diff --git a/.env.docker b/.env.docker index 278bd3e50..d77601b02 100644 --- a/.env.docker +++ b/.env.docker @@ -3,9 +3,9 @@ TZ=UTC # Greenplum ONETL_GP_HOST=greenplum ONETL_GP_PORT=5432 -ONETL_GP_DATABASE=testdb -ONETL_GP_USER=tester -ONETL_GP_PASSWORD=pivotal +ONETL_GP_DATABASE=postgres +ONETL_GP_USER=gpadmin +ONETL_GP_PASSWORD= # ClickHouse ONETL_CH_HOST=clickhouse diff --git a/.env.local b/.env.local index 0327f1cc0..bb076a46a 100644 --- a/.env.local +++ b/.env.local @@ -3,9 +3,9 @@ export TZ=UTC # Greenplum export ONETL_GP_HOST=localhost export ONETL_GP_PORT=5433 -export ONETL_GP_DATABASE=testdb -export ONETL_GP_USER=tester -export ONETL_GP_PASSWORD=pivotal +export ONETL_GP_DATABASE=postgres +export ONETL_GP_USER=gpadmin +export ONETL_GP_PASSWORD= # ClickHouse export ONETL_CH_HOST=localhost diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..a5c01bee7 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +# Set update schedule for GitHub Actions + +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: daily + labels: + - type:ci diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index be7746b84..c29d2d6c2 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -10,10 +10,10 @@ jobs: if: github.event.pull_request.user.login == 'pre-commit-ci[bot]' || github.event.pull_request.user.login == 'dependabot[bot]' steps: - - uses: alexwilson/enable-github-automerge-action@1.0.0 - with: - github-token: ${{ secrets.AUTOMERGE_TOKEN }} - merge-method: REBASE + - uses: alexwilson/enable-github-automerge-action@1.0.0 + with: + github-token: ${{ secrets.AUTOMERGE_TOKEN }} + merge-method: REBASE autoapprove: name: Automatically approve pull request @@ -22,6 +22,6 @@ jobs: if: github.event.pull_request.user.login == 'pre-commit-ci[bot]' || github.event.pull_request.user.login == 'dependabot[bot]' steps: - - uses: hmarr/auto-approve-action@v3 - with: - github-token: ${{ secrets.AUTOMERGE_TOKEN }} + - uses: hmarr/auto-approve-action@v4 + with: + github-token: ${{ secrets.AUTOMERGE_TOKEN }} diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml index b7e93d401..061d69df4 100644 --- a/.github/workflows/changelog.yml +++ b/.github/workflows/changelog.yml @@ -4,7 +4,7 @@ on: pull_request: types: [opened, synchronize, labeled, unlabeled, reopened] branches-ignore: - - master + - master env: DEFAULT_PYTHON: '3.12' @@ -19,44 +19,44 @@ jobs: timeout-minutes: 10 if: "!contains(github.event.pull_request.labels.*.name, 'ci:skip-changelog') && github.event.pull_request.user.login != 'pre-commit-ci[bot]' && github.event.pull_request.user.login != 'dependabot[bot]'" steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Python ${{ env.DEFAULT_PYTHON }} - id: python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.DEFAULT_PYTHON }} - - - name: Cache pip - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-changelog-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-changelog-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} - ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-changelog- - - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel - - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/docs.txt - - - name: Check changelog entry exists - run: | - if [ ! -s docs/changelog/next_release/${{ github.event.pull_request.number }}.*.rst ]; then - echo "Please add corresponding file 'docs/changelog/next_release/..rst' with changes description" - exit 1 - fi - - - name: Validate changelog - run: | - # Fetch the pull request' base branch so towncrier will be able to - # compare the current branch with the base branch. - git fetch --no-tags origin +refs/heads/${{ github.base_ref }}:refs/remotes/origin/${{ github.base_ref }} - towncrier check --compare-with origin/${{ github.base_ref }} - towncrier --draft + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python ${{ env.DEFAULT_PYTHON }} + id: python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.DEFAULT_PYTHON }} + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-changelog-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-changelog-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} + ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-changelog- + + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/docs.txt + + - name: Check changelog entry exists + run: | + if [ ! -s docs/changelog/next_release/${{ github.event.pull_request.number }}.*.rst ]; then + echo "Please add corresponding file 'docs/changelog/next_release/..rst' with changes description" + exit 1 + fi + + - name: Validate changelog + run: | + # Fetch the pull request' base branch so towncrier will be able to + # compare the current branch with the base branch. + git fetch --no-tags origin +refs/heads/${{ github.base_ref }}:refs/remotes/origin/${{ github.base_ref }} + towncrier check --compare-with origin/${{ github.base_ref }} + towncrier --draft diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 1b3aee7f5..f3397b706 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -3,10 +3,10 @@ name: Code analysis on: push: branches: - - develop + - develop pull_request: branches-ignore: - - master + - master workflow_dispatch: concurrency: @@ -28,55 +28,55 @@ jobs: security-events: write steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Python ${{ env.DEFAULT_PYTHON }} - uses: actions/setup-python@v4 - with: - python-version: ${{ env.DEFAULT_PYTHON }} - - - name: Install Kerberos headers - run: | - sudo apt-get update - sudo apt-get install --no-install-recommends libkrb5-dev - - - name: Cache pip - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql-${{ hashFiles('requirements*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql-${{ hashFiles('requirements*.txt') }} - ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql- - ${{ runner.os }}-python - ${{ runner.os }}- - - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel - - - name: Install dependencies - run: | - pip install -I \ - -r requirements/core.txt \ - -r requirements/dev.txt \ - -r requirements/ftp.txt \ - -r requirements/hdfs.txt \ - -r requirements/kerberos.txt \ - -r requirements/s3.txt \ - -r requirements/sftp.txt \ - -r requirements/spark.txt \ - -r requirements/webdav.txt - - # Set the `CODEQL-PYTHON` environment variable to the Python executable - # that includes the dependencies - echo "CODEQL_PYTHON=$(which python)" >> $GITHUB_ENV - - - name: Run flake8 - run: python3 -m flake8 --config setup.cfg . - - - name: Run mypy - run: python3 -m mypy --config-file setup.cfg onetl + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python ${{ env.DEFAULT_PYTHON }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.DEFAULT_PYTHON }} + + - name: Install Kerberos headers + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends libkrb5-dev + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql-${{ hashFiles('requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql-${{ hashFiles('requirements*.txt') }} + ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql- + ${{ runner.os }}-python + ${{ runner.os }}- + + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: | + pip install -I \ + -r requirements/core.txt \ + -r requirements/dev.txt \ + -r requirements/ftp.txt \ + -r requirements/hdfs.txt \ + -r requirements/kerberos.txt \ + -r requirements/s3.txt \ + -r requirements/sftp.txt \ + -r requirements/spark.txt \ + -r requirements/webdav.txt + + # Set the `CODEQL-PYTHON` environment variable to the Python executable + # that includes the dependencies + echo "CODEQL_PYTHON=$(which python)" >> $GITHUB_ENV + + - name: Run flake8 + run: python3 -m flake8 --config setup.cfg . + + - name: Run mypy + run: python3 -m mypy --config-file setup.cfg onetl codeql: name: CodeQL @@ -87,21 +87,21 @@ jobs: security-events: write steps: - - name: Checkout repository - uses: actions/checkout@v4 + - name: Checkout repository + uses: actions/checkout@v4 - - name: Set up Python ${{ env.DEFAULT_PYTHON }} - uses: actions/setup-python@v4 - with: - python-version: ${{ env.DEFAULT_PYTHON }} + - name: Set up Python ${{ env.DEFAULT_PYTHON }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.DEFAULT_PYTHON }} # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v2 - with: - languages: python - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 - with: - category: /language:python + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: python + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: /language:python diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 6ee63269e..7c9b53384 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -18,21 +18,21 @@ latest: &latest matrix: small: - - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine - <<: *max + - clickhouse-image: clickhouse/clickhouse-server + clickhouse-version: 23.6.1-alpine + <<: *max full: # the lowest supported Clickhouse version by JDBC driver - - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '20.7' - <<: *min - - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine - <<: *max + - clickhouse-image: yandex/clickhouse-server + clickhouse-version: '20.7' + <<: *min + - clickhouse-image: clickhouse/clickhouse-server + clickhouse-version: 23.6.1-alpine + <<: *max nightly: - - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '20.7' - <<: *min - - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: latest-alpine - <<: *latest + - clickhouse-image: yandex/clickhouse-server + clickhouse-version: '20.7' + <<: *min + - clickhouse-image: clickhouse/clickhouse-server + clickhouse-version: latest-alpine + <<: *latest diff --git a/.github/workflows/data/clickhouse/tracked.txt b/.github/workflows/data/clickhouse/tracked.txt index eca76fa67..699984b0e 100644 --- a/.github/workflows/data/clickhouse/tracked.txt +++ b/.github/workflows/data/clickhouse/tracked.txt @@ -1 +1,2 @@ -**/clickhouse* +**/*clickhouse* +**/*clickhouse*/** diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index 58061a5ed..86243228c 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -9,11 +9,11 @@ max: &max matrix: small: # chonjay21/ftps image has only latest tag - - ftp-version: latest - <<: *max + - ftp-version: latest + <<: *max full: &full - - ftp-version: latest - <<: *min - - ftp-version: latest - <<: *max + - ftp-version: latest + <<: *min + - ftp-version: latest + <<: *max nightly: *full diff --git a/.github/workflows/data/ftp/tracked.txt b/.github/workflows/data/ftp/tracked.txt index 6ca4507ac..f83f0d21e 100644 --- a/.github/workflows/data/ftp/tracked.txt +++ b/.github/workflows/data/ftp/tracked.txt @@ -1,3 +1,4 @@ -**/ftp* +**/*ftp* +**/*ftp*/** docker-compose.yml .env.dependencies diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index 1ff40b12d..bb61dc6b0 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -9,11 +9,11 @@ max: &max matrix: small: # chonjay21/ftps image has only latest tag - - ftps-version: latest - <<: *max + - ftps-version: latest + <<: *max full: &full - - ftps-version: latest - <<: *min - - ftps-version: latest - <<: *max + - ftps-version: latest + <<: *min + - ftps-version: latest + <<: *max nightly: *full diff --git a/.github/workflows/data/ftps/tracked.txt b/.github/workflows/data/ftps/tracked.txt index 6ca4507ac..f83f0d21e 100644 --- a/.github/workflows/data/ftps/tracked.txt +++ b/.github/workflows/data/ftps/tracked.txt @@ -1,3 +1,4 @@ -**/ftp* +**/*ftp* +**/*ftp*/** docker-compose.yml .env.dependencies diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 292319bb5..7715ed5c3 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -14,11 +14,11 @@ max: &max matrix: small: - - greenplum-version: 6.8 - <<: *max + - greenplum-version: 7.0.0 + <<: *max full: &full - - greenplum-version: 6.1 - <<: *min - - greenplum-version: 6.8 - <<: *max + - greenplum-version: 6.25.3 + <<: *min + - greenplum-version: 7.0.0 + <<: *max nightly: *full diff --git a/.github/workflows/data/greenplum/tracked.txt b/.github/workflows/data/greenplum/tracked.txt index 46f52535e..57dad4935 100644 --- a/.github/workflows/data/greenplum/tracked.txt +++ b/.github/workflows/data/greenplum/tracked.txt @@ -1 +1,2 @@ -**/greenplum* +**/*greenplum* +**/*greenplum*/** diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index 465c7eb81..c117c8d6f 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -21,10 +21,10 @@ latest: &latest matrix: small: - - *max + - *max full: - - *min - - *max + - *min + - *max nightly: - - *min - - *latest + - *min + - *latest diff --git a/.github/workflows/data/hdfs/tracked.txt b/.github/workflows/data/hdfs/tracked.txt index 6fed0b82a..47e0c040f 100644 --- a/.github/workflows/data/hdfs/tracked.txt +++ b/.github/workflows/data/hdfs/tracked.txt @@ -1,4 +1,5 @@ **/*hdfs* +**/*hdfs*/** onetl/connection/kerberos_helpers.py docker-compose.yml .env.dependencies diff --git a/.github/workflows/data/hive/tracked.txt b/.github/workflows/data/hive/tracked.txt index 4e2200117..00690a779 100644 --- a/.github/workflows/data/hive/tracked.txt +++ b/.github/workflows/data/hive/tracked.txt @@ -1 +1,2 @@ -**/hive* +**/*hive* +**/*hive*/** diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index cf4668e9a..26ccf4af7 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -22,10 +22,10 @@ latest: &latest matrix: small: - - *max + - *max full: - - *min - - *max + - *min + - *max nightly: - - *min - - *latest + - *min + - *latest diff --git a/.github/workflows/data/kafka/tracked.txt b/.github/workflows/data/kafka/tracked.txt index a2639c3e4..3cfcf7c7f 100644 --- a/.github/workflows/data/kafka/tracked.txt +++ b/.github/workflows/data/kafka/tracked.txt @@ -1 +1,2 @@ -**/kafka* +**/*kafka* +**/*kafka*/** diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index df9bdeffe..6dcf8efd3 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -30,14 +30,14 @@ latest: &latest matrix: small: - - <<: *max + - <<: *max full: - - <<: *min - - <<: *min_avro - - <<: *min_excel - - <<: *max + - <<: *min + - <<: *min_avro + - <<: *min_excel + - <<: *max nightly: - - <<: *min - - <<: *min_avro - - <<: *min_excel - - <<: *latest + - <<: *min + - <<: *min_avro + - <<: *min_excel + - <<: *latest diff --git a/.github/workflows/data/local-fs/tracked.txt b/.github/workflows/data/local-fs/tracked.txt index 013c04894..bb8d4c276 100644 --- a/.github/workflows/data/local-fs/tracked.txt +++ b/.github/workflows/data/local-fs/tracked.txt @@ -1,2 +1,4 @@ **/*local_fs* **/*local-fs* +**/*local_fs*/** +**/*local-fs*/** diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 63aca7454..34e66b85a 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -19,15 +19,15 @@ latest: &latest matrix: small: - - mongodb-version: 6.0.7 - <<: *max + - mongodb-version: 6.0.7 + <<: *max full: - - mongodb-version: 4.0.0 - <<: *min - - mongodb-version: 6.0.7 - <<: *max + - mongodb-version: 4.0.0 + <<: *min + - mongodb-version: 6.0.7 + <<: *max nightly: - - mongodb-version: 4.0.0 - <<: *min - - mongodb-version: latest - <<: *latest + - mongodb-version: 4.0.0 + <<: *min + - mongodb-version: latest + <<: *latest diff --git a/.github/workflows/data/mongodb/tracked.txt b/.github/workflows/data/mongodb/tracked.txt index 0d362cd8b..11189fca6 100644 --- a/.github/workflows/data/mongodb/tracked.txt +++ b/.github/workflows/data/mongodb/tracked.txt @@ -1 +1,2 @@ -**/mongodb* +**/*mongodb* +**/*mongodb*/** diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index a2c115bc2..bbb121c39 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -18,16 +18,16 @@ latest: &latest matrix: small: - - mssql-version: v2017.CU24.0 - <<: *max + - mssql-version: v2017.CU24.0 + <<: *max full: - - mssql-version: v2017.CU24.0 - <<: *min + - mssql-version: v2017.CU24.0 + <<: *min # v2019.CU4.0 is not very stable - - mssql-version: v2017.CU24.0 - <<: *max + - mssql-version: v2017.CU24.0 + <<: *max nightly: - - mssql-version: v2017.CU24.0 - <<: *min - - mssql-version: v2017.CU24.0 - <<: *latest + - mssql-version: v2017.CU24.0 + <<: *min + - mssql-version: v2017.CU24.0 + <<: *latest diff --git a/.github/workflows/data/mssql/tracked.txt b/.github/workflows/data/mssql/tracked.txt index 6432bff7b..d0b4b2c24 100644 --- a/.github/workflows/data/mssql/tracked.txt +++ b/.github/workflows/data/mssql/tracked.txt @@ -1 +1,2 @@ -**/mssql* +**/*mssql* +**/*mssql*/** diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index 435ce9f90..aa1c575d1 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -18,17 +18,17 @@ latest: &latest matrix: small: - - mysql-version: 8.0.33 - <<: *max + - mysql-version: 8.0.33 + <<: *max full: # Min supported version by JDBC driver is 5.7 - - mysql-version: 5.7.42 - <<: *min + - mysql-version: 5.7.42 + <<: *min # Max supported version by JDBC driver is 8.0 - - mysql-version: 8.0.33 - <<: *max + - mysql-version: 8.0.33 + <<: *max nightly: - - mysql-version: 5.7.42 - <<: *min - - mysql-version: latest - <<: *latest + - mysql-version: 5.7.42 + <<: *min + - mysql-version: latest + <<: *latest diff --git a/.github/workflows/data/mysql/tracked.txt b/.github/workflows/data/mysql/tracked.txt index 07268aad8..6fc8b7eba 100644 --- a/.github/workflows/data/mysql/tracked.txt +++ b/.github/workflows/data/mysql/tracked.txt @@ -1 +1,2 @@ -**/mysql* +**/*mysql* +**/*mysql*/** diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index dcf725f51..1230f659c 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -18,29 +18,29 @@ latest: &latest matrix: small: - - oracle-image: gvenzl/oracle-free - oracle-version: 23.3-slim-faststart - db-name: FREEPDB1 - <<: *max + - oracle-image: gvenzl/oracle-free + oracle-version: 23.3-slim-faststart + db-name: FREEPDB1 + <<: *max full: - - oracle-image: gvenzl/oracle-xe - oracle-version: 11.2.0.2-slim-faststart - db-name: XE - <<: *min - - oracle-image: gvenzl/oracle-xe - oracle-version: 21.3.0-slim-faststart - db-name: XEPDB1 - <<: *max - - oracle-image: gvenzl/oracle-free - oracle-version: 23.3-slim-faststart - db-name: FREEPDB1 - <<: *max + - oracle-image: gvenzl/oracle-xe + oracle-version: 11.2.0.2-slim-faststart + db-name: XE + <<: *min + - oracle-image: gvenzl/oracle-xe + oracle-version: 21.3.0-slim-faststart + db-name: XEPDB1 + <<: *max + - oracle-image: gvenzl/oracle-free + oracle-version: 23.3-slim-faststart + db-name: FREEPDB1 + <<: *max nightly: - - oracle-image: gvenzl/oracle-xe - oracle-version: 11.2.0.2-slim-faststart - db-name: XE - <<: *min - - oracle-image: gvenzl/oracle-free - oracle-version: slim-faststart - db-name: FREEPDB1 - <<: *latest + - oracle-image: gvenzl/oracle-xe + oracle-version: 11.2.0.2-slim-faststart + db-name: XE + <<: *min + - oracle-image: gvenzl/oracle-free + oracle-version: slim-faststart + db-name: FREEPDB1 + <<: *latest diff --git a/.github/workflows/data/oracle/tracked.txt b/.github/workflows/data/oracle/tracked.txt index 0ccaed3c0..740ec5589 100644 --- a/.github/workflows/data/oracle/tracked.txt +++ b/.github/workflows/data/oracle/tracked.txt @@ -1 +1,2 @@ -**/oracle* +**/*oracle* +**/*oracle*/** diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 1f2c6077c..78de0b38b 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -18,16 +18,16 @@ latest: &latest matrix: small: - - postgres-version: 15.2-alpine - <<: *max + - postgres-version: 15.2-alpine + <<: *max full: # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life - - postgres-version: 9.4.26-alpine - <<: *min - - postgres-version: 15.2-alpine - <<: *max + - postgres-version: 9.4.26-alpine + <<: *min + - postgres-version: 15.2-alpine + <<: *max nightly: - - postgres-version: 9.4.26-alpine - <<: *min - - postgres-version: alpine - <<: *latest + - postgres-version: 9.4.26-alpine + <<: *min + - postgres-version: alpine + <<: *latest diff --git a/.github/workflows/data/postgres/tracked.txt b/.github/workflows/data/postgres/tracked.txt index b717fb751..179aa6b02 100644 --- a/.github/workflows/data/postgres/tracked.txt +++ b/.github/workflows/data/postgres/tracked.txt @@ -1 +1,2 @@ -**/postgres* +**/*postgres* +**/*postgres*/** diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 64357c229..b2d595d6b 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -23,10 +23,10 @@ latest: &latest matrix: small: - - *max + - *max full: - - *min - - *max + - *min + - *max nightly: - - *min - - *latest + - *min + - *latest diff --git a/.github/workflows/data/s3/tracked.txt b/.github/workflows/data/s3/tracked.txt index 9754e6cd2..164eed96f 100644 --- a/.github/workflows/data/s3/tracked.txt +++ b/.github/workflows/data/s3/tracked.txt @@ -1 +1,2 @@ **/*s3* +**/*s3*/** diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index fc1573e02..f03adcd08 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -8,11 +8,11 @@ max: &max matrix: small: - - server-version: latest - <<: *max + - server-version: latest + <<: *max full: &full - - server-version: latest - <<: *min - - server-version: latest - <<: *max + - server-version: latest + <<: *min + - server-version: latest + <<: *max nightly: *full diff --git a/.github/workflows/data/samba/tracked.txt b/.github/workflows/data/samba/tracked.txt index 5f7fcf905..81f81ae17 100644 --- a/.github/workflows/data/samba/tracked.txt +++ b/.github/workflows/data/samba/tracked.txt @@ -1 +1,2 @@ -**/samba* +**/*samba* +**/*samba*/** diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 44852908b..3379a6e3e 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -8,16 +8,16 @@ max: &max matrix: small: - - openssh-version: 9.3_p1-r3-ls120 - <<: *max + - openssh-version: 9.3_p1-r3-ls120 + <<: *max full: # prior image versions does not accept incoming connections, seems like a bug - - openssh-version: 8.1_p1-r0-ls5 - <<: *min - - openssh-version: 9.3_p1-r3-ls120 - <<: *max + - openssh-version: 8.1_p1-r0-ls5 + <<: *min + - openssh-version: 9.3_p1-r3-ls120 + <<: *max nightly: - - openssh-version: 8.1_p1-r0-ls5 - <<: *min - - openssh-version: latest - <<: *max + - openssh-version: 8.1_p1-r0-ls5 + <<: *min + - openssh-version: latest + <<: *max diff --git a/.github/workflows/data/sftp/tracked.txt b/.github/workflows/data/sftp/tracked.txt index 0432594f5..918066c25 100644 --- a/.github/workflows/data/sftp/tracked.txt +++ b/.github/workflows/data/sftp/tracked.txt @@ -1 +1,2 @@ -**/sftp* +**/*sftp* +**/*sftp*/** diff --git a/.github/workflows/data/teradata/tracked.txt b/.github/workflows/data/teradata/tracked.txt index 8e0f7b49a..a83eceebe 100644 --- a/.github/workflows/data/teradata/tracked.txt +++ b/.github/workflows/data/teradata/tracked.txt @@ -1 +1,2 @@ -**/teradata* +**/*teradata* +**/*teradata*/** diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index 87b492c31..0a67ff838 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -9,11 +9,11 @@ max: &max matrix: small: # chonjay21/webdav image has only latest tag - - webdav-version: latest - <<: *max + - webdav-version: latest + <<: *max full: &full - - webdav-version: latest - <<: *min - - webdav-version: latest - <<: *max + - webdav-version: latest + <<: *min + - webdav-version: latest + <<: *max nightly: *full diff --git a/.github/workflows/data/webdav/tracked.txt b/.github/workflows/data/webdav/tracked.txt index e7dd068b9..b8ac5e88f 100644 --- a/.github/workflows/data/webdav/tracked.txt +++ b/.github/workflows/data/webdav/tracked.txt @@ -1,3 +1,4 @@ -**/webdav* +**/*webdav* +**/*webdav*/** docker-compose.yml .env.dependencies diff --git a/.github/workflows/dev-release.yml b/.github/workflows/dev-release.yml index 8dd3dc5d1..57aae7045 100644 --- a/.github/workflows/dev-release.yml +++ b/.github/workflows/dev-release.yml @@ -3,9 +3,9 @@ name: Dev release on: push: branches-ignore: - - dependabot/** - - pre-commit-ci-update-config - - master + - dependabot/** + - pre-commit-ci-update-config + - master workflow_dispatch: env: @@ -28,28 +28,28 @@ jobs: id-token: write # to auth in Test PyPI steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Python ${{ env.DEFAULT_PYTHON }} - id: python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.DEFAULT_PYTHON }} - - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel - - - name: Fix logo in Readme - run: | - sed -i "s#image:: docs/#image:: https://raw.githubusercontent.com/MobileTeleSystems/onetl/$GITHUB_SHA/docs/#g" README.rst - - - name: Build package - run: python setup.py sdist bdist_wheel - - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python ${{ env.DEFAULT_PYTHON }} + id: python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.DEFAULT_PYTHON }} + + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel + + - name: Fix logo in Readme + run: | + sed -i "s#image:: docs/#image:: https://raw.githubusercontent.com/MobileTeleSystems/onetl/$GITHUB_SHA/docs/#g" README.rst + + - name: Build package + run: python setup.py sdist bdist_wheel + + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index 466ca483a..a97dd5a7d 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -74,626 +74,626 @@ jobs: matrix-samba: ${{ toJson(fromJson(steps.matrix-samba.outputs.result)[steps.key-samba.outputs.key]) }} matrix-webdav: ${{ toJson(fromJson(steps.matrix-webdav.outputs.result)[steps.key-webdav.outputs.key]) }} steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 2 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.DEFAULT_PYTHON }} - - - name: Check if base files are changed - id: changed-base - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/base/tracked.txt - files_ignore_from_source_file: .github/workflows/data/base/ignored.txt - - - name: Print base files changed - run: | - echo '${{ steps.changed-base.outputs.all_changed_files }}' - - - name: Check if db-related files are changed - id: changed-db - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/db/tracked.txt - files_ignore_from_source_file: .github/workflows/data/db/ignored.txt - - - name: Print db-related files changed - run: | - echo '${{ steps.changed-db.outputs.all_changed_files }}' - - - name: Check if file-related files are changed - id: changed-file - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/file/tracked.txt - files_ignore_from_source_file: .github/workflows/data/file/ignored.txt - - - name: Print file-related files changed - run: | - echo '${{ steps.changed-file.outputs.all_changed_files }}' - - - name: Check if file-df-related files are changed - id: changed-file-df - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/file-df/tracked.txt - files_ignore_from_source_file: .github/workflows/data/file-df/ignored.txt - - - name: Print file-df-related files changed - run: | - echo '${{ steps.changed-file-df.outputs.all_changed_files }}' - - - name: Check if core files are changed - id: changed-core - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/core/tracked.txt - files_ignore_from_source_file: .github/workflows/data/core/ignored.txt - - - name: Print core files changed - run: | - echo '${{ steps.changed-core.outputs.all_changed_files }}' - - - name: Calculate Core matrix key - id: key-core - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-core.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Core matrix - id: matrix-core - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml - - - name: Check if Clickhouse files are changed - id: changed-clickhouse - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/clickhouse/tracked.txt - files_ignore_from_source_file: .github/workflows/data/clickhouse/ignored.txt - - - name: Print Clickhouse files changed - run: | - echo '${{ steps.changed-clickhouse.outputs.all_changed_files }}' - - - name: Calculate Clickhouse matrix key - id: key-clickhouse - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-clickhouse.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Clickhouse matrix - id: matrix-clickhouse - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml - - - name: Check if Greenplum files are changed - id: changed-greenplum - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/greenplum/tracked.txt - files_ignore_from_source_file: .github/workflows/data/greenplum/ignored.txt - - - name: Print Greenplum files changed - run: | - echo '${{ steps.changed-greenplum.outputs.all_changed_files }}' - - - name: Calculate Greenplum matrix key - id: key-greenplum - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-greenplum.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Greenplum matrix - id: matrix-greenplum - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml - - - name: Check if Hive files are changed - id: changed-hive - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/hive/tracked.txt - files_ignore_from_source_file: .github/workflows/data/hive/ignored.txt - - - name: Print Hive files changed - run: | - echo '${{ steps.changed-hive.outputs.all_changed_files }}' - - - name: Calculate Hive matrix key - id: key-hive - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-hive.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Hive matrix - id: matrix-hive - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml - - - name: Check if Kafka files are changed - id: changed-kafka - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/kafka/tracked.txt - files_ignore_from_source_file: .github/workflows/data/kafka/ignored.txt - - - name: Print Kafka files changed - run: | - echo '${{ steps.changed-kafka.outputs.all_changed_files }}' - - - name: Calculate Kafka matrix key - id: key-kafka - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-kafka.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Kafka matrix - id: matrix-kafka - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml - - - name: Check if LocalFS files are changed - id: changed-local-fs - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/local-fs/tracked.txt - files_ignore_from_source_file: .github/workflows/data/local-fs/ignored.txt - - - name: Print LocalFS files changed - run: | - echo '${{ steps.changed-local-fs.outputs.all_changed_files }}' - - - name: Calculate LocalFS matrix key - id: key-local-fs - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file-df.outputs.any_changed }} || ${{ steps.changed-local-fs.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get LocalFS matrix - id: matrix-local-fs - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml - - - name: Check if MongoDB files are changed - id: changed-mongodb - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/mongodb/tracked.txt - files_ignore_from_source_file: .github/workflows/data/mongodb/ignored.txt - - - name: Print MongoDB files changed - run: | - echo '${{ steps.changed-mongodb.outputs.all_changed_files }}' - - - name: Calculate MongoDB matrix key - id: key-mongodb - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-mongodb.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get MongoDB matrix - id: matrix-mongodb - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml - - - name: Check if MSSQL files are changed - id: changed-mssql - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/mssql/tracked.txt - files_ignore_from_source_file: .github/workflows/data/mssql/ignored.txt - - - name: Print MSSQL files changed - run: | - echo '${{ steps.changed-mssql.outputs.all_changed_files }}' - - - name: Calculate MSSQL matrix key - id: key-mssql - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-mssql.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get MSSQL matrix - id: matrix-mssql - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml - - - name: Check if MySQL files are changed - id: changed-mysql - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/mysql/tracked.txt - files_ignore_from_source_file: .github/workflows/data/mysql/ignored.txt - - - name: Print MSSQL files changed - run: | - echo '${{ steps.changed-mysql.outputs.all_changed_files }}' - - - name: Calculate MySQL matrix key - id: key-mysql - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-mysql.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get MySQL matrix - id: matrix-mysql - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml - - - name: Check if Oracle files are changed - id: changed-oracle - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/oracle/tracked.txt - files_ignore_from_source_file: .github/workflows/data/oracle/ignored.txt - - - name: Print Oracle files changed - run: | - echo '${{ steps.changed-oracle.outputs.all_changed_files }}' - - - name: Calculate Oracle matrix key - id: key-oracle - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-oracle.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Oracle matrix - id: matrix-oracle - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml - - - name: Check if Postgres files are changed - id: changed-postgres - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/postgres/tracked.txt - files_ignore_from_source_file: .github/workflows/data/postgres/ignored.txt - - - name: Print Postgres files changed - run: | - echo '${{ steps.changed-postgres.outputs.all_changed_files }}' - - - name: Calculate Postgres matrix key - id: key-postgres - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-postgres.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Postgres matrix - id: matrix-postgres - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml - - - name: Check if Teradata files are changed - id: changed-teradata - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/teradata/tracked.txt - files_ignore_from_source_file: .github/workflows/data/teradata/ignored.txt - - - name: Print Teradata files changed - run: | - echo '${{ steps.changed-teradata.outputs.all_changed_files }}' - - - name: Calculate Teradata matrix key - id: key-teradata - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-teradata.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Teradata matrix - id: matrix-teradata - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml - - - name: Check if FTP files are changed - id: changed-ftp - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/ftp/tracked.txt - files_ignore_from_source_file: .github/workflows/data/ftp/ignored.txt - - - name: Print FTP files changed - run: | - echo '${{ steps.changed-ftp.outputs.all_changed_files }}' - - - name: Calculate FTP matrix key - id: key-ftp - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-ftp.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get FTP matrix - id: matrix-ftp - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml - - - name: Check if FTPS files are changed - id: changed-ftps - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/ftps/tracked.txt - files_ignore_from_source_file: .github/workflows/data/ftps/ignored.txt - - - name: Print FTPS files changed - run: | - echo '${{ steps.changed-ftps.outputs.all_changed_files }}' - - - name: Calculate FTPS matrix key - id: key-ftps - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-ftps.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get FTPS matrix - id: matrix-ftps - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml - - - name: Check if HDFS files are changed - id: changed-hdfs - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/hdfs/tracked.txt - files_ignore_from_source_file: .github/workflows/data/hdfs/ignored.txt - - - name: Print HDFS files changed - run: | - echo '${{ steps.changed-hdfs.outputs.all_changed_files }}' - - - name: Calculate HDFS matrix key - id: key-hdfs - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-hdfs.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get HDFS matrix - id: matrix-hdfs - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml - - - name: Check if S3 files are changed - id: changed-s3 - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/s3/tracked.txt - files_ignore_from_source_file: .github/workflows/data/s3/ignored.txt - - - name: Print S3 files changed - run: | - echo '${{ steps.changed-s3.outputs.all_changed_files }}' - - - name: Calculate S3 matrix key - id: key-s3 - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-s3.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get S3 matrix - id: matrix-s3 - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml - - - name: Check if SFTP files are changed - id: changed-sftp - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/sftp/tracked.txt - files_ignore_from_source_file: .github/workflows/data/sftp/ignored.txt - - - name: Print SFTP files changed - run: | - echo '${{ steps.changed-sftp.outputs.all_changed_files }}' - - - name: Calculate SFTP matrix key - id: key-sftp - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-sftp.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get SFTP matrix - id: matrix-sftp - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml - - - name: Check if Samba files are changed - id: changed-samba - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/samba/tracked.txt - files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt - - - name: Print Samba files changed - run: | - echo '${{ steps.changed-samba.outputs.all_changed_files }}' - - - name: Calculate Samba matrix key - id: key-samba - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-samba.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get Samba matrix - id: matrix-samba - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml - - - name: Check if WebDAV files are changed - id: changed-webdav - uses: tj-actions/changed-files@v35 - with: - files_from_source_file: .github/workflows/data/webdav/tracked.txt - files_ignore_from_source_file: .github/workflows/data/webdav/ignored.txt - - - name: Print WebDAV files changed - run: | - echo '${{ steps.changed-webdav.outputs.all_changed_files }}' - - - name: Calculate WebDAV matrix key - id: key-webdav - run: | - if ${{ inputs.nightly }}; then - key=nightly - elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-webdav.outputs.any_changed }}; then - key=full - else - key=small - fi - echo key=$key - echo key=$key >> $GITHUB_OUTPUT - - - name: Get WebDAV matrix - id: matrix-webdav - uses: mikefarah/yq@v4.33.3 - with: - cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.DEFAULT_PYTHON }} + + - name: Check if base files are changed + id: changed-base + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/base/tracked.txt + files_ignore_from_source_file: .github/workflows/data/base/ignored.txt + + - name: Print base files changed + run: | + echo '${{ steps.changed-base.outputs.all_changed_files }}' + + - name: Check if db-related files are changed + id: changed-db + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/db/tracked.txt + files_ignore_from_source_file: .github/workflows/data/db/ignored.txt + + - name: Print db-related files changed + run: | + echo '${{ steps.changed-db.outputs.all_changed_files }}' + + - name: Check if file-related files are changed + id: changed-file + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/file/tracked.txt + files_ignore_from_source_file: .github/workflows/data/file/ignored.txt + + - name: Print file-related files changed + run: | + echo '${{ steps.changed-file.outputs.all_changed_files }}' + + - name: Check if file-df-related files are changed + id: changed-file-df + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/file-df/tracked.txt + files_ignore_from_source_file: .github/workflows/data/file-df/ignored.txt + + - name: Print file-df-related files changed + run: | + echo '${{ steps.changed-file-df.outputs.all_changed_files }}' + + - name: Check if core files are changed + id: changed-core + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/core/tracked.txt + files_ignore_from_source_file: .github/workflows/data/core/ignored.txt + + - name: Print core files changed + run: | + echo '${{ steps.changed-core.outputs.all_changed_files }}' + + - name: Calculate Core matrix key + id: key-core + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-core.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Core matrix + id: matrix-core + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml + + - name: Check if Clickhouse files are changed + id: changed-clickhouse + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/clickhouse/tracked.txt + files_ignore_from_source_file: .github/workflows/data/clickhouse/ignored.txt + + - name: Print Clickhouse files changed + run: | + echo '${{ steps.changed-clickhouse.outputs.all_changed_files }}' + + - name: Calculate Clickhouse matrix key + id: key-clickhouse + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-clickhouse.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Clickhouse matrix + id: matrix-clickhouse + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml + + - name: Check if Greenplum files are changed + id: changed-greenplum + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/greenplum/tracked.txt + files_ignore_from_source_file: .github/workflows/data/greenplum/ignored.txt + + - name: Print Greenplum files changed + run: | + echo '${{ steps.changed-greenplum.outputs.all_changed_files }}' + + - name: Calculate Greenplum matrix key + id: key-greenplum + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-greenplum.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Greenplum matrix + id: matrix-greenplum + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml + + - name: Check if Hive files are changed + id: changed-hive + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/hive/tracked.txt + files_ignore_from_source_file: .github/workflows/data/hive/ignored.txt + + - name: Print Hive files changed + run: | + echo '${{ steps.changed-hive.outputs.all_changed_files }}' + + - name: Calculate Hive matrix key + id: key-hive + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-hive.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Hive matrix + id: matrix-hive + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml + + - name: Check if Kafka files are changed + id: changed-kafka + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/kafka/tracked.txt + files_ignore_from_source_file: .github/workflows/data/kafka/ignored.txt + + - name: Print Kafka files changed + run: | + echo '${{ steps.changed-kafka.outputs.all_changed_files }}' + + - name: Calculate Kafka matrix key + id: key-kafka + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-kafka.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Kafka matrix + id: matrix-kafka + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml + + - name: Check if LocalFS files are changed + id: changed-local-fs + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/local-fs/tracked.txt + files_ignore_from_source_file: .github/workflows/data/local-fs/ignored.txt + + - name: Print LocalFS files changed + run: | + echo '${{ steps.changed-local-fs.outputs.all_changed_files }}' + + - name: Calculate LocalFS matrix key + id: key-local-fs + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file-df.outputs.any_changed }} || ${{ steps.changed-local-fs.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get LocalFS matrix + id: matrix-local-fs + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml + + - name: Check if MongoDB files are changed + id: changed-mongodb + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/mongodb/tracked.txt + files_ignore_from_source_file: .github/workflows/data/mongodb/ignored.txt + + - name: Print MongoDB files changed + run: | + echo '${{ steps.changed-mongodb.outputs.all_changed_files }}' + + - name: Calculate MongoDB matrix key + id: key-mongodb + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-mongodb.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get MongoDB matrix + id: matrix-mongodb + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml + + - name: Check if MSSQL files are changed + id: changed-mssql + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/mssql/tracked.txt + files_ignore_from_source_file: .github/workflows/data/mssql/ignored.txt + + - name: Print MSSQL files changed + run: | + echo '${{ steps.changed-mssql.outputs.all_changed_files }}' + + - name: Calculate MSSQL matrix key + id: key-mssql + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-mssql.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get MSSQL matrix + id: matrix-mssql + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml + + - name: Check if MySQL files are changed + id: changed-mysql + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/mysql/tracked.txt + files_ignore_from_source_file: .github/workflows/data/mysql/ignored.txt + + - name: Print MSSQL files changed + run: | + echo '${{ steps.changed-mysql.outputs.all_changed_files }}' + + - name: Calculate MySQL matrix key + id: key-mysql + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-mysql.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get MySQL matrix + id: matrix-mysql + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml + + - name: Check if Oracle files are changed + id: changed-oracle + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/oracle/tracked.txt + files_ignore_from_source_file: .github/workflows/data/oracle/ignored.txt + + - name: Print Oracle files changed + run: | + echo '${{ steps.changed-oracle.outputs.all_changed_files }}' + + - name: Calculate Oracle matrix key + id: key-oracle + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-oracle.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Oracle matrix + id: matrix-oracle + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml + + - name: Check if Postgres files are changed + id: changed-postgres + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/postgres/tracked.txt + files_ignore_from_source_file: .github/workflows/data/postgres/ignored.txt + + - name: Print Postgres files changed + run: | + echo '${{ steps.changed-postgres.outputs.all_changed_files }}' + + - name: Calculate Postgres matrix key + id: key-postgres + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-postgres.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Postgres matrix + id: matrix-postgres + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml + + - name: Check if Teradata files are changed + id: changed-teradata + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/teradata/tracked.txt + files_ignore_from_source_file: .github/workflows/data/teradata/ignored.txt + + - name: Print Teradata files changed + run: | + echo '${{ steps.changed-teradata.outputs.all_changed_files }}' + + - name: Calculate Teradata matrix key + id: key-teradata + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-db.outputs.any_changed }} || ${{ steps.changed-teradata.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Teradata matrix + id: matrix-teradata + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml + + - name: Check if FTP files are changed + id: changed-ftp + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/ftp/tracked.txt + files_ignore_from_source_file: .github/workflows/data/ftp/ignored.txt + + - name: Print FTP files changed + run: | + echo '${{ steps.changed-ftp.outputs.all_changed_files }}' + + - name: Calculate FTP matrix key + id: key-ftp + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-ftp.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get FTP matrix + id: matrix-ftp + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml + + - name: Check if FTPS files are changed + id: changed-ftps + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/ftps/tracked.txt + files_ignore_from_source_file: .github/workflows/data/ftps/ignored.txt + + - name: Print FTPS files changed + run: | + echo '${{ steps.changed-ftps.outputs.all_changed_files }}' + + - name: Calculate FTPS matrix key + id: key-ftps + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-ftps.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get FTPS matrix + id: matrix-ftps + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml + + - name: Check if HDFS files are changed + id: changed-hdfs + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/hdfs/tracked.txt + files_ignore_from_source_file: .github/workflows/data/hdfs/ignored.txt + + - name: Print HDFS files changed + run: | + echo '${{ steps.changed-hdfs.outputs.all_changed_files }}' + + - name: Calculate HDFS matrix key + id: key-hdfs + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-hdfs.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get HDFS matrix + id: matrix-hdfs + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml + + - name: Check if S3 files are changed + id: changed-s3 + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/s3/tracked.txt + files_ignore_from_source_file: .github/workflows/data/s3/ignored.txt + + - name: Print S3 files changed + run: | + echo '${{ steps.changed-s3.outputs.all_changed_files }}' + + - name: Calculate S3 matrix key + id: key-s3 + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-s3.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get S3 matrix + id: matrix-s3 + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml + + - name: Check if SFTP files are changed + id: changed-sftp + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/sftp/tracked.txt + files_ignore_from_source_file: .github/workflows/data/sftp/ignored.txt + + - name: Print SFTP files changed + run: | + echo '${{ steps.changed-sftp.outputs.all_changed_files }}' + + - name: Calculate SFTP matrix key + id: key-sftp + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-sftp.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get SFTP matrix + id: matrix-sftp + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml + + - name: Check if Samba files are changed + id: changed-samba + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/samba/tracked.txt + files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt + + - name: Print Samba files changed + run: | + echo '${{ steps.changed-samba.outputs.all_changed_files }}' + + - name: Calculate Samba matrix key + id: key-samba + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-samba.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Samba matrix + id: matrix-samba + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml + + - name: Check if WebDAV files are changed + id: changed-webdav + uses: tj-actions/changed-files@v42 + with: + files_from_source_file: .github/workflows/data/webdav/tracked.txt + files_ignore_from_source_file: .github/workflows/data/webdav/ignored.txt + + - name: Print WebDAV files changed + run: | + echo '${{ steps.changed-webdav.outputs.all_changed_files }}' + + - name: Calculate WebDAV matrix key + id: key-webdav + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-webdav.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get WebDAV matrix + id: matrix-webdav + uses: mikefarah/yq@v4.40.5 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index be5958b1b..58b34cee1 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -2,7 +2,7 @@ name: Nightly tests on: schedule: # Run every week - - cron: 30 12 * * WED + - cron: 30 12 * * WED workflow_dispatch: concurrency: @@ -338,28 +338,28 @@ jobs: name: Tests done runs-on: ubuntu-latest needs: - - tests-core - - tests-clickhouse - - tests-hive - - tests-kafka - - tests-local-fs - - tests-mongodb - - tests-mssql - - tests-mysql - - tests-oracle - - tests-postgres - - tests-teradata - - tests-ftp - - tests-ftps - - tests-hdfs - - tests-s3 - - tests-sftp - - tests-samba - - tests-webdav + - tests-core + - tests-clickhouse + - tests-hive + - tests-kafka + - tests-local-fs + - tests-mongodb + - tests-mssql + - tests-mysql + - tests-oracle + - tests-postgres + - tests-teradata + - tests-ftp + - tests-ftps + - tests-hdfs + - tests-s3 + - tests-sftp + - tests-samba + - tests-webdav steps: # Do not send coverage report. Nightly tests are used only to find bugs with new versions of DB and filesystems. - - name: All done - run: echo 1 + - name: All done + run: echo 1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 84cf5817c..90dfc638d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,7 +3,7 @@ name: Release on: push: tags: - - '[0-9]+.[0-9]+.[0-9]+' + - '[0-9]+.[0-9]+.[0-9]+' env: DEFAULT_PYTHON: '3.12' @@ -22,92 +22,95 @@ jobs: contents: write # to create Github release steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Python ${{ env.DEFAULT_PYTHON }} - id: python - uses: actions/setup-python@v4 - with: - python-version: ${{ env.DEFAULT_PYTHON }} - - - name: Cache pip - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-release-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-release-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} - ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-release- - - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel - - - name: Install dependencies - run: pip install -I -r requirements/core.txt -r requirements/docs.txt - - - name: Fix logo in Readme - run: | - sed -i "s#image:: docs/#image:: https://raw.githubusercontent.com/MobileTeleSystems/onetl/$GITHUB_REF_NAME/docs/#g" README.rst - - - name: Build package - run: python setup.py sdist bdist_wheel - - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - - - name: Get changelog - run: | - cat docs/changelog/$GITHUB_REF_NAME.rst > changelog.rst - - - name: Fix Github links - run: | - # Replace Github links from Sphinx syntax with Markdown - sed -i -E 's/:github:issue:`(.*)`/#\1/g' changelog.rst - sed -i -E 's/:github:pull:`(.*)`/#\1/g' changelog.rst - sed -i -E 's/:github:user:`(.*)`/@\1/g' changelog.rst - sed -i -E 's/:github:org:`(.*)`/@\1/g' changelog.rst - - - name: Convert changelog to markdown - uses: docker://pandoc/core:2.9 - with: - args: >- - --output=changelog.md - --from=rst - --to=gfm - --wrap=none - changelog.rst - - - name: Fix Github code blocks - run: | - # Replace ``` {.python caption="abc"} with ```python caption="abc" - sed -i -E 's/``` \{\.(.*)\}/```\1/g' changelog.md - - # Replace ``` python with ```python - sed -i -E 's/``` (\w+)/```\1/g' changelog.md - - - name: Get release name - id: release-name - run: | - # Release name looks like: 0.7.0 (2023-05-15) - echo -n name= > "$GITHUB_OUTPUT" - cat changelog.md | head -1 | sed -E "s/#+\s*//g" >> "$GITHUB_OUTPUT" - - - name: Fix headers - run: | - # Remove header with release name - sed -i -e '1,2d' changelog.md - - - name: Create Github release - id: create_release - uses: softprops/action-gh-release@v1 - with: - token: ${{ secrets.GITHUB_TOKEN }} - draft: false - prerelease: false - name: ${{ steps.release-name.outputs.name }} - body_path: changelog.md - files: | - dist/* + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python ${{ env.DEFAULT_PYTHON }} + id: python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.DEFAULT_PYTHON }} + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-release-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-release-${{ hashFiles('requirements/core.txt', 'requirements/docs.txt') }} + ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-release- + + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: pip install -I -r requirements/core.txt -r requirements/docs.txt + + - name: Fix logo in Readme + run: | + sed -i "s#image:: docs/#image:: https://raw.githubusercontent.com/MobileTeleSystems/onetl/$GITHUB_REF_NAME/docs/#g" README.rst + + - name: Build package + run: python setup.py sdist bdist_wheel + + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 + + - name: Get changelog + run: | + cat docs/changelog/$GITHUB_REF_NAME.rst > changelog.rst + + - name: Prepare rST syntax for conversion to Markdown + run: | + # Replace Github links from Sphinx syntax with Markdown + sed -i -E 's/:github:issue:`(.*)`/#\1/g' changelog.rst + sed -i -E 's/:github:pull:`(.*)`/#\1/g' changelog.rst + sed -i -E 's/:github:user:`(.*)`/@\1/g' changelog.rst + sed -i -E 's/:github:org:`(.*)`/@\1/g' changelog.rst + + - name: Convert rST to Markdown + uses: docker://pandoc/core:2.9 + with: + args: >- + --output=changelog.md + --from=rst + --to=gfm + --wrap=none + changelog.rst + + - name: Fixing Markdown syntax after conversion + run: | + # Replace ``` {.python caption="abc"} with ```python caption="abc" + sed -i -E 's/``` \{\.(.*)\}/```\1/g' changelog.md + + # Replace ``` python with ```python + sed -i -E 's/``` (\w+)/```\1/g' changelog.md + + # Replace \# with # + sed -i -E 's/\\#/#/g' changelog.md + + - name: Get release name + id: release-name + run: | + # Release name looks like: 0.7.0 (2023-05-15) + echo -n name= > "$GITHUB_OUTPUT" + cat changelog.md | head -1 | sed -E "s/#+\s*//g" >> "$GITHUB_OUTPUT" + + - name: Fix headers + run: | + # Remove header with release name + sed -i -e '1,2d' changelog.md + + - name: Create Github release + id: create_release + uses: softprops/action-gh-release@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + draft: false + prerelease: false + name: ${{ steps.release-name.outputs.name }} + body_path: changelog.md + files: | + dist/* diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 903e50665..236c1b125 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -35,64 +35,64 @@ jobs: env: TZ: UTC ports: - - 8123:8123 - - 9001:9000 + - 8123:8123 + - 9001:9000 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for Clickhouse to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8123 -t 60 + - name: Wait for Clickhouse to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 8123 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m clickhouse + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m clickhouse - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index f3f24f7a8..6e9036df1 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -25,54 +25,54 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Run tests - run: | - ./run_tests.sh -m 'not connection' - ./run_tests.sh onetl/_util + - name: Run tests + run: | + ./run_tests.sh -m 'not connection' + ./run_tests.sh onetl/_util - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: core-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: core-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index 648a5ca36..962013077 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -22,62 +22,62 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. # See https://github.com/orgs/community/discussions/25792 - - name: Start FTP - run: | - docker compose down -v --remove-orphans - docker compose up -d ftp - env: - FTP_IMAGE: chonjay21/ftps:${{ inputs.ftp-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} + - name: Start FTP + run: | + docker compose down -v --remove-orphans + docker compose up -d ftp + env: + FTP_IMAGE: chonjay21/ftps:${{ inputs.ftp-version }} + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - - name: Wait for FTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2121 -t 60 + - name: Wait for FTP to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 2121 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m ftp + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m ftp - - name: Shutdown FTP - if: always() - run: | - docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} + - name: Shutdown FTP + if: always() + run: | + docker compose down -v --remove-orphans + env: + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: ftp-${{ inputs.ftp-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: ftp-${{ inputs.ftp-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index 4725420a0..3be745aed 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -22,62 +22,62 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. # See https://github.com/orgs/community/discussions/25792 - - name: Start FTPS - run: | - docker compose down -v --remove-orphans - docker compose up -d ftps - env: - FTPS_IMAGE: chonjay21/ftps:${{ inputs.ftps-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} + - name: Start FTPS + run: | + docker compose down -v --remove-orphans + docker compose up -d ftps + env: + FTPS_IMAGE: chonjay21/ftps:${{ inputs.ftps-version }} + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - - name: Wait for FTPS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2122 -t 60 + - name: Wait for FTPS to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 2122 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m ftps + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m ftps - - name: Shutdown FTPS - if: always() - run: | - docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} + - name: Shutdown FTPS + if: always() + run: | + docker compose down -v --remove-orphans + env: + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: ftps-${{ inputs.ftps-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: ftps-${{ inputs.ftps-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 26bac4a68..94b99436c 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -34,84 +34,84 @@ jobs: runs-on: ${{ inputs.os }} services: greenplum: - image: datagrip/greenplum:${{ inputs.greenplum-version }} + image: andruche/greenplum:${{ inputs.greenplum-version }} env: TZ: UTC ports: - - 5433:5432 + - 5433:5432 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum- - - name: Set up Postgres client - if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install --no-install-recommends postgresql-client + - name: Set up Postgres client + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends postgresql-client - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for Greenplum to be ready - run: | - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env + - name: Wait for Greenplum to be ready + run: | + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env - # Greenplum init script is running very late - sleep 30 + # Greenplum init script is running very late + sleep 30 - export PGPASSWORD=$ONETL_GP_PASSWORD - pg_isready -h localhost -p 5433 -U $ONETL_GP_USER -d $ONETL_GP_DATABASE -t 60 + export PGPASSWORD=$ONETL_GP_PASSWORD + pg_isready -h localhost -p 5433 -U $ONETL_GP_USER -d $ONETL_GP_DATABASE -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m greenplum - env: - ONETL_DB_WITH_GREENPLUM: 'true' - GREENPLUM_PACKAGES_USER: ${{ secrets.GREENPLUM_PACKAGES_USER }} - GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m greenplum + env: + ONETL_DB_WITH_GREENPLUM: 'true' + GREENPLUM_PACKAGES_USER: ${{ secrets.GREENPLUM_PACKAGES_USER }} + GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: greenplum-${{ inputs.greenplum-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: greenplum-${{ inputs.greenplum-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index b0c53c475..c97f5357c 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -28,87 +28,87 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Set up Kerberos libs - if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install --no-install-recommends libkrb5-dev gcc + - name: Set up Kerberos libs + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends libkrb5-dev gcc - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt # Cannot use services because we need to mount config file from the repo, but services start before checkout. # See https://github.com/orgs/community/discussions/25792 - - name: Start HDFS - run: | - docker compose down -v --remove-orphans - docker compose up -d hdfs --wait --wait-timeout 200 & - wait_pid=$! - docker compose logs -f hdfs & - wait $wait_pid - env: - HDFS_IMAGE: mtsrus/hadoop:${{ inputs.hadoop-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} + - name: Start HDFS + run: | + docker compose down -v --remove-orphans + docker compose up -d hdfs --wait --wait-timeout 200 & + wait_pid=$! + docker compose logs -f hdfs & + wait $wait_pid + env: + HDFS_IMAGE: mtsrus/hadoop:${{ inputs.hadoop-version }} + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - - name: Wait for HDFS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9870 -t 60 + - name: Wait for HDFS to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 9870 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - echo "127.0.0.1 hdfs" | sudo tee -a /etc/hosts - ./pytest_runner.sh -m hdfs + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + echo "127.0.0.1 hdfs" | sudo tee -a /etc/hosts + ./pytest_runner.sh -m hdfs - - name: Shutdown HDFS - if: always() - run: | - docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} + - name: Shutdown HDFS + if: always() + run: | + docker compose down -v --remove-orphans + env: + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: hdfs-${{ inputs.hadoop-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: hdfs-${{ inputs.hadoop-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-hive.yml b/.github/workflows/test-hive.yml index 16e9ef3f4..6c77690fb 100644 --- a/.github/workflows/test-hive.yml +++ b/.github/workflows/test-hive.yml @@ -25,56 +25,56 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m hive + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m hive - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: hive-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: hive-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index 3b8894bcc..a72172d84 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -63,8 +63,8 @@ jobs: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT KAFKA_SASL_ENABLED_MECHANISMS: PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 ports: - - 9093:9093 - - 9095:9095 + - 9093:9093 + - 9095:9095 options: >- --health-cmd "kafka-topics.sh --bootstrap-server 127.0.0.1:9092 --list" --health-interval 10s @@ -72,61 +72,61 @@ jobs: --health-retries 5 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for Kafka to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9093 -t 60 - ./docker/wait-for-it.sh -h localhost -p 9095 -t 60 + - name: Wait for Kafka to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 9093 -t 60 + ./docker/wait-for-it.sh -h localhost -p 9095 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m kafka + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m kafka - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: kafka-${{ inputs.kafka-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: kafka-${{ inputs.kafka-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-local-fs.yml b/.github/workflows/test-local-fs.yml index a15c04337..98c98e652 100644 --- a/.github/workflows/test-local-fs.yml +++ b/.github/workflows/test-local-fs.yml @@ -25,56 +25,56 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m local_fs + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m local_fs - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: local-fs-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: local-fs-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index 2229c9a20..3366bb3cb 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -34,63 +34,63 @@ jobs: MONGO_INITDB_ROOT_USERNAME: onetl MONGO_INITDB_ROOT_PASSWORD: E4j7h!9A ports: - - 27017:27017 + - 27017:27017 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for MongoDB to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 27017 -t 60 + - name: Wait for MongoDB to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 27017 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m mongodb + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m mongodb - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: mongodb-${{ inputs.mongodb-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: mongodb-${{ inputs.mongodb-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 59a88978a..529f9edec 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -37,63 +37,63 @@ jobs: ACCEPT_EULA: Y SA_PASSWORD: 2astazeY ports: - - 1433:1433 + - 1433:1433 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for MSSQL to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 1433 -t 60 + - name: Wait for MSSQL to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 1433 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m mssql + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m mssql - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: mssql-${{ inputs.mssql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: mssql-${{ inputs.mssql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index a03bd6304..36d800a57 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -36,63 +36,63 @@ jobs: MYSQL_USER: onetl MYSQL_PASSWORD: ohbuz9Eochaj9saibooK3thooGa5aesh ports: - - 3306:3306 + - 3306:3306 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for MySQL to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 3306 -t 60 + - name: Wait for MySQL to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 3306 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m mysql + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m mysql - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: mysql-${{ inputs.mysql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: mysql-${{ inputs.mysql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index 0fbcbc73b..a61395c5b 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -42,77 +42,77 @@ jobs: APP_USER: onetl APP_USER_PASSWORD: Yoequ2Hoeceit4ch ports: - - 1522:1521 + - 1522:1521 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle- - - name: Set up Oracle instantclient - if: runner.os == 'Linux' - run: | - mkdir ./tmp - wget -P ./tmp https://download.oracle.com/otn_software/linux/instantclient/2110000/instantclient-basic-linux.x64-21.10.0.0.0dbru.zip - mkdir -p ./oracle - unzip ./tmp/instantclient-basic-linux.x64-21.10.0.0.0dbru.zip -d ./oracle - rm -rf ./tmp/instantclient-basic-linux.x64-21.10.0.0.0dbru.zip + - name: Set up Oracle instantclient + if: runner.os == 'Linux' + run: | + mkdir ./tmp + wget -P ./tmp https://download.oracle.com/otn_software/linux/instantclient/2110000/instantclient-basic-linux.x64-21.10.0.0.0dbru.zip + mkdir -p ./oracle + unzip ./tmp/instantclient-basic-linux.x64-21.10.0.0.0dbru.zip -d ./oracle + rm -rf ./tmp/instantclient-basic-linux.x64-21.10.0.0.0dbru.zip - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for Oracle to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 1522 -t 60 + - name: Wait for Oracle to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 1522 -t 60 - - name: Run tests - run: | - export ONETL_ORA_CLIENT_PATH=./oracle/instantclient_21_10 - export LD_LIBRARY_PATH=${ONETL_ORA_CLIENT_PATH}:${LD_LIBRARY_PATH} - export PATH=${ONETL_ORA_CLIENT_PATH}:${PATH} + - name: Run tests + run: | + export ONETL_ORA_CLIENT_PATH=./oracle/instantclient_21_10 + export LD_LIBRARY_PATH=${ONETL_ORA_CLIENT_PATH}:${LD_LIBRARY_PATH} + export PATH=${ONETL_ORA_CLIENT_PATH}:${PATH} - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - export "ONETL_ORA_SERVICE_NAME=${{ inputs.db-name }}" - ./pytest_runner.sh -m oracle + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + export "ONETL_ORA_SERVICE_NAME=${{ inputs.db-name }}" + ./pytest_runner.sh -m oracle - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: oracle-${{ inputs.oracle-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: oracle-${{ inputs.oracle-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index 601a57a02..bc2666248 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -35,63 +35,63 @@ jobs: POSTGRES_DB: onetl POSTGRES_PASSWORD: ohtae0luxeshi1uraeluMoh9IShah7ai ports: - - 5432:5432 + - 5432:5432 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for Postgres to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 5432 -t 60 + - name: Wait for Postgres to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 5432 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m postgres + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m postgres - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: postgres-${{ inputs.postgres-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: postgres-${{ inputs.postgres-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 365331e37..0796ed171 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -36,63 +36,63 @@ jobs: MINIO_ROOT_PASSWORD: woh3fogh3Biereu3quee1aidu9theiro MINIO_SECRET_KEY: woh3fogh3Biereu3quee1aidu9theiro ports: - - 9010:9000 + - 9010:9000 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3- - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Wait for S3 to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9010 -t 60 + - name: Wait for S3 to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 9010 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m s3 + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m s3 - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: s3-${{ inputs.minio-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: s3-${{ inputs.minio-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index d82005d89..4612ff95f 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -22,60 +22,60 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt # Replace with Github Actions' because of custom parameter for samba container start - - name: Start Samba - run: | - docker compose down -v --remove-orphans - docker compose up -d samba - env: - SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} + - name: Start Samba + run: | + docker compose down -v --remove-orphans + docker compose up -d samba + env: + SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }} + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - - name: Wait for Samba to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 445 -t 60 + - name: Wait for Samba to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 445 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m samba + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m samba - - name: Shutdown Samba - if: always() - run: | - docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} + - name: Shutdown Samba + if: always() + run: | + docker compose down -v --remove-orphans + env: + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index 5ea61b2e6..301dd27c8 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -29,47 +29,47 @@ jobs: PASSWORD_ACCESS: 'true' USER_PASSWORD: AesujeifohgoaCu0Boosiet5aimeitho ports: - - 2222:2222 + - 2222:2222 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt - - name: Wait for SFTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2222 -t 60 + - name: Wait for SFTP to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 2222 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m sftp + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m sftp - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: sftp-${{ inputs.openssh-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: sftp-${{ inputs.openssh-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-teradata.yml b/.github/workflows/test-teradata.yml index b865cfff6..64d332994 100644 --- a/.github/workflows/test-teradata.yml +++ b/.github/workflows/test-teradata.yml @@ -25,55 +25,55 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Java ${{ inputs.java-version }} - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ inputs.java-version }} + - name: Set up Java ${{ inputs.java-version }} + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ inputs.java-version }} - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache Ivy - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata- + - name: Cache Ivy + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.ivy2 + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + restore-keys: | + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata- - - name: Cache pip - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata- + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m teradata + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m teradata - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: teradata-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: teradata-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index bece2465f..e259d4dfe 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -22,62 +22,62 @@ jobs: runs-on: ${{ inputs.os }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Python ${{ inputs.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} - - name: Cache pip - uses: actions/cache@v3 - if: inputs.with-cache - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} - restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav- + - name: Cache pip + uses: actions/cache@v4 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav- - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel - - name: Install dependencies - run: | - pip install -I -r requirements/core.txt -r requirements/webdav.txt -r requirements/tests/base.txt + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/webdav.txt -r requirements/tests/base.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-webdav/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. # See https://github.com/orgs/community/discussions/25792 - - name: Start WebDAV - run: | - docker compose down -v --remove-orphans - docker compose up -d webdav - env: - WEBDAV_IMAGE: chonjay21/webdav:${{ inputs.webdav-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} + - name: Start WebDAV + run: | + docker compose down -v --remove-orphans + docker compose up -d webdav + env: + WEBDAV_IMAGE: chonjay21/webdav:${{ inputs.webdav-version }} + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - - name: Wait for WebDAV to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8000 -t 60 + - name: Wait for WebDAV to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 8000 -t 60 - - name: Run tests - run: | - mkdir reports/ || echo "Directory exists" - sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env - source ./env - ./pytest_runner.sh -m webdav + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m webdav - - name: Shutdown WebDAV - if: always() - run: | - docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} + - name: Shutdown WebDAV + if: always() + run: | + docker compose down -v --remove-orphans + env: + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: webdav-${{ inputs.webdav-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: webdav-${{ inputs.webdav-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 670f981e0..b48f88c5a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,10 +2,10 @@ name: Tests on: push: branches: - - develop + - develop pull_request: branches-ignore: - - master + - master workflow_dispatch: concurrency: @@ -321,63 +321,65 @@ jobs: name: Tests done runs-on: ubuntu-latest needs: - - tests-core - - tests-clickhouse - - tests-hive - - tests-kafka - - tests-local-fs - - tests-mongodb - - tests-mssql - - tests-mysql - - tests-oracle - - tests-postgres - - tests-teradata - - tests-ftp - - tests-ftps - - tests-hdfs - - tests-s3 - - tests-sftp - - tests-samba - - tests-webdav + - tests-core + - tests-clickhouse + - tests-hive + - tests-kafka + - tests-local-fs + - tests-mongodb + - tests-mssql + - tests-mysql + - tests-oracle + - tests-postgres + - tests-teradata + - tests-ftp + - tests-ftps + - tests-hdfs + - tests-s3 + - tests-sftp + - tests-samba + - tests-webdav steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python ${{ env.DEFAULT_PYTHON }} - uses: actions/setup-python@v4 - with: - python-version: ${{ env.DEFAULT_PYTHON }} - - - name: Cache pip - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-coverage - - - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel - - - name: Install dependencies - run: pip install -I coverage pytest - - - name: Download all coverage reports - uses: actions/download-artifact@v4 - with: - path: reports - - - name: Move coverage data to the root folder - run: find reports -type f -exec mv '{}' reports \; - - - name: Generate coverate reports - run: ./combine_coverage.sh - - - name: Check coverage - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - directory: ./reports - fail_ci_if_error: true - - - name: All done - run: echo 1 + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ env.DEFAULT_PYTHON }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.DEFAULT_PYTHON }} + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-coverage + + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: pip install -I coverage pytest + + - name: Download all coverage reports + uses: actions/download-artifact@v4 + with: + path: reports + + - name: Move coverage data to the root folder + run: find reports -type f -exec mv '{}' reports \; + + - name: Generate coverate reports + run: ./combine_coverage.sh + + - name: Check coverage + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./reports/coverage.xml + fail_ci_if_error: true + # TODO: remove after fixing https://github.com/codecov/codecov-cli/issues/367 + plugin: 'gcov' + + - name: All done + run: echo 1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1f1f1fc6d..e209d7e54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,113 +1,151 @@ +default_language_version: + python: python3.11 + repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - - id: check-ast - - id: check-case-conflict - - id: check-docstring-first - - id: check-executables-have-shebangs - - id: check-merge-conflict - - id: check-toml - - id: check-vcs-permalinks - - id: check-yaml - args: [--unsafe] - - id: requirements-txt-fixer - files: ^(requirements/.*\.txt)$ - - id: end-of-file-fixer - exclude: ^(.*/VERSION|tests/resources/.*/.*)$ - - id: fix-byte-order-marker - - id: fix-encoding-pragma - args: [--remove] - - id: name-tests-test - files: ^tests/(tests_integration|tests_unit)/.*\.py$ - args: [--django] - - id: trailing-whitespace -- repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.5.4 - hooks: - - id: remove-tabs - exclude: ^docs/(make.bat|Makefile) -- repo: https://github.com/codespell-project/codespell - rev: v2.2.6 - hooks: - - id: codespell - args: [-w] -- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.11.0 - hooks: - - id: pretty-format-yaml - args: [--autofix, --indent, '2', --preserve-quotes] -- repo: https://github.com/lovesegfault/beautysh - rev: v6.2.1 - hooks: - - id: beautysh -- repo: https://github.com/IamTheFij/docker-pre-commit - rev: v3.0.1 - hooks: - - id: docker-compose-check -- repo: https://github.com/pycqa/isort - rev: 5.13.0 - hooks: - - id: isort -- repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.10.0 - hooks: - - id: python-no-log-warn - - id: python-no-eval - - id: rst-backticks - - id: rst-directive-colons - - id: rst-inline-touching-normal - - id: text-unicode-replacement-char -- repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 - hooks: - - id: pyupgrade - args: [--py37-plus, --keep-runtime-typing] -- repo: https://github.com/psf/black - rev: 23.11.0 - hooks: - - id: black - language_version: python3 -- repo: https://github.com/asottile/blacken-docs - rev: 1.16.0 - hooks: - - id: blacken-docs -- repo: meta - hooks: - - id: check-hooks-apply - - id: check-useless-excludes -- repo: https://github.com/PyCQA/autoflake - rev: v2.2.1 - hooks: - - id: autoflake - args: - - --in-place - - --config=setup.cfg -- repo: local - hooks: - - id: flake8 - name: flake8 - entry: python3 -m flake8 - language: system - types: [python] - files: ^(onetl|tests)/.*$ - pass_filenames: true - - id: mypy - name: mypy - entry: python3 -m mypy --config-file setup.cfg onetl - language: system - types: [python] - pass_filenames: false - - id: towncrier - name: towncrier - entry: towncrier build --draft - language: system - types: [rst] - pass_filenames: false + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-ast + - id: check-case-conflict + - id: check-docstring-first + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-toml + - id: check-vcs-permalinks + - id: check-yaml + args: [--unsafe] + - id: requirements-txt-fixer + files: ^(requirements/.*\.txt)$ + - id: end-of-file-fixer + exclude: ^(.*/VERSION|tests/resources/.*/.*)$ + - id: fix-byte-order-marker + - id: fix-encoding-pragma + args: [--remove] + - id: name-tests-test + files: ^tests/(tests_integration|tests_unit)/.*\.py$ + args: [--django] + - id: trailing-whitespace + + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.5.4 + hooks: + - id: forbid-tabs + - id: remove-tabs + args: [--whitespaces-count, '2'] + - id: chmod + args: ['644'] + exclude_types: [shell] + exclude: ^(tests/resources/file_df_connection/generate_files\.py)$ + - id: chmod + args: ['755'] + types: [shell] + - id: chmod + args: ['755'] + files: ^(tests/resources/file_df_connection/generate_files\.py)$ + - id: insert-license + files: .*\.py$ + exclude: ^(setup\.py|conftest\.py|docs/.*\.py|tests/.*\.py)$ + args: + - --license-filepath + - .spdx-license-header.txt + - --use-current-year + - --no-extra-eol + + - repo: https://github.com/codespell-project/codespell + rev: v2.2.6 + hooks: + - id: codespell + args: [-w] + + - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.12.0 + hooks: + - id: pretty-format-yaml + args: [--autofix, --indent, '2', --preserve-quotes, --offset, '2'] + + - repo: https://github.com/lovesegfault/beautysh + rev: v6.2.1 + hooks: + - id: beautysh + + - repo: https://github.com/IamTheFij/docker-pre-commit + rev: v3.0.1 + hooks: + - id: docker-compose-check + + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: python-no-log-warn + - id: python-no-eval + - id: rst-backticks + - id: rst-directive-colons + - id: rst-inline-touching-normal + - id: text-unicode-replacement-char + + - repo: https://github.com/asottile/pyupgrade + rev: v3.15.0 + hooks: + - id: pyupgrade + args: [--py37-plus, --keep-runtime-typing] + + - repo: https://github.com/psf/black + rev: 24.1.1 + hooks: + - id: black + language_version: python3 + + - repo: https://github.com/asottile/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==24.1.1 + + - repo: meta + hooks: + - id: check-hooks-apply + - id: check-useless-excludes + + - repo: https://github.com/PyCQA/autoflake + rev: v2.2.1 + hooks: + - id: autoflake + args: + - --in-place + - --config=setup.cfg + + - repo: local + hooks: + - id: flake8 + name: flake8 + entry: python3 -m flake8 + language: system + types: [python] + files: ^(onetl|tests)/.*$ + pass_filenames: true + - id: mypy + name: mypy + entry: python3 -m mypy --config-file setup.cfg onetl + language: system + types: [python] + pass_filenames: false + - id: towncrier + name: towncrier + entry: towncrier build --draft + language: system + types: [rst] + pass_filenames: false ci: skip: - - flake8 # checked with Github Actions - - mypy # checked with Github Actions - - towncrier # checked with Github Actions - - docker-compose-check # cannot run on pre-commit.ci + - flake8 # checked with Github Actions + - mypy # checked with Github Actions + - towncrier # checked with Github Actions + - docker-compose-check # cannot run on pre-commit.ci + - chmod # failing in pre-commit.ci diff --git a/.readthedocs.yml b/.readthedocs.yml index 13358b8b3..efb1a83cd 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,18 +4,29 @@ build: os: ubuntu-22.04 tools: python: "3.12" + # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 + commands: + - python -m virtualenv $READTHEDOCS_VIRTUALENV_PATH + - python -m pip install --upgrade --no-cache-dir pip setuptools wheel + - python -m pip install --upgrade --no-cache-dir sphinx readthedocs-sphinx-ext + - python -m pip install --exists-action=w --no-cache-dir -r requirements/docs.txt + - python -m pip install --exists-action=w --no-cache-dir --no-deps sphinx-plantuml + - python -m pip install --exists-action=w --upgrade --upgrade-strategy only-if-needed --no-cache-dir .[ftp,ftps,hdfs,samba,s3,sftp,webdav,spark] + - cat docs/conf.py + - cd docs && python -m sphinx -T -E -b html -d _build/doctrees -D language=en . $READTHEDOCS_OUTPUT/html -python: - install: - - requirements: requirements/docs.txt - - method: pip - path: . - extra_requirements: - - ftp - - ftps - - hdfs - - samba - - s3 - - sftp - - webdav - - spark +# TODO: uncomment after https://github.com/zqmillet/sphinx-plantuml/pull/4 +#python: +# install: +# - requirements: requirements/docs.txt +# - method: pip +# path: . +# extra_requirements: +# - ftp +# - ftps +# - hdfs +# - samba +# - s3 +# - sftp +# - webdav +# - spark diff --git a/.spdx-license-header.txt b/.spdx-license-header.txt new file mode 100644 index 000000000..19a8b2e4d --- /dev/null +++ b/.spdx-license-header.txt @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +SPDX-License-Identifier: Apache-2.0 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 8c21a15f6..cab05a780 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -72,6 +72,9 @@ Create virtualenv and install dependencies: -r requirements/tests/oracle.txt \ -r requirements/tests/spark-3.5.0.txt + # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 + pip install sphinx-plantuml --no-deps + Enable pre-commit hooks ~~~~~~~~~~~~~~~~~~~~~~~ @@ -170,7 +173,7 @@ Without docker-compose To run Greenplum tests, you should: - * Download `Pivotal connector for Spark `_ + * Download `VMware Greenplum connector for Spark `_ * Either move it to ``~/.ivy2/jars/``, or pass file path to ``CLASSPATH`` * Set environment variable ``ONETL_DB_WITH_GREENPLUM=true`` to enable adding connector to Spark session @@ -334,3 +337,44 @@ How to skip change notes check? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Just add ``ci:skip-changelog`` label to pull request. + +Release Process +^^^^^^^^^^^^^^^ + +Before making a release from the ``develop`` branch, follow these steps: + +1. Backup ``NEXT_RELEASE.rst`` + +.. code:: bash + + cp docs/changelog/NEXT_RELEASE.rst docs/changelog/temp_NEXT_RELEASE.rst + +2. Build the Release notes with Towncrier + +.. code:: bash + + export VERSION=$(cat onetl/VERSION) + towncrier build --version=${VERSION} + +3. Update Changelog + +.. code:: bash + + mv docs/changelog/NEXT_RELEASE.rst docs/changelog/${VERSION}.rst + +4. Edit the ``${VERSION}.rst`` file +Remove content above the version number heading in the ``${VERSION}.rst`` file. + +5. Update Changelog Index + +.. code:: bash + + awk -v version=${VERSION} '/NEXT_RELEASE/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst + +6. Reset ``NEXT_RELEASE.rst`` file + +.. code:: bash + + mv docs/changelog/temp_NEXT_RELEASE.rst docs/changelog/NEXT_RELEASE.rst + +7. Update the patch version in the ``VERSION`` file of ``develop`` branch **after release**. diff --git a/LICENSE.txt b/LICENSE.txt index 67c428820..a22e190ad 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright 2023 MTS (Mobile Telesystems). All rights reserved. +Copyright 2021-2024 MTS (Mobile Telesystems). All rights reserved. Apache License Version 2.0, January 2004 diff --git a/README.rst b/README.rst index 01ff7a7f5..0372381ae 100644 --- a/README.rst +++ b/README.rst @@ -59,49 +59,49 @@ Requirements Supported storages ------------------ -+--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+ -| Type | Storage | Powered by | -+====================+==============+======================================================================================================================+ -| Database | Clickhouse | Apache Spark `JDBC Data Source `_ | -+ +--------------+ + -| | MSSQL | | -+ +--------------+ + -| | MySQL | | -+ +--------------+ + -| | Postgres | | -+ +--------------+ + -| | Oracle | | -+ +--------------+ + -| | Teradata | | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | Hive | Apache Spark `Hive integration `_ | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | Kafka | Apache Spark `Kafka integration `_ | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | Greenplum | Pivotal `Greenplum Spark connector `_ | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | MongoDB | `MongoDB Spark connector `_ | -+--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+ -| File | HDFS | `HDFS Python client `_ | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | S3 | `minio-py client `_ | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | SFTP | `Paramiko library `_ | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | FTP | `FTPUtil library `_ | -+ +--------------+ + -| | FTPS | | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | WebDAV | `WebdavClient3 library `_ | -+ +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | Samba | `pysmb library `_ | -+--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+ -| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | -| +--------------+ + -| | SparkHDFS | | -| +--------------+----------------------------------------------------------------------------------------------------------------------+ -| | SparkS3 | `Hadoop AWS `_ library | -+--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Type | Storage | Powered by | ++====================+==============+=========================================================================================================================+ +| Database | Clickhouse | Apache Spark `JDBC Data Source `_ | ++ +--------------+ + +| | MSSQL | | ++ +--------------+ + +| | MySQL | | ++ +--------------+ + +| | Postgres | | ++ +--------------+ + +| | Oracle | | ++ +--------------+ + +| | Teradata | | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Hive | Apache Spark `Hive integration `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Kafka | Apache Spark `Kafka integration `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Greenplum | VMware `Greenplum Spark connector `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | MongoDB | `MongoDB Spark connector `_ | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| File | HDFS | `HDFS Python client `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | S3 | `minio-py client `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | SFTP | `Paramiko library `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | FTP | `FTPUtil library `_ | ++ +--------------+ + +| | FTPS | | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | WebDAV | `WebdavClient3 library `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Samba | `pysmb library `_ | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | +| +--------------+ + +| | SparkHDFS | | +| +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | SparkS3 | `Hadoop AWS `_ library | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ .. documentation @@ -337,6 +337,9 @@ Read data from MSSQL, transform & write to Hive. options=MSSQL.ReadOptions(fetchsize=10000), ) + # checks that there is data in the table, otherwise raises exception + reader.raise_if_no_data() + # Read data to DataFrame df = reader.run() df.printSchema() diff --git a/docker-compose.yml b/docker-compose.yml index 724e3d084..5b4fea437 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,10 +12,10 @@ services: SPARK_VERSION: 3.5.0 env_file: .env.docker volumes: - - ./:/app/ + - ./:/app/ networks: - - onetl - - default + - onetl + - default ulimits: # https://stackoverflow.com/a/56895801 nofile: @@ -24,31 +24,31 @@ services: # no dependencies from other containers to allow running limited set of tests instead of all greenplum: - image: ${GREENPLUM_IMAGE:-datagrip/greenplum:6.8} + image: ${GREENPLUM_IMAGE:-andruche/greenplum:7.0.0} restart: unless-stopped env_file: .env.dependencies ports: - - 5433:5432 + - 5433:5432 networks: - - onetl + - onetl clickhouse: image: ${CLICKHOUSE_IMAGE:-clickhouse/clickhouse-server:latest-alpine} restart: unless-stopped ports: - - 8123:8123 - - 9001:9000 + - 8123:8123 + - 9001:9000 networks: - - onetl + - onetl zookeeper: image: ${ZOOKEEPER_IMAGE:-bitnami/zookeeper:3.8} ports: - - 2181:2181 + - 2181:2181 networks: - - onetl + - onetl environment: - - ALLOW_ANONYMOUS_LOGIN=yes + - ALLOW_ANONYMOUS_LOGIN=yes healthcheck: test: ["CMD-SHELL", "nc -z localhost 2181 || exit"] interval: 10s @@ -59,11 +59,11 @@ services: image: ${KAFKA_IMAGE:-bitnami/kafka:latest} restart: unless-stopped ports: - - 9093:9093 - - 9095:9095 + - 9093:9093 + - 9095:9095 env_file: .env.dependencies networks: - - onetl + - onetl depends_on: zookeeper: condition: service_healthy @@ -77,37 +77,37 @@ services: image: ${MONGODB_IMAGE:-mongo:latest} restart: unless-stopped ports: - - 27017:27017 + - 27017:27017 env_file: .env.dependencies networks: - - onetl + - onetl mssql: image: ${MSSQL_IMAGE:-mcmoe/mssqldocker:latest} restart: unless-stopped env_file: .env.dependencies ports: - - 1433:1433 + - 1433:1433 networks: - - onetl + - onetl mysql: image: ${MYSQL_IMAGE:-mysql:latest} restart: unless-stopped env_file: .env.dependencies ports: - - 3306:3306 + - 3306:3306 networks: - - onetl + - onetl postgres: image: ${POSTGRES_IMAGE:-postgres:15.2-alpine} restart: unless-stopped env_file: .env.dependencies ports: - - 5432:5432 + - 5432:5432 networks: - - onetl + - onetl hdfs: image: ${HDFS_IMAGE:-mtsrus/hadoop:hadoop2-hdfs} @@ -115,22 +115,22 @@ services: restart: unless-stopped env_file: .env.dependencies ports: - - 9820:9820 # HDFS IPC - - 9870:9870 # WebHDFS - - 9864:9864 # Datanode UI + - 9820:9820 # HDFS IPC + - 9870:9870 # WebHDFS + - 9864:9864 # Datanode UI volumes: - - ./docker/hdfs/conf/hadoop/:/var/hadoop/conf/ + - ./docker/hdfs/conf/hadoop/:/var/hadoop/conf/ networks: - - onetl + - onetl oracle: image: ${ORACLE_IMAGE:-gvenzl/oracle-free:23.3-slim-faststart} restart: unless-stopped env_file: .env.dependencies ports: - - 1522:1521 + - 1522:1521 networks: - - onetl + - onetl healthcheck: test: ["CMD", "healthcheck.sh"] interval: 10s @@ -142,75 +142,75 @@ services: restart: unless-stopped env_file: .env.dependencies environment: - - USE_SSL=false - - PASSV_MIN_PORT=30000 - - PASSV_MAX_PORT=30010 + - USE_SSL=false + - PASSV_MIN_PORT=30000 + - PASSV_MAX_PORT=30010 ports: - - 2121:21 - - 30000-30010:30000-30010 + - 2121:21 + - 30000-30010:30000-30010 volumes: - - ./docker/ftp/on_post_init.sh:/sources/ftps/eventscripts/on_post_init.sh + - ./docker/ftp/on_post_init.sh:/sources/ftps/eventscripts/on_post_init.sh networks: - - onetl + - onetl ftps: image: ${FTPS_IMAGE:-chonjay21/ftps:latest} restart: unless-stopped env_file: .env.dependencies environment: - - USE_SSL=true - - PASSV_MIN_PORT=30020 - - PASSV_MAX_PORT=30030 + - USE_SSL=true + - PASSV_MIN_PORT=30020 + - PASSV_MAX_PORT=30030 ports: - - 2122:21 - - 30020-30030:30020-30030 + - 2122:21 + - 30020-30030:30020-30030 volumes: - - ./docker/ftp/on_post_init.sh:/sources/ftps/eventscripts/on_post_init.sh + - ./docker/ftp/on_post_init.sh:/sources/ftps/eventscripts/on_post_init.sh networks: - - onetl + - onetl samba: image: elswork/samba restart: unless-stopped ports: - - "139:139" - - "445:445" + - "139:139" + - "445:445" volumes: - - ./docker/samba/custom_entrypoint.sh:/custom_entrypoint.sh + - ./docker/samba/custom_entrypoint.sh:/custom_entrypoint.sh entrypoint: ["/custom_entrypoint.sh"] networks: - - onetl + - onetl s3: image: ${S3_IMAGE:-bitnami/minio:latest} restart: unless-stopped env_file: .env.dependencies ports: - - 9010:9000 - - 9011:9001 + - 9010:9000 + - 9011:9001 networks: - - onetl + - onetl sftp: image: ${SFTP_IMAGE:-linuxserver/openssh-server} restart: unless-stopped env_file: .env.dependencies ports: - - 2222:2222 + - 2222:2222 networks: - - onetl + - onetl webdav: image: ${WEBDAV_IMAGE:-chonjay21/webdav:latest} restart: unless-stopped env_file: .env.dependencies ports: - - 8000:80 + - 8000:80 volumes: # Remove after https://github.com/chonjay21/docker-webdav/pull/3 - - ./docker/webdav/on_post_init.sh:/sources/webdav/eventscripts/on_post_init.sh + - ./docker/webdav/on_post_init.sh:/sources/webdav/eventscripts/on_post_init.sh networks: - - onetl + - onetl networks: onetl: diff --git a/docs/changelog/0.10.0.rst b/docs/changelog/0.10.0.rst index 79cc1d2d5..e546d150c 100644 --- a/docs/changelog/0.10.0.rst +++ b/docs/changelog/0.10.0.rst @@ -45,7 +45,6 @@ Breaking Changes - New ``HWM`` classes have flat structure instead of nested. - New ``HWM`` classes have mandatory ``name`` attribute (it was known as ``qualified_name`` before). - Type aliases used while serializing and deserializing ``HWM`` objects to ``dict`` representation were changed too: ``int`` -> ``column_int``. - - HWM Store implementations now can handle only new ``HWM`` classes, old ones are **NOT** supported. To make migration simpler, you can use new method: @@ -65,14 +64,192 @@ Breaking Changes - YAMLHWMStore **CANNOT read files created by older onETL versions** (0.9.x or older). - If you use it, please: + .. dropdown:: Update procedure - * Find ``.yml`` file for specific HWM. Path can be found in logs, it is usually in form ``/home/USERNAME/.local/share/onETL/yml_hwm_store/QUALIFIED_NAME.yml`` (on Linux). - * Take latest ``value`` from file content. - * Delete the file. - * Update ``DBReader(where=...)`` value to include filter like ``hwm_column >= old_value`` (it should match column type). - * Run your code. ``DBReader.run()`` will get new HWM value, and save it to ``.yml`` file with new structure. - * Undo changes of ``DBReader(where=...)``. + .. code-block:: python + + # pip install onetl==0.9.5 + + # Get qualified_name for HWM + + + # Option 1. HWM is built manually + from etl_entities import IntHWM, FileListHWM + from etl_entities.source import Column, Table, RemoteFolder + from etl_entities.process import Process + + # for column HWM + old_column_hwm = IntHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=Table(name="schema.table", instance="postgres://host:5432/db"), + column=Column(name="col1"), + ) + qualified_name = old_column_hwm.qualified_name + # "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost" + + # for file HWM + old_file_hwm = FileListHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=RemoteFolder(name="/absolute/path", instance="ftp://ftp.server:21"), + ) + qualified_name = old_file_hwm.qualified_name + # "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost" + + + # Option 2. HWM is generated automatically (by DBReader/FileDownloader) + # See onETL logs and search for string like qualified_name = '...' + + qualified_name = "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost" + + + # Get .yml file path by qualified_name + + import os + from pathlib import PurePosixPath + from onetl.hwm.store import YAMLHWMStore + + # here you should pass the same arguments as used on production, if any + yaml_hwm_store = YAMLHWMStore() + hwm_path = yaml_hwm_store.get_file_path(qualified_name) + print(hwm_path) + + # for column HWM + # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/col1__schema.table__postgres_host_5432_db__cde.abc.myprocess__myhost.yml') + + # for file HWM + # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/file_list__absolute_path__ftp_ftp.server_21__cde.abc.myprocess__myhost.yml') + + + # Read raw .yml file content + + from yaml import safe_load, dump + + raw_old_hwm_items = safe_load(hwm_path.read_text()) + print(raw_old_hwm_items) + + # for column HWM + # [ + # { + # "column": { "name": "col1", "partition": {} }, + # "modified_time": "2023-12-18T10: 39: 47.377378", + # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" }, + # "source": { "instance": "postgres: //host:5432/db", "name": "schema.table" }, + # "type": "int", + # "value": "123", + # }, + # ] + + # for file HWM + # [ + # { + # "modified_time": "2023-12-18T11:15:36.478462", + # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" }, + # "source": { "instance": "ftp://ftp.server:21", "name": "/absolute/path" }, + # "type": "file_list", + # "value": ["file1.txt", "file2.txt"], + # }, + # ] + + + # Convert file content to new structure, compatible with onETL 0.10.x + raw_new_hwm_items = [] + for old_hwm in raw_old_hwm_items: + new_hwm = {"name": qualified_name, "modified_time": old_hwm["modified_time"]} + + if "column" in old_hwm: + new_hwm["expression"] = old_hwm["column"]["name"] + new_hwm["entity"] = old_hwm["source"]["name"] + old_hwm.pop("process", None) + + if old_hwm["type"] == "int": + new_hwm["type"] = "column_int" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "date": + new_hwm["type"] = "column_date" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "datetime": + new_hwm["type"] = "column_datetime" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "file_list": + new_hwm["type"] = "file_list" + new_hwm["value"] = [ + os.fspath(PurePosixPath(old_hwm["source"]["name"]).joinpath(path)) + for path in old_hwm["value"] + ] + + else: + raise ValueError("WAT?") + + raw_new_hwm_items.append(new_hwm) + + + print(raw_new_hwm_items) + # for column HWM + # [ + # { + # "name": "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost", + # "modified_time": "2023-12-18T10:39:47.377378", + # "expression": "col1", + # "source": "schema.table", + # "type": "column_int", + # "value": 123, + # }, + # ] + + # for file HWM + # [ + # { + # "name": "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost", + # "modified_time": "2023-12-18T11:15:36.478462", + # "entity": "/absolute/path", + # "type": "file_list", + # "value": ["/absolute/path/file1.txt", "/absolute/path/file2.txt"], + # }, + # ] + + + # Save file with new content + with open(hwm_path, "w") as file: + dump(raw_new_hwm_items, file) + + + # Stop Python interpreter and update onETL + # pip install onetl==0.10.0 + # Check that new .yml file can be read + + from onetl.hwm.store import YAMLHWMStore + + qualified_name = ... + + # here you should pass the same arguments as used on production, if any + yaml_hwm_store = YAMLHWMStore() + yaml_hwm_store.get_hwm(qualified_name) + + # for column HWM + # ColumnIntHWM( + # name='col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost', + # description='', + # entity='schema.table', + # value=123, + # expression='col1', + # modified_time=datetime.datetime(2023, 12, 18, 10, 39, 47, 377378), + # ) + + # for file HWM + # FileListHWM( + # name='file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost', + # description='', + # entity=AbsolutePath('/absolute/path'), + # value=frozenset({AbsolutePath('/absolute/path/file1.txt'), AbsolutePath('/absolute/path/file2.txt')}), + # expression=None, + # modified_time=datetime.datetime(2023, 12, 18, 11, 15, 36, 478462) + # ) + + + # That's all! But most of users use other HWM store implementations which do not have such issues. @@ -196,7 +373,7 @@ Breaking Changes Automatic ``name`` generation using the old ``DBReader.hwm_column`` / ``FileDownloader.hwm_type`` syntax is still supported, but will be removed in v1.0.0 release. (:github:pull:`179`) -- Performance of read Incremental abd Batch strategies has been drastically improved. (:github:pull:`182`). +- Performance of read Incremental and Batch strategies has been drastically improved. (:github:pull:`182`). .. dropdown:: Before and after in details diff --git a/docs/changelog/0.10.1.rst b/docs/changelog/0.10.1.rst new file mode 100644 index 000000000..7f30ca284 --- /dev/null +++ b/docs/changelog/0.10.1.rst @@ -0,0 +1,28 @@ +0.10.1 (2024-02-05) +=================== + +Features +-------- + +- Add support of ``Incremental Strategies`` for ``Kafka`` connection: + + .. code-block:: python + + reader = DBReader( + connection=Kafka(...), + source="topic_name", + hwm=AutoDetectHWM(name="some_hwm_name", expression="offset"), + ) + + with IncrementalStrategy(): + df = reader.run() + + This lets you resume reading data from a Kafka topic starting at the last committed offset from your previous run. (:github:pull:`202`) +- Add ``has_data``, ``raise_if_no_data`` methods to ``DBReader`` class. (:github:pull:`203`) +- Updare VMware Greenplum connector from ``2.1.4`` to ``2.3.0``. This implies: + * Greenplum 7.x support + * `Kubernetes support `_ + * New read option `gpdb.matchDistributionPolicy `_ + which allows to match each Spark executor with specific Greenplum segment, avoiding redundant data transfer between Greenplum segments + * Allows overriding `Greenplum optimizer parameters `_ in read/write operations (:github:pull:`208`) +- ``Greenplum.get_packages()`` method now accepts optional arg ``package_version`` which allows to override version of Greenplum connector package. (:github:pull:`208`) diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 29163e700..557ac69b4 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -4,6 +4,7 @@ DRAFT NEXT_RELEASE + 0.10.1 0.10.0 0.9.5 0.9.4 diff --git a/docs/conf.py b/docs/conf.py index 291f04069..958bfe1c5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,7 @@ # -- Project information ----------------------------------------------------- project = "onETL" -copyright = "2023, DataOps.ETL" +copyright = "2021-2024 MTS (Mobile Telesystems)" author = "DataOps.ETL" # The version info for the project you're documenting, acts as replacement for diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 57db9635e..4f33b51a0 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -6,11 +6,11 @@ Prerequisites Version Compatibility --------------------- -* Greenplum server versions: 5.x, 6.x +* Greenplum server versions: 5.x, 6.x, 7.x * Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) * Java versions: 8 - 11 -See `official documentation `_. +See `official documentation `_. Installing PySpark ------------------ @@ -20,11 +20,11 @@ BEFORE creating the connector instance. See :ref:`install-spark` installation instruction for more details. -Downloading Pivotal package ---------------------------- +Downloading VMware package +-------------------------- To use Greenplum connector you should download connector ``.jar`` file from -`Pivotal website `_ +`VMware website `_ and then pass it to Spark session. .. warning:: @@ -46,12 +46,12 @@ Interaction schema Spark executors open ports to listen incoming requests. Greenplum segments are initiating connections to Spark executors using `EXTERNAL TABLE `_ -functionality, and send/read data using `gpfdist `_ protocol. +functionality, and send/read data using `gpfdist protocol `_. Data is **not** send through Greenplum master. Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on). -More details can be found in `official documentation `_. +More details can be found in `official documentation `_. Number of parallel connections ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -60,7 +60,7 @@ Number of parallel connections This is very important!!! - If you don't limit number of connections, you can exceed the `max_connections `_ + If you don't limit number of connections, you can exceed the `max_connections `_ limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max, depending on your Greenplum instance settings and using connection balancers like ``pgbouncer``. @@ -125,7 +125,7 @@ Number of connections can be limited by 2 ways: }, ) -See `connection pooling `_ +See `connection pooling `_ documentation. @@ -138,11 +138,19 @@ Allowing connection to Greenplum master Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node, e.g. by updating ``pg_hba.conf`` file. -More details can be found in `official documentation `_. +More details can be found in `official documentation `_. Network ports ~~~~~~~~~~~~~ +Spark with ``master=k8s`` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please follow `the official documentation `_ + +Spark with ``master=yarn`` or ``master=local`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum: * Spark driver and all Spark executors -> port ``5432`` on Greenplum master node. @@ -153,7 +161,7 @@ To read data from Greenplum using Spark, following ports should be opened in fir Greenplum(host="master.host", port=5432, ...) -* Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executor**. +* Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executors**. This range should be set in ``extra`` option: @@ -168,18 +176,18 @@ To read data from Greenplum using Spark, following ports should be opened in fir Number of ports in this range is ``number of parallel running Spark sessions`` * ``number of parallel connections per session``. - Number of connections per session (see below) is usually less than ``30`` (see below). + Number of connections per session (see below) is usually less than ``30`` (see above). Number of session depends on your environment: * For ``master=local`` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU. - * For ``master=yarn`` / ``master=k8s`` hundreds or thousands of sessions can be started simultaneously, + * For ``master=yarn`` hundreds or thousands of sessions can be started simultaneously, but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time. More details can be found in official documentation: - * `port requirements `_ - * `format of server.port value `_ - * `port troubleshooting `_ + * `port requirements `_ + * `format of server.port value `_ + * `port troubleshooting `_ Required grants ~~~~~~~~~~~~~~~ @@ -189,27 +197,74 @@ used for creating a connection: .. tabs:: - .. code-tab:: sql Reading & writing - + .. code-tab:: sql Read + write + + -- get access to get tables metadata & cluster information + GRANT SELECT ON information_schema.tables TO username; + GRANT SELECT ON pg_attribute TO username; + GRANT SELECT ON pg_class TO username; + GRANT SELECT ON pg_namespace TO username; + GRANT SELECT ON pg_settings TO username; + GRANT SELECT ON pg_stats TO username; + GRANT SELECT ON gp_distributed_xacts TO username; + GRANT SELECT ON gp_segment_configuration TO username; + -- Greenplum 5.x only + GRANT SELECT ON gp_distribution_policy TO username; + + -- allow creating external tables in the same schema as source/target table GRANT USAGE ON SCHEMA myschema TO username; GRANT CREATE ON SCHEMA myschema TO username; - GRANT SELECT, INSERT ON SCHEMA myschema.mytable TO username; ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); - .. code-tab:: sql Reading from Greenplum - + -- allow read access to specific table (to get column types) + -- allow write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- get access to get tables metadata & cluster information + GRANT SELECT ON information_schema.tables TO username; + GRANT SELECT ON pg_attribute TO username; + GRANT SELECT ON pg_class TO username; + GRANT SELECT ON pg_namespace TO username; + GRANT SELECT ON pg_settings TO username; + GRANT SELECT ON pg_stats TO username; + GRANT SELECT ON gp_distributed_xacts TO username; + GRANT SELECT ON gp_segment_configuration TO username; + -- Greenplum 5.x only + GRANT SELECT ON gp_distribution_policy TO username; + + -- allow creating external tables in the same schema as source table GRANT USAGE ON SCHEMA schema_to_read TO username; GRANT CREATE ON SCHEMA schema_to_read TO username; - GRANT SELECT ON SCHEMA schema_to_read.table_to_read TO username; -- yes, ``writable``, because data is written from Greenplum to Spark executor. ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); - .. code-tab:: sql Writing to Greenplum + -- allow read access to specific table + GRANT SELECT ON schema_to_read.table_to_read TO username; + + .. code-tab:: sql Write only + -- get access to get tables metadata & cluster information + GRANT SELECT ON information_schema.tables TO username; + GRANT SELECT ON pg_attribute TO username; + GRANT SELECT ON pg_class TO username; + GRANT SELECT ON pg_namespace TO username; + GRANT SELECT ON pg_settings TO username; + GRANT SELECT ON pg_stats TO username; + GRANT SELECT ON gp_distributed_xacts TO username; + GRANT SELECT ON gp_segment_configuration TO username; + -- Greenplum 5.x only + GRANT SELECT ON gp_distribution_policy TO username; + + -- allow creating external tables in the same schema as target table GRANT USAGE ON SCHEMA schema_to_write TO username; GRANT CREATE ON SCHEMA schema_to_write TO username; - GRANT SELECT, INSERT ON SCHEMA schema_to_write.table_to_write TO username; -- yes, ``readable``, because data is read from Spark executor to Greenplum. ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist'); -More details can be found in `official documentation `_. + -- allow read access to specific table (to get column types) + -- allow write access to specific table + GRANT SELECT, INSERT ON schema_to_write.table_to_write TO username; + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index 30d669fea..feecaf0bb 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -97,7 +97,7 @@ Reading from views This connector is **NOT** designed to read data from views. You can technically read data from a view which has -`gp_segment_id `_ column. +`gp_segment_id `_ column. But this is **not** recommended because each Spark executor will run the same query, which may lead to running duplicated calculations and sending data between segments only to skip most of the result and select only small part. @@ -139,7 +139,7 @@ to write data to staging table without generating useless WAL logs. Mapping of Greenplum types to Spark types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -See `official documentation `_ +See `official documentation `_ for more details. onETL does not perform any additional casting of types while reading data. diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst index c7a4f1560..8dfa5a4c6 100644 --- a/docs/connection/db_connection/greenplum/write.rst +++ b/docs/connection/db_connection/greenplum/write.rst @@ -93,7 +93,7 @@ Recommendations Mapping of Spark types to Greenplum types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -See `official documentation `_ +See `official documentation `_ for more details. onETL does not perform any additional casting of types while writing data. diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst index d502c453e..8b2917943 100644 --- a/docs/connection/db_connection/kafka/read.rst +++ b/docs/connection/db_connection/kafka/read.rst @@ -5,10 +5,6 @@ Reading from Kafka For reading data from Kafka, use :obj:`DBReader ` with specific options (see below). -.. warning:: - - Currently, Kafka does not support :ref:`strategy`. You can only read the **whole** topic. - .. note:: Unlike other connection classes, Kafka always return dataframe with fixed schema diff --git a/docs/db/db_reader.rst b/docs/db/db_reader.rst index 4e256d42c..3571be7bc 100644 --- a/docs/db/db_reader.rst +++ b/docs/db/db_reader.rst @@ -9,6 +9,8 @@ DB Reader DBReader DBReader.run + DBReader.has_data + DBReader.raise_if_no_data .. autoclass:: DBReader - :members: run + :members: run, has_data, raise_if_no_data diff --git a/docs/make.bat b/docs/make.bat index 8084272b4..53ad1e82c 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -5,7 +5,7 @@ pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build + set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build @@ -14,15 +14,15 @@ if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% diff --git a/onetl/VERSION b/onetl/VERSION index 78bc1abd1..571215736 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.10.0 +0.10.1 diff --git a/onetl/__init__.py b/onetl/__init__.py index 454896bbe..04793543c 100644 --- a/onetl/__init__.py +++ b/onetl/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import os from onetl.plugins import import_plugins diff --git a/onetl/_internal.py b/onetl/_internal.py index 1476a6414..3b533fac0 100644 --- a/onetl/_internal.py +++ b/onetl/_internal.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 """ Helpers """ diff --git a/onetl/_util/__init__.py b/onetl/_util/__init__.py index e69de29bb..07325b1d1 100644 --- a/onetl/_util/__init__.py +++ b/onetl/_util/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/_util/classproperty.py b/onetl/_util/classproperty.py index e7ec3ff36..8738304dc 100644 --- a/onetl/_util/classproperty.py +++ b/onetl/_util/classproperty.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/file.py b/onetl/_util/file.py index f51a9b210..06e6ef047 100644 --- a/onetl/_util/file.py +++ b/onetl/_util/file.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import hashlib diff --git a/onetl/_util/hadoop.py b/onetl/_util/hadoop.py index 242be692f..1faee4188 100644 --- a/onetl/_util/hadoop.py +++ b/onetl/_util/hadoop.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING diff --git a/onetl/_util/java.py b/onetl/_util/java.py index a29b10ca0..df88b1a59 100644 --- a/onetl/_util/java.py +++ b/onetl/_util/java.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING diff --git a/onetl/_util/scala.py b/onetl/_util/scala.py index b27280654..ec5d53fd8 100644 --- a/onetl/_util/scala.py +++ b/onetl/_util/scala.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from onetl._util.version import Version diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index cbedbdf7b..218c8d7de 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import textwrap diff --git a/onetl/_util/version.py b/onetl/_util/version.py index 7c7bc157b..87583ee63 100644 --- a/onetl/_util/version.py +++ b/onetl/_util/version.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from functools import total_ordering diff --git a/onetl/base/__init__.py b/onetl/base/__init__.py index 733b2d524..4178e7c9b 100644 --- a/onetl/base/__init__.py +++ b/onetl/base/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.base.base_connection import BaseConnection from onetl.base.base_db_connection import BaseDBConnection, BaseDBDialect from onetl.base.base_file_connection import BaseFileConnection diff --git a/onetl/base/base_connection.py b/onetl/base/base_connection.py index e67af2ae0..dc2cbd4f3 100644 --- a/onetl/base/base_connection.py +++ b/onetl/base/base_connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod from typing import TypeVar diff --git a/onetl/base/base_db_connection.py b/onetl/base/base_db_connection.py index 88c9b11a8..c11fb802e 100644 --- a/onetl/base/base_db_connection.py +++ b/onetl/base/base_db_connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import ABC, abstractmethod @@ -21,7 +9,7 @@ from onetl.hwm import Window if TYPE_CHECKING: - from etl_entities.hwm import HWM, ColumnHWM + from etl_entities.hwm import HWM from pyspark.sql import DataFrame from pyspark.sql.types import StructField, StructType @@ -107,7 +95,7 @@ def validate_hint(self, hint: Any) -> Any | None: """ @abstractmethod - def detect_hwm_class(self, field: StructField) -> type[ColumnHWM] | None: + def detect_hwm_class(self, field: StructField) -> type[HWM] | None: """ Detects hwm column type based on specific data types in connections data stores """ diff --git a/onetl/base/base_file_connection.py b/onetl/base/base_file_connection.py index 5c3b50aed..3d1aa2357 100644 --- a/onetl/base/base_file_connection.py +++ b/onetl/base/base_file_connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/base/base_file_df_connection.py b/onetl/base/base_file_df_connection.py index fef5a38cc..c432bd5a2 100644 --- a/onetl/base/base_file_df_connection.py +++ b/onetl/base/base_file_df_connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/base/base_file_filter.py b/onetl/base/base_file_filter.py index 1b2176d2a..641a5b3ed 100644 --- a/onetl/base/base_file_filter.py +++ b/onetl/base/base_file_filter.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import ABC, abstractmethod diff --git a/onetl/base/base_file_format.py b/onetl/base/base_file_format.py index c1ac32dd1..b1f32aa07 100644 --- a/onetl/base/base_file_format.py +++ b/onetl/base/base_file_format.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import ABC, abstractmethod diff --git a/onetl/base/base_file_limit.py b/onetl/base/base_file_limit.py index fdb91d4a4..f71f1910c 100644 --- a/onetl/base/base_file_limit.py +++ b/onetl/base/base_file_limit.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import ABC, abstractmethod diff --git a/onetl/base/contains_exception.py b/onetl/base/contains_exception.py index 2b46fcdef..cf9ae5aac 100644 --- a/onetl/base/contains_exception.py +++ b/onetl/base/contains_exception.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from typing_extensions import Protocol, runtime_checkable diff --git a/onetl/base/contains_get_df_schema.py b/onetl/base/contains_get_df_schema.py index 917811c91..ccb7d34a4 100644 --- a/onetl/base/contains_get_df_schema.py +++ b/onetl/base/contains_get_df_schema.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING diff --git a/onetl/base/contains_get_min_max_values.py b/onetl/base/contains_get_min_max_values.py index 9ac74b91d..d23029d52 100644 --- a/onetl/base/contains_get_min_max_values.py +++ b/onetl/base/contains_get_min_max_values.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/base/path_protocol.py b/onetl/base/path_protocol.py index 703bbf2e3..68594a108 100644 --- a/onetl/base/path_protocol.py +++ b/onetl/base/path_protocol.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing_extensions import Protocol, runtime_checkable diff --git a/onetl/base/path_stat_protocol.py b/onetl/base/path_stat_protocol.py index 830c46261..a42f0569d 100644 --- a/onetl/base/path_stat_protocol.py +++ b/onetl/base/path_stat_protocol.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing_extensions import Protocol, runtime_checkable diff --git a/onetl/base/pure_path_protocol.py b/onetl/base/pure_path_protocol.py index d7c21f1f1..d5757c3b4 100644 --- a/onetl/base/pure_path_protocol.py +++ b/onetl/base/pure_path_protocol.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Sequence, TypeVar diff --git a/onetl/base/supports_rename_dir.py b/onetl/base/supports_rename_dir.py index 1b774f5b5..703866992 100644 --- a/onetl/base/supports_rename_dir.py +++ b/onetl/base/supports_rename_dir.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os @@ -32,5 +20,4 @@ def rename_dir( source_dir_path: str | os.PathLike, target_dir_path: str | os.PathLike, replace: bool = False, - ) -> PathWithStatsProtocol: - ... + ) -> PathWithStatsProtocol: ... diff --git a/onetl/connection/__init__.py b/onetl/connection/__init__.py index 3e40e2a2a..4a25e2102 100644 --- a/onetl/connection/__init__.py +++ b/onetl/connection/__init__.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from importlib import import_module diff --git a/onetl/connection/db_connection/__init__.py b/onetl/connection/db_connection/__init__.py index e69de29bb..07325b1d1 100644 --- a/onetl/connection/db_connection/__init__.py +++ b/onetl/connection/db_connection/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/db_connection/clickhouse/__init__.py b/onetl/connection/db_connection/clickhouse/__init__.py index 0fbebdd70..b830a78d7 100644 --- a/onetl/connection/db_connection/clickhouse/__init__.py +++ b/onetl/connection/db_connection/clickhouse/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.clickhouse.connection import ( Clickhouse, ClickhouseExtra, diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 94e46c739..63eeb455e 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -1,30 +1,16 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging import warnings -from typing import Any, ClassVar, Optional +from typing import ClassVar, Optional from onetl._util.classproperty import classproperty from onetl.connection.db_connection.clickhouse.dialect import ClickhouseDialect from onetl.connection.db_connection.jdbc_connection import JDBCConnection -from onetl.connection.db_connection.jdbc_connection.options import JDBCReadOptions from onetl.connection.db_connection.jdbc_mixin import JDBCStatementType from onetl.hooks import slot, support_hooks -from onetl.hwm import Window from onetl.impl import GenericOptions # do not import PySpark here, as we allow user to use `Clickhouse.get_packages()` for creating Spark session @@ -110,8 +96,6 @@ class Clickhouse(JDBCConnection): from onetl.connection import Clickhouse from pyspark.sql import SparkSession - extra = {"continueBatchOnError": "false"} - # Create Spark session with Clickhouse driver loaded maven_packages = Clickhouse.get_packages() spark = ( @@ -125,7 +109,7 @@ class Clickhouse(JDBCConnection): host="database.host.or.ip", user="user", password="*****", - extra=extra, + extra={"continueBatchOnError": "false"}, spark=spark, ) @@ -175,27 +159,6 @@ def jdbc_url(self) -> str: return f"jdbc:clickhouse://{self.host}:{self.port}?{parameters}".rstrip("?") - @slot - def get_min_max_values( - self, - source: str, - window: Window, - hint: Any | None = None, - where: Any | None = None, - options: JDBCReadOptions | None = None, - ) -> tuple[Any, Any]: - min_value, max_value = super().get_min_max_values( - source=source, - window=window, - hint=hint, - where=where, - options=options, - ) - # Clickhouse for some reason return min/max=0 if there are no rows - if min_value == max_value == 0: - return None, None - return min_value, max_value - @staticmethod def _build_statement( statement: str, diff --git a/onetl/connection/db_connection/clickhouse/dialect.py b/onetl/connection/db_connection/clickhouse/dialect.py index 3fa6eb9ad..187b2e787 100644 --- a/onetl/connection/db_connection/clickhouse/dialect.py +++ b/onetl/connection/db_connection/clickhouse/dialect.py @@ -1,20 +1,9 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import date, datetime +from typing import Any from onetl.connection.db_connection.jdbc_connection import JDBCDialect @@ -26,6 +15,16 @@ def get_partition_column_hash(self, partition_column: str, num_partitions: int) def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str: return f"{partition_column} % {num_partitions}" + def get_max_value(self, value: Any) -> str: + # Max function in Clickhouse returns 0 instead of NULL for empty table + result = self._serialize_value(value) + return f"maxOrNull({result})" + + def get_min_value(self, value: Any) -> str: + # Min function in Clickhouse returns 0 instead of NULL for empty table + result = self._serialize_value(value) + return f"minOrNull({result})" + def _serialize_datetime(self, value: datetime) -> str: result = value.strftime("%Y-%m-%d %H:%M:%S") return f"CAST('{result}' AS DateTime)" diff --git a/onetl/connection/db_connection/db_connection/__init__.py b/onetl/connection/db_connection/db_connection/__init__.py index 9eb19e84f..71439ddc6 100644 --- a/onetl/connection/db_connection/db_connection/__init__.py +++ b/onetl/connection/db_connection/db_connection/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.db_connection.connection import DBConnection from onetl.connection.db_connection.db_connection.dialect import DBDialect diff --git a/onetl/connection/db_connection/db_connection/connection.py b/onetl/connection/db_connection/db_connection/connection.py index 731ef872d..ff3e0af7f 100644 --- a/onetl/connection/db_connection/db_connection/connection.py +++ b/onetl/connection/db_connection/db_connection/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from logging import getLogger diff --git a/onetl/connection/db_connection/db_connection/dialect.py b/onetl/connection/db_connection/db_connection/dialect.py index da52e817c..73efba338 100644 --- a/onetl/connection/db_connection/db_connection/dialect.py +++ b/onetl/connection/db_connection/db_connection/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os @@ -23,12 +11,12 @@ from onetl.hwm.store import SparkTypeToHWM if TYPE_CHECKING: - from etl_entities.hwm import ColumnHWM + from etl_entities.hwm import HWM from pyspark.sql.types import StructField class DBDialect(BaseDBDialect): - def detect_hwm_class(self, field: StructField) -> type[ColumnHWM] | None: + def detect_hwm_class(self, field: StructField) -> type[HWM] | None: return SparkTypeToHWM.get(field.dataType.typeName()) # type: ignore def get_sql_query( diff --git a/onetl/connection/db_connection/dialect_mixins/__init__.py b/onetl/connection/db_connection/dialect_mixins/__init__.py index 43f1df40d..da36f0893 100644 --- a/onetl/connection/db_connection/dialect_mixins/__init__.py +++ b/onetl/connection/db_connection/dialect_mixins/__init__.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.dialect_mixins.not_support_columns import ( NotSupportColumns, ) diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_columns.py b/onetl/connection/db_connection/dialect_mixins/not_support_columns.py index 74fca09cf..2d98ac742 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_columns.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_columns.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py b/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py index 8ae04939b..b99f3873d 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_hint.py b/onetl/connection/db_connection/dialect_mixins/not_support_hint.py index bbc8e13a4..7680c4aac 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_hint.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_hint.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_where.py b/onetl/connection/db_connection/dialect_mixins/not_support_where.py index 0bd06f76f..122de982a 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_where.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_where.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py b/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py index a66bff553..9b026572f 100644 --- a/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py +++ b/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING diff --git a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py index 0ab4eb24b..a45f47d28 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py +++ b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/connection/db_connection/dialect_mixins/support_hint_str.py b/onetl/connection/db_connection/dialect_mixins/support_hint_str.py index 73f3b2875..b90d93099 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_hint_str.py +++ b/onetl/connection/db_connection/dialect_mixins/support_hint_str.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py b/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py index 0be0fe615..157ffecdc 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py +++ b/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from etl_entities.hwm import HWM diff --git a/onetl/connection/db_connection/dialect_mixins/support_name_any.py b/onetl/connection/db_connection/dialect_mixins/support_name_any.py index eb87b8097..dbe230243 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_name_any.py +++ b/onetl/connection/db_connection/dialect_mixins/support_name_any.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py b/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py index 4b9f4a29d..d13cfc747 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py +++ b/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_where_str.py b/onetl/connection/db_connection/dialect_mixins/support_where_str.py index 1354b5345..c171ec4de 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_where_str.py +++ b/onetl/connection/db_connection/dialect_mixins/support_where_str.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any diff --git a/onetl/connection/db_connection/greenplum/__init__.py b/onetl/connection/db_connection/greenplum/__init__.py index d080e8932..8a401d0be 100644 --- a/onetl/connection/db_connection/greenplum/__init__.py +++ b/onetl/connection/db_connection/greenplum/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.greenplum.connection import Greenplum from onetl.connection.db_connection.greenplum.dialect import GreenplumDialect from onetl.connection.db_connection.greenplum.options import ( diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index ad9287e7d..5d0268590 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -75,8 +63,8 @@ class Config: class Greenplum(JDBCMixin, DBConnection): """Greenplum connection. |support_hooks| - Based on package ``io.pivotal:greenplum-spark:2.1.4`` - (`Pivotal connector for Spark `_). + Based on package ``io.pivotal:greenplum-spark:2.3.0`` + (`VMware Greenplum connector for Spark `_). .. warning:: @@ -111,7 +99,7 @@ class Greenplum(JDBCMixin, DBConnection): Supported options are: * All `Postgres JDBC driver properties `_ - * Properties from `Greenplum connector for Spark documentation `_ page, but only starting with ``server.`` or ``pool.`` + * Properties from `Greenplum connector for Spark documentation `_ page, but only starting with ``server.`` or ``pool.`` Examples -------- @@ -171,15 +159,17 @@ class Greenplum(JDBCMixin, DBConnection): @classmethod def get_packages( cls, - scala_version: str | None = None, - spark_version: str | None = None, + *, + scala_version: str | Version | None = None, + spark_version: str | Version | None = None, + package_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| .. warning:: - You should pass at least one parameter. + You should pass either ``scala_version`` or ``spark_version``. Parameters ---------- @@ -193,6 +183,9 @@ def get_packages( Used only if ``scala_version=None``. + package_version : str, optional, default ``2.3.0`` + Package version in format ``major.minor.patch`` + Examples -------- @@ -206,6 +199,11 @@ def get_packages( """ # Connector version is fixed, so we can perform checks for Scala/Spark version + if package_version: + package_ver = Version.parse(package_version) + else: + package_ver = Version(2, 3, 0) + if scala_version: scala_ver = Version.parse(scala_version) elif spark_version: @@ -219,28 +217,28 @@ def get_packages( if scala_ver.digits(2) < (2, 11) or scala_ver.digits(2) > (2, 12): raise ValueError(f"Scala version must be 2.11 - 2.12, got {scala_ver}") - return [f"io.pivotal:greenplum-spark_{scala_ver.digits(2)}:2.1.4"] + return [f"io.pivotal:greenplum-spark_{scala_ver.digits(2)}:{package_ver.digits(3)}"] @classproperty def package_spark_2_3(cls) -> str: """Get package name to be downloaded by Spark 2.3.""" msg = "`Greenplum.package_2_3` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='2.3')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.11:2.1.4" + return "io.pivotal:greenplum-spark_2.11:2.3.0" @classproperty def package_spark_2_4(cls) -> str: """Get package name to be downloaded by Spark 2.4.""" msg = "`Greenplum.package_2_4` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='2.4')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.11:2.1.4" + return "io.pivotal:greenplum-spark_2.11:2.3.0" @classproperty def package_spark_3_2(cls) -> str: """Get package name to be downloaded by Spark 3.2.""" msg = "`Greenplum.package_3_2` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='3.2')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.12:2.1.4" + return "io.pivotal:greenplum-spark_2.12:2.3.0" @property def instance_url(self) -> str: diff --git a/onetl/connection/db_connection/greenplum/connection_limit.py b/onetl/connection/db_connection/greenplum/connection_limit.py index 7dec5ac26..32cb99d8e 100644 --- a/onetl/connection/db_connection/greenplum/connection_limit.py +++ b/onetl/connection/db_connection/greenplum/connection_limit.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import textwrap diff --git a/onetl/connection/db_connection/greenplum/dialect.py b/onetl/connection/db_connection/greenplum/dialect.py index 723503a77..f4bafa688 100644 --- a/onetl/connection/db_connection/greenplum/dialect.py +++ b/onetl/connection/db_connection/greenplum/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import date, datetime diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index 7d4638412..123edec1b 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings @@ -30,6 +18,10 @@ ), ) +READ_WRITE_OPTIONS = frozenset( + ("gpdb.guc.*",), +) + WRITE_OPTIONS = frozenset( ( "mode", @@ -44,6 +36,7 @@ "partitions", "numPartitions", "partitionColumn", + "gpdb.matchDistributionPolicy", ), ) @@ -70,12 +63,12 @@ def _missing_(cls, value: object): # noqa: WPS120 class GreenplumReadOptions(JDBCOptions): - """Pivotal's Greenplum Spark connector reading options. + """VMware's Greenplum Spark connector reading options. .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -99,7 +92,7 @@ class GreenplumReadOptions(JDBCOptions): """ class Config: - known_options = READ_OPTIONS + known_options = READ_OPTIONS | READ_WRITE_OPTIONS prohibited_options = JDBCOptions.Config.prohibited_options | GENERIC_PROHIBITED_OPTIONS | WRITE_OPTIONS partition_column: Optional[str] = Field(alias="partitionColumn") @@ -203,12 +196,12 @@ class Config: class GreenplumWriteOptions(JDBCOptions): - """Pivotal's Greenplum Spark connector writing options. + """VMware's Greenplum Spark connector writing options. .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -233,7 +226,7 @@ class GreenplumWriteOptions(JDBCOptions): """ class Config: - known_options = WRITE_OPTIONS + known_options = WRITE_OPTIONS | READ_WRITE_OPTIONS prohibited_options = JDBCOptions.Config.prohibited_options | GENERIC_PROHIBITED_OPTIONS | READ_OPTIONS if_exists: GreenplumTableExistBehavior = Field(default=GreenplumTableExistBehavior.APPEND, alias="mode") diff --git a/onetl/connection/db_connection/hive/__init__.py b/onetl/connection/db_connection/hive/__init__.py index 54a00f004..bc1a73e35 100644 --- a/onetl/connection/db_connection/hive/__init__.py +++ b/onetl/connection/db_connection/hive/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.hive.connection import Hive from onetl.connection.db_connection.hive.dialect import HiveDialect from onetl.connection.db_connection.hive.options import ( diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index ddcc3ea8f..34d5af5cb 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/connection/db_connection/hive/dialect.py b/onetl/connection/db_connection/hive/dialect.py index 7acd4dbd2..38b737fe0 100644 --- a/onetl/connection/db_connection/hive/dialect.py +++ b/onetl/connection/db_connection/hive/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from onetl.connection.db_connection.db_connection import DBDialect diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 81445851d..357584660 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/connection/db_connection/hive/slots.py b/onetl/connection/db_connection/hive/slots.py index 4105b5901..ce4ceea97 100644 --- a/onetl/connection/db_connection/hive/slots.py +++ b/onetl/connection/db_connection/hive/slots.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from onetl.hooks import slot, support_hooks diff --git a/onetl/connection/db_connection/jdbc_connection/__init__.py b/onetl/connection/db_connection/jdbc_connection/__init__.py index 9c11532c1..476e7ea1d 100644 --- a/onetl/connection/db_connection/jdbc_connection/__init__.py +++ b/onetl/connection/db_connection/jdbc_connection/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.jdbc_connection.connection import JDBCConnection from onetl.connection.db_connection.jdbc_connection.dialect import JDBCDialect from onetl.connection.db_connection.jdbc_connection.options import ( diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index b689efcc1..9396dc802 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/connection/db_connection/jdbc_connection/dialect.py b/onetl/connection/db_connection/jdbc_connection/dialect.py index f051cb92b..cbf0ceb60 100644 --- a/onetl/connection/db_connection/jdbc_connection/dialect.py +++ b/onetl/connection/db_connection/jdbc_connection/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import abstractmethod @@ -37,9 +25,7 @@ class JDBCDialect( # noqa: WPS215 DBDialect, ): @abstractmethod - def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str: - ... + def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str: ... @abstractmethod - def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str: - ... + def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str: ... diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index dacaded77..8793fc084 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/connection/db_connection/jdbc_mixin/__init__.py b/onetl/connection/db_connection/jdbc_mixin/__init__.py index 062fdb74d..0f368a129 100644 --- a/onetl/connection/db_connection/jdbc_mixin/__init__.py +++ b/onetl/connection/db_connection/jdbc_mixin/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.jdbc_mixin.connection import ( JDBCMixin, JDBCStatementType, diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index e5e3e312e..d7324db0e 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -601,7 +589,7 @@ def _statement_to_optional_dataframe(self, jdbc_statement) -> DataFrame | None: result_set = jdbc_statement.getResultSet() - if not result_set: + if not result_set or result_set.isClosed(): return None result_metadata = result_set.getMetaData() diff --git a/onetl/connection/db_connection/jdbc_mixin/options.py b/onetl/connection/db_connection/jdbc_mixin/options.py index dd889b8fc..583cb4f53 100644 --- a/onetl/connection/db_connection/jdbc_mixin/options.py +++ b/onetl/connection/db_connection/jdbc_mixin/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Optional diff --git a/onetl/connection/db_connection/kafka/__init__.py b/onetl/connection/db_connection/kafka/__init__.py index 41938b1a5..71d01ebce 100644 --- a/onetl/connection/db_connection/kafka/__init__.py +++ b/onetl/connection/db_connection/kafka/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.kafka.connection import Kafka diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 0f32f4e9b..b7f06194c 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -1,19 +1,8 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import json import logging from contextlib import closing from typing import TYPE_CHECKING, Any, List, Optional @@ -258,7 +247,7 @@ def check(self): return self @slot - def read_source_as_df( + def read_source_as_df( # noqa: WPS231 self, source: str, columns: list[str] | None = None, @@ -276,7 +265,30 @@ def read_source_as_df( result_options = {f"kafka.{key}": value for key, value in self._get_connection_properties().items()} result_options.update(options.dict(by_alias=True, exclude_none=True)) result_options["subscribe"] = source + + if window and window.expression == "offset": + # the 'including' flag in window values are relevant for batch strategies which are not + # supported by Kafka, therefore we always get offsets including border values + starting_offsets = dict(window.start_from.value) if window.start_from.value else {} + ending_offsets = dict(window.stop_at.value) if window.stop_at.value else {} + + # when the Kafka topic's number of partitions has increased during incremental processing, + # new partitions, which are present in ending_offsets but not in + # starting_offsets, are assigned a default offset (0 in this case). + for partition in ending_offsets: + if partition not in starting_offsets: + starting_offsets[partition] = 0 + + if starting_offsets: + result_options["startingOffsets"] = json.dumps({source: starting_offsets}) + if ending_offsets: + result_options["endingOffsets"] = json.dumps({source: ending_offsets}) + df = self.spark.read.format("kafka").options(**result_options).load() + + if limit is not None: + df = df.limit(limit) + log.info("|%s| Dataframe is successfully created.", self.__class__.__name__) return df @@ -346,38 +358,38 @@ def get_df_schema( TimestampType, ) - schema = StructType( - [ - StructField("key", BinaryType(), nullable=True), - StructField("value", BinaryType(), nullable=False), - StructField("topic", StringType(), nullable=True), - StructField("partition", IntegerType(), nullable=True), - StructField("offset", LongType(), nullable=True), - StructField("timestamp", TimestampType(), nullable=True), - StructField("timestampType", IntegerType(), nullable=True), - StructField( - "headers", - ArrayType( - StructType( - [ - StructField("key", StringType(), nullable=True), - StructField("value", BinaryType(), nullable=True), - ], - ), + all_fields = [ + StructField("key", BinaryType(), nullable=True), + StructField("value", BinaryType(), nullable=False), + StructField("topic", StringType(), nullable=True), + StructField("partition", IntegerType(), nullable=True), + StructField("offset", LongType(), nullable=True), + StructField("timestamp", TimestampType(), nullable=True), + StructField("timestampType", IntegerType(), nullable=True), + StructField( + "headers", + ArrayType( + StructType( + [ + StructField("key", StringType(), nullable=True), + StructField("value", BinaryType(), nullable=True), + ], ), - nullable=True, ), - ], - ) + nullable=True, + ), + ] + + filtered_fields = [field for field in all_fields if columns is None or field.name in columns] - return schema # noqa: WPS331 + return StructType(filtered_fields) @slot @classmethod def get_packages( cls, - spark_version: str, - scala_version: str | None = None, + spark_version: str | Version, + scala_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -466,6 +478,63 @@ def close(self): # because this can influence dataframes created by this connection. # For example, .close() deletes local keytab copy. + @slot + def get_min_max_values( + self, + source: str, + window: Window, + hint: Any | None = None, + where: Any | None = None, + options: KafkaReadOptions | dict | None = None, + ) -> tuple[dict[int, int], dict[int, int]]: + log.info("|%s| Getting min and max offset values for topic %r ...", self.__class__.__name__, source) + + consumer = self._get_java_consumer() + with closing(consumer): + # https://kafka.apache.org/22/javadoc/org/apache/kafka/clients/consumer/KafkaConsumer.html#partitionsFor-java.lang.String- + partition_infos = consumer.partitionsFor(source) + + jvm = self.spark._jvm + topic_partitions = [ + jvm.org.apache.kafka.common.TopicPartition(source, p.partition()) # type: ignore[union-attr] + for p in partition_infos + ] + + # https://kafka.apache.org/22/javadoc/org/apache/kafka/clients/consumer/KafkaConsumer.html#beginningOffsets-java.util.Collection- + beginning_offsets = consumer.beginningOffsets(topic_partitions) + + # https://kafka.apache.org/22/javadoc/org/apache/kafka/clients/consumer/KafkaConsumer.html#endOffsets-java.util.Collection- + end_offsets = consumer.endOffsets(topic_partitions) + + min_offsets = {} + max_offsets = {} + for tp in topic_partitions: + partition_id = tp.partition() + begin_offset = beginning_offsets[tp] + end_offset = end_offsets[tp] + + if window.start_from and window.start_from.is_set(): + window_start = window.start_from.value.get(partition_id, begin_offset) + begin_offset = max(window_start, begin_offset) + if window.stop_at and window.stop_at.is_set(): + window_stop = window.stop_at.value.get(partition_id, end_offset) + end_offset = min(window_stop, end_offset) + + min_offsets[partition_id] = begin_offset + max_offsets[partition_id] = end_offset + + log.info("|%s| Received min and max offset values for each partition.", self.__class__.__name__) + for partition_id in sorted(min_offsets.keys()): + log.debug( + "|%s| Partition %d: Min Offset = %d, Max Offset = %d", + self.__class__.__name__, + partition_id, + min_offsets[partition_id], + max_offsets[partition_id], + ) + + return min_offsets, max_offsets + @property def instance_url(self): return "kafka://" + self.cluster diff --git a/onetl/connection/db_connection/kafka/dialect.py b/onetl/connection/db_connection/kafka/dialect.py index b0507467c..16c4d6056 100644 --- a/onetl/connection/db_connection/kafka/dialect.py +++ b/onetl/connection/db_connection/kafka/dialect.py @@ -1,23 +1,11 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging +from typing import TYPE_CHECKING -from etl_entities.hwm import HWM +from etl_entities.hwm import HWM, KeyValueIntHWM from onetl._util.spark import get_spark_version from onetl.connection.db_connection.db_connection.dialect import DBDialect @@ -26,9 +14,11 @@ NotSupportDFSchema, NotSupportHint, NotSupportWhere, - SupportNameAny, ) +if TYPE_CHECKING: + from pyspark.sql.types import StructField + log = logging.getLogger(__name__) @@ -37,10 +27,17 @@ class KafkaDialect( # noqa: WPS215 NotSupportDFSchema, NotSupportHint, NotSupportWhere, - SupportNameAny, DBDialect, ): - SUPPORTED_HWM_COLUMNS = {"offset", "timestamp"} + SUPPORTED_HWM_COLUMNS = {"offset"} + + def validate_name(self, value: str) -> str: + if "*" in value or "," in value: + raise ValueError( + f"source/target={value} is not supported by {self.connection.__class__.__name__}. " + f"Provide a singular topic.", + ) + return value def validate_hwm( self, @@ -63,3 +60,10 @@ def validate_hwm( f"Spark version must be 3.x for the timestamp column. Current version is: {spark_version}", ) return hwm + + def detect_hwm_class(self, field: StructField) -> type[KeyValueIntHWM] | None: + kafka_field_to_hwm_class = { + "offset": KeyValueIntHWM, + # add "timestamp" in future + } + return kafka_field_to_hwm_class.get(field.name) diff --git a/onetl/connection/db_connection/kafka/extra.py b/onetl/connection/db_connection/kafka/extra.py index 9ed5c4a1b..6dd95e2c3 100644 --- a/onetl/connection/db_connection/kafka/extra.py +++ b/onetl/connection/db_connection/kafka/extra.py @@ -1,18 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.impl import GenericOptions PROHIBITED_OPTIONS = frozenset( diff --git a/onetl/connection/db_connection/kafka/kafka_auth.py b/onetl/connection/db_connection/kafka/kafka_auth.py index adb428c1d..f39d39a3e 100644 --- a/onetl/connection/db_connection/kafka/kafka_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_auth.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import ABC, abstractmethod diff --git a/onetl/connection/db_connection/kafka/kafka_basic_auth.py b/onetl/connection/db_connection/kafka/kafka_basic_auth.py index 32d9fe8d1..56037cb32 100644 --- a/onetl/connection/db_connection/kafka/kafka_basic_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_basic_auth.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 587f23972..2af64f165 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py index 264bd3fa5..fd1384461 100644 --- a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING diff --git a/onetl/connection/db_connection/kafka/kafka_protocol.py b/onetl/connection/db_connection/kafka/kafka_protocol.py index 646302528..12de89cf8 100644 --- a/onetl/connection/db_connection/kafka/kafka_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_protocol.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import ABC, abstractmethod diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index 518d63b35..579dc7ed0 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index 600d56876..d76c4e333 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from pathlib import Path diff --git a/onetl/connection/db_connection/kafka/options.py b/onetl/connection/db_connection/kafka/options.py index 6d7e4ef4c..28bf44f6b 100644 --- a/onetl/connection/db_connection/kafka/options.py +++ b/onetl/connection/db_connection/kafka/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from enum import Enum diff --git a/onetl/connection/db_connection/kafka/slots.py b/onetl/connection/db_connection/kafka/slots.py index 90ef0df79..cd6bfcbe8 100644 --- a/onetl/connection/db_connection/kafka/slots.py +++ b/onetl/connection/db_connection/kafka/slots.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from onetl.hooks import slot, support_hooks diff --git a/onetl/connection/db_connection/mongodb/__init__.py b/onetl/connection/db_connection/mongodb/__init__.py index 0ec92b2d7..b452ca536 100644 --- a/onetl/connection/db_connection/mongodb/__init__.py +++ b/onetl/connection/db_connection/mongodb/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.mongodb.connection import MongoDB, MongoDBExtra from onetl.connection.db_connection.mongodb.dialect import MongoDBDialect from onetl.connection.db_connection.mongodb.options import ( diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index facdf9fee..c84317d88 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import json @@ -160,8 +148,8 @@ class MongoDB(DBConnection): @classmethod def get_packages( cls, - scala_version: str | None = None, - spark_version: str | None = None, + scala_version: str | Version | None = None, + spark_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/connection/db_connection/mongodb/dialect.py b/onetl/connection/db_connection/mongodb/dialect.py index 51c6a64d7..247c58aa6 100644 --- a/onetl/connection/db_connection/mongodb/dialect.py +++ b/onetl/connection/db_connection/mongodb/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import datetime diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 13c256aff..86ff82f1b 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/connection/db_connection/mssql/__init__.py b/onetl/connection/db_connection/mssql/__init__.py index efd6e7072..5b07949ff 100644 --- a/onetl/connection/db_connection/mssql/__init__.py +++ b/onetl/connection/db_connection/mssql/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.mssql.connection import MSSQL, MSSQLExtra from onetl.connection.db_connection.mssql.dialect import MSSQLDialect diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index dac49e3cf..c2716ce72 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings @@ -182,7 +170,7 @@ class MSSQL(JDBCConnection): @classmethod def get_packages( cls, - java_version: str = "8", + java_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -203,6 +191,9 @@ def get_packages( MSSQL.get_packages(java_version="8") """ + if java_version is None: + java_version = "8" + java_ver = Version.parse(java_version) if java_ver.major < 8: raise ValueError(f"Java version must be at least 8, got {java_ver}") diff --git a/onetl/connection/db_connection/mssql/dialect.py b/onetl/connection/db_connection/mssql/dialect.py index f39568423..7dcfdd61e 100644 --- a/onetl/connection/db_connection/mssql/dialect.py +++ b/onetl/connection/db_connection/mssql/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import date, datetime @@ -27,6 +15,29 @@ def get_partition_column_hash(self, partition_column: str, num_partitions: int) def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str: return f"{partition_column} % {num_partitions}" + def get_sql_query( + self, + table: str, + columns: list[str] | None = None, + where: str | list[str] | None = None, + hint: str | None = None, + limit: int | None = None, + compact: bool = False, + ) -> str: + query = super().get_sql_query( + table=table, + columns=columns, + where=where, + hint=hint, + limit=0 if limit == 0 else None, + compact=compact, + ) + # MSSQL-specific handling for the LIMIT clause using TOP + if limit is not None and limit > 0: + query = query.replace("SELECT", f"SELECT TOP {limit}", 1) + + return query + def _serialize_datetime(self, value: datetime) -> str: result = value.isoformat() return f"CAST('{result}' AS datetime2)" diff --git a/onetl/connection/db_connection/mysql/__init__.py b/onetl/connection/db_connection/mysql/__init__.py index ba7337b23..490f356ec 100644 --- a/onetl/connection/db_connection/mysql/__init__.py +++ b/onetl/connection/db_connection/mysql/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.mysql.connection import MySQL, MySQLExtra from onetl.connection.db_connection.mysql.dialect import MySQLDialect diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 3af51a2d1..224504962 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/connection/db_connection/mysql/dialect.py b/onetl/connection/db_connection/mysql/dialect.py index 59f663aed..b9c186e2e 100644 --- a/onetl/connection/db_connection/mysql/dialect.py +++ b/onetl/connection/db_connection/mysql/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import date, datetime diff --git a/onetl/connection/db_connection/oracle/__init__.py b/onetl/connection/db_connection/oracle/__init__.py index 79b1b9278..3bcca7061 100644 --- a/onetl/connection/db_connection/oracle/__init__.py +++ b/onetl/connection/db_connection/oracle/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.oracle.connection import Oracle, OracleExtra from onetl.connection.db_connection.oracle.dialect import OracleDialect diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index a3ec87921..6d87cb559 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -195,7 +183,7 @@ class Oracle(JDBCConnection): @classmethod def get_packages( cls, - java_version: str = "8", + java_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -216,6 +204,9 @@ def get_packages( Oracle.get_packages(java_version="8") """ + if java_version is None: + java_version = "8" + java_ver = Version.parse(java_version) if java_ver.major < 8: raise ValueError(f"Java version must be at least 8, got {java_ver}") diff --git a/onetl/connection/db_connection/oracle/dialect.py b/onetl/connection/db_connection/oracle/dialect.py index 9484524fd..70e0eff3c 100644 --- a/onetl/connection/db_connection/oracle/dialect.py +++ b/onetl/connection/db_connection/oracle/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import date, datetime @@ -33,12 +21,24 @@ def get_sql_query( new_columns = columns or ["*"] if len(new_columns) > 1: new_columns = [table + ".*" if column.strip() == "*" else column for column in new_columns] + + where = where or [] + if isinstance(where, str): + where = [where] + + if limit is not None: + if limit == 0: + where = ["1=0"] + else: + # Oracle does not support LIMIT + where.append(f"ROWNUM <= {limit}") + return super().get_sql_query( table=table, columns=new_columns, where=where, hint=hint, - limit=limit, + limit=None, compact=compact, ) diff --git a/onetl/connection/db_connection/postgres/__init__.py b/onetl/connection/db_connection/postgres/__init__.py index 42bad7d54..3eef06fa1 100644 --- a/onetl/connection/db_connection/postgres/__init__.py +++ b/onetl/connection/db_connection/postgres/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.postgres.connection import Postgres, PostgresExtra from onetl.connection.db_connection.postgres.dialect import PostgresDialect diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index a0bd508b0..1353204fd 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/connection/db_connection/postgres/dialect.py b/onetl/connection/db_connection/postgres/dialect.py index 54124ce3b..0e4f67ab8 100644 --- a/onetl/connection/db_connection/postgres/dialect.py +++ b/onetl/connection/db_connection/postgres/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import date, datetime diff --git a/onetl/connection/db_connection/teradata/__init__.py b/onetl/connection/db_connection/teradata/__init__.py index 9a70a22f8..8356d51e8 100644 --- a/onetl/connection/db_connection/teradata/__init__.py +++ b/onetl/connection/db_connection/teradata/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.teradata.connection import Teradata, TeradataExtra from onetl.connection.db_connection.teradata.dialect import TeradataDialect diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index ed46d24d1..95741927b 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/connection/db_connection/teradata/dialect.py b/onetl/connection/db_connection/teradata/dialect.py index 7845dc360..ac225ce41 100644 --- a/onetl/connection/db_connection/teradata/dialect.py +++ b/onetl/connection/db_connection/teradata/dialect.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from datetime import date, datetime diff --git a/onetl/connection/file_connection/__init__.py b/onetl/connection/file_connection/__init__.py index e69de29bb..07325b1d1 100644 --- a/onetl/connection/file_connection/__init__.py +++ b/onetl/connection/file_connection/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py index cc5ebbb9e..3d7437ad8 100644 --- a/onetl/connection/file_connection/file_connection.py +++ b/onetl/connection/file_connection/file_connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index b7dd82257..5e91648e1 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import ftplib # noqa: S402 diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index dfcd05553..859cca942 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import ftplib # NOQA: S402 import textwrap diff --git a/onetl/connection/file_connection/hdfs/__init__.py b/onetl/connection/file_connection/hdfs/__init__.py index 56be3211c..0eedb25b4 100644 --- a/onetl/connection/file_connection/hdfs/__init__.py +++ b/onetl/connection/file_connection/hdfs/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_connection.hdfs.connection import HDFS from onetl.connection.file_connection.hdfs.slots import HDFSSlots diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index aa58f7e0a..9cf4482d7 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os @@ -541,7 +529,7 @@ def _extract_stat_from_entry(self, top: RemotePath, entry: ENTRY_TYPE) -> PathSt st_mtime=entry_stat["modificationTime"] / 1000, # HDFS uses timestamps with milliseconds st_uid=entry_stat["owner"], st_gid=entry_stat["group"], - st_mode=int(entry_stat["permission"], 8) | stat.S_IFDIR - if entry_stat["type"] == "DIRECTORY" - else stat.S_IFREG, + st_mode=( + int(entry_stat["permission"], 8) | stat.S_IFDIR if entry_stat["type"] == "DIRECTORY" else stat.S_IFREG + ), ) diff --git a/onetl/connection/file_connection/hdfs/slots.py b/onetl/connection/file_connection/hdfs/slots.py index c57e69af4..8bd8431a1 100644 --- a/onetl/connection/file_connection/hdfs/slots.py +++ b/onetl/connection/file_connection/hdfs/slots.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from onetl.hooks import slot, support_hooks diff --git a/onetl/connection/file_connection/mixins/__init__.py b/onetl/connection/file_connection/mixins/__init__.py index 7a01169aa..7b11e58ae 100644 --- a/onetl/connection/file_connection/mixins/__init__.py +++ b/onetl/connection/file_connection/mixins/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_connection.mixins.rename_dir_mixin import RenameDirMixin diff --git a/onetl/connection/file_connection/mixins/rename_dir_mixin.py b/onetl/connection/file_connection/mixins/rename_dir_mixin.py index 4193e48d9..c183ebf56 100644 --- a/onetl/connection/file_connection/mixins/rename_dir_mixin.py +++ b/onetl/connection/file_connection/mixins/rename_dir_mixin.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os @@ -91,5 +79,4 @@ def rename_dir( return self.resolve_dir(target_dir) @abstractmethod - def _rename_dir(self, source: RemotePath, target: RemotePath) -> None: - ... + def _rename_dir(self, source: RemotePath, target: RemotePath) -> None: ... diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 2f8d298f1..9deb016c2 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import io diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index bef7ed276..81c6cce8a 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index bef53ce2d..61ed3ee95 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import contextlib @@ -143,7 +131,7 @@ def _get_client(self) -> SFTPClient: client.load_system_host_keys() if not self.host_key_check: # Default is RejectPolicy - client.set_missing_host_key_policy(WarningPolicy()) + client.set_missing_host_key_policy(WarningPolicy()) # noqa: S507 client.connect( hostname=self.host, diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index 9825a0525..b0eb7decc 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import datetime diff --git a/onetl/connection/file_df_connection/__init__.py b/onetl/connection/file_df_connection/__init__.py index e69de29bb..07325b1d1 100644 --- a/onetl/connection/file_df_connection/__init__.py +++ b/onetl/connection/file_df_connection/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/file_df_connection/spark_file_df_connection.py b/onetl/connection/file_df_connection/spark_file_df_connection.py index 10853078b..d3061385d 100644 --- a/onetl/connection/file_df_connection/spark_file_df_connection.py +++ b/onetl/connection/file_df_connection/spark_file_df_connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from abc import abstractmethod diff --git a/onetl/connection/file_df_connection/spark_hdfs/__init__.py b/onetl/connection/file_df_connection/spark_hdfs/__init__.py index 3cbf3d48b..6977eb4ae 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/__init__.py +++ b/onetl/connection/file_df_connection/spark_hdfs/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_df_connection.spark_hdfs.connection import SparkHDFS from onetl.connection.file_df_connection.spark_hdfs.slots import SparkHDFSSlots diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 76d1520cc..7f23e8b58 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import getpass diff --git a/onetl/connection/file_df_connection/spark_hdfs/slots.py b/onetl/connection/file_df_connection/spark_hdfs/slots.py index cf997c726..d16c6527e 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/slots.py +++ b/onetl/connection/file_df_connection/spark_hdfs/slots.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from onetl.hooks import slot, support_hooks diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index b2fa4625e..d2accb68a 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/connection/file_df_connection/spark_s3/__init__.py b/onetl/connection/file_df_connection/spark_s3/__init__.py index af69c5678..303a74960 100644 --- a/onetl/connection/file_df_connection/spark_s3/__init__.py +++ b/onetl/connection/file_df_connection/spark_s3/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_df_connection.spark_s3.connection import SparkS3 from onetl.connection.file_df_connection.spark_s3.extra import SparkS3Extra diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 73dc1cb07..ff516abf7 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -239,8 +227,8 @@ class SparkS3(SparkFileDFConnection): @classmethod def get_packages( cls, - spark_version: str, - scala_version: str | None = None, + spark_version: str | Version, + scala_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/connection/file_df_connection/spark_s3/extra.py b/onetl/connection/file_df_connection/spark_s3/extra.py index 9ffa71db2..440eabed1 100644 --- a/onetl/connection/file_df_connection/spark_s3/extra.py +++ b/onetl/connection/file_df_connection/spark_s3/extra.py @@ -1,18 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import re from onetl.impl import GenericOptions diff --git a/onetl/connection/kerberos_helpers.py b/onetl/connection/kerberos_helpers.py index 48e0e2b73..5e2bd65b6 100644 --- a/onetl/connection/kerberos_helpers.py +++ b/onetl/connection/kerberos_helpers.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/core/__init__.py b/onetl/core/__init__.py index 9f7bbda6d..1768b6039 100644 --- a/onetl/core/__init__.py +++ b/onetl/core/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import textwrap import warnings from importlib import import_module diff --git a/onetl/core/file_filter/__init__.py b/onetl/core/file_filter/__init__.py index f40419a19..9f2c9a5ae 100644 --- a/onetl/core/file_filter/__init__.py +++ b/onetl/core/file_filter/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.core.file_filter.file_filter import FileFilter diff --git a/onetl/core/file_filter/file_filter.py b/onetl/core/file_filter/file_filter.py index 2c3243a13..a3dac8654 100644 --- a/onetl/core/file_filter/file_filter.py +++ b/onetl/core/file_filter/file_filter.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import glob diff --git a/onetl/core/file_limit/__init__.py b/onetl/core/file_limit/__init__.py index dfc600499..58759c163 100644 --- a/onetl/core/file_limit/__init__.py +++ b/onetl/core/file_limit/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.core.file_limit.file_limit import FileLimit diff --git a/onetl/core/file_limit/file_limit.py b/onetl/core/file_limit/file_limit.py index b79ef1c34..b178433e7 100644 --- a/onetl/core/file_limit/file_limit.py +++ b/onetl/core/file_limit/file_limit.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import textwrap diff --git a/onetl/db/__init__.py b/onetl/db/__init__.py index 98024de34..2cd609817 100644 --- a/onetl/db/__init__.py +++ b/onetl/db/__init__.py @@ -1,16 +1,4 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.db.db_reader import DBReader from onetl.db.db_writer import DBWriter diff --git a/onetl/db/db_reader/__init__.py b/onetl/db/db_reader/__init__.py index 078b92aff..a71bb5268 100644 --- a/onetl/db/db_reader/__init__.py +++ b/onetl/db/db_reader/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.db.db_reader.db_reader import DBReader diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 625599cd6..7f6f6f0d9 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import textwrap @@ -6,7 +8,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union import frozendict -from etl_entities.hwm import HWM, ColumnHWM +from etl_entities.hwm import HWM, ColumnHWM, KeyValueHWM from etl_entities.old_hwm import IntHWM as OldColumnHWM from etl_entities.source import Column, Table from pydantic import Field, PrivateAttr, root_validator, validator @@ -17,6 +19,7 @@ ContainsGetDFSchemaMethod, ContainsGetMinMaxValues, ) +from onetl.exception import NoDataError from onetl.hooks import slot, support_hooks from onetl.hwm import AutoDetectHWM, Edge, Window from onetl.impl import FrozenModel, GenericOptions @@ -123,7 +126,7 @@ class DBReader(FrozenModel): Some sources does not support data filtering. hwm : type[HWM] | None, default: ``None`` - HWM class to be used as :etl-entities:`HWM ` value. + HWM class to be used as :etl-entities:`HWM ` value. .. code:: python @@ -362,8 +365,6 @@ class DBReader(FrozenModel): df = reader.run() """ - AutoDetectHWM = AutoDetectHWM - connection: BaseDBConnection source: str = Field(alias="table") columns: Optional[List[str]] = Field(default=None, min_items=1) @@ -372,9 +373,11 @@ class DBReader(FrozenModel): df_schema: Optional[StructType] = None hwm_column: Optional[Union[str, tuple]] = None hwm_expression: Optional[str] = None - hwm: Optional[ColumnHWM] = None + hwm: Optional[Union[AutoDetectHWM, ColumnHWM, KeyValueHWM]] = None options: Optional[GenericOptions] = None + AutoDetectHWM = AutoDetectHWM + _connection_checked: bool = PrivateAttr(default=False) @validator("source", always=True) @@ -414,7 +417,7 @@ def validate_hwm(cls, values: dict) -> dict: # noqa: WPS231 source: str = values["source"] hwm_column: str | tuple[str, str] | None = values.get("hwm_column") hwm_expression: str | None = values.get("hwm_expression") - hwm: ColumnHWM | None = values.get("hwm") + hwm: HWM | None = values.get("hwm") if hwm_column is not None: if hwm: @@ -501,6 +504,97 @@ def validate_options(cls, options, values): return None + @slot + def has_data(self) -> bool: + """Returns ``True`` if there is some data in the source, ``False`` otherwise. |support_hooks| + + .. note:: + + This method can return different results depending on :ref:`strategy` + + .. warning:: + + If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + + Raises + ------ + RuntimeError + + Current strategy is not compatible with HWM parameter. + + Examples + -------- + + .. code:: python + + reader = DBReader(...) + + # handle situation when there is no data in the source + if reader.has_data(): + df = reader.run() + else: + # implement your handling logic here + ... + """ + self._check_strategy() + + if not self._connection_checked: + self._log_parameters() + self.connection.check() + + window, limit = self._calculate_window_and_limit() + if limit == 0: + return False + + df = self.connection.read_source_as_df( + source=str(self.source), + columns=self.columns, + hint=self.hint, + where=self.where, + df_schema=self.df_schema, + window=window, + limit=1, + **self._get_read_kwargs(), + ) + + return bool(df.take(1)) + + @slot + def raise_if_no_data(self) -> None: + """Raises exception ``NoDataError`` if source does not contain any data. |support_hooks| + + .. note:: + + This method can return different results depending on :ref:`strategy` + + .. warning:: + + If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + + Raises + ------ + RuntimeError + + Current strategy is not compatible with HWM parameter. + + :obj:`onetl.exception.NoDataError` + + There is no data in source. + + Examples + -------- + + .. code:: python + + reader = DBReader(...) + + # ensure that there is some data in the source before reading it using Spark + reader.raise_if_no_data() + """ + + if not self.has_data(): + raise NoDataError(f"No data in the source: {self.source}") + @slot def run(self) -> DataFrame: """ @@ -510,6 +604,10 @@ def run(self) -> DataFrame: This method can return different results depending on :ref:`strategy` + .. warning:: + + If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + Returns ------- df : pyspark.sql.dataframe.DataFrame @@ -541,6 +639,12 @@ def run(self) -> DataFrame: self._connection_checked = True window, limit = self._calculate_window_and_limit() + + # update the HWM with the stop value + if self.hwm and window: + strategy: HWMStrategy = StrategyManager.get_current() # type: ignore[assignment] + strategy.update_hwm(window.stop_at.value) + df = self.connection.read_source_as_df( source=str(self.source), columns=self.columns, @@ -562,7 +666,9 @@ def _check_strategy(self): if self.hwm: if not isinstance(strategy, HWMStrategy): - raise RuntimeError(f"{class_name}(hwm=...) cannot be used with {strategy_name}") + raise RuntimeError( + f"{class_name}(hwm=...) cannot be used with {strategy_name}. Check documentation DBReader.has_data(): https://onetl.readthedocs.io/en/stable/db/db_reader.html#onetl.db.db_reader.db_reader.DBReader.has_data.", + ) self._prepare_hwm(strategy, self.hwm) elif isinstance(strategy, HWMStrategy): @@ -578,7 +684,7 @@ def _prepare_hwm(self, strategy: HWMStrategy, hwm: ColumnHWM): strategy.fetch_hwm() return - if not isinstance(strategy.hwm, ColumnHWM) or strategy.hwm.name != hwm.name: + if not isinstance(strategy.hwm, (ColumnHWM, KeyValueHWM)) or strategy.hwm.name != hwm.name: # exception raised when inside one strategy >1 processes on the same table but with different hwm columns # are executed, example: test_postgres_strategy_incremental_hwm_set_twice error_message = textwrap.dedent( @@ -660,7 +766,7 @@ def _get_hwm_field(self, hwm: HWM) -> StructField: log.info("|%s| Got Spark field: %s", self.__class__.__name__, result) return result - def _calculate_window_and_limit(self) -> tuple[Window | None, int | None]: + def _calculate_window_and_limit(self) -> tuple[Window | None, int | None]: # noqa: WPS231 if not self.hwm: # SnapshotStrategy - always select all the data from source return None, None @@ -673,7 +779,6 @@ def _calculate_window_and_limit(self) -> tuple[Window | None, int | None]: if start_value is not None and stop_value is not None: # we already have start and stop values, nothing to do window = Window(self.hwm.expression, start_from=strategy.current, stop_at=strategy.next) - strategy.update_hwm(window.stop_at.value) return window, None if not isinstance(self.connection, ContainsGetMinMaxValues): @@ -737,7 +842,6 @@ def _calculate_window_and_limit(self) -> tuple[Window | None, int | None]: stop_at=Edge(value=max_value), ) - strategy.update_hwm(window.stop_at.value) return window, None def _log_parameters(self) -> None: diff --git a/onetl/db/db_writer/__init__.py b/onetl/db/db_writer/__init__.py index 9451a3834..b181c7f04 100644 --- a/onetl/db/db_writer/__init__.py +++ b/onetl/db/db_writer/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.db.db_writer.db_writer import DBWriter diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index 5f4264042..e79bb5205 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from logging import getLogger diff --git a/onetl/exception.py b/onetl/exception.py index ca9b915ca..45ea80020 100644 --- a/onetl/exception.py +++ b/onetl/exception.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import textwrap from evacuator import NeedEvacuation diff --git a/onetl/file/__init__.py b/onetl/file/__init__.py index 37dbd44a8..a747f037d 100644 --- a/onetl/file/__init__.py +++ b/onetl/file/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.file_df_reader import FileDFReader from onetl.file.file_df_writer import FileDFWriter from onetl.file.file_downloader import DownloadResult, FileDownloader diff --git a/onetl/file/file_df_reader/__init__.py b/onetl/file/file_df_reader/__init__.py index 2048b13b2..b7b1ff185 100644 --- a/onetl/file/file_df_reader/__init__.py +++ b/onetl/file/file_df_reader/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.file_df_reader.file_df_reader import FileDFReader diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index 8fdef08a4..5b8d9bed6 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/file_df_reader/options.py b/onetl/file/file_df_reader/options.py index 1eee0b2ca..ea4564549 100644 --- a/onetl/file/file_df_reader/options.py +++ b/onetl/file/file_df_reader/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING, Optional diff --git a/onetl/file/file_df_writer/__init__.py b/onetl/file/file_df_writer/__init__.py index 6fdf94b18..0cbb35bb5 100644 --- a/onetl/file/file_df_writer/__init__.py +++ b/onetl/file/file_df_writer/__init__.py @@ -1,15 +1,3 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.file_df_writer.file_df_writer import FileDFWriter diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index f2dd52438..3b7a2724c 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/file_df_writer/options.py b/onetl/file/file_df_writer/options.py index fece10e8f..01d808135 100644 --- a/onetl/file/file_df_writer/options.py +++ b/onetl/file/file_df_writer/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from collections.abc import Iterator diff --git a/onetl/file/file_downloader/__init__.py b/onetl/file/file_downloader/__init__.py index 232bbccb7..e19f5052d 100644 --- a/onetl/file/file_downloader/__init__.py +++ b/onetl/file/file_downloader/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.file_downloader.file_downloader import FileDownloader from onetl.file.file_downloader.options import FileDownloaderOptions from onetl.file.file_downloader.result import DownloadResult diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index fa1257303..a3db0ef19 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/file_downloader/options.py b/onetl/file/file_downloader/options.py index a1b635f81..11b776239 100644 --- a/onetl/file/file_downloader/options.py +++ b/onetl/file/file_downloader/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/file/file_downloader/result.py b/onetl/file/file_downloader/result.py index 2f24f8ad6..7d6233025 100644 --- a/onetl/file/file_downloader/result.py +++ b/onetl/file/file_downloader/result.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from pydantic import Field diff --git a/onetl/file/file_mover/__init__.py b/onetl/file/file_mover/__init__.py index fa416ec6f..a1baa0db4 100644 --- a/onetl/file/file_mover/__init__.py +++ b/onetl/file/file_mover/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.file_mover.file_mover import FileMover from onetl.file.file_mover.options import FileMoverOptions from onetl.file.file_mover.result import MoveResult diff --git a/onetl/file/file_mover/file_mover.py b/onetl/file/file_mover/file_mover.py index 8dd48be6f..c39f2170f 100644 --- a/onetl/file/file_mover/file_mover.py +++ b/onetl/file/file_mover/file_mover.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/file_mover/options.py b/onetl/file/file_mover/options.py index a3a89727c..78a6fd63d 100644 --- a/onetl/file/file_mover/options.py +++ b/onetl/file/file_mover/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/file/file_mover/result.py b/onetl/file/file_mover/result.py index 003dc20d4..59fc433e8 100644 --- a/onetl/file/file_mover/result.py +++ b/onetl/file/file_mover/result.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from pydantic import Field diff --git a/onetl/file/file_result.py b/onetl/file/file_result.py index a54417c97..0ba074288 100644 --- a/onetl/file/file_result.py +++ b/onetl/file/file_result.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/file/file_set.py b/onetl/file/file_set.py index 121dcfc12..4cbf86ea5 100644 --- a/onetl/file/file_set.py +++ b/onetl/file/file_set.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import os import textwrap from typing import Generic, TypeVar diff --git a/onetl/file/file_uploader/__init__.py b/onetl/file/file_uploader/__init__.py index 7f24451f7..85fc5fcd7 100644 --- a/onetl/file/file_uploader/__init__.py +++ b/onetl/file/file_uploader/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.file_uploader.file_uploader import FileUploader from onetl.file.file_uploader.options import FileUploaderOptions from onetl.file.file_uploader.result import UploadResult diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index e2ec9cc48..d4e77ecdd 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/file_uploader/options.py b/onetl/file/file_uploader/options.py index 5d1f1dbdd..af4e588fe 100644 --- a/onetl/file/file_uploader/options.py +++ b/onetl/file/file_uploader/options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import warnings diff --git a/onetl/file/file_uploader/result.py b/onetl/file/file_uploader/result.py index 9b348ca28..48fbbcc85 100644 --- a/onetl/file/file_uploader/result.py +++ b/onetl/file/file_uploader/result.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from pydantic import Field diff --git a/onetl/file/filter/__init__.py b/onetl/file/filter/__init__.py index a2f218d7a..1ebee0306 100644 --- a/onetl/file/filter/__init__.py +++ b/onetl/file/filter/__init__.py @@ -1,18 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.filter.exclude_dir import ExcludeDir from onetl.file.filter.file_hwm import FileHWMFilter from onetl.file.filter.glob import Glob diff --git a/onetl/file/filter/exclude_dir.py b/onetl/file/filter/exclude_dir.py index 820f8f278..33b387623 100644 --- a/onetl/file/filter/exclude_dir.py +++ b/onetl/file/filter/exclude_dir.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/file/filter/file_hwm.py b/onetl/file/filter/file_hwm.py index 395443fc3..232bf451a 100644 --- a/onetl/file/filter/file_hwm.py +++ b/onetl/file/filter/file_hwm.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from etl_entities.hwm import FileHWM diff --git a/onetl/file/filter/glob.py b/onetl/file/filter/glob.py index 56d162c27..045ee1bcc 100644 --- a/onetl/file/filter/glob.py +++ b/onetl/file/filter/glob.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import glob diff --git a/onetl/file/filter/match_all_filters.py b/onetl/file/filter/match_all_filters.py index 9686d3fad..e5c6a7d56 100644 --- a/onetl/file/filter/match_all_filters.py +++ b/onetl/file/filter/match_all_filters.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import logging from typing import Iterable diff --git a/onetl/file/filter/regexp.py b/onetl/file/filter/regexp.py index 0f1bbd83b..e07ba7039 100644 --- a/onetl/file/filter/regexp.py +++ b/onetl/file/filter/regexp.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/file/format/__init__.py b/onetl/file/format/__init__.py index 74475c8a9..f149f5c2a 100644 --- a/onetl/file/format/__init__.py +++ b/onetl/file/format/__init__.py @@ -1,18 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.format.avro import Avro from onetl.file.format.csv import CSV from onetl.file.format.excel import Excel diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index b07c34b01..04d07af84 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import json @@ -126,8 +114,8 @@ class Config: @classmethod def get_packages( cls, - spark_version: str, - scala_version: str | None = None, + spark_version: str | Version, + scala_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index d54e9747b..e21eb5d82 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING, ClassVar diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index 3f5b2bdcf..a7342f702 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -126,9 +114,9 @@ class Config: @classmethod def get_packages( cls, - spark_version: str, - scala_version: str | None = None, - package_version: str | None = None, + spark_version: str | Version, + scala_version: str | Version | None = None, + package_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -149,7 +137,7 @@ def get_packages( If ``None``, ``spark_version`` is used to determine Scala version. - version: str, optional + package_version : str, optional Package version in format ``major.minor.patch``. Default is ``0.20.3``. .. warning:: @@ -187,7 +175,7 @@ def get_packages( raise ValueError(f"Package version should be at least 0.15, got {package_version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version.parse("0.20.3") + version = Version(0, 20, 3) spark_ver = Version.parse(spark_version) if spark_ver < (3, 2): diff --git a/onetl/file/format/file_format.py b/onetl/file/format/file_format.py index 15fd6e8e6..4a7de4f3e 100644 --- a/onetl/file/format/file_format.py +++ b/onetl/file/format/file_format.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING, ClassVar, TypeVar diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 58b18e2f8..3ea82d7e6 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING, ClassVar diff --git a/onetl/file/format/jsonline.py b/onetl/file/format/jsonline.py index 7dc0c1eb0..9bfd84159 100644 --- a/onetl/file/format/jsonline.py +++ b/onetl/file/format/jsonline.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING, ClassVar diff --git a/onetl/file/format/orc.py b/onetl/file/format/orc.py index 0a4d0fc39..a292b6c83 100644 --- a/onetl/file/format/orc.py +++ b/onetl/file/format/orc.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING, ClassVar diff --git a/onetl/file/format/parquet.py b/onetl/file/format/parquet.py index 008a0060a..460819463 100644 --- a/onetl/file/format/parquet.py +++ b/onetl/file/format/parquet.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import TYPE_CHECKING, ClassVar diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 1a4c99803..794593d34 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -150,9 +138,9 @@ class Config: @classmethod def get_packages( # noqa: WPS231 cls, - spark_version: str, - scala_version: str | None = None, - package_version: str | None = None, + spark_version: str | Version, + scala_version: str | Version | None = None, + package_version: str | Version | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -167,7 +155,7 @@ def get_packages( # noqa: WPS231 If ``None``, ``spark_version`` is used to determine Scala version. - version: str, optional + package_version : str, optional Package version in format ``major.minor.patch``. Default is ``0.17.0``. See `Maven index `_ @@ -205,7 +193,7 @@ def get_packages( # noqa: WPS231 raise ValueError(f"Package version must be above 0.13, got {version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version.parse("0.17.0") + version = Version(0, 17, 0) spark_ver = Version.parse(spark_version) scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) diff --git a/onetl/file/limit/__init__.py b/onetl/file/limit/__init__.py index 4f5d0d760..1e00ffddc 100644 --- a/onetl/file/limit/__init__.py +++ b/onetl/file/limit/__init__.py @@ -1,18 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.file.limit.limits_reached import limits_reached from onetl.file.limit.limits_stop_at import limits_stop_at from onetl.file.limit.max_files_count import MaxFilesCount diff --git a/onetl/file/limit/limits_reached.py b/onetl/file/limit/limits_reached.py index f750d3c1e..fe3d98b85 100644 --- a/onetl/file/limit/limits_reached.py +++ b/onetl/file/limit/limits_reached.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/limit/limits_stop_at.py b/onetl/file/limit/limits_stop_at.py index 2ad2f63e1..472c94426 100644 --- a/onetl/file/limit/limits_stop_at.py +++ b/onetl/file/limit/limits_stop_at.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/limit/max_files_count.py b/onetl/file/limit/max_files_count.py index cdbc2e5ea..5f2f584d6 100644 --- a/onetl/file/limit/max_files_count.py +++ b/onetl/file/limit/max_files_count.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/file/limit/reset_limits.py b/onetl/file/limit/reset_limits.py index 5a648752c..4a30ca445 100644 --- a/onetl/file/limit/reset_limits.py +++ b/onetl/file/limit/reset_limits.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/hooks/__init__.py b/onetl/hooks/__init__.py index 2c35260ef..6002b402a 100644 --- a/onetl/hooks/__init__.py +++ b/onetl/hooks/__init__.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.hooks.hook import HookPriority, hook from onetl.hooks.hooks_state import resume_all_hooks, skip_all_hooks, stop_all_hooks from onetl.hooks.slot import slot diff --git a/onetl/hooks/hook.py b/onetl/hooks/hook.py index e49039a3c..c0e9a9f25 100644 --- a/onetl/hooks/hook.py +++ b/onetl/hooks/hook.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -62,8 +64,7 @@ class Hook(Generic[T]): # noqa: WPS338 from onetl.hooks.hook import Hook, HookPriority - def some_func(*args, **kwargs): - ... + def some_func(*args, **kwargs): ... hook = Hook(callback=some_func, enabled=True, priority=HookPriority.FIRST) @@ -209,8 +210,7 @@ def __call__(self, *args, **kwargs) -> T | ContextDecorator: from onetl.hooks.hook import Hook, HookPriority - def some_func(*args, **kwargs): - ... + def some_func(*args, **kwargs): ... hook = Hook(callback=some_func) @@ -231,8 +231,7 @@ class CanProcessResult(Protocol): allowing it to process result of original method call and modify/replace it with something else. """ - def process_result(self, result: T) -> T: - ... + def process_result(self, result: T) -> T: ... class ContextDecorator: diff --git a/onetl/hooks/hook_collection.py b/onetl/hooks/hook_collection.py index eb64c0bbe..6c6643122 100644 --- a/onetl/hooks/hook_collection.py +++ b/onetl/hooks/hook_collection.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/hooks/hooks_state.py b/onetl/hooks/hooks_state.py index b5f9b8cf7..de46b33c0 100644 --- a/onetl/hooks/hooks_state.py +++ b/onetl/hooks/hooks_state.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/hooks/method_inheritance_stack.py b/onetl/hooks/method_inheritance_stack.py index cb755a317..c65a2924e 100644 --- a/onetl/hooks/method_inheritance_stack.py +++ b/onetl/hooks/method_inheritance_stack.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from collections import defaultdict diff --git a/onetl/hooks/slot.py b/onetl/hooks/slot.py index b6f80efe9..cf9171f63 100644 --- a/onetl/hooks/slot.py +++ b/onetl/hooks/slot.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import inspect @@ -139,8 +141,7 @@ def _prepare_hook_args( @support_hooks class MyClass: @slot - def method(self, some, named="abc"): - ... + def method(self, some, named="abc"): ... then hook should have a compatible signature, like these ones: @@ -148,22 +149,19 @@ def method(self, some, named="abc"): @MyClass.method.bind @hook - def callback(self, some, named): - ... + def callback(self, some, named): ... .. code:: python @MyClass.method.bind @hook - def callback(self, some, **kwargs): - ... + def callback(self, some, **kwargs): ... .. code:: python @MyClass.method.bind @hook - def callback(my_class_instance, *args, **kwargs): - ... + def callback(my_class_instance, *args, **kwargs): ... .. note:: @@ -469,8 +467,7 @@ def is_slot(method: Callable) -> bool: class Slot(Protocol): """Protocol which is implemented by a method after applying :obj:`~slot` decorator.""" - def __call__(self, *args, **kwargs): - ... + def __call__(self, *args, **kwargs): ... @property def __hooks__(self) -> HookCollection: @@ -561,14 +558,12 @@ def suspend_hooks(self): @support_hooks class MyClass: @slot - def my_method(self, arg): - ... + def my_method(self, arg): ... @MyClass.my_method.bind @hook - def callback1(self, arg): - ... + def callback1(self, arg): ... obj = MyClass() @@ -598,14 +593,12 @@ def resume_hooks(self): @support_hooks class MyClass: @slot - def my_method(self, arg): - ... + def my_method(self, arg): ... @MyClass.my_method.bind @hook - def callback1(self, arg): - ... + def callback1(self, arg): ... obj = MyClass() @@ -619,8 +612,7 @@ def callback1(self, arg): """ @wraps(bind_hook) - def bind(self): - ... + def bind(self): ... def slot(method: Method) -> Method: @@ -656,36 +648,30 @@ def slot(method: Method) -> Method: @support_hooks class MyClass: @slot - def my_method(self, arg): - ... + def my_method(self, arg): ... @slot # decorator should be on top of all other decorators @classmethod - def class_method(cls): - ... + def class_method(cls): ... @slot # decorator should be on top of all other decorators @staticmethod - def static_method(arg): - ... + def static_method(arg): ... @MyClass.my_method.bind @hook - def callback1(self, arg): - ... + def callback1(self, arg): ... @MyClass.class_method.bind @hook - def callback2(cls): - ... + def callback2(cls): ... @MyClass.static_method.bind @hook - def callback3(arg): - ... + def callback3(arg): ... obj = MyClass() diff --git a/onetl/hooks/support_hooks.py b/onetl/hooks/support_hooks.py index 2500653f4..052ac64fd 100644 --- a/onetl/hooks/support_hooks.py +++ b/onetl/hooks/support_hooks.py @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -111,13 +113,11 @@ def suspend_hooks(cls: type) -> None: @support_hooks class MyClass: @slot - def my_method(self, arg): - ... + def my_method(self, arg): ... @MyClass.my_method.hook - def callback(self, arg): - ... + def callback(self, arg): ... obj = MyClass() @@ -146,13 +146,11 @@ def resume_hooks(cls: type) -> None: @support_hooks class MyClass: @slot - def my_method(self, arg): - ... + def my_method(self, arg): ... @MyClass.my_method.hook - def callback(self, arg): - ... + def callback(self, arg): ... obj = MyClass() @@ -190,13 +188,11 @@ def support_hooks(cls: Klass) -> Klass: @support_hooks class MyClass: @slot - def my_method(self, arg): - ... + def my_method(self, arg): ... @MyClass.my_method.hook - def callback(self, arg): - ... + def callback(self, arg): ... MyClass().my_method() # will execute callback function diff --git a/onetl/hwm/__init__.py b/onetl/hwm/__init__.py index 333d3958a..e516ebead 100644 --- a/onetl/hwm/__init__.py +++ b/onetl/hwm/__init__.py @@ -1,2 +1,4 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.hwm.auto_hwm import AutoDetectHWM from onetl.hwm.window import Edge, Window diff --git a/onetl/hwm/auto_hwm.py b/onetl/hwm/auto_hwm.py index aae9c89fd..26ee406c8 100644 --- a/onetl/hwm/auto_hwm.py +++ b/onetl/hwm/auto_hwm.py @@ -1,21 +1,38 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from etl_entities.hwm import ColumnHWM +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from typing import Any + +from etl_entities.hwm import HWM +from pydantic import root_validator from typing_extensions import Literal -class AutoDetectHWM(ColumnHWM): +class AutoDetectHWM(HWM): value: Literal[None] = None + + @root_validator(pre=True) + def handle_aliases(cls, values): + # this validator is hack for accommodating multiple aliases for a single field in pydantic v1. + + # 'column' is an alias used specifically for instances of the ColumnHWM class. + if "source" in values and "entity" not in values: + values["entity"] = values.pop("source") + + # 'topic' is an alias used for instances of the KeyValueHWM class. + elif "topic" in values and "entity" not in values: + values["entity"] = values.pop("topic") + + return values + + def update(self: AutoDetectHWM, value: Any) -> AutoDetectHWM: + """Update current HWM value with some implementation-specific logic, and return HWM""" + raise NotImplementedError("update method should be implemented in auto detected subclasses") + + def dict(self, **kwargs): + serialized_data = super().dict(**kwargs) + # as in HWM classes default value for 'value' may be any structure, + # e.g. frozendict for KeyValueHWM, there should unifed dict representation + serialized_data.pop("value") + return serialized_data diff --git a/onetl/hwm/store/__init__.py b/onetl/hwm/store/__init__.py index 4f71ba0d5..0e34caa00 100644 --- a/onetl/hwm/store/__init__.py +++ b/onetl/hwm/store/__init__.py @@ -1,16 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import textwrap import warnings from importlib import import_module diff --git a/onetl/hwm/store/hwm_class_registry.py b/onetl/hwm/store/hwm_class_registry.py index d9a2ce674..ccd074e40 100644 --- a/onetl/hwm/store/hwm_class_registry.py +++ b/onetl/hwm/store/hwm_class_registry.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import ClassVar @@ -77,8 +65,7 @@ def register_spark_type_to_hwm_type_mapping(*type_names: str): @register_spark_type_to_hwm_type_mapping("somename", "anothername") - class MyHWM(HWM): - ... + class MyHWM(HWM): ... assert SparkTypeToHWM.get("somename") == MyClass diff --git a/onetl/hwm/store/yaml_hwm_store.py b/onetl/hwm/store/yaml_hwm_store.py index 31064d572..759637bab 100644 --- a/onetl/hwm/store/yaml_hwm_store.py +++ b/onetl/hwm/store/yaml_hwm_store.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import operator @@ -50,8 +38,7 @@ def default_hwm_store_class(klass: type[BaseHWMStore]) -> type[BaseHWMStore]: @default_hwm_store_class - class MyClass(BaseHWMStore): - ... + class MyClass(BaseHWMStore): ... HWMStoreClassRegistry.get() == MyClass # default diff --git a/onetl/hwm/window.py b/onetl/hwm/window.py index f4eba5bc7..7a902000f 100644 --- a/onetl/hwm/window.py +++ b/onetl/hwm/window.py @@ -1,18 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from dataclasses import dataclass, field diff --git a/onetl/impl/__init__.py b/onetl/impl/__init__.py index 12319f92b..76d32f48f 100644 --- a/onetl/impl/__init__.py +++ b/onetl/impl/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.impl.base_model import BaseModel from onetl.impl.failed_local_file import FailedLocalFile from onetl.impl.file_exist_behavior import FileExistBehavior diff --git a/onetl/impl/base_model.py b/onetl/impl/base_model.py index 91b028eac..cf76cd48c 100644 --- a/onetl/impl/base_model.py +++ b/onetl/impl/base_model.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import ClassVar diff --git a/onetl/impl/failed_local_file.py b/onetl/impl/failed_local_file.py index f9de4eda5..34a12bb97 100644 --- a/onetl/impl/failed_local_file.py +++ b/onetl/impl/failed_local_file.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/impl/file_exist_behavior.py b/onetl/impl/file_exist_behavior.py index 5bf82de55..1081aa0e3 100644 --- a/onetl/impl/file_exist_behavior.py +++ b/onetl/impl/file_exist_behavior.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import logging import warnings from enum import Enum diff --git a/onetl/impl/frozen_model.py b/onetl/impl/frozen_model.py index 5f155abbb..10cb06a64 100644 --- a/onetl/impl/frozen_model.py +++ b/onetl/impl/frozen_model.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from onetl.impl.base_model import BaseModel @@ -19,4 +7,5 @@ class FrozenModel(BaseModel): class Config: + smart_union = True frozen = True diff --git a/onetl/impl/generic_options.py b/onetl/impl/generic_options.py index 84826df17..057e69a65 100644 --- a/onetl/impl/generic_options.py +++ b/onetl/impl/generic_options.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/impl/local_path.py b/onetl/impl/local_path.py index 9708fc200..0dc709860 100644 --- a/onetl/impl/local_path.py +++ b/onetl/impl/local_path.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 import os import sys from pathlib import Path, PurePosixPath, PureWindowsPath diff --git a/onetl/impl/path_container.py b/onetl/impl/path_container.py index 1f00610a2..85ee7e82a 100644 --- a/onetl/impl/path_container.py +++ b/onetl/impl/path_container.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/impl/path_repr.py b/onetl/impl/path_repr.py index cdf89da45..5a61b7406 100644 --- a/onetl/impl/path_repr.py +++ b/onetl/impl/path_repr.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/impl/remote_directory.py b/onetl/impl/remote_directory.py index 7e48303b9..2b52346e7 100644 --- a/onetl/impl/remote_directory.py +++ b/onetl/impl/remote_directory.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/impl/remote_file.py b/onetl/impl/remote_file.py index 6044b1602..86193d440 100644 --- a/onetl/impl/remote_file.py +++ b/onetl/impl/remote_file.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import os diff --git a/onetl/impl/remote_path.py b/onetl/impl/remote_path.py index d8d065904..78fce3948 100644 --- a/onetl/impl/remote_path.py +++ b/onetl/impl/remote_path.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from pathlib import PurePosixPath diff --git a/onetl/impl/remote_path_stat.py b/onetl/impl/remote_path_stat.py index 95b34482a..859748034 100644 --- a/onetl/impl/remote_path_stat.py +++ b/onetl/impl/remote_path_stat.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Optional, Union diff --git a/onetl/log.py b/onetl/log.py index 158e6e1b6..c97fe6620 100644 --- a/onetl/log.py +++ b/onetl/log.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import io diff --git a/onetl/plugins/__init__.py b/onetl/plugins/__init__.py index c80146bf8..8127ff0d4 100644 --- a/onetl/plugins/__init__.py +++ b/onetl/plugins/__init__.py @@ -1 +1,3 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.plugins.import_plugins import import_plugins diff --git a/onetl/plugins/import_plugins.py b/onetl/plugins/import_plugins.py index c0b480b6c..f22bd2805 100644 --- a/onetl/plugins/import_plugins.py +++ b/onetl/plugins/import_plugins.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import inspect diff --git a/onetl/strategy/__init__.py b/onetl/strategy/__init__.py index 3842b6864..1a0ff4643 100644 --- a/onetl/strategy/__init__.py +++ b/onetl/strategy/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from onetl.strategy.base_strategy import BaseStrategy from onetl.strategy.incremental_strategy import ( IncrementalBatchStrategy, diff --git a/onetl/strategy/base_strategy.py b/onetl/strategy/base_strategy.py index e22ebd5ca..0daf309df 100644 --- a/onetl/strategy/base_strategy.py +++ b/onetl/strategy/base_strategy.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/strategy/batch_hwm_strategy.py b/onetl/strategy/batch_hwm_strategy.py index 711a4ccff..dc02fb29a 100644 --- a/onetl/strategy/batch_hwm_strategy.py +++ b/onetl/strategy/batch_hwm_strategy.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging @@ -142,6 +130,9 @@ def check_hwm_increased(self, next_value: Any) -> None: @property def next(self) -> Edge: if self.current.is_set(): + if not hasattr(self.current.value, "__add__"): + raise RuntimeError(f"HWM: {self.hwm!r} cannot be used with Batch strategies") + result = Edge(value=self.current.value + self.step) else: result = Edge(value=self.stop) diff --git a/onetl/strategy/hwm_store/__init__.py b/onetl/strategy/hwm_store/__init__.py index 124fd3ccc..0b931301e 100644 --- a/onetl/strategy/hwm_store/__init__.py +++ b/onetl/strategy/hwm_store/__init__.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 # TODO: remove in 1.0.0 import textwrap diff --git a/onetl/strategy/hwm_strategy.py b/onetl/strategy/hwm_strategy.py index c50c74372..02249554d 100644 --- a/onetl/strategy/hwm_strategy.py +++ b/onetl/strategy/hwm_strategy.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/strategy/incremental_strategy.py b/onetl/strategy/incremental_strategy.py index 04cc87903..8946f7d5d 100644 --- a/onetl/strategy/incremental_strategy.py +++ b/onetl/strategy/incremental_strategy.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from typing import Any, Optional @@ -294,6 +282,40 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): FROM public.mydata WHERE business_dt > CAST('2021-01-09' AS DATE); -- from HWM-offset (EXCLUDING first row) + Incremental run with :ref:`db-reader` and :ref:`kafka` connection + (by ``offset`` in topic - :etl-entities:`KeyValueHWM `): + + .. code:: python + + from onetl.connection import Kafka + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + from onetl.hwm import AutoDetectHWM + + from pyspark.sql import SparkSession + + maven_packages = Kafka.get_packages() + spark = ( + SparkSession.builder.appName("spark-app-name") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + + kafka = Kafka( + addresses=["mybroker:9092", "anotherbroker:9092"], + cluster="my-cluster", + spark=spark, + ) + + reader = DBReader( + connection=kafka, + source="topic_name", + hwm=DBReader.AutoDetectHWM(name="some_hwm_name", expression="offset"), + ) + + with IncrementalStrategy(): + df = reader.run() + Incremental run with :ref:`file-downloader` and ``hwm=FileListHWM(...)``: .. code:: python diff --git a/onetl/strategy/snapshot_strategy.py b/onetl/strategy/snapshot_strategy.py index 3bf6913cb..49bce44dd 100644 --- a/onetl/strategy/snapshot_strategy.py +++ b/onetl/strategy/snapshot_strategy.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/strategy/strategy_manager.py b/onetl/strategy/strategy_manager.py index 3f5470fe1..bb380ae54 100644 --- a/onetl/strategy/strategy_manager.py +++ b/onetl/strategy/strategy_manager.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import logging diff --git a/onetl/version.py b/onetl/version.py index ee0661f89..dada22dd7 100644 --- a/onetl/version.py +++ b/onetl/version.py @@ -1,17 +1,5 @@ -# Copyright 2023 MTS (Mobile Telesystems) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 """ __version__ parameter required to be able to output to the console """ diff --git a/requirements/core.txt b/requirements/core.txt index 01398e154..f0ca45793 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -1,5 +1,5 @@ deprecated -etl-entities>=2.1.2,<2.2 +etl-entities>=2.2,<2.3 evacuator>=1.0,<1.1 frozendict humanize diff --git a/requirements/docs-plantuml.txt b/requirements/docs-plantuml.txt new file mode 100644 index 000000000..efaaff7ac --- /dev/null +++ b/requirements/docs-plantuml.txt @@ -0,0 +1,2 @@ +--no-deps +sphinx-plantuml diff --git a/requirements/docs.txt b/requirements/docs.txt index f6062e62f..fedf2562a 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -7,7 +7,8 @@ sphinx sphinx-copybutton sphinx-design sphinx-favicon -sphinx-plantuml +# TODO: uncomment after https://github.com/zqmillet/sphinx-plantuml/pull/4 +# sphinx-plantuml sphinx-tabs sphinx-toolbox sphinx_substitution_extensions diff --git a/requirements/tests/base.txt b/requirements/tests/base.txt index fd01d6f52..17e7ef712 100644 --- a/requirements/tests/base.txt +++ b/requirements/tests/base.txt @@ -1,6 +1,6 @@ coverage omegaconf -pytest +pytest<8 pytest-lazy-fixture pytest-mock pytest-rerunfailures diff --git a/setup.cfg b/setup.cfg index 435f97203..d73a28390 100644 --- a/setup.cfg +++ b/setup.cfg @@ -268,7 +268,11 @@ ignore = # WPS201 Found module with too many imports: 26 > 25 WPS201, # WPS429 Found multiple assign targets - WPS429 + WPS429, +# E704 multiple statements on one line: def func(): ... + E704, +# WPS220 Found too deep nesting: 46 > 20 + WPS220 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = diff --git a/setup.py b/setup.py index 6cff75434..d428bd826 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def parse_requirements(file: Path) -> list[str]: description="One ETL tool to rule them all", long_description=long_description, long_description_content_type="text/x-rst", - license="Apache License 2.0", + license="Apache-2.0", license_files=("LICENSE.txt",), url="https://github.com/MobileTeleSystems/onetl", classifiers=[ @@ -85,6 +85,7 @@ def parse_requirements(file: Path) -> list[str]: }, keywords=["Spark", "ETL", "JDBC", "HWM"], packages=find_packages(exclude=["docs", "docs.*", "tests", "tests.*"]), + entry_points={"tricoder_package_spy.register": ["onetl=onetl"]}, python_requires=">=3.7", install_requires=requirements_core, extras_require={ diff --git a/tests/fixtures/processing/base_processing.py b/tests/fixtures/processing/base_processing.py index e45d3d883..c3159f334 100644 --- a/tests/fixtures/processing/base_processing.py +++ b/tests/fixtures/processing/base_processing.py @@ -35,8 +35,7 @@ def create_schema_ddl( def create_schema( self, schema: str, - ) -> None: - ... + ) -> None: ... def create_table_ddl( self, @@ -53,8 +52,7 @@ def create_table( table: str, fields: dict[str, str], schema: str, - ) -> None: - ... + ) -> None: ... def drop_database_ddl( self, @@ -66,8 +64,7 @@ def drop_database_ddl( def drop_database( self, schema: str, - ) -> None: - ... + ) -> None: ... def drop_table_ddl( self, @@ -81,8 +78,7 @@ def drop_table( self, table: str, schema: str, - ) -> None: - ... + ) -> None: ... @abstractmethod def insert_data( @@ -90,8 +86,7 @@ def insert_data( schema: str, table: str, values: list, - ) -> None: - ... + ) -> None: ... def get_expected_dataframe_ddl( self, @@ -112,8 +107,7 @@ def get_expected_dataframe( schema: str, table: str, order_by: str | None = None, - ) -> pandas.DataFrame: - ... + ) -> pandas.DataFrame: ... def get_column_type(self, name: str) -> str: return self._column_types_and_names_matching[name] diff --git a/tests/fixtures/processing/fixtures.py b/tests/fixtures/processing/fixtures.py index 38b614441..3f541f692 100644 --- a/tests/fixtures/processing/fixtures.py +++ b/tests/fixtures/processing/fixtures.py @@ -76,3 +76,35 @@ def load_table_data(prepare_schema_table, processing): ) return prepare_schema_table + + +@pytest.fixture +def kafka_topic(processing, request): + topic = secrets.token_hex(6) + processing.create_topic(topic, num_partitions=1) + + def delete_topic(): + processing.delete_topic([topic]) + + request.addfinalizer(delete_topic) + return topic + + +@pytest.fixture +def kafka_dataframe_schema(): + from pyspark.sql.types import ( + FloatType, + LongType, + StringType, + StructField, + StructType, + ) + + return StructType( + [ + StructField("id_int", LongType(), nullable=True), + StructField("text_string", StringType(), nullable=True), + StructField("hwm_int", LongType(), nullable=True), + StructField("float_value", FloatType(), nullable=True), + ], + ) diff --git a/tests/fixtures/processing/kafka.py b/tests/fixtures/processing/kafka.py index e80371ea9..bddb3490b 100644 --- a/tests/fixtures/processing/kafka.py +++ b/tests/fixtures/processing/kafka.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os from typing import TYPE_CHECKING @@ -17,6 +18,17 @@ class KafkaProcessing(BaseProcessing): column_names: list[str] = ["id_int", "text_string", "hwm_int", "float_value"] + def __enter__(self): + return self + + def __exit__(self, _exc_type, _exc_value, _traceback): + return False + + @property + def schema(self) -> str: + # Kafka does not support schemas + return "" + def get_consumer(self): from confluent_kafka import Consumer @@ -111,11 +123,47 @@ def get_expected_df(self, topic: str, num_messages: int = 1, timeout: float = DE def insert_data(self, schema: str, table: str, values: list) -> None: pass + def change_topic_partitions(self, topic: str, num_partitions: int, timeout: float = DEFAULT_TIMEOUT): + from confluent_kafka.admin import NewPartitions + + admin_client = self.get_admin_client() + + if not self.topic_exists(topic): + self.create_topic(topic, num_partitions) + else: + new_partitions = [NewPartitions(topic, num_partitions)] + # change the number of partitions + fs = admin_client.create_partitions(new_partitions, request_timeout=timeout) + + for topic, f in fs.items(): + try: + f.result() + except Exception as e: + raise Exception(f"Failed to update number of partitions for topic '{topic}': {e}") # noqa: WPS454 + + def create_topic(self, topic: str, num_partitions: int, timeout: float = DEFAULT_TIMEOUT): + from confluent_kafka.admin import KafkaException, NewTopic + + admin_client = self.get_admin_client() + topic_config = NewTopic(topic, num_partitions=num_partitions, replication_factor=1) + fs = admin_client.create_topics([topic_config], request_timeout=timeout) + + for topic, f in fs.items(): + try: + f.result() + except Exception as e: + raise KafkaException(f"Error creating topic '{topic}': {e}") + def delete_topic(self, topics: list[str], timeout: float = DEFAULT_TIMEOUT): admin = self.get_admin_client() # https://github.com/confluentinc/confluent-kafka-python/issues/813 admin.delete_topics(topics, request_timeout=timeout) + def insert_pandas_df_into_topic(self, df: pandas.DataFrame, topic: str): + for _, row in df.iterrows(): + message = json.dumps(row.to_dict()) + self.send_message(topic, message.encode("utf-8")) + def topic_exists(self, topic: str, timeout: float = DEFAULT_TIMEOUT) -> bool: admin = self.get_admin_client() topic_metadata = admin.list_topics(timeout=timeout) diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index 2dbe213a0..3196210e1 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -72,7 +72,7 @@ def maven_packages(): packages.extend(SparkS3.get_packages(spark_version=pyspark_version)) # There is no XML files support for Spark less than 3 - packages.extend(XML.get_packages(pyspark_version)) + packages.extend(XML.get_packages(spark_version=pyspark_version)) # There is no MongoDB connector for Spark less than 3.2 packages.extend(MongoDB.get_packages(spark_version=pyspark_version)) diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py index 692c3ea45..e38de7413 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py @@ -338,6 +338,11 @@ def test_clickhouse_reader_snapshot_nothing_to_read(spark, processing, prepare_s first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -352,8 +357,12 @@ def test_clickhouse_reader_snapshot_nothing_to_read(spark, processing, prepare_s # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") # insert second span diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_greenplum_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_greenplum_reader_integration.py index 0e60a6689..f0ede8a82 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_greenplum_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_greenplum_reader_integration.py @@ -280,6 +280,11 @@ def test_greenplum_reader_snapshot_nothing_to_read(spark, processing, prepare_sc first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -294,8 +299,12 @@ def test_greenplum_reader_snapshot_nothing_to_read(spark, processing, prepare_sc # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") # insert second span diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_hive_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_hive_reader_integration.py index 3a3a0a8a0..7831f835b 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_hive_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_hive_reader_integration.py @@ -220,6 +220,11 @@ def test_hive_reader_snapshot_nothing_to_read(spark, processing, prepare_schema_ first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -234,8 +239,12 @@ def test_hive_reader_snapshot_nothing_to_read(spark, processing, prepare_schema_ # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") # insert second span diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py index 635e257b9..86f2b7e98 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py @@ -1,6 +1,3 @@ -import json -import secrets - import pytest try: @@ -15,26 +12,6 @@ pytestmark = [pytest.mark.kafka, pytest.mark.db_connection, pytest.mark.connection] -@pytest.fixture(name="schema") -def dataframe_schema(): - from pyspark.sql.types import ( - FloatType, - LongType, - StringType, - StructField, - StructType, - ) - - return StructType( - [ - StructField("id_int", LongType(), nullable=True), - StructField("text_string", StringType(), nullable=True), - StructField("hwm_int", LongType(), nullable=True), - StructField("float_value", FloatType(), nullable=True), - ], - ) - - @pytest.fixture def kafka_schema(): from pyspark.sql.types import ( @@ -100,26 +77,7 @@ def kafka_schema_with_headers(): return schema # noqa: WPS331 -@pytest.fixture(name="kafka_processing") -def create_kafka_data(spark): - from tests.fixtures.processing.kafka import KafkaProcessing - - topic = secrets.token_hex(5) - proc = KafkaProcessing() - df = proc.create_spark_df(spark) - rows = [row.asDict() for row in df.collect()] - - for row_to_send in rows: - proc.send_message(topic, json.dumps(row_to_send).encode("utf-8")) - - yield topic, proc, df - # Release - proc.delete_topic([topic]) - - -def test_kafka_reader(spark, kafka_processing, schema): - topic, processing, expected_df = kafka_processing - +def test_kafka_reader(spark, processing, kafka_dataframe_schema, kafka_topic): kafka = Kafka( spark=spark, addresses=[f"{processing.host}:{processing.port}"], @@ -128,16 +86,20 @@ def test_kafka_reader(spark, kafka_processing, schema): reader = DBReader( connection=kafka, - source=topic, + source=kafka_topic, ) - df = reader.run() - processing.assert_equal_df(processing.json_deserialize(df, df_schema=schema), other_frame=expected_df) + first_span = processing.create_pandas_df(min_id=0, max_id=100) + processing.insert_pandas_df_into_topic(first_span, kafka_topic) + df = reader.run() + processing.assert_equal_df( + processing.json_deserialize(df, df_schema=kafka_dataframe_schema), + other_frame=first_span, + ) -def test_kafka_reader_columns_and_types_without_headers(spark, kafka_processing, kafka_schema): - topic, processing, _ = kafka_processing +def test_kafka_reader_columns_and_types_without_headers(spark, processing, kafka_schema, kafka_topic): kafka = Kafka( spark=spark, addresses=[f"{processing.host}:{processing.port}"], @@ -146,20 +108,21 @@ def test_kafka_reader_columns_and_types_without_headers(spark, kafka_processing, reader = DBReader( connection=kafka, - source=topic, + source=kafka_topic, ) + first_span = processing.create_pandas_df(min_id=0, max_id=100) + processing.insert_pandas_df_into_topic(first_span, kafka_topic) + df = reader.run() assert df.schema == kafka_schema # headers aren't included in schema if includeHeaders=False -def test_kafka_reader_columns_and_types_with_headers(spark, kafka_processing, kafka_schema_with_headers): +def test_kafka_reader_columns_and_types_with_headers(spark, processing, kafka_schema_with_headers, kafka_topic): if get_spark_version(spark).major < 3: pytest.skip("Spark 3.x or later is required to write/read 'headers' from Kafka messages") - topic, processing, _ = kafka_processing - kafka = Kafka( spark=spark, addresses=[f"{processing.host}:{processing.port}"], @@ -169,18 +132,19 @@ def test_kafka_reader_columns_and_types_with_headers(spark, kafka_processing, ka # Check that the DataFrame also has a "headers" column when includeHeaders=True reader = DBReader( connection=kafka, - source=topic, + source=kafka_topic, options=Kafka.ReadOptions(includeHeaders=True), ) + first_span = processing.create_pandas_df(min_id=0, max_id=100) + processing.insert_pandas_df_into_topic(first_span, kafka_topic) + df = reader.run() assert df.schema == kafka_schema_with_headers -def test_kafka_reader_topic_does_not_exist(spark, kafka_processing): - _, processing, _ = kafka_processing - +def test_kafka_reader_topic_does_not_exist(spark, processing): kafka = Kafka( spark=spark, addresses=[f"{processing.host}:{processing.port}"], @@ -197,11 +161,12 @@ def test_kafka_reader_topic_does_not_exist(spark, kafka_processing): @pytest.mark.parametrize("group_id_option", ["group.id", "groupIdPrefix"]) -def test_kafka_reader_with_group_id(group_id_option, spark, kafka_processing, schema): +def test_kafka_reader_with_group_id(group_id_option, spark, processing, kafka_dataframe_schema, kafka_topic): if get_spark_version(spark).major < 3: pytest.skip("Spark 3.x or later is required to pas group.id") - topic, processing, expected_df = kafka_processing + first_span = processing.create_pandas_df(min_id=0, max_id=100) + processing.insert_pandas_df_into_topic(first_span, kafka_topic) kafka = Kafka( spark=spark, @@ -212,12 +177,81 @@ def test_kafka_reader_with_group_id(group_id_option, spark, kafka_processing, sc reader = DBReader( connection=kafka, - source=topic, + source=kafka_topic, ) df = reader.run() - processing.assert_equal_df(processing.json_deserialize(df, df_schema=schema), other_frame=expected_df) + processing.assert_equal_df( + processing.json_deserialize(df, df_schema=kafka_dataframe_schema), + other_frame=first_span, + ) # Spark does not report to Kafka which messages were read, so Kafka does not remember latest offsets for groupId # https://stackoverflow.com/a/64003569 df = reader.run() - processing.assert_equal_df(processing.json_deserialize(df, df_schema=schema), other_frame=expected_df) + processing.assert_equal_df( + processing.json_deserialize(df, df_schema=kafka_dataframe_schema), + other_frame=first_span, + ) + + +def test_kafka_reader_snapshot_nothing_to_read(spark, processing, kafka_dataframe_schema, kafka_topic): + kafka = Kafka( + spark=spark, + addresses=[f"{processing.host}:{processing.port}"], + cluster="cluster", + ) + + reader = DBReader( + connection=kafka, + source=kafka_topic, + ) + + # 0..100 + first_span_begin = 0 + first_span_end = 100 + + # 110..210 + second_span_begin = 110 + second_span_end = 210 + + first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) + second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + + # no data yet, nothing to read + df = reader.run() + assert not df.count() + + # insert first span + processing.insert_pandas_df_into_topic(first_span, kafka_topic) + + # .run() is not called, but dataframes are lazy, so it now contains all data from the source + deserialized_df = processing.json_deserialize(df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=first_span, order_by="id_int") + + # check that read df has data + assert reader.has_data() + + # read data explicitly + df = reader.run() + + deserialized_df = processing.json_deserialize(df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=first_span, order_by="id_int") + + # insert second span + processing.insert_pandas_df_into_topic(second_span, kafka_topic) + total_span = pandas.concat([first_span, second_span], ignore_index=True) + + # .run() is not called, but dataframes are lazy, so it now contains all data from the source + deserialized_df = processing.json_deserialize(df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=total_span, order_by="id_int") + + # read data explicitly + df = reader.run() + + deserialized_df = processing.json_deserialize(df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=total_span, order_by="id_int") diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py index 5071270d2..b06527a0b 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py @@ -160,6 +160,11 @@ def test_mongodb_reader_snapshot_nothing_to_read(spark, processing, prepare_sche first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -174,8 +179,12 @@ def test_mongodb_reader_snapshot_nothing_to_read(spark, processing, prepare_sche # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="_id") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="_id") # insert second span diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mssql_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mssql_reader_integration.py index 5409b85e9..781121e4b 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mssql_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mssql_reader_integration.py @@ -315,6 +315,11 @@ def test_mssql_reader_snapshot_nothing_to_read(spark, processing, prepare_schema first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -329,8 +334,12 @@ def test_mssql_reader_snapshot_nothing_to_read(spark, processing, prepare_schema # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") # insert second span diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mysql_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mysql_reader_integration.py index 65b922732..3f12746b8 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mysql_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mysql_reader_integration.py @@ -331,6 +331,11 @@ def test_mysql_reader_snapshot_nothing_to_read(spark, processing, prepare_schema first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -345,8 +350,12 @@ def test_mysql_reader_snapshot_nothing_to_read(spark, processing, prepare_schema # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") # insert second span diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_oracle_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_oracle_reader_integration.py index 126420c61..d9864967c 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_oracle_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_oracle_reader_integration.py @@ -315,6 +315,11 @@ def test_oracle_reader_snapshot_nothing_to_read(spark, processing, prepare_schem first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -329,8 +334,12 @@ def test_oracle_reader_snapshot_nothing_to_read(spark, processing, prepare_schem # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") # insert second span diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_postgres_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_postgres_reader_integration.py index bbb2ee472..248a575b9 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_postgres_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_postgres_reader_integration.py @@ -377,6 +377,11 @@ def test_postgres_reader_snapshot_nothing_to_read(spark, processing, prepare_sch first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + with pytest.raises(Exception, match="No data in the source:"): + reader.raise_if_no_data() + + assert not reader.has_data() + # no data yet, nothing to read df = reader.run() assert not df.count() @@ -391,8 +396,12 @@ def test_postgres_reader_snapshot_nothing_to_read(spark, processing, prepare_sch # .run() is not called, but dataframes are lazy, so it now contains all data from the source processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") + # check that read df has data + assert reader.has_data() + # read data explicitly df = reader.run() + processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") # insert second span diff --git a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py index a424282d3..4514594c5 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py @@ -246,6 +246,7 @@ def table_finalizer(): assert not greenplum.fetch(f"SELECT * FROM {temp_table}{suffix}").count() +@pytest.mark.xfail(reason="Greenplum prior to 7.x does not support procedures") @pytest.mark.parametrize("suffix", ["", ";"]) def test_greenplum_connection_execute_procedure( request, @@ -267,7 +268,50 @@ def test_greenplum_connection_execute_procedure( table = load_table_data.full_name proc = f"{load_table_data.table}_proc" - # Greenplum does not support procedures + assert not greenplum.execute( + f""" + CREATE PROCEDURE {proc} () + LANGUAGE SQL + AS $$ + SELECT COUNT(*) FROM {table}; + $${suffix} + """, + ) + + def proc_finalizer(): + greenplum.execute(f"DROP PROCEDURE {proc}") + + request.addfinalizer(proc_finalizer) + + assert not greenplum.execute(f"CALL {proc}(){suffix}") + + # wrong syntax + with pytest.raises(Exception): + greenplum.execute(f"CALL {proc}{suffix}") + + # EXECUTE is supported only for prepared statements + with pytest.raises(Exception): + greenplum.execute(f"EXECUTE {proc}{suffix}") + + with pytest.raises(Exception): + greenplum.execute(f"EXECUTE {proc}(){suffix}") + + # syntax proposed by https://docs.oracle.com/javase/8/docs/api/java/sql/CallableStatement.html + # supported only for functions + with pytest.raises(Exception): + greenplum.execute(f"{{call {proc}}}") + + with pytest.raises(Exception): + greenplum.execute(f"{{call {proc}()}}") + + # not supported by greenplum + with pytest.raises(Exception): + greenplum.execute(f"{{?= call {proc}}}") + + with pytest.raises(Exception): + greenplum.execute(f"{{?= call {proc}()}}") + + # already exists with pytest.raises(Exception): greenplum.execute( f""" @@ -279,6 +323,235 @@ def test_greenplum_connection_execute_procedure( """, ) + # recreate + assert not greenplum.execute( + f""" + CREATE OR REPLACE PROCEDURE {proc} () + LANGUAGE SQL + AS $$ + SELECT COUNT(*) FROM {table}; + $${suffix} + """, + ) + + with pytest.raises(Exception): + greenplum.execute("CALL MissingProcedure") + + with pytest.raises(Exception): + greenplum.execute("CALL MissingProcedure()") + + with pytest.raises(Exception): + greenplum.execute("DROP PROCEDURE MissingProcedure") + + # missing semicolon in body + with pytest.raises(Exception): + greenplum.execute( + f""" + CREATE PROCEDURE {proc} () + LANGUAGE SQL + AS $$ + SELECT COUNT(*) FROM {table} + $$ + """, + ) + + +@pytest.mark.xfail(reason="Greenplum prior to 7.x does not support procedures") +@pytest.mark.parametrize("suffix", ["", ";"]) +def test_greenplum_connection_execute_procedure_arguments( + request, + spark, + processing, + load_table_data, + suffix, +): + greenplum = Greenplum( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra=processing.extra, + ) + + table = load_table_data.full_name + proc = f"{load_table_data.table}_proc" + + assert not greenplum.execute( + f""" + CREATE PROCEDURE {proc} (idd int) + LANGUAGE SQL + AS $$ + SELECT COUNT(*) FROM {table} + WHERE id_int = idd; + $${suffix} + """, + ) + + def proc_finalizer(): + greenplum.execute(f"DROP PROCEDURE {proc}") + + request.addfinalizer(proc_finalizer) + + assert not greenplum.execute(f"CALL {proc}(10){suffix}") + + # not enough options + with pytest.raises(Exception): + greenplum.execute(f"CALL {proc}{suffix}") + + with pytest.raises(Exception): + greenplum.execute(f"CALL {proc}(){suffix}") + + # too many options + with pytest.raises(Exception): + greenplum.execute(f"CALL {proc}(10, 1){suffix}") + + +@pytest.mark.xfail(reason="Greenplum prior to 7.x does not support procedures") +@pytest.mark.parametrize("suffix", ["", ";"]) +def test_greenplum_connection_execute_procedure_inout( + request, + spark, + processing, + load_table_data, + suffix, +): + greenplum = Greenplum( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra=processing.extra, + ) + + table = load_table_data.full_name + proc = f"{load_table_data.table}_proc_inout" + + table_df = processing.get_expected_dataframe( + schema=load_table_data.schema, + table=load_table_data.table, + order_by="id_int", + ) + + assert not greenplum.execute( + f""" + CREATE PROCEDURE {proc} (IN idd int, INOUT result int) + LANGUAGE PLPGSQL + AS $$ + BEGIN + SELECT COUNT(*) INTO result FROM {table} + WHERE id_int < idd; + END + $${suffix} + """, + ) + + def proc_finalizer(): + greenplum.execute(f"DROP PROCEDURE {proc}{suffix}") + + request.addfinalizer(proc_finalizer) + + df = greenplum.execute(f"CALL {proc}(10, 1){suffix}") + matching_df = table_df[table_df.id_int < 10] + result_df = pandas.DataFrame([[len(matching_df)]], columns=["result"]) + processing.assert_equal_df(df=df, other_frame=result_df) + + # option 1 value is missing + # greenplum does not support OUT arguments + with pytest.raises(Exception): + greenplum.execute(f"CALL {proc}(10, ?){suffix}") + + +@pytest.mark.xfail(reason="Greenplum prior to 7.x does not support procedures") +@pytest.mark.parametrize("suffix", ["", ";"]) +def test_greenplum_connection_execute_procedure_ddl( + request, + spark, + processing, + get_schema_table, + suffix, +): + greenplum = Greenplum( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra=processing.extra, + ) + + table = get_schema_table.full_name + proc = f"{table}_ddl" + + assert not greenplum.execute( + f""" + CREATE PROCEDURE {proc} () + LANGUAGE SQL + AS $$ + CREATE TABLE {table} (iid INT, text VARCHAR(400)); + $${suffix} + """, + ) + + def proc_finalizer(): + greenplum.execute(f"DROP PROCEDURE {proc}") + + request.addfinalizer(proc_finalizer) + + assert not greenplum.execute(f"CALL {proc}()") + assert not greenplum.execute(f"DROP TABLE {table}") + + +@pytest.mark.xfail(reason="Greenplum prior to 7.x does not support procedures") +@pytest.mark.parametrize("suffix", ["", ";"]) +def test_greenplum_connection_execute_procedure_dml( + request, + spark, + processing, + get_schema_table, + suffix, +): + greenplum = Greenplum( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra=processing.extra, + ) + + table = get_schema_table.full_name + proc = f"{table}_dml" + + assert not greenplum.execute(f"CREATE TABLE {table} (iid INT, text VARCHAR(400)){suffix}") + + def table_finalizer(): + greenplum.execute(f"DROP TABLE {table}") + + request.addfinalizer(table_finalizer) + + assert not greenplum.execute( + f""" + CREATE PROCEDURE {proc} (idd int, text VARCHAR) + LANGUAGE SQL + AS $$ + INSERT INTO {table} VALUES(idd, text); + $${suffix} + """, + ) + + def proc_finalizer(): + greenplum.execute(f"DROP PROCEDURE {proc}") + + request.addfinalizer(proc_finalizer) + + assert not greenplum.execute(f"CALL {proc}(1, 'abc')") + @pytest.mark.parametrize("suffix", ["", ";"]) def test_greenplum_connection_execute_function( diff --git a/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py b/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py index 5371e8e27..66c7ad31a 100644 --- a/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py +++ b/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py @@ -944,3 +944,27 @@ def test_postgres_strategy_incremental_batch_nothing_to_read(spark, processing, processing.assert_equal_df(df=df, other_frame=second_span, order_by="id_int") hwm = store.get_hwm(name=hwm_name) assert hwm.value == second_span_max + + +def test_postgres_has_data_outside_incremental_strategy(spark, processing, prepare_schema_table): + postgres = Postgres( + host=processing.host, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + reader = DBReader( + connection=postgres, + source=prepare_schema_table.full_name, + hwm=ColumnIntHWM(name=secrets.token_hex(5), expression="text_string"), + ) + + with pytest.raises( + RuntimeError, + match=re.escape( + "Check documentation DBReader.has_data(): ", + ), + ): + reader.has_data() diff --git a/tests/tests_integration/tests_strategy_integration/test_strategy_snapshot_batch.py b/tests/tests_integration/tests_strategy_integration/test_strategy_snapshot_batch.py index e1984ab05..4c8121dbc 100644 --- a/tests/tests_integration/tests_strategy_integration/test_strategy_snapshot_batch.py +++ b/tests/tests_integration/tests_strategy_integration/test_strategy_snapshot_batch.py @@ -696,3 +696,27 @@ def test_postgres_strategy_snapshot_batch_nothing_to_read(spark, processing, pre total_span = pandas.concat([first_span, second_span], ignore_index=True) processing.assert_equal_df(df=df, other_frame=total_span, order_by="id_int") + + +def test_postgres_has_data_outside_snapshot_batch_strategy(spark, processing, prepare_schema_table): + postgres = Postgres( + host=processing.host, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + reader = DBReader( + connection=postgres, + source=prepare_schema_table.full_name, + hwm=ColumnIntHWM(name=secrets.token_hex(5), expression="text_string"), + ) + + with pytest.raises( + RuntimeError, + match=re.escape( + "Check documentation DBReader.has_data(): ", + ), + ): + reader.has_data() diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_strategy_integration/test_strategy_incremental_batch_kafka.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_strategy_integration/test_strategy_incremental_batch_kafka.py index aad831c01..634f7bbf1 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_strategy_integration/test_strategy_incremental_batch_kafka.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_strategy_integration/test_strategy_incremental_batch_kafka.py @@ -21,7 +21,7 @@ def test_strategy_kafka_with_batch_strategy_error(strategy, spark): processing = KafkaProcessing() - with strategy(step=10): + with strategy(step=10) as batches: reader = DBReader( connection=Kafka( addresses=[f"{processing.host}:{processing.port}"], @@ -31,5 +31,10 @@ def test_strategy_kafka_with_batch_strategy_error(strategy, spark): table="topic", hwm=DBReader.AutoDetectHWM(name=secrets.token_hex(5), expression="offset"), ) - with pytest.raises(RuntimeError): - reader.run() + # raises as at current version there is no way to distribute step size between kafka partitions + with pytest.raises( + RuntimeError, + match=r"HWM: .* cannot be used with Batch strategies", + ): + for _ in batches: + reader.run() diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 9a628c3e6..77ec071b3 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -145,6 +145,7 @@ def test_clickhouse_strategy_incremental_nothing_to_read(spark, processing, prep # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -165,6 +166,7 @@ def test_clickhouse_strategy_incremental_nothing_to_read(spark, processing, prep # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_greenplum.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_greenplum.py index 7a2be6b68..8159bca25 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_greenplum.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_greenplum.py @@ -152,6 +152,7 @@ def test_greenplum_strategy_incremental_nothing_to_read(spark, processing, prepa # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -172,6 +173,7 @@ def test_greenplum_strategy_incremental_nothing_to_read(spark, processing, prepa # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py index c576a5d8c..6cc860f56 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py @@ -136,6 +136,7 @@ def test_hive_strategy_incremental_nothing_to_read(spark, processing, prepare_sc # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -156,6 +157,7 @@ def test_hive_strategy_incremental_nothing_to_read(spark, processing, prepare_sc # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py new file mode 100644 index 000000000..1ba96cee1 --- /dev/null +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py @@ -0,0 +1,287 @@ +import secrets + +import pytest +from etl_entities.hwm import KeyValueIntHWM +from etl_entities.hwm_store import HWMStoreStackManager + +from onetl.connection import Kafka +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +pytestmark = pytest.mark.kafka + + +@pytest.mark.parametrize( + "num_partitions", + [ + None, # default number of partitions is 1 + 5, + 10, + ], +) +def test_kafka_strategy_incremental( + spark, + processing, + kafka_dataframe_schema, + kafka_topic, + num_partitions, +): + from pyspark.sql.functions import count as spark_count + + hwm_type = KeyValueIntHWM + hwm_name = secrets.token_hex(5) + store = HWMStoreStackManager.get_current() + + kafka = Kafka( + addresses=[f"{processing.host}:{processing.port}"], + cluster="cluster", + spark=spark, + ) + + # change the number of partitions for the Kafka topic to test work for different partitioning cases + if num_partitions is not None: + processing.change_topic_partitions(kafka_topic, num_partitions) + + reader = DBReader( + connection=kafka, + source=kafka_topic, + hwm=DBReader.AutoDetectHWM(name=hwm_name, expression="offset"), + ) + + # there are 2 spans with a gap between + + # 0..100 + first_span_begin = 0 + first_span_end = 100 + + # 110..210 + second_span_begin = 110 + second_span_end = 210 + + first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) + second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + + # insert first span + processing.insert_pandas_df_into_topic(first_span, kafka_topic) + + # hwm is not in the store + assert store.get_hwm(hwm_name) is None + + # incremental run + with IncrementalStrategy(): + first_df = reader.run() + + hwm = store.get_hwm(hwm_name) + assert hwm is not None + assert isinstance(hwm, hwm_type) + + # check that HWM distribution of messages in partitions matches the distribution in sparkDF + partition_counts = first_df.groupBy("partition").agg(spark_count("*").alias("count")) + partition_count_dict_first_df = {row["partition"]: row["count"] for row in partition_counts.collect()} + assert hwm.value == partition_count_dict_first_df + + # all the data has been read + deserialized_first_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_first_df, other_frame=first_span, order_by="id_int") + + # insert second span + processing.insert_pandas_df_into_topic(second_span, kafka_topic) + + with IncrementalStrategy(): + second_df = reader.run() + + hwm = store.get_hwm(hwm_name) + + # check that HWM distribution of messages in partitions matches the distribution in sparkDF + combined_df = first_df.union(second_df) + partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) + partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} + assert hwm.value == partition_count_dict_combined + + deserialized_second_df = processing.json_deserialize(second_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_second_df, other_frame=second_span, order_by="id_int") + + +@pytest.mark.parametrize( + "num_partitions", + [ + None, # default number of partitions is 1 + 5, + 10, + ], +) +def test_kafka_strategy_incremental_nothing_to_read( + spark, + processing, + kafka_dataframe_schema, + num_partitions, + kafka_topic, +): + from pyspark.sql.functions import count as spark_count + + hwm_name = secrets.token_hex(5) + store = HWMStoreStackManager.get_current() + + kafka = Kafka( + addresses=[f"{processing.host}:{processing.port}"], + cluster="cluster", + spark=spark, + ) + + # change the number of partitions for the Kafka topic to test work for different partitioning cases + if num_partitions is not None: + processing.change_topic_partitions(kafka_topic, num_partitions) + + reader = DBReader( + connection=kafka, + source=kafka_topic, + hwm=DBReader.AutoDetectHWM(name=hwm_name, expression="offset"), + ) + + # 0..50 + first_span_begin = 0 + first_span_end = 50 + # 60..110 + second_span_begin = 60 + second_span_end = 110 + + first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) + second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + + # no data yet, nothing to read + with IncrementalStrategy(): + assert not reader.has_data() + df = reader.run() + + assert not df.count() + hwm = store.get_hwm(name=hwm_name) + assert all(value == 0 for value in hwm.value.values()) + + # insert first span + processing.insert_pandas_df_into_topic(first_span, kafka_topic) + + # .run() is not called - dataframe still empty - HWM not updated + assert not df.count() + hwm = store.get_hwm(name=hwm_name) + assert all(value == 0 for value in hwm.value.values()) + + # set hwm value to 50 + with IncrementalStrategy(): + assert reader.has_data() + first_df = reader.run() + + hwm = store.get_hwm(name=hwm_name) + # check that HWM distribution of messages in partitions matches the distribution in sparkDF + partition_counts = first_df.groupBy("partition").agg(spark_count("*").alias("count")) + partition_count_dict_first_df = {row["partition"]: row["count"] for row in partition_counts.collect()} + assert hwm.value == partition_count_dict_first_df + + deserialized_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=first_span, order_by="id_int") + + # no new data yet, nothing to read + with IncrementalStrategy(): + df = reader.run() + + assert not df.count() + # HWM value is unchanged + hwm = store.get_hwm(name=hwm_name) + assert hwm.value == partition_count_dict_first_df + + # insert second span + processing.insert_pandas_df_into_topic(second_span, kafka_topic) + + # .run() is not called - dataframe still empty - HWM not updated + assert not df.count() + # HWM value is unchanged + hwm = store.get_hwm(name=hwm_name) + assert hwm.value == partition_count_dict_first_df + + # read data + with IncrementalStrategy(): + df = reader.run() + + hwm = store.get_hwm(name=hwm_name) + # check that HWM distribution of messages in partitions matches the distribution in sparkDF + combined_df = df.union(first_df) + partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) + partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} + assert hwm.value == partition_count_dict_combined + + deserialized_df = processing.json_deserialize(df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=second_span, order_by="id_int") + + +@pytest.mark.parametrize( + "initial_partitions, additional_partitions", + [ + (3, 2), # starting with 3 partitions, adding 2 more + (5, 1), # starting with 5 partitions, adding 1 more + ], +) +def test_kafka_strategy_incremental_with_new_partition( + spark, + processing, + initial_partitions, + additional_partitions, + kafka_topic, +): + from pyspark.sql.functions import count as spark_count + + hwm_name = secrets.token_hex(5) + store = HWMStoreStackManager.get_current() + + kafka = Kafka( + addresses=[f"{processing.host}:{processing.port}"], + cluster="cluster", + spark=spark, + ) + + reader = DBReader( + connection=kafka, + source=kafka_topic, + hwm=DBReader.AutoDetectHWM(name=hwm_name, expression="offset"), + ) + + # Initial setup with `initial_partitions` partitions + processing.change_topic_partitions(kafka_topic, initial_partitions) + + # 0..50 + first_span_begin = 0 + first_span_end = 100 + + # 60..110 + second_span_begin = 60 + second_span_end = 110 + + first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end) + second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end) + + processing.insert_pandas_df_into_topic(first_span, kafka_topic) + with IncrementalStrategy(): + first_df = reader.run() + + # it is crucial to save dataframe after reading as if number of partitions is altered before executing any subsequent operations, Spark fails to run them due to + # Caused by: java.lang.AssertionError: assertion failed: If startingOffsets contains specific offsets, you must specify all TopicPartitions. + # Use -1 for latest, -2 for earliest. + # Specified: Set(topic1, topic2) Assigned: Set(topic1, topic2, additional_topic3, additional_topic4) + first_df.cache() + + hwm = store.get_hwm(name=hwm_name) + first_run_hwm_keys_num = len(hwm.value.keys()) + + processing.change_topic_partitions(kafka_topic, initial_partitions + additional_partitions) + processing.insert_pandas_df_into_topic(second_span, kafka_topic) + + with IncrementalStrategy(): + second_df = reader.run() + + hwm = store.get_hwm(name=hwm_name) + second_run_hwm_keys_num = len(hwm.value) + assert first_run_hwm_keys_num + additional_partitions == second_run_hwm_keys_num + + # check that HWM distribution of messages in partitions matches the distribution in sparkDF + combined_df = second_df.union(first_df) + partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) + partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} + assert hwm.value == partition_count_dict_combined diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py index 888300a2d..3e8287257 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py @@ -181,6 +181,7 @@ def test_mongodb_strategy_incremental_nothing_to_read( # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -201,6 +202,7 @@ def test_mongodb_strategy_incremental_nothing_to_read( # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="_id") diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mssql.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mssql.py index 90447d3b3..a03db2759 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mssql.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mssql.py @@ -152,6 +152,7 @@ def test_mssql_strategy_incremental_nothing_to_read(spark, processing, prepare_s # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -172,6 +173,7 @@ def test_mssql_strategy_incremental_nothing_to_read(spark, processing, prepare_s # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mysql.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mysql.py index 29bba214e..e762eb0da 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mysql.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mysql.py @@ -150,6 +150,7 @@ def test_mysql_strategy_incremental_nothing_to_read(spark, processing, prepare_s # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -170,6 +171,7 @@ def test_mysql_strategy_incremental_nothing_to_read(spark, processing, prepare_s # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_oracle.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_oracle.py index 3e07546f4..2bb9165e6 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_oracle.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_oracle.py @@ -165,6 +165,7 @@ def test_oracle_strategy_incremental_nothing_to_read(spark, processing, prepare_ # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -185,6 +186,7 @@ def test_oracle_strategy_incremental_nothing_to_read(spark, processing, prepare_ # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_postgres.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_postgres.py index 424949751..8d5f73705 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_postgres.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_postgres.py @@ -252,6 +252,7 @@ def test_postgres_strategy_incremental_nothing_to_read(spark, processing, prepar # no data yet, nothing to read with IncrementalStrategy(): + assert not reader.has_data() df = reader.run() assert not df.count() @@ -272,6 +273,7 @@ def test_postgres_strategy_incremental_nothing_to_read(spark, processing, prepar # set hwm value to 50 with IncrementalStrategy(): + assert reader.has_data() df = reader.run() processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int") diff --git a/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py b/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py index eb4f68c68..a21fb1daa 100644 --- a/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py +++ b/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py @@ -1,7 +1,9 @@ import re +import secrets import textwrap import pytest +from frozendict import frozendict from onetl.connection import Hive from onetl.db import DBReader @@ -158,3 +160,29 @@ def test_reader_no_hwm_expression(spark_mock): table="schema.table", hwm=DBReader.AutoDetectHWM(name="some_name"), ) + + +@pytest.mark.parametrize( + "alias_key, alias_value", + [ + ("source", "test_source"), + ("topic", "test_topic"), + ("entity", "test_entity"), + ], +) +def test_auto_detect_hwm_handle_aliases(alias_key, alias_value): + values = {alias_key: alias_value} + auto_detect_hwm = DBReader.AutoDetectHWM(name=secrets.token_hex(6), **values) + assert auto_detect_hwm.entity == alias_value + + +@pytest.mark.parametrize( + "value", + [None, 123, "test_string", frozendict({1: 100})], +) +def test_auto_detect_hwm_dict_without_value_field(value): + hwm = DBReader.AutoDetectHWM(name=secrets.token_hex(6)) + object.__setattr__(hwm, "value", value) + serialized_data = hwm.dict() + + assert "value" not in serialized_data diff --git a/tests/tests_unit/test_db/test_db_reader_unit/test_kafka_reader_unit.py b/tests/tests_unit/test_db/test_db_reader_unit/test_kafka_reader_unit.py index 3506a415f..3ff5c9502 100644 --- a/tests/tests_unit/test_db/test_db_reader_unit/test_kafka_reader_unit.py +++ b/tests/tests_unit/test_db/test_db_reader_unit/test_kafka_reader_unit.py @@ -82,29 +82,36 @@ def test_kafka_reader_hwm_offset_is_valid(spark_mock): ) -def test_kafka_reader_hwm_timestamp_depends_on_spark_version(spark_mock, mocker): +@pytest.mark.parametrize( + "hwm_expression", + ["unknown", "timestamp"], +) +def test_kafka_reader_invalid_hwm_column(spark_mock, hwm_expression): kafka = Kafka( addresses=["localhost:9092"], cluster="my_cluster", spark=spark_mock, ) - mocker.patch.object(spark_mock, "version", new="3.2.0") - DBReader( - connection=kafka, - table="table", - hwm=DBReader.AutoDetectHWM(name=secrets.token_hex(5), expression="timestamp"), - ) - mocker.patch.object(spark_mock, "version", new="2.4.0") - with pytest.raises(ValueError, match="Spark version must be 3.x"): + with pytest.raises( + ValueError, + match=f"hwm.expression='{hwm_expression}' is not supported by Kafka", + ): DBReader( connection=kafka, table="table", - hwm=DBReader.AutoDetectHWM(name=secrets.token_hex(5), expression="timestamp"), + hwm=DBReader.AutoDetectHWM(name=secrets.token_hex(5), expression=hwm_expression), ) -def test_kafka_reader_invalid_hwm_column(spark_mock): +@pytest.mark.parametrize( + "topic, error_message", + [ + ("*", r"source/target=\* is not supported by Kafka. Provide a singular topic."), + ("topic1, topic2", "source/target=topic1, topic2 is not supported by Kafka. Provide a singular topic."), + ], +) +def test_kafka_reader_invalid_source(spark_mock, topic, error_message): kafka = Kafka( addresses=["localhost:9092"], cluster="my_cluster", @@ -113,10 +120,10 @@ def test_kafka_reader_invalid_hwm_column(spark_mock): with pytest.raises( ValueError, - match="hwm.expression='unknown' is not supported by Kafka", + match=error_message, ): DBReader( connection=kafka, - table="table", - hwm=DBReader.AutoDetectHWM(name=secrets.token_hex(5), expression="unknown"), + table=topic, + hwm=DBReader.AutoDetectHWM(name=secrets.token_hex(5), expression="offset"), ) diff --git a/tests/tests_unit/tests_db_connection_unit/test_dialect_unit.py b/tests/tests_unit/tests_db_connection_unit/test_dialect_unit.py index 8faed9256..7dc7ac7d2 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_dialect_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_dialect_unit.py @@ -2,7 +2,7 @@ import pytest -from onetl.connection import Oracle, Postgres +from onetl.connection import MSSQL, Oracle, Postgres pytestmark = [pytest.mark.postgres] @@ -262,3 +262,41 @@ def test_db_dialect_get_sql_query_compact_true(spark_mock): ).strip() assert result == expected + + +@pytest.mark.parametrize( + "limit, where, expected_query", + [ + (None, None, "SELECT\n *\nFROM\n default.test"), + (0, None, "SELECT\n *\nFROM\n default.test\nWHERE\n 1=0"), + (5, None, "SELECT\n *\nFROM\n default.test\nWHERE\n ROWNUM <= 5"), + (None, "column1 = 'value'", "SELECT\n *\nFROM\n default.test\nWHERE\n column1 = 'value'"), + (0, "column1 = 'value'", "SELECT\n *\nFROM\n default.test\nWHERE\n 1=0"), + ( + 5, + "column1 = 'value'", + "SELECT\n *\nFROM\n default.test\nWHERE\n (column1 = 'value')\n AND\n (ROWNUM <= 5)", + ), + ], +) +def test_oracle_dialect_get_sql_query_limit_where(spark_mock, limit, where, expected_query): + conn = Oracle(host="some_host", user="user", sid="XE", password="passwd", spark=spark_mock) + result = conn.dialect.get_sql_query(table="default.test", limit=limit, where=where) + assert result.strip() == expected_query.strip() + + +@pytest.mark.parametrize( + "limit, where, expected_query", + [ + (None, None, "SELECT\n *\nFROM\n default.test"), + (0, None, "SELECT\n *\nFROM\n default.test\nWHERE\n 1 = 0"), + (5, None, "SELECT TOP 5\n *\nFROM\n default.test"), + (None, "column1 = 'value'", "SELECT\n *\nFROM\n default.test\nWHERE\n column1 = 'value'"), + (0, "column1 = 'value'", "SELECT\n *\nFROM\n default.test\nWHERE\n 1 = 0"), + (5, "column1 = 'value'", "SELECT TOP 5\n *\nFROM\n default.test\nWHERE\n column1 = 'value'"), + ], +) +def test_mssql_dialect_get_sql_query_limit_where(spark_mock, limit, where, expected_query): + conn = MSSQL(host="some_host", user="user", database="database", password="passwd", spark=spark_mock) + result = conn.dialect.get_sql_query(table="default.test", limit=limit, where=where) + assert result.strip() == expected_query.strip() diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index 276a3c892..71308e2f9 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -15,9 +15,9 @@ def test_greenplum_driver(): def test_greenplum_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Greenplum.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert Greenplum.package_spark_2_3 == "io.pivotal:greenplum-spark_2.11:2.1.4" - assert Greenplum.package_spark_2_4 == "io.pivotal:greenplum-spark_2.11:2.1.4" - assert Greenplum.package_spark_3_2 == "io.pivotal:greenplum-spark_2.12:2.1.4" + assert Greenplum.package_spark_2_3 == "io.pivotal:greenplum-spark_2.11:2.3.0" + assert Greenplum.package_spark_2_4 == "io.pivotal:greenplum-spark_2.11:2.3.0" + assert Greenplum.package_spark_3_2 == "io.pivotal:greenplum-spark_2.12:2.3.0" def test_greenplum_get_packages_no_input(): @@ -56,21 +56,33 @@ def test_greenplum_get_packages_scala_version_not_supported(scala_version): "spark_version, scala_version, package", [ # use Scala version directly - (None, "2.11", "io.pivotal:greenplum-spark_2.11:2.1.4"), - (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.1.4"), + (None, "2.11", "io.pivotal:greenplum-spark_2.11:2.3.0"), + (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), # Detect Scala version by Spark version - ("2.3", None, "io.pivotal:greenplum-spark_2.11:2.1.4"), - ("2.4", None, "io.pivotal:greenplum-spark_2.11:2.1.4"), - ("3.2", None, "io.pivotal:greenplum-spark_2.12:2.1.4"), + ("2.3", None, "io.pivotal:greenplum-spark_2.11:2.3.0"), + ("2.4", None, "io.pivotal:greenplum-spark_2.11:2.3.0"), + ("3.2", None, "io.pivotal:greenplum-spark_2.12:2.3.0"), # Override Scala version detected automatically - ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.1.4"), - ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.1.4"), + ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.3.0"), + ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), ], ) def test_greenplum_get_packages(spark_version, scala_version, package): assert Greenplum.get_packages(spark_version=spark_version, scala_version=scala_version) == [package] +@pytest.mark.parametrize( + "package_version, scala_version, package", + [ + (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + ("2.3.0", "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + ("2.1.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.1.4"), + ], +) +def test_greenplum_get_packages_explicit_version(package_version, scala_version, package): + assert Greenplum.get_packages(package_version=package_version, scala_version=scala_version) == [package] + + def test_greenplum_missing_package(spark_no_packages): msg = "Cannot import Java class 'io.pivotal.greenplum.spark.GreenplumRelationProvider'" with pytest.raises(ValueError, match=msg): @@ -242,6 +254,7 @@ def test_greenplum_write_options_cannot_be_used_in_read_options(arg, value): ("partitions", 10), ("numPartitions", 10), ("partitionColumn", "abc"), + ("gpdb.matchDistributionPolicy", "true"), ], ) def test_greenplum_read_options_cannot_be_used_in_write_options(arg, value): diff --git a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py index 404ca57fa..a97212560 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py @@ -5,6 +5,17 @@ import pytest +try: + from pyspark.sql.types import ( + BinaryType, + LongType, + StructField, + StructType, + TimestampType, + ) +except ImportError: + pytest.skip("Missing pyspark", allow_module_level=True) + from onetl.connection import Kafka from onetl.connection.db_connection.kafka.extra import KafkaExtra from onetl.connection.db_connection.kafka.options import KafkaTopicExistBehaviorKafka @@ -883,3 +894,46 @@ def test_kafka_ssl_protocol_with_basic_auth(spark_mock): "ssl.truststore.certificates": "", "security.protocol": "SASL_SSL", } + + +@pytest.mark.parametrize( + "columns,expected_schema", + [ + ( + ["key", "value", "offset"], + StructType( + [ + StructField("key", BinaryType(), nullable=True), + StructField("value", BinaryType(), nullable=False), + StructField("offset", LongType(), nullable=True), + ], + ), + ), + ( + ["key", "timestamp"], + StructType( + [ + StructField("key", BinaryType(), nullable=True), + StructField("timestamp", TimestampType(), nullable=True), + ], + ), + ), + ( + ["value"], + StructType( + [ + StructField("value", BinaryType(), nullable=False), + ], + ), + ), + ], +) +def test_get_df_schema(spark_mock, columns, expected_schema): + kafka = Kafka( + addresses=["localhost:9092"], + cluster="my_cluster", + spark=spark_mock, + ) + + df_schema = kafka.get_df_schema(source="test_topic", columns=columns) + assert df_schema == expected_schema