diff --git a/.env.docker b/.env.docker index b9c2105aa..cb0394806 100644 --- a/.env.docker +++ b/.env.docker @@ -87,6 +87,16 @@ ONETL_SFTP_PORT=2222 ONETL_SFTP_USER=onetl ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho +# Samba +ONETL_SAMBA_HOST=samba +ONETL_SAMBA_PROTOCOL=SMB +ONETL_SAMBA_UID=1000 +ONETL_SAMBA_GID=1000 +ONETL_SAMBA_PORT=445 +ONETL_SAMBA_SHARE=SmbShare +ONETL_SAMBA_USER=onetl +ONETL_SAMBA_PASSWORD=awd123fd1 + # Webdav ONETL_WEBDAV_HOST=webdav ONETL_WEBDAV_PORT=80 diff --git a/.env.local b/.env.local index af2551dbd..2e05030f3 100644 --- a/.env.local +++ b/.env.local @@ -87,6 +87,16 @@ export ONETL_SFTP_PORT=2222 export ONETL_SFTP_USER=onetl export ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho +# Samba +export ONETL_SAMBA_HOST=localhost +export ONETL_SAMBA_PROTOCOL=SMB +export ONETL_SAMBA_UID=1000 +export ONETL_SAMBA_GID=1000 +export ONETL_SAMBA_PORT=445 +export ONETL_SAMBA_SHARE=SmbShare +export ONETL_SAMBA_USER=onetl +export ONETL_SAMBA_PASSWORD=awd123fd1 + # Webdav export ONETL_WEBDAV_HOST=localhost export ONETL_WEBDAV_PORT=8000 diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 43e02d8c2..292319bb5 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -7,7 +7,7 @@ min: &min max: &max # Greenplum connector does not support Spark 3.3+ - spark-version: 3.2.3 + spark-version: 3.2.4 python-version: '3.10' java-version: 11 os: ubuntu-latest diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index af841433b..e956169ba 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -4,12 +4,18 @@ min: &min java-version: 8 os: ubuntu-latest -avro: &avro +min_avro: &min_avro spark-version: 2.4.8 python-version: '3.7' java-version: 8 os: ubuntu-latest +min_excel: &min_excel + spark-version: 3.2.4 + python-version: '3.7' + java-version: 8 + os: ubuntu-latest + max: &max spark-version: 3.4.1 python-version: '3.11' @@ -25,12 +31,15 @@ latest: &latest matrix: small: - <<: *max - - <<: *avro + - <<: *min_avro + - <<: *min_excel full: - <<: *min - - <<: *avro + - <<: *min_avro + - <<: *min_excel - <<: *max nightly: - <<: *min - - <<: *avro + - <<: *min_avro + - <<: *min_excel - <<: *latest diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 80f81aacf..f91e1baaa 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -1,6 +1,6 @@ min: &min # MongoDB connector does not support Spark 2 - spark-version: 3.2.3 + spark-version: 3.2.4 python-version: '3.7' java-version: 8 os: ubuntu-latest diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 57fe2ca8f..44779fe95 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -2,7 +2,7 @@ min: &min # prior image versions returns empty content of bucket root, some kind of bug minio-version: 2021.3.17 # Minimal Spark version with Hadoop 3.x support - spark-version: 3.2.3 + spark-version: 3.2.4 python-version: '3.7' java-version: 8 os: ubuntu-latest diff --git a/.github/workflows/data/samba/ignored.txt b/.github/workflows/data/samba/ignored.txt new file mode 100644 index 000000000..d8f8d4692 --- /dev/null +++ b/.github/workflows/data/samba/ignored.txt @@ -0,0 +1 @@ +docs diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml new file mode 100644 index 000000000..a4a3afe30 --- /dev/null +++ b/.github/workflows/data/samba/matrix.yml @@ -0,0 +1,18 @@ +min: &min + python-version: '3.7' + os: ubuntu-latest + +max: &max + python-version: '3.11' + os: ubuntu-latest + +matrix: + small: + - server-version: latest + <<: *max + full: &full + - server-version: latest + <<: *min + - server-version: latest + <<: *max + nightly: *full diff --git a/.github/workflows/data/samba/tracked.txt b/.github/workflows/data/samba/tracked.txt new file mode 100644 index 000000000..5f7fcf905 --- /dev/null +++ b/.github/workflows/data/samba/tracked.txt @@ -0,0 +1 @@ +**/samba* diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index fd7e24aae..b9d160b42 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -41,6 +41,8 @@ on: value: ${{ jobs.get-matrix.outputs.matrix-s3 }} matrix-sftp: value: ${{ jobs.get-matrix.outputs.matrix-sftp }} + matrix-samba: + value: ${{ jobs.get-matrix.outputs.matrix-samba }} matrix-webdav: value: ${{ jobs.get-matrix.outputs.matrix-webdav }} @@ -69,6 +71,7 @@ jobs: matrix-hdfs: ${{ toJson(fromJson(steps.matrix-hdfs.outputs.result)[steps.key-hdfs.outputs.key]) }} matrix-s3: ${{ toJson(fromJson(steps.matrix-s3.outputs.result)[steps.key-s3.outputs.key]) }} matrix-sftp: ${{ toJson(fromJson(steps.matrix-sftp.outputs.result)[steps.key-sftp.outputs.key]) }} + matrix-samba: ${{ toJson(fromJson(steps.matrix-samba.outputs.result)[steps.key-samba.outputs.key]) }} matrix-webdav: ${{ toJson(fromJson(steps.matrix-webdav.outputs.result)[steps.key-webdav.outputs.key]) }} steps: - name: Checkout code @@ -635,6 +638,36 @@ jobs: with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml + - name: Check if Samba files are changed + id: changed-samba + uses: tj-actions/changed-files@v35 + with: + files_from_source_file: .github/workflows/data/samba/tracked.txt + files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt + + - name: Print Samba files changed + run: | + echo '${{ steps.changed-samba.outputs.all_changed_files }}' + + - name: Calculate Samba matrix key + id: key-samba + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-samba.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Samba matrix + id: matrix-samba + uses: mikefarah/yq@v4.33.3 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml + - name: Check if WebDAV files are changed id: changed-webdav uses: tj-actions/changed-files@v35 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 209364f4b..7608ebe6e 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -303,6 +303,21 @@ jobs: os: ${{ matrix.os }} with-cache: false + tests-samba: + name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + needs: [get-matrix] + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }} + + uses: ./.github/workflows/test-samba.yml + with: + server-version: ${{ matrix.server-version }} + python-version: ${{ matrix.python-version }} + os: ${{ matrix.os }} + with-cache: false + tests-webdav: name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] @@ -338,6 +353,7 @@ jobs: - tests-hdfs - tests-s3 - tests-sftp + - tests-samba - tests-webdav steps: diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml new file mode 100644 index 000000000..d823a9ae7 --- /dev/null +++ b/.github/workflows/test-samba.yml @@ -0,0 +1,81 @@ +name: Tests for Samba +on: + workflow_call: + inputs: + server-version: + required: true + type: string + python-version: + required: true + type: string + os: + required: true + type: string + with-cache: + required: false + type: boolean + default: true + +jobs: + test-samba: + name: Run Samba tests (server=${{ inputs.server-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + runs-on: ${{ inputs.os }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + + - name: Cache pip + uses: actions/cache@v3 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba- + + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt + + # Replace with Github Actions' because of custom parameter for samba container start + - name: Start Samba + run: | + docker compose down -v --remove-orphans + docker compose up -d samba + env: + SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }} + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} + + - name: Wait for Samba to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 445 -t 60 + + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m samba + + - name: Shutdown Samba + if: always() + run: | + docker compose down -v --remove-orphans + env: + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} + + - name: Upload coverage results + uses: actions/upload-artifact@v3 + with: + name: samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 44125d701..1df7f5306 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -287,6 +287,21 @@ jobs: python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} + tests-samba: + name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + needs: [get-matrix] + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }} + + uses: ./.github/workflows/test-samba.yml + with: + server-version: ${{ matrix.server-version }} + python-version: ${{ matrix.python-version }} + os: ${{ matrix.os }} + + tests-webdav: name: Run WebDAV tests (server=${{ matrix.webdav-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] @@ -321,6 +336,7 @@ jobs: - tests-hdfs - tests-s3 - tests-sftp + - tests-samba - tests-webdav steps: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd0c89d6b..193ae3c3d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -59,12 +59,12 @@ repos: - id: rst-inline-touching-normal - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.13.0 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 23.9.1 hooks: - id: black language_version: python3 diff --git a/.readthedocs.yml b/.readthedocs.yml index 4d54479b4..923741b22 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -14,6 +14,7 @@ python: - ftp - ftps - hdfs + - samba - s3 - sftp - webdav diff --git a/README.rst b/README.rst index 13b280830..4f8b0aca8 100644 --- a/README.rst +++ b/README.rst @@ -54,7 +54,7 @@ Requirements * **Python 3.7 - 3.11** * PySpark 2.3.x - 3.4.x (depends on used connector) * Java 8+ (required by Spark, see below) -* Kerberos libs & GCC (required by ``Hive`` and ``HDFS`` connectors) +* Kerberos libs & GCC (required by ``Hive``, ``HDFS`` and ``SparkHDFS`` connectors) Supported storages ------------------ @@ -93,6 +93,8 @@ Supported storages | | FTPS | | + +--------------+----------------------------------------------------------------------------------------------------------------------+ | | WebDAV | `WebdavClient3 library `_ | ++ +--------------+----------------------------------------------------------------------------------------------------------------------+ +| | Samba | `pysmb library `_ | +--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+ | Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | | +--------------+ + @@ -109,16 +111,16 @@ Documentation See https://onetl.readthedocs.io/ -.. install - How to install --------------- -.. _minimal-install: +.. _install: Minimal installation ~~~~~~~~~~~~~~~~~~~~ +.. _minimal-install: + Base ``onetl`` package contains: * ``DBReader``, ``DBWriter`` and related classes @@ -140,14 +142,16 @@ It can be installed via: This method is recommended for use in third-party libraries which require for ``onetl`` to be installed, but do not use its connection classes. -.. _spark-install: - With DB and FileDF connections ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _spark-install: + All DB connection classes (``Clickhouse``, ``Greenplum``, ``Hive`` and others) and all FileDF connection classes (``SparkHDFS``, ``SparkLocalFS``, ``SparkS3``) -require PySpark to be installed. +require Spark to be installed. + +.. _java-install: Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples: @@ -169,13 +173,15 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | `2.4.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ | `3.4.x `_ | 3.7 - 3.11 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ +.. _pyspark-install: + Then you should install PySpark via passing ``spark`` to ``extras``: .. code:: bash @@ -191,12 +197,11 @@ or install PySpark explicitly: or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** - -.. _files-install: - With File connections ~~~~~~~~~~~~~~~~~~~~~ +.. _files-install: + All File (but not *FileDF*) connection classes (``FTP``, ``SFTP``, ``HDFS`` and so on) requires specific Python clients to be installed. Each client can be installed explicitly by passing connector name (in lowercase) to ``extras``: @@ -204,7 +209,7 @@ Each client can be installed explicitly by passing connector name (in lowercase) .. code:: bash pip install onetl[ftp] # specific connector - pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav] # multiple connectors + pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors To install all file connectors at once you can pass ``files`` to ``extras``: @@ -214,22 +219,21 @@ To install all file connectors at once you can pass ``files`` to ``extras``: **Otherwise class import will fail.** - -.. _kerberos-install: - With Kerberos support ~~~~~~~~~~~~~~~~~~~~~ +.. _kerberos-install: + Most of Hadoop instances set up with Kerberos support, so some connections require additional setup to work properly. * ``HDFS`` Uses `requests-kerberos `_ and - `GSSApi `_ for authentication in WebHDFS. + `GSSApi `_ for authentication. It also uses ``kinit`` executable to generate Kerberos ticket. * ``Hive`` and ``SparkHDFS`` - Requires Kerberos ticket to exist before creating Spark session. + require Kerberos ticket to exist before creating Spark session. So you need to install OS packages with: @@ -250,12 +254,11 @@ Also you should pass ``kerberos`` to ``extras`` to install required Python packa pip install onetl[kerberos] - -.. _full-install: - Full bundle ~~~~~~~~~~~ +.. _full-bundle: + To install all connectors and dependencies, you can pass ``all`` into ``extras``: .. code:: bash @@ -269,7 +272,7 @@ To install all connectors and dependencies, you can pass ``all`` into ``extras`` This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS. -.. quick-start +.. _quick-start: Quick start ------------ diff --git a/conftest.py b/conftest.py index ab0b60a5c..52b6c5754 100644 --- a/conftest.py +++ b/conftest.py @@ -19,5 +19,6 @@ "tests.fixtures.connections.local_fs", "tests.fixtures.connections.s3", "tests.fixtures.connections.sftp", + "tests.fixtures.connections.samba", "tests.fixtures.connections.webdav", ] diff --git a/docker-compose.yml b/docker-compose.yml index a08d8fc38..bdcfe3954 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -164,6 +164,18 @@ services: networks: - onetl + samba: + image: elswork/samba + restart: unless-stopped + ports: + - "139:139" + - "445:445" + volumes: + - ./docker/samba/custom_entrypoint.sh:/custom_entrypoint.sh + entrypoint: ["/custom_entrypoint.sh"] + networks: + - onetl + s3: image: ${S3_IMAGE:-bitnami/minio:latest} restart: unless-stopped diff --git a/docker/Dockerfile b/docker/Dockerfile index 103cc2b26..817d4eab2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -57,6 +57,7 @@ RUN pip install \ -r /app/requirements/hdfs.txt \ -r /app/requirements/s3.txt \ -r /app/requirements/sftp.txt \ + -r /app/requirements/samba.txt \ -r /app/requirements/webdav.txt \ -r /app/requirements/kerberos.txt \ -r /app/requirements/docs.txt \ diff --git a/docker/samba/custom_entrypoint.sh b/docker/samba/custom_entrypoint.sh new file mode 100755 index 000000000..f0d4078c0 --- /dev/null +++ b/docker/samba/custom_entrypoint.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +# allow create files and directories +mkdir -p /share/folder +chmod 0777 /share/folder +/entrypoint.sh -u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl" diff --git a/docs/changelog/0.9.4.rst b/docs/changelog/0.9.4.rst new file mode 100644 index 000000000..4eb406ae0 --- /dev/null +++ b/docs/changelog/0.9.4.rst @@ -0,0 +1,30 @@ +0.9.4 (2023-09-26) +================== + +Features +-------- + +- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`) +- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`) +- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`) +- Add ``Excel`` file format support. (:github:pull:`148`) +- Add ``Samba`` file connection. + It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`) + + +Improvements +------------ + +- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`) +- Drastically improve ``Greenplum`` documentation: + * Added information about network ports, grants, ``pg_hba.conf`` and so on. + * Added interaction schemas for reading, writing and executing statements in Greenplum. + * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`) +- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`) +- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`) + + +Bug Fixes +--------- + +- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`) diff --git a/docs/changelog/NEXT_RELEASE.rst b/docs/changelog/NEXT_RELEASE.rst index 5e26856b4..ee4196843 100644 --- a/docs/changelog/NEXT_RELEASE.rst +++ b/docs/changelog/NEXT_RELEASE.rst @@ -3,3 +3,34 @@ .. and add it to index.rst .. towncrier release notes start + +0.9.4 (2023-09-26) +================== + +Features +-------- + +- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`) +- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`) +- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`) +- Add ``Excel`` file format support. (:github:pull:`148`) +- Add ``Samba`` file connection. + It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`) + + +Improvements +------------ + +- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`) +- Drastically improve ``Greenplum`` documentation: + * Added information about network ports, grants, ``pg_hba.conf`` and so on. + * Added interaction schemas for reading, writing and executing statements in Greenplum. + * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`) +- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`) +- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`) + + +Bug Fixes +--------- + +- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`) diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 92701e1e1..6130bfdc8 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -4,6 +4,7 @@ DRAFT NEXT_RELEASE + 0.9.4 0.9.3 0.9.2 0.9.1 diff --git a/docs/conf.py b/docs/conf.py index 87d6fd17b..06a5b08aa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,6 +56,7 @@ "sphinx.ext.autosummary", "sphinxcontrib.autodoc_pydantic", "sphinxcontrib.towncrier", # provides `towncrier-draft-entries` directive + "sphinxcontrib.plantuml", ] numpydoc_show_class_members = True autodoc_pydantic_model_show_config = False diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index b0833b213..e2179a4ec 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -3,6 +3,47 @@ Executing statements in Greenplum ================================== +Interaction schema +------------------ + +Unlike reading & writing, executing statements in Greenplum is done **only** through Greenplum master node, +without any interaction between Greenplum segments and Spark executors. More than that, Spark executors are not used in this case. + +The only port used while interacting with Greenplum in this case is ``5432`` (Greenplum master port). + +.. dropdown:: Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + end box + + box "Greenplum" + participant "Greenplum master" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + == Greenplum.execute(statement) == + "Spark driver" --> "Greenplum master" : EXECUTE statement + "Greenplum master" -> "Spark driver" : RETURN result + + == Greenplum.close() == + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + +Options +------- + .. currentmodule:: onetl.connection.db_connection.greenplum.connection .. automethod:: Greenplum.fetch diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 964d9cdcf..57db9635e 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -7,7 +7,7 @@ Version Compatibility --------------------- * Greenplum server versions: 5.x, 6.x -* Spark versions: 2.3.x - 3.2.x (Spark 3.3.x is not supported yet) +* Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) * Java versions: 8 - 11 See `official documentation `_. @@ -18,13 +18,7 @@ Installing PySpark To use Greenplum connector you should have PySpark installed (or injected to ``sys.path``) BEFORE creating the connector instance. -You can install PySpark as follows: - -.. code:: bash - - pip install onetl pyspark=3.2.3 # pass specific PySpark version - -See :ref:`spark-install` instruction for more details. +See :ref:`install-spark` installation instruction for more details. Downloading Pivotal package --------------------------- @@ -33,140 +27,189 @@ To use Greenplum connector you should download connector ``.jar`` file from `Pivotal website `_ and then pass it to Spark session. -There are several ways to do that. +.. warning:: + + Please pay attention to :ref:`Spark & Scala version compatibility `. + +There are several ways to do that. See :ref:`java-packages` for details. .. note:: - Please pay attention to Spark <-> Scala version compatibility. See :ref:`spark-compatibility-matrix`. + If you're uploading package to private package repo, use ``groupId=io.pivotal`` and ``artifactoryId=greenplum-spark_2.12`` + (``2.12`` is Scala version) to give uploaded package a proper name. -Using ``spark.jars`` -~~~~~~~~~~~~~~~~~~~~ +Connecting to Greenplum +----------------------- -The most simple solution, but this requires to store/deploy ``.jar`` file in the local environment. +Interaction schema +~~~~~~~~~~~~~~~~~~ -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option, e.g. +Spark executors open ports to listen incoming requests. +Greenplum segments are initiating connections to Spark executors using `EXTERNAL TABLE `_ +functionality, and send/read data using `gpfdist `_ protocol. -.. code:: python +Data is **not** send through Greenplum master. +Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on). - # no need to use spark.jars.packages - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars", "/path/to/downloaded.jar") - .getOrCreate() - ) +More details can be found in `official documentation `_. -Using ``spark.jars.repositories`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Number of parallel connections +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo. +.. warning:: -* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``). -* Pass repo URL to ``spark.jars.repositories`` Spark config option -* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option. + This is very important!!! + If you don't limit number of connections, you can exceed the `max_connections `_ + limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max, + depending on your Greenplum instance settings and using connection balancers like ``pgbouncer``. -Example -^^^^^^^ + Consuming all available connections means **nobody** (even admin users) can connect to Greenplum. -.. code:: python +Each job on the Spark executor makes its own connection to Greenplum master node, +so you need to limit number of connections to avoid opening too many of them. + +* Reading about ``5-10Gb`` of data requires about ``3-5`` parallel connections. +* Reading about ``20-30Gb`` of data requires about ``5-10`` parallel connections. +* Reading about ``50Gb`` of data requires ~ ``10-20`` parallel connections. +* Reading about ``100+Gb`` of data requires ``20-30`` parallel connections. +* Opening more than ``30-50`` connections is not recommended. + +Number of connections can be limited by 2 ways: + +* By limiting number of Spark executors and number of cores per-executor. Max number of parallel jobs is ``executors * cores``. + +.. tabs:: - maven_packages = Greenplum.get_packages(spark_version="3.2") - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars.repositories", "http://nexus.domain.com/example-repo/") - .config("spark.jars.packages", ",".join(maven_packages)) - .getOrCreate() - ) - - -Using ``spark.jars.ivySettings`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Same as above, but can be used even if there is no network access to public repos like Maven. - -* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``). -* Create `ivysettings.xml `_ file. -* Add here a resolver with repository URL (and credentials, if required). -* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option. -* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option. - -Example -^^^^^^^ - -.. code-block:: xml - :caption: ivysettings.xml - - - - - - - - - - - - - - - - - - - - -.. code-block:: python - :caption: script.py - - maven_packages = Greenplum.get_packages(spark_version="3.2") - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars.ivySettings", "/path/to/ivysettings.xml") - .config("spark.jars.packages", ",".join(maven_packages)) - .getOrCreate() - ) - -Moving ``.jar`` file to ``~/.ivy2/jars/`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven. - -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Move it to ``~/.ivy2/jars/`` folder -* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option. - -Example -^^^^^^^ + .. code-tab:: py Spark with master=local + + ( + SparkSession.builder + # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.master", "local[10]") + .config("spark.executor.cores", 1) + ) + + .. code-tab:: py Spark with master=yarn or master=k8s, dynamic allocation + + ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 + .config("spark.dynamicAllocation.maxExecutors", 10) + .config("spark.executor.cores", 1) + ) + + .. code-tab:: py Spark with master=yarn or master=k8s, static allocation + + ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.executor.instances", 10) + .config("spark.executor.cores", 1) + ) + +* By limiting connection pool size user by Spark (**only** for Spark with ``master=local``): .. code:: python - maven_packages = Greenplum.get_packages(spark_version="3.2") - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars.packages", ",".join(maven_packages)) - .getOrCreate() - ) + spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate() + + # No matter how many executors are started and how many cores they have, + # number of connections cannot exceed pool size: + Greenplum( + ..., + extra={ + "pool.maxSize": 10, + }, + ) + +See `connection pooling `_ +documentation. + + +* By setting :obj:`num_partitions ` + and :obj:`partition_column ` (not recommended). + +Allowing connection to Greenplum master +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node, +e.g. by updating ``pg_hba.conf`` file. + +More details can be found in `official documentation `_. + +Network ports +~~~~~~~~~~~~~ + +To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum: + +* Spark driver and all Spark executors -> port ``5432`` on Greenplum master node. + + This port number should be set while connecting to Greenplum: + + .. code:: python + + Greenplum(host="master.host", port=5432, ...) + +* Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executor**. + + This range should be set in ``extra`` option: + + .. code:: python + + Greenplum( + ..., + extra={ + "server.port": "41000-42000", + }, + ) + + Number of ports in this range is ``number of parallel running Spark sessions`` * ``number of parallel connections per session``. + + Number of connections per session (see below) is usually less than ``30`` (see below). + + Number of session depends on your environment: + * For ``master=local`` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU. + + * For ``master=yarn`` / ``master=k8s`` hundreds or thousands of sessions can be started simultaneously, + but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time. + +More details can be found in official documentation: + * `port requirements `_ + * `format of server.port value `_ + * `port troubleshooting `_ + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Greenplum cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: -Inserting ``.jar`` file to Spark jars folder -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. code-tab:: sql Reading & writing -Can be used to embed ``.jar`` files to a default Spark classpath. + GRANT USAGE ON SCHEMA myschema TO username; + GRANT CREATE ON SCHEMA myschema TO username; + GRANT SELECT, INSERT ON SCHEMA myschema.mytable TO username; + ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``. -* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages`` + .. code-tab:: sql Reading from Greenplum + GRANT USAGE ON SCHEMA schema_to_read TO username; + GRANT CREATE ON SCHEMA schema_to_read TO username; + GRANT SELECT ON SCHEMA schema_to_read.table_to_read TO username; + -- yes, ``writable``, because data is written from Greenplum to Spark executor. + ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); -Manually adding ``.jar`` files to ``CLASSPATH`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. code-tab:: sql Writing to Greenplum -Can be used to embed ``.jar`` files to a default Java classpath. + GRANT USAGE ON SCHEMA schema_to_write TO username; + GRANT CREATE ON SCHEMA schema_to_write TO username; + GRANT SELECT, INSERT ON SCHEMA schema_to_write.table_to_write TO username; + -- yes, ``readable``, because data is read from Spark executor to Greenplum. + ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist'); -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Set environment variable ``CLASSPATH`` to ``/path/to/downloader.jar`` -* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages`` +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index 2640f7e6c..30d669fea 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -8,20 +8,143 @@ For reading data from Greenplum, use :obj:`DBReader `, - and drop staging table after reading is finished. +Interaction schema +------------------ - In this case data will be read directly from Greenplum segment nodes in a distributed way. +High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below. + +.. dropdown:: Spark <-> Greenplum interaction during DBReader.run() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + "Spark driver" --> "Greenplum master" : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE EXISTS + "Spark driver" -> "Greenplum master" : SHOW SCHEMA FOR gp_table + "Greenplum master" --> "Spark driver" : (id bigint, col1 int, col2 text, ...) + + == DBReader.run() == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN + + "Greenplum segment1" ->o "Spark executor1" -- : INITIALIZE CONNECTION TO Spark executor1\nPUSH DATA TO Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + + "Greenplum segment2" ->o "Spark executor2" -- : INITIALIZE CONNECTION TO Spark executor2\nPUSH DATA TO Spark executor2 + "Greenplum segmentN" ->o "Spark executorN" -- : INITIALIZE CONNECTION TO Spark executorN\nPUSH DATA TO Spark executorN + + == Spark.stop() == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + +Recommendations +--------------- + +Reading from views +~~~~~~~~~~~~~~~~~~ + +This connector is **NOT** designed to read data from views. + +You can technically read data from a view which has +`gp_segment_id `_ column. +But this is **not** recommended because each Spark executor will run the same query, which may lead to running duplicated calculations +and sending data between segments only to skip most of the result and select only small part. + +Prefer following option: + * Create staging table to store result data, using :obj:`Greenplum.execute ` + * Use the same ``.execute`` method run a query ``INSERT INTO staging_table AS SELECT FROM some_view``. This will be done on Greenplum segments side, query will be run only once. + * Read data from staging table to Spark executor using :obj:`DBReader `. + * Drop staging table using ``.execute`` method. + +Using ``JOIN`` on Greenplum side +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you need to get data of joining 2 tables in Greenplum, you should: + * Create staging table to store result data, using ``Greenplum.execute`` + * Use the same ``Greenplum.execute`` run a query ``INSERT INTO staging_table AS SELECT FROM table1 JOIN table2``. This will be done on Greenplum segments side, in a distributed way. + * Read data from staging table to Spark executor using ``DBReader``. + * Drop staging table using ``Greenplum.execute``. .. warning:: - Greenplum connection does **NOT** support reading data from views which does not have ``gp_segment_id`` column. - Either add this column to a view, or use stating table solution (see above). + Do **NOT** try to read data from ``table1`` and ``table2`` using ``DBReader``, and then join the resulting dataframes! + + This will lead to sending all the data from both tables to Spark executor memory, and then ``JOIN`` + will be performed on Spark side, not Greenplum. This is **very** inefficient. + +Using ``TEMPORARY`` tables +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Someone could think that writing data from ``VIEW`` or result of ``JOIN`` to ``TEMPORARY`` table, +and then passing it to DBReader, is an efficient way to read data from Greenplum, because temp tables are not generating WAL files, +and are automatically deleted after finishing the transaction. + +That's will **not** work. Each Spark executor establishes its own connection to Greenplum, +and thus reads its own temporary table, which does not contain any data. + +You should use `UNLOGGED `_ tables +to write data to staging table without generating useless WAL logs. + +Mapping of Greenplum types to Spark types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See `official documentation `_ +for more details. +onETL does not perform any additional casting of types while reading data. + +Options +------- .. currentmodule:: onetl.connection.db_connection.greenplum.options diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst index aeb688ac5..c7a4f1560 100644 --- a/docs/connection/db_connection/greenplum/write.rst +++ b/docs/connection/db_connection/greenplum/write.rst @@ -5,6 +5,101 @@ Writing to Greenplum For writing data to Greenplum, use :obj:`DBWriter ` with options below. + +Interaction schema +------------------ + +High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below. + +.. dropdown:: Spark <-> Greenplum interaction during DBWriter.run() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + "Spark driver" --> "Greenplum master" ++ : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE NOT EXISTS + + == DBWriter.run(df) == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO gp_table FROM spark_executor1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM spark_executor1 TO gp_table_data_on_segment1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO gp_table FROM spark_executor2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM spark_executor2 TO gp_table_data_on_segment2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO gp_table FROM spark_executorN + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM spark_executorN TO gp_table_data_on_segmentN + + "Greenplum segment1" -->o "Spark executor1" : INITIALIZE CONNECTION TO Spark executor1 + "Spark executor1" -> "Greenplum segment1" : READ DATA FROM Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + deactivate "Greenplum segment1" + + "Greenplum segment2" -->o "Spark executor2" : INITIALIZE CONNECTION TO Spark executor2 + "Spark executor2" -> "Greenplum segment2" : READ DATA FROM Spark executor2 + deactivate "Greenplum segment2" + + "Greenplum segmentN" -->o "Spark executorN" : INITIALIZE CONNECTION TO Spark executorN + "Spark executorN" -> "Greenplum segmentN" : READ DATA FROM Spark executorN + deactivate "Greenplum segmentN" + + == Finished == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + +Recommendations +--------------- + +Mapping of Spark types to Greenplum types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See `official documentation `_ +for more details. +onETL does not perform any additional casting of types while writing data. + +Options +------- + .. currentmodule:: onetl.connection.db_connection.greenplum.options .. autopydantic_model:: GreenplumWriteOptions diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst index a19c5e57b..3e6c846b1 100644 --- a/docs/connection/db_connection/kafka/read.rst +++ b/docs/connection/db_connection/kafka/read.rst @@ -7,7 +7,8 @@ For reading data from Kafka, use :obj:`DBReader FTPS HDFS + Samba SFTP S3 Webdav diff --git a/docs/connection/file_connection/samba.rst b/docs/connection/file_connection/samba.rst new file mode 100644 index 000000000..73f7ac3f9 --- /dev/null +++ b/docs/connection/file_connection/samba.rst @@ -0,0 +1,9 @@ +.. _samba: + +Samba connection +============== + +.. currentmodule:: onetl.connection.file_connection.samba + +.. autoclass:: Samba + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_file, list_dir, download_file, upload_file diff --git a/docs/file_df/file_formats/avro.rst b/docs/file_df/file_formats/avro.rst index 6251a5154..7f1ec0d4f 100644 --- a/docs/file_df/file_formats/avro.rst +++ b/docs/file_df/file_formats/avro.rst @@ -1,7 +1,7 @@ .. _avro-file-format: Avro -======== +==== .. currentmodule:: onetl.file.format.avro diff --git a/docs/file_df/file_formats/excel.rst b/docs/file_df/file_formats/excel.rst new file mode 100644 index 000000000..f9b680085 --- /dev/null +++ b/docs/file_df/file_formats/excel.rst @@ -0,0 +1,9 @@ +.. _excel-file-format: + +Excel +===== + +.. currentmodule:: onetl.file.format.excel + +.. autoclass:: Excel + :members: get_packages diff --git a/docs/file_df/file_formats/index.rst b/docs/file_df/file_formats/index.rst index 7e3367bc6..3a39bc061 100644 --- a/docs/file_df/file_formats/index.rst +++ b/docs/file_df/file_formats/index.rst @@ -9,6 +9,7 @@ File Formats avro csv + excel json jsonline orc diff --git a/docs/file_df/file_formats/orc.rst b/docs/file_df/file_formats/orc.rst index 2d82b3584..491492bac 100644 --- a/docs/file_df/file_formats/orc.rst +++ b/docs/file_df/file_formats/orc.rst @@ -1,7 +1,7 @@ .. _orc-file-format: ORC -======== +=== .. currentmodule:: onetl.file.format.orc diff --git a/docs/index.rst b/docs/index.rst index cc8fdb87d..54ced3d06 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,7 +14,7 @@ :hidden: self - install + install/index quickstart concepts diff --git a/docs/install.rst b/docs/install.rst deleted file mode 100644 index abf328c75..000000000 --- a/docs/install.rst +++ /dev/null @@ -1,3 +0,0 @@ -.. include:: ../README.rst - :start-after: install - :end-before: quick-start diff --git a/docs/install/files.rst b/docs/install/files.rst new file mode 100644 index 000000000..b32c7a807 --- /dev/null +++ b/docs/install/files.rst @@ -0,0 +1,8 @@ +.. _install-files: + +File connections +================= + +.. include:: ../../README.rst + :start-after: .. _files-install: + :end-before: With Kerberos support diff --git a/docs/install/full.rst b/docs/install/full.rst new file mode 100644 index 000000000..a3853207c --- /dev/null +++ b/docs/install/full.rst @@ -0,0 +1,8 @@ +.. _install-full: + +Full bundle +=========== + +.. include:: ../../README.rst + :start-after: .. _full-bundle: + :end-before: .. _quick-start: diff --git a/docs/install/index.rst b/docs/install/index.rst new file mode 100644 index 000000000..47f86287c --- /dev/null +++ b/docs/install/index.rst @@ -0,0 +1,21 @@ +.. _install: + +How to install +============== + +.. include:: ../../README.rst + :start-after: .. _minimal-install: + :end-before: With DB and FileDF connections + +Installation in details +----------------------- + +.. toctree:: + :maxdepth: 1 + :caption: How to install + + self + spark + files + kerberos + full diff --git a/docs/install/kerberos.rst b/docs/install/kerberos.rst new file mode 100644 index 000000000..2ba28de4d --- /dev/null +++ b/docs/install/kerberos.rst @@ -0,0 +1,8 @@ +.. _install-kerberos: + +Kerberos support +================ + +.. include:: ../../README.rst + :start-after: .. _kerberos-install: + :end-before: Full bundle diff --git a/docs/install/spark.rst b/docs/install/spark.rst new file mode 100644 index 000000000..861527341 --- /dev/null +++ b/docs/install/spark.rst @@ -0,0 +1,323 @@ +.. _install-spark: + +Spark +===== + +.. include:: ../../README.rst + :start-after: .. _spark-install: + :end-before: .. _java-install: + +Installing Java +--------------- + +.. include:: ../../README.rst + :start-after: .. _java-install: + :end-before: .. _pyspark-install: + +Installing PySpark +------------------ + +.. include:: ../../README.rst + :start-after: .. _pyspark-install: + :end-before: With File connections + +.. _java-packages: + +Injecting Java packages +----------------------- + +Some DB and FileDF connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session, +like JDBC drivers. + +This is usually done by setting up ``spark.jars.packages`` option while creating Spark session: + +.. code:: python + + # here is a list of packages to be downloaded: + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) + + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + + +Spark automatically resolves package and all its dependencies, download them and inject to Spark session +(both driver and all executors). + +This requires internet access, because package metadata and ``.jar`` files are fetched from `Maven Repository `_. + +But sometimes it is required to: + +* Install package without direct internet access (isolated network) +* Install package which is not available in Maven + +There are several ways to do that. + +Using ``spark.jars`` +^^^^^^^^^^^^^^^^^^^^ + +The most simple solution, but this requires to store raw ``.jar`` files somewhere on filesystem or web server. + +* Download ``package.jar`` files (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique. +* (For ``spark.submit.deployMode=cluster``) place downloaded files to HDFS or deploy to any HTTP web server serving static files. See `official documentation `_ for more details. +* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option: + +.. tabs:: + + .. code-tab:: py for spark.submit.deployMode=client (default) + + jar_files = ["/path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) + + .. code-tab:: py for spark.submit.deployMode=cluster + + # you can also pass URLs like http://domain.com/path/to/downloadable/package.jar + jar_files = ["hdfs:///path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) + +Using ``spark.jars.repositories`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + In this case Spark still will try to fetch packages from the internet, so if you don't have internet access, + Spark session will be created with significant delay because of all attempts to fetch packages. + +Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo. + +* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter. +* Upload ``package.jar`` file to private repository (with same ``groupId`` and ``artifactoryId`` as in source package in Maven). +* Pass repo URL to ``spark.jars.repositories`` Spark config option. +* Create Spark session with passing Package name to ``spark.jars.packages`` Spark config option: + +.. code:: python + + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) + + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + + +Using ``spark.jars.ivySettings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Same as above, but can be used even if there is no network access to public repos like Maven. + +* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter. +* Upload ``package.jar`` file to `private repository `_ (with same ``groupId`` and ``artifactoryId`` as in source package in Maven). +* Create ``ivysettings.xml`` file (see below). +* Add here a resolver with repository URL (and credentials, if required). +* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option. +* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option: + +.. tabs:: + + .. code-tab:: xml ivysettings-all-packages-uploaded-to-nexus.xml + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-private-packages-in-nexus-public-in-maven.xml + + + + + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-private-packages-in-nexus-public-fetched-using-proxy-repo.xml + + + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-nexus-with-auth-required.xml + + + + + + + + + + + + + + + + + + + + + + +.. code-block:: python + :caption: script.py + + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) + + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.ivySettings", "/path/to/ivysettings.xml") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + +Place ``.jar`` file to ``-/.ivy2/jars/`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven. + +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique. +* Move it to ``-/.ivy2/jars/`` folder. +* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option: + +.. code:: python + + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) + + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + +Place ``.jar`` file to Spark jars folder +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + Package file should be placed on all hosts/containers Spark is running, + both driver and all executors. + + Usually this is used only with either: + * ``spark.master=local`` (driver and executors are running on the same host), + * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods). + +Can be used to embed ``.jar`` files to a default Spark classpath. + +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique. +* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``^/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``. +* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages`` +.. code:: python + + # no need to set spark.jars.packages or any other spark.jars.* option + # all jars already present in CLASSPATH, and loaded automatically + + spark = SparkSession.builder.config("spark.app.name", "onetl").getOrCreate() + + +Manually adding ``.jar`` files to ``CLASSPATH`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + Package file should be placed on all hosts/containers Spark is running, + both driver and all executors. + + Usually this is used only with either: + * ``spark.master=local`` (driver and executors are running on the same host), + * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods). + +Can be used to embed ``.jar`` files to a default Java classpath. + +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter. +* Set environment variable ``CLASSPATH`` to ``/path/to/package.jar``. You can set multiple file paths +* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages`` + +.. code:: python + + # no need to set spark.jars.packages or any other spark.jars.* option + # all jars already present in CLASSPATH, and loaded automatically + + import os + + jar_files = ["/path/to/package.jar"] + # different delimiters for Windows and Linux + delimiter = ";" if os.name == "nt" else ":" + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.driver.extraClassPath", delimiter.join(jar_files)) + .config("spark.executor.extraClassPath", delimiter.join(jar_files)) + .getOrCreate() + ) diff --git a/onetl/VERSION b/onetl/VERSION index 965065db5..a602fc9e2 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.9.3 +0.9.4 diff --git a/onetl/connection/__init__.py b/onetl/connection/__init__.py index 1c50f7fee..3e40e2a2a 100644 --- a/onetl/connection/__init__.py +++ b/onetl/connection/__init__.py @@ -37,6 +37,7 @@ from onetl.connection.file_connection.ftps import FTPS from onetl.connection.file_connection.hdfs import HDFS from onetl.connection.file_connection.s3 import S3 + from onetl.connection.file_connection.samba import Samba from onetl.connection.file_connection.sftp import SFTP from onetl.connection.file_connection.webdav import WebDAV from onetl.connection.file_df_connection.spark_hdfs import SparkHDFS @@ -62,6 +63,7 @@ "HDFS": "hdfs", "S3": "s3", "SFTP": "sftp", + "Samba": "samba", "WebDAV": "webdav", } diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index dc6acf163..f95884f7d 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -65,7 +65,7 @@ class Clickhouse(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 99de7d90c..d1eedff7f 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -124,34 +124,24 @@ class Greenplum(JDBCMixin, DBConnection): from onetl.connection import Greenplum from pyspark.sql import SparkSession - # Please ask your DevOps and Greenplum admin what port range - # on Spark side can be used to accept requests from Greenplum segments - - extra = { - "server.port": "49152-65535", - } - # Create Spark session with Greenplum connector loaded - # See Prerequisites page for more details maven_packages = Greenplum.get_packages(spark_version="3.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.executor.allowSparkContext", "true") + # IMPORTANT!!! + # Set number of executors according to "Prerequisites" -> "Number of executors" .config("spark.dynamicAllocation.maxExecutors", 10) .config("spark.executor.cores", 1) .getOrCreate() ) # IMPORTANT!!! - # Each job on the Spark executor make its own connection to Greenplum master node, - # so we need to limit number of connections to avoid opening too many of them. - # - # Table size ~20Gb requires about 10 executors * cores, - # ~50Gb requires ~ 20 executors * cores, - # 100Gb+ requires 30 executors * cores. - # - # Cores number can be increased, but executors count should be reduced - # to keep the same number of executors * cores. + # Set port range of executors according to "Prerequisites" -> "Network ports" + extra = { + "server.port": "41000-42000", + } # Create connection greenplum = Greenplum( diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index 86785155e..7d4638412 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -107,7 +107,9 @@ class Config: .. warning:: - You should not change this option, unless you know what you're doing + You should not change this option, unless you know what you're doing. + + It's preferable to use default values to read data parallel by number of segments in Greenplum cluster. Possible values: * ``None`` (default): diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index d0bc08d29..6d768ea2e 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -69,7 +69,7 @@ class Hive(DBConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. warning:: @@ -82,7 +82,7 @@ class Hive(DBConnection): .. note:: Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`. + **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. @@ -340,6 +340,14 @@ def write_df_to_target( # https://stackoverflow.com/a/72747050 if table_exists and write_options.if_exists != HiveTableExistBehavior.REPLACE_ENTIRE_TABLE: + if write_options.if_exists == HiveTableExistBehavior.ERROR: + raise ValueError("Operation stopped due to Hive.WriteOptions(if_exists='error')") + elif write_options.if_exists == HiveTableExistBehavior.IGNORE: + log.info( + "|%s| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')", + self.__class__.__name__, + ) + return # using saveAsTable on existing table does not handle # spark.sql.sources.partitionOverwriteMode=dynamic, so using insertInto instead. self._insert_into(df, target, options) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index c46b7882d..81445851d 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -26,6 +26,8 @@ class HiveTableExistBehavior(str, Enum): APPEND = "append" + IGNORE = "ignore" + ERROR = "error" REPLACE_ENTIRE_TABLE = "replace_entire_table" REPLACE_OVERLAPPING_PARTITIONS = "replace_overlapping_partitions" @@ -173,9 +175,30 @@ class Config: Table is recreated using options provided by user (``format``, ``compression``, etc) **instead of using original table options**. Be careful - .. note:: + * ``ignore`` + Ignores the write operation if the table/partition already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user (``format``, ``compression``, etc). + + * Table exists + If the table exists, **no further action is taken**. This is true whether or not new partition + values are present and whether the partitioning scheme differs or not + + * ``error`` + Raises an error if the table/partition already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user (``format``, ``compression``, etc). + + * Table exists + If the table exists, **raises an error**. This is true whether or not new partition + values are present and whether the partitioning scheme differs or not - ``error`` and ``ignore`` modes are not supported. .. note:: diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 3eb83f538..f5b611910 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -218,7 +218,11 @@ def write_df_to_target( write_options = self.WriteOptions.parse(options) jdbc_params = self.options_to_jdbc_params(write_options) - mode = "append" if write_options.if_exists == JDBCTableExistBehavior.APPEND else "overwrite" + mode = ( + "overwrite" + if write_options.if_exists == JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE + else write_options.if_exists.value + ) log.info("|%s| Saving data to a table %r", self.__class__.__name__, target) df.write.jdbc(table=target, mode=mode, **jdbc_params) log.info("|%s| Table %r successfully written", self.__class__.__name__, target) diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index c998055fe..dacaded77 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -84,6 +84,8 @@ class JDBCTableExistBehavior(str, Enum): APPEND = "append" + IGNORE = "ignore" + ERROR = "error" REPLACE_ENTIRE_TABLE = "replace_entire_table" def __str__(self) -> str: @@ -413,44 +415,65 @@ class Config: .. dropdown:: Behavior in details - * Table does not exist - Table is created using options provided by user - (``createTableOptions``, ``createTableColumnTypes``, etc). + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). - * Table exists - Data is appended to a table. Table has the same DDL as before writing data + * Table exists + Data is appended to a table. Table has the same DDL as before writing data - .. warning:: + .. warning:: - This mode does not check whether table already contains - rows from dataframe, so duplicated rows can be created. + This mode does not check whether table already contains + rows from dataframe, so duplicated rows can be created. - Also Spark does not support passing custom options to - insert statement, like ``ON CONFLICT``, so don't try to - implement deduplication using unique indexes or constraints. + Also Spark does not support passing custom options to + insert statement, like ``ON CONFLICT``, so don't try to + implement deduplication using unique indexes or constraints. - Instead, write to staging table and perform deduplication - using :obj:`~execute` method. + Instead, write to staging table and perform deduplication + using :obj:`~execute` method. * ``replace_entire_table`` **Table is dropped and then created, or truncated**. .. dropdown:: Behavior in details - * Table does not exist - Table is created using options provided by user - (``createTableOptions``, ``createTableColumnTypes``, etc). + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). - * Table exists - Table content is replaced with dataframe content. + * Table exists + Table content is replaced with dataframe content. - After writing completed, target table could either have the same DDL as - before writing data (``truncate=True``), or can be recreated (``truncate=False`` - or source does not support truncation). + After writing completed, target table could either have the same DDL as + before writing data (``truncate=True``), or can be recreated (``truncate=False`` + or source does not support truncation). - .. note:: + * ``ignore`` + Ignores the write operation if the table already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). + + * Table exists + The write operation is ignored, and no data is written to the table. + + * ``error`` + Raises an error if the table already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). + + * Table exists + An error is raised, and no data is written to the table. - ``error`` and ``ignore`` modes are not supported. """ batchsize: int = 20_000 diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index c02fb82f1..e5e3e312e 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -15,10 +15,11 @@ from __future__ import annotations import logging +import threading from abc import abstractmethod from contextlib import closing, suppress from enum import Enum, auto -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Tuple, TypeVar +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, TypeVar from pydantic import Field, PrivateAttr, SecretStr, validator @@ -76,7 +77,7 @@ class JDBCMixin(FrozenModel): _CHECK_QUERY: ClassVar[str] = "SELECT 1" # cached JDBC connection (Java object), plus corresponding GenericOptions (Python object) - _last_connection_and_options: Optional[Tuple[Any, JDBCMixinOptions]] = PrivateAttr(default=None) + _last_connection_and_options: Optional[threading.local] = PrivateAttr(default=None) @property @abstractmethod @@ -126,6 +127,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # noqa: U101 def __del__(self): # noqa: WPS603 # If current object is collected by GC, close all opened connections + # This is safe because closing connection on Spark driver does not influence Spark executors self.close() @slot @@ -459,8 +461,14 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions): return jdbc_options.asConnectionProperties() def _get_jdbc_connection(self, options: JDBCMixinOptions): + if not self._last_connection_and_options: + # connection class can be used in multiple threads. + # each Python thread creates its own thread in JVM + # so we need local variable to create per-thread persistent connection + self._last_connection_and_options = threading.local() + with suppress(Exception): # nothing cached, or JVM failed - last_connection, last_options = self._last_connection_and_options + last_connection, last_options = self._last_connection_and_options.data if options == last_options and not last_connection.isClosed(): return last_connection @@ -471,15 +479,18 @@ def _get_jdbc_connection(self, options: JDBCMixinOptions): driver_manager = self.spark._jvm.java.sql.DriverManager # type: ignore new_connection = driver_manager.getConnection(self.jdbc_url, connection_properties) - self._last_connection_and_options = (new_connection, options) + self._last_connection_and_options.data = (new_connection, options) return new_connection def _close_connections(self): with suppress(Exception): - last_connection, _ = self._last_connection_and_options + # connection maybe not opened yet + last_connection, _ = self._last_connection_and_options.data last_connection.close() - self._last_connection_and_options = None + with suppress(Exception): + # connection maybe not opened yet + del self._last_connection_and_options.data def _get_statement_args(self) -> tuple[int, ...]: resultset = self.spark._jvm.java.sql.ResultSet # type: ignore diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 3aa8f0fd2..51053df0c 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -72,6 +72,7 @@ class Kafka(DBConnection): * Apache Kafka versions: 0.10 or higher * Spark versions: 2.4.x - 3.4.x + * Scala versions: 2.11 - 2.13 Parameters ---------- @@ -381,6 +382,9 @@ def get_packages( """ Get package names to be downloaded by Spark. |support_hooks| + See `Maven package index `_ + for all available packages. + Parameters ---------- spark_version : str @@ -458,6 +462,10 @@ def close(self): self.auth.cleanup(self) return self + # Do not all __del__ with calling .close(), like other connections, + # because this can influence dataframes created by this connection. + # For example, .close() deletes local keytab copy. + @property def instance_url(self): return "kafka://" + self.cluster diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 8e6110f14..860f7b215 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -65,6 +65,7 @@ class MongoDB(DBConnection): * MongoDB server versions: 4.0 or higher * Spark versions: 3.2.x - 3.4.x * Java versions: 8 - 20 + * Scala versions: 2.11 - 2.13 See `official documentation `_. @@ -82,7 +83,7 @@ class MongoDB(DBConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- @@ -124,7 +125,7 @@ class MongoDB(DBConnection): from pyspark.sql import SparkSession # Create Spark session with MongoDB connector loaded - maven_packages = Greenplum.get_packages(spark_version="3.2") + maven_packages = MongoDB.get_packages(spark_version="3.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -206,6 +207,7 @@ def get_packages( if scala_ver.digits(2) < (2, 12) or scala_ver.digits(2) > (2, 13): raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver}") + # https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.digits(2)}:10.1.1"] @classproperty @@ -504,6 +506,16 @@ def write_df_to_target( else "append" ) + if self._collection_exists(target): + if write_options.if_exists == MongoDBCollectionExistBehavior.ERROR: + raise ValueError("Operation stopped due to MongoDB.WriteOptions(if_exists='error')") + elif write_options.if_exists == MongoDBCollectionExistBehavior.IGNORE: + log.info( + "|%s| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')", + self.__class__.__name__, + ) + return + log.info("|%s| Saving data to a collection %r", self.__class__.__name__, target) df.write.format("mongodb").mode(mode).options(**write_options_dict).save() log.info("|%s| Collection %r is successfully written", self.__class__.__name__, target) @@ -533,3 +545,13 @@ def _check_java_class_imported(cls, spark): log.debug("Missing Java class", exc_info=e, stack_info=True) raise ValueError(msg) from e return spark + + def _collection_exists(self, source: str) -> bool: + jvm = self.spark._jvm + client = jvm.com.mongodb.client.MongoClients.create(self.connection_url) # type: ignore + collections = set(client.getDatabase(self.database).listCollectionNames().iterator()) + if source in collections: + log.info("|%s| Collection %r exists", self.__class__.__name__, source) + return True + log.info("|%s| Collection %r does not exist", self.__class__.__name__, source) + return False diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 85f1935a3..13c256aff 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -81,6 +81,8 @@ class MongoDBCollectionExistBehavior(str, Enum): APPEND = "append" + IGNORE = "ignore" + ERROR = "error" REPLACE_ENTIRE_COLLECTION = "replace_entire_collection" def __str__(self) -> str: @@ -207,33 +209,52 @@ class MongoDBWriteOptions(GenericOptions): .. dropdown:: Behavior in details - * Collection does not exist - Collection is created using options provided by user - (``shardkey`` and others). + * Collection does not exist + Collection is created using options provided by user + (``shardkey`` and others). - * Collection exists - Data is appended to a collection. + * Collection exists + Data is appended to a collection. - .. warning:: + .. warning:: - This mode does not check whether collection already contains - objects from dataframe, so duplicated objects can be created. + This mode does not check whether collection already contains + objects from dataframe, so duplicated objects can be created. * ``replace_entire_collection`` **Collection is deleted and then created**. .. dropdown:: Behavior in details - * Collection does not exist - Collection is created using options provided by user - (``shardkey`` and others). + * Collection does not exist + Collection is created using options provided by user + (``shardkey`` and others). - * Collection exists - Collection content is replaced with dataframe content. + * Collection exists + Collection content is replaced with dataframe content. - .. note:: + * ``ignore`` + Ignores the write operation if the collection already exists. + + .. dropdown:: Behavior in details + + * Collection does not exist + Collection is created using options provided by user + + * Collection exists + The write operation is ignored, and no data is written to the collection. + + * ``error`` + Raises an error if the collection already exists. + + .. dropdown:: Behavior in details + + * Collection does not exist + Collection is created using options provided by user + + * Collection exists + An error is raised, and no data is written to the collection. - ``error`` and ``ignore`` modes are not supported. """ class Config: diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 49fc825d9..6738c2541 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -64,7 +64,7 @@ class MSSQL(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 868731eaf..abd17df33 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -63,7 +63,7 @@ class MySQL(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 69d7e2c5b..2e1f3e916 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -103,7 +103,7 @@ class Oracle(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index eb07a68f6..22b42c296 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -61,7 +61,7 @@ class Postgres(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 7e730f9eb..2c797b3d8 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -66,7 +66,7 @@ class Teradata(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py index 39e27f2c6..cc5ebbb9e 100644 --- a/onetl/connection/file_connection/file_connection.py +++ b/onetl/connection/file_connection/file_connection.py @@ -17,6 +17,7 @@ import os import threading from abc import abstractmethod +from contextlib import suppress from logging import getLogger from typing import Any, Iterable, Iterator @@ -72,8 +73,10 @@ def client(self): if client and not self._is_client_closed(client): return client except AttributeError: - self._clients_cache.client = self._get_client() - return self._clients_cache.client + pass + + self._clients_cache.client = self._get_client() + return self._clients_cache.client @slot def close(self): @@ -112,8 +115,14 @@ def close(self): except AttributeError: return self - self._close_client(client) - del self._clients_cache.client + with suppress(Exception): + # exceptions while closing client should be ignored + self._close_client(client) + + with suppress(Exception): + # .close() could be called from destructor, and modifying self is not allowed here + del self._clients_cache.client + return self def __enter__(self): @@ -122,6 +131,10 @@ def __enter__(self): def __exit__(self, _exc_type, _exc_value, _traceback): self.close() + def __del__(self): # noqa: WPS603 + # If current object is collected by GC, close opened connection + self.close() + @slot def check(self): log.info("|%s| Checking connection availability...", self.__class__.__name__) diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index 6710a4303..b7dd82257 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -68,7 +68,7 @@ class FTP(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 211ff6030..dfcd05553 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -69,7 +69,7 @@ class FTPS(FTP): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 2419aae2f..aa58f7e0a 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -72,14 +72,14 @@ class HDFS(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` installation instruction for more details. .. note:: To access Hadoop cluster with Kerberos installed, you should have ``kinit`` executable in some path in ``PATH`` environment variable. - See onETL :ref:`kerberos-install` instruction for more details. + See :ref:`install-kerberos` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 7198a05aa..2f8d298f1 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -67,7 +67,7 @@ class S3(FileConnection): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py new file mode 100644 index 000000000..bef7ed276 --- /dev/null +++ b/onetl/connection/file_connection/samba.py @@ -0,0 +1,327 @@ +# Copyright 2023 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import stat +import textwrap +from io import BytesIO +from logging import getLogger +from pathlib import Path +from typing import Optional, Union + +from etl_entities.instance import Host +from pydantic import SecretStr, validator +from typing_extensions import Literal + +from onetl.connection.file_connection.file_connection import FileConnection +from onetl.hooks import slot, support_hooks +from onetl.impl import LocalPath, RemotePath, RemotePathStat + +try: + from smb.smb_structs import OperationFailure + from smb.SMBConnection import SMBConnection +except (ImportError, NameError) as e: + raise ImportError( + textwrap.dedent( + """ + Cannot import module "pysmb". + + You should install package as follows: + pip install onetl[samba] + + or + pip install onetl[files] + """, + ).strip(), + ) from e + + +log = getLogger(__name__) + + +@support_hooks +class Samba(FileConnection): + """Samba file connection. |support_hooks| + + Based on `pysmb library `_. + + .. versionadded:: 0.9.4 + + .. warning:: + + To use Samba connector you should install package as follows: + + .. code:: bash + + pip install onetl[samba] + + # or + pip install onetl[files] + + See :ref:`install-files` installation instruction for more details. + + Parameters + ---------- + host : str + Host of Samba source. For example: ``mydomain.com``. + + share : str + The name of the share on the Samba server. + + protocol : str, default: ``SMB`` + The protocol to use for the connection. Either ``SMB`` or ``NetBIOS``. + Affects the default port and the `is_direct_tcp` flag in `SMBConnection`. + + port : int, default: 445 + Port of Samba source. + + domain : str, default: `` + Domain name for the Samba connection. Empty strings means use ``host`` as domain name. + + auth_type : str, default: ``NTLMv2`` + The authentication type to use. Either ``NTLMv2`` or ``NTLMv1``. + Affects the `use_ntlm_v2` flag in `SMBConnection`. + + user : str, default: None + User, which have access to the file source. Can be `None` for anonymous connection. + + password : str, default: None + Password for file source connection. Can be `None` for anonymous connection. + + Examples + -------- + + Samba file connection initialization + + .. code:: python + + from onetl.connection import Samba + + samba = Samba( + host="mydomain.com", + share="share_name", + protocol="SMB", + port=445, + user="user", + password="password", + ) + + + """ + + host: Host + share: str + protocol: Union[Literal["SMB"], Literal["NetBIOS"]] = "SMB" + port: Optional[int] = None + domain: Optional[str] = "" + auth_type: Union[Literal["NTLMv1"], Literal["NTLMv2"]] = "NTLMv2" + user: Optional[str] = None + password: Optional[SecretStr] = None + + @property + def instance_url(self) -> str: + return f"smb://{self.host}:{self.port}" + + @slot + def check(self): + log.info("|%s| Checking connection availability...", self.__class__.__name__) + self._log_parameters() + try: + available_shares = {share.name for share in self.client.listShares()} + if self.share in available_shares: + log.info("|%s| Connection is available.", self.__class__.__name__) + else: + log.error( + "|%s| Share %r not found among existing shares %r", + self.__class__.__name__, + self.share, + available_shares, + ) + raise ConnectionError("Failed to connect to the Samba server.") + except Exception as exc: + log.exception("|%s| Connection is unavailable", self.__class__.__name__) + raise RuntimeError("Connection is unavailable") from exc + + return self + + @slot + def path_exists(self, path: os.PathLike | str) -> bool: + try: + self.client.getAttributes(self.share, os.fspath(path)) + return True + except OperationFailure: + return False + + def _scan_entries(self, path: RemotePath) -> list: + if self._is_dir(path): + return [ + entry + for entry in self.client.listPath( + self.share, + os.fspath(path), + ) + if entry.filename not in {".", ".."} # Filter out '.' and '..' + ] + return [self.client.getAttributes(self.share, os.fspath(path))] + + def _extract_name_from_entry(self, entry) -> str: + return entry.filename + + def _is_dir_entry(self, top: RemotePath, entry) -> bool: + return entry.isDirectory + + def _is_file_entry(self, top: RemotePath, entry) -> bool: + return not entry.isDirectory + + def _extract_stat_from_entry(self, top: RemotePath, entry) -> RemotePathStat: + if entry.isDirectory: + return RemotePathStat(st_mode=stat.S_IFDIR) + + return RemotePathStat( + st_size=entry.file_size, + st_mtime=entry.last_write_time, + st_uid=entry.filename, + ) + + def _get_client(self) -> SMBConnection: + is_direct_tcp = self.protocol == "SMB" + use_ntlm_v2 = self.auth_type == "NTLMv2" + conn = SMBConnection( + username=self.user, + password=self.password.get_secret_value() if self.password else None, + my_name="onetl", + remote_name=self.host, + domain=self.domain, + use_ntlm_v2=use_ntlm_v2, + sign_options=2, + is_direct_tcp=is_direct_tcp, + ) + conn.connect(self.host, port=self.port) + return conn + + def _is_client_closed(self, client: SMBConnection) -> bool: + try: + socket_fileno = client.sock.fileno() + except (AttributeError, OSError): + return True + + return socket_fileno == -1 + + def _close_client(self, client: SMBConnection) -> None: + self.client.close() + + def _download_file(self, remote_file_path: RemotePath, local_file_path: LocalPath) -> None: + with open(local_file_path, "wb") as local_file: + self.client.retrieveFile( + self.share, + os.fspath(remote_file_path), + local_file, + ) + + def _get_stat(self, path: RemotePath) -> RemotePathStat: + info = self.client.getAttributes(self.share, os.fspath(path)) + + if self.is_dir(os.fspath(path)): + return RemotePathStat(st_mode=stat.S_IFDIR) + + return RemotePathStat( + st_size=info.file_size, + st_mtime=info.last_write_time, + st_uid=info.filename, + ) + + def _remove_file(self, remote_file_path: RemotePath) -> None: + self.client.deleteFiles( + self.share, + os.fspath(remote_file_path), + ) + + def _create_dir(self, path: RemotePath) -> None: + path_obj = Path(path) + for parent in reversed(path_obj.parents): + # create dirs sequentially as .createDirectory(...) cannot create nested dirs + try: + self.client.getAttributes(self.share, os.fspath(parent)) + except OperationFailure: + self.client.createDirectory(self.share, os.fspath(parent)) + + self.client.createDirectory(self.share, os.fspath(path)) + + def _upload_file(self, local_file_path: LocalPath, remote_file_path: RemotePath) -> None: + with open(local_file_path, "rb") as file_obj: + self.client.storeFile( + self.share, + os.fspath(remote_file_path), + file_obj, + ) + + def _rename_file(self, source: RemotePath, target: RemotePath) -> None: + self.client.rename( + self.share, + os.fspath(source), + os.fspath(target), + ) + + def _remove_dir(self, path: RemotePath) -> None: + files = self.client.listPath(self.share, os.fspath(path)) + + for item in files: + if item.filename not in {".", ".."}: # skip current and parent directory entries + full_path = path / item.filename + if item.isDirectory: + # recursively delete subdirectory + self._remove_dir(full_path) + else: + self.client.deleteFiles(self.share, os.fspath(full_path)) + + self.client.deleteDirectory(self.share, os.fspath(path)) + + def _read_text(self, path: RemotePath, encoding: str) -> str: + return self._read_bytes(path).decode(encoding) + + def _read_bytes(self, path: RemotePath) -> bytes: + file_obj = BytesIO() + self.client.retrieveFile( + self.share, + os.fspath(path), + file_obj, + ) + file_obj.seek(0) + return file_obj.read() + + def _write_text(self, path: RemotePath, content: str, encoding: str) -> None: + self._write_bytes(path, bytes(content, encoding)) + + def _write_bytes(self, path: RemotePath, content: bytes) -> None: + file_obj = BytesIO(content) + + self.client.storeFile( + self.share, + os.fspath(path), + file_obj, + ) + + def _is_dir(self, path: RemotePath) -> bool: + return self.client.getAttributes(self.share, os.fspath(path)).isDirectory + + def _is_file(self, path: RemotePath) -> bool: + return not self.client.getAttributes(self.share, os.fspath(path)).isDirectory + + @validator("port", pre=True, always=True) + def _set_port_based_on_protocol(cls, port, values): + if port is None: + return 445 if values.get("protocol") == "SMB" else 139 + return port diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 3b84df658..bef53ce2d 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -71,7 +71,7 @@ class SFTP(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index 52aab0419..9825a0525 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -70,7 +70,7 @@ class WebDAV(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 04bdfae48..6855fe595 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -17,6 +17,7 @@ import getpass import logging import os +from contextlib import suppress from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -58,12 +59,12 @@ class SparkHDFS(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. note:: Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`. + **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. @@ -224,10 +225,16 @@ def close(self): """ log.debug("Reset FileSystem cache") - self._get_spark_fs().close() - object.__setattr__(self, "_active_host", None) # noqa: WPS609 + with suppress(Exception): + self._get_spark_fs().close() + + with suppress(Exception): + self._active_host = None return self + # Do not all __del__ with calling .close(), like other connections, + # because this can influence dataframes created by this connection + @slot @classmethod def get_current(cls, spark: SparkSession): @@ -360,7 +367,7 @@ def _convert_to_url(self, path: PurePathProtocol) -> str: else: host = self._get_host() # cache value to avoid getting active namenode for every path - object.__setattr__(self, "_active_host", host) # noqa: WPS609 + self._active_host = host return f"hdfs://{host}:{self.ipc_port}" + path.as_posix() def _get_default_path(self): diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index b914c714f..264fac3a2 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -49,7 +49,7 @@ class SparkLocalFS(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. warning:: diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 0fd72a0ca..992e11627 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -16,6 +16,7 @@ import logging import os +from contextlib import suppress from typing import TYPE_CHECKING, ClassVar, List, Optional from etl_entities.instance import Host @@ -63,6 +64,7 @@ class SparkS3(SparkFileDFConnection): * Spark versions: 3.2.x - 3.4.x (only with Hadoop 3.x libraries) * Java versions: 8 - 20 + * Scala versions: 2.11 - 2.13 .. warning:: @@ -82,7 +84,7 @@ class SparkS3(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. note:: @@ -160,9 +162,17 @@ class SparkS3(SparkFileDFConnection): # Create Spark session with Hadoop AWS libraries loaded maven_packages = SparkS3.get_packages(spark_version="3.4.1") + # Some dependencies are not used, but downloading takes a lot of time. Skipping them. + excluded_packages = [ + "com.google.cloud.bigdataoss:gcs-connector", + "org.apache.hadoop:hadoop-aliyun", + "org.apache.hadoop:hadoop-azure-datalake", + "org.apache.hadoop:hadoop-azure", + ] spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(excluded_packages)) .config("spark.hadoop.fs.s3a.committer.magic.enabled", "true") .config("spark.hadoop.fs.s3a.committer.name", "magic") .config( @@ -263,6 +273,7 @@ def get_packages( raise ValueError(f"Spark version must be at least 3.x, got {spark_ver}") scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + # https://mvnrepository.com/artifact/org.apache.spark/spark-hadoop-cloud return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.digits(2)}:{spark_ver.digits(3)}"] @slot @@ -311,9 +322,13 @@ def close(self): connection.close() """ - self._reset_hadoop_conf() + with suppress(Exception): + self._reset_hadoop_conf() return self + # Do not all __del__ with calling .close(), like other connections, + # because this can influence dataframes created by this connection + @slot def check(self): self._patch_hadoop_conf() diff --git a/onetl/file/format/__init__.py b/onetl/file/format/__init__.py index d41c76aac..0c9d6b742 100644 --- a/onetl/file/format/__init__.py +++ b/onetl/file/format/__init__.py @@ -15,6 +15,7 @@ from onetl.file.format.avro import Avro from onetl.file.format.csv import CSV +from onetl.file.format.excel import Excel from onetl.file.format.json import JSON from onetl.file.format.jsonline import JSONLine from onetl.file.format.orc import ORC diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 2fc5a1cb5..b0c58e18d 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -73,6 +73,7 @@ class Avro(ReadWriteFileFormat): * Spark versions: 2.4.x - 3.4.x * Java versions: 8 - 20 + * Scala versions: 2.11 - 2.13 See documentation from link above. @@ -131,6 +132,9 @@ def get_packages( """ Get package names to be downloaded by Spark. |support_hooks| + See `Maven package index `_ + for all available packages. + Parameters ---------- spark_version : str diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py new file mode 100644 index 000000000..ffd11a5da --- /dev/null +++ b/onetl/file/format/excel.py @@ -0,0 +1,220 @@ +# Copyright 2023 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, ClassVar + +from onetl._util.java import try_import_java_class +from onetl._util.scala import get_default_scala_version +from onetl._util.spark import get_spark_version +from onetl._util.version import Version +from onetl.exception import MISSING_JVM_CLASS_MSG +from onetl.file.format.file_format import ReadWriteFileFormat +from onetl.hooks import slot, support_hooks + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + +READ_OPTIONS = frozenset( + ( + "dataAddress", + "treatEmptyValuesAsNulls", + "setErrorCellsToFallbackValues", + "usePlainNumberFormat", + "inferSchema", + "addColorColumns", + "timestampFormat", + "maxRowsInMemory", + "maxByteArraySize", + "tempFileThreshold", + "excerptSize", + "workbookPassword", + ), +) + +WRITE_OPTIONS = frozenset( + ( + "dataAddress", + "dateFormat", + "timestampFormat", + ), +) + +log = logging.getLogger(__name__) + + +@support_hooks +class Excel(ReadWriteFileFormat): + """ + Excel file format. |support_hooks| + + Based on `Spark Excel `_ file format. + + Supports reading/writing files with ``.xlsx`` (read/write) and ``.xls`` (read only) extensions. + + .. versionadded:: 0.9.4 + + .. dropdown:: Version compatibility + + * Spark versions: 3.2.x - 3.4.x. + + .. warning:: + + Not all combinations of Spark version and package version are supported. + See `Maven index `_ + and `official documentation `_. + + * Scala versions: 2.12 - 2.13 + * Java versions: 8 - 20 + + See documentation from link above. + + .. note :: + + You can pass any option to the constructor, even if it is not mentioned in this documentation. + **Option names should be in** ``camelCase``! + + The set of supported options depends on Spark version. See link above. + + Examples + -------- + + Describe options how to read from/write to Excel file with specific options: + + .. code:: python + + from onetl.file.format import Excel + from pyspark.sql import SparkSession + + # Create Spark session with Excel package loaded + maven_packages = Excel.get_packages(spark_version="3.4.1") + spark = ( + SparkSession.builder.appName("spark-app-name") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + + excel = Excel( + header=True, + inferSchema=True, + ) + + """ + + name: ClassVar[str] = "excel" + + header: bool = False + + class Config: + known_options = READ_OPTIONS | WRITE_OPTIONS + extra = "allow" + + @slot + @classmethod + def get_packages( + cls, + spark_version: str, + scala_version: str | None = None, + package_version: str | None = None, + ) -> list[str]: + """ + Get package names to be downloaded by Spark. |support_hooks| + + .. warning:: + + Not all combinations of Spark version and package version are supported. + See `Maven index `_ + and `official documentation `_. + + Parameters + ---------- + spark_version : str + Spark version in format ``major.minor.patch``. + + scala_version : str, optional + Scala version in format ``major.minor``. + + If ``None``, ``spark_version`` is used to determine Scala version. + + version: str, optional + Package version in format ``major.minor.patch``. Default is ``0.19.0``. + + .. warning:: + + Version ``0.14`` and below are not supported. + + .. note:: + + It is not guaranteed that custom package versions are supported. + Tests are performed only for default version. + + Examples + -------- + + .. code:: python + + from onetl.file.format import Excel + + Excel.get_packages(spark_version="3.4.1") + Excel.get_packages(spark_version="3.4.1", scala_version="2.13") + Excel.get_packages( + spark_version="3.4.1", + scala_version="2.13", + package_version="0.19.0", + ) + + """ + + if package_version: + version = Version.parse(package_version) + if version < (0, 15): + # format="com.crealytics.spark.excel" does not support reading folder with files + # format="excel" was added only in 0.14, but Maven package for 0.14 has different naming convention than recent versions. + # So using 0.15 as the lowest supported version. + raise ValueError(f"Package version should be at least 0.15, got {package_version}") + log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) + else: + version = Version.parse("0.19.0") + + spark_ver = Version.parse(spark_version) + if spark_ver < (3, 2): + # Actually, Spark 2.4 is supported, but packages are built only for Scala 2.12 + # when default pyspark==2.4.1 is built with Scala 2.11. + # See https://github.com/crealytics/spark-excel/issues/426 + raise ValueError(f"Spark version should be at least 3.2, got {spark_version}") + + scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + if scala_ver.digits(2) < (2, 12): + raise ValueError(f"Scala version should be at least 2.12, got {scala_ver}") + + return [f"com.crealytics:spark-excel_{scala_ver.digits(2)}:{spark_ver.digits(3)}_{version.digits(3)}"] + + @slot + def check_if_supported(self, spark: SparkSession) -> None: + java_class = "com.crealytics.spark.excel.v2.ExcelDataSource" + + try: + try_import_java_class(spark, java_class) + except Exception as e: + spark_version = get_spark_version(spark) + msg = MISSING_JVM_CLASS_MSG.format( + java_class=java_class, + package_source=self.__class__.__name__, + args=f"spark_version='{spark_version}'", + ) + if log.isEnabledFor(logging.DEBUG): + log.debug("Missing Java class", exc_info=e, stack_info=True) + raise ValueError(msg) from e diff --git a/pytest.ini b/pytest.ini index 5e40e75d7..3c71e8eb6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -24,5 +24,6 @@ markers = postgres: Postgres tests s3: S3 tests sftp: SFTP tests + samba: Samba tests teradata: Teradata tests webdav: WebDAV tests diff --git a/requirements/docs.txt b/requirements/docs.txt index d3fc9555e..4ff1db3e9 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -3,10 +3,10 @@ furo importlib-resources<6 numpydoc pygments-csv-lexer -# https://github.com/sphinx-doc/sphinx/issues/11662 -sphinx<7.2.5 +sphinx sphinx-copybutton sphinx-design +sphinx-plantuml sphinx-tabs sphinx-toolbox sphinx_substitution_extensions diff --git a/requirements/samba.txt b/requirements/samba.txt new file mode 100644 index 000000000..619ee4f51 --- /dev/null +++ b/requirements/samba.txt @@ -0,0 +1 @@ +pysmb diff --git a/requirements/tests/samba.txt b/requirements/tests/samba.txt new file mode 100644 index 000000000..619ee4f51 --- /dev/null +++ b/requirements/tests/samba.txt @@ -0,0 +1 @@ +pysmb diff --git a/requirements/tests/spark-3.2.3.txt b/requirements/tests/spark-3.2.4.txt similarity index 80% rename from requirements/tests/spark-3.2.3.txt rename to requirements/tests/spark-3.2.4.txt index 44291430a..1acafab9a 100644 --- a/requirements/tests/spark-3.2.3.txt +++ b/requirements/tests/spark-3.2.4.txt @@ -1,5 +1,5 @@ numpy>=1.16,<1.24 pandas>=1.0,<2 pyarrow>=1.0 -pyspark==3.2.3 +pyspark==3.2.4 sqlalchemy<2.0 diff --git a/setup.py b/setup.py index 422085535..f8b560707 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def parse_requirements(file: Path) -> list[str]: requirements_ftp = parse_requirements(here / "requirements" / "ftp.txt") requirements_sftp = parse_requirements(here / "requirements" / "sftp.txt") +requirements_samba = parse_requirements(here / "requirements" / "samba.txt") requirements_hdfs = parse_requirements(here / "requirements" / "hdfs.txt") requirements_s3 = parse_requirements(here / "requirements" / "s3.txt") requirements_webdav = parse_requirements(here / "requirements" / "webdav.txt") @@ -90,6 +91,7 @@ def parse_requirements(file: Path) -> list[str]: "ftp": requirements_ftp, "ftps": requirements_ftp, "sftp": requirements_sftp, + "samba": requirements_samba, "hdfs": requirements_hdfs, "s3": requirements_s3, "webdav": requirements_webdav, diff --git a/tests/fixtures/connections/file_connections.py b/tests/fixtures/connections/file_connections.py index e8ef7253e..f44240894 100644 --- a/tests/fixtures/connections/file_connections.py +++ b/tests/fixtures/connections/file_connections.py @@ -12,6 +12,7 @@ lazy_fixture("hdfs_file_connection"), lazy_fixture("s3_file_connection"), lazy_fixture("sftp_file_connection"), + lazy_fixture("samba_file_connection"), lazy_fixture("webdav_file_connection"), ], ) @@ -26,6 +27,7 @@ def file_connection(request): lazy_fixture("hdfs_file_connection_with_path"), lazy_fixture("s3_file_connection_with_path"), lazy_fixture("sftp_file_connection_with_path"), + lazy_fixture("samba_file_connection_with_path"), lazy_fixture("webdav_file_connection_with_path"), ], ) @@ -40,6 +42,7 @@ def file_connection_with_path(request): lazy_fixture("hdfs_file_connection_with_path_and_files"), lazy_fixture("s3_file_connection_with_path_and_files"), lazy_fixture("sftp_file_connection_with_path_and_files"), + lazy_fixture("samba_file_connection_with_path_and_files"), lazy_fixture("webdav_file_connection_with_path_and_files"), ], ) diff --git a/tests/fixtures/connections/samba.py b/tests/fixtures/connections/samba.py new file mode 100644 index 000000000..52a294d5b --- /dev/null +++ b/tests/fixtures/connections/samba.py @@ -0,0 +1,63 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.util.upload_files import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param("real-samba", marks=[pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]), + ], +) +def samba_server(): + SambaServer = namedtuple("SambaServer", ["host", "protocol", "port", "share", "user", "password"]) + + return SambaServer( + host=os.getenv("ONETL_SAMBA_HOST"), + protocol=os.getenv("ONETL_SAMBA_PROTOCOL"), + port=os.getenv("ONETL_SAMBA_PORT"), + share=os.getenv("ONETL_SAMBA_SHARE"), + user=os.getenv("ONETL_SAMBA_USER"), + password=os.getenv("ONETL_SAMBA_PASSWORD"), + ) + + +@pytest.fixture() +def samba_file_connection(samba_server): + from onetl.connection import Samba + + return Samba( + host=samba_server.host, + protocol=samba_server.protocol, + port=samba_server.port, + share=samba_server.share, + user=samba_server.user, + password=samba_server.password, + ) + + +@pytest.fixture() +def samba_file_connection_with_path(request, samba_file_connection): + connection = samba_file_connection + root = PurePosixPath("/data") + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + + return connection, root + + +@pytest.fixture() +def samba_file_connection_with_path_and_files(resource_path, samba_file_connection_with_path): + connection, upload_to = samba_file_connection_with_path + upload_from = resource_path / "file_connection" + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index 2135f3b52..05358b9c0 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -44,7 +44,7 @@ def maven_packages(): SparkS3, Teradata, ) - from onetl.file.format import Avro + from onetl.file.format import Avro, Excel pyspark_version = get_pyspark_version() packages = ( @@ -74,9 +74,23 @@ def maven_packages(): # There is no MongoDB connector for Spark less than 3.2 packages.extend(MongoDB.get_packages(spark_version=pyspark_version)) + # There is no Excel files support for Spark less than 3.2 + packages.extend(Excel.get_packages(spark_version=pyspark_version)) + return packages +@pytest.fixture(scope="session") +def excluded_packages(): + # These packages are a part of org.apache.spark:spark-hadoop-cloud, but not used in tests + return [ + "com.google.cloud.bigdataoss:gcs-connector", + "org.apache.hadoop:hadoop-aliyun", + "org.apache.hadoop:hadoop-azure-datalake", + "org.apache.hadoop:hadoop-azure", + ] + + @pytest.fixture( scope="session", name="spark", @@ -84,13 +98,14 @@ def maven_packages(): pytest.param("real-spark", marks=[pytest.mark.db_connection, pytest.mark.connection]), ], ) -def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages): +def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages, excluded_packages): from pyspark.sql import SparkSession spark = ( SparkSession.builder.config("spark.app.name", "onetl") # noqa: WPS221 .config("spark.master", "local[*]") .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(excluded_packages)) .config("spark.jars.ivySettings", os.fspath(ivysettings_path)) .config("spark.driver.memory", "1g") .config("spark.driver.maxResultSize", "1g") diff --git a/tests/resources/file_df_connection/generate_files.py b/tests/resources/file_df_connection/generate_files.py index 874045f8c..698c81ea7 100755 --- a/tests/resources/file_df_connection/generate_files.py +++ b/tests/resources/file_df_connection/generate_files.py @@ -14,10 +14,13 @@ from contextlib import contextmanager from datetime import date, datetime, timezone from pathlib import Path +from tempfile import gettempdir from typing import TYPE_CHECKING, Any, Iterator, TextIO +from zipfile import ZipFile if TYPE_CHECKING: from avro.schema import Schema as AvroSchema + from pandas import DataFrame as PandasDataFrame from pyarrow import Schema as ArrowSchema from pyarrow import Table as ArrowTable @@ -85,6 +88,12 @@ def get_data() -> list[dict]: ] +def get_pandas_dataframe(data: list[dict]) -> PandasDataFrame: + import pandas as pd + + return pd.DataFrame(data) + + def get_pyarrow_schema() -> ArrowSchema: import pyarrow as pa @@ -382,6 +391,87 @@ def save_as_avro(data: list[dict], path: Path) -> None: save_as_avro_snappy(data, root / "with_compression") +def save_as_xls_with_options( + data: list[dict], + path: Path, + index: bool = False, + **kwargs, +) -> None: + # required to register xlwt writer which supports generating .xls files + import pandas_xlwt + + path.mkdir(parents=True, exist_ok=True) + file = path / "file.xls" + + df = get_pandas_dataframe(data) + df["datetime_value"] = df.datetime_value.dt.tz_localize(None) + df.to_excel(file, index=index, engine="xlwt", **kwargs) + + +def make_zip_deterministic(path: Path) -> None: + temp_dir = gettempdir() + file_copy = Path(shutil.copy(path, temp_dir)) + + with ZipFile(file_copy, "r") as original_file: + with ZipFile(path, "w") as new_file: + for item in original_file.infolist(): + if item.filename == "docProps/core.xml": + # this file contains modification time, which produces files with different hashes + continue + # reset modification time of all files + item.date_time = (1980, 1, 1, 0, 0, 0) + new_file.writestr(item, original_file.read(item.filename)) + + +def save_as_xlsx_with_options( + data: list[dict], + path: Path, + index: bool = False, + **kwargs, +) -> None: + path.mkdir(parents=True, exist_ok=True) + file = path / "file.xls" + + df = get_pandas_dataframe(data) + df["datetime_value"] = df.datetime_value.dt.tz_localize(None) + df.to_excel(file, index=index, engine="openpyxl", **kwargs) + make_zip_deterministic(file) + + +def save_as_xlsx(data: list[dict], path: Path) -> None: + root = path / "xlsx" + shutil.rmtree(root, ignore_errors=True) + root.mkdir(parents=True, exist_ok=True) + + save_as_xlsx_with_options(data, root / "without_header", header=False) + save_as_xlsx_with_options(data, root / "with_header", header=True) + save_as_xlsx_with_options( + data, + root / "with_data_address", + header=False, + sheet_name="ABC", + startcol=10, + startrow=5, + ) + + +def save_as_xls(data: list[dict], path: Path) -> None: + root = path / "xls" + shutil.rmtree(root, ignore_errors=True) + root.mkdir(parents=True, exist_ok=True) + + save_as_xls_with_options(data, root / "without_header", header=False) + save_as_xls_with_options(data, root / "with_header", header=True) + save_as_xls_with_options( + data, + root / "with_data_address", + header=False, + sheet_name="ABC", + startcol=10, + startrow=5, + ) + + format_mapping = { "csv": save_as_csv, "json": save_as_json, @@ -389,6 +479,8 @@ def save_as_avro(data: list[dict], path: Path) -> None: "orc": save_as_orc, "parquet": save_as_parquet, "avro": save_as_avro, + "xlsx": save_as_xlsx, + "xls": save_as_xls, } diff --git a/tests/resources/file_df_connection/xls/with_data_address/file.xls b/tests/resources/file_df_connection/xls/with_data_address/file.xls new file mode 100644 index 000000000..28288eb8e Binary files /dev/null and b/tests/resources/file_df_connection/xls/with_data_address/file.xls differ diff --git a/tests/resources/file_df_connection/xls/with_header/file.xls b/tests/resources/file_df_connection/xls/with_header/file.xls new file mode 100644 index 000000000..efb43b4a9 Binary files /dev/null and b/tests/resources/file_df_connection/xls/with_header/file.xls differ diff --git a/tests/resources/file_df_connection/xls/without_header/file.xls b/tests/resources/file_df_connection/xls/without_header/file.xls new file mode 100644 index 000000000..420aa1107 Binary files /dev/null and b/tests/resources/file_df_connection/xls/without_header/file.xls differ diff --git a/tests/resources/file_df_connection/xlsx/with_data_address/file.xls b/tests/resources/file_df_connection/xlsx/with_data_address/file.xls new file mode 100644 index 000000000..bf2343c0a Binary files /dev/null and b/tests/resources/file_df_connection/xlsx/with_data_address/file.xls differ diff --git a/tests/resources/file_df_connection/xlsx/with_header/file.xls b/tests/resources/file_df_connection/xlsx/with_header/file.xls new file mode 100644 index 000000000..b19c54d02 Binary files /dev/null and b/tests/resources/file_df_connection/xlsx/with_header/file.xls differ diff --git a/tests/resources/file_df_connection/xlsx/without_header/file.xls b/tests/resources/file_df_connection/xlsx/without_header/file.xls new file mode 100644 index 000000000..78632de24 Binary files /dev/null and b/tests/resources/file_df_connection/xlsx/without_header/file.xls differ diff --git a/tests/resources/requirements.txt b/tests/resources/requirements.txt index 56d154dd9..033953205 100644 --- a/tests/resources/requirements.txt +++ b/tests/resources/requirements.txt @@ -1,2 +1,5 @@ avro[snappy] +openpyxl +pandas pyarrow +pandas-xlwt diff --git a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py index 3b47df0d0..778d3a20c 100644 --- a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py +++ b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py @@ -26,13 +26,16 @@ def test_spark_hdfs_check(hdfs_file_df_connection, caplog): def test_spark_hdfs_file_connection_check_failed(spark): from onetl.connection import SparkHDFS - with pytest.raises(RuntimeError, match="Connection is unavailable"): - SparkHDFS( - cluster="rnd-dwh", - host="hive1", - port=1234, - spark=spark, - ).check() + wrong_hdfs = SparkHDFS( + cluster="rnd-dwh", + host="hive1", + port=1234, + spark=spark, + ) + + with wrong_hdfs: + with pytest.raises(RuntimeError, match="Connection is unavailable"): + wrong_hdfs.check() def test_spark_hdfs_file_connection_check_with_hooks(spark, request, hdfs_server): diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index a73fa06c5..cb687776c 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -56,7 +56,7 @@ def test_avro_reader( """Reading Avro files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) if spark_version < (2, 4): - pytest.skip("Avro only supported on Spark 2.4+") + pytest.skip("Avro files are supported on Spark 3.2+ only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files df = file_df_dataframe @@ -76,10 +76,10 @@ def test_avro_reader( @pytest.mark.parametrize( - "path, options", + "options", [ - ("without_compression", {}), - ("with_compression", {"compression": "snappy"}), + {}, + {"compression": "snappy"}, ], ids=["without_compression", "with_compression"], ) @@ -88,13 +88,12 @@ def test_avro_writer( local_fs_file_df_connection_with_path, file_df_dataframe, avro_schema, - path, options, ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) if spark_version < (2, 4): - pytest.skip("Avro only supported on Spark 2.4+") + pytest.skip("Avro files are supported on Spark 3.2+ only") file_df_connection, source_path = local_fs_file_df_connection_with_path df = file_df_dataframe diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py index a6cd14591..289e88273 100644 --- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py @@ -27,6 +27,7 @@ def test_csv_reader_with_infer_schema( local_fs_file_df_connection_with_path_and_files, file_df_dataframe, ): + """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions""" file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files df = file_df_dataframe csv_root = source_path / "csv/without_header" @@ -42,9 +43,13 @@ def test_csv_reader_with_infer_schema( expected_df = df - if get_spark_version(spark).major < 3: + spark_version = get_spark_version(spark) + if spark_version.major < 3: # Spark 2 infers "date_value" as timestamp instead of date expected_df = df.withColumn("date_value", col("date_value").cast("timestamp")) + elif spark_version < (3, 3): + # Spark 3.2 cannot infer "date_value", and return it as string + expected_df = df.withColumn("date_value", col("date_value").cast("string")) # csv does not have header, so columns are named like "_c0", "_c1", etc expected_df = reset_column_names(expected_df) diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py new file mode 100644 index 000000000..de8cc9cf9 --- /dev/null +++ b/tests/tests_integration/test_file_format_integration/test_excel_integration.py @@ -0,0 +1,142 @@ +"""Integration tests for Excel file format. + +Test only that options are passed to Spark in both FileDFReader & FileDFWriter. +Do not test all the possible options and combinations, we are not testing Spark here. +""" + +import pytest + +from onetl._util.spark import get_spark_version +from onetl.file import FileDFReader, FileDFWriter +from onetl.file.format import Excel + +try: + from pyspark.sql.functions import col + + from tests.util.assert_df import assert_equal_df + from tests.util.spark_df import reset_column_names +except ImportError: + # pandas and spark can be missing if someone runs tests for file connections only + pass + +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] + + +@pytest.mark.parametrize("format", ["xlsx", "xls"]) +def test_excel_reader_with_infer_schema( + spark, + local_fs_file_df_connection_with_path_and_files, + file_df_dataframe, + format, +): + """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions""" + spark_version = get_spark_version(spark) + if spark_version < (3, 2): + pytest.skip("Excel files are supported on Spark 3.2+ only") + + file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files + df = file_df_dataframe + excel_root = source_path / format / "without_header" + + reader = FileDFReader( + connection=file_df_connection, + format=Excel(inferSchema=True), + source_path=excel_root, + ) + read_df = reader.run() + + assert read_df.count() + + expected_df = df + # Spark infers "date_value" as timestamp instead of date + expected_df = df.withColumn("date_value", col("date_value").cast("timestamp")) + + # excel does not have header, so columns are named like "_c0", "_c1", etc + expected_df = reset_column_names(expected_df) + + assert read_df.schema != df.schema + assert read_df.schema == expected_df.schema + assert_equal_df(read_df, expected_df) + + +@pytest.mark.parametrize("format", ["xlsx", "xls"]) +@pytest.mark.parametrize( + "path, options", + [ + ("without_header", {}), + ("with_header", {"header": True}), + ("with_data_address", {"dataAddress": "'ABC'!K6"}), + ], + ids=["without_header", "with_header", "with_data_address"], +) +def test_excel_reader_with_options( + spark, + local_fs_file_df_connection_with_path_and_files, + file_df_dataframe, + format, + path, + options, +): + """Reading Excel files working as expected on any Spark, Python and Java versions""" + spark_version = get_spark_version(spark) + if spark_version < (3, 2): + pytest.skip("Excel files are supported on Spark 3.2+ only") + + local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files + df = file_df_dataframe + excel_root = source_path / format / path + + reader = FileDFReader( + connection=local_fs, + format=Excel.parse(options), + df_schema=df.schema, + source_path=excel_root, + ) + read_df = reader.run() + + assert read_df.count() + assert read_df.schema == df.schema + assert_equal_df(read_df, df) + + +@pytest.mark.parametrize( + "options", + [ + {}, + {"header": True}, + ], + ids=["without_header", "with_header"], +) +def test_excel_writer( + spark, + local_fs_file_df_connection_with_path, + file_df_dataframe, + options, +): + """Written files can be read by Spark""" + spark_version = get_spark_version(spark) + if spark_version < (3, 2): + pytest.skip("Excel files are supported on Spark 3.2+ only") + + file_df_connection, source_path = local_fs_file_df_connection_with_path + df = file_df_dataframe + excel_root = source_path / "excel" + + writer = FileDFWriter( + connection=file_df_connection, + format=Excel.parse(options), + target_path=excel_root, + ) + writer.run(df) + + reader = FileDFReader( + connection=file_df_connection, + format=Excel.parse(options), + source_path=excel_root, + df_schema=df.schema, + ) + read_df = reader.run() + + assert read_df.count() + assert read_df.schema == df.schema + assert_equal_df(read_df, df) diff --git a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py index ed290ab43..0a932dd46 100644 --- a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py +++ b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py @@ -635,10 +635,11 @@ def test_file_downloader_mode_replace_entire_directory( caplog, ): file_connection, remote_path, _ = file_connection_with_path_and_files + # Reason for using .resolve(): https://stackoverflow.com/a/58719476 if local_dir_exist: - local_path = tmp_path_factory.mktemp("local_path") + local_path = tmp_path_factory.mktemp("local_path").resolve() else: - local_path = Path(tempfile.gettempdir()) / secrets.token_hex() + local_path = Path(tempfile.gettempdir()).resolve() / secrets.token_hex() temp_file = local_path / secrets.token_hex(5) if local_dir_exist: @@ -755,7 +756,11 @@ def finalizer(): local_path=file.name, ) - with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"): + # Reason for .realpath(): https://stackoverflow.com/a/58719476 + with pytest.raises( + NotADirectoryError, + match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory", + ): downloader.run() diff --git a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py index 522cf2dd4..feedeaa45 100644 --- a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py +++ b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py @@ -490,7 +490,11 @@ def test_file_uploader_run_local_path_not_a_directory(file_connection): with tempfile.NamedTemporaryFile() as file: uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=file.name) - with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"): + # Reason for .realpath(): https://stackoverflow.com/a/58719476 + with pytest.raises( + NotADirectoryError, + match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory", + ): uploader.run() diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py index c97105a44..338de0c67 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py @@ -137,6 +137,14 @@ def test_greenplum_writer_if_exists_error(spark, processing, prepare_schema_tabl ): writer.run(df) + empty_df = spark.createDataFrame([], df.schema) + + processing.assert_equal_df( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + df=empty_df, + ) + def test_greenplum_writer_if_exists_ignore(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py index 44553539b..8ca74b06d 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py @@ -1,4 +1,5 @@ import logging +import re import textwrap import pytest @@ -225,6 +226,8 @@ def test_hive_writer_default_not_partitioned(spark, processing, get_schema_table "options", [ Hive.WriteOptions(if_exists="append"), + Hive.WriteOptions(if_exists="ignore"), + Hive.WriteOptions(if_exists="error"), Hive.WriteOptions(if_exists="replace_entire_table"), Hive.WriteOptions(if_exists="replace_overlapping_partitions"), ], @@ -363,6 +366,105 @@ def test_hive_writer_insert_into_append(spark, processing, get_schema_table, ori ) +@pytest.mark.parametrize( + "original_options, new_options", + [ + pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"), + pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"), + pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"), + pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"), + ], +) +def test_hive_writer_insert_into_ignore(spark, processing, get_schema_table, original_options, new_options, caplog): + df = processing.create_spark_df(spark=spark) + + df1 = df[df.id_int <= 25] + df2 = df.where("id_int > 25 AND id_int <= 50") + df3 = df[df.id_int > 50] + + hive = Hive(cluster="rnd-dwh", spark=spark) + writer1 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=original_options, + ) + # create & fill up the table with some data + writer1.run(df1.union(df2)) + old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + + writer2 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=Hive.WriteOptions(if_exists="ignore", **new_options), + ) + + with caplog.at_level(logging.INFO): + writer2.run(df1.union(df3)) + + assert "|Hive| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')" in caplog.text + + new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + + # table DDL remains the same + assert new_ddl == old_ddl + + # table should only contain old data, because 'ignore' should not have added new data + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df1.union(df2), + order_by="id_int", + ) + + +@pytest.mark.parametrize( + "original_options, new_options", + [ + pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"), + pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"), + pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"), + pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"), + ], +) +def test_hive_writer_insert_into_error(spark, processing, get_schema_table, original_options, new_options, caplog): + df = processing.create_spark_df(spark=spark) + + hive = Hive(cluster="rnd-dwh", spark=spark) + writer1 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=original_options, + ) + + # Create & fill up the table with some data + writer1.run(df) + old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + + writer2 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=Hive.WriteOptions(if_exists="error", **new_options), + ) + + with pytest.raises( + ValueError, + match=re.escape("Operation stopped due to Hive.WriteOptions(if_exists='error')"), + ): + writer2.run(df) + + # table DDL remains the same + new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + assert new_ddl == old_ddl + + # validate that the table contains only old data + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df, + order_by="id_int", + ) + + @pytest.mark.parametrize( "original_options, new_options", [ diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py index 458a6902f..d5cd94fed 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py @@ -1,3 +1,6 @@ +import logging +import re + import pytest from onetl.connection import MongoDB @@ -6,8 +9,18 @@ pytestmark = pytest.mark.mongodb +@pytest.mark.parametrize( + "options", + [ + {}, + {"if_exists": "append"}, + {"if_exists": "replace_entire_collection"}, + {"if_exists": "error"}, + {"if_exists": "ignore"}, + ], +) @pytest.mark.flaky(reruns=2) -def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table): +def test_mongodb_writer_snapshot(spark, processing, get_schema_table, options, caplog): df = processing.create_spark_df(spark=spark) mongo = MongoDB( @@ -21,12 +34,144 @@ def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table): writer = DBWriter( connection=mongo, - table=prepare_schema_table.table, + table=get_schema_table.table, + options=MongoDB.WriteOptions(**options), + ) + + with caplog.at_level(logging.INFO): + writer.run(df) + + assert f"|MongoDB| Collection '{get_schema_table.table}' does not exist" in caplog.text + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df, + ) + + +def test_mongodb_writer_if_exists_append(spark, processing, get_schema_table): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + df1 = df[df._id < 1001] + df2 = df[df._id > 1000] + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="append"), + ) + writer.run(df1) + writer.run(df2) + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df, + ) + + +def test_mongodb_writer_if_exists_replace_entire_collection(spark, processing, get_schema_table): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + df1 = df[df._id < 1001] + df2 = df[df._id > 1000] + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="replace_entire_collection"), + ) + writer.run(df1) + writer.run(df2) + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df2, + ) + + +def test_mongodb_writer_if_exists_error(spark, processing, get_schema_table, caplog): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="error"), ) writer.run(df) + with pytest.raises( + ValueError, + match=re.escape("Operation stopped due to MongoDB.WriteOptions(if_exists='error')"), + ): + writer.run(df) + processing.assert_equal_df( - schema=prepare_schema_table.schema, - table=prepare_schema_table.table, + schema=get_schema_table.schema, + table=get_schema_table.table, df=df, ) + + +def test_mongodb_writer_if_exists_ignore(spark, processing, get_schema_table, caplog): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + df1 = df[df._id < 1001] + df2 = df[df._id > 1000] + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="ignore"), + ) + writer.run(df1) + + with caplog.at_level(logging.INFO): + writer.run(df2) # The write operation is ignored + + assert f"|MongoDB| Collection '{get_schema_table.table}' exists" in caplog.text + assert ( + "|MongoDB| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')" + in caplog.text + ) + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df1, + ) diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py index 195b16e02..cda43c8a8 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py @@ -6,7 +6,17 @@ pytestmark = pytest.mark.postgres -def test_postgres_writer_snapshot(spark, processing, prepare_schema_table): +@pytest.mark.parametrize( + "options", + [ + {}, + {"if_exists": "append"}, + {"if_exists": "replace_entire_table"}, + {"if_exists": "error"}, + {"if_exists": "ignore"}, + ], +) +def test_postgres_writer_snapshot(spark, processing, get_schema_table, options): df = processing.create_spark_df(spark=spark) postgres = Postgres( @@ -20,14 +30,15 @@ def test_postgres_writer_snapshot(spark, processing, prepare_schema_table): writer = DBWriter( connection=postgres, - target=prepare_schema_table.full_name, + target=get_schema_table.full_name, + options=Postgres.WriteOptions(**options), ) writer.run(df) processing.assert_equal_df( - schema=prepare_schema_table.schema, - table=prepare_schema_table.table, + schema=get_schema_table.schema, + table=get_schema_table.table, df=df, ) @@ -86,7 +97,7 @@ def test_postgres_writer_snapshot_with_pydantic_options(spark, processing, prepa ) -def test_postgres_writer_mode_append(spark, processing, prepare_schema_table): +def test_postgres_writer_if_exists_append(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) df1 = df[df.id_int < 1001] df2 = df[df.id_int > 1000] @@ -116,7 +127,70 @@ def test_postgres_writer_mode_append(spark, processing, prepare_schema_table): ) -def test_postgres_writer_mode_replace_entire_table(spark, processing, prepare_schema_table): +def test_postgres_writer_if_exists_error(spark, processing, prepare_schema_table): + from pyspark.sql.utils import AnalysisException + + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=postgres, + target=prepare_schema_table.full_name, + options=Postgres.WriteOptions(if_exists="error"), + ) + + with pytest.raises( + AnalysisException, + match=f"Table or view '{prepare_schema_table.full_name}' already exists. SaveMode: ErrorIfExists.", + ): + writer.run(df) + + empty_df = spark.createDataFrame([], df.schema) + + processing.assert_equal_df( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + df=empty_df, + ) + + +def test_postgres_writer_if_exists_ignore(spark, processing, prepare_schema_table): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=postgres, + target=prepare_schema_table.full_name, + options=Postgres.WriteOptions(if_exists="ignore"), + ) + + writer.run(df) # The write operation is ignored + empty_df = spark.createDataFrame([], df.schema) + + processing.assert_equal_df( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + df=empty_df, + ) + + +def test_postgres_writer_if_exists_replace_entire_table(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) df1 = df[df.id_int < 1001] df2 = df[df.id_int > 1000] diff --git a/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py new file mode 100644 index 000000000..7c5c8f5d5 --- /dev/null +++ b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py @@ -0,0 +1,58 @@ +import logging + +import pytest + +pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection] + + +def test_samba_file_connection_check_success(samba_file_connection, caplog): + samba = samba_file_connection + with caplog.at_level(logging.INFO): + assert samba.check() == samba + + assert "|Samba|" in caplog.text + assert f"host = '{samba.host}'" in caplog.text + assert f"port = {samba.port}" in caplog.text + assert f"protocol = '{samba.protocol}'" in caplog.text + assert f"user = '{samba.user}'" in caplog.text + assert f"share = '{samba.share}'" in caplog.text + assert "password = SecretStr('**********')" in caplog.text + assert samba.password.get_secret_value() not in caplog.text + + assert "Connection is available." in caplog.text + + +def test_samba_file_connection_check_not_existing_share_failed(samba_server, caplog): + from onetl.connection import Samba + + not_existing_share = "NotExistingShare" + samba = Samba( + host=samba_server.host, + share=not_existing_share, + protocol=samba_server.protocol, + port=samba_server.port, + user=samba_server.user, + password=samba_server.password, + ) + + with caplog.at_level(logging.INFO): + with pytest.raises(RuntimeError, match="Connection is unavailable"): + samba.check() + + assert f"Share '{not_existing_share}' not found among existing shares" in caplog.text + + +def test_samba_file_connection_check_runtime_failed(samba_server): + from onetl.connection import Samba + + samba = Samba( + host=samba_server.host, + share=samba_server.share, + protocol=samba_server.protocol, + port=samba_server.port, + user="unknown", + password="unknown", + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + samba.check() diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py new file mode 100644 index 000000000..e94386120 --- /dev/null +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -0,0 +1,106 @@ +import logging + +import pytest + +from onetl.file.format import Excel + + +@pytest.mark.parametrize( + "spark_version", + [ + "2.2.1", + "2.3.1", + "2.4.8", + ], +) +def test_excel_get_packages_spark_version_not_supported(spark_version): + with pytest.raises(ValueError, match=f"Spark version should be at least 3.2, got {spark_version}"): + Excel.get_packages(spark_version=spark_version) + + +def test_excel_get_packages_scala_version_not_supported(): + with pytest.raises(ValueError, match="Scala version should be at least 2.12, got 2.11"): + Excel.get_packages(spark_version="3.2.4", scala_version="2.11") + + +def test_excel_get_packages_package_version_not_supported(): + with pytest.raises(ValueError, match="Package version should be at least 0.15, got 0.13.7"): + Excel.get_packages(spark_version="3.2.4", package_version="0.13.7") + + +@pytest.mark.parametrize( + "spark_version, scala_version, package_version, packages", + [ + # Detect Scala version by Spark version + ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]), + ("3.4.1", None, None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]), + # Override Scala version + ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]), + ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.19.0"]), + ("3.4.1", "2.12", None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]), + ("3.4.1", "2.13", None, ["com.crealytics:spark-excel_2.13:3.4.1_0.19.0"]), + # Override package version + ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]), + ("3.4.1", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.4.1_0.18.0"]), + ], +) +def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages): + with caplog.at_level(level=logging.WARNING): + result = Excel.get_packages( + spark_version=spark_version, + scala_version=scala_version, + package_version=package_version, + ) + + if package_version: + assert f"Passed custom package version '{package_version}', it is not guaranteed to be supported" + + assert result == packages + + +def test_excel_options_default(): + excel = Excel() + assert not excel.header + + +def test_excel_options_default_override(): + excel = Excel(header=True) + assert excel.header + + +@pytest.mark.parametrize( + "known_option", + [ + "dataAddress", + "treatEmptyValuesAsNulls", + "setErrorCellsToFallbackValues", + "usePlainNumberFormat", + "inferSchema", + "addColorColumns", + "timestampFormat", + "maxRowsInMemory", + "maxByteArraySize", + "tempFileThreshold", + "excerptSize", + "workbookPassword", + "dateFormat", + ], +) +def test_excel_options_known(known_option): + excel = Excel.parse({known_option: "value"}) + assert getattr(excel, known_option) == "value" + + +def test_excel_options_unknown(caplog): + with caplog.at_level(logging.WARNING): + excel = Excel(unknown="abc") + assert excel.unknown == "abc" + + assert ("Options ['unknown'] are not known by Excel, are you sure they are valid?") in caplog.text + + +@pytest.mark.local_fs +def test_excel_missing_package(spark_no_packages): + msg = "Cannot import Java class 'com.crealytics.spark.excel.v2.ExcelDataSource'" + with pytest.raises(ValueError, match=msg): + Excel().check_if_supported(spark_no_packages) diff --git a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py index 7e633206e..6469b10c8 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py @@ -153,6 +153,8 @@ def test_hive_write_options_unsupported_insert_into(insert_into): ({"if_exists": "append"}, HiveTableExistBehavior.APPEND), ({"if_exists": "replace_overlapping_partitions"}, HiveTableExistBehavior.REPLACE_OVERLAPPING_PARTITIONS), ({"if_exists": "replace_entire_table"}, HiveTableExistBehavior.REPLACE_ENTIRE_TABLE), + ({"if_exists": "error"}, HiveTableExistBehavior.ERROR), + ({"if_exists": "ignore"}, HiveTableExistBehavior.IGNORE), ], ) def test_hive_write_options_if_exists(options, value): @@ -198,6 +200,18 @@ def test_hive_write_options_if_exists(options, value): "Mode `overwrite_table` is deprecated since v0.9.0 and will be removed in v1.0.0. " "Use `replace_entire_table` instead", ), + ( + {"mode": "error"}, + HiveTableExistBehavior.ERROR, + "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `Hive.WriteOptions(if_exists=...)` instead", + ), + ( + {"mode": "ignore"}, + HiveTableExistBehavior.IGNORE, + "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `Hive.WriteOptions(if_exists=...)` instead", + ), ], ) def test_hive_write_options_mode_deprecated(options, value, message): @@ -209,10 +223,6 @@ def test_hive_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( "options", [ - # disallowed modes - {"mode": "error"}, - {"mode": "ignore"}, - # wrong mode {"mode": "wrong_mode"}, ], ) diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index ae81402cc..f932408d0 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -266,6 +266,8 @@ def test_jdbc_write_options_to_jdbc(spark_mock): [ ({}, JDBCTableExistBehavior.APPEND), ({"if_exists": "append"}, JDBCTableExistBehavior.APPEND), + ({"if_exists": "ignore"}, JDBCTableExistBehavior.IGNORE), + ({"if_exists": "error"}, JDBCTableExistBehavior.ERROR), ({"if_exists": "replace_entire_table"}, JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE), ], ) @@ -294,6 +296,18 @@ def test_jdbc_write_options_if_exists(options, value): "Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. " "Use `replace_entire_table` instead", ), + ( + {"mode": "ignore"}, + JDBCTableExistBehavior.IGNORE, + "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `WriteOptions(if_exists=...)` instead", + ), + ( + {"mode": "error"}, + JDBCTableExistBehavior.ERROR, + "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `WriteOptions(if_exists=...)` instead", + ), ], ) def test_jdbc_write_options_mode_deprecated(options, value, message): @@ -305,10 +319,6 @@ def test_jdbc_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( "options", [ - # disallowed modes - {"mode": "error"}, - {"mode": "ignore"}, - # wrong mode {"mode": "wrong_mode"}, ], ) diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index 8775f6dbc..eb3f1db23 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -233,6 +233,8 @@ def test_mongodb_convert_dict_to_str(): [ ({}, MongoDBCollectionExistBehavior.APPEND), ({"if_exists": "append"}, MongoDBCollectionExistBehavior.APPEND), + ({"if_exists": "ignore"}, MongoDBCollectionExistBehavior.IGNORE), + ({"if_exists": "error"}, MongoDBCollectionExistBehavior.ERROR), ({"if_exists": "replace_entire_collection"}, MongoDBCollectionExistBehavior.REPLACE_ENTIRE_COLLECTION), ], ) @@ -261,6 +263,18 @@ def test_mongodb_write_options_if_exists(options, value): "Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. " "Use `replace_entire_collection` instead", ), + ( + {"mode": "ignore"}, + MongoDBCollectionExistBehavior.IGNORE, + "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `MongoDB.WriteOptions(if_exists=...)` instead", + ), + ( + {"mode": "error"}, + MongoDBCollectionExistBehavior.ERROR, + "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `MongoDB.WriteOptions(if_exists=...)` instead", + ), ], ) def test_mongodb_write_options_mode_deprecated(options, value, message): @@ -272,10 +286,6 @@ def test_mongodb_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( "options", [ - # disallowed modes - {"mode": "error"}, - {"mode": "ignore"}, - # wrong mode {"mode": "wrong_mode"}, ], ) diff --git a/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py new file mode 100644 index 000000000..42f95b368 --- /dev/null +++ b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py @@ -0,0 +1,47 @@ +import pytest + +from onetl.connection import FileConnection + +pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection] + + +def test_samba_connection(): + from onetl.connection import Samba + + samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd") + assert isinstance(samba, FileConnection) + assert samba.host == "some_host" + assert samba.protocol == "SMB" + assert samba.domain == "" + assert samba.auth_type == "NTLMv2" + assert samba.port == 445 + assert samba.user == "some_user" + assert samba.password != "pwd" + assert samba.password.get_secret_value() == "pwd" + + assert "password='pwd'" not in str(samba) + assert "password='pwd'" not in repr(samba) + + +def test_samba_connection_with_net_bios(): + from onetl.connection import Samba + + samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol="NetBIOS") + assert samba.protocol == "NetBIOS" + assert samba.port == 139 + + +@pytest.mark.parametrize("protocol", ["SMB", "NetBIOS"]) +def test_samba_connection_with_custom_port(protocol): + from onetl.connection import Samba + + samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol=protocol, port=444) + assert samba.protocol == protocol + assert samba.port == 444 + + +def test_samba_connection_without_mandatory_args(): + from onetl.connection import Samba + + with pytest.raises(ValueError): + Samba() diff --git a/tests/util/spark_df.py b/tests/util/spark_df.py index 8e4c667b8..f4e239026 100644 --- a/tests/util/spark_df.py +++ b/tests/util/spark_df.py @@ -10,7 +10,7 @@ def reset_column_names(df: SparkDataFrame, columns: list[str] | None = None) -> """ Reset columns to ``_c0`` format. - If `columns` is None, reset all columns names. + If `columns` is None, apply to all columns in df. """ columns = columns or df.columns for i, column in enumerate(columns):