diff --git a/.env.docker b/.env.docker
index b9c2105aa..cb0394806 100644
--- a/.env.docker
+++ b/.env.docker
@@ -87,6 +87,16 @@ ONETL_SFTP_PORT=2222
ONETL_SFTP_USER=onetl
ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho
+# Samba
+ONETL_SAMBA_HOST=samba
+ONETL_SAMBA_PROTOCOL=SMB
+ONETL_SAMBA_UID=1000
+ONETL_SAMBA_GID=1000
+ONETL_SAMBA_PORT=445
+ONETL_SAMBA_SHARE=SmbShare
+ONETL_SAMBA_USER=onetl
+ONETL_SAMBA_PASSWORD=awd123fd1
+
# Webdav
ONETL_WEBDAV_HOST=webdav
ONETL_WEBDAV_PORT=80
diff --git a/.env.local b/.env.local
index af2551dbd..2e05030f3 100644
--- a/.env.local
+++ b/.env.local
@@ -87,6 +87,16 @@ export ONETL_SFTP_PORT=2222
export ONETL_SFTP_USER=onetl
export ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho
+# Samba
+export ONETL_SAMBA_HOST=localhost
+export ONETL_SAMBA_PROTOCOL=SMB
+export ONETL_SAMBA_UID=1000
+export ONETL_SAMBA_GID=1000
+export ONETL_SAMBA_PORT=445
+export ONETL_SAMBA_SHARE=SmbShare
+export ONETL_SAMBA_USER=onetl
+export ONETL_SAMBA_PASSWORD=awd123fd1
+
# Webdav
export ONETL_WEBDAV_HOST=localhost
export ONETL_WEBDAV_PORT=8000
diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml
index 43e02d8c2..292319bb5 100644
--- a/.github/workflows/data/greenplum/matrix.yml
+++ b/.github/workflows/data/greenplum/matrix.yml
@@ -7,7 +7,7 @@ min: &min
max: &max
# Greenplum connector does not support Spark 3.3+
- spark-version: 3.2.3
+ spark-version: 3.2.4
python-version: '3.10'
java-version: 11
os: ubuntu-latest
diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml
index af841433b..e956169ba 100644
--- a/.github/workflows/data/local-fs/matrix.yml
+++ b/.github/workflows/data/local-fs/matrix.yml
@@ -4,12 +4,18 @@ min: &min
java-version: 8
os: ubuntu-latest
-avro: &avro
+min_avro: &min_avro
spark-version: 2.4.8
python-version: '3.7'
java-version: 8
os: ubuntu-latest
+min_excel: &min_excel
+ spark-version: 3.2.4
+ python-version: '3.7'
+ java-version: 8
+ os: ubuntu-latest
+
max: &max
spark-version: 3.4.1
python-version: '3.11'
@@ -25,12 +31,15 @@ latest: &latest
matrix:
small:
- <<: *max
- - <<: *avro
+ - <<: *min_avro
+ - <<: *min_excel
full:
- <<: *min
- - <<: *avro
+ - <<: *min_avro
+ - <<: *min_excel
- <<: *max
nightly:
- <<: *min
- - <<: *avro
+ - <<: *min_avro
+ - <<: *min_excel
- <<: *latest
diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml
index 80f81aacf..f91e1baaa 100644
--- a/.github/workflows/data/mongodb/matrix.yml
+++ b/.github/workflows/data/mongodb/matrix.yml
@@ -1,6 +1,6 @@
min: &min
# MongoDB connector does not support Spark 2
- spark-version: 3.2.3
+ spark-version: 3.2.4
python-version: '3.7'
java-version: 8
os: ubuntu-latest
diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml
index 57fe2ca8f..44779fe95 100644
--- a/.github/workflows/data/s3/matrix.yml
+++ b/.github/workflows/data/s3/matrix.yml
@@ -2,7 +2,7 @@ min: &min
# prior image versions returns empty content of bucket root, some kind of bug
minio-version: 2021.3.17
# Minimal Spark version with Hadoop 3.x support
- spark-version: 3.2.3
+ spark-version: 3.2.4
python-version: '3.7'
java-version: 8
os: ubuntu-latest
diff --git a/.github/workflows/data/samba/ignored.txt b/.github/workflows/data/samba/ignored.txt
new file mode 100644
index 000000000..d8f8d4692
--- /dev/null
+++ b/.github/workflows/data/samba/ignored.txt
@@ -0,0 +1 @@
+docs
diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml
new file mode 100644
index 000000000..a4a3afe30
--- /dev/null
+++ b/.github/workflows/data/samba/matrix.yml
@@ -0,0 +1,18 @@
+min: &min
+ python-version: '3.7'
+ os: ubuntu-latest
+
+max: &max
+ python-version: '3.11'
+ os: ubuntu-latest
+
+matrix:
+ small:
+ - server-version: latest
+ <<: *max
+ full: &full
+ - server-version: latest
+ <<: *min
+ - server-version: latest
+ <<: *max
+ nightly: *full
diff --git a/.github/workflows/data/samba/tracked.txt b/.github/workflows/data/samba/tracked.txt
new file mode 100644
index 000000000..5f7fcf905
--- /dev/null
+++ b/.github/workflows/data/samba/tracked.txt
@@ -0,0 +1 @@
+**/samba*
diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml
index fd7e24aae..b9d160b42 100644
--- a/.github/workflows/get-matrix.yml
+++ b/.github/workflows/get-matrix.yml
@@ -41,6 +41,8 @@ on:
value: ${{ jobs.get-matrix.outputs.matrix-s3 }}
matrix-sftp:
value: ${{ jobs.get-matrix.outputs.matrix-sftp }}
+ matrix-samba:
+ value: ${{ jobs.get-matrix.outputs.matrix-samba }}
matrix-webdav:
value: ${{ jobs.get-matrix.outputs.matrix-webdav }}
@@ -69,6 +71,7 @@ jobs:
matrix-hdfs: ${{ toJson(fromJson(steps.matrix-hdfs.outputs.result)[steps.key-hdfs.outputs.key]) }}
matrix-s3: ${{ toJson(fromJson(steps.matrix-s3.outputs.result)[steps.key-s3.outputs.key]) }}
matrix-sftp: ${{ toJson(fromJson(steps.matrix-sftp.outputs.result)[steps.key-sftp.outputs.key]) }}
+ matrix-samba: ${{ toJson(fromJson(steps.matrix-samba.outputs.result)[steps.key-samba.outputs.key]) }}
matrix-webdav: ${{ toJson(fromJson(steps.matrix-webdav.outputs.result)[steps.key-webdav.outputs.key]) }}
steps:
- name: Checkout code
@@ -635,6 +638,36 @@ jobs:
with:
cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml
+ - name: Check if Samba files are changed
+ id: changed-samba
+ uses: tj-actions/changed-files@v35
+ with:
+ files_from_source_file: .github/workflows/data/samba/tracked.txt
+ files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt
+
+ - name: Print Samba files changed
+ run: |
+ echo '${{ steps.changed-samba.outputs.all_changed_files }}'
+
+ - name: Calculate Samba matrix key
+ id: key-samba
+ run: |
+ if ${{ inputs.nightly }}; then
+ key=nightly
+ elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-samba.outputs.any_changed }}; then
+ key=full
+ else
+ key=small
+ fi
+ echo key=$key
+ echo key=$key >> $GITHUB_OUTPUT
+
+ - name: Get Samba matrix
+ id: matrix-samba
+ uses: mikefarah/yq@v4.33.3
+ with:
+ cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml
+
- name: Check if WebDAV files are changed
id: changed-webdav
uses: tj-actions/changed-files@v35
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 209364f4b..7608ebe6e 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -303,6 +303,21 @@ jobs:
os: ${{ matrix.os }}
with-cache: false
+ tests-samba:
+ name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
+ needs: [get-matrix]
+ strategy:
+ fail-fast: false
+ matrix:
+ include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }}
+
+ uses: ./.github/workflows/test-samba.yml
+ with:
+ server-version: ${{ matrix.server-version }}
+ python-version: ${{ matrix.python-version }}
+ os: ${{ matrix.os }}
+ with-cache: false
+
tests-webdav:
name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
needs: [get-matrix]
@@ -338,6 +353,7 @@ jobs:
- tests-hdfs
- tests-s3
- tests-sftp
+ - tests-samba
- tests-webdav
steps:
diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml
new file mode 100644
index 000000000..d823a9ae7
--- /dev/null
+++ b/.github/workflows/test-samba.yml
@@ -0,0 +1,81 @@
+name: Tests for Samba
+on:
+ workflow_call:
+ inputs:
+ server-version:
+ required: true
+ type: string
+ python-version:
+ required: true
+ type: string
+ os:
+ required: true
+ type: string
+ with-cache:
+ required: false
+ type: boolean
+ default: true
+
+jobs:
+ test-samba:
+ name: Run Samba tests (server=${{ inputs.server-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }})
+ runs-on: ${{ inputs.os }}
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Python ${{ inputs.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ inputs.python-version }}
+
+ - name: Cache pip
+ uses: actions/cache@v3
+ if: inputs.with-cache
+ with:
+ path: ~/.cache/pip
+ key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }}
+ ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-
+
+ - name: Upgrade pip
+ run: python -m pip install --upgrade pip setuptools wheel
+
+ - name: Install dependencies
+ run: |
+ pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt
+
+ # Replace with Github Actions' because of custom parameter for samba container start
+ - name: Start Samba
+ run: |
+ docker compose down -v --remove-orphans
+ docker compose up -d samba
+ env:
+ SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }}
+ COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }}
+
+ - name: Wait for Samba to be ready
+ run: |
+ ./docker/wait-for-it.sh -h localhost -p 445 -t 60
+
+ - name: Run tests
+ run: |
+ mkdir reports/ || echo "Directory exists"
+ sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env
+ source ./env
+ ./pytest_runner.sh -m samba
+
+ - name: Shutdown Samba
+ if: always()
+ run: |
+ docker compose down -v --remove-orphans
+ env:
+ COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }}
+
+ - name: Upload coverage results
+ uses: actions/upload-artifact@v3
+ with:
+ name: samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }}
+ path: reports/*
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 44125d701..1df7f5306 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -287,6 +287,21 @@ jobs:
python-version: ${{ matrix.python-version }}
os: ${{ matrix.os }}
+ tests-samba:
+ name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
+ needs: [get-matrix]
+ strategy:
+ fail-fast: false
+ matrix:
+ include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }}
+
+ uses: ./.github/workflows/test-samba.yml
+ with:
+ server-version: ${{ matrix.server-version }}
+ python-version: ${{ matrix.python-version }}
+ os: ${{ matrix.os }}
+
+
tests-webdav:
name: Run WebDAV tests (server=${{ matrix.webdav-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
needs: [get-matrix]
@@ -321,6 +336,7 @@ jobs:
- tests-hdfs
- tests-s3
- tests-sftp
+ - tests-samba
- tests-webdav
steps:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fd0c89d6b..193ae3c3d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -59,12 +59,12 @@ repos:
- id: rst-inline-touching-normal
- id: text-unicode-replacement-char
- repo: https://github.com/asottile/pyupgrade
- rev: v3.10.1
+ rev: v3.13.0
hooks:
- id: pyupgrade
args: [--py37-plus, --keep-runtime-typing]
- repo: https://github.com/psf/black
- rev: 23.7.0
+ rev: 23.9.1
hooks:
- id: black
language_version: python3
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 4d54479b4..923741b22 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -14,6 +14,7 @@ python:
- ftp
- ftps
- hdfs
+ - samba
- s3
- sftp
- webdav
diff --git a/README.rst b/README.rst
index 13b280830..4f8b0aca8 100644
--- a/README.rst
+++ b/README.rst
@@ -54,7 +54,7 @@ Requirements
* **Python 3.7 - 3.11**
* PySpark 2.3.x - 3.4.x (depends on used connector)
* Java 8+ (required by Spark, see below)
-* Kerberos libs & GCC (required by ``Hive`` and ``HDFS`` connectors)
+* Kerberos libs & GCC (required by ``Hive``, ``HDFS`` and ``SparkHDFS`` connectors)
Supported storages
------------------
@@ -93,6 +93,8 @@ Supported storages
| | FTPS | |
+ +--------------+----------------------------------------------------------------------------------------------------------------------+
| | WebDAV | `WebdavClient3 library `_ |
++ +--------------+----------------------------------------------------------------------------------------------------------------------+
+| | Samba | `pysmb library `_ |
+--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+
| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ |
| +--------------+ +
@@ -109,16 +111,16 @@ Documentation
See https://onetl.readthedocs.io/
-.. install
-
How to install
---------------
-.. _minimal-install:
+.. _install:
Minimal installation
~~~~~~~~~~~~~~~~~~~~
+.. _minimal-install:
+
Base ``onetl`` package contains:
* ``DBReader``, ``DBWriter`` and related classes
@@ -140,14 +142,16 @@ It can be installed via:
This method is recommended for use in third-party libraries which require for ``onetl`` to be installed,
but do not use its connection classes.
-.. _spark-install:
-
With DB and FileDF connections
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. _spark-install:
+
All DB connection classes (``Clickhouse``, ``Greenplum``, ``Hive`` and others)
and all FileDF connection classes (``SparkHDFS``, ``SparkLocalFS``, ``SparkS3``)
-require PySpark to be installed.
+require Spark to be installed.
+
+.. _java-install:
Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples:
@@ -169,13 +173,15 @@ Compatibility matrix
+--------------------------------------------------------------+-------------+-------------+-------+
| `2.4.x `_ | 3.7 only | 8 only | 2.11 |
+--------------------------------------------------------------+-------------+-------------+-------+
-| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 |
+| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 |
+--------------------------------------------------------------+-------------+-------------+-------+
-| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 |
+| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 |
+--------------------------------------------------------------+-------------+-------------+-------+
| `3.4.x `_ | 3.7 - 3.11 | 8u362 - 20 | 2.12 |
+--------------------------------------------------------------+-------------+-------------+-------+
+.. _pyspark-install:
+
Then you should install PySpark via passing ``spark`` to ``extras``:
.. code:: bash
@@ -191,12 +197,11 @@ or install PySpark explicitly:
or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance.
**Otherwise connection object cannot be created.**
-
-.. _files-install:
-
With File connections
~~~~~~~~~~~~~~~~~~~~~
+.. _files-install:
+
All File (but not *FileDF*) connection classes (``FTP``, ``SFTP``, ``HDFS`` and so on) requires specific Python clients to be installed.
Each client can be installed explicitly by passing connector name (in lowercase) to ``extras``:
@@ -204,7 +209,7 @@ Each client can be installed explicitly by passing connector name (in lowercase)
.. code:: bash
pip install onetl[ftp] # specific connector
- pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav] # multiple connectors
+ pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors
To install all file connectors at once you can pass ``files`` to ``extras``:
@@ -214,22 +219,21 @@ To install all file connectors at once you can pass ``files`` to ``extras``:
**Otherwise class import will fail.**
-
-.. _kerberos-install:
-
With Kerberos support
~~~~~~~~~~~~~~~~~~~~~
+.. _kerberos-install:
+
Most of Hadoop instances set up with Kerberos support,
so some connections require additional setup to work properly.
* ``HDFS``
Uses `requests-kerberos `_ and
- `GSSApi `_ for authentication in WebHDFS.
+ `GSSApi `_ for authentication.
It also uses ``kinit`` executable to generate Kerberos ticket.
* ``Hive`` and ``SparkHDFS``
- Requires Kerberos ticket to exist before creating Spark session.
+ require Kerberos ticket to exist before creating Spark session.
So you need to install OS packages with:
@@ -250,12 +254,11 @@ Also you should pass ``kerberos`` to ``extras`` to install required Python packa
pip install onetl[kerberos]
-
-.. _full-install:
-
Full bundle
~~~~~~~~~~~
+.. _full-bundle:
+
To install all connectors and dependencies, you can pass ``all`` into ``extras``:
.. code:: bash
@@ -269,7 +272,7 @@ To install all connectors and dependencies, you can pass ``all`` into ``extras``
This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS.
-.. quick-start
+.. _quick-start:
Quick start
------------
diff --git a/conftest.py b/conftest.py
index ab0b60a5c..52b6c5754 100644
--- a/conftest.py
+++ b/conftest.py
@@ -19,5 +19,6 @@
"tests.fixtures.connections.local_fs",
"tests.fixtures.connections.s3",
"tests.fixtures.connections.sftp",
+ "tests.fixtures.connections.samba",
"tests.fixtures.connections.webdav",
]
diff --git a/docker-compose.yml b/docker-compose.yml
index a08d8fc38..bdcfe3954 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -164,6 +164,18 @@ services:
networks:
- onetl
+ samba:
+ image: elswork/samba
+ restart: unless-stopped
+ ports:
+ - "139:139"
+ - "445:445"
+ volumes:
+ - ./docker/samba/custom_entrypoint.sh:/custom_entrypoint.sh
+ entrypoint: ["/custom_entrypoint.sh"]
+ networks:
+ - onetl
+
s3:
image: ${S3_IMAGE:-bitnami/minio:latest}
restart: unless-stopped
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 103cc2b26..817d4eab2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -57,6 +57,7 @@ RUN pip install \
-r /app/requirements/hdfs.txt \
-r /app/requirements/s3.txt \
-r /app/requirements/sftp.txt \
+ -r /app/requirements/samba.txt \
-r /app/requirements/webdav.txt \
-r /app/requirements/kerberos.txt \
-r /app/requirements/docs.txt \
diff --git a/docker/samba/custom_entrypoint.sh b/docker/samba/custom_entrypoint.sh
new file mode 100755
index 000000000..f0d4078c0
--- /dev/null
+++ b/docker/samba/custom_entrypoint.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+# allow create files and directories
+mkdir -p /share/folder
+chmod 0777 /share/folder
+/entrypoint.sh -u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl"
diff --git a/docs/changelog/0.9.4.rst b/docs/changelog/0.9.4.rst
new file mode 100644
index 000000000..4eb406ae0
--- /dev/null
+++ b/docs/changelog/0.9.4.rst
@@ -0,0 +1,30 @@
+0.9.4 (2023-09-26)
+==================
+
+Features
+--------
+
+- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`)
+- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`)
+- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`)
+- Add ``Excel`` file format support. (:github:pull:`148`)
+- Add ``Samba`` file connection.
+ It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`)
+
+
+Improvements
+------------
+
+- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`)
+- Drastically improve ``Greenplum`` documentation:
+ * Added information about network ports, grants, ``pg_hba.conf`` and so on.
+ * Added interaction schemas for reading, writing and executing statements in Greenplum.
+ * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`)
+- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`)
+- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`)
+
+
+Bug Fixes
+---------
+
+- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`)
diff --git a/docs/changelog/NEXT_RELEASE.rst b/docs/changelog/NEXT_RELEASE.rst
index 5e26856b4..ee4196843 100644
--- a/docs/changelog/NEXT_RELEASE.rst
+++ b/docs/changelog/NEXT_RELEASE.rst
@@ -3,3 +3,34 @@
.. and add it to index.rst
.. towncrier release notes start
+
+0.9.4 (2023-09-26)
+==================
+
+Features
+--------
+
+- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`)
+- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`)
+- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`)
+- Add ``Excel`` file format support. (:github:pull:`148`)
+- Add ``Samba`` file connection.
+ It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`)
+
+
+Improvements
+------------
+
+- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`)
+- Drastically improve ``Greenplum`` documentation:
+ * Added information about network ports, grants, ``pg_hba.conf`` and so on.
+ * Added interaction schemas for reading, writing and executing statements in Greenplum.
+ * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`)
+- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`)
+- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`)
+
+
+Bug Fixes
+---------
+
+- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`)
diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst
index 92701e1e1..6130bfdc8 100644
--- a/docs/changelog/index.rst
+++ b/docs/changelog/index.rst
@@ -4,6 +4,7 @@
DRAFT
NEXT_RELEASE
+ 0.9.4
0.9.3
0.9.2
0.9.1
diff --git a/docs/conf.py b/docs/conf.py
index 87d6fd17b..06a5b08aa 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -56,6 +56,7 @@
"sphinx.ext.autosummary",
"sphinxcontrib.autodoc_pydantic",
"sphinxcontrib.towncrier", # provides `towncrier-draft-entries` directive
+ "sphinxcontrib.plantuml",
]
numpydoc_show_class_members = True
autodoc_pydantic_model_show_config = False
diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst
index b0833b213..e2179a4ec 100644
--- a/docs/connection/db_connection/greenplum/execute.rst
+++ b/docs/connection/db_connection/greenplum/execute.rst
@@ -3,6 +3,47 @@
Executing statements in Greenplum
==================================
+Interaction schema
+------------------
+
+Unlike reading & writing, executing statements in Greenplum is done **only** through Greenplum master node,
+without any interaction between Greenplum segments and Spark executors. More than that, Spark executors are not used in this case.
+
+The only port used while interacting with Greenplum in this case is ``5432`` (Greenplum master port).
+
+.. dropdown:: Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch()
+
+ .. plantuml::
+
+ @startuml
+ title Greenplum master <-> Spark driver
+ box "Spark"
+ participant "Spark driver"
+ end box
+
+ box "Greenplum"
+ participant "Greenplum master"
+ end box
+
+ == Greenplum.check() ==
+
+ activate "Spark driver"
+ "Spark driver" -> "Greenplum master" ++ : CONNECT
+
+ == Greenplum.execute(statement) ==
+ "Spark driver" --> "Greenplum master" : EXECUTE statement
+ "Greenplum master" -> "Spark driver" : RETURN result
+
+ == Greenplum.close() ==
+ "Spark driver" --> "Greenplum master" : CLOSE CONNECTION
+
+ deactivate "Greenplum master"
+ deactivate "Spark driver"
+ @enduml
+
+Options
+-------
+
.. currentmodule:: onetl.connection.db_connection.greenplum.connection
.. automethod:: Greenplum.fetch
diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index 964d9cdcf..57db9635e 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -7,7 +7,7 @@ Version Compatibility
---------------------
* Greenplum server versions: 5.x, 6.x
-* Spark versions: 2.3.x - 3.2.x (Spark 3.3.x is not supported yet)
+* Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet)
* Java versions: 8 - 11
See `official documentation `_.
@@ -18,13 +18,7 @@ Installing PySpark
To use Greenplum connector you should have PySpark installed (or injected to ``sys.path``)
BEFORE creating the connector instance.
-You can install PySpark as follows:
-
-.. code:: bash
-
- pip install onetl pyspark=3.2.3 # pass specific PySpark version
-
-See :ref:`spark-install` instruction for more details.
+See :ref:`install-spark` installation instruction for more details.
Downloading Pivotal package
---------------------------
@@ -33,140 +27,189 @@ To use Greenplum connector you should download connector ``.jar`` file from
`Pivotal website `_
and then pass it to Spark session.
-There are several ways to do that.
+.. warning::
+
+ Please pay attention to :ref:`Spark & Scala version compatibility `.
+
+There are several ways to do that. See :ref:`java-packages` for details.
.. note::
- Please pay attention to Spark <-> Scala version compatibility. See :ref:`spark-compatibility-matrix`.
+ If you're uploading package to private package repo, use ``groupId=io.pivotal`` and ``artifactoryId=greenplum-spark_2.12``
+ (``2.12`` is Scala version) to give uploaded package a proper name.
-Using ``spark.jars``
-~~~~~~~~~~~~~~~~~~~~
+Connecting to Greenplum
+-----------------------
-The most simple solution, but this requires to store/deploy ``.jar`` file in the local environment.
+Interaction schema
+~~~~~~~~~~~~~~~~~~
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option, e.g.
+Spark executors open ports to listen incoming requests.
+Greenplum segments are initiating connections to Spark executors using `EXTERNAL TABLE `_
+functionality, and send/read data using `gpfdist `_ protocol.
-.. code:: python
+Data is **not** send through Greenplum master.
+Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on).
- # no need to use spark.jars.packages
- spark = (
- SparkSession.builder.config("spark.app.name", "onetl")
- .config("spark.jars", "/path/to/downloaded.jar")
- .getOrCreate()
- )
+More details can be found in `official documentation `_.
-Using ``spark.jars.repositories``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Number of parallel connections
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo.
+.. warning::
-* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_.
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``).
-* Pass repo URL to ``spark.jars.repositories`` Spark config option
-* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option.
+ This is very important!!!
+ If you don't limit number of connections, you can exceed the `max_connections `_
+ limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max,
+ depending on your Greenplum instance settings and using connection balancers like ``pgbouncer``.
-Example
-^^^^^^^
+ Consuming all available connections means **nobody** (even admin users) can connect to Greenplum.
-.. code:: python
+Each job on the Spark executor makes its own connection to Greenplum master node,
+so you need to limit number of connections to avoid opening too many of them.
+
+* Reading about ``5-10Gb`` of data requires about ``3-5`` parallel connections.
+* Reading about ``20-30Gb`` of data requires about ``5-10`` parallel connections.
+* Reading about ``50Gb`` of data requires ~ ``10-20`` parallel connections.
+* Reading about ``100+Gb`` of data requires ``20-30`` parallel connections.
+* Opening more than ``30-50`` connections is not recommended.
+
+Number of connections can be limited by 2 ways:
+
+* By limiting number of Spark executors and number of cores per-executor. Max number of parallel jobs is ``executors * cores``.
+
+.. tabs::
- maven_packages = Greenplum.get_packages(spark_version="3.2")
- spark = (
- SparkSession.builder.config("spark.app.name", "onetl")
- .config("spark.jars.repositories", "http://nexus.domain.com/example-repo/")
- .config("spark.jars.packages", ",".join(maven_packages))
- .getOrCreate()
- )
-
-
-Using ``spark.jars.ivySettings``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Same as above, but can be used even if there is no network access to public repos like Maven.
-
-* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_.
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``).
-* Create `ivysettings.xml `_ file.
-* Add here a resolver with repository URL (and credentials, if required).
-* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option.
-* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option.
-
-Example
-^^^^^^^
-
-.. code-block:: xml
- :caption: ivysettings.xml
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. code-block:: python
- :caption: script.py
-
- maven_packages = Greenplum.get_packages(spark_version="3.2")
- spark = (
- SparkSession.builder.config("spark.app.name", "onetl")
- .config("spark.jars.ivySettings", "/path/to/ivysettings.xml")
- .config("spark.jars.packages", ",".join(maven_packages))
- .getOrCreate()
- )
-
-Moving ``.jar`` file to ``~/.ivy2/jars/``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven.
-
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Move it to ``~/.ivy2/jars/`` folder
-* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option.
-
-Example
-^^^^^^^
+ .. code-tab:: py Spark with master=local
+
+ (
+ SparkSession.builder
+ # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10
+ .config("spark.master", "local[10]")
+ .config("spark.executor.cores", 1)
+ )
+
+ .. code-tab:: py Spark with master=yarn or master=k8s, dynamic allocation
+
+ (
+ SparkSession.builder
+ .config("spark.master", "yarn")
+ # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10
+ .config("spark.dynamicAllocation.maxExecutors", 10)
+ .config("spark.executor.cores", 1)
+ )
+
+ .. code-tab:: py Spark with master=yarn or master=k8s, static allocation
+
+ (
+ SparkSession.builder
+ .config("spark.master", "yarn")
+ # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10
+ .config("spark.executor.instances", 10)
+ .config("spark.executor.cores", 1)
+ )
+
+* By limiting connection pool size user by Spark (**only** for Spark with ``master=local``):
.. code:: python
- maven_packages = Greenplum.get_packages(spark_version="3.2")
- spark = (
- SparkSession.builder.config("spark.app.name", "onetl")
- .config("spark.jars.packages", ",".join(maven_packages))
- .getOrCreate()
- )
+ spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate()
+
+ # No matter how many executors are started and how many cores they have,
+ # number of connections cannot exceed pool size:
+ Greenplum(
+ ...,
+ extra={
+ "pool.maxSize": 10,
+ },
+ )
+
+See `connection pooling `_
+documentation.
+
+
+* By setting :obj:`num_partitions `
+ and :obj:`partition_column ` (not recommended).
+
+Allowing connection to Greenplum master
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node,
+e.g. by updating ``pg_hba.conf`` file.
+
+More details can be found in `official documentation `_.
+
+Network ports
+~~~~~~~~~~~~~
+
+To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum:
+
+* Spark driver and all Spark executors -> port ``5432`` on Greenplum master node.
+
+ This port number should be set while connecting to Greenplum:
+
+ .. code:: python
+
+ Greenplum(host="master.host", port=5432, ...)
+
+* Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executor**.
+
+ This range should be set in ``extra`` option:
+
+ .. code:: python
+
+ Greenplum(
+ ...,
+ extra={
+ "server.port": "41000-42000",
+ },
+ )
+
+ Number of ports in this range is ``number of parallel running Spark sessions`` * ``number of parallel connections per session``.
+
+ Number of connections per session (see below) is usually less than ``30`` (see below).
+
+ Number of session depends on your environment:
+ * For ``master=local`` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU.
+
+ * For ``master=yarn`` / ``master=k8s`` hundreds or thousands of sessions can be started simultaneously,
+ but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time.
+
+More details can be found in official documentation:
+ * `port requirements `_
+ * `format of server.port value `_
+ * `port troubleshooting `_
+
+Required grants
+~~~~~~~~~~~~~~~
+
+Ask your Greenplum cluster administrator to set following grants for a user,
+used for creating a connection:
+
+.. tabs::
-Inserting ``.jar`` file to Spark jars folder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ .. code-tab:: sql Reading & writing
-Can be used to embed ``.jar`` files to a default Spark classpath.
+ GRANT USAGE ON SCHEMA myschema TO username;
+ GRANT CREATE ON SCHEMA myschema TO username;
+ GRANT SELECT, INSERT ON SCHEMA myschema.mytable TO username;
+ ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``.
-* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages``
+ .. code-tab:: sql Reading from Greenplum
+ GRANT USAGE ON SCHEMA schema_to_read TO username;
+ GRANT CREATE ON SCHEMA schema_to_read TO username;
+ GRANT SELECT ON SCHEMA schema_to_read.table_to_read TO username;
+ -- yes, ``writable``, because data is written from Greenplum to Spark executor.
+ ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
-Manually adding ``.jar`` files to ``CLASSPATH``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ .. code-tab:: sql Writing to Greenplum
-Can be used to embed ``.jar`` files to a default Java classpath.
+ GRANT USAGE ON SCHEMA schema_to_write TO username;
+ GRANT CREATE ON SCHEMA schema_to_write TO username;
+ GRANT SELECT, INSERT ON SCHEMA schema_to_write.table_to_write TO username;
+ -- yes, ``readable``, because data is read from Spark executor to Greenplum.
+ ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist');
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Set environment variable ``CLASSPATH`` to ``/path/to/downloader.jar``
-* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages``
+More details can be found in `official documentation `_.
diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst
index 2640f7e6c..30d669fea 100644
--- a/docs/connection/db_connection/greenplum/read.rst
+++ b/docs/connection/db_connection/greenplum/read.rst
@@ -8,20 +8,143 @@ For reading data from Greenplum, use :obj:`DBReader `,
- and drop staging table after reading is finished.
+Interaction schema
+------------------
- In this case data will be read directly from Greenplum segment nodes in a distributed way.
+High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below.
+
+.. dropdown:: Spark <-> Greenplum interaction during DBReader.run()
+
+ .. plantuml::
+
+ @startuml
+ title Greenplum master <-> Spark driver
+ box "Spark"
+ participant "Spark driver"
+ participant "Spark executor1"
+ participant "Spark executor2"
+ participant "Spark executorN"
+ end box
+
+ box "Greenplum"
+ participant "Greenplum master"
+ participant "Greenplum segment1"
+ participant "Greenplum segment2"
+ participant "Greenplum segmentN"
+ end box
+
+ == Greenplum.check() ==
+
+ activate "Spark driver"
+ "Spark driver" -> "Greenplum master" ++ : CONNECT
+
+ "Spark driver" --> "Greenplum master" : CHECK IF TABLE EXISTS gp_table
+ "Greenplum master" --> "Spark driver" : TABLE EXISTS
+ "Spark driver" -> "Greenplum master" : SHOW SCHEMA FOR gp_table
+ "Greenplum master" --> "Spark driver" : (id bigint, col1 int, col2 text, ...)
+
+ == DBReader.run() ==
+
+ "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1
+ "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2
+ "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N
+
+ note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV
+ "Spark executor1" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1
+ note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master
+ "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1
+ note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments.
+
+ "Spark executor2" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2
+ "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2
+
+ "Spark executorN" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N
+ "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN
+
+ "Greenplum segment1" ->o "Spark executor1" -- : INITIALIZE CONNECTION TO Spark executor1\nPUSH DATA TO Spark executor1
+ note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor
+
+ "Greenplum segment2" ->o "Spark executor2" -- : INITIALIZE CONNECTION TO Spark executor2\nPUSH DATA TO Spark executor2
+ "Greenplum segmentN" ->o "Spark executorN" -- : INITIALIZE CONNECTION TO Spark executorN\nPUSH DATA TO Spark executorN
+
+ == Spark.stop() ==
+
+ "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1
+ deactivate "Greenplum master"
+ "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2
+ deactivate "Greenplum master"
+ "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN
+ deactivate "Greenplum master"
+
+ "Spark executor1" --> "Spark driver" -- : DONE
+ "Spark executor2" --> "Spark driver" -- : DONE
+ "Spark executorN" --> "Spark driver" -- : DONE
+
+ "Spark driver" --> "Greenplum master" : CLOSE CONNECTION
+ deactivate "Greenplum master"
+ deactivate "Spark driver"
+ @enduml
+
+Recommendations
+---------------
+
+Reading from views
+~~~~~~~~~~~~~~~~~~
+
+This connector is **NOT** designed to read data from views.
+
+You can technically read data from a view which has
+`gp_segment_id `_ column.
+But this is **not** recommended because each Spark executor will run the same query, which may lead to running duplicated calculations
+and sending data between segments only to skip most of the result and select only small part.
+
+Prefer following option:
+ * Create staging table to store result data, using :obj:`Greenplum.execute `
+ * Use the same ``.execute`` method run a query ``INSERT INTO staging_table AS SELECT FROM some_view``. This will be done on Greenplum segments side, query will be run only once.
+ * Read data from staging table to Spark executor using :obj:`DBReader `.
+ * Drop staging table using ``.execute`` method.
+
+Using ``JOIN`` on Greenplum side
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to get data of joining 2 tables in Greenplum, you should:
+ * Create staging table to store result data, using ``Greenplum.execute``
+ * Use the same ``Greenplum.execute`` run a query ``INSERT INTO staging_table AS SELECT FROM table1 JOIN table2``. This will be done on Greenplum segments side, in a distributed way.
+ * Read data from staging table to Spark executor using ``DBReader``.
+ * Drop staging table using ``Greenplum.execute``.
.. warning::
- Greenplum connection does **NOT** support reading data from views which does not have ``gp_segment_id`` column.
- Either add this column to a view, or use stating table solution (see above).
+ Do **NOT** try to read data from ``table1`` and ``table2`` using ``DBReader``, and then join the resulting dataframes!
+
+ This will lead to sending all the data from both tables to Spark executor memory, and then ``JOIN``
+ will be performed on Spark side, not Greenplum. This is **very** inefficient.
+
+Using ``TEMPORARY`` tables
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Someone could think that writing data from ``VIEW`` or result of ``JOIN`` to ``TEMPORARY`` table,
+and then passing it to DBReader, is an efficient way to read data from Greenplum, because temp tables are not generating WAL files,
+and are automatically deleted after finishing the transaction.
+
+That's will **not** work. Each Spark executor establishes its own connection to Greenplum,
+and thus reads its own temporary table, which does not contain any data.
+
+You should use `UNLOGGED `_ tables
+to write data to staging table without generating useless WAL logs.
+
+Mapping of Greenplum types to Spark types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See `official documentation `_
+for more details.
+onETL does not perform any additional casting of types while reading data.
+
+Options
+-------
.. currentmodule:: onetl.connection.db_connection.greenplum.options
diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst
index aeb688ac5..c7a4f1560 100644
--- a/docs/connection/db_connection/greenplum/write.rst
+++ b/docs/connection/db_connection/greenplum/write.rst
@@ -5,6 +5,101 @@ Writing to Greenplum
For writing data to Greenplum, use :obj:`DBWriter ` with options below.
+
+Interaction schema
+------------------
+
+High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below.
+
+.. dropdown:: Spark <-> Greenplum interaction during DBWriter.run()
+
+ .. plantuml::
+
+ @startuml
+ title Greenplum master <-> Spark driver
+ box "Spark"
+ participant "Spark driver"
+ participant "Spark executor1"
+ participant "Spark executor2"
+ participant "Spark executorN"
+ end box
+
+ box "Greenplum"
+ participant "Greenplum master"
+ participant "Greenplum segment1"
+ participant "Greenplum segment2"
+ participant "Greenplum segmentN"
+ end box
+
+ == Greenplum.check() ==
+
+ activate "Spark driver"
+ "Spark driver" -> "Greenplum master" ++ : CONNECT
+ "Spark driver" --> "Greenplum master" ++ : CHECK IF TABLE EXISTS gp_table
+ "Greenplum master" --> "Spark driver" : TABLE NOT EXISTS
+
+ == DBWriter.run(df) ==
+
+ "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1
+ "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2
+ "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N
+
+ note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV
+ "Spark executor1" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO gp_table FROM spark_executor1
+ note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master
+ "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM spark_executor1 TO gp_table_data_on_segment1
+ note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments.
+
+ "Spark executor2" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO gp_table FROM spark_executor2
+ "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM spark_executor2 TO gp_table_data_on_segment2
+
+ "Spark executorN" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO gp_table FROM spark_executorN
+ "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM spark_executorN TO gp_table_data_on_segmentN
+
+ "Greenplum segment1" -->o "Spark executor1" : INITIALIZE CONNECTION TO Spark executor1
+ "Spark executor1" -> "Greenplum segment1" : READ DATA FROM Spark executor1
+ note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor
+ deactivate "Greenplum segment1"
+
+ "Greenplum segment2" -->o "Spark executor2" : INITIALIZE CONNECTION TO Spark executor2
+ "Spark executor2" -> "Greenplum segment2" : READ DATA FROM Spark executor2
+ deactivate "Greenplum segment2"
+
+ "Greenplum segmentN" -->o "Spark executorN" : INITIALIZE CONNECTION TO Spark executorN
+ "Spark executorN" -> "Greenplum segmentN" : READ DATA FROM Spark executorN
+ deactivate "Greenplum segmentN"
+
+ == Finished ==
+
+ "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1
+ deactivate "Greenplum master"
+ "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2
+ deactivate "Greenplum master"
+ "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN
+ deactivate "Greenplum master"
+
+ "Spark executor1" --> "Spark driver" -- : DONE
+ "Spark executor2" --> "Spark driver" -- : DONE
+ "Spark executorN" --> "Spark driver" -- : DONE
+
+ "Spark driver" --> "Greenplum master" : CLOSE CONNECTION
+ deactivate "Greenplum master"
+ deactivate "Spark driver"
+ @enduml
+
+Recommendations
+---------------
+
+Mapping of Spark types to Greenplum types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See `official documentation `_
+for more details.
+onETL does not perform any additional casting of types while writing data.
+
+Options
+-------
+
.. currentmodule:: onetl.connection.db_connection.greenplum.options
.. autopydantic_model:: GreenplumWriteOptions
diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst
index a19c5e57b..3e6c846b1 100644
--- a/docs/connection/db_connection/kafka/read.rst
+++ b/docs/connection/db_connection/kafka/read.rst
@@ -7,7 +7,8 @@ For reading data from Kafka, use :obj:`DBReader
FTPS
HDFS
+ Samba
SFTP
S3
Webdav
diff --git a/docs/connection/file_connection/samba.rst b/docs/connection/file_connection/samba.rst
new file mode 100644
index 000000000..73f7ac3f9
--- /dev/null
+++ b/docs/connection/file_connection/samba.rst
@@ -0,0 +1,9 @@
+.. _samba:
+
+Samba connection
+==============
+
+.. currentmodule:: onetl.connection.file_connection.samba
+
+.. autoclass:: Samba
+ :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_file, list_dir, download_file, upload_file
diff --git a/docs/file_df/file_formats/avro.rst b/docs/file_df/file_formats/avro.rst
index 6251a5154..7f1ec0d4f 100644
--- a/docs/file_df/file_formats/avro.rst
+++ b/docs/file_df/file_formats/avro.rst
@@ -1,7 +1,7 @@
.. _avro-file-format:
Avro
-========
+====
.. currentmodule:: onetl.file.format.avro
diff --git a/docs/file_df/file_formats/excel.rst b/docs/file_df/file_formats/excel.rst
new file mode 100644
index 000000000..f9b680085
--- /dev/null
+++ b/docs/file_df/file_formats/excel.rst
@@ -0,0 +1,9 @@
+.. _excel-file-format:
+
+Excel
+=====
+
+.. currentmodule:: onetl.file.format.excel
+
+.. autoclass:: Excel
+ :members: get_packages
diff --git a/docs/file_df/file_formats/index.rst b/docs/file_df/file_formats/index.rst
index 7e3367bc6..3a39bc061 100644
--- a/docs/file_df/file_formats/index.rst
+++ b/docs/file_df/file_formats/index.rst
@@ -9,6 +9,7 @@ File Formats
avro
csv
+ excel
json
jsonline
orc
diff --git a/docs/file_df/file_formats/orc.rst b/docs/file_df/file_formats/orc.rst
index 2d82b3584..491492bac 100644
--- a/docs/file_df/file_formats/orc.rst
+++ b/docs/file_df/file_formats/orc.rst
@@ -1,7 +1,7 @@
.. _orc-file-format:
ORC
-========
+===
.. currentmodule:: onetl.file.format.orc
diff --git a/docs/index.rst b/docs/index.rst
index cc8fdb87d..54ced3d06 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -14,7 +14,7 @@
:hidden:
self
- install
+ install/index
quickstart
concepts
diff --git a/docs/install.rst b/docs/install.rst
deleted file mode 100644
index abf328c75..000000000
--- a/docs/install.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-.. include:: ../README.rst
- :start-after: install
- :end-before: quick-start
diff --git a/docs/install/files.rst b/docs/install/files.rst
new file mode 100644
index 000000000..b32c7a807
--- /dev/null
+++ b/docs/install/files.rst
@@ -0,0 +1,8 @@
+.. _install-files:
+
+File connections
+=================
+
+.. include:: ../../README.rst
+ :start-after: .. _files-install:
+ :end-before: With Kerberos support
diff --git a/docs/install/full.rst b/docs/install/full.rst
new file mode 100644
index 000000000..a3853207c
--- /dev/null
+++ b/docs/install/full.rst
@@ -0,0 +1,8 @@
+.. _install-full:
+
+Full bundle
+===========
+
+.. include:: ../../README.rst
+ :start-after: .. _full-bundle:
+ :end-before: .. _quick-start:
diff --git a/docs/install/index.rst b/docs/install/index.rst
new file mode 100644
index 000000000..47f86287c
--- /dev/null
+++ b/docs/install/index.rst
@@ -0,0 +1,21 @@
+.. _install:
+
+How to install
+==============
+
+.. include:: ../../README.rst
+ :start-after: .. _minimal-install:
+ :end-before: With DB and FileDF connections
+
+Installation in details
+-----------------------
+
+.. toctree::
+ :maxdepth: 1
+ :caption: How to install
+
+ self
+ spark
+ files
+ kerberos
+ full
diff --git a/docs/install/kerberos.rst b/docs/install/kerberos.rst
new file mode 100644
index 000000000..2ba28de4d
--- /dev/null
+++ b/docs/install/kerberos.rst
@@ -0,0 +1,8 @@
+.. _install-kerberos:
+
+Kerberos support
+================
+
+.. include:: ../../README.rst
+ :start-after: .. _kerberos-install:
+ :end-before: Full bundle
diff --git a/docs/install/spark.rst b/docs/install/spark.rst
new file mode 100644
index 000000000..861527341
--- /dev/null
+++ b/docs/install/spark.rst
@@ -0,0 +1,323 @@
+.. _install-spark:
+
+Spark
+=====
+
+.. include:: ../../README.rst
+ :start-after: .. _spark-install:
+ :end-before: .. _java-install:
+
+Installing Java
+---------------
+
+.. include:: ../../README.rst
+ :start-after: .. _java-install:
+ :end-before: .. _pyspark-install:
+
+Installing PySpark
+------------------
+
+.. include:: ../../README.rst
+ :start-after: .. _pyspark-install:
+ :end-before: With File connections
+
+.. _java-packages:
+
+Injecting Java packages
+-----------------------
+
+Some DB and FileDF connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session,
+like JDBC drivers.
+
+This is usually done by setting up ``spark.jars.packages`` option while creating Spark session:
+
+.. code:: python
+
+ # here is a list of packages to be downloaded:
+ maven_packages = (
+ Greenplum.get_packages(spark_version="3.2")
+ + MySQL.get_packages()
+ + Teradata.get_packages()
+ )
+
+ spark = (
+ SparkSession.builder.config("spark.app.name", "onetl")
+ .config("spark.jars.packages", ",".join(maven_packages))
+ .getOrCreate()
+ )
+
+
+Spark automatically resolves package and all its dependencies, download them and inject to Spark session
+(both driver and all executors).
+
+This requires internet access, because package metadata and ``.jar`` files are fetched from `Maven Repository `_.
+
+But sometimes it is required to:
+
+* Install package without direct internet access (isolated network)
+* Install package which is not available in Maven
+
+There are several ways to do that.
+
+Using ``spark.jars``
+^^^^^^^^^^^^^^^^^^^^
+
+The most simple solution, but this requires to store raw ``.jar`` files somewhere on filesystem or web server.
+
+* Download ``package.jar`` files (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique.
+* (For ``spark.submit.deployMode=cluster``) place downloaded files to HDFS or deploy to any HTTP web server serving static files. See `official documentation `_ for more details.
+* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option:
+
+.. tabs::
+
+ .. code-tab:: py for spark.submit.deployMode=client (default)
+
+ jar_files = ["/path/to/package.jar"]
+
+ # do not pass spark.jars.packages
+ spark = (
+ SparkSession.builder.config("spark.app.name", "onetl")
+ .config("spark.jars", ",".join(jar_files))
+ .getOrCreate()
+ )
+
+ .. code-tab:: py for spark.submit.deployMode=cluster
+
+ # you can also pass URLs like http://domain.com/path/to/downloadable/package.jar
+ jar_files = ["hdfs:///path/to/package.jar"]
+
+ # do not pass spark.jars.packages
+ spark = (
+ SparkSession.builder.config("spark.app.name", "onetl")
+ .config("spark.jars", ",".join(jar_files))
+ .getOrCreate()
+ )
+
+Using ``spark.jars.repositories``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+ In this case Spark still will try to fetch packages from the internet, so if you don't have internet access,
+ Spark session will be created with significant delay because of all attempts to fetch packages.
+
+Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo.
+
+* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_.
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter.
+* Upload ``package.jar`` file to private repository (with same ``groupId`` and ``artifactoryId`` as in source package in Maven).
+* Pass repo URL to ``spark.jars.repositories`` Spark config option.
+* Create Spark session with passing Package name to ``spark.jars.packages`` Spark config option:
+
+.. code:: python
+
+ maven_packages = (
+ Greenplum.get_packages(spark_version="3.2")
+ + MySQL.get_packages()
+ + Teradata.get_packages()
+ )
+
+ spark = (
+ SparkSession.builder.config("spark.app.name", "onetl")
+ .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/")
+ .config("spark.jars.packages", ",".join(maven_packages))
+ .getOrCreate()
+ )
+
+
+Using ``spark.jars.ivySettings``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Same as above, but can be used even if there is no network access to public repos like Maven.
+
+* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_.
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter.
+* Upload ``package.jar`` file to `private repository `_ (with same ``groupId`` and ``artifactoryId`` as in source package in Maven).
+* Create ``ivysettings.xml`` file (see below).
+* Add here a resolver with repository URL (and credentials, if required).
+* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option.
+* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option:
+
+.. tabs::
+
+ .. code-tab:: xml ivysettings-all-packages-uploaded-to-nexus.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .. code-tab:: xml ivysettings-private-packages-in-nexus-public-in-maven.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .. code-tab:: xml ivysettings-private-packages-in-nexus-public-fetched-using-proxy-repo.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .. code-tab:: xml ivysettings-nexus-with-auth-required.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. code-block:: python
+ :caption: script.py
+
+ maven_packages = (
+ Greenplum.get_packages(spark_version="3.2")
+ + MySQL.get_packages()
+ + Teradata.get_packages()
+ )
+
+ spark = (
+ SparkSession.builder.config("spark.app.name", "onetl")
+ .config("spark.jars.ivySettings", "/path/to/ivysettings.xml")
+ .config("spark.jars.packages", ",".join(maven_packages))
+ .getOrCreate()
+ )
+
+Place ``.jar`` file to ``-/.ivy2/jars/``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven.
+
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique.
+* Move it to ``-/.ivy2/jars/`` folder.
+* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option:
+
+.. code:: python
+
+ maven_packages = (
+ Greenplum.get_packages(spark_version="3.2")
+ + MySQL.get_packages()
+ + Teradata.get_packages()
+ )
+
+ spark = (
+ SparkSession.builder.config("spark.app.name", "onetl")
+ .config("spark.jars.packages", ",".join(maven_packages))
+ .getOrCreate()
+ )
+
+Place ``.jar`` file to Spark jars folder
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+ Package file should be placed on all hosts/containers Spark is running,
+ both driver and all executors.
+
+ Usually this is used only with either:
+ * ``spark.master=local`` (driver and executors are running on the same host),
+ * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods).
+
+Can be used to embed ``.jar`` files to a default Spark classpath.
+
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique.
+* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``^/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``.
+* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages``
+.. code:: python
+
+ # no need to set spark.jars.packages or any other spark.jars.* option
+ # all jars already present in CLASSPATH, and loaded automatically
+
+ spark = SparkSession.builder.config("spark.app.name", "onetl").getOrCreate()
+
+
+Manually adding ``.jar`` files to ``CLASSPATH``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+ Package file should be placed on all hosts/containers Spark is running,
+ both driver and all executors.
+
+ Usually this is used only with either:
+ * ``spark.master=local`` (driver and executors are running on the same host),
+ * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods).
+
+Can be used to embed ``.jar`` files to a default Java classpath.
+
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter.
+* Set environment variable ``CLASSPATH`` to ``/path/to/package.jar``. You can set multiple file paths
+* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages``
+
+.. code:: python
+
+ # no need to set spark.jars.packages or any other spark.jars.* option
+ # all jars already present in CLASSPATH, and loaded automatically
+
+ import os
+
+ jar_files = ["/path/to/package.jar"]
+ # different delimiters for Windows and Linux
+ delimiter = ";" if os.name == "nt" else ":"
+ spark = (
+ SparkSession.builder.config("spark.app.name", "onetl")
+ .config("spark.driver.extraClassPath", delimiter.join(jar_files))
+ .config("spark.executor.extraClassPath", delimiter.join(jar_files))
+ .getOrCreate()
+ )
diff --git a/onetl/VERSION b/onetl/VERSION
index 965065db5..a602fc9e2 100644
--- a/onetl/VERSION
+++ b/onetl/VERSION
@@ -1 +1 @@
-0.9.3
+0.9.4
diff --git a/onetl/connection/__init__.py b/onetl/connection/__init__.py
index 1c50f7fee..3e40e2a2a 100644
--- a/onetl/connection/__init__.py
+++ b/onetl/connection/__init__.py
@@ -37,6 +37,7 @@
from onetl.connection.file_connection.ftps import FTPS
from onetl.connection.file_connection.hdfs import HDFS
from onetl.connection.file_connection.s3 import S3
+ from onetl.connection.file_connection.samba import Samba
from onetl.connection.file_connection.sftp import SFTP
from onetl.connection.file_connection.webdav import WebDAV
from onetl.connection.file_df_connection.spark_hdfs import SparkHDFS
@@ -62,6 +63,7 @@
"HDFS": "hdfs",
"S3": "s3",
"SFTP": "sftp",
+ "Samba": "samba",
"WebDAV": "webdav",
}
diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py
index dc6acf163..f95884f7d 100644
--- a/onetl/connection/db_connection/clickhouse/connection.py
+++ b/onetl/connection/db_connection/clickhouse/connection.py
@@ -65,7 +65,7 @@ class Clickhouse(JDBCConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py
index 99de7d90c..d1eedff7f 100644
--- a/onetl/connection/db_connection/greenplum/connection.py
+++ b/onetl/connection/db_connection/greenplum/connection.py
@@ -124,34 +124,24 @@ class Greenplum(JDBCMixin, DBConnection):
from onetl.connection import Greenplum
from pyspark.sql import SparkSession
- # Please ask your DevOps and Greenplum admin what port range
- # on Spark side can be used to accept requests from Greenplum segments
-
- extra = {
- "server.port": "49152-65535",
- }
-
# Create Spark session with Greenplum connector loaded
- # See Prerequisites page for more details
maven_packages = Greenplum.get_packages(spark_version="3.2")
spark = (
SparkSession.builder.appName("spark-app-name")
.config("spark.jars.packages", ",".join(maven_packages))
+ .config("spark.executor.allowSparkContext", "true")
+ # IMPORTANT!!!
+ # Set number of executors according to "Prerequisites" -> "Number of executors"
.config("spark.dynamicAllocation.maxExecutors", 10)
.config("spark.executor.cores", 1)
.getOrCreate()
)
# IMPORTANT!!!
- # Each job on the Spark executor make its own connection to Greenplum master node,
- # so we need to limit number of connections to avoid opening too many of them.
- #
- # Table size ~20Gb requires about 10 executors * cores,
- # ~50Gb requires ~ 20 executors * cores,
- # 100Gb+ requires 30 executors * cores.
- #
- # Cores number can be increased, but executors count should be reduced
- # to keep the same number of executors * cores.
+ # Set port range of executors according to "Prerequisites" -> "Network ports"
+ extra = {
+ "server.port": "41000-42000",
+ }
# Create connection
greenplum = Greenplum(
diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py
index 86785155e..7d4638412 100644
--- a/onetl/connection/db_connection/greenplum/options.py
+++ b/onetl/connection/db_connection/greenplum/options.py
@@ -107,7 +107,9 @@ class Config:
.. warning::
- You should not change this option, unless you know what you're doing
+ You should not change this option, unless you know what you're doing.
+
+ It's preferable to use default values to read data parallel by number of segments in Greenplum cluster.
Possible values:
* ``None`` (default):
diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py
index d0bc08d29..6d768ea2e 100644
--- a/onetl/connection/db_connection/hive/connection.py
+++ b/onetl/connection/db_connection/hive/connection.py
@@ -69,7 +69,7 @@ class Hive(DBConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
.. warning::
@@ -82,7 +82,7 @@ class Hive(DBConnection):
.. note::
Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit``
- **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`.
+ **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`.
In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options
to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS.
@@ -340,6 +340,14 @@ def write_df_to_target(
# https://stackoverflow.com/a/72747050
if table_exists and write_options.if_exists != HiveTableExistBehavior.REPLACE_ENTIRE_TABLE:
+ if write_options.if_exists == HiveTableExistBehavior.ERROR:
+ raise ValueError("Operation stopped due to Hive.WriteOptions(if_exists='error')")
+ elif write_options.if_exists == HiveTableExistBehavior.IGNORE:
+ log.info(
+ "|%s| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')",
+ self.__class__.__name__,
+ )
+ return
# using saveAsTable on existing table does not handle
# spark.sql.sources.partitionOverwriteMode=dynamic, so using insertInto instead.
self._insert_into(df, target, options)
diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py
index c46b7882d..81445851d 100644
--- a/onetl/connection/db_connection/hive/options.py
+++ b/onetl/connection/db_connection/hive/options.py
@@ -26,6 +26,8 @@
class HiveTableExistBehavior(str, Enum):
APPEND = "append"
+ IGNORE = "ignore"
+ ERROR = "error"
REPLACE_ENTIRE_TABLE = "replace_entire_table"
REPLACE_OVERLAPPING_PARTITIONS = "replace_overlapping_partitions"
@@ -173,9 +175,30 @@ class Config:
Table is recreated using options provided by user (``format``, ``compression``, etc)
**instead of using original table options**. Be careful
- .. note::
+ * ``ignore``
+ Ignores the write operation if the table/partition already exists.
+
+ .. dropdown:: Behavior in details
+
+ * Table does not exist
+ Table is created using options provided by user (``format``, ``compression``, etc).
+
+ * Table exists
+ If the table exists, **no further action is taken**. This is true whether or not new partition
+ values are present and whether the partitioning scheme differs or not
+
+ * ``error``
+ Raises an error if the table/partition already exists.
+
+ .. dropdown:: Behavior in details
+
+ * Table does not exist
+ Table is created using options provided by user (``format``, ``compression``, etc).
+
+ * Table exists
+ If the table exists, **raises an error**. This is true whether or not new partition
+ values are present and whether the partitioning scheme differs or not
- ``error`` and ``ignore`` modes are not supported.
.. note::
diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py
index 3eb83f538..f5b611910 100644
--- a/onetl/connection/db_connection/jdbc_connection/connection.py
+++ b/onetl/connection/db_connection/jdbc_connection/connection.py
@@ -218,7 +218,11 @@ def write_df_to_target(
write_options = self.WriteOptions.parse(options)
jdbc_params = self.options_to_jdbc_params(write_options)
- mode = "append" if write_options.if_exists == JDBCTableExistBehavior.APPEND else "overwrite"
+ mode = (
+ "overwrite"
+ if write_options.if_exists == JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE
+ else write_options.if_exists.value
+ )
log.info("|%s| Saving data to a table %r", self.__class__.__name__, target)
df.write.jdbc(table=target, mode=mode, **jdbc_params)
log.info("|%s| Table %r successfully written", self.__class__.__name__, target)
diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py
index c998055fe..dacaded77 100644
--- a/onetl/connection/db_connection/jdbc_connection/options.py
+++ b/onetl/connection/db_connection/jdbc_connection/options.py
@@ -84,6 +84,8 @@
class JDBCTableExistBehavior(str, Enum):
APPEND = "append"
+ IGNORE = "ignore"
+ ERROR = "error"
REPLACE_ENTIRE_TABLE = "replace_entire_table"
def __str__(self) -> str:
@@ -413,44 +415,65 @@ class Config:
.. dropdown:: Behavior in details
- * Table does not exist
- Table is created using options provided by user
- (``createTableOptions``, ``createTableColumnTypes``, etc).
+ * Table does not exist
+ Table is created using options provided by user
+ (``createTableOptions``, ``createTableColumnTypes``, etc).
- * Table exists
- Data is appended to a table. Table has the same DDL as before writing data
+ * Table exists
+ Data is appended to a table. Table has the same DDL as before writing data
- .. warning::
+ .. warning::
- This mode does not check whether table already contains
- rows from dataframe, so duplicated rows can be created.
+ This mode does not check whether table already contains
+ rows from dataframe, so duplicated rows can be created.
- Also Spark does not support passing custom options to
- insert statement, like ``ON CONFLICT``, so don't try to
- implement deduplication using unique indexes or constraints.
+ Also Spark does not support passing custom options to
+ insert statement, like ``ON CONFLICT``, so don't try to
+ implement deduplication using unique indexes or constraints.
- Instead, write to staging table and perform deduplication
- using :obj:`~execute` method.
+ Instead, write to staging table and perform deduplication
+ using :obj:`~execute` method.
* ``replace_entire_table``
**Table is dropped and then created, or truncated**.
.. dropdown:: Behavior in details
- * Table does not exist
- Table is created using options provided by user
- (``createTableOptions``, ``createTableColumnTypes``, etc).
+ * Table does not exist
+ Table is created using options provided by user
+ (``createTableOptions``, ``createTableColumnTypes``, etc).
- * Table exists
- Table content is replaced with dataframe content.
+ * Table exists
+ Table content is replaced with dataframe content.
- After writing completed, target table could either have the same DDL as
- before writing data (``truncate=True``), or can be recreated (``truncate=False``
- or source does not support truncation).
+ After writing completed, target table could either have the same DDL as
+ before writing data (``truncate=True``), or can be recreated (``truncate=False``
+ or source does not support truncation).
- .. note::
+ * ``ignore``
+ Ignores the write operation if the table already exists.
+
+ .. dropdown:: Behavior in details
+
+ * Table does not exist
+ Table is created using options provided by user
+ (``createTableOptions``, ``createTableColumnTypes``, etc).
+
+ * Table exists
+ The write operation is ignored, and no data is written to the table.
+
+ * ``error``
+ Raises an error if the table already exists.
+
+ .. dropdown:: Behavior in details
+
+ * Table does not exist
+ Table is created using options provided by user
+ (``createTableOptions``, ``createTableColumnTypes``, etc).
+
+ * Table exists
+ An error is raised, and no data is written to the table.
- ``error`` and ``ignore`` modes are not supported.
"""
batchsize: int = 20_000
diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py
index c02fb82f1..e5e3e312e 100644
--- a/onetl/connection/db_connection/jdbc_mixin/connection.py
+++ b/onetl/connection/db_connection/jdbc_mixin/connection.py
@@ -15,10 +15,11 @@
from __future__ import annotations
import logging
+import threading
from abc import abstractmethod
from contextlib import closing, suppress
from enum import Enum, auto
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Tuple, TypeVar
+from typing import TYPE_CHECKING, Callable, ClassVar, Optional, TypeVar
from pydantic import Field, PrivateAttr, SecretStr, validator
@@ -76,7 +77,7 @@ class JDBCMixin(FrozenModel):
_CHECK_QUERY: ClassVar[str] = "SELECT 1"
# cached JDBC connection (Java object), plus corresponding GenericOptions (Python object)
- _last_connection_and_options: Optional[Tuple[Any, JDBCMixinOptions]] = PrivateAttr(default=None)
+ _last_connection_and_options: Optional[threading.local] = PrivateAttr(default=None)
@property
@abstractmethod
@@ -126,6 +127,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # noqa: U101
def __del__(self): # noqa: WPS603
# If current object is collected by GC, close all opened connections
+ # This is safe because closing connection on Spark driver does not influence Spark executors
self.close()
@slot
@@ -459,8 +461,14 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions):
return jdbc_options.asConnectionProperties()
def _get_jdbc_connection(self, options: JDBCMixinOptions):
+ if not self._last_connection_and_options:
+ # connection class can be used in multiple threads.
+ # each Python thread creates its own thread in JVM
+ # so we need local variable to create per-thread persistent connection
+ self._last_connection_and_options = threading.local()
+
with suppress(Exception): # nothing cached, or JVM failed
- last_connection, last_options = self._last_connection_and_options
+ last_connection, last_options = self._last_connection_and_options.data
if options == last_options and not last_connection.isClosed():
return last_connection
@@ -471,15 +479,18 @@ def _get_jdbc_connection(self, options: JDBCMixinOptions):
driver_manager = self.spark._jvm.java.sql.DriverManager # type: ignore
new_connection = driver_manager.getConnection(self.jdbc_url, connection_properties)
- self._last_connection_and_options = (new_connection, options)
+ self._last_connection_and_options.data = (new_connection, options)
return new_connection
def _close_connections(self):
with suppress(Exception):
- last_connection, _ = self._last_connection_and_options
+ # connection maybe not opened yet
+ last_connection, _ = self._last_connection_and_options.data
last_connection.close()
- self._last_connection_and_options = None
+ with suppress(Exception):
+ # connection maybe not opened yet
+ del self._last_connection_and_options.data
def _get_statement_args(self) -> tuple[int, ...]:
resultset = self.spark._jvm.java.sql.ResultSet # type: ignore
diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py
index 3aa8f0fd2..51053df0c 100644
--- a/onetl/connection/db_connection/kafka/connection.py
+++ b/onetl/connection/db_connection/kafka/connection.py
@@ -72,6 +72,7 @@ class Kafka(DBConnection):
* Apache Kafka versions: 0.10 or higher
* Spark versions: 2.4.x - 3.4.x
+ * Scala versions: 2.11 - 2.13
Parameters
----------
@@ -381,6 +382,9 @@ def get_packages(
"""
Get package names to be downloaded by Spark. |support_hooks|
+ See `Maven package index `_
+ for all available packages.
+
Parameters
----------
spark_version : str
@@ -458,6 +462,10 @@ def close(self):
self.auth.cleanup(self)
return self
+ # Do not all __del__ with calling .close(), like other connections,
+ # because this can influence dataframes created by this connection.
+ # For example, .close() deletes local keytab copy.
+
@property
def instance_url(self):
return "kafka://" + self.cluster
diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py
index 8e6110f14..860f7b215 100644
--- a/onetl/connection/db_connection/mongodb/connection.py
+++ b/onetl/connection/db_connection/mongodb/connection.py
@@ -65,6 +65,7 @@ class MongoDB(DBConnection):
* MongoDB server versions: 4.0 or higher
* Spark versions: 3.2.x - 3.4.x
* Java versions: 8 - 20
+ * Scala versions: 2.11 - 2.13
See `official documentation `_.
@@ -82,7 +83,7 @@ class MongoDB(DBConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
Parameters
----------
@@ -124,7 +125,7 @@ class MongoDB(DBConnection):
from pyspark.sql import SparkSession
# Create Spark session with MongoDB connector loaded
- maven_packages = Greenplum.get_packages(spark_version="3.2")
+ maven_packages = MongoDB.get_packages(spark_version="3.2")
spark = (
SparkSession.builder.appName("spark-app-name")
.config("spark.jars.packages", ",".join(maven_packages))
@@ -206,6 +207,7 @@ def get_packages(
if scala_ver.digits(2) < (2, 12) or scala_ver.digits(2) > (2, 13):
raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver}")
+ # https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector
return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.digits(2)}:10.1.1"]
@classproperty
@@ -504,6 +506,16 @@ def write_df_to_target(
else "append"
)
+ if self._collection_exists(target):
+ if write_options.if_exists == MongoDBCollectionExistBehavior.ERROR:
+ raise ValueError("Operation stopped due to MongoDB.WriteOptions(if_exists='error')")
+ elif write_options.if_exists == MongoDBCollectionExistBehavior.IGNORE:
+ log.info(
+ "|%s| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')",
+ self.__class__.__name__,
+ )
+ return
+
log.info("|%s| Saving data to a collection %r", self.__class__.__name__, target)
df.write.format("mongodb").mode(mode).options(**write_options_dict).save()
log.info("|%s| Collection %r is successfully written", self.__class__.__name__, target)
@@ -533,3 +545,13 @@ def _check_java_class_imported(cls, spark):
log.debug("Missing Java class", exc_info=e, stack_info=True)
raise ValueError(msg) from e
return spark
+
+ def _collection_exists(self, source: str) -> bool:
+ jvm = self.spark._jvm
+ client = jvm.com.mongodb.client.MongoClients.create(self.connection_url) # type: ignore
+ collections = set(client.getDatabase(self.database).listCollectionNames().iterator())
+ if source in collections:
+ log.info("|%s| Collection %r exists", self.__class__.__name__, source)
+ return True
+ log.info("|%s| Collection %r does not exist", self.__class__.__name__, source)
+ return False
diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py
index 85f1935a3..13c256aff 100644
--- a/onetl/connection/db_connection/mongodb/options.py
+++ b/onetl/connection/db_connection/mongodb/options.py
@@ -81,6 +81,8 @@
class MongoDBCollectionExistBehavior(str, Enum):
APPEND = "append"
+ IGNORE = "ignore"
+ ERROR = "error"
REPLACE_ENTIRE_COLLECTION = "replace_entire_collection"
def __str__(self) -> str:
@@ -207,33 +209,52 @@ class MongoDBWriteOptions(GenericOptions):
.. dropdown:: Behavior in details
- * Collection does not exist
- Collection is created using options provided by user
- (``shardkey`` and others).
+ * Collection does not exist
+ Collection is created using options provided by user
+ (``shardkey`` and others).
- * Collection exists
- Data is appended to a collection.
+ * Collection exists
+ Data is appended to a collection.
- .. warning::
+ .. warning::
- This mode does not check whether collection already contains
- objects from dataframe, so duplicated objects can be created.
+ This mode does not check whether collection already contains
+ objects from dataframe, so duplicated objects can be created.
* ``replace_entire_collection``
**Collection is deleted and then created**.
.. dropdown:: Behavior in details
- * Collection does not exist
- Collection is created using options provided by user
- (``shardkey`` and others).
+ * Collection does not exist
+ Collection is created using options provided by user
+ (``shardkey`` and others).
- * Collection exists
- Collection content is replaced with dataframe content.
+ * Collection exists
+ Collection content is replaced with dataframe content.
- .. note::
+ * ``ignore``
+ Ignores the write operation if the collection already exists.
+
+ .. dropdown:: Behavior in details
+
+ * Collection does not exist
+ Collection is created using options provided by user
+
+ * Collection exists
+ The write operation is ignored, and no data is written to the collection.
+
+ * ``error``
+ Raises an error if the collection already exists.
+
+ .. dropdown:: Behavior in details
+
+ * Collection does not exist
+ Collection is created using options provided by user
+
+ * Collection exists
+ An error is raised, and no data is written to the collection.
- ``error`` and ``ignore`` modes are not supported.
"""
class Config:
diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py
index 49fc825d9..6738c2541 100644
--- a/onetl/connection/db_connection/mssql/connection.py
+++ b/onetl/connection/db_connection/mssql/connection.py
@@ -64,7 +64,7 @@ class MSSQL(JDBCConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py
index 868731eaf..abd17df33 100644
--- a/onetl/connection/db_connection/mysql/connection.py
+++ b/onetl/connection/db_connection/mysql/connection.py
@@ -63,7 +63,7 @@ class MySQL(JDBCConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py
index 69d7e2c5b..2e1f3e916 100644
--- a/onetl/connection/db_connection/oracle/connection.py
+++ b/onetl/connection/db_connection/oracle/connection.py
@@ -103,7 +103,7 @@ class Oracle(JDBCConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py
index eb07a68f6..22b42c296 100644
--- a/onetl/connection/db_connection/postgres/connection.py
+++ b/onetl/connection/db_connection/postgres/connection.py
@@ -61,7 +61,7 @@ class Postgres(JDBCConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py
index 7e730f9eb..2c797b3d8 100644
--- a/onetl/connection/db_connection/teradata/connection.py
+++ b/onetl/connection/db_connection/teradata/connection.py
@@ -66,7 +66,7 @@ class Teradata(JDBCConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py
index 39e27f2c6..cc5ebbb9e 100644
--- a/onetl/connection/file_connection/file_connection.py
+++ b/onetl/connection/file_connection/file_connection.py
@@ -17,6 +17,7 @@
import os
import threading
from abc import abstractmethod
+from contextlib import suppress
from logging import getLogger
from typing import Any, Iterable, Iterator
@@ -72,8 +73,10 @@ def client(self):
if client and not self._is_client_closed(client):
return client
except AttributeError:
- self._clients_cache.client = self._get_client()
- return self._clients_cache.client
+ pass
+
+ self._clients_cache.client = self._get_client()
+ return self._clients_cache.client
@slot
def close(self):
@@ -112,8 +115,14 @@ def close(self):
except AttributeError:
return self
- self._close_client(client)
- del self._clients_cache.client
+ with suppress(Exception):
+ # exceptions while closing client should be ignored
+ self._close_client(client)
+
+ with suppress(Exception):
+ # .close() could be called from destructor, and modifying self is not allowed here
+ del self._clients_cache.client
+
return self
def __enter__(self):
@@ -122,6 +131,10 @@ def __enter__(self):
def __exit__(self, _exc_type, _exc_value, _traceback):
self.close()
+ def __del__(self): # noqa: WPS603
+ # If current object is collected by GC, close opened connection
+ self.close()
+
@slot
def check(self):
log.info("|%s| Checking connection availability...", self.__class__.__name__)
diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py
index 6710a4303..b7dd82257 100644
--- a/onetl/connection/file_connection/ftp.py
+++ b/onetl/connection/file_connection/ftp.py
@@ -68,7 +68,7 @@ class FTP(FileConnection, RenameDirMixin):
# or
pip install onetl[files]
- See :ref:`files-install` instruction for more details.
+ See :ref:`install-files` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py
index 211ff6030..dfcd05553 100644
--- a/onetl/connection/file_connection/ftps.py
+++ b/onetl/connection/file_connection/ftps.py
@@ -69,7 +69,7 @@ class FTPS(FTP):
# or
pip install onetl[files]
- See :ref:`files-install` instruction for more details.
+ See :ref:`install-files` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py
index 2419aae2f..aa58f7e0a 100644
--- a/onetl/connection/file_connection/hdfs/connection.py
+++ b/onetl/connection/file_connection/hdfs/connection.py
@@ -72,14 +72,14 @@ class HDFS(FileConnection, RenameDirMixin):
# or
pip install onetl[files]
- See :ref:`files-install` instruction for more details.
+ See :ref:`install-files` installation instruction for more details.
.. note::
To access Hadoop cluster with Kerberos installed, you should have ``kinit`` executable
in some path in ``PATH`` environment variable.
- See onETL :ref:`kerberos-install` instruction for more details.
+ See :ref:`install-kerberos` instruction for more details.
Parameters
----------
diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py
index 7198a05aa..2f8d298f1 100644
--- a/onetl/connection/file_connection/s3.py
+++ b/onetl/connection/file_connection/s3.py
@@ -67,7 +67,7 @@ class S3(FileConnection):
# or
pip install onetl[files]
- See :ref:`files-install` instruction for more details.
+ See :ref:`install-files` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py
new file mode 100644
index 000000000..bef7ed276
--- /dev/null
+++ b/onetl/connection/file_connection/samba.py
@@ -0,0 +1,327 @@
+# Copyright 2023 MTS (Mobile Telesystems)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+import stat
+import textwrap
+from io import BytesIO
+from logging import getLogger
+from pathlib import Path
+from typing import Optional, Union
+
+from etl_entities.instance import Host
+from pydantic import SecretStr, validator
+from typing_extensions import Literal
+
+from onetl.connection.file_connection.file_connection import FileConnection
+from onetl.hooks import slot, support_hooks
+from onetl.impl import LocalPath, RemotePath, RemotePathStat
+
+try:
+ from smb.smb_structs import OperationFailure
+ from smb.SMBConnection import SMBConnection
+except (ImportError, NameError) as e:
+ raise ImportError(
+ textwrap.dedent(
+ """
+ Cannot import module "pysmb".
+
+ You should install package as follows:
+ pip install onetl[samba]
+
+ or
+ pip install onetl[files]
+ """,
+ ).strip(),
+ ) from e
+
+
+log = getLogger(__name__)
+
+
+@support_hooks
+class Samba(FileConnection):
+ """Samba file connection. |support_hooks|
+
+ Based on `pysmb library `_.
+
+ .. versionadded:: 0.9.4
+
+ .. warning::
+
+ To use Samba connector you should install package as follows:
+
+ .. code:: bash
+
+ pip install onetl[samba]
+
+ # or
+ pip install onetl[files]
+
+ See :ref:`install-files` installation instruction for more details.
+
+ Parameters
+ ----------
+ host : str
+ Host of Samba source. For example: ``mydomain.com``.
+
+ share : str
+ The name of the share on the Samba server.
+
+ protocol : str, default: ``SMB``
+ The protocol to use for the connection. Either ``SMB`` or ``NetBIOS``.
+ Affects the default port and the `is_direct_tcp` flag in `SMBConnection`.
+
+ port : int, default: 445
+ Port of Samba source.
+
+ domain : str, default: ``
+ Domain name for the Samba connection. Empty strings means use ``host`` as domain name.
+
+ auth_type : str, default: ``NTLMv2``
+ The authentication type to use. Either ``NTLMv2`` or ``NTLMv1``.
+ Affects the `use_ntlm_v2` flag in `SMBConnection`.
+
+ user : str, default: None
+ User, which have access to the file source. Can be `None` for anonymous connection.
+
+ password : str, default: None
+ Password for file source connection. Can be `None` for anonymous connection.
+
+ Examples
+ --------
+
+ Samba file connection initialization
+
+ .. code:: python
+
+ from onetl.connection import Samba
+
+ samba = Samba(
+ host="mydomain.com",
+ share="share_name",
+ protocol="SMB",
+ port=445,
+ user="user",
+ password="password",
+ )
+
+
+ """
+
+ host: Host
+ share: str
+ protocol: Union[Literal["SMB"], Literal["NetBIOS"]] = "SMB"
+ port: Optional[int] = None
+ domain: Optional[str] = ""
+ auth_type: Union[Literal["NTLMv1"], Literal["NTLMv2"]] = "NTLMv2"
+ user: Optional[str] = None
+ password: Optional[SecretStr] = None
+
+ @property
+ def instance_url(self) -> str:
+ return f"smb://{self.host}:{self.port}"
+
+ @slot
+ def check(self):
+ log.info("|%s| Checking connection availability...", self.__class__.__name__)
+ self._log_parameters()
+ try:
+ available_shares = {share.name for share in self.client.listShares()}
+ if self.share in available_shares:
+ log.info("|%s| Connection is available.", self.__class__.__name__)
+ else:
+ log.error(
+ "|%s| Share %r not found among existing shares %r",
+ self.__class__.__name__,
+ self.share,
+ available_shares,
+ )
+ raise ConnectionError("Failed to connect to the Samba server.")
+ except Exception as exc:
+ log.exception("|%s| Connection is unavailable", self.__class__.__name__)
+ raise RuntimeError("Connection is unavailable") from exc
+
+ return self
+
+ @slot
+ def path_exists(self, path: os.PathLike | str) -> bool:
+ try:
+ self.client.getAttributes(self.share, os.fspath(path))
+ return True
+ except OperationFailure:
+ return False
+
+ def _scan_entries(self, path: RemotePath) -> list:
+ if self._is_dir(path):
+ return [
+ entry
+ for entry in self.client.listPath(
+ self.share,
+ os.fspath(path),
+ )
+ if entry.filename not in {".", ".."} # Filter out '.' and '..'
+ ]
+ return [self.client.getAttributes(self.share, os.fspath(path))]
+
+ def _extract_name_from_entry(self, entry) -> str:
+ return entry.filename
+
+ def _is_dir_entry(self, top: RemotePath, entry) -> bool:
+ return entry.isDirectory
+
+ def _is_file_entry(self, top: RemotePath, entry) -> bool:
+ return not entry.isDirectory
+
+ def _extract_stat_from_entry(self, top: RemotePath, entry) -> RemotePathStat:
+ if entry.isDirectory:
+ return RemotePathStat(st_mode=stat.S_IFDIR)
+
+ return RemotePathStat(
+ st_size=entry.file_size,
+ st_mtime=entry.last_write_time,
+ st_uid=entry.filename,
+ )
+
+ def _get_client(self) -> SMBConnection:
+ is_direct_tcp = self.protocol == "SMB"
+ use_ntlm_v2 = self.auth_type == "NTLMv2"
+ conn = SMBConnection(
+ username=self.user,
+ password=self.password.get_secret_value() if self.password else None,
+ my_name="onetl",
+ remote_name=self.host,
+ domain=self.domain,
+ use_ntlm_v2=use_ntlm_v2,
+ sign_options=2,
+ is_direct_tcp=is_direct_tcp,
+ )
+ conn.connect(self.host, port=self.port)
+ return conn
+
+ def _is_client_closed(self, client: SMBConnection) -> bool:
+ try:
+ socket_fileno = client.sock.fileno()
+ except (AttributeError, OSError):
+ return True
+
+ return socket_fileno == -1
+
+ def _close_client(self, client: SMBConnection) -> None:
+ self.client.close()
+
+ def _download_file(self, remote_file_path: RemotePath, local_file_path: LocalPath) -> None:
+ with open(local_file_path, "wb") as local_file:
+ self.client.retrieveFile(
+ self.share,
+ os.fspath(remote_file_path),
+ local_file,
+ )
+
+ def _get_stat(self, path: RemotePath) -> RemotePathStat:
+ info = self.client.getAttributes(self.share, os.fspath(path))
+
+ if self.is_dir(os.fspath(path)):
+ return RemotePathStat(st_mode=stat.S_IFDIR)
+
+ return RemotePathStat(
+ st_size=info.file_size,
+ st_mtime=info.last_write_time,
+ st_uid=info.filename,
+ )
+
+ def _remove_file(self, remote_file_path: RemotePath) -> None:
+ self.client.deleteFiles(
+ self.share,
+ os.fspath(remote_file_path),
+ )
+
+ def _create_dir(self, path: RemotePath) -> None:
+ path_obj = Path(path)
+ for parent in reversed(path_obj.parents):
+ # create dirs sequentially as .createDirectory(...) cannot create nested dirs
+ try:
+ self.client.getAttributes(self.share, os.fspath(parent))
+ except OperationFailure:
+ self.client.createDirectory(self.share, os.fspath(parent))
+
+ self.client.createDirectory(self.share, os.fspath(path))
+
+ def _upload_file(self, local_file_path: LocalPath, remote_file_path: RemotePath) -> None:
+ with open(local_file_path, "rb") as file_obj:
+ self.client.storeFile(
+ self.share,
+ os.fspath(remote_file_path),
+ file_obj,
+ )
+
+ def _rename_file(self, source: RemotePath, target: RemotePath) -> None:
+ self.client.rename(
+ self.share,
+ os.fspath(source),
+ os.fspath(target),
+ )
+
+ def _remove_dir(self, path: RemotePath) -> None:
+ files = self.client.listPath(self.share, os.fspath(path))
+
+ for item in files:
+ if item.filename not in {".", ".."}: # skip current and parent directory entries
+ full_path = path / item.filename
+ if item.isDirectory:
+ # recursively delete subdirectory
+ self._remove_dir(full_path)
+ else:
+ self.client.deleteFiles(self.share, os.fspath(full_path))
+
+ self.client.deleteDirectory(self.share, os.fspath(path))
+
+ def _read_text(self, path: RemotePath, encoding: str) -> str:
+ return self._read_bytes(path).decode(encoding)
+
+ def _read_bytes(self, path: RemotePath) -> bytes:
+ file_obj = BytesIO()
+ self.client.retrieveFile(
+ self.share,
+ os.fspath(path),
+ file_obj,
+ )
+ file_obj.seek(0)
+ return file_obj.read()
+
+ def _write_text(self, path: RemotePath, content: str, encoding: str) -> None:
+ self._write_bytes(path, bytes(content, encoding))
+
+ def _write_bytes(self, path: RemotePath, content: bytes) -> None:
+ file_obj = BytesIO(content)
+
+ self.client.storeFile(
+ self.share,
+ os.fspath(path),
+ file_obj,
+ )
+
+ def _is_dir(self, path: RemotePath) -> bool:
+ return self.client.getAttributes(self.share, os.fspath(path)).isDirectory
+
+ def _is_file(self, path: RemotePath) -> bool:
+ return not self.client.getAttributes(self.share, os.fspath(path)).isDirectory
+
+ @validator("port", pre=True, always=True)
+ def _set_port_based_on_protocol(cls, port, values):
+ if port is None:
+ return 445 if values.get("protocol") == "SMB" else 139
+ return port
diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py
index 3b84df658..bef53ce2d 100644
--- a/onetl/connection/file_connection/sftp.py
+++ b/onetl/connection/file_connection/sftp.py
@@ -71,7 +71,7 @@ class SFTP(FileConnection, RenameDirMixin):
# or
pip install onetl[files]
- See :ref:`files-install` instruction for more details.
+ See :ref:`install-files` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py
index 52aab0419..9825a0525 100644
--- a/onetl/connection/file_connection/webdav.py
+++ b/onetl/connection/file_connection/webdav.py
@@ -70,7 +70,7 @@ class WebDAV(FileConnection, RenameDirMixin):
# or
pip install onetl[files]
- See :ref:`files-install` instruction for more details.
+ See :ref:`install-files` installation instruction for more details.
Parameters
----------
diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py
index 04bdfae48..6855fe595 100644
--- a/onetl/connection/file_df_connection/spark_hdfs/connection.py
+++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py
@@ -17,6 +17,7 @@
import getpass
import logging
import os
+from contextlib import suppress
from pathlib import Path
from typing import TYPE_CHECKING, Optional
@@ -58,12 +59,12 @@ class SparkHDFS(SparkFileDFConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
.. note::
Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit``
- **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`.
+ **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`.
In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options
to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS.
@@ -224,10 +225,16 @@ def close(self):
"""
log.debug("Reset FileSystem cache")
- self._get_spark_fs().close()
- object.__setattr__(self, "_active_host", None) # noqa: WPS609
+ with suppress(Exception):
+ self._get_spark_fs().close()
+
+ with suppress(Exception):
+ self._active_host = None
return self
+ # Do not all __del__ with calling .close(), like other connections,
+ # because this can influence dataframes created by this connection
+
@slot
@classmethod
def get_current(cls, spark: SparkSession):
@@ -360,7 +367,7 @@ def _convert_to_url(self, path: PurePathProtocol) -> str:
else:
host = self._get_host()
# cache value to avoid getting active namenode for every path
- object.__setattr__(self, "_active_host", host) # noqa: WPS609
+ self._active_host = host
return f"hdfs://{host}:{self.ipc_port}" + path.as_posix()
def _get_default_path(self):
diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py
index b914c714f..264fac3a2 100644
--- a/onetl/connection/file_df_connection/spark_local_fs.py
+++ b/onetl/connection/file_df_connection/spark_local_fs.py
@@ -49,7 +49,7 @@ class SparkLocalFS(SparkFileDFConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
.. warning::
diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py
index 0fd72a0ca..992e11627 100644
--- a/onetl/connection/file_df_connection/spark_s3/connection.py
+++ b/onetl/connection/file_df_connection/spark_s3/connection.py
@@ -16,6 +16,7 @@
import logging
import os
+from contextlib import suppress
from typing import TYPE_CHECKING, ClassVar, List, Optional
from etl_entities.instance import Host
@@ -63,6 +64,7 @@ class SparkS3(SparkFileDFConnection):
* Spark versions: 3.2.x - 3.4.x (only with Hadoop 3.x libraries)
* Java versions: 8 - 20
+ * Scala versions: 2.11 - 2.13
.. warning::
@@ -82,7 +84,7 @@ class SparkS3(SparkFileDFConnection):
# or
pip install onetl pyspark=3.4.1 # pass specific PySpark version
- See :ref:`spark-install` instruction for more details.
+ See :ref:`install-spark` installation instruction for more details.
.. note::
@@ -160,9 +162,17 @@ class SparkS3(SparkFileDFConnection):
# Create Spark session with Hadoop AWS libraries loaded
maven_packages = SparkS3.get_packages(spark_version="3.4.1")
+ # Some dependencies are not used, but downloading takes a lot of time. Skipping them.
+ excluded_packages = [
+ "com.google.cloud.bigdataoss:gcs-connector",
+ "org.apache.hadoop:hadoop-aliyun",
+ "org.apache.hadoop:hadoop-azure-datalake",
+ "org.apache.hadoop:hadoop-azure",
+ ]
spark = (
SparkSession.builder.appName("spark-app-name")
.config("spark.jars.packages", ",".join(maven_packages))
+ .config("spark.jars.excludes", ",".join(excluded_packages))
.config("spark.hadoop.fs.s3a.committer.magic.enabled", "true")
.config("spark.hadoop.fs.s3a.committer.name", "magic")
.config(
@@ -263,6 +273,7 @@ def get_packages(
raise ValueError(f"Spark version must be at least 3.x, got {spark_ver}")
scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver)
+ # https://mvnrepository.com/artifact/org.apache.spark/spark-hadoop-cloud
return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.digits(2)}:{spark_ver.digits(3)}"]
@slot
@@ -311,9 +322,13 @@ def close(self):
connection.close()
"""
- self._reset_hadoop_conf()
+ with suppress(Exception):
+ self._reset_hadoop_conf()
return self
+ # Do not all __del__ with calling .close(), like other connections,
+ # because this can influence dataframes created by this connection
+
@slot
def check(self):
self._patch_hadoop_conf()
diff --git a/onetl/file/format/__init__.py b/onetl/file/format/__init__.py
index d41c76aac..0c9d6b742 100644
--- a/onetl/file/format/__init__.py
+++ b/onetl/file/format/__init__.py
@@ -15,6 +15,7 @@
from onetl.file.format.avro import Avro
from onetl.file.format.csv import CSV
+from onetl.file.format.excel import Excel
from onetl.file.format.json import JSON
from onetl.file.format.jsonline import JSONLine
from onetl.file.format.orc import ORC
diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py
index 2fc5a1cb5..b0c58e18d 100644
--- a/onetl/file/format/avro.py
+++ b/onetl/file/format/avro.py
@@ -73,6 +73,7 @@ class Avro(ReadWriteFileFormat):
* Spark versions: 2.4.x - 3.4.x
* Java versions: 8 - 20
+ * Scala versions: 2.11 - 2.13
See documentation from link above.
@@ -131,6 +132,9 @@ def get_packages(
"""
Get package names to be downloaded by Spark. |support_hooks|
+ See `Maven package index `_
+ for all available packages.
+
Parameters
----------
spark_version : str
diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py
new file mode 100644
index 000000000..ffd11a5da
--- /dev/null
+++ b/onetl/file/format/excel.py
@@ -0,0 +1,220 @@
+# Copyright 2023 MTS (Mobile Telesystems)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, ClassVar
+
+from onetl._util.java import try_import_java_class
+from onetl._util.scala import get_default_scala_version
+from onetl._util.spark import get_spark_version
+from onetl._util.version import Version
+from onetl.exception import MISSING_JVM_CLASS_MSG
+from onetl.file.format.file_format import ReadWriteFileFormat
+from onetl.hooks import slot, support_hooks
+
+if TYPE_CHECKING:
+ from pyspark.sql import SparkSession
+
+READ_OPTIONS = frozenset(
+ (
+ "dataAddress",
+ "treatEmptyValuesAsNulls",
+ "setErrorCellsToFallbackValues",
+ "usePlainNumberFormat",
+ "inferSchema",
+ "addColorColumns",
+ "timestampFormat",
+ "maxRowsInMemory",
+ "maxByteArraySize",
+ "tempFileThreshold",
+ "excerptSize",
+ "workbookPassword",
+ ),
+)
+
+WRITE_OPTIONS = frozenset(
+ (
+ "dataAddress",
+ "dateFormat",
+ "timestampFormat",
+ ),
+)
+
+log = logging.getLogger(__name__)
+
+
+@support_hooks
+class Excel(ReadWriteFileFormat):
+ """
+ Excel file format. |support_hooks|
+
+ Based on `Spark Excel `_ file format.
+
+ Supports reading/writing files with ``.xlsx`` (read/write) and ``.xls`` (read only) extensions.
+
+ .. versionadded:: 0.9.4
+
+ .. dropdown:: Version compatibility
+
+ * Spark versions: 3.2.x - 3.4.x.
+
+ .. warning::
+
+ Not all combinations of Spark version and package version are supported.
+ See `Maven index `_
+ and `official documentation `_.
+
+ * Scala versions: 2.12 - 2.13
+ * Java versions: 8 - 20
+
+ See documentation from link above.
+
+ .. note ::
+
+ You can pass any option to the constructor, even if it is not mentioned in this documentation.
+ **Option names should be in** ``camelCase``!
+
+ The set of supported options depends on Spark version. See link above.
+
+ Examples
+ --------
+
+ Describe options how to read from/write to Excel file with specific options:
+
+ .. code:: python
+
+ from onetl.file.format import Excel
+ from pyspark.sql import SparkSession
+
+ # Create Spark session with Excel package loaded
+ maven_packages = Excel.get_packages(spark_version="3.4.1")
+ spark = (
+ SparkSession.builder.appName("spark-app-name")
+ .config("spark.jars.packages", ",".join(maven_packages))
+ .getOrCreate()
+ )
+
+ excel = Excel(
+ header=True,
+ inferSchema=True,
+ )
+
+ """
+
+ name: ClassVar[str] = "excel"
+
+ header: bool = False
+
+ class Config:
+ known_options = READ_OPTIONS | WRITE_OPTIONS
+ extra = "allow"
+
+ @slot
+ @classmethod
+ def get_packages(
+ cls,
+ spark_version: str,
+ scala_version: str | None = None,
+ package_version: str | None = None,
+ ) -> list[str]:
+ """
+ Get package names to be downloaded by Spark. |support_hooks|
+
+ .. warning::
+
+ Not all combinations of Spark version and package version are supported.
+ See `Maven index `_
+ and `official documentation `_.
+
+ Parameters
+ ----------
+ spark_version : str
+ Spark version in format ``major.minor.patch``.
+
+ scala_version : str, optional
+ Scala version in format ``major.minor``.
+
+ If ``None``, ``spark_version`` is used to determine Scala version.
+
+ version: str, optional
+ Package version in format ``major.minor.patch``. Default is ``0.19.0``.
+
+ .. warning::
+
+ Version ``0.14`` and below are not supported.
+
+ .. note::
+
+ It is not guaranteed that custom package versions are supported.
+ Tests are performed only for default version.
+
+ Examples
+ --------
+
+ .. code:: python
+
+ from onetl.file.format import Excel
+
+ Excel.get_packages(spark_version="3.4.1")
+ Excel.get_packages(spark_version="3.4.1", scala_version="2.13")
+ Excel.get_packages(
+ spark_version="3.4.1",
+ scala_version="2.13",
+ package_version="0.19.0",
+ )
+
+ """
+
+ if package_version:
+ version = Version.parse(package_version)
+ if version < (0, 15):
+ # format="com.crealytics.spark.excel" does not support reading folder with files
+ # format="excel" was added only in 0.14, but Maven package for 0.14 has different naming convention than recent versions.
+ # So using 0.15 as the lowest supported version.
+ raise ValueError(f"Package version should be at least 0.15, got {package_version}")
+ log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version)
+ else:
+ version = Version.parse("0.19.0")
+
+ spark_ver = Version.parse(spark_version)
+ if spark_ver < (3, 2):
+ # Actually, Spark 2.4 is supported, but packages are built only for Scala 2.12
+ # when default pyspark==2.4.1 is built with Scala 2.11.
+ # See https://github.com/crealytics/spark-excel/issues/426
+ raise ValueError(f"Spark version should be at least 3.2, got {spark_version}")
+
+ scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver)
+ if scala_ver.digits(2) < (2, 12):
+ raise ValueError(f"Scala version should be at least 2.12, got {scala_ver}")
+
+ return [f"com.crealytics:spark-excel_{scala_ver.digits(2)}:{spark_ver.digits(3)}_{version.digits(3)}"]
+
+ @slot
+ def check_if_supported(self, spark: SparkSession) -> None:
+ java_class = "com.crealytics.spark.excel.v2.ExcelDataSource"
+
+ try:
+ try_import_java_class(spark, java_class)
+ except Exception as e:
+ spark_version = get_spark_version(spark)
+ msg = MISSING_JVM_CLASS_MSG.format(
+ java_class=java_class,
+ package_source=self.__class__.__name__,
+ args=f"spark_version='{spark_version}'",
+ )
+ if log.isEnabledFor(logging.DEBUG):
+ log.debug("Missing Java class", exc_info=e, stack_info=True)
+ raise ValueError(msg) from e
diff --git a/pytest.ini b/pytest.ini
index 5e40e75d7..3c71e8eb6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -24,5 +24,6 @@ markers =
postgres: Postgres tests
s3: S3 tests
sftp: SFTP tests
+ samba: Samba tests
teradata: Teradata tests
webdav: WebDAV tests
diff --git a/requirements/docs.txt b/requirements/docs.txt
index d3fc9555e..4ff1db3e9 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -3,10 +3,10 @@ furo
importlib-resources<6
numpydoc
pygments-csv-lexer
-# https://github.com/sphinx-doc/sphinx/issues/11662
-sphinx<7.2.5
+sphinx
sphinx-copybutton
sphinx-design
+sphinx-plantuml
sphinx-tabs
sphinx-toolbox
sphinx_substitution_extensions
diff --git a/requirements/samba.txt b/requirements/samba.txt
new file mode 100644
index 000000000..619ee4f51
--- /dev/null
+++ b/requirements/samba.txt
@@ -0,0 +1 @@
+pysmb
diff --git a/requirements/tests/samba.txt b/requirements/tests/samba.txt
new file mode 100644
index 000000000..619ee4f51
--- /dev/null
+++ b/requirements/tests/samba.txt
@@ -0,0 +1 @@
+pysmb
diff --git a/requirements/tests/spark-3.2.3.txt b/requirements/tests/spark-3.2.4.txt
similarity index 80%
rename from requirements/tests/spark-3.2.3.txt
rename to requirements/tests/spark-3.2.4.txt
index 44291430a..1acafab9a 100644
--- a/requirements/tests/spark-3.2.3.txt
+++ b/requirements/tests/spark-3.2.4.txt
@@ -1,5 +1,5 @@
numpy>=1.16,<1.24
pandas>=1.0,<2
pyarrow>=1.0
-pyspark==3.2.3
+pyspark==3.2.4
sqlalchemy<2.0
diff --git a/setup.py b/setup.py
index 422085535..f8b560707 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@ def parse_requirements(file: Path) -> list[str]:
requirements_ftp = parse_requirements(here / "requirements" / "ftp.txt")
requirements_sftp = parse_requirements(here / "requirements" / "sftp.txt")
+requirements_samba = parse_requirements(here / "requirements" / "samba.txt")
requirements_hdfs = parse_requirements(here / "requirements" / "hdfs.txt")
requirements_s3 = parse_requirements(here / "requirements" / "s3.txt")
requirements_webdav = parse_requirements(here / "requirements" / "webdav.txt")
@@ -90,6 +91,7 @@ def parse_requirements(file: Path) -> list[str]:
"ftp": requirements_ftp,
"ftps": requirements_ftp,
"sftp": requirements_sftp,
+ "samba": requirements_samba,
"hdfs": requirements_hdfs,
"s3": requirements_s3,
"webdav": requirements_webdav,
diff --git a/tests/fixtures/connections/file_connections.py b/tests/fixtures/connections/file_connections.py
index e8ef7253e..f44240894 100644
--- a/tests/fixtures/connections/file_connections.py
+++ b/tests/fixtures/connections/file_connections.py
@@ -12,6 +12,7 @@
lazy_fixture("hdfs_file_connection"),
lazy_fixture("s3_file_connection"),
lazy_fixture("sftp_file_connection"),
+ lazy_fixture("samba_file_connection"),
lazy_fixture("webdav_file_connection"),
],
)
@@ -26,6 +27,7 @@ def file_connection(request):
lazy_fixture("hdfs_file_connection_with_path"),
lazy_fixture("s3_file_connection_with_path"),
lazy_fixture("sftp_file_connection_with_path"),
+ lazy_fixture("samba_file_connection_with_path"),
lazy_fixture("webdav_file_connection_with_path"),
],
)
@@ -40,6 +42,7 @@ def file_connection_with_path(request):
lazy_fixture("hdfs_file_connection_with_path_and_files"),
lazy_fixture("s3_file_connection_with_path_and_files"),
lazy_fixture("sftp_file_connection_with_path_and_files"),
+ lazy_fixture("samba_file_connection_with_path_and_files"),
lazy_fixture("webdav_file_connection_with_path_and_files"),
],
)
diff --git a/tests/fixtures/connections/samba.py b/tests/fixtures/connections/samba.py
new file mode 100644
index 000000000..52a294d5b
--- /dev/null
+++ b/tests/fixtures/connections/samba.py
@@ -0,0 +1,63 @@
+import os
+from collections import namedtuple
+from pathlib import PurePosixPath
+
+import pytest
+
+from tests.util.upload_files import upload_files
+
+
+@pytest.fixture(
+ scope="session",
+ params=[
+ pytest.param("real-samba", marks=[pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]),
+ ],
+)
+def samba_server():
+ SambaServer = namedtuple("SambaServer", ["host", "protocol", "port", "share", "user", "password"])
+
+ return SambaServer(
+ host=os.getenv("ONETL_SAMBA_HOST"),
+ protocol=os.getenv("ONETL_SAMBA_PROTOCOL"),
+ port=os.getenv("ONETL_SAMBA_PORT"),
+ share=os.getenv("ONETL_SAMBA_SHARE"),
+ user=os.getenv("ONETL_SAMBA_USER"),
+ password=os.getenv("ONETL_SAMBA_PASSWORD"),
+ )
+
+
+@pytest.fixture()
+def samba_file_connection(samba_server):
+ from onetl.connection import Samba
+
+ return Samba(
+ host=samba_server.host,
+ protocol=samba_server.protocol,
+ port=samba_server.port,
+ share=samba_server.share,
+ user=samba_server.user,
+ password=samba_server.password,
+ )
+
+
+@pytest.fixture()
+def samba_file_connection_with_path(request, samba_file_connection):
+ connection = samba_file_connection
+ root = PurePosixPath("/data")
+
+ def finalizer():
+ connection.remove_dir(root, recursive=True)
+
+ request.addfinalizer(finalizer)
+
+ connection.remove_dir(root, recursive=True)
+
+ return connection, root
+
+
+@pytest.fixture()
+def samba_file_connection_with_path_and_files(resource_path, samba_file_connection_with_path):
+ connection, upload_to = samba_file_connection_with_path
+ upload_from = resource_path / "file_connection"
+ files = upload_files(upload_from, upload_to, connection)
+ return connection, upload_to, files
diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py
index 2135f3b52..05358b9c0 100644
--- a/tests/fixtures/spark.py
+++ b/tests/fixtures/spark.py
@@ -44,7 +44,7 @@ def maven_packages():
SparkS3,
Teradata,
)
- from onetl.file.format import Avro
+ from onetl.file.format import Avro, Excel
pyspark_version = get_pyspark_version()
packages = (
@@ -74,9 +74,23 @@ def maven_packages():
# There is no MongoDB connector for Spark less than 3.2
packages.extend(MongoDB.get_packages(spark_version=pyspark_version))
+ # There is no Excel files support for Spark less than 3.2
+ packages.extend(Excel.get_packages(spark_version=pyspark_version))
+
return packages
+@pytest.fixture(scope="session")
+def excluded_packages():
+ # These packages are a part of org.apache.spark:spark-hadoop-cloud, but not used in tests
+ return [
+ "com.google.cloud.bigdataoss:gcs-connector",
+ "org.apache.hadoop:hadoop-aliyun",
+ "org.apache.hadoop:hadoop-azure-datalake",
+ "org.apache.hadoop:hadoop-azure",
+ ]
+
+
@pytest.fixture(
scope="session",
name="spark",
@@ -84,13 +98,14 @@ def maven_packages():
pytest.param("real-spark", marks=[pytest.mark.db_connection, pytest.mark.connection]),
],
)
-def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages):
+def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages, excluded_packages):
from pyspark.sql import SparkSession
spark = (
SparkSession.builder.config("spark.app.name", "onetl") # noqa: WPS221
.config("spark.master", "local[*]")
.config("spark.jars.packages", ",".join(maven_packages))
+ .config("spark.jars.excludes", ",".join(excluded_packages))
.config("spark.jars.ivySettings", os.fspath(ivysettings_path))
.config("spark.driver.memory", "1g")
.config("spark.driver.maxResultSize", "1g")
diff --git a/tests/resources/file_df_connection/generate_files.py b/tests/resources/file_df_connection/generate_files.py
index 874045f8c..698c81ea7 100755
--- a/tests/resources/file_df_connection/generate_files.py
+++ b/tests/resources/file_df_connection/generate_files.py
@@ -14,10 +14,13 @@
from contextlib import contextmanager
from datetime import date, datetime, timezone
from pathlib import Path
+from tempfile import gettempdir
from typing import TYPE_CHECKING, Any, Iterator, TextIO
+from zipfile import ZipFile
if TYPE_CHECKING:
from avro.schema import Schema as AvroSchema
+ from pandas import DataFrame as PandasDataFrame
from pyarrow import Schema as ArrowSchema
from pyarrow import Table as ArrowTable
@@ -85,6 +88,12 @@ def get_data() -> list[dict]:
]
+def get_pandas_dataframe(data: list[dict]) -> PandasDataFrame:
+ import pandas as pd
+
+ return pd.DataFrame(data)
+
+
def get_pyarrow_schema() -> ArrowSchema:
import pyarrow as pa
@@ -382,6 +391,87 @@ def save_as_avro(data: list[dict], path: Path) -> None:
save_as_avro_snappy(data, root / "with_compression")
+def save_as_xls_with_options(
+ data: list[dict],
+ path: Path,
+ index: bool = False,
+ **kwargs,
+) -> None:
+ # required to register xlwt writer which supports generating .xls files
+ import pandas_xlwt
+
+ path.mkdir(parents=True, exist_ok=True)
+ file = path / "file.xls"
+
+ df = get_pandas_dataframe(data)
+ df["datetime_value"] = df.datetime_value.dt.tz_localize(None)
+ df.to_excel(file, index=index, engine="xlwt", **kwargs)
+
+
+def make_zip_deterministic(path: Path) -> None:
+ temp_dir = gettempdir()
+ file_copy = Path(shutil.copy(path, temp_dir))
+
+ with ZipFile(file_copy, "r") as original_file:
+ with ZipFile(path, "w") as new_file:
+ for item in original_file.infolist():
+ if item.filename == "docProps/core.xml":
+ # this file contains modification time, which produces files with different hashes
+ continue
+ # reset modification time of all files
+ item.date_time = (1980, 1, 1, 0, 0, 0)
+ new_file.writestr(item, original_file.read(item.filename))
+
+
+def save_as_xlsx_with_options(
+ data: list[dict],
+ path: Path,
+ index: bool = False,
+ **kwargs,
+) -> None:
+ path.mkdir(parents=True, exist_ok=True)
+ file = path / "file.xls"
+
+ df = get_pandas_dataframe(data)
+ df["datetime_value"] = df.datetime_value.dt.tz_localize(None)
+ df.to_excel(file, index=index, engine="openpyxl", **kwargs)
+ make_zip_deterministic(file)
+
+
+def save_as_xlsx(data: list[dict], path: Path) -> None:
+ root = path / "xlsx"
+ shutil.rmtree(root, ignore_errors=True)
+ root.mkdir(parents=True, exist_ok=True)
+
+ save_as_xlsx_with_options(data, root / "without_header", header=False)
+ save_as_xlsx_with_options(data, root / "with_header", header=True)
+ save_as_xlsx_with_options(
+ data,
+ root / "with_data_address",
+ header=False,
+ sheet_name="ABC",
+ startcol=10,
+ startrow=5,
+ )
+
+
+def save_as_xls(data: list[dict], path: Path) -> None:
+ root = path / "xls"
+ shutil.rmtree(root, ignore_errors=True)
+ root.mkdir(parents=True, exist_ok=True)
+
+ save_as_xls_with_options(data, root / "without_header", header=False)
+ save_as_xls_with_options(data, root / "with_header", header=True)
+ save_as_xls_with_options(
+ data,
+ root / "with_data_address",
+ header=False,
+ sheet_name="ABC",
+ startcol=10,
+ startrow=5,
+ )
+
+
format_mapping = {
"csv": save_as_csv,
"json": save_as_json,
@@ -389,6 +479,8 @@ def save_as_avro(data: list[dict], path: Path) -> None:
"orc": save_as_orc,
"parquet": save_as_parquet,
"avro": save_as_avro,
+ "xlsx": save_as_xlsx,
+ "xls": save_as_xls,
}
diff --git a/tests/resources/file_df_connection/xls/with_data_address/file.xls b/tests/resources/file_df_connection/xls/with_data_address/file.xls
new file mode 100644
index 000000000..28288eb8e
Binary files /dev/null and b/tests/resources/file_df_connection/xls/with_data_address/file.xls differ
diff --git a/tests/resources/file_df_connection/xls/with_header/file.xls b/tests/resources/file_df_connection/xls/with_header/file.xls
new file mode 100644
index 000000000..efb43b4a9
Binary files /dev/null and b/tests/resources/file_df_connection/xls/with_header/file.xls differ
diff --git a/tests/resources/file_df_connection/xls/without_header/file.xls b/tests/resources/file_df_connection/xls/without_header/file.xls
new file mode 100644
index 000000000..420aa1107
Binary files /dev/null and b/tests/resources/file_df_connection/xls/without_header/file.xls differ
diff --git a/tests/resources/file_df_connection/xlsx/with_data_address/file.xls b/tests/resources/file_df_connection/xlsx/with_data_address/file.xls
new file mode 100644
index 000000000..bf2343c0a
Binary files /dev/null and b/tests/resources/file_df_connection/xlsx/with_data_address/file.xls differ
diff --git a/tests/resources/file_df_connection/xlsx/with_header/file.xls b/tests/resources/file_df_connection/xlsx/with_header/file.xls
new file mode 100644
index 000000000..b19c54d02
Binary files /dev/null and b/tests/resources/file_df_connection/xlsx/with_header/file.xls differ
diff --git a/tests/resources/file_df_connection/xlsx/without_header/file.xls b/tests/resources/file_df_connection/xlsx/without_header/file.xls
new file mode 100644
index 000000000..78632de24
Binary files /dev/null and b/tests/resources/file_df_connection/xlsx/without_header/file.xls differ
diff --git a/tests/resources/requirements.txt b/tests/resources/requirements.txt
index 56d154dd9..033953205 100644
--- a/tests/resources/requirements.txt
+++ b/tests/resources/requirements.txt
@@ -1,2 +1,5 @@
avro[snappy]
+openpyxl
+pandas
pyarrow
+pandas-xlwt
diff --git a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py
index 3b47df0d0..778d3a20c 100644
--- a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py
+++ b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py
@@ -26,13 +26,16 @@ def test_spark_hdfs_check(hdfs_file_df_connection, caplog):
def test_spark_hdfs_file_connection_check_failed(spark):
from onetl.connection import SparkHDFS
- with pytest.raises(RuntimeError, match="Connection is unavailable"):
- SparkHDFS(
- cluster="rnd-dwh",
- host="hive1",
- port=1234,
- spark=spark,
- ).check()
+ wrong_hdfs = SparkHDFS(
+ cluster="rnd-dwh",
+ host="hive1",
+ port=1234,
+ spark=spark,
+ )
+
+ with wrong_hdfs:
+ with pytest.raises(RuntimeError, match="Connection is unavailable"):
+ wrong_hdfs.check()
def test_spark_hdfs_file_connection_check_with_hooks(spark, request, hdfs_server):
diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py
index a73fa06c5..cb687776c 100644
--- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py
+++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py
@@ -56,7 +56,7 @@ def test_avro_reader(
"""Reading Avro files working as expected on any Spark, Python and Java versions"""
spark_version = get_spark_version(spark)
if spark_version < (2, 4):
- pytest.skip("Avro only supported on Spark 2.4+")
+ pytest.skip("Avro files are supported on Spark 3.2+ only")
local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files
df = file_df_dataframe
@@ -76,10 +76,10 @@ def test_avro_reader(
@pytest.mark.parametrize(
- "path, options",
+ "options",
[
- ("without_compression", {}),
- ("with_compression", {"compression": "snappy"}),
+ {},
+ {"compression": "snappy"},
],
ids=["without_compression", "with_compression"],
)
@@ -88,13 +88,12 @@ def test_avro_writer(
local_fs_file_df_connection_with_path,
file_df_dataframe,
avro_schema,
- path,
options,
):
"""Written files can be read by Spark"""
spark_version = get_spark_version(spark)
if spark_version < (2, 4):
- pytest.skip("Avro only supported on Spark 2.4+")
+ pytest.skip("Avro files are supported on Spark 3.2+ only")
file_df_connection, source_path = local_fs_file_df_connection_with_path
df = file_df_dataframe
diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py
index a6cd14591..289e88273 100644
--- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py
+++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py
@@ -27,6 +27,7 @@ def test_csv_reader_with_infer_schema(
local_fs_file_df_connection_with_path_and_files,
file_df_dataframe,
):
+ """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions"""
file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files
df = file_df_dataframe
csv_root = source_path / "csv/without_header"
@@ -42,9 +43,13 @@ def test_csv_reader_with_infer_schema(
expected_df = df
- if get_spark_version(spark).major < 3:
+ spark_version = get_spark_version(spark)
+ if spark_version.major < 3:
# Spark 2 infers "date_value" as timestamp instead of date
expected_df = df.withColumn("date_value", col("date_value").cast("timestamp"))
+ elif spark_version < (3, 3):
+ # Spark 3.2 cannot infer "date_value", and return it as string
+ expected_df = df.withColumn("date_value", col("date_value").cast("string"))
# csv does not have header, so columns are named like "_c0", "_c1", etc
expected_df = reset_column_names(expected_df)
diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py
new file mode 100644
index 000000000..de8cc9cf9
--- /dev/null
+++ b/tests/tests_integration/test_file_format_integration/test_excel_integration.py
@@ -0,0 +1,142 @@
+"""Integration tests for Excel file format.
+
+Test only that options are passed to Spark in both FileDFReader & FileDFWriter.
+Do not test all the possible options and combinations, we are not testing Spark here.
+"""
+
+import pytest
+
+from onetl._util.spark import get_spark_version
+from onetl.file import FileDFReader, FileDFWriter
+from onetl.file.format import Excel
+
+try:
+ from pyspark.sql.functions import col
+
+ from tests.util.assert_df import assert_equal_df
+ from tests.util.spark_df import reset_column_names
+except ImportError:
+ # pandas and spark can be missing if someone runs tests for file connections only
+ pass
+
+pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
+
+
+@pytest.mark.parametrize("format", ["xlsx", "xls"])
+def test_excel_reader_with_infer_schema(
+ spark,
+ local_fs_file_df_connection_with_path_and_files,
+ file_df_dataframe,
+ format,
+):
+ """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions"""
+ spark_version = get_spark_version(spark)
+ if spark_version < (3, 2):
+ pytest.skip("Excel files are supported on Spark 3.2+ only")
+
+ file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files
+ df = file_df_dataframe
+ excel_root = source_path / format / "without_header"
+
+ reader = FileDFReader(
+ connection=file_df_connection,
+ format=Excel(inferSchema=True),
+ source_path=excel_root,
+ )
+ read_df = reader.run()
+
+ assert read_df.count()
+
+ expected_df = df
+ # Spark infers "date_value" as timestamp instead of date
+ expected_df = df.withColumn("date_value", col("date_value").cast("timestamp"))
+
+ # excel does not have header, so columns are named like "_c0", "_c1", etc
+ expected_df = reset_column_names(expected_df)
+
+ assert read_df.schema != df.schema
+ assert read_df.schema == expected_df.schema
+ assert_equal_df(read_df, expected_df)
+
+
+@pytest.mark.parametrize("format", ["xlsx", "xls"])
+@pytest.mark.parametrize(
+ "path, options",
+ [
+ ("without_header", {}),
+ ("with_header", {"header": True}),
+ ("with_data_address", {"dataAddress": "'ABC'!K6"}),
+ ],
+ ids=["without_header", "with_header", "with_data_address"],
+)
+def test_excel_reader_with_options(
+ spark,
+ local_fs_file_df_connection_with_path_and_files,
+ file_df_dataframe,
+ format,
+ path,
+ options,
+):
+ """Reading Excel files working as expected on any Spark, Python and Java versions"""
+ spark_version = get_spark_version(spark)
+ if spark_version < (3, 2):
+ pytest.skip("Excel files are supported on Spark 3.2+ only")
+
+ local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files
+ df = file_df_dataframe
+ excel_root = source_path / format / path
+
+ reader = FileDFReader(
+ connection=local_fs,
+ format=Excel.parse(options),
+ df_schema=df.schema,
+ source_path=excel_root,
+ )
+ read_df = reader.run()
+
+ assert read_df.count()
+ assert read_df.schema == df.schema
+ assert_equal_df(read_df, df)
+
+
+@pytest.mark.parametrize(
+ "options",
+ [
+ {},
+ {"header": True},
+ ],
+ ids=["without_header", "with_header"],
+)
+def test_excel_writer(
+ spark,
+ local_fs_file_df_connection_with_path,
+ file_df_dataframe,
+ options,
+):
+ """Written files can be read by Spark"""
+ spark_version = get_spark_version(spark)
+ if spark_version < (3, 2):
+ pytest.skip("Excel files are supported on Spark 3.2+ only")
+
+ file_df_connection, source_path = local_fs_file_df_connection_with_path
+ df = file_df_dataframe
+ excel_root = source_path / "excel"
+
+ writer = FileDFWriter(
+ connection=file_df_connection,
+ format=Excel.parse(options),
+ target_path=excel_root,
+ )
+ writer.run(df)
+
+ reader = FileDFReader(
+ connection=file_df_connection,
+ format=Excel.parse(options),
+ source_path=excel_root,
+ df_schema=df.schema,
+ )
+ read_df = reader.run()
+
+ assert read_df.count()
+ assert read_df.schema == df.schema
+ assert_equal_df(read_df, df)
diff --git a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py
index ed290ab43..0a932dd46 100644
--- a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py
+++ b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py
@@ -635,10 +635,11 @@ def test_file_downloader_mode_replace_entire_directory(
caplog,
):
file_connection, remote_path, _ = file_connection_with_path_and_files
+ # Reason for using .resolve(): https://stackoverflow.com/a/58719476
if local_dir_exist:
- local_path = tmp_path_factory.mktemp("local_path")
+ local_path = tmp_path_factory.mktemp("local_path").resolve()
else:
- local_path = Path(tempfile.gettempdir()) / secrets.token_hex()
+ local_path = Path(tempfile.gettempdir()).resolve() / secrets.token_hex()
temp_file = local_path / secrets.token_hex(5)
if local_dir_exist:
@@ -755,7 +756,11 @@ def finalizer():
local_path=file.name,
)
- with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"):
+ # Reason for .realpath(): https://stackoverflow.com/a/58719476
+ with pytest.raises(
+ NotADirectoryError,
+ match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory",
+ ):
downloader.run()
diff --git a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py
index 522cf2dd4..feedeaa45 100644
--- a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py
+++ b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py
@@ -490,7 +490,11 @@ def test_file_uploader_run_local_path_not_a_directory(file_connection):
with tempfile.NamedTemporaryFile() as file:
uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=file.name)
- with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"):
+ # Reason for .realpath(): https://stackoverflow.com/a/58719476
+ with pytest.raises(
+ NotADirectoryError,
+ match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory",
+ ):
uploader.run()
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py
index c97105a44..338de0c67 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py
@@ -137,6 +137,14 @@ def test_greenplum_writer_if_exists_error(spark, processing, prepare_schema_tabl
):
writer.run(df)
+ empty_df = spark.createDataFrame([], df.schema)
+
+ processing.assert_equal_df(
+ schema=prepare_schema_table.schema,
+ table=prepare_schema_table.table,
+ df=empty_df,
+ )
+
def test_greenplum_writer_if_exists_ignore(spark, processing, prepare_schema_table):
df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py
index 44553539b..8ca74b06d 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py
@@ -1,4 +1,5 @@
import logging
+import re
import textwrap
import pytest
@@ -225,6 +226,8 @@ def test_hive_writer_default_not_partitioned(spark, processing, get_schema_table
"options",
[
Hive.WriteOptions(if_exists="append"),
+ Hive.WriteOptions(if_exists="ignore"),
+ Hive.WriteOptions(if_exists="error"),
Hive.WriteOptions(if_exists="replace_entire_table"),
Hive.WriteOptions(if_exists="replace_overlapping_partitions"),
],
@@ -363,6 +366,105 @@ def test_hive_writer_insert_into_append(spark, processing, get_schema_table, ori
)
+@pytest.mark.parametrize(
+ "original_options, new_options",
+ [
+ pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"),
+ pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"),
+ pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"),
+ pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"),
+ ],
+)
+def test_hive_writer_insert_into_ignore(spark, processing, get_schema_table, original_options, new_options, caplog):
+ df = processing.create_spark_df(spark=spark)
+
+ df1 = df[df.id_int <= 25]
+ df2 = df.where("id_int > 25 AND id_int <= 50")
+ df3 = df[df.id_int > 50]
+
+ hive = Hive(cluster="rnd-dwh", spark=spark)
+ writer1 = DBWriter(
+ connection=hive,
+ target=get_schema_table.full_name,
+ options=original_options,
+ )
+ # create & fill up the table with some data
+ writer1.run(df1.union(df2))
+ old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+
+ writer2 = DBWriter(
+ connection=hive,
+ target=get_schema_table.full_name,
+ options=Hive.WriteOptions(if_exists="ignore", **new_options),
+ )
+
+ with caplog.at_level(logging.INFO):
+ writer2.run(df1.union(df3))
+
+ assert "|Hive| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')" in caplog.text
+
+ new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+
+ # table DDL remains the same
+ assert new_ddl == old_ddl
+
+ # table should only contain old data, because 'ignore' should not have added new data
+ processing.assert_equal_df(
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
+ df=df1.union(df2),
+ order_by="id_int",
+ )
+
+
+@pytest.mark.parametrize(
+ "original_options, new_options",
+ [
+ pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"),
+ pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"),
+ pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"),
+ pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"),
+ ],
+)
+def test_hive_writer_insert_into_error(spark, processing, get_schema_table, original_options, new_options, caplog):
+ df = processing.create_spark_df(spark=spark)
+
+ hive = Hive(cluster="rnd-dwh", spark=spark)
+ writer1 = DBWriter(
+ connection=hive,
+ target=get_schema_table.full_name,
+ options=original_options,
+ )
+
+ # Create & fill up the table with some data
+ writer1.run(df)
+ old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+
+ writer2 = DBWriter(
+ connection=hive,
+ target=get_schema_table.full_name,
+ options=Hive.WriteOptions(if_exists="error", **new_options),
+ )
+
+ with pytest.raises(
+ ValueError,
+ match=re.escape("Operation stopped due to Hive.WriteOptions(if_exists='error')"),
+ ):
+ writer2.run(df)
+
+ # table DDL remains the same
+ new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+ assert new_ddl == old_ddl
+
+ # validate that the table contains only old data
+ processing.assert_equal_df(
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
+ df=df,
+ order_by="id_int",
+ )
+
+
@pytest.mark.parametrize(
"original_options, new_options",
[
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py
index 458a6902f..d5cd94fed 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py
@@ -1,3 +1,6 @@
+import logging
+import re
+
import pytest
from onetl.connection import MongoDB
@@ -6,8 +9,18 @@
pytestmark = pytest.mark.mongodb
+@pytest.mark.parametrize(
+ "options",
+ [
+ {},
+ {"if_exists": "append"},
+ {"if_exists": "replace_entire_collection"},
+ {"if_exists": "error"},
+ {"if_exists": "ignore"},
+ ],
+)
@pytest.mark.flaky(reruns=2)
-def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table):
+def test_mongodb_writer_snapshot(spark, processing, get_schema_table, options, caplog):
df = processing.create_spark_df(spark=spark)
mongo = MongoDB(
@@ -21,12 +34,144 @@ def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table):
writer = DBWriter(
connection=mongo,
- table=prepare_schema_table.table,
+ table=get_schema_table.table,
+ options=MongoDB.WriteOptions(**options),
+ )
+
+ with caplog.at_level(logging.INFO):
+ writer.run(df)
+
+ assert f"|MongoDB| Collection '{get_schema_table.table}' does not exist" in caplog.text
+
+ processing.assert_equal_df(
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
+ df=df,
+ )
+
+
+def test_mongodb_writer_if_exists_append(spark, processing, get_schema_table):
+ df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+ df1 = df[df._id < 1001]
+ df2 = df[df._id > 1000]
+
+ mongo = MongoDB(
+ host=processing.host,
+ port=processing.port,
+ user=processing.user,
+ password=processing.password,
+ database=processing.database,
+ spark=spark,
+ )
+
+ writer = DBWriter(
+ connection=mongo,
+ table=get_schema_table.table,
+ options=MongoDB.WriteOptions(if_exists="append"),
+ )
+ writer.run(df1)
+ writer.run(df2)
+
+ processing.assert_equal_df(
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
+ df=df,
+ )
+
+
+def test_mongodb_writer_if_exists_replace_entire_collection(spark, processing, get_schema_table):
+ df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+ df1 = df[df._id < 1001]
+ df2 = df[df._id > 1000]
+
+ mongo = MongoDB(
+ host=processing.host,
+ port=processing.port,
+ user=processing.user,
+ password=processing.password,
+ database=processing.database,
+ spark=spark,
+ )
+
+ writer = DBWriter(
+ connection=mongo,
+ table=get_schema_table.table,
+ options=MongoDB.WriteOptions(if_exists="replace_entire_collection"),
+ )
+ writer.run(df1)
+ writer.run(df2)
+
+ processing.assert_equal_df(
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
+ df=df2,
+ )
+
+
+def test_mongodb_writer_if_exists_error(spark, processing, get_schema_table, caplog):
+ df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+
+ mongo = MongoDB(
+ host=processing.host,
+ port=processing.port,
+ user=processing.user,
+ password=processing.password,
+ database=processing.database,
+ spark=spark,
+ )
+
+ writer = DBWriter(
+ connection=mongo,
+ table=get_schema_table.table,
+ options=MongoDB.WriteOptions(if_exists="error"),
)
writer.run(df)
+ with pytest.raises(
+ ValueError,
+ match=re.escape("Operation stopped due to MongoDB.WriteOptions(if_exists='error')"),
+ ):
+ writer.run(df)
+
processing.assert_equal_df(
- schema=prepare_schema_table.schema,
- table=prepare_schema_table.table,
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
df=df,
)
+
+
+def test_mongodb_writer_if_exists_ignore(spark, processing, get_schema_table, caplog):
+ df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+ df1 = df[df._id < 1001]
+ df2 = df[df._id > 1000]
+
+ mongo = MongoDB(
+ host=processing.host,
+ port=processing.port,
+ user=processing.user,
+ password=processing.password,
+ database=processing.database,
+ spark=spark,
+ )
+
+ writer = DBWriter(
+ connection=mongo,
+ table=get_schema_table.table,
+ options=MongoDB.WriteOptions(if_exists="ignore"),
+ )
+ writer.run(df1)
+
+ with caplog.at_level(logging.INFO):
+ writer.run(df2) # The write operation is ignored
+
+ assert f"|MongoDB| Collection '{get_schema_table.table}' exists" in caplog.text
+ assert (
+ "|MongoDB| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')"
+ in caplog.text
+ )
+
+ processing.assert_equal_df(
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
+ df=df1,
+ )
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py
index 195b16e02..cda43c8a8 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py
@@ -6,7 +6,17 @@
pytestmark = pytest.mark.postgres
-def test_postgres_writer_snapshot(spark, processing, prepare_schema_table):
+@pytest.mark.parametrize(
+ "options",
+ [
+ {},
+ {"if_exists": "append"},
+ {"if_exists": "replace_entire_table"},
+ {"if_exists": "error"},
+ {"if_exists": "ignore"},
+ ],
+)
+def test_postgres_writer_snapshot(spark, processing, get_schema_table, options):
df = processing.create_spark_df(spark=spark)
postgres = Postgres(
@@ -20,14 +30,15 @@ def test_postgres_writer_snapshot(spark, processing, prepare_schema_table):
writer = DBWriter(
connection=postgres,
- target=prepare_schema_table.full_name,
+ target=get_schema_table.full_name,
+ options=Postgres.WriteOptions(**options),
)
writer.run(df)
processing.assert_equal_df(
- schema=prepare_schema_table.schema,
- table=prepare_schema_table.table,
+ schema=get_schema_table.schema,
+ table=get_schema_table.table,
df=df,
)
@@ -86,7 +97,7 @@ def test_postgres_writer_snapshot_with_pydantic_options(spark, processing, prepa
)
-def test_postgres_writer_mode_append(spark, processing, prepare_schema_table):
+def test_postgres_writer_if_exists_append(spark, processing, prepare_schema_table):
df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
df1 = df[df.id_int < 1001]
df2 = df[df.id_int > 1000]
@@ -116,7 +127,70 @@ def test_postgres_writer_mode_append(spark, processing, prepare_schema_table):
)
-def test_postgres_writer_mode_replace_entire_table(spark, processing, prepare_schema_table):
+def test_postgres_writer_if_exists_error(spark, processing, prepare_schema_table):
+ from pyspark.sql.utils import AnalysisException
+
+ df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+
+ postgres = Postgres(
+ host=processing.host,
+ port=processing.port,
+ user=processing.user,
+ password=processing.password,
+ database=processing.database,
+ spark=spark,
+ )
+
+ writer = DBWriter(
+ connection=postgres,
+ target=prepare_schema_table.full_name,
+ options=Postgres.WriteOptions(if_exists="error"),
+ )
+
+ with pytest.raises(
+ AnalysisException,
+ match=f"Table or view '{prepare_schema_table.full_name}' already exists. SaveMode: ErrorIfExists.",
+ ):
+ writer.run(df)
+
+ empty_df = spark.createDataFrame([], df.schema)
+
+ processing.assert_equal_df(
+ schema=prepare_schema_table.schema,
+ table=prepare_schema_table.table,
+ df=empty_df,
+ )
+
+
+def test_postgres_writer_if_exists_ignore(spark, processing, prepare_schema_table):
+ df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+
+ postgres = Postgres(
+ host=processing.host,
+ port=processing.port,
+ user=processing.user,
+ password=processing.password,
+ database=processing.database,
+ spark=spark,
+ )
+
+ writer = DBWriter(
+ connection=postgres,
+ target=prepare_schema_table.full_name,
+ options=Postgres.WriteOptions(if_exists="ignore"),
+ )
+
+ writer.run(df) # The write operation is ignored
+ empty_df = spark.createDataFrame([], df.schema)
+
+ processing.assert_equal_df(
+ schema=prepare_schema_table.schema,
+ table=prepare_schema_table.table,
+ df=empty_df,
+ )
+
+
+def test_postgres_writer_if_exists_replace_entire_table(spark, processing, prepare_schema_table):
df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
df1 = df[df.id_int < 1001]
df2 = df[df.id_int > 1000]
diff --git a/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py
new file mode 100644
index 000000000..7c5c8f5d5
--- /dev/null
+++ b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py
@@ -0,0 +1,58 @@
+import logging
+
+import pytest
+
+pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]
+
+
+def test_samba_file_connection_check_success(samba_file_connection, caplog):
+ samba = samba_file_connection
+ with caplog.at_level(logging.INFO):
+ assert samba.check() == samba
+
+ assert "|Samba|" in caplog.text
+ assert f"host = '{samba.host}'" in caplog.text
+ assert f"port = {samba.port}" in caplog.text
+ assert f"protocol = '{samba.protocol}'" in caplog.text
+ assert f"user = '{samba.user}'" in caplog.text
+ assert f"share = '{samba.share}'" in caplog.text
+ assert "password = SecretStr('**********')" in caplog.text
+ assert samba.password.get_secret_value() not in caplog.text
+
+ assert "Connection is available." in caplog.text
+
+
+def test_samba_file_connection_check_not_existing_share_failed(samba_server, caplog):
+ from onetl.connection import Samba
+
+ not_existing_share = "NotExistingShare"
+ samba = Samba(
+ host=samba_server.host,
+ share=not_existing_share,
+ protocol=samba_server.protocol,
+ port=samba_server.port,
+ user=samba_server.user,
+ password=samba_server.password,
+ )
+
+ with caplog.at_level(logging.INFO):
+ with pytest.raises(RuntimeError, match="Connection is unavailable"):
+ samba.check()
+
+ assert f"Share '{not_existing_share}' not found among existing shares" in caplog.text
+
+
+def test_samba_file_connection_check_runtime_failed(samba_server):
+ from onetl.connection import Samba
+
+ samba = Samba(
+ host=samba_server.host,
+ share=samba_server.share,
+ protocol=samba_server.protocol,
+ port=samba_server.port,
+ user="unknown",
+ password="unknown",
+ )
+
+ with pytest.raises(RuntimeError, match="Connection is unavailable"):
+ samba.check()
diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py
new file mode 100644
index 000000000..e94386120
--- /dev/null
+++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py
@@ -0,0 +1,106 @@
+import logging
+
+import pytest
+
+from onetl.file.format import Excel
+
+
+@pytest.mark.parametrize(
+ "spark_version",
+ [
+ "2.2.1",
+ "2.3.1",
+ "2.4.8",
+ ],
+)
+def test_excel_get_packages_spark_version_not_supported(spark_version):
+ with pytest.raises(ValueError, match=f"Spark version should be at least 3.2, got {spark_version}"):
+ Excel.get_packages(spark_version=spark_version)
+
+
+def test_excel_get_packages_scala_version_not_supported():
+ with pytest.raises(ValueError, match="Scala version should be at least 2.12, got 2.11"):
+ Excel.get_packages(spark_version="3.2.4", scala_version="2.11")
+
+
+def test_excel_get_packages_package_version_not_supported():
+ with pytest.raises(ValueError, match="Package version should be at least 0.15, got 0.13.7"):
+ Excel.get_packages(spark_version="3.2.4", package_version="0.13.7")
+
+
+@pytest.mark.parametrize(
+ "spark_version, scala_version, package_version, packages",
+ [
+ # Detect Scala version by Spark version
+ ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]),
+ ("3.4.1", None, None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]),
+ # Override Scala version
+ ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]),
+ ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.19.0"]),
+ ("3.4.1", "2.12", None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]),
+ ("3.4.1", "2.13", None, ["com.crealytics:spark-excel_2.13:3.4.1_0.19.0"]),
+ # Override package version
+ ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]),
+ ("3.4.1", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.4.1_0.18.0"]),
+ ],
+)
+def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages):
+ with caplog.at_level(level=logging.WARNING):
+ result = Excel.get_packages(
+ spark_version=spark_version,
+ scala_version=scala_version,
+ package_version=package_version,
+ )
+
+ if package_version:
+ assert f"Passed custom package version '{package_version}', it is not guaranteed to be supported"
+
+ assert result == packages
+
+
+def test_excel_options_default():
+ excel = Excel()
+ assert not excel.header
+
+
+def test_excel_options_default_override():
+ excel = Excel(header=True)
+ assert excel.header
+
+
+@pytest.mark.parametrize(
+ "known_option",
+ [
+ "dataAddress",
+ "treatEmptyValuesAsNulls",
+ "setErrorCellsToFallbackValues",
+ "usePlainNumberFormat",
+ "inferSchema",
+ "addColorColumns",
+ "timestampFormat",
+ "maxRowsInMemory",
+ "maxByteArraySize",
+ "tempFileThreshold",
+ "excerptSize",
+ "workbookPassword",
+ "dateFormat",
+ ],
+)
+def test_excel_options_known(known_option):
+ excel = Excel.parse({known_option: "value"})
+ assert getattr(excel, known_option) == "value"
+
+
+def test_excel_options_unknown(caplog):
+ with caplog.at_level(logging.WARNING):
+ excel = Excel(unknown="abc")
+ assert excel.unknown == "abc"
+
+ assert ("Options ['unknown'] are not known by Excel, are you sure they are valid?") in caplog.text
+
+
+@pytest.mark.local_fs
+def test_excel_missing_package(spark_no_packages):
+ msg = "Cannot import Java class 'com.crealytics.spark.excel.v2.ExcelDataSource'"
+ with pytest.raises(ValueError, match=msg):
+ Excel().check_if_supported(spark_no_packages)
diff --git a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py
index 7e633206e..6469b10c8 100644
--- a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py
+++ b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py
@@ -153,6 +153,8 @@ def test_hive_write_options_unsupported_insert_into(insert_into):
({"if_exists": "append"}, HiveTableExistBehavior.APPEND),
({"if_exists": "replace_overlapping_partitions"}, HiveTableExistBehavior.REPLACE_OVERLAPPING_PARTITIONS),
({"if_exists": "replace_entire_table"}, HiveTableExistBehavior.REPLACE_ENTIRE_TABLE),
+ ({"if_exists": "error"}, HiveTableExistBehavior.ERROR),
+ ({"if_exists": "ignore"}, HiveTableExistBehavior.IGNORE),
],
)
def test_hive_write_options_if_exists(options, value):
@@ -198,6 +200,18 @@ def test_hive_write_options_if_exists(options, value):
"Mode `overwrite_table` is deprecated since v0.9.0 and will be removed in v1.0.0. "
"Use `replace_entire_table` instead",
),
+ (
+ {"mode": "error"},
+ HiveTableExistBehavior.ERROR,
+ "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+ "Use `Hive.WriteOptions(if_exists=...)` instead",
+ ),
+ (
+ {"mode": "ignore"},
+ HiveTableExistBehavior.IGNORE,
+ "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+ "Use `Hive.WriteOptions(if_exists=...)` instead",
+ ),
],
)
def test_hive_write_options_mode_deprecated(options, value, message):
@@ -209,10 +223,6 @@ def test_hive_write_options_mode_deprecated(options, value, message):
@pytest.mark.parametrize(
"options",
[
- # disallowed modes
- {"mode": "error"},
- {"mode": "ignore"},
- # wrong mode
{"mode": "wrong_mode"},
],
)
diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py
index ae81402cc..f932408d0 100644
--- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py
+++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py
@@ -266,6 +266,8 @@ def test_jdbc_write_options_to_jdbc(spark_mock):
[
({}, JDBCTableExistBehavior.APPEND),
({"if_exists": "append"}, JDBCTableExistBehavior.APPEND),
+ ({"if_exists": "ignore"}, JDBCTableExistBehavior.IGNORE),
+ ({"if_exists": "error"}, JDBCTableExistBehavior.ERROR),
({"if_exists": "replace_entire_table"}, JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE),
],
)
@@ -294,6 +296,18 @@ def test_jdbc_write_options_if_exists(options, value):
"Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. "
"Use `replace_entire_table` instead",
),
+ (
+ {"mode": "ignore"},
+ JDBCTableExistBehavior.IGNORE,
+ "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+ "Use `WriteOptions(if_exists=...)` instead",
+ ),
+ (
+ {"mode": "error"},
+ JDBCTableExistBehavior.ERROR,
+ "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+ "Use `WriteOptions(if_exists=...)` instead",
+ ),
],
)
def test_jdbc_write_options_mode_deprecated(options, value, message):
@@ -305,10 +319,6 @@ def test_jdbc_write_options_mode_deprecated(options, value, message):
@pytest.mark.parametrize(
"options",
[
- # disallowed modes
- {"mode": "error"},
- {"mode": "ignore"},
- # wrong mode
{"mode": "wrong_mode"},
],
)
diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py
index 8775f6dbc..eb3f1db23 100644
--- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py
+++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py
@@ -233,6 +233,8 @@ def test_mongodb_convert_dict_to_str():
[
({}, MongoDBCollectionExistBehavior.APPEND),
({"if_exists": "append"}, MongoDBCollectionExistBehavior.APPEND),
+ ({"if_exists": "ignore"}, MongoDBCollectionExistBehavior.IGNORE),
+ ({"if_exists": "error"}, MongoDBCollectionExistBehavior.ERROR),
({"if_exists": "replace_entire_collection"}, MongoDBCollectionExistBehavior.REPLACE_ENTIRE_COLLECTION),
],
)
@@ -261,6 +263,18 @@ def test_mongodb_write_options_if_exists(options, value):
"Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. "
"Use `replace_entire_collection` instead",
),
+ (
+ {"mode": "ignore"},
+ MongoDBCollectionExistBehavior.IGNORE,
+ "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+ "Use `MongoDB.WriteOptions(if_exists=...)` instead",
+ ),
+ (
+ {"mode": "error"},
+ MongoDBCollectionExistBehavior.ERROR,
+ "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+ "Use `MongoDB.WriteOptions(if_exists=...)` instead",
+ ),
],
)
def test_mongodb_write_options_mode_deprecated(options, value, message):
@@ -272,10 +286,6 @@ def test_mongodb_write_options_mode_deprecated(options, value, message):
@pytest.mark.parametrize(
"options",
[
- # disallowed modes
- {"mode": "error"},
- {"mode": "ignore"},
- # wrong mode
{"mode": "wrong_mode"},
],
)
diff --git a/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py
new file mode 100644
index 000000000..42f95b368
--- /dev/null
+++ b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py
@@ -0,0 +1,47 @@
+import pytest
+
+from onetl.connection import FileConnection
+
+pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]
+
+
+def test_samba_connection():
+ from onetl.connection import Samba
+
+ samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd")
+ assert isinstance(samba, FileConnection)
+ assert samba.host == "some_host"
+ assert samba.protocol == "SMB"
+ assert samba.domain == ""
+ assert samba.auth_type == "NTLMv2"
+ assert samba.port == 445
+ assert samba.user == "some_user"
+ assert samba.password != "pwd"
+ assert samba.password.get_secret_value() == "pwd"
+
+ assert "password='pwd'" not in str(samba)
+ assert "password='pwd'" not in repr(samba)
+
+
+def test_samba_connection_with_net_bios():
+ from onetl.connection import Samba
+
+ samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol="NetBIOS")
+ assert samba.protocol == "NetBIOS"
+ assert samba.port == 139
+
+
+@pytest.mark.parametrize("protocol", ["SMB", "NetBIOS"])
+def test_samba_connection_with_custom_port(protocol):
+ from onetl.connection import Samba
+
+ samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol=protocol, port=444)
+ assert samba.protocol == protocol
+ assert samba.port == 444
+
+
+def test_samba_connection_without_mandatory_args():
+ from onetl.connection import Samba
+
+ with pytest.raises(ValueError):
+ Samba()
diff --git a/tests/util/spark_df.py b/tests/util/spark_df.py
index 8e4c667b8..f4e239026 100644
--- a/tests/util/spark_df.py
+++ b/tests/util/spark_df.py
@@ -10,7 +10,7 @@ def reset_column_names(df: SparkDataFrame, columns: list[str] | None = None) ->
"""
Reset columns to ``_c0`` format.
- If `columns` is None, reset all columns names.
+ If `columns` is None, apply to all columns in df.
"""
columns = columns or df.columns
for i, column in enumerate(columns):