From 2b0942cccecf71f279f7d156b12c2c5db2ebbf95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Wed, 6 Sep 2023 15:07:32 +0000
Subject: [PATCH 01/26] [DOP-8511] Bump version

---
 onetl/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onetl/VERSION b/onetl/VERSION
index 965065db5..a602fc9e2 100644
--- a/onetl/VERSION
+++ b/onetl/VERSION
@@ -1 +1 @@
-0.9.3
+0.9.4

From 0f6b274bbd5a0e030f8ec41fbcff85e2eaa2c6fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Wed, 6 Sep 2023 15:23:34 +0000
Subject: [PATCH 02/26] [DOP-8511] Update Kafka documentation

---
 docs/connection/db_connection/kafka/write.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/connection/db_connection/kafka/write.rst b/docs/connection/db_connection/kafka/write.rst
index eb04ecccb..064c8ead1 100644
--- a/docs/connection/db_connection/kafka/write.rst
+++ b/docs/connection/db_connection/kafka/write.rst
@@ -30,6 +30,7 @@ For writing data to Kafka, use :obj:`DBWriter <onetl.db.db_writer.db_writer.DBWr
                     # optional fields, can be omitted:
                     StructField("key", BinaryType(), nullable=True),
                     StructField("partition", IntegerType(), nullable=True),
+                    # this field can be passed only with ``include_headers=True``
                     StructField(
                         "headers",
                         ArrayType(
@@ -44,6 +45,7 @@ For writing data to Kafka, use :obj:`DBWriter <onetl.db.db_writer.db_writer.DBWr
                     ),
                 ],
             )
+
     You cannot pass dataframe with other column names or types.
 
 .. warning::

From 736a45df3cb1ab169dbd3bf6a9fb605a93911813 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Wed, 6 Sep 2023 15:31:13 +0000
Subject: [PATCH 03/26] [DOP-8511] Update Kafka documentation

---
 docs/connection/db_connection/kafka/read.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst
index a19c5e57b..3e6c846b1 100644
--- a/docs/connection/db_connection/kafka/read.rst
+++ b/docs/connection/db_connection/kafka/read.rst
@@ -7,7 +7,8 @@ For reading data from Kafka, use :obj:`DBReader <onetl.db.db_reader.db_reader.DB
 
 .. warning::
 
-    Currently, Kafka does not support :ref:`strategy`. You can only read the whole topic.
+    Currently, Kafka does not support :ref:`strategy`. You can only read the whole topic, or use ``group.id`` / ``groupIdPrefix`` so Kafka
+    will return only messages added to topic since last read.
 
 .. note::
 

From 5d4b4b1e0da03d845db98fda92248b7192f51e8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Thu, 7 Sep 2023 11:45:57 +0000
Subject: [PATCH 04/26] [DOP-8511] Update README

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 13b280830..6e21bb672 100644
--- a/README.rst
+++ b/README.rst
@@ -229,7 +229,7 @@ so some connections require additional setup to work properly.
   It also uses ``kinit`` executable to generate Kerberos ticket.
 
 * ``Hive`` and ``SparkHDFS``
-  Requires Kerberos ticket to exist before creating Spark session.
+  require Kerberos ticket to exist before creating Spark session.
 
 So you need to install OS packages with:
 

From 0adf0555b71a83d798d8f7a90db68138bff57501 Mon Sep 17 00:00:00 2001
From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com>
Date: Mon, 11 Sep 2023 13:27:00 +0300
Subject: [PATCH 05/26] [DOP-8665] - Allow modes "ignore" and "error" in
 JDBC.WriteOptions (#144)

* [DOP-8665] - Allow modes "ignore" and "error" in JDBC.WriteOptions

* [DOP-8665] - updated tests
---
 docs/changelog/next_release/144.feature.rst   |  1 +
 .../jdbc_connection/connection.py             |  6 +-
 .../db_connection/jdbc_connection/options.py  | 69 ++++++++++-----
 .../test_postgres_writer_integration.py       | 86 +++++++++++++++++--
 .../test_jdbc_options_unit.py                 | 18 +++-
 5 files changed, 146 insertions(+), 34 deletions(-)
 create mode 100644 docs/changelog/next_release/144.feature.rst

diff --git a/docs/changelog/next_release/144.feature.rst b/docs/changelog/next_release/144.feature.rst
new file mode 100644
index 000000000..a0cf257e4
--- /dev/null
+++ b/docs/changelog/next_release/144.feature.rst
@@ -0,0 +1 @@
+Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions``
diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py
index 3eb83f538..f5b611910 100644
--- a/onetl/connection/db_connection/jdbc_connection/connection.py
+++ b/onetl/connection/db_connection/jdbc_connection/connection.py
@@ -218,7 +218,11 @@ def write_df_to_target(
         write_options = self.WriteOptions.parse(options)
         jdbc_params = self.options_to_jdbc_params(write_options)
 
-        mode = "append" if write_options.if_exists == JDBCTableExistBehavior.APPEND else "overwrite"
+        mode = (
+            "overwrite"
+            if write_options.if_exists == JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE
+            else write_options.if_exists.value
+        )
         log.info("|%s| Saving data to a table %r", self.__class__.__name__, target)
         df.write.jdbc(table=target, mode=mode, **jdbc_params)
         log.info("|%s| Table %r successfully written", self.__class__.__name__, target)
diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py
index c998055fe..dacaded77 100644
--- a/onetl/connection/db_connection/jdbc_connection/options.py
+++ b/onetl/connection/db_connection/jdbc_connection/options.py
@@ -84,6 +84,8 @@
 
 class JDBCTableExistBehavior(str, Enum):
     APPEND = "append"
+    IGNORE = "ignore"
+    ERROR = "error"
     REPLACE_ENTIRE_TABLE = "replace_entire_table"
 
     def __str__(self) -> str:
@@ -413,44 +415,65 @@ class Config:
 
             .. dropdown:: Behavior in details
 
-            * Table does not exist
-                Table is created using options provided by user
-                (``createTableOptions``, ``createTableColumnTypes``, etc).
+                * Table does not exist
+                    Table is created using options provided by user
+                    (``createTableOptions``, ``createTableColumnTypes``, etc).
 
-            * Table exists
-                Data is appended to a table. Table has the same DDL as before writing data
+                * Table exists
+                    Data is appended to a table. Table has the same DDL as before writing data
 
-                .. warning::
+                    .. warning::
 
-                    This mode does not check whether table already contains
-                    rows from dataframe, so duplicated rows can be created.
+                        This mode does not check whether table already contains
+                        rows from dataframe, so duplicated rows can be created.
 
-                    Also Spark does not support passing custom options to
-                    insert statement, like ``ON CONFLICT``, so don't try to
-                    implement deduplication using unique indexes or constraints.
+                        Also Spark does not support passing custom options to
+                        insert statement, like ``ON CONFLICT``, so don't try to
+                        implement deduplication using unique indexes or constraints.
 
-                    Instead, write to staging table and perform deduplication
-                    using :obj:`~execute` method.
+                        Instead, write to staging table and perform deduplication
+                        using :obj:`~execute` method.
 
         * ``replace_entire_table``
             **Table is dropped and then created, or truncated**.
 
             .. dropdown:: Behavior in details
 
-            * Table does not exist
-                Table is created using options provided by user
-                (``createTableOptions``, ``createTableColumnTypes``, etc).
+                * Table does not exist
+                    Table is created using options provided by user
+                    (``createTableOptions``, ``createTableColumnTypes``, etc).
 
-            * Table exists
-                Table content is replaced with dataframe content.
+                * Table exists
+                    Table content is replaced with dataframe content.
 
-                After writing completed, target table could either have the same DDL as
-                before writing data (``truncate=True``), or can be recreated (``truncate=False``
-                or source does not support truncation).
+                    After writing completed, target table could either have the same DDL as
+                    before writing data (``truncate=True``), or can be recreated (``truncate=False``
+                    or source does not support truncation).
 
-    .. note::
+        * ``ignore``
+            Ignores the write operation if the table already exists.
+
+            .. dropdown:: Behavior in details
+
+                * Table does not exist
+                    Table is created using options provided by user
+                    (``createTableOptions``, ``createTableColumnTypes``, etc).
+
+                * Table exists
+                    The write operation is ignored, and no data is written to the table.
+
+        * ``error``
+            Raises an error if the table already exists.
+
+            .. dropdown:: Behavior in details
+
+                * Table does not exist
+                    Table is created using options provided by user
+                    (``createTableOptions``, ``createTableColumnTypes``, etc).
+
+                * Table exists
+                    An error is raised, and no data is written to the table.
 
-        ``error`` and ``ignore`` modes are not supported.
     """
 
     batchsize: int = 20_000
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py
index 195b16e02..cda43c8a8 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py
@@ -6,7 +6,17 @@
 pytestmark = pytest.mark.postgres
 
 
-def test_postgres_writer_snapshot(spark, processing, prepare_schema_table):
+@pytest.mark.parametrize(
+    "options",
+    [
+        {},
+        {"if_exists": "append"},
+        {"if_exists": "replace_entire_table"},
+        {"if_exists": "error"},
+        {"if_exists": "ignore"},
+    ],
+)
+def test_postgres_writer_snapshot(spark, processing, get_schema_table, options):
     df = processing.create_spark_df(spark=spark)
 
     postgres = Postgres(
@@ -20,14 +30,15 @@ def test_postgres_writer_snapshot(spark, processing, prepare_schema_table):
 
     writer = DBWriter(
         connection=postgres,
-        target=prepare_schema_table.full_name,
+        target=get_schema_table.full_name,
+        options=Postgres.WriteOptions(**options),
     )
 
     writer.run(df)
 
     processing.assert_equal_df(
-        schema=prepare_schema_table.schema,
-        table=prepare_schema_table.table,
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
         df=df,
     )
 
@@ -86,7 +97,7 @@ def test_postgres_writer_snapshot_with_pydantic_options(spark, processing, prepa
     )
 
 
-def test_postgres_writer_mode_append(spark, processing, prepare_schema_table):
+def test_postgres_writer_if_exists_append(spark, processing, prepare_schema_table):
     df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
     df1 = df[df.id_int < 1001]
     df2 = df[df.id_int > 1000]
@@ -116,7 +127,70 @@ def test_postgres_writer_mode_append(spark, processing, prepare_schema_table):
     )
 
 
-def test_postgres_writer_mode_replace_entire_table(spark, processing, prepare_schema_table):
+def test_postgres_writer_if_exists_error(spark, processing, prepare_schema_table):
+    from pyspark.sql.utils import AnalysisException
+
+    df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+
+    postgres = Postgres(
+        host=processing.host,
+        port=processing.port,
+        user=processing.user,
+        password=processing.password,
+        database=processing.database,
+        spark=spark,
+    )
+
+    writer = DBWriter(
+        connection=postgres,
+        target=prepare_schema_table.full_name,
+        options=Postgres.WriteOptions(if_exists="error"),
+    )
+
+    with pytest.raises(
+        AnalysisException,
+        match=f"Table or view '{prepare_schema_table.full_name}' already exists. SaveMode: ErrorIfExists.",
+    ):
+        writer.run(df)
+
+    empty_df = spark.createDataFrame([], df.schema)
+
+    processing.assert_equal_df(
+        schema=prepare_schema_table.schema,
+        table=prepare_schema_table.table,
+        df=empty_df,
+    )
+
+
+def test_postgres_writer_if_exists_ignore(spark, processing, prepare_schema_table):
+    df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+
+    postgres = Postgres(
+        host=processing.host,
+        port=processing.port,
+        user=processing.user,
+        password=processing.password,
+        database=processing.database,
+        spark=spark,
+    )
+
+    writer = DBWriter(
+        connection=postgres,
+        target=prepare_schema_table.full_name,
+        options=Postgres.WriteOptions(if_exists="ignore"),
+    )
+
+    writer.run(df)  # The write operation is ignored
+    empty_df = spark.createDataFrame([], df.schema)
+
+    processing.assert_equal_df(
+        schema=prepare_schema_table.schema,
+        table=prepare_schema_table.table,
+        df=empty_df,
+    )
+
+
+def test_postgres_writer_if_exists_replace_entire_table(spark, processing, prepare_schema_table):
     df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
     df1 = df[df.id_int < 1001]
     df2 = df[df.id_int > 1000]
diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py
index ae81402cc..f932408d0 100644
--- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py
+++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py
@@ -266,6 +266,8 @@ def test_jdbc_write_options_to_jdbc(spark_mock):
     [
         ({}, JDBCTableExistBehavior.APPEND),
         ({"if_exists": "append"}, JDBCTableExistBehavior.APPEND),
+        ({"if_exists": "ignore"}, JDBCTableExistBehavior.IGNORE),
+        ({"if_exists": "error"}, JDBCTableExistBehavior.ERROR),
         ({"if_exists": "replace_entire_table"}, JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE),
     ],
 )
@@ -294,6 +296,18 @@ def test_jdbc_write_options_if_exists(options, value):
             "Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. "
             "Use `replace_entire_table` instead",
         ),
+        (
+            {"mode": "ignore"},
+            JDBCTableExistBehavior.IGNORE,
+            "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+            "Use `WriteOptions(if_exists=...)` instead",
+        ),
+        (
+            {"mode": "error"},
+            JDBCTableExistBehavior.ERROR,
+            "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+            "Use `WriteOptions(if_exists=...)` instead",
+        ),
     ],
 )
 def test_jdbc_write_options_mode_deprecated(options, value, message):
@@ -305,10 +319,6 @@ def test_jdbc_write_options_mode_deprecated(options, value, message):
 @pytest.mark.parametrize(
     "options",
     [
-        # disallowed modes
-        {"mode": "error"},
-        {"mode": "ignore"},
-        # wrong mode
         {"mode": "wrong_mode"},
     ],
 )

From 13b498ad432c09b3f8b80d5034cd1fbd2d5e8767 Mon Sep 17 00:00:00 2001
From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com>
Date: Mon, 11 Sep 2023 13:45:20 +0300
Subject: [PATCH 06/26] [DOP-8664] - Allow modes "ignore" and "error" in
 MongoDB.WriteOptions (#145)

* [DOP-8664] - Allow modes "ignore" and "error" in MongoDB.WriteOptions

* [DOP-8664] - replaced collection existence check with java client

* [DOP-8664] - removed useless type ignore

* [DOP-8664] - updated tests

* [DOP-8664] - updated tests in greenplum.WriteOptions(if_exists='error')

* [DOP-8664] - updated logs messages in WriteOptions

* [DOP-8664] - updated logs messages in MongoDB.WriteOptions

* [DOP-8664] - updated logs messages in MongoDB.WriteOptions
---
 docs/changelog/next_release/145.feature.rst   |   1 +
 .../db_connection/mongodb/connection.py       |  20 +++
 .../db_connection/mongodb/options.py          |  51 ++++--
 .../test_greenplum_writer_integration.py      |   8 +
 .../test_mongodb_writer_integration.py        | 153 +++++++++++++++++-
 .../test_mongodb_unit.py                      |  18 ++-
 6 files changed, 228 insertions(+), 23 deletions(-)
 create mode 100644 docs/changelog/next_release/145.feature.rst

diff --git a/docs/changelog/next_release/145.feature.rst b/docs/changelog/next_release/145.feature.rst
new file mode 100644
index 000000000..975e0b96d
--- /dev/null
+++ b/docs/changelog/next_release/145.feature.rst
@@ -0,0 +1 @@
+Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions``
diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py
index 8e6110f14..5a8640a68 100644
--- a/onetl/connection/db_connection/mongodb/connection.py
+++ b/onetl/connection/db_connection/mongodb/connection.py
@@ -504,6 +504,16 @@ def write_df_to_target(
             else "append"
         )
 
+        if self._collection_exists(target):
+            if write_options.if_exists == MongoDBCollectionExistBehavior.ERROR:
+                raise ValueError("Operation stopped due to MongoDB.WriteOptions(if_exists='error')")
+            elif write_options.if_exists == MongoDBCollectionExistBehavior.IGNORE:
+                log.info(
+                    "|%s| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')",
+                    self.__class__.__name__,
+                )
+                return
+
         log.info("|%s| Saving data to a collection %r", self.__class__.__name__, target)
         df.write.format("mongodb").mode(mode).options(**write_options_dict).save()
         log.info("|%s| Collection %r is successfully written", self.__class__.__name__, target)
@@ -533,3 +543,13 @@ def _check_java_class_imported(cls, spark):
                 log.debug("Missing Java class", exc_info=e, stack_info=True)
             raise ValueError(msg) from e
         return spark
+
+    def _collection_exists(self, source: str) -> bool:
+        jvm = self.spark._jvm
+        client = jvm.com.mongodb.client.MongoClients.create(self.connection_url)  # type: ignore
+        collections = set(client.getDatabase(self.database).listCollectionNames().iterator())
+        if source in collections:
+            log.info("|%s| Collection %r exists", self.__class__.__name__, source)
+            return True
+        log.info("|%s| Collection %r does not exist", self.__class__.__name__, source)
+        return False
diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py
index 85f1935a3..13c256aff 100644
--- a/onetl/connection/db_connection/mongodb/options.py
+++ b/onetl/connection/db_connection/mongodb/options.py
@@ -81,6 +81,8 @@
 
 class MongoDBCollectionExistBehavior(str, Enum):
     APPEND = "append"
+    IGNORE = "ignore"
+    ERROR = "error"
     REPLACE_ENTIRE_COLLECTION = "replace_entire_collection"
 
     def __str__(self) -> str:
@@ -207,33 +209,52 @@ class MongoDBWriteOptions(GenericOptions):
 
             .. dropdown:: Behavior in details
 
-            * Collection does not exist
-                Collection is created using options provided by user
-                (``shardkey`` and others).
+                * Collection does not exist
+                    Collection is created using options provided by user
+                    (``shardkey`` and others).
 
-            * Collection exists
-                Data is appended to a collection.
+                * Collection exists
+                    Data is appended to a collection.
 
-                .. warning::
+                    .. warning::
 
-                    This mode does not check whether collection already contains
-                    objects from dataframe, so duplicated objects can be created.
+                        This mode does not check whether collection already contains
+                        objects from dataframe, so duplicated objects can be created.
 
         * ``replace_entire_collection``
             **Collection is deleted and then created**.
 
             .. dropdown:: Behavior in details
 
-            * Collection does not exist
-                Collection is created using options provided by user
-                (``shardkey`` and others).
+                * Collection does not exist
+                    Collection is created using options provided by user
+                    (``shardkey`` and others).
 
-            * Collection exists
-                Collection content is replaced with dataframe content.
+                * Collection exists
+                    Collection content is replaced with dataframe content.
 
-    .. note::
+        * ``ignore``
+            Ignores the write operation if the collection already exists.
+
+            .. dropdown:: Behavior in details
+
+                * Collection does not exist
+                    Collection is created using options provided by user
+
+                * Collection exists
+                    The write operation is ignored, and no data is written to the collection.
+
+        * ``error``
+            Raises an error if the collection already exists.
+
+            .. dropdown:: Behavior in details
+
+                * Collection does not exist
+                    Collection is created using options provided by user
+
+                * Collection exists
+                    An error is raised, and no data is written to the collection.
 
-        ``error`` and ``ignore`` modes are not supported.
     """
 
     class Config:
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py
index c97105a44..338de0c67 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py
@@ -137,6 +137,14 @@ def test_greenplum_writer_if_exists_error(spark, processing, prepare_schema_tabl
     ):
         writer.run(df)
 
+    empty_df = spark.createDataFrame([], df.schema)
+
+    processing.assert_equal_df(
+        schema=prepare_schema_table.schema,
+        table=prepare_schema_table.table,
+        df=empty_df,
+    )
+
 
 def test_greenplum_writer_if_exists_ignore(spark, processing, prepare_schema_table):
     df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py
index 458a6902f..d5cd94fed 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py
@@ -1,3 +1,6 @@
+import logging
+import re
+
 import pytest
 
 from onetl.connection import MongoDB
@@ -6,8 +9,18 @@
 pytestmark = pytest.mark.mongodb
 
 
+@pytest.mark.parametrize(
+    "options",
+    [
+        {},
+        {"if_exists": "append"},
+        {"if_exists": "replace_entire_collection"},
+        {"if_exists": "error"},
+        {"if_exists": "ignore"},
+    ],
+)
 @pytest.mark.flaky(reruns=2)
-def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table):
+def test_mongodb_writer_snapshot(spark, processing, get_schema_table, options, caplog):
     df = processing.create_spark_df(spark=spark)
 
     mongo = MongoDB(
@@ -21,12 +34,144 @@ def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table):
 
     writer = DBWriter(
         connection=mongo,
-        table=prepare_schema_table.table,
+        table=get_schema_table.table,
+        options=MongoDB.WriteOptions(**options),
+    )
+
+    with caplog.at_level(logging.INFO):
+        writer.run(df)
+
+        assert f"|MongoDB| Collection '{get_schema_table.table}' does not exist" in caplog.text
+
+    processing.assert_equal_df(
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
+        df=df,
+    )
+
+
+def test_mongodb_writer_if_exists_append(spark, processing, get_schema_table):
+    df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+    df1 = df[df._id < 1001]
+    df2 = df[df._id > 1000]
+
+    mongo = MongoDB(
+        host=processing.host,
+        port=processing.port,
+        user=processing.user,
+        password=processing.password,
+        database=processing.database,
+        spark=spark,
+    )
+
+    writer = DBWriter(
+        connection=mongo,
+        table=get_schema_table.table,
+        options=MongoDB.WriteOptions(if_exists="append"),
+    )
+    writer.run(df1)
+    writer.run(df2)
+
+    processing.assert_equal_df(
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
+        df=df,
+    )
+
+
+def test_mongodb_writer_if_exists_replace_entire_collection(spark, processing, get_schema_table):
+    df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+    df1 = df[df._id < 1001]
+    df2 = df[df._id > 1000]
+
+    mongo = MongoDB(
+        host=processing.host,
+        port=processing.port,
+        user=processing.user,
+        password=processing.password,
+        database=processing.database,
+        spark=spark,
+    )
+
+    writer = DBWriter(
+        connection=mongo,
+        table=get_schema_table.table,
+        options=MongoDB.WriteOptions(if_exists="replace_entire_collection"),
+    )
+    writer.run(df1)
+    writer.run(df2)
+
+    processing.assert_equal_df(
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
+        df=df2,
+    )
+
+
+def test_mongodb_writer_if_exists_error(spark, processing, get_schema_table, caplog):
+    df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+
+    mongo = MongoDB(
+        host=processing.host,
+        port=processing.port,
+        user=processing.user,
+        password=processing.password,
+        database=processing.database,
+        spark=spark,
+    )
+
+    writer = DBWriter(
+        connection=mongo,
+        table=get_schema_table.table,
+        options=MongoDB.WriteOptions(if_exists="error"),
     )
     writer.run(df)
 
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Operation stopped due to MongoDB.WriteOptions(if_exists='error')"),
+    ):
+        writer.run(df)
+
     processing.assert_equal_df(
-        schema=prepare_schema_table.schema,
-        table=prepare_schema_table.table,
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
         df=df,
     )
+
+
+def test_mongodb_writer_if_exists_ignore(spark, processing, get_schema_table, caplog):
+    df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500)
+    df1 = df[df._id < 1001]
+    df2 = df[df._id > 1000]
+
+    mongo = MongoDB(
+        host=processing.host,
+        port=processing.port,
+        user=processing.user,
+        password=processing.password,
+        database=processing.database,
+        spark=spark,
+    )
+
+    writer = DBWriter(
+        connection=mongo,
+        table=get_schema_table.table,
+        options=MongoDB.WriteOptions(if_exists="ignore"),
+    )
+    writer.run(df1)
+
+    with caplog.at_level(logging.INFO):
+        writer.run(df2)  # The write operation is ignored
+
+        assert f"|MongoDB| Collection '{get_schema_table.table}' exists" in caplog.text
+        assert (
+            "|MongoDB| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')"
+            in caplog.text
+        )
+
+    processing.assert_equal_df(
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
+        df=df1,
+    )
diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py
index 8775f6dbc..eb3f1db23 100644
--- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py
+++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py
@@ -233,6 +233,8 @@ def test_mongodb_convert_dict_to_str():
     [
         ({}, MongoDBCollectionExistBehavior.APPEND),
         ({"if_exists": "append"}, MongoDBCollectionExistBehavior.APPEND),
+        ({"if_exists": "ignore"}, MongoDBCollectionExistBehavior.IGNORE),
+        ({"if_exists": "error"}, MongoDBCollectionExistBehavior.ERROR),
         ({"if_exists": "replace_entire_collection"}, MongoDBCollectionExistBehavior.REPLACE_ENTIRE_COLLECTION),
     ],
 )
@@ -261,6 +263,18 @@ def test_mongodb_write_options_if_exists(options, value):
             "Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. "
             "Use `replace_entire_collection` instead",
         ),
+        (
+            {"mode": "ignore"},
+            MongoDBCollectionExistBehavior.IGNORE,
+            "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+            "Use `MongoDB.WriteOptions(if_exists=...)` instead",
+        ),
+        (
+            {"mode": "error"},
+            MongoDBCollectionExistBehavior.ERROR,
+            "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+            "Use `MongoDB.WriteOptions(if_exists=...)` instead",
+        ),
     ],
 )
 def test_mongodb_write_options_mode_deprecated(options, value, message):
@@ -272,10 +286,6 @@ def test_mongodb_write_options_mode_deprecated(options, value, message):
 @pytest.mark.parametrize(
     "options",
     [
-        # disallowed modes
-        {"mode": "error"},
-        {"mode": "ignore"},
-        # wrong mode
         {"mode": "wrong_mode"},
     ],
 )

From 023aa42b2bc465e5c2585f43e51109020d7f2a8e Mon Sep 17 00:00:00 2001
From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com>
Date: Mon, 11 Sep 2023 13:59:15 +0300
Subject: [PATCH 07/26] [DOP-8647] - Allow modes "ignore" and "error" in
 Hive.WriteOptions (#143)

* [DOP-8647] - Allow modes "ignore" and "error" in HiveWriteOptions

* [DOP-8647] - remove log.error message

* [DOP-8647] - move write_options check to write_df_to_target

* [DOP-8647] - updated logs messages in Hive.WriteOptions

* [DOP-8647] - updated logs messages in Hive.WriteOptions

* [DOP-8647] - updated logs messages in Hive.WriteOptions
---
 docs/changelog/next_release/143.feature.rst   |   1 +
 .../db_connection/hive/connection.py          |   8 ++
 .../connection/db_connection/hive/options.py  |  27 ++++-
 .../test_hive_writer_integration.py           | 102 ++++++++++++++++++
 .../test_hive_unit.py                         |  18 +++-
 5 files changed, 150 insertions(+), 6 deletions(-)
 create mode 100644 docs/changelog/next_release/143.feature.rst

diff --git a/docs/changelog/next_release/143.feature.rst b/docs/changelog/next_release/143.feature.rst
new file mode 100644
index 000000000..97756efc4
--- /dev/null
+++ b/docs/changelog/next_release/143.feature.rst
@@ -0,0 +1 @@
+Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions``
diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py
index d0bc08d29..d0997f512 100644
--- a/onetl/connection/db_connection/hive/connection.py
+++ b/onetl/connection/db_connection/hive/connection.py
@@ -340,6 +340,14 @@ def write_df_to_target(
 
         # https://stackoverflow.com/a/72747050
         if table_exists and write_options.if_exists != HiveTableExistBehavior.REPLACE_ENTIRE_TABLE:
+            if write_options.if_exists == HiveTableExistBehavior.ERROR:
+                raise ValueError("Operation stopped due to Hive.WriteOptions(if_exists='error')")
+            elif write_options.if_exists == HiveTableExistBehavior.IGNORE:
+                log.info(
+                    "|%s| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')",
+                    self.__class__.__name__,
+                )
+                return
             # using saveAsTable on existing table does not handle
             # spark.sql.sources.partitionOverwriteMode=dynamic, so using insertInto instead.
             self._insert_into(df, target, options)
diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py
index c46b7882d..81445851d 100644
--- a/onetl/connection/db_connection/hive/options.py
+++ b/onetl/connection/db_connection/hive/options.py
@@ -26,6 +26,8 @@
 
 class HiveTableExistBehavior(str, Enum):
     APPEND = "append"
+    IGNORE = "ignore"
+    ERROR = "error"
     REPLACE_ENTIRE_TABLE = "replace_entire_table"
     REPLACE_OVERLAPPING_PARTITIONS = "replace_overlapping_partitions"
 
@@ -173,9 +175,30 @@ class Config:
                 Table is recreated using options provided by user (``format``, ``compression``, etc)
                 **instead of using original table options**. Be careful
 
-    .. note::
+        * ``ignore``
+            Ignores the write operation if the table/partition already exists.
+
+            .. dropdown:: Behavior in details
+
+                * Table does not exist
+                    Table is created using options provided by user (``format``, ``compression``, etc).
+
+                * Table exists
+                    If the table exists, **no further action is taken**. This is true whether or not new partition
+                    values are present and whether the partitioning scheme differs or not
+
+        * ``error``
+            Raises an error if the table/partition already exists.
+
+            .. dropdown:: Behavior in details
+
+                * Table does not exist
+                    Table is created using options provided by user (``format``, ``compression``, etc).
+
+                * Table exists
+                    If the table exists, **raises an error**. This is true whether or not new partition
+                    values are present and whether the partitioning scheme differs or not
 
-        ``error`` and ``ignore`` modes are not supported.
 
     .. note::
 
diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py
index 44553539b..8ca74b06d 100644
--- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py
+++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import textwrap
 
 import pytest
@@ -225,6 +226,8 @@ def test_hive_writer_default_not_partitioned(spark, processing, get_schema_table
     "options",
     [
         Hive.WriteOptions(if_exists="append"),
+        Hive.WriteOptions(if_exists="ignore"),
+        Hive.WriteOptions(if_exists="error"),
         Hive.WriteOptions(if_exists="replace_entire_table"),
         Hive.WriteOptions(if_exists="replace_overlapping_partitions"),
     ],
@@ -363,6 +366,105 @@ def test_hive_writer_insert_into_append(spark, processing, get_schema_table, ori
     )
 
 
+@pytest.mark.parametrize(
+    "original_options, new_options",
+    [
+        pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"),
+        pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"),
+        pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"),
+        pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"),
+    ],
+)
+def test_hive_writer_insert_into_ignore(spark, processing, get_schema_table, original_options, new_options, caplog):
+    df = processing.create_spark_df(spark=spark)
+
+    df1 = df[df.id_int <= 25]
+    df2 = df.where("id_int > 25 AND id_int <= 50")
+    df3 = df[df.id_int > 50]
+
+    hive = Hive(cluster="rnd-dwh", spark=spark)
+    writer1 = DBWriter(
+        connection=hive,
+        target=get_schema_table.full_name,
+        options=original_options,
+    )
+    # create & fill up the table with some data
+    writer1.run(df1.union(df2))
+    old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+
+    writer2 = DBWriter(
+        connection=hive,
+        target=get_schema_table.full_name,
+        options=Hive.WriteOptions(if_exists="ignore", **new_options),
+    )
+
+    with caplog.at_level(logging.INFO):
+        writer2.run(df1.union(df3))
+
+        assert "|Hive| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')" in caplog.text
+
+    new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+
+    # table DDL remains the same
+    assert new_ddl == old_ddl
+
+    # table should only contain old data, because 'ignore' should not have added new data
+    processing.assert_equal_df(
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
+        df=df1.union(df2),
+        order_by="id_int",
+    )
+
+
+@pytest.mark.parametrize(
+    "original_options, new_options",
+    [
+        pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"),
+        pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"),
+        pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"),
+        pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"),
+    ],
+)
+def test_hive_writer_insert_into_error(spark, processing, get_schema_table, original_options, new_options, caplog):
+    df = processing.create_spark_df(spark=spark)
+
+    hive = Hive(cluster="rnd-dwh", spark=spark)
+    writer1 = DBWriter(
+        connection=hive,
+        target=get_schema_table.full_name,
+        options=original_options,
+    )
+
+    # Create & fill up the table with some data
+    writer1.run(df)
+    old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+
+    writer2 = DBWriter(
+        connection=hive,
+        target=get_schema_table.full_name,
+        options=Hive.WriteOptions(if_exists="error", **new_options),
+    )
+
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Operation stopped due to Hive.WriteOptions(if_exists='error')"),
+    ):
+        writer2.run(df)
+
+    # table DDL remains the same
+    new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0]
+    assert new_ddl == old_ddl
+
+    # validate that the table contains only old data
+    processing.assert_equal_df(
+        schema=get_schema_table.schema,
+        table=get_schema_table.table,
+        df=df,
+        order_by="id_int",
+    )
+
+
 @pytest.mark.parametrize(
     "original_options, new_options",
     [
diff --git a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py
index 7e633206e..6469b10c8 100644
--- a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py
+++ b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py
@@ -153,6 +153,8 @@ def test_hive_write_options_unsupported_insert_into(insert_into):
         ({"if_exists": "append"}, HiveTableExistBehavior.APPEND),
         ({"if_exists": "replace_overlapping_partitions"}, HiveTableExistBehavior.REPLACE_OVERLAPPING_PARTITIONS),
         ({"if_exists": "replace_entire_table"}, HiveTableExistBehavior.REPLACE_ENTIRE_TABLE),
+        ({"if_exists": "error"}, HiveTableExistBehavior.ERROR),
+        ({"if_exists": "ignore"}, HiveTableExistBehavior.IGNORE),
     ],
 )
 def test_hive_write_options_if_exists(options, value):
@@ -198,6 +200,18 @@ def test_hive_write_options_if_exists(options, value):
             "Mode `overwrite_table` is deprecated since v0.9.0 and will be removed in v1.0.0. "
             "Use `replace_entire_table` instead",
         ),
+        (
+            {"mode": "error"},
+            HiveTableExistBehavior.ERROR,
+            "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+            "Use `Hive.WriteOptions(if_exists=...)` instead",
+        ),
+        (
+            {"mode": "ignore"},
+            HiveTableExistBehavior.IGNORE,
+            "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. "
+            "Use `Hive.WriteOptions(if_exists=...)` instead",
+        ),
     ],
 )
 def test_hive_write_options_mode_deprecated(options, value, message):
@@ -209,10 +223,6 @@ def test_hive_write_options_mode_deprecated(options, value, message):
 @pytest.mark.parametrize(
     "options",
     [
-        # disallowed modes
-        {"mode": "error"},
-        {"mode": "ignore"},
-        # wrong mode
         {"mode": "wrong_mode"},
     ],
 )

From 9cde99381232fe14d836c09896a0ced42226f3cb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 12 Sep 2023 05:30:00 +0000
Subject: [PATCH 08/26] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/psf/black: 23.7.0 → 23.9.1](https://github.com/psf/black/compare/23.7.0...23.9.1)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fd0c89d6b..aea58de6b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -64,7 +64,7 @@ repos:
   - id: pyupgrade
     args: [--py37-plus, --keep-runtime-typing]
 - repo: https://github.com/psf/black
-  rev: 23.7.0
+  rev: 23.9.1
   hooks:
   - id: black
     language_version: python3

From 1aa3fc07343b016c8aca46b74eef5ddd86fd96d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 12 Sep 2023 10:42:01 +0000
Subject: [PATCH 09/26] [DOP-8511] Fix MongoDB documentation example

---
 onetl/connection/db_connection/mongodb/connection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py
index 5a8640a68..7f02f20ef 100644
--- a/onetl/connection/db_connection/mongodb/connection.py
+++ b/onetl/connection/db_connection/mongodb/connection.py
@@ -124,7 +124,7 @@ class MongoDB(DBConnection):
         from pyspark.sql import SparkSession
 
         # Create Spark session with MongoDB connector loaded
-        maven_packages = Greenplum.get_packages(spark_version="3.2")
+        maven_packages = MongoDB.get_packages(spark_version="3.2")
         spark = (
             SparkSession.builder.appName("spark-app-name")
             .config("spark.jars.packages", ",".join(maven_packages))

From 2caf72b27a4e50f8b7383731c4b5d2525b9c2c56 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 04:54:50 +0000
Subject: [PATCH 10/26] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/asottile/pyupgrade: v3.10.1 → v3.11.0](https://github.com/asottile/pyupgrade/compare/v3.10.1...v3.11.0)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index aea58de6b..d43a63307 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -59,7 +59,7 @@ repos:
   - id: rst-inline-touching-normal
   - id: text-unicode-replacement-char
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.10.1
+  rev: v3.11.0
   hooks:
   - id: pyupgrade
     args: [--py37-plus, --keep-runtime-typing]

From 982d96cc11536b42f676a881a2eeddfefa9fcee4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Mon, 18 Sep 2023 11:19:38 +0000
Subject: [PATCH 11/26] [DOP-8959] Add Excel file format support

---
 .github/workflows/data/greenplum/matrix.yml   |   2 +-
 .github/workflows/data/local-fs/matrix.yml    |  17 +-
 .github/workflows/data/mongodb/matrix.yml     |   2 +-
 .github/workflows/data/s3/matrix.yml          |   2 +-
 README.rst                                    |   4 +-
 docs/changelog/next_release/148.feature.rst   |   1 +
 .../db_connection/greenplum/prerequisites.rst |   4 +-
 docs/file_df/file_formats/avro.rst            |   2 +-
 docs/file_df/file_formats/excel.rst           |   9 +
 docs/file_df/file_formats/index.rst           |   1 +
 docs/file_df/file_formats/orc.rst             |   2 +-
 .../db_connection/kafka/connection.py         |   4 +
 .../db_connection/mongodb/connection.py       |   2 +
 .../file_df_connection/spark_s3/connection.py |   2 +
 onetl/file/format/__init__.py                 |   1 +
 onetl/file/format/avro.py                     |   4 +
 onetl/file/format/excel.py                    | 220 ++++++++++++++++++
 .../{spark-3.2.3.txt => spark-3.2.4.txt}      |   2 +-
 tests/fixtures/spark.py                       |  19 +-
 .../file_df_connection/generate_files.py      |  92 ++++++++
 .../xls/with_data_address/file.xls            | Bin 0 -> 5632 bytes
 .../xls/with_header/file.xls                  | Bin 0 -> 5632 bytes
 .../xls/without_header/file.xls               | Bin 0 -> 5632 bytes
 .../xlsx/with_data_address/file.xls           | Bin 0 -> 4891 bytes
 .../xlsx/with_header/file.xls                 | Bin 0 -> 5026 bytes
 .../xlsx/without_header/file.xls              | Bin 0 -> 4881 bytes
 tests/resources/requirements.txt              |   3 +
 .../test_avro_integration.py                  |  11 +-
 .../test_csv_integration.py                   |   7 +-
 .../test_excel_integration.py                 | 142 +++++++++++
 .../test_format_unit/test_excel_unit.py       | 106 +++++++++
 tests/util/spark_df.py                        |   2 +-
 32 files changed, 639 insertions(+), 24 deletions(-)
 create mode 100644 docs/changelog/next_release/148.feature.rst
 create mode 100644 docs/file_df/file_formats/excel.rst
 create mode 100644 onetl/file/format/excel.py
 rename requirements/tests/{spark-3.2.3.txt => spark-3.2.4.txt} (80%)
 create mode 100644 tests/resources/file_df_connection/xls/with_data_address/file.xls
 create mode 100644 tests/resources/file_df_connection/xls/with_header/file.xls
 create mode 100644 tests/resources/file_df_connection/xls/without_header/file.xls
 create mode 100644 tests/resources/file_df_connection/xlsx/with_data_address/file.xls
 create mode 100644 tests/resources/file_df_connection/xlsx/with_header/file.xls
 create mode 100644 tests/resources/file_df_connection/xlsx/without_header/file.xls
 create mode 100644 tests/tests_integration/test_file_format_integration/test_excel_integration.py
 create mode 100644 tests/tests_unit/test_file/test_format_unit/test_excel_unit.py

diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml
index 43e02d8c2..292319bb5 100644
--- a/.github/workflows/data/greenplum/matrix.yml
+++ b/.github/workflows/data/greenplum/matrix.yml
@@ -7,7 +7,7 @@ min: &min
 
 max: &max
   # Greenplum connector does not support Spark 3.3+
-  spark-version: 3.2.3
+  spark-version: 3.2.4
   python-version: '3.10'
   java-version: 11
   os: ubuntu-latest
diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml
index af841433b..e956169ba 100644
--- a/.github/workflows/data/local-fs/matrix.yml
+++ b/.github/workflows/data/local-fs/matrix.yml
@@ -4,12 +4,18 @@ min: &min
   java-version: 8
   os: ubuntu-latest
 
-avro: &avro
+min_avro: &min_avro
   spark-version: 2.4.8
   python-version: '3.7'
   java-version: 8
   os: ubuntu-latest
 
+min_excel: &min_excel
+  spark-version: 3.2.4
+  python-version: '3.7'
+  java-version: 8
+  os: ubuntu-latest
+
 max: &max
   spark-version: 3.4.1
   python-version: '3.11'
@@ -25,12 +31,15 @@ latest: &latest
 matrix:
   small:
   - <<: *max
-  - <<: *avro
+  - <<: *min_avro
+  - <<: *min_excel
   full:
   - <<: *min
-  - <<: *avro
+  - <<: *min_avro
+  - <<: *min_excel
   - <<: *max
   nightly:
   - <<: *min
-  - <<: *avro
+  - <<: *min_avro
+  - <<: *min_excel
   - <<: *latest
diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml
index 80f81aacf..f91e1baaa 100644
--- a/.github/workflows/data/mongodb/matrix.yml
+++ b/.github/workflows/data/mongodb/matrix.yml
@@ -1,6 +1,6 @@
 min: &min
   # MongoDB connector does not support Spark 2
-  spark-version: 3.2.3
+  spark-version: 3.2.4
   python-version: '3.7'
   java-version: 8
   os: ubuntu-latest
diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml
index 57fe2ca8f..44779fe95 100644
--- a/.github/workflows/data/s3/matrix.yml
+++ b/.github/workflows/data/s3/matrix.yml
@@ -2,7 +2,7 @@ min: &min
   # prior image versions returns empty content of bucket root, some kind of bug
   minio-version: 2021.3.17
   # Minimal Spark version with Hadoop 3.x support
-  spark-version: 3.2.3
+  spark-version: 3.2.4
   python-version: '3.7'
   java-version: 8
   os: ubuntu-latest
diff --git a/README.rst b/README.rst
index 6e21bb672..e20086214 100644
--- a/README.rst
+++ b/README.rst
@@ -169,9 +169,9 @@ Compatibility matrix
 +--------------------------------------------------------------+-------------+-------------+-------+
 | `2.4.x <https://spark.apache.org/docs/2.4.8/#downloading>`_  | 3.7 only    | 8 only      | 2.11  |
 +--------------------------------------------------------------+-------------+-------------+-------+
-| `3.2.x <https://spark.apache.org/docs/3.2.3/#downloading>`_  | 3.7 - 3.10  | 8u201 - 11  | 2.12  |
+| `3.2.x <https://spark.apache.org/docs/3.2.4/#downloading>`_  | 3.7 - 3.10  | 8u201 - 11  | 2.12  |
 +--------------------------------------------------------------+-------------+-------------+-------+
-| `3.3.x <https://spark.apache.org/docs/3.3.2/#downloading>`_  | 3.7 - 3.10  | 8u201 - 17  | 2.12  |
+| `3.3.x <https://spark.apache.org/docs/3.3.3/#downloading>`_  | 3.7 - 3.10  | 8u201 - 17  | 2.12  |
 +--------------------------------------------------------------+-------------+-------------+-------+
 | `3.4.x <https://spark.apache.org/docs/3.4.1/#downloading>`_  | 3.7 - 3.11  | 8u362 - 20  | 2.12  |
 +--------------------------------------------------------------+-------------+-------------+-------+
diff --git a/docs/changelog/next_release/148.feature.rst b/docs/changelog/next_release/148.feature.rst
new file mode 100644
index 000000000..87b1b48a8
--- /dev/null
+++ b/docs/changelog/next_release/148.feature.rst
@@ -0,0 +1 @@
+Add ``Excel`` file format support.
diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index 964d9cdcf..815a12b27 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -22,7 +22,7 @@ You can install PySpark as follows:
 
 .. code:: bash
 
-    pip install onetl pyspark=3.2.3  # pass specific PySpark version
+    pip install onetl pyspark=3.2.4  # pass specific PySpark version
 
 See :ref:`spark-install` instruction for more details.
 
@@ -158,7 +158,7 @@ Inserting ``.jar`` file to Spark jars folder
 Can be used to embed ``.jar`` files to a default Spark classpath.
 
 * Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``.
+* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.4/jars/``.
 * Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages``
 
 
diff --git a/docs/file_df/file_formats/avro.rst b/docs/file_df/file_formats/avro.rst
index 6251a5154..7f1ec0d4f 100644
--- a/docs/file_df/file_formats/avro.rst
+++ b/docs/file_df/file_formats/avro.rst
@@ -1,7 +1,7 @@
 .. _avro-file-format:
 
 Avro
-========
+====
 
 .. currentmodule:: onetl.file.format.avro
 
diff --git a/docs/file_df/file_formats/excel.rst b/docs/file_df/file_formats/excel.rst
new file mode 100644
index 000000000..f9b680085
--- /dev/null
+++ b/docs/file_df/file_formats/excel.rst
@@ -0,0 +1,9 @@
+.. _excel-file-format:
+
+Excel
+=====
+
+.. currentmodule:: onetl.file.format.excel
+
+.. autoclass:: Excel
+    :members: get_packages
diff --git a/docs/file_df/file_formats/index.rst b/docs/file_df/file_formats/index.rst
index 7e3367bc6..3a39bc061 100644
--- a/docs/file_df/file_formats/index.rst
+++ b/docs/file_df/file_formats/index.rst
@@ -9,6 +9,7 @@ File Formats
 
     avro
     csv
+    excel
     json
     jsonline
     orc
diff --git a/docs/file_df/file_formats/orc.rst b/docs/file_df/file_formats/orc.rst
index 2d82b3584..491492bac 100644
--- a/docs/file_df/file_formats/orc.rst
+++ b/docs/file_df/file_formats/orc.rst
@@ -1,7 +1,7 @@
 .. _orc-file-format:
 
 ORC
-========
+===
 
 .. currentmodule:: onetl.file.format.orc
 
diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py
index 3aa8f0fd2..cf9a669c9 100644
--- a/onetl/connection/db_connection/kafka/connection.py
+++ b/onetl/connection/db_connection/kafka/connection.py
@@ -72,6 +72,7 @@ class Kafka(DBConnection):
 
         * Apache Kafka versions: 0.10 or higher
         * Spark versions: 2.4.x - 3.4.x
+        * Scala versions: 2.11 - 2.13
 
     Parameters
     ----------
@@ -381,6 +382,9 @@ def get_packages(
         """
         Get package names to be downloaded by Spark. |support_hooks|
 
+        See `Maven package index <https://mvnrepository.com/artifact/org.apache.spark/spark-sql-kafka-0-10>`_
+        for all available packages.
+
         Parameters
         ----------
         spark_version : str
diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py
index 7f02f20ef..771fb3b69 100644
--- a/onetl/connection/db_connection/mongodb/connection.py
+++ b/onetl/connection/db_connection/mongodb/connection.py
@@ -65,6 +65,7 @@ class MongoDB(DBConnection):
         * MongoDB server versions: 4.0 or higher
         * Spark versions: 3.2.x - 3.4.x
         * Java versions: 8 - 20
+        * Scala versions: 2.11 - 2.13
 
         See `official documentation <https://www.mongodb.com/docs/spark-connector/current/>`_.
 
@@ -206,6 +207,7 @@ def get_packages(
         if scala_ver.digits(2) < (2, 12) or scala_ver.digits(2) > (2, 13):
             raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver}")
 
+        # https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector
         return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.digits(2)}:10.1.1"]
 
     @classproperty
diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py
index 0fd72a0ca..464487f52 100644
--- a/onetl/connection/file_df_connection/spark_s3/connection.py
+++ b/onetl/connection/file_df_connection/spark_s3/connection.py
@@ -63,6 +63,7 @@ class SparkS3(SparkFileDFConnection):
 
         * Spark versions: 3.2.x - 3.4.x (only with Hadoop 3.x libraries)
         * Java versions: 8 - 20
+        * Scala versions: 2.11 - 2.13
 
     .. warning::
 
@@ -263,6 +264,7 @@ def get_packages(
             raise ValueError(f"Spark version must be at least 3.x, got {spark_ver}")
 
         scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver)
+        # https://mvnrepository.com/artifact/org.apache.spark/spark-hadoop-cloud
         return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.digits(2)}:{spark_ver.digits(3)}"]
 
     @slot
diff --git a/onetl/file/format/__init__.py b/onetl/file/format/__init__.py
index d41c76aac..0c9d6b742 100644
--- a/onetl/file/format/__init__.py
+++ b/onetl/file/format/__init__.py
@@ -15,6 +15,7 @@
 
 from onetl.file.format.avro import Avro
 from onetl.file.format.csv import CSV
+from onetl.file.format.excel import Excel
 from onetl.file.format.json import JSON
 from onetl.file.format.jsonline import JSONLine
 from onetl.file.format.orc import ORC
diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py
index 2fc5a1cb5..b0c58e18d 100644
--- a/onetl/file/format/avro.py
+++ b/onetl/file/format/avro.py
@@ -73,6 +73,7 @@ class Avro(ReadWriteFileFormat):
 
         * Spark versions: 2.4.x - 3.4.x
         * Java versions: 8 - 20
+        * Scala versions: 2.11 - 2.13
 
         See documentation from link above.
 
@@ -131,6 +132,9 @@ def get_packages(
         """
         Get package names to be downloaded by Spark. |support_hooks|
 
+        See `Maven package index <https://mvnrepository.com/artifact/org.apache.spark/spark-avro>`_
+        for all available packages.
+
         Parameters
         ----------
         spark_version : str
diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py
new file mode 100644
index 000000000..ffd11a5da
--- /dev/null
+++ b/onetl/file/format/excel.py
@@ -0,0 +1,220 @@
+#  Copyright 2023 MTS (Mobile Telesystems)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, ClassVar
+
+from onetl._util.java import try_import_java_class
+from onetl._util.scala import get_default_scala_version
+from onetl._util.spark import get_spark_version
+from onetl._util.version import Version
+from onetl.exception import MISSING_JVM_CLASS_MSG
+from onetl.file.format.file_format import ReadWriteFileFormat
+from onetl.hooks import slot, support_hooks
+
+if TYPE_CHECKING:
+    from pyspark.sql import SparkSession
+
+READ_OPTIONS = frozenset(
+    (
+        "dataAddress",
+        "treatEmptyValuesAsNulls",
+        "setErrorCellsToFallbackValues",
+        "usePlainNumberFormat",
+        "inferSchema",
+        "addColorColumns",
+        "timestampFormat",
+        "maxRowsInMemory",
+        "maxByteArraySize",
+        "tempFileThreshold",
+        "excerptSize",
+        "workbookPassword",
+    ),
+)
+
+WRITE_OPTIONS = frozenset(
+    (
+        "dataAddress",
+        "dateFormat",
+        "timestampFormat",
+    ),
+)
+
+log = logging.getLogger(__name__)
+
+
+@support_hooks
+class Excel(ReadWriteFileFormat):
+    """
+    Excel file format. |support_hooks|
+
+    Based on `Spark Excel <https://github.com/crealytics/spark-excel>`_ file format.
+
+    Supports reading/writing files with ``.xlsx`` (read/write) and ``.xls`` (read only) extensions.
+
+    .. versionadded:: 0.9.4
+
+    .. dropdown:: Version compatibility
+
+        * Spark versions: 3.2.x - 3.4.x.
+
+            .. warning::
+
+                Not all combinations of Spark version and package version are supported.
+                See `Maven index <https://mvnrepository.com/artifact/com.crealytics/spark-excel>`_
+                and `official documentation <https://github.com/crealytics/spark-excel>`_.
+
+        * Scala versions: 2.12 - 2.13
+        * Java versions: 8 - 20
+
+        See documentation from link above.
+
+    .. note ::
+
+        You can pass any option to the constructor, even if it is not mentioned in this documentation.
+        **Option names should be in** ``camelCase``!
+
+        The set of supported options depends on Spark version. See link above.
+
+    Examples
+    --------
+
+    Describe options how to read from/write to Excel file with specific options:
+
+    .. code:: python
+
+        from onetl.file.format import Excel
+        from pyspark.sql import SparkSession
+
+        # Create Spark session with Excel package loaded
+        maven_packages = Excel.get_packages(spark_version="3.4.1")
+        spark = (
+            SparkSession.builder.appName("spark-app-name")
+            .config("spark.jars.packages", ",".join(maven_packages))
+            .getOrCreate()
+        )
+
+        excel = Excel(
+            header=True,
+            inferSchema=True,
+        )
+
+    """
+
+    name: ClassVar[str] = "excel"
+
+    header: bool = False
+
+    class Config:
+        known_options = READ_OPTIONS | WRITE_OPTIONS
+        extra = "allow"
+
+    @slot
+    @classmethod
+    def get_packages(
+        cls,
+        spark_version: str,
+        scala_version: str | None = None,
+        package_version: str | None = None,
+    ) -> list[str]:
+        """
+        Get package names to be downloaded by Spark. |support_hooks|
+
+        .. warning::
+
+            Not all combinations of Spark version and package version are supported.
+            See `Maven index <https://mvnrepository.com/artifact/com.crealytics/spark-excel>`_
+            and `official documentation <https://github.com/crealytics/spark-excel>`_.
+
+        Parameters
+        ----------
+        spark_version : str
+            Spark version in format ``major.minor.patch``.
+
+        scala_version : str, optional
+            Scala version in format ``major.minor``.
+
+            If ``None``, ``spark_version`` is used to determine Scala version.
+
+        version: str, optional
+            Package version in format ``major.minor.patch``. Default is ``0.19.0``.
+
+            .. warning::
+
+                Version ``0.14`` and below are not supported.
+
+            .. note::
+
+                It is not guaranteed that custom package versions are supported.
+                Tests are performed only for default version.
+
+        Examples
+        --------
+
+        .. code:: python
+
+            from onetl.file.format import Excel
+
+            Excel.get_packages(spark_version="3.4.1")
+            Excel.get_packages(spark_version="3.4.1", scala_version="2.13")
+            Excel.get_packages(
+                spark_version="3.4.1",
+                scala_version="2.13",
+                package_version="0.19.0",
+            )
+
+        """
+
+        if package_version:
+            version = Version.parse(package_version)
+            if version < (0, 15):
+                # format="com.crealytics.spark.excel" does not support reading folder with files
+                # format="excel" was added only in 0.14, but Maven package for 0.14 has different naming convention than recent versions.
+                # So using 0.15 as the lowest supported version.
+                raise ValueError(f"Package version should be at least 0.15, got {package_version}")
+            log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version)
+        else:
+            version = Version.parse("0.19.0")
+
+        spark_ver = Version.parse(spark_version)
+        if spark_ver < (3, 2):
+            # Actually, Spark 2.4 is supported, but packages are built only for Scala 2.12
+            # when default pyspark==2.4.1 is built with Scala 2.11.
+            # See https://github.com/crealytics/spark-excel/issues/426
+            raise ValueError(f"Spark version should be at least 3.2, got {spark_version}")
+
+        scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver)
+        if scala_ver.digits(2) < (2, 12):
+            raise ValueError(f"Scala version should be at least 2.12, got {scala_ver}")
+
+        return [f"com.crealytics:spark-excel_{scala_ver.digits(2)}:{spark_ver.digits(3)}_{version.digits(3)}"]
+
+    @slot
+    def check_if_supported(self, spark: SparkSession) -> None:
+        java_class = "com.crealytics.spark.excel.v2.ExcelDataSource"
+
+        try:
+            try_import_java_class(spark, java_class)
+        except Exception as e:
+            spark_version = get_spark_version(spark)
+            msg = MISSING_JVM_CLASS_MSG.format(
+                java_class=java_class,
+                package_source=self.__class__.__name__,
+                args=f"spark_version='{spark_version}'",
+            )
+            if log.isEnabledFor(logging.DEBUG):
+                log.debug("Missing Java class", exc_info=e, stack_info=True)
+            raise ValueError(msg) from e
diff --git a/requirements/tests/spark-3.2.3.txt b/requirements/tests/spark-3.2.4.txt
similarity index 80%
rename from requirements/tests/spark-3.2.3.txt
rename to requirements/tests/spark-3.2.4.txt
index 44291430a..1acafab9a 100644
--- a/requirements/tests/spark-3.2.3.txt
+++ b/requirements/tests/spark-3.2.4.txt
@@ -1,5 +1,5 @@
 numpy>=1.16,<1.24
 pandas>=1.0,<2
 pyarrow>=1.0
-pyspark==3.2.3
+pyspark==3.2.4
 sqlalchemy<2.0
diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py
index 2135f3b52..05358b9c0 100644
--- a/tests/fixtures/spark.py
+++ b/tests/fixtures/spark.py
@@ -44,7 +44,7 @@ def maven_packages():
         SparkS3,
         Teradata,
     )
-    from onetl.file.format import Avro
+    from onetl.file.format import Avro, Excel
 
     pyspark_version = get_pyspark_version()
     packages = (
@@ -74,9 +74,23 @@ def maven_packages():
         # There is no MongoDB connector for Spark less than 3.2
         packages.extend(MongoDB.get_packages(spark_version=pyspark_version))
 
+        # There is no Excel files support for Spark less than 3.2
+        packages.extend(Excel.get_packages(spark_version=pyspark_version))
+
     return packages
 
 
+@pytest.fixture(scope="session")
+def excluded_packages():
+    # These packages are a part of org.apache.spark:spark-hadoop-cloud, but not used in tests
+    return [
+        "com.google.cloud.bigdataoss:gcs-connector",
+        "org.apache.hadoop:hadoop-aliyun",
+        "org.apache.hadoop:hadoop-azure-datalake",
+        "org.apache.hadoop:hadoop-azure",
+    ]
+
+
 @pytest.fixture(
     scope="session",
     name="spark",
@@ -84,13 +98,14 @@ def maven_packages():
         pytest.param("real-spark", marks=[pytest.mark.db_connection, pytest.mark.connection]),
     ],
 )
-def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages):
+def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages, excluded_packages):
     from pyspark.sql import SparkSession
 
     spark = (
         SparkSession.builder.config("spark.app.name", "onetl")  # noqa: WPS221
         .config("spark.master", "local[*]")
         .config("spark.jars.packages", ",".join(maven_packages))
+        .config("spark.jars.excludes", ",".join(excluded_packages))
         .config("spark.jars.ivySettings", os.fspath(ivysettings_path))
         .config("spark.driver.memory", "1g")
         .config("spark.driver.maxResultSize", "1g")
diff --git a/tests/resources/file_df_connection/generate_files.py b/tests/resources/file_df_connection/generate_files.py
index 874045f8c..698c81ea7 100755
--- a/tests/resources/file_df_connection/generate_files.py
+++ b/tests/resources/file_df_connection/generate_files.py
@@ -14,10 +14,13 @@
 from contextlib import contextmanager
 from datetime import date, datetime, timezone
 from pathlib import Path
+from tempfile import gettempdir
 from typing import TYPE_CHECKING, Any, Iterator, TextIO
+from zipfile import ZipFile
 
 if TYPE_CHECKING:
     from avro.schema import Schema as AvroSchema
+    from pandas import DataFrame as PandasDataFrame
     from pyarrow import Schema as ArrowSchema
     from pyarrow import Table as ArrowTable
 
@@ -85,6 +88,12 @@ def get_data() -> list[dict]:
     ]
 
 
+def get_pandas_dataframe(data: list[dict]) -> PandasDataFrame:
+    import pandas as pd
+
+    return pd.DataFrame(data)
+
+
 def get_pyarrow_schema() -> ArrowSchema:
     import pyarrow as pa
 
@@ -382,6 +391,87 @@ def save_as_avro(data: list[dict], path: Path) -> None:
     save_as_avro_snappy(data, root / "with_compression")
 
 
+def save_as_xls_with_options(
+    data: list[dict],
+    path: Path,
+    index: bool = False,
+    **kwargs,
+) -> None:
+    # required to register xlwt writer which supports generating .xls files
+    import pandas_xlwt
+
+    path.mkdir(parents=True, exist_ok=True)
+    file = path / "file.xls"
+
+    df = get_pandas_dataframe(data)
+    df["datetime_value"] = df.datetime_value.dt.tz_localize(None)
+    df.to_excel(file, index=index, engine="xlwt", **kwargs)
+
+
+def make_zip_deterministic(path: Path) -> None:
+    temp_dir = gettempdir()
+    file_copy = Path(shutil.copy(path, temp_dir))
+
+    with ZipFile(file_copy, "r") as original_file:
+        with ZipFile(path, "w") as new_file:
+            for item in original_file.infolist():
+                if item.filename == "docProps/core.xml":
+                    # this file contains modification time, which produces files with different hashes
+                    continue
+                # reset modification time of all files
+                item.date_time = (1980, 1, 1, 0, 0, 0)
+                new_file.writestr(item, original_file.read(item.filename))
+
+
+def save_as_xlsx_with_options(
+    data: list[dict],
+    path: Path,
+    index: bool = False,
+    **kwargs,
+) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+    file = path / "file.xls"
+
+    df = get_pandas_dataframe(data)
+    df["datetime_value"] = df.datetime_value.dt.tz_localize(None)
+    df.to_excel(file, index=index, engine="openpyxl", **kwargs)
+    make_zip_deterministic(file)
+
+
+def save_as_xlsx(data: list[dict], path: Path) -> None:
+    root = path / "xlsx"
+    shutil.rmtree(root, ignore_errors=True)
+    root.mkdir(parents=True, exist_ok=True)
+
+    save_as_xlsx_with_options(data, root / "without_header", header=False)
+    save_as_xlsx_with_options(data, root / "with_header", header=True)
+    save_as_xlsx_with_options(
+        data,
+        root / "with_data_address",
+        header=False,
+        sheet_name="ABC",
+        startcol=10,
+        startrow=5,
+    )
+
+
+def save_as_xls(data: list[dict], path: Path) -> None:
+    root = path / "xls"
+    shutil.rmtree(root, ignore_errors=True)
+    root.mkdir(parents=True, exist_ok=True)
+
+    save_as_xls_with_options(data, root / "without_header", header=False)
+    save_as_xls_with_options(data, root / "with_header", header=True)
+    save_as_xls_with_options(
+        data,
+        root / "with_data_address",
+        header=False,
+        sheet_name="ABC",
+        startcol=10,
+        startrow=5,
+    )
+
+
 format_mapping = {
     "csv": save_as_csv,
     "json": save_as_json,
@@ -389,6 +479,8 @@ def save_as_avro(data: list[dict], path: Path) -> None:
     "orc": save_as_orc,
     "parquet": save_as_parquet,
     "avro": save_as_avro,
+    "xlsx": save_as_xlsx,
+    "xls": save_as_xls,
 }
 
 
diff --git a/tests/resources/file_df_connection/xls/with_data_address/file.xls b/tests/resources/file_df_connection/xls/with_data_address/file.xls
new file mode 100644
index 0000000000000000000000000000000000000000..28288eb8e5e89ab2d4059d39a10c5b0b10133a9f
GIT binary patch
literal 5632
zcmeI0U1$_n6vzLw*^imoWV1VoqA4y*s<Ec}VDlo1ZcNqWWhGTmlL|IAsY*gDp;TIE
zwbi`Y4`?ma_9dW&wxCwQs%?UT+ET?&&{C)-ih@`wN~@)qZvW@ZPT5_8L@I(|&a&s;
zIrq%`?mg$;J9qcY>BL~)H&sI@l6fe>KUoXmJiEcV<z2_Y_H0%zz0zczCSRVQ2v{Z)
z84lsQvXc?XrQ~}E0qhALp$4w-yArQr@%rwL)MJze?L(U(eV9va;B#7Hn5eySRKKjc
zLiKL-{9bjvTHaGV7Y+uER-8^RuARkaflpcysMfX^wSjN-o?p~56XWnBeX<)rl*!K4
z5a#!+YwH>-4as0Bc7T~~?C9?3Y3oV`%drzS&_bD!$;?==AQ_y1FHn(hN;Ngj&SYjU
zT{?(L&X67WGn)ebkO+w!PWkr78wUd^;4fi{xbcrLFC*DaCU_*f#Vi-wKMDkaO~}5a
zE3Ow&G}8dS3+6XGe;<=%=I{cb^>wZaw|LGi>Nt(y?kDkr{iICSSt)fp%C(-*)fR_|
z`(NHZms#DMUZs{zw1~*!n<9Nd7VpR%ygA0T2$b?`gL@Wg^^Rv%zbig?O6ygsE9uR>
zqo#RWuCi#e`fSnG8glAX&Znp5)t_O#_vQ-+lD${c(^%itIP_-au8ZkDRP)W(PKJCo
zW4bZj*#4==!L(Nz+-YRD`c<{NPv!4WW+1L3h;vLTMmgRN*tFS5=)OD5T8J@EXmdZG
z7vgvm_?_W!@X6i)%21+lh;e;RzB}o~zK^ffyU!@6-!1wHMfUJwN?0k9A)$l|C}E|1
z#k@NB>3~NGbGNNP<8O2JHyn2<VP-FZujwZgnF_^}h*DxXN<=A=OBqW<DX%bz4(<$i
zl!#I~ue2n#+;b@rrL3W!P-H$7Q%t2eIf_|8F_luy&N-+(=21+geDL#zu7%UCYs5^Y
zJWW5L$f>57Vku>Oj$$cA@+)JplyZrkbI^LiqgYDWdurPc$6H;BrIgF`6N*H?m|`oX
zDo3#kD7I3z^EL<XpY<rVQZ5|bSG{(tOR<&m8U2J39f1;6N-{@@7Eq!}S;mFsVBn%h
zi7MsqLucQ*(eF~CN?A_7c>xFeB^0O^l@l4AFMb642>223Bj88CkHC`<0Xdz?t2}q8
z*AuFoALVo>|LY9%E2S@EGXhJno}br{!a{Vj?7<t%{ODyoLWUO^nM<P)@Bc#aT~2xr
pI<Z>&(K(vi3w-#S>p#BqqoDCY7xEMG(a1yd^T+=Py70gF{{lfZDN6tV

literal 0
HcmV?d00001

diff --git a/tests/resources/file_df_connection/xls/with_header/file.xls b/tests/resources/file_df_connection/xls/with_header/file.xls
new file mode 100644
index 0000000000000000000000000000000000000000..efb43b4a972dd3618e170aaf8e19bbcf06e7189a
GIT binary patch
literal 5632
zcmeI0ZHQD=7{~wj&b>3U+rG`6RcfPQSj%$L51U`Y(3wPAzl>N5nQLI9JDbZ+yD?2g
zL1pQF>kWE=tS<`|RG?{KRw@Z)(Ti_EMo{i51yK}5sf0HDpXbcAnM*7ak!0Lwm~+na
zoadb1InQ~ycjnN+#N@>Gt|<g$5hD1#(u8P@-D0`P*E_)aN<~yZkF!iuC>Kx!n&Mo@
zsEfB-KaR;-%6d=1#%s<#roeH2Z^fNhJ~moNpGR4&W5`=FhJ{Qmyg*AEF40kWny%9{
zsp&TL+^uO=Estrs5FRG20UXRM@41$Jfj!+~cWYf6Qwwisn=jPTiy8QUK9z@_l+Dgs
z7uT1D^Tq!_g~d_rmT>$lqe?j~*aB{WzQSmslrN^7cD#sYU=?LvE;nz<l9Y2XUP7|g
zlwP)MK`ytTzkd>)+&WwEYb6c*B7+opCmq_)ZyYS7fz!eiG4YRaT`bw&rxS^6I-B<G
zakWzo_&0Ik|Eiy6u71$|l<fc;P`QnV$ig@e=qEJDHPXMnP$>7F!4O)h%VUEh7#?Kd
zu5xM3J^A9@1(t_L%cjhoHkdCLjM8S5^6(v1!$rVQaV%f;C4udGO?HjR=CcEs%dDgB
zGHcaC-okVc?OIOANYH^e&b<87k=)u9nKf$JK#Pc+{3~Q$sNi8~!NwdB#nj6c*`qD4
z*7Ona!KGU6(zKJ;1{t$g%%~P_*r-06w01W8nXmF`=E|D#RV<HxcjK<q_=(IMmbdmz
z-QT(OXl4T4jNc#`^4W;F)?91zb0TkM{M_O-C$sueQ*%kyUXO8`*E}W4IjLCfI4j4S
zlF-y>7B!i_(QInctI;CVBIh)Rn#9P|xzt>0!jbm1oCob@wRyA}f0>8T_Ar;d>EQ=9
zOXArbHW?Ad+hXsXxcua0t0$iRI&1oH>Gvc3gd*b#QpD!ztYlQODt}UJrM$ukB=O-c
zpJFSe@B8a^F5Yh_Hs|5tRr(1<<{PAl&ErltbBQQLo}Dt6h*E|aiX?vC>r*0189BZx
zvFVhdM3izn{e&WM3{u4A@na}X9mP>fH=#-NyysIKr9AQZy~U+-zR`#|N|{AJp~(6K
zDPr?5vr2K5BD-AXB74}wS5%W2_{gWYO4)JXnfLb(7>cWuWAqb>tZ|ScHV<7@O0<p=
zRmx^=#Uvg->{Fsj`Rv`-yVpNzC{d+6Pd}kZyn__6c}P_$u{ugjDL1zWW$#g+5>v`=
zZykQ%+dYO7Q_4#EEwVlAlnsTfBvmmd#@E6;j6fKHFalu&!U$ZT5s;TPdCJL4jl6Q0
zmjvhoR{qyH%dh02{68ab6UO){BaNjPWnRL4+y#xxOt_XG8QDvXi2r|~_^u?qgb}RO
paf~$5yv~O|d(QD?9Cek`DdzXkbRLb?{cV2j{2PT#<M)sFe*g-zQMUj9

literal 0
HcmV?d00001

diff --git a/tests/resources/file_df_connection/xls/without_header/file.xls b/tests/resources/file_df_connection/xls/without_header/file.xls
new file mode 100644
index 0000000000000000000000000000000000000000..420aa1107b20e249250ceea2cd08a073506eea69
GIT binary patch
literal 5632
zcmeI0U1(HC6vzMf?%mBsVv@UwVpCj})TSoY2kT2wl1)XMylhAts7(|!(XC2CEI||n
zKU$NwenCH=wl4u&XbWkpV70B%LXAbm7zGtUO%w%D6vb8%b^AYO_Ui6d!j^)x;htsB
z%sq4N{ASLanR|DS4!I-!-&c$xN9qv3gH$0x8Fq{HLhrg1Y)_@+(ksKP)8xzlC<29H
zE@UW(pC%qIkhPTc9)*qXoL{Md^ZYKxn`m0yos2y}nWtlDw`2^p)D}Lcr5I1?s3NMD
zsV-Cft$O~fx?U~ss;)%@BUT%R;!QO#vM;cw*X>GeE2g&agWhvmEweBQztbnR`2$II
zwg&N1&#LyW@zM}=im)AA>4s!?vZuW(>XhIMOaNLbGZTrK^XEsMDfkj)nWk7{<D5id
zPD{%O%9$bC@kc5K+>!{1{1Wr+Pc{w~V!$0?ikSEpa9ti}4|9PZXD{R`=h{EC9bgMm
zOLfQfalf_DAg4mh%4BlQtb3R$$-`#@z=pNl7^Zl^6tkI0Fh@#h){!z%w^|YPa+GM@
z)eTpQFz$VI<5;4jIle?KTWArH{r9X42PwQIckt$7Y0v?fgQ`aFcwY6};)AEPUZJ|2
zo0rE&Rr92D<>jsF)2ppj<jht%9)B*QKArWxE3fR1_Fafqvp&!;x}kjFWV|1heE2OV
zLq1zE!<u1j`&494+$${}HgaH{QEiT@%tI<{^RT6aI42dW1e<J$8Q{_i+?Tz`{#`L)
zFZ%=ga&e8}h~SeQHbWoKI5^y*)Bc>gsQ;sj^`?&?{jSqbC=$I~ir8H3Unnw*M-)pb
zUvpg}IJn!RSX|^XG+dp#Yu=!t$bsnM8~O=F=9)_po6E#8bCET1bu=;;TPbe@g>rMB
zN3oUCdA`-{y=^GAQdZDUD3WZs6tTJN#ZUrSlz>tynWqud?Dr@ErM!P?UDtx@OB%6&
zQl6ooP$biGDPnUmIZbhtBJ-2EI7&IoWR0NhkVkQpvg63d2L{^=#Zk&R`U!<o`Wr=T
zE-KQLU=~F(*u^#`MFj5+dz7G3PW-mBa^*XQ5>(1(^b?9?Qtn*D<|3M=gt90hrMy-o
zlzk^XN=PYpe;nR;d9R^_l(LY1b#?^1BowF@l@^o9-j!ea5%446N5GGO9|1oC|IrA@
zOQAf+<%LRKcgzcldC8Xlb?);kr7z<%0<U5<KWh-f0(7(N!Fpyv{xTUs%ZrTUQa<AS
zUnsr{N$)`?I&>VJ`P`o6!ygq7`7(~I#<4EUFR`&a@@@Oa{LK013mL=jU-AC}a4RR?

literal 0
HcmV?d00001

diff --git a/tests/resources/file_df_connection/xlsx/with_data_address/file.xls b/tests/resources/file_df_connection/xlsx/with_data_address/file.xls
new file mode 100644
index 0000000000000000000000000000000000000000..bf2343c0a7cf9bb4c811a0925a2a903e64f845d9
GIT binary patch
literal 4891
zcmZ`-2UHX5wod3xA%q&bG!di)lr9hn(h(4mNCzSGl0y$D(m}fPa_At^B=laD-jo(;
z0@6g9ph&<Q^}TzZynAP@|DS)>ns2YY_k6p|*3}>+q5}W`WcbAefE#yG1>sNC_(g+X
z)-G1M?k=t#A{MT$!ro4fS}|IbU7|F%f2eQHJKx*m6ytoGztrz{=jM$#YnW#U&R@SX
z7Av<P8r?)cVAR9yB^laN902N91=LZ(yJ3^1%&NQ)q!GK~h))!F*0zVIisdz-IlU%y
zBm>9xsa9s4wMFuSz-C!}NNOf3#a`^{w-krNn2_lfjyjhU3*z>~@5H))=$rO1lbePF
z0LbJA0Kj;Cy&XkRb~a8nzjv{pD(@I0Jm#e60(L9Myls!2DEn?asl$Lhke*mfTJVlG
zM+nI1nWK+&d~jg9>J6H~<V!mVT1^>pL_(XVBPy!v8&W=ppM@<qLc@~|yv=7tuv5+T
zA`46R#d$zVqRhpWdmR*onx?hJNI<s~x*ts2{XQs)Ov9UQaO&X_0wTgawS~dX8wMjC
zbD31F&C#Ypc8X&Lca2@lI5BG+&~NmE2G4Brf#CZ}e2w9lgA>PI+}whK&FH2KoA?uF
zQ5GZ4z!8_?Rc+(&WrMDs4tv{Mh7PaCC+<8B>PIILH>n-xPtuc#Uy_x{G8*ypKknJd
zi5{3kNjO24R`$K6>INsGTv>^h?lxz4WqomMFHj=K<RizdXO8D8;)`B9)7P2&ieB8#
zu*;-)hR~UG@S2~DiRddI^S;!!uM^f?5z$vRhT2B*@t;*V6l^lsDZ^K`eQD`y*Ayo?
z0Y<K#Q$rJk$)*C||4M?)vNg{uJ=`<;EFA$l@9QO+LQXIzOTgW;;)pK;ZuX38>iSnI
z)kQ9KB}_bwn;I3U)rF9yzj<<`8pPH6zVhrSrMbTV>YIg@8F5GoM3w=IcbYRs6LB6J
zt2xDq(^|?St}R03`={Ipi!gp?S+KhdtF5yC=<#uUA+Px8?g_=Vwwx`8VRDtp=;&&8
zv^#h6{IL=GSVz^Fd%Cov=E`&uoJS?+&Gn{LkW{VOC2{aAvaHZldtFw70&=nB+|X~B
zzL%gSFuJ~t?9Jk{@ULYq>GJn;jnw9XztO-UA2K~YdW<`sk`}_pkMFg-&rdO;2c|GB
zO1zL1>Yeg_-mzbvDaQt;9ZsSoQ3_4r;iD~;4$!&-J{K&UJI|a{+)EPS`K}%xi>%=;
zM)3`AKp~y*N|wLE>?Y)V&xubmB`H|rS=b0{S!&>1!N}%4_DY_Z^ztr{pt6D`^9_i5
zEyI&Kx6)v9W7@uPZ|w;T2>0NnWTJxkn@_{>e~nDVUj`+Z-o(E<5)}t(XAhN9Pv6ON
zGa@?6HtkmH+z1+cbk-0M`VQ(S6!TT}Eu;D>HDB?qO>qg*{PYLpt-*%&IWNK_6iV2|
zDTwzQ*%j7&y(w-c_qFI&1o;7fjL+TdD*G1HAsr};9xdE${34$_QGb&gZQD5+g;;<)
zBYI9N#O9_Ygs)oNcOQQ9+MqH~U`9Hp<I|a)LiYa7_gS9c%Jc`<xf$X#p42GuG~l8J
z_OA7iZ&)LE$p;N4ZEi-=W-GHeZ|WbZKc8N!Y`oeahgjXL%ta{(jh@Jvtz(8Mp|E;)
zjjg(*wpu<Z^)J;!uG;K_RI;{m32b#<f3r!R9-DZsau<r~r@$H@%58*BO<C_{q_R`v
z8!0CB;#|5<HhaAxAEalA8quluoz`c@$4}Qg?^fmS$|bYC4nN4_`kWh@I@%2j^YJrF
zwSC^FD?#Vv_>xBmUT@woy)Q(!WNm)^Z_=lgcL@&k?wN}r`UnN%^87WURW@ALQy!YF
zj3=yMi)?D0D+Romov!FfcOg^3wq*kgHr1W?*hcRm4yxMh<uUZyqvsFG-{eq%v>7OA
z?^q=<7OCeQGV8ak3##?plWJFi*3qUgwXA%6?WV2YK<imYO(5RbEM4krBW7E4uy~z{
zTnG{VfcAry$<Dj=jLZt(E1us|ygJPf&RqLdsn=xJ&AynxDgut@SH3$Z%HGk;^yviw
zQ7Q!7&sj+r7{yn0KPSYIfPuqF{kQiBf~<>JAE$n&Y(1cA6rdj|SC9@=5qcV`Z92hA
zOBY#1Ma;$MzOuWUXzvw0{axsZQ?P-!^*hD^SVX5h)Ezcsz<XFR$4420{a`TM3~A#t
z*XZ_j&brS318e$L9>XXPKg>^CVy_AF$nb6GQhE8qXo~$rd{NpJE3-go`VlMyD+uUB
zM@Bvdm07kmRqCAZX3Apx_MdQO;HIzz$MZ{6{%^~S3#|n~w=_uoITA>$q7h#t%wp**
z%ZsYZ`{u-`d>$JeQsq99S`1T3hd|#&y_9}N;iB|TaqSlN?IpY8OA@T{#HM0tbvN=?
zwJGG@pYL9(>gtJE-ic}l{#ihV&)Ht45&!^!`1;56yMTDPxI1{*+1Q{wM1I|VRgve(
zJI)!ROd&4BZa@+ioFs8U#U!SbL#e#(tSBP!Z-SBN3`eh983k`WXJT*1TOP9K94hcd
zJYX&KW%E?t*wC3ihJki53xVnh7PVj#c|ap$YpG+wmc3JveFl{u@77U6i32#Ph-Y)F
zD0Z;dyIgNgxoxv<Q>8F(xqRFH=xdGLT6MSuE?~KI_A{n6E-4D#N<XUnW9I(AP(r`W
zJSvX^GOMcY82K&6{{`nON_g)moR!p;^+6J&fZ5LKX;Zj9Hr}pR9T7F>&)QRNhrQBP
zjmEKSx%6F5=<j7~Zw_QzzBHsEAC8zkx1aL8-y;+=WvSm@`(2H_JpxLT4Z-PcH$A-S
z6KwzjPYw-S{(Og(IcvTGM$XvZ$x|^>d#Yc)x${xp!A9a4QaL58Nm9?Lz*Aw9Lrx^=
z#5uM|CfZK{dBEzvRhG<8PjQcP_%0k|!Q>vOvQ|jIe?zPaES42sny(5JYfYFr95{HK
zIznT`UXqLooheZqa8F-}E_zU<`aFdq(<nAot7mAO)N3vKEQe_r<Yy{<IeeB%%vOl=
z5wr(R%$Y~?iq1-tz*2->s+wIaC%kc+jpk4=LcX%9Mm0b!8p=?Tj7yk#0pO8a7Z@Iv
z<viAV0^SC7qm*xS$I^GA8*ee%=GI8tEj}Y!>`+CtbrcDGlfFJGPB#D$|C(kk-6dzF
zchcFRcHFs4+JGe{?_923%4V_Ql4>F1k2|1YH4Q>AE6(6zL3U<b4UANo)*P(p$IZtQ
zM5@z4e$LThOLioz`m90u%&y#o;ot=E0W}79zs3%71vmbYO`F({Va84w0pZ5gVuN~$
zZmgw~NUwCeMvhbec;i85*3U!WAYDWAEf#^J(;oJ^NvP@x^`C53-Mt-JfM;_YB>-^g
zH=8|BK8`jXKO3QW!$p@lQJVAnLhUsGq<#<?wMK{xQuFx+?5kZkVKR<eBa`BIdX%vk
zXlz!fg&mC69XZFLz1C&H4A11%+N52b`|qmFTYoWkl8nuJ<h2wjnWNcT3sct@d^C?)
zP}REUVJM-fC!QYjTCE^;4AnSRZY+R}%R<%;Zwy!6#;s_?kE^-EB2osmI-$PqRt0ZM
zuG-PP5A<vO(ae&6<ckW7{iGv`c25<Qbb$nWg`IFwQy(7rw#@>>wh|u-tLKk4YL}R~
z4rsa^bXmha_<HLYI<BSB9;~JmS5-kL-u5)XJ}P<&4(m|s;HXC0r|&9fG#&29qo~Og
z??@Ag76w=}mZa9UObOoHl$!q_8~){ZveX^)$-y&S(`7SlOg=NQyp+<^Y;7|onX+GF
zo2+(*V+pB6540=4J93o>%QD==WXjySZ0+@M;F83mhfg6|NW@+|G;*U81w1NB*$0oc
z8ywCtzS`&LPf%}RI&J)GcEkBRUo!dDR14i}=KvwnANRM=-RH2g=Js-qBYw}j$z^#|
zWkl(+`HM^GZ)%8U>y(XiPiK|rh^Ke$S5~iMn{qM)(e>?1ybgtENJn(`j-<%#vI&c6
z?t*u_GFY=*QvQC|noN;g%79gs1)c4)I%otmb2<R9eEroHoT*caos#>pML-pqXpF5z
zz<II%BRd85Zp#efc;<n$T-H$vE}yTRxM1m0mSxRJiI_wp*>O2)c%0RGGSA+nGK*>;
z;UQsm!0yX^`xUkWHx6^jvl3X9=Dv21c<(4AZm$u@t=0Xm4+kJDBbbt1y7h_F_u5c|
zRmn957SE?{JX&HzzGCn5)q*m$336P~RS4SGNC+H2sY>&jiJ%^&T1Cc5LKU&{S^nEc
zG+k?3ao%Cz-kQjtbWVwEX6?Y!7>uVhp3wj^cN<3!5#gVERzjn5w<wijVDDJADCrni
zIMimJf#?Kim`6QBOB|_^)m}MQ=oUxG>?1En`<@-CxFc0i7gfr1Mx%!`e%AoJQ^y;N
z9En%1cq_y%88)GK8*2rtG8R%$6lQ)@-)hZJ>``z({NZ#~p*&VZN6F=JrHrPVGV+cF
z^zz>DM4<TBLn=v?HFfUi2#ELO+PUYF+Ia32BZhR9{>ePH!a%my$8nRJl~z|UzKD$z
zH#-B0nD<t^(QcNQbIG87>Q4cHKMfUF_WT9~0sx$NL#cnq?awM?>Ehz>GjgBZ9k=gN
zq`^IR<rpi$DEE?;PZ(Y;fBWH-cQ&=R8Cz9vzdh}hNOSE=p^l$y4HeYm5W3IBFT-QD
z5R#f^T##0X6tdsDv7k$+lomOI7=Qj1>B!{B>;Tvo&WeUlzC}<y663(!$TSt<(IwN<
zts?d$TmLp#@8I4QE-jO^JA`r}<=bZ(TOl48iq@ockZGq`2?;W)zn+>oIMl%J*%8#s
z;f+nlVSGN(RG)<1<4#M9F{o0GnJ__pH<Zs-j6L1C`XyNV@qNRkw9OgwnR@Z`E3J{^
zN1{V*J<LCf;V-TUnXB}U!{XXhNv0uG)qQ;FJVw-MKHd-OkOl!J$Njgjn;i8(4T7d^
zT%27EltX|6H|ZlH9b|h$6FpHfiP+}-);d>|bLotqH!tEcFl_Vz?t*`L*fa9V@qYb=
z_c7$Re}8l7|9bfsuWIAvFFzHf3aO-B14!zY!iY6Zj0A;gh!44GX?rUu>P13;aWW{2
zSJ~GCcl)(-?C89l|E@dafjf{4SPy>M1r#7Kk-{S@qB{1BFF!m8C25&xwWmqA+N3lC
zwa)&WmX|4a4nzGlkMx&1dnL6;E!twTQL17spiU22)$FpXyV0rlVNf$UuSA?V@>;U1
zG3SSVPA8-xx=KeIv+l!q^Q%h5@CtdwB9*H*>ij}0V-x3-z>GI=l<@bzFaqY{jIwb?
znHl)F+IX1!BvN^zmUg!&4c<B%1h24=(ceWZ=9DI`Kr0<c*?FR^@dOsCJ*WOREq#dA
zOYeo2(d?9$3$dW8jjW@YB(}T$bOiTEwJq7McvoP!q}6cgbyw7D6A?6D2iKnJc7nZM
zT=h%Zeh0<4yOerw`mkv2X{B{oK9GCXHDs^OoG+<PJKfv4&5^WqyQI-$%-r^hu#s+L
zS@6RXHrFjbSw(d?tU5lzw9xALM!BMlafk2RhXH077D6mj(z=oiu_7p~Cgil;9!H63
z@nwrfk2cTZIv-KI8PK8tNmmmEzRu$ApTB3#eaE_}(wpnm(Wg@-ZN$^I_~Df^D&)=8
zCzeJZN5A>?G5%l~*#~ufZogICGNla({1~--%QO)0Ngx3o;eVfl;5qZ_dW`q)|DTFn
zL|>dr{Kf(RkAYSA<A0b@ToiDzEB&{CC;Rwe2VTH`>`^aDx!4W-TMDq9{Fjt}dV`D5
zi<$QyXbjaKdH5pmVkY|s=!I`c{zvow&1)CI7sK=)@bu+h;Qtb|7v)@xgn#6;;Ir$0
Yhznf}V!U5}4#CI(Kk%QJ1oZRiU#uEuS^xk5

literal 0
HcmV?d00001

diff --git a/tests/resources/file_df_connection/xlsx/with_header/file.xls b/tests/resources/file_df_connection/xlsx/with_header/file.xls
new file mode 100644
index 0000000000000000000000000000000000000000..b19c54d02d7c37068481b74f543c0e22f9cc24ca
GIT binary patch
literal 5026
zcmZ`-1yqz<*B)94VHk1<Y4kGENJt~yf-o?MNJuwIi!{=m14xPz14@HP3?<D-4&6wD
z4w8Q3b^Z5#m;Zk6S|{GM&VJTD`#sNd_Pce|@$jht0KheDV+AM~w~+>6XI0onfo+z~
z7CLUuF7A9!TwHj)933=cHA&j}DenJN+njZJxX&WM@}*#**H=aKZoH+!^U!lYy|y^C
z<Q#u=BlTw}lHF4z4Eg2-uvZySOQO`FfO<=>%n7!IGRqEnM}elSksOr_-FRly8bX6v
z=S<r*66=i3B1^Y7OY1_@;T~yr0=Iso*&oG*PBh=Fbv}7Q@G0q(K<6L+W|+X)DF^`o
zI5z+Q!uspwz~^CO<!JT07x?A!jxo%AT8!$&9%9(b8skXPbvLlK8su*K99^9evZHk~
z6bOxR@U~0{3HYRZm!dC~Y$s8(F-sC3vUxVBsJy-*=6$rwYrYW{k$mW7HpPb?Z>r;)
zTQC&l0Lt;xzd`J`5*KNh))?CYI>h{YL6jXcK~dM#y{P)eO%`BaKKAji)#zD+5L<^l
zI%P{Ue^ZE!?6AJBvGZe=>eZV<KdAfkW2_2rK!$Q$4H4CcCk~zG({s{RLz@yzf`Lx_
z3{aMULFYFsTE-EJ`t8V8J8KyO`|goZm6t)i{>cQ5s+a;4^)*4VdlK2u!RLM$<W6q%
z=V=dNNASY(ftOfq-)NKzBmRPJQ%-yKH-}G!a@VU1Y=<o;G1C<Z#qVPDv{A?Y^IKUq
zaN-!4Hp<>}78U!dt9;mttmQyEyrbe(SLv|FmMs_eMTLFgCXJ1}((;xMC3VfJEQ$pH
zb$LGCKZ=)Xdi(TQGI)xqX;#i;AG$043V7Moi9c>TN}npM<dz*za04KU9MRD6L&()e
zF0?0(n#7L}-LBCAU(4(X{H`3t+Bt)`_@36(TPWm{?JqHCpB!LYit}ctDQhU{+<jvu
z_f67-ru?8wGvxZg89VGrIJeV1kedXfwY=XD29r?4DLAxuLcFCVX?@cmwer!>&`M6U
z8++3%2I`N|R(4{ac-#7cZ6aBTLov7KPU8wttVWeg5F~RgJ1pH!hY`2%x<G1P*pKS2
zGT;Kpzpmw4&wNb8ajA2rlwlrJbvon+g%WrP?!M+e;&4V(q%?wg*gR8^2BpSHqnj6g
zCj#jl_eyC!D2GcjfhY%(NeJb_(m1#%--^G`Q~_P`6ir{kQL_8Vd>p50330X`*xz_?
z4QvR3+msLt&%$j+C4DXlPT(TMj0p@(xYi6GlvqP-oA#Lz9I=_@?LZ!RX><C!V7D5Y
zz*^V0A!f#uT_avvqt(E3qcV{sX|tZ~frMv+;|Vu_iKe31wS!Udz)#F!VrrQ?`L0m>
ziyYGq)wYeGzNZ)UFT%!z93ZjB%3o;JR>-;D$ZQG<6BT3{UH=$jV3+$YTv)n<S&*3E
zpn+L>)yIoiG_|W)ry|G~=jX_@XnX08pjPn!UjLz@y@qd6d82ir?Ecnms3_Q+k`oMh
zRv|DwA<TQr!q9D?r&}M9bbC@fw{`o%MmpzU=X8oA1d(ZUhn*%~J@A7ZNBw!!=lwsB
z*EcL-oY(vGQC6anlsWPYPMdn))lw!_5e>KMC1ER@h&&Hz$k2)8<MrwR5+Q{;H}$Wz
z$t^WpVrt*2`dze``$+FuOC~badcI~7Im0}9zQSI_ua~A!50-Dib8O6>kr2yCPiP=U
zJ&1SiIN9v<0xyYA;WzlF8#=B}j*OhGx9L_E>`A6Fbw?cLv+m}Fr4Mx|gnRowPPb0!
z(h;U|bSUG1DAk$OPaHs~7A(!~ye0}p=;9tecxWa7>%!xS&-c@aR@`t|PcwPEJd(JK
zF1D(1LfrPGcVv4Y-fo)?vM&8RXI0hqkZI^4?69)MPO6$(Yv|IbyeF3ws6|6UsbY~#
zTdbCUM6cJf&ZCNaDE3KFsFpH~u6cQ_+f_@ip7MDuIj&$sllWU7D*@}`!}&X;*CDV7
zBg!SsM>}KdS?~%Uw&$m5o^57_7cRZZ<g54AAAftKQ1JpYyL{@DbWcSC?%fH*@leEN
zPhljap%q-w*-eZm1mO%M_uk*f4YDj|e3^bq^6`+g;WqVPxwLqIA|yCW%XE~Jk}9&2
zlz^4iZFz4m$<8x+;uI3-7@{v|IY#?g;Z>WIkekA!KIc)zG#5!IdP#qv3EaYErrzP>
zlzoT$C))IjR5h)X(osRi0`rG(_bi|KcEz%v&~fGy!Fh2Pw8R{q=^6;4Q23(FKQi(q
zu++Sz5uts;3BOnEdl1Nybv}+R#LO;``h6)iF0$kS%BU0h-Ap93h=zR=ejG<-US3>P
z-Zd>i>irUWM4I<hY(88u6D%|qRVE%o>?}7XyDEeJLS};@6J|^xFcrwCy<4!NMJzdU
zxkpynj*MO0iTZ@|&kKZ)wNY*W7XX;O4gk>ozCb*k-R#|MtgJlT`TiXKyhTz{cbumA
z=|Y_eR&n9T1H*R|wp7Cyiw9$N=%HQ$Jl&*pal+cBTqYaKtYPAi`*8R%qYNTxVTof5
zRtJ*TzP$Sktyz#geikmaYjAI5|3=S8n}L)VeX@zCCAF?~hCzt4{NmUriShM|4;fMv
z19nwYLM1mE?K+b(J;I9@zs_rXITVf=-MB!y*UZ}bvhhYq%}3Ja1#IUg({~2<HQL=r
zKYX8kt13h3@n|^bhf~x250q~P#dDyEhPUUp7tlUn&gb&^G(3(>^(W$y%DNNdJsQhH
zokCzX$2{Nj-InbsvOb<YWI<WLL;H_eN|4O5&aj(y7ZB~oouNeJAh`QLpL{RszR}=&
z=v1(DlXHl!VfoHzl);*v)slx%c@#8DbyByv@k_sS#b))X|H^l{=w#^^loMaGijLyc
zCrqNUYNCehYm8>HzW2p#iPo$XfZ{Zp!gD(E)+>*zJU6N|qJC^(Xy$tOcQ>I5Qhezr
zPVSRK=rNY$wJ(Nu2H!#S1M%CU<=b5{Y;sKu(7UGGvf&GEAg?s8z71*s^k*^y%8
z`cV!clO?k46pCU6sx;qhHj8k%$Jv$e1l+#0z3U%#G#69a_GV?IC;%?Ot6tK@P%u!d
zuxje1nnkP1ZvVNq3s2zrM?7;s?y7a_L5EnxE7vaEG2F)>7`2LO>|qGo5n04X{RN~C
z(vpQMRvMZ@-=GPP0s5M9)Ss<chNQn=P&JMRnit>>m+|*fAiZ^}zC8#8Z2?h%=@O@}
z=Y@Q*xQfAN&$JM)RMmoyhl5P9$S)_G8&&LNaH`UX;n28rGGzY<QK_JXC#A0`#4O_j
z&|D+<%&tu-B8IpX9v?CS$LGr`q9ulZ5pvmfxUt=}e$Np7ZilO8GW|Wi&XG>Z(bFZ#
zVYm68pcb`qE7Z;1{!8^+Jp2=)5567;&SGV=@lfWU6d+?7Z9KtJppp~-Ap1=LcMop|
zEB9a7)2tyncAB5!f|_Sfm}E-xI-824L20@EJGE_vGn)tn$VsBaQ-ZVa-;t~);~(rw
zF-39|zOh5_i~}*?(Ks_cY%JR7eVTlYw>}Krzf?c_MPn3`?Ue4OQWR4-t~qemiJ`-0
z3RgAK%gsHhE76Vm&YB(f%|gdsrM={V0(9H`^Buhm<>ha8+2$VQsi!IvdBzknrdUxC
zP0T;4__<p#;wIksNqPKDdccFBLU_%ccHg+o`0nm0H*r|`OS)rK-nA5iHzwQ(qGCh(
zf`=N4%+&h(HH)q9N>-K@20;7{)x%*>LrZQELiMwy#j0Gn3tyQ;9!CjB#6rNt#i7va
zCOJ!!h`t{969MOvn*I_Z(M_@J1M2&y2dF&PJV!FAXl~DuSHrXL9^<UU3=(>0<9>qD
zGIqL_@<pd-AUvD$R3{y`#<9rf0*v(Q8lxgs%u~}MV<Q7KBPY#wy#wvHSVqWARv5UD
z#0LJQVf)>@=l;*e$wi0B?C2WdXL+8>+zUTvvX1@NFqpWR#4s8AGuQP)-P!YQEf{93
zGIFo!qy_v<V9x^C^Z`{23?~nAPO+3eeoSN1znP$H=lCN^FPL{m3{ME?{j}?l3_3&j
zY<0@ePyj=8nmhJdrY8UuFthj7Xv;)+(2UdN$I%9fq!O~1z-X_&(&YPx%rs6t#A$a!
zoyQJyH|^!@Y2rf<?rjucZN#?Ol5@V-0s5VK^s8ZZSMDNd3~I4=xw`(3shku=84>g|
zQ4XgC{j<`00fs#ao}s|#<tlz$y5eBPm#&f|sFU2ojQkTX(kMsp{o-5Jk#3SaAyl(O
zsO#>6o}oem?>SWU%JLO~(XU&-rWWs$`F5D&5f<aAeKNH7t2?;i-9%m~<zK2$sd1o%
z6zsGoQ>k}q?RmfNWFgVo`W$DAX_$OVRy$N&6}Ks}Gc4PTu~(2Z(3JGpCX;PUbrCdo
zb32(sH8m92+79Hn9@c7$1VV5la1;^?eaTHf#w{)T{vmy#Y;=zQPmZKTHZithxdFoR
zgakW2cC&JD=i~j=XD2o|b?}qQ26PVR@DmNQMhICQ(BPlo803>rQW8WeW`9CV7rDlh
z(0fZsQqC~j%I=62)<(UhyP$YrYdltuqf*NmXFHf6U-1RPED}B{dmn9~P-zU2mgS{?
zTKCbC=8b!yVT8#<c99gCPg~CUB|<{ORo+%bUFgRCz-WNr@e!$r;;I^Z3JmOpTD^R}
zpgNMr2BpbV>_z1>6$LPLW8zVp2n)7qAK1o;tBpQ!?2H9xw5xgbrASaO`SuIIKQ{_)
zWGLmw0RY&s8zuiO2EXn@b7yD!UvdzmJEGPhOL3m!a?|X=NKnycl;WKT8Xo^o99d5{
zXB{We)zb*iBnrrnCDhy?q@#c4P{R)r5!;*Q$+}hyw57EL_YpS7h<=gRDdxf$c&sXb
zVT7<wLfK|@=MN4!K2dFDsY&ChsI5?ZH92BpOe^yuW5CG^O)a`A+3>^cti_#H4rV-`
zX4#aievYcVBN74ATM^Hi5_=nF`jyPdA<)Y}&yT&@FeDBT<_8r?%<?V{5^+mjG#{?}
zdVudE>6ASTlE5*A1zHzZbh>O<2`l<gTWmbZw0K2wsU4#~0d3FxJg5*~rA#=%Kw8yy
zH<JTO-arqz5$NIcP5@p~TKdi_P;I{$e4$zVM~Q&!_hJcM*E=OMZliH>>zLganAeIP
zgHnHBm`!^P^tv_!46Z0G2Xj__IX0+2umJ}D4({)R^xq-=^H9|!NZknLCk;hVt^!1K
z-YO7iJc9D@QV<-mQ&M(T5ZCdA;>1gMJb9mUCt$BvE7yj~)9H2X5r>jJ;b+Uf?L9yt
ztU8i+aG77*j<#&cD2%Xq^rIa`;;lxxNg>Oe-Hd#=<fVegvsqg|v5WU&`{ex1k2c;a
zTHaQp2Crze+tl7|d$81}k(ys3NFVt}s*5qpQZI|6t$}}~wpR7JH?8QgV#UDn^@@2?
z7cY;?D;bqlymul!c7sEBzkkVL3eHX*R!$y|^}SuJ+#mfqMdeAFS{?ip*xgydIC&w^
z*ToEGB#&HhEVRL;mr2&fqi7+mX|>ms)S;SXUUSVuIca5calp!r?C+DwOuBwlxDSc6
z%$eA{Dymt<RnIeP+0<&1U=+uFtHC;LAg_0~e3Q4vgsR<~-+FC&GidH>X0)0cNyfDI
z+o{nPh^SFcbhd5XO#XVmq``gI%$kiCsuNinVsgUd^40gAteTQSRl+ONB8!xba#;!E
zR-fsm&-4mtFo8tL$K_P81@7A_JQnM%5f1@Pt{nd8p{A+xwx`5BpEZer;#K$o-P!ED
zvkxuVRV<4UUaarGZ=cC&!Gc>729_^KZADiC&7o^UKYY7re=-an0NZyz$y7CuYk>pS
zq84RL1F(_A!KK3c-{lzWnfdd1iH-07FV0-GzFHjlZ3_Ur#Hqwi|D{xN72s-e{1-sr
z0k-CX1^Cxg`6|lQ{NgVZoNw5q8;kOHwsF<;s_g!4N{7u<{!4_f8eWyLzYTG2{N?%o
ziP}};t2F)F*c2;?Scm@yX|Lj3Wy0S$3fS7(zwkmwod6rxU$wYvfS=e;90UCI_8;g<
B$p8QV

literal 0
HcmV?d00001

diff --git a/tests/resources/file_df_connection/xlsx/without_header/file.xls b/tests/resources/file_df_connection/xlsx/without_header/file.xls
new file mode 100644
index 0000000000000000000000000000000000000000..78632de24b88a9e74dffb761070c678595bafc75
GIT binary patch
literal 4881
zcmZ`-cUTkOwhbKv3ZWCaAfklco0Lc=bP$QsBZL|t(xeCi(xrnmrGrS7BE5qF>79Ud
z0TF2$MFDTr_wMh>yLZ0t%$z^YT64~xwaaXMZ6abi002OaA6x)6^G?bT{Hz*3sPV(j
z%~s#T4dp3hjY0|fxH!Y&U=&@#)Hi=>Z7;f->~o56PUbIT9%x8O#@j)?!cP2+I-g+W
z-wDSw(GQsQaC=LH_Y?<$FzSGM3bk(NL@BfS6)@6_U1``ins&~<ho_3ApU8?{M{GFr
zgzaOU>?UiA#Ok%}vWBqqEOeTq2;a9fr=z&AnHG+Ew_|IP_N4D5`hV^B>76WYYEl3o
z>nZ?1i{G!0vk=+=;ez;c7x}g2U30kSyfj_lUgfBdJ<f%qPcpc^mev#Lg{^%Wx~t0(
z1~Q9v_O(k04Qf}Hq#jJYxSI%T%9JODZ2uTmRo~o__C5L}XtNa_k$mW5H7A6fZf+2I
zw|q;K2c#^_TwJ-|0W8$9s53_bx}^g!v^3odA<^X8K6HcAcb4H`A@1o9wb(_|P^5D%
zle(Q%fCa=sY1G)j+|81+c7sFg8~vbhEFzzP_LeelV?^!Yv2*Xq{JR^7k!@Kv(O_3$
z7BkMEVYlLSUGs<)<F1|#M|(w6r~a{VjfWwafMk*;O<ev2J-O&bIoWKpVK0AN&+fCB
zfqAsJ3wU|$z(=}%a6B5tO1x~)oYR&4*}1(y`9f_za@1}XH(!xZ^di<sZ{lmf(oUvB
z7BCjBH{s;HI1v}wS3c@<vF$+bad$;zU)d;n2g!T&w8E)io54XvZEeSohQ4k?X@V1A
zhVq(zJ5H2paqav4WbhnY^P=*deX~z8k)X4_UgBxwICHAFnn!j#$t8eP&zO$Bf2DGL
z)N)ti_?`Iak!yAOVDgMN!C%xvxOx{VPrsx!V+zFlvIAs?osxr)Wdy~$&6y)fC!SmD
z&x(^~U@F6?7RZHzAKY;3$5&nDXgy?E?N$6oaJYoRE21NN$G{z3d3z4i)T+B9BkMUa
z9^B1~II{qpp1LddOlilf%QMMpJgU#$2sEvOr0X;<iqa~QXNRXd>a!9STo6gk4gXf#
z_Y$;B8_>{3{$?pQ;%k{(hQh5}GtK$XZ`5kw)hy5Vo@35G$O_fQa3(Da`DteK1Zhl5
z;x8m1z0*D^9S7xE@@%v;L&+4R%He4|yfmdUfiMl)GycN)v#bfF{bV7Y?^+2@kgvFl
z(Y!-jV&G1-N|yVN9meJT&Pa~4B!H|5ENq1KEU(nKLXpk;?3Fxm8RcCdew7<G%#vV_
zI)>nS_tH=+bDF*}AKmd<(8=wW5=l3#-h3QNxIa9da0!%XA%$N%932m8XAhUw%Gk|w
zHzPjHvFO(9+zJ`IcltUod`iq268BYol2L1&inmyCTU4AZKjZd=)=*Q&XD=R$-zZ@h
z1(F;zvftS7^8rew_O<9&gghYlIW{lVRrW2ULncTtV5D%b@v}nic!LyofPLpgH2j^K
zE4=4Ng~<GjxFDbHEsvo${l=9^*Jfp&b$mQ^xRG<P`+bflv@+wi05?OtcJM1@p4TVQ
z1N+x|E^OJsuUr^3o<K-N(d4MGxNaML(Mp-wsBGkWEe~JcuFOT>fQ%f=TW;14QHVhs
zJhVU5C%4t{N^5<tev8s&AEcDCmrrD?_kPYM@dJ0)Yn{7L*eDJ98m!Vr<kFPAAS<1d
zp3n%KFpPKWKHl#20k6u;5jO^;-*VZU9UJ?x*=bOfzbBu{)*o@0$Mq>UJbk1a`q=k@
zWx9PzpT0Pqi}On!h+2cy>zM-x-LjpPz;m)kl?H@|h9*`b@IE5`_&k4|7}YJ*X4)Og
zwXwuCY!RZ)wep%bv&&^enJ#2Ht$o?RJ4AJ-3EPMX{IIIcQK6PzcjWAL`I~2yAYBFu
z8V%cI#v-k}BW9zvO@7TD6X|wUv3i;`rk1t${qDL(uW7vMsR%_In`KJ<5F++Phf4yK
z7a;J6+cc}NySr1HnOPNnm%YBHd3RbJo}w`7R2y=emY?rJD*|zgYu{ay<TP}$e0xCz
zXjMY)6jo9OM$vWsPl@rQv;;%Rn49~AA$CQq57WO>v>sA6UZWo_zabN(3V9T+YcYO>
zhAyg#l7x%VV{LCQ$<aGz<~t<VCDd5dZi;aL8ri8J<^i2GzH(GC&r1=8T{RwR2DkBA
zX?Od%W(!>XiM5zisAW`8JIa5$%>L@JXQtolF4dPm&8FFpMVDkySlM?(7Vl{x(1O6u
zfT*a4pfa1brb@lzD_L^24-SGkGf$?m1-Qj!O8?0+^Flj*kfJu3KSv^|Z4CUgxaAW%
zoARRS^1gWyO5cZON0hnuq?aD6W`M<}qF>6y0^O9Slr|KxlNTLu7sXi<NGwF2)=TEE
z>jLE$&h{==b@jxp>_)c}{GC7+uLENe2mt^jKL0WONg&>C9!{PP2n5<w==bAy7D-86
zbj5N4!Yq3aiL5d$A9z3nV0pVXhEtAM37Ce>lq7Qz28SamR#jSvzL`My9(9R>8WoW6
z;l4aDH>o{9Dkl1K$(L>I)L3l()-n0ICS?&q$Yi9h^aWHhCwaNK)MyDIJc67)^3urk
z^TU;7n73_Lu=V6?^)<9@epoU@AJ%9@e_rb}dLZrigVX5)tzv|IZcq+;N-1I(F@v*V
zg3It|ePrC8Sse_&eaiD~Dr#^mg;x9U@<v&?|B}G?!X*u~O<6gSl%o9sXVWnFsr8BP
zSDm{ByIX0B3sSVr*1jwGKJkltUd2zMYju*cXO9|m6)}TS2gA?>z^yw+MW%;|e1mh7
zaCp_6n!Wwb0^NNt$w=(i`i;7BwvP)6P6+XG=1|w+7BNGYg8EONse&1A=3)m7b&WoW
zGcnRrnCz4R`>w~^Mlcmzx%Q%}pI3v#!ug&Rn+Uq6O)4g0CA%k>u?r-U#l>q<VBXDa
zlTYXy$;=zE!IzN_iwCyMK4g-yS>Mh#xt%u>xx(tc9Hx0Q=3YcuO-#fXWwk*tVND*N
zrwVySS7UecafknNJ(<vwp3GK+&Zrr%Q2pNkrn9|%;N1c#Wu9$=PSije;GekC1N
zk%P};FCduB9)~#T{k*YH@C0TXQ?;K35>L4#uLFjFMxk7#dF+{-^gh)DBKLbHC)AHx
z3x4rJ<}3OAv~?&LvY}3X+Ri2b8c+0CSK#H<KTAD(-cG2O)!6-7is-4a5$g$6{oy!t
zFCrG$5jEaoM<07B6GjbendC>kziW)6IqRItW$P=TeJ~+F_SPA6zjd~PUO;bx&$3{;
z=2?{bksxO>eLAi+`Fp<2p2=6Qu@kbtWvsq;GrRyV<9G@H;Nl+{d!l`v5uU#)pheRq
zw|Qaevs;k%R{*3@2sxE@m@HB!WefV%A%ZCNgj+ich?^N<EG96wEQDbPWAum5P6E6)
z<!BjV6*SvqP_7sQ%|*M<RxT1x^6q&rM@c->>8*om8S&p+tbL~rGx0PPS27gMi0juZ
zNFPNvj+UEW!^USL>xQ<5s&1aFX(x<ndO#!724S6Iejc_3lO=o(bPGWbT7NdP<bUx)
z2R-?yCmi6B&M)Bx4)uO~%tb|Y^u@1j4j}R&>5iaQ{z#*4i6v@4$NjL&PR*0Ix1OQn
z`cs<2^|a!uDzWj&o+jvfB`^LVJu1Bu%Hj4I1C`9Cqg@3w6}ggz46$%wpmk$OdR@yj
zztpz$;;LN4XWT@o2k4`dSB8$;_R~>?tfcZ%3Jc4P?X*-1jP?$B-7Lp45=KwpP;M~H
zM~r0|YGSfrZe6kSzB6!9e96<dFaRRtC>kEM)rltfQj~T;JKAo1G|yObz=28BYGL}(
zcz<ro^(<c^RdKq7uHQ8fLiY34hk))g=xK9%Imef)UIwXUd6Z?u8FKkc%NcK85zp1D
znCJeOQ>G)C*>hZ57r-_>%j6GeXkWhKR2Ts6h{@TN5V~15Zau?YFtsO(wag{Eib1`~
z63V3rTvvUkw{u!A7AclB69`xlsQIAA)G5tQ!Tr!Wu!>wb&fYrktl0ma!wvRsn@o}f
z=7FcVtRp}!->;n~p)#c`D>@V6aY@8-WAapL@wS_(Jo^{Rtg8u#j)-ys_g)@2uCX1u
zb6827mO!g?4s?4&dq=?W`;7$Lu<oh8697>e;k4Yc;zu&S>u)1%ORh7pcs+9Gfr$|N
zi7e!6hGgjyK0^gm!D;%DU^M`RI(0u2VFO6Liky>_GIH&c!ngMUbgga0c}GF}8$y4}
zIW4N0wF56>TD+w3iUwGEAe=pg1b^ML6B}K-g(;PSdPj4F$ws*%#1IDz#K#1tc~rAB
zBvGo_?UnO|?(r1Nz6$a*3+zaxUFm}Q=u)OrYD1*?)N2Bb`YTV6!wD)ClMr@^$Ky&j
zv9{1EbI1)PLFRi6t#%B>o&~oe?#yHtDqw~5l-(Xy%IdhQAT_kbF6|GE2Z???qLfhG
z(Be*kgMB78&b*d2$8s;5F=VJ>Ci2({gV_3U@e|vXwwG)D;9JM;4#vQ^1=}kz?l!e&
z5+N9>kAZ-{4+?H#DHA3D0J!l7rTXKyztWG5o14=w&y6)0)9O~DK1o4wSQ(Cm6mCbW
z3PdpQ2ek8K-rHVunZ?%5S9&K=L%yv}yc>pezg;-g@rOjjVbZ+0-WP$8j7abxX-lls
zq>6qKFU}Ob0R_WJ;k~3vE!u9coOAr5J71<IO{Y%mgb`~g1A({>_7&Ek<3N}$Q?=6L
z!|cqJ-3}ghBEJ@;l+3q0)wxF$A{KlR_nQ+jjSFwpY^We+XTjdB7(I9o0SNAICXtxw
zTNEPUk-TCv+VJ6k*iYUydz4m|zycm@UsTbH+Cqq{`qA5NS!dWrQk?0<8qb(@Wef~M
z<Ezz4XILn!J0&xC%%~cfA(w*Dt}jHgO3KP!_ylY17lBV<Mb}G2l)e<n8n_FTEO?AR
zQQpLT3dB8EMGq@n$FW;rOpW@sgG?3wHZBi)W?nhosNe7g2LCbcA0ho;6aN-fU4p`;
zN5Yh0l{6aw3H?$iiOyX!enD!IBW@a+-U?uYP#8hHEZVvzM<8eqqx;N(&fE2Q{Sl9v
z6X}56;Kw~c0lYR!aCl8v&yn%v>g{mSmho0c>O{UK<ykSioKH{lvgFU8==+OEf9ca2
z>3u5Umb+V}s&?13=)vncT@Lk<orbG}I;nXjqRdg(Q&Hxes~Aofq-j8vo^I`?FQe2~
z)rz6D3l&S0C?E9MxdH|e@0-YsKi~+_pFdzY)Xf!*a79}h`=St@cYmp<JPD@TEliC+
z9Rhwu5MuVch{cNHE{ecb4_tPZWN$u>71N#9dQL+h27BrAu4N=A?d7{CpsKCxFSE&P
z2L5z}CS<xcY?pm1YPn=IPcrH+Yt<#eslN_xJksx^^?AYfAbDp>tk%P=)Mwk51-1`+
z+F^5BKDO(vqZV_%gci+AZ|4q2@`sxxjh>@c_Ll|C^rOl`?;NwCK0J_9(o%y~Cq!Bl
z+NNxkE6JL7_|2~lFhj9m64{d0wN$VzVQDoHr~S?tS_H<MBOEi*Ja^K05BO#P1_a4e
z69@HYb7K}w?6@`TiYk4$YQB8@p{xsk)Rr)`c1nqqS`W4{dq48+K_BB!mf-_X*Qa*H
z>XvC;aM1hc6-A36yd?<;>4^S&_XDq)-`7LDegA(i<UIO(FX9gt0C-4Hg`fV1&cu0w
z^R?)|34#ysEe<@vf2>T;vz#vl{>?)0`oeFPe+q;1(DPyUALwJszk=|2;Q2824-kc~
zM*e5#{~OfKgU`F^Kj5KDzrp{-Y0vYV_k@3VYVe`;Klp{dHVNLYzgl19fS>qJeDbT~
G1^5>%<7VFg

literal 0
HcmV?d00001

diff --git a/tests/resources/requirements.txt b/tests/resources/requirements.txt
index 56d154dd9..033953205 100644
--- a/tests/resources/requirements.txt
+++ b/tests/resources/requirements.txt
@@ -1,2 +1,5 @@
 avro[snappy]
+openpyxl
+pandas
 pyarrow
+pandas-xlwt
diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py
index a73fa06c5..cb687776c 100644
--- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py
+++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py
@@ -56,7 +56,7 @@ def test_avro_reader(
     """Reading Avro files working as expected on any Spark, Python and Java versions"""
     spark_version = get_spark_version(spark)
     if spark_version < (2, 4):
-        pytest.skip("Avro only supported on Spark 2.4+")
+        pytest.skip("Avro files are supported on Spark 3.2+ only")
 
     local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files
     df = file_df_dataframe
@@ -76,10 +76,10 @@ def test_avro_reader(
 
 
 @pytest.mark.parametrize(
-    "path, options",
+    "options",
     [
-        ("without_compression", {}),
-        ("with_compression", {"compression": "snappy"}),
+        {},
+        {"compression": "snappy"},
     ],
     ids=["without_compression", "with_compression"],
 )
@@ -88,13 +88,12 @@ def test_avro_writer(
     local_fs_file_df_connection_with_path,
     file_df_dataframe,
     avro_schema,
-    path,
     options,
 ):
     """Written files can be read by Spark"""
     spark_version = get_spark_version(spark)
     if spark_version < (2, 4):
-        pytest.skip("Avro only supported on Spark 2.4+")
+        pytest.skip("Avro files are supported on Spark 3.2+ only")
 
     file_df_connection, source_path = local_fs_file_df_connection_with_path
     df = file_df_dataframe
diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py
index a6cd14591..289e88273 100644
--- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py
+++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py
@@ -27,6 +27,7 @@ def test_csv_reader_with_infer_schema(
     local_fs_file_df_connection_with_path_and_files,
     file_df_dataframe,
 ):
+    """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions"""
     file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files
     df = file_df_dataframe
     csv_root = source_path / "csv/without_header"
@@ -42,9 +43,13 @@ def test_csv_reader_with_infer_schema(
 
     expected_df = df
 
-    if get_spark_version(spark).major < 3:
+    spark_version = get_spark_version(spark)
+    if spark_version.major < 3:
         # Spark 2 infers "date_value" as timestamp instead of date
         expected_df = df.withColumn("date_value", col("date_value").cast("timestamp"))
+    elif spark_version < (3, 3):
+        # Spark 3.2 cannot infer "date_value", and return it as string
+        expected_df = df.withColumn("date_value", col("date_value").cast("string"))
 
     # csv does not have header, so columns are named like "_c0", "_c1", etc
     expected_df = reset_column_names(expected_df)
diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py
new file mode 100644
index 000000000..de8cc9cf9
--- /dev/null
+++ b/tests/tests_integration/test_file_format_integration/test_excel_integration.py
@@ -0,0 +1,142 @@
+"""Integration tests for Excel file format.
+
+Test only that options are passed to Spark in both FileDFReader & FileDFWriter.
+Do not test all the possible options and combinations, we are not testing Spark here.
+"""
+
+import pytest
+
+from onetl._util.spark import get_spark_version
+from onetl.file import FileDFReader, FileDFWriter
+from onetl.file.format import Excel
+
+try:
+    from pyspark.sql.functions import col
+
+    from tests.util.assert_df import assert_equal_df
+    from tests.util.spark_df import reset_column_names
+except ImportError:
+    # pandas and spark can be missing if someone runs tests for file connections only
+    pass
+
+pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection]
+
+
+@pytest.mark.parametrize("format", ["xlsx", "xls"])
+def test_excel_reader_with_infer_schema(
+    spark,
+    local_fs_file_df_connection_with_path_and_files,
+    file_df_dataframe,
+    format,
+):
+    """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions"""
+    spark_version = get_spark_version(spark)
+    if spark_version < (3, 2):
+        pytest.skip("Excel files are supported on Spark 3.2+ only")
+
+    file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files
+    df = file_df_dataframe
+    excel_root = source_path / format / "without_header"
+
+    reader = FileDFReader(
+        connection=file_df_connection,
+        format=Excel(inferSchema=True),
+        source_path=excel_root,
+    )
+    read_df = reader.run()
+
+    assert read_df.count()
+
+    expected_df = df
+    # Spark infers "date_value" as timestamp instead of date
+    expected_df = df.withColumn("date_value", col("date_value").cast("timestamp"))
+
+    # excel does not have header, so columns are named like "_c0", "_c1", etc
+    expected_df = reset_column_names(expected_df)
+
+    assert read_df.schema != df.schema
+    assert read_df.schema == expected_df.schema
+    assert_equal_df(read_df, expected_df)
+
+
+@pytest.mark.parametrize("format", ["xlsx", "xls"])
+@pytest.mark.parametrize(
+    "path, options",
+    [
+        ("without_header", {}),
+        ("with_header", {"header": True}),
+        ("with_data_address", {"dataAddress": "'ABC'!K6"}),
+    ],
+    ids=["without_header", "with_header", "with_data_address"],
+)
+def test_excel_reader_with_options(
+    spark,
+    local_fs_file_df_connection_with_path_and_files,
+    file_df_dataframe,
+    format,
+    path,
+    options,
+):
+    """Reading Excel files working as expected on any Spark, Python and Java versions"""
+    spark_version = get_spark_version(spark)
+    if spark_version < (3, 2):
+        pytest.skip("Excel files are supported on Spark 3.2+ only")
+
+    local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files
+    df = file_df_dataframe
+    excel_root = source_path / format / path
+
+    reader = FileDFReader(
+        connection=local_fs,
+        format=Excel.parse(options),
+        df_schema=df.schema,
+        source_path=excel_root,
+    )
+    read_df = reader.run()
+
+    assert read_df.count()
+    assert read_df.schema == df.schema
+    assert_equal_df(read_df, df)
+
+
+@pytest.mark.parametrize(
+    "options",
+    [
+        {},
+        {"header": True},
+    ],
+    ids=["without_header", "with_header"],
+)
+def test_excel_writer(
+    spark,
+    local_fs_file_df_connection_with_path,
+    file_df_dataframe,
+    options,
+):
+    """Written files can be read by Spark"""
+    spark_version = get_spark_version(spark)
+    if spark_version < (3, 2):
+        pytest.skip("Excel files are supported on Spark 3.2+ only")
+
+    file_df_connection, source_path = local_fs_file_df_connection_with_path
+    df = file_df_dataframe
+    excel_root = source_path / "excel"
+
+    writer = FileDFWriter(
+        connection=file_df_connection,
+        format=Excel.parse(options),
+        target_path=excel_root,
+    )
+    writer.run(df)
+
+    reader = FileDFReader(
+        connection=file_df_connection,
+        format=Excel.parse(options),
+        source_path=excel_root,
+        df_schema=df.schema,
+    )
+    read_df = reader.run()
+
+    assert read_df.count()
+    assert read_df.schema == df.schema
+    assert_equal_df(read_df, df)
diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py
new file mode 100644
index 000000000..e94386120
--- /dev/null
+++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py
@@ -0,0 +1,106 @@
+import logging
+
+import pytest
+
+from onetl.file.format import Excel
+
+
+@pytest.mark.parametrize(
+    "spark_version",
+    [
+        "2.2.1",
+        "2.3.1",
+        "2.4.8",
+    ],
+)
+def test_excel_get_packages_spark_version_not_supported(spark_version):
+    with pytest.raises(ValueError, match=f"Spark version should be at least 3.2, got {spark_version}"):
+        Excel.get_packages(spark_version=spark_version)
+
+
+def test_excel_get_packages_scala_version_not_supported():
+    with pytest.raises(ValueError, match="Scala version should be at least 2.12, got 2.11"):
+        Excel.get_packages(spark_version="3.2.4", scala_version="2.11")
+
+
+def test_excel_get_packages_package_version_not_supported():
+    with pytest.raises(ValueError, match="Package version should be at least 0.15, got 0.13.7"):
+        Excel.get_packages(spark_version="3.2.4", package_version="0.13.7")
+
+
+@pytest.mark.parametrize(
+    "spark_version, scala_version, package_version, packages",
+    [
+        # Detect Scala version by Spark version
+        ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]),
+        ("3.4.1", None, None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]),
+        # Override Scala version
+        ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]),
+        ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.19.0"]),
+        ("3.4.1", "2.12", None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]),
+        ("3.4.1", "2.13", None, ["com.crealytics:spark-excel_2.13:3.4.1_0.19.0"]),
+        # Override package version
+        ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]),
+        ("3.4.1", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.4.1_0.18.0"]),
+    ],
+)
+def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages):
+    with caplog.at_level(level=logging.WARNING):
+        result = Excel.get_packages(
+            spark_version=spark_version,
+            scala_version=scala_version,
+            package_version=package_version,
+        )
+
+        if package_version:
+            assert f"Passed custom package version '{package_version}', it is not guaranteed to be supported"
+
+    assert result == packages
+
+
+def test_excel_options_default():
+    excel = Excel()
+    assert not excel.header
+
+
+def test_excel_options_default_override():
+    excel = Excel(header=True)
+    assert excel.header
+
+
+@pytest.mark.parametrize(
+    "known_option",
+    [
+        "dataAddress",
+        "treatEmptyValuesAsNulls",
+        "setErrorCellsToFallbackValues",
+        "usePlainNumberFormat",
+        "inferSchema",
+        "addColorColumns",
+        "timestampFormat",
+        "maxRowsInMemory",
+        "maxByteArraySize",
+        "tempFileThreshold",
+        "excerptSize",
+        "workbookPassword",
+        "dateFormat",
+    ],
+)
+def test_excel_options_known(known_option):
+    excel = Excel.parse({known_option: "value"})
+    assert getattr(excel, known_option) == "value"
+
+
+def test_excel_options_unknown(caplog):
+    with caplog.at_level(logging.WARNING):
+        excel = Excel(unknown="abc")
+        assert excel.unknown == "abc"
+
+    assert ("Options ['unknown'] are not known by Excel, are you sure they are valid?") in caplog.text
+
+
+@pytest.mark.local_fs
+def test_excel_missing_package(spark_no_packages):
+    msg = "Cannot import Java class 'com.crealytics.spark.excel.v2.ExcelDataSource'"
+    with pytest.raises(ValueError, match=msg):
+        Excel().check_if_supported(spark_no_packages)
diff --git a/tests/util/spark_df.py b/tests/util/spark_df.py
index 8e4c667b8..f4e239026 100644
--- a/tests/util/spark_df.py
+++ b/tests/util/spark_df.py
@@ -10,7 +10,7 @@ def reset_column_names(df: SparkDataFrame, columns: list[str] | None = None) ->
     """
     Reset columns to ``_c0`` format.
 
-    If `columns` is None, reset all columns names.
+    If `columns` is None, apply to all columns in df.
     """
     columns = columns or df.columns
     for i, column in enumerate(columns):

From 252ce9a6fec72ea56ba16676db9f59682242b111 Mon Sep 17 00:00:00 2001
From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com>
Date: Wed, 20 Sep 2023 13:46:35 +0300
Subject: [PATCH 12/26] [DOP-8837] - add Samba file_connection class (#150)

* [DOP-8837] - add draft Samba connection class

* [DOP-8837] - implement all abstract methods for Samba connection

* [DOP-8837] - updated comments

* [DOP-8837] - add unit tests

* [DOP-8837] - update pytest.ini

* [DOP-8837] - update docker-compose.yml

* [DOP-8837] - add .sh script to allow dirs/files creation

* [DOP-8837] - remove extra .env

* [DOP-8837] - modified Samba connection methods

* [DOP-8837] - update Samba connection method

* [DOP-8837] - update Samba connection method

* Update onetl/connection/file_connection/samba.py

Co-authored-by: Maxim Martynov <msmarty5@mts.ru>

* [DOP-8837] - update Samba connection method

* [DOP-8837] - update _create_dir method in Samba connection

* [DOP-8837] - hardcode env vars in docker compose configuration

* Update .env.docker

Co-authored-by: Maxim Martynov <msmarty5@mts.ru>

---------

Co-authored-by: Maxim Martynov <msmarty5@mts.ru>
---
 .env.docker                                   |  10 +
 .env.local                                    |  10 +
 conftest.py                                   |   1 +
 docker-compose.yml                            |  12 +
 docker/Dockerfile                             |   1 +
 docker/samba/on_post_init.sh                  |   4 +
 onetl/connection/__init__.py                  |   2 +
 onetl/connection/file_connection/samba.py     | 292 ++++++++++++++++++
 pytest.ini                                    |   1 +
 requirements/samba.txt                        |   1 +
 requirements/tests/samba.txt                  |   1 +
 setup.py                                      |   2 +
 .../fixtures/connections/file_connections.py  |   3 +
 tests/fixtures/connections/samba.py           |  63 ++++
 .../test_file_downloader_integration.py       |  11 +-
 .../test_file_uploader_integration.py         |   6 +-
 .../test_samba_file_connection_integration.py |  58 ++++
 .../test_samba_unit.py                        |  47 +++
 18 files changed, 521 insertions(+), 4 deletions(-)
 create mode 100755 docker/samba/on_post_init.sh
 create mode 100644 onetl/connection/file_connection/samba.py
 create mode 100644 requirements/samba.txt
 create mode 100644 requirements/tests/samba.txt
 create mode 100644 tests/fixtures/connections/samba.py
 create mode 100644 tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py
 create mode 100644 tests/tests_unit/tests_file_connection_unit/test_samba_unit.py

diff --git a/.env.docker b/.env.docker
index b9c2105aa..cb0394806 100644
--- a/.env.docker
+++ b/.env.docker
@@ -87,6 +87,16 @@ ONETL_SFTP_PORT=2222
 ONETL_SFTP_USER=onetl
 ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho
 
+# Samba
+ONETL_SAMBA_HOST=samba
+ONETL_SAMBA_PROTOCOL=SMB
+ONETL_SAMBA_UID=1000
+ONETL_SAMBA_GID=1000
+ONETL_SAMBA_PORT=445
+ONETL_SAMBA_SHARE=SmbShare
+ONETL_SAMBA_USER=onetl
+ONETL_SAMBA_PASSWORD=awd123fd1
+
 # Webdav
 ONETL_WEBDAV_HOST=webdav
 ONETL_WEBDAV_PORT=80
diff --git a/.env.local b/.env.local
index af2551dbd..2e05030f3 100644
--- a/.env.local
+++ b/.env.local
@@ -87,6 +87,16 @@ export ONETL_SFTP_PORT=2222
 export ONETL_SFTP_USER=onetl
 export ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho
 
+# Samba
+export ONETL_SAMBA_HOST=localhost
+export ONETL_SAMBA_PROTOCOL=SMB
+export ONETL_SAMBA_UID=1000
+export ONETL_SAMBA_GID=1000
+export ONETL_SAMBA_PORT=445
+export ONETL_SAMBA_SHARE=SmbShare
+export ONETL_SAMBA_USER=onetl
+export ONETL_SAMBA_PASSWORD=awd123fd1
+
 # Webdav
 export ONETL_WEBDAV_HOST=localhost
 export ONETL_WEBDAV_PORT=8000
diff --git a/conftest.py b/conftest.py
index ab0b60a5c..52b6c5754 100644
--- a/conftest.py
+++ b/conftest.py
@@ -19,5 +19,6 @@
     "tests.fixtures.connections.local_fs",
     "tests.fixtures.connections.s3",
     "tests.fixtures.connections.sftp",
+    "tests.fixtures.connections.samba",
     "tests.fixtures.connections.webdav",
 ]
diff --git a/docker-compose.yml b/docker-compose.yml
index a08d8fc38..3d93c02af 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -164,6 +164,18 @@ services:
     networks:
     - onetl
 
+  samba:
+    image: elswork/samba
+    restart: unless-stopped
+    ports:
+    - "139:139"
+    - "445:445"
+    volumes:
+    - ./docker/samba:/share/folder
+    command: '-u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl"'
+    networks:
+    - onetl
+
   s3:
     image: ${S3_IMAGE:-bitnami/minio:latest}
     restart: unless-stopped
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 103cc2b26..817d4eab2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -57,6 +57,7 @@ RUN pip install \
     -r /app/requirements/hdfs.txt \
     -r /app/requirements/s3.txt \
     -r /app/requirements/sftp.txt \
+    -r /app/requirements/samba.txt \
     -r /app/requirements/webdav.txt \
     -r /app/requirements/kerberos.txt \
     -r /app/requirements/docs.txt \
diff --git a/docker/samba/on_post_init.sh b/docker/samba/on_post_init.sh
new file mode 100755
index 000000000..f71af2a03
--- /dev/null
+++ b/docker/samba/on_post_init.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+# allow create files and directories
+chmod 777 /share/folder
diff --git a/onetl/connection/__init__.py b/onetl/connection/__init__.py
index 1c50f7fee..3e40e2a2a 100644
--- a/onetl/connection/__init__.py
+++ b/onetl/connection/__init__.py
@@ -37,6 +37,7 @@
     from onetl.connection.file_connection.ftps import FTPS
     from onetl.connection.file_connection.hdfs import HDFS
     from onetl.connection.file_connection.s3 import S3
+    from onetl.connection.file_connection.samba import Samba
     from onetl.connection.file_connection.sftp import SFTP
     from onetl.connection.file_connection.webdav import WebDAV
     from onetl.connection.file_df_connection.spark_hdfs import SparkHDFS
@@ -62,6 +63,7 @@
     "HDFS": "hdfs",
     "S3": "s3",
     "SFTP": "sftp",
+    "Samba": "samba",
     "WebDAV": "webdav",
 }
 
diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py
new file mode 100644
index 000000000..7a7f21132
--- /dev/null
+++ b/onetl/connection/file_connection/samba.py
@@ -0,0 +1,292 @@
+#  Copyright 2023 MTS (Mobile Telesystems)
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from __future__ import annotations
+
+import os
+import stat
+import textwrap
+from io import BytesIO
+from logging import getLogger
+from pathlib import Path
+from typing import Literal, Optional, Union
+
+from etl_entities.instance import Host
+from pydantic import SecretStr, validator
+
+from onetl.connection.file_connection.file_connection import FileConnection
+from onetl.hooks import slot, support_hooks
+from onetl.impl import LocalPath, RemotePath, RemotePathStat
+
+try:
+    from smb.smb_structs import OperationFailure
+    from smb.SMBConnection import SMBConnection
+except (ImportError, NameError) as e:
+    raise ImportError(
+        textwrap.dedent(
+            """
+            Cannot import module "pysmb".
+
+            You should install package as follows:
+                pip install onetl[samba]
+
+            or
+                pip install onetl[files]
+            """,
+        ).strip(),
+    ) from e
+
+
+log = getLogger(__name__)
+
+
+@support_hooks
+class Samba(FileConnection):
+    """Samba file connection.
+
+    Based on `pysmb library <https://pypi.org/project/pysmb/>`_.
+
+    Parameters
+    ----------
+    host : str
+        Host of Samba source. For example: ``mydomain.com``.
+
+    share : str
+        The name of the share on the Samba server.
+
+    protocol : str, default: ``SMB``
+        The protocol to use for the connection. Either ``SMB`` or ``NetBIOS``.
+        Affects the default port and the `is_direct_tcp` flag in `SMBConnection`.
+
+    port : int, default: 445
+        Port of Samba source.
+
+    domain : str, default: ``
+        Domain name for the Samba connection. Empty strings means use ``host`` as domain name.
+
+    auth_type : str, default: ``NTLMv2``
+        The authentication type to use. Either ``NTLMv2`` or ``NTLMv1``.
+        Affects the `use_ntlm_v2` flag in `SMBConnection`.
+
+    user : str, default: None
+        User, which have access to the file source. Can be `None` for anonymous connection.
+
+    password : str, default: None
+        Password for file source connection. Can be `None` for anonymous connection.
+
+    """
+
+    host: Host
+    share: str
+    protocol: Union[Literal["SMB"], Literal["NetBIOS"]] = "SMB"
+    port: Optional[int] = None
+    domain: Optional[str] = ""
+    auth_type: Union[Literal["NTLMv1"], Literal["NTLMv2"]] = "NTLMv2"
+    user: Optional[str] = None
+    password: Optional[SecretStr] = None
+
+    @property
+    def instance_url(self) -> str:
+        return f"smb://{self.host}:{self.port}"
+
+    @slot
+    def check(self):
+        log.info("|%s| Checking connection availability...", self.__class__.__name__)
+        self._log_parameters()
+        try:
+            available_shares = {share.name for share in self.client.listShares()}
+            if self.share in available_shares:
+                log.info("|%s| Connection is available.", self.__class__.__name__)
+            else:
+                log.error(
+                    "|%s| Share %r not found among existing shares %r",
+                    self.__class__.__name__,
+                    self.share,
+                    available_shares,
+                )
+                raise ConnectionError("Failed to connect to the Samba server.")
+        except Exception as exc:
+            log.exception("|%s| Connection is unavailable", self.__class__.__name__)
+            raise RuntimeError("Connection is unavailable") from exc
+
+        return self
+
+    @slot
+    def path_exists(self, path: os.PathLike | str) -> bool:
+        try:
+            self.client.getAttributes(self.share, os.fspath(path))
+            return True
+        except OperationFailure:
+            return False
+
+    def _scan_entries(self, path: RemotePath) -> list:
+        if self._is_dir(path):
+            return [
+                entry
+                for entry in self.client.listPath(
+                    self.share,
+                    os.fspath(path),
+                )
+                if entry.filename not in {".", ".."}  # Filter out '.' and '..'
+            ]
+        return [self.client.getAttributes(self.share, os.fspath(path))]
+
+    def _extract_name_from_entry(self, entry) -> str:
+        return entry.filename
+
+    def _is_dir_entry(self, top: RemotePath, entry) -> bool:
+        return entry.isDirectory
+
+    def _is_file_entry(self, top: RemotePath, entry) -> bool:
+        return not entry.isDirectory
+
+    def _extract_stat_from_entry(self, top: RemotePath, entry) -> RemotePathStat:
+        if entry.isDirectory:
+            return RemotePathStat(st_mode=stat.S_IFDIR)
+
+        return RemotePathStat(
+            st_size=entry.file_size,
+            st_mtime=entry.last_write_time,
+            st_uid=entry.filename,
+        )
+
+    def _get_client(self) -> SMBConnection:
+        is_direct_tcp = self.protocol == "SMB"
+        use_ntlm_v2 = self.auth_type == "NTLMv2"
+        conn = SMBConnection(
+            username=self.user,
+            password=self.password.get_secret_value() if self.password else None,
+            my_name="optional_client_name",
+            remote_name=self.host,
+            domain=self.domain,
+            use_ntlm_v2=use_ntlm_v2,
+            sign_options=2,
+            is_direct_tcp=is_direct_tcp,
+        )
+        conn.connect(self.host, port=self.port)
+        return conn
+
+    def _is_client_closed(self, client: SMBConnection) -> bool:
+        try:
+            socket_fileno = client.sock.fileno()
+        except (AttributeError, OSError):
+            return True
+
+        return socket_fileno == -1
+
+    def _close_client(self, client: SMBConnection) -> None:
+        self.client.close()
+
+    def _download_file(self, remote_file_path: RemotePath, local_file_path: LocalPath) -> None:
+        with open(local_file_path, "wb") as local_file:
+            self.client.retrieveFile(
+                self.share,
+                os.fspath(remote_file_path),
+                local_file,
+            )
+
+    def _get_stat(self, path: RemotePath) -> RemotePathStat:
+        info = self.client.getAttributes(self.share, os.fspath(path))
+
+        if self.is_dir(os.fspath(path)):
+            return RemotePathStat(st_mode=stat.S_IFDIR)
+
+        return RemotePathStat(
+            st_size=info.file_size,
+            st_mtime=info.last_write_time,
+            st_uid=info.filename,
+        )
+
+    def _remove_file(self, remote_file_path: RemotePath) -> None:
+        self.client.deleteFiles(
+            self.share,
+            os.fspath(remote_file_path),
+        )
+
+    def _create_dir(self, path: RemotePath) -> None:
+        path_obj = Path(path)
+        for parent in reversed(path_obj.parents):
+            # create dirs sequentially as .createDirectory(...) cannot create nested dirs
+            try:
+                self.client.getAttributes(self.share, os.fspath(parent))
+            except OperationFailure:
+                self.client.createDirectory(self.share, os.fspath(parent))
+
+        self.client.createDirectory(self.share, os.fspath(path))
+
+    def _upload_file(self, local_file_path: LocalPath, remote_file_path: RemotePath) -> None:
+        with open(local_file_path, "rb") as file_obj:
+            self.client.storeFile(
+                self.share,
+                os.fspath(remote_file_path),
+                file_obj,
+            )
+
+    def _rename_file(self, source: RemotePath, target: RemotePath) -> None:
+        self.client.rename(
+            self.share,
+            os.fspath(source),
+            os.fspath(target),
+        )
+
+    def _remove_dir(self, path: RemotePath) -> None:
+        files = self.client.listPath(self.share, os.fspath(path))
+
+        for item in files:
+            if item.filename not in {".", ".."}:  # skip current and parent directory entries
+                full_path = path / item.filename
+                if item.isDirectory:
+                    # recursively delete subdirectory
+                    self._remove_dir(full_path)
+                else:
+                    self.client.deleteFiles(self.share, os.fspath(full_path))
+
+        self.client.deleteDirectory(self.share, os.fspath(path))
+
+    def _read_text(self, path: RemotePath, encoding: str) -> str:
+        return self._read_bytes(path).decode(encoding)
+
+    def _read_bytes(self, path: RemotePath) -> bytes:
+        file_obj = BytesIO()
+        self.client.retrieveFile(
+            self.share,
+            os.fspath(path),
+            file_obj,
+        )
+        file_obj.seek(0)
+        return file_obj.read()
+
+    def _write_text(self, path: RemotePath, content: str, encoding: str) -> None:
+        self._write_bytes(path, bytes(content, encoding))
+
+    def _write_bytes(self, path: RemotePath, content: bytes) -> None:
+        file_obj = BytesIO(content)
+
+        self.client.storeFile(
+            self.share,
+            os.fspath(path),
+            file_obj,
+        )
+
+    def _is_dir(self, path: RemotePath) -> bool:
+        return self.client.getAttributes(self.share, os.fspath(path)).isDirectory
+
+    def _is_file(self, path: RemotePath) -> bool:
+        return not self.client.getAttributes(self.share, os.fspath(path)).isDirectory
+
+    @validator("port", pre=True, always=True)
+    def _set_port_based_on_protocol(cls, port, values):
+        if port is None:
+            return 445 if values.get("protocol") == "SMB" else 139
+        return port
diff --git a/pytest.ini b/pytest.ini
index 5e40e75d7..3c71e8eb6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -24,5 +24,6 @@ markers =
     postgres: Postgres tests
     s3: S3 tests
     sftp: SFTP tests
+    samba: Samba tests
     teradata: Teradata tests
     webdav: WebDAV tests
diff --git a/requirements/samba.txt b/requirements/samba.txt
new file mode 100644
index 000000000..619ee4f51
--- /dev/null
+++ b/requirements/samba.txt
@@ -0,0 +1 @@
+pysmb
diff --git a/requirements/tests/samba.txt b/requirements/tests/samba.txt
new file mode 100644
index 000000000..619ee4f51
--- /dev/null
+++ b/requirements/tests/samba.txt
@@ -0,0 +1 @@
+pysmb
diff --git a/setup.py b/setup.py
index 422085535..f8b560707 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@ def parse_requirements(file: Path) -> list[str]:
 
 requirements_ftp = parse_requirements(here / "requirements" / "ftp.txt")
 requirements_sftp = parse_requirements(here / "requirements" / "sftp.txt")
+requirements_samba = parse_requirements(here / "requirements" / "samba.txt")
 requirements_hdfs = parse_requirements(here / "requirements" / "hdfs.txt")
 requirements_s3 = parse_requirements(here / "requirements" / "s3.txt")
 requirements_webdav = parse_requirements(here / "requirements" / "webdav.txt")
@@ -90,6 +91,7 @@ def parse_requirements(file: Path) -> list[str]:
         "ftp": requirements_ftp,
         "ftps": requirements_ftp,
         "sftp": requirements_sftp,
+        "samba": requirements_samba,
         "hdfs": requirements_hdfs,
         "s3": requirements_s3,
         "webdav": requirements_webdav,
diff --git a/tests/fixtures/connections/file_connections.py b/tests/fixtures/connections/file_connections.py
index e8ef7253e..f44240894 100644
--- a/tests/fixtures/connections/file_connections.py
+++ b/tests/fixtures/connections/file_connections.py
@@ -12,6 +12,7 @@
         lazy_fixture("hdfs_file_connection"),
         lazy_fixture("s3_file_connection"),
         lazy_fixture("sftp_file_connection"),
+        lazy_fixture("samba_file_connection"),
         lazy_fixture("webdav_file_connection"),
     ],
 )
@@ -26,6 +27,7 @@ def file_connection(request):
         lazy_fixture("hdfs_file_connection_with_path"),
         lazy_fixture("s3_file_connection_with_path"),
         lazy_fixture("sftp_file_connection_with_path"),
+        lazy_fixture("samba_file_connection_with_path"),
         lazy_fixture("webdav_file_connection_with_path"),
     ],
 )
@@ -40,6 +42,7 @@ def file_connection_with_path(request):
         lazy_fixture("hdfs_file_connection_with_path_and_files"),
         lazy_fixture("s3_file_connection_with_path_and_files"),
         lazy_fixture("sftp_file_connection_with_path_and_files"),
+        lazy_fixture("samba_file_connection_with_path_and_files"),
         lazy_fixture("webdav_file_connection_with_path_and_files"),
     ],
 )
diff --git a/tests/fixtures/connections/samba.py b/tests/fixtures/connections/samba.py
new file mode 100644
index 000000000..52a294d5b
--- /dev/null
+++ b/tests/fixtures/connections/samba.py
@@ -0,0 +1,63 @@
+import os
+from collections import namedtuple
+from pathlib import PurePosixPath
+
+import pytest
+
+from tests.util.upload_files import upload_files
+
+
+@pytest.fixture(
+    scope="session",
+    params=[
+        pytest.param("real-samba", marks=[pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]),
+    ],
+)
+def samba_server():
+    SambaServer = namedtuple("SambaServer", ["host", "protocol", "port", "share", "user", "password"])
+
+    return SambaServer(
+        host=os.getenv("ONETL_SAMBA_HOST"),
+        protocol=os.getenv("ONETL_SAMBA_PROTOCOL"),
+        port=os.getenv("ONETL_SAMBA_PORT"),
+        share=os.getenv("ONETL_SAMBA_SHARE"),
+        user=os.getenv("ONETL_SAMBA_USER"),
+        password=os.getenv("ONETL_SAMBA_PASSWORD"),
+    )
+
+
+@pytest.fixture()
+def samba_file_connection(samba_server):
+    from onetl.connection import Samba
+
+    return Samba(
+        host=samba_server.host,
+        protocol=samba_server.protocol,
+        port=samba_server.port,
+        share=samba_server.share,
+        user=samba_server.user,
+        password=samba_server.password,
+    )
+
+
+@pytest.fixture()
+def samba_file_connection_with_path(request, samba_file_connection):
+    connection = samba_file_connection
+    root = PurePosixPath("/data")
+
+    def finalizer():
+        connection.remove_dir(root, recursive=True)
+
+    request.addfinalizer(finalizer)
+
+    connection.remove_dir(root, recursive=True)
+
+    return connection, root
+
+
+@pytest.fixture()
+def samba_file_connection_with_path_and_files(resource_path, samba_file_connection_with_path):
+    connection, upload_to = samba_file_connection_with_path
+    upload_from = resource_path / "file_connection"
+    files = upload_files(upload_from, upload_to, connection)
+    return connection, upload_to, files
diff --git a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py
index ed290ab43..0a932dd46 100644
--- a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py
+++ b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py
@@ -635,10 +635,11 @@ def test_file_downloader_mode_replace_entire_directory(
     caplog,
 ):
     file_connection, remote_path, _ = file_connection_with_path_and_files
+    # Reason for using .resolve(): https://stackoverflow.com/a/58719476
     if local_dir_exist:
-        local_path = tmp_path_factory.mktemp("local_path")
+        local_path = tmp_path_factory.mktemp("local_path").resolve()
     else:
-        local_path = Path(tempfile.gettempdir()) / secrets.token_hex()
+        local_path = Path(tempfile.gettempdir()).resolve() / secrets.token_hex()
 
     temp_file = local_path / secrets.token_hex(5)
     if local_dir_exist:
@@ -755,7 +756,11 @@ def finalizer():
             local_path=file.name,
         )
 
-        with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"):
+        # Reason for .realpath(): https://stackoverflow.com/a/58719476
+        with pytest.raises(
+            NotADirectoryError,
+            match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory",
+        ):
             downloader.run()
 
 
diff --git a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py
index 522cf2dd4..feedeaa45 100644
--- a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py
+++ b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py
@@ -490,7 +490,11 @@ def test_file_uploader_run_local_path_not_a_directory(file_connection):
     with tempfile.NamedTemporaryFile() as file:
         uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=file.name)
 
-        with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"):
+        # Reason for .realpath(): https://stackoverflow.com/a/58719476
+        with pytest.raises(
+            NotADirectoryError,
+            match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory",
+        ):
             uploader.run()
 
 
diff --git a/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py
new file mode 100644
index 000000000..7c5c8f5d5
--- /dev/null
+++ b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py
@@ -0,0 +1,58 @@
+import logging
+
+import pytest
+
+pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]
+
+
+def test_samba_file_connection_check_success(samba_file_connection, caplog):
+    samba = samba_file_connection
+    with caplog.at_level(logging.INFO):
+        assert samba.check() == samba
+
+    assert "|Samba|" in caplog.text
+    assert f"host = '{samba.host}'" in caplog.text
+    assert f"port = {samba.port}" in caplog.text
+    assert f"protocol = '{samba.protocol}'" in caplog.text
+    assert f"user = '{samba.user}'" in caplog.text
+    assert f"share = '{samba.share}'" in caplog.text
+    assert "password = SecretStr('**********')" in caplog.text
+    assert samba.password.get_secret_value() not in caplog.text
+
+    assert "Connection is available." in caplog.text
+
+
+def test_samba_file_connection_check_not_existing_share_failed(samba_server, caplog):
+    from onetl.connection import Samba
+
+    not_existing_share = "NotExistingShare"
+    samba = Samba(
+        host=samba_server.host,
+        share=not_existing_share,
+        protocol=samba_server.protocol,
+        port=samba_server.port,
+        user=samba_server.user,
+        password=samba_server.password,
+    )
+
+    with caplog.at_level(logging.INFO):
+        with pytest.raises(RuntimeError, match="Connection is unavailable"):
+            samba.check()
+
+    assert f"Share '{not_existing_share}' not found among existing shares" in caplog.text
+
+
+def test_samba_file_connection_check_runtime_failed(samba_server):
+    from onetl.connection import Samba
+
+    samba = Samba(
+        host=samba_server.host,
+        share=samba_server.share,
+        protocol=samba_server.protocol,
+        port=samba_server.port,
+        user="unknown",
+        password="unknown",
+    )
+
+    with pytest.raises(RuntimeError, match="Connection is unavailable"):
+        samba.check()
diff --git a/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py
new file mode 100644
index 000000000..42f95b368
--- /dev/null
+++ b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py
@@ -0,0 +1,47 @@
+import pytest
+
+from onetl.connection import FileConnection
+
+pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]
+
+
+def test_samba_connection():
+    from onetl.connection import Samba
+
+    samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd")
+    assert isinstance(samba, FileConnection)
+    assert samba.host == "some_host"
+    assert samba.protocol == "SMB"
+    assert samba.domain == ""
+    assert samba.auth_type == "NTLMv2"
+    assert samba.port == 445
+    assert samba.user == "some_user"
+    assert samba.password != "pwd"
+    assert samba.password.get_secret_value() == "pwd"
+
+    assert "password='pwd'" not in str(samba)
+    assert "password='pwd'" not in repr(samba)
+
+
+def test_samba_connection_with_net_bios():
+    from onetl.connection import Samba
+
+    samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol="NetBIOS")
+    assert samba.protocol == "NetBIOS"
+    assert samba.port == 139
+
+
+@pytest.mark.parametrize("protocol", ["SMB", "NetBIOS"])
+def test_samba_connection_with_custom_port(protocol):
+    from onetl.connection import Samba
+
+    samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol=protocol, port=444)
+    assert samba.protocol == protocol
+    assert samba.port == 444
+
+
+def test_samba_connection_without_mandatory_args():
+    from onetl.connection import Samba
+
+    with pytest.raises(ValueError):
+        Samba()

From 6313b126f6d581b0ad76fbf2870d28169400a0d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 19 Sep 2023 07:49:40 +0000
Subject: [PATCH 13/26] [DOP-9007] Add ivysettings.xml examples to
 documentation

---
 README.rst                                    |   2 +-
 .../next_release/151.improvement.rst          |   1 +
 .../db_connection/greenplum/prerequisites.rst | 141 +--------
 docs/index.rst                                |   2 +-
 docs/install.rst                              |   3 -
 docs/install/index.rst                        |  11 +
 docs/install/java_packages.rst                | 281 ++++++++++++++++++
 docs/install/python_packages.rst              |   8 +
 8 files changed, 310 insertions(+), 139 deletions(-)
 create mode 100644 docs/changelog/next_release/151.improvement.rst
 delete mode 100644 docs/install.rst
 create mode 100644 docs/install/index.rst
 create mode 100644 docs/install/java_packages.rst
 create mode 100644 docs/install/python_packages.rst

diff --git a/README.rst b/README.rst
index e20086214..ea9518d91 100644
--- a/README.rst
+++ b/README.rst
@@ -114,7 +114,7 @@ See https://onetl.readthedocs.io/
 How to install
 ---------------
 
-.. _minimal-install:
+.. minimal-install
 
 Minimal installation
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/changelog/next_release/151.improvement.rst b/docs/changelog/next_release/151.improvement.rst
new file mode 100644
index 000000000..d8da800ae
--- /dev/null
+++ b/docs/changelog/next_release/151.improvement.rst
@@ -0,0 +1 @@
+Add documentation about different ways of passing packages to Spark session.
diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index 815a12b27..a545fdc27 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -7,7 +7,7 @@ Version Compatibility
 ---------------------
 
 * Greenplum server versions: 5.x, 6.x
-* Spark versions: 2.3.x - 3.2.x (Spark 3.3.x is not supported yet)
+* Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet)
 * Java versions: 8 - 11
 
 See `official documentation <https://docs.vmware.com/en/VMware-Tanzu-Greenplum-Connector-for-Apache-Spark/2.1/tanzu-greenplum-connector-spark/GUID-release_notes.html>`_.
@@ -33,140 +33,13 @@ To use Greenplum connector you should download connector ``.jar`` file from
 `Pivotal website <https://network.tanzu.vmware.com/products/vmware-greenplum#/releases/1341690/file_groups/14993>`_
 and then pass it to Spark session.
 
-There are several ways to do that.
+.. warning::
 
-.. note::
-
-    Please pay attention to Spark <-> Scala version compatibility. See :ref:`spark-compatibility-matrix`.
-
-Using ``spark.jars``
-~~~~~~~~~~~~~~~~~~~~
-
-The most simple solution, but this requires to store/deploy ``.jar`` file in the local environment.
-
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option, e.g.
-
-.. code:: python
-
-    # no need to use spark.jars.packages
-    spark = (
-        SparkSession.builder.config("spark.app.name", "onetl")
-        .config("spark.jars", "/path/to/downloaded.jar")
-        .getOrCreate()
-    )
-
-Using ``spark.jars.repositories``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo.
-
-* Setup private Maven repository in `JFrog Artifactory <https://jfrog.com/artifactory/>`_ or `Sonatype Nexus <https://www.sonatype.com/products/sonatype-nexus-repository>`_.
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``).
-* Pass repo URL to ``spark.jars.repositories`` Spark config option
-* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option.
-
-
-Example
-^^^^^^^
-
-.. code:: python
-
-    maven_packages = Greenplum.get_packages(spark_version="3.2")
-    spark = (
-        SparkSession.builder.config("spark.app.name", "onetl")
-        .config("spark.jars.repositories", "http://nexus.domain.com/example-repo/")
-        .config("spark.jars.packages", ",".join(maven_packages))
-        .getOrCreate()
-    )
-
-
-Using ``spark.jars.ivySettings``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Same as above, but can be used even if there is no network access to public repos like Maven.
-
-* Setup private Maven repository in `JFrog Artifactory <https://jfrog.com/artifactory/>`_ or `Sonatype Nexus <https://www.sonatype.com/products/sonatype-nexus-repository>`_.
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``).
-* Create `ivysettings.xml <https://github.com/MobileTeleSystems/onetl/blob/develop/tests/ivysettings.xml>`_ file.
-* Add here a resolver with repository URL (and credentials, if required).
-* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option.
-* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option.
+    Please pay attention to :ref:`Spark <-> Scala version compatibility <spark-compatibility-matrix>`.
 
-Example
-^^^^^^^
+There are several ways to do that. See :ref:`java-packages` for details.
 
-.. code-block:: xml
-    :caption: ivysettings.xml
-
-    <ivysettings>
-        <settings defaultResolver="main"/>
-        <resolvers>
-            <chain name="main" returnFirst="true">
-                <!-- Use Maven cache -->
-                <ibiblio name="local-maven-cache" m2compatible="true" root="file://${user.home}/.m2/repository"/>
-                <!-- Use ~/.ivy2/jars/*.jar files -->
-                <ibiblio name="local-ivy2-cache" m2compatible="false" root="file://${user.home}/.ivy2/jars"/>
-                <!-- Download packages from Maven, remove if no network access -->
-                <ibiblio name="central" m2compatible="true" />
-                <!-- Download packages from SparkPackages, remove if no network access -->
-                <ibiblio name="spark-packages" m2compatible="true" root="https://repos.spark-packages.org/" />
-                <!-- Nexus repo-->
-                <ibiblio name="nexus-private" m2compatible="true" root="http://nexus.domain.com/example-repo/" />
-            </chain>
-        </resolvers>
-    </ivysettings>
-
-
-.. code-block:: python
-    :caption: script.py
-
-    maven_packages = Greenplum.get_packages(spark_version="3.2")
-    spark = (
-        SparkSession.builder.config("spark.app.name", "onetl")
-        .config("spark.jars.ivySettings", "/path/to/ivysettings.xml")
-        .config("spark.jars.packages", ",".join(maven_packages))
-        .getOrCreate()
-    )
-
-Moving ``.jar`` file to ``~/.ivy2/jars/``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven.
-
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Move it to ``~/.ivy2/jars/`` folder
-* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option.
-
-Example
-^^^^^^^
-
-.. code:: python
-
-    maven_packages = Greenplum.get_packages(spark_version="3.2")
-    spark = (
-        SparkSession.builder.config("spark.app.name", "onetl")
-        .config("spark.jars.packages", ",".join(maven_packages))
-        .getOrCreate()
-    )
-
-Inserting ``.jar`` file to Spark jars folder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Can be used to embed ``.jar`` files to a default Spark classpath.
-
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.4/jars/``.
-* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages``
-
-
-Manually adding ``.jar`` files to ``CLASSPATH``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Can be used to embed ``.jar`` files to a default Java classpath.
+.. note::
 
-* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file.
-* Set environment variable ``CLASSPATH`` to ``/path/to/downloader.jar``
-* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages``
+    If you're uploading package to private package repo, use ``groupId=io.pivotal`` and ``artifactoryId=greenplum-spark_2.12``
+    (``2.12`` is Scala version) to give uploaded package a proper name.
diff --git a/docs/index.rst b/docs/index.rst
index cc8fdb87d..54ced3d06 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -14,7 +14,7 @@
     :hidden:
 
     self
-    install
+    install/index
     quickstart
     concepts
 
diff --git a/docs/install.rst b/docs/install.rst
deleted file mode 100644
index abf328c75..000000000
--- a/docs/install.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-.. include:: ../README.rst
-    :start-after: install
-    :end-before: quick-start
diff --git a/docs/install/index.rst b/docs/install/index.rst
new file mode 100644
index 000000000..86365e381
--- /dev/null
+++ b/docs/install/index.rst
@@ -0,0 +1,11 @@
+.. _install:
+
+How to install
+==============
+
+.. toctree::
+    :maxdepth: 1
+    :caption: How to install
+
+    python_packages
+    java_packages
diff --git a/docs/install/java_packages.rst b/docs/install/java_packages.rst
new file mode 100644
index 000000000..a64c9e7c5
--- /dev/null
+++ b/docs/install/java_packages.rst
@@ -0,0 +1,281 @@
+.. _java-packages:
+
+Java packages
+==============
+
+``DB`` and ``FileDF`` connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session,
+like JDBC drivers.
+
+This is usually done by setting up ``spark.jars.packages`` option while creating Spark session:
+
+.. code:: python
+
+    # here is a list of packages to be downloaded:
+    maven_packages = Greenplum.get_packages(spark_version="3.2")
+
+    spark = (
+        SparkSession.builder.config("spark.app.name", "onetl")
+        .config("spark.jars.packages", ",".join(maven_packages))
+        .getOrCreate()
+    )
+
+
+Spark automatically resolves package and all its dependencies, download them and inject to Spark session
+(both driver and all executors).
+
+This requires internet access, because package metadata and ``.jar`` files are fetched from `Maven Repository <https://mvnrepository.com/>`_.
+
+But sometimes it is required to:
+
+* Install package without direct internet access (isolated network)
+* Install package which is not available in Maven
+
+There are several ways to do that.
+
+Using ``spark.jars``
+--------------------
+
+The most simple solution, but this requires to store raw ``.jar`` files somewhere on filesystem or web server.
+
+* Download ``package.jar`` files (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique.
+* (For ``spark.submit.deployMode=cluster``) place downloaded files to HDFS or deploy to any HTTP web server serving static files. See `official documentation <https://spark.apache.org/docs/latest/submitting-applications.html#advanced-dependency-management>`_ for more details.
+* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option:
+
+.. tabs::
+
+    .. code-tab:: py for spark.submit.deployMode=client (default)
+
+        jar_files = ["/path/to/package.jar"]
+
+        # do not pass spark.jars.packages
+        spark = (
+            SparkSession.builder.config("spark.app.name", "onetl")
+            .config("spark.jars", ",".join(jar_files))
+            .getOrCreate()
+        )
+
+    .. code-tab:: py for spark.submit.deployMode=cluster
+
+        # you can also pass URLs like http://domain.com/path/to/downloadable/package.jar
+        jar_files = ["hdfs:///path/to/package.jar"]
+
+        # do not pass spark.jars.packages
+        spark = (
+            SparkSession.builder.config("spark.app.name", "onetl")
+            .config("spark.jars", ",".join(jar_files))
+            .getOrCreate()
+        )
+
+Using ``spark.jars.repositories``
+---------------------------------
+
+.. note::
+
+    In this case Spark still will try to fetch packages from the internet, so if you don't have internet access,
+    Spark session will be created with significant delay because of all attempts to fetch packages.
+
+Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo.
+
+* Setup private Maven repository in `JFrog Artifactory <https://jfrog.com/artifactory/>`_ or `Sonatype Nexus <https://www.sonatype.com/products/sonatype-nexus-repository>`_.
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter.
+* Upload ``package.jar`` file to private repository (with same ``groupId`` and ``artifactoryId`` as in source package in Maven).
+* Pass repo URL to ``spark.jars.repositories`` Spark config option.
+* Create Spark session with passing Package name to ``spark.jars.packages`` Spark config option:
+
+.. code:: python
+
+    maven_packages = Greenplum.get_packages(spark_version="3.2")
+    spark = (
+        SparkSession.builder.config("spark.app.name", "onetl")
+        .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/")
+        .config("spark.jars.packages", ",".join(maven_packages))
+        .getOrCreate()
+    )
+
+
+Using ``spark.jars.ivySettings``
+--------------------------------
+
+Same as above, but can be used even if there is no network access to public repos like Maven.
+
+* Setup private Maven repository in `JFrog Artifactory <https://jfrog.com/artifactory/>`_ or `Sonatype Nexus <https://www.sonatype.com/products/sonatype-nexus-repository>`_.
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter.
+* Upload ``package.jar`` file to `private repository <https://help.sonatype.com/repomanager3/nexus-repository-administration/repository-management#RepositoryManagement-HostedRepository>`_ (with same ``groupId`` and ``artifactoryId`` as in source package in Maven).
+* Create ``ivysettings.xml`` file (see below).
+* Add here a resolver with repository URL (and credentials, if required).
+* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option.
+* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option:
+
+.. tabs::
+
+    .. code-tab:: xml ivysettings-all-packages-uploaded-to-nexus.xml
+
+        <ivysettings>
+            <settings defaultResolver="main"/>
+            <resolvers>
+                <chain name="main" returnFirst="true">
+                    <!-- Use Maven cache -->
+                    <ibiblio name="local-maven-cache" m2compatible="true" root="file://${user.home}/.m2/repository"/>
+                    <!-- Use -/.ivy2/jars/*.jar files -->
+                    <ibiblio name="local-ivy2-cache" m2compatible="false" root="file://${user.home}/.ivy2/jars"/>
+                    <!-- Download all packages from own Nexus instance -->
+                    <ibiblio name="nexus-private" m2compatible="true" root="http://nexus.mydomain.com/private-repo/" />
+                </chain>
+            </resolvers>
+        </ivysettings>
+
+    .. code-tab:: xml ivysettings-private-packages-in-nexus-public-in-maven.xml
+
+        <ivysettings>
+            <settings defaultResolver="main"/>
+            <resolvers>
+                <chain name="main" returnFirst="true">
+                    <!-- Use Maven cache -->
+                    <ibiblio name="local-maven-cache" m2compatible="true" root="file://${user.home}/.m2/repository"/>
+                    <!-- Use -/.ivy2/jars/*.jar files -->
+                    <ibiblio name="local-ivy2-cache" m2compatible="false" root="file://${user.home}/.ivy2/jars"/>
+                    <!-- Download private packages from own Nexus instance -->
+                    <ibiblio name="nexus-private" m2compatible="true" root="http://nexus.mydomain.com/private-repo/" />
+                    <!-- Download other packages from Maven -->
+                    <ibiblio name="central" m2compatible="true" />
+                    <!-- Download other packages from SparkPackages -->
+                    <ibiblio name="spark-packages" m2compatible="true" root="https://repos.spark-packages.org/" />
+                </chain>
+            </resolvers>
+        </ivysettings>
+
+    .. code-tab:: xml ivysettings-private-packages-in-nexus-public-fetched-using-proxy-repo.xml
+
+        <ivysettings>
+            <settings defaultResolver="main"/>
+            <resolvers>
+                <chain name="main" returnFirst="true">
+                    <!-- Use Maven cache -->
+                    <ibiblio name="local-maven-cache" m2compatible="true" root="file://${user.home}/.m2/repository"/>
+                    <!-- Use -/.ivy2/jars/*.jar files -->
+                    <ibiblio name="local-ivy2-cache" m2compatible="false" root="file://${user.home}/.ivy2/jars"/>
+                    <!-- Download private packages from own Nexus instance -->
+                    <ibiblio name="nexus-private" m2compatible="true" root="http://nexus.mydomain.com/private-repo/" />
+                    <!-- Download public packages from same Nexus instance using Proxy Repo
+                    See https://help.sonatype.com/repomanager3/nexus-repository-administration/repository-management#RepositoryManagement-ProxyRepository
+                    -->
+                    <ibiblio name="nexus-proxy" m2compatible="true" root="http://nexus.mydomain.com/proxy-repo/" />
+                </chain>
+            </resolvers>
+        </ivysettings>
+
+    .. code-tab:: xml ivysettings-nexus-with-auth-required.xml
+
+        <ivysettings>
+            <settings defaultResolver="main"/>
+            <properties environment="env"/>
+            <!-- use environment variables NEXUS_USER and NEXUS_PASSWORD as credentials to auth in Nexus -->
+            <property name="repo.username" value="${env.NEXUS_USER}"/>
+            <property name="repo.pass" value="${env.NEXUS_PASSWORD}"/>
+            <!-- realm value is described
+            - here https://stackoverflow.com/a/38019000
+            - here https://github.com/sonatype/nexus-book-examples/blob/master/ant-ivy/simple-project/ivysettings.xml
+            - here https://support.sonatype.com/hc/en-us/articles/213465388-How-do-I-configure-my-Ivy-build-to-deploy-artifacts-to-Nexus-Repository-2-
+            -->
+            <credentials host="nexus.mydomain.com" username="${repo.username}" passwd="${repo.pass}" realm="Sonatype Nexus Repository Manager" />
+            <resolvers>
+                <chain name="main" returnFirst="true">
+                    <!-- Use Maven cache -->
+                    <ibiblio name="local-maven-cache" m2compatible="true" root="file://${user.home}/.m2/repository"/>
+                    <!-- Use -/.ivy2/jars/*.jar files -->
+                    <ibiblio name="local-ivy2-cache" m2compatible="false" root="file://${user.home}/.ivy2/jars"/>
+                    <!-- Download all packages from own Nexus instance, using credentials for domain above -->
+                    <ibiblio name="nexus-private" m2compatible="true" root="http://nexus.mydomain.com/private-repo/" />
+                </chain>
+            </resolvers>
+        </ivysettings>
+
+
+.. code-block:: python
+    :caption: script.py
+
+    maven_packages = Greenplum.get_packages(spark_version="3.2")
+    spark = (
+        SparkSession.builder.config("spark.app.name", "onetl")
+        .config("spark.jars.ivySettings", "/path/to/ivysettings.xml")
+        .config("spark.jars.packages", ",".join(maven_packages))
+        .getOrCreate()
+    )
+
+Place ``.jar`` file to ``-/.ivy2/jars/``
+----------------------------------------
+
+Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven.
+
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique.
+* Move it to ``-/.ivy2/jars/`` folder.
+* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option:
+
+.. code:: python
+
+    maven_packages = Greenplum.get_packages(spark_version="3.2")
+    spark = (
+        SparkSession.builder.config("spark.app.name", "onetl")
+        .config("spark.jars.packages", ",".join(maven_packages))
+        .getOrCreate()
+    )
+
+Place ``.jar`` file to Spark jars folder
+----------------------------------------
+
+.. note::
+
+    Package file should be placed on all hosts/containers Spark is running,
+    both driver and all executors.
+
+    Usually this is used only with either:
+        * ``spark.master=local`` (driver and executors are running on the same host),
+        * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods).
+
+Can be used to embed ``.jar`` files to a default Spark classpath.
+
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique.
+* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``.
+* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages``
+.. code:: python
+
+    # no need to set spark.jars.packages or any other spark.jars.* option
+    # all jars already present in CLASSPATH, and loaded automatically
+
+    spark = SparkSession.builder.config("spark.app.name", "onetl").getOrCreate()
+
+
+Manually adding ``.jar`` files to ``CLASSPATH``
+-----------------------------------------------
+
+.. note::
+
+    Package file should be placed on all hosts/containers Spark is running,
+    both driver and all executors.
+
+    Usually this is used only with either:
+        * ``spark.master=local`` (driver and executors are running on the same host),
+        * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods).
+
+Can be used to embed ``.jar`` files to a default Java classpath.
+
+* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter.
+* Set environment variable ``CLASSPATH`` to ``/path/to/package.jar``. You can set multiple file paths
+* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages``
+
+.. code:: python
+
+    # no need to set spark.jars.packages or any other spark.jars.* option
+    # all jars already present in CLASSPATH, and loaded automatically
+
+    import os
+
+    jar_files = ["/path/to/package.jar"]
+    # different delimiters for Windows and Linux
+    delimiter = ";" if os.name == "nt" else ":"
+    spark = (
+        SparkSession.builder.config("spark.app.name", "onetl")
+        .config("spark.driver.extraClassPath", delimiter.join(jar_files))
+        .config("spark.executor.extraClassPath", delimiter.join(jar_files))
+        .getOrCreate()
+    )
diff --git a/docs/install/python_packages.rst b/docs/install/python_packages.rst
new file mode 100644
index 000000000..4459b2f37
--- /dev/null
+++ b/docs/install/python_packages.rst
@@ -0,0 +1,8 @@
+.. _python-packages:
+
+Python packages
+===============
+
+.. include:: ../../README.rst
+    :start-after: minimal-install
+    :end-before: quick-start

From ab8632d94615e39e9f07503ff9a05073d24bd9d3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 Sep 2023 05:10:06 +0000
Subject: [PATCH 14/26] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/asottile/pyupgrade: v3.11.0 → v3.13.0](https://github.com/asottile/pyupgrade/compare/v3.11.0...v3.13.0)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d43a63307..193ae3c3d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -59,7 +59,7 @@ repos:
   - id: rst-inline-touching-normal
   - id: text-unicode-replacement-char
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.11.0
+  rev: v3.13.0
   hooks:
   - id: pyupgrade
     args: [--py37-plus, --keep-runtime-typing]

From 3719442ee1e3dc23cfc6bb6936da2df6a12507bb Mon Sep 17 00:00:00 2001
From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com>
Date: Tue, 26 Sep 2023 10:30:23 +0300
Subject: [PATCH 15/26] [DOP-8838] - add CI/CD for Samba connection (#152)

* [DOP-8838] - add CI/CD for Samba connection

* [DOP-8838] - update get-matrix.yml

* [DOP-8838] - update services parameters

* Revert "[DOP-8838] - update services parameters"

This reverts commit 8f88cea6a5845aade610b7376ae99947c9bbe962.

* [DOP-8838] - add 'options' parameter to workflow in test_samba.yml

* [DOP-8838] - change way to start samba container

* [DOP-8838] - change way to start samba container in docker-compose

* [DOP-8838] - fix unsupported import

* [DOP-8838] - change parameters for starting samba container

* [DOP-8838] - update test-samba.yml

* [DOP-8838] - update get-matrix.yml

* [DOP-8838] - update tests.yml

* [DOP-8838] - replace openssh-version with server-version

* [DOP-8838] - replace openssh-version with server-version
---
 .github/workflows/data/samba/ignored.txt  |  1 +
 .github/workflows/data/samba/matrix.yml   | 18 +++++
 .github/workflows/data/samba/tracked.txt  |  1 +
 .github/workflows/get-matrix.yml          | 33 +++++++++
 .github/workflows/nightly.yml             | 16 +++++
 .github/workflows/test-samba.yml          | 81 +++++++++++++++++++++++
 .github/workflows/tests.yml               | 16 +++++
 docker-compose.yml                        |  4 +-
 docker/samba/custom_entrypoint.sh         |  6 ++
 docker/samba/on_post_init.sh              |  4 --
 onetl/connection/file_connection/samba.py |  3 +-
 11 files changed, 176 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/data/samba/ignored.txt
 create mode 100644 .github/workflows/data/samba/matrix.yml
 create mode 100644 .github/workflows/data/samba/tracked.txt
 create mode 100644 .github/workflows/test-samba.yml
 create mode 100755 docker/samba/custom_entrypoint.sh
 delete mode 100755 docker/samba/on_post_init.sh

diff --git a/.github/workflows/data/samba/ignored.txt b/.github/workflows/data/samba/ignored.txt
new file mode 100644
index 000000000..d8f8d4692
--- /dev/null
+++ b/.github/workflows/data/samba/ignored.txt
@@ -0,0 +1 @@
+docs
diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml
new file mode 100644
index 000000000..a4a3afe30
--- /dev/null
+++ b/.github/workflows/data/samba/matrix.yml
@@ -0,0 +1,18 @@
+min: &min
+  python-version: '3.7'
+  os: ubuntu-latest
+
+max: &max
+  python-version: '3.11'
+  os: ubuntu-latest
+
+matrix:
+  small:
+  - server-version: latest
+    <<: *max
+  full: &full
+  - server-version: latest
+    <<: *min
+  - server-version: latest
+    <<: *max
+  nightly: *full
diff --git a/.github/workflows/data/samba/tracked.txt b/.github/workflows/data/samba/tracked.txt
new file mode 100644
index 000000000..5f7fcf905
--- /dev/null
+++ b/.github/workflows/data/samba/tracked.txt
@@ -0,0 +1 @@
+**/samba*
diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml
index fd7e24aae..b9d160b42 100644
--- a/.github/workflows/get-matrix.yml
+++ b/.github/workflows/get-matrix.yml
@@ -41,6 +41,8 @@ on:
         value: ${{ jobs.get-matrix.outputs.matrix-s3 }}
       matrix-sftp:
         value: ${{ jobs.get-matrix.outputs.matrix-sftp }}
+      matrix-samba:
+        value: ${{ jobs.get-matrix.outputs.matrix-samba }}
       matrix-webdav:
         value: ${{ jobs.get-matrix.outputs.matrix-webdav }}
 
@@ -69,6 +71,7 @@ jobs:
       matrix-hdfs: ${{ toJson(fromJson(steps.matrix-hdfs.outputs.result)[steps.key-hdfs.outputs.key]) }}
       matrix-s3: ${{ toJson(fromJson(steps.matrix-s3.outputs.result)[steps.key-s3.outputs.key]) }}
       matrix-sftp: ${{ toJson(fromJson(steps.matrix-sftp.outputs.result)[steps.key-sftp.outputs.key]) }}
+      matrix-samba: ${{ toJson(fromJson(steps.matrix-samba.outputs.result)[steps.key-samba.outputs.key]) }}
       matrix-webdav: ${{ toJson(fromJson(steps.matrix-webdav.outputs.result)[steps.key-webdav.outputs.key]) }}
     steps:
     - name: Checkout code
@@ -635,6 +638,36 @@ jobs:
       with:
         cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml
 
+    - name: Check if Samba files are changed
+      id: changed-samba
+      uses: tj-actions/changed-files@v35
+      with:
+        files_from_source_file: .github/workflows/data/samba/tracked.txt
+        files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt
+
+    - name: Print Samba files changed
+      run: |
+        echo '${{ steps.changed-samba.outputs.all_changed_files }}'
+
+    - name: Calculate Samba matrix key
+      id: key-samba
+      run: |
+        if ${{ inputs.nightly }}; then
+          key=nightly
+        elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-samba.outputs.any_changed }}; then
+          key=full
+        else
+          key=small
+        fi
+        echo key=$key
+        echo key=$key >> $GITHUB_OUTPUT
+
+    - name: Get Samba matrix
+      id: matrix-samba
+      uses: mikefarah/yq@v4.33.3
+      with:
+        cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml
+
     - name: Check if WebDAV files are changed
       id: changed-webdav
       uses: tj-actions/changed-files@v35
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 209364f4b..7608ebe6e 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -303,6 +303,21 @@ jobs:
       os: ${{ matrix.os }}
       with-cache: false
 
+  tests-samba:
+    name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
+    needs: [get-matrix]
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }}
+
+    uses: ./.github/workflows/test-samba.yml
+    with:
+      server-version: ${{ matrix.server-version }}
+      python-version: ${{ matrix.python-version }}
+      os: ${{ matrix.os }}
+      with-cache: false
+
   tests-webdav:
     name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
     needs: [get-matrix]
@@ -338,6 +353,7 @@ jobs:
     - tests-hdfs
     - tests-s3
     - tests-sftp
+    - tests-samba
     - tests-webdav
 
     steps:
diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml
new file mode 100644
index 000000000..d823a9ae7
--- /dev/null
+++ b/.github/workflows/test-samba.yml
@@ -0,0 +1,81 @@
+name: Tests for Samba
+on:
+  workflow_call:
+    inputs:
+      server-version:
+        required: true
+        type: string
+      python-version:
+        required: true
+        type: string
+      os:
+        required: true
+        type: string
+      with-cache:
+        required: false
+        type: boolean
+        default: true
+
+jobs:
+  test-samba:
+    name: Run Samba tests (server=${{ inputs.server-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }})
+    runs-on: ${{ inputs.os }}
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python ${{ inputs.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+
+    - name: Cache pip
+      uses: actions/cache@v3
+      if: inputs.with-cache
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }}
+          ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-
+
+    - name: Upgrade pip
+      run: python -m pip install --upgrade pip setuptools wheel
+
+    - name: Install dependencies
+      run: |
+        pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt
+
+    # Replace with Github Actions' because of custom parameter for samba container start
+    - name: Start Samba
+      run: |
+        docker compose down -v --remove-orphans
+        docker compose up -d samba
+      env:
+        SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }}
+        COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }}
+
+    - name: Wait for Samba to be ready
+      run: |
+        ./docker/wait-for-it.sh -h localhost -p 445 -t 60
+
+    - name: Run tests
+      run: |
+        mkdir reports/ || echo "Directory exists"
+        sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env
+        source ./env
+        ./pytest_runner.sh -m samba
+
+    - name: Shutdown Samba
+      if: always()
+      run: |
+        docker compose down -v --remove-orphans
+      env:
+        COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }}
+
+    - name: Upload coverage results
+      uses: actions/upload-artifact@v3
+      with:
+        name: samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }}
+        path: reports/*
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 44125d701..1df7f5306 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -287,6 +287,21 @@ jobs:
       python-version: ${{ matrix.python-version }}
       os: ${{ matrix.os }}
 
+  tests-samba:
+    name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
+    needs: [get-matrix]
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }}
+
+    uses: ./.github/workflows/test-samba.yml
+    with:
+      server-version: ${{ matrix.server-version }}
+      python-version: ${{ matrix.python-version }}
+      os: ${{ matrix.os }}
+
+
   tests-webdav:
     name: Run WebDAV tests (server=${{ matrix.webdav-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }})
     needs: [get-matrix]
@@ -321,6 +336,7 @@ jobs:
     - tests-hdfs
     - tests-s3
     - tests-sftp
+    - tests-samba
     - tests-webdav
 
     steps:
diff --git a/docker-compose.yml b/docker-compose.yml
index 3d93c02af..bdcfe3954 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -171,8 +171,8 @@ services:
     - "139:139"
     - "445:445"
     volumes:
-    - ./docker/samba:/share/folder
-    command: '-u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl"'
+    - ./docker/samba/custom_entrypoint.sh:/custom_entrypoint.sh
+    entrypoint: ["/custom_entrypoint.sh"]
     networks:
     - onetl
 
diff --git a/docker/samba/custom_entrypoint.sh b/docker/samba/custom_entrypoint.sh
new file mode 100755
index 000000000..f0d4078c0
--- /dev/null
+++ b/docker/samba/custom_entrypoint.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+# allow create files and directories
+mkdir -p /share/folder
+chmod 0777 /share/folder
+/entrypoint.sh -u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl"
diff --git a/docker/samba/on_post_init.sh b/docker/samba/on_post_init.sh
deleted file mode 100755
index f71af2a03..000000000
--- a/docker/samba/on_post_init.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-
-# allow create files and directories
-chmod 777 /share/folder
diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py
index 7a7f21132..8073622c3 100644
--- a/onetl/connection/file_connection/samba.py
+++ b/onetl/connection/file_connection/samba.py
@@ -20,10 +20,11 @@
 from io import BytesIO
 from logging import getLogger
 from pathlib import Path
-from typing import Literal, Optional, Union
+from typing import Optional, Union
 
 from etl_entities.instance import Host
 from pydantic import SecretStr, validator
+from typing_extensions import Literal
 
 from onetl.connection.file_connection.file_connection import FileConnection
 from onetl.hooks import slot, support_hooks

From b8bd6dd2e4077c58cd149c2b5e40c44d404e79ac Mon Sep 17 00:00:00 2001
From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com>
Date: Tue, 26 Sep 2023 10:47:45 +0300
Subject: [PATCH 16/26] [DOP-8839] - add documentation to Samba connection
 (#153)

* [DOP-8839] - add documentation to Samba connection

* [DOP-8839] - updated documentation to Samba connection

* [DOP-8839] - updated changelog

* Update docs/changelog/next_release/150.feature.rst

Co-authored-by: Maxim Martynov <msmarty5@mts.ru>

* [DOP-8839] - change default client name in SMB connection

* Update onetl/connection/file_connection/samba.py

Co-authored-by: Maxim Martynov <msmarty5@mts.ru>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Maxim Martynov <msmarty5@mts.ru>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .readthedocs.yml                            |  1 +
 README.rst                                  |  4 +++-
 docs/changelog/next_release/150.feature.rst |  2 ++
 docs/connection/file_connection/index.rst   |  1 +
 docs/connection/file_connection/samba.rst   |  9 ++++++++
 onetl/connection/file_connection/samba.py   | 25 +++++++++++++++++++--
 6 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 docs/changelog/next_release/150.feature.rst
 create mode 100644 docs/connection/file_connection/samba.rst

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 4d54479b4..923741b22 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -14,6 +14,7 @@ python:
     - ftp
     - ftps
     - hdfs
+    - samba
     - s3
     - sftp
     - webdav
diff --git a/README.rst b/README.rst
index ea9518d91..1c483e068 100644
--- a/README.rst
+++ b/README.rst
@@ -93,6 +93,8 @@ Supported storages
 |                    | FTPS         |                                                                                                                      |
 +                    +--------------+----------------------------------------------------------------------------------------------------------------------+
 |                    | WebDAV       | `WebdavClient3 library <https://pypi.org/project/webdavclient3/>`_                                                   |
++                    +--------------+----------------------------------------------------------------------------------------------------------------------+
+|                    | Samba        | `pysmb library <https://pypi.org/project/pysmb/>`_                                                                   |
 +--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+
 | Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source <https://spark.apache.org/docs/3.4.1/sql-data-sources-generic-options.html>`_         |
 |                    +--------------+                                                                                                                      +
@@ -204,7 +206,7 @@ Each client can be installed explicitly by passing connector name (in lowercase)
 .. code:: bash
 
     pip install onetl[ftp]  # specific connector
-    pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav]  # multiple connectors
+    pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba]  # multiple connectors
 
 To install all file connectors at once you can pass ``files`` to ``extras``:
 
diff --git a/docs/changelog/next_release/150.feature.rst b/docs/changelog/next_release/150.feature.rst
new file mode 100644
index 000000000..6ea0af9ff
--- /dev/null
+++ b/docs/changelog/next_release/150.feature.rst
@@ -0,0 +1,2 @@
+Add ``Samba`` file connection.
+It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``.
diff --git a/docs/connection/file_connection/index.rst b/docs/connection/file_connection/index.rst
index 2fc998c7f..3b6908c7d 100644
--- a/docs/connection/file_connection/index.rst
+++ b/docs/connection/file_connection/index.rst
@@ -10,6 +10,7 @@ File Connections
     FTP <ftp>
     FTPS <ftps>
     HDFS <hdfs/index>
+    Samba <samba>
     SFTP <sftp>
     S3 <s3>
     Webdav <webdav>
diff --git a/docs/connection/file_connection/samba.rst b/docs/connection/file_connection/samba.rst
new file mode 100644
index 000000000..73f7ac3f9
--- /dev/null
+++ b/docs/connection/file_connection/samba.rst
@@ -0,0 +1,9 @@
+.. _samba:
+
+Samba connection
+==============
+
+.. currentmodule:: onetl.connection.file_connection.samba
+
+.. autoclass:: Samba
+    :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_file, list_dir, download_file, upload_file
diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py
index 8073622c3..9e907ee3e 100644
--- a/onetl/connection/file_connection/samba.py
+++ b/onetl/connection/file_connection/samba.py
@@ -54,10 +54,12 @@
 
 @support_hooks
 class Samba(FileConnection):
-    """Samba file connection.
+    """Samba file connection. |support_hooks|
 
     Based on `pysmb library <https://pypi.org/project/pysmb/>`_.
 
+    .. versionadded:: 0.9.4
+
     Parameters
     ----------
     host : str
@@ -86,6 +88,25 @@ class Samba(FileConnection):
     password : str, default: None
         Password for file source connection. Can be `None` for anonymous connection.
 
+    Examples
+    --------
+
+    Samba file connection initialization
+
+    .. code:: python
+
+        from onetl.connection import Samba
+
+        samba = Samba(
+            host="mydomain.com",
+            share="share_name",
+            protocol="SMB",
+            port=445,
+            user="user",
+            password="password",
+        )
+
+
     """
 
     host: Host
@@ -168,7 +189,7 @@ def _get_client(self) -> SMBConnection:
         conn = SMBConnection(
             username=self.user,
             password=self.password.get_secret_value() if self.password else None,
-            my_name="optional_client_name",
+            my_name="onetl",
             remote_name=self.host,
             domain=self.domain,
             use_ntlm_v2=use_ntlm_v2,

From 95572680bace2743f7980ce972a65abd5a87ec2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Mon, 25 Sep 2023 12:18:03 +0000
Subject: [PATCH 17/26] [DOP-9007] Rearrange installation documentation

---
 README.rst                                    | 37 +++++-----
 docs/install/files.rst                        |  8 +++
 docs/install/full.rst                         |  8 +++
 docs/install/index.rst                        | 14 +++-
 docs/install/kerberos.rst                     |  8 +++
 docs/install/python_packages.rst              |  8 ---
 docs/install/{java_packages.rst => spark.rst} | 70 +++++++++++++++----
 7 files changed, 111 insertions(+), 42 deletions(-)
 create mode 100644 docs/install/files.rst
 create mode 100644 docs/install/full.rst
 create mode 100644 docs/install/kerberos.rst
 delete mode 100644 docs/install/python_packages.rst
 rename docs/install/{java_packages.rst => spark.rst} (89%)

diff --git a/README.rst b/README.rst
index 1c483e068..4f8b0aca8 100644
--- a/README.rst
+++ b/README.rst
@@ -54,7 +54,7 @@ Requirements
 * **Python 3.7 - 3.11**
 * PySpark 2.3.x - 3.4.x (depends on used connector)
 * Java 8+ (required by Spark, see below)
-* Kerberos libs & GCC (required by ``Hive`` and ``HDFS`` connectors)
+* Kerberos libs & GCC (required by ``Hive``, ``HDFS`` and ``SparkHDFS`` connectors)
 
 Supported storages
 ------------------
@@ -111,16 +111,16 @@ Documentation
 
 See https://onetl.readthedocs.io/
 
-.. install
-
 How to install
 ---------------
 
-.. minimal-install
+.. _install:
 
 Minimal installation
 ~~~~~~~~~~~~~~~~~~~~
 
+.. _minimal-install:
+
 Base ``onetl`` package contains:
 
 * ``DBReader``, ``DBWriter`` and related classes
@@ -142,14 +142,16 @@ It can be installed via:
     This method is recommended for use in third-party libraries which require for ``onetl`` to be installed,
     but do not use its connection classes.
 
-.. _spark-install:
-
 With DB and FileDF connections
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. _spark-install:
+
 All DB connection classes (``Clickhouse``, ``Greenplum``, ``Hive`` and others)
 and all FileDF connection classes (``SparkHDFS``, ``SparkLocalFS``, ``SparkS3``)
-require PySpark to be installed.
+require Spark to be installed.
+
+.. _java-install:
 
 Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples:
 
@@ -178,6 +180,8 @@ Compatibility matrix
 | `3.4.x <https://spark.apache.org/docs/3.4.1/#downloading>`_  | 3.7 - 3.11  | 8u362 - 20  | 2.12  |
 +--------------------------------------------------------------+-------------+-------------+-------+
 
+.. _pyspark-install:
+
 Then you should install PySpark via passing ``spark`` to ``extras``:
 
 .. code:: bash
@@ -193,12 +197,11 @@ or install PySpark explicitly:
 or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance.
 **Otherwise connection object cannot be created.**
 
-
-.. _files-install:
-
 With File connections
 ~~~~~~~~~~~~~~~~~~~~~
 
+.. _files-install:
+
 All File (but not *FileDF*) connection classes (``FTP``,  ``SFTP``, ``HDFS`` and so on) requires specific Python clients to be installed.
 
 Each client can be installed explicitly by passing connector name (in lowercase) to ``extras``:
@@ -216,18 +219,17 @@ To install all file connectors at once you can pass ``files`` to ``extras``:
 
 **Otherwise class import will fail.**
 
-
-.. _kerberos-install:
-
 With Kerberos support
 ~~~~~~~~~~~~~~~~~~~~~
 
+.. _kerberos-install:
+
 Most of Hadoop instances set up with Kerberos support,
 so some connections require additional setup to work properly.
 
 * ``HDFS``
   Uses `requests-kerberos <https://pypi.org/project/requests-kerberos/>`_ and
-  `GSSApi <https://pypi.org/project/gssapi/>`_ for authentication in WebHDFS.
+  `GSSApi <https://pypi.org/project/gssapi/>`_ for authentication.
   It also uses ``kinit`` executable to generate Kerberos ticket.
 
 * ``Hive`` and ``SparkHDFS``
@@ -252,12 +254,11 @@ Also you should pass ``kerberos`` to ``extras`` to install required Python packa
 
     pip install onetl[kerberos]
 
-
-.. _full-install:
-
 Full bundle
 ~~~~~~~~~~~
 
+.. _full-bundle:
+
 To install all connectors and dependencies, you can pass ``all`` into ``extras``:
 
 .. code:: bash
@@ -271,7 +272,7 @@ To install all connectors and dependencies, you can pass ``all`` into ``extras``
 
     This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS.
 
-.. quick-start
+.. _quick-start:
 
 Quick start
 ------------
diff --git a/docs/install/files.rst b/docs/install/files.rst
new file mode 100644
index 000000000..b32c7a807
--- /dev/null
+++ b/docs/install/files.rst
@@ -0,0 +1,8 @@
+.. _install-files:
+
+File connections
+=================
+
+.. include:: ../../README.rst
+    :start-after: .. _files-install:
+    :end-before: With Kerberos support
diff --git a/docs/install/full.rst b/docs/install/full.rst
new file mode 100644
index 000000000..a3853207c
--- /dev/null
+++ b/docs/install/full.rst
@@ -0,0 +1,8 @@
+.. _install-full:
+
+Full bundle
+===========
+
+.. include:: ../../README.rst
+    :start-after: .. _full-bundle:
+    :end-before: .. _quick-start:
diff --git a/docs/install/index.rst b/docs/install/index.rst
index 86365e381..47f86287c 100644
--- a/docs/install/index.rst
+++ b/docs/install/index.rst
@@ -3,9 +3,19 @@
 How to install
 ==============
 
+.. include:: ../../README.rst
+    :start-after: .. _minimal-install:
+    :end-before: With DB and FileDF connections
+
+Installation in details
+-----------------------
+
 .. toctree::
     :maxdepth: 1
     :caption: How to install
 
-    python_packages
-    java_packages
+    self
+    spark
+    files
+    kerberos
+    full
diff --git a/docs/install/kerberos.rst b/docs/install/kerberos.rst
new file mode 100644
index 000000000..2ba28de4d
--- /dev/null
+++ b/docs/install/kerberos.rst
@@ -0,0 +1,8 @@
+.. _install-kerberos:
+
+Kerberos support
+================
+
+.. include:: ../../README.rst
+    :start-after: .. _kerberos-install:
+    :end-before: Full bundle
diff --git a/docs/install/python_packages.rst b/docs/install/python_packages.rst
deleted file mode 100644
index 4459b2f37..000000000
--- a/docs/install/python_packages.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-.. _python-packages:
-
-Python packages
-===============
-
-.. include:: ../../README.rst
-    :start-after: minimal-install
-    :end-before: quick-start
diff --git a/docs/install/java_packages.rst b/docs/install/spark.rst
similarity index 89%
rename from docs/install/java_packages.rst
rename to docs/install/spark.rst
index a64c9e7c5..861527341 100644
--- a/docs/install/java_packages.rst
+++ b/docs/install/spark.rst
@@ -1,9 +1,32 @@
+.. _install-spark:
+
+Spark
+=====
+
+.. include:: ../../README.rst
+    :start-after: .. _spark-install:
+    :end-before: .. _java-install:
+
+Installing Java
+---------------
+
+.. include:: ../../README.rst
+    :start-after: .. _java-install:
+    :end-before: .. _pyspark-install:
+
+Installing PySpark
+------------------
+
+.. include:: ../../README.rst
+    :start-after: .. _pyspark-install:
+    :end-before: With File connections
+
 .. _java-packages:
 
-Java packages
-==============
+Injecting Java packages
+-----------------------
 
-``DB`` and ``FileDF`` connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session,
+Some DB and FileDF connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session,
 like JDBC drivers.
 
 This is usually done by setting up ``spark.jars.packages`` option while creating Spark session:
@@ -11,7 +34,11 @@ This is usually done by setting up ``spark.jars.packages`` option while creating
 .. code:: python
 
     # here is a list of packages to be downloaded:
-    maven_packages = Greenplum.get_packages(spark_version="3.2")
+    maven_packages = (
+        Greenplum.get_packages(spark_version="3.2")
+        + MySQL.get_packages()
+        + Teradata.get_packages()
+    )
 
     spark = (
         SparkSession.builder.config("spark.app.name", "onetl")
@@ -33,7 +60,7 @@ But sometimes it is required to:
 There are several ways to do that.
 
 Using ``spark.jars``
---------------------
+^^^^^^^^^^^^^^^^^^^^
 
 The most simple solution, but this requires to store raw ``.jar`` files somewhere on filesystem or web server.
 
@@ -67,7 +94,7 @@ The most simple solution, but this requires to store raw ``.jar`` files somewher
         )
 
 Using ``spark.jars.repositories``
----------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. note::
 
@@ -84,7 +111,12 @@ Can be used if you have access both to public repos (like Maven) and a private A
 
 .. code:: python
 
-    maven_packages = Greenplum.get_packages(spark_version="3.2")
+    maven_packages = (
+        Greenplum.get_packages(spark_version="3.2")
+        + MySQL.get_packages()
+        + Teradata.get_packages()
+    )
+
     spark = (
         SparkSession.builder.config("spark.app.name", "onetl")
         .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/")
@@ -94,7 +126,7 @@ Can be used if you have access both to public repos (like Maven) and a private A
 
 
 Using ``spark.jars.ivySettings``
---------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Same as above, but can be used even if there is no network access to public repos like Maven.
 
@@ -194,7 +226,12 @@ Same as above, but can be used even if there is no network access to public repo
 .. code-block:: python
     :caption: script.py
 
-    maven_packages = Greenplum.get_packages(spark_version="3.2")
+    maven_packages = (
+        Greenplum.get_packages(spark_version="3.2")
+        + MySQL.get_packages()
+        + Teradata.get_packages()
+    )
+
     spark = (
         SparkSession.builder.config("spark.app.name", "onetl")
         .config("spark.jars.ivySettings", "/path/to/ivysettings.xml")
@@ -203,7 +240,7 @@ Same as above, but can be used even if there is no network access to public repo
     )
 
 Place ``.jar`` file to ``-/.ivy2/jars/``
-----------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven.
 
@@ -213,7 +250,12 @@ Can be used to pass already downloaded file to Ivy, and skip resolving package f
 
 .. code:: python
 
-    maven_packages = Greenplum.get_packages(spark_version="3.2")
+    maven_packages = (
+        Greenplum.get_packages(spark_version="3.2")
+        + MySQL.get_packages()
+        + Teradata.get_packages()
+    )
+
     spark = (
         SparkSession.builder.config("spark.app.name", "onetl")
         .config("spark.jars.packages", ",".join(maven_packages))
@@ -221,7 +263,7 @@ Can be used to pass already downloaded file to Ivy, and skip resolving package f
     )
 
 Place ``.jar`` file to Spark jars folder
-----------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. note::
 
@@ -235,7 +277,7 @@ Place ``.jar`` file to Spark jars folder
 Can be used to embed ``.jar`` files to a default Spark classpath.
 
 * Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique.
-* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``.
+* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``^/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``.
 * Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages``
 .. code:: python
 
@@ -246,7 +288,7 @@ Can be used to embed ``.jar`` files to a default Spark classpath.
 
 
 Manually adding ``.jar`` files to ``CLASSPATH``
------------------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. note::
 

From ae6bde61d15aa6754113fc8efc45312a123ca878 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Mon, 25 Sep 2023 11:22:28 +0000
Subject: [PATCH 18/26] [DOP-9085] Improve Greenplum documentation

---
 .../next_release/154.improvement.rst          |   4 +
 docs/conf.py                                  |   1 +
 .../db_connection/greenplum/execute.rst       |  41 ++++
 .../db_connection/greenplum/prerequisites.rst | 176 ++++++++++++++++++
 .../db_connection/greenplum/read.rst          | 141 +++++++++++++-
 .../db_connection/greenplum/write.rst         |  95 ++++++++++
 .../db_connection/greenplum/connection.py     |  24 +--
 .../db_connection/greenplum/options.py        |   4 +-
 .../file_df_connection/spark_s3/connection.py |   8 +
 requirements/docs.txt                         |   4 +-
 10 files changed, 469 insertions(+), 29 deletions(-)
 create mode 100644 docs/changelog/next_release/154.improvement.rst

diff --git a/docs/changelog/next_release/154.improvement.rst b/docs/changelog/next_release/154.improvement.rst
new file mode 100644
index 000000000..745989a87
--- /dev/null
+++ b/docs/changelog/next_release/154.improvement.rst
@@ -0,0 +1,4 @@
+Drastically improve ``Greenplum`` documentation:
+* Added information about network ports, grants, ``pg_hba.conf`` and so on.
+* Added interaction schemas for reading, writing and executing statements in Greenplum.
+* Added recommendations about reading data from views and ``JOIN`` results from Greenplum.
diff --git a/docs/conf.py b/docs/conf.py
index 87d6fd17b..06a5b08aa 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -56,6 +56,7 @@
     "sphinx.ext.autosummary",
     "sphinxcontrib.autodoc_pydantic",
     "sphinxcontrib.towncrier",  # provides `towncrier-draft-entries` directive
+    "sphinxcontrib.plantuml",
 ]
 numpydoc_show_class_members = True
 autodoc_pydantic_model_show_config = False
diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst
index b0833b213..e2179a4ec 100644
--- a/docs/connection/db_connection/greenplum/execute.rst
+++ b/docs/connection/db_connection/greenplum/execute.rst
@@ -3,6 +3,47 @@
 Executing statements in Greenplum
 ==================================
 
+Interaction schema
+------------------
+
+Unlike reading & writing, executing statements in Greenplum is done **only** through Greenplum master node,
+without any interaction between Greenplum segments and Spark executors. More than that, Spark executors are not used in this case.
+
+The only port used while interacting with Greenplum in this case is ``5432`` (Greenplum master port).
+
+.. dropdown:: Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch()
+
+    .. plantuml::
+
+        @startuml
+        title Greenplum master <-> Spark driver
+                box "Spark"
+                participant "Spark driver"
+                end box
+
+                box "Greenplum"
+                participant "Greenplum master"
+                end box
+
+                == Greenplum.check() ==
+
+                activate "Spark driver"
+                "Spark driver" -> "Greenplum master" ++ : CONNECT
+
+                == Greenplum.execute(statement) ==
+                "Spark driver" --> "Greenplum master" : EXECUTE statement
+                "Greenplum master" -> "Spark driver" : RETURN result
+
+                == Greenplum.close() ==
+                "Spark driver" --> "Greenplum master" : CLOSE CONNECTION
+
+                deactivate "Greenplum master"
+                deactivate "Spark driver"
+        @enduml
+
+Options
+-------
+
 .. currentmodule:: onetl.connection.db_connection.greenplum.connection
 
 .. automethod:: Greenplum.fetch
diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index a545fdc27..f7b8e9d32 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -43,3 +43,179 @@ There are several ways to do that. See :ref:`java-packages` for details.
 
     If you're uploading package to private package repo, use ``groupId=io.pivotal`` and ``artifactoryId=greenplum-spark_2.12``
     (``2.12`` is Scala version) to give uploaded package a proper name.
+
+Connecting to Greenplum
+-----------------------
+
+Interaction schema
+~~~~~~~~~~~~~~~~~~
+
+Spark executors open ports to listen incoming requests.
+Greenplum segments are initiating connections to Spark executors using `EXTERNAL TABLE <https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html>`_
+functionality, and send/read data using `gpfdist <https://docs.vmware.com/en/VMware-Greenplum/index.html>`_ protocol.
+
+Data is **not** send through Greenplum master.
+Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on).
+
+More details can be found in `official documentation <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/overview.html>`_.
+
+Number of parallel connections
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. warning::
+
+    This is very important!!!
+
+    If you don't limit number of connections, you can exceed the `max_connections <https://docs.vmware.com/en/VMware-Greenplum/6/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections>`_
+    limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max,
+    depending on your Greenplum instance settings and using connection balancers like ``pgbouncer``.
+
+    Consuming all available connections means **nobody** (even admin users) can connect to Greenplum.
+
+Each job on the Spark executor makes its own connection to Greenplum master node,
+so you need to limit number of connections to avoid opening too many of them.
+
+* Reading about ``5-10Gb`` of data requires about ``3-5`` parallel connections.
+* Reading about ``20-30Gb`` of data requires about ``5-10`` parallel connections.
+* Reading about ``50Gb`` of data requires ~ ``10-20`` parallel connections.
+* Reading about ``100+Gb`` of data requires ``20-30`` parallel connections.
+* Opening more than ``30-50`` connections is not recommended.
+
+Number of connections can be limited by 2 ways:
+
+* By limiting number of Spark executors and number of cores per-executor. Max number of parallel jobs is ``executors * cores``.
+
+.. tabs::
+
+    .. code-tab:: py Spark with master=local
+
+        (
+            SparkSession.builder
+            # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10
+            .config("spark.master", "local[10]")
+            .config("spark.executor.cores", 1)
+        )
+
+    .. code-tab:: py Spark with master=yarn or master=k8s, dynamic allocation
+
+        (
+            SparkSession.builder
+            .config("spark.master", "yarn")
+            # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10
+            .config("spark.dynamicAllocation.maxExecutors", 10)
+            .config("spark.executor.cores", 1)
+        )
+
+    .. code-tab:: py Spark with master=yarn or master=k8s, static allocation
+
+        (
+            SparkSession.builder
+            .config("spark.master", "yarn")
+            # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10
+            .config("spark.executor.instances", 10)
+            .config("spark.executor.cores", 1)
+        )
+
+* By limiting connection pool size user by Spark (**only** for Spark with ``master=local``):
+
+.. code:: python
+
+        spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate()
+
+        # No matter how many executors are started and how many cores they have,
+        # number of connections cannot exceed pool size:
+        Greenplum(
+            ...,
+            extra={
+                "pool.maxSize": 10,
+            },
+        )
+
+See `connection pooling <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/using_the_connector.html#jdbcconnpool>`_
+documentation.
+
+
+* By setting :obj:`num_partitions <onetl.connection.db_connection.greenplum.options.GreenplumReadOptions.num_partitions>`
+  and :obj:`partition_column <onetl.connection.db_connection.greenplum.options.GreenplumReadOptions.partition_column>` (not recommended).
+
+Allowing connection to Greenplum master
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node,
+e.g. by updating ``pg_hba.conf`` file.
+
+More details can be found in `official documentation <https://docs.vmware.com/en/VMware-Greenplum/6/greenplum-database/admin_guide-client_auth.html>`_.
+
+Network ports
+~~~~~~~~~~~~~
+
+To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum:
+
+* Spark driver and all Spark executors -> port ``5432`` on Greenplum master node.
+
+  This port number should be set while connecting to Greenplum:
+
+  .. code:: python
+
+        Greenplum(host="master.host", port=5432, ...)
+
+* Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executor**.
+
+  This range should be set in ``extra`` option:
+
+  .. code:: python
+
+        Greenplum(
+            ...,
+            extra={
+                "server.port": "41000-42000",
+            },
+        )
+
+  Number of ports in this range is ``number of parallel running Spark sessions`` * ``number of parallel connections per session``.
+
+  Number of connections per session (see below) is usually less than ``30`` (see below).
+
+  Number of session depends on your environment:
+    * For ``master=local`` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU.
+
+    * For ``master=yarn`` / ``master=k8s`` hundreds or thousands of sessions can be started simultaneously,
+      but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time.
+
+More details can be found in official documentation:
+    * `port requirements <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/sys_reqs.html#network-port-requirements>`_
+    * `format of server.port value <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/options.html#server.port>`_
+    * `port troubleshooting <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/troubleshooting.html#port-errors>`_
+
+Required grants
+~~~~~~~~~~~~~~~
+
+Ask your Greenplum cluster administrator to set following grants for a user,
+used for creating a connection:
+
+.. tabs::
+
+    .. code-tab:: sql Reading & writing
+
+        GRANT USAGE ON SCHEMA myschema TO username;
+        GRANT CREATE ON SCHEMA myschema TO username;
+        GRANT SELECT, INSERT ON SCHEMA myschema.mytable TO username;
+        ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
+
+    .. code-tab:: sql Reading from Greenplum
+
+        GRANT USAGE ON SCHEMA schema_to_read TO username;
+        GRANT CREATE ON SCHEMA schema_to_read TO username;
+        GRANT SELECT ON SCHEMA schema_to_read.table_to_read TO username;
+        -- yes, ``writable``, because data is written from Greenplum to Spark executor.
+        ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
+
+    .. code-tab:: sql Writing to Greenplum
+
+        GRANT USAGE ON SCHEMA schema_to_write TO username;
+        GRANT CREATE ON SCHEMA schema_to_write TO username;
+        GRANT SELECT, INSERT ON SCHEMA schema_to_write.table_to_write TO username;
+        -- yes, ``readable``, because data is read from Spark executor to Greenplum.
+        ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist');
+
+More details can be found in `official documentation <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/install_cfg.html#role-privileges>`_.
diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst
index 2640f7e6c..30d669fea 100644
--- a/docs/connection/db_connection/greenplum/read.rst
+++ b/docs/connection/db_connection/greenplum/read.rst
@@ -8,20 +8,143 @@ For reading data from Greenplum, use :obj:`DBReader <onetl.db.db_reader.db_reade
 .. note::
 
     Unlike JDBC connectors, *Greenplum connector for Spark* does not support
-    executing **custom** SQL queries using ``.sql`` method, because this leads to sending
-    the result through *master* node which is really bad for cluster performance.
+    executing **custom** SQL queries using ``.sql`` method. Connector can be used to only read some table data
+    (with filters, if needed) using DBReader.
 
-    To make distributed queries like ``JOIN`` **on Greenplum side**, you should create a staging table,
-    populate it with the data you need (using ``.execute`` method to call ``INSERT INTO ... AS SELECT ...``),
-    then read the data from this table using :obj:`DBReader <onetl.db.db_reader.db_reader.DBReader>`,
-    and drop staging table after reading is finished.
+Interaction schema
+------------------
 
-    In this case data will be read directly from Greenplum segment nodes in a distributed way.
+High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below.
+
+.. dropdown:: Spark <-> Greenplum interaction during DBReader.run()
+
+    .. plantuml::
+
+        @startuml
+        title Greenplum master <-> Spark driver
+                box "Spark"
+                participant "Spark driver"
+                participant "Spark executor1"
+                participant "Spark executor2"
+                participant "Spark executorN"
+                end box
+
+                box "Greenplum"
+                participant "Greenplum master"
+                participant "Greenplum segment1"
+                participant "Greenplum segment2"
+                participant "Greenplum segmentN"
+                end box
+
+                == Greenplum.check() ==
+
+                activate "Spark driver"
+                "Spark driver" -> "Greenplum master" ++ : CONNECT
+
+                "Spark driver" --> "Greenplum master" : CHECK IF TABLE EXISTS gp_table
+                "Greenplum master" --> "Spark driver" : TABLE EXISTS
+                "Spark driver" -> "Greenplum master" : SHOW SCHEMA FOR gp_table
+                "Greenplum master" --> "Spark driver" : (id bigint, col1 int, col2 text, ...)
+
+                == DBReader.run() ==
+
+                "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1
+                "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2
+                "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N
+
+                note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV
+                "Spark executor1" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1
+                note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master
+                "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1
+                note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments.
+
+                "Spark executor2" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2
+                "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2
+
+                "Spark executorN" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N
+                "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN
+
+                "Greenplum segment1" ->o "Spark executor1" -- : INITIALIZE CONNECTION TO Spark executor1\nPUSH DATA TO Spark executor1
+                note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor
+
+                "Greenplum segment2" ->o "Spark executor2" -- : INITIALIZE CONNECTION TO Spark executor2\nPUSH DATA TO Spark executor2
+                "Greenplum segmentN" ->o "Spark executorN" -- : INITIALIZE CONNECTION TO Spark executorN\nPUSH DATA TO Spark executorN
+
+                == Spark.stop() ==
+
+                "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1
+                deactivate "Greenplum master"
+                "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2
+                deactivate "Greenplum master"
+                "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN
+                deactivate "Greenplum master"
+
+                "Spark executor1" --> "Spark driver" -- : DONE
+                "Spark executor2" --> "Spark driver" -- : DONE
+                "Spark executorN" --> "Spark driver" -- : DONE
+
+                "Spark driver" --> "Greenplum master" : CLOSE CONNECTION
+                deactivate "Greenplum master"
+                deactivate "Spark driver"
+        @enduml
+
+Recommendations
+---------------
+
+Reading from views
+~~~~~~~~~~~~~~~~~~
+
+This connector is **NOT** designed to read data from views.
+
+You can technically read data from a view which has
+`gp_segment_id <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/troubleshooting.html#reading-from-a-view>`_ column.
+But this is **not** recommended because each Spark executor will run the same query, which may lead to running duplicated calculations
+and sending data between segments only to skip most of the result and select only small part.
+
+Prefer following option:
+    * Create staging table to store result data, using :obj:`Greenplum.execute <onetl.connection.db_connection.greenplum.connection.Greenplum.execute>`
+    * Use the same ``.execute`` method run a query ``INSERT INTO staging_table AS SELECT FROM some_view``. This will be done on Greenplum segments side, query will be run only once.
+    * Read data from staging table to Spark executor using :obj:`DBReader <onetl.db.db_reader.db_reader.DBReader>`.
+    * Drop staging table using ``.execute`` method.
+
+Using ``JOIN`` on Greenplum side
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to get data of joining 2 tables in Greenplum, you should:
+    * Create staging table to store result data, using ``Greenplum.execute``
+    * Use the same ``Greenplum.execute`` run a query ``INSERT INTO staging_table AS SELECT FROM table1 JOIN table2``. This will be done on Greenplum segments side, in a distributed way.
+    * Read data from staging table to Spark executor using ``DBReader``.
+    * Drop staging table using ``Greenplum.execute``.
 
 .. warning::
 
-    Greenplum connection does **NOT** support reading data from views which does not have ``gp_segment_id`` column.
-    Either add this column to a view, or use stating table solution (see above).
+    Do **NOT** try to read data from ``table1`` and ``table2`` using ``DBReader``, and then join the resulting dataframes!
+
+    This will lead to sending all the data from both tables to Spark executor memory, and then ``JOIN``
+    will be performed on Spark side, not Greenplum. This is **very** inefficient.
+
+Using ``TEMPORARY`` tables
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Someone could think that writing data from ``VIEW`` or result of ``JOIN`` to ``TEMPORARY`` table,
+and then passing it to DBReader, is an efficient way to read data from Greenplum, because temp tables are not generating WAL files,
+and are automatically deleted after finishing the transaction.
+
+That's will **not** work. Each Spark executor establishes its own connection to Greenplum,
+and thus reads its own temporary table, which does not contain any data.
+
+You should use `UNLOGGED <https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_TABLE.html>`_ tables
+to write data to staging table without generating useless WAL logs.
+
+Mapping of Greenplum types to Spark types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See `official documentation <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/reference-datatype_mapping.html#greenplum-to-spark>`_
+for more details.
+onETL does not perform any additional casting of types while reading data.
+
+Options
+-------
 
 .. currentmodule:: onetl.connection.db_connection.greenplum.options
 
diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst
index aeb688ac5..c7a4f1560 100644
--- a/docs/connection/db_connection/greenplum/write.rst
+++ b/docs/connection/db_connection/greenplum/write.rst
@@ -5,6 +5,101 @@ Writing to Greenplum
 
 For writing data to Greenplum, use :obj:`DBWriter <onetl.db.db_writer.db_writer.DBWriter>` with options below.
 
+
+Interaction schema
+------------------
+
+High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below.
+
+.. dropdown:: Spark <-> Greenplum interaction during DBWriter.run()
+
+    .. plantuml::
+
+        @startuml
+        title Greenplum master <-> Spark driver
+                box "Spark"
+                participant "Spark driver"
+                participant "Spark executor1"
+                participant "Spark executor2"
+                participant "Spark executorN"
+                end box
+
+                box "Greenplum"
+                participant "Greenplum master"
+                participant "Greenplum segment1"
+                participant "Greenplum segment2"
+                participant "Greenplum segmentN"
+                end box
+
+                == Greenplum.check() ==
+
+                activate "Spark driver"
+                "Spark driver" -> "Greenplum master" ++ : CONNECT
+                "Spark driver" --> "Greenplum master" ++ : CHECK IF TABLE EXISTS gp_table
+                "Greenplum master" --> "Spark driver" : TABLE NOT EXISTS
+
+                == DBWriter.run(df) ==
+
+                "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1
+                "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2
+                "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N
+
+                note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV
+                "Spark executor1" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO gp_table FROM spark_executor1
+                note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master
+                "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM spark_executor1 TO gp_table_data_on_segment1
+                note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments.
+
+                "Spark executor2" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO gp_table FROM spark_executor2
+                "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM spark_executor2 TO gp_table_data_on_segment2
+
+                "Spark executorN" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO gp_table FROM spark_executorN
+                "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM spark_executorN TO gp_table_data_on_segmentN
+
+                "Greenplum segment1" -->o "Spark executor1" : INITIALIZE CONNECTION TO Spark executor1
+                "Spark executor1" -> "Greenplum segment1" : READ DATA FROM Spark executor1
+                note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor
+                deactivate "Greenplum segment1"
+
+                "Greenplum segment2" -->o "Spark executor2" : INITIALIZE CONNECTION TO Spark executor2
+                "Spark executor2" -> "Greenplum segment2" : READ DATA FROM Spark executor2
+                deactivate "Greenplum segment2"
+
+                "Greenplum segmentN" -->o "Spark executorN" : INITIALIZE CONNECTION TO Spark executorN
+                "Spark executorN" -> "Greenplum segmentN" : READ DATA FROM Spark executorN
+                deactivate "Greenplum segmentN"
+
+                == Finished ==
+
+                "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1
+                deactivate "Greenplum master"
+                "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2
+                deactivate "Greenplum master"
+                "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN
+                deactivate "Greenplum master"
+
+                "Spark executor1" --> "Spark driver" -- : DONE
+                "Spark executor2" --> "Spark driver" -- : DONE
+                "Spark executorN" --> "Spark driver" -- : DONE
+
+                "Spark driver" --> "Greenplum master" : CLOSE CONNECTION
+                deactivate "Greenplum master"
+                deactivate "Spark driver"
+        @enduml
+
+Recommendations
+---------------
+
+Mapping of Spark types to Greenplum types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See `official documentation <https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.1/greenplum-connector-spark/reference-datatype_mapping.html#spark-to-greenplum>`_
+for more details.
+onETL does not perform any additional casting of types while writing data.
+
+Options
+-------
+
 .. currentmodule:: onetl.connection.db_connection.greenplum.options
 
 .. autopydantic_model:: GreenplumWriteOptions
diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py
index 99de7d90c..d1eedff7f 100644
--- a/onetl/connection/db_connection/greenplum/connection.py
+++ b/onetl/connection/db_connection/greenplum/connection.py
@@ -124,34 +124,24 @@ class Greenplum(JDBCMixin, DBConnection):
         from onetl.connection import Greenplum
         from pyspark.sql import SparkSession
 
-        # Please ask your DevOps and Greenplum admin what port range
-        # on Spark side can be used to accept requests from Greenplum segments
-
-        extra = {
-            "server.port": "49152-65535",
-        }
-
         # Create Spark session with Greenplum connector loaded
-        # See Prerequisites page for more details
         maven_packages = Greenplum.get_packages(spark_version="3.2")
         spark = (
             SparkSession.builder.appName("spark-app-name")
             .config("spark.jars.packages", ",".join(maven_packages))
+            .config("spark.executor.allowSparkContext", "true")
+            # IMPORTANT!!!
+            # Set number of executors according to "Prerequisites" -> "Number of executors"
             .config("spark.dynamicAllocation.maxExecutors", 10)
             .config("spark.executor.cores", 1)
             .getOrCreate()
         )
 
         # IMPORTANT!!!
-        # Each job on the Spark executor make its own connection to Greenplum master node,
-        # so we need to limit number of connections to avoid opening too many of them.
-        #
-        # Table size ~20Gb requires about 10 executors * cores,
-        # ~50Gb requires ~ 20 executors * cores,
-        # 100Gb+ requires 30 executors * cores.
-        #
-        # Cores number can be increased, but executors count should be reduced
-        # to keep the same number of executors * cores.
+        # Set port range of executors according to "Prerequisites" -> "Network ports"
+        extra = {
+            "server.port": "41000-42000",
+        }
 
         # Create connection
         greenplum = Greenplum(
diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py
index 86785155e..7d4638412 100644
--- a/onetl/connection/db_connection/greenplum/options.py
+++ b/onetl/connection/db_connection/greenplum/options.py
@@ -107,7 +107,9 @@ class Config:
 
     .. warning::
 
-        You should not change this option, unless you know what you're doing
+        You should not change this option, unless you know what you're doing.
+
+        It's preferable to use default values to read data parallel by number of segments in Greenplum cluster.
 
     Possible values:
         * ``None`` (default):
diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py
index 464487f52..609bba034 100644
--- a/onetl/connection/file_df_connection/spark_s3/connection.py
+++ b/onetl/connection/file_df_connection/spark_s3/connection.py
@@ -161,9 +161,17 @@ class SparkS3(SparkFileDFConnection):
 
         # Create Spark session with Hadoop AWS libraries loaded
         maven_packages = SparkS3.get_packages(spark_version="3.4.1")
+        # Some dependencies are not used, but downloading takes a lot of time. Skipping them.
+        excluded_packages = [
+            "com.google.cloud.bigdataoss:gcs-connector",
+            "org.apache.hadoop:hadoop-aliyun",
+            "org.apache.hadoop:hadoop-azure-datalake",
+            "org.apache.hadoop:hadoop-azure",
+        ]
         spark = (
             SparkSession.builder.appName("spark-app-name")
             .config("spark.jars.packages", ",".join(maven_packages))
+            .config("spark.jars.excludes", ",".join(excluded_packages))
             .config("spark.hadoop.fs.s3a.committer.magic.enabled", "true")
             .config("spark.hadoop.fs.s3a.committer.name", "magic")
             .config(
diff --git a/requirements/docs.txt b/requirements/docs.txt
index d3fc9555e..4ff1db3e9 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -3,10 +3,10 @@ furo
 importlib-resources<6
 numpydoc
 pygments-csv-lexer
-# https://github.com/sphinx-doc/sphinx/issues/11662
-sphinx<7.2.5
+sphinx
 sphinx-copybutton
 sphinx-design
+sphinx-plantuml
 sphinx-tabs
 sphinx-toolbox
 sphinx_substitution_extensions

From e757a65e46e09fd326d0bc79cdf5a29de064076f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 26 Sep 2023 11:21:46 +0000
Subject: [PATCH 19/26] [DOP-9007] Fix links to installation instruction

---
 docs/connection/db_connection/greenplum/prerequisites.rst    | 2 +-
 onetl/connection/db_connection/clickhouse/connection.py      | 2 +-
 onetl/connection/db_connection/hive/connection.py            | 4 ++--
 onetl/connection/db_connection/mongodb/connection.py         | 2 +-
 onetl/connection/db_connection/mssql/connection.py           | 2 +-
 onetl/connection/db_connection/mysql/connection.py           | 2 +-
 onetl/connection/db_connection/oracle/connection.py          | 2 +-
 onetl/connection/db_connection/postgres/connection.py        | 2 +-
 onetl/connection/db_connection/teradata/connection.py        | 2 +-
 onetl/connection/file_connection/ftp.py                      | 2 +-
 onetl/connection/file_connection/ftps.py                     | 2 +-
 onetl/connection/file_connection/hdfs/connection.py          | 4 ++--
 onetl/connection/file_connection/s3.py                       | 2 +-
 onetl/connection/file_connection/sftp.py                     | 2 +-
 onetl/connection/file_connection/webdav.py                   | 2 +-
 onetl/connection/file_df_connection/spark_hdfs/connection.py | 4 ++--
 onetl/connection/file_df_connection/spark_local_fs.py        | 2 +-
 onetl/connection/file_df_connection/spark_s3/connection.py   | 2 +-
 18 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index f7b8e9d32..0f0c1c53d 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -24,7 +24,7 @@ You can install PySpark as follows:
 
     pip install onetl pyspark=3.2.4  # pass specific PySpark version
 
-See :ref:`spark-install` instruction for more details.
+See :ref:`install-spark` instruction for more details.
 
 Downloading Pivotal package
 ---------------------------
diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py
index dc6acf163..19f953b47 100644
--- a/onetl/connection/db_connection/clickhouse/connection.py
+++ b/onetl/connection/db_connection/clickhouse/connection.py
@@ -65,7 +65,7 @@ class Clickhouse(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py
index d0997f512..83b219acc 100644
--- a/onetl/connection/db_connection/hive/connection.py
+++ b/onetl/connection/db_connection/hive/connection.py
@@ -69,7 +69,7 @@ class Hive(DBConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     .. warning::
 
@@ -82,7 +82,7 @@ class Hive(DBConnection):
     .. note::
 
         Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit``
-        **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`.
+        **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`.
 
         In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options
         to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS.
diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py
index 771fb3b69..57c06cac0 100644
--- a/onetl/connection/db_connection/mongodb/connection.py
+++ b/onetl/connection/db_connection/mongodb/connection.py
@@ -83,7 +83,7 @@ class MongoDB(DBConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py
index 49fc825d9..b5de3bab0 100644
--- a/onetl/connection/db_connection/mssql/connection.py
+++ b/onetl/connection/db_connection/mssql/connection.py
@@ -64,7 +64,7 @@ class MSSQL(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py
index 868731eaf..252ee60a4 100644
--- a/onetl/connection/db_connection/mysql/connection.py
+++ b/onetl/connection/db_connection/mysql/connection.py
@@ -63,7 +63,7 @@ class MySQL(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py
index 69d7e2c5b..18d9addb3 100644
--- a/onetl/connection/db_connection/oracle/connection.py
+++ b/onetl/connection/db_connection/oracle/connection.py
@@ -103,7 +103,7 @@ class Oracle(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py
index eb07a68f6..377f9eb04 100644
--- a/onetl/connection/db_connection/postgres/connection.py
+++ b/onetl/connection/db_connection/postgres/connection.py
@@ -61,7 +61,7 @@ class Postgres(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py
index 7e730f9eb..4ae7d5760 100644
--- a/onetl/connection/db_connection/teradata/connection.py
+++ b/onetl/connection/db_connection/teradata/connection.py
@@ -66,7 +66,7 @@ class Teradata(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py
index 6710a4303..fce0a9f3a 100644
--- a/onetl/connection/file_connection/ftp.py
+++ b/onetl/connection/file_connection/ftp.py
@@ -68,7 +68,7 @@ class FTP(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`files-install` instruction for more details.
+        See :ref:`install-files` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py
index 211ff6030..97dbc0972 100644
--- a/onetl/connection/file_connection/ftps.py
+++ b/onetl/connection/file_connection/ftps.py
@@ -69,7 +69,7 @@ class FTPS(FTP):
             # or
             pip install onetl[files]
 
-        See :ref:`files-install` instruction for more details.
+        See :ref:`install-files` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py
index 2419aae2f..71e0c07c6 100644
--- a/onetl/connection/file_connection/hdfs/connection.py
+++ b/onetl/connection/file_connection/hdfs/connection.py
@@ -72,14 +72,14 @@ class HDFS(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`files-install` instruction for more details.
+        See :ref:`install-files` instruction for more details.
 
     .. note::
 
         To access Hadoop cluster with Kerberos installed, you should have ``kinit`` executable
         in some path in ``PATH`` environment variable.
 
-        See onETL :ref:`kerberos-install` instruction for more details.
+        See onETL :ref:`install-kerberos` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py
index 7198a05aa..5d51c80fa 100644
--- a/onetl/connection/file_connection/s3.py
+++ b/onetl/connection/file_connection/s3.py
@@ -67,7 +67,7 @@ class S3(FileConnection):
             # or
             pip install onetl[files]
 
-        See :ref:`files-install` instruction for more details.
+        See :ref:`install-files` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py
index 3b84df658..007bb147b 100644
--- a/onetl/connection/file_connection/sftp.py
+++ b/onetl/connection/file_connection/sftp.py
@@ -71,7 +71,7 @@ class SFTP(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`files-install` instruction for more details.
+        See :ref:`install-files` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py
index 52aab0419..0a3f55f23 100644
--- a/onetl/connection/file_connection/webdav.py
+++ b/onetl/connection/file_connection/webdav.py
@@ -70,7 +70,7 @@ class WebDAV(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`files-install` instruction for more details.
+        See :ref:`install-files` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py
index 04bdfae48..73b8c9914 100644
--- a/onetl/connection/file_df_connection/spark_hdfs/connection.py
+++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py
@@ -58,12 +58,12 @@ class SparkHDFS(SparkFileDFConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     .. note::
 
         Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit``
-        **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`.
+        **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`.
 
         In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options
         to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS.
diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py
index b914c714f..a40255186 100644
--- a/onetl/connection/file_df_connection/spark_local_fs.py
+++ b/onetl/connection/file_df_connection/spark_local_fs.py
@@ -49,7 +49,7 @@ class SparkLocalFS(SparkFileDFConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     .. warning::
 
diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py
index 609bba034..d93e6e9f6 100644
--- a/onetl/connection/file_df_connection/spark_s3/connection.py
+++ b/onetl/connection/file_df_connection/spark_s3/connection.py
@@ -83,7 +83,7 @@ class SparkS3(SparkFileDFConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`spark-install` instruction for more details.
+        See :ref:`install-spark` instruction for more details.
 
     .. note::
 

From 8c3ae34817e926c33f484dd89663163351289ec3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 26 Sep 2023 11:50:21 +0000
Subject: [PATCH 20/26] [DOP-9007] Fix changelog

---
 docs/changelog/next_release/154.improvement.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/changelog/next_release/154.improvement.rst b/docs/changelog/next_release/154.improvement.rst
index 745989a87..d22b5a566 100644
--- a/docs/changelog/next_release/154.improvement.rst
+++ b/docs/changelog/next_release/154.improvement.rst
@@ -1,4 +1,4 @@
 Drastically improve ``Greenplum`` documentation:
-* Added information about network ports, grants, ``pg_hba.conf`` and so on.
-* Added interaction schemas for reading, writing and executing statements in Greenplum.
-* Added recommendations about reading data from views and ``JOIN`` results from Greenplum.
+    * Added information about network ports, grants, ``pg_hba.conf`` and so on.
+    * Added interaction schemas for reading, writing and executing statements in Greenplum.
+    * Added recommendations about reading data from views and ``JOIN`` results from Greenplum.

From 13f0e4bdfc72dbf8577a0a12d1fc5fb11e76c7fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 26 Sep 2023 11:53:31 +0000
Subject: [PATCH 21/26] [DOP-9007] Fix Samba documentation

---
 onetl/connection/file_connection/samba.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py
index 9e907ee3e..def44943c 100644
--- a/onetl/connection/file_connection/samba.py
+++ b/onetl/connection/file_connection/samba.py
@@ -60,6 +60,19 @@ class Samba(FileConnection):
 
     .. versionadded:: 0.9.4
 
+    .. warning::
+
+        To use Samba connector you should install package as follows:
+
+        .. code:: bash
+
+            pip install onetl[samba]
+
+            # or
+            pip install onetl[files]
+
+        See :ref:`install-files` instruction for more details.
+
     Parameters
     ----------
     host : str

From 661e0c09fba81dbc5040b852d25e1c01548c4031 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 26 Sep 2023 11:55:48 +0000
Subject: [PATCH 22/26] [DOP-9007] Fix link names

---
 docs/connection/db_connection/greenplum/prerequisites.rst    | 2 +-
 onetl/connection/db_connection/clickhouse/connection.py      | 2 +-
 onetl/connection/db_connection/hive/connection.py            | 2 +-
 onetl/connection/db_connection/mongodb/connection.py         | 2 +-
 onetl/connection/db_connection/mssql/connection.py           | 2 +-
 onetl/connection/db_connection/mysql/connection.py           | 2 +-
 onetl/connection/db_connection/oracle/connection.py          | 2 +-
 onetl/connection/db_connection/postgres/connection.py        | 2 +-
 onetl/connection/db_connection/teradata/connection.py        | 2 +-
 onetl/connection/file_connection/ftp.py                      | 2 +-
 onetl/connection/file_connection/ftps.py                     | 2 +-
 onetl/connection/file_connection/hdfs/connection.py          | 4 ++--
 onetl/connection/file_connection/s3.py                       | 2 +-
 onetl/connection/file_connection/samba.py                    | 2 +-
 onetl/connection/file_connection/sftp.py                     | 2 +-
 onetl/connection/file_connection/webdav.py                   | 2 +-
 onetl/connection/file_df_connection/spark_hdfs/connection.py | 2 +-
 onetl/connection/file_df_connection/spark_local_fs.py        | 2 +-
 onetl/connection/file_df_connection/spark_s3/connection.py   | 2 +-
 19 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index 0f0c1c53d..a181f16dc 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -24,7 +24,7 @@ You can install PySpark as follows:
 
     pip install onetl pyspark=3.2.4  # pass specific PySpark version
 
-See :ref:`install-spark` instruction for more details.
+See :ref:`install-spark` installation instruction for more details.
 
 Downloading Pivotal package
 ---------------------------
diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py
index 19f953b47..f95884f7d 100644
--- a/onetl/connection/db_connection/clickhouse/connection.py
+++ b/onetl/connection/db_connection/clickhouse/connection.py
@@ -65,7 +65,7 @@ class Clickhouse(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py
index 83b219acc..6d768ea2e 100644
--- a/onetl/connection/db_connection/hive/connection.py
+++ b/onetl/connection/db_connection/hive/connection.py
@@ -69,7 +69,7 @@ class Hive(DBConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     .. warning::
 
diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py
index 57c06cac0..860f7b215 100644
--- a/onetl/connection/db_connection/mongodb/connection.py
+++ b/onetl/connection/db_connection/mongodb/connection.py
@@ -83,7 +83,7 @@ class MongoDB(DBConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py
index b5de3bab0..6738c2541 100644
--- a/onetl/connection/db_connection/mssql/connection.py
+++ b/onetl/connection/db_connection/mssql/connection.py
@@ -64,7 +64,7 @@ class MSSQL(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py
index 252ee60a4..abd17df33 100644
--- a/onetl/connection/db_connection/mysql/connection.py
+++ b/onetl/connection/db_connection/mysql/connection.py
@@ -63,7 +63,7 @@ class MySQL(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py
index 18d9addb3..2e1f3e916 100644
--- a/onetl/connection/db_connection/oracle/connection.py
+++ b/onetl/connection/db_connection/oracle/connection.py
@@ -103,7 +103,7 @@ class Oracle(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py
index 377f9eb04..22b42c296 100644
--- a/onetl/connection/db_connection/postgres/connection.py
+++ b/onetl/connection/db_connection/postgres/connection.py
@@ -61,7 +61,7 @@ class Postgres(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py
index 4ae7d5760..2c797b3d8 100644
--- a/onetl/connection/db_connection/teradata/connection.py
+++ b/onetl/connection/db_connection/teradata/connection.py
@@ -66,7 +66,7 @@ class Teradata(JDBCConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py
index fce0a9f3a..b7dd82257 100644
--- a/onetl/connection/file_connection/ftp.py
+++ b/onetl/connection/file_connection/ftp.py
@@ -68,7 +68,7 @@ class FTP(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`install-files` instruction for more details.
+        See :ref:`install-files` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py
index 97dbc0972..dfcd05553 100644
--- a/onetl/connection/file_connection/ftps.py
+++ b/onetl/connection/file_connection/ftps.py
@@ -69,7 +69,7 @@ class FTPS(FTP):
             # or
             pip install onetl[files]
 
-        See :ref:`install-files` instruction for more details.
+        See :ref:`install-files` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py
index 71e0c07c6..aa58f7e0a 100644
--- a/onetl/connection/file_connection/hdfs/connection.py
+++ b/onetl/connection/file_connection/hdfs/connection.py
@@ -72,14 +72,14 @@ class HDFS(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`install-files` instruction for more details.
+        See :ref:`install-files` installation instruction for more details.
 
     .. note::
 
         To access Hadoop cluster with Kerberos installed, you should have ``kinit`` executable
         in some path in ``PATH`` environment variable.
 
-        See onETL :ref:`install-kerberos` instruction for more details.
+        See :ref:`install-kerberos` instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py
index 5d51c80fa..2f8d298f1 100644
--- a/onetl/connection/file_connection/s3.py
+++ b/onetl/connection/file_connection/s3.py
@@ -67,7 +67,7 @@ class S3(FileConnection):
             # or
             pip install onetl[files]
 
-        See :ref:`install-files` instruction for more details.
+        See :ref:`install-files` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py
index def44943c..bef7ed276 100644
--- a/onetl/connection/file_connection/samba.py
+++ b/onetl/connection/file_connection/samba.py
@@ -71,7 +71,7 @@ class Samba(FileConnection):
             # or
             pip install onetl[files]
 
-        See :ref:`install-files` instruction for more details.
+        See :ref:`install-files` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py
index 007bb147b..bef53ce2d 100644
--- a/onetl/connection/file_connection/sftp.py
+++ b/onetl/connection/file_connection/sftp.py
@@ -71,7 +71,7 @@ class SFTP(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`install-files` instruction for more details.
+        See :ref:`install-files` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py
index 0a3f55f23..9825a0525 100644
--- a/onetl/connection/file_connection/webdav.py
+++ b/onetl/connection/file_connection/webdav.py
@@ -70,7 +70,7 @@ class WebDAV(FileConnection, RenameDirMixin):
             # or
             pip install onetl[files]
 
-        See :ref:`install-files` instruction for more details.
+        See :ref:`install-files` installation instruction for more details.
 
     Parameters
     ----------
diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py
index 73b8c9914..ce83c8b1d 100644
--- a/onetl/connection/file_df_connection/spark_hdfs/connection.py
+++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py
@@ -58,7 +58,7 @@ class SparkHDFS(SparkFileDFConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     .. note::
 
diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py
index a40255186..264fac3a2 100644
--- a/onetl/connection/file_df_connection/spark_local_fs.py
+++ b/onetl/connection/file_df_connection/spark_local_fs.py
@@ -49,7 +49,7 @@ class SparkLocalFS(SparkFileDFConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     .. warning::
 
diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py
index d93e6e9f6..b766389cf 100644
--- a/onetl/connection/file_df_connection/spark_s3/connection.py
+++ b/onetl/connection/file_df_connection/spark_s3/connection.py
@@ -83,7 +83,7 @@ class SparkS3(SparkFileDFConnection):
             # or
             pip install onetl pyspark=3.4.1  # pass specific PySpark version
 
-        See :ref:`install-spark` instruction for more details.
+        See :ref:`install-spark` installation instruction for more details.
 
     .. note::
 

From ad8edcb2efcdae056bb064853c20bc0de0b639c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 26 Sep 2023 11:59:18 +0000
Subject: [PATCH 23/26] [DOP-9007] Fix Greenplum documentation

---
 docs/connection/db_connection/greenplum/prerequisites.rst | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index a181f16dc..3d1f9c80b 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -18,12 +18,6 @@ Installing PySpark
 To use Greenplum connector you should have PySpark installed (or injected to ``sys.path``)
 BEFORE creating the connector instance.
 
-You can install PySpark as follows:
-
-.. code:: bash
-
-    pip install onetl pyspark=3.2.4  # pass specific PySpark version
-
 See :ref:`install-spark` installation instruction for more details.
 
 Downloading Pivotal package

From f574c081c7e30ad47d53b6304b8d85b7dff7f91b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 26 Sep 2023 12:00:01 +0000
Subject: [PATCH 24/26] [DOP-9007] Fix Greenplum documentation

---
 docs/connection/db_connection/greenplum/prerequisites.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst
index 3d1f9c80b..57db9635e 100644
--- a/docs/connection/db_connection/greenplum/prerequisites.rst
+++ b/docs/connection/db_connection/greenplum/prerequisites.rst
@@ -29,7 +29,7 @@ and then pass it to Spark session.
 
 .. warning::
 
-    Please pay attention to :ref:`Spark <-> Scala version compatibility <spark-compatibility-matrix>`.
+    Please pay attention to :ref:`Spark & Scala version compatibility <spark-compatibility-matrix>`.
 
 There are several ways to do that. See :ref:`java-packages` for details.
 

From f3aa33540eb3adae2d44643bcd196e1694edde71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Mon, 25 Sep 2023 13:09:18 +0000
Subject: [PATCH 25/26] [DOP-9208] Fix calling .close() method in __del__

---
 docs/changelog/next_release/156.bugfix.rst    |  1 +
 .../next_release/156.improvement.1.rst        |  1 +
 .../next_release/156.improvement.2.rst        |  1 +
 .../db_connection/jdbc_mixin/connection.py    | 23 ++++++++++++++-----
 .../db_connection/kafka/connection.py         |  4 ++++
 .../file_connection/file_connection.py        | 21 +++++++++++++----
 .../spark_hdfs/connection.py                  | 13 ++++++++---
 .../file_df_connection/spark_s3/connection.py |  7 +++++-
 .../test_spark_hdfs_integration.py            | 17 ++++++++------
 9 files changed, 67 insertions(+), 21 deletions(-)
 create mode 100644 docs/changelog/next_release/156.bugfix.rst
 create mode 100644 docs/changelog/next_release/156.improvement.1.rst
 create mode 100644 docs/changelog/next_release/156.improvement.2.rst

diff --git a/docs/changelog/next_release/156.bugfix.rst b/docs/changelog/next_release/156.bugfix.rst
new file mode 100644
index 000000000..2953ab3d6
--- /dev/null
+++ b/docs/changelog/next_release/156.bugfix.rst
@@ -0,0 +1 @@
+Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log.
diff --git a/docs/changelog/next_release/156.improvement.1.rst b/docs/changelog/next_release/156.improvement.1.rst
new file mode 100644
index 000000000..5607eb69c
--- /dev/null
+++ b/docs/changelog/next_release/156.improvement.1.rst
@@ -0,0 +1 @@
+Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection.
diff --git a/docs/changelog/next_release/156.improvement.2.rst b/docs/changelog/next_release/156.improvement.2.rst
new file mode 100644
index 000000000..5824c8a9b
--- /dev/null
+++ b/docs/changelog/next_release/156.improvement.2.rst
@@ -0,0 +1 @@
+Call ``.close()`` on FileConnection then it is removed by garbage collector.
diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py
index c02fb82f1..e5e3e312e 100644
--- a/onetl/connection/db_connection/jdbc_mixin/connection.py
+++ b/onetl/connection/db_connection/jdbc_mixin/connection.py
@@ -15,10 +15,11 @@
 from __future__ import annotations
 
 import logging
+import threading
 from abc import abstractmethod
 from contextlib import closing, suppress
 from enum import Enum, auto
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Tuple, TypeVar
+from typing import TYPE_CHECKING, Callable, ClassVar, Optional, TypeVar
 
 from pydantic import Field, PrivateAttr, SecretStr, validator
 
@@ -76,7 +77,7 @@ class JDBCMixin(FrozenModel):
     _CHECK_QUERY: ClassVar[str] = "SELECT 1"
 
     # cached JDBC connection (Java object), plus corresponding GenericOptions (Python object)
-    _last_connection_and_options: Optional[Tuple[Any, JDBCMixinOptions]] = PrivateAttr(default=None)
+    _last_connection_and_options: Optional[threading.local] = PrivateAttr(default=None)
 
     @property
     @abstractmethod
@@ -126,6 +127,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback):  # noqa: U101
 
     def __del__(self):  # noqa: WPS603
         # If current object is collected by GC, close all opened connections
+        # This is safe because closing connection on Spark driver does not influence Spark executors
         self.close()
 
     @slot
@@ -459,8 +461,14 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions):
         return jdbc_options.asConnectionProperties()
 
     def _get_jdbc_connection(self, options: JDBCMixinOptions):
+        if not self._last_connection_and_options:
+            # connection class can be used in multiple threads.
+            # each Python thread creates its own thread in JVM
+            # so we need local variable to create per-thread persistent connection
+            self._last_connection_and_options = threading.local()
+
         with suppress(Exception):  # nothing cached, or JVM failed
-            last_connection, last_options = self._last_connection_and_options
+            last_connection, last_options = self._last_connection_and_options.data
             if options == last_options and not last_connection.isClosed():
                 return last_connection
 
@@ -471,15 +479,18 @@ def _get_jdbc_connection(self, options: JDBCMixinOptions):
         driver_manager = self.spark._jvm.java.sql.DriverManager  # type: ignore
         new_connection = driver_manager.getConnection(self.jdbc_url, connection_properties)
 
-        self._last_connection_and_options = (new_connection, options)
+        self._last_connection_and_options.data = (new_connection, options)
         return new_connection
 
     def _close_connections(self):
         with suppress(Exception):
-            last_connection, _ = self._last_connection_and_options
+            # connection maybe not opened yet
+            last_connection, _ = self._last_connection_and_options.data
             last_connection.close()
 
-        self._last_connection_and_options = None
+        with suppress(Exception):
+            # connection maybe not opened yet
+            del self._last_connection_and_options.data
 
     def _get_statement_args(self) -> tuple[int, ...]:
         resultset = self.spark._jvm.java.sql.ResultSet  # type: ignore
diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py
index cf9a669c9..51053df0c 100644
--- a/onetl/connection/db_connection/kafka/connection.py
+++ b/onetl/connection/db_connection/kafka/connection.py
@@ -462,6 +462,10 @@ def close(self):
             self.auth.cleanup(self)
         return self
 
+    # Do not all __del__ with calling .close(), like other connections,
+    # because this can influence dataframes created by this connection.
+    # For example, .close() deletes local keytab copy.
+
     @property
     def instance_url(self):
         return "kafka://" + self.cluster
diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py
index 39e27f2c6..cc5ebbb9e 100644
--- a/onetl/connection/file_connection/file_connection.py
+++ b/onetl/connection/file_connection/file_connection.py
@@ -17,6 +17,7 @@
 import os
 import threading
 from abc import abstractmethod
+from contextlib import suppress
 from logging import getLogger
 from typing import Any, Iterable, Iterator
 
@@ -72,8 +73,10 @@ def client(self):
             if client and not self._is_client_closed(client):
                 return client
         except AttributeError:
-            self._clients_cache.client = self._get_client()
-            return self._clients_cache.client
+            pass
+
+        self._clients_cache.client = self._get_client()
+        return self._clients_cache.client
 
     @slot
     def close(self):
@@ -112,8 +115,14 @@ def close(self):
         except AttributeError:
             return self
 
-        self._close_client(client)
-        del self._clients_cache.client
+        with suppress(Exception):
+            # exceptions while closing client should be ignored
+            self._close_client(client)
+
+        with suppress(Exception):
+            # .close() could be called from destructor, and modifying self is not allowed here
+            del self._clients_cache.client
+
         return self
 
     def __enter__(self):
@@ -122,6 +131,10 @@ def __enter__(self):
     def __exit__(self, _exc_type, _exc_value, _traceback):
         self.close()
 
+    def __del__(self):  # noqa: WPS603
+        # If current object is collected by GC, close opened connection
+        self.close()
+
     @slot
     def check(self):
         log.info("|%s| Checking connection availability...", self.__class__.__name__)
diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py
index ce83c8b1d..6855fe595 100644
--- a/onetl/connection/file_df_connection/spark_hdfs/connection.py
+++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py
@@ -17,6 +17,7 @@
 import getpass
 import logging
 import os
+from contextlib import suppress
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -224,10 +225,16 @@ def close(self):
 
         """
         log.debug("Reset FileSystem cache")
-        self._get_spark_fs().close()
-        object.__setattr__(self, "_active_host", None)  # noqa: WPS609
+        with suppress(Exception):
+            self._get_spark_fs().close()
+
+        with suppress(Exception):
+            self._active_host = None
         return self
 
+    # Do not all __del__ with calling .close(), like other connections,
+    # because this can influence dataframes created by this connection
+
     @slot
     @classmethod
     def get_current(cls, spark: SparkSession):
@@ -360,7 +367,7 @@ def _convert_to_url(self, path: PurePathProtocol) -> str:
         else:
             host = self._get_host()
             # cache value to avoid getting active namenode for every path
-            object.__setattr__(self, "_active_host", host)  # noqa: WPS609
+            self._active_host = host
         return f"hdfs://{host}:{self.ipc_port}" + path.as_posix()
 
     def _get_default_path(self):
diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py
index b766389cf..992e11627 100644
--- a/onetl/connection/file_df_connection/spark_s3/connection.py
+++ b/onetl/connection/file_df_connection/spark_s3/connection.py
@@ -16,6 +16,7 @@
 
 import logging
 import os
+from contextlib import suppress
 from typing import TYPE_CHECKING, ClassVar, List, Optional
 
 from etl_entities.instance import Host
@@ -321,9 +322,13 @@ def close(self):
             connection.close()
 
         """
-        self._reset_hadoop_conf()
+        with suppress(Exception):
+            self._reset_hadoop_conf()
         return self
 
+    # Do not all __del__ with calling .close(), like other connections,
+    # because this can influence dataframes created by this connection
+
     @slot
     def check(self):
         self._patch_hadoop_conf()
diff --git a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py
index 3b47df0d0..778d3a20c 100644
--- a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py
+++ b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py
@@ -26,13 +26,16 @@ def test_spark_hdfs_check(hdfs_file_df_connection, caplog):
 def test_spark_hdfs_file_connection_check_failed(spark):
     from onetl.connection import SparkHDFS
 
-    with pytest.raises(RuntimeError, match="Connection is unavailable"):
-        SparkHDFS(
-            cluster="rnd-dwh",
-            host="hive1",
-            port=1234,
-            spark=spark,
-        ).check()
+    wrong_hdfs = SparkHDFS(
+        cluster="rnd-dwh",
+        host="hive1",
+        port=1234,
+        spark=spark,
+    )
+
+    with wrong_hdfs:
+        with pytest.raises(RuntimeError, match="Connection is unavailable"):
+            wrong_hdfs.check()
 
 
 def test_spark_hdfs_file_connection_check_with_hooks(spark, request, hdfs_server):

From 146abdb57a287166bb9e83491f6a4261211e7467 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?=
 =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?=
 =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= <msmarty5@mts.ru>
Date: Tue, 26 Sep 2023 12:22:01 +0000
Subject: [PATCH 26/26] [DOP-9007] Prepare for release

---
 docs/changelog/0.9.4.rst                      | 30 ++++++++++++++++++
 docs/changelog/NEXT_RELEASE.rst               | 31 +++++++++++++++++++
 docs/changelog/index.rst                      |  1 +
 docs/changelog/next_release/143.feature.rst   |  1 -
 docs/changelog/next_release/144.feature.rst   |  1 -
 docs/changelog/next_release/145.feature.rst   |  1 -
 docs/changelog/next_release/148.feature.rst   |  1 -
 docs/changelog/next_release/150.feature.rst   |  2 --
 .../next_release/151.improvement.rst          |  1 -
 .../next_release/154.improvement.rst          |  4 ---
 docs/changelog/next_release/156.bugfix.rst    |  1 -
 .../next_release/156.improvement.1.rst        |  1 -
 .../next_release/156.improvement.2.rst        |  1 -
 13 files changed, 62 insertions(+), 14 deletions(-)
 create mode 100644 docs/changelog/0.9.4.rst
 delete mode 100644 docs/changelog/next_release/143.feature.rst
 delete mode 100644 docs/changelog/next_release/144.feature.rst
 delete mode 100644 docs/changelog/next_release/145.feature.rst
 delete mode 100644 docs/changelog/next_release/148.feature.rst
 delete mode 100644 docs/changelog/next_release/150.feature.rst
 delete mode 100644 docs/changelog/next_release/151.improvement.rst
 delete mode 100644 docs/changelog/next_release/154.improvement.rst
 delete mode 100644 docs/changelog/next_release/156.bugfix.rst
 delete mode 100644 docs/changelog/next_release/156.improvement.1.rst
 delete mode 100644 docs/changelog/next_release/156.improvement.2.rst

diff --git a/docs/changelog/0.9.4.rst b/docs/changelog/0.9.4.rst
new file mode 100644
index 000000000..4eb406ae0
--- /dev/null
+++ b/docs/changelog/0.9.4.rst
@@ -0,0 +1,30 @@
+0.9.4 (2023-09-26)
+==================
+
+Features
+--------
+
+- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`)
+- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`)
+- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`)
+- Add ``Excel`` file format support. (:github:pull:`148`)
+- Add ``Samba`` file connection.
+  It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`)
+
+
+Improvements
+------------
+
+- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`)
+- Drastically improve ``Greenplum`` documentation:
+      * Added information about network ports, grants, ``pg_hba.conf`` and so on.
+      * Added interaction schemas for reading, writing and executing statements in Greenplum.
+      * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`)
+- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`)
+- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`)
+
+
+Bug Fixes
+---------
+
+- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`)
diff --git a/docs/changelog/NEXT_RELEASE.rst b/docs/changelog/NEXT_RELEASE.rst
index 5e26856b4..ee4196843 100644
--- a/docs/changelog/NEXT_RELEASE.rst
+++ b/docs/changelog/NEXT_RELEASE.rst
@@ -3,3 +3,34 @@
 .. and add it to index.rst
 
 .. towncrier release notes start
+
+0.9.4 (2023-09-26)
+==================
+
+Features
+--------
+
+- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`)
+- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`)
+- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`)
+- Add ``Excel`` file format support. (:github:pull:`148`)
+- Add ``Samba`` file connection.
+  It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`)
+
+
+Improvements
+------------
+
+- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`)
+- Drastically improve ``Greenplum`` documentation:
+      * Added information about network ports, grants, ``pg_hba.conf`` and so on.
+      * Added interaction schemas for reading, writing and executing statements in Greenplum.
+      * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`)
+- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`)
+- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`)
+
+
+Bug Fixes
+---------
+
+- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`)
diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst
index 92701e1e1..6130bfdc8 100644
--- a/docs/changelog/index.rst
+++ b/docs/changelog/index.rst
@@ -4,6 +4,7 @@
 
     DRAFT
     NEXT_RELEASE
+    0.9.4
     0.9.3
     0.9.2
     0.9.1
diff --git a/docs/changelog/next_release/143.feature.rst b/docs/changelog/next_release/143.feature.rst
deleted file mode 100644
index 97756efc4..000000000
--- a/docs/changelog/next_release/143.feature.rst
+++ /dev/null
@@ -1 +0,0 @@
-Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions``
diff --git a/docs/changelog/next_release/144.feature.rst b/docs/changelog/next_release/144.feature.rst
deleted file mode 100644
index a0cf257e4..000000000
--- a/docs/changelog/next_release/144.feature.rst
+++ /dev/null
@@ -1 +0,0 @@
-Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions``
diff --git a/docs/changelog/next_release/145.feature.rst b/docs/changelog/next_release/145.feature.rst
deleted file mode 100644
index 975e0b96d..000000000
--- a/docs/changelog/next_release/145.feature.rst
+++ /dev/null
@@ -1 +0,0 @@
-Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions``
diff --git a/docs/changelog/next_release/148.feature.rst b/docs/changelog/next_release/148.feature.rst
deleted file mode 100644
index 87b1b48a8..000000000
--- a/docs/changelog/next_release/148.feature.rst
+++ /dev/null
@@ -1 +0,0 @@
-Add ``Excel`` file format support.
diff --git a/docs/changelog/next_release/150.feature.rst b/docs/changelog/next_release/150.feature.rst
deleted file mode 100644
index 6ea0af9ff..000000000
--- a/docs/changelog/next_release/150.feature.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Add ``Samba`` file connection.
-It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``.
diff --git a/docs/changelog/next_release/151.improvement.rst b/docs/changelog/next_release/151.improvement.rst
deleted file mode 100644
index d8da800ae..000000000
--- a/docs/changelog/next_release/151.improvement.rst
+++ /dev/null
@@ -1 +0,0 @@
-Add documentation about different ways of passing packages to Spark session.
diff --git a/docs/changelog/next_release/154.improvement.rst b/docs/changelog/next_release/154.improvement.rst
deleted file mode 100644
index d22b5a566..000000000
--- a/docs/changelog/next_release/154.improvement.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Drastically improve ``Greenplum`` documentation:
-    * Added information about network ports, grants, ``pg_hba.conf`` and so on.
-    * Added interaction schemas for reading, writing and executing statements in Greenplum.
-    * Added recommendations about reading data from views and ``JOIN`` results from Greenplum.
diff --git a/docs/changelog/next_release/156.bugfix.rst b/docs/changelog/next_release/156.bugfix.rst
deleted file mode 100644
index 2953ab3d6..000000000
--- a/docs/changelog/next_release/156.bugfix.rst
+++ /dev/null
@@ -1 +0,0 @@
-Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log.
diff --git a/docs/changelog/next_release/156.improvement.1.rst b/docs/changelog/next_release/156.improvement.1.rst
deleted file mode 100644
index 5607eb69c..000000000
--- a/docs/changelog/next_release/156.improvement.1.rst
+++ /dev/null
@@ -1 +0,0 @@
-Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection.
diff --git a/docs/changelog/next_release/156.improvement.2.rst b/docs/changelog/next_release/156.improvement.2.rst
deleted file mode 100644
index 5824c8a9b..000000000
--- a/docs/changelog/next_release/156.improvement.2.rst
+++ /dev/null
@@ -1 +0,0 @@
-Call ``.close()`` on FileConnection then it is removed by garbage collector.