From 2b0942cccecf71f279f7d156b12c2c5db2ebbf95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 6 Sep 2023 15:07:32 +0000 Subject: [PATCH 01/26] [DOP-8511] Bump version --- onetl/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetl/VERSION b/onetl/VERSION index 965065db5..a602fc9e2 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.9.3 +0.9.4 From 0f6b274bbd5a0e030f8ec41fbcff85e2eaa2c6fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 6 Sep 2023 15:23:34 +0000 Subject: [PATCH 02/26] [DOP-8511] Update Kafka documentation --- docs/connection/db_connection/kafka/write.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/connection/db_connection/kafka/write.rst b/docs/connection/db_connection/kafka/write.rst index eb04ecccb..064c8ead1 100644 --- a/docs/connection/db_connection/kafka/write.rst +++ b/docs/connection/db_connection/kafka/write.rst @@ -30,6 +30,7 @@ For writing data to Kafka, use :obj:`DBWriter Date: Wed, 6 Sep 2023 15:31:13 +0000 Subject: [PATCH 03/26] [DOP-8511] Update Kafka documentation --- docs/connection/db_connection/kafka/read.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst index a19c5e57b..3e6c846b1 100644 --- a/docs/connection/db_connection/kafka/read.rst +++ b/docs/connection/db_connection/kafka/read.rst @@ -7,7 +7,8 @@ For reading data from Kafka, use :obj:`DBReader Date: Thu, 7 Sep 2023 11:45:57 +0000 Subject: [PATCH 04/26] [DOP-8511] Update README --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 13b280830..6e21bb672 100644 --- a/README.rst +++ b/README.rst @@ -229,7 +229,7 @@ so some connections require additional setup to work properly. It also uses ``kinit`` executable to generate Kerberos ticket. * ``Hive`` and ``SparkHDFS`` - Requires Kerberos ticket to exist before creating Spark session. + require Kerberos ticket to exist before creating Spark session. So you need to install OS packages with: From 0adf0555b71a83d798d8f7a90db68138bff57501 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:27:00 +0300 Subject: [PATCH 05/26] [DOP-8665] - Allow modes "ignore" and "error" in JDBC.WriteOptions (#144) * [DOP-8665] - Allow modes "ignore" and "error" in JDBC.WriteOptions * [DOP-8665] - updated tests --- docs/changelog/next_release/144.feature.rst | 1 + .../jdbc_connection/connection.py | 6 +- .../db_connection/jdbc_connection/options.py | 69 ++++++++++----- .../test_postgres_writer_integration.py | 86 +++++++++++++++++-- .../test_jdbc_options_unit.py | 18 +++- 5 files changed, 146 insertions(+), 34 deletions(-) create mode 100644 docs/changelog/next_release/144.feature.rst diff --git a/docs/changelog/next_release/144.feature.rst b/docs/changelog/next_release/144.feature.rst new file mode 100644 index 000000000..a0cf257e4 --- /dev/null +++ b/docs/changelog/next_release/144.feature.rst @@ -0,0 +1 @@ +Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 3eb83f538..f5b611910 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -218,7 +218,11 @@ def write_df_to_target( write_options = self.WriteOptions.parse(options) jdbc_params = self.options_to_jdbc_params(write_options) - mode = "append" if write_options.if_exists == JDBCTableExistBehavior.APPEND else "overwrite" + mode = ( + "overwrite" + if write_options.if_exists == JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE + else write_options.if_exists.value + ) log.info("|%s| Saving data to a table %r", self.__class__.__name__, target) df.write.jdbc(table=target, mode=mode, **jdbc_params) log.info("|%s| Table %r successfully written", self.__class__.__name__, target) diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index c998055fe..dacaded77 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -84,6 +84,8 @@ class JDBCTableExistBehavior(str, Enum): APPEND = "append" + IGNORE = "ignore" + ERROR = "error" REPLACE_ENTIRE_TABLE = "replace_entire_table" def __str__(self) -> str: @@ -413,44 +415,65 @@ class Config: .. dropdown:: Behavior in details - * Table does not exist - Table is created using options provided by user - (``createTableOptions``, ``createTableColumnTypes``, etc). + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). - * Table exists - Data is appended to a table. Table has the same DDL as before writing data + * Table exists + Data is appended to a table. Table has the same DDL as before writing data - .. warning:: + .. warning:: - This mode does not check whether table already contains - rows from dataframe, so duplicated rows can be created. + This mode does not check whether table already contains + rows from dataframe, so duplicated rows can be created. - Also Spark does not support passing custom options to - insert statement, like ``ON CONFLICT``, so don't try to - implement deduplication using unique indexes or constraints. + Also Spark does not support passing custom options to + insert statement, like ``ON CONFLICT``, so don't try to + implement deduplication using unique indexes or constraints. - Instead, write to staging table and perform deduplication - using :obj:`~execute` method. + Instead, write to staging table and perform deduplication + using :obj:`~execute` method. * ``replace_entire_table`` **Table is dropped and then created, or truncated**. .. dropdown:: Behavior in details - * Table does not exist - Table is created using options provided by user - (``createTableOptions``, ``createTableColumnTypes``, etc). + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). - * Table exists - Table content is replaced with dataframe content. + * Table exists + Table content is replaced with dataframe content. - After writing completed, target table could either have the same DDL as - before writing data (``truncate=True``), or can be recreated (``truncate=False`` - or source does not support truncation). + After writing completed, target table could either have the same DDL as + before writing data (``truncate=True``), or can be recreated (``truncate=False`` + or source does not support truncation). - .. note:: + * ``ignore`` + Ignores the write operation if the table already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). + + * Table exists + The write operation is ignored, and no data is written to the table. + + * ``error`` + Raises an error if the table already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user + (``createTableOptions``, ``createTableColumnTypes``, etc). + + * Table exists + An error is raised, and no data is written to the table. - ``error`` and ``ignore`` modes are not supported. """ batchsize: int = 20_000 diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py index 195b16e02..cda43c8a8 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py @@ -6,7 +6,17 @@ pytestmark = pytest.mark.postgres -def test_postgres_writer_snapshot(spark, processing, prepare_schema_table): +@pytest.mark.parametrize( + "options", + [ + {}, + {"if_exists": "append"}, + {"if_exists": "replace_entire_table"}, + {"if_exists": "error"}, + {"if_exists": "ignore"}, + ], +) +def test_postgres_writer_snapshot(spark, processing, get_schema_table, options): df = processing.create_spark_df(spark=spark) postgres = Postgres( @@ -20,14 +30,15 @@ def test_postgres_writer_snapshot(spark, processing, prepare_schema_table): writer = DBWriter( connection=postgres, - target=prepare_schema_table.full_name, + target=get_schema_table.full_name, + options=Postgres.WriteOptions(**options), ) writer.run(df) processing.assert_equal_df( - schema=prepare_schema_table.schema, - table=prepare_schema_table.table, + schema=get_schema_table.schema, + table=get_schema_table.table, df=df, ) @@ -86,7 +97,7 @@ def test_postgres_writer_snapshot_with_pydantic_options(spark, processing, prepa ) -def test_postgres_writer_mode_append(spark, processing, prepare_schema_table): +def test_postgres_writer_if_exists_append(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) df1 = df[df.id_int < 1001] df2 = df[df.id_int > 1000] @@ -116,7 +127,70 @@ def test_postgres_writer_mode_append(spark, processing, prepare_schema_table): ) -def test_postgres_writer_mode_replace_entire_table(spark, processing, prepare_schema_table): +def test_postgres_writer_if_exists_error(spark, processing, prepare_schema_table): + from pyspark.sql.utils import AnalysisException + + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=postgres, + target=prepare_schema_table.full_name, + options=Postgres.WriteOptions(if_exists="error"), + ) + + with pytest.raises( + AnalysisException, + match=f"Table or view '{prepare_schema_table.full_name}' already exists. SaveMode: ErrorIfExists.", + ): + writer.run(df) + + empty_df = spark.createDataFrame([], df.schema) + + processing.assert_equal_df( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + df=empty_df, + ) + + +def test_postgres_writer_if_exists_ignore(spark, processing, prepare_schema_table): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=postgres, + target=prepare_schema_table.full_name, + options=Postgres.WriteOptions(if_exists="ignore"), + ) + + writer.run(df) # The write operation is ignored + empty_df = spark.createDataFrame([], df.schema) + + processing.assert_equal_df( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + df=empty_df, + ) + + +def test_postgres_writer_if_exists_replace_entire_table(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) df1 = df[df.id_int < 1001] df2 = df[df.id_int > 1000] diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index ae81402cc..f932408d0 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -266,6 +266,8 @@ def test_jdbc_write_options_to_jdbc(spark_mock): [ ({}, JDBCTableExistBehavior.APPEND), ({"if_exists": "append"}, JDBCTableExistBehavior.APPEND), + ({"if_exists": "ignore"}, JDBCTableExistBehavior.IGNORE), + ({"if_exists": "error"}, JDBCTableExistBehavior.ERROR), ({"if_exists": "replace_entire_table"}, JDBCTableExistBehavior.REPLACE_ENTIRE_TABLE), ], ) @@ -294,6 +296,18 @@ def test_jdbc_write_options_if_exists(options, value): "Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. " "Use `replace_entire_table` instead", ), + ( + {"mode": "ignore"}, + JDBCTableExistBehavior.IGNORE, + "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `WriteOptions(if_exists=...)` instead", + ), + ( + {"mode": "error"}, + JDBCTableExistBehavior.ERROR, + "Option `WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `WriteOptions(if_exists=...)` instead", + ), ], ) def test_jdbc_write_options_mode_deprecated(options, value, message): @@ -305,10 +319,6 @@ def test_jdbc_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( "options", [ - # disallowed modes - {"mode": "error"}, - {"mode": "ignore"}, - # wrong mode {"mode": "wrong_mode"}, ], ) From 13b498ad432c09b3f8b80d5034cd1fbd2d5e8767 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:45:20 +0300 Subject: [PATCH 06/26] [DOP-8664] - Allow modes "ignore" and "error" in MongoDB.WriteOptions (#145) * [DOP-8664] - Allow modes "ignore" and "error" in MongoDB.WriteOptions * [DOP-8664] - replaced collection existence check with java client * [DOP-8664] - removed useless type ignore * [DOP-8664] - updated tests * [DOP-8664] - updated tests in greenplum.WriteOptions(if_exists='error') * [DOP-8664] - updated logs messages in WriteOptions * [DOP-8664] - updated logs messages in MongoDB.WriteOptions * [DOP-8664] - updated logs messages in MongoDB.WriteOptions --- docs/changelog/next_release/145.feature.rst | 1 + .../db_connection/mongodb/connection.py | 20 +++ .../db_connection/mongodb/options.py | 51 ++++-- .../test_greenplum_writer_integration.py | 8 + .../test_mongodb_writer_integration.py | 153 +++++++++++++++++- .../test_mongodb_unit.py | 18 ++- 6 files changed, 228 insertions(+), 23 deletions(-) create mode 100644 docs/changelog/next_release/145.feature.rst diff --git a/docs/changelog/next_release/145.feature.rst b/docs/changelog/next_release/145.feature.rst new file mode 100644 index 000000000..975e0b96d --- /dev/null +++ b/docs/changelog/next_release/145.feature.rst @@ -0,0 +1 @@ +Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 8e6110f14..5a8640a68 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -504,6 +504,16 @@ def write_df_to_target( else "append" ) + if self._collection_exists(target): + if write_options.if_exists == MongoDBCollectionExistBehavior.ERROR: + raise ValueError("Operation stopped due to MongoDB.WriteOptions(if_exists='error')") + elif write_options.if_exists == MongoDBCollectionExistBehavior.IGNORE: + log.info( + "|%s| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')", + self.__class__.__name__, + ) + return + log.info("|%s| Saving data to a collection %r", self.__class__.__name__, target) df.write.format("mongodb").mode(mode).options(**write_options_dict).save() log.info("|%s| Collection %r is successfully written", self.__class__.__name__, target) @@ -533,3 +543,13 @@ def _check_java_class_imported(cls, spark): log.debug("Missing Java class", exc_info=e, stack_info=True) raise ValueError(msg) from e return spark + + def _collection_exists(self, source: str) -> bool: + jvm = self.spark._jvm + client = jvm.com.mongodb.client.MongoClients.create(self.connection_url) # type: ignore + collections = set(client.getDatabase(self.database).listCollectionNames().iterator()) + if source in collections: + log.info("|%s| Collection %r exists", self.__class__.__name__, source) + return True + log.info("|%s| Collection %r does not exist", self.__class__.__name__, source) + return False diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 85f1935a3..13c256aff 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -81,6 +81,8 @@ class MongoDBCollectionExistBehavior(str, Enum): APPEND = "append" + IGNORE = "ignore" + ERROR = "error" REPLACE_ENTIRE_COLLECTION = "replace_entire_collection" def __str__(self) -> str: @@ -207,33 +209,52 @@ class MongoDBWriteOptions(GenericOptions): .. dropdown:: Behavior in details - * Collection does not exist - Collection is created using options provided by user - (``shardkey`` and others). + * Collection does not exist + Collection is created using options provided by user + (``shardkey`` and others). - * Collection exists - Data is appended to a collection. + * Collection exists + Data is appended to a collection. - .. warning:: + .. warning:: - This mode does not check whether collection already contains - objects from dataframe, so duplicated objects can be created. + This mode does not check whether collection already contains + objects from dataframe, so duplicated objects can be created. * ``replace_entire_collection`` **Collection is deleted and then created**. .. dropdown:: Behavior in details - * Collection does not exist - Collection is created using options provided by user - (``shardkey`` and others). + * Collection does not exist + Collection is created using options provided by user + (``shardkey`` and others). - * Collection exists - Collection content is replaced with dataframe content. + * Collection exists + Collection content is replaced with dataframe content. - .. note:: + * ``ignore`` + Ignores the write operation if the collection already exists. + + .. dropdown:: Behavior in details + + * Collection does not exist + Collection is created using options provided by user + + * Collection exists + The write operation is ignored, and no data is written to the collection. + + * ``error`` + Raises an error if the collection already exists. + + .. dropdown:: Behavior in details + + * Collection does not exist + Collection is created using options provided by user + + * Collection exists + An error is raised, and no data is written to the collection. - ``error`` and ``ignore`` modes are not supported. """ class Config: diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py index c97105a44..338de0c67 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py @@ -137,6 +137,14 @@ def test_greenplum_writer_if_exists_error(spark, processing, prepare_schema_tabl ): writer.run(df) + empty_df = spark.createDataFrame([], df.schema) + + processing.assert_equal_df( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + df=empty_df, + ) + def test_greenplum_writer_if_exists_ignore(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py index 458a6902f..d5cd94fed 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py @@ -1,3 +1,6 @@ +import logging +import re + import pytest from onetl.connection import MongoDB @@ -6,8 +9,18 @@ pytestmark = pytest.mark.mongodb +@pytest.mark.parametrize( + "options", + [ + {}, + {"if_exists": "append"}, + {"if_exists": "replace_entire_collection"}, + {"if_exists": "error"}, + {"if_exists": "ignore"}, + ], +) @pytest.mark.flaky(reruns=2) -def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table): +def test_mongodb_writer_snapshot(spark, processing, get_schema_table, options, caplog): df = processing.create_spark_df(spark=spark) mongo = MongoDB( @@ -21,12 +34,144 @@ def test_mongodb_writer_snapshot(spark, processing, prepare_schema_table): writer = DBWriter( connection=mongo, - table=prepare_schema_table.table, + table=get_schema_table.table, + options=MongoDB.WriteOptions(**options), + ) + + with caplog.at_level(logging.INFO): + writer.run(df) + + assert f"|MongoDB| Collection '{get_schema_table.table}' does not exist" in caplog.text + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df, + ) + + +def test_mongodb_writer_if_exists_append(spark, processing, get_schema_table): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + df1 = df[df._id < 1001] + df2 = df[df._id > 1000] + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="append"), + ) + writer.run(df1) + writer.run(df2) + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df, + ) + + +def test_mongodb_writer_if_exists_replace_entire_collection(spark, processing, get_schema_table): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + df1 = df[df._id < 1001] + df2 = df[df._id > 1000] + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="replace_entire_collection"), + ) + writer.run(df1) + writer.run(df2) + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df2, + ) + + +def test_mongodb_writer_if_exists_error(spark, processing, get_schema_table, caplog): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="error"), ) writer.run(df) + with pytest.raises( + ValueError, + match=re.escape("Operation stopped due to MongoDB.WriteOptions(if_exists='error')"), + ): + writer.run(df) + processing.assert_equal_df( - schema=prepare_schema_table.schema, - table=prepare_schema_table.table, + schema=get_schema_table.schema, + table=get_schema_table.table, df=df, ) + + +def test_mongodb_writer_if_exists_ignore(spark, processing, get_schema_table, caplog): + df = processing.create_spark_df(spark=spark, min_id=1, max_id=1500) + df1 = df[df._id < 1001] + df2 = df[df._id > 1000] + + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + writer = DBWriter( + connection=mongo, + table=get_schema_table.table, + options=MongoDB.WriteOptions(if_exists="ignore"), + ) + writer.run(df1) + + with caplog.at_level(logging.INFO): + writer.run(df2) # The write operation is ignored + + assert f"|MongoDB| Collection '{get_schema_table.table}' exists" in caplog.text + assert ( + "|MongoDB| Skip writing to existing collection because of MongoDB.WriteOptions(if_exists='ignore')" + in caplog.text + ) + + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df1, + ) diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index 8775f6dbc..eb3f1db23 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -233,6 +233,8 @@ def test_mongodb_convert_dict_to_str(): [ ({}, MongoDBCollectionExistBehavior.APPEND), ({"if_exists": "append"}, MongoDBCollectionExistBehavior.APPEND), + ({"if_exists": "ignore"}, MongoDBCollectionExistBehavior.IGNORE), + ({"if_exists": "error"}, MongoDBCollectionExistBehavior.ERROR), ({"if_exists": "replace_entire_collection"}, MongoDBCollectionExistBehavior.REPLACE_ENTIRE_COLLECTION), ], ) @@ -261,6 +263,18 @@ def test_mongodb_write_options_if_exists(options, value): "Mode `overwrite` is deprecated since v0.9.0 and will be removed in v1.0.0. " "Use `replace_entire_collection` instead", ), + ( + {"mode": "ignore"}, + MongoDBCollectionExistBehavior.IGNORE, + "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `MongoDB.WriteOptions(if_exists=...)` instead", + ), + ( + {"mode": "error"}, + MongoDBCollectionExistBehavior.ERROR, + "Option `MongoDB.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `MongoDB.WriteOptions(if_exists=...)` instead", + ), ], ) def test_mongodb_write_options_mode_deprecated(options, value, message): @@ -272,10 +286,6 @@ def test_mongodb_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( "options", [ - # disallowed modes - {"mode": "error"}, - {"mode": "ignore"}, - # wrong mode {"mode": "wrong_mode"}, ], ) From 023aa42b2bc465e5c2585f43e51109020d7f2a8e Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:59:15 +0300 Subject: [PATCH 07/26] [DOP-8647] - Allow modes "ignore" and "error" in Hive.WriteOptions (#143) * [DOP-8647] - Allow modes "ignore" and "error" in HiveWriteOptions * [DOP-8647] - remove log.error message * [DOP-8647] - move write_options check to write_df_to_target * [DOP-8647] - updated logs messages in Hive.WriteOptions * [DOP-8647] - updated logs messages in Hive.WriteOptions * [DOP-8647] - updated logs messages in Hive.WriteOptions --- docs/changelog/next_release/143.feature.rst | 1 + .../db_connection/hive/connection.py | 8 ++ .../connection/db_connection/hive/options.py | 27 ++++- .../test_hive_writer_integration.py | 102 ++++++++++++++++++ .../test_hive_unit.py | 18 +++- 5 files changed, 150 insertions(+), 6 deletions(-) create mode 100644 docs/changelog/next_release/143.feature.rst diff --git a/docs/changelog/next_release/143.feature.rst b/docs/changelog/next_release/143.feature.rst new file mode 100644 index 000000000..97756efc4 --- /dev/null +++ b/docs/changelog/next_release/143.feature.rst @@ -0,0 +1 @@ +Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index d0bc08d29..d0997f512 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -340,6 +340,14 @@ def write_df_to_target( # https://stackoverflow.com/a/72747050 if table_exists and write_options.if_exists != HiveTableExistBehavior.REPLACE_ENTIRE_TABLE: + if write_options.if_exists == HiveTableExistBehavior.ERROR: + raise ValueError("Operation stopped due to Hive.WriteOptions(if_exists='error')") + elif write_options.if_exists == HiveTableExistBehavior.IGNORE: + log.info( + "|%s| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')", + self.__class__.__name__, + ) + return # using saveAsTable on existing table does not handle # spark.sql.sources.partitionOverwriteMode=dynamic, so using insertInto instead. self._insert_into(df, target, options) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index c46b7882d..81445851d 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -26,6 +26,8 @@ class HiveTableExistBehavior(str, Enum): APPEND = "append" + IGNORE = "ignore" + ERROR = "error" REPLACE_ENTIRE_TABLE = "replace_entire_table" REPLACE_OVERLAPPING_PARTITIONS = "replace_overlapping_partitions" @@ -173,9 +175,30 @@ class Config: Table is recreated using options provided by user (``format``, ``compression``, etc) **instead of using original table options**. Be careful - .. note:: + * ``ignore`` + Ignores the write operation if the table/partition already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user (``format``, ``compression``, etc). + + * Table exists + If the table exists, **no further action is taken**. This is true whether or not new partition + values are present and whether the partitioning scheme differs or not + + * ``error`` + Raises an error if the table/partition already exists. + + .. dropdown:: Behavior in details + + * Table does not exist + Table is created using options provided by user (``format``, ``compression``, etc). + + * Table exists + If the table exists, **raises an error**. This is true whether or not new partition + values are present and whether the partitioning scheme differs or not - ``error`` and ``ignore`` modes are not supported. .. note:: diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py index 44553539b..8ca74b06d 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py @@ -1,4 +1,5 @@ import logging +import re import textwrap import pytest @@ -225,6 +226,8 @@ def test_hive_writer_default_not_partitioned(spark, processing, get_schema_table "options", [ Hive.WriteOptions(if_exists="append"), + Hive.WriteOptions(if_exists="ignore"), + Hive.WriteOptions(if_exists="error"), Hive.WriteOptions(if_exists="replace_entire_table"), Hive.WriteOptions(if_exists="replace_overlapping_partitions"), ], @@ -363,6 +366,105 @@ def test_hive_writer_insert_into_append(spark, processing, get_schema_table, ori ) +@pytest.mark.parametrize( + "original_options, new_options", + [ + pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"), + pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"), + pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"), + pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"), + ], +) +def test_hive_writer_insert_into_ignore(spark, processing, get_schema_table, original_options, new_options, caplog): + df = processing.create_spark_df(spark=spark) + + df1 = df[df.id_int <= 25] + df2 = df.where("id_int > 25 AND id_int <= 50") + df3 = df[df.id_int > 50] + + hive = Hive(cluster="rnd-dwh", spark=spark) + writer1 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=original_options, + ) + # create & fill up the table with some data + writer1.run(df1.union(df2)) + old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + + writer2 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=Hive.WriteOptions(if_exists="ignore", **new_options), + ) + + with caplog.at_level(logging.INFO): + writer2.run(df1.union(df3)) + + assert "|Hive| Skip writing to existing table because of Hive.WriteOptions(if_exists='ignore')" in caplog.text + + new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + + # table DDL remains the same + assert new_ddl == old_ddl + + # table should only contain old data, because 'ignore' should not have added new data + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df1.union(df2), + order_by="id_int", + ) + + +@pytest.mark.parametrize( + "original_options, new_options", + [ + pytest.param({}, {"partitionBy": "id_int"}, id="table_not_partitioned_dataframe_is"), + pytest.param({"partitionBy": "text_string"}, {}, id="table_partitioned_dataframe_is_not"), + pytest.param({"partitionBy": "text_string"}, {"partitionBy": "id_int"}, id="different_partitioning_schema"), + pytest.param({"partitionBy": "id_int"}, {"partitionBy": "id_int"}, id="same_partitioning_schema"), + ], +) +def test_hive_writer_insert_into_error(spark, processing, get_schema_table, original_options, new_options, caplog): + df = processing.create_spark_df(spark=spark) + + hive = Hive(cluster="rnd-dwh", spark=spark) + writer1 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=original_options, + ) + + # Create & fill up the table with some data + writer1.run(df) + old_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + + writer2 = DBWriter( + connection=hive, + target=get_schema_table.full_name, + options=Hive.WriteOptions(if_exists="error", **new_options), + ) + + with pytest.raises( + ValueError, + match=re.escape("Operation stopped due to Hive.WriteOptions(if_exists='error')"), + ): + writer2.run(df) + + # table DDL remains the same + new_ddl = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}").collect()[0][0] + assert new_ddl == old_ddl + + # validate that the table contains only old data + processing.assert_equal_df( + schema=get_schema_table.schema, + table=get_schema_table.table, + df=df, + order_by="id_int", + ) + + @pytest.mark.parametrize( "original_options, new_options", [ diff --git a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py index 7e633206e..6469b10c8 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py @@ -153,6 +153,8 @@ def test_hive_write_options_unsupported_insert_into(insert_into): ({"if_exists": "append"}, HiveTableExistBehavior.APPEND), ({"if_exists": "replace_overlapping_partitions"}, HiveTableExistBehavior.REPLACE_OVERLAPPING_PARTITIONS), ({"if_exists": "replace_entire_table"}, HiveTableExistBehavior.REPLACE_ENTIRE_TABLE), + ({"if_exists": "error"}, HiveTableExistBehavior.ERROR), + ({"if_exists": "ignore"}, HiveTableExistBehavior.IGNORE), ], ) def test_hive_write_options_if_exists(options, value): @@ -198,6 +200,18 @@ def test_hive_write_options_if_exists(options, value): "Mode `overwrite_table` is deprecated since v0.9.0 and will be removed in v1.0.0. " "Use `replace_entire_table` instead", ), + ( + {"mode": "error"}, + HiveTableExistBehavior.ERROR, + "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `Hive.WriteOptions(if_exists=...)` instead", + ), + ( + {"mode": "ignore"}, + HiveTableExistBehavior.IGNORE, + "Option `Hive.WriteOptions(mode=...)` is deprecated since v0.9.0 and will be removed in v1.0.0. " + "Use `Hive.WriteOptions(if_exists=...)` instead", + ), ], ) def test_hive_write_options_mode_deprecated(options, value, message): @@ -209,10 +223,6 @@ def test_hive_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( "options", [ - # disallowed modes - {"mode": "error"}, - {"mode": "ignore"}, - # wrong mode {"mode": "wrong_mode"}, ], ) From 9cde99381232fe14d836c09896a0ced42226f3cb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Sep 2023 05:30:00 +0000 Subject: [PATCH 08/26] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 23.7.0 → 23.9.1](https://github.com/psf/black/compare/23.7.0...23.9.1) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd0c89d6b..aea58de6b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -64,7 +64,7 @@ repos: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 23.9.1 hooks: - id: black language_version: python3 From 1aa3fc07343b016c8aca46b74eef5ddd86fd96d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 12 Sep 2023 10:42:01 +0000 Subject: [PATCH 09/26] [DOP-8511] Fix MongoDB documentation example --- onetl/connection/db_connection/mongodb/connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 5a8640a68..7f02f20ef 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -124,7 +124,7 @@ class MongoDB(DBConnection): from pyspark.sql import SparkSession # Create Spark session with MongoDB connector loaded - maven_packages = Greenplum.get_packages(spark_version="3.2") + maven_packages = MongoDB.get_packages(spark_version="3.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) From 2caf72b27a4e50f8b7383731c4b5d2525b9c2c56 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 04:54:50 +0000 Subject: [PATCH 10/26] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.10.1 → v3.11.0](https://github.com/asottile/pyupgrade/compare/v3.10.1...v3.11.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aea58de6b..d43a63307 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -59,7 +59,7 @@ repos: - id: rst-inline-touching-normal - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.11.0 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] From 982d96cc11536b42f676a881a2eeddfefa9fcee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 18 Sep 2023 11:19:38 +0000 Subject: [PATCH 11/26] [DOP-8959] Add Excel file format support --- .github/workflows/data/greenplum/matrix.yml | 2 +- .github/workflows/data/local-fs/matrix.yml | 17 +- .github/workflows/data/mongodb/matrix.yml | 2 +- .github/workflows/data/s3/matrix.yml | 2 +- README.rst | 4 +- docs/changelog/next_release/148.feature.rst | 1 + .../db_connection/greenplum/prerequisites.rst | 4 +- docs/file_df/file_formats/avro.rst | 2 +- docs/file_df/file_formats/excel.rst | 9 + docs/file_df/file_formats/index.rst | 1 + docs/file_df/file_formats/orc.rst | 2 +- .../db_connection/kafka/connection.py | 4 + .../db_connection/mongodb/connection.py | 2 + .../file_df_connection/spark_s3/connection.py | 2 + onetl/file/format/__init__.py | 1 + onetl/file/format/avro.py | 4 + onetl/file/format/excel.py | 220 ++++++++++++++++++ .../{spark-3.2.3.txt => spark-3.2.4.txt} | 2 +- tests/fixtures/spark.py | 19 +- .../file_df_connection/generate_files.py | 92 ++++++++ .../xls/with_data_address/file.xls | Bin 0 -> 5632 bytes .../xls/with_header/file.xls | Bin 0 -> 5632 bytes .../xls/without_header/file.xls | Bin 0 -> 5632 bytes .../xlsx/with_data_address/file.xls | Bin 0 -> 4891 bytes .../xlsx/with_header/file.xls | Bin 0 -> 5026 bytes .../xlsx/without_header/file.xls | Bin 0 -> 4881 bytes tests/resources/requirements.txt | 3 + .../test_avro_integration.py | 11 +- .../test_csv_integration.py | 7 +- .../test_excel_integration.py | 142 +++++++++++ .../test_format_unit/test_excel_unit.py | 106 +++++++++ tests/util/spark_df.py | 2 +- 32 files changed, 639 insertions(+), 24 deletions(-) create mode 100644 docs/changelog/next_release/148.feature.rst create mode 100644 docs/file_df/file_formats/excel.rst create mode 100644 onetl/file/format/excel.py rename requirements/tests/{spark-3.2.3.txt => spark-3.2.4.txt} (80%) create mode 100644 tests/resources/file_df_connection/xls/with_data_address/file.xls create mode 100644 tests/resources/file_df_connection/xls/with_header/file.xls create mode 100644 tests/resources/file_df_connection/xls/without_header/file.xls create mode 100644 tests/resources/file_df_connection/xlsx/with_data_address/file.xls create mode 100644 tests/resources/file_df_connection/xlsx/with_header/file.xls create mode 100644 tests/resources/file_df_connection/xlsx/without_header/file.xls create mode 100644 tests/tests_integration/test_file_format_integration/test_excel_integration.py create mode 100644 tests/tests_unit/test_file/test_format_unit/test_excel_unit.py diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 43e02d8c2..292319bb5 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -7,7 +7,7 @@ min: &min max: &max # Greenplum connector does not support Spark 3.3+ - spark-version: 3.2.3 + spark-version: 3.2.4 python-version: '3.10' java-version: 11 os: ubuntu-latest diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index af841433b..e956169ba 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -4,12 +4,18 @@ min: &min java-version: 8 os: ubuntu-latest -avro: &avro +min_avro: &min_avro spark-version: 2.4.8 python-version: '3.7' java-version: 8 os: ubuntu-latest +min_excel: &min_excel + spark-version: 3.2.4 + python-version: '3.7' + java-version: 8 + os: ubuntu-latest + max: &max spark-version: 3.4.1 python-version: '3.11' @@ -25,12 +31,15 @@ latest: &latest matrix: small: - <<: *max - - <<: *avro + - <<: *min_avro + - <<: *min_excel full: - <<: *min - - <<: *avro + - <<: *min_avro + - <<: *min_excel - <<: *max nightly: - <<: *min - - <<: *avro + - <<: *min_avro + - <<: *min_excel - <<: *latest diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 80f81aacf..f91e1baaa 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -1,6 +1,6 @@ min: &min # MongoDB connector does not support Spark 2 - spark-version: 3.2.3 + spark-version: 3.2.4 python-version: '3.7' java-version: 8 os: ubuntu-latest diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 57fe2ca8f..44779fe95 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -2,7 +2,7 @@ min: &min # prior image versions returns empty content of bucket root, some kind of bug minio-version: 2021.3.17 # Minimal Spark version with Hadoop 3.x support - spark-version: 3.2.3 + spark-version: 3.2.4 python-version: '3.7' java-version: 8 os: ubuntu-latest diff --git a/README.rst b/README.rst index 6e21bb672..e20086214 100644 --- a/README.rst +++ b/README.rst @@ -169,9 +169,9 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | `2.4.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ | `3.4.x `_ | 3.7 - 3.11 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ diff --git a/docs/changelog/next_release/148.feature.rst b/docs/changelog/next_release/148.feature.rst new file mode 100644 index 000000000..87b1b48a8 --- /dev/null +++ b/docs/changelog/next_release/148.feature.rst @@ -0,0 +1 @@ +Add ``Excel`` file format support. diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 964d9cdcf..815a12b27 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -22,7 +22,7 @@ You can install PySpark as follows: .. code:: bash - pip install onetl pyspark=3.2.3 # pass specific PySpark version + pip install onetl pyspark=3.2.4 # pass specific PySpark version See :ref:`spark-install` instruction for more details. @@ -158,7 +158,7 @@ Inserting ``.jar`` file to Spark jars folder Can be used to embed ``.jar`` files to a default Spark classpath. * Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``. +* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.4/jars/``. * Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages`` diff --git a/docs/file_df/file_formats/avro.rst b/docs/file_df/file_formats/avro.rst index 6251a5154..7f1ec0d4f 100644 --- a/docs/file_df/file_formats/avro.rst +++ b/docs/file_df/file_formats/avro.rst @@ -1,7 +1,7 @@ .. _avro-file-format: Avro -======== +==== .. currentmodule:: onetl.file.format.avro diff --git a/docs/file_df/file_formats/excel.rst b/docs/file_df/file_formats/excel.rst new file mode 100644 index 000000000..f9b680085 --- /dev/null +++ b/docs/file_df/file_formats/excel.rst @@ -0,0 +1,9 @@ +.. _excel-file-format: + +Excel +===== + +.. currentmodule:: onetl.file.format.excel + +.. autoclass:: Excel + :members: get_packages diff --git a/docs/file_df/file_formats/index.rst b/docs/file_df/file_formats/index.rst index 7e3367bc6..3a39bc061 100644 --- a/docs/file_df/file_formats/index.rst +++ b/docs/file_df/file_formats/index.rst @@ -9,6 +9,7 @@ File Formats avro csv + excel json jsonline orc diff --git a/docs/file_df/file_formats/orc.rst b/docs/file_df/file_formats/orc.rst index 2d82b3584..491492bac 100644 --- a/docs/file_df/file_formats/orc.rst +++ b/docs/file_df/file_formats/orc.rst @@ -1,7 +1,7 @@ .. _orc-file-format: ORC -======== +=== .. currentmodule:: onetl.file.format.orc diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 3aa8f0fd2..cf9a669c9 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -72,6 +72,7 @@ class Kafka(DBConnection): * Apache Kafka versions: 0.10 or higher * Spark versions: 2.4.x - 3.4.x + * Scala versions: 2.11 - 2.13 Parameters ---------- @@ -381,6 +382,9 @@ def get_packages( """ Get package names to be downloaded by Spark. |support_hooks| + See `Maven package index `_ + for all available packages. + Parameters ---------- spark_version : str diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 7f02f20ef..771fb3b69 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -65,6 +65,7 @@ class MongoDB(DBConnection): * MongoDB server versions: 4.0 or higher * Spark versions: 3.2.x - 3.4.x * Java versions: 8 - 20 + * Scala versions: 2.11 - 2.13 See `official documentation `_. @@ -206,6 +207,7 @@ def get_packages( if scala_ver.digits(2) < (2, 12) or scala_ver.digits(2) > (2, 13): raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver}") + # https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.digits(2)}:10.1.1"] @classproperty diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 0fd72a0ca..464487f52 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -63,6 +63,7 @@ class SparkS3(SparkFileDFConnection): * Spark versions: 3.2.x - 3.4.x (only with Hadoop 3.x libraries) * Java versions: 8 - 20 + * Scala versions: 2.11 - 2.13 .. warning:: @@ -263,6 +264,7 @@ def get_packages( raise ValueError(f"Spark version must be at least 3.x, got {spark_ver}") scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + # https://mvnrepository.com/artifact/org.apache.spark/spark-hadoop-cloud return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.digits(2)}:{spark_ver.digits(3)}"] @slot diff --git a/onetl/file/format/__init__.py b/onetl/file/format/__init__.py index d41c76aac..0c9d6b742 100644 --- a/onetl/file/format/__init__.py +++ b/onetl/file/format/__init__.py @@ -15,6 +15,7 @@ from onetl.file.format.avro import Avro from onetl.file.format.csv import CSV +from onetl.file.format.excel import Excel from onetl.file.format.json import JSON from onetl.file.format.jsonline import JSONLine from onetl.file.format.orc import ORC diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 2fc5a1cb5..b0c58e18d 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -73,6 +73,7 @@ class Avro(ReadWriteFileFormat): * Spark versions: 2.4.x - 3.4.x * Java versions: 8 - 20 + * Scala versions: 2.11 - 2.13 See documentation from link above. @@ -131,6 +132,9 @@ def get_packages( """ Get package names to be downloaded by Spark. |support_hooks| + See `Maven package index `_ + for all available packages. + Parameters ---------- spark_version : str diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py new file mode 100644 index 000000000..ffd11a5da --- /dev/null +++ b/onetl/file/format/excel.py @@ -0,0 +1,220 @@ +# Copyright 2023 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, ClassVar + +from onetl._util.java import try_import_java_class +from onetl._util.scala import get_default_scala_version +from onetl._util.spark import get_spark_version +from onetl._util.version import Version +from onetl.exception import MISSING_JVM_CLASS_MSG +from onetl.file.format.file_format import ReadWriteFileFormat +from onetl.hooks import slot, support_hooks + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + +READ_OPTIONS = frozenset( + ( + "dataAddress", + "treatEmptyValuesAsNulls", + "setErrorCellsToFallbackValues", + "usePlainNumberFormat", + "inferSchema", + "addColorColumns", + "timestampFormat", + "maxRowsInMemory", + "maxByteArraySize", + "tempFileThreshold", + "excerptSize", + "workbookPassword", + ), +) + +WRITE_OPTIONS = frozenset( + ( + "dataAddress", + "dateFormat", + "timestampFormat", + ), +) + +log = logging.getLogger(__name__) + + +@support_hooks +class Excel(ReadWriteFileFormat): + """ + Excel file format. |support_hooks| + + Based on `Spark Excel `_ file format. + + Supports reading/writing files with ``.xlsx`` (read/write) and ``.xls`` (read only) extensions. + + .. versionadded:: 0.9.4 + + .. dropdown:: Version compatibility + + * Spark versions: 3.2.x - 3.4.x. + + .. warning:: + + Not all combinations of Spark version and package version are supported. + See `Maven index `_ + and `official documentation `_. + + * Scala versions: 2.12 - 2.13 + * Java versions: 8 - 20 + + See documentation from link above. + + .. note :: + + You can pass any option to the constructor, even if it is not mentioned in this documentation. + **Option names should be in** ``camelCase``! + + The set of supported options depends on Spark version. See link above. + + Examples + -------- + + Describe options how to read from/write to Excel file with specific options: + + .. code:: python + + from onetl.file.format import Excel + from pyspark.sql import SparkSession + + # Create Spark session with Excel package loaded + maven_packages = Excel.get_packages(spark_version="3.4.1") + spark = ( + SparkSession.builder.appName("spark-app-name") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + + excel = Excel( + header=True, + inferSchema=True, + ) + + """ + + name: ClassVar[str] = "excel" + + header: bool = False + + class Config: + known_options = READ_OPTIONS | WRITE_OPTIONS + extra = "allow" + + @slot + @classmethod + def get_packages( + cls, + spark_version: str, + scala_version: str | None = None, + package_version: str | None = None, + ) -> list[str]: + """ + Get package names to be downloaded by Spark. |support_hooks| + + .. warning:: + + Not all combinations of Spark version and package version are supported. + See `Maven index `_ + and `official documentation `_. + + Parameters + ---------- + spark_version : str + Spark version in format ``major.minor.patch``. + + scala_version : str, optional + Scala version in format ``major.minor``. + + If ``None``, ``spark_version`` is used to determine Scala version. + + version: str, optional + Package version in format ``major.minor.patch``. Default is ``0.19.0``. + + .. warning:: + + Version ``0.14`` and below are not supported. + + .. note:: + + It is not guaranteed that custom package versions are supported. + Tests are performed only for default version. + + Examples + -------- + + .. code:: python + + from onetl.file.format import Excel + + Excel.get_packages(spark_version="3.4.1") + Excel.get_packages(spark_version="3.4.1", scala_version="2.13") + Excel.get_packages( + spark_version="3.4.1", + scala_version="2.13", + package_version="0.19.0", + ) + + """ + + if package_version: + version = Version.parse(package_version) + if version < (0, 15): + # format="com.crealytics.spark.excel" does not support reading folder with files + # format="excel" was added only in 0.14, but Maven package for 0.14 has different naming convention than recent versions. + # So using 0.15 as the lowest supported version. + raise ValueError(f"Package version should be at least 0.15, got {package_version}") + log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) + else: + version = Version.parse("0.19.0") + + spark_ver = Version.parse(spark_version) + if spark_ver < (3, 2): + # Actually, Spark 2.4 is supported, but packages are built only for Scala 2.12 + # when default pyspark==2.4.1 is built with Scala 2.11. + # See https://github.com/crealytics/spark-excel/issues/426 + raise ValueError(f"Spark version should be at least 3.2, got {spark_version}") + + scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + if scala_ver.digits(2) < (2, 12): + raise ValueError(f"Scala version should be at least 2.12, got {scala_ver}") + + return [f"com.crealytics:spark-excel_{scala_ver.digits(2)}:{spark_ver.digits(3)}_{version.digits(3)}"] + + @slot + def check_if_supported(self, spark: SparkSession) -> None: + java_class = "com.crealytics.spark.excel.v2.ExcelDataSource" + + try: + try_import_java_class(spark, java_class) + except Exception as e: + spark_version = get_spark_version(spark) + msg = MISSING_JVM_CLASS_MSG.format( + java_class=java_class, + package_source=self.__class__.__name__, + args=f"spark_version='{spark_version}'", + ) + if log.isEnabledFor(logging.DEBUG): + log.debug("Missing Java class", exc_info=e, stack_info=True) + raise ValueError(msg) from e diff --git a/requirements/tests/spark-3.2.3.txt b/requirements/tests/spark-3.2.4.txt similarity index 80% rename from requirements/tests/spark-3.2.3.txt rename to requirements/tests/spark-3.2.4.txt index 44291430a..1acafab9a 100644 --- a/requirements/tests/spark-3.2.3.txt +++ b/requirements/tests/spark-3.2.4.txt @@ -1,5 +1,5 @@ numpy>=1.16,<1.24 pandas>=1.0,<2 pyarrow>=1.0 -pyspark==3.2.3 +pyspark==3.2.4 sqlalchemy<2.0 diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index 2135f3b52..05358b9c0 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -44,7 +44,7 @@ def maven_packages(): SparkS3, Teradata, ) - from onetl.file.format import Avro + from onetl.file.format import Avro, Excel pyspark_version = get_pyspark_version() packages = ( @@ -74,9 +74,23 @@ def maven_packages(): # There is no MongoDB connector for Spark less than 3.2 packages.extend(MongoDB.get_packages(spark_version=pyspark_version)) + # There is no Excel files support for Spark less than 3.2 + packages.extend(Excel.get_packages(spark_version=pyspark_version)) + return packages +@pytest.fixture(scope="session") +def excluded_packages(): + # These packages are a part of org.apache.spark:spark-hadoop-cloud, but not used in tests + return [ + "com.google.cloud.bigdataoss:gcs-connector", + "org.apache.hadoop:hadoop-aliyun", + "org.apache.hadoop:hadoop-azure-datalake", + "org.apache.hadoop:hadoop-azure", + ] + + @pytest.fixture( scope="session", name="spark", @@ -84,13 +98,14 @@ def maven_packages(): pytest.param("real-spark", marks=[pytest.mark.db_connection, pytest.mark.connection]), ], ) -def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages): +def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages, excluded_packages): from pyspark.sql import SparkSession spark = ( SparkSession.builder.config("spark.app.name", "onetl") # noqa: WPS221 .config("spark.master", "local[*]") .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(excluded_packages)) .config("spark.jars.ivySettings", os.fspath(ivysettings_path)) .config("spark.driver.memory", "1g") .config("spark.driver.maxResultSize", "1g") diff --git a/tests/resources/file_df_connection/generate_files.py b/tests/resources/file_df_connection/generate_files.py index 874045f8c..698c81ea7 100755 --- a/tests/resources/file_df_connection/generate_files.py +++ b/tests/resources/file_df_connection/generate_files.py @@ -14,10 +14,13 @@ from contextlib import contextmanager from datetime import date, datetime, timezone from pathlib import Path +from tempfile import gettempdir from typing import TYPE_CHECKING, Any, Iterator, TextIO +from zipfile import ZipFile if TYPE_CHECKING: from avro.schema import Schema as AvroSchema + from pandas import DataFrame as PandasDataFrame from pyarrow import Schema as ArrowSchema from pyarrow import Table as ArrowTable @@ -85,6 +88,12 @@ def get_data() -> list[dict]: ] +def get_pandas_dataframe(data: list[dict]) -> PandasDataFrame: + import pandas as pd + + return pd.DataFrame(data) + + def get_pyarrow_schema() -> ArrowSchema: import pyarrow as pa @@ -382,6 +391,87 @@ def save_as_avro(data: list[dict], path: Path) -> None: save_as_avro_snappy(data, root / "with_compression") +def save_as_xls_with_options( + data: list[dict], + path: Path, + index: bool = False, + **kwargs, +) -> None: + # required to register xlwt writer which supports generating .xls files + import pandas_xlwt + + path.mkdir(parents=True, exist_ok=True) + file = path / "file.xls" + + df = get_pandas_dataframe(data) + df["datetime_value"] = df.datetime_value.dt.tz_localize(None) + df.to_excel(file, index=index, engine="xlwt", **kwargs) + + +def make_zip_deterministic(path: Path) -> None: + temp_dir = gettempdir() + file_copy = Path(shutil.copy(path, temp_dir)) + + with ZipFile(file_copy, "r") as original_file: + with ZipFile(path, "w") as new_file: + for item in original_file.infolist(): + if item.filename == "docProps/core.xml": + # this file contains modification time, which produces files with different hashes + continue + # reset modification time of all files + item.date_time = (1980, 1, 1, 0, 0, 0) + new_file.writestr(item, original_file.read(item.filename)) + + +def save_as_xlsx_with_options( + data: list[dict], + path: Path, + index: bool = False, + **kwargs, +) -> None: + path.mkdir(parents=True, exist_ok=True) + file = path / "file.xls" + + df = get_pandas_dataframe(data) + df["datetime_value"] = df.datetime_value.dt.tz_localize(None) + df.to_excel(file, index=index, engine="openpyxl", **kwargs) + make_zip_deterministic(file) + + +def save_as_xlsx(data: list[dict], path: Path) -> None: + root = path / "xlsx" + shutil.rmtree(root, ignore_errors=True) + root.mkdir(parents=True, exist_ok=True) + + save_as_xlsx_with_options(data, root / "without_header", header=False) + save_as_xlsx_with_options(data, root / "with_header", header=True) + save_as_xlsx_with_options( + data, + root / "with_data_address", + header=False, + sheet_name="ABC", + startcol=10, + startrow=5, + ) + + +def save_as_xls(data: list[dict], path: Path) -> None: + root = path / "xls" + shutil.rmtree(root, ignore_errors=True) + root.mkdir(parents=True, exist_ok=True) + + save_as_xls_with_options(data, root / "without_header", header=False) + save_as_xls_with_options(data, root / "with_header", header=True) + save_as_xls_with_options( + data, + root / "with_data_address", + header=False, + sheet_name="ABC", + startcol=10, + startrow=5, + ) + + format_mapping = { "csv": save_as_csv, "json": save_as_json, @@ -389,6 +479,8 @@ def save_as_avro(data: list[dict], path: Path) -> None: "orc": save_as_orc, "parquet": save_as_parquet, "avro": save_as_avro, + "xlsx": save_as_xlsx, + "xls": save_as_xls, } diff --git a/tests/resources/file_df_connection/xls/with_data_address/file.xls b/tests/resources/file_df_connection/xls/with_data_address/file.xls new file mode 100644 index 0000000000000000000000000000000000000000..28288eb8e5e89ab2d4059d39a10c5b0b10133a9f GIT binary patch literal 5632 zcmeI0U1$_n6vzLw*^imoWV1VoqA4y*sBL~)H&sI@l6fe>KUoXmJiEcV-4as0Bc7T~~?C9?3Y3oV`%drzS&_bD!$;?==AQ_y1FHn(hN;Ngj&SYjU zT{?(L&X67WGn)ebkO+w!PWkr78wUd^;4fi{xbcrLFC*DaCU_*f#Vi-wKMDkaO~}5a zE3Ow&G}8dS3+6XGe;<=%=I{cb^>wZaw|LGi>Nt(y?kDkr{iICSSt)fp%C(-*)fR_| z`(NHZms#DMUZs{zw1~*!n<9Nd7VpR%ygA0T2$b?`gL@Wg^^Rv%zbig?O6ygsE9uR> zqo#RWuCi#e`fSnG8glAX&Znp5)t_O#_vQ-+lD${c(^%itIP_-au8ZkDRP)W(PKJCo zW4bZj*#4==!L(Nz+-YRD`c<{NPv!4WW+1L3h;vLTMmgRN*tFS5=)OD5T8J@EXmdZG z7vgvm_?_W!@X6i)%21+lh;e;RzB}o~zK^ffyU!@6-!1wHMfUJwN?0k9A)$l|C}E|1 z#k@NB>3~NGbGNNP<8O2JHyn257Vku>Oj$$cA@+)JplyZrkbI^LiqgYDWdurPc$6H;BrIgF`6N*H?m|`oX zDo3#kD7I3z^ELxFeB^0O^l@l4AFMb642>223Bj88CkHC`<0Xdz?t2}q8 z*AuFoALVo>|LY9%E2S@EGXhJno}br{!a{Vj?7&(K(vi3w-#S>p#BqqoDCY7xEMG(a1yd^T+=Py70gF{{lfZDN6tV literal 0 HcmV?d00001 diff --git a/tests/resources/file_df_connection/xls/with_header/file.xls b/tests/resources/file_df_connection/xls/with_header/file.xls new file mode 100644 index 0000000000000000000000000000000000000000..efb43b4a972dd3618e170aaf8e19bbcf06e7189a GIT binary patch literal 5632 zcmeI0ZHQD=7{~wj&b>3U+rG`6RcfPQSj%$L51U`Y(3wPAzl>N5nQLI9JDbZ+yD?2g zL1pQF>kWE=tS<`|RG?{KRw@Z)(Ti_EMo{i51yK}5sf0HDpXbcAnM*7ak!0Lwm~+na zoadb1InQ~ycjnN+#N@>Gt|Kx!n&Mo@ zsEfB-KaR;-%6d=1#%s<#roeH2Z^fNhJ~moNpGR4&W5`=FhJ{Qmyg*AEF40kWny%9{ zsp&TL+^uO=Estrs5FRG20UXRM@41$Jfj!+~cWYf6Qwwisn=jPTiy8QUK9z@_l+Dgs z7uT1D^Tq!_g~d_rmT>$lqe?j~*aB{WzQSmslrN^7cD#sYU=?LvE;nz)+&WwEYb6c*B7+opCmq_)ZyYS7fz!eiG4YRaT`bw&rxS^6I-B7F!4O)h%VUEh7#?Kd zu5xM3J^A9@1(t_L%cjhoHkdCLjM8S5^6(v1!$rVQaV%f;C4udGO?HjR=CcEs%dDgB zGHcaC-okVc?OIOANYH^e&b<87k=)u9nKf$JK#Pc+{3~Q$sNi8~!NwdB#nj6c*`qD4 z*7Ona!KGU6(zKJ;1{t$g%%~P_*r-06w01W8nXmF`=E|D#RV7B!i_(QInctI;CVBIh)Rn#9P|xzt>0!jbm1oCob@wRyA}f0>8T_Ar;d>EQ=9 zOXArbHW?Ad+hXsXxcua0t0$iRI&1oH>Gvc3gd*b#QpD!ztYlQODt}UJrM$ukB=O-c zpJFSe@B8a^F5Yh_Hs|5tRr(1<<{PAl&ErltbBQQLo}Dt6h*E|aiX?vC>r*0189BZx zvFVhdM3izn{e&WM3{u4A@na}X9mP>fH=#-NyysIKr9AQZy~U+-zR`#|N|{AJp~(6K zDPr?5vr2K5BD-AXB74}wS5%W2_{gWYO4)JXnfLb(7>cWuWAqb>tZ|ScHV<7@O0{Fsj`Rv`-yVpNzC{d+6Pd}kZyn__6c}P_$u{ugjDL1zWW$#g+5>v`= zZykQ%+dYO7Q_4#EEwVlAlnsTfBvmmd#@E6;j6fKHFalu&!U$ZT5s;TPdCJL4jl6Q0 zmjvhoR{qyH%dh02{68ab6UO){BaNjPWnRL4+y#xxOt_XG8QDvXi2r|~_^u?qgb}RO paf~$5yv~O|d(QD?9Cek`DdzXkbRLb?{cV2j{2PT#GeE2g&agWhvmEweBQztbnR`2$II zwg&N1&#LyW@zM}=im)AA>4s!?vZuW(>XhIMOaNLbGZTrK^XEsMDfkj)nWk7{!{1{1Wr+Pc{w~V!$0?ikSEpa9ti}4|9PZXD{R`=h{EC9bgMm zOLfQfalf_DAg4mh%4BlQtb3R$$-`#@z=pNl7^Zl^6tkI0Fh@#h){!z%w^|YPa+GM@ z)eTpQFz$VI<5;4jIle?KTWArH{r9X42PwQIckt$7Y0v?fgQ`aFcwY6};)AEPUZJ|2 zo0rE&Rr92D<>jsF)2ppjgsQ;sj^`?&?{jSqbC=$I~ir8H3Unnw*M-)pb zUvpg}IJn!RSX|^XG+dp#Yu=!t$bsnM8~O=F=9)_po6E#8bCET1bu=;;TPbe@g>rMB zN3oUCdA`-{y=^GAQdZDUD3WZs6tTJN#ZUrSlz>tynWqud?Dr@ErM!P?UDtx@OB%6& zQl6ooP$biGDPnUmIZbhtBJ-2EI7&IoWR0NhkVkQpvg63d2L{^=#Zk&R`U!(1(^b?9?Qtn*D<|3M=gt90hrMy-o zlzk^XN=PYpe;nR;d9R^_l(LY1b#?^1BowF@l@^o9-j!ea5%446N5GGO9|1oC|IrA@ zOQAf+<%LRKcgzcldC8Xlb?);kr7z<%0VJ`P`o6!ygq7`7(~I#<4EUFR`&a@@@Oa{LK013mL=jU-AC}a4RR? literal 0 HcmV?d00001 diff --git a/tests/resources/file_df_connection/xlsx/with_data_address/file.xls b/tests/resources/file_df_connection/xlsx/with_data_address/file.xls new file mode 100644 index 0000000000000000000000000000000000000000..bf2343c0a7cf9bb4c811a0925a2a903e64f845d9 GIT binary patch literal 4891 zcmZ`-2UHX5wod3xA%q&bG!di)lr9hn(h(4mNCzSGl0y$D(m}fPa_At^B=laD-jo(; z0@6g9ph&ns2YY_k6p|*3}>+q5}W`WcbAefE#yG1>sNC_(g+X z)-G1M?k=t#A{MT$!ro4fS}|IbU7|F%f2eQHJKx*m6ytoGztrz{=jM$#YnW#U&R@SX z7Av9xsa9s4wMFuSz-C!}NNOf3#a`^{w-krNn2_lfjyjhU3*z>~@5H))=$rO1lbePF z0LbJA0Kj;Cy&XkRb~a8nzjv{pD(@I0Jm#e60(L9Myls!2DEn?asl$Lhke*mfTJVlG zM+nI1nWK+&d~jg9>J6H~pL_(XVBPy!v8&W=ppM@PI+}whK&FH2KoA?uF zQ5GZ4z!8_?Rc+(&WrMDs4tv{Mh7PaCC+<8B>PIILH>n-xPtuc#Uy_x{G8*ypKknJd zi5{3kNjO24R`$K6>INsGTv>^h?lxz4WqomMFHj=KiSnI z)kQ9KB}_bwn;I3U)rF9yzj<<`8pPH6zVhrSrMbTV>YIg@8F5GoM3w=IcbYRs6LB6J zt2xDq(^|?St}R03`={Ipi!gp?S+KhdtF5yC=<#uUA+Px8?g_=Vwwx`8VRDtp=;&&8 zv^#h6{IL=GSVz^Fd%Cov=E`&uoJS?+&Gn{LkW{VOC2{aAvaHZldtFw70&=nB+|X~B zzL%gSFuJ~t?9Jk{@ULYq>GJn;jnw9XztO-UA2K~YdW<`sk`}_pkMFg-&rdO;2c|GB zO1zL1>Yeg_-mzbvDaQt;9ZsSoQ3_4r;iD~;4$!&-J{K&UJI|a{+)EPS`K}%xi>%=; zM)3`AKp~y*N|wLE>?Y)V&xubmB`H|rS=b0{S!&>1!N}%4_DY_Z^ztr{pt6D`^9_i5 zEyI&Kx6)v9W7@uPZ|w;T2>0NnWTJxkn@_{>e~nDVUj`+Z-o(E<5)}t(XAhN9Pv6ON zGa@?6HtkmH+z1+cbk-0M`VQ(S6!TT}Eu;D>HDB?qO>qg*{PYLpt-*%&IWNK_6iV2| zDTwzQ*%j7&y(w-c_qFI&1o;7fjL+TdD*G1HAsr};9xdE${34$_QGb&gZQD5+g;;<) zBYI9N#O9_Ygs)oNcOQQ9+MqH~U`9HpxBmUT@woy)Q(!WNm)^Z_=lgcL@&k?wN}r`UnN%^87WURW@ALQy!YF zj3=yMi)?D0D+Romov!FfcOg^3wq*kgHr1W?*hcRm4yxMh7OA z?^q=<7OCeQGV8ak3##?plWJFi*3qUgwXA%6?WV2YKJAE$n&Y(1cA6rdj|SC9@=5qcV`Z92hA zOBY#1Ma;$MzOuWUXzvw0{axsZQ?P-!^*hD^SVX5h)EzcszXXPKg>^CVy_AF$nb6GQhE8qXo~$rd{NpJE3-go`VlMyD+uUB zM@Bvdm07kmRqCAZX3Apx_MdQO;HIzz$MZ{6{%^~S3#|n~w=_uoITA>$q7h#t%wp** z%ZsYZ`{u-`d>$JeQsq99S`1T3hd|#&y_9}N;iB|TaqSlN?IpY8OA@T{#HM0tbvN=? zwJGG@pYL9(>gtJE-ic}l{#ihV&)Ht45&!^!`1;56yMTDPxI1{*+1Q{wM1I|VRgve( zJI)!ROd&4BZa@+ioFs8U#U!SbL#e#(tSBP!Z-SBN3`eh983k`WXJT*1TOP9K94hcd zJYX&KW%E?t*wC3ihJki53xVnh7PVj#c|ap$YpG+wmc3JveFl{u@77U6i32#Ph-Y)F zD0Z;dyIgNgxoxv8F(xqRFH=xdGLT6MSuE?~KI_A{n6E-4D#N&)QRNhrQBP zjmEKSx%6F5=d#Yc)x${xp!A9a4QaL58Nm9?Lz*Aw9Lrx^= z#5uM|CfZK{dBEzvRhG<8PjQcP_%0k|!Q>vOvQ|jIe?zPaES42sny(5JYfYFr95{HK zIznT`UXqLooheZqa8F-}E_zU<`aFdq(s+wIaC%kc+jpk4=LcX%9Mm0b!8p=?Tj7yk#0pO8a7Z@Iv z`+CtbrcDGlfFJGPB#D$|C(kk-6dzF zchcFRcHFs4+JGe{?_923%4V_Ql4>F1k2|1YH4Q>AE6(6zL3U-^g zH=8|BK8`jXKO3QW!$p@lQJVAnLhUsGq<#>wh|u-tLKk4YL}R~ z4rsa^bXmha_%QD==WXjySZ0+@M;F83mhfg6|NW@+|G;*U81w1NB*$0oc z8ywCtzS`&LPf%}RI&J)GcEkBRUo!dDR14i}=KvwnANRM=-RH2g=Js-qBYw}j$z^#| zWkl(+`HM^GZ)%8U>y(XiPiK|rh^Ke$S5~iMn{qM)(e>?1ybgtENJn(`j-<%#vI&c6 z?t*u_GFY=*QvQC|noN;g%79gs1)c4)I%otmb2pML-pqXpF5z zzO2)c%0RGGSA+nGK*>; z;UQsm!0yX^`xUkWHx6^jvl3X9=Dv21c<(4AZm$u@t=0Xm4+kJDBbbt1y7h_F_u5c| zRmn957SE?{JX&HzzGCn5)q*m$336P~RS4SGNC+H2sY>&jiJ%^&T1Cc5LKU&{S^nEc zG+k?3ao%Cz-kQjtbWVwEX6?Y!7>uVhp3wj^cN<3!5#gVERzjn5w?1En`<@-CxFc0i7gfr1Mx%!`e%AoJQ^y;N z9En%1cq_y%88)GK8*2rtG8R%$6lQ)@-)hZJ>``z({NZ#~p*&VZN6F=JrHrPVGV+cF z^zz>DM4ePH!a%my$8nRJl~z|UzKD$z zH#-B0nDQ4cHKMfUF_WT9~0sx$NL#cnq?awM?>Ehz>GjgBZ9k=gN zq`^IRB!{B>;Tvo&WeUlzC}oIMl%J*%8#s z;f+nlVSGN(RG)<1<4#M9F{o0GnJ__pH|bLotqH!tEcFl_Vz?t*`L*fa9V@qYb= z_c7$Re}8l7|9bfsuWIAvFFzHf3aO-B14!zY!iY6Zj0A;gh!44GX?rUu>P13;aWW{2 zSJ~GCcl)(-?C89l|E@dafjf{4SPy>M1r#7Kk-{S@qB{1BFF!m8C25&xwWmqA+N3lC zwa)&WmX|4a4nzGlkMx&1dnL6;E!twTQL17spiU22)$FpXyV0rlVNf$UuSA?V@>;U1 zG3SSVPA8-xx=KeIv+l!q^Q%h5@CtdwB9*H*>ij}0V-x3-z>GI=l<@bzFaqY{jIwb? znHl)F+IX1!BvN^zmUg!&4c?G5%l~*#~ufZogICGNla({1~--%QO)0Ngx3o;eVfl;5qZ_dW`q)|DTFn zL|>dr{Kf(RkAYSA`^aDx!4W-TMDq9{Fjt}dV`D5 zi<$QyXbjaKdH5pmVkY|s=!I`c{zvow&1)CI7sK=)@bu+h;Qtb|7v)@xgn#6;;Ir$0 Yhznf}V!U5}4#CI(Kk%QJ1oZRiU#uEuS^xk5 literal 0 HcmV?d00001 diff --git a/tests/resources/file_df_connection/xlsx/with_header/file.xls b/tests/resources/file_df_connection/xlsx/with_header/file.xls new file mode 100644 index 0000000000000000000000000000000000000000..b19c54d02d7c37068481b74f543c0e22f9cc24ca GIT binary patch literal 5026 zcmZ`-1yqz<*B)94VHk16XI0onfo+z~ z7CLUuF7A9!TwHj)933=cHA&j}DenJN+njZJxX&WM@}*#**H=aKZoH+!^U!lYy|y^C z%n7!IGRqEnM}elSksOr_-FRly8bX6v z=Sr;) zTQC&l0Lt;xzd`J`5*KNh))?CYI>h{YL6jXcK~dM#y{P)eO%`BaKKAji)#zD+5L<^l zI%P{Ue^ZE!?6AJBvGZe=>eZVcQ5s+a;4^)*4VdlK2u!RLM$}78U!dt9;mttmQyEyrbe(SLv|FmMs_eMTLFgCXJ1}((;xMC3VfJEQ$pH zb$LGCKZ=)Xdi(TQGI)xqX;#i;AG$043V7Moi9c>TN}npM&`M6U z8++3%2I`N|R(4{ac-#7cZ6aBTLov7KPU8wttVWeg5F~RgJ1pH!hY`2%xp50330X`*xz_? z4QvR3+msLt&%$j+C4DXlPT(TMj0p@(xYi6GlvqP-oA#Lz9I=_@?LZ!RX> ziyYGq)wYeGzNZ)UFT%!z93ZjB%3o;JR>-;D$ZQG<6BT3{UH=$jV3+$YTv)n)yIoiG_|W)ry|G~=jX_@XnX08pjPn!UjLz@y@qd6d82ir?Ecnms3_Q+k`oMh zRv|DwAKMC1ER@h&&Hz$k2)8kr2yCPiP=U zJ&1SiIN9v<0xyYA;WzlF8#=B}j*OhGx9L_E>`A6Fbw?cLv+m}Fr4Mx|gnRowPPb0! z(h;U|bSUG1DAk$OPaHs~7A(!~ye0}p=;9tecxWa7>%!xS&-c@aR@`t|PcwPEJd(JK zF1D(1LfrPGcVv4Y-fo)?vM&8RXI0hqkZI^4?69)MPO6$(Yv|IbyeF3ws6|6UsbY~# zTdbCUM6cJf&ZCNaDE3KFsFpH~u6cQ_+f_@ip7MDuIj&$sllWU7D*@}`!}&X;*CDV7 zBg!SsM>}KdS?~%Uw&$m5o^57_7cRZZWIkekA!KIc)zG#5!IdP#qv3EaYErrzP> zlzoT$C))IjR5h)X(osRi0`rG(_bi|KcEz%v&~fGy!Fh2Pw8R{q=^6;4Q23(FKQi(q zu++Sz5uts;3BOnEdl1Nybv}+R#LO;``h6)iF0$kS%BU0h-Ap93h=zR=ejG<-US3>P z-Zd>i>irUWM4I?}7XyDEeJLS};@6J|^xFcrwCy<4!NMJzdU zxkpynj*MO0iTZ@|&kKZ)wNY*W7XX;O4gk>ozCb*k-R#|MtgJlT`TiXKyhTz{cbumA z=|Y_eR&n9T1H*R|wp7Cyiw9$N=%HQ$Jl&*pal+cBTqYaKtYPAi`*8R%qYNTxVTof5 zRtJ*TzP$Sktyz#geikmaYjAI5|3=S8n}L)VeX@zCCAF?~hCzt4{NmUriShM|4;fMv z19nwYLM1mE?K+b(J;I9@zs_rXITVf=-MB!y*UZ}bvhhYq%}3Ja1#IUg({~2^(W$y%DNNdJsQhH zokCzX$2{Nj-InbsvObHzW2p#iPo$XfZ{Zp!gD(E)+>*zJU6N|qJC^(Xy$tOcQ>I5Qhezr zPVSRK=rNY$wJ(Nu2H!#S1M%CU<=b5{Y;sKu(7UGGvf&GEAg?s8z71*s^k*^y%8 z`cV!clO?k46pCU6sx;qhHj8k%$Jv$e1l+#0z3U%#G#69a_GV?IC;%?Ot6tK@P%u!d zuxje1nnkP1ZvVNq3s2zrM?7;s?y7a_L5EnxE7vaEG2F)>7`2LO>|qGo5n04X{RN~C z(vpQMRvMZ@-=GPP0s5M9)Ss>m+|*fAiZ^}zC8#8Z2?h%=@O@} z=Y@Q*xQfAN&$JM)RMmoyhl5P9$S)_G8&&LNaH`UX;n28rGGzYZ(bFZ# zVYm68pcb`qE7Z;1{!8^+Jp2=)5567;&SGV=@lfWU6d+?7Z9KtJppp~-Ap1=LcMop| zEB9a7)2tyncAB5!f|_Sfm}E-xI-824L20@EJGE_vGn)tn$VsBaQ-ZVa-;t~);~(rw zF-39|zOh5_i~}*?(Ks_cY%JR7eVTlYw>}Krzf?c_MPn3`?Ue4OQWR4-t~qemiJ`-0 z3RgAK%gsHhE76Vm&YB(f%|gdsrM={V0(9H`^Buhm<>ha8+2$VQsi!IvdBzknrdUxC zP0T;4__-K@20;7{)x%*>LrZQELiMwy#j0Gn3tyQ;9!CjB#6rNt#i7va zCOJ!!h`t{969MOvn*I_Z(M_@J1M2&y2dF&PJV!FAXl~DuSHrXL9^0<9>qD zGIqL_@@=l;*e$wi0B?C2WdXL+8>+zUTvvX1@NFqpWR#4s8AGuQP)-P!YQEf{93 zGIFo!qy_vg| zQ4XgC{j<`00fs#ao}s|#6o}oem?>SWU%JLO~(XU&-rWWs$`F5D&5fk}q?RmfNWFgVo`W$DAX_$OVRy$N&6}Ks}Gc4PTu~(2Z(3JGpCX;PUbrCdo zb32(sH8m92+79Hn9@c7$1VV5la1;^?eaTHf#w{)T{vmy#Y;=zQPmZKTHZithxdFoR zgakW2cC&JD=i~j=XD2o|b?}qQ26PVR@DmNQMhICQ(BPlo803>rQW8WeW`9CV7rDlh z(0fZsQqC~j%I=62)<(UhyP$YrYdltuqf*NmXFHf6U-1RPED}B{dmn9~P-zU2mgS{? zTKCbC=8b!yVT8#_z1>6$LPLW8zVp2n)7qAK1o;tBpQ!?2H9xw5xgbrASaO`SuIIKQ{_) zWGLmw0RY&s8zuiO2EXn@b7yD!UvdzmJEGPhOL3m!a?|X=NKnycl;WKT8Xo^o99d5{ zXB{We)zb*iBnrrnCDhy?q@#c4P{R)r5!;*Q$+}hyw57EL_YpS7h<=gRDdxf$c&sXb zVT7^cti_#H4rV-` zX4#aievYcVBN74ATM^Hi5_=nF`jyPdA<)Y}&yT&@FeDBT<_8r?%6ASTlE5*A1zHzZbh>O<2`l>zLganAeIP zgHnHBm`!^P^tv_!46Z0G2Xj__IX0+2umJ}D4({)R^xq-=^H9|!NZknLCk;hVt^!1K z-YO7iJc9D@QV<-mQ&M(T5ZCdA;>1gMJb9mUCt$BvE7yj~)9H2X5r>jJ;b+Uf?L9yt ztU8i+aG77*j<#&cD2%Xq^rIa`;;lxxNg>Oe-Hd#=ms)S;SXUUSVuIca5calp!r?C+DwOuBwlxDSc6 z%$eA{DymtX0)0cNyfDI z+o{nPh^SFcbhd5XO#XVmq``gI%$kiCsuNinVsgUd^40gAteTQSRl+ONB8!xba#;!E zR-fsm&-4mtFo8tL$K_P81@7A_JQnM%5f1@Pt{nd8p{A+xwx`5BpEZer;#K$o-P!ED zvkxuVRV<4UUaarGZ=cC&!Gc>729_^KZADiC&7o^UKYY7re=-an0NZyz$y7CuYk>pS zq84RL1F(_A!KK3c-{lzWnfdd1iH-07FV0-GzFHjlZ3_Ur#Hqwi|D{xN72s-e{1-sr z0k-CX1^Cxg`6|lQ{NgVZoNw5q8;kOHwsF<;s_g!4N{7u<{!4_f8eWyLzYTG2{N?%o ziP}};t2F)F*c2;?Scm@yX|Lj3Wy0S$3fS7(zwkmwod6rxU$wYvfS=e;90UCI_8;g< B$p8QV literal 0 HcmV?d00001 diff --git a/tests/resources/file_df_connection/xlsx/without_header/file.xls b/tests/resources/file_df_connection/xlsx/without_header/file.xls new file mode 100644 index 0000000000000000000000000000000000000000..78632de24b88a9e74dffb761070c678595bafc75 GIT binary patch literal 4881 zcmZ`-cUTkOwhbKv3ZWCaAfklco0Lc=bP$QsBZL|t(xeCi(xrnmrGrS7BE5qF>79Ud z0TF2$MFDTr_wMh>yLZ0t%$z^YT64~xwaaXMZ6abi002OaA6x)6^G?bT{Hz*3sPV(j z%~s#T4dp3hjY0|fxH!Y&U=&@#)Hi=>Z7;f->~o56PUbIT9%x8O#@j)?!cP2+I-g+W z-wDSw(GQsQaC=LH_Y?<$FzSGM3bk(NL@BfS6)@6_U1``ins&~?UiA#Ok%}vWBqqEOeTq2;a9fr=z&AnHG+Ew_|IP_N4D5`hV^B>76WYYEl3o z>nZ?1i{G!0vk=+=;ez;c7x}g2U30kSyfj_lUgfBdJa!9STo6gk4gXf# z_Y$;B8_>{3{$?pQ;%k{(hQh5}GtK$XZ`5kw)hy5Vo@35G$O_fQa3(Da`DteK1Zhl5 z;x8m1z0*D^9S7xE@@%v;L&+4R%He4|yfmdUfiMl)GycN)v#bfF{bV7Y?^+2@kgvFl z(Y!-jV&G1-N|yVN9meJT&Pa~4B!H|5ENq1KEU(nKLXpk;?3Fxm8RcCdew7nS_tH=+bDF*}AKmd<(8=wW5=l3#-h3QNxIa9da0!%XA%$N%932m8XAhUw%Gk|w zHzPjHvFO(9+zJ`IcltUod`iq268BYol2L1&inmyCTU4AZKjZd=)=*Q&XD=R$-zZ@h z1(F;zvftS7^8rew_O<9&gghYlIW{lVRrW2ULncTtV5D%b@v}nic!LyofPLpgH2j^K zE4=4Ng~fQ%f=TW;14QHVhs zJhVU5C%4t{N^5UJbk1a`q=k@ zWx9PzpT0Pqi}On!h+2cy>zM-x-LjpPz;m)kl?H@|h9*`b@IE5`_&k4|7}YJ*X4)Og zwXwuCY!RZ)wep%bv&&^enJ#2Ht$o?RJ4AJ-3EPMX{IIIcQK6PzcjWAL`I~2yAYBFu z8V%cI#v-k}BW9zvO@7TD6X|wUv3i;`rk1t${qDL(uW7vMsR%_In`KJ<5F++Phf4yK z7a;J6+cc}NySr1HnOPNnm%YBHd3RbJo}w`7R2y=emY?rJD*|zgYu{ayv>sA6UZWo_zabN(3V9T+YcYO> zhAyg#l7x%VV{LCQ$XiM5zisAW`8JIa5$%>L@JXQtolF4dPm&8FFpMVDkySlM?(7Vl{x(1O6u zfT*a4pfa1brb@lzD_L^24-SGkGf$?m1-Qj!O8?0+^Flj*kfJu3KSv^|Z4CUgxaAW% zoARRS^1gWyO5cZON0hnuq?aD6W`M<}qF>6y0^O9Slr|KxlNTLu7sXijLE$&h{==b@jxp>_)c}{GC7+uLENe2mt^jKL0WONg&>C9!{PP2n5lq7Qz28SamR#jSvzL`My9(9R>8WoW6 z;l4aDH>o{9Dkl1K$(L>I)L3l()-n0ICS?&q$Yi9h^aWHhCwaNK)MyDIJc67)^3urk z^TU;7n73_Lu=V6?^)<9@epoU@AJ%9@e_rb}dLZrigVX5)tzv|IZcq+;N-1I(F@v*V zg3It|ePrC8Sse_&eaiD~Dr#^mg;x9U@z^yw+MW%;|e1mh7 zaCp_6n!Wwb0^NNt$w=(i`i;7BwvP)6P6+XG=1|w+7BNGYg8EONse&1A=3)m7b&WoW zGcnRrnCz4R`>w~^Mlcmzx%Q%}pI3v#!ug&Rn+Uq6O)4g0CA%k>u?r-U#l>qMh#xt%u>xx(tc9Hx0Q=3YcuO-#fXWwk*tVND*N zrwVySS7UecafknNJ(L4#uLFjFMxk7#dF+{-^gh)DBKLbHC)AHx z3x4rJ<}3OAv~?&LvY}3X+Ri2b8c+0CSK#HkziW)6IqRItW$P=TeJ~+F_SPA6zjd~PUO;bx&$3{; z=2?{bksxO>eLAi+`Fp<2p2=6Qu@kbtWvsq;GrRyV<9G@H;Nl+{d!l`v5uU#)pheRq zw|Qaevs;k%R{*3@2sxE@m@HB!WefV%A%ZCNgj+ich?^N>8*om8S&p+tbL~rGx0PPS27gMi0juZ zNFPNvj+UEW!^USL>xQ<5s&1aFX(x5iaQ{z#*4i6v@4$NjL&PR*0Ix1OQn z`cs<2^|a!uDzWj&o+jvfB`^LVJu1Bu%Hj4I1C`9Cqg@3w6}ggz46$%wpmk$OdR@yj zztpz$;;LN4XWT@o2k4`dSB8$;_R~>?tfcZ%3Jc4P?X*-1jP?$B-7Lp45=KwpP;M~H zM~r0|YGSfrZe6kSzB6!9e96kEM)rltfQj~T;JKAo1G|yObz=28BYGL}( zczG)C*>hZ57r-_>%j6GeXkWhKR2Ts6h{@TN5V~15Zau?YFtsO(wag{Eib1`~ z63V3rTvvUkw{u!A7AclB69`xlsQIAA)G5tQ!Tr!Wu!>wb&fYrktl0ma!wvRsn@o}f z=7FcVtRp}!->;n~p)#c`D>@V6aY@8-WAapL@wS_(Jo^{Rtg8u#j)-ys_g)@2uCX1u zb6827mO!g?4s?4&dq=?W`;7$Lue;k4Yc;zu&S>u)1%ORh7pcs+9Gfr$|N zi7e!6hGgjyK0^gm!D;%DU^M`RI(0u2VFO6Liky>_GIH&c!ngMUbgga0c}GF}8$y4} zIW4N0wF56>TD+w3iUwGEAe=pg1b^ML6B}K-g(;PSdPj4F$ws*%#1IDz#K#1tc~rAB zBvGo_?UnO|?(r1Nz6$a*3+zaxUFm}Q=u)OrYD1*?)N2Bb`YTV6!wD)ClMr@^$Ky&j zv9{1EbI1)PLFRi6t#%B>o&~oe?#yHtDqw~5l-(Xy%IdhQAT_kbF6|GE2Z???qLfhG z(Be*kgMB78&b*d2$8s;5F=VJ>Ci2({gV_3U@e|vXwwG)D;9JM;4#vQ^1=}kz?l!e& z5+N9>kAZ-{4+?H#DHA3D0J!l7rTXKyztWG5o14=w&y6)0)9O~DK1o4wSQ(Cm6mCbW z3PdpQ2ek8K-rHVunZ?%5S9&K=L%yv}yc>pezg;-g@rOjjVbZ+0-WP$8j7abxX-lls zq>6qKFU}Ob0R_WJ;k~3vE!u9coOAr5J71_7&Ek<3N}$Q?=6L z!|cqJ-3}ghBEJ@;l+3q0)wxF$A{KlR_nQ+jjSFwpY^We+XTjdB7(I9o0SNAICXtxw zTNEPUk-TCv+VJ6k*iYUydz4m|zycm@UsTbH+Cqq{`qA5NS!dWrQk?0<8qb(@Wef~M zg{mSmho0c>O{UK3u5Umb+V}s&?13=)vncT@Lk71N#9dQL+h27BrAu4N=A?d7{CpsKCxFSE&P z2L5z}CStk%P=)Mwk51-1`+ z+F^5BKDO(vqZV_%gci+AZ|4q2@`sxxjh>@c_Ll|C^rOl`?;NwCK0J_9(o%y~Cq!Bl z+NNxkE6JL7_|2~lFhj9m64{d0wN$VzVQDoHr~S?tS_HXvC;aM1hc6-A36yd?<;>4^S&_XDq)-`7LDegA(iT;vz#vl{>?)0`oeFPe+q;1(DPyUALwJszk=|2;Q2824-kc~ zM*e5#{~OfKgU`F^Kj5KDzrp{-Y0vYV_k@3VYVe`;Klp{dHVNLYzgl19fS>qJeDbT~ G1^5>%<7VFg literal 0 HcmV?d00001 diff --git a/tests/resources/requirements.txt b/tests/resources/requirements.txt index 56d154dd9..033953205 100644 --- a/tests/resources/requirements.txt +++ b/tests/resources/requirements.txt @@ -1,2 +1,5 @@ avro[snappy] +openpyxl +pandas pyarrow +pandas-xlwt diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index a73fa06c5..cb687776c 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -56,7 +56,7 @@ def test_avro_reader( """Reading Avro files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) if spark_version < (2, 4): - pytest.skip("Avro only supported on Spark 2.4+") + pytest.skip("Avro files are supported on Spark 3.2+ only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files df = file_df_dataframe @@ -76,10 +76,10 @@ def test_avro_reader( @pytest.mark.parametrize( - "path, options", + "options", [ - ("without_compression", {}), - ("with_compression", {"compression": "snappy"}), + {}, + {"compression": "snappy"}, ], ids=["without_compression", "with_compression"], ) @@ -88,13 +88,12 @@ def test_avro_writer( local_fs_file_df_connection_with_path, file_df_dataframe, avro_schema, - path, options, ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) if spark_version < (2, 4): - pytest.skip("Avro only supported on Spark 2.4+") + pytest.skip("Avro files are supported on Spark 3.2+ only") file_df_connection, source_path = local_fs_file_df_connection_with_path df = file_df_dataframe diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py index a6cd14591..289e88273 100644 --- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py @@ -27,6 +27,7 @@ def test_csv_reader_with_infer_schema( local_fs_file_df_connection_with_path_and_files, file_df_dataframe, ): + """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions""" file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files df = file_df_dataframe csv_root = source_path / "csv/without_header" @@ -42,9 +43,13 @@ def test_csv_reader_with_infer_schema( expected_df = df - if get_spark_version(spark).major < 3: + spark_version = get_spark_version(spark) + if spark_version.major < 3: # Spark 2 infers "date_value" as timestamp instead of date expected_df = df.withColumn("date_value", col("date_value").cast("timestamp")) + elif spark_version < (3, 3): + # Spark 3.2 cannot infer "date_value", and return it as string + expected_df = df.withColumn("date_value", col("date_value").cast("string")) # csv does not have header, so columns are named like "_c0", "_c1", etc expected_df = reset_column_names(expected_df) diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py new file mode 100644 index 000000000..de8cc9cf9 --- /dev/null +++ b/tests/tests_integration/test_file_format_integration/test_excel_integration.py @@ -0,0 +1,142 @@ +"""Integration tests for Excel file format. + +Test only that options are passed to Spark in both FileDFReader & FileDFWriter. +Do not test all the possible options and combinations, we are not testing Spark here. +""" + +import pytest + +from onetl._util.spark import get_spark_version +from onetl.file import FileDFReader, FileDFWriter +from onetl.file.format import Excel + +try: + from pyspark.sql.functions import col + + from tests.util.assert_df import assert_equal_df + from tests.util.spark_df import reset_column_names +except ImportError: + # pandas and spark can be missing if someone runs tests for file connections only + pass + +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] + + +@pytest.mark.parametrize("format", ["xlsx", "xls"]) +def test_excel_reader_with_infer_schema( + spark, + local_fs_file_df_connection_with_path_and_files, + file_df_dataframe, + format, +): + """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions""" + spark_version = get_spark_version(spark) + if spark_version < (3, 2): + pytest.skip("Excel files are supported on Spark 3.2+ only") + + file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files + df = file_df_dataframe + excel_root = source_path / format / "without_header" + + reader = FileDFReader( + connection=file_df_connection, + format=Excel(inferSchema=True), + source_path=excel_root, + ) + read_df = reader.run() + + assert read_df.count() + + expected_df = df + # Spark infers "date_value" as timestamp instead of date + expected_df = df.withColumn("date_value", col("date_value").cast("timestamp")) + + # excel does not have header, so columns are named like "_c0", "_c1", etc + expected_df = reset_column_names(expected_df) + + assert read_df.schema != df.schema + assert read_df.schema == expected_df.schema + assert_equal_df(read_df, expected_df) + + +@pytest.mark.parametrize("format", ["xlsx", "xls"]) +@pytest.mark.parametrize( + "path, options", + [ + ("without_header", {}), + ("with_header", {"header": True}), + ("with_data_address", {"dataAddress": "'ABC'!K6"}), + ], + ids=["without_header", "with_header", "with_data_address"], +) +def test_excel_reader_with_options( + spark, + local_fs_file_df_connection_with_path_and_files, + file_df_dataframe, + format, + path, + options, +): + """Reading Excel files working as expected on any Spark, Python and Java versions""" + spark_version = get_spark_version(spark) + if spark_version < (3, 2): + pytest.skip("Excel files are supported on Spark 3.2+ only") + + local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files + df = file_df_dataframe + excel_root = source_path / format / path + + reader = FileDFReader( + connection=local_fs, + format=Excel.parse(options), + df_schema=df.schema, + source_path=excel_root, + ) + read_df = reader.run() + + assert read_df.count() + assert read_df.schema == df.schema + assert_equal_df(read_df, df) + + +@pytest.mark.parametrize( + "options", + [ + {}, + {"header": True}, + ], + ids=["without_header", "with_header"], +) +def test_excel_writer( + spark, + local_fs_file_df_connection_with_path, + file_df_dataframe, + options, +): + """Written files can be read by Spark""" + spark_version = get_spark_version(spark) + if spark_version < (3, 2): + pytest.skip("Excel files are supported on Spark 3.2+ only") + + file_df_connection, source_path = local_fs_file_df_connection_with_path + df = file_df_dataframe + excel_root = source_path / "excel" + + writer = FileDFWriter( + connection=file_df_connection, + format=Excel.parse(options), + target_path=excel_root, + ) + writer.run(df) + + reader = FileDFReader( + connection=file_df_connection, + format=Excel.parse(options), + source_path=excel_root, + df_schema=df.schema, + ) + read_df = reader.run() + + assert read_df.count() + assert read_df.schema == df.schema + assert_equal_df(read_df, df) diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py new file mode 100644 index 000000000..e94386120 --- /dev/null +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -0,0 +1,106 @@ +import logging + +import pytest + +from onetl.file.format import Excel + + +@pytest.mark.parametrize( + "spark_version", + [ + "2.2.1", + "2.3.1", + "2.4.8", + ], +) +def test_excel_get_packages_spark_version_not_supported(spark_version): + with pytest.raises(ValueError, match=f"Spark version should be at least 3.2, got {spark_version}"): + Excel.get_packages(spark_version=spark_version) + + +def test_excel_get_packages_scala_version_not_supported(): + with pytest.raises(ValueError, match="Scala version should be at least 2.12, got 2.11"): + Excel.get_packages(spark_version="3.2.4", scala_version="2.11") + + +def test_excel_get_packages_package_version_not_supported(): + with pytest.raises(ValueError, match="Package version should be at least 0.15, got 0.13.7"): + Excel.get_packages(spark_version="3.2.4", package_version="0.13.7") + + +@pytest.mark.parametrize( + "spark_version, scala_version, package_version, packages", + [ + # Detect Scala version by Spark version + ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]), + ("3.4.1", None, None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]), + # Override Scala version + ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.19.0"]), + ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.19.0"]), + ("3.4.1", "2.12", None, ["com.crealytics:spark-excel_2.12:3.4.1_0.19.0"]), + ("3.4.1", "2.13", None, ["com.crealytics:spark-excel_2.13:3.4.1_0.19.0"]), + # Override package version + ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]), + ("3.4.1", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.4.1_0.18.0"]), + ], +) +def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages): + with caplog.at_level(level=logging.WARNING): + result = Excel.get_packages( + spark_version=spark_version, + scala_version=scala_version, + package_version=package_version, + ) + + if package_version: + assert f"Passed custom package version '{package_version}', it is not guaranteed to be supported" + + assert result == packages + + +def test_excel_options_default(): + excel = Excel() + assert not excel.header + + +def test_excel_options_default_override(): + excel = Excel(header=True) + assert excel.header + + +@pytest.mark.parametrize( + "known_option", + [ + "dataAddress", + "treatEmptyValuesAsNulls", + "setErrorCellsToFallbackValues", + "usePlainNumberFormat", + "inferSchema", + "addColorColumns", + "timestampFormat", + "maxRowsInMemory", + "maxByteArraySize", + "tempFileThreshold", + "excerptSize", + "workbookPassword", + "dateFormat", + ], +) +def test_excel_options_known(known_option): + excel = Excel.parse({known_option: "value"}) + assert getattr(excel, known_option) == "value" + + +def test_excel_options_unknown(caplog): + with caplog.at_level(logging.WARNING): + excel = Excel(unknown="abc") + assert excel.unknown == "abc" + + assert ("Options ['unknown'] are not known by Excel, are you sure they are valid?") in caplog.text + + +@pytest.mark.local_fs +def test_excel_missing_package(spark_no_packages): + msg = "Cannot import Java class 'com.crealytics.spark.excel.v2.ExcelDataSource'" + with pytest.raises(ValueError, match=msg): + Excel().check_if_supported(spark_no_packages) diff --git a/tests/util/spark_df.py b/tests/util/spark_df.py index 8e4c667b8..f4e239026 100644 --- a/tests/util/spark_df.py +++ b/tests/util/spark_df.py @@ -10,7 +10,7 @@ def reset_column_names(df: SparkDataFrame, columns: list[str] | None = None) -> """ Reset columns to ``_c0`` format. - If `columns` is None, reset all columns names. + If `columns` is None, apply to all columns in df. """ columns = columns or df.columns for i, column in enumerate(columns): From 252ce9a6fec72ea56ba16676db9f59682242b111 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Wed, 20 Sep 2023 13:46:35 +0300 Subject: [PATCH 12/26] [DOP-8837] - add Samba file_connection class (#150) * [DOP-8837] - add draft Samba connection class * [DOP-8837] - implement all abstract methods for Samba connection * [DOP-8837] - updated comments * [DOP-8837] - add unit tests * [DOP-8837] - update pytest.ini * [DOP-8837] - update docker-compose.yml * [DOP-8837] - add .sh script to allow dirs/files creation * [DOP-8837] - remove extra .env * [DOP-8837] - modified Samba connection methods * [DOP-8837] - update Samba connection method * [DOP-8837] - update Samba connection method * Update onetl/connection/file_connection/samba.py Co-authored-by: Maxim Martynov * [DOP-8837] - update Samba connection method * [DOP-8837] - update _create_dir method in Samba connection * [DOP-8837] - hardcode env vars in docker compose configuration * Update .env.docker Co-authored-by: Maxim Martynov --------- Co-authored-by: Maxim Martynov --- .env.docker | 10 + .env.local | 10 + conftest.py | 1 + docker-compose.yml | 12 + docker/Dockerfile | 1 + docker/samba/on_post_init.sh | 4 + onetl/connection/__init__.py | 2 + onetl/connection/file_connection/samba.py | 292 ++++++++++++++++++ pytest.ini | 1 + requirements/samba.txt | 1 + requirements/tests/samba.txt | 1 + setup.py | 2 + .../fixtures/connections/file_connections.py | 3 + tests/fixtures/connections/samba.py | 63 ++++ .../test_file_downloader_integration.py | 11 +- .../test_file_uploader_integration.py | 6 +- .../test_samba_file_connection_integration.py | 58 ++++ .../test_samba_unit.py | 47 +++ 18 files changed, 521 insertions(+), 4 deletions(-) create mode 100755 docker/samba/on_post_init.sh create mode 100644 onetl/connection/file_connection/samba.py create mode 100644 requirements/samba.txt create mode 100644 requirements/tests/samba.txt create mode 100644 tests/fixtures/connections/samba.py create mode 100644 tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py create mode 100644 tests/tests_unit/tests_file_connection_unit/test_samba_unit.py diff --git a/.env.docker b/.env.docker index b9c2105aa..cb0394806 100644 --- a/.env.docker +++ b/.env.docker @@ -87,6 +87,16 @@ ONETL_SFTP_PORT=2222 ONETL_SFTP_USER=onetl ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho +# Samba +ONETL_SAMBA_HOST=samba +ONETL_SAMBA_PROTOCOL=SMB +ONETL_SAMBA_UID=1000 +ONETL_SAMBA_GID=1000 +ONETL_SAMBA_PORT=445 +ONETL_SAMBA_SHARE=SmbShare +ONETL_SAMBA_USER=onetl +ONETL_SAMBA_PASSWORD=awd123fd1 + # Webdav ONETL_WEBDAV_HOST=webdav ONETL_WEBDAV_PORT=80 diff --git a/.env.local b/.env.local index af2551dbd..2e05030f3 100644 --- a/.env.local +++ b/.env.local @@ -87,6 +87,16 @@ export ONETL_SFTP_PORT=2222 export ONETL_SFTP_USER=onetl export ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho +# Samba +export ONETL_SAMBA_HOST=localhost +export ONETL_SAMBA_PROTOCOL=SMB +export ONETL_SAMBA_UID=1000 +export ONETL_SAMBA_GID=1000 +export ONETL_SAMBA_PORT=445 +export ONETL_SAMBA_SHARE=SmbShare +export ONETL_SAMBA_USER=onetl +export ONETL_SAMBA_PASSWORD=awd123fd1 + # Webdav export ONETL_WEBDAV_HOST=localhost export ONETL_WEBDAV_PORT=8000 diff --git a/conftest.py b/conftest.py index ab0b60a5c..52b6c5754 100644 --- a/conftest.py +++ b/conftest.py @@ -19,5 +19,6 @@ "tests.fixtures.connections.local_fs", "tests.fixtures.connections.s3", "tests.fixtures.connections.sftp", + "tests.fixtures.connections.samba", "tests.fixtures.connections.webdav", ] diff --git a/docker-compose.yml b/docker-compose.yml index a08d8fc38..3d93c02af 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -164,6 +164,18 @@ services: networks: - onetl + samba: + image: elswork/samba + restart: unless-stopped + ports: + - "139:139" + - "445:445" + volumes: + - ./docker/samba:/share/folder + command: '-u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl"' + networks: + - onetl + s3: image: ${S3_IMAGE:-bitnami/minio:latest} restart: unless-stopped diff --git a/docker/Dockerfile b/docker/Dockerfile index 103cc2b26..817d4eab2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -57,6 +57,7 @@ RUN pip install \ -r /app/requirements/hdfs.txt \ -r /app/requirements/s3.txt \ -r /app/requirements/sftp.txt \ + -r /app/requirements/samba.txt \ -r /app/requirements/webdav.txt \ -r /app/requirements/kerberos.txt \ -r /app/requirements/docs.txt \ diff --git a/docker/samba/on_post_init.sh b/docker/samba/on_post_init.sh new file mode 100755 index 000000000..f71af2a03 --- /dev/null +++ b/docker/samba/on_post_init.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +# allow create files and directories +chmod 777 /share/folder diff --git a/onetl/connection/__init__.py b/onetl/connection/__init__.py index 1c50f7fee..3e40e2a2a 100644 --- a/onetl/connection/__init__.py +++ b/onetl/connection/__init__.py @@ -37,6 +37,7 @@ from onetl.connection.file_connection.ftps import FTPS from onetl.connection.file_connection.hdfs import HDFS from onetl.connection.file_connection.s3 import S3 + from onetl.connection.file_connection.samba import Samba from onetl.connection.file_connection.sftp import SFTP from onetl.connection.file_connection.webdav import WebDAV from onetl.connection.file_df_connection.spark_hdfs import SparkHDFS @@ -62,6 +63,7 @@ "HDFS": "hdfs", "S3": "s3", "SFTP": "sftp", + "Samba": "samba", "WebDAV": "webdav", } diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py new file mode 100644 index 000000000..7a7f21132 --- /dev/null +++ b/onetl/connection/file_connection/samba.py @@ -0,0 +1,292 @@ +# Copyright 2023 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import stat +import textwrap +from io import BytesIO +from logging import getLogger +from pathlib import Path +from typing import Literal, Optional, Union + +from etl_entities.instance import Host +from pydantic import SecretStr, validator + +from onetl.connection.file_connection.file_connection import FileConnection +from onetl.hooks import slot, support_hooks +from onetl.impl import LocalPath, RemotePath, RemotePathStat + +try: + from smb.smb_structs import OperationFailure + from smb.SMBConnection import SMBConnection +except (ImportError, NameError) as e: + raise ImportError( + textwrap.dedent( + """ + Cannot import module "pysmb". + + You should install package as follows: + pip install onetl[samba] + + or + pip install onetl[files] + """, + ).strip(), + ) from e + + +log = getLogger(__name__) + + +@support_hooks +class Samba(FileConnection): + """Samba file connection. + + Based on `pysmb library `_. + + Parameters + ---------- + host : str + Host of Samba source. For example: ``mydomain.com``. + + share : str + The name of the share on the Samba server. + + protocol : str, default: ``SMB`` + The protocol to use for the connection. Either ``SMB`` or ``NetBIOS``. + Affects the default port and the `is_direct_tcp` flag in `SMBConnection`. + + port : int, default: 445 + Port of Samba source. + + domain : str, default: `` + Domain name for the Samba connection. Empty strings means use ``host`` as domain name. + + auth_type : str, default: ``NTLMv2`` + The authentication type to use. Either ``NTLMv2`` or ``NTLMv1``. + Affects the `use_ntlm_v2` flag in `SMBConnection`. + + user : str, default: None + User, which have access to the file source. Can be `None` for anonymous connection. + + password : str, default: None + Password for file source connection. Can be `None` for anonymous connection. + + """ + + host: Host + share: str + protocol: Union[Literal["SMB"], Literal["NetBIOS"]] = "SMB" + port: Optional[int] = None + domain: Optional[str] = "" + auth_type: Union[Literal["NTLMv1"], Literal["NTLMv2"]] = "NTLMv2" + user: Optional[str] = None + password: Optional[SecretStr] = None + + @property + def instance_url(self) -> str: + return f"smb://{self.host}:{self.port}" + + @slot + def check(self): + log.info("|%s| Checking connection availability...", self.__class__.__name__) + self._log_parameters() + try: + available_shares = {share.name for share in self.client.listShares()} + if self.share in available_shares: + log.info("|%s| Connection is available.", self.__class__.__name__) + else: + log.error( + "|%s| Share %r not found among existing shares %r", + self.__class__.__name__, + self.share, + available_shares, + ) + raise ConnectionError("Failed to connect to the Samba server.") + except Exception as exc: + log.exception("|%s| Connection is unavailable", self.__class__.__name__) + raise RuntimeError("Connection is unavailable") from exc + + return self + + @slot + def path_exists(self, path: os.PathLike | str) -> bool: + try: + self.client.getAttributes(self.share, os.fspath(path)) + return True + except OperationFailure: + return False + + def _scan_entries(self, path: RemotePath) -> list: + if self._is_dir(path): + return [ + entry + for entry in self.client.listPath( + self.share, + os.fspath(path), + ) + if entry.filename not in {".", ".."} # Filter out '.' and '..' + ] + return [self.client.getAttributes(self.share, os.fspath(path))] + + def _extract_name_from_entry(self, entry) -> str: + return entry.filename + + def _is_dir_entry(self, top: RemotePath, entry) -> bool: + return entry.isDirectory + + def _is_file_entry(self, top: RemotePath, entry) -> bool: + return not entry.isDirectory + + def _extract_stat_from_entry(self, top: RemotePath, entry) -> RemotePathStat: + if entry.isDirectory: + return RemotePathStat(st_mode=stat.S_IFDIR) + + return RemotePathStat( + st_size=entry.file_size, + st_mtime=entry.last_write_time, + st_uid=entry.filename, + ) + + def _get_client(self) -> SMBConnection: + is_direct_tcp = self.protocol == "SMB" + use_ntlm_v2 = self.auth_type == "NTLMv2" + conn = SMBConnection( + username=self.user, + password=self.password.get_secret_value() if self.password else None, + my_name="optional_client_name", + remote_name=self.host, + domain=self.domain, + use_ntlm_v2=use_ntlm_v2, + sign_options=2, + is_direct_tcp=is_direct_tcp, + ) + conn.connect(self.host, port=self.port) + return conn + + def _is_client_closed(self, client: SMBConnection) -> bool: + try: + socket_fileno = client.sock.fileno() + except (AttributeError, OSError): + return True + + return socket_fileno == -1 + + def _close_client(self, client: SMBConnection) -> None: + self.client.close() + + def _download_file(self, remote_file_path: RemotePath, local_file_path: LocalPath) -> None: + with open(local_file_path, "wb") as local_file: + self.client.retrieveFile( + self.share, + os.fspath(remote_file_path), + local_file, + ) + + def _get_stat(self, path: RemotePath) -> RemotePathStat: + info = self.client.getAttributes(self.share, os.fspath(path)) + + if self.is_dir(os.fspath(path)): + return RemotePathStat(st_mode=stat.S_IFDIR) + + return RemotePathStat( + st_size=info.file_size, + st_mtime=info.last_write_time, + st_uid=info.filename, + ) + + def _remove_file(self, remote_file_path: RemotePath) -> None: + self.client.deleteFiles( + self.share, + os.fspath(remote_file_path), + ) + + def _create_dir(self, path: RemotePath) -> None: + path_obj = Path(path) + for parent in reversed(path_obj.parents): + # create dirs sequentially as .createDirectory(...) cannot create nested dirs + try: + self.client.getAttributes(self.share, os.fspath(parent)) + except OperationFailure: + self.client.createDirectory(self.share, os.fspath(parent)) + + self.client.createDirectory(self.share, os.fspath(path)) + + def _upload_file(self, local_file_path: LocalPath, remote_file_path: RemotePath) -> None: + with open(local_file_path, "rb") as file_obj: + self.client.storeFile( + self.share, + os.fspath(remote_file_path), + file_obj, + ) + + def _rename_file(self, source: RemotePath, target: RemotePath) -> None: + self.client.rename( + self.share, + os.fspath(source), + os.fspath(target), + ) + + def _remove_dir(self, path: RemotePath) -> None: + files = self.client.listPath(self.share, os.fspath(path)) + + for item in files: + if item.filename not in {".", ".."}: # skip current and parent directory entries + full_path = path / item.filename + if item.isDirectory: + # recursively delete subdirectory + self._remove_dir(full_path) + else: + self.client.deleteFiles(self.share, os.fspath(full_path)) + + self.client.deleteDirectory(self.share, os.fspath(path)) + + def _read_text(self, path: RemotePath, encoding: str) -> str: + return self._read_bytes(path).decode(encoding) + + def _read_bytes(self, path: RemotePath) -> bytes: + file_obj = BytesIO() + self.client.retrieveFile( + self.share, + os.fspath(path), + file_obj, + ) + file_obj.seek(0) + return file_obj.read() + + def _write_text(self, path: RemotePath, content: str, encoding: str) -> None: + self._write_bytes(path, bytes(content, encoding)) + + def _write_bytes(self, path: RemotePath, content: bytes) -> None: + file_obj = BytesIO(content) + + self.client.storeFile( + self.share, + os.fspath(path), + file_obj, + ) + + def _is_dir(self, path: RemotePath) -> bool: + return self.client.getAttributes(self.share, os.fspath(path)).isDirectory + + def _is_file(self, path: RemotePath) -> bool: + return not self.client.getAttributes(self.share, os.fspath(path)).isDirectory + + @validator("port", pre=True, always=True) + def _set_port_based_on_protocol(cls, port, values): + if port is None: + return 445 if values.get("protocol") == "SMB" else 139 + return port diff --git a/pytest.ini b/pytest.ini index 5e40e75d7..3c71e8eb6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -24,5 +24,6 @@ markers = postgres: Postgres tests s3: S3 tests sftp: SFTP tests + samba: Samba tests teradata: Teradata tests webdav: WebDAV tests diff --git a/requirements/samba.txt b/requirements/samba.txt new file mode 100644 index 000000000..619ee4f51 --- /dev/null +++ b/requirements/samba.txt @@ -0,0 +1 @@ +pysmb diff --git a/requirements/tests/samba.txt b/requirements/tests/samba.txt new file mode 100644 index 000000000..619ee4f51 --- /dev/null +++ b/requirements/tests/samba.txt @@ -0,0 +1 @@ +pysmb diff --git a/setup.py b/setup.py index 422085535..f8b560707 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def parse_requirements(file: Path) -> list[str]: requirements_ftp = parse_requirements(here / "requirements" / "ftp.txt") requirements_sftp = parse_requirements(here / "requirements" / "sftp.txt") +requirements_samba = parse_requirements(here / "requirements" / "samba.txt") requirements_hdfs = parse_requirements(here / "requirements" / "hdfs.txt") requirements_s3 = parse_requirements(here / "requirements" / "s3.txt") requirements_webdav = parse_requirements(here / "requirements" / "webdav.txt") @@ -90,6 +91,7 @@ def parse_requirements(file: Path) -> list[str]: "ftp": requirements_ftp, "ftps": requirements_ftp, "sftp": requirements_sftp, + "samba": requirements_samba, "hdfs": requirements_hdfs, "s3": requirements_s3, "webdav": requirements_webdav, diff --git a/tests/fixtures/connections/file_connections.py b/tests/fixtures/connections/file_connections.py index e8ef7253e..f44240894 100644 --- a/tests/fixtures/connections/file_connections.py +++ b/tests/fixtures/connections/file_connections.py @@ -12,6 +12,7 @@ lazy_fixture("hdfs_file_connection"), lazy_fixture("s3_file_connection"), lazy_fixture("sftp_file_connection"), + lazy_fixture("samba_file_connection"), lazy_fixture("webdav_file_connection"), ], ) @@ -26,6 +27,7 @@ def file_connection(request): lazy_fixture("hdfs_file_connection_with_path"), lazy_fixture("s3_file_connection_with_path"), lazy_fixture("sftp_file_connection_with_path"), + lazy_fixture("samba_file_connection_with_path"), lazy_fixture("webdav_file_connection_with_path"), ], ) @@ -40,6 +42,7 @@ def file_connection_with_path(request): lazy_fixture("hdfs_file_connection_with_path_and_files"), lazy_fixture("s3_file_connection_with_path_and_files"), lazy_fixture("sftp_file_connection_with_path_and_files"), + lazy_fixture("samba_file_connection_with_path_and_files"), lazy_fixture("webdav_file_connection_with_path_and_files"), ], ) diff --git a/tests/fixtures/connections/samba.py b/tests/fixtures/connections/samba.py new file mode 100644 index 000000000..52a294d5b --- /dev/null +++ b/tests/fixtures/connections/samba.py @@ -0,0 +1,63 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.util.upload_files import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param("real-samba", marks=[pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection]), + ], +) +def samba_server(): + SambaServer = namedtuple("SambaServer", ["host", "protocol", "port", "share", "user", "password"]) + + return SambaServer( + host=os.getenv("ONETL_SAMBA_HOST"), + protocol=os.getenv("ONETL_SAMBA_PROTOCOL"), + port=os.getenv("ONETL_SAMBA_PORT"), + share=os.getenv("ONETL_SAMBA_SHARE"), + user=os.getenv("ONETL_SAMBA_USER"), + password=os.getenv("ONETL_SAMBA_PASSWORD"), + ) + + +@pytest.fixture() +def samba_file_connection(samba_server): + from onetl.connection import Samba + + return Samba( + host=samba_server.host, + protocol=samba_server.protocol, + port=samba_server.port, + share=samba_server.share, + user=samba_server.user, + password=samba_server.password, + ) + + +@pytest.fixture() +def samba_file_connection_with_path(request, samba_file_connection): + connection = samba_file_connection + root = PurePosixPath("/data") + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + + return connection, root + + +@pytest.fixture() +def samba_file_connection_with_path_and_files(resource_path, samba_file_connection_with_path): + connection, upload_to = samba_file_connection_with_path + upload_from = resource_path / "file_connection" + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py index ed290ab43..0a932dd46 100644 --- a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py +++ b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py @@ -635,10 +635,11 @@ def test_file_downloader_mode_replace_entire_directory( caplog, ): file_connection, remote_path, _ = file_connection_with_path_and_files + # Reason for using .resolve(): https://stackoverflow.com/a/58719476 if local_dir_exist: - local_path = tmp_path_factory.mktemp("local_path") + local_path = tmp_path_factory.mktemp("local_path").resolve() else: - local_path = Path(tempfile.gettempdir()) / secrets.token_hex() + local_path = Path(tempfile.gettempdir()).resolve() / secrets.token_hex() temp_file = local_path / secrets.token_hex(5) if local_dir_exist: @@ -755,7 +756,11 @@ def finalizer(): local_path=file.name, ) - with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"): + # Reason for .realpath(): https://stackoverflow.com/a/58719476 + with pytest.raises( + NotADirectoryError, + match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory", + ): downloader.run() diff --git a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py index 522cf2dd4..feedeaa45 100644 --- a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py +++ b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py @@ -490,7 +490,11 @@ def test_file_uploader_run_local_path_not_a_directory(file_connection): with tempfile.NamedTemporaryFile() as file: uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=file.name) - with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"): + # Reason for .realpath(): https://stackoverflow.com/a/58719476 + with pytest.raises( + NotADirectoryError, + match=rf"'{os.path.realpath(file.name)}' \(kind='file', .*\) is not a directory", + ): uploader.run() diff --git a/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py new file mode 100644 index 000000000..7c5c8f5d5 --- /dev/null +++ b/tests/tests_integration/tests_file_connection_integration/test_samba_file_connection_integration.py @@ -0,0 +1,58 @@ +import logging + +import pytest + +pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection] + + +def test_samba_file_connection_check_success(samba_file_connection, caplog): + samba = samba_file_connection + with caplog.at_level(logging.INFO): + assert samba.check() == samba + + assert "|Samba|" in caplog.text + assert f"host = '{samba.host}'" in caplog.text + assert f"port = {samba.port}" in caplog.text + assert f"protocol = '{samba.protocol}'" in caplog.text + assert f"user = '{samba.user}'" in caplog.text + assert f"share = '{samba.share}'" in caplog.text + assert "password = SecretStr('**********')" in caplog.text + assert samba.password.get_secret_value() not in caplog.text + + assert "Connection is available." in caplog.text + + +def test_samba_file_connection_check_not_existing_share_failed(samba_server, caplog): + from onetl.connection import Samba + + not_existing_share = "NotExistingShare" + samba = Samba( + host=samba_server.host, + share=not_existing_share, + protocol=samba_server.protocol, + port=samba_server.port, + user=samba_server.user, + password=samba_server.password, + ) + + with caplog.at_level(logging.INFO): + with pytest.raises(RuntimeError, match="Connection is unavailable"): + samba.check() + + assert f"Share '{not_existing_share}' not found among existing shares" in caplog.text + + +def test_samba_file_connection_check_runtime_failed(samba_server): + from onetl.connection import Samba + + samba = Samba( + host=samba_server.host, + share=samba_server.share, + protocol=samba_server.protocol, + port=samba_server.port, + user="unknown", + password="unknown", + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + samba.check() diff --git a/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py new file mode 100644 index 000000000..42f95b368 --- /dev/null +++ b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py @@ -0,0 +1,47 @@ +import pytest + +from onetl.connection import FileConnection + +pytestmark = [pytest.mark.samba, pytest.mark.file_connection, pytest.mark.connection] + + +def test_samba_connection(): + from onetl.connection import Samba + + samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd") + assert isinstance(samba, FileConnection) + assert samba.host == "some_host" + assert samba.protocol == "SMB" + assert samba.domain == "" + assert samba.auth_type == "NTLMv2" + assert samba.port == 445 + assert samba.user == "some_user" + assert samba.password != "pwd" + assert samba.password.get_secret_value() == "pwd" + + assert "password='pwd'" not in str(samba) + assert "password='pwd'" not in repr(samba) + + +def test_samba_connection_with_net_bios(): + from onetl.connection import Samba + + samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol="NetBIOS") + assert samba.protocol == "NetBIOS" + assert samba.port == 139 + + +@pytest.mark.parametrize("protocol", ["SMB", "NetBIOS"]) +def test_samba_connection_with_custom_port(protocol): + from onetl.connection import Samba + + samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol=protocol, port=444) + assert samba.protocol == protocol + assert samba.port == 444 + + +def test_samba_connection_without_mandatory_args(): + from onetl.connection import Samba + + with pytest.raises(ValueError): + Samba() From 6313b126f6d581b0ad76fbf2870d28169400a0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 19 Sep 2023 07:49:40 +0000 Subject: [PATCH 13/26] [DOP-9007] Add ivysettings.xml examples to documentation --- README.rst | 2 +- .../next_release/151.improvement.rst | 1 + .../db_connection/greenplum/prerequisites.rst | 141 +-------- docs/index.rst | 2 +- docs/install.rst | 3 - docs/install/index.rst | 11 + docs/install/java_packages.rst | 281 ++++++++++++++++++ docs/install/python_packages.rst | 8 + 8 files changed, 310 insertions(+), 139 deletions(-) create mode 100644 docs/changelog/next_release/151.improvement.rst delete mode 100644 docs/install.rst create mode 100644 docs/install/index.rst create mode 100644 docs/install/java_packages.rst create mode 100644 docs/install/python_packages.rst diff --git a/README.rst b/README.rst index e20086214..ea9518d91 100644 --- a/README.rst +++ b/README.rst @@ -114,7 +114,7 @@ See https://onetl.readthedocs.io/ How to install --------------- -.. _minimal-install: +.. minimal-install Minimal installation ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/changelog/next_release/151.improvement.rst b/docs/changelog/next_release/151.improvement.rst new file mode 100644 index 000000000..d8da800ae --- /dev/null +++ b/docs/changelog/next_release/151.improvement.rst @@ -0,0 +1 @@ +Add documentation about different ways of passing packages to Spark session. diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 815a12b27..a545fdc27 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -7,7 +7,7 @@ Version Compatibility --------------------- * Greenplum server versions: 5.x, 6.x -* Spark versions: 2.3.x - 3.2.x (Spark 3.3.x is not supported yet) +* Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) * Java versions: 8 - 11 See `official documentation `_. @@ -33,140 +33,13 @@ To use Greenplum connector you should download connector ``.jar`` file from `Pivotal website `_ and then pass it to Spark session. -There are several ways to do that. +.. warning:: -.. note:: - - Please pay attention to Spark <-> Scala version compatibility. See :ref:`spark-compatibility-matrix`. - -Using ``spark.jars`` -~~~~~~~~~~~~~~~~~~~~ - -The most simple solution, but this requires to store/deploy ``.jar`` file in the local environment. - -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option, e.g. - -.. code:: python - - # no need to use spark.jars.packages - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars", "/path/to/downloaded.jar") - .getOrCreate() - ) - -Using ``spark.jars.repositories`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo. - -* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``). -* Pass repo URL to ``spark.jars.repositories`` Spark config option -* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option. - - -Example -^^^^^^^ - -.. code:: python - - maven_packages = Greenplum.get_packages(spark_version="3.2") - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars.repositories", "http://nexus.domain.com/example-repo/") - .config("spark.jars.packages", ",".join(maven_packages)) - .getOrCreate() - ) - - -Using ``spark.jars.ivySettings`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Same as above, but can be used even if there is no network access to public repos like Maven. - -* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Upload ``.jar`` file to private repository (with ``groupId=io.pivotal``, ``artifactoryId=greenplum-spark_2.12``). -* Create `ivysettings.xml `_ file. -* Add here a resolver with repository URL (and credentials, if required). -* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option. -* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option. + Please pay attention to :ref:`Spark <-> Scala version compatibility `. -Example -^^^^^^^ +There are several ways to do that. See :ref:`java-packages` for details. -.. code-block:: xml - :caption: ivysettings.xml - - - - - - - - - - - - - - - - - - - - -.. code-block:: python - :caption: script.py - - maven_packages = Greenplum.get_packages(spark_version="3.2") - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars.ivySettings", "/path/to/ivysettings.xml") - .config("spark.jars.packages", ",".join(maven_packages)) - .getOrCreate() - ) - -Moving ``.jar`` file to ``~/.ivy2/jars/`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven. - -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Move it to ``~/.ivy2/jars/`` folder -* Create Spark session with passing Greenplum package name to ``spark.jars.packages`` Spark config option. - -Example -^^^^^^^ - -.. code:: python - - maven_packages = Greenplum.get_packages(spark_version="3.2") - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") - .config("spark.jars.packages", ",".join(maven_packages)) - .getOrCreate() - ) - -Inserting ``.jar`` file to Spark jars folder -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Can be used to embed ``.jar`` files to a default Spark classpath. - -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.4/jars/``. -* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages`` - - -Manually adding ``.jar`` files to ``CLASSPATH`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Can be used to embed ``.jar`` files to a default Java classpath. +.. note:: -* Download ``greenplum-connector-apache-spark-scala_2.12-2.1.4.jar`` file. -* Set environment variable ``CLASSPATH`` to ``/path/to/downloader.jar`` -* Create Spark session **WITHOUT** passing Greenplum package name to ``spark.jars.packages`` + If you're uploading package to private package repo, use ``groupId=io.pivotal`` and ``artifactoryId=greenplum-spark_2.12`` + (``2.12`` is Scala version) to give uploaded package a proper name. diff --git a/docs/index.rst b/docs/index.rst index cc8fdb87d..54ced3d06 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,7 +14,7 @@ :hidden: self - install + install/index quickstart concepts diff --git a/docs/install.rst b/docs/install.rst deleted file mode 100644 index abf328c75..000000000 --- a/docs/install.rst +++ /dev/null @@ -1,3 +0,0 @@ -.. include:: ../README.rst - :start-after: install - :end-before: quick-start diff --git a/docs/install/index.rst b/docs/install/index.rst new file mode 100644 index 000000000..86365e381 --- /dev/null +++ b/docs/install/index.rst @@ -0,0 +1,11 @@ +.. _install: + +How to install +============== + +.. toctree:: + :maxdepth: 1 + :caption: How to install + + python_packages + java_packages diff --git a/docs/install/java_packages.rst b/docs/install/java_packages.rst new file mode 100644 index 000000000..a64c9e7c5 --- /dev/null +++ b/docs/install/java_packages.rst @@ -0,0 +1,281 @@ +.. _java-packages: + +Java packages +============== + +``DB`` and ``FileDF`` connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session, +like JDBC drivers. + +This is usually done by setting up ``spark.jars.packages`` option while creating Spark session: + +.. code:: python + + # here is a list of packages to be downloaded: + maven_packages = Greenplum.get_packages(spark_version="3.2") + + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + + +Spark automatically resolves package and all its dependencies, download them and inject to Spark session +(both driver and all executors). + +This requires internet access, because package metadata and ``.jar`` files are fetched from `Maven Repository `_. + +But sometimes it is required to: + +* Install package without direct internet access (isolated network) +* Install package which is not available in Maven + +There are several ways to do that. + +Using ``spark.jars`` +-------------------- + +The most simple solution, but this requires to store raw ``.jar`` files somewhere on filesystem or web server. + +* Download ``package.jar`` files (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique. +* (For ``spark.submit.deployMode=cluster``) place downloaded files to HDFS or deploy to any HTTP web server serving static files. See `official documentation `_ for more details. +* Create Spark session with passing ``.jar`` absolute file path to ``spark.jars`` Spark config option: + +.. tabs:: + + .. code-tab:: py for spark.submit.deployMode=client (default) + + jar_files = ["/path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) + + .. code-tab:: py for spark.submit.deployMode=cluster + + # you can also pass URLs like http://domain.com/path/to/downloadable/package.jar + jar_files = ["hdfs:///path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) + +Using ``spark.jars.repositories`` +--------------------------------- + +.. note:: + + In this case Spark still will try to fetch packages from the internet, so if you don't have internet access, + Spark session will be created with significant delay because of all attempts to fetch packages. + +Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo. + +* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter. +* Upload ``package.jar`` file to private repository (with same ``groupId`` and ``artifactoryId`` as in source package in Maven). +* Pass repo URL to ``spark.jars.repositories`` Spark config option. +* Create Spark session with passing Package name to ``spark.jars.packages`` Spark config option: + +.. code:: python + + maven_packages = Greenplum.get_packages(spark_version="3.2") + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + + +Using ``spark.jars.ivySettings`` +-------------------------------- + +Same as above, but can be used even if there is no network access to public repos like Maven. + +* Setup private Maven repository in `JFrog Artifactory `_ or `Sonatype Nexus `_. +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter. +* Upload ``package.jar`` file to `private repository `_ (with same ``groupId`` and ``artifactoryId`` as in source package in Maven). +* Create ``ivysettings.xml`` file (see below). +* Add here a resolver with repository URL (and credentials, if required). +* Pass ``ivysettings.xml`` absolute path to ``spark.jars.ivySettings`` Spark config option. +* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option: + +.. tabs:: + + .. code-tab:: xml ivysettings-all-packages-uploaded-to-nexus.xml + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-private-packages-in-nexus-public-in-maven.xml + + + + + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-private-packages-in-nexus-public-fetched-using-proxy-repo.xml + + + + + + + + + + + + + + + + + + .. code-tab:: xml ivysettings-nexus-with-auth-required.xml + + + + + + + + + + + + + + + + + + + + + + +.. code-block:: python + :caption: script.py + + maven_packages = Greenplum.get_packages(spark_version="3.2") + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.ivySettings", "/path/to/ivysettings.xml") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + +Place ``.jar`` file to ``-/.ivy2/jars/`` +---------------------------------------- + +Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven. + +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique. +* Move it to ``-/.ivy2/jars/`` folder. +* Create Spark session with passing package name to ``spark.jars.packages`` Spark config option: + +.. code:: python + + maven_packages = Greenplum.get_packages(spark_version="3.2") + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() + ) + +Place ``.jar`` file to Spark jars folder +---------------------------------------- + +.. note:: + + Package file should be placed on all hosts/containers Spark is running, + both driver and all executors. + + Usually this is used only with either: + * ``spark.master=local`` (driver and executors are running on the same host), + * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods). + +Can be used to embed ``.jar`` files to a default Spark classpath. + +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique. +* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``. +* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages`` +.. code:: python + + # no need to set spark.jars.packages or any other spark.jars.* option + # all jars already present in CLASSPATH, and loaded automatically + + spark = SparkSession.builder.config("spark.app.name", "onetl").getOrCreate() + + +Manually adding ``.jar`` files to ``CLASSPATH`` +----------------------------------------------- + +.. note:: + + Package file should be placed on all hosts/containers Spark is running, + both driver and all executors. + + Usually this is used only with either: + * ``spark.master=local`` (driver and executors are running on the same host), + * ``spark.master=k8s://...`` (``.jar`` files are added to image or to volume mounted to all pods). + +Can be used to embed ``.jar`` files to a default Java classpath. + +* Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter. +* Set environment variable ``CLASSPATH`` to ``/path/to/package.jar``. You can set multiple file paths +* Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages`` + +.. code:: python + + # no need to set spark.jars.packages or any other spark.jars.* option + # all jars already present in CLASSPATH, and loaded automatically + + import os + + jar_files = ["/path/to/package.jar"] + # different delimiters for Windows and Linux + delimiter = ";" if os.name == "nt" else ":" + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.driver.extraClassPath", delimiter.join(jar_files)) + .config("spark.executor.extraClassPath", delimiter.join(jar_files)) + .getOrCreate() + ) diff --git a/docs/install/python_packages.rst b/docs/install/python_packages.rst new file mode 100644 index 000000000..4459b2f37 --- /dev/null +++ b/docs/install/python_packages.rst @@ -0,0 +1,8 @@ +.. _python-packages: + +Python packages +=============== + +.. include:: ../../README.rst + :start-after: minimal-install + :end-before: quick-start From ab8632d94615e39e9f07503ff9a05073d24bd9d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Sep 2023 05:10:06 +0000 Subject: [PATCH 14/26] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.11.0 → v3.13.0](https://github.com/asottile/pyupgrade/compare/v3.11.0...v3.13.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d43a63307..193ae3c3d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -59,7 +59,7 @@ repos: - id: rst-inline-touching-normal - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.11.0 + rev: v3.13.0 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] From 3719442ee1e3dc23cfc6bb6936da2df6a12507bb Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Tue, 26 Sep 2023 10:30:23 +0300 Subject: [PATCH 15/26] [DOP-8838] - add CI/CD for Samba connection (#152) * [DOP-8838] - add CI/CD for Samba connection * [DOP-8838] - update get-matrix.yml * [DOP-8838] - update services parameters * Revert "[DOP-8838] - update services parameters" This reverts commit 8f88cea6a5845aade610b7376ae99947c9bbe962. * [DOP-8838] - add 'options' parameter to workflow in test_samba.yml * [DOP-8838] - change way to start samba container * [DOP-8838] - change way to start samba container in docker-compose * [DOP-8838] - fix unsupported import * [DOP-8838] - change parameters for starting samba container * [DOP-8838] - update test-samba.yml * [DOP-8838] - update get-matrix.yml * [DOP-8838] - update tests.yml * [DOP-8838] - replace openssh-version with server-version * [DOP-8838] - replace openssh-version with server-version --- .github/workflows/data/samba/ignored.txt | 1 + .github/workflows/data/samba/matrix.yml | 18 +++++ .github/workflows/data/samba/tracked.txt | 1 + .github/workflows/get-matrix.yml | 33 +++++++++ .github/workflows/nightly.yml | 16 +++++ .github/workflows/test-samba.yml | 81 +++++++++++++++++++++++ .github/workflows/tests.yml | 16 +++++ docker-compose.yml | 4 +- docker/samba/custom_entrypoint.sh | 6 ++ docker/samba/on_post_init.sh | 4 -- onetl/connection/file_connection/samba.py | 3 +- 11 files changed, 176 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/data/samba/ignored.txt create mode 100644 .github/workflows/data/samba/matrix.yml create mode 100644 .github/workflows/data/samba/tracked.txt create mode 100644 .github/workflows/test-samba.yml create mode 100755 docker/samba/custom_entrypoint.sh delete mode 100755 docker/samba/on_post_init.sh diff --git a/.github/workflows/data/samba/ignored.txt b/.github/workflows/data/samba/ignored.txt new file mode 100644 index 000000000..d8f8d4692 --- /dev/null +++ b/.github/workflows/data/samba/ignored.txt @@ -0,0 +1 @@ +docs diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml new file mode 100644 index 000000000..a4a3afe30 --- /dev/null +++ b/.github/workflows/data/samba/matrix.yml @@ -0,0 +1,18 @@ +min: &min + python-version: '3.7' + os: ubuntu-latest + +max: &max + python-version: '3.11' + os: ubuntu-latest + +matrix: + small: + - server-version: latest + <<: *max + full: &full + - server-version: latest + <<: *min + - server-version: latest + <<: *max + nightly: *full diff --git a/.github/workflows/data/samba/tracked.txt b/.github/workflows/data/samba/tracked.txt new file mode 100644 index 000000000..5f7fcf905 --- /dev/null +++ b/.github/workflows/data/samba/tracked.txt @@ -0,0 +1 @@ +**/samba* diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index fd7e24aae..b9d160b42 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -41,6 +41,8 @@ on: value: ${{ jobs.get-matrix.outputs.matrix-s3 }} matrix-sftp: value: ${{ jobs.get-matrix.outputs.matrix-sftp }} + matrix-samba: + value: ${{ jobs.get-matrix.outputs.matrix-samba }} matrix-webdav: value: ${{ jobs.get-matrix.outputs.matrix-webdav }} @@ -69,6 +71,7 @@ jobs: matrix-hdfs: ${{ toJson(fromJson(steps.matrix-hdfs.outputs.result)[steps.key-hdfs.outputs.key]) }} matrix-s3: ${{ toJson(fromJson(steps.matrix-s3.outputs.result)[steps.key-s3.outputs.key]) }} matrix-sftp: ${{ toJson(fromJson(steps.matrix-sftp.outputs.result)[steps.key-sftp.outputs.key]) }} + matrix-samba: ${{ toJson(fromJson(steps.matrix-samba.outputs.result)[steps.key-samba.outputs.key]) }} matrix-webdav: ${{ toJson(fromJson(steps.matrix-webdav.outputs.result)[steps.key-webdav.outputs.key]) }} steps: - name: Checkout code @@ -635,6 +638,36 @@ jobs: with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml + - name: Check if Samba files are changed + id: changed-samba + uses: tj-actions/changed-files@v35 + with: + files_from_source_file: .github/workflows/data/samba/tracked.txt + files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt + + - name: Print Samba files changed + run: | + echo '${{ steps.changed-samba.outputs.all_changed_files }}' + + - name: Calculate Samba matrix key + id: key-samba + run: | + if ${{ inputs.nightly }}; then + key=nightly + elif ${{ steps.changed-base.outputs.any_changed }} || ${{ steps.changed-file.outputs.any_changed }} || ${{ steps.changed-samba.outputs.any_changed }}; then + key=full + else + key=small + fi + echo key=$key + echo key=$key >> $GITHUB_OUTPUT + + - name: Get Samba matrix + id: matrix-samba + uses: mikefarah/yq@v4.33.3 + with: + cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml + - name: Check if WebDAV files are changed id: changed-webdav uses: tj-actions/changed-files@v35 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 209364f4b..7608ebe6e 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -303,6 +303,21 @@ jobs: os: ${{ matrix.os }} with-cache: false + tests-samba: + name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + needs: [get-matrix] + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }} + + uses: ./.github/workflows/test-samba.yml + with: + server-version: ${{ matrix.server-version }} + python-version: ${{ matrix.python-version }} + os: ${{ matrix.os }} + with-cache: false + tests-webdav: name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] @@ -338,6 +353,7 @@ jobs: - tests-hdfs - tests-s3 - tests-sftp + - tests-samba - tests-webdav steps: diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml new file mode 100644 index 000000000..d823a9ae7 --- /dev/null +++ b/.github/workflows/test-samba.yml @@ -0,0 +1,81 @@ +name: Tests for Samba +on: + workflow_call: + inputs: + server-version: + required: true + type: string + python-version: + required: true + type: string + os: + required: true + type: string + with-cache: + required: false + type: boolean + default: true + +jobs: + test-samba: + name: Run Samba tests (server=${{ inputs.server-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + runs-on: ${{ inputs.os }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + + - name: Cache pip + uses: actions/cache@v3 + if: inputs.with-cache + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + restore-keys: | + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba- + + - name: Upgrade pip + run: python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: | + pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt + + # Replace with Github Actions' because of custom parameter for samba container start + - name: Start Samba + run: | + docker compose down -v --remove-orphans + docker compose up -d samba + env: + SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }} + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} + + - name: Wait for Samba to be ready + run: | + ./docker/wait-for-it.sh -h localhost -p 445 -t 60 + + - name: Run tests + run: | + mkdir reports/ || echo "Directory exists" + sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env + source ./env + ./pytest_runner.sh -m samba + + - name: Shutdown Samba + if: always() + run: | + docker compose down -v --remove-orphans + env: + COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} + + - name: Upload coverage results + uses: actions/upload-artifact@v3 + with: + name: samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 44125d701..1df7f5306 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -287,6 +287,21 @@ jobs: python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} + tests-samba: + name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + needs: [get-matrix] + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.get-matrix.outputs.matrix-samba) }} + + uses: ./.github/workflows/test-samba.yml + with: + server-version: ${{ matrix.server-version }} + python-version: ${{ matrix.python-version }} + os: ${{ matrix.os }} + + tests-webdav: name: Run WebDAV tests (server=${{ matrix.webdav-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] @@ -321,6 +336,7 @@ jobs: - tests-hdfs - tests-s3 - tests-sftp + - tests-samba - tests-webdav steps: diff --git a/docker-compose.yml b/docker-compose.yml index 3d93c02af..bdcfe3954 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -171,8 +171,8 @@ services: - "139:139" - "445:445" volumes: - - ./docker/samba:/share/folder - command: '-u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl"' + - ./docker/samba/custom_entrypoint.sh:/custom_entrypoint.sh + entrypoint: ["/custom_entrypoint.sh"] networks: - onetl diff --git a/docker/samba/custom_entrypoint.sh b/docker/samba/custom_entrypoint.sh new file mode 100755 index 000000000..f0d4078c0 --- /dev/null +++ b/docker/samba/custom_entrypoint.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +# allow create files and directories +mkdir -p /share/folder +chmod 0777 /share/folder +/entrypoint.sh -u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl" diff --git a/docker/samba/on_post_init.sh b/docker/samba/on_post_init.sh deleted file mode 100755 index f71af2a03..000000000 --- a/docker/samba/on_post_init.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -# allow create files and directories -chmod 777 /share/folder diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index 7a7f21132..8073622c3 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -20,10 +20,11 @@ from io import BytesIO from logging import getLogger from pathlib import Path -from typing import Literal, Optional, Union +from typing import Optional, Union from etl_entities.instance import Host from pydantic import SecretStr, validator +from typing_extensions import Literal from onetl.connection.file_connection.file_connection import FileConnection from onetl.hooks import slot, support_hooks From b8bd6dd2e4077c58cd149c2b5e40c44d404e79ac Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Tue, 26 Sep 2023 10:47:45 +0300 Subject: [PATCH 16/26] [DOP-8839] - add documentation to Samba connection (#153) * [DOP-8839] - add documentation to Samba connection * [DOP-8839] - updated documentation to Samba connection * [DOP-8839] - updated changelog * Update docs/changelog/next_release/150.feature.rst Co-authored-by: Maxim Martynov * [DOP-8839] - change default client name in SMB connection * Update onetl/connection/file_connection/samba.py Co-authored-by: Maxim Martynov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Maxim Martynov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .readthedocs.yml | 1 + README.rst | 4 +++- docs/changelog/next_release/150.feature.rst | 2 ++ docs/connection/file_connection/index.rst | 1 + docs/connection/file_connection/samba.rst | 9 ++++++++ onetl/connection/file_connection/samba.py | 25 +++++++++++++++++++-- 6 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 docs/changelog/next_release/150.feature.rst create mode 100644 docs/connection/file_connection/samba.rst diff --git a/.readthedocs.yml b/.readthedocs.yml index 4d54479b4..923741b22 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -14,6 +14,7 @@ python: - ftp - ftps - hdfs + - samba - s3 - sftp - webdav diff --git a/README.rst b/README.rst index ea9518d91..1c483e068 100644 --- a/README.rst +++ b/README.rst @@ -93,6 +93,8 @@ Supported storages | | FTPS | | + +--------------+----------------------------------------------------------------------------------------------------------------------+ | | WebDAV | `WebdavClient3 library `_ | ++ +--------------+----------------------------------------------------------------------------------------------------------------------+ +| | Samba | `pysmb library `_ | +--------------------+--------------+----------------------------------------------------------------------------------------------------------------------+ | Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | | +--------------+ + @@ -204,7 +206,7 @@ Each client can be installed explicitly by passing connector name (in lowercase) .. code:: bash pip install onetl[ftp] # specific connector - pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav] # multiple connectors + pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors To install all file connectors at once you can pass ``files`` to ``extras``: diff --git a/docs/changelog/next_release/150.feature.rst b/docs/changelog/next_release/150.feature.rst new file mode 100644 index 000000000..6ea0af9ff --- /dev/null +++ b/docs/changelog/next_release/150.feature.rst @@ -0,0 +1,2 @@ +Add ``Samba`` file connection. +It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. diff --git a/docs/connection/file_connection/index.rst b/docs/connection/file_connection/index.rst index 2fc998c7f..3b6908c7d 100644 --- a/docs/connection/file_connection/index.rst +++ b/docs/connection/file_connection/index.rst @@ -10,6 +10,7 @@ File Connections FTP FTPS HDFS + Samba SFTP S3 Webdav diff --git a/docs/connection/file_connection/samba.rst b/docs/connection/file_connection/samba.rst new file mode 100644 index 000000000..73f7ac3f9 --- /dev/null +++ b/docs/connection/file_connection/samba.rst @@ -0,0 +1,9 @@ +.. _samba: + +Samba connection +============== + +.. currentmodule:: onetl.connection.file_connection.samba + +.. autoclass:: Samba + :members: __init__, check, path_exists, is_file, is_dir, get_stat, resolve_dir, resolve_file, create_dir, remove_file, remove_dir, rename_file, list_dir, download_file, upload_file diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index 8073622c3..9e907ee3e 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -54,10 +54,12 @@ @support_hooks class Samba(FileConnection): - """Samba file connection. + """Samba file connection. |support_hooks| Based on `pysmb library `_. + .. versionadded:: 0.9.4 + Parameters ---------- host : str @@ -86,6 +88,25 @@ class Samba(FileConnection): password : str, default: None Password for file source connection. Can be `None` for anonymous connection. + Examples + -------- + + Samba file connection initialization + + .. code:: python + + from onetl.connection import Samba + + samba = Samba( + host="mydomain.com", + share="share_name", + protocol="SMB", + port=445, + user="user", + password="password", + ) + + """ host: Host @@ -168,7 +189,7 @@ def _get_client(self) -> SMBConnection: conn = SMBConnection( username=self.user, password=self.password.get_secret_value() if self.password else None, - my_name="optional_client_name", + my_name="onetl", remote_name=self.host, domain=self.domain, use_ntlm_v2=use_ntlm_v2, From 95572680bace2743f7980ce972a65abd5a87ec2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 25 Sep 2023 12:18:03 +0000 Subject: [PATCH 17/26] [DOP-9007] Rearrange installation documentation --- README.rst | 37 +++++----- docs/install/files.rst | 8 +++ docs/install/full.rst | 8 +++ docs/install/index.rst | 14 +++- docs/install/kerberos.rst | 8 +++ docs/install/python_packages.rst | 8 --- docs/install/{java_packages.rst => spark.rst} | 70 +++++++++++++++---- 7 files changed, 111 insertions(+), 42 deletions(-) create mode 100644 docs/install/files.rst create mode 100644 docs/install/full.rst create mode 100644 docs/install/kerberos.rst delete mode 100644 docs/install/python_packages.rst rename docs/install/{java_packages.rst => spark.rst} (89%) diff --git a/README.rst b/README.rst index 1c483e068..4f8b0aca8 100644 --- a/README.rst +++ b/README.rst @@ -54,7 +54,7 @@ Requirements * **Python 3.7 - 3.11** * PySpark 2.3.x - 3.4.x (depends on used connector) * Java 8+ (required by Spark, see below) -* Kerberos libs & GCC (required by ``Hive`` and ``HDFS`` connectors) +* Kerberos libs & GCC (required by ``Hive``, ``HDFS`` and ``SparkHDFS`` connectors) Supported storages ------------------ @@ -111,16 +111,16 @@ Documentation See https://onetl.readthedocs.io/ -.. install - How to install --------------- -.. minimal-install +.. _install: Minimal installation ~~~~~~~~~~~~~~~~~~~~ +.. _minimal-install: + Base ``onetl`` package contains: * ``DBReader``, ``DBWriter`` and related classes @@ -142,14 +142,16 @@ It can be installed via: This method is recommended for use in third-party libraries which require for ``onetl`` to be installed, but do not use its connection classes. -.. _spark-install: - With DB and FileDF connections ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _spark-install: + All DB connection classes (``Clickhouse``, ``Greenplum``, ``Hive`` and others) and all FileDF connection classes (``SparkHDFS``, ``SparkLocalFS``, ``SparkS3``) -require PySpark to be installed. +require Spark to be installed. + +.. _java-install: Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples: @@ -178,6 +180,8 @@ Compatibility matrix | `3.4.x `_ | 3.7 - 3.11 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ +.. _pyspark-install: + Then you should install PySpark via passing ``spark`` to ``extras``: .. code:: bash @@ -193,12 +197,11 @@ or install PySpark explicitly: or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** - -.. _files-install: - With File connections ~~~~~~~~~~~~~~~~~~~~~ +.. _files-install: + All File (but not *FileDF*) connection classes (``FTP``, ``SFTP``, ``HDFS`` and so on) requires specific Python clients to be installed. Each client can be installed explicitly by passing connector name (in lowercase) to ``extras``: @@ -216,18 +219,17 @@ To install all file connectors at once you can pass ``files`` to ``extras``: **Otherwise class import will fail.** - -.. _kerberos-install: - With Kerberos support ~~~~~~~~~~~~~~~~~~~~~ +.. _kerberos-install: + Most of Hadoop instances set up with Kerberos support, so some connections require additional setup to work properly. * ``HDFS`` Uses `requests-kerberos `_ and - `GSSApi `_ for authentication in WebHDFS. + `GSSApi `_ for authentication. It also uses ``kinit`` executable to generate Kerberos ticket. * ``Hive`` and ``SparkHDFS`` @@ -252,12 +254,11 @@ Also you should pass ``kerberos`` to ``extras`` to install required Python packa pip install onetl[kerberos] - -.. _full-install: - Full bundle ~~~~~~~~~~~ +.. _full-bundle: + To install all connectors and dependencies, you can pass ``all`` into ``extras``: .. code:: bash @@ -271,7 +272,7 @@ To install all connectors and dependencies, you can pass ``all`` into ``extras`` This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS. -.. quick-start +.. _quick-start: Quick start ------------ diff --git a/docs/install/files.rst b/docs/install/files.rst new file mode 100644 index 000000000..b32c7a807 --- /dev/null +++ b/docs/install/files.rst @@ -0,0 +1,8 @@ +.. _install-files: + +File connections +================= + +.. include:: ../../README.rst + :start-after: .. _files-install: + :end-before: With Kerberos support diff --git a/docs/install/full.rst b/docs/install/full.rst new file mode 100644 index 000000000..a3853207c --- /dev/null +++ b/docs/install/full.rst @@ -0,0 +1,8 @@ +.. _install-full: + +Full bundle +=========== + +.. include:: ../../README.rst + :start-after: .. _full-bundle: + :end-before: .. _quick-start: diff --git a/docs/install/index.rst b/docs/install/index.rst index 86365e381..47f86287c 100644 --- a/docs/install/index.rst +++ b/docs/install/index.rst @@ -3,9 +3,19 @@ How to install ============== +.. include:: ../../README.rst + :start-after: .. _minimal-install: + :end-before: With DB and FileDF connections + +Installation in details +----------------------- + .. toctree:: :maxdepth: 1 :caption: How to install - python_packages - java_packages + self + spark + files + kerberos + full diff --git a/docs/install/kerberos.rst b/docs/install/kerberos.rst new file mode 100644 index 000000000..2ba28de4d --- /dev/null +++ b/docs/install/kerberos.rst @@ -0,0 +1,8 @@ +.. _install-kerberos: + +Kerberos support +================ + +.. include:: ../../README.rst + :start-after: .. _kerberos-install: + :end-before: Full bundle diff --git a/docs/install/python_packages.rst b/docs/install/python_packages.rst deleted file mode 100644 index 4459b2f37..000000000 --- a/docs/install/python_packages.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _python-packages: - -Python packages -=============== - -.. include:: ../../README.rst - :start-after: minimal-install - :end-before: quick-start diff --git a/docs/install/java_packages.rst b/docs/install/spark.rst similarity index 89% rename from docs/install/java_packages.rst rename to docs/install/spark.rst index a64c9e7c5..861527341 100644 --- a/docs/install/java_packages.rst +++ b/docs/install/spark.rst @@ -1,9 +1,32 @@ +.. _install-spark: + +Spark +===== + +.. include:: ../../README.rst + :start-after: .. _spark-install: + :end-before: .. _java-install: + +Installing Java +--------------- + +.. include:: ../../README.rst + :start-after: .. _java-install: + :end-before: .. _pyspark-install: + +Installing PySpark +------------------ + +.. include:: ../../README.rst + :start-after: .. _pyspark-install: + :end-before: With File connections + .. _java-packages: -Java packages -============== +Injecting Java packages +----------------------- -``DB`` and ``FileDF`` connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session, +Some DB and FileDF connection classes require specific packages to be inserted to ``CLASSPATH`` of Spark session, like JDBC drivers. This is usually done by setting up ``spark.jars.packages`` option while creating Spark session: @@ -11,7 +34,11 @@ This is usually done by setting up ``spark.jars.packages`` option while creating .. code:: python # here is a list of packages to be downloaded: - maven_packages = Greenplum.get_packages(spark_version="3.2") + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) spark = ( SparkSession.builder.config("spark.app.name", "onetl") @@ -33,7 +60,7 @@ But sometimes it is required to: There are several ways to do that. Using ``spark.jars`` --------------------- +^^^^^^^^^^^^^^^^^^^^ The most simple solution, but this requires to store raw ``.jar`` files somewhere on filesystem or web server. @@ -67,7 +94,7 @@ The most simple solution, but this requires to store raw ``.jar`` files somewher ) Using ``spark.jars.repositories`` ---------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. note:: @@ -84,7 +111,12 @@ Can be used if you have access both to public repos (like Maven) and a private A .. code:: python - maven_packages = Greenplum.get_packages(spark_version="3.2") + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) + spark = ( SparkSession.builder.config("spark.app.name", "onetl") .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/") @@ -94,7 +126,7 @@ Can be used if you have access both to public repos (like Maven) and a private A Using ``spark.jars.ivySettings`` --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Same as above, but can be used even if there is no network access to public repos like Maven. @@ -194,7 +226,12 @@ Same as above, but can be used even if there is no network access to public repo .. code-block:: python :caption: script.py - maven_packages = Greenplum.get_packages(spark_version="3.2") + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) + spark = ( SparkSession.builder.config("spark.app.name", "onetl") .config("spark.jars.ivySettings", "/path/to/ivysettings.xml") @@ -203,7 +240,7 @@ Same as above, but can be used even if there is no network access to public repo ) Place ``.jar`` file to ``-/.ivy2/jars/`` ----------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven. @@ -213,7 +250,12 @@ Can be used to pass already downloaded file to Ivy, and skip resolving package f .. code:: python - maven_packages = Greenplum.get_packages(spark_version="3.2") + maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Teradata.get_packages() + ) + spark = ( SparkSession.builder.config("spark.app.name", "onetl") .config("spark.jars.packages", ",".join(maven_packages)) @@ -221,7 +263,7 @@ Can be used to pass already downloaded file to Ivy, and skip resolving package f ) Place ``.jar`` file to Spark jars folder ----------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. note:: @@ -235,7 +277,7 @@ Place ``.jar`` file to Spark jars folder Can be used to embed ``.jar`` files to a default Spark classpath. * Download ``package.jar`` file (it's usually something like ``some-package_1.0.0.jar``). Local file name does not matter, but it should be unique. -* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``~/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``. +* Move it to ``$SPARK_HOME/jars/`` folder, e.g. ``^/.local/lib/python3.7/site-packages/pyspark/jars/`` or ``/opt/spark/3.2.3/jars/``. * Create Spark session **WITHOUT** passing Package name to ``spark.jars.packages`` .. code:: python @@ -246,7 +288,7 @@ Can be used to embed ``.jar`` files to a default Spark classpath. Manually adding ``.jar`` files to ``CLASSPATH`` ------------------------------------------------ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. note:: From ae6bde61d15aa6754113fc8efc45312a123ca878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 25 Sep 2023 11:22:28 +0000 Subject: [PATCH 18/26] [DOP-9085] Improve Greenplum documentation --- .../next_release/154.improvement.rst | 4 + docs/conf.py | 1 + .../db_connection/greenplum/execute.rst | 41 ++++ .../db_connection/greenplum/prerequisites.rst | 176 ++++++++++++++++++ .../db_connection/greenplum/read.rst | 141 +++++++++++++- .../db_connection/greenplum/write.rst | 95 ++++++++++ .../db_connection/greenplum/connection.py | 24 +-- .../db_connection/greenplum/options.py | 4 +- .../file_df_connection/spark_s3/connection.py | 8 + requirements/docs.txt | 4 +- 10 files changed, 469 insertions(+), 29 deletions(-) create mode 100644 docs/changelog/next_release/154.improvement.rst diff --git a/docs/changelog/next_release/154.improvement.rst b/docs/changelog/next_release/154.improvement.rst new file mode 100644 index 000000000..745989a87 --- /dev/null +++ b/docs/changelog/next_release/154.improvement.rst @@ -0,0 +1,4 @@ +Drastically improve ``Greenplum`` documentation: +* Added information about network ports, grants, ``pg_hba.conf`` and so on. +* Added interaction schemas for reading, writing and executing statements in Greenplum. +* Added recommendations about reading data from views and ``JOIN`` results from Greenplum. diff --git a/docs/conf.py b/docs/conf.py index 87d6fd17b..06a5b08aa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,6 +56,7 @@ "sphinx.ext.autosummary", "sphinxcontrib.autodoc_pydantic", "sphinxcontrib.towncrier", # provides `towncrier-draft-entries` directive + "sphinxcontrib.plantuml", ] numpydoc_show_class_members = True autodoc_pydantic_model_show_config = False diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index b0833b213..e2179a4ec 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -3,6 +3,47 @@ Executing statements in Greenplum ================================== +Interaction schema +------------------ + +Unlike reading & writing, executing statements in Greenplum is done **only** through Greenplum master node, +without any interaction between Greenplum segments and Spark executors. More than that, Spark executors are not used in this case. + +The only port used while interacting with Greenplum in this case is ``5432`` (Greenplum master port). + +.. dropdown:: Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + end box + + box "Greenplum" + participant "Greenplum master" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + == Greenplum.execute(statement) == + "Spark driver" --> "Greenplum master" : EXECUTE statement + "Greenplum master" -> "Spark driver" : RETURN result + + == Greenplum.close() == + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + +Options +------- + .. currentmodule:: onetl.connection.db_connection.greenplum.connection .. automethod:: Greenplum.fetch diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index a545fdc27..f7b8e9d32 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -43,3 +43,179 @@ There are several ways to do that. See :ref:`java-packages` for details. If you're uploading package to private package repo, use ``groupId=io.pivotal`` and ``artifactoryId=greenplum-spark_2.12`` (``2.12`` is Scala version) to give uploaded package a proper name. + +Connecting to Greenplum +----------------------- + +Interaction schema +~~~~~~~~~~~~~~~~~~ + +Spark executors open ports to listen incoming requests. +Greenplum segments are initiating connections to Spark executors using `EXTERNAL TABLE `_ +functionality, and send/read data using `gpfdist `_ protocol. + +Data is **not** send through Greenplum master. +Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on). + +More details can be found in `official documentation `_. + +Number of parallel connections +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + This is very important!!! + + If you don't limit number of connections, you can exceed the `max_connections `_ + limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max, + depending on your Greenplum instance settings and using connection balancers like ``pgbouncer``. + + Consuming all available connections means **nobody** (even admin users) can connect to Greenplum. + +Each job on the Spark executor makes its own connection to Greenplum master node, +so you need to limit number of connections to avoid opening too many of them. + +* Reading about ``5-10Gb`` of data requires about ``3-5`` parallel connections. +* Reading about ``20-30Gb`` of data requires about ``5-10`` parallel connections. +* Reading about ``50Gb`` of data requires ~ ``10-20`` parallel connections. +* Reading about ``100+Gb`` of data requires ``20-30`` parallel connections. +* Opening more than ``30-50`` connections is not recommended. + +Number of connections can be limited by 2 ways: + +* By limiting number of Spark executors and number of cores per-executor. Max number of parallel jobs is ``executors * cores``. + +.. tabs:: + + .. code-tab:: py Spark with master=local + + ( + SparkSession.builder + # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.master", "local[10]") + .config("spark.executor.cores", 1) + ) + + .. code-tab:: py Spark with master=yarn or master=k8s, dynamic allocation + + ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 + .config("spark.dynamicAllocation.maxExecutors", 10) + .config("spark.executor.cores", 1) + ) + + .. code-tab:: py Spark with master=yarn or master=k8s, static allocation + + ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.executor.instances", 10) + .config("spark.executor.cores", 1) + ) + +* By limiting connection pool size user by Spark (**only** for Spark with ``master=local``): + +.. code:: python + + spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate() + + # No matter how many executors are started and how many cores they have, + # number of connections cannot exceed pool size: + Greenplum( + ..., + extra={ + "pool.maxSize": 10, + }, + ) + +See `connection pooling `_ +documentation. + + +* By setting :obj:`num_partitions ` + and :obj:`partition_column ` (not recommended). + +Allowing connection to Greenplum master +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node, +e.g. by updating ``pg_hba.conf`` file. + +More details can be found in `official documentation `_. + +Network ports +~~~~~~~~~~~~~ + +To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum: + +* Spark driver and all Spark executors -> port ``5432`` on Greenplum master node. + + This port number should be set while connecting to Greenplum: + + .. code:: python + + Greenplum(host="master.host", port=5432, ...) + +* Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executor**. + + This range should be set in ``extra`` option: + + .. code:: python + + Greenplum( + ..., + extra={ + "server.port": "41000-42000", + }, + ) + + Number of ports in this range is ``number of parallel running Spark sessions`` * ``number of parallel connections per session``. + + Number of connections per session (see below) is usually less than ``30`` (see below). + + Number of session depends on your environment: + * For ``master=local`` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU. + + * For ``master=yarn`` / ``master=k8s`` hundreds or thousands of sessions can be started simultaneously, + but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time. + +More details can be found in official documentation: + * `port requirements `_ + * `format of server.port value `_ + * `port troubleshooting `_ + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Greenplum cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Reading & writing + + GRANT USAGE ON SCHEMA myschema TO username; + GRANT CREATE ON SCHEMA myschema TO username; + GRANT SELECT, INSERT ON SCHEMA myschema.mytable TO username; + ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + .. code-tab:: sql Reading from Greenplum + + GRANT USAGE ON SCHEMA schema_to_read TO username; + GRANT CREATE ON SCHEMA schema_to_read TO username; + GRANT SELECT ON SCHEMA schema_to_read.table_to_read TO username; + -- yes, ``writable``, because data is written from Greenplum to Spark executor. + ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + .. code-tab:: sql Writing to Greenplum + + GRANT USAGE ON SCHEMA schema_to_write TO username; + GRANT CREATE ON SCHEMA schema_to_write TO username; + GRANT SELECT, INSERT ON SCHEMA schema_to_write.table_to_write TO username; + -- yes, ``readable``, because data is read from Spark executor to Greenplum. + ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist'); + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index 2640f7e6c..30d669fea 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -8,20 +8,143 @@ For reading data from Greenplum, use :obj:`DBReader `, - and drop staging table after reading is finished. +Interaction schema +------------------ - In this case data will be read directly from Greenplum segment nodes in a distributed way. +High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below. + +.. dropdown:: Spark <-> Greenplum interaction during DBReader.run() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + "Spark driver" --> "Greenplum master" : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE EXISTS + "Spark driver" -> "Greenplum master" : SHOW SCHEMA FOR gp_table + "Greenplum master" --> "Spark driver" : (id bigint, col1 int, col2 text, ...) + + == DBReader.run() == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN + + "Greenplum segment1" ->o "Spark executor1" -- : INITIALIZE CONNECTION TO Spark executor1\nPUSH DATA TO Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + + "Greenplum segment2" ->o "Spark executor2" -- : INITIALIZE CONNECTION TO Spark executor2\nPUSH DATA TO Spark executor2 + "Greenplum segmentN" ->o "Spark executorN" -- : INITIALIZE CONNECTION TO Spark executorN\nPUSH DATA TO Spark executorN + + == Spark.stop() == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + +Recommendations +--------------- + +Reading from views +~~~~~~~~~~~~~~~~~~ + +This connector is **NOT** designed to read data from views. + +You can technically read data from a view which has +`gp_segment_id `_ column. +But this is **not** recommended because each Spark executor will run the same query, which may lead to running duplicated calculations +and sending data between segments only to skip most of the result and select only small part. + +Prefer following option: + * Create staging table to store result data, using :obj:`Greenplum.execute ` + * Use the same ``.execute`` method run a query ``INSERT INTO staging_table AS SELECT FROM some_view``. This will be done on Greenplum segments side, query will be run only once. + * Read data from staging table to Spark executor using :obj:`DBReader `. + * Drop staging table using ``.execute`` method. + +Using ``JOIN`` on Greenplum side +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you need to get data of joining 2 tables in Greenplum, you should: + * Create staging table to store result data, using ``Greenplum.execute`` + * Use the same ``Greenplum.execute`` run a query ``INSERT INTO staging_table AS SELECT FROM table1 JOIN table2``. This will be done on Greenplum segments side, in a distributed way. + * Read data from staging table to Spark executor using ``DBReader``. + * Drop staging table using ``Greenplum.execute``. .. warning:: - Greenplum connection does **NOT** support reading data from views which does not have ``gp_segment_id`` column. - Either add this column to a view, or use stating table solution (see above). + Do **NOT** try to read data from ``table1`` and ``table2`` using ``DBReader``, and then join the resulting dataframes! + + This will lead to sending all the data from both tables to Spark executor memory, and then ``JOIN`` + will be performed on Spark side, not Greenplum. This is **very** inefficient. + +Using ``TEMPORARY`` tables +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Someone could think that writing data from ``VIEW`` or result of ``JOIN`` to ``TEMPORARY`` table, +and then passing it to DBReader, is an efficient way to read data from Greenplum, because temp tables are not generating WAL files, +and are automatically deleted after finishing the transaction. + +That's will **not** work. Each Spark executor establishes its own connection to Greenplum, +and thus reads its own temporary table, which does not contain any data. + +You should use `UNLOGGED `_ tables +to write data to staging table without generating useless WAL logs. + +Mapping of Greenplum types to Spark types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See `official documentation `_ +for more details. +onETL does not perform any additional casting of types while reading data. + +Options +------- .. currentmodule:: onetl.connection.db_connection.greenplum.options diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst index aeb688ac5..c7a4f1560 100644 --- a/docs/connection/db_connection/greenplum/write.rst +++ b/docs/connection/db_connection/greenplum/write.rst @@ -5,6 +5,101 @@ Writing to Greenplum For writing data to Greenplum, use :obj:`DBWriter ` with options below. + +Interaction schema +------------------ + +High-level schema is described in :ref:`greenplum-prerequisites`. You can find detailed interaction schema below. + +.. dropdown:: Spark <-> Greenplum interaction during DBWriter.run() + + .. plantuml:: + + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + "Spark driver" --> "Greenplum master" ++ : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE NOT EXISTS + + == DBWriter.run(df) == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO gp_table FROM spark_executor1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM spark_executor1 TO gp_table_data_on_segment1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO gp_table FROM spark_executor2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM spark_executor2 TO gp_table_data_on_segment2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO gp_table FROM spark_executorN + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM spark_executorN TO gp_table_data_on_segmentN + + "Greenplum segment1" -->o "Spark executor1" : INITIALIZE CONNECTION TO Spark executor1 + "Spark executor1" -> "Greenplum segment1" : READ DATA FROM Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + deactivate "Greenplum segment1" + + "Greenplum segment2" -->o "Spark executor2" : INITIALIZE CONNECTION TO Spark executor2 + "Spark executor2" -> "Greenplum segment2" : READ DATA FROM Spark executor2 + deactivate "Greenplum segment2" + + "Greenplum segmentN" -->o "Spark executorN" : INITIALIZE CONNECTION TO Spark executorN + "Spark executorN" -> "Greenplum segmentN" : READ DATA FROM Spark executorN + deactivate "Greenplum segmentN" + + == Finished == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + +Recommendations +--------------- + +Mapping of Spark types to Greenplum types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See `official documentation `_ +for more details. +onETL does not perform any additional casting of types while writing data. + +Options +------- + .. currentmodule:: onetl.connection.db_connection.greenplum.options .. autopydantic_model:: GreenplumWriteOptions diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 99de7d90c..d1eedff7f 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -124,34 +124,24 @@ class Greenplum(JDBCMixin, DBConnection): from onetl.connection import Greenplum from pyspark.sql import SparkSession - # Please ask your DevOps and Greenplum admin what port range - # on Spark side can be used to accept requests from Greenplum segments - - extra = { - "server.port": "49152-65535", - } - # Create Spark session with Greenplum connector loaded - # See Prerequisites page for more details maven_packages = Greenplum.get_packages(spark_version="3.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.executor.allowSparkContext", "true") + # IMPORTANT!!! + # Set number of executors according to "Prerequisites" -> "Number of executors" .config("spark.dynamicAllocation.maxExecutors", 10) .config("spark.executor.cores", 1) .getOrCreate() ) # IMPORTANT!!! - # Each job on the Spark executor make its own connection to Greenplum master node, - # so we need to limit number of connections to avoid opening too many of them. - # - # Table size ~20Gb requires about 10 executors * cores, - # ~50Gb requires ~ 20 executors * cores, - # 100Gb+ requires 30 executors * cores. - # - # Cores number can be increased, but executors count should be reduced - # to keep the same number of executors * cores. + # Set port range of executors according to "Prerequisites" -> "Network ports" + extra = { + "server.port": "41000-42000", + } # Create connection greenplum = Greenplum( diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index 86785155e..7d4638412 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -107,7 +107,9 @@ class Config: .. warning:: - You should not change this option, unless you know what you're doing + You should not change this option, unless you know what you're doing. + + It's preferable to use default values to read data parallel by number of segments in Greenplum cluster. Possible values: * ``None`` (default): diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 464487f52..609bba034 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -161,9 +161,17 @@ class SparkS3(SparkFileDFConnection): # Create Spark session with Hadoop AWS libraries loaded maven_packages = SparkS3.get_packages(spark_version="3.4.1") + # Some dependencies are not used, but downloading takes a lot of time. Skipping them. + excluded_packages = [ + "com.google.cloud.bigdataoss:gcs-connector", + "org.apache.hadoop:hadoop-aliyun", + "org.apache.hadoop:hadoop-azure-datalake", + "org.apache.hadoop:hadoop-azure", + ] spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(excluded_packages)) .config("spark.hadoop.fs.s3a.committer.magic.enabled", "true") .config("spark.hadoop.fs.s3a.committer.name", "magic") .config( diff --git a/requirements/docs.txt b/requirements/docs.txt index d3fc9555e..4ff1db3e9 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -3,10 +3,10 @@ furo importlib-resources<6 numpydoc pygments-csv-lexer -# https://github.com/sphinx-doc/sphinx/issues/11662 -sphinx<7.2.5 +sphinx sphinx-copybutton sphinx-design +sphinx-plantuml sphinx-tabs sphinx-toolbox sphinx_substitution_extensions From e757a65e46e09fd326d0bc79cdf5a29de064076f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Sep 2023 11:21:46 +0000 Subject: [PATCH 19/26] [DOP-9007] Fix links to installation instruction --- docs/connection/db_connection/greenplum/prerequisites.rst | 2 +- onetl/connection/db_connection/clickhouse/connection.py | 2 +- onetl/connection/db_connection/hive/connection.py | 4 ++-- onetl/connection/db_connection/mongodb/connection.py | 2 +- onetl/connection/db_connection/mssql/connection.py | 2 +- onetl/connection/db_connection/mysql/connection.py | 2 +- onetl/connection/db_connection/oracle/connection.py | 2 +- onetl/connection/db_connection/postgres/connection.py | 2 +- onetl/connection/db_connection/teradata/connection.py | 2 +- onetl/connection/file_connection/ftp.py | 2 +- onetl/connection/file_connection/ftps.py | 2 +- onetl/connection/file_connection/hdfs/connection.py | 4 ++-- onetl/connection/file_connection/s3.py | 2 +- onetl/connection/file_connection/sftp.py | 2 +- onetl/connection/file_connection/webdav.py | 2 +- onetl/connection/file_df_connection/spark_hdfs/connection.py | 4 ++-- onetl/connection/file_df_connection/spark_local_fs.py | 2 +- onetl/connection/file_df_connection/spark_s3/connection.py | 2 +- 18 files changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index f7b8e9d32..0f0c1c53d 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -24,7 +24,7 @@ You can install PySpark as follows: pip install onetl pyspark=3.2.4 # pass specific PySpark version -See :ref:`spark-install` instruction for more details. +See :ref:`install-spark` instruction for more details. Downloading Pivotal package --------------------------- diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index dc6acf163..19f953b47 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -65,7 +65,7 @@ class Clickhouse(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index d0997f512..83b219acc 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -69,7 +69,7 @@ class Hive(DBConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. .. warning:: @@ -82,7 +82,7 @@ class Hive(DBConnection): .. note:: Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`. + **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 771fb3b69..57c06cac0 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -83,7 +83,7 @@ class MongoDB(DBConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 49fc825d9..b5de3bab0 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -64,7 +64,7 @@ class MSSQL(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 868731eaf..252ee60a4 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -63,7 +63,7 @@ class MySQL(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 69d7e2c5b..18d9addb3 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -103,7 +103,7 @@ class Oracle(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index eb07a68f6..377f9eb04 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -61,7 +61,7 @@ class Postgres(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 7e730f9eb..4ae7d5760 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -66,7 +66,7 @@ class Teradata(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index 6710a4303..fce0a9f3a 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -68,7 +68,7 @@ class FTP(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 211ff6030..97dbc0972 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -69,7 +69,7 @@ class FTPS(FTP): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 2419aae2f..71e0c07c6 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -72,14 +72,14 @@ class HDFS(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` instruction for more details. .. note:: To access Hadoop cluster with Kerberos installed, you should have ``kinit`` executable in some path in ``PATH`` environment variable. - See onETL :ref:`kerberos-install` instruction for more details. + See onETL :ref:`install-kerberos` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 7198a05aa..5d51c80fa 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -67,7 +67,7 @@ class S3(FileConnection): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 3b84df658..007bb147b 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -71,7 +71,7 @@ class SFTP(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index 52aab0419..0a3f55f23 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -70,7 +70,7 @@ class WebDAV(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`files-install` instruction for more details. + See :ref:`install-files` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 04bdfae48..73b8c9914 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -58,12 +58,12 @@ class SparkHDFS(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. .. note:: Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`kerberos-install`. + **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index b914c714f..a40255186 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -49,7 +49,7 @@ class SparkLocalFS(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. .. warning:: diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 609bba034..d93e6e9f6 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -83,7 +83,7 @@ class SparkS3(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`spark-install` instruction for more details. + See :ref:`install-spark` instruction for more details. .. note:: From 8c3ae34817e926c33f484dd89663163351289ec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Sep 2023 11:50:21 +0000 Subject: [PATCH 20/26] [DOP-9007] Fix changelog --- docs/changelog/next_release/154.improvement.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/changelog/next_release/154.improvement.rst b/docs/changelog/next_release/154.improvement.rst index 745989a87..d22b5a566 100644 --- a/docs/changelog/next_release/154.improvement.rst +++ b/docs/changelog/next_release/154.improvement.rst @@ -1,4 +1,4 @@ Drastically improve ``Greenplum`` documentation: -* Added information about network ports, grants, ``pg_hba.conf`` and so on. -* Added interaction schemas for reading, writing and executing statements in Greenplum. -* Added recommendations about reading data from views and ``JOIN`` results from Greenplum. + * Added information about network ports, grants, ``pg_hba.conf`` and so on. + * Added interaction schemas for reading, writing and executing statements in Greenplum. + * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. From 13f0e4bdfc72dbf8577a0a12d1fc5fb11e76c7fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Sep 2023 11:53:31 +0000 Subject: [PATCH 21/26] [DOP-9007] Fix Samba documentation --- onetl/connection/file_connection/samba.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index 9e907ee3e..def44943c 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -60,6 +60,19 @@ class Samba(FileConnection): .. versionadded:: 0.9.4 + .. warning:: + + To use Samba connector you should install package as follows: + + .. code:: bash + + pip install onetl[samba] + + # or + pip install onetl[files] + + See :ref:`install-files` instruction for more details. + Parameters ---------- host : str From 661e0c09fba81dbc5040b852d25e1c01548c4031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Sep 2023 11:55:48 +0000 Subject: [PATCH 22/26] [DOP-9007] Fix link names --- docs/connection/db_connection/greenplum/prerequisites.rst | 2 +- onetl/connection/db_connection/clickhouse/connection.py | 2 +- onetl/connection/db_connection/hive/connection.py | 2 +- onetl/connection/db_connection/mongodb/connection.py | 2 +- onetl/connection/db_connection/mssql/connection.py | 2 +- onetl/connection/db_connection/mysql/connection.py | 2 +- onetl/connection/db_connection/oracle/connection.py | 2 +- onetl/connection/db_connection/postgres/connection.py | 2 +- onetl/connection/db_connection/teradata/connection.py | 2 +- onetl/connection/file_connection/ftp.py | 2 +- onetl/connection/file_connection/ftps.py | 2 +- onetl/connection/file_connection/hdfs/connection.py | 4 ++-- onetl/connection/file_connection/s3.py | 2 +- onetl/connection/file_connection/samba.py | 2 +- onetl/connection/file_connection/sftp.py | 2 +- onetl/connection/file_connection/webdav.py | 2 +- onetl/connection/file_df_connection/spark_hdfs/connection.py | 2 +- onetl/connection/file_df_connection/spark_local_fs.py | 2 +- onetl/connection/file_df_connection/spark_s3/connection.py | 2 +- 19 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 0f0c1c53d..a181f16dc 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -24,7 +24,7 @@ You can install PySpark as follows: pip install onetl pyspark=3.2.4 # pass specific PySpark version -See :ref:`install-spark` instruction for more details. +See :ref:`install-spark` installation instruction for more details. Downloading Pivotal package --------------------------- diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 19f953b47..f95884f7d 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -65,7 +65,7 @@ class Clickhouse(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 83b219acc..6d768ea2e 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -69,7 +69,7 @@ class Hive(DBConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. warning:: diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 57c06cac0..860f7b215 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -83,7 +83,7 @@ class MongoDB(DBConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index b5de3bab0..6738c2541 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -64,7 +64,7 @@ class MSSQL(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 252ee60a4..abd17df33 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -63,7 +63,7 @@ class MySQL(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 18d9addb3..2e1f3e916 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -103,7 +103,7 @@ class Oracle(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 377f9eb04..22b42c296 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -61,7 +61,7 @@ class Postgres(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 4ae7d5760..2c797b3d8 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -66,7 +66,7 @@ class Teradata(JDBCConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index fce0a9f3a..b7dd82257 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -68,7 +68,7 @@ class FTP(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`install-files` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 97dbc0972..dfcd05553 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -69,7 +69,7 @@ class FTPS(FTP): # or pip install onetl[files] - See :ref:`install-files` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 71e0c07c6..aa58f7e0a 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -72,14 +72,14 @@ class HDFS(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`install-files` instruction for more details. + See :ref:`install-files` installation instruction for more details. .. note:: To access Hadoop cluster with Kerberos installed, you should have ``kinit`` executable in some path in ``PATH`` environment variable. - See onETL :ref:`install-kerberos` instruction for more details. + See :ref:`install-kerberos` instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 5d51c80fa..2f8d298f1 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -67,7 +67,7 @@ class S3(FileConnection): # or pip install onetl[files] - See :ref:`install-files` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index def44943c..bef7ed276 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -71,7 +71,7 @@ class Samba(FileConnection): # or pip install onetl[files] - See :ref:`install-files` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 007bb147b..bef53ce2d 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -71,7 +71,7 @@ class SFTP(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`install-files` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index 0a3f55f23..9825a0525 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -70,7 +70,7 @@ class WebDAV(FileConnection, RenameDirMixin): # or pip install onetl[files] - See :ref:`install-files` instruction for more details. + See :ref:`install-files` installation instruction for more details. Parameters ---------- diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 73b8c9914..ce83c8b1d 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -58,7 +58,7 @@ class SparkHDFS(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. note:: diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index a40255186..264fac3a2 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -49,7 +49,7 @@ class SparkLocalFS(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. warning:: diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index d93e6e9f6..b766389cf 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -83,7 +83,7 @@ class SparkS3(SparkFileDFConnection): # or pip install onetl pyspark=3.4.1 # pass specific PySpark version - See :ref:`install-spark` instruction for more details. + See :ref:`install-spark` installation instruction for more details. .. note:: From ad8edcb2efcdae056bb064853c20bc0de0b639c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Sep 2023 11:59:18 +0000 Subject: [PATCH 23/26] [DOP-9007] Fix Greenplum documentation --- docs/connection/db_connection/greenplum/prerequisites.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index a181f16dc..3d1f9c80b 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -18,12 +18,6 @@ Installing PySpark To use Greenplum connector you should have PySpark installed (or injected to ``sys.path``) BEFORE creating the connector instance. -You can install PySpark as follows: - -.. code:: bash - - pip install onetl pyspark=3.2.4 # pass specific PySpark version - See :ref:`install-spark` installation instruction for more details. Downloading Pivotal package From f574c081c7e30ad47d53b6304b8d85b7dff7f91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Sep 2023 12:00:01 +0000 Subject: [PATCH 24/26] [DOP-9007] Fix Greenplum documentation --- docs/connection/db_connection/greenplum/prerequisites.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 3d1f9c80b..57db9635e 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -29,7 +29,7 @@ and then pass it to Spark session. .. warning:: - Please pay attention to :ref:`Spark <-> Scala version compatibility `. + Please pay attention to :ref:`Spark & Scala version compatibility `. There are several ways to do that. See :ref:`java-packages` for details. From f3aa33540eb3adae2d44643bcd196e1694edde71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 25 Sep 2023 13:09:18 +0000 Subject: [PATCH 25/26] [DOP-9208] Fix calling .close() method in __del__ --- docs/changelog/next_release/156.bugfix.rst | 1 + .../next_release/156.improvement.1.rst | 1 + .../next_release/156.improvement.2.rst | 1 + .../db_connection/jdbc_mixin/connection.py | 23 ++++++++++++++----- .../db_connection/kafka/connection.py | 4 ++++ .../file_connection/file_connection.py | 21 +++++++++++++---- .../spark_hdfs/connection.py | 13 ++++++++--- .../file_df_connection/spark_s3/connection.py | 7 +++++- .../test_spark_hdfs_integration.py | 17 ++++++++------ 9 files changed, 67 insertions(+), 21 deletions(-) create mode 100644 docs/changelog/next_release/156.bugfix.rst create mode 100644 docs/changelog/next_release/156.improvement.1.rst create mode 100644 docs/changelog/next_release/156.improvement.2.rst diff --git a/docs/changelog/next_release/156.bugfix.rst b/docs/changelog/next_release/156.bugfix.rst new file mode 100644 index 000000000..2953ab3d6 --- /dev/null +++ b/docs/changelog/next_release/156.bugfix.rst @@ -0,0 +1 @@ +Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. diff --git a/docs/changelog/next_release/156.improvement.1.rst b/docs/changelog/next_release/156.improvement.1.rst new file mode 100644 index 000000000..5607eb69c --- /dev/null +++ b/docs/changelog/next_release/156.improvement.1.rst @@ -0,0 +1 @@ +Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. diff --git a/docs/changelog/next_release/156.improvement.2.rst b/docs/changelog/next_release/156.improvement.2.rst new file mode 100644 index 000000000..5824c8a9b --- /dev/null +++ b/docs/changelog/next_release/156.improvement.2.rst @@ -0,0 +1 @@ +Call ``.close()`` on FileConnection then it is removed by garbage collector. diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index c02fb82f1..e5e3e312e 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -15,10 +15,11 @@ from __future__ import annotations import logging +import threading from abc import abstractmethod from contextlib import closing, suppress from enum import Enum, auto -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Tuple, TypeVar +from typing import TYPE_CHECKING, Callable, ClassVar, Optional, TypeVar from pydantic import Field, PrivateAttr, SecretStr, validator @@ -76,7 +77,7 @@ class JDBCMixin(FrozenModel): _CHECK_QUERY: ClassVar[str] = "SELECT 1" # cached JDBC connection (Java object), plus corresponding GenericOptions (Python object) - _last_connection_and_options: Optional[Tuple[Any, JDBCMixinOptions]] = PrivateAttr(default=None) + _last_connection_and_options: Optional[threading.local] = PrivateAttr(default=None) @property @abstractmethod @@ -126,6 +127,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # noqa: U101 def __del__(self): # noqa: WPS603 # If current object is collected by GC, close all opened connections + # This is safe because closing connection on Spark driver does not influence Spark executors self.close() @slot @@ -459,8 +461,14 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions): return jdbc_options.asConnectionProperties() def _get_jdbc_connection(self, options: JDBCMixinOptions): + if not self._last_connection_and_options: + # connection class can be used in multiple threads. + # each Python thread creates its own thread in JVM + # so we need local variable to create per-thread persistent connection + self._last_connection_and_options = threading.local() + with suppress(Exception): # nothing cached, or JVM failed - last_connection, last_options = self._last_connection_and_options + last_connection, last_options = self._last_connection_and_options.data if options == last_options and not last_connection.isClosed(): return last_connection @@ -471,15 +479,18 @@ def _get_jdbc_connection(self, options: JDBCMixinOptions): driver_manager = self.spark._jvm.java.sql.DriverManager # type: ignore new_connection = driver_manager.getConnection(self.jdbc_url, connection_properties) - self._last_connection_and_options = (new_connection, options) + self._last_connection_and_options.data = (new_connection, options) return new_connection def _close_connections(self): with suppress(Exception): - last_connection, _ = self._last_connection_and_options + # connection maybe not opened yet + last_connection, _ = self._last_connection_and_options.data last_connection.close() - self._last_connection_and_options = None + with suppress(Exception): + # connection maybe not opened yet + del self._last_connection_and_options.data def _get_statement_args(self) -> tuple[int, ...]: resultset = self.spark._jvm.java.sql.ResultSet # type: ignore diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index cf9a669c9..51053df0c 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -462,6 +462,10 @@ def close(self): self.auth.cleanup(self) return self + # Do not all __del__ with calling .close(), like other connections, + # because this can influence dataframes created by this connection. + # For example, .close() deletes local keytab copy. + @property def instance_url(self): return "kafka://" + self.cluster diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py index 39e27f2c6..cc5ebbb9e 100644 --- a/onetl/connection/file_connection/file_connection.py +++ b/onetl/connection/file_connection/file_connection.py @@ -17,6 +17,7 @@ import os import threading from abc import abstractmethod +from contextlib import suppress from logging import getLogger from typing import Any, Iterable, Iterator @@ -72,8 +73,10 @@ def client(self): if client and not self._is_client_closed(client): return client except AttributeError: - self._clients_cache.client = self._get_client() - return self._clients_cache.client + pass + + self._clients_cache.client = self._get_client() + return self._clients_cache.client @slot def close(self): @@ -112,8 +115,14 @@ def close(self): except AttributeError: return self - self._close_client(client) - del self._clients_cache.client + with suppress(Exception): + # exceptions while closing client should be ignored + self._close_client(client) + + with suppress(Exception): + # .close() could be called from destructor, and modifying self is not allowed here + del self._clients_cache.client + return self def __enter__(self): @@ -122,6 +131,10 @@ def __enter__(self): def __exit__(self, _exc_type, _exc_value, _traceback): self.close() + def __del__(self): # noqa: WPS603 + # If current object is collected by GC, close opened connection + self.close() + @slot def check(self): log.info("|%s| Checking connection availability...", self.__class__.__name__) diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index ce83c8b1d..6855fe595 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -17,6 +17,7 @@ import getpass import logging import os +from contextlib import suppress from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -224,10 +225,16 @@ def close(self): """ log.debug("Reset FileSystem cache") - self._get_spark_fs().close() - object.__setattr__(self, "_active_host", None) # noqa: WPS609 + with suppress(Exception): + self._get_spark_fs().close() + + with suppress(Exception): + self._active_host = None return self + # Do not all __del__ with calling .close(), like other connections, + # because this can influence dataframes created by this connection + @slot @classmethod def get_current(cls, spark: SparkSession): @@ -360,7 +367,7 @@ def _convert_to_url(self, path: PurePathProtocol) -> str: else: host = self._get_host() # cache value to avoid getting active namenode for every path - object.__setattr__(self, "_active_host", host) # noqa: WPS609 + self._active_host = host return f"hdfs://{host}:{self.ipc_port}" + path.as_posix() def _get_default_path(self): diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index b766389cf..992e11627 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -16,6 +16,7 @@ import logging import os +from contextlib import suppress from typing import TYPE_CHECKING, ClassVar, List, Optional from etl_entities.instance import Host @@ -321,9 +322,13 @@ def close(self): connection.close() """ - self._reset_hadoop_conf() + with suppress(Exception): + self._reset_hadoop_conf() return self + # Do not all __del__ with calling .close(), like other connections, + # because this can influence dataframes created by this connection + @slot def check(self): self._patch_hadoop_conf() diff --git a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py index 3b47df0d0..778d3a20c 100644 --- a/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py +++ b/tests/tests_integration/test_file_df_connection_integration/test_spark_hdfs_integration.py @@ -26,13 +26,16 @@ def test_spark_hdfs_check(hdfs_file_df_connection, caplog): def test_spark_hdfs_file_connection_check_failed(spark): from onetl.connection import SparkHDFS - with pytest.raises(RuntimeError, match="Connection is unavailable"): - SparkHDFS( - cluster="rnd-dwh", - host="hive1", - port=1234, - spark=spark, - ).check() + wrong_hdfs = SparkHDFS( + cluster="rnd-dwh", + host="hive1", + port=1234, + spark=spark, + ) + + with wrong_hdfs: + with pytest.raises(RuntimeError, match="Connection is unavailable"): + wrong_hdfs.check() def test_spark_hdfs_file_connection_check_with_hooks(spark, request, hdfs_server): From 146abdb57a287166bb9e83491f6a4261211e7467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Sep 2023 12:22:01 +0000 Subject: [PATCH 26/26] [DOP-9007] Prepare for release --- docs/changelog/0.9.4.rst | 30 ++++++++++++++++++ docs/changelog/NEXT_RELEASE.rst | 31 +++++++++++++++++++ docs/changelog/index.rst | 1 + docs/changelog/next_release/143.feature.rst | 1 - docs/changelog/next_release/144.feature.rst | 1 - docs/changelog/next_release/145.feature.rst | 1 - docs/changelog/next_release/148.feature.rst | 1 - docs/changelog/next_release/150.feature.rst | 2 -- .../next_release/151.improvement.rst | 1 - .../next_release/154.improvement.rst | 4 --- docs/changelog/next_release/156.bugfix.rst | 1 - .../next_release/156.improvement.1.rst | 1 - .../next_release/156.improvement.2.rst | 1 - 13 files changed, 62 insertions(+), 14 deletions(-) create mode 100644 docs/changelog/0.9.4.rst delete mode 100644 docs/changelog/next_release/143.feature.rst delete mode 100644 docs/changelog/next_release/144.feature.rst delete mode 100644 docs/changelog/next_release/145.feature.rst delete mode 100644 docs/changelog/next_release/148.feature.rst delete mode 100644 docs/changelog/next_release/150.feature.rst delete mode 100644 docs/changelog/next_release/151.improvement.rst delete mode 100644 docs/changelog/next_release/154.improvement.rst delete mode 100644 docs/changelog/next_release/156.bugfix.rst delete mode 100644 docs/changelog/next_release/156.improvement.1.rst delete mode 100644 docs/changelog/next_release/156.improvement.2.rst diff --git a/docs/changelog/0.9.4.rst b/docs/changelog/0.9.4.rst new file mode 100644 index 000000000..4eb406ae0 --- /dev/null +++ b/docs/changelog/0.9.4.rst @@ -0,0 +1,30 @@ +0.9.4 (2023-09-26) +================== + +Features +-------- + +- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`) +- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`) +- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`) +- Add ``Excel`` file format support. (:github:pull:`148`) +- Add ``Samba`` file connection. + It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`) + + +Improvements +------------ + +- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`) +- Drastically improve ``Greenplum`` documentation: + * Added information about network ports, grants, ``pg_hba.conf`` and so on. + * Added interaction schemas for reading, writing and executing statements in Greenplum. + * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`) +- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`) +- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`) + + +Bug Fixes +--------- + +- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`) diff --git a/docs/changelog/NEXT_RELEASE.rst b/docs/changelog/NEXT_RELEASE.rst index 5e26856b4..ee4196843 100644 --- a/docs/changelog/NEXT_RELEASE.rst +++ b/docs/changelog/NEXT_RELEASE.rst @@ -3,3 +3,34 @@ .. and add it to index.rst .. towncrier release notes start + +0.9.4 (2023-09-26) +================== + +Features +-------- + +- Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` (:github:pull:`143`) +- Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` (:github:pull:`144`) +- Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` (:github:pull:`145`) +- Add ``Excel`` file format support. (:github:pull:`148`) +- Add ``Samba`` file connection. + It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. (:github:pull:`150`) + + +Improvements +------------ + +- Add documentation about different ways of passing packages to Spark session. (:github:pull:`151`) +- Drastically improve ``Greenplum`` documentation: + * Added information about network ports, grants, ``pg_hba.conf`` and so on. + * Added interaction schemas for reading, writing and executing statements in Greenplum. + * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. (:github:pull:`154`) +- Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. (:github:pull:`156`) +- Call ``.close()`` on FileConnection then it is removed by garbage collector. (:github:pull:`156`) + + +Bug Fixes +--------- + +- Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. (:github:pull:`156`) diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 92701e1e1..6130bfdc8 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -4,6 +4,7 @@ DRAFT NEXT_RELEASE + 0.9.4 0.9.3 0.9.2 0.9.1 diff --git a/docs/changelog/next_release/143.feature.rst b/docs/changelog/next_release/143.feature.rst deleted file mode 100644 index 97756efc4..000000000 --- a/docs/changelog/next_release/143.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``if_exists="ignore"`` and ``error`` to ``Hive.WriteOptions`` diff --git a/docs/changelog/next_release/144.feature.rst b/docs/changelog/next_release/144.feature.rst deleted file mode 100644 index a0cf257e4..000000000 --- a/docs/changelog/next_release/144.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``if_exists="ignore"`` and ``error`` to ``JDBC.WriteOptions`` diff --git a/docs/changelog/next_release/145.feature.rst b/docs/changelog/next_release/145.feature.rst deleted file mode 100644 index 975e0b96d..000000000 --- a/docs/changelog/next_release/145.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``if_exists="ignore"`` and ``error`` to ``MongoDB.WriteOptions`` diff --git a/docs/changelog/next_release/148.feature.rst b/docs/changelog/next_release/148.feature.rst deleted file mode 100644 index 87b1b48a8..000000000 --- a/docs/changelog/next_release/148.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``Excel`` file format support. diff --git a/docs/changelog/next_release/150.feature.rst b/docs/changelog/next_release/150.feature.rst deleted file mode 100644 index 6ea0af9ff..000000000 --- a/docs/changelog/next_release/150.feature.rst +++ /dev/null @@ -1,2 +0,0 @@ -Add ``Samba`` file connection. -It is now possible to download and upload files to Samba shared folders using ``FileDownloader``/``FileUploader``. diff --git a/docs/changelog/next_release/151.improvement.rst b/docs/changelog/next_release/151.improvement.rst deleted file mode 100644 index d8da800ae..000000000 --- a/docs/changelog/next_release/151.improvement.rst +++ /dev/null @@ -1 +0,0 @@ -Add documentation about different ways of passing packages to Spark session. diff --git a/docs/changelog/next_release/154.improvement.rst b/docs/changelog/next_release/154.improvement.rst deleted file mode 100644 index d22b5a566..000000000 --- a/docs/changelog/next_release/154.improvement.rst +++ /dev/null @@ -1,4 +0,0 @@ -Drastically improve ``Greenplum`` documentation: - * Added information about network ports, grants, ``pg_hba.conf`` and so on. - * Added interaction schemas for reading, writing and executing statements in Greenplum. - * Added recommendations about reading data from views and ``JOIN`` results from Greenplum. diff --git a/docs/changelog/next_release/156.bugfix.rst b/docs/changelog/next_release/156.bugfix.rst deleted file mode 100644 index 2953ab3d6..000000000 --- a/docs/changelog/next_release/156.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix issue while stopping Python interpreter calls ``JDBCMixin.close()`` and prints exceptions to log. diff --git a/docs/changelog/next_release/156.improvement.1.rst b/docs/changelog/next_release/156.improvement.1.rst deleted file mode 100644 index 5607eb69c..000000000 --- a/docs/changelog/next_release/156.improvement.1.rst +++ /dev/null @@ -1 +0,0 @@ -Make ``.fetch`` and ``.execute`` methods of DB connections thread-safe. Each thread works with its own connection. diff --git a/docs/changelog/next_release/156.improvement.2.rst b/docs/changelog/next_release/156.improvement.2.rst deleted file mode 100644 index 5824c8a9b..000000000 --- a/docs/changelog/next_release/156.improvement.2.rst +++ /dev/null @@ -1 +0,0 @@ -Call ``.close()`` on FileConnection then it is removed by garbage collector.