Skip to content

Commit

Permalink
[DOP-21732] Fix Oracle reading with partitioning_mode=hash
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfinus committed Nov 27, 2024
1 parent 8c39d1d commit 83e6c80
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/changelog/next_release/319.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix ``DBReader(conn=oracle, options={"partitioning_mode": "hash"})`` lead to data skew in last partition due to wrong ``ora_hash`` usage.
2 changes: 1 addition & 1 deletion onetl/connection/db_connection/clickhouse/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class ClickhouseDialect(JDBCDialect):
def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str:
return f"modulo(halfMD5({partition_column}), {num_partitions})"
return f"halfMD5({partition_column}) % {num_partitions}"

def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str:
return f"{partition_column} % {num_partitions}"
Expand Down
2 changes: 1 addition & 1 deletion onetl/connection/db_connection/mssql/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class MSSQLDialect(JDBCDialect):
# https://docs.microsoft.com/ru-ru/sql/t-sql/functions/hashbytes-transact-sql?view=sql-server-ver16
def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str:
return f"CONVERT(BIGINT, HASHBYTES ( 'SHA' , {partition_column} )) % {num_partitions}"
return f"CONVERT(BIGINT, HASHBYTES ('SHA', {partition_column})) % {num_partitions}"

def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str:
return f"{partition_column} % {num_partitions}"
Expand Down
4 changes: 3 additions & 1 deletion onetl/connection/db_connection/oracle/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def get_sql_query(
)

def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str:
return f"ora_hash({partition_column}, {num_partitions})"
# ora_hash returns values from 0 to N including N.
# Balancing N+1 splits to N partitions leads to data skew in last partition.
return f"ora_hash({partition_column}, {num_partitions - 1})"

def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str:
return f"MOD({partition_column}, {num_partitions})"
Expand Down

0 comments on commit 83e6c80

Please sign in to comment.