Skip to content

Commit

Permalink
feat(table): add Table.diff method (#5814)
Browse files Browse the repository at this point in the history
Fixes #5756

---------

Co-authored-by: Chip Kent <[email protected]>
  • Loading branch information
jmao-denver and chipkent authored Aug 10, 2024
1 parent f3fb208 commit d26e1b2
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 8 deletions.
68 changes: 67 additions & 1 deletion py/server/deephaven/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import inspect
from enum import Enum
from enum import auto
from typing import Any, Optional, Callable, Dict, Generator, Tuple
from typing import Any, Optional, Callable, Dict, Generator, Tuple, Literal
from typing import Sequence, List, Union, Protocol

import jpy
Expand Down Expand Up @@ -43,6 +43,8 @@
_JSearchDisplayMode = jpy.get_type("io.deephaven.engine.util.LayoutHintBuilder$SearchDisplayModes")
_JSnapshotWhenOptions = jpy.get_type("io.deephaven.api.snapshot.SnapshotWhenOptions")
_JBlinkTableTools = jpy.get_type("io.deephaven.engine.table.impl.BlinkTableTools")
_JDiffItems = jpy.get_type("io.deephaven.engine.util.TableDiff$DiffItems")
_JEnumSet = jpy.get_type("java.util.EnumSet")

# PartitionedTable
_JPartitionedTable = jpy.get_type("io.deephaven.engine.table.PartitionedTable")
Expand Down Expand Up @@ -3765,3 +3767,67 @@ def multi_join(input: Union[Table, Sequence[Table], MultiJoinInput, Sequence[Mul
table() method.
"""
return MultiJoinTable(input, on)


# region utility functions

def table_diff(t1: Table, t2: Table, max_diffs: int = 1, floating_comparison: Literal['exact', 'absolute', 'relative'] = 'exact',
ignore_column_order: bool = False) -> str:
"""Returns the differences between this table and the provided table as a string. If the two tables are the same,
an empty string is returned. The differences are returned in a human-readable format.
This method starts by comparing the table sizes, and then the schema of the two tables, such as the number of
columns, column names, column types, column orders. If the schemas are different, the comparison stops and
the differences are returned. If the schemas are the same, the method proceeds to compare the data in the
tables. The method compares the data in the tables column by column (not row by row) and only records the first
difference found in each column.
Note, inexact comparison of floating numbers may sometimes be desirable due to their inherent imprecision.
When that is the case, the floating_comparison should be set to either 'absolute' or 'relative'. When it is set
to 'absolute', the absolute value of the difference between two floating numbers is used to compare against a
threshold. The threshold is set to 0.0001 for Doubles and 0.005 for Floats. Only differences that are greater
than the threshold are recorded. When floating_comparison is set to 'relative', the relative difference between
two floating numbers is used to compare against the threshold. The relative difference is calculated as the absolute
difference divided by the smaller absolute value between the two numbers.
Args:
t1 (Table): the table to compare
t2 (Table): the table to compare against
max_diffs (int): the maximum number of differences to return, default is 1
floating_comparison (Literal['exact', 'absolute', 'relative']): the type of comparison to use for floating numbers,
default is 'exact'
ignore_column_order (bool): whether columns that exist in both tables but in different orders are
treated as differences. False indicates that column order matters (default), and True indicates that
column order does not matter.
Returns:
string
Raises:
DHError
"""
try:
diff_items = []
if max_diffs < 1:
raise ValueError("max_diffs must be greater than 0.")

if floating_comparison not in ['exact', 'absolute', 'relative']:
raise ValueError("floating_comparison must be one of 'exact', 'absolute', or 'relative'.")

if floating_comparison != 'exact':
diff_items.append(_JDiffItems.DoublesExact)
if floating_comparison == 'relative':
diff_items.append(_JDiffItems.DoubleFraction)
if ignore_column_order:
diff_items.append(_JDiffItems.ColumnsOrder)

with auto_locking_ctx(t1, t2):
if diff_items:
j_diff_items = _JEnumSet.of(*diff_items)
return _JTableTools.diff(t1.j_table, t2.j_table, max_diffs, j_diff_items)
else:
return _JTableTools.diff(t1.j_table, t2.j_table, max_diffs)
except Exception as e:
raise DHError(e, "table diff failed") from e

# endregion
61 changes: 59 additions & 2 deletions py/server/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
from deephaven.html import to_html
from deephaven.jcompat import j_hashmap
from deephaven.pandas import to_pandas
from deephaven.stream.table_publisher import table_publisher
from deephaven.table import Table, SearchDisplayMode
from deephaven.table import Table, SearchDisplayMode, table_diff
from tests.testbase import BaseTestCase, table_equals


Expand Down Expand Up @@ -1124,6 +1123,64 @@ def test_arg_validation(self):
t.partition_by("A", "B")
self.assertIn("drop_keys must be", str(cm.exception))

def test_table_diff(self):
with self.subTest("diff"):
t1 = empty_table(10).update(["A = i", "B = i", "C = i"])
t2 = empty_table(10).update(["A = i", "B = i % 2 == 0? i: i + 1", "C = i % 2 == 0? i + 1: i"])
d = table_diff(t1, t2, max_diffs=10).split("\n")
self.assertEqual(len(d), 3)
self.assertIn("row 1", d[0])
self.assertIn("row 0", d[1])

d = table_diff(t1, t2).split("\n")
self.assertEqual(len(d), 2)

with self.subTest("diff - ignore column order"):
t1 = empty_table(10).update(["A = i", "B = i + 1"])
t2 = empty_table(10).update(["B = i + 1", "A = i"])
d = table_diff(t1, t2, max_diffs=10).split("\n")
self.assertEqual(len(d), 3)

t1 = empty_table(10).update(["A = i", "B = i"])
t2 = empty_table(10).update(["B = i", "A = i"])
d = table_diff(t1, t2, max_diffs=10, ignore_column_order=True)
self.assertEqual(d, "")

with self.subTest("diff - floating_comparison = 'absolute'-double"):
t1 = empty_table(10).update(["A = i", "B = i + 1.0"])
t2 = empty_table(10).update(["A = i", "B = i + 1.00001"])
d = table_diff(t1, t2, max_diffs=10, floating_comparison='exact').split("\n")
self.assertEqual(len(d), 2)

t1 = empty_table(10).update(["A = i", "B = i + 1.0"])
t2 = empty_table(10).update(["A = i", "B = i + 1.00001"])
d = table_diff(t1, t2, max_diffs=10, floating_comparison='absolute')
self.assertEqual(d, "")

with self.subTest("diff - floating_comparison = 'absolute'-float"):
t1 = empty_table(10).update(["A = i", "B = (float)(i + 1.0)"])
t2 = empty_table(10).update(["A = i", "B = (float)(i + 1.005)"])
d = table_diff(t1, t2, max_diffs=10, floating_comparison='exact').split("\n")
self.assertEqual(len(d), 2)

t1 = empty_table(10).update(["A = i", "B = (float)(i + 1.0)"])
# 1.005 would cause the difference to be greater than 0.005, something like 0.00500001144
t2 = empty_table(10).update(["A = i", "B = (float)(i + 1.004999)"])
d = table_diff(t1, t2, max_diffs=10, floating_comparison='absolute')
self.assertEqual(d, "")

with self.subTest("diff - floating_comparison='relative'-double"):
t1 = empty_table(10).update(["A = i", "B = i + 1.0"])
t2 = empty_table(10).update(["A = i", "B = i + 1.00001"])
d = table_diff(t1, t2, max_diffs=10, floating_comparison='relative')
self.assertEqual(d, "")

with self.subTest("diff - floating_comparison='relative'-float"):
t1 = empty_table(10).update(["A = i", "B = (float)(i + 1.0)"])
t2 = empty_table(10).update(["A = i", "B = (float)(i + 1.005)"])
d = table_diff(t1, t2, max_diffs=10, floating_comparison='relative')
self.assertFalse(d)


if __name__ == "__main__":
unittest.main()
7 changes: 2 additions & 5 deletions py/server/tests/testbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,13 @@
from deephaven.liveness_scope import liveness_scope

from deephaven.update_graph import exclusive_lock
from deephaven.table import Table, PartitionedTableProxy
from deephaven.table import Table, PartitionedTableProxy, table_diff

from test_helper import py_dh_session

_JTableTools = jpy.get_type("io.deephaven.engine.util.TableTools")


def table_equals(table_a: Table, table_b: Table) -> bool:
try:
return False if _JTableTools.diff(table_a.j_table, table_b.j_table, 1) else True
return False if table_diff(table_a, table_b, 1) else True
except Exception as e:
raise DHError(e, "table equality test failed.") from e

Expand Down

0 comments on commit d26e1b2

Please sign in to comment.