tensorflow · lijuanzhang78 · May 1, 2023 · May 1, 2023 · May 3, 2023 · May 3, 2023
diff --git a/tests/test_atds_avro/benchmark/test_atds_autotuning_benchmark.py b/tests/test_atds_avro/benchmark/test_atds_autotuning_benchmark.py
@@ -0,0 +1,54 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""ATDS benchmark with autotuning."""
+
+import pytest
+import tensorflow as tf
+
+from tests.test_atds_avro.utils.data_source import DataSource
+from tests.test_atds_avro.utils.data_source_registry import LARGE_NUM_RECORDS
+from tests.test_atds_avro.utils.atds_benchmark_utils import (
+    run_atds_benchmark_from_data_source,
+)
+from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO
+
+BATCH_SIZES = [8, 16, 32, 64, 128, 256, 512, 1024]
+PARALLELISM = [1, 2, 3, 4, 5, 6, tf.data.AUTOTUNE]
+PARAMS = [
+    (batch_size, 1024, "deflate", parallelism)
+    for batch_size in BATCH_SIZES
+    for parallelism in PARALLELISM
+]
+
+
+@pytest.mark.benchmark(
+    group="autotuning",
+)
+@pytest.mark.parametrize(
+    ["batch_size", "shuffle_buffer_size", "codec", "parallelism"], PARAMS
+)
+def test_autotuning(batch_size, shuffle_buffer_size, codec, parallelism, benchmark):
+    data_source = DataSource(
+        scenario=MIXED_TYPES_SCENARIO, num_records=LARGE_NUM_RECORDS
+    )
+    run_atds_benchmark_from_data_source(
+        data_source,
+        batch_size,
+        benchmark,
+        parallelism=parallelism,
+        codec=codec,
+        shuffle_buffer_size=shuffle_buffer_size,
+        rounds=10,
+    )
diff --git a/tests/test_atds_avro/benchmark/test_atds_parallelism_benchmark.py b/tests/test_atds_avro/benchmark/test_atds_parallelism_benchmark.py
@@ -0,0 +1,107 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""ATDS benchmark with parallelism."""
+
+import pytest
+import tensorflow as tf
+
+from tests.test_atds_avro.utils.data_source import DataSource
+from tests.test_atds_avro.utils.data_source_registry import LARGE_NUM_RECORDS
+from tests.test_atds_avro.utils.atds_benchmark_utils import (
+    run_atds_benchmark_from_data_source,
+)
+from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO
+
+
+@pytest.mark.benchmark(
+    group="parallelism",
+)
+@pytest.mark.parametrize(
+    ["batch_size", "shuffle_buffer_size", "codec", "parallelism"],
+    [
+        (128, 1024, "null", 1),
+        (128, 1024, "null", 2),
+        (128, 1024, "null", 3),
+        (128, 1024, "null", 4),
+        (128, 1024, "null", 5),
+        (128, 1024, "null", 6),
+        (128, 1024, "deflate", 1),
+        (128, 1024, "deflate", 2),
+        (128, 1024, "deflate", 3),
+        (128, 1024, "deflate", 4),
+        (128, 1024, "deflate", 5),
+        (128, 1024, "deflate", 6),
+        (128, 1024, "snappy", 1),
+        (128, 1024, "snappy", 2),
+        (128, 1024, "snappy", 3),
+        (128, 1024, "snappy", 4),
+        (128, 1024, "snappy", 5),
+        (128, 1024, "snappy", 6),
+    ],
+)
+def test_parallelism(batch_size, shuffle_buffer_size, codec, parallelism, benchmark):
+    data_source = DataSource(
+        scenario=MIXED_TYPES_SCENARIO, num_records=LARGE_NUM_RECORDS
+    )
+    run_atds_benchmark_from_data_source(
+        data_source,
+        batch_size,
+        benchmark,
+        parallelism=parallelism,
+        codec=codec,
+        shuffle_buffer_size=shuffle_buffer_size,
+        rounds=10,
+    )
+
+
+@pytest.mark.benchmark(
+    group="parallelism",
+)
+@pytest.mark.parametrize(
+    ["batch_size", "shuffle_buffer_size", "parallelism", "interleave"],
+    [
+        (32, 1024, 1, 6),
+        (32, 1024, 2, 3),
+        (32, 1024, 3, 2),
+        (32, 1024, 6, 1),
+        (32, 1024, tf.data.AUTOTUNE, 1),
+        (32, 1024, tf.data.AUTOTUNE, 2),
+        (32, 1024, tf.data.AUTOTUNE, 3),
+        (32, 1024, tf.data.AUTOTUNE, 6),
+        (128, 1024, 1, 6),
+        (128, 1024, 2, 3),
+        (128, 1024, 3, 2),
+        (128, 1024, 6, 1),
+        (128, 1024, tf.data.AUTOTUNE, 1),
+        (128, 1024, tf.data.AUTOTUNE, 2),
+        (128, 1024, tf.data.AUTOTUNE, 3),
+        (128, 1024, tf.data.AUTOTUNE, 6),
+    ],
+)
+def test_parallelism_with_interleave(
+    batch_size, shuffle_buffer_size, parallelism, interleave, benchmark
+):
+    data_source = DataSource(
+        scenario=MIXED_TYPES_SCENARIO, num_records=LARGE_NUM_RECORDS, partitions=6
+    )
+    run_atds_benchmark_from_data_source(
+        data_source,
+        batch_size,
+        benchmark,
+        parallelism=parallelism,
+        interleave_parallelism=interleave,
+        codec="deflate",
+        shuffle_buffer_size=shuffle_buffer_size,
+    )
diff --git a/tests/test_atds_avro/benchmark/test_codec_atds_benchmark.py b/tests/test_atds_avro/benchmark/test_codec_atds_benchmark.py
@@ -0,0 +1,37 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""ATDS benchmark with different codecs."""
+
+import pytest
+
+from tests.test_atds_avro.utils.data_source import DataSource
+from tests.test_atds_avro.utils.data_source_registry import SMALL_NUM_RECORDS
+from tests.test_atds_avro.utils.atds_benchmark_utils import (
+    run_atds_benchmark_from_data_source,
+)
+from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO
+
+
+@pytest.mark.benchmark(
+    group="codec",
+)
+@pytest.mark.parametrize(
+    ["batch_size", "codec"], [(128, "null"), (128, "deflate"), (128, "snappy")]
+)
+def test_codec(batch_size, codec, benchmark):
+    data_source = DataSource(
+        scenario=MIXED_TYPES_SCENARIO, num_records=SMALL_NUM_RECORDS
+    )
+    run_atds_benchmark_from_data_source(data_source, batch_size, benchmark, codec=codec)
diff --git a/tests/test_atds_avro/benchmark/test_mixed_benchmark.py b/tests/test_atds_avro/benchmark/test_mixed_benchmark.py
@@ -0,0 +1,108 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""ATDS benchmark for schema with mixed data types."""
+
+import glob
+import os
+import pytest
+import tensorflow as tf
+
+from tests.test_atds_avro.utils.data_source import DataSource
+from tests.test_atds_avro.utils.data_source_registry import SMALL_NUM_RECORDS
+from tests.test_atds_avro.utils.generator.tensor_generator import (
+    IntTensorGenerator,
+    FloatTensorGenerator,
+    WordTensorGenerator,
+)
+from tests.test_atds_avro.utils.generator.sparse_tensor_generator import (
+    FloatSparseTensorGenerator,
+    ValueDistribution,
+)
+from tests.test_atds_avro.utils.atds_writer import ATDSWriter
+from tests.test_atds_avro.utils.benchmark_utils import benchmark_func
+from tests.test_atds_avro.utils.atds_benchmark_utils import (
+    get_dataset,
+    get_features_from_data_source,
+)
+
+
+@pytest.mark.benchmark(
+    group="mixed",
+)
+def test_mixed_benchmark_data():
+    scenario = {
+        "sparse_1d_float_small_1": FloatSparseTensorGenerator(
+            tf.SparseTensorSpec([3], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
+        ),
+        "sparse_1d_float_large": FloatSparseTensorGenerator(
+            tf.SparseTensorSpec([50001], tf.dtypes.float32),
+            ValueDistribution.SINGLE_VALUE,
+        ),
+        "dense_0d_float": FloatTensorGenerator(tf.TensorSpec([], tf.dtypes.float32)),
+        "dense_1d_float_large_1": FloatTensorGenerator(
+            tf.TensorSpec([200], tf.dtypes.float32)
+        ),
+        "dense_0d_int_1": IntTensorGenerator(tf.TensorSpec([], tf.dtypes.int32)),
+        "sparse_1d_float_medium_1": FloatSparseTensorGenerator(
+            tf.SparseTensorSpec([10], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
+        ),
+        "dense_1d_float_large_2": FloatTensorGenerator(
+            tf.TensorSpec([200], tf.dtypes.float32)
+        ),
+        "dense_1d_float_small_1": FloatTensorGenerator(
+            tf.TensorSpec([2], tf.dtypes.float32)
+        ),
+        "dense_1d_float_large_3": FloatTensorGenerator(
+            tf.TensorSpec([200], tf.dtypes.float32)
+        ),
+        "dense_1d_float_small_2": FloatTensorGenerator(
+            tf.TensorSpec([2], tf.dtypes.float32)
+        ),
+        "dense_1d_float_small_3": FloatTensorGenerator(
+            tf.TensorSpec([2], tf.dtypes.float32)
+        ),
+        "sparse_1d_float_medium_2": FloatSparseTensorGenerator(
+            tf.SparseTensorSpec([51], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
+        ),
+        "sparse_1d_float_small_2": FloatSparseTensorGenerator(
+            tf.SparseTensorSpec([3], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
+        ),
+        "dense_1d_float_large_4": FloatTensorGenerator(
+            tf.TensorSpec([200], tf.dtypes.float32)
+        ),
+        "dense_1d_float_small_4": FloatTensorGenerator(
+            tf.TensorSpec([1], tf.dtypes.float32)
+        ),
+        "dense_0d_string_1": WordTensorGenerator(
+            tf.TensorSpec([], tf.dtypes.string), avg_length=24
+        ),
+        "dense_0d_int_2": IntTensorGenerator(tf.TensorSpec([], tf.dtypes.int32)),
+        "dense_0d_string_2": WordTensorGenerator(
+            tf.TensorSpec([], tf.dtypes.string), avg_length=24
+        ),
+        "dense_0d_long": IntTensorGenerator(tf.TensorSpec([], tf.dtypes.int64)),
+    }
+    num_partitions = 10
+    data_source = DataSource(
+        scenario=scenario, num_records=SMALL_NUM_RECORDS, partitions=num_partitions
+    )
+    with ATDSWriter() as writer:
+        dir_path = writer.write(data_source)
+        pattern = os.path.join(dir_path, f"*.{writer.extension}")
+        dataset = get_dataset(
+            glob.glob(pattern), get_features_from_data_source(writer, data_source)
+        )
+        dataset = dataset.unbatch()
+        benchmark_func(dataset)
diff --git a/tests/test_atds_avro/benchmark/test_multiple_files_atds_benchmark.py b/tests/test_atds_avro/benchmark/test_multiple_files_atds_benchmark.py
@@ -0,0 +1,40 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""ATDS benchmark with multiple files."""
+
+import pytest
+
+from tests.test_atds_avro.utils.data_source import DataSource
+from tests.test_atds_avro.utils.data_source_registry import (
+    LARGE_NUM_RECORDS,
+    MULTIPLE_PARTITION,
+)
+from tests.test_atds_avro.utils.atds_benchmark_utils import (
+    run_atds_benchmark_from_data_source,
+)
+from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO
+
+
+@pytest.mark.benchmark(
+    group="multi_partition",
+)
+@pytest.mark.parametrize(["batch_size", "partitions"], [(128, MULTIPLE_PARTITION)])
+def test_multiple_partitions(batch_size, partitions, benchmark):
+    data_source = DataSource(
+        scenario=MIXED_TYPES_SCENARIO,
+        num_records=LARGE_NUM_RECORDS,
+        partitions=partitions,
+    )
+    run_atds_benchmark_from_data_source(data_source, batch_size, benchmark)