Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Python bindings to annotating_importer. #139

Merged
merged 7 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions gematria/datasets/python/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,58 @@ gematria_py_binary(
],
)

gematria_pybind_extension(
name = "annotating_importer",
srcs = ["annotating_importer.cc"],
py_deps = [
"//gematria/llvm/python:canonicalizer",
"//gematria/proto:basic_block_py_pb2",
"//gematria/proto:canonicalized_instruction_py_pb2",
"//gematria/proto:throughput_py_pb2",
],
visibility = ["//:internal_users"],
deps = [
"//gematria/basic_block:basic_block_protos",
"//gematria/datasets:annotating_importer",
"//gematria/datasets:bhive_importer",
"//gematria/llvm:canonicalizer",
"@com_google_pybind11_protobuf//pybind11_protobuf:native_proto_caster",
"@llvm-project//llvm:Support",
"@pybind11_abseil_repo//pybind11_abseil:import_status_module",
"@pybind11_abseil_repo//pybind11_abseil:status_casters",
],
)

gematria_py_test(
name = "annotating_importer_test",
size = "small",
srcs = ["annotating_importer_test.py"],
data = [
"//gematria/testing/testdata:simple_x86_elf_object",
"//gematria/testing/testdata:simple_x86_elf_object.perf.data",
],
deps = [
":annotating_importer",
"//gematria/llvm/python:canonicalizer",
"//gematria/llvm/python:llvm_architecture_support",
"//gematria/proto:basic_block_py_pb2",
"//gematria/proto:canonicalized_instruction_py_pb2",
"//gematria/proto:throughput_py_pb2",
"@rules_python//python/runfiles",
],
)

gematria_py_binary(
name = "import_annotated_basic_blocks",
srcs = ["import_annotated_basic_blocks.py"],
deps = [
":annotating_importer",
"//gematria/llvm/python:canonicalizer",
"//gematria/llvm/python:llvm_architecture_support",
"//gematria/utils/python:pybind11_abseil_status",
],
)

gematria_py_binary(
name = "extract_tokens_file",
srcs = ["extract_tokens_file.py"],
Expand Down
81 changes: 81 additions & 0 deletions gematria/datasets/python/annotating_importer.cc
virajbshah marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright 2024 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "gematria/datasets/annotating_importer.h"

#include <algorithm>
#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <tuple>
#include <vector>

#include "gematria/llvm/canonicalizer.h"
#include "llvm/Support/Error.h"
#include "pybind11/cast.h"
#include "pybind11/detail/common.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11_abseil/import_status_module.h"
#include "pybind11_abseil/status_casters.h"
#include "pybind11_protobuf/native_proto_caster.h"

namespace gematria {

namespace py = ::pybind11;

PYBIND11_MODULE(annotating_importer, m) {
m.doc() = "Support code for importing annotated basic block data.";

py::google::ImportStatusModule();

py::class_<AnnotatingImporter>(m, "AnnotatingImporter")
.def( //
py::init<const Canonicalizer* /* canonicalizer */>(),
virajbshah marked this conversation as resolved.
Show resolved Hide resolved
py::arg("canonicalizer"), py::keep_alive<1, 2>(),
R"(Initializes a new annotation collector for a given architecture.

Args:
canonicalizer: The canonicalizer used to disassemble instructions
and convert them to the Gematria proto representation.)")
.def( //
"get_annotated_basic_block_protos",
&AnnotatingImporter::GetAnnotatedBasicBlockProtos,
py::arg("elf_file_name"), py::arg("perf_data_file_name"),
py::arg("source_name"),
R"(Creates annotated BasicBlockProtos from an ELF object and samples.

Reads an ELF object along with a corresponding `perf.data`-like file
and creates a list of annotated `BasicBlockProto`s consisting of
basic blocks from the ELF object annotated using samples from the
`perf.data`-like file.

Args:
elf_file_name: The path to the ELF object from which basic blocks
are to be extracted.
perf_data_file_name: The path to the `perf.data`-like file from
which samples are to be extracted along with LBR data.
virajbshah marked this conversation as resolved.
Show resolved Hide resolved
source_name: The source name the timing data in the annotated
`BasicBlockProto`s should be attributed to.

Returns:
A list of annotated `BasicBlockProto`s.

Raises:
StatusNotOk: When extracting basic blocks and samples or creating
the annotated `BasicBlockProto`s fails.)");
}

} // namespace gematria
148 changes: 148 additions & 0 deletions gematria/datasets/python/annotating_importer_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright 2024 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from absl.testing import absltest
from gematria.datasets.python import annotating_importer
from gematria.llvm.python import canonicalizer
from gematria.llvm.python import llvm_architecture_support
from gematria.proto import basic_block_pb2
from gematria.proto import canonicalized_instruction_pb2
from gematria.proto import throughput_pb2
from rules_python.python.runfiles import runfiles

_CanonicalizedOperandProto = (
canonicalized_instruction_pb2.CanonicalizedOperandProto
)
_CanonicalizedInstructionProto = (
canonicalized_instruction_pb2.CanonicalizedInstructionProto
)


_EXPECTED_BASIC_BLOCK_PROTO = basic_block_pb2.BasicBlockProto(
machine_instructions=(
basic_block_pb2.MachineInstructionProto(
assembly="\tmovl\t%ecx, %edx",
address=18446744073709547787,
machine_code=b"\211\312",
),
basic_block_pb2.MachineInstructionProto(
assembly="\timull\t%edx, %edx",
address=18446744073709547789,
machine_code=b"\017\257\322",
),
basic_block_pb2.MachineInstructionProto(
assembly="\taddl\t%edx, %eax",
address=18446744073709547792,
machine_code=b"\001\320",
),
basic_block_pb2.MachineInstructionProto(
assembly="\tdecl\t%ecx",
address=18446744073709547794,
machine_code=b"\377\311",
),
),
canonicalized_instructions=(
_CanonicalizedInstructionProto(
mnemonic="MOV",
llvm_mnemonic="MOV32rr",
output_operands=(_CanonicalizedOperandProto(register_name="EDX"),),
input_operands=(_CanonicalizedOperandProto(register_name="ECX"),),
),
_CanonicalizedInstructionProto(
mnemonic="IMUL",
llvm_mnemonic="IMUL32rr",
output_operands=(_CanonicalizedOperandProto(register_name="EDX"),),
input_operands=(
_CanonicalizedOperandProto(register_name="EDX"),
_CanonicalizedOperandProto(register_name="EDX"),
),
implicit_output_operands=(
_CanonicalizedOperandProto(register_name="EFLAGS"),
),
),
_CanonicalizedInstructionProto(
mnemonic="ADD",
llvm_mnemonic="ADD32rr",
output_operands=(_CanonicalizedOperandProto(register_name="EAX"),),
input_operands=(
_CanonicalizedOperandProto(register_name="EAX"),
_CanonicalizedOperandProto(register_name="EDX"),
),
implicit_output_operands=(
_CanonicalizedOperandProto(register_name="EFLAGS"),
),
),
_CanonicalizedInstructionProto(
mnemonic="DEC",
llvm_mnemonic="DEC32r",
output_operands=(_CanonicalizedOperandProto(register_name="ECX"),),
input_operands=(_CanonicalizedOperandProto(register_name="ECX"),),
implicit_output_operands=(
_CanonicalizedOperandProto(register_name="EFLAGS"),
),
),
),
)


class AnnotatingImporterTest(absltest.TestCase):
_ELF_OBJECT_FILEPATH = (
r"com_google_gematria/gematria/testing/testdata/simple_x86_elf_object"
)
_PERF_DATA_FILEPATH = (
r"com_google_gematria/gematria/testing/testdata/"
r"simple_x86_elf_object.perf.data"
)
_SOURCE_NAME = "test: skl"

def setUp(self):
super().setUp()

self._x86_llvm = llvm_architecture_support.LlvmArchitectureSupport.x86_64()
self._x86_canonicalizer = canonicalizer.Canonicalizer.x86_64(self._x86_llvm)

self._runfiles_dir = os.environ.get("PYTHON_RUNFILES")
self._runfiles_env = runfiles.Create({"RUNFILES_DIR": self._runfiles_dir})
self.assertIsNotNone(self._runfiles_env)

def test_x86_basic_block_proto_from_binary_and_profile(self):
source_name = "test: skl"
importer = annotating_importer.AnnotatingImporter(self._x86_canonicalizer)
block_protos = importer.get_annotated_basic_block_protos(
elf_file_name=self._runfiles_env.Rlocation(self._ELF_OBJECT_FILEPATH),
perf_data_file_name=self._runfiles_env.Rlocation(
self._PERF_DATA_FILEPATH
),
source_name=self._SOURCE_NAME,
)
self.assertSequenceEqual(
block_protos,
(
throughput_pb2.BasicBlockWithThroughputProto(
basic_block=_EXPECTED_BASIC_BLOCK_PROTO,
inverse_throughputs=(
throughput_pb2.ThroughputWithSourceProto(
source=source_name,
inverse_throughput_cycles=[1.532258064516129],
),
),
),
),
)


if __name__ == "__main__":
absltest.main()
99 changes: 99 additions & 0 deletions gematria/datasets/python/import_annotated_basic_blocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2024 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

r"""Creates an annotated Gematria data set from an ELF object and perf samples.

Reads basic blocks from an ELF object, along with event samples and block
latencies from a `perf.data`-like file, usually generated using `perf record`,
and writes a Gematria TFRecord dataset containing basic blocks with instruction
annotations as derived from the samples.
"""

from collections.abc import Sequence

from absl import app
from absl import flags
from absl import logging
from gematria.datasets.python import annotating_importer
from gematria.llvm.python import canonicalizer
from gematria.llvm.python import llvm_architecture_support
from gematria.proto import basic_block_pb2
from pybind11_abseil import status
import tensorflow as tf


_INPUT_ELF_FILE = flags.DEFINE_string(
'gematria_input_elf',
None,
'The name of the ELF file from which basic blocks are to be imported.',
required=True,
)
_INPUT_PERF_FILE = flags.DEFINE_string(
'gematria_input_perf_data',
None,
'The name of the `perf.data`-like file from which samples are to be'
' imported.',
required=True,
)
_OUTPUT_TFRECORD_FILE = flags.DEFINE_string(
'gematria_output_tfrecord',
None,
'The name of the TFRecord file to write the data to.',
required=True,
)
_SOURCE_NAME = flags.DEFINE_string(
'gematria_throughput_source_name',
'perf_lbr_data',
'The name of the throughput source used for the throughput data.',
required=False,
)
_LLVM_TRIPLE = flags.DEFINE_string(
'gematria_llvm_triple',
'x86_64',
'The LLVM triple used for disassembling the instructions in the data set.',
)


def main(argv: Sequence[str]) -> None:
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
try:
llvm = llvm_architecture_support.LlvmArchitectureSupport.from_triple(
_LLVM_TRIPLE.value
)
except status.StatusNotOk:
logging.exception(
'LLVM triple "%s" is not known or supported.', _LLVM_TRIPLE.value
)
exit(1)
Comment on lines +75 to +79
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: Just letting the exception bubble up uncaught would have pretty much the same effect.


# TODO(ondrasej): Update this so that the canonicalizer is created using the
# LLVM triple. As of 2024-08, this is OK, because we support only x86-64
# anyway.
canonicalizer_obj = canonicalizer.Canonicalizer.x86_64(llvm)
importer = annotating_importer.AnnotatingImporter(canonicalizer_obj)

protos = importer.get_annotated_basic_block_protos(
_INPUT_ELF_FILE.value,
_INPUT_PERF_FILE.value,
_SOURCE_NAME.value,
)

with tf.io.TFRecordWriter(_OUTPUT_TFRECORD_FILE.value) as writer:
for proto in protos:
writer.write(proto.SerializeToString())


if __name__ == '__main__':
app.run(main)
Loading
Loading