From 772a75b6c11dedbbe663f9acde4d4b9754a6b3d7 Mon Sep 17 00:00:00 2001
From: Foster Brereton <fbrereto@adobe.com>
Date: Mon, 17 Jun 2024 08:57:46 -0700
Subject: [PATCH] Improved typedef handling (#85)

* Improvement to handle anonymous typedefs

* adding fix for issue 84

* adding test case for anonymous struct conflict

* adding a special case for `objc_object`
---
 include/orc/hash.hpp                          |  2 +-
 include/orc/orc.hpp                           |  2 +
 src/dwarf.cpp                                 | 98 +++++++++++++++----
 src/main.cpp                                  | 24 +++--
 src/orc.cpp                                   | 10 ++
 test/battery/typedef_anonymous_struct/a.cpp   |  4 +
 test/battery/typedef_anonymous_struct/b.cpp   |  4 +
 .../typedef_anonymous_struct/odrv_test.toml   |  9 ++
 test/src/main.cpp                             | 14 ++-
 9 files changed, 132 insertions(+), 35 deletions(-)
 create mode 100644 test/battery/typedef_anonymous_struct/a.cpp
 create mode 100644 test/battery/typedef_anonymous_struct/b.cpp
 create mode 100644 test/battery/typedef_anonymous_struct/odrv_test.toml
diff --git a/include/orc/hash.hpp b/include/orc/hash.hpp
index f018b17..caf6ae7 100644
--- a/include/orc/hash.hpp
+++ b/include/orc/hash.hpp
@@ -37,7 +37,7 @@ inline std::size_t hash_combine(std::size_t seed, const T& x) {
     // This is the terminating hash_combine call when there's only one item left to hash into the
     // seed. It also serves as the combiner the other routine variant uses to generate its new
     // seed.
-    return (seed ^ x) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    return seed ^ (x + 0x9e3779b9 + (seed << 6) + (seed >> 2));
 }
 
 template <class T, class... Args>
diff --git a/include/orc/orc.hpp b/include/orc/orc.hpp
index 1f9729d..65fcb85 100644
--- a/include/orc/orc.hpp
+++ b/include/orc/orc.hpp
@@ -63,6 +63,8 @@ void register_dies(dies die_vector);
 
 std::string to_json(const std::vector<odrv_report>&);
 
+std::string version_json();
+
 } // namespace orc
 
 void orc_reset();
diff --git a/src/dwarf.cpp b/src/dwarf.cpp
index e6b56d0..4273d82 100644
--- a/src/dwarf.cpp
+++ b/src/dwarf.cpp
@@ -341,16 +341,15 @@ void line_header::read(freader& s, bool needs_byteswap) {
 }
 
 /**************************************************************************************************/
-
-std::size_t fatal_attribute_hash(const attribute_sequence& attributes) {
-#if ORC_FEATURE(PROFILE_DIE_DETAILS)
-    ZoneScoped;
-#endif // ORC_FEATURE(PROFILE_DIE_DETAILS)
-
-    // We only hash the attributes that could contribute to an ODRV. We also sort that set of
-    // attributes by name to make sure the hashing is consistent regardless of attribute order.
-    constexpr const std::size_t max_names_k{32};
-    std::array<dw::at, max_names_k> names{dw::at::none};
+// It is fixed to keep allocations from happening.
+constexpr std::size_t max_names_k{32};
+using fixed_attribute_array = std::array<dw::at, max_names_k>;
+
+// for an incoming set of arbitrary attributes, return the subset of those that are fatal.
+// We also sort the set of attributes by name to make sure subsequent traversal of this set
+// is consistent.
+fixed_attribute_array fatal_attributes_within(const attribute_sequence& attributes) {
+    fixed_attribute_array names{dw::at::none};
     std::size_t count{0};
 
     for (const auto& attr : attributes) {
@@ -362,17 +361,32 @@ std::size_t fatal_attribute_hash(const attribute_sequence& attributes) {
     }
     std::sort(&names[0], &names[count]);
 
+    return names;
+}
+
+/**************************************************************************************************/
+
+std::size_t fatal_attribute_hash(const attribute_sequence& attributes) {
+#if ORC_FEATURE(PROFILE_DIE_DETAILS)
+    ZoneScoped;
+#endif // ORC_FEATURE(PROFILE_DIE_DETAILS)
+
+    // We only hash the attributes that could contribute to an ODRV. That's what makes them "fatal"
+
     std::size_t h{0};
 
-    for (std::size_t i{0}; i < count; ++i) {
+    for (auto name : fatal_attributes_within(attributes)) {
+        // As soon as we hit our first no-name-attribute, we're done.
+        if (name == dw::at::none) break;
+
         // If this assert fires, it means an attribute's value was passed over during evaluation,
         // but it was necessary for ODRV evaluation after all. The fix is to improve the attribute
         // form evaluation engine such that this attribute's value is no longer passed over.
-        const auto name = names[i];
         const auto& attribute = attributes.get(name);
         assert(!attributes.has(name, attribute_value::type::passover));
-        const auto attribute_hash = attribute._value.hash();
-        h = orc::hash_combine(h, attribute_hash);
+
+        const auto hash = attribute._value.hash();
+        h = orc::hash_combine(h, hash);
     }
 
     return h;
@@ -474,6 +488,7 @@ struct dwarf::implementation {
     std::vector<pool_string> _decl_files;
     std::unordered_map<std::size_t, pool_string> _type_cache;
     std::unordered_map<std::size_t, pool_string> _debug_str_cache;
+    pool_string _last_typedef_name; // for unnamed structs - see https://github.com/adobe/orc/issues/84
     cu_header _cu_header;
     std::size_t _cu_header_offset{0}; // offset of the compilation unit header. Relative to __debug_info.
     std::size_t _cu_die_offset{0}; // offset of the `compile_unit` die. Relative to start of `debug_info`
@@ -1380,6 +1395,18 @@ pool_string dwarf::implementation::resolve_type(attribute type) {
         result = empool("const " + recurse(attributes).allocate_string());
     } else if (die._tag == dw::tag::pointer_type) {
         result = empool(recurse(attributes).allocate_string() + "*");
+    } else if (die._tag == dw::tag::typedef_) {
+        if (const auto maybe_type = recurse(attributes)) {
+            result = maybe_type;
+        } else if (attributes.has_string(dw::at::name)) {
+            result = attributes.string(dw::at::name);
+        } else {
+            // `result` will be empty if we hit this in release
+            // builds. It's bad, but not UB, so the program can
+            // continue from here (though the results will be
+            // untrustworthy).
+            assert(!"Got a typedef with no name?");
+        }
     } else if (attributes.has_string(dw::at::type)) {
         result = type.string();
     } else if (attributes.has_reference(dw::at::type)) {
@@ -1416,6 +1443,7 @@ die_pair dwarf::implementation::abbreviation_to_die(std::size_t die_address, pro
     die._tag = a._tag;
     die._has_children = a._has_children;
 
+    // Can we get rid of this memory allocation? This happens a lot...
     attributes.reserve(a._attributes.size());
 
     std::transform(a._attributes.begin(), a._attributes.end(), std::back_inserter(attributes),
@@ -1425,8 +1453,9 @@ die_pair dwarf::implementation::abbreviation_to_die(std::size_t die_address, pro
                    });
 
     if (mode == process_mode::complete) {
+        // These statements must be kept in sync with the ones dealing with issue 84 below.
+        // See https://github.com/adobe/orc/issues/84
         path_identifier_set(die_identifier(die, attributes));
-
         die._path = empool(std::string_view(qualified_symbol_name(die, attributes)));
     }
 
@@ -1507,6 +1536,16 @@ bool dwarf::implementation::skip_die(die& d, const attribute_sequence& attribute
         return true;
     }
 
+    // There are some symbols that are owned by the compiler and/or OS vendor (read: Apple)
+    // that have been observed to conflict. We skip them for our purposes, as we have no
+    // control over them.
+    if (d._path.view().find("::[u]::objc_object") == 0) {
+#if ORC_FEATURE(PROFILE_DIE_DETAILS)
+        ZoneTextL("skipping: vendor- or compiler-defined symbol");
+#endif // ORC_FEATURE(PROFILE_DIE_DETAILS)
+        return true;
+    }
+
     // lambdas are ephemeral and can't cause (hopefully) an ODRV
     if (d._path.view().find("lambda") != std::string::npos) {
 #if ORC_FEATURE(PROFILE_DIE_DETAILS)
@@ -1652,12 +1691,37 @@ void dwarf::implementation::process_all_dies() {
                 }
             }
 
+            post_process_die_attributes(attributes);
+
+            // See https://github.com/adobe/orc/issues/84
+            // This code accounts for unnamed structs that are part of
+            // a typedef expression. In such case the typedef actually adds a name
+            // for the purpose of linking, but the structure's
+            // die does not contain the name. Furthermore, the typedef die has
+            // been found to precede the die for the structure. So we grab it if
+            // we find a typedef, and use it if an ensuing structure_type has no name.
+            if (die._tag == dw::tag::typedef_ && attributes.has(dw::at::name)) {
+                _last_typedef_name = attributes.get(dw::at::name).string();
+            } else if (die._tag == dw::tag::structure_type &&
+                       !attributes.has(dw::at::name) &&
+                       _last_typedef_name) {
+                attribute name;
+                name._name = dw::at::name;
+                name._form = dw::form::strp;
+                name._value.string(_last_typedef_name);
+                attributes.push_back(name);
+
+                // These two statements must be in sync with the ones in `abbreviation_to_die`
+                path_identifier_set(die_identifier(die, attributes));
+                die._path = empool(std::string_view(qualified_symbol_name(die, attributes)));
+
+                _last_typedef_name = pool_string(); // reset it to avoid misuse
+            }
+
             if (die._has_children) {
                 path_identifier_push();
             }
 
-            post_process_die_attributes(attributes);
-
             die._skippable = skip_die(die, attributes);
             die._ofd_index = _ofd_index;
             die._hash = die_hash(die, attributes);
diff --git a/src/main.cpp b/src/main.cpp
index 528d596..60031e1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -451,13 +451,19 @@ auto epilogue(bool exception) {
     const auto& g = globals::instance();
 
     if (g._object_file_count == 0) {
-        cout_safe([&](auto& s) {
-            const auto local_build = ORC_VERSION_STR() == std::string("local");
-            const std::string tag_url = local_build ? "" : std::string(" (https://github.com/adobe/orc/releases/tag/") + ORC_VERSION_STR() + ")";
-            s << "ORC (https://github.com/adobe/orc)\n";
-            s << "    version: " << ORC_VERSION_STR() << tag_url << '\n';
-            s << "    sha: " << ORC_SHA_STR() << '\n';
-        });
+        if (settings::instance()._output_file_mode == settings::output_file_mode::json) {
+            cout_safe([](auto& s){
+                s << orc::version_json() << '\n';
+            });
+        } else {
+            cout_safe([&](auto& s) {
+                const auto local_build = ORC_VERSION_STR() == std::string("local");
+                const std::string tag_url = local_build ? "" : std::string(" (https://github.com/adobe/orc/releases/tag/") + ORC_VERSION_STR() + ")";
+                s << "ORC (https://github.com/adobe/orc)\n";
+                s << "    version: " << ORC_VERSION_STR() << tag_url << '\n';
+                s << "    sha: " << ORC_SHA_STR() << '\n';
+            });
+        }
     } else if (log_level_at_least(settings::log_level::warning)) {
         // Make sure these values are in sync with the `synopsis` json blob in `to_json`.
         cout_safe([&](auto& s) {
@@ -510,9 +516,9 @@ void maybe_forward_to_linker(int argc, char** argv, const cmdline_results& cmdli
     } else if (cmdline._libtool_mode) {
         executable_path /= "libtool";
     } else {
-        if (log_level_at_least(settings::log_level::warning)) {
+        if (log_level_at_least(settings::log_level::verbose)) {
             cout_safe([&](auto& s) {
-                s << "warning: libtool/ld mode could not be derived; forwarding to linker disabled\n";
+                s << "verbose: libtool/ld mode could not be derived; forwarding to linker disabled\n";
             });
         }
 
diff --git a/src/orc.cpp b/src/orc.cpp
index cbf4a9a..9ccf765 100644
--- a/src/orc.cpp
+++ b/src/orc.cpp
@@ -46,6 +46,7 @@
 #include "orc/str.hpp"
 #include "orc/string_pool.hpp"
 #include "orc/tracy.hpp"
+#include "orc/version.hpp"
 
 /**************************************************************************************************/
 
@@ -702,6 +703,15 @@ std::string to_json(const std::vector<odrv_report>& reports) {
     return result.dump(spaces_k);
 }
 
+std::string version_json() {
+    nlohmann::json result;
+
+    result["version"] = ORC_VERSION_STR();
+    result["sha256"] = ORC_SHA_STR();
+
+    return result.dump();
+}
+
 /**************************************************************************************************/
 
 } // namespace orc
diff --git a/test/battery/typedef_anonymous_struct/a.cpp b/test/battery/typedef_anonymous_struct/a.cpp
new file mode 100644
index 0000000..def3ad5
--- /dev/null
+++ b/test/battery/typedef_anonymous_struct/a.cpp
@@ -0,0 +1,4 @@
+typedef struct { double _d; } S;
+
+// Required so the compiler generates a symbol.
+int getd(S s) { return s._d; }
diff --git a/test/battery/typedef_anonymous_struct/b.cpp b/test/battery/typedef_anonymous_struct/b.cpp
new file mode 100644
index 0000000..2296da3
--- /dev/null
+++ b/test/battery/typedef_anonymous_struct/b.cpp
@@ -0,0 +1,4 @@
+typedef struct {int _i;} S;
+
+// Required so the compiler generates a symbol.
+int geti(S s) { return s._i; }
diff --git a/test/battery/typedef_anonymous_struct/odrv_test.toml b/test/battery/typedef_anonymous_struct/odrv_test.toml
new file mode 100644
index 0000000..f7036de
--- /dev/null
+++ b/test/battery/typedef_anonymous_struct/odrv_test.toml
@@ -0,0 +1,9 @@
+[[source]]
+    path = "a.cpp"
+
+[[source]]
+    path = "b.cpp"
+
+[[odrv]]
+    category = "structure:byte_size"
+    symbol = "S"
diff --git a/test/src/main.cpp b/test/src/main.cpp
index e095310..06a3b29 100644
--- a/test/src/main.cpp
+++ b/test/src/main.cpp
@@ -504,19 +504,17 @@ void run_battery_test(const std::filesystem::path& home) {
 
 /**************************************************************************************************/
 
-void traverse_directory_tree(std::filesystem::path& directory) {
+void traverse_directory_tree(const std::filesystem::path& directory) {
     assert(is_directory(directory));
 
+    if (exists(directory / tomlname_k)) {
+        run_battery_test(directory);
+    }
+
     for (const auto& entry : std::filesystem::directory_iterator(directory)) {
         try {
             if (is_directory(entry)) {
-                std::filesystem::path path = entry.path();
-
-                if (exists(path / tomlname_k)) {
-                    run_battery_test(path);
-                }
-
-                traverse_directory_tree(path);
+                traverse_directory_tree(entry.path());
             }
         } catch (...) {
             console_error() << "\nIn battery " << entry.path() << ":";