From 7879fad1579fe891de0ea29ea9808241c5cc96bd Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 22 Oct 2024 09:18:58 -0400 Subject: [PATCH] [libc++] Rewrite the transitive header checking machinery (#110554) Since we don't generate a full dependency graph of headers, we can greatly simplify the script that parses the result of --trace-includes. At the same time, we also unify the mechanism for detecting whether a header is a public/C compat/internal/etc header with the existing mechanism in header_information.py. As a drive-by this fixes the headers_in_modulemap.sh.py test which had been disabled by mistake because it used its own way of determining the list of libc++ headers. By consistently using header_information.py to get that information, problems like this shouldn't happen anymore. This should also unblock #110303, which was blocked because of a brittle implementation of the transitive includes check which broke when the repository was cloned at a path like /path/__something/more. --- libcxx/test/libcxx/header_inclusions.gen.py | 2 +- libcxx/test/libcxx/headers_in_modulemap.sh.py | 22 +- libcxx/test/libcxx/transitive_includes.gen.py | 12 +- .../test/libcxx/transitive_includes/to_csv.py | 120 ++++++++ .../test/libcxx/transitive_includes_to_csv.py | 147 ---------- libcxx/utils/generate_iwyu_mapping.py | 2 +- libcxx/utils/generate_libcxx_cppm_in.py | 14 +- libcxx/utils/libcxx/header_information.py | 276 ++++++++++-------- 8 files changed, 297 insertions(+), 298 deletions(-) create mode 100755 libcxx/test/libcxx/transitive_includes/to_csv.py delete mode 100755 libcxx/test/libcxx/transitive_includes_to_csv.py diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py index 2ecc47cbb1891a7..e5def1ad4cb70d9 100644 --- a/libcxx/test/libcxx/header_inclusions.gen.py +++ b/libcxx/test/libcxx/header_inclusions.gen.py @@ -16,7 +16,7 @@ from libcxx.header_information import lit_header_restrictions, public_headers, mandatory_inclusions for header in public_headers: - header_guard = lambda h: f"_LIBCPP_{h.upper().replace('.', '_').replace('/', '_')}" + header_guard = lambda h: f"_LIBCPP_{str(h).upper().replace('.', '_').replace('/', '_')}" # has no header guards if header == 'cassert': diff --git a/libcxx/test/libcxx/headers_in_modulemap.sh.py b/libcxx/test/libcxx/headers_in_modulemap.sh.py index 237b006115ccdf3..51b14377ba89bb4 100644 --- a/libcxx/test/libcxx/headers_in_modulemap.sh.py +++ b/libcxx/test/libcxx/headers_in_modulemap.sh.py @@ -1,25 +1,15 @@ -# RUN: %{python} %s %{libcxx-dir}/utils %{include-dir} +# RUN: %{python} %s %{libcxx-dir}/utils import sys - sys.path.append(sys.argv[1]) +from libcxx.header_information import all_headers, libcxx_include -import pathlib -import sys -from libcxx.header_information import is_modulemap_header, is_header - -headers = list(pathlib.Path(sys.argv[2]).rglob("*")) -modulemap = open(f"{sys.argv[2]}/module.modulemap").read() +with open(libcxx_include / "module.modulemap") as f: + modulemap = f.read() isHeaderMissing = False - -for header in headers: - if not is_header(header): - continue - - header = header.relative_to(pathlib.Path(sys.argv[2])).as_posix() - - if not is_modulemap_header(header): +for header in all_headers: + if not header.is_in_modulemap(): continue if not str(header) in modulemap: diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py index 22075364bf1b799..2693617bcb0e5b4 100644 --- a/libcxx/test/libcxx/transitive_includes.gen.py +++ b/libcxx/test/libcxx/transitive_includes.gen.py @@ -42,10 +42,10 @@ all_traces = [] for header in sorted(public_headers): - if header.endswith(".h"): # Skip C compatibility or detail headers + if header.is_C_compatibility() or header.is_internal(): continue - normalized_header = re.sub("/", "_", header) + normalized_header = re.sub("/", "_", str(header)) print( f"""\ // RUN: echo "#include <{header}>" | %{{cxx}} -xc++ - %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.{normalized_header}.txt @@ -55,17 +55,17 @@ print( f"""\ -// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv +// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv """ ) else: for header in public_headers: - if header.endswith(".h"): # Skip C compatibility or detail headers + if header.is_C_compatibility() or header.is_internal(): continue # Escape slashes for the awk command below - escaped_header = header.replace("/", "\\/") + escaped_header = str(header).replace("/", "\\/") print( f"""\ @@ -92,7 +92,7 @@ // RUN: mkdir %t // RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt -// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv +// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv // RUN: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv // RUN: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv #include <{header}> diff --git a/libcxx/test/libcxx/transitive_includes/to_csv.py b/libcxx/test/libcxx/transitive_includes/to_csv.py new file mode 100755 index 000000000000000..69d94deedf6f507 --- /dev/null +++ b/libcxx/test/libcxx/transitive_includes/to_csv.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +from typing import List, Tuple, Optional +import argparse +import io +import itertools +import os +import pathlib +import re +import sys + +libcxx_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +sys.path.append(os.path.join(libcxx_root, "utils")) +from libcxx.header_information import Header + +def parse_line(line: str) -> Tuple[int, str]: + """ + Parse a single line of --trace-includes output. + + Returns the inclusion level and the raw file name being included. + """ + match = re.match(r"(\.+) (.+)", line) + if not match: + raise ArgumentError(f"Line {line} contains invalid data.") + + # The number of periods in front of the header name is the nesting level of + # that header. + return (len(match.group(1)), match.group(2)) + +def make_cxx_v1_relative(header: str) -> Optional[str]: + """ + Returns the path of the header as relative to /c++/v1, or None if the path + doesn't contain c++/v1. + + We use that heuristic to figure out which headers are libc++ headers. + """ + # On Windows, the path separators can either be forward slash or backslash. + # If it is a backslash, Clang prints it escaped as two consecutive + # backslashes, and they need to be escaped in the RE. (Use a raw string for + # the pattern to avoid needing another level of escaping on the Python string + # literal level.) + pathsep = r"(?:/|\\\\)" + CXX_V1_REGEX = r"^.*c\+\+" + pathsep + r"v[0-9]+" + pathsep + r"(.+)$" + match = re.match(CXX_V1_REGEX, header) + if not match: + return None + else: + return match.group(1) + +def parse_file(file: io.TextIOBase) -> List[Tuple[Header, Header]]: + """ + Parse a file containing --trace-includes output to generate a list of the + transitive includes contained in it. + """ + result = [] + includer = None + for line in file.readlines(): + (level, header) = parse_line(line) + relative = make_cxx_v1_relative(header) + + # Not a libc++ header + if relative is None: + continue + + # If we're at the first level, remember this header as being the one who includes other headers. + # There's usually exactly one, except if the compiler is passed a file with `-include`. + if level == 1: + includer = Header(relative) + continue + + # Otherwise, take note that this header is being included by the top-level includer. + else: + assert includer is not None + result.append((includer, Header(relative))) + return result + +def print_csv(includes: List[Tuple[Header, Header]]) -> None: + """ + Print the transitive includes as space-delimited CSV. + + This function only prints public libc++ headers that are not C compatibility headers. + """ + # Sort and group by includer + by_includer = lambda t: t[0] + includes = itertools.groupby(sorted(includes, key=by_includer), key=by_includer) + + for (includer, includees) in includes: + includees = map(lambda t: t[1], includees) + for h in sorted(set(includees)): + if h.is_public() and not h.is_C_compatibility(): + print(f"{includer} {h}") + +def main(argv): + parser = argparse.ArgumentParser( + description=""" + Given a list of headers produced by --trace-includes, produce a list of libc++ headers in that output. + + Note that -fshow-skipped-includes must also be passed to the compiler in order to get sufficient + information for this script to run. + + The output of this script is provided in space-delimited CSV format where each line contains: + +
+ """) + parser.add_argument("inputs", type=argparse.FileType("r"), nargs='+', default=None, + help="One or more files containing the result of --trace-includes") + args = parser.parse_args(argv) + + includes = [line for file in args.inputs for line in parse_file(file)] + print_csv(includes) + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/libcxx/test/libcxx/transitive_includes_to_csv.py b/libcxx/test/libcxx/transitive_includes_to_csv.py deleted file mode 100755 index 73571fb404ae7be..000000000000000 --- a/libcxx/test/libcxx/transitive_includes_to_csv.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python -# ===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ===----------------------------------------------------------------------===## - -from dataclasses import dataclass -from typing import List # Needed for python 3.8 compatibility. -import argparse -import pathlib -import re -import sys - - -@dataclass -class header: - name: str = None - level: int = -1 - - -def parse_line(line: str) -> header: - """ - Parse an output line from --trace-includes into a `header`. - """ - match = re.match(r"(\.+) (.+)", line) - if not match: - sys.exit(f"Line {line} contains invalid data.") - - # The number of periods in front of the header name is the nesting level of - # that header. - return header(match.group(2), len(match.group(1))) - - -# On Windows, the path separators can either be forward slash or backslash. -# If it is a backslash, Clang prints it escaped as two consecutive -# backslashes, and they need to be escaped in the RE. (Use a raw string for -# the pattern to avoid needing another level of escaping on the Python string -# literal level.) -LIBCXX_HEADER_REGEX = r".*c\+\+(?:/|\\\\)v[0-9]+(?:/|\\\\)(.+)" - -def is_libcxx_header(header: str) -> bool: - """ - Returns whether a header is a libc++ header, excluding the C-compatibility headers. - """ - # Only keep files in the c++/vN directory. - match = re.match(LIBCXX_HEADER_REGEX, header) - if not match: - return False - - # Skip C compatibility headers (in particular, make sure not to skip libc++ detail headers). - relative = match.group(1) - if relative.endswith(".h") and not ( - relative.startswith("__") or re.search(r"(/|\\\\)__", relative) - ): - return False - - return True - - -def parse_file(file: pathlib.Path) -> List[str]: - """ - Parse a file containing --trace-includes output to generate a list of the top-level C++ includes - contained in it. - - This effectively generates the list of public libc++ headers of the header whose --trace-includes it is. - In order to get the expected result of --trace-includes, the -fshow-skipped-includes flag also needs to - be passed. - """ - result = list() - with file.open(encoding="utf-8") as f: - for line in f.readlines(): - header = parse_line(line) - - # Skip non-libc++ headers - if not is_libcxx_header(header.name): - continue - - # Include top-level headers in the output. There's usually exactly one, - # except if the compiler is passed a file with `-include`. Top-level - # headers are transparent, in the sense that we want to go look at - # transitive includes underneath. - if header.level == 1: - level = 999 - result.append(header) - continue - - # Detail headers are transparent too: we attribute all includes of public libc++ - # headers under a detail header to the last public libc++ header that included it. - if header.name.startswith("__") or re.search(r"(/|\\\\)__", header.name): - level = 999 - continue - - # Add the non-detail libc++ header to the list. - level = header.level - result.append(header) - return result - - -def create_include_graph(trace_includes: List[pathlib.Path]) -> List[str]: - result = list() - for file in trace_includes: - headers = parse_file(file) - - # Get actual filenames relative to libc++'s installation directory instead of full paths - relative = lambda h: re.match(LIBCXX_HEADER_REGEX, h).group(1) - - top_level = relative( - next(h.name for h in headers if h.level == 1) - ) # There should be only one top-level header - includes = [relative(h.name) for h in headers if h.level != 1] - - # Remove duplicates in all includes. - includes = list(set(includes)) - - if len(includes) != 0: - result.append([top_level] + includes) - return result - - -def print_csv(graph: List[str]) -> None: - for includes in graph: - header = includes[0] - for include in sorted(includes[1:]): - if header == include: - sys.exit(f"Cycle detected: header {header} includes itself.") - print(f"{header} {include}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="""Produce a dependency graph of libc++ headers, in CSV format. -This script is normally executed by libcxx/test/libcxx/transitive_includes.gen.py""", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "inputs", - default=None, - metavar="FILE", - nargs='+', - help="One or more files containing the result of --trace-includes on the headers one wishes to graph.", - ) - options = parser.parse_args() - - print_csv(create_include_graph(map(pathlib.Path, options.inputs))) diff --git a/libcxx/utils/generate_iwyu_mapping.py b/libcxx/utils/generate_iwyu_mapping.py index 599201808bb79bc..7976f4afc24fa83 100644 --- a/libcxx/utils/generate_iwyu_mapping.py +++ b/libcxx/utils/generate_iwyu_mapping.py @@ -71,7 +71,7 @@ def main(argv: typing.List[str]): mappings = [] # Pairs of (header, public_header) for header in libcxx.header_information.all_headers: - public_headers = IWYU_mapping(header) + public_headers = IWYU_mapping(str(header)) if public_headers is not None: mappings.extend((header, public) for public in public_headers) diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py index e98ac1b24331448..39076a61b55b845 100644 --- a/libcxx/utils/generate_libcxx_cppm_in.py +++ b/libcxx/utils/generate_libcxx_cppm_in.py @@ -9,19 +9,11 @@ import os.path import sys -from libcxx.header_information import module_c_headers -from libcxx.header_information import module_headers -from libcxx.header_information import header_restrictions -from libcxx.header_information import headers_not_available +from libcxx.header_information import module_c_headers, module_headers, header_restrictions, headers_not_available, libcxx_root def write_file(module): - libcxx_module_directory = os.path.join( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "modules" - ) - with open( - os.path.join(libcxx_module_directory, f"{module}.cppm.in"), "w" - ) as module_cpp_in: + with open(libcxx_root / "modules" / f"{module}.cppm.in", "w") as module_cpp_in: module_cpp_in.write( """\ // -*- C++ -*- @@ -45,7 +37,7 @@ def write_file(module): // and the headers of Table 25: C++ headers for C library facilitiesā€ƒ[tab:headers.cpp.c] """ ) - for header in module_headers if module == "std" else module_c_headers: + for header in sorted(module_headers if module == "std" else module_c_headers): if header in header_restrictions: module_cpp_in.write( f"""\ diff --git a/libcxx/utils/libcxx/header_information.py b/libcxx/utils/libcxx/header_information.py index 2ed52e8c1dbf225..c5849e80f085b62 100644 --- a/libcxx/utils/libcxx/header_information.py +++ b/libcxx/utils/libcxx/header_information.py @@ -6,7 +6,166 @@ # # ===----------------------------------------------------------------------===## -import os, pathlib +import os, pathlib, functools + +libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +libcxx_include = libcxx_root / "include" +assert libcxx_root.exists() + +def _is_header_file(file): + """Returns whether the given file is a header file, i.e. not a directory or the modulemap file.""" + return not file.is_dir() and not file.name in [ + "module.modulemap", + "CMakeLists.txt", + "libcxx.imp", + "__config_site.in", + ] + +@functools.total_ordering +class Header: + _name: str + """Relative path from the root of libcxx/include""" + + def __init__(self, name: str): + """Create a Header. + + name: The path of the header relative to libc++'s include directory. + For example '__algorithm/find.h' or 'coroutine'. + """ + self._name = name + + def is_public(self) -> bool: + """Returns whether the header is a public libc++ API header.""" + return "__" not in self._name and not self._name.startswith("ext/") + + def is_internal(self) -> bool: + """Returns whether the header is an internal implementation detail of the library.""" + return not self.is_public() + + def is_C_compatibility(self) -> bool: + """ + Returns whether the header is a C compatibility header (headers ending in .h like stdlib.h). + + Note that headers like are not considered C compatibility headers. + """ + return self.is_public() and self._name.endswith(".h") + + def is_cstd(self) -> bool: + """Returns whether the header is a C 'std' header, like , , etc.""" + return self._name in [ + "cassert", + "ccomplex", + "cctype", + "cerrno", + "cfenv", + "cfloat", + "cinttypes", + "ciso646", + "climits", + "clocale", + "cmath", + "csetjmp", + "csignal", + "cstdarg", + "cstdbool", + "cstddef", + "cstdint", + "cstdio", + "cstdlib", + "cstring", + "ctgmath", + "ctime", + "cuchar", + "cwchar", + "cwctype", + ] + + def is_experimental(self) -> bool: + """Returns whether the header is a public experimental header.""" + return self.is_public() and self._name.startswith("experimental/") + + def has_cxx20_module(self) -> bool: + """ + Returns whether the header is in the std and std.compat C++20 modules. + + These headers are all C++23-and-later headers, excluding C compatibility headers and + experimental headers. + """ + # These headers have been removed in C++20 so are never part of a module. + removed_in_20 = ["ccomplex", "ciso646", "cstdbool", "ctgmath"] + return self.is_public() and not self.is_experimental() and not self.is_C_compatibility() and not self._name in removed_in_20 + + def is_in_modulemap(self) -> bool: + """Returns whether a header should be listed in the modulemap.""" + # TODO: Should `__config_site` be in the modulemap? + if self._name == "__config_site": + return False + + if self._name == "__assertion_handler": + return False + + # exclude libc++abi files + if self._name in ["cxxabi.h", "__cxxabi_config.h"]: + return False + + # exclude headers in __support/ - these aren't supposed to work everywhere, + # so they shouldn't be included in general + if self._name.startswith("__support/"): + return False + + # exclude ext/ headers - these are non-standard extensions and are barely + # maintained. People should migrate away from these and we don't need to + # burden ourself with maintaining them in any way. + if self._name.startswith("ext/"): + return False + return True + + def __str__(self) -> str: + return self._name + + def __repr__(self) -> str: + return repr(self._name) + + def __eq__(self, other) -> bool: + if isinstance(other, str): + return self._name == other + return self._name == other._name + + def __lt__(self, other) -> bool: + if isinstance(other, str): + return self._name < other + return self._name < other._name + + def __hash__(self) -> int: + return hash(self._name) + + +# Commonly-used sets of headers +all_headers = [Header(p.relative_to(libcxx_include).as_posix()) for p in libcxx_include.rglob("[_a-z]*") if _is_header_file(p)] +all_headers += [Header("__config_site"), Header("__assertion_handler")] # Headers generated during the build process +public_headers = [h for h in all_headers if h.is_public()] +module_headers = [h for h in all_headers if h.has_cxx20_module()] +module_c_headers = [h for h in all_headers if h.has_cxx20_module() and h.is_cstd()] + +# These headers are not yet implemented in libc++ +# +# These headers are required by the latest (draft) Standard but have not been +# implemented yet. They are used in the generated module input. The C++23 standard +# modules will fail to build if a header is added but this list is not updated. +headers_not_available = list(map(Header, [ + "debugging", + "flat_map", + "flat_set", + "generator", + "hazard_pointer", + "inplace_vector", + "linalg", + "rcu", + "spanstream", + "stacktrace", + "stdfloat", + "text_encoding", +])) header_restrictions = { # headers with #error directives @@ -112,118 +271,3 @@ "variant": ["compare"], "vector": ["compare", "initializer_list"], } - - -# These headers are not yet implemented in libc++ -# -# These headers are required by the latest (draft) Standard but have not been -# implemented yet. They are used in the generated module input. The C++23 standard -# modules will fail to build if a header is added but this list is not updated. -headers_not_available = [ - "debugging", - "flat_map", - "flat_set", - "generator", - "hazard_pointer", - "inplace_vector", - "linalg", - "rcu", - "spanstream", - "stacktrace", - "stdfloat", - "text_encoding", -] - - -def is_header(file): - """Returns whether the given file is a header (i.e. not a directory or the modulemap file).""" - return not file.is_dir() and not file.name in [ - "module.modulemap", - "CMakeLists.txt", - "libcxx.imp", - ] - - -def is_public_header(header): - return "__" not in header and not header.startswith("ext/") - - -def is_modulemap_header(header): - """Returns whether a header should be listed in the modulemap""" - # TODO: Should `__config_site` be in the modulemap? - if header == "__config_site": - return False - - if header == "__assertion_handler": - return False - - # exclude libc++abi files - if header in ["cxxabi.h", "__cxxabi_config.h"]: - return False - - # exclude headers in __support/ - these aren't supposed to work everywhere, - # so they shouldn't be included in general - if header.startswith("__support/"): - return False - - # exclude ext/ headers - these are non-standard extensions and are barely - # maintained. People should migrate away from these and we don't need to - # burden ourself with maintaining them in any way. - if header.startswith("ext/"): - return False - return True - -libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) -include = pathlib.Path(os.path.join(libcxx_root, "include")) -test = pathlib.Path(os.path.join(libcxx_root, "test")) -assert libcxx_root.exists() - -all_headers = sorted( - p.relative_to(include).as_posix() for p in include.rglob("[_a-z]*") if is_header(p) -) -toplevel_headers = sorted( - p.relative_to(include).as_posix() for p in include.glob("[_a-z]*") if is_header(p) -) -experimental_headers = sorted( - p.relative_to(include).as_posix() - for p in include.glob("experimental/[a-z]*") - if is_header(p) -) - -public_headers = [p for p in all_headers if is_public_header(p)] - -# The headers used in the std and std.compat modules. -# -# This is the set of all C++23-and-later headers, excluding C compatibility headers. -module_headers = [ - header - for header in toplevel_headers - if not header.endswith(".h") and is_public_header(header) - # These headers have been removed in C++20 so are never part of a module. - and not header in ["ccomplex", "ciso646", "cstdbool", "ctgmath"] -] - -# The C headers used in the std and std.compat modules. -module_c_headers = [ - "cassert", - "cctype", - "cerrno", - "cfenv", - "cfloat", - "cinttypes", - "climits", - "clocale", - "cmath", - "csetjmp", - "csignal", - "cstdarg", - "cstddef", - "cstdint", - "cstdio", - "cstdlib", - "cstring", - "ctime", - "cuchar", - "cwchar", - "cwctype", -]