handler: report errors on skipped merged cloud-config.txt parts

Log errors instead of warnings on skipped/invalid cloud-config parts. Also add error comments to merged /v/l/c/instance/cloud-config.txt. Error bread crumb comments also permit cloud-init schema to better represent failures in loading user-data parts when either MIME messages or jinja templates represent invalid YAML or unparseable content. LP: #1999952
canonical · Jun 20, 2023 · e6c069d · e6c069d
1 parent 4538a2e
commit e6c069d
Show file tree

Hide file tree

Showing 5 changed files with 233 additions and 107 deletions.
diff --git a/cloudinit/config/schema.py b/cloudinit/config/schema.py
@@ -19,12 +19,7 @@
 from cloudinit import importer, safeyaml
 from cloudinit.cmd.devel import read_cfg_paths
 from cloudinit.handlers import INCLUSION_TYPES_MAP, type_from_starts_with
-from cloudinit.util import (
-    decode_binary,
-    error,
-    get_modules_from_dir,
-    load_file,
-)
+from cloudinit.util import error, get_modules_from_dir, load_file
 
 try:
     from jsonschema import ValidationError as _ValidationError
@@ -110,6 +105,11 @@ def format(self) -> str:
 SchemaProblems = List[SchemaProblem]
 
 
+class UserDataTypeAndDecodedContent(NamedTuple):
+    userdata_type: str
+    content: str
+
+
 def _format_schema_problems(
     schema_problems: SchemaProblems,
     *,
@@ -517,7 +517,7 @@ class _Annotator:
     def __init__(
         self,
         cloudconfig: dict,
-        original_content: bytes,
+        original_content: str,
         schemamarks: dict,
     ):
         self._cloudconfig = cloudconfig
@@ -608,10 +608,10 @@ def annotate(
         self,
         schema_errors: SchemaProblems,
         schema_deprecations: SchemaProblems,
-    ) -> Union[str, bytes]:
+    ) -> str:
         if not schema_errors and not schema_deprecations:
             return self._original_content
-        lines = self._original_content.decode().split("\n")
+        lines = self._original_content.split("\n")
         if not isinstance(self._cloudconfig, dict):
             # Return a meaningful message on empty cloud-config
             return "\n".join(
@@ -632,7 +632,7 @@ def annotate(
 
 def annotated_cloudconfig_file(
     cloudconfig: dict,
-    original_content: bytes,
+    original_content: str,
     schemamarks: dict,
     *,
     schema_errors: Optional[SchemaProblems] = None,
@@ -654,56 +654,63 @@ def annotated_cloudconfig_file(
     )
 
 
-def validate_cloudconfig_file(
+def process_merged_cloud_config_part_problems(
+    content: str,
+) -> List[SchemaProblem]:
+    """Annotate and return schema validation errors in merged cloud-config.txt
+
+    When merging multiple cloud-config parts cloud-init logs an error and
+    ignores any user-data parts which are declared as #cloud-config but
+    cannot be processed. the hanlder.cloud_config module also leaves comments
+    in the final merged config for every invalid part file which begin with
+    MERGED_CONFIG_SCHEMA_ERROR_PREFIX to aid in triage.
+    """
+    from cloudinit.handlers.cloud_config import MERGED_PART_SCHEMA_ERROR_PREFIX
+
+    if MERGED_PART_SCHEMA_ERROR_PREFIX not in content:
+        return []
+    errors: List[SchemaProblem] = []
+    for line_num, line in enumerate(content.splitlines(), 1):
+        if line.startswith(MERGED_PART_SCHEMA_ERROR_PREFIX):
+            errors.append(
+                SchemaProblem(
+                    f"format-l{line_num}.c1",
+                    line.replace(
+                        MERGED_PART_SCHEMA_ERROR_PREFIX,
+                        "Ignored invalid user-data: ",
+                    ),
+                )
+            )
+    return errors
+
+
+def _get_config_type_and_rendered_userdata(
     config_path: str,
-    schema: dict,
-    annotate: bool = False,
+    content: str,
     instance_data_path: str = None,
-):
-    """Validate cloudconfig file adheres to a specific jsonschema.
+) -> UserDataTypeAndDecodedContent:
+    """
+    Return tuple of user-data-type and rendered content.
 
-    @param config_path: Path to the yaml cloud-config file to parse, or None
-        to default to system userdata from Paths object.
-    @param schema: Dict describing a valid jsonschema to validate against.
-    @param annotate: Boolean set True to print original config file with error
-        annotations on the offending lines.
-    @param instance_data_path: Path to instance_data JSON, used for text/jinja
-        rendering.
+    When encountering jinja user-data, render said content.
 
-    @raises SchemaValidationError containing any of schema_errors encountered.
-    @raises RuntimeError when config_path does not exist.
+    :return: UserDataTypeAndDecodedContent
+    :raises: SchemaValidationError when non-jinja content found but
+        header declared ## template: jinja.
     """
     from cloudinit.handlers.jinja_template import (
         JinjaLoadError,
         NotJinjaError,
         render_jinja_payload_from_file,
     )
 
-    content = load_file(config_path, decode=False)
     user_data_type = type_from_starts_with(content)
     schema_position = "format-l1.c1"
-    if not user_data_type:
-        raise SchemaValidationError(
-            [
-                SchemaProblem(
-                    schema_position,
-                    f"No valid cloud-init user-data header in {config_path}.\n"
-                    "Expected first line to be one of: "
-                    f"{', '.join(USERDATA_VALID_HEADERS)}",
-                )
-            ]
-        )
-    if user_data_type not in ("text/cloud-config", "text/jinja2"):
-        print(
-            f"User-data type '{user_data_type}' not currently evaluated"
-            " by cloud-init schema"
-        )
-        return
     if user_data_type == "text/jinja2":
         try:
             content = render_jinja_payload_from_file(
-                decode_binary(content), config_path, instance_data_path
-            ).encode()
+                content, config_path, instance_data_path
+            )
         except NotJinjaError as e:
             raise SchemaValidationError(
                 [
@@ -718,18 +725,52 @@ def validate_cloudconfig_file(
             error(str(e), sys_exit=True)
         schema_position = "format-l2.c1"
         user_data_type = type_from_starts_with(content)
-    if not user_data_type:
-        content_header = content[: decode_binary(content).find("\n")]
+    if not user_data_type:  # Neither jinja2 nor #cloud-config
+        header_line, _, _ = content.partition("\n")
         raise SchemaValidationError(
             [
                 SchemaProblem(
                     schema_position,
                     f"Unrecognized user-data header in {config_path}:"
-                    f" {content_header}. Expected one of the following "
-                    f"headers: {', '.join(USERDATA_VALID_HEADERS)}",
+                    f' "{header_line}".\nExpected first line'
+                    f" to be one of: {', '.join(USERDATA_VALID_HEADERS)}",
                 )
             ]
         )
+    elif user_data_type != "text/cloud-config":
+        print(
+            f"User-data type '{user_data_type}' not currently evaluated"
+            " by cloud-init schema"
+        )
+    return UserDataTypeAndDecodedContent(user_data_type, content)
+
+
+def validate_cloudconfig_file(
+    config_path: str,
+    schema: dict,
+    annotate: bool = False,
+    instance_data_path: str = None,
+):
+    """Validate cloudconfig file adheres to a specific jsonschema.
+
+    @param config_path: Path to the yaml cloud-config file to parse, or None
+        to default to system userdata from Paths object.
+    @param schema: Dict describing a valid jsonschema to validate against.
+    @param annotate: Boolean set True to print original config file with error
+        annotations on the offending lines.
+    @param instance_data_path: Path to instance_data JSON, used for text/jinja
+        rendering.
+
+    @raises SchemaValidationError containing any of schema_errors encountered.
+    @raises RuntimeError when config_path does not exist.
+    """
+    decoded_userdata = _get_config_type_and_rendered_userdata(
+        config_path, load_file(config_path, decode=True), instance_data_path
+    )
+    if decoded_userdata.userdata_type != "text/cloud-config":
+        return  # Neither nested #cloud-config in jinja2 nor raw #cloud-config
+    content = decoded_userdata.content
+    errors = process_merged_cloud_config_part_problems(content)
     try:
         if annotate:
             cloudconfig, marks = safeyaml.load_with_marks(content)
@@ -746,12 +787,12 @@ def validate_cloudconfig_file(
         if mark:
             line = mark.line + 1
             column = mark.column + 1
-        errors = [
+        errors.append(
             SchemaProblem(
                 "format-l{line}.c{col}".format(line=line, col=column),
                 "File {0} is not valid yaml. {1}".format(config_path, str(e)),
             ),
-        ]
+        )
         schema_error = SchemaValidationError(errors)
         if annotate:
             print(
@@ -769,13 +810,15 @@ def validate_cloudconfig_file(
             cloudconfig, schema, strict=True, log_deprecations=False
         )
     except SchemaValidationError as e:
+        if e.has_errors():
+            errors += e.schema_errors
         if annotate:
             print(
                 annotated_cloudconfig_file(
                     cloudconfig,
                     content,
                     marks,
-                    schema_errors=e.schema_errors,
+                    schema_errors=errors,
                     schema_deprecations=e.schema_deprecations,
                 )
             )
@@ -786,8 +829,8 @@ def validate_cloudconfig_file(
                 separator=", ",
             )
             print(message)
-        if e.has_errors():  # We do not consider deprecations as error
-            raise
+        if errors:
+            raise SchemaValidationError(schema_errors=errors) from e
 
 
 def _sort_property_order(value):

diff --git a/cloudinit/handlers/cloud_config.py b/cloudinit/handlers/cloud_config.py
@@ -40,6 +40,8 @@
 CLOUD_PREFIX = "#cloud-config"
 JSONP_PREFIX = "#cloud-config-jsonp"
 
+MERGED_PART_SCHEMA_ERROR_PREFIX = "# Cloud-config part ignored SCHEMA_ERROR: "
+
 
 class CloudConfigPartHandler(handlers.Handler):
 
@@ -53,6 +55,7 @@ def __init__(self, paths, **_kwargs):
         if "cloud_config_path" in _kwargs:
             self.cloud_fn = paths.get_ipath(_kwargs["cloud_config_path"])
         self.file_names = []
+        self.error_file_names = []
 
     def _write_cloud_config(self):
         if not self.cloud_fn:
@@ -66,6 +69,8 @@ def _write_cloud_config(self):
                     fn = "?"
                 file_lines.append("# %s" % (fn))
             file_lines.append("")
+        for error_file in self.error_file_names:
+            file_lines.append(f"{MERGED_PART_SCHEMA_ERROR_PREFIX}{error_file}")
         if self.cloud_buf is not None:
             # Something was actually gathered....
             lines = [
@@ -143,6 +148,7 @@ def handle_part(self, data, ctype, filename, payload, frequency, headers):
                 filename = filename.replace(i, " ")
             self.file_names.append(filename.strip())
         except ValueError as err:
+            self.error_file_names.append(filename.strip())
             LOG.warning(
                 "Failed at merging in cloud config part from %s: %s",
                 filename,