cli: schema validation of jinja template user-data (SC-1385) (#2132)

The CLI cloud-init schema now asserts that the leading header comment in user-data files is a valid user-data type. Raise an informative error otherwise about valid user-data types. For user-data files declared with '## template: jinja', render those files first sourcing jinja variables from /run/cloud-init/instance-data.json or a new --instance-data parameter. Once the jinja template is rendered, validate schema of the resulting #cloud-config user-data. This branch also ensures any errors and deprecation warnings are unique. LP: #1881925
author: Chad Smith <chad.smith@canonical.com> 2023-04-28 02:01:31 -0600
committer: GitHub <noreply@github.com> 2023-04-28 10:01:31 +0200
commit: e9d1d3a0efce95b6bc7f63fe01d579b98664c69f (patch)
tree: 91b8868fa804517d54f500e5b5e0a562fd41dae1
parent: aa0cd62c81866d632522bbc54dc03eb4fa7fd913 (diff)
download: cloud-init-git-e9d1d3a0efce95b6bc7f63fe01d579b98664c69f.tar.gz
2 files changed, 278 insertions, 73 deletions
diff --git a/cloudinit/config/schema.py b/cloudinit/config/schema.py
index 9886bde6..9005e924 100644
--- a/cloudinit/config/schema.py
+++ b/cloudinit/config/schema.py
@@ -17,8 +17,14 @@ from typing import TYPE_CHECKING, List, NamedTuple, Optional, Type, Union, cast
 import yaml
 
 from cloudinit import importer, safeyaml
-from cloudinit.stages import Init
-from cloudinit.util import error, get_modules_from_dir, load_file
+from cloudinit.cmd.devel import read_cfg_paths
+from cloudinit.handlers import INCLUSION_TYPES_MAP, type_from_starts_with
+from cloudinit.util import (
+    decode_binary,
+    error,
+    get_modules_from_dir,
+    load_file,
+)
 
 try:
     from jsonschema import ValidationError as _ValidationError
@@ -35,7 +41,6 @@ VERSIONED_USERDATA_SCHEMA_FILE = "versions.schema.cloud-config.json"
 # Also add new version definition to versions.schema.json.
 USERDATA_SCHEMA_FILE = "schema-cloud-config-v1.json"
 _YAML_MAP = {True: "true", False: "false", None: "null"}
-CLOUD_CONFIG_HEADER = b"#cloud-config"
 SCHEMA_DOC_TMPL = """
 {name}
 {title_underbar}
@@ -64,6 +69,10 @@ SCHEMA_EXAMPLES_SPACER_TEMPLATE = "\n    # --- Example{0} ---"
 DEPRECATED_KEY = "deprecated"
 DEPRECATED_PREFIX = "DEPRECATED: "
 
+# user-data files typically must begin with a leading '#'
+USERDATA_VALID_HEADERS = sorted(
+    [t for t in INCLUSION_TYPES_MAP.keys() if t[0] == "#"]
+)
 
 # type-annotate only if type-checking.
 # Consider to add `type_extensions` as a dependency when Bionic is EOL.
@@ -129,20 +138,26 @@ class SchemaValidationError(ValueError):
             ((flat.config.key, msg),)
         """
         message = ""
-        if schema_errors:
-            message += _format_schema_problems(
-                schema_errors, prefix="Cloud config schema errors: "
-            )
-        if schema_deprecations:
+
+        def handle_problems(problems, prefix):
+            if not problems:
+                return problems
+            nonlocal message
             if message:
                 message += "\n\n"
-            message += _format_schema_problems(
-                schema_deprecations,
-                prefix="Cloud config schema deprecations: ",
-            )
+            problems = sorted(list(set(problems)))
+            message += _format_schema_problems(problems, prefix=prefix)
+            return problems
+
+        self.schema_errors = handle_problems(
+            schema_errors,
+            prefix="Cloud config schema errors: ",
+        )
+        self.schema_deprecations = handle_problems(
+            schema_deprecations,
+            prefix="Cloud config schema deprecations: ",
+        )
         super().__init__(message)
-        self.schema_errors = schema_errors
-        self.schema_deprecations = schema_deprecations
 
     def has_errors(self) -> bool:
         return bool(self.schema_errors)
@@ -639,7 +654,12 @@ def annotated_cloudconfig_file(
     )
 
 
-def validate_cloudconfig_file(config_path, schema, annotate=False):
+def validate_cloudconfig_file(
+    config_path: str,
+    schema: dict,
+    annotate: bool = False,
+    instance_data_path: str = None,
+):
     """Validate cloudconfig file adheres to a specific jsonschema.
 
     @param config_path: Path to the yaml cloud-config file to parse, or None
@@ -647,28 +667,69 @@ def validate_cloudconfig_file(config_path, schema, annotate=False):
     @param schema: Dict describing a valid jsonschema to validate against.
     @param annotate: Boolean set True to print original config file with error
         annotations on the offending lines.
+    @param instance_data_path: Path to instance_data JSON, used for text/jinja
+        rendering.
 
     @raises SchemaValidationError containing any of schema_errors encountered.
     @raises RuntimeError when config_path does not exist.
     """
+    from cloudinit.handlers.jinja_template import (
+        JinjaLoadError,
+        NotJinjaError,
+        render_jinja_payload_from_file,
+    )
+
     content = load_file(config_path, decode=False)
-    if not content.startswith(CLOUD_CONFIG_HEADER):
-        errors = [
-            SchemaProblem(
-                "format-l1.c1",
-                'File {0} needs to begin with "{1}"'.format(
-                    config_path, CLOUD_CONFIG_HEADER.decode()
-                ),
-            ),
-        ]
-        error = SchemaValidationError(errors)
-        if annotate:
-            print(
-                annotated_cloudconfig_file(
-                    {}, content, {}, schema_errors=error.schema_errors
+    user_data_type = type_from_starts_with(content)
+    schema_position = "format-l1.c1"
+    if not user_data_type:
+        raise SchemaValidationError(
+            [
+                SchemaProblem(
+                    schema_position,
+                    f"No valid cloud-init user-data header in {config_path}.\n"
+                    "Expected first line to be one of: "
+                    f"{', '.join(USERDATA_VALID_HEADERS)}",
                 )
-            )
-        raise error
+            ]
+        )
+    if user_data_type not in ("text/cloud-config", "text/jinja2"):
+        print(
+            f"User-data type '{user_data_type}' not currently evaluated"
+            " by cloud-init schema"
+        )
+        return
+    if user_data_type == "text/jinja2":
+        try:
+            content = render_jinja_payload_from_file(
+                decode_binary(content), config_path, instance_data_path
+            ).encode()
+        except NotJinjaError as e:
+            raise SchemaValidationError(
+                [
+                    SchemaProblem(
+                        schema_position,
+                        "Detected type '{user_data_type}' from header. "
+                        "But, content is not a jinja template",
+                    )
+                ]
+            ) from e
+        except JinjaLoadError as e:
+            error(str(e), sys_exit=True)
+        schema_position = "format-l2.c1"
+        user_data_type = type_from_starts_with(content)
+    if not user_data_type:
+        content_header = content[: decode_binary(content).find("\n")]
+        raise SchemaValidationError(
+            [
+                SchemaProblem(
+                    schema_position,
+                    f"Unrecognized user-data header in {config_path}:"
+                    f" {content_header}. Expected one of the following "
+                    f"headers: {', '.join(USERDATA_VALID_HEADERS)}",
+                )
+            ]
+        )
     try:
         if annotate:
             cloudconfig, marks = safeyaml.load_with_marks(content)
@@ -691,14 +752,14 @@ def validate_cloudconfig_file(config_path, schema, annotate=False):
                 "File {0} is not valid yaml. {1}".format(config_path, str(e)),
             ),
         ]
-        error = SchemaValidationError(errors)
+        schema_error = SchemaValidationError(errors)
         if annotate:
             print(
                 annotated_cloudconfig_file(
-                    {}, content, {}, schema_errors=error.schema_errors
+                    {}, content, {}, schema_errors=schema_error.schema_errors
                 )
             )
-        raise error from e
+        raise schema_error from e
     if not isinstance(cloudconfig, dict):
         # Return a meaningful message on empty cloud-config
         if not annotate:
@@ -1153,6 +1214,16 @@ def get_parser(parser=None):
         help="Path of the cloud-config yaml file to validate",
     )
     parser.add_argument(
+        "-i",
+        "--instance-data",
+        type=str,
+        help=(
+            "Path to instance-data.json file for variable expansion "
+            "of '##template: jinja' user-data. Default: "
+            f"{read_cfg_paths().get_runpath('instance_data')}"
+        ),
+    )
+    parser.add_argument(
         "--system",
         action="store_true",
         default=False,
@@ -1193,6 +1264,13 @@ def handle_schema_args(name, args):
     if args.docs:
         print(load_doc(args.docs))
         return
+    paths = read_cfg_paths()
+    if args.instance_data:
+        instance_data_path = args.instance_data
+    elif os.getuid() != 0:
+        instance_data_path = paths.get_runpath("instance_data")
+    else:
+        instance_data_path = paths.get_runpath("instance_data_sensitive")
     if args.config_file:
         config_files = (("user-data", args.config_file),)
     else:
@@ -1202,9 +1280,7 @@ def handle_schema_args(name, args):
                 " user. Try using sudo.",
                 sys_exit=True,
             )
-        init = Init(ds_deps=[])
-        init.fetch(existing="trust")
-        userdata_file = init.paths.get_ipath("cloud_config")
+        userdata_file = paths.get_ipath("cloud_config")
         if not userdata_file:
             error(
                 "Unable to obtain user data file. No instance data available",
@@ -1213,8 +1289,8 @@ def handle_schema_args(name, args):
             return  # Helps typing
         config_files = (("user-data", userdata_file),)
         vendor_config_files = (
-            ("vendor-data", init.paths.get_ipath("vendor_cloud_config")),
-            ("vendor2-data", init.paths.get_ipath("vendor2_cloud_config")),
+            ("vendor-data", paths.get_ipath("vendor_cloud_config")),
+            ("vendor2-data", paths.get_ipath("vendor2_cloud_config")),
         )
         for cfg_type, vendor_file in vendor_config_files:
             if vendor_file and os.path.exists(vendor_file):
@@ -1240,7 +1316,9 @@ def handle_schema_args(name, args):
         if multi_config_output:
             print(f"\n{idx}. {cfg_type} at {cfg_file}:")
         try:
-            validate_cloudconfig_file(cfg_file, full_schema, args.annotate)
+            validate_cloudconfig_file(
+                cfg_file, full_schema, args.annotate, instance_data_path
+            )
         except SchemaValidationError as e:
             if not args.annotate:
                 print(f"{nested_output_prefix}Invalid cloud-config {cfg_file}")
diff --git a/tests/unittests/config/test_schema.py b/tests/unittests/config/test_schema.py
index 0e49dbf3..ceb689a4 100644
--- a/tests/unittests/config/test_schema.py
+++ b/tests/unittests/config/test_schema.py
@@ -19,9 +19,8 @@ from typing import List, Optional, Sequence, Set
 
 import pytest
 
-from cloudinit import log, stages
+from cloudinit import log
 from cloudinit.config.schema import (
-    CLOUD_CONFIG_HEADER,
     VERSIONED_USERDATA_SCHEMA_FILE,
     MetaSchema,
     SchemaProblem,
@@ -53,7 +52,6 @@ from tests.unittests.helpers import (
     skipUnlessJsonSchema,
     skipUnlessJsonSchemaVersionGreaterThan,
 )
-from tests.unittests.util import FakeDataSource
 
 M_PATH = "cloudinit.config.schema."
 DEPRECATED_LOG_LEVEL = 35
@@ -720,14 +718,14 @@ class TestValidateCloudConfigFile:
         """On invalid header, validate_cloudconfig_file errors.
 
         A SchemaValidationError is raised when the file doesn't begin with
-        CLOUD_CONFIG_HEADER.
+        known headers.
         """
         config_file = tmpdir.join("my.yaml")
         config_file.write("#junk")
         error_msg = (
-            "Cloud config schema errors: format-l1.c1: File"
-            f" {config_file} needs to begin with"
-            f' "{CLOUD_CONFIG_HEADER.decode()}"'
+            f"No valid cloud-init user-data header in {config_file}.\n"
+            "Expected first line to be one of: #!, ## template: jinja, "
+            "#cloud-boothook, #cloud-config,"
         )
         with pytest.raises(SchemaValidationError, match=error_msg):
             validate_cloudconfig_file(config_file.strpath, {}, annotate)
@@ -777,28 +775,46 @@ class TestValidateCloudConfigFile:
 
     @skipUnlessJsonSchema()
     @pytest.mark.parametrize("annotate", (True, False))
+    def test_validateconfig_file_squelches_duplicate_errors(
+        self, annotate, tmpdir
+    ):
+        """validate_cloudconfig_file raises only unique errors."""
+        config_file = tmpdir.join("my.yaml")
+        schema = {  # Define duplicate schema definitions in different sections
+            "allOf": [
+                {"properties": {"p1": {"type": "string", "format": "string"}}},
+                {"properties": {"p1": {"type": "string", "format": "string"}}},
+            ]
+        }
+        config_file.write("#cloud-config\np1: -1")
+        error_msg = (  # Strict match of full error
+            "Cloud config schema errors: p1: -1 is not of type 'string'$"
+        )
+        with pytest.raises(SchemaValidationError, match=error_msg):
+            validate_cloudconfig_file(config_file.strpath, schema, annotate)
+
+    @skipUnlessJsonSchema()
+    @pytest.mark.parametrize("annotate", (True, False))
+    @mock.patch(M_PATH + "read_cfg_paths")
     @mock.patch("cloudinit.url_helper.time.sleep")
     def test_validateconfig_file_no_cloud_cfg(
-        self, m_sleep, annotate, capsys, mocker
+        self, m_sleep, read_cfg_paths, annotate, paths, capsys, mocker
     ):
         """validate_cloudconfig_file does noop with empty user-data."""
         schema = {"properties": {"p1": {"type": "string", "format": "string"}}}
-        blob = ""
 
-        ci = stages.Init()
-        ci.datasource = FakeDataSource(blob)
-        mocker.patch(M_PATH + "Init", return_value=ci)
-        cloud_config_file = ci.paths.get_ipath_cur("cloud_config")
+        paths.get_ipath = paths.get_ipath_cur
+        read_cfg_paths.return_value = paths
+        cloud_config_file = paths.get_ipath_cur("cloud_config")
         write_file(cloud_config_file, b"")
 
-        with pytest.raises(
-            SchemaValidationError,
-            match=re.escape(
-                "Cloud config schema errors: format-l1.c1:"
-                f" File {cloud_config_file} needs to begin with"
-                ' "#cloud-config"'
-            ),
-        ):
+        error_msg = (
+            "Cloud config schema errors: format-l1.c1: "
+            f"No valid cloud-init user-data header in {cloud_config_file}.\n"
+            "Expected first line to be one of: #!, ## template: jinja,"
+            " #cloud-boothook, #cloud-config,"
+        )
+        with pytest.raises(SchemaValidationError, match=error_msg):
             validate_cloudconfig_file(cloud_config_file, schema, annotate)
 
 
@@ -1603,6 +1619,7 @@ class TestAnnotatedCloudconfigFile:
         )
 
 
+@mock.patch(M_PATH + "read_cfg_paths")  # called by parse_args help docs
 class TestMain:
 
     exclusive_combinations = itertools.combinations(
@@ -1610,7 +1627,7 @@ class TestMain:
     )
 
     @pytest.mark.parametrize("params", exclusive_combinations)
-    def test_main_exclusive_args(self, params, capsys):
+    def test_main_exclusive_args(self, _read_cfg_paths, params, capsys):
         """Main exits non-zero and error on required exclusive args."""
         params = list(itertools.chain(*[a.split() for a in params]))
         with mock.patch("sys.argv", ["mycmd"] + params):
@@ -1625,7 +1642,7 @@ class TestMain:
         )
         assert expected == err
 
-    def test_main_missing_args(self, capsys):
+    def test_main_missing_args(self, _read_cfg_paths, capsys):
         """Main exits non-zero and reports an error on missing parameters."""
         with mock.patch("sys.argv", ["mycmd"]):
             with pytest.raises(SystemExit) as context_manager:
@@ -1639,7 +1656,7 @@ class TestMain:
         )
         assert expected == err
 
-    def test_main_absent_config_file(self, capsys):
+    def test_main_absent_config_file(self, _read_cfg_paths, capsys):
         """Main exits non-zero when config file is absent."""
         myargs = ["mycmd", "--annotate", "--config-file", "NOT_A_FILE"]
         with mock.patch("sys.argv", myargs):
@@ -1649,7 +1666,7 @@ class TestMain:
         _out, err = capsys.readouterr()
         assert "Error: Config file NOT_A_FILE does not exist\n" == err
 
-    def test_main_invalid_flag_combo(self, capsys):
+    def test_main_invalid_flag_combo(self, _read_cfg_paths, capsys):
         """Main exits non-zero when invalid flag combo used."""
         myargs = ["mycmd", "--annotate", "--docs", "DOES_NOT_MATTER"]
         with mock.patch("sys.argv", myargs):
@@ -1662,7 +1679,7 @@ class TestMain:
             "Cannot use --annotate with --docs\n" == err
         )
 
-    def test_main_prints_docs(self, capsys):
+    def test_main_prints_docs(self, _read_cfg_paths, capsys):
         """When --docs parameter is provided, main generates documentation."""
         myargs = ["mycmd", "--docs", "all"]
         with mock.patch("sys.argv", myargs):
@@ -1671,7 +1688,7 @@ class TestMain:
         assert "\nNTP\n---\n" in out
         assert "\nRuncmd\n------\n" in out
 
-    def test_main_validates_config_file(self, tmpdir, capsys):
+    def test_main_validates_config_file(self, _read_cfg_paths, tmpdir, capsys):
         """When --config-file parameter is provided, main validates schema."""
         myyaml = tmpdir.join("my.yaml")
         myargs = ["mycmd", "--config-file", myyaml.strpath]
@@ -1681,13 +1698,14 @@ class TestMain:
         out, _err = capsys.readouterr()
         assert f"Valid cloud-config: {myyaml}\n" == out
 
+    @mock.patch(M_PATH + "read_cfg_paths")
     @mock.patch(M_PATH + "os.getuid", return_value=0)
     def test_main_validates_system_userdata_and_vendordata(
-        self, _getuid, capsys, mocker, paths
+        self, _read_cfg_paths, _getuid, read_cfg_paths, capsys, mocker, paths
     ):
         """When --system is provided, main validates system userdata."""
-        m_init = mocker.patch(M_PATH + "Init")
-        m_init.return_value.paths.get_ipath = paths.get_ipath_cur
+        paths.get_ipath = paths.get_ipath_cur
+        read_cfg_paths.return_value = paths
         cloud_config_file = paths.get_ipath_cur("cloud_config")
         write_file(cloud_config_file, b"#cloud-config\nntp:")
         vd_file = paths.get_ipath_cur("vendor_cloud_config")
@@ -1721,7 +1739,9 @@ class TestMain:
         )
 
     @mock.patch(M_PATH + "os.getuid", return_value=1000)
-    def test_main_system_userdata_requires_root(self, m_getuid, capsys, paths):
+    def test_main_system_userdata_requires_root(
+        self, _read_cfg_paths, m_getuid, capsys, paths
+    ):
         """Non-root user can't use --system param"""
         myargs = ["mycmd", "--system"]
         with mock.patch("sys.argv", myargs):
@@ -1861,7 +1881,7 @@ class TestSchemaFuzz:
 
 class TestHandleSchemaArgs:
 
-    Args = namedtuple("Args", "config_file docs system annotate")
+    Args = namedtuple("Args", "config_file docs system annotate instance_data")
 
     @pytest.mark.parametrize(
         "annotate, expected_output",
@@ -1902,9 +1922,19 @@ apt_reboot_if_required: Default: ``false``. Deprecated in version 22.2.\
             ),
         ],
     )
+    @mock.patch(M_PATH + "read_cfg_paths")
     def test_handle_schema_args_annotate_deprecated_config(
-        self, annotate, expected_output, caplog, capsys, tmpdir
+        self,
+        read_cfg_paths,
+        annotate,
+        expected_output,
+        paths,
+        caplog,
+        capsys,
+        tmpdir,
     ):
+        paths.get_ipath = paths.get_ipath_cur
+        read_cfg_paths.return_value = paths
         user_data_fn = tmpdir.join("user-data")
         with open(user_data_fn, "w") as f:
             f.write(
@@ -1924,6 +1954,7 @@ apt_reboot_if_required: Default: ``false``. Deprecated in version 22.2.\
             annotate=annotate,
             docs=None,
             system=None,
+            instance_data=None,
         )
         handle_schema_args("unused", args)
         out, err = capsys.readouterr()
@@ -1933,3 +1964,99 @@ apt_reboot_if_required: Default: ``false``. Deprecated in version 22.2.\
         )
         assert not err
         assert "deprec" not in caplog.text
+
+    @pytest.mark.parametrize(
+        "uid, annotate, expected_out, expected_err, expectation",
+        [
+            pytest.param(
+                0,
+                True,
+                dedent(
+                    """\
+                    #cloud-config
+                    hostname: 123		# E1
+
+                    # Errors: -------------
+                    # E1: 123 is not of type 'string'
+
+
+                    """  # noqa: E501
+                ),
+                "",
+                does_not_raise(),
+                id="root_annotate_unique_errors_no_exception",
+            ),
+            pytest.param(
+                0,
+                False,
+                dedent(
+                    """\
+                    Invalid cloud-config {cfg_file}
+                    """  # noqa: E501
+                ),
+                dedent(
+                    """\
+                    Error: Cloud config schema errors: hostname: 123 is not of type 'string'
+
+                    Error: Invalid cloud-config schema: user-data
+
+                    """  # noqa: E501
+                ),
+                pytest.raises(SystemExit),
+                id="root_no_annotate_exception_with_unique_errors",
+            ),
+        ],
+    )
+    @mock.patch(M_PATH + "os.getuid")
+    @mock.patch(M_PATH + "read_cfg_paths")
+    def test_handle_schema_args_jinja_with_errors(
+        self,
+        read_cfg_paths,
+        getuid,
+        uid,
+        annotate,
+        expected_out,
+        expected_err,
+        expectation,
+        paths,
+        caplog,
+        capsys,
+        tmpdir,
+    ):
+        getuid.return_value = uid
+        paths.get_ipath = paths.get_ipath_cur
+        read_cfg_paths.return_value = paths
+        user_data_fn = tmpdir.join("user-data")
+        if uid == 0:
+            id_path = paths.get_runpath("instance_data_sensitive")
+        else:
+            id_path = paths.get_runpath("instance_data")
+        with open(id_path, "w") as f:
+            f.write(json.dumps({"ds": {"asdf": 123}}))
+        with open(user_data_fn, "w") as f:
+            f.write(
+                dedent(
+                    """\
+                    ## template: jinja
+                    #cloud-config
+                    hostname: {{ ds.asdf }}
+                    """
+                )
+            )
+        args = self.Args(
+            config_file=str(user_data_fn),
+            annotate=annotate,
+            docs=None,
+            system=None,
+            instance_data=None,
+        )
+        with expectation:
+            handle_schema_args("unused", args)
+        out, err = capsys.readouterr()
+        assert (
+            expected_out.format(cfg_file=user_data_fn, id_path=id_path) == out
+        )
+        assert (
+            expected_err.format(cfg_file=user_data_fn, id_path=id_path) == err
+        )
+        assert "deprec" not in caplog.text
author	Chad Smith <chad.smith@canonical.com>	2023-04-28 02:01:31 -0600
committer	GitHub <noreply@github.com>	2023-04-28 10:01:31 +0200
commit	e9d1d3a0efce95b6bc7f63fe01d579b98664c69f (patch)
tree	91b8868fa804517d54f500e5b5e0a562fd41dae1
parent	aa0cd62c81866d632522bbc54dc03eb4fa7fd913 (diff)
download	cloud-init-git-e9d1d3a0efce95b6bc7f63fe01d579b98664c69f.tar.gz