config_tools: composing operations around XMLs as pipelines

There is an increasing demand of composing different operations around XML schemas and/or data in different ways for different purpose. Today we already have: - Validate XML data, which takes XML schemas and data (board and scenario) as inputs. - Fill in missing nodes in XML data with default values, which takes XML schema and data (scenario only) as inputs. In the near future we'll extend the operations around XMLs by introducing XML schema preprocessing and XML data upgrading, adding more possibilities to construct a larger operation by composing smaller ones. In order for minimized code repetition and easier composition, this patch introduces an infrasturcture that abstracts each operation as a pipeline stage. Each stage defines its own inputs and outputs and can be composed sequentially as a larger, single operation. The existing operations listed above, along with XML file loaders, are then refactored to provide pipeline stages. The main methods are also refined to complete their tasks by constructing and invoking pipelines. Tracked-On: #6690 Signed-off-by: Junjie Mao <junjie.mao@intel.com>
2025-08-09 20:18:19 +00:00 · 2022-02-23 22:05:51 +08:00 · 2022-02-23 22:05:51 +08:00 · 0a7910c7f0
commit 0a7910c7f0
parent 4fb6ad247a
5 changed files with 273 additions and 74 deletions
--- a/misc/config_tools/scenario_config/default_populator.py
+++ b/misc/config_tools/scenario_config/default_populator.py
@ -5,10 +5,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #

+import os
 import argparse
-import lxml.etree as etree
+
 from scenario_transformer import ScenarioTransformer

+from pipeline import PipelineObject, PipelineStage, PipelineEngine
+
 class DefaultValuePopulator(ScenarioTransformer):
    def add_missing_nodes(self, xsd_element_node, xml_parent_node, new_node_index):
        element_name = xsd_element_node.get("name")
@ -20,7 +23,7 @@ class DefaultValuePopulator(ScenarioTransformer):
        if self.complex_type_of_element(xsd_element_node) is None and default_value is None:
            return []

-        new_node = etree.Element(element_name)
+        new_node = xml_parent_node.makeelement(element_name, {})
        new_node.text = default_value

        if new_node_index is not None:
@ -30,15 +33,30 @@ class DefaultValuePopulator(ScenarioTransformer):

        return [new_node]

+class DefaultValuePopulatingStage(PipelineStage):
+    uses = {"schema_etree", "scenario_etree"}
+    provides = {"scenario_etree"}
+
+    def run(self, obj):
+        populator = DefaultValuePopulator(obj.get("schema_etree"))
+        etree = obj.get("scenario_etree")
+        populator.transform(etree)
+        obj.set("scenario_etree", etree)
+
 def main(xsd_file, xml_file, out_file):
-    xsd_etree = etree.parse(xsd_file)
-    xsd_etree.xinclude()
-    populator = DefaultValuePopulator(xsd_etree)
+    from xml_loader import XMLLoadStage
+    from lxml_loader import LXMLLoadStage

-    xml_etree = etree.parse(xml_file, etree.XMLParser(remove_blank_text=True))
-    populator.transform(xml_etree)
+    pipeline = PipelineEngine(["schema_path", "scenario_path"])
+    pipeline.add_stages([
+        LXMLLoadStage("schema"),
+        XMLLoadStage("scenario"),
+        DefaultValuePopulatingStage(),
+    ])

-    xml_etree.write(out_file, pretty_print=True)
+    obj = PipelineObject(schema_path = xsd_file, scenario_path = xml_file)
+    pipeline.run(obj)
+    obj.get("scenario_etree").write(out_file)

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Populate a given scenario XML with default values of nonexistent nodes")
--- a/misc/config_tools/scenario_config/lxml_loader.py
+++ b/misc/config_tools/scenario_config/lxml_loader.py
@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2022 Intel Corporation.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+from lxml.etree import parse, XMLParser
+from pipeline import PipelineStage
+
+class LXMLLoadStage(PipelineStage):
+    def __init__(self, tag):
+        self.consumes = f"{tag}_path"
+        self.provides = f"{tag}_etree"
+
+    def run(self, obj):
+        xml_path = obj.get(self.consumes)
+        etree = parse(xml_path, XMLParser(remove_blank_text=True))
+        etree.xinclude()
+        obj.set(self.provides, etree)
--- a/misc/config_tools/scenario_config/pipeline.py
+++ b/misc/config_tools/scenario_config/pipeline.py
@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2022 Intel Corporation.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+class PipelineObject:
+    def __init__(self, **kwargs):
+        self.data = {}
+        for k,v in kwargs.items():
+            self.set(k, v)
+
+    def set(self, tag, data):
+        self.data[tag] = data
+
+    def get(self, tag):
+        return self.data[tag]
+
+    def has(self, tag):
+        return tag in self.data.keys()
+
+    def consume(self, tag):
+        return self.data.pop(tag, None)
+
+    def dump(self):
+        print(self.data)
+
+class PipelineStage:
+    # The following three class variables defines the inputs and outputs of the stage. Each of them can be either a set
+    # or a string (which is interpreted as a unit set)
+
+    consumes = set()        # Data consumed by this stage. Consumed data will be unavailable to later stages.
+    uses = set()            # Data used but not consumed by this stage.
+    provides = set()        # Data provided by this stage.
+
+    def run(self, obj):
+        raise NotImplementedError
+
+class PipelineEngine:
+    def __init__(self, initial_data = []):
+        self.stages = []
+        self.initial_data = set(initial_data)
+        self.available_data = set(initial_data)
+
+    def add_stage(self, stage):
+        consumes = stage.consumes if isinstance(stage.consumes, set) else {stage.consumes}
+        uses = stage.uses if isinstance(stage.uses, set) else {stage.uses}
+        provides = stage.provides if isinstance(stage.provides, set) else {stage.provides}
+
+        all_uses = consumes.union(uses)
+        if not all_uses.issubset(self.available_data):
+            raise Exception(f"Data {uses - self.available_data} need by stage {stage.__class__.__name__} but not provided by the pipeline")
+
+        self.stages.append(stage)
+        self.available_data = self.available_data.difference(consumes).union(provides)
+
+    def add_stages(self, stages):
+        for stage in stages:
+            self.add_stage(stage)
+
+    def run(self, obj):
+        for tag in self.initial_data:
+            if not obj.has(tag):
+                raise AttributeError(f"Data {tag} is needed by the pipeline but not provided by the object")
+
+        for stage in self.stages:
+            stage.run(obj)
+
+            consumes = stage.consumes if isinstance(stage.consumes, set) else {stage.consumes}
+            for tag in consumes:
+                obj.consume(tag)
--- a/misc/config_tools/scenario_config/validator.py
+++ b/misc/config_tools/scenario_config/validator.py
@ -7,8 +7,9 @@

 import sys, os
 import argparse
-import lxml.etree as etree
 import logging
+from copy import copy
+from collections import namedtuple

 try:
    import xmlschema
@ -18,7 +19,8 @@ except ImportError:
                  "To enable the validation, install the python package by executing: pip3 install xmlschema.")
    sys.exit(0)

-from default_populator import DefaultValuePopulator
+from pipeline import PipelineObject, PipelineStage, PipelineEngine
+from default_populator import DefaultValuePopulatingStage

 def existing_file_type(parser):
    def aux(arg):
@ -39,106 +41,174 @@ def log_level_type(parser):
            parser.error(f"{arg} is not a valid log level")
    return aux

-def load_schema(xsd_xml, datachecks_xml):
-    global schema, schema_etree, datachecks
+class ValidationError(dict):
+    def __init__(self, paths, message, severity):
+        super().__init__(paths = paths, message = message, severity = severity)

-    schema_etree = etree.parse(xsd_xml)
-    schema_etree.xinclude()
-    schema = xmlschema.XMLSchema11(etree.tostring(schema_etree, encoding="unicode"))
+    def __str__(self):
+        return f"{', '.join(self['paths'])}: {self['message']}"

-    datachecks_etree = etree.parse(datachecks_xml)
-    datachecks_etree.xinclude()
-    datachecks = xmlschema.XMLSchema11(etree.tostring(datachecks_etree, encoding="unicode"))
+class ScenarioValidator:
+    def __init__(self, schema_etree, datachecks_etree):
+        """Initialize the validator with preprocessed schemas in ElementTree."""
+        self.schema = xmlschema.XMLSchema11(schema_etree)
+        self.datachecks = xmlschema.XMLSchema11(datachecks_etree)

-config_tools_dir = os.path.join(os.path.dirname(__file__), "..")
-schema_dir = os.path.join(config_tools_dir, "schema")
-schema = None
-schema_etree = None
-datachecks = None
-load_schema(os.path.join(schema_dir, "config.xsd"), os.path.join(schema_dir, "datachecks.xsd"))
+    def check_syntax(self, scenario_etree):
+        errors = []

-def validate_one(board_xml, scenario_xml):
-    nr_schema_errors = 0
-    nr_check_errors = 0
-    nr_check_warnings = 0
-    board_name = os.path.basename(board_xml)
-    scenario_name = os.path.basename(scenario_xml)
-
-    scenario_etree = etree.parse(scenario_xml, etree.XMLParser(remove_blank_text=True))
-    DefaultValuePopulator(schema_etree).transform(scenario_etree)
-
-    it = schema.iter_errors(scenario_etree)
+        it = self.schema.iter_errors(scenario_etree)
        for error in it:
-        logging.debug(error)
-        nr_schema_errors += 1
+            # Syntactic errors are always critical.
+            e = ValidationError([error.path], error.reason, "critical")
+            logging.debug(e)
+            errors.append(e)

-    if nr_schema_errors == 0:
-        main_etree = etree.parse(board_xml)
-        main_etree.getroot().extend(scenario_etree.getroot()[:])
+        return errors

-        it = datachecks.iter_errors(main_etree)
+    def check_semantics(self, board_etree, scenario_etree):
+        errors = []
+
+        unified_node = copy(scenario_etree.getroot())
+        unified_node.extend(board_etree.getroot())
+        it = self.datachecks.iter_errors(unified_node)
        for error in it:
-            logging.debug(error)
-
+            logging.debug(f"{error.elem}: {error.message}")
            anno = error.validator.annotation
            severity = anno.elem.get("{https://projectacrn.org}severity")
+            errors.append(ValidationError([error.elem.tag], error.message, severity))

-            if severity == "error":
-                nr_check_errors += 1
-            elif severity == "warning":
-                nr_check_warnings += 1
+        return errors

-        if nr_check_errors > 0:
-            logging.error(f"Board {board_name} and scenario {scenario_name} have inconsistent data: {nr_check_errors} errors, {nr_check_warnings} warnings.")
-        elif nr_check_warnings > 0:
-            logging.warning(f"Board {board_name} and scenario {scenario_name} have inconsistent data: {nr_check_warnings} warnings.")
+class ValidatorConstructionStage(PipelineStage):
+    # The schema etree may still useful for schema-based transformation. Do not consume it.
+    uses = {"schema_etree"}
+    consumes = {"datachecks_etree"}
+    provides = {"validator"}
+
+    def run(self, obj):
+        validator = ScenarioValidator(obj.get("schema_etree"), obj.get("datachecks_etree"))
+        obj.set("validator", validator)
+
+class ValidatorConstructionByFileStage(PipelineStage):
+    uses = {"schema_path", "datachecks_path"}
+    provides = {"validator"}
+
+    def run(self, obj):
+        validator = ScenarioValidator(obj.get("schema_path"), obj.get("datachecks_path"))
+        obj.set("validator", validator)
+
+class SyntacticValidationStage(PipelineStage):
+    uses = {"validator", "scenario_etree"}
+    provides = {"syntactic_errors"}
+
+    def run(self, obj):
+        errors = obj.get("validator").check_syntax(obj.get("scenario_etree"))
+        obj.set("syntactic_errors", errors)
+
+class SemanticValidationStage(PipelineStage):
+    uses = {"validator", "board_etree", "scenario_etree"}
+    provides = {"semantic_errors"}
+
+    def run(self, obj):
+        errors = obj.get("validator").check_semantics(obj.get("board_etree"), obj.get("scenario_etree"))
+        obj.set("semantic_errors", errors)
+
+class ReportValidationResultStage(PipelineStage):
+    consumes = {"board_etree", "scenario_etree", "syntactic_errors", "semantic_errors"}
+    provides = {"nr_all_errors"}
+
+    def run(self, obj):
+        board_name = obj.get("board_etree").getroot().get("board")
+        scenario_name = obj.get("scenario_etree").getroot().get("scenario")
+
+        nr_critical = len(obj.get("syntactic_errors"))
+        nr_error = len(list(filter(lambda e: e["severity"] == "error", obj.get("semantic_errors"))))
+        nr_warning = len(list(filter(lambda e: e["severity"] == "warning", obj.get("semantic_errors"))))
+
+        if nr_critical > 0 or nr_error > 0:
+            logging.error(f"Board {board_name} and scenario {scenario_name} are inconsistent: {nr_critical} syntax errors, {nr_error} data errors, {nr_warning} warnings.")
+        elif nr_warning > 0:
+            logging.warning(f"Board {board_name} and scenario {scenario_name} are potentially inconsistent: {nr_warning} warnings.")
        else:
            logging.info(f"Board {board_name} and scenario {scenario_name} are valid and consistent.")
-    else:
-        logging.warning(f"Scenario {scenario_name} is invalid: {nr_schema_errors} schema errors.")

-    return nr_schema_errors + nr_check_errors + nr_check_warnings
+        obj.set("nr_all_errors", nr_critical + nr_error + nr_warning)

-def validate_board(board_xml):
+def validate_one(validation_pipeline, pipeline_obj, board_xml, scenario_xml):
+    pipeline_obj.set("board_path", board_xml)
+    pipeline_obj.set("scenario_path", scenario_xml)
+    validation_pipeline.run(pipeline_obj)
+    return pipeline_obj.consume("nr_all_errors")
+
+def validate_board(validation_pipeline, pipeline_obj, board_xml):
    board_dir = os.path.dirname(board_xml)
-    nr_violations = 0
+    nr_all_errors = 0

    for f in os.listdir(board_dir):
        if not f.endswith(".xml"):
            continue
        if f == os.path.basename(board_xml) or "launch" in f:
            continue
+        nr_all_errors += validate_one(validation_pipeline, pipeline_obj, board_xml, os.path.join(board_dir, f))

-        nr_violations += validate_one(board_xml, os.path.join(board_dir, f))
+    return nr_all_errors

-    return nr_violations
-
-def validate_all(data_dir):
-    nr_violations = 0
+def validate_all(validation_pipeline, pipeline_obj, data_dir):
+    nr_all_errors = 0

    for f in os.listdir(data_dir):
        board_xml = os.path.join(data_dir, f, f"{f}.xml")
        if os.path.isfile(board_xml):
-            nr_violations += validate_board(board_xml)
+            nr_all_errors += validate_board(validation_pipeline, pipeline_obj, board_xml)
        else:
            logging.warning(f"Cannot find a board XML under {os.path.join(data_dir, f)}")

-    return nr_violations
+    return nr_all_errors
+
+def main(args):
+    from xml_loader import XMLLoadStage
+    from lxml_loader import LXMLLoadStage
+
+    validator_construction_pipeline = PipelineEngine(["schema_path", "datachecks_path"])
+    validator_construction_pipeline.add_stages([
+        LXMLLoadStage("schema"),
+        LXMLLoadStage("datachecks"),
+        ValidatorConstructionStage(),
+    ])
+
+    validation_pipeline = PipelineEngine(["board_path", "scenario_path", "schema_etree", "validator"])
+    validation_pipeline.add_stages([
+        XMLLoadStage("board"),
+        XMLLoadStage("scenario"),
+        DefaultValuePopulatingStage(),
+        SyntacticValidationStage(),
+        SemanticValidationStage(),
+        ReportValidationResultStage(),
+    ])
+
+    obj = PipelineObject(schema_path = args.schema, datachecks_path = args.datachecks)
+    validator_construction_pipeline.run(obj)
+    if args.board and args.scenario:
+        nr_all_errors = validate_one(validation_pipeline, obj, args.board, args.scenario)
+    elif args.board:
+        nr_all_errors = validate_board(validation_pipeline, obj, args.board)
+    else:
+        nr_all_errors = validate_all(validation_pipeline, obj, os.path.join(config_tools_dir, "data"))
+
+    sys.exit(1 if nr_all_errors > 0 else 0)

 if __name__ == "__main__":
+    config_tools_dir = os.path.join(os.path.dirname(__file__), "..")
+    schema_dir = os.path.join(config_tools_dir, "schema")
+
    parser = argparse.ArgumentParser()
    parser.add_argument("board", nargs="?", type=existing_file_type(parser), help="the board XML file to be validated")
    parser.add_argument("scenario", nargs="?", type=existing_file_type(parser), help="the scenario XML file to be validated")
    parser.add_argument("--loglevel", default="warning", type=log_level_type(parser), help="choose log level, e.g. debug, info, warning or error")
+    parser.add_argument("--schema", default=os.path.join(schema_dir, "config.xsd"), help="the XML schema that defines the syntax of scenario XMLs")
+    parser.add_argument("--datachecks", default=os.path.join(schema_dir, "datachecks.xsd"), help="the XML schema that defines the semantic rules against board and scenario data")
    args = parser.parse_args()

    logging.basicConfig(level=args.loglevel.upper())
-
-    if args.board and args.scenario:
-        nr_violations = validate_one(args.board, args.scenario)
-    elif args.board:
-        nr_violations = validate_board(args.board)
-    else:
-        nr_violations = validate_all(os.path.join(config_tools_dir, "data"))
-
-    sys.exit(1 if nr_violations > 0 else 0)
+    main(args)
--- a/misc/config_tools/scenario_config/xml_loader.py
+++ b/misc/config_tools/scenario_config/xml_loader.py
@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2022 Intel Corporation.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+from defusedxml.ElementTree import parse
+from pipeline import PipelineStage
+
+class XMLLoadStage(PipelineStage):
+    def __init__(self, tag):
+        self.consumes = f"{tag}_path"
+        self.provides = f"{tag}_etree"
+
+    def run(self, obj):
+        xml_path = obj.get(self.consumes)
+        etree = parse(xml_path)
+        obj.set(self.provides, etree)