config_tools: composing operations around XMLs as pipelines

There is an increasing demand of composing different operations around XML schemas and/or data in different ways for different purpose. Today we already have: - Validate XML data, which takes XML schemas and data (board and scenario) as inputs. - Fill in missing nodes in XML data with default values, which takes XML schema and data (scenario only) as inputs. In the near future we'll extend the operations around XMLs by introducing XML schema preprocessing and XML data upgrading, adding more possibilities to construct a larger operation by composing smaller ones. In order for minimized code repetition and easier composition, this patch introduces an infrasturcture that abstracts each operation as a pipeline stage. Each stage defines its own inputs and outputs and can be composed sequentially as a larger, single operation. The existing operations listed above, along with XML file loaders, are then refactored to provide pipeline stages. The main methods are also refined to complete their tasks by constructing and invoking pipelines. Tracked-On: #6690 Signed-off-by: Junjie Mao <junjie.mao@intel.com>
2025-08-07 03:04:49 +00:00 · 2022-02-23 22:05:51 +08:00 · 2022-02-23 22:05:51 +08:00 · 0a7910c7f0
commit 0a7910c7f0
parent 4fb6ad247a
5 changed files with 273 additions and 74 deletions
--- a/misc/config_tools/scenario_config/default_populator.py
+++ b/misc/config_tools/scenario_config/default_populator.py
@ -5,10 +5,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 import os
 import argparse
-import lxml.etree as etree
+
 from scenario_transformer import ScenarioTransformer
 from pipeline import PipelineObject, PipelineStage, PipelineEngine
 class DefaultValuePopulator(ScenarioTransformer):
    def add_missing_nodes(self, xsd_element_node, xml_parent_node, new_node_index):
        element_name = xsd_element_node.get("name")
@ -20,7 +23,7 @@ class DefaultValuePopulator(ScenarioTransformer):
        if self.complex_type_of_element(xsd_element_node) is None and default_value is None:
            return []
-        new_node = etree.Element(element_name)
+        new_node = xml_parent_node.makeelement(element_name, {})
        new_node.text = default_value
        if new_node_index is not None:
@ -30,15 +33,30 @@ class DefaultValuePopulator(ScenarioTransformer):
        return [new_node]
 class DefaultValuePopulatingStage(PipelineStage):
    uses = {"schema_etree", "scenario_etree"}
    provides = {"scenario_etree"}
    def run(self, obj):
        populator = DefaultValuePopulator(obj.get("schema_etree"))
        etree = obj.get("scenario_etree")
        populator.transform(etree)
        obj.set("scenario_etree", etree)
 def main(xsd_file, xml_file, out_file):
-    xsd_etree = etree.parse(xsd_file)
+    from xml_loader import XMLLoadStage
-    xsd_etree.xinclude()
+    from lxml_loader import LXMLLoadStage
    populator = DefaultValuePopulator(xsd_etree)
-    xml_etree = etree.parse(xml_file, etree.XMLParser(remove_blank_text=True))
+    pipeline = PipelineEngine(["schema_path", "scenario_path"])
-    populator.transform(xml_etree)
+    pipeline.add_stages([
        LXMLLoadStage("schema"),
        XMLLoadStage("scenario"),
        DefaultValuePopulatingStage(),
    ])
-    xml_etree.write(out_file, pretty_print=True)
+    obj = PipelineObject(schema_path = xsd_file, scenario_path = xml_file)
    pipeline.run(obj)
    obj.get("scenario_etree").write(out_file)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Populate a given scenario XML with default values of nonexistent nodes")
--- a/misc/config_tools/scenario_config/lxml_loader.py
+++ b/misc/config_tools/scenario_config/lxml_loader.py
@ -0,0 +1,20 @@
 #!/usr/bin/env python3
 #
 # Copyright (C) 2022 Intel Corporation.
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
 from lxml.etree import parse, XMLParser
 from pipeline import PipelineStage
 class LXMLLoadStage(PipelineStage):
    def __init__(self, tag):
        self.consumes = f"{tag}_path"
        self.provides = f"{tag}_etree"
    def run(self, obj):
        xml_path = obj.get(self.consumes)
        etree = parse(xml_path, XMLParser(remove_blank_text=True))
        etree.xinclude()
        obj.set(self.provides, etree)
--- a/misc/config_tools/scenario_config/pipeline.py
+++ b/misc/config_tools/scenario_config/pipeline.py
@ -0,0 +1,72 @@
 #!/usr/bin/env python3
 #
 # Copyright (C) 2022 Intel Corporation.
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
 class PipelineObject:
    def __init__(self, **kwargs):
        self.data = {}
        for k,v in kwargs.items():
            self.set(k, v)
    def set(self, tag, data):
        self.data[tag] = data
    def get(self, tag):
        return self.data[tag]
    def has(self, tag):
        return tag in self.data.keys()
    def consume(self, tag):
        return self.data.pop(tag, None)
    def dump(self):
        print(self.data)
 class PipelineStage:
    # The following three class variables defines the inputs and outputs of the stage. Each of them can be either a set
    # or a string (which is interpreted as a unit set)
    consumes = set()        # Data consumed by this stage. Consumed data will be unavailable to later stages.
    uses = set()            # Data used but not consumed by this stage.
    provides = set()        # Data provided by this stage.
    def run(self, obj):
        raise NotImplementedError
 class PipelineEngine:
    def __init__(self, initial_data = []):
        self.stages = []
        self.initial_data = set(initial_data)
        self.available_data = set(initial_data)
    def add_stage(self, stage):
        consumes = stage.consumes if isinstance(stage.consumes, set) else {stage.consumes}
        uses = stage.uses if isinstance(stage.uses, set) else {stage.uses}
        provides = stage.provides if isinstance(stage.provides, set) else {stage.provides}
        all_uses = consumes.union(uses)
        if not all_uses.issubset(self.available_data):
            raise Exception(f"Data {uses - self.available_data} need by stage {stage.__class__.__name__} but not provided by the pipeline")
        self.stages.append(stage)
        self.available_data = self.available_data.difference(consumes).union(provides)
    def add_stages(self, stages):
        for stage in stages:
            self.add_stage(stage)
    def run(self, obj):
        for tag in self.initial_data:
            if not obj.has(tag):
                raise AttributeError(f"Data {tag} is needed by the pipeline but not provided by the object")
        for stage in self.stages:
            stage.run(obj)
            consumes = stage.consumes if isinstance(stage.consumes, set) else {stage.consumes}
            for tag in consumes:
                obj.consume(tag)
--- a/misc/config_tools/scenario_config/validator.py
+++ b/misc/config_tools/scenario_config/validator.py
@ -7,8 +7,9 @@
 import sys, os
 import argparse
 import lxml.etree as etree
 import logging
 from copy import copy
 from collections import namedtuple
 try:
    import xmlschema
@ -18,7 +19,8 @@ except ImportError:
                  "To enable the validation, install the python package by executing: pip3 install xmlschema.")
    sys.exit(0)
-from default_populator import DefaultValuePopulator
+from pipeline import PipelineObject, PipelineStage, PipelineEngine
 from default_populator import DefaultValuePopulatingStage
 def existing_file_type(parser):
    def aux(arg):
@ -39,106 +41,174 @@ def log_level_type(parser):
            parser.error(f"{arg} is not a valid log level")
    return aux
-def load_schema(xsd_xml, datachecks_xml):
+class ValidationError(dict):
-    global schema, schema_etree, datachecks
+    def __init__(self, paths, message, severity):
        super().__init__(paths = paths, message = message, severity = severity)
-    schema_etree = etree.parse(xsd_xml)
+    def __str__(self):
-    schema_etree.xinclude()
+        return f"{', '.join(self['paths'])}: {self['message']}"
    schema = xmlschema.XMLSchema11(etree.tostring(schema_etree, encoding="unicode"))
-    datachecks_etree = etree.parse(datachecks_xml)
+class ScenarioValidator:
-    datachecks_etree.xinclude()
+    def __init__(self, schema_etree, datachecks_etree):
-    datachecks = xmlschema.XMLSchema11(etree.tostring(datachecks_etree, encoding="unicode"))
+        """Initialize the validator with preprocessed schemas in ElementTree."""
        self.schema = xmlschema.XMLSchema11(schema_etree)
        self.datachecks = xmlschema.XMLSchema11(datachecks_etree)
-config_tools_dir = os.path.join(os.path.dirname(__file__), "..")
+    def check_syntax(self, scenario_etree):
-schema_dir = os.path.join(config_tools_dir, "schema")
+        errors = []
 schema = None
 schema_etree = None
 datachecks = None
 load_schema(os.path.join(schema_dir, "config.xsd"), os.path.join(schema_dir, "datachecks.xsd"))
-def validate_one(board_xml, scenario_xml):
+        it = self.schema.iter_errors(scenario_etree)
    nr_schema_errors = 0
    nr_check_errors = 0
    nr_check_warnings = 0
    board_name = os.path.basename(board_xml)
    scenario_name = os.path.basename(scenario_xml)
    scenario_etree = etree.parse(scenario_xml, etree.XMLParser(remove_blank_text=True))
    DefaultValuePopulator(schema_etree).transform(scenario_etree)
    it = schema.iter_errors(scenario_etree)
    for error in it:
        logging.debug(error)
        nr_schema_errors += 1
    if nr_schema_errors == 0:
        main_etree = etree.parse(board_xml)
        main_etree.getroot().extend(scenario_etree.getroot()[:])
        it = datachecks.iter_errors(main_etree)
        for error in it:
-            logging.debug(error)
+            # Syntactic errors are always critical.
            e = ValidationError([error.path], error.reason, "critical")
            logging.debug(e)
            errors.append(e)
        return errors
    def check_semantics(self, board_etree, scenario_etree):
        errors = []
        unified_node = copy(scenario_etree.getroot())
        unified_node.extend(board_etree.getroot())
        it = self.datachecks.iter_errors(unified_node)
        for error in it:
            logging.debug(f"{error.elem}: {error.message}")
            anno = error.validator.annotation
            severity = anno.elem.get("{https://projectacrn.org}severity")
            errors.append(ValidationError([error.elem.tag], error.message, severity))
-            if severity == "error":
+        return errors
                nr_check_errors += 1
            elif severity == "warning":
                nr_check_warnings += 1
-        if nr_check_errors > 0:
+class ValidatorConstructionStage(PipelineStage):
-            logging.error(f"Board {board_name} and scenario {scenario_name} have inconsistent data: {nr_check_errors} errors, {nr_check_warnings} warnings.")
+    # The schema etree may still useful for schema-based transformation. Do not consume it.
-        elif nr_check_warnings > 0:
+    uses = {"schema_etree"}
-            logging.warning(f"Board {board_name} and scenario {scenario_name} have inconsistent data: {nr_check_warnings} warnings.")
+    consumes = {"datachecks_etree"}
    provides = {"validator"}
    def run(self, obj):
        validator = ScenarioValidator(obj.get("schema_etree"), obj.get("datachecks_etree"))
        obj.set("validator", validator)
 class ValidatorConstructionByFileStage(PipelineStage):
    uses = {"schema_path", "datachecks_path"}
    provides = {"validator"}
    def run(self, obj):
        validator = ScenarioValidator(obj.get("schema_path"), obj.get("datachecks_path"))
        obj.set("validator", validator)
 class SyntacticValidationStage(PipelineStage):
    uses = {"validator", "scenario_etree"}
    provides = {"syntactic_errors"}
    def run(self, obj):
        errors = obj.get("validator").check_syntax(obj.get("scenario_etree"))
        obj.set("syntactic_errors", errors)
 class SemanticValidationStage(PipelineStage):
    uses = {"validator", "board_etree", "scenario_etree"}
    provides = {"semantic_errors"}
    def run(self, obj):
        errors = obj.get("validator").check_semantics(obj.get("board_etree"), obj.get("scenario_etree"))
        obj.set("semantic_errors", errors)
 class ReportValidationResultStage(PipelineStage):
    consumes = {"board_etree", "scenario_etree", "syntactic_errors", "semantic_errors"}
    provides = {"nr_all_errors"}
    def run(self, obj):
        board_name = obj.get("board_etree").getroot().get("board")
        scenario_name = obj.get("scenario_etree").getroot().get("scenario")
        nr_critical = len(obj.get("syntactic_errors"))
        nr_error = len(list(filter(lambda e: e["severity"] == "error", obj.get("semantic_errors"))))
        nr_warning = len(list(filter(lambda e: e["severity"] == "warning", obj.get("semantic_errors"))))
        if nr_critical > 0 or nr_error > 0:
            logging.error(f"Board {board_name} and scenario {scenario_name} are inconsistent: {nr_critical} syntax errors, {nr_error} data errors, {nr_warning} warnings.")
        elif nr_warning > 0:
            logging.warning(f"Board {board_name} and scenario {scenario_name} are potentially inconsistent: {nr_warning} warnings.")
        else:
            logging.info(f"Board {board_name} and scenario {scenario_name} are valid and consistent.")
    else:
        logging.warning(f"Scenario {scenario_name} is invalid: {nr_schema_errors} schema errors.")
-    return nr_schema_errors + nr_check_errors + nr_check_warnings
+        obj.set("nr_all_errors", nr_critical + nr_error + nr_warning)
-def validate_board(board_xml):
+def validate_one(validation_pipeline, pipeline_obj, board_xml, scenario_xml):
    pipeline_obj.set("board_path", board_xml)
    pipeline_obj.set("scenario_path", scenario_xml)
    validation_pipeline.run(pipeline_obj)
    return pipeline_obj.consume("nr_all_errors")
 def validate_board(validation_pipeline, pipeline_obj, board_xml):
    board_dir = os.path.dirname(board_xml)
-    nr_violations = 0
+    nr_all_errors = 0
    for f in os.listdir(board_dir):
        if not f.endswith(".xml"):
            continue
        if f == os.path.basename(board_xml) or "launch" in f:
            continue
        nr_all_errors += validate_one(validation_pipeline, pipeline_obj, board_xml, os.path.join(board_dir, f))
-        nr_violations += validate_one(board_xml, os.path.join(board_dir, f))
+    return nr_all_errors
-    return nr_violations
+def validate_all(validation_pipeline, pipeline_obj, data_dir):
-
+    nr_all_errors = 0
 def validate_all(data_dir):
    nr_violations = 0
    for f in os.listdir(data_dir):
        board_xml = os.path.join(data_dir, f, f"{f}.xml")
        if os.path.isfile(board_xml):
-            nr_violations += validate_board(board_xml)
+            nr_all_errors += validate_board(validation_pipeline, pipeline_obj, board_xml)
        else:
            logging.warning(f"Cannot find a board XML under {os.path.join(data_dir, f)}")
-    return nr_violations
+    return nr_all_errors
 def main(args):
    from xml_loader import XMLLoadStage
    from lxml_loader import LXMLLoadStage
    validator_construction_pipeline = PipelineEngine(["schema_path", "datachecks_path"])
    validator_construction_pipeline.add_stages([
        LXMLLoadStage("schema"),
        LXMLLoadStage("datachecks"),
        ValidatorConstructionStage(),
    ])
    validation_pipeline = PipelineEngine(["board_path", "scenario_path", "schema_etree", "validator"])
    validation_pipeline.add_stages([
        XMLLoadStage("board"),
        XMLLoadStage("scenario"),
        DefaultValuePopulatingStage(),
        SyntacticValidationStage(),
        SemanticValidationStage(),
        ReportValidationResultStage(),
    ])
    obj = PipelineObject(schema_path = args.schema, datachecks_path = args.datachecks)
    validator_construction_pipeline.run(obj)
    if args.board and args.scenario:
        nr_all_errors = validate_one(validation_pipeline, obj, args.board, args.scenario)
    elif args.board:
        nr_all_errors = validate_board(validation_pipeline, obj, args.board)
    else:
        nr_all_errors = validate_all(validation_pipeline, obj, os.path.join(config_tools_dir, "data"))
    sys.exit(1 if nr_all_errors > 0 else 0)
 if __name__ == "__main__":
    config_tools_dir = os.path.join(os.path.dirname(__file__), "..")
    schema_dir = os.path.join(config_tools_dir, "schema")
    parser = argparse.ArgumentParser()
    parser.add_argument("board", nargs="?", type=existing_file_type(parser), help="the board XML file to be validated")
    parser.add_argument("scenario", nargs="?", type=existing_file_type(parser), help="the scenario XML file to be validated")
    parser.add_argument("--loglevel", default="warning", type=log_level_type(parser), help="choose log level, e.g. debug, info, warning or error")
    parser.add_argument("--schema", default=os.path.join(schema_dir, "config.xsd"), help="the XML schema that defines the syntax of scenario XMLs")
    parser.add_argument("--datachecks", default=os.path.join(schema_dir, "datachecks.xsd"), help="the XML schema that defines the semantic rules against board and scenario data")
    args = parser.parse_args()
    logging.basicConfig(level=args.loglevel.upper())
-
+    main(args)
    if args.board and args.scenario:
        nr_violations = validate_one(args.board, args.scenario)
    elif args.board:
        nr_violations = validate_board(args.board)
    else:
        nr_violations = validate_all(os.path.join(config_tools_dir, "data"))
    sys.exit(1 if nr_violations > 0 else 0)
--- a/misc/config_tools/scenario_config/xml_loader.py
+++ b/misc/config_tools/scenario_config/xml_loader.py
@ -0,0 +1,19 @@
 #!/usr/bin/env python3
 #
 # Copyright (C) 2022 Intel Corporation.
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
 from defusedxml.ElementTree import parse
 from pipeline import PipelineStage
 class XMLLoadStage(PipelineStage):
    def __init__(self, tag):
        self.consumes = f"{tag}_path"
        self.provides = f"{tag}_etree"
    def run(self, obj):
        xml_path = obj.get(self.consumes)
        etree = parse(xml_path)
        obj.set(self.provides, etree)