feat(core): Read and save system information to tracer

2025-07-24 20:47:46 +00:00 · 2023-10-13 16:22:33 +08:00 · 2023-10-13 16:22:33 +08:00 · 16542bf7d3
commit 16542bf7d3
parent a9241e1d75
8 changed files with 307 additions and 11 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,2 +1,5 @@
 models/
 plugins/
+pilot/data
+pilot/message
+logs/
--- a/docker/base/Dockerfile
+++ b/docker/base/Dockerfile
@ -3,7 +3,7 @@ ARG BASE_IMAGE="nvidia/cuda:11.8.0-runtime-ubuntu22.04"
 FROM ${BASE_IMAGE}
 ARG BASE_IMAGE

-RUN apt-get update && apt-get install -y git python3 pip wget sqlite3 \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y git python3 pip wget sqlite3 tzdata \
    && apt-get clean

 ARG BUILD_LOCAL_CODE="false"
@ -44,11 +44,6 @@ ARG BUILD_LOCAL_CODE="false"
 # COPY the rest of the app
 COPY . /app

-# TODO：Need to find a better way to determine whether to build docker image with local code.
-RUN (if [ "${BUILD_LOCAL_CODE}" = "true" ]; \
-    then rm -rf /app/logs && rm -rf /app/pilot/data && rm -rf /app/pilot/message; \
-    fi;)
-
 ARG LOAD_EXAMPLES="true"

 RUN (if [ "${LOAD_EXAMPLES}" = "true" ]; \
@ -57,6 +52,11 @@ RUN (if [ "${LOAD_EXAMPLES}" = "true" ]; \
    && sqlite3 /app/pilot/data/default_sqlite.db < /app/docker/examples/sqls/test_case_info_sqlite.sql; \
    fi;)

+RUN (if [ "${LANGUAGE}" = "zh" ]; \
+    then ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
+    && echo "Asia/Shanghai" > /etc/timezone; \
+    fi;)
+
 ENV PYTHONPATH "/app:$PYTHONPATH"
 EXPOSE 5000

--- a/pilot/model/cluster/embedding/loader.py
+++ b/pilot/model/cluster/embedding/loader.py
@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
 from pilot.model.parameter import BaseEmbeddingModelParameters
 from pilot.utils.parameter_utils import _get_dict_from_obj
 from pilot.utils.tracer import root_tracer, SpanType, SpanTypeRunName
+from pilot.utils.system_utils import get_system_info

 if TYPE_CHECKING:
    from langchain.embeddings.base import Embeddings
@ -21,6 +22,7 @@ class EmbeddingLoader:
            "model_name": model_name,
            "run_service": SpanTypeRunName.EMBEDDING_MODEL.value,
            "params": _get_dict_from_obj(param),
+            "sys_infos": _get_dict_from_obj(get_system_info()),
        }
        with root_tracer.start_span(
            "EmbeddingLoader.load", span_type=SpanType.RUN, metadata=metadata
--- a/pilot/model/cluster/worker/default_worker.py
+++ b/pilot/model/cluster/worker/default_worker.py
@ -11,6 +11,7 @@ from pilot.model.cluster.worker_base import ModelWorker
 from pilot.utils.model_utils import _clear_model_cache
 from pilot.utils.parameter_utils import EnvArgumentParser, _get_dict_from_obj
 from pilot.utils.tracer import root_tracer, SpanType, SpanTypeRunName
+from pilot.utils.system_utils import get_system_info

 logger = logging.getLogger(__name__)

@ -102,6 +103,7 @@ class DefaultModelWorker(ModelWorker):
            "llm_adapter": str(self.llm_adapter),
            "run_service": SpanTypeRunName.MODEL_WORKER,
            "params": _get_dict_from_obj(model_params),
+            "sys_infos": _get_dict_from_obj(get_system_info()),
        }
        with root_tracer.start_span(
            "DefaultModelWorker.start", span_type=SpanType.RUN, metadata=metadata
--- a/pilot/model/cluster/worker/manager.py
+++ b/pilot/model/cluster/worker/manager.py
@ -40,6 +40,7 @@ from pilot.utils.parameter_utils import (
 )
 from pilot.utils.utils import setup_logging
 from pilot.utils.tracer import initialize_tracer, root_tracer, SpanType, SpanTypeRunName
+from pilot.utils.system_utils import get_system_info

 logger = logging.getLogger(__name__)

@ -838,6 +839,7 @@ def _start_local_worker(
        metadata={
            "run_service": SpanTypeRunName.WORKER_MANAGER,
            "params": _get_dict_from_obj(worker_params),
+            "sys_infos": _get_dict_from_obj(get_system_info()),
        },
    ):
        worker = _build_worker(worker_params)
@ -974,6 +976,7 @@ def run_worker_manager(
        os.path.join(LOGDIR, "dbgpt_model_worker_manager_tracer.jsonl"),
        root_operation_name="DB-GPT-WorkerManager-Entry",
    )
+
    _start_local_worker(worker_manager, worker_params)
    _start_local_embedding_worker(
        worker_manager, embedding_model_name, embedding_model_path
@ -985,11 +988,13 @@ def run_worker_manager(
    if not embedded_mod:
        import uvicorn

-        loop = asyncio.get_event_loop()
-        loop.run_until_complete(worker_manager.start())
        uvicorn.run(
            app, host=worker_params.host, port=worker_params.port, log_level="info"
        )
+    else:
+        # Embedded mod, start worker manager
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(worker_manager.start())


 if __name__ == "__main__":
--- a/pilot/server/dbgpt_server.py
+++ b/pilot/server/dbgpt_server.py
@ -5,7 +5,6 @@ from typing import List

 ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(ROOT_PATH)
-import signal
 from pilot.configs.config import Config
 from pilot.configs.model_config import LLM_MODEL_CONFIG, EMBEDDING_MODEL_CONFIG, LOGDIR
 from pilot.component import SystemApp
@ -40,6 +39,7 @@ from pilot.utils.utils import (
 )
 from pilot.utils.tracer import root_tracer, initialize_tracer, SpanType, SpanTypeRunName
 from pilot.utils.parameter_utils import _get_dict_from_obj
+from pilot.utils.system_utils import get_system_info

 static_file_path = os.path.join(os.getcwd(), "server/static")

@ -190,6 +190,7 @@ def run_webserver(param: WebWerverParameters = None):
        metadata={
            "run_service": SpanTypeRunName.WEBSERVER,
            "params": _get_dict_from_obj(param),
+            "sys_infos": _get_dict_from_obj(get_system_info()),
        },
    ):
        param = initialize_app(param)
--- a/pilot/utils/system_utils.py
+++ b/pilot/utils/system_utils.py
@ -0,0 +1,266 @@
+from dataclasses import dataclass, asdict
+from enum import Enum
+from typing import Tuple, Dict
+import os
+import platform
+import subprocess
+import re
+from functools import cache
+
+
+@dataclass
+class SystemInfo:
+    platform: str
+    distribution: str
+    python_version: str
+    cpu: str
+    cpu_avx: str
+    memory: str
+    torch_version: str
+    device: str
+    device_version: str
+    device_count: int
+    device_other: str
+
+    def to_dict(self) -> Dict:
+        return asdict(self)
+
+
+class AVXType(Enum):
+    BASIC = "basic"
+    AVX = "AVX"
+    AVX2 = "AVX2"
+    AVX512 = "AVX512"
+
+    @staticmethod
+    def of_type(avx: str):
+        for item in AVXType:
+            if item._value_ == avx:
+                return item
+        return None
+
+
+class OSType(str, Enum):
+    WINDOWS = "win"
+    LINUX = "linux"
+    DARWIN = "darwin"
+    OTHER = "other"
+
+
+def get_cpu_avx_support() -> Tuple[OSType, AVXType, str]:
+    system = platform.system()
+    os_type = OSType.OTHER
+    cpu_avx = AVXType.BASIC
+    env_cpu_avx = AVXType.of_type(os.getenv("DBGPT_LLAMA_CPP_AVX"))
+    distribution = "Unknown Distribution"
+    if "windows" in system.lower():
+        os_type = OSType.WINDOWS
+        output = "avx2"
+        distribution = "Windows " + platform.release()
+        print("Current platform is windows, use avx2 as default cpu architecture")
+    elif system == "Linux":
+        os_type = OSType.LINUX
+        result = subprocess.run(
+            ["lscpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        output = result.stdout.decode()
+        distribution = get_linux_distribution()
+    elif system == "Darwin":
+        os_type = OSType.DARWIN
+        result = subprocess.run(
+            ["sysctl", "-a"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        distribution = "Mac OS " + platform.mac_ver()[0]
+        output = result.stdout.decode()
+    else:
+        os_type = OSType.OTHER
+        print("Unsupported OS to get cpu avx, use default")
+        return os_type, env_cpu_avx if env_cpu_avx else cpu_avx, distribution
+
+    if "avx512" in output.lower():
+        cpu_avx = AVXType.AVX512
+    elif "avx2" in output.lower():
+        cpu_avx = AVXType.AVX2
+    elif "avx " in output.lower():
+        # cpu_avx =  AVXType.AVX
+        pass
+    return os_type, env_cpu_avx if env_cpu_avx else cpu_avx, distribution
+
+
+def get_device() -> str:
+    try:
+        import torch
+
+        return (
+            "cuda"
+            if torch.cuda.is_available()
+            else "mps"
+            if torch.backends.mps.is_available()
+            else "cpu"
+        )
+    except ModuleNotFoundError:
+        return "cpu"
+
+
+def get_device_info() -> Tuple[str, str, str, int, str]:
+    torch_version, device, device_version, device_count, device_other = (
+        None,
+        "cpu",
+        None,
+        0,
+        "",
+    )
+    try:
+        import torch
+
+        torch_version = torch.__version__
+        if torch.cuda.is_available():
+            device = "cuda"
+            device_version = torch.version.cuda
+            device_count = torch.cuda.device_count()
+        elif torch.backends.mps.is_available():
+            device = "mps"
+    except ModuleNotFoundError:
+        pass
+
+    if not device_version:
+        device_version = (
+            get_cuda_version_from_nvcc() or get_cuda_version_from_nvidia_smi()
+        )
+    if device == "cuda":
+        try:
+            output = subprocess.check_output(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=name,driver_version,memory.total,memory.free,memory.used",
+                    "--format=csv",
+                ]
+            )
+            device_other = output.decode("utf-8")
+        except:
+            pass
+    return torch_version, device, device_version, device_count, device_other
+
+
+def get_cuda_version_from_nvcc():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"])
+        version_line = [
+            line for line in output.decode("utf-8").split("\n") if "release" in line
+        ][0]
+        return version_line.split("release")[-1].strip().split(",")[0]
+    except:
+        return None
+
+
+def get_cuda_version_from_nvidia_smi():
+    try:
+        output = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
+        match = re.search(r"CUDA Version:\s+(\d+\.\d+)", output)
+        if match:
+            return match.group(1)
+        else:
+            return None
+    except:
+        return None
+
+
+def get_linux_distribution():
+    """Get distribution of Linux"""
+    if os.path.isfile("/etc/os-release"):
+        with open("/etc/os-release", "r") as f:
+            info = {}
+            for line in f:
+                key, _, value = line.partition("=")
+                info[key] = value.strip().strip('"')
+            return f"{info.get('NAME', 'Unknown')} {info.get('VERSION_ID', '')}".strip()
+    return "Unknown Linux Distribution"
+
+
+def get_cpu_info():
+    # Getting platform
+    os_type, avx_type, distribution = get_cpu_avx_support()
+
+    # Getting CPU information
+    cpu_info = "Unknown CPU"
+    if os_type == OSType.LINUX:
+        try:
+            output = subprocess.check_output(["lscpu"]).decode("utf-8")
+            match = re.search(r".*Model name:\s*(.+)", output)
+            if match:
+                cpu_info = match.group(1).strip()
+            match = re.search(f".*型号名称：\s*(.+)", output)
+            if match:
+                cpu_info = match.group(1).strip()
+        except:
+            pass
+    elif os_type == OSType.DARWIN:
+        try:
+            output = subprocess.check_output(
+                ["sysctl", "machdep.cpu.brand_string"]
+            ).decode("utf-8")
+            match = re.search(r"machdep.cpu.brand_string:\s*(.+)", output)
+            if match:
+                cpu_info = match.group(1).strip()
+        except:
+            pass
+    elif os_type == OSType.WINDOWS:
+        # TODO
+        raise NotImplementedError
+
+    return os_type, avx_type, cpu_info, distribution
+
+
+def get_memory_info(os_type: OSType) -> str:
+    memory = "Unknown Memory"
+    try:
+        import psutil
+
+        memory = f"{psutil.virtual_memory().total // (1024 ** 3)} GB"
+    except ImportError:
+        pass
+    if os_type == OSType.LINUX:
+        try:
+            with open("/proc/meminfo", "r") as f:
+                mem_info = f.readlines()
+            for line in mem_info:
+                if "MemTotal" in line:
+                    memory = line.split(":")[1].strip()
+                    break
+        except:
+            pass
+    return memory
+
+
+@cache
+def get_system_info() -> SystemInfo:
+    """Get System information"""
+
+    os_type, avx_type, cpu_info, distribution = get_cpu_info()
+
+    # Getting Python version
+    python_version = platform.python_version()
+
+    memory = get_memory_info(os_type)
+
+    (
+        torch_version,
+        device,
+        device_version,
+        device_count,
+        device_other,
+    ) = get_device_info()
+
+    return SystemInfo(
+        platform=os_type._value_,
+        distribution=distribution,
+        python_version=python_version,
+        cpu=cpu_info,
+        cpu_avx=avx_type._value_,
+        memory=memory,
+        torch_version=torch_version,
+        device=device,
+        device_version=device_version,
+        device_count=device_count,
+        device_other=device_other,
+    )
--- a/pilot/utils/tracer/tracer_cli.py
+++ b/pilot/utils/tracer/tracer_cli.py
@ -259,6 +259,7 @@ def chat(
                found_trace_id = trace_id

    service_tables = {}
+    system_infos_table = {}
    out_kwargs = {"ensure_ascii": False} if output == "json" else {}
    for service_name, sp in service_spans.items():
        metadata = sp["metadata"]
@ -266,6 +267,15 @@ def chat(
        for k, v in metadata["params"].items():
            table.add_row([k, v])
        service_tables[service_name] = table
+        sys_infos = metadata.get("sys_infos")
+        if sys_infos and isinstance(sys_infos, dict):
+            sys_table = PrettyTable(
+                ["System Config Key", "System Config Value"],
+                title=f"{service_name} System information",
+            )
+            for k, v in sys_infos.items():
+                sys_table.add_row([k, v])
+            system_infos_table[service_name] = sys_table

    if not hide_run_params:
        merged_table1 = merge_tables_horizontally(
@ -276,16 +286,23 @@ def chat(
        )
        merged_table2 = merge_tables_horizontally(
            [
-                service_tables.get(SpanTypeRunName.MODEL_WORKER),
-                service_tables.get(SpanTypeRunName.WORKER_MANAGER),
+                service_tables.get(SpanTypeRunName.MODEL_WORKER.value),
+                service_tables.get(SpanTypeRunName.WORKER_MANAGER.value),
            ]
        )
+        sys_table = system_infos_table.get(SpanTypeRunName.WORKER_MANAGER.value)
+        if system_infos_table:
+            for k, v in system_infos_table.items():
+                sys_table = v
+                break
        if output == "text":
            print(merged_table1)
            print(merged_table2)
        else:
            for service_name, table in service_tables.items():
                print(table.get_formatted_string(out_format=output, **out_kwargs))
+        if sys_table:
+            print(sys_table.get_formatted_string(out_format=output, **out_kwargs))
    if hide_conv:
        return