From 16542bf7d3301b456abbfc3c18ef9d6ceab03c90 Mon Sep 17 00:00:00 2001 From: FangYin Cheng Date: Fri, 13 Oct 2023 16:22:33 +0800 Subject: [PATCH] feat(core): Read and save system information to tracer --- .dockerignore | 3 + docker/base/Dockerfile | 12 +- pilot/model/cluster/embedding/loader.py | 2 + pilot/model/cluster/worker/default_worker.py | 2 + pilot/model/cluster/worker/manager.py | 9 +- pilot/server/dbgpt_server.py | 3 +- pilot/utils/system_utils.py | 266 +++++++++++++++++++ pilot/utils/tracer/tracer_cli.py | 21 +- 8 files changed, 307 insertions(+), 11 deletions(-) create mode 100644 pilot/utils/system_utils.py diff --git a/.dockerignore b/.dockerignore index e5b067a78..efded29b9 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,5 @@ models/ plugins/ +pilot/data +pilot/message +logs/ \ No newline at end of file diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 7c6bbf598..63486e260 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -3,7 +3,7 @@ ARG BASE_IMAGE="nvidia/cuda:11.8.0-runtime-ubuntu22.04" FROM ${BASE_IMAGE} ARG BASE_IMAGE -RUN apt-get update && apt-get install -y git python3 pip wget sqlite3 \ +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y git python3 pip wget sqlite3 tzdata \ && apt-get clean ARG BUILD_LOCAL_CODE="false" @@ -44,11 +44,6 @@ ARG BUILD_LOCAL_CODE="false" # COPY the rest of the app COPY . /app -# TODO:Need to find a better way to determine whether to build docker image with local code. -RUN (if [ "${BUILD_LOCAL_CODE}" = "true" ]; \ - then rm -rf /app/logs && rm -rf /app/pilot/data && rm -rf /app/pilot/message; \ - fi;) - ARG LOAD_EXAMPLES="true" RUN (if [ "${LOAD_EXAMPLES}" = "true" ]; \ @@ -57,6 +52,11 @@ RUN (if [ "${LOAD_EXAMPLES}" = "true" ]; \ && sqlite3 /app/pilot/data/default_sqlite.db < /app/docker/examples/sqls/test_case_info_sqlite.sql; \ fi;) +RUN (if [ "${LANGUAGE}" = "zh" ]; \ + then ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ + && echo "Asia/Shanghai" > /etc/timezone; \ + fi;) + ENV PYTHONPATH "/app:$PYTHONPATH" EXPOSE 5000 diff --git a/pilot/model/cluster/embedding/loader.py b/pilot/model/cluster/embedding/loader.py index caf4bda9a..258e3ec2d 100644 --- a/pilot/model/cluster/embedding/loader.py +++ b/pilot/model/cluster/embedding/loader.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING from pilot.model.parameter import BaseEmbeddingModelParameters from pilot.utils.parameter_utils import _get_dict_from_obj from pilot.utils.tracer import root_tracer, SpanType, SpanTypeRunName +from pilot.utils.system_utils import get_system_info if TYPE_CHECKING: from langchain.embeddings.base import Embeddings @@ -21,6 +22,7 @@ class EmbeddingLoader: "model_name": model_name, "run_service": SpanTypeRunName.EMBEDDING_MODEL.value, "params": _get_dict_from_obj(param), + "sys_infos": _get_dict_from_obj(get_system_info()), } with root_tracer.start_span( "EmbeddingLoader.load", span_type=SpanType.RUN, metadata=metadata diff --git a/pilot/model/cluster/worker/default_worker.py b/pilot/model/cluster/worker/default_worker.py index 378fee2ea..04b47cbdb 100644 --- a/pilot/model/cluster/worker/default_worker.py +++ b/pilot/model/cluster/worker/default_worker.py @@ -11,6 +11,7 @@ from pilot.model.cluster.worker_base import ModelWorker from pilot.utils.model_utils import _clear_model_cache from pilot.utils.parameter_utils import EnvArgumentParser, _get_dict_from_obj from pilot.utils.tracer import root_tracer, SpanType, SpanTypeRunName +from pilot.utils.system_utils import get_system_info logger = logging.getLogger(__name__) @@ -102,6 +103,7 @@ class DefaultModelWorker(ModelWorker): "llm_adapter": str(self.llm_adapter), "run_service": SpanTypeRunName.MODEL_WORKER, "params": _get_dict_from_obj(model_params), + "sys_infos": _get_dict_from_obj(get_system_info()), } with root_tracer.start_span( "DefaultModelWorker.start", span_type=SpanType.RUN, metadata=metadata diff --git a/pilot/model/cluster/worker/manager.py b/pilot/model/cluster/worker/manager.py index 5648c8e01..a85ee0ed7 100644 --- a/pilot/model/cluster/worker/manager.py +++ b/pilot/model/cluster/worker/manager.py @@ -40,6 +40,7 @@ from pilot.utils.parameter_utils import ( ) from pilot.utils.utils import setup_logging from pilot.utils.tracer import initialize_tracer, root_tracer, SpanType, SpanTypeRunName +from pilot.utils.system_utils import get_system_info logger = logging.getLogger(__name__) @@ -838,6 +839,7 @@ def _start_local_worker( metadata={ "run_service": SpanTypeRunName.WORKER_MANAGER, "params": _get_dict_from_obj(worker_params), + "sys_infos": _get_dict_from_obj(get_system_info()), }, ): worker = _build_worker(worker_params) @@ -974,6 +976,7 @@ def run_worker_manager( os.path.join(LOGDIR, "dbgpt_model_worker_manager_tracer.jsonl"), root_operation_name="DB-GPT-WorkerManager-Entry", ) + _start_local_worker(worker_manager, worker_params) _start_local_embedding_worker( worker_manager, embedding_model_name, embedding_model_path @@ -985,11 +988,13 @@ def run_worker_manager( if not embedded_mod: import uvicorn - loop = asyncio.get_event_loop() - loop.run_until_complete(worker_manager.start()) uvicorn.run( app, host=worker_params.host, port=worker_params.port, log_level="info" ) + else: + # Embedded mod, start worker manager + loop = asyncio.get_event_loop() + loop.run_until_complete(worker_manager.start()) if __name__ == "__main__": diff --git a/pilot/server/dbgpt_server.py b/pilot/server/dbgpt_server.py index e58c61756..2b35eaf10 100644 --- a/pilot/server/dbgpt_server.py +++ b/pilot/server/dbgpt_server.py @@ -5,7 +5,6 @@ from typing import List ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(ROOT_PATH) -import signal from pilot.configs.config import Config from pilot.configs.model_config import LLM_MODEL_CONFIG, EMBEDDING_MODEL_CONFIG, LOGDIR from pilot.component import SystemApp @@ -40,6 +39,7 @@ from pilot.utils.utils import ( ) from pilot.utils.tracer import root_tracer, initialize_tracer, SpanType, SpanTypeRunName from pilot.utils.parameter_utils import _get_dict_from_obj +from pilot.utils.system_utils import get_system_info static_file_path = os.path.join(os.getcwd(), "server/static") @@ -190,6 +190,7 @@ def run_webserver(param: WebWerverParameters = None): metadata={ "run_service": SpanTypeRunName.WEBSERVER, "params": _get_dict_from_obj(param), + "sys_infos": _get_dict_from_obj(get_system_info()), }, ): param = initialize_app(param) diff --git a/pilot/utils/system_utils.py b/pilot/utils/system_utils.py new file mode 100644 index 000000000..2673edc0e --- /dev/null +++ b/pilot/utils/system_utils.py @@ -0,0 +1,266 @@ +from dataclasses import dataclass, asdict +from enum import Enum +from typing import Tuple, Dict +import os +import platform +import subprocess +import re +from functools import cache + + +@dataclass +class SystemInfo: + platform: str + distribution: str + python_version: str + cpu: str + cpu_avx: str + memory: str + torch_version: str + device: str + device_version: str + device_count: int + device_other: str + + def to_dict(self) -> Dict: + return asdict(self) + + +class AVXType(Enum): + BASIC = "basic" + AVX = "AVX" + AVX2 = "AVX2" + AVX512 = "AVX512" + + @staticmethod + def of_type(avx: str): + for item in AVXType: + if item._value_ == avx: + return item + return None + + +class OSType(str, Enum): + WINDOWS = "win" + LINUX = "linux" + DARWIN = "darwin" + OTHER = "other" + + +def get_cpu_avx_support() -> Tuple[OSType, AVXType, str]: + system = platform.system() + os_type = OSType.OTHER + cpu_avx = AVXType.BASIC + env_cpu_avx = AVXType.of_type(os.getenv("DBGPT_LLAMA_CPP_AVX")) + distribution = "Unknown Distribution" + if "windows" in system.lower(): + os_type = OSType.WINDOWS + output = "avx2" + distribution = "Windows " + platform.release() + print("Current platform is windows, use avx2 as default cpu architecture") + elif system == "Linux": + os_type = OSType.LINUX + result = subprocess.run( + ["lscpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + output = result.stdout.decode() + distribution = get_linux_distribution() + elif system == "Darwin": + os_type = OSType.DARWIN + result = subprocess.run( + ["sysctl", "-a"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + distribution = "Mac OS " + platform.mac_ver()[0] + output = result.stdout.decode() + else: + os_type = OSType.OTHER + print("Unsupported OS to get cpu avx, use default") + return os_type, env_cpu_avx if env_cpu_avx else cpu_avx, distribution + + if "avx512" in output.lower(): + cpu_avx = AVXType.AVX512 + elif "avx2" in output.lower(): + cpu_avx = AVXType.AVX2 + elif "avx " in output.lower(): + # cpu_avx = AVXType.AVX + pass + return os_type, env_cpu_avx if env_cpu_avx else cpu_avx, distribution + + +def get_device() -> str: + try: + import torch + + return ( + "cuda" + if torch.cuda.is_available() + else "mps" + if torch.backends.mps.is_available() + else "cpu" + ) + except ModuleNotFoundError: + return "cpu" + + +def get_device_info() -> Tuple[str, str, str, int, str]: + torch_version, device, device_version, device_count, device_other = ( + None, + "cpu", + None, + 0, + "", + ) + try: + import torch + + torch_version = torch.__version__ + if torch.cuda.is_available(): + device = "cuda" + device_version = torch.version.cuda + device_count = torch.cuda.device_count() + elif torch.backends.mps.is_available(): + device = "mps" + except ModuleNotFoundError: + pass + + if not device_version: + device_version = ( + get_cuda_version_from_nvcc() or get_cuda_version_from_nvidia_smi() + ) + if device == "cuda": + try: + output = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=name,driver_version,memory.total,memory.free,memory.used", + "--format=csv", + ] + ) + device_other = output.decode("utf-8") + except: + pass + return torch_version, device, device_version, device_count, device_other + + +def get_cuda_version_from_nvcc(): + try: + output = subprocess.check_output(["nvcc", "--version"]) + version_line = [ + line for line in output.decode("utf-8").split("\n") if "release" in line + ][0] + return version_line.split("release")[-1].strip().split(",")[0] + except: + return None + + +def get_cuda_version_from_nvidia_smi(): + try: + output = subprocess.check_output(["nvidia-smi"]).decode("utf-8") + match = re.search(r"CUDA Version:\s+(\d+\.\d+)", output) + if match: + return match.group(1) + else: + return None + except: + return None + + +def get_linux_distribution(): + """Get distribution of Linux""" + if os.path.isfile("/etc/os-release"): + with open("/etc/os-release", "r") as f: + info = {} + for line in f: + key, _, value = line.partition("=") + info[key] = value.strip().strip('"') + return f"{info.get('NAME', 'Unknown')} {info.get('VERSION_ID', '')}".strip() + return "Unknown Linux Distribution" + + +def get_cpu_info(): + # Getting platform + os_type, avx_type, distribution = get_cpu_avx_support() + + # Getting CPU information + cpu_info = "Unknown CPU" + if os_type == OSType.LINUX: + try: + output = subprocess.check_output(["lscpu"]).decode("utf-8") + match = re.search(r".*Model name:\s*(.+)", output) + if match: + cpu_info = match.group(1).strip() + match = re.search(f".*型号名称:\s*(.+)", output) + if match: + cpu_info = match.group(1).strip() + except: + pass + elif os_type == OSType.DARWIN: + try: + output = subprocess.check_output( + ["sysctl", "machdep.cpu.brand_string"] + ).decode("utf-8") + match = re.search(r"machdep.cpu.brand_string:\s*(.+)", output) + if match: + cpu_info = match.group(1).strip() + except: + pass + elif os_type == OSType.WINDOWS: + # TODO + raise NotImplementedError + + return os_type, avx_type, cpu_info, distribution + + +def get_memory_info(os_type: OSType) -> str: + memory = "Unknown Memory" + try: + import psutil + + memory = f"{psutil.virtual_memory().total // (1024 ** 3)} GB" + except ImportError: + pass + if os_type == OSType.LINUX: + try: + with open("/proc/meminfo", "r") as f: + mem_info = f.readlines() + for line in mem_info: + if "MemTotal" in line: + memory = line.split(":")[1].strip() + break + except: + pass + return memory + + +@cache +def get_system_info() -> SystemInfo: + """Get System information""" + + os_type, avx_type, cpu_info, distribution = get_cpu_info() + + # Getting Python version + python_version = platform.python_version() + + memory = get_memory_info(os_type) + + ( + torch_version, + device, + device_version, + device_count, + device_other, + ) = get_device_info() + + return SystemInfo( + platform=os_type._value_, + distribution=distribution, + python_version=python_version, + cpu=cpu_info, + cpu_avx=avx_type._value_, + memory=memory, + torch_version=torch_version, + device=device, + device_version=device_version, + device_count=device_count, + device_other=device_other, + ) diff --git a/pilot/utils/tracer/tracer_cli.py b/pilot/utils/tracer/tracer_cli.py index 822b039ee..7df18f516 100644 --- a/pilot/utils/tracer/tracer_cli.py +++ b/pilot/utils/tracer/tracer_cli.py @@ -259,6 +259,7 @@ def chat( found_trace_id = trace_id service_tables = {} + system_infos_table = {} out_kwargs = {"ensure_ascii": False} if output == "json" else {} for service_name, sp in service_spans.items(): metadata = sp["metadata"] @@ -266,6 +267,15 @@ def chat( for k, v in metadata["params"].items(): table.add_row([k, v]) service_tables[service_name] = table + sys_infos = metadata.get("sys_infos") + if sys_infos and isinstance(sys_infos, dict): + sys_table = PrettyTable( + ["System Config Key", "System Config Value"], + title=f"{service_name} System information", + ) + for k, v in sys_infos.items(): + sys_table.add_row([k, v]) + system_infos_table[service_name] = sys_table if not hide_run_params: merged_table1 = merge_tables_horizontally( @@ -276,16 +286,23 @@ def chat( ) merged_table2 = merge_tables_horizontally( [ - service_tables.get(SpanTypeRunName.MODEL_WORKER), - service_tables.get(SpanTypeRunName.WORKER_MANAGER), + service_tables.get(SpanTypeRunName.MODEL_WORKER.value), + service_tables.get(SpanTypeRunName.WORKER_MANAGER.value), ] ) + sys_table = system_infos_table.get(SpanTypeRunName.WORKER_MANAGER.value) + if system_infos_table: + for k, v in system_infos_table.items(): + sys_table = v + break if output == "text": print(merged_table1) print(merged_table2) else: for service_name, table in service_tables.items(): print(table.get_formatted_string(out_format=output, **out_kwargs)) + if sys_table: + print(sys_table.get_formatted_string(out_format=output, **out_kwargs)) if hide_conv: return