mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-24 20:47:46 +00:00
feat(core): Read and save system information to tracer
This commit is contained in:
parent
a9241e1d75
commit
16542bf7d3
@ -1,2 +1,5 @@
|
||||
models/
|
||||
plugins/
|
||||
pilot/data
|
||||
pilot/message
|
||||
logs/
|
@ -3,7 +3,7 @@ ARG BASE_IMAGE="nvidia/cuda:11.8.0-runtime-ubuntu22.04"
|
||||
FROM ${BASE_IMAGE}
|
||||
ARG BASE_IMAGE
|
||||
|
||||
RUN apt-get update && apt-get install -y git python3 pip wget sqlite3 \
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y git python3 pip wget sqlite3 tzdata \
|
||||
&& apt-get clean
|
||||
|
||||
ARG BUILD_LOCAL_CODE="false"
|
||||
@ -44,11 +44,6 @@ ARG BUILD_LOCAL_CODE="false"
|
||||
# COPY the rest of the app
|
||||
COPY . /app
|
||||
|
||||
# TODO:Need to find a better way to determine whether to build docker image with local code.
|
||||
RUN (if [ "${BUILD_LOCAL_CODE}" = "true" ]; \
|
||||
then rm -rf /app/logs && rm -rf /app/pilot/data && rm -rf /app/pilot/message; \
|
||||
fi;)
|
||||
|
||||
ARG LOAD_EXAMPLES="true"
|
||||
|
||||
RUN (if [ "${LOAD_EXAMPLES}" = "true" ]; \
|
||||
@ -57,6 +52,11 @@ RUN (if [ "${LOAD_EXAMPLES}" = "true" ]; \
|
||||
&& sqlite3 /app/pilot/data/default_sqlite.db < /app/docker/examples/sqls/test_case_info_sqlite.sql; \
|
||||
fi;)
|
||||
|
||||
RUN (if [ "${LANGUAGE}" = "zh" ]; \
|
||||
then ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||
&& echo "Asia/Shanghai" > /etc/timezone; \
|
||||
fi;)
|
||||
|
||||
ENV PYTHONPATH "/app:$PYTHONPATH"
|
||||
EXPOSE 5000
|
||||
|
||||
|
@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
|
||||
from pilot.model.parameter import BaseEmbeddingModelParameters
|
||||
from pilot.utils.parameter_utils import _get_dict_from_obj
|
||||
from pilot.utils.tracer import root_tracer, SpanType, SpanTypeRunName
|
||||
from pilot.utils.system_utils import get_system_info
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain.embeddings.base import Embeddings
|
||||
@ -21,6 +22,7 @@ class EmbeddingLoader:
|
||||
"model_name": model_name,
|
||||
"run_service": SpanTypeRunName.EMBEDDING_MODEL.value,
|
||||
"params": _get_dict_from_obj(param),
|
||||
"sys_infos": _get_dict_from_obj(get_system_info()),
|
||||
}
|
||||
with root_tracer.start_span(
|
||||
"EmbeddingLoader.load", span_type=SpanType.RUN, metadata=metadata
|
||||
|
@ -11,6 +11,7 @@ from pilot.model.cluster.worker_base import ModelWorker
|
||||
from pilot.utils.model_utils import _clear_model_cache
|
||||
from pilot.utils.parameter_utils import EnvArgumentParser, _get_dict_from_obj
|
||||
from pilot.utils.tracer import root_tracer, SpanType, SpanTypeRunName
|
||||
from pilot.utils.system_utils import get_system_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -102,6 +103,7 @@ class DefaultModelWorker(ModelWorker):
|
||||
"llm_adapter": str(self.llm_adapter),
|
||||
"run_service": SpanTypeRunName.MODEL_WORKER,
|
||||
"params": _get_dict_from_obj(model_params),
|
||||
"sys_infos": _get_dict_from_obj(get_system_info()),
|
||||
}
|
||||
with root_tracer.start_span(
|
||||
"DefaultModelWorker.start", span_type=SpanType.RUN, metadata=metadata
|
||||
|
@ -40,6 +40,7 @@ from pilot.utils.parameter_utils import (
|
||||
)
|
||||
from pilot.utils.utils import setup_logging
|
||||
from pilot.utils.tracer import initialize_tracer, root_tracer, SpanType, SpanTypeRunName
|
||||
from pilot.utils.system_utils import get_system_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -838,6 +839,7 @@ def _start_local_worker(
|
||||
metadata={
|
||||
"run_service": SpanTypeRunName.WORKER_MANAGER,
|
||||
"params": _get_dict_from_obj(worker_params),
|
||||
"sys_infos": _get_dict_from_obj(get_system_info()),
|
||||
},
|
||||
):
|
||||
worker = _build_worker(worker_params)
|
||||
@ -974,6 +976,7 @@ def run_worker_manager(
|
||||
os.path.join(LOGDIR, "dbgpt_model_worker_manager_tracer.jsonl"),
|
||||
root_operation_name="DB-GPT-WorkerManager-Entry",
|
||||
)
|
||||
|
||||
_start_local_worker(worker_manager, worker_params)
|
||||
_start_local_embedding_worker(
|
||||
worker_manager, embedding_model_name, embedding_model_path
|
||||
@ -985,11 +988,13 @@ def run_worker_manager(
|
||||
if not embedded_mod:
|
||||
import uvicorn
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(worker_manager.start())
|
||||
uvicorn.run(
|
||||
app, host=worker_params.host, port=worker_params.port, log_level="info"
|
||||
)
|
||||
else:
|
||||
# Embedded mod, start worker manager
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(worker_manager.start())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -5,7 +5,6 @@ from typing import List
|
||||
|
||||
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(ROOT_PATH)
|
||||
import signal
|
||||
from pilot.configs.config import Config
|
||||
from pilot.configs.model_config import LLM_MODEL_CONFIG, EMBEDDING_MODEL_CONFIG, LOGDIR
|
||||
from pilot.component import SystemApp
|
||||
@ -40,6 +39,7 @@ from pilot.utils.utils import (
|
||||
)
|
||||
from pilot.utils.tracer import root_tracer, initialize_tracer, SpanType, SpanTypeRunName
|
||||
from pilot.utils.parameter_utils import _get_dict_from_obj
|
||||
from pilot.utils.system_utils import get_system_info
|
||||
|
||||
static_file_path = os.path.join(os.getcwd(), "server/static")
|
||||
|
||||
@ -190,6 +190,7 @@ def run_webserver(param: WebWerverParameters = None):
|
||||
metadata={
|
||||
"run_service": SpanTypeRunName.WEBSERVER,
|
||||
"params": _get_dict_from_obj(param),
|
||||
"sys_infos": _get_dict_from_obj(get_system_info()),
|
||||
},
|
||||
):
|
||||
param = initialize_app(param)
|
||||
|
266
pilot/utils/system_utils.py
Normal file
266
pilot/utils/system_utils.py
Normal file
@ -0,0 +1,266 @@
|
||||
from dataclasses import dataclass, asdict
|
||||
from enum import Enum
|
||||
from typing import Tuple, Dict
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import re
|
||||
from functools import cache
|
||||
|
||||
|
||||
@dataclass
|
||||
class SystemInfo:
|
||||
platform: str
|
||||
distribution: str
|
||||
python_version: str
|
||||
cpu: str
|
||||
cpu_avx: str
|
||||
memory: str
|
||||
torch_version: str
|
||||
device: str
|
||||
device_version: str
|
||||
device_count: int
|
||||
device_other: str
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
class AVXType(Enum):
|
||||
BASIC = "basic"
|
||||
AVX = "AVX"
|
||||
AVX2 = "AVX2"
|
||||
AVX512 = "AVX512"
|
||||
|
||||
@staticmethod
|
||||
def of_type(avx: str):
|
||||
for item in AVXType:
|
||||
if item._value_ == avx:
|
||||
return item
|
||||
return None
|
||||
|
||||
|
||||
class OSType(str, Enum):
|
||||
WINDOWS = "win"
|
||||
LINUX = "linux"
|
||||
DARWIN = "darwin"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
def get_cpu_avx_support() -> Tuple[OSType, AVXType, str]:
|
||||
system = platform.system()
|
||||
os_type = OSType.OTHER
|
||||
cpu_avx = AVXType.BASIC
|
||||
env_cpu_avx = AVXType.of_type(os.getenv("DBGPT_LLAMA_CPP_AVX"))
|
||||
distribution = "Unknown Distribution"
|
||||
if "windows" in system.lower():
|
||||
os_type = OSType.WINDOWS
|
||||
output = "avx2"
|
||||
distribution = "Windows " + platform.release()
|
||||
print("Current platform is windows, use avx2 as default cpu architecture")
|
||||
elif system == "Linux":
|
||||
os_type = OSType.LINUX
|
||||
result = subprocess.run(
|
||||
["lscpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
output = result.stdout.decode()
|
||||
distribution = get_linux_distribution()
|
||||
elif system == "Darwin":
|
||||
os_type = OSType.DARWIN
|
||||
result = subprocess.run(
|
||||
["sysctl", "-a"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
distribution = "Mac OS " + platform.mac_ver()[0]
|
||||
output = result.stdout.decode()
|
||||
else:
|
||||
os_type = OSType.OTHER
|
||||
print("Unsupported OS to get cpu avx, use default")
|
||||
return os_type, env_cpu_avx if env_cpu_avx else cpu_avx, distribution
|
||||
|
||||
if "avx512" in output.lower():
|
||||
cpu_avx = AVXType.AVX512
|
||||
elif "avx2" in output.lower():
|
||||
cpu_avx = AVXType.AVX2
|
||||
elif "avx " in output.lower():
|
||||
# cpu_avx = AVXType.AVX
|
||||
pass
|
||||
return os_type, env_cpu_avx if env_cpu_avx else cpu_avx, distribution
|
||||
|
||||
|
||||
def get_device() -> str:
|
||||
try:
|
||||
import torch
|
||||
|
||||
return (
|
||||
"cuda"
|
||||
if torch.cuda.is_available()
|
||||
else "mps"
|
||||
if torch.backends.mps.is_available()
|
||||
else "cpu"
|
||||
)
|
||||
except ModuleNotFoundError:
|
||||
return "cpu"
|
||||
|
||||
|
||||
def get_device_info() -> Tuple[str, str, str, int, str]:
|
||||
torch_version, device, device_version, device_count, device_other = (
|
||||
None,
|
||||
"cpu",
|
||||
None,
|
||||
0,
|
||||
"",
|
||||
)
|
||||
try:
|
||||
import torch
|
||||
|
||||
torch_version = torch.__version__
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
device_version = torch.version.cuda
|
||||
device_count = torch.cuda.device_count()
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
if not device_version:
|
||||
device_version = (
|
||||
get_cuda_version_from_nvcc() or get_cuda_version_from_nvidia_smi()
|
||||
)
|
||||
if device == "cuda":
|
||||
try:
|
||||
output = subprocess.check_output(
|
||||
[
|
||||
"nvidia-smi",
|
||||
"--query-gpu=name,driver_version,memory.total,memory.free,memory.used",
|
||||
"--format=csv",
|
||||
]
|
||||
)
|
||||
device_other = output.decode("utf-8")
|
||||
except:
|
||||
pass
|
||||
return torch_version, device, device_version, device_count, device_other
|
||||
|
||||
|
||||
def get_cuda_version_from_nvcc():
|
||||
try:
|
||||
output = subprocess.check_output(["nvcc", "--version"])
|
||||
version_line = [
|
||||
line for line in output.decode("utf-8").split("\n") if "release" in line
|
||||
][0]
|
||||
return version_line.split("release")[-1].strip().split(",")[0]
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def get_cuda_version_from_nvidia_smi():
|
||||
try:
|
||||
output = subprocess.check_output(["nvidia-smi"]).decode("utf-8")
|
||||
match = re.search(r"CUDA Version:\s+(\d+\.\d+)", output)
|
||||
if match:
|
||||
return match.group(1)
|
||||
else:
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def get_linux_distribution():
|
||||
"""Get distribution of Linux"""
|
||||
if os.path.isfile("/etc/os-release"):
|
||||
with open("/etc/os-release", "r") as f:
|
||||
info = {}
|
||||
for line in f:
|
||||
key, _, value = line.partition("=")
|
||||
info[key] = value.strip().strip('"')
|
||||
return f"{info.get('NAME', 'Unknown')} {info.get('VERSION_ID', '')}".strip()
|
||||
return "Unknown Linux Distribution"
|
||||
|
||||
|
||||
def get_cpu_info():
|
||||
# Getting platform
|
||||
os_type, avx_type, distribution = get_cpu_avx_support()
|
||||
|
||||
# Getting CPU information
|
||||
cpu_info = "Unknown CPU"
|
||||
if os_type == OSType.LINUX:
|
||||
try:
|
||||
output = subprocess.check_output(["lscpu"]).decode("utf-8")
|
||||
match = re.search(r".*Model name:\s*(.+)", output)
|
||||
if match:
|
||||
cpu_info = match.group(1).strip()
|
||||
match = re.search(f".*型号名称:\s*(.+)", output)
|
||||
if match:
|
||||
cpu_info = match.group(1).strip()
|
||||
except:
|
||||
pass
|
||||
elif os_type == OSType.DARWIN:
|
||||
try:
|
||||
output = subprocess.check_output(
|
||||
["sysctl", "machdep.cpu.brand_string"]
|
||||
).decode("utf-8")
|
||||
match = re.search(r"machdep.cpu.brand_string:\s*(.+)", output)
|
||||
if match:
|
||||
cpu_info = match.group(1).strip()
|
||||
except:
|
||||
pass
|
||||
elif os_type == OSType.WINDOWS:
|
||||
# TODO
|
||||
raise NotImplementedError
|
||||
|
||||
return os_type, avx_type, cpu_info, distribution
|
||||
|
||||
|
||||
def get_memory_info(os_type: OSType) -> str:
|
||||
memory = "Unknown Memory"
|
||||
try:
|
||||
import psutil
|
||||
|
||||
memory = f"{psutil.virtual_memory().total // (1024 ** 3)} GB"
|
||||
except ImportError:
|
||||
pass
|
||||
if os_type == OSType.LINUX:
|
||||
try:
|
||||
with open("/proc/meminfo", "r") as f:
|
||||
mem_info = f.readlines()
|
||||
for line in mem_info:
|
||||
if "MemTotal" in line:
|
||||
memory = line.split(":")[1].strip()
|
||||
break
|
||||
except:
|
||||
pass
|
||||
return memory
|
||||
|
||||
|
||||
@cache
|
||||
def get_system_info() -> SystemInfo:
|
||||
"""Get System information"""
|
||||
|
||||
os_type, avx_type, cpu_info, distribution = get_cpu_info()
|
||||
|
||||
# Getting Python version
|
||||
python_version = platform.python_version()
|
||||
|
||||
memory = get_memory_info(os_type)
|
||||
|
||||
(
|
||||
torch_version,
|
||||
device,
|
||||
device_version,
|
||||
device_count,
|
||||
device_other,
|
||||
) = get_device_info()
|
||||
|
||||
return SystemInfo(
|
||||
platform=os_type._value_,
|
||||
distribution=distribution,
|
||||
python_version=python_version,
|
||||
cpu=cpu_info,
|
||||
cpu_avx=avx_type._value_,
|
||||
memory=memory,
|
||||
torch_version=torch_version,
|
||||
device=device,
|
||||
device_version=device_version,
|
||||
device_count=device_count,
|
||||
device_other=device_other,
|
||||
)
|
@ -259,6 +259,7 @@ def chat(
|
||||
found_trace_id = trace_id
|
||||
|
||||
service_tables = {}
|
||||
system_infos_table = {}
|
||||
out_kwargs = {"ensure_ascii": False} if output == "json" else {}
|
||||
for service_name, sp in service_spans.items():
|
||||
metadata = sp["metadata"]
|
||||
@ -266,6 +267,15 @@ def chat(
|
||||
for k, v in metadata["params"].items():
|
||||
table.add_row([k, v])
|
||||
service_tables[service_name] = table
|
||||
sys_infos = metadata.get("sys_infos")
|
||||
if sys_infos and isinstance(sys_infos, dict):
|
||||
sys_table = PrettyTable(
|
||||
["System Config Key", "System Config Value"],
|
||||
title=f"{service_name} System information",
|
||||
)
|
||||
for k, v in sys_infos.items():
|
||||
sys_table.add_row([k, v])
|
||||
system_infos_table[service_name] = sys_table
|
||||
|
||||
if not hide_run_params:
|
||||
merged_table1 = merge_tables_horizontally(
|
||||
@ -276,16 +286,23 @@ def chat(
|
||||
)
|
||||
merged_table2 = merge_tables_horizontally(
|
||||
[
|
||||
service_tables.get(SpanTypeRunName.MODEL_WORKER),
|
||||
service_tables.get(SpanTypeRunName.WORKER_MANAGER),
|
||||
service_tables.get(SpanTypeRunName.MODEL_WORKER.value),
|
||||
service_tables.get(SpanTypeRunName.WORKER_MANAGER.value),
|
||||
]
|
||||
)
|
||||
sys_table = system_infos_table.get(SpanTypeRunName.WORKER_MANAGER.value)
|
||||
if system_infos_table:
|
||||
for k, v in system_infos_table.items():
|
||||
sys_table = v
|
||||
break
|
||||
if output == "text":
|
||||
print(merged_table1)
|
||||
print(merged_table2)
|
||||
else:
|
||||
for service_name, table in service_tables.items():
|
||||
print(table.get_formatted_string(out_format=output, **out_kwargs))
|
||||
if sys_table:
|
||||
print(sys_table.get_formatted_string(out_format=output, **out_kwargs))
|
||||
if hide_conv:
|
||||
return
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user