DB-GPT/dbgpt/core/interface/llm.py

from abc import ABC
from typing import Optional, Dict, List, Any, Union, AsyncIterator

import time
from dataclasses import dataclass, asdict
import copy

from dbgpt.util.annotations import PublicAPI
from dbgpt.util.model_utils import GPUInfo
from dbgpt.core.interface.message import ModelMessage, ModelMessageRoleType
from dbgpt.core.awel import MapOperator, StreamifyAbsOperator


@dataclass
class ModelInferenceMetrics:
    """A class to represent metrics for assessing the inference performance of a LLM."""

    collect_index: Optional[int] = 0

    start_time_ms: Optional[int] = None
    """The timestamp (in milliseconds) when the model inference starts."""

    end_time_ms: Optional[int] = None
    """The timestamp (in milliseconds) when the model inference ends."""

    current_time_ms: Optional[int] = None
    """The current timestamp (in milliseconds) when the model inference return partially output(stream)."""

    first_token_time_ms: Optional[int] = None
    """The timestamp (in milliseconds) when the first token is generated."""

    first_completion_time_ms: Optional[int] = None
    """The timestamp (in milliseconds) when the first completion is generated."""

    first_completion_tokens: Optional[int] = None
    """The number of tokens when the first completion is generated."""

    prompt_tokens: Optional[int] = None
    """The number of tokens in the input prompt."""

    completion_tokens: Optional[int] = None
    """The number of tokens in the generated completion."""

    total_tokens: Optional[int] = None
    """The total number of tokens (prompt plus completion)."""

    speed_per_second: Optional[float] = None
    """The average number of tokens generated per second."""

    current_gpu_infos: Optional[List[GPUInfo]] = None
    """Current gpu information, all devices"""

    avg_gpu_infos: Optional[List[GPUInfo]] = None
    """Average memory usage across all collection points"""

    @staticmethod
    def create_metrics(
        last_metrics: Optional["ModelInferenceMetrics"] = None,
    ) -> "ModelInferenceMetrics":
        start_time_ms = last_metrics.start_time_ms if last_metrics else None
        first_token_time_ms = last_metrics.first_token_time_ms if last_metrics else None
        first_completion_time_ms = (
            last_metrics.first_completion_time_ms if last_metrics else None
        )
        first_completion_tokens = (
            last_metrics.first_completion_tokens if last_metrics else None
        )
        prompt_tokens = last_metrics.prompt_tokens if last_metrics else None
        completion_tokens = last_metrics.completion_tokens if last_metrics else None
        total_tokens = last_metrics.total_tokens if last_metrics else None
        speed_per_second = last_metrics.speed_per_second if last_metrics else None
        current_gpu_infos = last_metrics.current_gpu_infos if last_metrics else None
        avg_gpu_infos = last_metrics.avg_gpu_infos if last_metrics else None

        if not start_time_ms:
            start_time_ms = time.time_ns() // 1_000_000
        current_time_ms = time.time_ns() // 1_000_000
        end_time_ms = current_time_ms

        return ModelInferenceMetrics(
            start_time_ms=start_time_ms,
            end_time_ms=end_time_ms,
            current_time_ms=current_time_ms,
            first_token_time_ms=first_token_time_ms,
            first_completion_time_ms=first_completion_time_ms,
            first_completion_tokens=first_completion_tokens,
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=total_tokens,
            speed_per_second=speed_per_second,
            current_gpu_infos=current_gpu_infos,
            avg_gpu_infos=avg_gpu_infos,
        )

    def to_dict(self) -> Dict:
        return asdict(self)


@dataclass
class ModelOutput:
    """A class to represent the output of a LLM.""" ""

    text: str
    """The generated text."""
    error_code: int
    """The error code of the model inference. If the model inference is successful, the error code is 0."""
    model_context: Dict = None
    finish_reason: str = None
    usage: Dict[str, Any] = None
    metrics: Optional[ModelInferenceMetrics] = None
    """Some metrics for model inference"""

    def to_dict(self) -> Dict:
        return asdict(self)


_ModelMessageType = Union[ModelMessage, Dict[str, Any]]


@dataclass
class ModelRequest:
    model: str
    """The name of the model."""

    messages: List[_ModelMessageType]
    """The input messages."""

    temperature: Optional[float] = None
    """The temperature of the model inference."""

    max_new_tokens: Optional[int] = None
    """The maximum number of tokens to generate."""

    stop: Optional[str] = None
    """The stop condition of the model inference."""
    stop_token_ids: Optional[List[int]] = None
    """The stop token ids of the model inference."""
    context_len: Optional[int] = None
    """The context length of the model inference."""
    echo: Optional[bool] = True
    """Whether to echo the input messages."""
    span_id: Optional[str] = None
    """The span id of the model inference."""

    def to_dict(self) -> Dict:
        new_reqeust = copy.deepcopy(self)
        new_reqeust.messages = list(
            map(lambda m: m if isinstance(m, dict) else m.dict(), new_reqeust.messages)
        )
        # Skip None fields
        return {k: v for k, v in asdict(new_reqeust).items() if v}

    def _get_messages(self) -> List[ModelMessage]:
        return list(
            map(
                lambda m: m if isinstance(m, ModelMessage) else ModelMessage(**m),
                self.messages,
            )
        )

    @staticmethod
    def _build(model: str, prompt: str, **kwargs):
        return ModelRequest(
            model=model,
            messages=[ModelMessage(role=ModelMessageRoleType.HUMAN, content=prompt)],
            **kwargs,
        )


class RequestBuildOperator(MapOperator[str, ModelRequest], ABC):
    def __init__(self, model: str, **kwargs):
        self._model = model
        super().__init__(**kwargs)

    async def map(self, input_value: str) -> ModelRequest:
        return ModelRequest._build(self._model, input_value)


class BaseLLMOperator(
    MapOperator[ModelRequest, ModelOutput],
    StreamifyAbsOperator[ModelRequest, ModelOutput],
    ABC,
):
    """The abstract operator for a LLM."""


@PublicAPI(stability="beta")
class OpenAILLM(BaseLLMOperator):
    """The operator for OpenAI LLM.

    Examples:

        .. code-block:: python
            llm = OpenAILLM()
            model_request = ModelRequest(model="gpt-3.5-turbo", messages=[ModelMessage(role=ModelMessageRoleType.HUMAN, content="Hello")])
            model_output = await llm.map(model_request)
    """

    def __int__(self):
        try:
            import openai
        except ImportError as e:
            raise ImportError("Please install openai package to use OpenAILLM") from e
        import importlib.metadata as metadata

        if not metadata.version("openai") >= "1.0.0":
            raise ImportError("Please upgrade openai package to version 1.0.0 or above")

    async def _send_request(
        self, model_request: ModelRequest, stream: Optional[bool] = False
    ):
        import os
        from openai import AsyncOpenAI

        client = AsyncOpenAI(
            api_key=os.environ.get("OPENAI_API_KEY"),
            base_url=os.environ.get("OPENAI_API_BASE"),
        )
        messages = ModelMessage.to_openai_messages(model_request._get_messages())
        payloads = {
            "model": model_request.model,
            "stream": stream,
        }
        if model_request.temperature is not None:
            payloads["temperature"] = model_request.temperature
        if model_request.max_new_tokens:
            payloads["max_tokens"] = model_request.max_new_tokens

        return await client.chat.completions.create(messages=messages, **payloads)

    async def map(self, model_request: ModelRequest) -> ModelOutput:
        try:
            chat_completion = await self._send_request(model_request, stream=False)
            text = chat_completion.choices[0].message.content
            usage = chat_completion.usage.dict()
            return ModelOutput(text=text, error_code=0, usage=usage)
        except Exception as e:
            return ModelOutput(
                text=f"**LLMServer Generate Error, Please CheckErrorInfo.**: {e}",
                error_code=1,
            )

    async def streamify(
        self, model_request: ModelRequest
    ) -> AsyncIterator[ModelOutput]:
        try:
            chat_completion = await self._send_request(model_request, stream=True)
            text = ""
            for r in chat_completion:
                if len(r.choices) == 0:
                    continue
                if r.choices[0].delta.content is not None:
                    content = r.choices[0].delta.content
                    text += content
                    yield ModelOutput(text=text, error_code=0)
        except Exception as e:
            yield ModelOutput(
                text=f"**LLMServer Generate Error, Please CheckErrorInfo.**: {e}",
                error_code=1,
            )