feat(core): Support opentelemetry exporter (#1690)

This commit is contained in:
Fangyin Cheng
2024-07-05 15:20:21 +08:00
committed by GitHub
parent 84fc1fc7fe
commit bf978d2bf9
39 changed files with 1176 additions and 218 deletions

View File

@@ -3,7 +3,7 @@ import contextvars
from abc import ABC, abstractmethod
from concurrent.futures import Executor, ThreadPoolExecutor
from functools import partial
from typing import Any, Awaitable, Callable
from typing import Any, Callable, Optional
from dbgpt.component import BaseComponent, ComponentType, SystemApp
@@ -67,6 +67,11 @@ async def blocking_func_to_async(
return await loop.run_in_executor(executor, run_with_context)
async def blocking_func_to_async_no_executor(func: BlockingFunction, *args, **kwargs):
"""Run a potentially blocking function within an executor."""
return await blocking_func_to_async(None, func, *args, **kwargs) # type: ignore
class AsyncToSyncIterator:
def __init__(self, async_iterable, loop: asyncio.BaseEventLoop):
self.async_iterable = async_iterable

View File

@@ -108,6 +108,99 @@ class BaseParameters:
return asdict(self)
@dataclass
class BaseServerParameters(BaseParameters):
host: Optional[str] = field(
default="0.0.0.0", metadata={"help": "The host IP address to bind to."}
)
port: Optional[int] = field(
default=None, metadata={"help": "The port number to bind to."}
)
daemon: Optional[bool] = field(
default=False, metadata={"help": "Run the server as a daemon."}
)
log_level: Optional[str] = field(
default=None,
metadata={
"help": "Logging level",
"valid_values": [
"FATAL",
"ERROR",
"WARNING",
"WARNING",
"INFO",
"DEBUG",
"NOTSET",
],
},
)
log_file: Optional[str] = field(
default=None,
metadata={
"help": "The filename to store log",
},
)
tracer_file: Optional[str] = field(
default=None,
metadata={
"help": "The filename to store tracer span records",
},
)
tracer_to_open_telemetry: Optional[bool] = field(
default=os.getenv("TRACER_TO_OPEN_TELEMETRY", "False").lower() == "true",
metadata={
"help": "Whether send tracer span records to OpenTelemetry",
},
)
otel_exporter_otlp_traces_endpoint: Optional[str] = field(
default=None,
metadata={
"help": "`OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` target to which the span "
"exporter is going to send spans. The endpoint MUST be a valid URL host, "
"and MAY contain a scheme (http or https), port and path. A scheme of https"
" indicates a secure connection and takes precedence over this "
"configuration setting.",
},
)
otel_exporter_otlp_traces_insecure: Optional[bool] = field(
default=None,
metadata={
"help": "OTEL_EXPORTER_OTLP_TRACES_INSECURE` represents whether to enable "
"client transport security for gRPC requests for spans. A scheme of https "
"takes precedence over the this configuration setting. Default: False"
},
)
otel_exporter_otlp_traces_certificate: Optional[str] = field(
default=None,
metadata={
"help": "`OTEL_EXPORTER_OTLP_TRACES_CERTIFICATE` stores the path to the "
"certificate file for TLS credentials of gRPC client for traces. "
"Should only be used for a secure connection for tracing",
},
)
otel_exporter_otlp_traces_headers: Optional[str] = field(
default=None,
metadata={
"help": "`OTEL_EXPORTER_OTLP_TRACES_HEADERS` contains the key-value pairs "
"to be used as headers for spans associated with gRPC or HTTP requests.",
},
)
otel_exporter_otlp_traces_timeout: Optional[int] = field(
default=None,
metadata={
"help": "`OTEL_EXPORTER_OTLP_TRACES_TIMEOUT` is the maximum time the OTLP "
"exporter will wait for each batch export for spans.",
},
)
otel_exporter_otlp_traces_compression: Optional[str] = field(
default=None,
metadata={
"help": "`OTEL_EXPORTER_OTLP_COMPRESSION` but only for the span exporter. "
"If both are present, this takes higher precedence.",
},
)
def _get_dataclass_print_str(obj):
class_name = obj.__class__.__name__
parameters = [

View File

@@ -1,4 +1,5 @@
from dbgpt.util.tracer.base import (
DBGPT_TRACER_SPAN_ID,
Span,
SpanStorage,
SpanStorageType,
@@ -28,6 +29,7 @@ __all__ = [
"SpanStorage",
"SpanStorageType",
"TracerContext",
"DBGPT_TRACER_SPAN_ID",
"MemorySpanStorage",
"FileSpanStorage",
"SpanStorageContainer",

View File

@@ -1,15 +1,24 @@
from __future__ import annotations
import json
import secrets
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from dbgpt.component import BaseComponent, ComponentType, SystemApp
DBGPT_TRACER_SPAN_ID = "DB-GPT-Trace-Span-Id"
# Compatibility with OpenTelemetry API
_TRACE_ID_MAX_VALUE = 2**128 - 1
_SPAN_ID_MAX_VALUE = 2**64 - 1
INVALID_SPAN_ID = 0x0000000000000000
INVALID_TRACE_ID = 0x00000000000000000000000000000000
class SpanType(str, Enum):
BASE = "base"
@@ -60,7 +69,7 @@ class Span:
# Timestamp when this span ended, initially None
self.end_time = None
# Additional metadata associated with the span
self.metadata = metadata
self.metadata = metadata or {}
self._end_callers = []
if end_caller:
self._end_callers.append(end_caller)
@@ -91,13 +100,17 @@ class Span:
"span_id": self.span_id,
"parent_span_id": self.parent_span_id,
"operation_name": self.operation_name,
"start_time": None
if not self.start_time
else self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3],
"end_time": None
if not self.end_time
else self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3],
"metadata": _clean_for_json(self.metadata),
"start_time": (
None
if not self.start_time
else self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
),
"end_time": (
None
if not self.end_time
else self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
),
"metadata": _clean_for_json(self.metadata) if self.metadata else None,
}
def copy(self) -> Span:
@@ -200,6 +213,60 @@ class Tracer(BaseComponent, ABC):
"""
return str(uuid.uuid4())
def _new_random_trace_id(self) -> str:
"""Create a new random trace ID."""
return _new_random_trace_id()
def _new_random_span_id(self) -> str:
"""Create a new random span ID."""
return _new_random_span_id()
def _new_random_trace_id() -> str:
"""Create a new random trace ID."""
# Generate a 128-bit hex string
return secrets.token_hex(16)
def _is_valid_trace_id(trace_id: Union[str, int]) -> bool:
if isinstance(trace_id, str):
try:
trace_id = int(trace_id, 16)
except ValueError:
return False
return INVALID_TRACE_ID < int(trace_id) <= _TRACE_ID_MAX_VALUE
def _new_random_span_id() -> str:
"""Create a new random span ID."""
# Generate a 64-bit hex string
return secrets.token_hex(8)
def _is_valid_span_id(span_id: Union[str, int]) -> bool:
if isinstance(span_id, str):
try:
span_id = int(span_id, 16)
except ValueError:
return False
return INVALID_SPAN_ID < int(span_id) <= _SPAN_ID_MAX_VALUE
def _split_span_id(span_id: str) -> Tuple[int, int]:
parent_span_id_parts = span_id.split(":")
if len(parent_span_id_parts) != 2:
return 0, 0
trace_id, parent_span_id = parent_span_id_parts
try:
trace_id = int(trace_id, 16)
span_id = int(parent_span_id, 16)
return trace_id, span_id
except ValueError:
return 0, 0
@dataclass
class TracerContext:
@@ -240,3 +307,28 @@ def _clean_for_json(data: Optional[str, Any] = None):
return data
except TypeError:
return None
def _parse_span_id(body: Any) -> Optional[str]:
from starlette.requests import Request
from dbgpt._private.pydantic import BaseModel, model_to_dict
span_id: Optional[str] = None
if isinstance(body, Request):
span_id = body.headers.get(DBGPT_TRACER_SPAN_ID)
elif isinstance(body, dict):
span_id = body.get(DBGPT_TRACER_SPAN_ID) or body.get("span_id")
elif isinstance(body, BaseModel):
dict_body = model_to_dict(body)
span_id = dict_body.get(DBGPT_TRACER_SPAN_ID) or dict_body.get("span_id")
if not span_id:
return None
else:
int_trace_id, int_span_id = _split_span_id(span_id)
if not int_trace_id:
return None
if _is_valid_span_id(int_span_id) and _is_valid_trace_id(int_trace_id):
return span_id
else:
return span_id

View File

@@ -0,0 +1,122 @@
from typing import Dict, List, Optional
from .base import Span, SpanStorage, _split_span_id
try:
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import Span as OTSpan
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace import SpanContext, SpanKind
except ImportError:
raise ImportError(
"To use OpenTelemetrySpanStorage, you must install opentelemetry-api, "
"opentelemetry-sdk and opentelemetry-exporter-otlp."
"You can install it via `pip install opentelemetry-api opentelemetry-sdk "
"opentelemetry-exporter-otlp`"
)
class OpenTelemetrySpanStorage(SpanStorage):
"""OpenTelemetry span storage."""
def __init__(
self,
service_name: str,
otlp_endpoint: Optional[str] = None,
otlp_insecure: Optional[bool] = None,
otlp_timeout: Optional[int] = None,
):
super().__init__()
self.service_name = service_name
resource = Resource(attributes={"service.name": service_name})
self.tracer_provider = TracerProvider(resource=resource)
self.tracer = self.tracer_provider.get_tracer(__name__)
# Store the spans that have not ended
self.spans: Dict[str, OTSpan] = {}
otlp_exporter = OTLPSpanExporter(
endpoint=otlp_endpoint,
insecure=otlp_insecure,
timeout=otlp_timeout,
)
span_processor = BatchSpanProcessor(otlp_exporter)
self.tracer_provider.add_span_processor(span_processor)
trace.set_tracer_provider(self.tracer_provider)
def append_span(self, span: Span):
span_id = span.span_id
if span_id in self.spans:
otel_span = self.spans.pop(span_id)
# Update the end time and attributes of the span
end_time = int(span.end_time.timestamp() * 1e9) if span.end_time else None
if span.metadata:
for key, value in span.metadata.items():
if isinstance(value, (bool, str, bytes, int, float)) or (
isinstance(value, list)
and all(
isinstance(i, (bool, str, bytes, int, float)) for i in value
)
):
otel_span.set_attribute(key, value)
if end_time:
otel_span.end(end_time=end_time)
else:
otel_span.end()
else:
parent_context = self._create_parent_context(span)
# Datetime -> int
start_time = int(span.start_time.timestamp() * 1e9)
otel_span = self.tracer.start_span(
span.operation_name,
context=parent_context,
kind=SpanKind.INTERNAL,
start_time=start_time,
)
otel_span.set_attribute("dbgpt_trace_id", span.trace_id)
otel_span.set_attribute("dbgpt_span_id", span.span_id)
if span.parent_span_id:
otel_span.set_attribute("dbgpt_parent_span_id", span.parent_span_id)
otel_span.set_attribute("span_type", span.span_type.value)
if span.metadata:
for key, value in span.metadata.items():
if isinstance(value, (bool, str, bytes, int, float)) or (
isinstance(value, list)
and all(
isinstance(i, (bool, str, bytes, int, float)) for i in value
)
):
otel_span.set_attribute(key, value)
if not span.end_time:
self.spans[span_id] = otel_span
def append_span_batch(self, spans: List[Span]):
for span in spans:
self.append_span(span)
def _create_parent_context(self, span: Span):
if not span.parent_span_id:
return trace.set_span_in_context(trace.INVALID_SPAN)
trace_id, parent_span_id = _split_span_id(span.parent_span_id)
if not trace_id:
return trace.set_span_in_context(trace.INVALID_SPAN)
span_context = SpanContext(
trace_id=trace_id,
span_id=parent_span_id,
is_remote=True,
trace_flags=trace.TraceFlags(0x01), # Default: SAMPLED
)
return trace.set_span_in_context(trace.NonRecordingSpan(span_context))
def close(self):
self.tracer_provider.shutdown()

View File

@@ -249,7 +249,7 @@ def chat(
for sp in spans:
span_type = sp["span_type"]
metadata = sp.get("metadata")
if span_type == SpanType.RUN:
if span_type == SpanType.RUN and metadata and "run_service" in metadata:
service_name = metadata["run_service"]
service_spans[service_name] = sp.copy()
if set(service_spans.keys()) == service_names and found_trace_id:

View File

@@ -3,7 +3,7 @@ import inspect
import logging
from contextvars import ContextVar
from functools import wraps
from typing import Dict, Optional
from typing import Any, AsyncIterator, Dict, Optional
from dbgpt.component import ComponentType, SystemApp
from dbgpt.util.module_utils import import_from_checked_string
@@ -46,9 +46,12 @@ class DefaultTracer(Tracer):
metadata: Dict = None,
) -> Span:
trace_id = (
self._new_uuid() if parent_span_id is None else parent_span_id.split(":")[0]
self._new_random_trace_id()
if parent_span_id is None
else parent_span_id.split(":")[0]
)
span_id = f"{trace_id}:{self._new_uuid()}"
span_id = f"{trace_id}:{self._new_random_span_id()}"
span = Span(
trace_id,
span_id,
@@ -164,6 +167,33 @@ class TracerManager:
current_span = self.get_current_span()
return current_span.span_type if current_span else None
def _parse_span_id(self, body: Any) -> Optional[str]:
from .base import _parse_span_id
return _parse_span_id(body)
def wrapper_async_stream(
self,
generator: AsyncIterator[Any],
operation_name: str,
parent_span_id: str = None,
span_type: SpanType = None,
metadata: Dict = None,
) -> AsyncIterator[Any]:
"""Wrap an async generator with a span"""
parent_span_id = parent_span_id or self.get_current_span_id()
async def wrapper():
span = self.start_span(operation_name, parent_span_id, span_type, metadata)
try:
async for item in generator:
yield item
finally:
span.end()
return wrapper()
root_tracer: TracerManager = TracerManager()
@@ -206,10 +236,14 @@ def _parse_operation_name(func, *args):
def initialize_tracer(
tracer_filename: str,
root_operation_name: str = "DB-GPT-Web-Entry",
root_operation_name: str = "DB-GPT-Webserver",
system_app: Optional[SystemApp] = None,
tracer_storage_cls: Optional[str] = None,
create_system_app: bool = False,
enable_open_telemetry: bool = False,
otlp_endpoint: Optional[str] = None,
otlp_insecure: Optional[bool] = None,
otlp_timeout: Optional[int] = None,
):
"""Initialize the tracer with the given filename and system app."""
from dbgpt.util.tracer.span_storage import FileSpanStorage, SpanStorageContainer
@@ -227,6 +261,17 @@ def initialize_tracer(
storage_container = SpanStorageContainer(system_app)
storage_container.append_storage(FileSpanStorage(tracer_filename))
if enable_open_telemetry:
from dbgpt.util.tracer.opentelemetry import OpenTelemetrySpanStorage
storage_container.append_storage(
OpenTelemetrySpanStorage(
service_name=root_operation_name,
otlp_endpoint=otlp_endpoint,
otlp_insecure=otlp_insecure,
otlp_timeout=otlp_timeout,
)
)
if tracer_storage_cls:
logger.info(f"Begin parse storage class {tracer_storage_cls}")

View File

@@ -1,4 +1,4 @@
import uuid
import logging
from contextvars import ContextVar
from starlette.middleware.base import BaseHTTPMiddleware
@@ -7,7 +7,11 @@ from starlette.types import ASGIApp
from dbgpt.util.tracer import Tracer, TracerContext
_DEFAULT_EXCLUDE_PATHS = ["/api/controller/heartbeat"]
from .base import _parse_span_id
_DEFAULT_EXCLUDE_PATHS = ["/api/controller/heartbeat", "/api/health"]
logger = logging.getLogger(__name__)
class TraceIDMiddleware(BaseHTTPMiddleware):
@@ -33,11 +37,12 @@ class TraceIDMiddleware(BaseHTTPMiddleware):
):
return await call_next(request)
span_id = request.headers.get("DBGPT_TRACER_SPAN_ID")
# if not span_id:
# span_id = str(uuid.uuid4())
# self.trace_context_var.set(TracerContext(span_id=span_id))
# Read trace_id from request headers
span_id = _parse_span_id(request)
logger.debug(
f"TraceIDMiddleware: span_id={span_id}, path={request.url.path}, "
f"headers={request.headers}"
)
with self.tracer.start_span(
self.root_operation_name, span_id, metadata={"path": request.url.path}
):