feat(storage): Support oss and s3

This commit is contained in:
Fangyin Cheng 2025-03-17 11:59:00 +08:00
parent 8eba2a3b2e
commit b2dd66dc6d
21 changed files with 1535 additions and 12 deletions

View File

@ -0,0 +1,51 @@
[system]
# Load language from environment variable(It is set by the hook)
language = "${env:DBGPT_LANG:-zh}"
log_level = "INFO"
api_keys = []
encrypt_key = "your_secret_key"
# Server Configurations
[service.web]
host = "0.0.0.0"
port = 5670
[service.web.database]
type = "sqlite"
path = "pilot/meta_data/dbgpt.db"
[[serves]]
type = "file"
# Default backend for file server
default_backend = "s3"
[[serves.backends]]
type = "oss"
endpoint = "https://oss-cn-beijing.aliyuncs.com"
region = "oss-cn-beijing"
access_key_id = "${env:OSS_ACCESS_KEY_ID}"
access_key_secret = "${env:OSS_ACCESS_KEY_SECRET}"
fixed_bucket = "{your_bucket_name}"
[[serves.backends]]
# Use Tencent COS s3 compatible API as the file server
type = "s3"
endpoint = "https://cos.ap-beijing.myqcloud.com"
region = "ap-beijing"
access_key_id = "${env:COS_SECRETID}"
access_key_secret = "${env:COS_SECRETKEY}"
fixed_bucket = "{your_bucket_name}"
# Model Configurations
[models]
[[models.llms]]
name = "${env:LLM_MODEL_NAME:-gpt-4o}"
provider = "${env:LLM_MODEL_PROVIDER:-proxy/openai}"
api_base = "${env:OPENAI_API_BASE:-https://api.openai.com/v1}"
api_key = "${env:OPENAI_API_KEY}"
[[models.embeddings]]
name = "${env:EMBEDDING_MODEL_NAME:-text-embedding-3-small}"
provider = "${env:EMBEDDING_MODEL_PROVIDER:-proxy/openai}"
api_url = "${env:EMBEDDING_MODEL_API_URL:-https://api.openai.com/v1/embeddings}"
api_key = "${env:OPENAI_API_KEY}"

View File

@ -393,7 +393,6 @@ async def document_upload(
bucket,
safe_filename,
doc_file.file,
storage_type="distributed",
custom_metadata=custom_metadata,
)

View File

@ -359,7 +359,6 @@ async def file_upload(
bucket,
file_name,
doc_file.file,
storage_type="distributed",
custom_metadata=custom_metadata,
)

View File

@ -181,7 +181,6 @@ class ChatExcel(BaseChat):
self.fs_client.upload_file,
self._bucket,
self._database_file_path,
storage_type="distributed",
file_id=self._database_file_id,
)
return result

View File

@ -1115,6 +1115,7 @@ def auto_register_resource(
alias: Optional[List[str]] = None,
tags: Optional[Dict[str, str]] = None,
show_in_ui: bool = True,
skip_fields: Optional[List[str]] = None,
**decorator_kwargs,
):
"""Auto register the resource.
@ -1130,6 +1131,8 @@ def auto_register_resource(
alias (Optional[List[str]], optional): The alias of the resource. Defaults to
None. For compatibility, we can use the alias to register the resource.
tags (Optional[Dict[str, str]]): The tags of the resource
show_in_ui (bool): Whether show the resource in UI.
skip_fields (Optional[List[str]]): The fields to skip.
"""
from dataclasses import fields, is_dataclass
@ -1147,6 +1150,8 @@ def auto_register_resource(
parameters: List[Parameter] = []
raw_fields = fields(cls)
for i, fd in enumerate(fields_desc_list):
if skip_fields and fd.param_name in skip_fields:
continue
param_type = fd.param_type
if param_type in TYPE_STRING_TO_TYPE:
# Basic type

View File

@ -16,6 +16,7 @@ import requests
from dbgpt.component import BaseComponent, ComponentType, SystemApp
from dbgpt.util.tracer import root_tracer, trace
from ...util import BaseParameters, RegisterParameters
from .storage import (
InMemoryStorage,
QuerySpec,
@ -116,6 +117,17 @@ class FileMetadata(StorageItem):
self._identifier = obj._identifier
@dataclasses.dataclass
class StorageBackendConfig(BaseParameters, RegisterParameters):
"""Storage backend configuration"""
__type__ = "___storage_backend_config___"
def create_storage(self) -> "StorageBackend":
"""Create the storage"""
raise NotImplementedError()
class FileStorageURI:
"""File storage URI."""
@ -489,6 +501,7 @@ class FileStorageClient(BaseComponent):
system_app: Optional[SystemApp] = None,
storage_system: Optional[FileStorageSystem] = None,
save_chunk_size: int = 1024 * 1024,
default_storage_type: Optional[str] = None,
):
"""Initialize the file storage client."""
super().__init__(system_app=system_app)
@ -503,10 +516,14 @@ class FileStorageClient(BaseComponent):
)
}
)
if not default_storage_type:
if storage_system and storage_system.storage_backends:
default_storage_type = list(storage_system.storage_backends.keys())[0]
self.system_app = system_app
self._storage_system = storage_system
self.save_chunk_size = save_chunk_size
self.default_storage_type = default_storage_type
def init_app(self, system_app: SystemApp):
"""Initialize the application."""
@ -523,7 +540,7 @@ class FileStorageClient(BaseComponent):
self,
bucket: str,
file_path: str,
storage_type: str,
storage_type: Optional[str] = None,
custom_metadata: Optional[Dict[str, Any]] = None,
file_id: Optional[str] = None,
) -> str:
@ -556,7 +573,7 @@ class FileStorageClient(BaseComponent):
bucket: str,
file_name: str,
file_data: BinaryIO,
storage_type: str,
storage_type: Optional[str] = None,
custom_metadata: Optional[Dict[str, Any]] = None,
file_id: Optional[str] = None,
) -> str:
@ -575,12 +592,20 @@ class FileStorageClient(BaseComponent):
Returns:
str: The file URI
"""
if not storage_type:
storage_type = self.default_storage_type
if not storage_type:
raise ValueError("Storage type not provided")
return self.storage_system.save_file(
bucket, file_name, file_data, storage_type, custom_metadata, file_id
)
def download_file(
self, uri: str, dest_path: Optional[str] = None, dest_dir: Optional[str] = None
self,
uri: str,
dest_path: Optional[str] = None,
dest_dir: Optional[str] = None,
cache: bool = True,
) -> Tuple[str, FileMetadata]:
"""Download a file from the storage system.
@ -595,6 +620,7 @@ class FileStorageClient(BaseComponent):
uri (str): The file URI
dest_path (str, optional): The destination path. Defaults to None.
dest_dir (str, optional): The destination directory. Defaults to None.
cache (bool, optional): Whether to cache the file. Defaults to True.
Raises:
FileNotFoundError: If the file is not found
@ -617,7 +643,7 @@ class FileStorageClient(BaseComponent):
os.makedirs(base_path, exist_ok=True)
target_path = os.path.join(base_path, file_metadata.file_id + extension)
file_hash = file_metadata.file_hash
if os.path.exists(target_path):
if os.path.exists(target_path) and cache:
logger.debug(f"File {target_path} already exists, begin hash check")
with open(target_path, "rb") as f:
if file_hash == calculate_file_hash(f, self.save_chunk_size):

View File

@ -253,6 +253,26 @@ class ModelScanner(Generic[T]):
for key, value in scanned_items.items():
self._registered_items[key] = value
child_items = {}
for key, value in self._registered_items.items():
if hasattr(value, "__scan_config__"):
_child_scanner = ModelScanner()
_child_config = value.__scan_config__
if not isinstance(_child_config, ScannerConfig):
continue
if (
hasattr(value, "__is_already_scanned__")
and value.__is_already_scanned__
):
continue
try:
_child_scanner.scan_and_register(_child_config)
child_items.update(_child_scanner.get_registered_items())
value.__is_already_scanned__ = True
except Exception as e:
logger.warning(f"Error scanning child module {key}: {str(e)}")
self._registered_items.update(child_items)
except ImportError as e:
logger.warning(f"Error importing module {config.module_path}: {str(e)}")

View File

@ -75,6 +75,13 @@ storage_chromadb = [
storage_elasticsearch = ["elasticsearch"]
storage_obvector = ["pyobvector"]
file_oss = [
"oss2" # Aliyun OSS
]
file_s3 = [
"boto3"
]
[tool.uv]
managed = true
dev-dependencies = [

View File

@ -0,0 +1,102 @@
from dataclasses import dataclass, field
from typing import Optional
from dbgpt.core.interface.file import StorageBackend, StorageBackendConfig
from dbgpt.util.i18n_utils import _
@dataclass
class OSSStorageConfig(StorageBackendConfig):
__type__ = "oss"
endpoint: str = field(
metadata={
"help": _(
"The endpoint of the OSS server. "
"e.g. https://oss-cn-hangzhou.aliyuncs.com"
)
},
)
region: str = field(
metadata={"help": _("The region of the OSS server. e.g. cn-hangzhou")},
)
access_key_id: Optional[str] = field(
default=None,
metadata={
"help": _(
"The access key ID of the OSS server. You can also set it in the "
"environment variable OSS_ACCESS_KEY_ID"
),
"tags": "privacy",
},
)
access_key_secret: Optional[str] = field(
default=None,
metadata={
"help": _(
"The access key secret of the OSS server. You can also set it in the "
"environment variable OSS_ACCESS_KEY_SECRET"
),
"tags": "privacy",
},
)
use_environment_credentials: Optional[bool] = field(
default=False,
metadata={
"help": _(
"Whether to use the environment variables OSS_ACCESS_KEY_ID and "
"OSS_ACCESS_KEY_SECRET as the credentials. Default is False."
),
},
)
fixed_bucket: Optional[str] = field(
default=None,
metadata={
"help": _(
"The fixed bucket name to use. If set, all logical buckets in DB-GPT "
"will be mapped to this bucket. We suggest you set this value to avoid "
"bucket name conflicts."
)
},
)
bucket_prefix: Optional[str] = field(
default="dbgpt-fs-",
metadata={
"help": _(
"The prefix of the bucket name. If set, all logical buckets in DB-GPT "
"will be prefixed with this value. Just work when fixed_bucket is None."
)
},
)
auto_create_bucket: Optional[bool] = field(
default=True,
metadata={
"help": _(
"Whether to create the bucket automatically if it does not exist. "
"If set to False, the bucket must exist before using it."
)
},
)
save_chunk_size: Optional[int] = field(
default=1024 * 1024,
metadata={
"help": _(
"The chunk size when saving the file. When the file is larger 10x than "
"this value, it will be uploaded in multiple parts. Default is 1M."
)
},
)
def create_storage(self) -> StorageBackend:
from .oss_storage import AliyunOSSStorage
return AliyunOSSStorage(
endpoint=self.endpoint,
region=self.region,
access_key_id=self.access_key_id,
access_key_secret=self.access_key_secret,
use_environment_credentials=self.use_environment_credentials,
fixed_bucket=self.fixed_bucket,
bucket_prefix=self.bucket_prefix,
auto_create_bucket=self.auto_create_bucket,
save_chunk_size=self.save_chunk_size,
)

View File

@ -0,0 +1,484 @@
"""Aliyun OSS storage backend."""
import hashlib
import io
import logging
import os
import random
import time
from typing import BinaryIO, Callable, Dict, Optional, Union
import oss2
from oss2.credentials import EnvironmentVariableCredentialsProvider
from dbgpt.core.interface.file import FileMetadata, StorageBackend
logger = logging.getLogger(__name__)
def does_bucket_exist(bucket):
try:
bucket.get_bucket_info()
except oss2.exceptions.NoSuchBucket:
return False
except:
raise
return True
class AliyunOSSStorage(StorageBackend):
"""Aliyun OSS storage backend implementation."""
storage_type: str = "oss"
def __init__(
self,
endpoint: str,
region: str,
access_key_id: Optional[str] = None,
access_key_secret: Optional[str] = None,
save_chunk_size: int = 1024 * 1024,
use_environment_credentials: bool = False,
fixed_bucket: Optional[str] = None,
bucket_prefix: str = "dbgpt-fs-",
bucket_mapper: Optional[Callable[[str], str]] = None,
auto_create_bucket: bool = True,
):
"""Initialize the Aliyun OSS storage backend.
Args:
endpoint (str): OSS endpoint, e.g., "https://oss-cn-hangzhou.aliyuncs.com"
region (str): OSS region, e.g., "cn-hangzhou"
access_key_id (Optional[str], optional): Aliyun Access Key ID. Defaults to
None.
access_key_secret (Optional[str], optional): Aliyun Access Key Secret.
Defaults to None.
save_chunk_size (int, optional): Chunk size for saving files. Defaults to
1024*1024 (1MB).
use_environment_credentials (bool, optional): Whether to use credentials
from environment variables. Defaults to False.
fixed_bucket (Optional[str], optional): A fixed OSS bucket to use for all
operations. If provided, all logical buckets will be mapped to this
single bucket. Defaults to None.
bucket_prefix (str, optional): Prefix for dynamically created buckets.
Defaults to "dbgpt-fs-".
bucket_mapper (Optional[Callable[[str], str]], optional): Custom function
to map logical bucket names to actual OSS bucket names. Defaults to
None.
auto_create_bucket (bool, optional): Whether to automatically create
buckets that don't exist. Defaults to True.
"""
self.endpoint = endpoint
self.region = region
self._save_chunk_size = save_chunk_size
self.fixed_bucket = fixed_bucket
self.bucket_prefix = bucket_prefix
self.custom_bucket_mapper = bucket_mapper
self.auto_create_bucket = auto_create_bucket
# Initialize OSS authentication
if use_environment_credentials:
# Check required environment variables
required_env_vars = ["OSS_ACCESS_KEY_ID", "OSS_ACCESS_KEY_SECRET"]
for var in required_env_vars:
if var not in os.environ:
raise ValueError(f"Environment variable {var} is not set.")
self.auth = oss2.ProviderAuthV4(EnvironmentVariableCredentialsProvider())
else:
if not access_key_id or not access_key_secret:
raise ValueError(
"Access key ID and secret are required when not using environment "
"credentials"
)
# Use provided credentials
self.auth = oss2.Auth(access_key_id, access_key_secret)
# Store buckets dict to avoid recreating bucket objects
self._buckets: Dict[str, oss2.Bucket] = {}
# Create fixed bucket if specified
if self.fixed_bucket and self.auto_create_bucket:
self._ensure_bucket_exists(self.fixed_bucket)
@property
def save_chunk_size(self) -> int:
"""Get the save chunk size."""
return self._save_chunk_size
def _map_bucket_name(self, logical_bucket: str) -> str:
"""Map logical bucket name to actual OSS bucket name.
Args:
logical_bucket (str): Logical bucket name used by the application
Returns:
str: Actual OSS bucket name to use
"""
# 1. If using a fixed bucket, always return that
if self.fixed_bucket:
return self.fixed_bucket
# 2. If a custom mapper is provided, use that
if self.custom_bucket_mapper:
return self.custom_bucket_mapper(logical_bucket)
# 3. Otherwise, use a hash-based approach to generate a unique but
# deterministic name
# This avoids bucket name conflicts while maintaining consistency
bucket_hash = hashlib.md5(logical_bucket.encode()).hexdigest()[:8]
return f"{self.bucket_prefix}{bucket_hash}-{logical_bucket}"
def _generate_dynamic_bucket_name(self) -> str:
"""Generate a unique bucket name for dynamic creation.
Returns:
str: A unique bucket name
"""
# Using timestamp + random number to ensure uniqueness
timestamp = int(time.time())
random_number = random.randint(0, 9999)
return f"{self.bucket_prefix}{timestamp}-{random_number}"
def _ensure_bucket_exists(self, bucket_name: str) -> bool:
"""Ensure the bucket exists, create it if needed and if auto_create_bucket is
True.
Args:
bucket_name (str): Bucket name
Returns:
bool: True if the bucket exists or was created, False otherwise
"""
bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name, region=self.region)
try:
if does_bucket_exist(bucket):
return True
if not self.auto_create_bucket:
logger.warning(
f"Bucket {bucket_name} does not exist and auto_create_bucket is "
f"False"
)
return False
logger.info(f"Creating bucket {bucket_name}")
bucket.create_bucket(oss2.models.BUCKET_ACL_PRIVATE)
return True
except oss2.exceptions.ServerError as e:
# Handle the case where bucket name is already taken by someone else
if e.status == 409 and "BucketAlreadyExists" in str(e):
logger.warning(
f"Bucket name {bucket_name} already exists and is owned by "
"someone else"
)
return False
raise
except oss2.exceptions.OssError as e:
logger.error(f"Failed to create or check bucket {bucket_name}: {e}")
raise
def _get_bucket(self, logical_bucket: str) -> Union[oss2.Bucket, None]:
"""Get or create an OSS bucket object for the given logical bucket.
Args:
logical_bucket (str): Logical bucket name
Returns:
Union[oss2.Bucket, None]: Bucket object or None if bucket creation failed
"""
# Get the actual OSS bucket name
actual_bucket_name = self._map_bucket_name(logical_bucket)
# Check if we've already cached this bucket
if actual_bucket_name in self._buckets:
return self._buckets[actual_bucket_name]
# Try to ensure the mapped bucket exists
if self._ensure_bucket_exists(actual_bucket_name):
# Cache and return the bucket
self._buckets[actual_bucket_name] = oss2.Bucket(
self.auth, self.endpoint, actual_bucket_name, region=self.region
)
return self._buckets[actual_bucket_name]
# If we get here, the bucket doesn't exist and couldn't be created
# Try to create a dynamic bucket if we're not using a fixed bucket
if not self.fixed_bucket and self.auto_create_bucket:
# Generate a new unique bucket name
dynamic_bucket = self._generate_dynamic_bucket_name()
logger.info(
f"Attempting to create dynamic bucket {dynamic_bucket} for logical "
f"bucket {logical_bucket}"
)
if self._ensure_bucket_exists(dynamic_bucket):
self._buckets[actual_bucket_name] = oss2.Bucket(
self.auth, self.endpoint, dynamic_bucket, region=self.region
)
return self._buckets[actual_bucket_name]
# If all attempts failed
raise ValueError(
f"Failed to get or create bucket for logical bucket {logical_bucket}"
)
def save(self, bucket: str, file_id: str, file_data: BinaryIO) -> str:
"""Save the file data to Aliyun OSS.
Args:
bucket (str): The logical bucket name
file_id (str): The file ID
file_data (BinaryIO): The file data
Returns:
str: The storage path (OSS URI)
"""
# Get the actual OSS bucket
oss_bucket = self._get_bucket(bucket)
# Generate OSS object name based on whether we're using fixed bucket
object_name = file_id
if self.fixed_bucket:
# When using a fixed bucket, we need to prefix with logical bucket name to
# avoid conflicts
object_name = f"{bucket}/{file_id}"
# For large files, use multipart upload
file_size = self._get_file_size(file_data)
if file_size > 10 * self.save_chunk_size: # If file is larger than 10MB
logger.info(
f"Using multipart upload for large file: {object_name} "
f"(size: {file_size})"
)
self._multipart_upload(oss_bucket, object_name, file_data)
else:
logger.info(f"Uploading file using simple upload: {object_name}")
try:
oss_bucket.put_object(object_name, file_data)
except oss2.exceptions.OssError as e:
logger.error(
f"Failed to upload file {object_name} to bucket "
f"{oss_bucket.bucket_name}: {e}"
)
raise
# Store the OSS bucket name and object path for future reference
actual_bucket_name = oss_bucket.bucket_name
# Format: oss://{actual_bucket_name}/{object_name}
# We store both the actual bucket name and the object path in the URI
# But we'll also keep the logical bucket in the external URI format
return f"oss://{bucket}/{file_id}?actual_bucket={actual_bucket_name}&object_name={object_name}" # noqa
def _get_file_size(self, file_data: BinaryIO) -> int:
"""Get file size without consuming the file object.
Args:
file_data (BinaryIO): The file data
Returns:
int: The file size in bytes
"""
current_pos = file_data.tell()
file_data.seek(0, io.SEEK_END)
size = file_data.tell()
file_data.seek(current_pos) # Reset the file pointer
return size
def _multipart_upload(
self, oss_bucket: oss2.Bucket, file_id: str, file_data: BinaryIO
) -> None:
"""Handle multipart upload for large files.
Args:
oss_bucket (oss2.Bucket): OSS bucket object
file_id (str): The file ID
file_data (BinaryIO): The file data
"""
# Initialize multipart upload
upload_id = oss_bucket.init_multipart_upload(file_id).upload_id
# Upload parts
part_number = 1
parts = []
while True:
chunk = file_data.read(self.save_chunk_size)
if not chunk:
break
# Upload part
etag = oss_bucket.upload_part(file_id, upload_id, part_number, chunk).etag
parts.append(oss2.models.PartInfo(part_number, etag))
part_number += 1
# Complete multipart upload
oss_bucket.complete_multipart_upload(file_id, upload_id, parts)
def _parse_storage_path(self, storage_path: str) -> Dict[str, str]:
"""Parse the OSS storage path to extract actual bucket and object name.
Args:
storage_path (str): The storage path URI
Returns:
Dict[str, str]: A dictionary with actual_bucket and object_name keys
"""
if not storage_path.startswith("oss://"):
raise ValueError(f"Invalid storage path for Aliyun OSS: {storage_path}")
# Example URI:
# oss://logical_bucket/file_id?actual_bucket=oss_bucket&object_name=logical_bucket/file_id # noqa
# Try to parse the URL parameters
from urllib.parse import parse_qs, urlparse
parsed_url = urlparse(storage_path)
params = parse_qs(parsed_url.query)
# Extract the parameters
actual_bucket = params.get("actual_bucket", [None])[0]
object_name = params.get("object_name", [None])[0]
# Extract the logical bucket and file_id from the path
path_parts = parsed_url.path.strip("/").split("/", 1)
logical_bucket = path_parts[0] if path_parts else None
logical_file_id = path_parts[1] if len(path_parts) > 1 else None
# If parameters aren't in the URL (backward compatibility or simplified URL),
# derive them from the logical values
if not actual_bucket:
# Try to use the bucket mapper to get the actual bucket
actual_bucket = (
self._map_bucket_name(logical_bucket) if logical_bucket else None
)
if not object_name:
# If using fixed bucket, the object name includes the logical bucket
# as prefix
if self.fixed_bucket:
object_name = (
f"{logical_bucket}/{logical_file_id}"
if logical_bucket and logical_file_id
else None
)
else:
object_name = logical_file_id
return {
"logical_bucket": logical_bucket,
"logical_file_id": logical_file_id,
"actual_bucket": actual_bucket,
"object_name": object_name,
}
def load(self, fm: FileMetadata) -> BinaryIO:
"""Load the file data from Aliyun OSS.
Args:
fm (FileMetadata): The file metadata
Returns:
BinaryIO: The file data as a binary IO object
"""
# Parse the storage path
path_info = self._parse_storage_path(fm.storage_path)
# Get actual bucket and object name
actual_bucket_name = path_info["actual_bucket"]
object_name = path_info["object_name"]
logical_bucket = path_info["logical_bucket"]
# If we couldn't determine the actual bucket from the URI, try with the
# logical bucket
if not actual_bucket_name and logical_bucket:
actual_bucket_name = self._map_bucket_name(logical_bucket)
# Use the file_id as object name if object_name is still None
if not object_name:
object_name = fm.file_id
# If using fixed bucket, prefix with logical bucket
if self.fixed_bucket and logical_bucket:
object_name = f"{logical_bucket}/{fm.file_id}"
# Get the bucket object
try:
oss_bucket = oss2.Bucket(
self.auth, self.endpoint, actual_bucket_name, region=self.region
)
# Get object as stream
object_stream = oss_bucket.get_object(object_name)
# Convert to BytesIO for compatibility
content = io.BytesIO(object_stream.read())
content.seek(0)
return content
except oss2.exceptions.NoSuchKey as e:
logger.error(
f"File {object_name} not found in bucket {actual_bucket_name}: {e}"
)
raise FileNotFoundError(
f"File {object_name} not found in bucket {actual_bucket_name}"
)
except oss2.exceptions.OssError as e:
logger.error(
f"Failed to download file {object_name} from bucket "
f"{actual_bucket_name}: {e}"
)
raise
def delete(self, fm: FileMetadata) -> bool:
"""Delete the file data from Aliyun OSS.
Args:
fm (FileMetadata): The file metadata
Returns:
bool: True if the file was deleted, False otherwise
"""
# Parse the storage path
path_info = self._parse_storage_path(fm.storage_path)
# Get actual bucket and object name
actual_bucket_name = path_info["actual_bucket"]
object_name = path_info["object_name"]
logical_bucket = path_info["logical_bucket"]
# If we couldn't determine the actual bucket from the URI, try with the
# logical bucket
if not actual_bucket_name and logical_bucket:
actual_bucket_name = self._map_bucket_name(logical_bucket)
# Use the file_id as object name if object_name is still None
if not object_name:
object_name = fm.file_id
# If using fixed bucket, prefix with logical bucket
if self.fixed_bucket and logical_bucket:
object_name = f"{logical_bucket}/{fm.file_id}"
try:
# Get the bucket object
oss_bucket = oss2.Bucket(
self.auth, self.endpoint, actual_bucket_name, region=self.region
)
# Check if the object exists
if not oss_bucket.object_exists(object_name):
logger.warning(
f"File {object_name} does not exist in bucket {actual_bucket_name}"
)
return False
# Delete the object
oss_bucket.delete_object(object_name)
logger.info(f"File {object_name} deleted from bucket {actual_bucket_name}")
return True
except oss2.exceptions.OssError as e:
logger.error(
f"Failed to delete file {object_name} from bucket {actual_bucket_name}:"
f" {e}"
)
return False

View File

@ -0,0 +1,118 @@
from dataclasses import dataclass, field
from typing import Any, Dict, Optional
from dbgpt.core.interface.file import StorageBackend, StorageBackendConfig
from dbgpt.util.i18n_utils import _
@dataclass
class S3StorageConfig(StorageBackendConfig):
__type__ = "s3"
endpoint: str = field(
metadata={
"help": _(
"The endpoint of the s3 server. e.g. https://s3.us-east-1.amazonaws.com"
)
},
)
region: str = field(
metadata={"help": _("The region of the s3 server. e.g. us-east-1")},
)
access_key_id: Optional[str] = field(
default=None,
metadata={
"help": _(
"The access key ID of the s3 server. You can also set it in the "
"environment variable AWS_ACCESS_KEY_ID"
),
"tags": "privacy",
},
)
access_key_secret: Optional[str] = field(
default=None,
metadata={
"help": _(
"The access key secret of the s3 server. You can also set it in the "
"environment variable AWS_SECRET_ACCESS_KEY"
),
"tags": "privacy",
},
)
use_environment_credentials: Optional[bool] = field(
default=False,
metadata={
"help": _(
"Whether to use the environment variables AWS_ACCESS_KEY_ID and "
"AWS_SECRET_ACCESS_KEY as the credentials. Default is False."
),
},
)
fixed_bucket: Optional[str] = field(
default=None,
metadata={
"help": _(
"The fixed bucket name to use. If set, all logical buckets in DB-GPT "
"will be mapped to this bucket. We suggest you set this value to avoid "
"bucket name conflicts."
)
},
)
bucket_prefix: Optional[str] = field(
default="dbgpt-fs-",
metadata={
"help": _(
"The prefix of the bucket name. If set, all logical buckets in DB-GPT "
"will be prefixed with this value. Just work when fixed_bucket is None."
)
},
)
auto_create_bucket: Optional[bool] = field(
default=True,
metadata={
"help": _(
"Whether to create the bucket automatically if it does not exist. "
"If set to False, the bucket must exist before using it."
)
},
)
save_chunk_size: Optional[int] = field(
default=1024 * 1024,
metadata={
"help": _(
"The chunk size when saving the file. When the file is larger 10x than "
"this value, it will be uploaded in multiple parts. Default is 1M."
)
},
)
signature_version: Optional[str] = field(
default=None,
metadata={
"help": _(
"The signature version of the s3 server. "
"e.g. s3v4, s3v2, None (default)"
)
},
)
s3_config: Optional[Dict[str, Any]] = field(
default_factory=dict,
metadata={
"help": _("The additional configuration for the S3 client."),
},
)
def create_storage(self) -> StorageBackend:
from .s3_storage import S3Storage
return S3Storage(
endpoint_url=self.endpoint,
region_name=self.region,
access_key_id=self.access_key_id,
secret_access_key=self.access_key_secret,
use_environment_credentials=self.use_environment_credentials,
fixed_bucket=self.fixed_bucket,
bucket_prefix=self.bucket_prefix,
auto_create_bucket=self.auto_create_bucket,
save_chunk_size=self.save_chunk_size,
signature_version=self.signature_version,
s3_config=self.s3_config,
)

View File

@ -0,0 +1,589 @@
"""S3 compatible storage backend."""
import hashlib
import io
import logging
import os
import random
import time
from typing import BinaryIO, Callable, Dict, Optional, Union
from urllib.parse import parse_qs, urlparse
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
from dbgpt.core.interface.file import FileMetadata, StorageBackend
logger = logging.getLogger(__name__)
class S3Storage(StorageBackend):
"""S3 compatible storage backend implementation."""
storage_type: str = "s3"
def __init__(
self,
endpoint_url: str,
region_name: str,
access_key_id: str,
secret_access_key: str,
save_chunk_size: int = 1024 * 1024,
use_environment_credentials: bool = False,
fixed_bucket: Optional[str] = None,
bucket_prefix: str = "dbgpt-fs-",
bucket_mapper: Optional[Callable[[str], str]] = None,
auto_create_bucket: bool = True,
signature_version: Optional[str] = None,
s3_config: Optional[Dict[str, Union[str, int]]] = None,
):
"""Initialize the S3 compatible storage backend.
Args:
endpoint_url (str): S3 endpoint URL, e.g.,
"https://s3.us-east-1.amazonaws.com"
region_name (str): S3 region, e.g., "us-east-1"
access_key_id (str): AWS/S3 Access Key ID
secret_access_key (str): AWS/S3 Secret Access Key
save_chunk_size (int, optional): Chunk size for saving files. Defaults to
1024*1024 (1MB).
use_environment_credentials (bool, optional): Whether to use credentials
from environment variables. Defaults to False.
fixed_bucket (Optional[str], optional): A fixed S3 bucket to use for all
operations. If provided, all logical buckets will be mapped to this
single bucket. Defaults to None.
bucket_prefix (str, optional): Prefix for dynamically created buckets.
Defaults to "dbgpt-fs-".
bucket_mapper (Optional[Callable[[str], str]], optional): Custom function
to map logical bucket names to actual S3 bucket names. Defaults to None.
auto_create_bucket (bool, optional): Whether to automatically create
buckets that don't exist. Defaults to True.
signature_version (str, optional): S3 signature version to use.
s3_config (Optional[Dict[str, Union[str, int]]], optional): Additional
S3 configuration options. Defaults to None.
"""
self.endpoint_url = endpoint_url
self.region_name = region_name
self._save_chunk_size = save_chunk_size
self.fixed_bucket = fixed_bucket
self.bucket_prefix = bucket_prefix
self.custom_bucket_mapper = bucket_mapper
self.auto_create_bucket = auto_create_bucket
self.signature_version = signature_version
# Build S3 client configuration
if not s3_config:
s3_config = {
"s3": {
# Use virtual addressing style
"addressing_style": "virtual",
},
"signature_version": signature_version or "v4",
}
if "request_checksum_calculation" not in s3_config:
s3_config["request_checksum_calculation"] = "when_required"
if "response_checksum_validation" not in s3_config:
s3_config["response_checksum_validation"] = "when_required"
config = Config(**s3_config)
# Initialize S3 authentication
if use_environment_credentials:
# Check required environment variables
required_env_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"]
for var in required_env_vars:
if var not in os.environ:
raise ValueError(f"Environment variable {var} is not set.")
# Use environment credentials
self.s3_client = boto3.client(
"s3",
endpoint_url=self.endpoint_url,
region_name=self.region_name,
config=config,
)
else:
if not access_key_id or not secret_access_key:
raise ValueError(
"Access key ID and secret are required when not using environment "
"credentials"
)
# Use provided credentials
self.s3_client = boto3.client(
"s3",
endpoint_url=self.endpoint_url,
region_name=self.region_name,
aws_access_key_id=access_key_id,
aws_secret_access_key=secret_access_key,
config=config,
)
# Create fixed bucket if specified
if self.fixed_bucket and self.auto_create_bucket:
self._ensure_bucket_exists(self.fixed_bucket)
@property
def save_chunk_size(self) -> int:
"""Get the save chunk size."""
return self._save_chunk_size
def _map_bucket_name(self, logical_bucket: str) -> str:
"""Map logical bucket name to actual S3 bucket name.
Args:
logical_bucket (str): Logical bucket name used by the application
Returns:
str: Actual S3 bucket name to use
"""
# 1. If using a fixed bucket, always return that
if self.fixed_bucket:
return self.fixed_bucket
# 2. If a custom mapper is provided, use that
if self.custom_bucket_mapper:
return self.custom_bucket_mapper(logical_bucket)
# 3. Otherwise, use a hash-based approach to generate a unique but
# deterministic name
# This avoids bucket name conflicts while maintaining consistency
bucket_hash = hashlib.md5(logical_bucket.encode()).hexdigest()[:8]
return f"{self.bucket_prefix}{bucket_hash}-{logical_bucket}"
def _generate_dynamic_bucket_name(self) -> str:
"""Generate a unique bucket name for dynamic creation.
Returns:
str: A unique bucket name
"""
# Using timestamp + random number to ensure uniqueness
timestamp = int(time.time())
random_number = random.randint(0, 9999)
return f"{self.bucket_prefix}{timestamp}-{random_number}"
def _ensure_bucket_exists(self, bucket_name: str) -> bool:
"""Ensure the bucket exists, create it if needed and if auto_create_bucket is
True.
Args:
bucket_name (str): Bucket name
Returns:
bool: True if the bucket exists or was created, False otherwise
"""
try:
# Check if bucket exists
self.s3_client.head_bucket(Bucket=bucket_name)
logger.info(f"Bucket {bucket_name} exists")
return True
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code")
error_msg = str(e)
logger.info(
f"Bucket check failed with error_code={error_code}, msg={error_msg}"
)
# Bucket doesn't exist or we don't have permission to access it
if error_code in ["404", "403", "NoSuchBucket", "Forbidden"]:
if not self.auto_create_bucket:
logger.warning(
f"Bucket {bucket_name} does not exist and auto_create_bucket "
"is False"
)
return False
# Create bucket
try:
logger.info(f"Creating bucket {bucket_name}")
# Try different creation methods to adapt to different
# S3-compatible APIs
creation_methods = [
# Method 1: Use LocationConstraint
lambda: self.s3_client.create_bucket(
Bucket=bucket_name,
CreateBucketConfiguration={
"LocationConstraint": self.region_name
},
),
# Method 2: Without LocationConstraint
lambda: self.s3_client.create_bucket(Bucket=bucket_name),
# Method 3: Use empty CreateBucketConfiguration
lambda: self.s3_client.create_bucket(
Bucket=bucket_name, CreateBucketConfiguration={}
),
]
# Try different creation methods
last_error = None
for create_method in creation_methods:
try:
create_method()
logger.info(f"Successfully created bucket {bucket_name}")
return True
except ClientError as method_error:
logger.info(
f"Bucket creation method failed: {method_error}"
)
last_error = method_error
continue
# If all methods failed, raise the last error
if last_error:
raise last_error
return False
except ClientError as create_error:
# Handle the case where bucket name is already taken by someone else
logger.error(
f"Failed to create bucket {bucket_name}: {create_error}"
)
if "BucketAlreadyExists" in str(create_error):
logger.warning(
f"Bucket name {bucket_name} already exists and is owned by "
"someone else"
)
return False
else:
# Some other error
logger.error(f"Failed to check bucket {bucket_name}: {e}")
return False
def save(self, bucket: str, file_id: str, file_data: BinaryIO) -> str:
"""Save the file data to S3.
Args:
bucket (str): The logical bucket name
file_id (str): The file ID
file_data (BinaryIO): The file data
Returns:
str: The storage path (S3 URI)
"""
# Get the actual S3 bucket
actual_bucket_name = self._map_bucket_name(bucket)
logger.info(
f"Mapped logical bucket '{bucket}' to actual bucket '{actual_bucket_name}'"
)
# Ensure bucket exists
bucket_exists = self._ensure_bucket_exists(actual_bucket_name)
if not bucket_exists:
logger.warning(
f"Could not ensure bucket {actual_bucket_name} exists, trying "
"alternatives"
)
# Try to create a dynamic bucket if we're not using a fixed bucket
if not self.fixed_bucket and self.auto_create_bucket:
dynamic_bucket = self._generate_dynamic_bucket_name()
logger.info(
f"Attempting to create dynamic bucket {dynamic_bucket} for logical "
f"bucket {bucket}"
)
if self._ensure_bucket_exists(dynamic_bucket):
logger.info(f"Successfully created dynamic bucket {dynamic_bucket}")
actual_bucket_name = dynamic_bucket
else:
error_msg = (
f"Failed to get or create bucket for logical bucket {bucket}"
)
logger.error(error_msg)
raise ValueError(error_msg)
else:
error_msg = (
f"Failed to get or create bucket for logical bucket {bucket}"
)
logger.error(error_msg)
raise ValueError(error_msg)
# Generate S3 object key based on whether we're using fixed bucket
object_key = file_id
if self.fixed_bucket:
# When using a fixed bucket, we need to prefix with logical bucket name to
# avoid conflicts
object_key = f"{bucket}/{file_id}"
# For large files, use multipart upload
file_size = self._get_file_size(file_data)
if file_size > 10 * self.save_chunk_size: # If file is larger than 10MB
logger.info(
f"Using multipart upload for large file: {object_key} "
f"(size: {file_size})"
)
self._multipart_upload(actual_bucket_name, object_key, file_data)
else:
logger.info(f"Uploading file using simple upload: {object_key}")
try:
# Reset the file pointer to the beginning
file_data.seek(0)
# Read the file content into memory
file_content = file_data.read()
# Use put_object for small files
self.s3_client.put_object(
Bucket=actual_bucket_name, Key=object_key, Body=file_content
)
except ClientError as e:
logger.error(
f"Failed to upload file {object_key} to bucket "
f"{actual_bucket_name}: {e}"
)
raise
# Format: s3://{logical_bucket}/{file_id}?actual_bucket={actual_bucket_name}&object_key={object_key} # noqa
return f"s3://{bucket}/{file_id}?actual_bucket={actual_bucket_name}&object_key={object_key}" # noqa
def _get_file_size(self, file_data: BinaryIO) -> int:
"""Get file size without consuming the file object.
Args:
file_data (BinaryIO): The file data
Returns:
int: The file size in bytes
"""
current_pos = file_data.tell()
file_data.seek(0, io.SEEK_END)
size = file_data.tell()
file_data.seek(current_pos) # Reset the file pointer
return size
def _multipart_upload(
self, bucket_name: str, object_key: str, file_data: BinaryIO
) -> None:
"""Handle multipart upload for large files.
Args:
bucket_name (str): S3 bucket name
object_key (str): The object key (file path in S3)
file_data (BinaryIO): The file data
"""
# Initialize multipart upload
try:
mpu = self.s3_client.create_multipart_upload(
Bucket=bucket_name, Key=object_key
)
upload_id = mpu["UploadId"]
# Upload parts
part_number = 1
parts = []
file_data.seek(0) # Make sure we're at the beginning of the file
while True:
# Read the chunk
chunk = file_data.read(self.save_chunk_size)
if not chunk:
break
# Upload the part
response = self.s3_client.upload_part(
Bucket=bucket_name,
Key=object_key,
UploadId=upload_id,
PartNumber=part_number,
Body=chunk,
)
parts.append({"PartNumber": part_number, "ETag": response["ETag"]})
part_number += 1
# Complete multipart upload
self.s3_client.complete_multipart_upload(
Bucket=bucket_name,
Key=object_key,
UploadId=upload_id,
MultipartUpload={"Parts": parts},
)
except ClientError as e:
logger.error(f"Error in multipart upload: {e}")
# Attempt to abort the multipart upload if it was initialized
if "upload_id" in locals():
try:
self.s3_client.abort_multipart_upload(
Bucket=bucket_name, Key=object_key, UploadId=upload_id
)
except ClientError as abort_error:
logger.error(f"Error aborting multipart upload: {abort_error}")
raise
def _parse_storage_path(self, storage_path: str) -> Dict[str, str]:
"""Parse the S3 storage path to extract actual bucket and object key.
Args:
storage_path (str): The storage path URI
Returns:
Dict[str, str]: A dictionary with actual_bucket and object_key keys
"""
if not storage_path.startswith("s3://"):
raise ValueError(f"Invalid storage path for S3: {storage_path}")
# Example URI:
# s3://logical_bucket/file_id?actual_bucket=s3_bucket&object_key=logical_bucket/file_id # noqa
# Parse the URL
parsed_url = urlparse(storage_path)
params = parse_qs(parsed_url.query)
# Extract the parameters
actual_bucket = params.get("actual_bucket", [None])[0]
object_key = params.get("object_key", [None])[0]
# Extract the logical bucket and file_id from the path
path_parts = parsed_url.path.strip("/").split("/", 1)
logical_bucket = path_parts[0] if path_parts else None
logical_file_id = path_parts[1] if len(path_parts) > 1 else None
# If parameters aren't in the URL (backward compatibility or simplified URL),
# derive them from the logical values
if not actual_bucket:
# Try to use the bucket mapper to get the actual bucket
actual_bucket = (
self._map_bucket_name(logical_bucket) if logical_bucket else None
)
if not object_key:
# If using fixed bucket, the object key includes the logical bucket
# as prefix
if self.fixed_bucket:
object_key = (
f"{logical_bucket}/{logical_file_id}"
if logical_bucket and logical_file_id
else None
)
else:
object_key = logical_file_id
return {
"logical_bucket": logical_bucket,
"logical_file_id": logical_file_id,
"actual_bucket": actual_bucket,
"object_key": object_key,
}
def load(self, fm: FileMetadata) -> BinaryIO:
"""Load the file data from S3.
Args:
fm (FileMetadata): The file metadata
Returns:
BinaryIO: The file data as a binary IO object
"""
# Parse the storage path
path_info = self._parse_storage_path(fm.storage_path)
# Get actual bucket and object key
actual_bucket_name = path_info["actual_bucket"]
object_key = path_info["object_key"]
logical_bucket = path_info["logical_bucket"]
# If we couldn't determine the actual bucket from the URI, try with the
# logical bucket
if not actual_bucket_name and logical_bucket:
actual_bucket_name = self._map_bucket_name(logical_bucket)
# Use the file_id as object key if object_key is still None
if not object_key:
object_key = fm.file_id
# If using fixed bucket, prefix with logical bucket
if self.fixed_bucket and logical_bucket:
object_key = f"{logical_bucket}/{fm.file_id}"
try:
# Get object from S3
response = self.s3_client.get_object(
Bucket=actual_bucket_name, Key=object_key
)
# Read the streaming body into a BytesIO object
content = io.BytesIO()
body = response["Body"]
# Stream the data in chunks
while True:
chunk = body.read(self.save_chunk_size)
if not chunk:
break
content.write(chunk)
content.seek(0)
return content
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code")
if error_code == "NoSuchKey":
logger.error(
f"File {object_key} not found in bucket {actual_bucket_name}: {e}"
)
raise FileNotFoundError(
f"File {object_key} not found in bucket {actual_bucket_name}"
)
logger.error(
f"Failed to download file {object_key} from bucket "
f"{actual_bucket_name}: {e}"
)
raise
def delete(self, fm: FileMetadata) -> bool:
"""Delete the file data from S3.
Args:
fm (FileMetadata): The file metadata
Returns:
bool: True if the file was deleted, False otherwise
"""
# Parse the storage path
path_info = self._parse_storage_path(fm.storage_path)
# Get actual bucket and object key
actual_bucket_name = path_info["actual_bucket"]
object_key = path_info["object_key"]
logical_bucket = path_info["logical_bucket"]
# If we couldn't determine the actual bucket from the URI, try with the
# logical bucket
if not actual_bucket_name and logical_bucket:
actual_bucket_name = self._map_bucket_name(logical_bucket)
# Use the file_id as object key if object_key is still None
if not object_key:
object_key = fm.file_id
# If using fixed bucket, prefix with logical bucket
if self.fixed_bucket and logical_bucket:
object_key = f"{logical_bucket}/{fm.file_id}"
try:
# Check if the object exists
try:
self.s3_client.head_object(Bucket=actual_bucket_name, Key=object_key)
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code")
if error_code == "404" or error_code == "NoSuchKey":
logger.warning(
f"File {object_key} does not exist in bucket "
f"{actual_bucket_name}"
)
return False
raise
# Delete the object
self.s3_client.delete_object(Bucket=actual_bucket_name, Key=object_key)
logger.info(f"File {object_key} deleted from bucket {actual_bucket_name}")
return True
except ClientError as e:
logger.error(
f"Failed to delete file {object_key} from bucket {actual_bucket_name}:"
f" {e}"
)
return False

View File

@ -120,7 +120,6 @@ async def upload_files(
global_system_app,
service.upload_files,
bucket,
"distributed",
files,
user_name,
sys_code,

View File

@ -1,12 +1,14 @@
from dataclasses import dataclass, field
from typing import Optional
from typing import List, Optional
from dbgpt.core.awel.flow import (
TAGS_ORDER_HIGH,
ResourceCategory,
auto_register_resource,
)
from dbgpt.core.interface.file import StorageBackendConfig
from dbgpt.util.i18n_utils import _
from dbgpt.util.module_utils import ScannerConfig
from dbgpt_serve.core import BaseServeConfig
APP_NAME = "file"
@ -27,6 +29,7 @@ SERVER_APP_TABLE_NAME = "dbgpt_serve_file"
"files in the file server."
),
show_in_ui=False,
skip_fields=["backends"],
)
@dataclass
class ServeConfig(BaseServeConfig):
@ -34,6 +37,13 @@ class ServeConfig(BaseServeConfig):
__type__ = APP_NAME
__scan_config__ = ScannerConfig(
module_path="dbgpt_ext.storage.file",
base_class=StorageBackendConfig,
recursive=True,
specific_files=["config"],
)
check_hash: Optional[bool] = field(
default=True,
metadata={"help": _("Check the hash of the file when downloading")},
@ -62,6 +72,14 @@ class ServeConfig(BaseServeConfig):
local_storage_path: Optional[str] = field(
default=None, metadata={"help": _("The local storage path")}
)
default_backend: Optional[str] = field(
default=None,
metadata={"help": _("The default storage backend")},
)
backends: List[StorageBackendConfig] = field(
default_factory=list,
metadata={"help": _("The storage backend configurations")},
)
def get_node_address(self) -> str:
"""Get the node address"""

View File

@ -88,6 +88,7 @@ class Serve(BaseServe):
FileMetadataAdapter(),
serializer,
)
default_backend = self._serve_config.default_backend
simple_distributed_storage = SimpleDistributedStorage(
node_address=self._serve_config.get_node_address(),
local_storage_path=self._serve_config.get_local_storage_path(),
@ -98,6 +99,15 @@ class Serve(BaseServe):
storage_backends = {
simple_distributed_storage.storage_type: simple_distributed_storage,
}
for backend_config in self._serve_config.backends:
storage_backend = backend_config.create_storage()
storage_backends[storage_backend.storage_type] = storage_backend
if not default_backend:
# First backend is the default backend
default_backend = storage_backend.storage_type
if not default_backend:
default_backend = simple_distributed_storage.storage_type
fs = FileStorageSystem(
storage_backends,
metadata_storage=storage,
@ -107,6 +117,7 @@ class Serve(BaseServe):
system_app=self._system_app,
storage_system=fs,
save_chunk_size=self._serve_config.save_chunk_size,
default_storage_type=default_backend,
)
self._system_app.register_instance(self._file_storage_client)

View File

@ -79,7 +79,6 @@ class Service(BaseService[ServeEntity, ServeRequest, ServerResponse]):
def upload_files(
self,
bucket: str,
storage_type: str,
files: List[UploadFile],
user_name: Optional[str] = None,
sys_code: Optional[str] = None,
@ -97,7 +96,6 @@ class Service(BaseService[ServeEntity, ServeRequest, ServerResponse]):
bucket,
file_name,
file_data=file.file,
storage_type=storage_type,
custom_metadata=custom_metadata,
)
parsed_uri = FileStorageURI.parse(uri)

View File

@ -187,7 +187,6 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
bucket,
safe_filename,
doc_file.file,
storage_type="distributed",
custom_metadata=custom_metadata,
)
request.content = file_uri

99
uv.lock
View File

@ -211,6 +211,28 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/8b/46919127496036c8e990b2b236454a0d8655fd46e1df2fd35610a9cbc842/alembic-1.12.0-py3-none-any.whl", hash = "sha256:03226222f1cf943deee6c85d9464261a6c710cd19b4fe867a3ad1f25afda610f", size = 226041 },
]
[[package]]
name = "aliyun-python-sdk-core"
version = "2.16.0"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "cryptography" },
{ name = "jmespath" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/09/da9f58eb38b4fdb97ba6523274fbf445ef6a06be64b433693da8307b4bec/aliyun-python-sdk-core-2.16.0.tar.gz", hash = "sha256:651caad597eb39d4fad6cf85133dffe92837d53bdf62db9d8f37dab6508bb8f9", size = 449555 }
[[package]]
name = "aliyun-python-sdk-kms"
version = "2.16.5"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "aliyun-python-sdk-core" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/2c/9877d0e6b18ecf246df671ac65a5d1d9fecbf85bdcb5d43efbde0d4662eb/aliyun-python-sdk-kms-2.16.5.tar.gz", hash = "sha256:f328a8a19d83ecbb965ffce0ec1e9930755216d104638cd95ecd362753b813b3", size = 12018 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/11/5c/0132193d7da2c735669a1ed103b142fd63c9455984d48c5a88a1a516efaa/aliyun_python_sdk_kms-2.16.5-py2.py3-none-any.whl", hash = "sha256:24b6cdc4fd161d2942619479c8d050c63ea9cd22b044fe33b60bbb60153786f0", size = 99495 },
]
[[package]]
name = "annotated-types"
version = "0.7.0"
@ -668,6 +690,34 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/91/4aea63dccee6491a54c630d9817656a886e086ab97222e2d8101d8cdf894/blis-0.7.11-cp312-cp312-win_amd64.whl", hash = "sha256:5a305dbfc96d202a20d0edd6edf74a406b7e1404f4fa4397d24c68454e60b1b4", size = 6624079 },
]
[[package]]
name = "boto3"
version = "1.37.13"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "botocore" },
{ name = "jmespath" },
{ name = "s3transfer" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d6/50/1183ffa4782408907891af344a8e91d7bc5d7a9bae12e43fca8874da567e/boto3-1.37.13.tar.gz", hash = "sha256:295648f887464ab74c5c301a44982df76f9ba39ebfc16be5b8f071ad1a81fe95", size = 111349 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/64/9f9578142ba1ed3ecc6b82a53c5c4c4352108e1424f1d5d02b6239b4314f/boto3-1.37.13-py3-none-any.whl", hash = "sha256:90fa5a91d7d7456219f0b7c4a93b38335dc5cf4613d885da4d4c1d099e04c6b7", size = 139552 },
]
[[package]]
name = "botocore"
version = "1.37.13"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "jmespath" },
{ name = "python-dateutil" },
{ name = "urllib3" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/53/3593b438ab1f9b6837cc90a8582dfa71c71c639e9359a01fd4d110f0566e/botocore-1.37.13.tar.gz", hash = "sha256:60dfb831c54eb466db9b91891a6c8a0c223626caa049969d5d42858ad1e7f8c7", size = 13647494 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/43/2aa89ca8ab69196890b0682820469e62d93c4cf402ceb46a3007fd44b0c3/botocore-1.37.13-py3-none-any.whl", hash = "sha256:aa417bac0f4d79533080e6e17c0509e149353aec83cfe7879597a7942f7f08d0", size = 13411385 },
]
[[package]]
name = "bs4"
version = "0.0.2"
@ -1266,6 +1316,12 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/5c/3ba7d12e7a79566f97b8f954400926d7b6eb33bcdccc1315a857f200f1f1/crashtest-0.4.1-py3-none-any.whl", hash = "sha256:8d23eac5fa660409f57472e3851dab7ac18aba459a8d19cbbba86d3d5aecd2a5", size = 7558 },
]
[[package]]
name = "crcmod"
version = "1.7"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6b/b0/e595ce2a2527e169c3bcd6c33d2473c1918e0b7f6826a043ca1245dd4e5b/crcmod-1.7.tar.gz", hash = "sha256:dc7051a0db5f2bd48665a990d3ec1cc305a466a77358ca4492826f41f283601e", size = 89670 }
[[package]]
name = "cryptography"
version = "44.0.1"
@ -1855,6 +1911,12 @@ datasource-spark = [
datasource-vertica = [
{ name = "vertica-python" },
]
file-oss = [
{ name = "oss2" },
]
file-s3 = [
{ name = "boto3" },
]
graph-rag = [
{ name = "dbgpt-tugraph-plugins" },
{ name = "neo4j" },
@ -1896,6 +1958,7 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "boto3", marker = "extra == 'file-s3'" },
{ name = "bs4", marker = "extra == 'rag'" },
{ name = "chromadb", marker = "extra == 'storage-chromadb'", specifier = ">=0.4.22" },
{ name = "clickhouse-connect", marker = "extra == 'datasource-clickhouse'" },
@ -1910,6 +1973,7 @@ requires-dist = [
{ name = "neo4j", marker = "extra == 'graph-rag'" },
{ name = "networkx", marker = "extra == 'graph-rag'" },
{ name = "onnxruntime", marker = "extra == 'storage-chromadb'", specifier = ">=1.14.1,<=1.18.1" },
{ name = "oss2", marker = "extra == 'file-oss'" },
{ name = "pdfplumber", marker = "extra == 'rag'" },
{ name = "psycopg2-binary", marker = "extra == 'datasource-postgres'" },
{ name = "pyhive", marker = "extra == 'datasource-hive'" },
@ -3177,6 +3241,15 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/91/61/c80ef80ed8a0a21158e289ef70dac01e351d929a1c30cb0f49be60772547/jiter-0.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:3ac9f578c46f22405ff7f8b1f5848fb753cc4b8377fbec8470a7dc3997ca7566", size = 202374 },
]
[[package]]
name = "jmespath"
version = "0.10.0"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/56/3f325b1eef9791759784aa5046a8f6a1aff8f7c898a2e34506771d3b99d8/jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", size = 21607 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f", size = 24489 },
]
[[package]]
name = "joblib"
version = "1.4.2"
@ -5478,6 +5551,20 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/27/f1/1d7ec15b20f8ce9300bc850de1e059132b88990e46cd0ccac29cbf11e4f9/orjson-3.10.15-cp313-cp313-win_amd64.whl", hash = "sha256:fd56a26a04f6ba5fb2045b0acc487a63162a958ed837648c5781e1fe3316cfbf", size = 133444 },
]
[[package]]
name = "oss2"
version = "2.19.1"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "aliyun-python-sdk-core" },
{ name = "aliyun-python-sdk-kms" },
{ name = "crcmod" },
{ name = "pycryptodome" },
{ name = "requests" },
{ name = "six" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/b5/f2cb1950dda46ac2284d6c950489fdacd0e743c2d79a347924d3cc44b86f/oss2-2.19.1.tar.gz", hash = "sha256:a8ab9ee7eb99e88a7e1382edc6ea641d219d585a7e074e3776e9dec9473e59c1", size = 298845 }
[[package]]
name = "outlines"
version = "0.1.11"
@ -7470,6 +7557,18 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/5e/d3a6fdf61f6373e53bfb45d6819a72dfef741bc8a9ff31c64496688e7c39/ruff_lsp-0.0.62-py3-none-any.whl", hash = "sha256:fb6c04a0cb09bb3ae316121b084ff09497edd01df58b36fa431f14515c63029e", size = 20980 },
]
[[package]]
name = "s3transfer"
version = "0.11.4"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "botocore" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/ec/aa1a215e5c126fe5decbee2e107468f51d9ce190b9763cb649f76bb45938/s3transfer-0.11.4.tar.gz", hash = "sha256:559f161658e1cf0a911f45940552c696735f5c74e64362e515f333ebed87d679", size = 148419 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/86/62/8d3fc3ec6640161a5649b2cddbbf2b9fa39c92541225b33f117c37c5a2eb/s3transfer-0.11.4-py3-none-any.whl", hash = "sha256:ac265fa68318763a03bf2dc4f39d5cbd6a9e178d81cc9483ad27da33637e320d", size = 84412 },
]
[[package]]
name = "safetensors"
version = "0.5.2"