mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-24 12:45:45 +00:00
feat(storage): Support oss and s3
This commit is contained in:
parent
8eba2a3b2e
commit
b2dd66dc6d
51
configs/dbgpt-cloud-storage.example.toml
Normal file
51
configs/dbgpt-cloud-storage.example.toml
Normal file
@ -0,0 +1,51 @@
|
||||
[system]
|
||||
# Load language from environment variable(It is set by the hook)
|
||||
language = "${env:DBGPT_LANG:-zh}"
|
||||
log_level = "INFO"
|
||||
api_keys = []
|
||||
encrypt_key = "your_secret_key"
|
||||
|
||||
# Server Configurations
|
||||
[service.web]
|
||||
host = "0.0.0.0"
|
||||
port = 5670
|
||||
|
||||
[service.web.database]
|
||||
type = "sqlite"
|
||||
path = "pilot/meta_data/dbgpt.db"
|
||||
|
||||
[[serves]]
|
||||
type = "file"
|
||||
# Default backend for file server
|
||||
default_backend = "s3"
|
||||
|
||||
[[serves.backends]]
|
||||
type = "oss"
|
||||
endpoint = "https://oss-cn-beijing.aliyuncs.com"
|
||||
region = "oss-cn-beijing"
|
||||
access_key_id = "${env:OSS_ACCESS_KEY_ID}"
|
||||
access_key_secret = "${env:OSS_ACCESS_KEY_SECRET}"
|
||||
fixed_bucket = "{your_bucket_name}"
|
||||
|
||||
[[serves.backends]]
|
||||
# Use Tencent COS s3 compatible API as the file server
|
||||
type = "s3"
|
||||
endpoint = "https://cos.ap-beijing.myqcloud.com"
|
||||
region = "ap-beijing"
|
||||
access_key_id = "${env:COS_SECRETID}"
|
||||
access_key_secret = "${env:COS_SECRETKEY}"
|
||||
fixed_bucket = "{your_bucket_name}"
|
||||
|
||||
# Model Configurations
|
||||
[models]
|
||||
[[models.llms]]
|
||||
name = "${env:LLM_MODEL_NAME:-gpt-4o}"
|
||||
provider = "${env:LLM_MODEL_PROVIDER:-proxy/openai}"
|
||||
api_base = "${env:OPENAI_API_BASE:-https://api.openai.com/v1}"
|
||||
api_key = "${env:OPENAI_API_KEY}"
|
||||
|
||||
[[models.embeddings]]
|
||||
name = "${env:EMBEDDING_MODEL_NAME:-text-embedding-3-small}"
|
||||
provider = "${env:EMBEDDING_MODEL_PROVIDER:-proxy/openai}"
|
||||
api_url = "${env:EMBEDDING_MODEL_API_URL:-https://api.openai.com/v1/embeddings}"
|
||||
api_key = "${env:OPENAI_API_KEY}"
|
@ -393,7 +393,6 @@ async def document_upload(
|
||||
bucket,
|
||||
safe_filename,
|
||||
doc_file.file,
|
||||
storage_type="distributed",
|
||||
custom_metadata=custom_metadata,
|
||||
)
|
||||
|
||||
|
@ -359,7 +359,6 @@ async def file_upload(
|
||||
bucket,
|
||||
file_name,
|
||||
doc_file.file,
|
||||
storage_type="distributed",
|
||||
custom_metadata=custom_metadata,
|
||||
)
|
||||
|
||||
|
@ -181,7 +181,6 @@ class ChatExcel(BaseChat):
|
||||
self.fs_client.upload_file,
|
||||
self._bucket,
|
||||
self._database_file_path,
|
||||
storage_type="distributed",
|
||||
file_id=self._database_file_id,
|
||||
)
|
||||
return result
|
||||
|
@ -1115,6 +1115,7 @@ def auto_register_resource(
|
||||
alias: Optional[List[str]] = None,
|
||||
tags: Optional[Dict[str, str]] = None,
|
||||
show_in_ui: bool = True,
|
||||
skip_fields: Optional[List[str]] = None,
|
||||
**decorator_kwargs,
|
||||
):
|
||||
"""Auto register the resource.
|
||||
@ -1130,6 +1131,8 @@ def auto_register_resource(
|
||||
alias (Optional[List[str]], optional): The alias of the resource. Defaults to
|
||||
None. For compatibility, we can use the alias to register the resource.
|
||||
tags (Optional[Dict[str, str]]): The tags of the resource
|
||||
show_in_ui (bool): Whether show the resource in UI.
|
||||
skip_fields (Optional[List[str]]): The fields to skip.
|
||||
"""
|
||||
from dataclasses import fields, is_dataclass
|
||||
|
||||
@ -1147,6 +1150,8 @@ def auto_register_resource(
|
||||
parameters: List[Parameter] = []
|
||||
raw_fields = fields(cls)
|
||||
for i, fd in enumerate(fields_desc_list):
|
||||
if skip_fields and fd.param_name in skip_fields:
|
||||
continue
|
||||
param_type = fd.param_type
|
||||
if param_type in TYPE_STRING_TO_TYPE:
|
||||
# Basic type
|
||||
|
@ -16,6 +16,7 @@ import requests
|
||||
from dbgpt.component import BaseComponent, ComponentType, SystemApp
|
||||
from dbgpt.util.tracer import root_tracer, trace
|
||||
|
||||
from ...util import BaseParameters, RegisterParameters
|
||||
from .storage import (
|
||||
InMemoryStorage,
|
||||
QuerySpec,
|
||||
@ -116,6 +117,17 @@ class FileMetadata(StorageItem):
|
||||
self._identifier = obj._identifier
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class StorageBackendConfig(BaseParameters, RegisterParameters):
|
||||
"""Storage backend configuration"""
|
||||
|
||||
__type__ = "___storage_backend_config___"
|
||||
|
||||
def create_storage(self) -> "StorageBackend":
|
||||
"""Create the storage"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class FileStorageURI:
|
||||
"""File storage URI."""
|
||||
|
||||
@ -489,6 +501,7 @@ class FileStorageClient(BaseComponent):
|
||||
system_app: Optional[SystemApp] = None,
|
||||
storage_system: Optional[FileStorageSystem] = None,
|
||||
save_chunk_size: int = 1024 * 1024,
|
||||
default_storage_type: Optional[str] = None,
|
||||
):
|
||||
"""Initialize the file storage client."""
|
||||
super().__init__(system_app=system_app)
|
||||
@ -503,10 +516,14 @@ class FileStorageClient(BaseComponent):
|
||||
)
|
||||
}
|
||||
)
|
||||
if not default_storage_type:
|
||||
if storage_system and storage_system.storage_backends:
|
||||
default_storage_type = list(storage_system.storage_backends.keys())[0]
|
||||
|
||||
self.system_app = system_app
|
||||
self._storage_system = storage_system
|
||||
self.save_chunk_size = save_chunk_size
|
||||
self.default_storage_type = default_storage_type
|
||||
|
||||
def init_app(self, system_app: SystemApp):
|
||||
"""Initialize the application."""
|
||||
@ -523,7 +540,7 @@ class FileStorageClient(BaseComponent):
|
||||
self,
|
||||
bucket: str,
|
||||
file_path: str,
|
||||
storage_type: str,
|
||||
storage_type: Optional[str] = None,
|
||||
custom_metadata: Optional[Dict[str, Any]] = None,
|
||||
file_id: Optional[str] = None,
|
||||
) -> str:
|
||||
@ -556,7 +573,7 @@ class FileStorageClient(BaseComponent):
|
||||
bucket: str,
|
||||
file_name: str,
|
||||
file_data: BinaryIO,
|
||||
storage_type: str,
|
||||
storage_type: Optional[str] = None,
|
||||
custom_metadata: Optional[Dict[str, Any]] = None,
|
||||
file_id: Optional[str] = None,
|
||||
) -> str:
|
||||
@ -575,12 +592,20 @@ class FileStorageClient(BaseComponent):
|
||||
Returns:
|
||||
str: The file URI
|
||||
"""
|
||||
if not storage_type:
|
||||
storage_type = self.default_storage_type
|
||||
if not storage_type:
|
||||
raise ValueError("Storage type not provided")
|
||||
return self.storage_system.save_file(
|
||||
bucket, file_name, file_data, storage_type, custom_metadata, file_id
|
||||
)
|
||||
|
||||
def download_file(
|
||||
self, uri: str, dest_path: Optional[str] = None, dest_dir: Optional[str] = None
|
||||
self,
|
||||
uri: str,
|
||||
dest_path: Optional[str] = None,
|
||||
dest_dir: Optional[str] = None,
|
||||
cache: bool = True,
|
||||
) -> Tuple[str, FileMetadata]:
|
||||
"""Download a file from the storage system.
|
||||
|
||||
@ -595,6 +620,7 @@ class FileStorageClient(BaseComponent):
|
||||
uri (str): The file URI
|
||||
dest_path (str, optional): The destination path. Defaults to None.
|
||||
dest_dir (str, optional): The destination directory. Defaults to None.
|
||||
cache (bool, optional): Whether to cache the file. Defaults to True.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file is not found
|
||||
@ -617,7 +643,7 @@ class FileStorageClient(BaseComponent):
|
||||
os.makedirs(base_path, exist_ok=True)
|
||||
target_path = os.path.join(base_path, file_metadata.file_id + extension)
|
||||
file_hash = file_metadata.file_hash
|
||||
if os.path.exists(target_path):
|
||||
if os.path.exists(target_path) and cache:
|
||||
logger.debug(f"File {target_path} already exists, begin hash check")
|
||||
with open(target_path, "rb") as f:
|
||||
if file_hash == calculate_file_hash(f, self.save_chunk_size):
|
||||
|
@ -253,6 +253,26 @@ class ModelScanner(Generic[T]):
|
||||
for key, value in scanned_items.items():
|
||||
self._registered_items[key] = value
|
||||
|
||||
child_items = {}
|
||||
for key, value in self._registered_items.items():
|
||||
if hasattr(value, "__scan_config__"):
|
||||
_child_scanner = ModelScanner()
|
||||
_child_config = value.__scan_config__
|
||||
if not isinstance(_child_config, ScannerConfig):
|
||||
continue
|
||||
if (
|
||||
hasattr(value, "__is_already_scanned__")
|
||||
and value.__is_already_scanned__
|
||||
):
|
||||
continue
|
||||
try:
|
||||
_child_scanner.scan_and_register(_child_config)
|
||||
child_items.update(_child_scanner.get_registered_items())
|
||||
value.__is_already_scanned__ = True
|
||||
except Exception as e:
|
||||
logger.warning(f"Error scanning child module {key}: {str(e)}")
|
||||
self._registered_items.update(child_items)
|
||||
|
||||
except ImportError as e:
|
||||
logger.warning(f"Error importing module {config.module_path}: {str(e)}")
|
||||
|
||||
|
@ -75,6 +75,13 @@ storage_chromadb = [
|
||||
storage_elasticsearch = ["elasticsearch"]
|
||||
storage_obvector = ["pyobvector"]
|
||||
|
||||
file_oss = [
|
||||
"oss2" # Aliyun OSS
|
||||
]
|
||||
file_s3 = [
|
||||
"boto3"
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
managed = true
|
||||
dev-dependencies = [
|
||||
|
102
packages/dbgpt-ext/src/dbgpt_ext/storage/file/oss/config.py
Normal file
102
packages/dbgpt-ext/src/dbgpt_ext/storage/file/oss/config.py
Normal file
@ -0,0 +1,102 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from dbgpt.core.interface.file import StorageBackend, StorageBackendConfig
|
||||
from dbgpt.util.i18n_utils import _
|
||||
|
||||
|
||||
@dataclass
|
||||
class OSSStorageConfig(StorageBackendConfig):
|
||||
__type__ = "oss"
|
||||
endpoint: str = field(
|
||||
metadata={
|
||||
"help": _(
|
||||
"The endpoint of the OSS server. "
|
||||
"e.g. https://oss-cn-hangzhou.aliyuncs.com"
|
||||
)
|
||||
},
|
||||
)
|
||||
region: str = field(
|
||||
metadata={"help": _("The region of the OSS server. e.g. cn-hangzhou")},
|
||||
)
|
||||
access_key_id: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The access key ID of the OSS server. You can also set it in the "
|
||||
"environment variable OSS_ACCESS_KEY_ID"
|
||||
),
|
||||
"tags": "privacy",
|
||||
},
|
||||
)
|
||||
access_key_secret: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The access key secret of the OSS server. You can also set it in the "
|
||||
"environment variable OSS_ACCESS_KEY_SECRET"
|
||||
),
|
||||
"tags": "privacy",
|
||||
},
|
||||
)
|
||||
use_environment_credentials: Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": _(
|
||||
"Whether to use the environment variables OSS_ACCESS_KEY_ID and "
|
||||
"OSS_ACCESS_KEY_SECRET as the credentials. Default is False."
|
||||
),
|
||||
},
|
||||
)
|
||||
fixed_bucket: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The fixed bucket name to use. If set, all logical buckets in DB-GPT "
|
||||
"will be mapped to this bucket. We suggest you set this value to avoid "
|
||||
"bucket name conflicts."
|
||||
)
|
||||
},
|
||||
)
|
||||
bucket_prefix: Optional[str] = field(
|
||||
default="dbgpt-fs-",
|
||||
metadata={
|
||||
"help": _(
|
||||
"The prefix of the bucket name. If set, all logical buckets in DB-GPT "
|
||||
"will be prefixed with this value. Just work when fixed_bucket is None."
|
||||
)
|
||||
},
|
||||
)
|
||||
auto_create_bucket: Optional[bool] = field(
|
||||
default=True,
|
||||
metadata={
|
||||
"help": _(
|
||||
"Whether to create the bucket automatically if it does not exist. "
|
||||
"If set to False, the bucket must exist before using it."
|
||||
)
|
||||
},
|
||||
)
|
||||
save_chunk_size: Optional[int] = field(
|
||||
default=1024 * 1024,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The chunk size when saving the file. When the file is larger 10x than "
|
||||
"this value, it will be uploaded in multiple parts. Default is 1M."
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
def create_storage(self) -> StorageBackend:
|
||||
from .oss_storage import AliyunOSSStorage
|
||||
|
||||
return AliyunOSSStorage(
|
||||
endpoint=self.endpoint,
|
||||
region=self.region,
|
||||
access_key_id=self.access_key_id,
|
||||
access_key_secret=self.access_key_secret,
|
||||
use_environment_credentials=self.use_environment_credentials,
|
||||
fixed_bucket=self.fixed_bucket,
|
||||
bucket_prefix=self.bucket_prefix,
|
||||
auto_create_bucket=self.auto_create_bucket,
|
||||
save_chunk_size=self.save_chunk_size,
|
||||
)
|
484
packages/dbgpt-ext/src/dbgpt_ext/storage/file/oss/oss_storage.py
Normal file
484
packages/dbgpt-ext/src/dbgpt_ext/storage/file/oss/oss_storage.py
Normal file
@ -0,0 +1,484 @@
|
||||
"""Aliyun OSS storage backend."""
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from typing import BinaryIO, Callable, Dict, Optional, Union
|
||||
|
||||
import oss2
|
||||
from oss2.credentials import EnvironmentVariableCredentialsProvider
|
||||
|
||||
from dbgpt.core.interface.file import FileMetadata, StorageBackend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def does_bucket_exist(bucket):
|
||||
try:
|
||||
bucket.get_bucket_info()
|
||||
except oss2.exceptions.NoSuchBucket:
|
||||
return False
|
||||
except:
|
||||
raise
|
||||
return True
|
||||
|
||||
|
||||
class AliyunOSSStorage(StorageBackend):
|
||||
"""Aliyun OSS storage backend implementation."""
|
||||
|
||||
storage_type: str = "oss"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint: str,
|
||||
region: str,
|
||||
access_key_id: Optional[str] = None,
|
||||
access_key_secret: Optional[str] = None,
|
||||
save_chunk_size: int = 1024 * 1024,
|
||||
use_environment_credentials: bool = False,
|
||||
fixed_bucket: Optional[str] = None,
|
||||
bucket_prefix: str = "dbgpt-fs-",
|
||||
bucket_mapper: Optional[Callable[[str], str]] = None,
|
||||
auto_create_bucket: bool = True,
|
||||
):
|
||||
"""Initialize the Aliyun OSS storage backend.
|
||||
|
||||
Args:
|
||||
endpoint (str): OSS endpoint, e.g., "https://oss-cn-hangzhou.aliyuncs.com"
|
||||
region (str): OSS region, e.g., "cn-hangzhou"
|
||||
access_key_id (Optional[str], optional): Aliyun Access Key ID. Defaults to
|
||||
None.
|
||||
access_key_secret (Optional[str], optional): Aliyun Access Key Secret.
|
||||
Defaults to None.
|
||||
save_chunk_size (int, optional): Chunk size for saving files. Defaults to
|
||||
1024*1024 (1MB).
|
||||
use_environment_credentials (bool, optional): Whether to use credentials
|
||||
from environment variables. Defaults to False.
|
||||
fixed_bucket (Optional[str], optional): A fixed OSS bucket to use for all
|
||||
operations. If provided, all logical buckets will be mapped to this
|
||||
single bucket. Defaults to None.
|
||||
bucket_prefix (str, optional): Prefix for dynamically created buckets.
|
||||
Defaults to "dbgpt-fs-".
|
||||
bucket_mapper (Optional[Callable[[str], str]], optional): Custom function
|
||||
to map logical bucket names to actual OSS bucket names. Defaults to
|
||||
None.
|
||||
auto_create_bucket (bool, optional): Whether to automatically create
|
||||
buckets that don't exist. Defaults to True.
|
||||
"""
|
||||
self.endpoint = endpoint
|
||||
self.region = region
|
||||
self._save_chunk_size = save_chunk_size
|
||||
self.fixed_bucket = fixed_bucket
|
||||
self.bucket_prefix = bucket_prefix
|
||||
self.custom_bucket_mapper = bucket_mapper
|
||||
self.auto_create_bucket = auto_create_bucket
|
||||
|
||||
# Initialize OSS authentication
|
||||
if use_environment_credentials:
|
||||
# Check required environment variables
|
||||
required_env_vars = ["OSS_ACCESS_KEY_ID", "OSS_ACCESS_KEY_SECRET"]
|
||||
for var in required_env_vars:
|
||||
if var not in os.environ:
|
||||
raise ValueError(f"Environment variable {var} is not set.")
|
||||
self.auth = oss2.ProviderAuthV4(EnvironmentVariableCredentialsProvider())
|
||||
else:
|
||||
if not access_key_id or not access_key_secret:
|
||||
raise ValueError(
|
||||
"Access key ID and secret are required when not using environment "
|
||||
"credentials"
|
||||
)
|
||||
# Use provided credentials
|
||||
self.auth = oss2.Auth(access_key_id, access_key_secret)
|
||||
|
||||
# Store buckets dict to avoid recreating bucket objects
|
||||
self._buckets: Dict[str, oss2.Bucket] = {}
|
||||
|
||||
# Create fixed bucket if specified
|
||||
if self.fixed_bucket and self.auto_create_bucket:
|
||||
self._ensure_bucket_exists(self.fixed_bucket)
|
||||
|
||||
@property
|
||||
def save_chunk_size(self) -> int:
|
||||
"""Get the save chunk size."""
|
||||
return self._save_chunk_size
|
||||
|
||||
def _map_bucket_name(self, logical_bucket: str) -> str:
|
||||
"""Map logical bucket name to actual OSS bucket name.
|
||||
|
||||
Args:
|
||||
logical_bucket (str): Logical bucket name used by the application
|
||||
|
||||
Returns:
|
||||
str: Actual OSS bucket name to use
|
||||
"""
|
||||
# 1. If using a fixed bucket, always return that
|
||||
if self.fixed_bucket:
|
||||
return self.fixed_bucket
|
||||
|
||||
# 2. If a custom mapper is provided, use that
|
||||
if self.custom_bucket_mapper:
|
||||
return self.custom_bucket_mapper(logical_bucket)
|
||||
|
||||
# 3. Otherwise, use a hash-based approach to generate a unique but
|
||||
# deterministic name
|
||||
# This avoids bucket name conflicts while maintaining consistency
|
||||
bucket_hash = hashlib.md5(logical_bucket.encode()).hexdigest()[:8]
|
||||
return f"{self.bucket_prefix}{bucket_hash}-{logical_bucket}"
|
||||
|
||||
def _generate_dynamic_bucket_name(self) -> str:
|
||||
"""Generate a unique bucket name for dynamic creation.
|
||||
|
||||
Returns:
|
||||
str: A unique bucket name
|
||||
"""
|
||||
# Using timestamp + random number to ensure uniqueness
|
||||
timestamp = int(time.time())
|
||||
random_number = random.randint(0, 9999)
|
||||
return f"{self.bucket_prefix}{timestamp}-{random_number}"
|
||||
|
||||
def _ensure_bucket_exists(self, bucket_name: str) -> bool:
|
||||
"""Ensure the bucket exists, create it if needed and if auto_create_bucket is
|
||||
True.
|
||||
|
||||
Args:
|
||||
bucket_name (str): Bucket name
|
||||
|
||||
Returns:
|
||||
bool: True if the bucket exists or was created, False otherwise
|
||||
"""
|
||||
bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name, region=self.region)
|
||||
|
||||
try:
|
||||
if does_bucket_exist(bucket):
|
||||
return True
|
||||
|
||||
if not self.auto_create_bucket:
|
||||
logger.warning(
|
||||
f"Bucket {bucket_name} does not exist and auto_create_bucket is "
|
||||
f"False"
|
||||
)
|
||||
return False
|
||||
|
||||
logger.info(f"Creating bucket {bucket_name}")
|
||||
bucket.create_bucket(oss2.models.BUCKET_ACL_PRIVATE)
|
||||
return True
|
||||
except oss2.exceptions.ServerError as e:
|
||||
# Handle the case where bucket name is already taken by someone else
|
||||
if e.status == 409 and "BucketAlreadyExists" in str(e):
|
||||
logger.warning(
|
||||
f"Bucket name {bucket_name} already exists and is owned by "
|
||||
"someone else"
|
||||
)
|
||||
return False
|
||||
raise
|
||||
except oss2.exceptions.OssError as e:
|
||||
logger.error(f"Failed to create or check bucket {bucket_name}: {e}")
|
||||
raise
|
||||
|
||||
def _get_bucket(self, logical_bucket: str) -> Union[oss2.Bucket, None]:
|
||||
"""Get or create an OSS bucket object for the given logical bucket.
|
||||
|
||||
Args:
|
||||
logical_bucket (str): Logical bucket name
|
||||
|
||||
Returns:
|
||||
Union[oss2.Bucket, None]: Bucket object or None if bucket creation failed
|
||||
"""
|
||||
# Get the actual OSS bucket name
|
||||
actual_bucket_name = self._map_bucket_name(logical_bucket)
|
||||
|
||||
# Check if we've already cached this bucket
|
||||
if actual_bucket_name in self._buckets:
|
||||
return self._buckets[actual_bucket_name]
|
||||
|
||||
# Try to ensure the mapped bucket exists
|
||||
if self._ensure_bucket_exists(actual_bucket_name):
|
||||
# Cache and return the bucket
|
||||
self._buckets[actual_bucket_name] = oss2.Bucket(
|
||||
self.auth, self.endpoint, actual_bucket_name, region=self.region
|
||||
)
|
||||
return self._buckets[actual_bucket_name]
|
||||
|
||||
# If we get here, the bucket doesn't exist and couldn't be created
|
||||
# Try to create a dynamic bucket if we're not using a fixed bucket
|
||||
if not self.fixed_bucket and self.auto_create_bucket:
|
||||
# Generate a new unique bucket name
|
||||
dynamic_bucket = self._generate_dynamic_bucket_name()
|
||||
logger.info(
|
||||
f"Attempting to create dynamic bucket {dynamic_bucket} for logical "
|
||||
f"bucket {logical_bucket}"
|
||||
)
|
||||
|
||||
if self._ensure_bucket_exists(dynamic_bucket):
|
||||
self._buckets[actual_bucket_name] = oss2.Bucket(
|
||||
self.auth, self.endpoint, dynamic_bucket, region=self.region
|
||||
)
|
||||
return self._buckets[actual_bucket_name]
|
||||
|
||||
# If all attempts failed
|
||||
raise ValueError(
|
||||
f"Failed to get or create bucket for logical bucket {logical_bucket}"
|
||||
)
|
||||
|
||||
def save(self, bucket: str, file_id: str, file_data: BinaryIO) -> str:
|
||||
"""Save the file data to Aliyun OSS.
|
||||
|
||||
Args:
|
||||
bucket (str): The logical bucket name
|
||||
file_id (str): The file ID
|
||||
file_data (BinaryIO): The file data
|
||||
|
||||
Returns:
|
||||
str: The storage path (OSS URI)
|
||||
"""
|
||||
# Get the actual OSS bucket
|
||||
oss_bucket = self._get_bucket(bucket)
|
||||
|
||||
# Generate OSS object name based on whether we're using fixed bucket
|
||||
object_name = file_id
|
||||
if self.fixed_bucket:
|
||||
# When using a fixed bucket, we need to prefix with logical bucket name to
|
||||
# avoid conflicts
|
||||
object_name = f"{bucket}/{file_id}"
|
||||
|
||||
# For large files, use multipart upload
|
||||
file_size = self._get_file_size(file_data)
|
||||
|
||||
if file_size > 10 * self.save_chunk_size: # If file is larger than 10MB
|
||||
logger.info(
|
||||
f"Using multipart upload for large file: {object_name} "
|
||||
f"(size: {file_size})"
|
||||
)
|
||||
self._multipart_upload(oss_bucket, object_name, file_data)
|
||||
else:
|
||||
logger.info(f"Uploading file using simple upload: {object_name}")
|
||||
try:
|
||||
oss_bucket.put_object(object_name, file_data)
|
||||
except oss2.exceptions.OssError as e:
|
||||
logger.error(
|
||||
f"Failed to upload file {object_name} to bucket "
|
||||
f"{oss_bucket.bucket_name}: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
# Store the OSS bucket name and object path for future reference
|
||||
actual_bucket_name = oss_bucket.bucket_name
|
||||
|
||||
# Format: oss://{actual_bucket_name}/{object_name}
|
||||
# We store both the actual bucket name and the object path in the URI
|
||||
# But we'll also keep the logical bucket in the external URI format
|
||||
return f"oss://{bucket}/{file_id}?actual_bucket={actual_bucket_name}&object_name={object_name}" # noqa
|
||||
|
||||
def _get_file_size(self, file_data: BinaryIO) -> int:
|
||||
"""Get file size without consuming the file object.
|
||||
|
||||
Args:
|
||||
file_data (BinaryIO): The file data
|
||||
|
||||
Returns:
|
||||
int: The file size in bytes
|
||||
"""
|
||||
current_pos = file_data.tell()
|
||||
file_data.seek(0, io.SEEK_END)
|
||||
size = file_data.tell()
|
||||
file_data.seek(current_pos) # Reset the file pointer
|
||||
return size
|
||||
|
||||
def _multipart_upload(
|
||||
self, oss_bucket: oss2.Bucket, file_id: str, file_data: BinaryIO
|
||||
) -> None:
|
||||
"""Handle multipart upload for large files.
|
||||
|
||||
Args:
|
||||
oss_bucket (oss2.Bucket): OSS bucket object
|
||||
file_id (str): The file ID
|
||||
file_data (BinaryIO): The file data
|
||||
"""
|
||||
# Initialize multipart upload
|
||||
upload_id = oss_bucket.init_multipart_upload(file_id).upload_id
|
||||
|
||||
# Upload parts
|
||||
part_number = 1
|
||||
parts = []
|
||||
|
||||
while True:
|
||||
chunk = file_data.read(self.save_chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
# Upload part
|
||||
etag = oss_bucket.upload_part(file_id, upload_id, part_number, chunk).etag
|
||||
parts.append(oss2.models.PartInfo(part_number, etag))
|
||||
part_number += 1
|
||||
|
||||
# Complete multipart upload
|
||||
oss_bucket.complete_multipart_upload(file_id, upload_id, parts)
|
||||
|
||||
def _parse_storage_path(self, storage_path: str) -> Dict[str, str]:
|
||||
"""Parse the OSS storage path to extract actual bucket and object name.
|
||||
|
||||
Args:
|
||||
storage_path (str): The storage path URI
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: A dictionary with actual_bucket and object_name keys
|
||||
"""
|
||||
if not storage_path.startswith("oss://"):
|
||||
raise ValueError(f"Invalid storage path for Aliyun OSS: {storage_path}")
|
||||
|
||||
# Example URI:
|
||||
# oss://logical_bucket/file_id?actual_bucket=oss_bucket&object_name=logical_bucket/file_id # noqa
|
||||
|
||||
# Try to parse the URL parameters
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
parsed_url = urlparse(storage_path)
|
||||
params = parse_qs(parsed_url.query)
|
||||
|
||||
# Extract the parameters
|
||||
actual_bucket = params.get("actual_bucket", [None])[0]
|
||||
object_name = params.get("object_name", [None])[0]
|
||||
|
||||
# Extract the logical bucket and file_id from the path
|
||||
path_parts = parsed_url.path.strip("/").split("/", 1)
|
||||
logical_bucket = path_parts[0] if path_parts else None
|
||||
logical_file_id = path_parts[1] if len(path_parts) > 1 else None
|
||||
|
||||
# If parameters aren't in the URL (backward compatibility or simplified URL),
|
||||
# derive them from the logical values
|
||||
if not actual_bucket:
|
||||
# Try to use the bucket mapper to get the actual bucket
|
||||
actual_bucket = (
|
||||
self._map_bucket_name(logical_bucket) if logical_bucket else None
|
||||
)
|
||||
|
||||
if not object_name:
|
||||
# If using fixed bucket, the object name includes the logical bucket
|
||||
# as prefix
|
||||
if self.fixed_bucket:
|
||||
object_name = (
|
||||
f"{logical_bucket}/{logical_file_id}"
|
||||
if logical_bucket and logical_file_id
|
||||
else None
|
||||
)
|
||||
else:
|
||||
object_name = logical_file_id
|
||||
|
||||
return {
|
||||
"logical_bucket": logical_bucket,
|
||||
"logical_file_id": logical_file_id,
|
||||
"actual_bucket": actual_bucket,
|
||||
"object_name": object_name,
|
||||
}
|
||||
|
||||
def load(self, fm: FileMetadata) -> BinaryIO:
|
||||
"""Load the file data from Aliyun OSS.
|
||||
|
||||
Args:
|
||||
fm (FileMetadata): The file metadata
|
||||
|
||||
Returns:
|
||||
BinaryIO: The file data as a binary IO object
|
||||
"""
|
||||
# Parse the storage path
|
||||
path_info = self._parse_storage_path(fm.storage_path)
|
||||
|
||||
# Get actual bucket and object name
|
||||
actual_bucket_name = path_info["actual_bucket"]
|
||||
object_name = path_info["object_name"]
|
||||
logical_bucket = path_info["logical_bucket"]
|
||||
|
||||
# If we couldn't determine the actual bucket from the URI, try with the
|
||||
# logical bucket
|
||||
if not actual_bucket_name and logical_bucket:
|
||||
actual_bucket_name = self._map_bucket_name(logical_bucket)
|
||||
|
||||
# Use the file_id as object name if object_name is still None
|
||||
if not object_name:
|
||||
object_name = fm.file_id
|
||||
# If using fixed bucket, prefix with logical bucket
|
||||
if self.fixed_bucket and logical_bucket:
|
||||
object_name = f"{logical_bucket}/{fm.file_id}"
|
||||
|
||||
# Get the bucket object
|
||||
try:
|
||||
oss_bucket = oss2.Bucket(
|
||||
self.auth, self.endpoint, actual_bucket_name, region=self.region
|
||||
)
|
||||
|
||||
# Get object as stream
|
||||
object_stream = oss_bucket.get_object(object_name)
|
||||
|
||||
# Convert to BytesIO for compatibility
|
||||
content = io.BytesIO(object_stream.read())
|
||||
content.seek(0)
|
||||
return content
|
||||
except oss2.exceptions.NoSuchKey as e:
|
||||
logger.error(
|
||||
f"File {object_name} not found in bucket {actual_bucket_name}: {e}"
|
||||
)
|
||||
raise FileNotFoundError(
|
||||
f"File {object_name} not found in bucket {actual_bucket_name}"
|
||||
)
|
||||
except oss2.exceptions.OssError as e:
|
||||
logger.error(
|
||||
f"Failed to download file {object_name} from bucket "
|
||||
f"{actual_bucket_name}: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
def delete(self, fm: FileMetadata) -> bool:
|
||||
"""Delete the file data from Aliyun OSS.
|
||||
|
||||
Args:
|
||||
fm (FileMetadata): The file metadata
|
||||
|
||||
Returns:
|
||||
bool: True if the file was deleted, False otherwise
|
||||
"""
|
||||
# Parse the storage path
|
||||
path_info = self._parse_storage_path(fm.storage_path)
|
||||
|
||||
# Get actual bucket and object name
|
||||
actual_bucket_name = path_info["actual_bucket"]
|
||||
object_name = path_info["object_name"]
|
||||
logical_bucket = path_info["logical_bucket"]
|
||||
|
||||
# If we couldn't determine the actual bucket from the URI, try with the
|
||||
# logical bucket
|
||||
if not actual_bucket_name and logical_bucket:
|
||||
actual_bucket_name = self._map_bucket_name(logical_bucket)
|
||||
|
||||
# Use the file_id as object name if object_name is still None
|
||||
if not object_name:
|
||||
object_name = fm.file_id
|
||||
# If using fixed bucket, prefix with logical bucket
|
||||
if self.fixed_bucket and logical_bucket:
|
||||
object_name = f"{logical_bucket}/{fm.file_id}"
|
||||
|
||||
try:
|
||||
# Get the bucket object
|
||||
oss_bucket = oss2.Bucket(
|
||||
self.auth, self.endpoint, actual_bucket_name, region=self.region
|
||||
)
|
||||
|
||||
# Check if the object exists
|
||||
if not oss_bucket.object_exists(object_name):
|
||||
logger.warning(
|
||||
f"File {object_name} does not exist in bucket {actual_bucket_name}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Delete the object
|
||||
oss_bucket.delete_object(object_name)
|
||||
logger.info(f"File {object_name} deleted from bucket {actual_bucket_name}")
|
||||
return True
|
||||
except oss2.exceptions.OssError as e:
|
||||
logger.error(
|
||||
f"Failed to delete file {object_name} from bucket {actual_bucket_name}:"
|
||||
f" {e}"
|
||||
)
|
||||
return False
|
118
packages/dbgpt-ext/src/dbgpt_ext/storage/file/s3/config.py
Normal file
118
packages/dbgpt-ext/src/dbgpt_ext/storage/file/s3/config.py
Normal file
@ -0,0 +1,118 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from dbgpt.core.interface.file import StorageBackend, StorageBackendConfig
|
||||
from dbgpt.util.i18n_utils import _
|
||||
|
||||
|
||||
@dataclass
|
||||
class S3StorageConfig(StorageBackendConfig):
|
||||
__type__ = "s3"
|
||||
endpoint: str = field(
|
||||
metadata={
|
||||
"help": _(
|
||||
"The endpoint of the s3 server. e.g. https://s3.us-east-1.amazonaws.com"
|
||||
)
|
||||
},
|
||||
)
|
||||
region: str = field(
|
||||
metadata={"help": _("The region of the s3 server. e.g. us-east-1")},
|
||||
)
|
||||
access_key_id: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The access key ID of the s3 server. You can also set it in the "
|
||||
"environment variable AWS_ACCESS_KEY_ID"
|
||||
),
|
||||
"tags": "privacy",
|
||||
},
|
||||
)
|
||||
access_key_secret: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The access key secret of the s3 server. You can also set it in the "
|
||||
"environment variable AWS_SECRET_ACCESS_KEY"
|
||||
),
|
||||
"tags": "privacy",
|
||||
},
|
||||
)
|
||||
use_environment_credentials: Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": _(
|
||||
"Whether to use the environment variables AWS_ACCESS_KEY_ID and "
|
||||
"AWS_SECRET_ACCESS_KEY as the credentials. Default is False."
|
||||
),
|
||||
},
|
||||
)
|
||||
fixed_bucket: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The fixed bucket name to use. If set, all logical buckets in DB-GPT "
|
||||
"will be mapped to this bucket. We suggest you set this value to avoid "
|
||||
"bucket name conflicts."
|
||||
)
|
||||
},
|
||||
)
|
||||
bucket_prefix: Optional[str] = field(
|
||||
default="dbgpt-fs-",
|
||||
metadata={
|
||||
"help": _(
|
||||
"The prefix of the bucket name. If set, all logical buckets in DB-GPT "
|
||||
"will be prefixed with this value. Just work when fixed_bucket is None."
|
||||
)
|
||||
},
|
||||
)
|
||||
auto_create_bucket: Optional[bool] = field(
|
||||
default=True,
|
||||
metadata={
|
||||
"help": _(
|
||||
"Whether to create the bucket automatically if it does not exist. "
|
||||
"If set to False, the bucket must exist before using it."
|
||||
)
|
||||
},
|
||||
)
|
||||
save_chunk_size: Optional[int] = field(
|
||||
default=1024 * 1024,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The chunk size when saving the file. When the file is larger 10x than "
|
||||
"this value, it will be uploaded in multiple parts. Default is 1M."
|
||||
)
|
||||
},
|
||||
)
|
||||
signature_version: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": _(
|
||||
"The signature version of the s3 server. "
|
||||
"e.g. s3v4, s3v2, None (default)"
|
||||
)
|
||||
},
|
||||
)
|
||||
s3_config: Optional[Dict[str, Any]] = field(
|
||||
default_factory=dict,
|
||||
metadata={
|
||||
"help": _("The additional configuration for the S3 client."),
|
||||
},
|
||||
)
|
||||
|
||||
def create_storage(self) -> StorageBackend:
|
||||
from .s3_storage import S3Storage
|
||||
|
||||
return S3Storage(
|
||||
endpoint_url=self.endpoint,
|
||||
region_name=self.region,
|
||||
access_key_id=self.access_key_id,
|
||||
secret_access_key=self.access_key_secret,
|
||||
use_environment_credentials=self.use_environment_credentials,
|
||||
fixed_bucket=self.fixed_bucket,
|
||||
bucket_prefix=self.bucket_prefix,
|
||||
auto_create_bucket=self.auto_create_bucket,
|
||||
save_chunk_size=self.save_chunk_size,
|
||||
signature_version=self.signature_version,
|
||||
s3_config=self.s3_config,
|
||||
)
|
589
packages/dbgpt-ext/src/dbgpt_ext/storage/file/s3/s3_storage.py
Normal file
589
packages/dbgpt-ext/src/dbgpt_ext/storage/file/s3/s3_storage.py
Normal file
@ -0,0 +1,589 @@
|
||||
"""S3 compatible storage backend."""
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from typing import BinaryIO, Callable, Dict, Optional, Union
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from dbgpt.core.interface.file import FileMetadata, StorageBackend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class S3Storage(StorageBackend):
|
||||
"""S3 compatible storage backend implementation."""
|
||||
|
||||
storage_type: str = "s3"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint_url: str,
|
||||
region_name: str,
|
||||
access_key_id: str,
|
||||
secret_access_key: str,
|
||||
save_chunk_size: int = 1024 * 1024,
|
||||
use_environment_credentials: bool = False,
|
||||
fixed_bucket: Optional[str] = None,
|
||||
bucket_prefix: str = "dbgpt-fs-",
|
||||
bucket_mapper: Optional[Callable[[str], str]] = None,
|
||||
auto_create_bucket: bool = True,
|
||||
signature_version: Optional[str] = None,
|
||||
s3_config: Optional[Dict[str, Union[str, int]]] = None,
|
||||
):
|
||||
"""Initialize the S3 compatible storage backend.
|
||||
|
||||
Args:
|
||||
endpoint_url (str): S3 endpoint URL, e.g.,
|
||||
"https://s3.us-east-1.amazonaws.com"
|
||||
region_name (str): S3 region, e.g., "us-east-1"
|
||||
access_key_id (str): AWS/S3 Access Key ID
|
||||
secret_access_key (str): AWS/S3 Secret Access Key
|
||||
save_chunk_size (int, optional): Chunk size for saving files. Defaults to
|
||||
1024*1024 (1MB).
|
||||
use_environment_credentials (bool, optional): Whether to use credentials
|
||||
from environment variables. Defaults to False.
|
||||
fixed_bucket (Optional[str], optional): A fixed S3 bucket to use for all
|
||||
operations. If provided, all logical buckets will be mapped to this
|
||||
single bucket. Defaults to None.
|
||||
bucket_prefix (str, optional): Prefix for dynamically created buckets.
|
||||
Defaults to "dbgpt-fs-".
|
||||
bucket_mapper (Optional[Callable[[str], str]], optional): Custom function
|
||||
to map logical bucket names to actual S3 bucket names. Defaults to None.
|
||||
auto_create_bucket (bool, optional): Whether to automatically create
|
||||
buckets that don't exist. Defaults to True.
|
||||
signature_version (str, optional): S3 signature version to use.
|
||||
s3_config (Optional[Dict[str, Union[str, int]]], optional): Additional
|
||||
S3 configuration options. Defaults to None.
|
||||
"""
|
||||
self.endpoint_url = endpoint_url
|
||||
self.region_name = region_name
|
||||
self._save_chunk_size = save_chunk_size
|
||||
self.fixed_bucket = fixed_bucket
|
||||
self.bucket_prefix = bucket_prefix
|
||||
self.custom_bucket_mapper = bucket_mapper
|
||||
self.auto_create_bucket = auto_create_bucket
|
||||
self.signature_version = signature_version
|
||||
|
||||
# Build S3 client configuration
|
||||
if not s3_config:
|
||||
s3_config = {
|
||||
"s3": {
|
||||
# Use virtual addressing style
|
||||
"addressing_style": "virtual",
|
||||
},
|
||||
"signature_version": signature_version or "v4",
|
||||
}
|
||||
if "request_checksum_calculation" not in s3_config:
|
||||
s3_config["request_checksum_calculation"] = "when_required"
|
||||
if "response_checksum_validation" not in s3_config:
|
||||
s3_config["response_checksum_validation"] = "when_required"
|
||||
config = Config(**s3_config)
|
||||
|
||||
# Initialize S3 authentication
|
||||
if use_environment_credentials:
|
||||
# Check required environment variables
|
||||
required_env_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"]
|
||||
for var in required_env_vars:
|
||||
if var not in os.environ:
|
||||
raise ValueError(f"Environment variable {var} is not set.")
|
||||
|
||||
# Use environment credentials
|
||||
self.s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=self.endpoint_url,
|
||||
region_name=self.region_name,
|
||||
config=config,
|
||||
)
|
||||
else:
|
||||
if not access_key_id or not secret_access_key:
|
||||
raise ValueError(
|
||||
"Access key ID and secret are required when not using environment "
|
||||
"credentials"
|
||||
)
|
||||
# Use provided credentials
|
||||
self.s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=self.endpoint_url,
|
||||
region_name=self.region_name,
|
||||
aws_access_key_id=access_key_id,
|
||||
aws_secret_access_key=secret_access_key,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# Create fixed bucket if specified
|
||||
if self.fixed_bucket and self.auto_create_bucket:
|
||||
self._ensure_bucket_exists(self.fixed_bucket)
|
||||
|
||||
@property
|
||||
def save_chunk_size(self) -> int:
|
||||
"""Get the save chunk size."""
|
||||
return self._save_chunk_size
|
||||
|
||||
def _map_bucket_name(self, logical_bucket: str) -> str:
|
||||
"""Map logical bucket name to actual S3 bucket name.
|
||||
|
||||
Args:
|
||||
logical_bucket (str): Logical bucket name used by the application
|
||||
|
||||
Returns:
|
||||
str: Actual S3 bucket name to use
|
||||
"""
|
||||
# 1. If using a fixed bucket, always return that
|
||||
if self.fixed_bucket:
|
||||
return self.fixed_bucket
|
||||
|
||||
# 2. If a custom mapper is provided, use that
|
||||
if self.custom_bucket_mapper:
|
||||
return self.custom_bucket_mapper(logical_bucket)
|
||||
|
||||
# 3. Otherwise, use a hash-based approach to generate a unique but
|
||||
# deterministic name
|
||||
# This avoids bucket name conflicts while maintaining consistency
|
||||
bucket_hash = hashlib.md5(logical_bucket.encode()).hexdigest()[:8]
|
||||
return f"{self.bucket_prefix}{bucket_hash}-{logical_bucket}"
|
||||
|
||||
def _generate_dynamic_bucket_name(self) -> str:
|
||||
"""Generate a unique bucket name for dynamic creation.
|
||||
|
||||
Returns:
|
||||
str: A unique bucket name
|
||||
"""
|
||||
# Using timestamp + random number to ensure uniqueness
|
||||
timestamp = int(time.time())
|
||||
random_number = random.randint(0, 9999)
|
||||
return f"{self.bucket_prefix}{timestamp}-{random_number}"
|
||||
|
||||
def _ensure_bucket_exists(self, bucket_name: str) -> bool:
|
||||
"""Ensure the bucket exists, create it if needed and if auto_create_bucket is
|
||||
True.
|
||||
|
||||
Args:
|
||||
bucket_name (str): Bucket name
|
||||
|
||||
Returns:
|
||||
bool: True if the bucket exists or was created, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Check if bucket exists
|
||||
self.s3_client.head_bucket(Bucket=bucket_name)
|
||||
logger.info(f"Bucket {bucket_name} exists")
|
||||
return True
|
||||
except ClientError as e:
|
||||
error_code = e.response.get("Error", {}).get("Code")
|
||||
error_msg = str(e)
|
||||
|
||||
logger.info(
|
||||
f"Bucket check failed with error_code={error_code}, msg={error_msg}"
|
||||
)
|
||||
|
||||
# Bucket doesn't exist or we don't have permission to access it
|
||||
if error_code in ["404", "403", "NoSuchBucket", "Forbidden"]:
|
||||
if not self.auto_create_bucket:
|
||||
logger.warning(
|
||||
f"Bucket {bucket_name} does not exist and auto_create_bucket "
|
||||
"is False"
|
||||
)
|
||||
return False
|
||||
|
||||
# Create bucket
|
||||
try:
|
||||
logger.info(f"Creating bucket {bucket_name}")
|
||||
|
||||
# Try different creation methods to adapt to different
|
||||
# S3-compatible APIs
|
||||
creation_methods = [
|
||||
# Method 1: Use LocationConstraint
|
||||
lambda: self.s3_client.create_bucket(
|
||||
Bucket=bucket_name,
|
||||
CreateBucketConfiguration={
|
||||
"LocationConstraint": self.region_name
|
||||
},
|
||||
),
|
||||
# Method 2: Without LocationConstraint
|
||||
lambda: self.s3_client.create_bucket(Bucket=bucket_name),
|
||||
# Method 3: Use empty CreateBucketConfiguration
|
||||
lambda: self.s3_client.create_bucket(
|
||||
Bucket=bucket_name, CreateBucketConfiguration={}
|
||||
),
|
||||
]
|
||||
|
||||
# Try different creation methods
|
||||
last_error = None
|
||||
for create_method in creation_methods:
|
||||
try:
|
||||
create_method()
|
||||
logger.info(f"Successfully created bucket {bucket_name}")
|
||||
return True
|
||||
except ClientError as method_error:
|
||||
logger.info(
|
||||
f"Bucket creation method failed: {method_error}"
|
||||
)
|
||||
last_error = method_error
|
||||
continue
|
||||
|
||||
# If all methods failed, raise the last error
|
||||
if last_error:
|
||||
raise last_error
|
||||
|
||||
return False
|
||||
|
||||
except ClientError as create_error:
|
||||
# Handle the case where bucket name is already taken by someone else
|
||||
logger.error(
|
||||
f"Failed to create bucket {bucket_name}: {create_error}"
|
||||
)
|
||||
if "BucketAlreadyExists" in str(create_error):
|
||||
logger.warning(
|
||||
f"Bucket name {bucket_name} already exists and is owned by "
|
||||
"someone else"
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# Some other error
|
||||
logger.error(f"Failed to check bucket {bucket_name}: {e}")
|
||||
return False
|
||||
|
||||
def save(self, bucket: str, file_id: str, file_data: BinaryIO) -> str:
|
||||
"""Save the file data to S3.
|
||||
|
||||
Args:
|
||||
bucket (str): The logical bucket name
|
||||
file_id (str): The file ID
|
||||
file_data (BinaryIO): The file data
|
||||
|
||||
Returns:
|
||||
str: The storage path (S3 URI)
|
||||
"""
|
||||
# Get the actual S3 bucket
|
||||
actual_bucket_name = self._map_bucket_name(bucket)
|
||||
logger.info(
|
||||
f"Mapped logical bucket '{bucket}' to actual bucket '{actual_bucket_name}'"
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
bucket_exists = self._ensure_bucket_exists(actual_bucket_name)
|
||||
|
||||
if not bucket_exists:
|
||||
logger.warning(
|
||||
f"Could not ensure bucket {actual_bucket_name} exists, trying "
|
||||
"alternatives"
|
||||
)
|
||||
|
||||
# Try to create a dynamic bucket if we're not using a fixed bucket
|
||||
if not self.fixed_bucket and self.auto_create_bucket:
|
||||
dynamic_bucket = self._generate_dynamic_bucket_name()
|
||||
logger.info(
|
||||
f"Attempting to create dynamic bucket {dynamic_bucket} for logical "
|
||||
f"bucket {bucket}"
|
||||
)
|
||||
|
||||
if self._ensure_bucket_exists(dynamic_bucket):
|
||||
logger.info(f"Successfully created dynamic bucket {dynamic_bucket}")
|
||||
actual_bucket_name = dynamic_bucket
|
||||
else:
|
||||
error_msg = (
|
||||
f"Failed to get or create bucket for logical bucket {bucket}"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
else:
|
||||
error_msg = (
|
||||
f"Failed to get or create bucket for logical bucket {bucket}"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Generate S3 object key based on whether we're using fixed bucket
|
||||
object_key = file_id
|
||||
if self.fixed_bucket:
|
||||
# When using a fixed bucket, we need to prefix with logical bucket name to
|
||||
# avoid conflicts
|
||||
object_key = f"{bucket}/{file_id}"
|
||||
|
||||
# For large files, use multipart upload
|
||||
file_size = self._get_file_size(file_data)
|
||||
|
||||
if file_size > 10 * self.save_chunk_size: # If file is larger than 10MB
|
||||
logger.info(
|
||||
f"Using multipart upload for large file: {object_key} "
|
||||
f"(size: {file_size})"
|
||||
)
|
||||
self._multipart_upload(actual_bucket_name, object_key, file_data)
|
||||
else:
|
||||
logger.info(f"Uploading file using simple upload: {object_key}")
|
||||
try:
|
||||
# Reset the file pointer to the beginning
|
||||
file_data.seek(0)
|
||||
|
||||
# Read the file content into memory
|
||||
file_content = file_data.read()
|
||||
|
||||
# Use put_object for small files
|
||||
self.s3_client.put_object(
|
||||
Bucket=actual_bucket_name, Key=object_key, Body=file_content
|
||||
)
|
||||
except ClientError as e:
|
||||
logger.error(
|
||||
f"Failed to upload file {object_key} to bucket "
|
||||
f"{actual_bucket_name}: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
# Format: s3://{logical_bucket}/{file_id}?actual_bucket={actual_bucket_name}&object_key={object_key} # noqa
|
||||
return f"s3://{bucket}/{file_id}?actual_bucket={actual_bucket_name}&object_key={object_key}" # noqa
|
||||
|
||||
def _get_file_size(self, file_data: BinaryIO) -> int:
|
||||
"""Get file size without consuming the file object.
|
||||
|
||||
Args:
|
||||
file_data (BinaryIO): The file data
|
||||
|
||||
Returns:
|
||||
int: The file size in bytes
|
||||
"""
|
||||
current_pos = file_data.tell()
|
||||
file_data.seek(0, io.SEEK_END)
|
||||
size = file_data.tell()
|
||||
file_data.seek(current_pos) # Reset the file pointer
|
||||
return size
|
||||
|
||||
def _multipart_upload(
|
||||
self, bucket_name: str, object_key: str, file_data: BinaryIO
|
||||
) -> None:
|
||||
"""Handle multipart upload for large files.
|
||||
|
||||
Args:
|
||||
bucket_name (str): S3 bucket name
|
||||
object_key (str): The object key (file path in S3)
|
||||
file_data (BinaryIO): The file data
|
||||
"""
|
||||
# Initialize multipart upload
|
||||
try:
|
||||
mpu = self.s3_client.create_multipart_upload(
|
||||
Bucket=bucket_name, Key=object_key
|
||||
)
|
||||
upload_id = mpu["UploadId"]
|
||||
|
||||
# Upload parts
|
||||
part_number = 1
|
||||
parts = []
|
||||
file_data.seek(0) # Make sure we're at the beginning of the file
|
||||
|
||||
while True:
|
||||
# Read the chunk
|
||||
chunk = file_data.read(self.save_chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
# Upload the part
|
||||
response = self.s3_client.upload_part(
|
||||
Bucket=bucket_name,
|
||||
Key=object_key,
|
||||
UploadId=upload_id,
|
||||
PartNumber=part_number,
|
||||
Body=chunk,
|
||||
)
|
||||
|
||||
parts.append({"PartNumber": part_number, "ETag": response["ETag"]})
|
||||
|
||||
part_number += 1
|
||||
|
||||
# Complete multipart upload
|
||||
self.s3_client.complete_multipart_upload(
|
||||
Bucket=bucket_name,
|
||||
Key=object_key,
|
||||
UploadId=upload_id,
|
||||
MultipartUpload={"Parts": parts},
|
||||
)
|
||||
except ClientError as e:
|
||||
logger.error(f"Error in multipart upload: {e}")
|
||||
# Attempt to abort the multipart upload if it was initialized
|
||||
if "upload_id" in locals():
|
||||
try:
|
||||
self.s3_client.abort_multipart_upload(
|
||||
Bucket=bucket_name, Key=object_key, UploadId=upload_id
|
||||
)
|
||||
except ClientError as abort_error:
|
||||
logger.error(f"Error aborting multipart upload: {abort_error}")
|
||||
raise
|
||||
|
||||
def _parse_storage_path(self, storage_path: str) -> Dict[str, str]:
|
||||
"""Parse the S3 storage path to extract actual bucket and object key.
|
||||
|
||||
Args:
|
||||
storage_path (str): The storage path URI
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: A dictionary with actual_bucket and object_key keys
|
||||
"""
|
||||
if not storage_path.startswith("s3://"):
|
||||
raise ValueError(f"Invalid storage path for S3: {storage_path}")
|
||||
|
||||
# Example URI:
|
||||
# s3://logical_bucket/file_id?actual_bucket=s3_bucket&object_key=logical_bucket/file_id # noqa
|
||||
|
||||
# Parse the URL
|
||||
parsed_url = urlparse(storage_path)
|
||||
params = parse_qs(parsed_url.query)
|
||||
|
||||
# Extract the parameters
|
||||
actual_bucket = params.get("actual_bucket", [None])[0]
|
||||
object_key = params.get("object_key", [None])[0]
|
||||
|
||||
# Extract the logical bucket and file_id from the path
|
||||
path_parts = parsed_url.path.strip("/").split("/", 1)
|
||||
logical_bucket = path_parts[0] if path_parts else None
|
||||
logical_file_id = path_parts[1] if len(path_parts) > 1 else None
|
||||
|
||||
# If parameters aren't in the URL (backward compatibility or simplified URL),
|
||||
# derive them from the logical values
|
||||
if not actual_bucket:
|
||||
# Try to use the bucket mapper to get the actual bucket
|
||||
actual_bucket = (
|
||||
self._map_bucket_name(logical_bucket) if logical_bucket else None
|
||||
)
|
||||
|
||||
if not object_key:
|
||||
# If using fixed bucket, the object key includes the logical bucket
|
||||
# as prefix
|
||||
if self.fixed_bucket:
|
||||
object_key = (
|
||||
f"{logical_bucket}/{logical_file_id}"
|
||||
if logical_bucket and logical_file_id
|
||||
else None
|
||||
)
|
||||
else:
|
||||
object_key = logical_file_id
|
||||
|
||||
return {
|
||||
"logical_bucket": logical_bucket,
|
||||
"logical_file_id": logical_file_id,
|
||||
"actual_bucket": actual_bucket,
|
||||
"object_key": object_key,
|
||||
}
|
||||
|
||||
def load(self, fm: FileMetadata) -> BinaryIO:
|
||||
"""Load the file data from S3.
|
||||
|
||||
Args:
|
||||
fm (FileMetadata): The file metadata
|
||||
|
||||
Returns:
|
||||
BinaryIO: The file data as a binary IO object
|
||||
"""
|
||||
# Parse the storage path
|
||||
path_info = self._parse_storage_path(fm.storage_path)
|
||||
|
||||
# Get actual bucket and object key
|
||||
actual_bucket_name = path_info["actual_bucket"]
|
||||
object_key = path_info["object_key"]
|
||||
logical_bucket = path_info["logical_bucket"]
|
||||
|
||||
# If we couldn't determine the actual bucket from the URI, try with the
|
||||
# logical bucket
|
||||
if not actual_bucket_name and logical_bucket:
|
||||
actual_bucket_name = self._map_bucket_name(logical_bucket)
|
||||
|
||||
# Use the file_id as object key if object_key is still None
|
||||
if not object_key:
|
||||
object_key = fm.file_id
|
||||
# If using fixed bucket, prefix with logical bucket
|
||||
if self.fixed_bucket and logical_bucket:
|
||||
object_key = f"{logical_bucket}/{fm.file_id}"
|
||||
|
||||
try:
|
||||
# Get object from S3
|
||||
response = self.s3_client.get_object(
|
||||
Bucket=actual_bucket_name, Key=object_key
|
||||
)
|
||||
|
||||
# Read the streaming body into a BytesIO object
|
||||
content = io.BytesIO()
|
||||
body = response["Body"]
|
||||
|
||||
# Stream the data in chunks
|
||||
while True:
|
||||
chunk = body.read(self.save_chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
content.write(chunk)
|
||||
|
||||
content.seek(0)
|
||||
return content
|
||||
except ClientError as e:
|
||||
error_code = e.response.get("Error", {}).get("Code")
|
||||
if error_code == "NoSuchKey":
|
||||
logger.error(
|
||||
f"File {object_key} not found in bucket {actual_bucket_name}: {e}"
|
||||
)
|
||||
raise FileNotFoundError(
|
||||
f"File {object_key} not found in bucket {actual_bucket_name}"
|
||||
)
|
||||
logger.error(
|
||||
f"Failed to download file {object_key} from bucket "
|
||||
f"{actual_bucket_name}: {e}"
|
||||
)
|
||||
raise
|
||||
|
||||
def delete(self, fm: FileMetadata) -> bool:
|
||||
"""Delete the file data from S3.
|
||||
|
||||
Args:
|
||||
fm (FileMetadata): The file metadata
|
||||
|
||||
Returns:
|
||||
bool: True if the file was deleted, False otherwise
|
||||
"""
|
||||
# Parse the storage path
|
||||
path_info = self._parse_storage_path(fm.storage_path)
|
||||
|
||||
# Get actual bucket and object key
|
||||
actual_bucket_name = path_info["actual_bucket"]
|
||||
object_key = path_info["object_key"]
|
||||
logical_bucket = path_info["logical_bucket"]
|
||||
|
||||
# If we couldn't determine the actual bucket from the URI, try with the
|
||||
# logical bucket
|
||||
if not actual_bucket_name and logical_bucket:
|
||||
actual_bucket_name = self._map_bucket_name(logical_bucket)
|
||||
|
||||
# Use the file_id as object key if object_key is still None
|
||||
if not object_key:
|
||||
object_key = fm.file_id
|
||||
# If using fixed bucket, prefix with logical bucket
|
||||
if self.fixed_bucket and logical_bucket:
|
||||
object_key = f"{logical_bucket}/{fm.file_id}"
|
||||
|
||||
try:
|
||||
# Check if the object exists
|
||||
try:
|
||||
self.s3_client.head_object(Bucket=actual_bucket_name, Key=object_key)
|
||||
except ClientError as e:
|
||||
error_code = e.response.get("Error", {}).get("Code")
|
||||
if error_code == "404" or error_code == "NoSuchKey":
|
||||
logger.warning(
|
||||
f"File {object_key} does not exist in bucket "
|
||||
f"{actual_bucket_name}"
|
||||
)
|
||||
return False
|
||||
raise
|
||||
|
||||
# Delete the object
|
||||
self.s3_client.delete_object(Bucket=actual_bucket_name, Key=object_key)
|
||||
|
||||
logger.info(f"File {object_key} deleted from bucket {actual_bucket_name}")
|
||||
return True
|
||||
except ClientError as e:
|
||||
logger.error(
|
||||
f"Failed to delete file {object_key} from bucket {actual_bucket_name}:"
|
||||
f" {e}"
|
||||
)
|
||||
return False
|
@ -120,7 +120,6 @@ async def upload_files(
|
||||
global_system_app,
|
||||
service.upload_files,
|
||||
bucket,
|
||||
"distributed",
|
||||
files,
|
||||
user_name,
|
||||
sys_code,
|
||||
|
@ -1,12 +1,14 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt.core.awel.flow import (
|
||||
TAGS_ORDER_HIGH,
|
||||
ResourceCategory,
|
||||
auto_register_resource,
|
||||
)
|
||||
from dbgpt.core.interface.file import StorageBackendConfig
|
||||
from dbgpt.util.i18n_utils import _
|
||||
from dbgpt.util.module_utils import ScannerConfig
|
||||
from dbgpt_serve.core import BaseServeConfig
|
||||
|
||||
APP_NAME = "file"
|
||||
@ -27,6 +29,7 @@ SERVER_APP_TABLE_NAME = "dbgpt_serve_file"
|
||||
"files in the file server."
|
||||
),
|
||||
show_in_ui=False,
|
||||
skip_fields=["backends"],
|
||||
)
|
||||
@dataclass
|
||||
class ServeConfig(BaseServeConfig):
|
||||
@ -34,6 +37,13 @@ class ServeConfig(BaseServeConfig):
|
||||
|
||||
__type__ = APP_NAME
|
||||
|
||||
__scan_config__ = ScannerConfig(
|
||||
module_path="dbgpt_ext.storage.file",
|
||||
base_class=StorageBackendConfig,
|
||||
recursive=True,
|
||||
specific_files=["config"],
|
||||
)
|
||||
|
||||
check_hash: Optional[bool] = field(
|
||||
default=True,
|
||||
metadata={"help": _("Check the hash of the file when downloading")},
|
||||
@ -62,6 +72,14 @@ class ServeConfig(BaseServeConfig):
|
||||
local_storage_path: Optional[str] = field(
|
||||
default=None, metadata={"help": _("The local storage path")}
|
||||
)
|
||||
default_backend: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": _("The default storage backend")},
|
||||
)
|
||||
backends: List[StorageBackendConfig] = field(
|
||||
default_factory=list,
|
||||
metadata={"help": _("The storage backend configurations")},
|
||||
)
|
||||
|
||||
def get_node_address(self) -> str:
|
||||
"""Get the node address"""
|
||||
|
@ -88,6 +88,7 @@ class Serve(BaseServe):
|
||||
FileMetadataAdapter(),
|
||||
serializer,
|
||||
)
|
||||
default_backend = self._serve_config.default_backend
|
||||
simple_distributed_storage = SimpleDistributedStorage(
|
||||
node_address=self._serve_config.get_node_address(),
|
||||
local_storage_path=self._serve_config.get_local_storage_path(),
|
||||
@ -98,6 +99,15 @@ class Serve(BaseServe):
|
||||
storage_backends = {
|
||||
simple_distributed_storage.storage_type: simple_distributed_storage,
|
||||
}
|
||||
for backend_config in self._serve_config.backends:
|
||||
storage_backend = backend_config.create_storage()
|
||||
storage_backends[storage_backend.storage_type] = storage_backend
|
||||
if not default_backend:
|
||||
# First backend is the default backend
|
||||
default_backend = storage_backend.storage_type
|
||||
if not default_backend:
|
||||
default_backend = simple_distributed_storage.storage_type
|
||||
|
||||
fs = FileStorageSystem(
|
||||
storage_backends,
|
||||
metadata_storage=storage,
|
||||
@ -107,6 +117,7 @@ class Serve(BaseServe):
|
||||
system_app=self._system_app,
|
||||
storage_system=fs,
|
||||
save_chunk_size=self._serve_config.save_chunk_size,
|
||||
default_storage_type=default_backend,
|
||||
)
|
||||
self._system_app.register_instance(self._file_storage_client)
|
||||
|
||||
|
@ -79,7 +79,6 @@ class Service(BaseService[ServeEntity, ServeRequest, ServerResponse]):
|
||||
def upload_files(
|
||||
self,
|
||||
bucket: str,
|
||||
storage_type: str,
|
||||
files: List[UploadFile],
|
||||
user_name: Optional[str] = None,
|
||||
sys_code: Optional[str] = None,
|
||||
@ -97,7 +96,6 @@ class Service(BaseService[ServeEntity, ServeRequest, ServerResponse]):
|
||||
bucket,
|
||||
file_name,
|
||||
file_data=file.file,
|
||||
storage_type=storage_type,
|
||||
custom_metadata=custom_metadata,
|
||||
)
|
||||
parsed_uri = FileStorageURI.parse(uri)
|
||||
|
@ -187,7 +187,6 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
bucket,
|
||||
safe_filename,
|
||||
doc_file.file,
|
||||
storage_type="distributed",
|
||||
custom_metadata=custom_metadata,
|
||||
)
|
||||
request.content = file_uri
|
||||
|
99
uv.lock
99
uv.lock
@ -211,6 +211,28 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/8b/46919127496036c8e990b2b236454a0d8655fd46e1df2fd35610a9cbc842/alembic-1.12.0-py3-none-any.whl", hash = "sha256:03226222f1cf943deee6c85d9464261a6c710cd19b4fe867a3ad1f25afda610f", size = 226041 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aliyun-python-sdk-core"
|
||||
version = "2.16.0"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "cryptography" },
|
||||
{ name = "jmespath" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/09/da9f58eb38b4fdb97ba6523274fbf445ef6a06be64b433693da8307b4bec/aliyun-python-sdk-core-2.16.0.tar.gz", hash = "sha256:651caad597eb39d4fad6cf85133dffe92837d53bdf62db9d8f37dab6508bb8f9", size = 449555 }
|
||||
|
||||
[[package]]
|
||||
name = "aliyun-python-sdk-kms"
|
||||
version = "2.16.5"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "aliyun-python-sdk-core" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/2c/9877d0e6b18ecf246df671ac65a5d1d9fecbf85bdcb5d43efbde0d4662eb/aliyun-python-sdk-kms-2.16.5.tar.gz", hash = "sha256:f328a8a19d83ecbb965ffce0ec1e9930755216d104638cd95ecd362753b813b3", size = 12018 }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/11/5c/0132193d7da2c735669a1ed103b142fd63c9455984d48c5a88a1a516efaa/aliyun_python_sdk_kms-2.16.5-py2.py3-none-any.whl", hash = "sha256:24b6cdc4fd161d2942619479c8d050c63ea9cd22b044fe33b60bbb60153786f0", size = 99495 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
version = "0.7.0"
|
||||
@ -668,6 +690,34 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/91/4aea63dccee6491a54c630d9817656a886e086ab97222e2d8101d8cdf894/blis-0.7.11-cp312-cp312-win_amd64.whl", hash = "sha256:5a305dbfc96d202a20d0edd6edf74a406b7e1404f4fa4397d24c68454e60b1b4", size = 6624079 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boto3"
|
||||
version = "1.37.13"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "botocore" },
|
||||
{ name = "jmespath" },
|
||||
{ name = "s3transfer" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d6/50/1183ffa4782408907891af344a8e91d7bc5d7a9bae12e43fca8874da567e/boto3-1.37.13.tar.gz", hash = "sha256:295648f887464ab74c5c301a44982df76f9ba39ebfc16be5b8f071ad1a81fe95", size = 111349 }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/64/9f9578142ba1ed3ecc6b82a53c5c4c4352108e1424f1d5d02b6239b4314f/boto3-1.37.13-py3-none-any.whl", hash = "sha256:90fa5a91d7d7456219f0b7c4a93b38335dc5cf4613d885da4d4c1d099e04c6b7", size = 139552 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "botocore"
|
||||
version = "1.37.13"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "jmespath" },
|
||||
{ name = "python-dateutil" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/53/3593b438ab1f9b6837cc90a8582dfa71c71c639e9359a01fd4d110f0566e/botocore-1.37.13.tar.gz", hash = "sha256:60dfb831c54eb466db9b91891a6c8a0c223626caa049969d5d42858ad1e7f8c7", size = 13647494 }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/43/2aa89ca8ab69196890b0682820469e62d93c4cf402ceb46a3007fd44b0c3/botocore-1.37.13-py3-none-any.whl", hash = "sha256:aa417bac0f4d79533080e6e17c0509e149353aec83cfe7879597a7942f7f08d0", size = 13411385 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bs4"
|
||||
version = "0.0.2"
|
||||
@ -1266,6 +1316,12 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/5c/3ba7d12e7a79566f97b8f954400926d7b6eb33bcdccc1315a857f200f1f1/crashtest-0.4.1-py3-none-any.whl", hash = "sha256:8d23eac5fa660409f57472e3851dab7ac18aba459a8d19cbbba86d3d5aecd2a5", size = 7558 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crcmod"
|
||||
version = "1.7"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6b/b0/e595ce2a2527e169c3bcd6c33d2473c1918e0b7f6826a043ca1245dd4e5b/crcmod-1.7.tar.gz", hash = "sha256:dc7051a0db5f2bd48665a990d3ec1cc305a466a77358ca4492826f41f283601e", size = 89670 }
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "44.0.1"
|
||||
@ -1855,6 +1911,12 @@ datasource-spark = [
|
||||
datasource-vertica = [
|
||||
{ name = "vertica-python" },
|
||||
]
|
||||
file-oss = [
|
||||
{ name = "oss2" },
|
||||
]
|
||||
file-s3 = [
|
||||
{ name = "boto3" },
|
||||
]
|
||||
graph-rag = [
|
||||
{ name = "dbgpt-tugraph-plugins" },
|
||||
{ name = "neo4j" },
|
||||
@ -1896,6 +1958,7 @@ dev = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "boto3", marker = "extra == 'file-s3'" },
|
||||
{ name = "bs4", marker = "extra == 'rag'" },
|
||||
{ name = "chromadb", marker = "extra == 'storage-chromadb'", specifier = ">=0.4.22" },
|
||||
{ name = "clickhouse-connect", marker = "extra == 'datasource-clickhouse'" },
|
||||
@ -1910,6 +1973,7 @@ requires-dist = [
|
||||
{ name = "neo4j", marker = "extra == 'graph-rag'" },
|
||||
{ name = "networkx", marker = "extra == 'graph-rag'" },
|
||||
{ name = "onnxruntime", marker = "extra == 'storage-chromadb'", specifier = ">=1.14.1,<=1.18.1" },
|
||||
{ name = "oss2", marker = "extra == 'file-oss'" },
|
||||
{ name = "pdfplumber", marker = "extra == 'rag'" },
|
||||
{ name = "psycopg2-binary", marker = "extra == 'datasource-postgres'" },
|
||||
{ name = "pyhive", marker = "extra == 'datasource-hive'" },
|
||||
@ -3177,6 +3241,15 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/91/61/c80ef80ed8a0a21158e289ef70dac01e351d929a1c30cb0f49be60772547/jiter-0.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:3ac9f578c46f22405ff7f8b1f5848fb753cc4b8377fbec8470a7dc3997ca7566", size = 202374 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jmespath"
|
||||
version = "0.10.0"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/56/3f325b1eef9791759784aa5046a8f6a1aff8f7c898a2e34506771d3b99d8/jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", size = 21607 }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f", size = 24489 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "joblib"
|
||||
version = "1.4.2"
|
||||
@ -5478,6 +5551,20 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/27/f1/1d7ec15b20f8ce9300bc850de1e059132b88990e46cd0ccac29cbf11e4f9/orjson-3.10.15-cp313-cp313-win_amd64.whl", hash = "sha256:fd56a26a04f6ba5fb2045b0acc487a63162a958ed837648c5781e1fe3316cfbf", size = 133444 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oss2"
|
||||
version = "2.19.1"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "aliyun-python-sdk-core" },
|
||||
{ name = "aliyun-python-sdk-kms" },
|
||||
{ name = "crcmod" },
|
||||
{ name = "pycryptodome" },
|
||||
{ name = "requests" },
|
||||
{ name = "six" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/b5/f2cb1950dda46ac2284d6c950489fdacd0e743c2d79a347924d3cc44b86f/oss2-2.19.1.tar.gz", hash = "sha256:a8ab9ee7eb99e88a7e1382edc6ea641d219d585a7e074e3776e9dec9473e59c1", size = 298845 }
|
||||
|
||||
[[package]]
|
||||
name = "outlines"
|
||||
version = "0.1.11"
|
||||
@ -7470,6 +7557,18 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/5e/d3a6fdf61f6373e53bfb45d6819a72dfef741bc8a9ff31c64496688e7c39/ruff_lsp-0.0.62-py3-none-any.whl", hash = "sha256:fb6c04a0cb09bb3ae316121b084ff09497edd01df58b36fa431f14515c63029e", size = 20980 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "s3transfer"
|
||||
version = "0.11.4"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "botocore" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/ec/aa1a215e5c126fe5decbee2e107468f51d9ce190b9763cb649f76bb45938/s3transfer-0.11.4.tar.gz", hash = "sha256:559f161658e1cf0a911f45940552c696735f5c74e64362e515f333ebed87d679", size = 148419 }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/86/62/8d3fc3ec6640161a5649b2cddbbf2b9fa39c92541225b33f117c37c5a2eb/s3transfer-0.11.4-py3-none-any.whl", hash = "sha256:ac265fa68318763a03bf2dc4f39d5cbd6a9e178d81cc9483ad27da33637e320d", size = 84412 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "safetensors"
|
||||
version = "0.5.2"
|
||||
|
Loading…
Reference in New Issue
Block a user