From 2d489bfe45a2cb92dad8576c4593ec1de057b802 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Wed, 19 Mar 2025 15:46:40 +0800 Subject: [PATCH] fix:issue2484 --- .../dbgpt-core/src/dbgpt/util/string_utils.py | 11 ++++++ .../storage/vector_store/chroma_store.py | 39 +++++++++++++++++-- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/packages/dbgpt-core/src/dbgpt/util/string_utils.py b/packages/dbgpt-core/src/dbgpt/util/string_utils.py index 91943c7b9..41e60b97c 100644 --- a/packages/dbgpt-core/src/dbgpt/util/string_utils.py +++ b/packages/dbgpt-core/src/dbgpt/util/string_utils.py @@ -41,6 +41,17 @@ def is_scientific_notation(string): return False +def is_valid_ipv4(address): + """Check if the address is a valid IPv4 address.""" + pattern = re.compile( + r"^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\." + r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\." + r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\." + r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" + ) + return pattern.match(address) is not None + + def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]: # extract text match_map = {} diff --git a/packages/dbgpt-ext/src/dbgpt_ext/storage/vector_store/chroma_store.py b/packages/dbgpt-ext/src/dbgpt_ext/storage/vector_store/chroma_store.py index a89e75d36..2303d9602 100644 --- a/packages/dbgpt-ext/src/dbgpt_ext/storage/vector_store/chroma_store.py +++ b/packages/dbgpt-ext/src/dbgpt_ext/storage/vector_store/chroma_store.py @@ -1,7 +1,9 @@ """Chroma vector store.""" +import hashlib import logging import os +import re from dataclasses import dataclass, field from typing import Any, Dict, Iterable, List, Mapping, Optional, Union @@ -112,10 +114,11 @@ class ChromaStore(VectorStoreBase): if not self.embeddings: raise ValueError("Embeddings is None") self._collection_name = name - if string_utils.contains_chinese(name): - bytes_str = self._collection_name.encode("utf-8") - hex_str = bytes_str.hex() - self._collection_name = hex_str + if not _valid_chroma_collection_name(name): + hash_object = hashlib.sha256(name.encode("utf-8")) + hex_hash = hash_object.hexdigest() + # ensure the collection name is less than 64 characters + self._collection_name = hex_hash[:63] if len(hex_hash) > 63 else hex_hash chroma_settings = Settings( # chroma_db_impl="duckdb+parquet", => deprecated configuration of Chroma persist_directory=self.persist_dir, @@ -420,3 +423,31 @@ def _transform_chroma_metadata( if isinstance(value, (str, int, float, bool)): transformed[key] = value return transformed + + +def _valid_chroma_collection_name(name): + """Check if the collection name is valid.""" + # ensure the collection name is less than 64 characters + if not (3 <= len(name) <= 63): + return False + + # ensure the collection name starts and ends with an alphanumeric character + if not re.match(r"^[a-zA-Z0-9].*[a-zA-Z0-9]$", name): + return False + + # ensure the collection name contains only alphanumeric characters, + # hyphens, underscores, and dots + if not re.match(r"^[a-zA-Z0-9_][-a-zA-Z0-9_.]*$", name): + return False + + # ensure the collection name does not contain the '..' substring + if ".." in name: + return False + + if string_utils.is_valid_ipv4(name): + return False + + if string_utils.contains_chinese(name): + return False + + return True