mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-30 23:28:35 +00:00
fix:issue2484
This commit is contained in:
parent
0e9faf475f
commit
2d489bfe45
@ -41,6 +41,17 @@ def is_scientific_notation(string):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_ipv4(address):
|
||||||
|
"""Check if the address is a valid IPv4 address."""
|
||||||
|
pattern = re.compile(
|
||||||
|
r"^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
|
||||||
|
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
|
||||||
|
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
|
||||||
|
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
|
||||||
|
)
|
||||||
|
return pattern.match(address) is not None
|
||||||
|
|
||||||
|
|
||||||
def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]:
|
def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]:
|
||||||
# extract text
|
# extract text
|
||||||
match_map = {}
|
match_map = {}
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
"""Chroma vector store."""
|
"""Chroma vector store."""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
|
from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
|
||||||
|
|
||||||
@ -112,10 +114,11 @@ class ChromaStore(VectorStoreBase):
|
|||||||
if not self.embeddings:
|
if not self.embeddings:
|
||||||
raise ValueError("Embeddings is None")
|
raise ValueError("Embeddings is None")
|
||||||
self._collection_name = name
|
self._collection_name = name
|
||||||
if string_utils.contains_chinese(name):
|
if not _valid_chroma_collection_name(name):
|
||||||
bytes_str = self._collection_name.encode("utf-8")
|
hash_object = hashlib.sha256(name.encode("utf-8"))
|
||||||
hex_str = bytes_str.hex()
|
hex_hash = hash_object.hexdigest()
|
||||||
self._collection_name = hex_str
|
# ensure the collection name is less than 64 characters
|
||||||
|
self._collection_name = hex_hash[:63] if len(hex_hash) > 63 else hex_hash
|
||||||
chroma_settings = Settings(
|
chroma_settings = Settings(
|
||||||
# chroma_db_impl="duckdb+parquet", => deprecated configuration of Chroma
|
# chroma_db_impl="duckdb+parquet", => deprecated configuration of Chroma
|
||||||
persist_directory=self.persist_dir,
|
persist_directory=self.persist_dir,
|
||||||
@ -420,3 +423,31 @@ def _transform_chroma_metadata(
|
|||||||
if isinstance(value, (str, int, float, bool)):
|
if isinstance(value, (str, int, float, bool)):
|
||||||
transformed[key] = value
|
transformed[key] = value
|
||||||
return transformed
|
return transformed
|
||||||
|
|
||||||
|
|
||||||
|
def _valid_chroma_collection_name(name):
|
||||||
|
"""Check if the collection name is valid."""
|
||||||
|
# ensure the collection name is less than 64 characters
|
||||||
|
if not (3 <= len(name) <= 63):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ensure the collection name starts and ends with an alphanumeric character
|
||||||
|
if not re.match(r"^[a-zA-Z0-9].*[a-zA-Z0-9]$", name):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ensure the collection name contains only alphanumeric characters,
|
||||||
|
# hyphens, underscores, and dots
|
||||||
|
if not re.match(r"^[a-zA-Z0-9_][-a-zA-Z0-9_.]*$", name):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ensure the collection name does not contain the '..' substring
|
||||||
|
if ".." in name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if string_utils.is_valid_ipv4(name):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if string_utils.contains_chinese(name):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
Loading…
Reference in New Issue
Block a user