fix:issue2484 (#2488)

Close #issue2484
# Description
Fix  Chroma Collection name rules
"(1) contains 3-63 characters, "
"(2) starts and ends with an alphanumeric character, "
"(3) otherwise contains only alphanumeric characters, underscores or
hyphens (-), "
"(4) contains no two consecutive periods (..) and "
"(5) is not a valid IPv4 address, "
# How Has This Been Tested?

# Snapshots:

create knowledge name rule as below.
"(1) contains 3-63 characters, "
"(2) starts and ends with an alphanumeric character, "
"(3) otherwise contains only alphanumeric characters, underscores or
hyphens (-), "
"(4) contains no two consecutive periods (..) and "
"(5) is not a valid IPv4 address, "

# Checklist:

- [ ] My code follows the style guidelines of this project
- [ ] I have already rebased the commits and make the commit message
conform to the project standard.
- [ ] I have performed a self-review of my own code
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] Any dependent changes have been merged and published in downstream
modules
This commit is contained in:
magic.chen 2025-03-19 21:11:17 +08:00 committed by GitHub
commit 3fa7bee289
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 46 additions and 4 deletions

View File

@ -41,6 +41,17 @@ def is_scientific_notation(string):
return False return False
def is_valid_ipv4(address):
"""Check if the address is a valid IPv4 address."""
pattern = re.compile(
r"^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
)
return pattern.match(address) is not None
def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]: def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]:
# extract text # extract text
match_map = {} match_map = {}

View File

@ -1,7 +1,9 @@
"""Chroma vector store.""" """Chroma vector store."""
import hashlib
import logging import logging
import os import os
import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Mapping, Optional, Union from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
@ -112,10 +114,11 @@ class ChromaStore(VectorStoreBase):
if not self.embeddings: if not self.embeddings:
raise ValueError("Embeddings is None") raise ValueError("Embeddings is None")
self._collection_name = name self._collection_name = name
if string_utils.contains_chinese(name): if not _valid_chroma_collection_name(name):
bytes_str = self._collection_name.encode("utf-8") hash_object = hashlib.sha256(name.encode("utf-8"))
hex_str = bytes_str.hex() hex_hash = hash_object.hexdigest()
self._collection_name = hex_str # ensure the collection name is less than 64 characters
self._collection_name = hex_hash[:63] if len(hex_hash) > 63 else hex_hash
chroma_settings = Settings( chroma_settings = Settings(
# chroma_db_impl="duckdb+parquet", => deprecated configuration of Chroma # chroma_db_impl="duckdb+parquet", => deprecated configuration of Chroma
persist_directory=self.persist_dir, persist_directory=self.persist_dir,
@ -420,3 +423,31 @@ def _transform_chroma_metadata(
if isinstance(value, (str, int, float, bool)): if isinstance(value, (str, int, float, bool)):
transformed[key] = value transformed[key] = value
return transformed return transformed
def _valid_chroma_collection_name(name):
"""Check if the collection name is valid."""
# ensure the collection name is less than 64 characters
if not (3 <= len(name) <= 63):
return False
# ensure the collection name starts and ends with an alphanumeric character
if not re.match(r"^[a-zA-Z0-9].*[a-zA-Z0-9]$", name):
return False
# ensure the collection name contains only alphanumeric characters,
# hyphens, underscores, and dots
if not re.match(r"^[a-zA-Z0-9_][-a-zA-Z0-9_.]*$", name):
return False
# ensure the collection name does not contain the '..' substring
if ".." in name:
return False
if string_utils.is_valid_ipv4(name):
return False
if string_utils.contains_chinese(name):
return False
return True