fix:issue2484 (#2488)

Close #issue2484
# Description
Fix  Chroma Collection name rules
"(1) contains 3-63 characters, "
"(2) starts and ends with an alphanumeric character, "
"(3) otherwise contains only alphanumeric characters, underscores or
hyphens (-), "
"(4) contains no two consecutive periods (..) and "
"(5) is not a valid IPv4 address, "
# How Has This Been Tested?

# Snapshots:

create knowledge name rule as below.
"(1) contains 3-63 characters, "
"(2) starts and ends with an alphanumeric character, "
"(3) otherwise contains only alphanumeric characters, underscores or
hyphens (-), "
"(4) contains no two consecutive periods (..) and "
"(5) is not a valid IPv4 address, "

# Checklist:

- [ ] My code follows the style guidelines of this project
- [ ] I have already rebased the commits and make the commit message
conform to the project standard.
- [ ] I have performed a self-review of my own code
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] Any dependent changes have been merged and published in downstream
modules
This commit is contained in:
magic.chen 2025-03-19 21:11:17 +08:00 committed by GitHub
commit 3fa7bee289
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 46 additions and 4 deletions

View File

@ -41,6 +41,17 @@ def is_scientific_notation(string):
return False
def is_valid_ipv4(address):
"""Check if the address is a valid IPv4 address."""
pattern = re.compile(
r"^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\."
r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
)
return pattern.match(address) is not None
def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]:
# extract text
match_map = {}

View File

@ -1,7 +1,9 @@
"""Chroma vector store."""
import hashlib
import logging
import os
import re
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
@ -112,10 +114,11 @@ class ChromaStore(VectorStoreBase):
if not self.embeddings:
raise ValueError("Embeddings is None")
self._collection_name = name
if string_utils.contains_chinese(name):
bytes_str = self._collection_name.encode("utf-8")
hex_str = bytes_str.hex()
self._collection_name = hex_str
if not _valid_chroma_collection_name(name):
hash_object = hashlib.sha256(name.encode("utf-8"))
hex_hash = hash_object.hexdigest()
# ensure the collection name is less than 64 characters
self._collection_name = hex_hash[:63] if len(hex_hash) > 63 else hex_hash
chroma_settings = Settings(
# chroma_db_impl="duckdb+parquet", => deprecated configuration of Chroma
persist_directory=self.persist_dir,
@ -420,3 +423,31 @@ def _transform_chroma_metadata(
if isinstance(value, (str, int, float, bool)):
transformed[key] = value
return transformed
def _valid_chroma_collection_name(name):
"""Check if the collection name is valid."""
# ensure the collection name is less than 64 characters
if not (3 <= len(name) <= 63):
return False
# ensure the collection name starts and ends with an alphanumeric character
if not re.match(r"^[a-zA-Z0-9].*[a-zA-Z0-9]$", name):
return False
# ensure the collection name contains only alphanumeric characters,
# hyphens, underscores, and dots
if not re.match(r"^[a-zA-Z0-9_][-a-zA-Z0-9_.]*$", name):
return False
# ensure the collection name does not contain the '..' substring
if ".." in name:
return False
if string_utils.is_valid_ipv4(name):
return False
if string_utils.contains_chinese(name):
return False
return True