milvus[patch]: fix vectorstore integration tests (#26583)

Resolves https://github.com/langchain-ai/langchain/issues/26564
This commit is contained in:
ccurme
2024-09-17 14:17:05 -04:00
committed by GitHub
parent 145a49cca2
commit 7c05f71e0f
3 changed files with 628 additions and 530 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "langchain-milvus" name = "langchain-milvus"
version = "0.1.6" version = "0.1.5"
description = "An integration package connecting Milvus and LangChain" description = "An integration package connecting Milvus and LangChain"
authors = [] authors = []
readme = "README.md" readme = "README.md"
@@ -25,10 +25,17 @@ ignore_missing_imports = "True"
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-milvus%3D%3D0%22&expanded=true" "Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-milvus%3D%3D0%22&expanded=true"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.9,<4.0" python = ">=3.8.1,<4.0"
langchain-core = "^0.3"
pymilvus = "^2.4.3" pymilvus = "^2.4.3"
[[tool.poetry.dependencies.langchain-core]]
version = ">=0.2.38,<0.4"
python = ">=3.9"
[[tool.poetry.dependencies.langchain-core]]
version = ">=0.2.38,<0.3"
python = "<3.9"
[[tool.poetry.dependencies.scipy]] [[tool.poetry.dependencies.scipy]]
version = "^1.7" version = "^1.7"
python = "<3.12" python = "<3.12"
@@ -73,6 +80,15 @@ pytest-watcher = "^0.3.4"
pytest-asyncio = "^0.21.1" pytest-asyncio = "^0.21.1"
milvus_model = "^0.2.0" milvus_model = "^0.2.0"
[[tool.poetry.group.test.dependencies.langchain-core]]
path = "../../core"
develop = true
python = ">=3.9"
[[tool.poetry.group.test.dependencies.langchain-core]]
version = ">=0.2.38,<0.3"
python = "<3.9"
[tool.poetry.group.codespell.dependencies] [tool.poetry.group.codespell.dependencies]
codespell = "^2.2.0" codespell = "^2.2.0"
@@ -87,14 +103,21 @@ mypy = "^0.991"
types-requests = "^2" types-requests = "^2"
simsimd = "^5.0.0" simsimd = "^5.0.0"
[tool.poetry.group.test.dependencies.langchain-core] [[tool.poetry.group.typing.dependencies.langchain-core]]
path = "../../core" path = "../../core"
develop = true develop = true
python = ">=3.9"
[tool.poetry.group.typing.dependencies.langchain-core] [[tool.poetry.group.typing.dependencies.langchain-core]]
path = "../../core" version = ">=0.2.38,<0.3"
develop = true python = "<3.9"
[tool.poetry.group.dev.dependencies.langchain-core] [tool.poetry.group.dev.dependencies]
[[tool.poetry.group.dev.dependencies.langchain-core]]
path = "../../core" path = "../../core"
develop = true develop = true
python = ">=3.9"
[[tool.poetry.group.dev.dependencies.langchain-core]]
version = ">=0.2.38,<0.3"
python = "<3.9"

View File

@@ -1,5 +1,6 @@
"""Test Milvus functionality.""" """Test Milvus functionality."""
import tempfile
from typing import Any, List, Optional from typing import Any, List, Optional
import pytest import pytest
@@ -13,6 +14,7 @@ from tests.integration_tests.utils import (
fake_texts, fake_texts,
) )
# #
# To run this test properly, please start a Milvus server with the following command: # To run this test properly, please start a Milvus server with the following command:
# #
@@ -24,12 +26,17 @@ from tests.integration_tests.utils import (
# Here is the reference: # Here is the reference:
# https://milvus.io/docs/install_standalone-docker.md # https://milvus.io/docs/install_standalone-docker.md
# #
@pytest.fixture
def temp_milvus_db() -> Any:
with tempfile.NamedTemporaryFile(suffix=".db") as temp_file:
yield temp_file.name
def _milvus_from_texts( def _milvus_from_texts(
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None, ids: Optional[List[str]] = None,
drop: bool = True, drop: bool = True,
db_path: str = "./milvus_demo.db",
**kwargs: Any, **kwargs: Any,
) -> Milvus: ) -> Milvus:
return Milvus.from_texts( return Milvus.from_texts(
@@ -38,7 +45,7 @@ def _milvus_from_texts(
metadatas=metadatas, metadatas=metadatas,
ids=ids, ids=ids,
# connection_args={"uri": "http://127.0.0.1:19530"}, # connection_args={"uri": "http://127.0.0.1:19530"},
connection_args={"uri": "./milvus_demo.db"}, connection_args={"uri": db_path},
drop_old=drop, drop_old=drop,
consistency_level="Strong", consistency_level="Strong",
**kwargs, **kwargs,
@@ -49,35 +56,37 @@ def _get_pks(expr: str, docsearch: Milvus) -> List[Any]:
return docsearch.get_pks(expr) # type: ignore[return-value] return docsearch.get_pks(expr) # type: ignore[return-value]
def test_milvus() -> None: def test_milvus(temp_milvus_db: Any) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
docsearch = _milvus_from_texts() docsearch = _milvus_from_texts(db_path=temp_milvus_db)
output = docsearch.similarity_search("foo", k=1) output = docsearch.similarity_search("foo", k=1)
assert_docs_equal_without_pk(output, [Document(page_content="foo")]) assert_docs_equal_without_pk(output, [Document(page_content="foo")])
def test_milvus_vector_search() -> None: def test_milvus_vector_search(temp_milvus_db: Any) -> None:
"""Test end to end construction and search by vector.""" """Test end to end construction and search by vector."""
docsearch = _milvus_from_texts() docsearch = _milvus_from_texts(db_path=temp_milvus_db)
output = docsearch.similarity_search_by_vector( output = docsearch.similarity_search_by_vector(
FakeEmbeddings().embed_query("foo"), k=1 FakeEmbeddings().embed_query("foo"), k=1
) )
assert_docs_equal_without_pk(output, [Document(page_content="foo")]) assert_docs_equal_without_pk(output, [Document(page_content="foo")])
def test_milvus_with_metadata() -> None: def test_milvus_with_metadata(temp_milvus_db: Any) -> None:
"""Test with metadata""" """Test with metadata"""
docsearch = _milvus_from_texts(metadatas=[{"label": "test"}] * len(fake_texts)) docsearch = _milvus_from_texts(
metadatas=[{"label": "test"}] * len(fake_texts), db_path=temp_milvus_db
)
output = docsearch.similarity_search("foo", k=1) output = docsearch.similarity_search("foo", k=1)
assert_docs_equal_without_pk( assert_docs_equal_without_pk(
output, [Document(page_content="foo", metadata={"label": "test"})] output, [Document(page_content="foo", metadata={"label": "test"})]
) )
def test_milvus_with_id() -> None: def test_milvus_with_id(temp_milvus_db: Any) -> None:
"""Test with ids""" """Test with ids"""
ids = ["id_" + str(i) for i in range(len(fake_texts))] ids = ["id_" + str(i) for i in range(len(fake_texts))]
docsearch = _milvus_from_texts(ids=ids) docsearch = _milvus_from_texts(ids=ids, db_path=temp_milvus_db)
output = docsearch.similarity_search("foo", k=1) output = docsearch.similarity_search("foo", k=1)
assert_docs_equal_without_pk(output, [Document(page_content="foo")]) assert_docs_equal_without_pk(output, [Document(page_content="foo")])
@@ -86,16 +95,16 @@ def test_milvus_with_id() -> None:
try: try:
ids = ["dup_id" for _ in fake_texts] ids = ["dup_id" for _ in fake_texts]
_milvus_from_texts(ids=ids) _milvus_from_texts(ids=ids, db_path=temp_milvus_db)
except Exception as e: except Exception as e:
assert isinstance(e, AssertionError) assert isinstance(e, AssertionError)
def test_milvus_with_score() -> None: def test_milvus_with_score(temp_milvus_db: Any) -> None:
"""Test end to end construction and search with scores and IDs.""" """Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))] metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas) docsearch = _milvus_from_texts(metadatas=metadatas, db_path=temp_milvus_db)
output = docsearch.similarity_search_with_score("foo", k=3) output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output] docs = [o[0] for o in output]
scores = [o[1] for o in output] scores = [o[1] for o in output]
@@ -110,11 +119,11 @@ def test_milvus_with_score() -> None:
assert scores[0] < scores[1] < scores[2] assert scores[0] < scores[1] < scores[2]
def test_milvus_max_marginal_relevance_search() -> None: def test_milvus_max_marginal_relevance_search(temp_milvus_db: Any) -> None:
"""Test end to end construction and MRR search.""" """Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))] metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas) docsearch = _milvus_from_texts(metadatas=metadatas, db_path=temp_milvus_db)
output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
assert_docs_equal_without_pk( assert_docs_equal_without_pk(
output, output,
@@ -125,11 +134,15 @@ def test_milvus_max_marginal_relevance_search() -> None:
) )
def test_milvus_max_marginal_relevance_search_with_dynamic_field() -> None: def test_milvus_max_marginal_relevance_search_with_dynamic_field(
temp_milvus_db: Any,
) -> None:
"""Test end to end construction and MRR search with enabling dynamic field.""" """Test end to end construction and MRR search with enabling dynamic field."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))] metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas, enable_dynamic_field=True) docsearch = _milvus_from_texts(
metadatas=metadatas, enable_dynamic_field=True, db_path=temp_milvus_db
)
output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
assert_docs_equal_without_pk( assert_docs_equal_without_pk(
output, output,
@@ -140,11 +153,11 @@ def test_milvus_max_marginal_relevance_search_with_dynamic_field() -> None:
) )
def test_milvus_add_extra() -> None: def test_milvus_add_extra(temp_milvus_db: Any) -> None:
"""Test end to end construction and MRR search.""" """Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))] metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas) docsearch = _milvus_from_texts(metadatas=metadatas, db_path=temp_milvus_db)
docsearch.add_texts(texts, metadatas) docsearch.add_texts(texts, metadatas)
@@ -152,45 +165,47 @@ def test_milvus_add_extra() -> None:
assert len(output) == 6 assert len(output) == 6
def test_milvus_no_drop() -> None: def test_milvus_no_drop(temp_milvus_db: Any) -> None:
"""Test construction without dropping old data.""" """Test construction without dropping old data."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))] metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas) docsearch = _milvus_from_texts(metadatas=metadatas, db_path=temp_milvus_db)
del docsearch del docsearch
docsearch = _milvus_from_texts(metadatas=metadatas, drop=False) docsearch = _milvus_from_texts(
metadatas=metadatas, drop=False, db_path=temp_milvus_db
)
output = docsearch.similarity_search("foo", k=10) output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6 assert len(output) == 6
def test_milvus_get_pks() -> None: def test_milvus_get_pks(temp_milvus_db: Any) -> None:
"""Test end to end construction and get pks with expr""" """Test end to end construction and get pks with expr"""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"id": i} for i in range(len(texts))] metadatas = [{"id": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas) docsearch = _milvus_from_texts(metadatas=metadatas, db_path=temp_milvus_db)
expr = "id in [1,2]" expr = "id in [1,2]"
output = _get_pks(expr, docsearch) output = _get_pks(expr, docsearch)
assert len(output) == 2 assert len(output) == 2
def test_milvus_delete_entities() -> None: def test_milvus_delete_entities(temp_milvus_db: Any) -> None:
"""Test end to end construction and delete entities""" """Test end to end construction and delete entities"""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"id": i} for i in range(len(texts))] metadatas = [{"id": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas) docsearch = _milvus_from_texts(metadatas=metadatas, db_path=temp_milvus_db)
expr = "id in [1,2]" expr = "id in [1,2]"
pks = _get_pks(expr, docsearch) pks = _get_pks(expr, docsearch)
result = docsearch.delete(pks) result = docsearch.delete(pks)
assert result.delete_count == 2 # type: ignore[attr-defined] assert result.delete_count == 2 # type: ignore[attr-defined]
def test_milvus_upsert_entities() -> None: def test_milvus_upsert_entities(temp_milvus_db: Any) -> None:
"""Test end to end construction and upsert entities""" """Test end to end construction and upsert entities"""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"id": i} for i in range(len(texts))] metadatas = [{"id": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas) docsearch = _milvus_from_texts(metadatas=metadatas, db_path=temp_milvus_db)
expr = "id in [1,2]" expr = "id in [1,2]"
pks = _get_pks(expr, docsearch) pks = _get_pks(expr, docsearch)
documents = [ documents = [
@@ -201,11 +216,13 @@ def test_milvus_upsert_entities() -> None:
assert len(ids) == 2 # type: ignore[arg-type] assert len(ids) == 2 # type: ignore[arg-type]
def test_milvus_enable_dynamic_field() -> None: def test_milvus_enable_dynamic_field(temp_milvus_db: Any) -> None:
"""Test end to end construction and enable dynamic field""" """Test end to end construction and enable dynamic field"""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"id": i} for i in range(len(texts))] metadatas = [{"id": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas, enable_dynamic_field=True) docsearch = _milvus_from_texts(
metadatas=metadatas, enable_dynamic_field=True, db_path=temp_milvus_db
)
output = docsearch.similarity_search("foo", k=10) output = docsearch.similarity_search("foo", k=10)
assert len(output) == 3 assert len(output) == 3
@@ -223,11 +240,13 @@ def test_milvus_enable_dynamic_field() -> None:
} }
def test_milvus_disable_dynamic_field() -> None: def test_milvus_disable_dynamic_field(temp_milvus_db: Any) -> None:
"""Test end to end construction and disable dynamic field""" """Test end to end construction and disable dynamic field"""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"id": i} for i in range(len(texts))] metadatas = [{"id": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas, enable_dynamic_field=False) docsearch = _milvus_from_texts(
metadatas=metadatas, enable_dynamic_field=False, db_path=temp_milvus_db
)
output = docsearch.similarity_search("foo", k=10) output = docsearch.similarity_search("foo", k=10)
assert len(output) == 3 assert len(output) == 3
# ["pk", "text", "vector", "id"] # ["pk", "text", "vector", "id"]
@@ -255,11 +274,13 @@ def test_milvus_disable_dynamic_field() -> None:
docsearch.add_texts(texts, new_metadatas) docsearch.add_texts(texts, new_metadatas)
def test_milvus_metadata_field() -> None: def test_milvus_metadata_field(temp_milvus_db: Any) -> None:
"""Test end to end construction and use metadata field""" """Test end to end construction and use metadata field"""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"id": i} for i in range(len(texts))] metadatas = [{"id": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas, metadata_field="metadata") docsearch = _milvus_from_texts(
metadatas=metadatas, metadata_field="metadata", db_path=temp_milvus_db
)
output = docsearch.similarity_search("foo", k=10) output = docsearch.similarity_search("foo", k=10)
assert len(output) == 3 assert len(output) == 3
@@ -277,7 +298,7 @@ def test_milvus_metadata_field() -> None:
} }
def test_milvus_enable_dynamic_field_with_partition_key() -> None: def test_milvus_enable_dynamic_field_with_partition_key(temp_milvus_db: Any) -> None:
""" """
Test end to end construction and enable dynamic field Test end to end construction and enable dynamic field
with partition_key_field with partition_key_field
@@ -286,7 +307,10 @@ def test_milvus_enable_dynamic_field_with_partition_key() -> None:
metadatas = [{"id": i, "namespace": f"name_{i}"} for i in range(len(texts))] metadatas = [{"id": i, "namespace": f"name_{i}"} for i in range(len(texts))]
docsearch = _milvus_from_texts( docsearch = _milvus_from_texts(
metadatas=metadatas, enable_dynamic_field=True, partition_key_field="namespace" metadatas=metadatas,
enable_dynamic_field=True,
partition_key_field="namespace",
db_path=temp_milvus_db,
) )
# filter on a single namespace # filter on a single namespace
@@ -318,19 +342,27 @@ def test_milvus_sparse_embeddings() -> None:
"in a surreal world of nightmares and illusions, where the boundaries between " "in a surreal world of nightmares and illusions, where the boundaries between "
"reality and fantasy blur.", "reality and fantasy blur.",
] ]
sparse_embedding_func = BM25SparseEmbedding(corpus=texts) try:
docsearch = Milvus.from_texts( sparse_embedding_func = BM25SparseEmbedding(corpus=texts)
embedding=sparse_embedding_func, except LookupError:
texts=texts, import nltk # type: ignore[import]
connection_args={"uri": "./milvus_demo.db"},
drop_old=True,
)
output = docsearch.similarity_search("Pilgrim", k=1) nltk.download("punkt_tab")
sparse_embedding_func = BM25SparseEmbedding(corpus=texts)
with tempfile.NamedTemporaryFile(suffix=".db") as temp_db:
docsearch = Milvus.from_texts(
embedding=sparse_embedding_func,
texts=texts,
connection_args={"uri": temp_db.name},
drop_old=True,
)
output = docsearch.similarity_search("Pilgrim", k=1)
assert "Pilgrim" in output[0].page_content assert "Pilgrim" in output[0].page_content
def test_milvus_array_field() -> None: def test_milvus_array_field(temp_milvus_db: Any) -> None:
"""Manually specify metadata schema, including an array_field. """Manually specify metadata schema, including an array_field.
For more information about array data type and filtering, please refer to For more information about array data type and filtering, please refer to
https://milvus.io/docs/array_data_type.md https://milvus.io/docs/array_data_type.md
@@ -353,6 +385,7 @@ def test_milvus_array_field() -> None:
# "dtype": DataType.INT64, # "dtype": DataType.INT64,
# } # }
}, },
db_path=temp_milvus_db,
) )
output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2") output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
assert len(output) == 2 assert len(output) == 2
@@ -366,6 +399,7 @@ def test_milvus_array_field() -> None:
docsearch = _milvus_from_texts( docsearch = _milvus_from_texts(
enable_dynamic_field=True, enable_dynamic_field=True,
metadatas=metadatas, metadatas=metadatas,
db_path=temp_milvus_db,
) )
output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2") output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
assert len(output) == 2 assert len(output) == 2