mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-13 00:16:01 +00:00
IMPROVEMENT Increase flexibility of ElasticVectorSearch (#6863)
Hey @rlancemartin, @eyurtsev , I did some minimal changes to the `ElasticVectorSearch` client so that it plays better with existing ES indices. Main changes are as follows: 1. You can pass the dense vector field name into `_default_script_query` 2. You can pass a custom script query implementation and the respective parameters to `similarity_search_with_score` 3. You can pass functions for building page content and metadata for the resulting `Document` <!-- Thank you for contributing to LangChain! Replace this comment with: - Description: a description of the change, - Issue: the issue # it fixes (if applicable), - Dependencies: any dependencies required for this change, - Tag maintainer: for a quicker response, tag the relevant maintainer (see below), - Twitter handle: we announce bigger features on Twitter. If your PR gets announced and you'd like a mention, we'll gladly shout you out! If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 4. an example notebook showing its use. Maintainer responsibilities: - General / Misc / if you don't know who to tag: @dev2049 - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev - Models / Prompts: @hwchase17, @dev2049 - Memory: @hwchase17 - Agents / Tools / Toolkits: @vowelparrot - Tracing / Callbacks: @agola11 - Async: @agola11 If no one reviews your PR within a few days, feel free to @-mention the same people again. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md -->
This commit is contained in:
parent
39852dffd2
commit
9b4974871d
@ -776,6 +776,40 @@
|
|||||||
"print(results[0])"
|
"print(results[0])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Customize the Document Builder\n",
|
||||||
|
"\n",
|
||||||
|
"With ```doc_builder``` parameter at search, you are able to adjust how a Document is being built using data retrieved from Elasticsearch. This is especially useful if you have indices which were not created using Langchain."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Dict\n",
|
||||||
|
"from langchain.docstore.document import Document\n",
|
||||||
|
"\n",
|
||||||
|
"def custom_document_builder(hit: Dict) -> Document:\n",
|
||||||
|
" src = hit.get(\"_source\", {})\n",
|
||||||
|
" return Document(\n",
|
||||||
|
" page_content=src.get(\"content\", \"Missing content!\"),\n",
|
||||||
|
" metadata={\"page_number\": src.get(\"page_number\", -1), \"original_filename\": src.get(\"original_filename\", \"Missing filename!\")},\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"results = db.similarity_search(\n",
|
||||||
|
" \"What did the president say about Ketanji Brown Jackson\",\n",
|
||||||
|
" k=4,\n",
|
||||||
|
" doc_builder=custom_document_builder,\n",
|
||||||
|
")\n",
|
||||||
|
"print(\"Results:\")\n",
|
||||||
|
"print(results[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "3242fd42",
|
"id": "3242fd42",
|
||||||
@ -929,7 +963,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.3"
|
"version": "3.9.7"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -727,6 +727,8 @@ class ElasticsearchStore(VectorStore):
|
|||||||
fields: Optional[List[str]] = None,
|
fields: Optional[List[str]] = None,
|
||||||
filter: Optional[List[dict]] = None,
|
filter: Optional[List[dict]] = None,
|
||||||
custom_query: Optional[Callable[[Dict, Union[str, None]], Dict]] = None,
|
custom_query: Optional[Callable[[Dict, Union[str, None]], Dict]] = None,
|
||||||
|
doc_builder: Optional[Callable[[Dict], Document]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
) -> List[Tuple[Document, float]]:
|
) -> List[Tuple[Document, float]]:
|
||||||
"""Return Elasticsearch documents most similar to query, along with scores.
|
"""Return Elasticsearch documents most similar to query, along with scores.
|
||||||
|
|
||||||
@ -781,6 +783,14 @@ class ElasticsearchStore(VectorStore):
|
|||||||
source=fields,
|
source=fields,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def default_doc_builder(hit: Dict) -> Document:
|
||||||
|
return Document(
|
||||||
|
page_content=hit["_source"].get(self.query_field, ""),
|
||||||
|
metadata=hit["_source"]["metadata"],
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_builder = doc_builder or default_doc_builder
|
||||||
|
|
||||||
docs_and_scores = []
|
docs_and_scores = []
|
||||||
for hit in response["hits"]["hits"]:
|
for hit in response["hits"]["hits"]:
|
||||||
for field in fields:
|
for field in fields:
|
||||||
@ -792,10 +802,7 @@ class ElasticsearchStore(VectorStore):
|
|||||||
|
|
||||||
docs_and_scores.append(
|
docs_and_scores.append(
|
||||||
(
|
(
|
||||||
Document(
|
doc_builder(hit),
|
||||||
page_content=hit["_source"].get(self.query_field, ""),
|
|
||||||
metadata=hit["_source"]["metadata"],
|
|
||||||
),
|
|
||||||
hit["_score"],
|
hit["_score"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -254,6 +254,35 @@ class TestElasticsearch:
|
|||||||
)
|
)
|
||||||
assert output == [Document(page_content="foo", metadata={"page": 1})]
|
assert output == [Document(page_content="foo", metadata={"page": 1})]
|
||||||
|
|
||||||
|
def test_similarity_search_with_doc_builder(
|
||||||
|
self, elasticsearch_connection: dict, index_name: str
|
||||||
|
) -> None:
|
||||||
|
texts = ["foo", "foo", "foo"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = ElasticsearchStore.from_texts(
|
||||||
|
texts,
|
||||||
|
FakeEmbeddings(),
|
||||||
|
metadatas=metadatas,
|
||||||
|
**elasticsearch_connection,
|
||||||
|
index_name=index_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
def custom_document_builder(_: Dict) -> Document:
|
||||||
|
return Document(
|
||||||
|
page_content="Mock content!",
|
||||||
|
metadata={
|
||||||
|
"page_number": -1,
|
||||||
|
"original_filename": "Mock filename!",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
output = docsearch.similarity_search(
|
||||||
|
query="foo", k=1, doc_builder=custom_document_builder
|
||||||
|
)
|
||||||
|
assert output[0].page_content == "Mock content!"
|
||||||
|
assert output[0].metadata["page_number"] == -1
|
||||||
|
assert output[0].metadata["original_filename"] == "Mock filename!"
|
||||||
|
|
||||||
def test_similarity_search_exact_search(
|
def test_similarity_search_exact_search(
|
||||||
self, elasticsearch_connection: dict, index_name: str
|
self, elasticsearch_connection: dict, index_name: str
|
||||||
) -> None:
|
) -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user