mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-12 15:59:56 +00:00
IMPROVEMENT Increase flexibility of ElasticVectorSearch (#6863)
Hey @rlancemartin, @eyurtsev , I did some minimal changes to the `ElasticVectorSearch` client so that it plays better with existing ES indices. Main changes are as follows: 1. You can pass the dense vector field name into `_default_script_query` 2. You can pass a custom script query implementation and the respective parameters to `similarity_search_with_score` 3. You can pass functions for building page content and metadata for the resulting `Document` <!-- Thank you for contributing to LangChain! Replace this comment with: - Description: a description of the change, - Issue: the issue # it fixes (if applicable), - Dependencies: any dependencies required for this change, - Tag maintainer: for a quicker response, tag the relevant maintainer (see below), - Twitter handle: we announce bigger features on Twitter. If your PR gets announced and you'd like a mention, we'll gladly shout you out! If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 4. an example notebook showing its use. Maintainer responsibilities: - General / Misc / if you don't know who to tag: @dev2049 - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev - Models / Prompts: @hwchase17, @dev2049 - Memory: @hwchase17 - Agents / Tools / Toolkits: @vowelparrot - Tracing / Callbacks: @agola11 - Async: @agola11 If no one reviews your PR within a few days, feel free to @-mention the same people again. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md -->
This commit is contained in:
parent
39852dffd2
commit
9b4974871d
@ -776,6 +776,40 @@
|
||||
"print(results[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Customize the Document Builder\n",
|
||||
"\n",
|
||||
"With ```doc_builder``` parameter at search, you are able to adjust how a Document is being built using data retrieved from Elasticsearch. This is especially useful if you have indices which were not created using Langchain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Dict\n",
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"def custom_document_builder(hit: Dict) -> Document:\n",
|
||||
" src = hit.get(\"_source\", {})\n",
|
||||
" return Document(\n",
|
||||
" page_content=src.get(\"content\", \"Missing content!\"),\n",
|
||||
" metadata={\"page_number\": src.get(\"page_number\", -1), \"original_filename\": src.get(\"original_filename\", \"Missing filename!\")},\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"results = db.similarity_search(\n",
|
||||
" \"What did the president say about Ketanji Brown Jackson\",\n",
|
||||
" k=4,\n",
|
||||
" doc_builder=custom_document_builder,\n",
|
||||
")\n",
|
||||
"print(\"Results:\")\n",
|
||||
"print(results[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3242fd42",
|
||||
@ -929,7 +963,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.3"
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -727,6 +727,8 @@ class ElasticsearchStore(VectorStore):
|
||||
fields: Optional[List[str]] = None,
|
||||
filter: Optional[List[dict]] = None,
|
||||
custom_query: Optional[Callable[[Dict, Union[str, None]], Dict]] = None,
|
||||
doc_builder: Optional[Callable[[Dict], Document]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return Elasticsearch documents most similar to query, along with scores.
|
||||
|
||||
@ -781,6 +783,14 @@ class ElasticsearchStore(VectorStore):
|
||||
source=fields,
|
||||
)
|
||||
|
||||
def default_doc_builder(hit: Dict) -> Document:
|
||||
return Document(
|
||||
page_content=hit["_source"].get(self.query_field, ""),
|
||||
metadata=hit["_source"]["metadata"],
|
||||
)
|
||||
|
||||
doc_builder = doc_builder or default_doc_builder
|
||||
|
||||
docs_and_scores = []
|
||||
for hit in response["hits"]["hits"]:
|
||||
for field in fields:
|
||||
@ -792,10 +802,7 @@ class ElasticsearchStore(VectorStore):
|
||||
|
||||
docs_and_scores.append(
|
||||
(
|
||||
Document(
|
||||
page_content=hit["_source"].get(self.query_field, ""),
|
||||
metadata=hit["_source"]["metadata"],
|
||||
),
|
||||
doc_builder(hit),
|
||||
hit["_score"],
|
||||
)
|
||||
)
|
||||
|
@ -254,6 +254,35 @@ class TestElasticsearch:
|
||||
)
|
||||
assert output == [Document(page_content="foo", metadata={"page": 1})]
|
||||
|
||||
def test_similarity_search_with_doc_builder(
|
||||
self, elasticsearch_connection: dict, index_name: str
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "foo"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = ElasticsearchStore.from_texts(
|
||||
texts,
|
||||
FakeEmbeddings(),
|
||||
metadatas=metadatas,
|
||||
**elasticsearch_connection,
|
||||
index_name=index_name,
|
||||
)
|
||||
|
||||
def custom_document_builder(_: Dict) -> Document:
|
||||
return Document(
|
||||
page_content="Mock content!",
|
||||
metadata={
|
||||
"page_number": -1,
|
||||
"original_filename": "Mock filename!",
|
||||
},
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search(
|
||||
query="foo", k=1, doc_builder=custom_document_builder
|
||||
)
|
||||
assert output[0].page_content == "Mock content!"
|
||||
assert output[0].metadata["page_number"] == -1
|
||||
assert output[0].metadata["original_filename"] == "Mock filename!"
|
||||
|
||||
def test_similarity_search_exact_search(
|
||||
self, elasticsearch_connection: dict, index_name: str
|
||||
) -> None:
|
||||
|
Loading…
Reference in New Issue
Block a user