diff --git a/docs/docs/integrations/vectorstores/elasticsearch.ipynb b/docs/docs/integrations/vectorstores/elasticsearch.ipynb index cc772b4d6c6..93343a499cc 100644 --- a/docs/docs/integrations/vectorstores/elasticsearch.ipynb +++ b/docs/docs/integrations/vectorstores/elasticsearch.ipynb @@ -776,6 +776,40 @@ "print(results[0])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Customize the Document Builder\n", + "\n", + "With ```doc_builder``` parameter at search, you are able to adjust how a Document is being built using data retrieved from Elasticsearch. This is especially useful if you have indices which were not created using Langchain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "from langchain.docstore.document import Document\n", + "\n", + "def custom_document_builder(hit: Dict) -> Document:\n", + " src = hit.get(\"_source\", {})\n", + " return Document(\n", + " page_content=src.get(\"content\", \"Missing content!\"),\n", + " metadata={\"page_number\": src.get(\"page_number\", -1), \"original_filename\": src.get(\"original_filename\", \"Missing filename!\")},\n", + " )\n", + "\n", + "results = db.similarity_search(\n", + " \"What did the president say about Ketanji Brown Jackson\",\n", + " k=4,\n", + " doc_builder=custom_document_builder,\n", + ")\n", + "print(\"Results:\")\n", + "print(results[0])" + ] + }, { "cell_type": "markdown", "id": "3242fd42", @@ -929,7 +963,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.3" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/vectorstores/elasticsearch.py b/libs/langchain/langchain/vectorstores/elasticsearch.py index 3210cf4f9ea..a635cdea40a 100644 --- a/libs/langchain/langchain/vectorstores/elasticsearch.py +++ b/libs/langchain/langchain/vectorstores/elasticsearch.py @@ -727,6 +727,8 @@ class ElasticsearchStore(VectorStore): fields: Optional[List[str]] = None, filter: Optional[List[dict]] = None, custom_query: Optional[Callable[[Dict, Union[str, None]], Dict]] = None, + doc_builder: Optional[Callable[[Dict], Document]] = None, + **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return Elasticsearch documents most similar to query, along with scores. @@ -781,6 +783,14 @@ class ElasticsearchStore(VectorStore): source=fields, ) + def default_doc_builder(hit: Dict) -> Document: + return Document( + page_content=hit["_source"].get(self.query_field, ""), + metadata=hit["_source"]["metadata"], + ) + + doc_builder = doc_builder or default_doc_builder + docs_and_scores = [] for hit in response["hits"]["hits"]: for field in fields: @@ -792,10 +802,7 @@ class ElasticsearchStore(VectorStore): docs_and_scores.append( ( - Document( - page_content=hit["_source"].get(self.query_field, ""), - metadata=hit["_source"]["metadata"], - ), + doc_builder(hit), hit["_score"], ) ) diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py b/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py index 2bca8c4a7b8..3c597a827c9 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_elasticsearch.py @@ -254,6 +254,35 @@ class TestElasticsearch: ) assert output == [Document(page_content="foo", metadata={"page": 1})] + def test_similarity_search_with_doc_builder( + self, elasticsearch_connection: dict, index_name: str + ) -> None: + texts = ["foo", "foo", "foo"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticsearchStore.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + **elasticsearch_connection, + index_name=index_name, + ) + + def custom_document_builder(_: Dict) -> Document: + return Document( + page_content="Mock content!", + metadata={ + "page_number": -1, + "original_filename": "Mock filename!", + }, + ) + + output = docsearch.similarity_search( + query="foo", k=1, doc_builder=custom_document_builder + ) + assert output[0].page_content == "Mock content!" + assert output[0].metadata["page_number"] == -1 + assert output[0].metadata["original_filename"] == "Mock filename!" + def test_similarity_search_exact_search( self, elasticsearch_connection: dict, index_name: str ) -> None: