From 95883a99a9ee9010e76f72b614c138bfd42500e7 Mon Sep 17 00:00:00 2001 From: Gabriele Ghisleni <74197369+GabrieleGhisleni@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:36:43 +0200 Subject: [PATCH] docs: ElasticsearchCacheStore in stores integrations documentation (#22612) The package for LangChain integrations with Elasticsearch https://github.com/langchain-ai/langchain-elastic contains a Elasticsearch byte store cache integration (see https://github.com/langchain-ai/langchain-elastic/pull/27). This is the documentation contribution on the page dedicated to stores integrations Co-authored-by: Gabriele Ghisleni --- .../integrations/stores/elasticsearch.ipynb | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 docs/docs/integrations/stores/elasticsearch.ipynb diff --git a/docs/docs/integrations/stores/elasticsearch.ipynb b/docs/docs/integrations/stores/elasticsearch.ipynb new file mode 100644 index 00000000000..f55919c23d6 --- /dev/null +++ b/docs/docs/integrations/stores/elasticsearch.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: Elasticsearch \n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ElasticsearchEmbeddingsCache\n", + "\n", + "The `ElasticsearchEmbeddingsCache` is a `ByteStore` implementation that uses your Elasticsearch instance for efficient storage and retrieval of embeddings.\n", + "\n", + "\n", + "First install the LangChain integration with Elasticsearch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U langchain-elasticsearch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "it can be instantiated using `CacheBackedEmbeddings.from_bytes_store` method." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import CacheBackedEmbeddings\n", + "from langchain_elasticsearch import ElasticsearchEmbeddingsCache\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "underlying_embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "store = ElasticsearchEmbeddingsCache(\n", + " es_url=\"http://localhost:9200\",\n", + " index_name=\"llm-chat-cache\",\n", + " metadata={\"project\": \"my_chatgpt_project\"},\n", + " namespace=\"my_chatgpt_project\",\n", + ")\n", + "\n", + "embeddings = CacheBackedEmbeddings.from_bytes_store(\n", + " underlying_embeddings=OpenAIEmbeddings(),\n", + " document_embedding_cache=store,\n", + " query_embedding_cache=store,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The index_name parameter can also accept aliases. This allows to use the ILM: Manage the index lifecycle that we suggest to consider for managing retention and controlling cache growth.\n", + "\n", + "Look at the class docstring for all parameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Index the generated vectors\n", + "The cached vectors won't be searchable by default. The developer can customize the building of the Elasticsearch document in order to add indexed vector field.\n", + "\n", + "This can be done by subclassing end overriding methods. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any, Dict, List\n", + "\n", + "from langchain_elasticsearch import ElasticsearchEmbeddingsCache\n", + "\n", + "\n", + "class SearchableElasticsearchStore(ElasticsearchEmbeddingsCache):\n", + " @property\n", + " def mapping(self) -> Dict[str, Any]:\n", + " mapping = super().mapping\n", + " mapping[\"mappings\"][\"properties\"][\"vector\"] = {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 1536,\n", + " \"index\": True,\n", + " \"similarity\": \"dot_product\",\n", + " }\n", + " return mapping\n", + "\n", + " def build_document(self, llm_input: str, vector: List[float]) -> Dict[str, Any]:\n", + " body = super().build_document(llm_input, vector)\n", + " body[\"vector\"] = vector\n", + " return body" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "When overriding the mapping and the document building, please only make additive modifications, keeping the base mapping intact." + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}