From b0a298894d9a74eef86d6a197ed534f1e7de0506 Mon Sep 17 00:00:00 2001 From: ScriptShi Date: Sat, 14 Dec 2024 03:17:28 +0800 Subject: [PATCH] community[minor]: Add TablestoreVectorStore (#25767) Thank you for contributing to LangChain! - [x] **PR title**: community: add TablestoreVectorStore - [x] **PR message**: - **Description:** add TablestoreVectorStore - **Dependencies:** none - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration: yes 2. an example notebook showing its use: yes If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur --- .../integrations/providers/alibaba_cloud.mdx | 8 + .../vectorstores/tablestore.ipynb | 385 ++++++++++++ .../vectorstores/__init__.py | 5 + .../vectorstores/tablestore.py | 564 ++++++++++++++++++ .../vectorstores/test_tablestore.py | 93 +++ .../unit_tests/vectorstores/test_imports.py | 1 + .../vectorstores/test_indexing_docs.py | 1 + 7 files changed, 1057 insertions(+) create mode 100644 docs/docs/integrations/vectorstores/tablestore.ipynb create mode 100644 libs/community/langchain_community/vectorstores/tablestore.py create mode 100644 libs/community/tests/integration_tests/vectorstores/test_tablestore.py diff --git a/docs/docs/integrations/providers/alibaba_cloud.mdx b/docs/docs/integrations/providers/alibaba_cloud.mdx index 74c3045a642..baf0fe9fe07 100644 --- a/docs/docs/integrations/providers/alibaba_cloud.mdx +++ b/docs/docs/integrations/providers/alibaba_cloud.mdx @@ -89,3 +89,11 @@ See [installation instructions and a usage example](/docs/integrations/vectorsto ```python from langchain_community.vectorstores import Hologres ``` + +### Tablestore + +See [installation instructions and a usage example](/docs/integrations/vectorstores/tablestore). + +```python +from langchain_community.vectorstores import TablestoreVectorStore +``` \ No newline at end of file diff --git a/docs/docs/integrations/vectorstores/tablestore.ipynb b/docs/docs/integrations/vectorstores/tablestore.ipynb new file mode 100644 index 00000000000..0c82261d97f --- /dev/null +++ b/docs/docs/integrations/vectorstores/tablestore.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# TablestoreVectorStore\n", + "\n", + "> [Tablestore](https://www.aliyun.com/product/ots) is a fully managed NoSQL cloud database service that enables storage of a massive amount of structured\n", + "and semi-structured data.\n", + "\n", + "This notebook shows how to use functionality related to the `Tablestore` vector database.\n", + "\n", + "To use Tablestore, you must create an instance.\n", + "Here are the [creating instance instructions](https://help.aliyun.com/zh/tablestore/getting-started/manage-the-wide-column-model-in-the-tablestore-console)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": "## Setup" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet langchain-community tablestore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Initialization" + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:04.469458Z", + "start_time": "2024-08-20T11:09:49.541150Z" + }, + "pycharm": { + "is_executing": true, + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"end_point\"] = getpass.getpass(\"Tablestore end_point:\")\n", + "os.environ[\"instance_name\"] = getpass.getpass(\"Tablestore instance_name:\")\n", + "os.environ[\"access_key_id\"] = getpass.getpass(\"Tablestore access_key_id:\")\n", + "os.environ[\"access_key_secret\"] = getpass.getpass(\"Tablestore access_key_secret:\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Create vector store. " + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:07.911086Z", + "start_time": "2024-08-20T11:10:07.351293Z" + } + }, + "outputs": [], + "source": [ + "import tablestore\n", + "from langchain_community.embeddings import FakeEmbeddings\n", + "from langchain_community.vectorstores import TablestoreVectorStore\n", + "from langchain_core.documents import Document\n", + "\n", + "test_embedding_dimension_size = 4\n", + "embeddings = FakeEmbeddings(size=test_embedding_dimension_size)\n", + "\n", + "store = TablestoreVectorStore(\n", + " embedding=embeddings,\n", + " endpoint=os.getenv(\"end_point\"),\n", + " instance_name=os.getenv(\"instance_name\"),\n", + " access_key_id=os.getenv(\"access_key_id\"),\n", + " access_key_secret=os.getenv(\"access_key_secret\"),\n", + " vector_dimension=test_embedding_dimension_size,\n", + " # metadata mapping is used to filter non-vector fields.\n", + " metadata_mappings=[\n", + " tablestore.FieldSchema(\n", + " \"type\", tablestore.FieldType.KEYWORD, index=True, enable_sort_and_agg=True\n", + " ),\n", + " tablestore.FieldSchema(\n", + " \"time\", tablestore.FieldType.LONG, index=True, enable_sort_and_agg=True\n", + " ),\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Manage vector store" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Create table and index." + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:10.875422Z", + "start_time": "2024-08-20T11:10:10.566400Z" + } + }, + "outputs": [], + "source": [ + "store.create_table_if_not_exist()\n", + "store.create_search_index_if_not_exist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Add documents." + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:14.974253Z", + "start_time": "2024-08-20T11:10:14.894190Z" + }, + "pycharm": { + "is_executing": true, + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['1', '2', '3', '4', '5']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.add_documents(\n", + " [\n", + " Document(\n", + " id=\"1\", page_content=\"1 hello world\", metadata={\"type\": \"pc\", \"time\": 2000}\n", + " ),\n", + " Document(\n", + " id=\"2\", page_content=\"abc world\", metadata={\"type\": \"pc\", \"time\": 2009}\n", + " ),\n", + " Document(\n", + " id=\"3\", page_content=\"3 text world\", metadata={\"type\": \"sky\", \"time\": 2010}\n", + " ),\n", + " Document(\n", + " id=\"4\", page_content=\"hi world\", metadata={\"type\": \"sky\", \"time\": 2030}\n", + " ),\n", + " Document(\n", + " id=\"5\", page_content=\"hi world\", metadata={\"type\": \"sky\", \"time\": 2030}\n", + " ),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": "Delete document." + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:17.408739Z", + "start_time": "2024-08-20T11:10:17.269593Z" + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.delete([\"3\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": "Get documents." + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Query vector store" + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:19.379617Z", + "start_time": "2024-08-20T11:10:19.339970Z" + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(id='1', metadata={'embedding': '[1.3296732307905934, 0.0037521341868022385, 0.9821875819319514, 2.5644103644492393]', 'time': 2000, 'type': 'pc'}, page_content='1 hello world'),\n", + " None,\n", + " Document(id='5', metadata={'embedding': '[1.4558082172139821, -1.6441137122167426, -0.13113098640337423, -1.889685473174525]', 'time': 2030, 'type': 'sky'}, page_content='hi world')]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.get_by_ids([\"1\", \"3\", \"5\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Similarity search." + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:21.306717Z", + "start_time": "2024-08-20T11:10:21.284244Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(id='1', metadata={'embedding': [1.3296732307905934, 0.0037521341868022385, 0.9821875819319514, 2.5644103644492393], 'time': 2000, 'type': 'pc'}, page_content='1 hello world'),\n", + " Document(id='4', metadata={'embedding': [-0.3310144199800685, 0.29250046478723635, -0.0646862290377582, -0.23664360156781225], 'time': 2030, 'type': 'sky'}, page_content='hi world')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.similarity_search(query=\"hello world\", k=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Similarity search with filters. " + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-20T11:10:23.231425Z", + "start_time": "2024-08-20T11:10:23.213046Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(id='5', metadata={'embedding': [1.4558082172139821, -1.6441137122167426, -0.13113098640337423, -1.889685473174525], 'time': 2030, 'type': 'sky'}, page_content='hi world'),\n", + " Document(id='4', metadata={'embedding': [-0.3310144199800685, 0.29250046478723635, -0.0646862290377582, -0.23664360156781225], 'time': 2030, 'type': 'sky'}, page_content='hi world')]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.similarity_search(\n", + " query=\"hello world\",\n", + " k=10,\n", + " tablestore_filter_query=tablestore.BoolQuery(\n", + " must_queries=[tablestore.TermQuery(field_name=\"type\", column_value=\"sky\")],\n", + " should_queries=[tablestore.RangeQuery(field_name=\"time\", range_from=2020)],\n", + " must_not_queries=[tablestore.TermQuery(field_name=\"type\", column_value=\"pc\")],\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage for retrieval-augmented generation\n", + "\n", + "For guides on how to use this vector store for retrieval-augmented generation (RAG), see the following sections:\n", + "\n", + "- [Tutorials](/docs/tutorials/)\n", + "- [How-to: Question and answer with RAG](https://python.langchain.com/docs/how_to/#qa-with-rag)\n", + "- [Retrieval conceptual docs](https://python.langchain.com/docs/concepts/retrieval)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all `TablestoreVectorStore` features and configurations head to the API reference:\n", + " https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.tablestore.TablestoreVectorStore.html" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index c38beea0ed6..b3f1ac6a27c 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -245,6 +245,9 @@ if TYPE_CHECKING: from langchain_community.vectorstores.surrealdb import ( SurrealDBStore, ) + from langchain_community.vectorstores.tablestore import ( + TablestoreVectorStore, + ) from langchain_community.vectorstores.tair import ( Tair, ) @@ -391,6 +394,7 @@ __all__ = [ "StarRocks", "SupabaseVectorStore", "SurrealDBStore", + "TablestoreVectorStore", "Tair", "TencentVectorDB", "TiDBVectorStore", @@ -495,6 +499,7 @@ _module_lookup = { "StarRocks": "langchain_community.vectorstores.starrocks", "SupabaseVectorStore": "langchain_community.vectorstores.supabase", "SurrealDBStore": "langchain_community.vectorstores.surrealdb", + "TablestoreVectorStore": "langchain_community.vectorstores.tablestore", "Tair": "langchain_community.vectorstores.tair", "TencentVectorDB": "langchain_community.vectorstores.tencentvectordb", "TiDBVectorStore": "langchain_community.vectorstores.tidb_vector", diff --git a/libs/community/langchain_community/vectorstores/tablestore.py b/libs/community/langchain_community/vectorstores/tablestore.py new file mode 100644 index 00000000000..a00e102839c --- /dev/null +++ b/libs/community/langchain_community/vectorstores/tablestore.py @@ -0,0 +1,564 @@ +import json +import logging +import uuid +from typing import ( + Any, + Iterable, + List, + Optional, + Sequence, + Tuple, +) + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +logger = logging.getLogger(__name__) + + +class TablestoreVectorStore(VectorStore): + """`Tablestore` vector store. + + To use, you should have the ``tablestore`` python package installed. + + Example: + .. code-block:: python + + import os + + from langchain_openai import OpenAIEmbeddings + from langchain_community.vectorstores import TablestoreVectorStore + import tablestore + + embeddings = OpenAIEmbeddings() + store = TablestoreVectorStore( + embeddings, + endpoint=os.getenv("end_point"), + instance_name=os.getenv("instance_name"), + access_key_id=os.getenv("access_key_id"), + access_key_secret=os.getenv("access_key_secret"), + vector_dimension=512, + # metadata mapping is used to filter non-vector fields. + metadata_mappings=[ + tablestore.FieldSchema( + "type", + tablestore.FieldType.KEYWORD, + index=True, + enable_sort_and_agg=True + ), + tablestore.FieldSchema( + "time", + tablestore.FieldType.LONG, + index=True, + enable_sort_and_agg=True + ), + ] + ) + """ + + def __init__( + self, + embedding: Embeddings, + *, + endpoint: Optional[str] = None, + instance_name: Optional[str] = None, + access_key_id: Optional[str] = None, + access_key_secret: Optional[str] = None, + table_name: Optional[str] = "langchain_vector_store_ots_v1", + index_name: Optional[str] = "langchain_vector_store_ots_index_v1", + text_field: Optional[str] = "content", + vector_field: Optional[str] = "embedding", + vector_dimension: int = 512, + vector_metric_type: Optional[str] = "cosine", + metadata_mappings: Optional[List[Any]] = None, + ): + try: + import tablestore + except ImportError: + raise ImportError( + "Could not import tablestore python package. " + "Please install it with `pip install tablestore`." + ) + self.__embedding = embedding + self.__tablestore_client = tablestore.OTSClient( + endpoint, + access_key_id, + access_key_secret, + instance_name, + retry_policy=tablestore.WriteRetryPolicy(), + ) + self.__table_name = table_name + self.__index_name = index_name + self.__vector_dimension = vector_dimension + self.__vector_field = vector_field + self.__text_field = text_field + if vector_metric_type == "cosine": + self.__vector_metric_type = tablestore.VectorMetricType.VM_COSINE + elif vector_metric_type == "euclidean": + self.__vector_metric_type = tablestore.VectorMetricType.VM_EUCLIDEAN + elif vector_metric_type == "dot_product": + self.__vector_metric_type = tablestore.VectorMetricType.VM_DOT_PRODUCT + else: + raise ValueError( + f"Unsupported vector_metric_type operator: {vector_metric_type}" + ) + + self.__metadata_mappings = [ + tablestore.FieldSchema( + self.__text_field, + tablestore.FieldType.TEXT, + index=True, + enable_sort_and_agg=False, + store=False, + analyzer=tablestore.AnalyzerType.MAXWORD, + ), + tablestore.FieldSchema( + self.__vector_field, + tablestore.FieldType.VECTOR, + vector_options=tablestore.VectorOptions( + data_type=tablestore.VectorDataType.VD_FLOAT_32, + dimension=self.__vector_dimension, + metric_type=self.__vector_metric_type, + ), + ), + ] + + if metadata_mappings: + for mapping in metadata_mappings: + if not isinstance(mapping, tablestore.FieldSchema): + raise ValueError( + f"meta_data mapping should be an " + f"instance of tablestore.FieldSchema, " + f"bug got {type(mapping)}" + ) + if ( + mapping.field_name == text_field + or mapping.field_name == vector_field + ): + continue + self.__metadata_mappings.append(mapping) + + def create_table_if_not_exist(self) -> None: + """Create table if not exist.""" + + try: + import tablestore + except ImportError: + raise ImportError( + "Could not import tablestore python package. " + "Please install it with `pip install tablestore`." + ) + table_list = self.__tablestore_client.list_table() + if self.__table_name in table_list: + logger.info("Tablestore system table[%s] already exists", self.__table_name) + return None + logger.info( + "Tablestore system table[%s] does not exist, try to create the table.", + self.__table_name, + ) + + schema_of_primary_key = [("id", "STRING")] + table_meta = tablestore.TableMeta(self.__table_name, schema_of_primary_key) + table_options = tablestore.TableOptions() + reserved_throughput = tablestore.ReservedThroughput( + tablestore.CapacityUnit(0, 0) + ) + try: + self.__tablestore_client.create_table( + table_meta, table_options, reserved_throughput + ) + logger.info("Tablestore create table[%s] successfully.", self.__table_name) + except tablestore.OTSClientError as e: + logger.exception( + "Tablestore create system table[%s] failed with client error, " + "http_status:%d, error_message:%s", + self.__table_name, + e.get_http_status(), + e.get_error_message(), + ) + except tablestore.OTSServiceError as e: + logger.exception( + "Tablestore create system table[%s] failed with client error, " + "http_status:%d, error_code:%s, error_message:%s, request_id:%s", + self.__table_name, + e.get_http_status(), + e.get_error_code(), + e.get_error_message(), + e.get_request_id(), + ) + + def create_search_index_if_not_exist(self) -> None: + """Create search index if not exist.""" + + try: + import tablestore + except ImportError: + raise ImportError( + "Could not import tablestore python package. " + "Please install it with `pip install tablestore`." + ) + search_index_list = self.__tablestore_client.list_search_index( + table_name=self.__table_name + ) + if self.__index_name in [t[1] for t in search_index_list]: + logger.info("Tablestore system index[%s] already exists", self.__index_name) + return None + index_meta = tablestore.SearchIndexMeta(self.__metadata_mappings) + self.__tablestore_client.create_search_index( + self.__table_name, self.__index_name, index_meta + ) + logger.info( + "Tablestore create system index[%s] successfully.", self.__index_name + ) + + def delete_table_if_exists(self) -> None: + """Delete table if exists.""" + + search_index_list = self.__tablestore_client.list_search_index( + table_name=self.__table_name + ) + for resp_tuple in search_index_list: + self.__tablestore_client.delete_search_index(resp_tuple[0], resp_tuple[1]) + self.__tablestore_client.delete_table(self.__table_name) + + def delete_search_index(self, table_name: str, index_name: str) -> None: + """Delete search index.""" + + self.__tablestore_client.delete_search_index(table_name, index_name) + + def __write_row( + self, row_id: str, content: str, embedding_vector: List[float], meta_data: dict + ) -> None: + try: + import tablestore + except ImportError: + raise ImportError( + "Could not import tablestore python package. " + "Please install it with `pip install tablestore`." + ) + primary_key = [("id", row_id)] + attribute_columns = [ + (self.__text_field, content), + (self.__vector_field, json.dumps(embedding_vector)), + ] + for k, v in meta_data.items(): + item = (k, v) + attribute_columns.append(item) + row = tablestore.Row(primary_key, attribute_columns) + + try: + self.__tablestore_client.put_row(self.__table_name, row) + logger.debug( + "Tablestore put row successfully. id:%s, content:%s, meta_data:%s", + row_id, + content, + meta_data, + ) + except tablestore.OTSClientError as e: + logger.exception( + "Tablestore put row failed with client error:%s, " + "id:%s, content:%s, meta_data:%s", + e, + row_id, + content, + meta_data, + ) + except tablestore.OTSServiceError as e: + logger.exception( + "Tablestore put row failed with client error:%s, id:%s, content:%s, " + "meta_data:%s, http_status:%d, " + "error_code:%s, error_message:%s, request_id:%s", + e, + row_id, + content, + meta_data, + e.get_http_status(), + e.get_error_code(), + e.get_error_message(), + e.get_request_id(), + ) + + def __delete_row(self, row_id: str) -> None: + try: + import tablestore + except ImportError: + raise ImportError( + "Could not import tablestore python package. " + "Please install it with `pip install tablestore`." + ) + primary_key = [("id", row_id)] + try: + self.__tablestore_client.delete_row(self.__table_name, primary_key, None) + logger.info("Tablestore delete row successfully. id:%s", row_id) + except tablestore.OTSClientError as e: + logger.exception( + "Tablestore delete row failed with client error:%s, id:%s", e, row_id + ) + except tablestore.OTSServiceError as e: + logger.exception( + "Tablestore delete row failed with client error:%s, " + "id:%s, http_status:%d, error_code:%s, error_message:%s, request_id:%s", + e, + row_id, + e.get_http_status(), + e.get_error_code(), + e.get_error_message(), + e.get_request_id(), + ) + + def __get_row(self, row_id: str) -> Document: + try: + import tablestore + except ImportError: + raise ImportError( + "Could not import tablestore python package. " + "Please install it with `pip install tablestore`." + ) + primary_key = [("id", row_id)] + try: + _, row, _ = self.__tablestore_client.get_row( + self.__table_name, primary_key, None, None, 1 + ) + logger.debug("Tablestore get row successfully. id:%s", row_id) + if row is None: + raise ValueError("Can't not find row_id:%s in tablestore." % row_id) + document_id = row.primary_key[0][1] + meta_data = {} + text = "" + for col in row.attribute_columns: + key = col[0] + val = col[1] + if key == self.__text_field: + text = val + continue + meta_data[key] = val + return Document( + id=document_id, + page_content=text, + metadata=meta_data, + ) + except tablestore.OTSClientError as e: + logger.exception( + "Tablestore get row failed with client error:%s, id:%s", e, row_id + ) + raise e + except tablestore.OTSServiceError as e: + logger.exception( + "Tablestore get row failed with client error:%s, " + "id:%s, http_status:%d, error_code:%s, error_message:%s, request_id:%s", + e, + row_id, + e.get_http_status(), + e.get_error_code(), + e.get_error_message(), + e.get_request_id(), + ) + raise e + + def _tablestore_search( + self, + query_embedding: List[float], + k: int = 5, + tablestore_filter_query: Optional[Any] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + try: + import tablestore + except ImportError: + raise ImportError( + "Could not import tablestore python package. " + "Please install it with `pip install tablestore`." + ) + if tablestore_filter_query: + if not isinstance(tablestore_filter_query, tablestore.Query): + raise ValueError( + f"table_store_filter_query should be " + f"an instance of tablestore.Query, " + f"bug got {type(tablestore_filter_query)}" + ) + if "knn_top_k" in kwargs: + knn_top_k = kwargs["knn_top_k"] + else: + knn_top_k = k + ots_query = tablestore.KnnVectorQuery( + field_name=self.__vector_field, + top_k=knn_top_k, + float32_query_vector=query_embedding, + filter=tablestore_filter_query, + ) + sort = tablestore.Sort( + sorters=[tablestore.ScoreSort(sort_order=tablestore.SortOrder.DESC)] + ) + search_query = tablestore.SearchQuery( + ots_query, limit=k, get_total_count=False, sort=sort + ) + try: + search_response = self.__tablestore_client.search( + table_name=self.__table_name, + index_name=self.__index_name, + search_query=search_query, + columns_to_get=tablestore.ColumnsToGet( + return_type=tablestore.ColumnReturnType.ALL + ), + ) + logger.info( + "Tablestore search successfully. request_id:%s", + search_response.request_id, + ) + tuple_list = [] + for hit in search_response.search_hits: + row = hit.row + score = hit.score + document_id = row[0][0][1] + meta_data = {} + text = "" + for col in row[1]: + key = col[0] + val = col[1] + if key == self.__text_field: + text = val + continue + if key == self.__vector_field: + val = json.loads(val) + meta_data[key] = val + doc = Document( + id=document_id, + page_content=text, + metadata=meta_data, + ) + tuple_list.append((doc, score)) + return tuple_list + except tablestore.OTSClientError as e: + logger.exception("Tablestore search failed with client error:%s", e) + raise e + except tablestore.OTSServiceError as e: + logger.exception( + "Tablestore search failed with client error:%s, " + "http_status:%d, error_code:%s, error_message:%s, request_id:%s", + e, + e.get_http_status(), + e.get_error_code(), + e.get_error_message(), + e.get_request_id(), + ) + raise e + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + ids = ids or [str(uuid.uuid4().hex) for _ in texts] + text_list = list(texts) + embeddings = self.__embedding.embed_documents(text_list) + for i in range(len(ids)): + row_id = ids[i] + text = text_list[i] + embedding_vector = embeddings[i] + metadata = dict() + if metadatas and metadatas[i]: + metadata = metadatas[i] + self.__write_row( + row_id=row_id, + content=text, + embedding_vector=embedding_vector, + meta_data=metadata, + ) + return ids + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + if ids: + for row_id in ids: + self.__delete_row(row_id) + return True + + def get_by_ids(self, ids: Sequence[str], /) -> List[Document]: + return [self.__get_row(row_id) for row_id in ids] + + def similarity_search( + self, + query: str, + k: int = 4, + tablestore_filter_query: Optional[Any] = None, + **kwargs: Any, + ) -> List[Document]: + return [ + doc + for (doc, score) in self.similarity_search_with_score( + query, k=k, tablestore_filter_query=tablestore_filter_query, **kwargs + ) + ] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + tablestore_filter_query: Optional[Any] = None, + *args: Any, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + query_embedding = self.__embedding.embed_query(query) + return self._tablestore_search( + query_embedding, + k=k, + tablestore_filter_query=tablestore_filter_query, + **kwargs, + ) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + tablestore_filter_query: Optional[Any] = None, + **kwargs: Any, + ) -> List[Document]: + return [ + doc + for (doc, score) in self._tablestore_search( + embedding, + k=k, + tablestore_filter_query=tablestore_filter_query, + **kwargs, + ) + ] + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + endpoint: Optional[str] = None, + instance_name: Optional[str] = None, + access_key_id: Optional[str] = None, + access_key_secret: Optional[str] = None, + table_name: Optional[str] = "langchain_vector_store_ots_v1", + index_name: Optional[str] = "langchain_vector_store_ots_index_v1", + text_field: Optional[str] = "content", + vector_field: Optional[str] = "embedding", + vector_dimension: int = 512, + vector_metric_type: Optional[str] = "cosine", + metadata_mappings: Optional[List[Any]] = None, + **kwargs: Any, + ) -> "TablestoreVectorStore": + store = cls( + embedding=embedding, + endpoint=endpoint, + instance_name=instance_name, + access_key_id=access_key_id, + access_key_secret=access_key_secret, + table_name=table_name, + index_name=index_name, + text_field=text_field, + vector_field=vector_field, + vector_dimension=vector_dimension, + vector_metric_type=vector_metric_type, + metadata_mappings=metadata_mappings, + ) + store.create_table_if_not_exist() + store.create_search_index_if_not_exist() + store.add_texts(texts, metadatas) + return store diff --git a/libs/community/tests/integration_tests/vectorstores/test_tablestore.py b/libs/community/tests/integration_tests/vectorstores/test_tablestore.py new file mode 100644 index 00000000000..4af879485d7 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_tablestore.py @@ -0,0 +1,93 @@ +"""Test tablestore functionality.""" + +import os + +import pytest +from langchain_core.documents import Document + +from langchain_community.embeddings import FakeEmbeddings +from langchain_community.vectorstores.tablestore import TablestoreVectorStore + + +def test_tablestore() -> None: + """Test end to end construction and search.""" + test_embedding_dimension_size = 4 + embeddings = FakeEmbeddings(size=test_embedding_dimension_size) + + end_point = os.getenv("end_point") + instance_name = os.getenv("instance_name") + access_key_id = os.getenv("access_key_id") + access_key_secret = os.getenv("access_key_secret") + if ( + end_point is None + or instance_name is None + or access_key_id is None + or access_key_secret is None + ): + pytest.skip( + "end_point is None or instance_name is None or " + "access_key_id is None or access_key_secret is None" + ) + """ + 1. create vector store + """ + store = TablestoreVectorStore( + embedding=embeddings, + endpoint=end_point, + instance_name=instance_name, + access_key_id=access_key_id, + access_key_secret=access_key_secret, + vector_dimension=test_embedding_dimension_size, + ) + + """ + 2. create table and index. (only needs to be run once) + """ + store.create_table_if_not_exist() + store.create_search_index_if_not_exist() + + """ + 3. add document + """ + store.add_documents( + [ + Document( + id="1", + page_content="1 hello world", + metadata={"type": "pc", "time": 2000}, + ), + Document( + id="2", page_content="abc world", metadata={"type": "pc", "time": 2009} + ), + Document( + id="3", + page_content="3 text world", + metadata={"type": "sky", "time": 2010}, + ), + Document( + id="4", page_content="hi world", metadata={"type": "sky", "time": 2030} + ), + Document( + id="5", page_content="hi world", metadata={"type": "sky", "time": 2030} + ), + ] + ) + + """ + 4. delete document + """ + assert store.delete(["3"]) + + """ + 5. get document + """ + get_docs = store.get_by_ids(["1", "4"]) + assert len(get_docs) == 2 + assert get_docs[0].id == "1" + assert get_docs[1].id == "4" + + """ + 6. similarity_search + """ + search_result = store.similarity_search_with_score(query="hello world", k=2) + assert len(search_result) == 2 diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 5ac0ca72b49..d65d9d41a13 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -84,6 +84,7 @@ EXPECTED_ALL = [ "StarRocks", "SupabaseVectorStore", "SurrealDBStore", + "TablestoreVectorStore", "Tair", "TencentVectorDB", "TiDBVectorStore", diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index 041f4172b2d..b7da470ef26 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -88,6 +88,7 @@ def test_compatible_vectorstore_documentation() -> None: "SingleStoreDB", "SupabaseVectorStore", "SurrealDBStore", + "TablestoreVectorStore", "TileDB", "TimescaleVector", "TencentVectorDB",