mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-06 03:27:55 +00:00
community[minor]: Add TablestoreVectorStore (#25767)
Thank you for contributing to LangChain! - [x] **PR title**: community: add TablestoreVectorStore - [x] **PR message**: - **Description:** add TablestoreVectorStore - **Dependencies:** none - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration: yes 2. an example notebook showing its use: yes If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
86b3c6e81c
commit
b0a298894d
@ -89,3 +89,11 @@ See [installation instructions and a usage example](/docs/integrations/vectorsto
|
||||
```python
|
||||
from langchain_community.vectorstores import Hologres
|
||||
```
|
||||
|
||||
### Tablestore
|
||||
|
||||
See [installation instructions and a usage example](/docs/integrations/vectorstores/tablestore).
|
||||
|
||||
```python
|
||||
from langchain_community.vectorstores import TablestoreVectorStore
|
||||
```
|
385
docs/docs/integrations/vectorstores/tablestore.ipynb
Normal file
385
docs/docs/integrations/vectorstores/tablestore.ipynb
Normal file
@ -0,0 +1,385 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# TablestoreVectorStore\n",
|
||||
"\n",
|
||||
"> [Tablestore](https://www.aliyun.com/product/ots) is a fully managed NoSQL cloud database service that enables storage of a massive amount of structured\n",
|
||||
"and semi-structured data.\n",
|
||||
"\n",
|
||||
"This notebook shows how to use functionality related to the `Tablestore` vector database.\n",
|
||||
"\n",
|
||||
"To use Tablestore, you must create an instance.\n",
|
||||
"Here are the [creating instance instructions](https://help.aliyun.com/zh/tablestore/getting-started/manage-the-wide-column-model-in-the-tablestore-console)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": "## Setup"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain-community tablestore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "## Initialization"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:04.469458Z",
|
||||
"start_time": "2024-08-20T11:09:49.541150Z"
|
||||
},
|
||||
"pycharm": {
|
||||
"is_executing": true,
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"end_point\"] = getpass.getpass(\"Tablestore end_point:\")\n",
|
||||
"os.environ[\"instance_name\"] = getpass.getpass(\"Tablestore instance_name:\")\n",
|
||||
"os.environ[\"access_key_id\"] = getpass.getpass(\"Tablestore access_key_id:\")\n",
|
||||
"os.environ[\"access_key_secret\"] = getpass.getpass(\"Tablestore access_key_secret:\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "Create vector store. "
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:07.911086Z",
|
||||
"start_time": "2024-08-20T11:10:07.351293Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tablestore\n",
|
||||
"from langchain_community.embeddings import FakeEmbeddings\n",
|
||||
"from langchain_community.vectorstores import TablestoreVectorStore\n",
|
||||
"from langchain_core.documents import Document\n",
|
||||
"\n",
|
||||
"test_embedding_dimension_size = 4\n",
|
||||
"embeddings = FakeEmbeddings(size=test_embedding_dimension_size)\n",
|
||||
"\n",
|
||||
"store = TablestoreVectorStore(\n",
|
||||
" embedding=embeddings,\n",
|
||||
" endpoint=os.getenv(\"end_point\"),\n",
|
||||
" instance_name=os.getenv(\"instance_name\"),\n",
|
||||
" access_key_id=os.getenv(\"access_key_id\"),\n",
|
||||
" access_key_secret=os.getenv(\"access_key_secret\"),\n",
|
||||
" vector_dimension=test_embedding_dimension_size,\n",
|
||||
" # metadata mapping is used to filter non-vector fields.\n",
|
||||
" metadata_mappings=[\n",
|
||||
" tablestore.FieldSchema(\n",
|
||||
" \"type\", tablestore.FieldType.KEYWORD, index=True, enable_sort_and_agg=True\n",
|
||||
" ),\n",
|
||||
" tablestore.FieldSchema(\n",
|
||||
" \"time\", tablestore.FieldType.LONG, index=True, enable_sort_and_agg=True\n",
|
||||
" ),\n",
|
||||
" ],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "## Manage vector store"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "Create table and index."
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:10.875422Z",
|
||||
"start_time": "2024-08-20T11:10:10.566400Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"store.create_table_if_not_exist()\n",
|
||||
"store.create_search_index_if_not_exist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "Add documents."
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:14.974253Z",
|
||||
"start_time": "2024-08-20T11:10:14.894190Z"
|
||||
},
|
||||
"pycharm": {
|
||||
"is_executing": true,
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['1', '2', '3', '4', '5']"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"store.add_documents(\n",
|
||||
" [\n",
|
||||
" Document(\n",
|
||||
" id=\"1\", page_content=\"1 hello world\", metadata={\"type\": \"pc\", \"time\": 2000}\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" id=\"2\", page_content=\"abc world\", metadata={\"type\": \"pc\", \"time\": 2009}\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" id=\"3\", page_content=\"3 text world\", metadata={\"type\": \"sky\", \"time\": 2010}\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" id=\"4\", page_content=\"hi world\", metadata={\"type\": \"sky\", \"time\": 2030}\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" id=\"5\", page_content=\"hi world\", metadata={\"type\": \"sky\", \"time\": 2030}\n",
|
||||
" ),\n",
|
||||
" ]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": "Delete document."
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:17.408739Z",
|
||||
"start_time": "2024-08-20T11:10:17.269593Z"
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"store.delete([\"3\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": "Get documents."
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "## Query vector store"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:19.379617Z",
|
||||
"start_time": "2024-08-20T11:10:19.339970Z"
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(id='1', metadata={'embedding': '[1.3296732307905934, 0.0037521341868022385, 0.9821875819319514, 2.5644103644492393]', 'time': 2000, 'type': 'pc'}, page_content='1 hello world'),\n",
|
||||
" None,\n",
|
||||
" Document(id='5', metadata={'embedding': '[1.4558082172139821, -1.6441137122167426, -0.13113098640337423, -1.889685473174525]', 'time': 2030, 'type': 'sky'}, page_content='hi world')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"store.get_by_ids([\"1\", \"3\", \"5\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "Similarity search."
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:21.306717Z",
|
||||
"start_time": "2024-08-20T11:10:21.284244Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(id='1', metadata={'embedding': [1.3296732307905934, 0.0037521341868022385, 0.9821875819319514, 2.5644103644492393], 'time': 2000, 'type': 'pc'}, page_content='1 hello world'),\n",
|
||||
" Document(id='4', metadata={'embedding': [-0.3310144199800685, 0.29250046478723635, -0.0646862290377582, -0.23664360156781225], 'time': 2030, 'type': 'sky'}, page_content='hi world')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"store.similarity_search(query=\"hello world\", k=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "Similarity search with filters. "
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-08-20T11:10:23.231425Z",
|
||||
"start_time": "2024-08-20T11:10:23.213046Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(id='5', metadata={'embedding': [1.4558082172139821, -1.6441137122167426, -0.13113098640337423, -1.889685473174525], 'time': 2030, 'type': 'sky'}, page_content='hi world'),\n",
|
||||
" Document(id='4', metadata={'embedding': [-0.3310144199800685, 0.29250046478723635, -0.0646862290377582, -0.23664360156781225], 'time': 2030, 'type': 'sky'}, page_content='hi world')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"store.similarity_search(\n",
|
||||
" query=\"hello world\",\n",
|
||||
" k=10,\n",
|
||||
" tablestore_filter_query=tablestore.BoolQuery(\n",
|
||||
" must_queries=[tablestore.TermQuery(field_name=\"type\", column_value=\"sky\")],\n",
|
||||
" should_queries=[tablestore.RangeQuery(field_name=\"time\", range_from=2020)],\n",
|
||||
" must_not_queries=[tablestore.TermQuery(field_name=\"type\", column_value=\"pc\")],\n",
|
||||
" ),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage for retrieval-augmented generation\n",
|
||||
"\n",
|
||||
"For guides on how to use this vector store for retrieval-augmented generation (RAG), see the following sections:\n",
|
||||
"\n",
|
||||
"- [Tutorials](/docs/tutorials/)\n",
|
||||
"- [How-to: Question and answer with RAG](https://python.langchain.com/docs/how_to/#qa-with-rag)\n",
|
||||
"- [Retrieval conceptual docs](https://python.langchain.com/docs/concepts/retrieval)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all `TablestoreVectorStore` features and configurations head to the API reference:\n",
|
||||
" https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.tablestore.TablestoreVectorStore.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
@ -245,6 +245,9 @@ if TYPE_CHECKING:
|
||||
from langchain_community.vectorstores.surrealdb import (
|
||||
SurrealDBStore,
|
||||
)
|
||||
from langchain_community.vectorstores.tablestore import (
|
||||
TablestoreVectorStore,
|
||||
)
|
||||
from langchain_community.vectorstores.tair import (
|
||||
Tair,
|
||||
)
|
||||
@ -391,6 +394,7 @@ __all__ = [
|
||||
"StarRocks",
|
||||
"SupabaseVectorStore",
|
||||
"SurrealDBStore",
|
||||
"TablestoreVectorStore",
|
||||
"Tair",
|
||||
"TencentVectorDB",
|
||||
"TiDBVectorStore",
|
||||
@ -495,6 +499,7 @@ _module_lookup = {
|
||||
"StarRocks": "langchain_community.vectorstores.starrocks",
|
||||
"SupabaseVectorStore": "langchain_community.vectorstores.supabase",
|
||||
"SurrealDBStore": "langchain_community.vectorstores.surrealdb",
|
||||
"TablestoreVectorStore": "langchain_community.vectorstores.tablestore",
|
||||
"Tair": "langchain_community.vectorstores.tair",
|
||||
"TencentVectorDB": "langchain_community.vectorstores.tencentvectordb",
|
||||
"TiDBVectorStore": "langchain_community.vectorstores.tidb_vector",
|
||||
|
564
libs/community/langchain_community/vectorstores/tablestore.py
Normal file
564
libs/community/langchain_community/vectorstores/tablestore.py
Normal file
@ -0,0 +1,564 @@
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from typing import (
|
||||
Any,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TablestoreVectorStore(VectorStore):
|
||||
"""`Tablestore` vector store.
|
||||
|
||||
To use, you should have the ``tablestore`` python package installed.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
import os
|
||||
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_community.vectorstores import TablestoreVectorStore
|
||||
import tablestore
|
||||
|
||||
embeddings = OpenAIEmbeddings()
|
||||
store = TablestoreVectorStore(
|
||||
embeddings,
|
||||
endpoint=os.getenv("end_point"),
|
||||
instance_name=os.getenv("instance_name"),
|
||||
access_key_id=os.getenv("access_key_id"),
|
||||
access_key_secret=os.getenv("access_key_secret"),
|
||||
vector_dimension=512,
|
||||
# metadata mapping is used to filter non-vector fields.
|
||||
metadata_mappings=[
|
||||
tablestore.FieldSchema(
|
||||
"type",
|
||||
tablestore.FieldType.KEYWORD,
|
||||
index=True,
|
||||
enable_sort_and_agg=True
|
||||
),
|
||||
tablestore.FieldSchema(
|
||||
"time",
|
||||
tablestore.FieldType.LONG,
|
||||
index=True,
|
||||
enable_sort_and_agg=True
|
||||
),
|
||||
]
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
endpoint: Optional[str] = None,
|
||||
instance_name: Optional[str] = None,
|
||||
access_key_id: Optional[str] = None,
|
||||
access_key_secret: Optional[str] = None,
|
||||
table_name: Optional[str] = "langchain_vector_store_ots_v1",
|
||||
index_name: Optional[str] = "langchain_vector_store_ots_index_v1",
|
||||
text_field: Optional[str] = "content",
|
||||
vector_field: Optional[str] = "embedding",
|
||||
vector_dimension: int = 512,
|
||||
vector_metric_type: Optional[str] = "cosine",
|
||||
metadata_mappings: Optional[List[Any]] = None,
|
||||
):
|
||||
try:
|
||||
import tablestore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tablestore python package. "
|
||||
"Please install it with `pip install tablestore`."
|
||||
)
|
||||
self.__embedding = embedding
|
||||
self.__tablestore_client = tablestore.OTSClient(
|
||||
endpoint,
|
||||
access_key_id,
|
||||
access_key_secret,
|
||||
instance_name,
|
||||
retry_policy=tablestore.WriteRetryPolicy(),
|
||||
)
|
||||
self.__table_name = table_name
|
||||
self.__index_name = index_name
|
||||
self.__vector_dimension = vector_dimension
|
||||
self.__vector_field = vector_field
|
||||
self.__text_field = text_field
|
||||
if vector_metric_type == "cosine":
|
||||
self.__vector_metric_type = tablestore.VectorMetricType.VM_COSINE
|
||||
elif vector_metric_type == "euclidean":
|
||||
self.__vector_metric_type = tablestore.VectorMetricType.VM_EUCLIDEAN
|
||||
elif vector_metric_type == "dot_product":
|
||||
self.__vector_metric_type = tablestore.VectorMetricType.VM_DOT_PRODUCT
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported vector_metric_type operator: {vector_metric_type}"
|
||||
)
|
||||
|
||||
self.__metadata_mappings = [
|
||||
tablestore.FieldSchema(
|
||||
self.__text_field,
|
||||
tablestore.FieldType.TEXT,
|
||||
index=True,
|
||||
enable_sort_and_agg=False,
|
||||
store=False,
|
||||
analyzer=tablestore.AnalyzerType.MAXWORD,
|
||||
),
|
||||
tablestore.FieldSchema(
|
||||
self.__vector_field,
|
||||
tablestore.FieldType.VECTOR,
|
||||
vector_options=tablestore.VectorOptions(
|
||||
data_type=tablestore.VectorDataType.VD_FLOAT_32,
|
||||
dimension=self.__vector_dimension,
|
||||
metric_type=self.__vector_metric_type,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
if metadata_mappings:
|
||||
for mapping in metadata_mappings:
|
||||
if not isinstance(mapping, tablestore.FieldSchema):
|
||||
raise ValueError(
|
||||
f"meta_data mapping should be an "
|
||||
f"instance of tablestore.FieldSchema, "
|
||||
f"bug got {type(mapping)}"
|
||||
)
|
||||
if (
|
||||
mapping.field_name == text_field
|
||||
or mapping.field_name == vector_field
|
||||
):
|
||||
continue
|
||||
self.__metadata_mappings.append(mapping)
|
||||
|
||||
def create_table_if_not_exist(self) -> None:
|
||||
"""Create table if not exist."""
|
||||
|
||||
try:
|
||||
import tablestore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tablestore python package. "
|
||||
"Please install it with `pip install tablestore`."
|
||||
)
|
||||
table_list = self.__tablestore_client.list_table()
|
||||
if self.__table_name in table_list:
|
||||
logger.info("Tablestore system table[%s] already exists", self.__table_name)
|
||||
return None
|
||||
logger.info(
|
||||
"Tablestore system table[%s] does not exist, try to create the table.",
|
||||
self.__table_name,
|
||||
)
|
||||
|
||||
schema_of_primary_key = [("id", "STRING")]
|
||||
table_meta = tablestore.TableMeta(self.__table_name, schema_of_primary_key)
|
||||
table_options = tablestore.TableOptions()
|
||||
reserved_throughput = tablestore.ReservedThroughput(
|
||||
tablestore.CapacityUnit(0, 0)
|
||||
)
|
||||
try:
|
||||
self.__tablestore_client.create_table(
|
||||
table_meta, table_options, reserved_throughput
|
||||
)
|
||||
logger.info("Tablestore create table[%s] successfully.", self.__table_name)
|
||||
except tablestore.OTSClientError as e:
|
||||
logger.exception(
|
||||
"Tablestore create system table[%s] failed with client error, "
|
||||
"http_status:%d, error_message:%s",
|
||||
self.__table_name,
|
||||
e.get_http_status(),
|
||||
e.get_error_message(),
|
||||
)
|
||||
except tablestore.OTSServiceError as e:
|
||||
logger.exception(
|
||||
"Tablestore create system table[%s] failed with client error, "
|
||||
"http_status:%d, error_code:%s, error_message:%s, request_id:%s",
|
||||
self.__table_name,
|
||||
e.get_http_status(),
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
|
||||
def create_search_index_if_not_exist(self) -> None:
|
||||
"""Create search index if not exist."""
|
||||
|
||||
try:
|
||||
import tablestore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tablestore python package. "
|
||||
"Please install it with `pip install tablestore`."
|
||||
)
|
||||
search_index_list = self.__tablestore_client.list_search_index(
|
||||
table_name=self.__table_name
|
||||
)
|
||||
if self.__index_name in [t[1] for t in search_index_list]:
|
||||
logger.info("Tablestore system index[%s] already exists", self.__index_name)
|
||||
return None
|
||||
index_meta = tablestore.SearchIndexMeta(self.__metadata_mappings)
|
||||
self.__tablestore_client.create_search_index(
|
||||
self.__table_name, self.__index_name, index_meta
|
||||
)
|
||||
logger.info(
|
||||
"Tablestore create system index[%s] successfully.", self.__index_name
|
||||
)
|
||||
|
||||
def delete_table_if_exists(self) -> None:
|
||||
"""Delete table if exists."""
|
||||
|
||||
search_index_list = self.__tablestore_client.list_search_index(
|
||||
table_name=self.__table_name
|
||||
)
|
||||
for resp_tuple in search_index_list:
|
||||
self.__tablestore_client.delete_search_index(resp_tuple[0], resp_tuple[1])
|
||||
self.__tablestore_client.delete_table(self.__table_name)
|
||||
|
||||
def delete_search_index(self, table_name: str, index_name: str) -> None:
|
||||
"""Delete search index."""
|
||||
|
||||
self.__tablestore_client.delete_search_index(table_name, index_name)
|
||||
|
||||
def __write_row(
|
||||
self, row_id: str, content: str, embedding_vector: List[float], meta_data: dict
|
||||
) -> None:
|
||||
try:
|
||||
import tablestore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tablestore python package. "
|
||||
"Please install it with `pip install tablestore`."
|
||||
)
|
||||
primary_key = [("id", row_id)]
|
||||
attribute_columns = [
|
||||
(self.__text_field, content),
|
||||
(self.__vector_field, json.dumps(embedding_vector)),
|
||||
]
|
||||
for k, v in meta_data.items():
|
||||
item = (k, v)
|
||||
attribute_columns.append(item)
|
||||
row = tablestore.Row(primary_key, attribute_columns)
|
||||
|
||||
try:
|
||||
self.__tablestore_client.put_row(self.__table_name, row)
|
||||
logger.debug(
|
||||
"Tablestore put row successfully. id:%s, content:%s, meta_data:%s",
|
||||
row_id,
|
||||
content,
|
||||
meta_data,
|
||||
)
|
||||
except tablestore.OTSClientError as e:
|
||||
logger.exception(
|
||||
"Tablestore put row failed with client error:%s, "
|
||||
"id:%s, content:%s, meta_data:%s",
|
||||
e,
|
||||
row_id,
|
||||
content,
|
||||
meta_data,
|
||||
)
|
||||
except tablestore.OTSServiceError as e:
|
||||
logger.exception(
|
||||
"Tablestore put row failed with client error:%s, id:%s, content:%s, "
|
||||
"meta_data:%s, http_status:%d, "
|
||||
"error_code:%s, error_message:%s, request_id:%s",
|
||||
e,
|
||||
row_id,
|
||||
content,
|
||||
meta_data,
|
||||
e.get_http_status(),
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
|
||||
def __delete_row(self, row_id: str) -> None:
|
||||
try:
|
||||
import tablestore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tablestore python package. "
|
||||
"Please install it with `pip install tablestore`."
|
||||
)
|
||||
primary_key = [("id", row_id)]
|
||||
try:
|
||||
self.__tablestore_client.delete_row(self.__table_name, primary_key, None)
|
||||
logger.info("Tablestore delete row successfully. id:%s", row_id)
|
||||
except tablestore.OTSClientError as e:
|
||||
logger.exception(
|
||||
"Tablestore delete row failed with client error:%s, id:%s", e, row_id
|
||||
)
|
||||
except tablestore.OTSServiceError as e:
|
||||
logger.exception(
|
||||
"Tablestore delete row failed with client error:%s, "
|
||||
"id:%s, http_status:%d, error_code:%s, error_message:%s, request_id:%s",
|
||||
e,
|
||||
row_id,
|
||||
e.get_http_status(),
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
|
||||
def __get_row(self, row_id: str) -> Document:
|
||||
try:
|
||||
import tablestore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tablestore python package. "
|
||||
"Please install it with `pip install tablestore`."
|
||||
)
|
||||
primary_key = [("id", row_id)]
|
||||
try:
|
||||
_, row, _ = self.__tablestore_client.get_row(
|
||||
self.__table_name, primary_key, None, None, 1
|
||||
)
|
||||
logger.debug("Tablestore get row successfully. id:%s", row_id)
|
||||
if row is None:
|
||||
raise ValueError("Can't not find row_id:%s in tablestore." % row_id)
|
||||
document_id = row.primary_key[0][1]
|
||||
meta_data = {}
|
||||
text = ""
|
||||
for col in row.attribute_columns:
|
||||
key = col[0]
|
||||
val = col[1]
|
||||
if key == self.__text_field:
|
||||
text = val
|
||||
continue
|
||||
meta_data[key] = val
|
||||
return Document(
|
||||
id=document_id,
|
||||
page_content=text,
|
||||
metadata=meta_data,
|
||||
)
|
||||
except tablestore.OTSClientError as e:
|
||||
logger.exception(
|
||||
"Tablestore get row failed with client error:%s, id:%s", e, row_id
|
||||
)
|
||||
raise e
|
||||
except tablestore.OTSServiceError as e:
|
||||
logger.exception(
|
||||
"Tablestore get row failed with client error:%s, "
|
||||
"id:%s, http_status:%d, error_code:%s, error_message:%s, request_id:%s",
|
||||
e,
|
||||
row_id,
|
||||
e.get_http_status(),
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise e
|
||||
|
||||
def _tablestore_search(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
k: int = 5,
|
||||
tablestore_filter_query: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
try:
|
||||
import tablestore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tablestore python package. "
|
||||
"Please install it with `pip install tablestore`."
|
||||
)
|
||||
if tablestore_filter_query:
|
||||
if not isinstance(tablestore_filter_query, tablestore.Query):
|
||||
raise ValueError(
|
||||
f"table_store_filter_query should be "
|
||||
f"an instance of tablestore.Query, "
|
||||
f"bug got {type(tablestore_filter_query)}"
|
||||
)
|
||||
if "knn_top_k" in kwargs:
|
||||
knn_top_k = kwargs["knn_top_k"]
|
||||
else:
|
||||
knn_top_k = k
|
||||
ots_query = tablestore.KnnVectorQuery(
|
||||
field_name=self.__vector_field,
|
||||
top_k=knn_top_k,
|
||||
float32_query_vector=query_embedding,
|
||||
filter=tablestore_filter_query,
|
||||
)
|
||||
sort = tablestore.Sort(
|
||||
sorters=[tablestore.ScoreSort(sort_order=tablestore.SortOrder.DESC)]
|
||||
)
|
||||
search_query = tablestore.SearchQuery(
|
||||
ots_query, limit=k, get_total_count=False, sort=sort
|
||||
)
|
||||
try:
|
||||
search_response = self.__tablestore_client.search(
|
||||
table_name=self.__table_name,
|
||||
index_name=self.__index_name,
|
||||
search_query=search_query,
|
||||
columns_to_get=tablestore.ColumnsToGet(
|
||||
return_type=tablestore.ColumnReturnType.ALL
|
||||
),
|
||||
)
|
||||
logger.info(
|
||||
"Tablestore search successfully. request_id:%s",
|
||||
search_response.request_id,
|
||||
)
|
||||
tuple_list = []
|
||||
for hit in search_response.search_hits:
|
||||
row = hit.row
|
||||
score = hit.score
|
||||
document_id = row[0][0][1]
|
||||
meta_data = {}
|
||||
text = ""
|
||||
for col in row[1]:
|
||||
key = col[0]
|
||||
val = col[1]
|
||||
if key == self.__text_field:
|
||||
text = val
|
||||
continue
|
||||
if key == self.__vector_field:
|
||||
val = json.loads(val)
|
||||
meta_data[key] = val
|
||||
doc = Document(
|
||||
id=document_id,
|
||||
page_content=text,
|
||||
metadata=meta_data,
|
||||
)
|
||||
tuple_list.append((doc, score))
|
||||
return tuple_list
|
||||
except tablestore.OTSClientError as e:
|
||||
logger.exception("Tablestore search failed with client error:%s", e)
|
||||
raise e
|
||||
except tablestore.OTSServiceError as e:
|
||||
logger.exception(
|
||||
"Tablestore search failed with client error:%s, "
|
||||
"http_status:%d, error_code:%s, error_message:%s, request_id:%s",
|
||||
e,
|
||||
e.get_http_status(),
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise e
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
ids = ids or [str(uuid.uuid4().hex) for _ in texts]
|
||||
text_list = list(texts)
|
||||
embeddings = self.__embedding.embed_documents(text_list)
|
||||
for i in range(len(ids)):
|
||||
row_id = ids[i]
|
||||
text = text_list[i]
|
||||
embedding_vector = embeddings[i]
|
||||
metadata = dict()
|
||||
if metadatas and metadatas[i]:
|
||||
metadata = metadatas[i]
|
||||
self.__write_row(
|
||||
row_id=row_id,
|
||||
content=text,
|
||||
embedding_vector=embedding_vector,
|
||||
meta_data=metadata,
|
||||
)
|
||||
return ids
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
||||
if ids:
|
||||
for row_id in ids:
|
||||
self.__delete_row(row_id)
|
||||
return True
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
return [self.__get_row(row_id) for row_id in ids]
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
tablestore_filter_query: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
return [
|
||||
doc
|
||||
for (doc, score) in self.similarity_search_with_score(
|
||||
query, k=k, tablestore_filter_query=tablestore_filter_query, **kwargs
|
||||
)
|
||||
]
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
tablestore_filter_query: Optional[Any] = None,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
query_embedding = self.__embedding.embed_query(query)
|
||||
return self._tablestore_search(
|
||||
query_embedding,
|
||||
k=k,
|
||||
tablestore_filter_query=tablestore_filter_query,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
tablestore_filter_query: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
return [
|
||||
doc
|
||||
for (doc, score) in self._tablestore_search(
|
||||
embedding,
|
||||
k=k,
|
||||
tablestore_filter_query=tablestore_filter_query,
|
||||
**kwargs,
|
||||
)
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
endpoint: Optional[str] = None,
|
||||
instance_name: Optional[str] = None,
|
||||
access_key_id: Optional[str] = None,
|
||||
access_key_secret: Optional[str] = None,
|
||||
table_name: Optional[str] = "langchain_vector_store_ots_v1",
|
||||
index_name: Optional[str] = "langchain_vector_store_ots_index_v1",
|
||||
text_field: Optional[str] = "content",
|
||||
vector_field: Optional[str] = "embedding",
|
||||
vector_dimension: int = 512,
|
||||
vector_metric_type: Optional[str] = "cosine",
|
||||
metadata_mappings: Optional[List[Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> "TablestoreVectorStore":
|
||||
store = cls(
|
||||
embedding=embedding,
|
||||
endpoint=endpoint,
|
||||
instance_name=instance_name,
|
||||
access_key_id=access_key_id,
|
||||
access_key_secret=access_key_secret,
|
||||
table_name=table_name,
|
||||
index_name=index_name,
|
||||
text_field=text_field,
|
||||
vector_field=vector_field,
|
||||
vector_dimension=vector_dimension,
|
||||
vector_metric_type=vector_metric_type,
|
||||
metadata_mappings=metadata_mappings,
|
||||
)
|
||||
store.create_table_if_not_exist()
|
||||
store.create_search_index_if_not_exist()
|
||||
store.add_texts(texts, metadatas)
|
||||
return store
|
@ -0,0 +1,93 @@
|
||||
"""Test tablestore functionality."""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.embeddings import FakeEmbeddings
|
||||
from langchain_community.vectorstores.tablestore import TablestoreVectorStore
|
||||
|
||||
|
||||
def test_tablestore() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
test_embedding_dimension_size = 4
|
||||
embeddings = FakeEmbeddings(size=test_embedding_dimension_size)
|
||||
|
||||
end_point = os.getenv("end_point")
|
||||
instance_name = os.getenv("instance_name")
|
||||
access_key_id = os.getenv("access_key_id")
|
||||
access_key_secret = os.getenv("access_key_secret")
|
||||
if (
|
||||
end_point is None
|
||||
or instance_name is None
|
||||
or access_key_id is None
|
||||
or access_key_secret is None
|
||||
):
|
||||
pytest.skip(
|
||||
"end_point is None or instance_name is None or "
|
||||
"access_key_id is None or access_key_secret is None"
|
||||
)
|
||||
"""
|
||||
1. create vector store
|
||||
"""
|
||||
store = TablestoreVectorStore(
|
||||
embedding=embeddings,
|
||||
endpoint=end_point,
|
||||
instance_name=instance_name,
|
||||
access_key_id=access_key_id,
|
||||
access_key_secret=access_key_secret,
|
||||
vector_dimension=test_embedding_dimension_size,
|
||||
)
|
||||
|
||||
"""
|
||||
2. create table and index. (only needs to be run once)
|
||||
"""
|
||||
store.create_table_if_not_exist()
|
||||
store.create_search_index_if_not_exist()
|
||||
|
||||
"""
|
||||
3. add document
|
||||
"""
|
||||
store.add_documents(
|
||||
[
|
||||
Document(
|
||||
id="1",
|
||||
page_content="1 hello world",
|
||||
metadata={"type": "pc", "time": 2000},
|
||||
),
|
||||
Document(
|
||||
id="2", page_content="abc world", metadata={"type": "pc", "time": 2009}
|
||||
),
|
||||
Document(
|
||||
id="3",
|
||||
page_content="3 text world",
|
||||
metadata={"type": "sky", "time": 2010},
|
||||
),
|
||||
Document(
|
||||
id="4", page_content="hi world", metadata={"type": "sky", "time": 2030}
|
||||
),
|
||||
Document(
|
||||
id="5", page_content="hi world", metadata={"type": "sky", "time": 2030}
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
"""
|
||||
4. delete document
|
||||
"""
|
||||
assert store.delete(["3"])
|
||||
|
||||
"""
|
||||
5. get document
|
||||
"""
|
||||
get_docs = store.get_by_ids(["1", "4"])
|
||||
assert len(get_docs) == 2
|
||||
assert get_docs[0].id == "1"
|
||||
assert get_docs[1].id == "4"
|
||||
|
||||
"""
|
||||
6. similarity_search
|
||||
"""
|
||||
search_result = store.similarity_search_with_score(query="hello world", k=2)
|
||||
assert len(search_result) == 2
|
@ -84,6 +84,7 @@ EXPECTED_ALL = [
|
||||
"StarRocks",
|
||||
"SupabaseVectorStore",
|
||||
"SurrealDBStore",
|
||||
"TablestoreVectorStore",
|
||||
"Tair",
|
||||
"TencentVectorDB",
|
||||
"TiDBVectorStore",
|
||||
|
@ -88,6 +88,7 @@ def test_compatible_vectorstore_documentation() -> None:
|
||||
"SingleStoreDB",
|
||||
"SupabaseVectorStore",
|
||||
"SurrealDBStore",
|
||||
"TablestoreVectorStore",
|
||||
"TileDB",
|
||||
"TimescaleVector",
|
||||
"TencentVectorDB",
|
||||
|
Loading…
Reference in New Issue
Block a user