mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-10 06:55:09 +00:00
Update Hologres vector store: use hologres-vector (#13767)
Hi, I made some code changes on the Hologres vector store to improve the data insertion performance. Also, this version of the code uses `hologres-vector` library. This library is more convenient for us to update, and more efficient in performance. The code has passed the format/lint/spell check. I have run the unit test for Hologres connecting to my own database. Please check this PR again and tell me if anything needs to change. Best, Changgeng, Developer @ Alibaba Cloud Co-authored-by: Changgeng Zhao <zhaochanggeng.zcg@alibaba-inc.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
0de7cf898d
commit
9b59bde93d
@ -11,7 +11,7 @@
|
||||
Click [here](https://www.alibabacloud.com/zh/product/hologres) to fast deploy a Hologres cloud instance.
|
||||
|
||||
```bash
|
||||
pip install psycopg2
|
||||
pip install hologres-vector
|
||||
```
|
||||
|
||||
## Vector Store
|
||||
|
@ -22,7 +22,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install psycopg2"
|
||||
"!pip install hologres-vector"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
|
||||
@ -15,104 +14,6 @@ ADA_TOKEN_COUNT = 1536
|
||||
_LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding"
|
||||
|
||||
|
||||
class HologresWrapper:
|
||||
"""`Hologres API` wrapper."""
|
||||
|
||||
def __init__(self, connection_string: str, ndims: int, table_name: str) -> None:
|
||||
"""Initialize the wrapper.
|
||||
|
||||
Args:
|
||||
connection_string: Hologres connection string.
|
||||
ndims: Number of dimensions of the embedding output.
|
||||
table_name: Name of the table to store embeddings and data.
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
|
||||
self.table_name = table_name
|
||||
self.conn = psycopg2.connect(connection_string)
|
||||
self.cursor = self.conn.cursor()
|
||||
self.conn.autocommit = False
|
||||
self.ndims = ndims
|
||||
|
||||
def create_vector_extension(self) -> None:
|
||||
self.cursor.execute("create extension if not exists proxima")
|
||||
self.conn.commit()
|
||||
|
||||
def create_table(self, drop_if_exist: bool = True) -> None:
|
||||
if drop_if_exist:
|
||||
self.cursor.execute(f"drop table if exists {self.table_name}")
|
||||
self.conn.commit()
|
||||
|
||||
self.cursor.execute(
|
||||
f"""create table if not exists {self.table_name} (
|
||||
id text,
|
||||
embedding float4[] check(array_ndims(embedding) = 1 and \
|
||||
array_length(embedding, 1) = {self.ndims}),
|
||||
metadata json,
|
||||
document text);"""
|
||||
)
|
||||
self.cursor.execute(
|
||||
f"call set_table_property('{self.table_name}'"
|
||||
+ """, 'proxima_vectors',
|
||||
'{"embedding":{"algorithm":"Graph",
|
||||
"distance_method":"SquaredEuclidean",
|
||||
"build_params":{"min_flush_proxima_row_count" : 1,
|
||||
"min_compaction_proxima_row_count" : 1,
|
||||
"max_total_size_to_merge_mb" : 2000}}}');"""
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def get_by_id(self, id: str) -> List[Tuple]:
|
||||
statement = (
|
||||
f"select id, embedding, metadata, "
|
||||
f"document from {self.table_name} where id = %s;"
|
||||
)
|
||||
self.cursor.execute(
|
||||
statement,
|
||||
(id),
|
||||
)
|
||||
self.conn.commit()
|
||||
return self.cursor.fetchall()
|
||||
|
||||
def insert(
|
||||
self,
|
||||
embedding: List[float],
|
||||
metadata: dict,
|
||||
document: str,
|
||||
id: Optional[str] = None,
|
||||
) -> None:
|
||||
self.cursor.execute(
|
||||
f'insert into "{self.table_name}" '
|
||||
f"values (%s, array{json.dumps(embedding)}::float4[], %s, %s)",
|
||||
(id if id is not None else "null", json.dumps(metadata), document),
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def query_nearest_neighbours(
|
||||
self, embedding: List[float], k: int, filter: Optional[Dict[str, str]] = None
|
||||
) -> List[Tuple[str, str, float]]:
|
||||
params = []
|
||||
filter_clause = ""
|
||||
if filter is not None:
|
||||
conjuncts = []
|
||||
for key, val in filter.items():
|
||||
conjuncts.append("metadata->>%s=%s")
|
||||
params.append(key)
|
||||
params.append(val)
|
||||
filter_clause = "where " + " and ".join(conjuncts)
|
||||
|
||||
sql = (
|
||||
f"select document, metadata::text, "
|
||||
f"pm_approx_squared_euclidean_distance(array{json.dumps(embedding)}"
|
||||
f"::float4[], embedding) as distance from"
|
||||
f" {self.table_name} {filter_clause} order by distance asc limit {k};"
|
||||
)
|
||||
self.cursor.execute(sql, tuple(params))
|
||||
self.conn.commit()
|
||||
return self.cursor.fetchall()
|
||||
|
||||
|
||||
class Hologres(VectorStore):
|
||||
"""`Hologres API` vector store.
|
||||
|
||||
@ -152,26 +53,20 @@ class Hologres(VectorStore):
|
||||
"""
|
||||
Initialize the store.
|
||||
"""
|
||||
self.storage = HologresWrapper(
|
||||
self.connection_string, self.ndims, self.table_name
|
||||
from hologres_vector import HologresVector
|
||||
|
||||
self.storage = HologresVector(
|
||||
self.connection_string,
|
||||
ndims=self.ndims,
|
||||
table_name=self.table_name,
|
||||
table_schema={"document": "text"},
|
||||
pre_delete_table=self.pre_delete_table,
|
||||
)
|
||||
self.create_vector_extension()
|
||||
self.create_table()
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self.embedding_function
|
||||
|
||||
def create_vector_extension(self) -> None:
|
||||
try:
|
||||
self.storage.create_vector_extension()
|
||||
except Exception as e:
|
||||
self.logger.exception(e)
|
||||
raise e
|
||||
|
||||
def create_table(self) -> None:
|
||||
self.storage.create_table(self.pre_delete_table)
|
||||
|
||||
@classmethod
|
||||
def __from(
|
||||
cls,
|
||||
@ -224,11 +119,10 @@ class Hologres(VectorStore):
|
||||
kwargs: vectorstore specific parameters
|
||||
"""
|
||||
try:
|
||||
for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
|
||||
self.storage.insert(embedding, metadata, text, id)
|
||||
schema_datas = [{"document": t} for t in texts]
|
||||
self.storage.upsert_vectors(embeddings, ids, metadatas, schema_datas)
|
||||
except Exception as e:
|
||||
self.logger.exception(e)
|
||||
self.storage.conn.commit()
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
@ -333,17 +227,17 @@ class Hologres(VectorStore):
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
results: List[Tuple[str, str, float]] = self.storage.query_nearest_neighbours(
|
||||
embedding, k, filter
|
||||
results: List[dict[str, Any]] = self.storage.search(
|
||||
embedding, k=k, select_columns=["document"], metadata_filters=filter
|
||||
)
|
||||
|
||||
docs = [
|
||||
(
|
||||
Document(
|
||||
page_content=result[0],
|
||||
metadata=json.loads(result[1]),
|
||||
page_content=result["document"],
|
||||
metadata=result["metadata"],
|
||||
),
|
||||
result[2],
|
||||
result["distance"],
|
||||
)
|
||||
for result in results
|
||||
]
|
||||
@ -363,9 +257,11 @@ class Hologres(VectorStore):
|
||||
) -> Hologres:
|
||||
"""
|
||||
Return VectorStore initialized from texts and embeddings.
|
||||
Postgres connection string is required
|
||||
Hologres connection string is required
|
||||
"Either pass it as a parameter
|
||||
or set the HOLOGRES_CONNECTION_STRING environment variable.
|
||||
Create the connection string by calling
|
||||
HologresVector.connection_string_from_db_params
|
||||
"""
|
||||
embeddings = embedding.embed_documents(list(texts))
|
||||
|
||||
@ -397,9 +293,11 @@ class Hologres(VectorStore):
|
||||
generated embeddings.
|
||||
|
||||
Return VectorStore initialized from documents and embeddings.
|
||||
Postgres connection string is required
|
||||
Hologres connection string is required
|
||||
"Either pass it as a parameter
|
||||
or set the HOLOGRES_CONNECTION_STRING environment variable.
|
||||
Create the connection string by calling
|
||||
HologresVector.connection_string_from_db_params
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
@ -463,9 +361,11 @@ class Hologres(VectorStore):
|
||||
|
||||
if not connection_string:
|
||||
raise ValueError(
|
||||
"Postgres connection string is required"
|
||||
"Hologres connection string is required"
|
||||
"Either pass it as a parameter"
|
||||
"or set the HOLOGRES_CONNECTION_STRING environment variable."
|
||||
"Create the connection string by calling"
|
||||
"HologresVector.connection_string_from_db_params"
|
||||
)
|
||||
|
||||
return connection_string
|
||||
@ -483,9 +383,11 @@ class Hologres(VectorStore):
|
||||
) -> Hologres:
|
||||
"""
|
||||
Return VectorStore initialized from documents and embeddings.
|
||||
Postgres connection string is required
|
||||
Hologres connection string is required
|
||||
"Either pass it as a parameter
|
||||
or set the HOLOGRES_CONNECTION_STRING environment variable.
|
||||
Create the connection string by calling
|
||||
HologresVector.connection_string_from_db_params
|
||||
"""
|
||||
|
||||
texts = [d.page_content for d in documents]
|
||||
|
43
libs/langchain/poetry.lock
generated
43
libs/langchain/poetry.lock
generated
@ -3281,6 +3281,22 @@ files = [
|
||||
[package.dependencies]
|
||||
numpy = "*"
|
||||
|
||||
[[package]]
|
||||
name = "hologres-vector"
|
||||
version = "0.0.6"
|
||||
description = ""
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "hologres_vector-0.0.6-py3-none-any.whl", hash = "sha256:c506eaafd9ae8c529955605fae71856e95191a64dde144d0a25b06536e6544a4"},
|
||||
{file = "hologres_vector-0.0.6.tar.gz", hash = "sha256:13251b74bcb9ef2af61cc39c6f155e16452e03891c2f0a07f708f0157baf7b08"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
psycopg2-binary = "*"
|
||||
typing = "*"
|
||||
uuid = "*"
|
||||
|
||||
[[package]]
|
||||
name = "hpack"
|
||||
version = "4.0.0"
|
||||
@ -10452,6 +10468,17 @@ files = [
|
||||
{file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing"
|
||||
version = "3.7.4.3"
|
||||
description = "Type Hints for Python"
|
||||
optional = true
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
files = [
|
||||
{file = "typing-3.7.4.3-py2-none-any.whl", hash = "sha256:283d868f5071ab9ad873e5e52268d611e851c870a2ba354193026f2dfb29d8b5"},
|
||||
{file = "typing-3.7.4.3.tar.gz", hash = "sha256:1187fb9c82fd670d10aa07bbb6cfcfe4bdda42d6fab8d5134f04e8c4d0b71cc9"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.8.0"
|
||||
@ -10583,6 +10610,16 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl
|
||||
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "1.30"
|
||||
description = "UUID object and generation functions (Python 2.3 or higher)"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "uuid-1.30.tar.gz", hash = "sha256:1f87cc004ac5120466f36c5beae48b4c48cc411968eed0eaecd3da82aa96193f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "validators"
|
||||
version = "0.22.0"
|
||||
@ -11431,14 +11468,14 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
||||
cffi = ["cffi (>=1.11)"]
|
||||
|
||||
[extras]
|
||||
all = ["O365", "aleph-alpha-client", "amadeus", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "dgml-utils", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jinja2", "jq", "lancedb", "langkit", "lark", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
||||
all = ["O365", "aleph-alpha-client", "amadeus", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "dgml-utils", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "hologres-vector", "html2text", "huggingface_hub", "jinja2", "jq", "lancedb", "langkit", "lark", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
||||
azure = ["azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"]
|
||||
clarifai = ["clarifai"]
|
||||
cli = ["typer"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
@ -11448,4 +11485,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "92909a7f5f12e9861a45e19cdd271ca516aebe71f4b2702c95b651966d2db6b7"
|
||||
content-hash = "0cd9769243ade0dc1df941e902aa66c18a57333ae50309f004b4f60e6e27b5cf"
|
||||
|
@ -143,6 +143,7 @@ azure-ai-textanalytics = {version = "^5.3.0", optional = true}
|
||||
google-cloud-documentai = {version = "^2.20.1", optional = true}
|
||||
fireworks-ai = {version = "^0.6.0", optional = true, python = ">=3.9,<4.0"}
|
||||
javelin-sdk = {version = "^0.1.8", optional = true}
|
||||
hologres-vector = {version = "^0.0.6", optional = true}
|
||||
praw = {version = "^7.7.1", optional = true}
|
||||
msal = {version = "^1.25.0", optional = true}
|
||||
databricks-vectorsearch = {version = "^0.21", optional = true}
|
||||
@ -315,6 +316,7 @@ all = [
|
||||
"amadeus",
|
||||
"librosa",
|
||||
"python-arango",
|
||||
"hologres-vector",
|
||||
"dgml-utils",
|
||||
]
|
||||
|
||||
@ -386,6 +388,7 @@ extended_testing = [
|
||||
"rspace_client",
|
||||
"fireworks-ai",
|
||||
"javelin-sdk",
|
||||
"hologres-vector",
|
||||
"praw",
|
||||
"databricks-vectorsearch",
|
||||
"dgml-utils",
|
||||
|
Loading…
Reference in New Issue
Block a user