Update Hologres vector store: use hologres-vector (#13767)

Hi,
I made some code changes on the Hologres vector store to improve the
data insertion performance.
Also, this version of the code uses `hologres-vector` library. This
library is more convenient for us to update, and more efficient in
performance.
The code has passed the format/lint/spell check. I have run the unit
test for Hologres connecting to my own database.
Please check this PR again and tell me if anything needs to change.

Best,
Changgeng,
Developer @ Alibaba Cloud

Co-authored-by: Changgeng Zhao <zhaochanggeng.zcg@alibaba-inc.com>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Changgeng Zhao 2023-12-04 03:50:45 +08:00 committed by GitHub
parent 0de7cf898d
commit 9b59bde93d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 72 additions and 130 deletions

View File

@ -11,7 +11,7 @@
Click [here](https://www.alibabacloud.com/zh/product/hologres) to fast deploy a Hologres cloud instance. Click [here](https://www.alibabacloud.com/zh/product/hologres) to fast deploy a Hologres cloud instance.
```bash ```bash
pip install psycopg2 pip install hologres-vector
``` ```
## Vector Store ## Vector Store

View File

@ -22,7 +22,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"#!pip install psycopg2" "!pip install hologres-vector"
] ]
}, },
{ {

View File

@ -1,6 +1,5 @@
from __future__ import annotations from __future__ import annotations
import json
import logging import logging
import uuid import uuid
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
@ -15,104 +14,6 @@ ADA_TOKEN_COUNT = 1536
_LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding" _LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding"
class HologresWrapper:
"""`Hologres API` wrapper."""
def __init__(self, connection_string: str, ndims: int, table_name: str) -> None:
"""Initialize the wrapper.
Args:
connection_string: Hologres connection string.
ndims: Number of dimensions of the embedding output.
table_name: Name of the table to store embeddings and data.
"""
import psycopg2
self.table_name = table_name
self.conn = psycopg2.connect(connection_string)
self.cursor = self.conn.cursor()
self.conn.autocommit = False
self.ndims = ndims
def create_vector_extension(self) -> None:
self.cursor.execute("create extension if not exists proxima")
self.conn.commit()
def create_table(self, drop_if_exist: bool = True) -> None:
if drop_if_exist:
self.cursor.execute(f"drop table if exists {self.table_name}")
self.conn.commit()
self.cursor.execute(
f"""create table if not exists {self.table_name} (
id text,
embedding float4[] check(array_ndims(embedding) = 1 and \
array_length(embedding, 1) = {self.ndims}),
metadata json,
document text);"""
)
self.cursor.execute(
f"call set_table_property('{self.table_name}'"
+ """, 'proxima_vectors',
'{"embedding":{"algorithm":"Graph",
"distance_method":"SquaredEuclidean",
"build_params":{"min_flush_proxima_row_count" : 1,
"min_compaction_proxima_row_count" : 1,
"max_total_size_to_merge_mb" : 2000}}}');"""
)
self.conn.commit()
def get_by_id(self, id: str) -> List[Tuple]:
statement = (
f"select id, embedding, metadata, "
f"document from {self.table_name} where id = %s;"
)
self.cursor.execute(
statement,
(id),
)
self.conn.commit()
return self.cursor.fetchall()
def insert(
self,
embedding: List[float],
metadata: dict,
document: str,
id: Optional[str] = None,
) -> None:
self.cursor.execute(
f'insert into "{self.table_name}" '
f"values (%s, array{json.dumps(embedding)}::float4[], %s, %s)",
(id if id is not None else "null", json.dumps(metadata), document),
)
self.conn.commit()
def query_nearest_neighbours(
self, embedding: List[float], k: int, filter: Optional[Dict[str, str]] = None
) -> List[Tuple[str, str, float]]:
params = []
filter_clause = ""
if filter is not None:
conjuncts = []
for key, val in filter.items():
conjuncts.append("metadata->>%s=%s")
params.append(key)
params.append(val)
filter_clause = "where " + " and ".join(conjuncts)
sql = (
f"select document, metadata::text, "
f"pm_approx_squared_euclidean_distance(array{json.dumps(embedding)}"
f"::float4[], embedding) as distance from"
f" {self.table_name} {filter_clause} order by distance asc limit {k};"
)
self.cursor.execute(sql, tuple(params))
self.conn.commit()
return self.cursor.fetchall()
class Hologres(VectorStore): class Hologres(VectorStore):
"""`Hologres API` vector store. """`Hologres API` vector store.
@ -152,26 +53,20 @@ class Hologres(VectorStore):
""" """
Initialize the store. Initialize the store.
""" """
self.storage = HologresWrapper( from hologres_vector import HologresVector
self.connection_string, self.ndims, self.table_name
self.storage = HologresVector(
self.connection_string,
ndims=self.ndims,
table_name=self.table_name,
table_schema={"document": "text"},
pre_delete_table=self.pre_delete_table,
) )
self.create_vector_extension()
self.create_table()
@property @property
def embeddings(self) -> Embeddings: def embeddings(self) -> Embeddings:
return self.embedding_function return self.embedding_function
def create_vector_extension(self) -> None:
try:
self.storage.create_vector_extension()
except Exception as e:
self.logger.exception(e)
raise e
def create_table(self) -> None:
self.storage.create_table(self.pre_delete_table)
@classmethod @classmethod
def __from( def __from(
cls, cls,
@ -224,11 +119,10 @@ class Hologres(VectorStore):
kwargs: vectorstore specific parameters kwargs: vectorstore specific parameters
""" """
try: try:
for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids): schema_datas = [{"document": t} for t in texts]
self.storage.insert(embedding, metadata, text, id) self.storage.upsert_vectors(embeddings, ids, metadatas, schema_datas)
except Exception as e: except Exception as e:
self.logger.exception(e) self.logger.exception(e)
self.storage.conn.commit()
def add_texts( def add_texts(
self, self,
@ -333,17 +227,17 @@ class Hologres(VectorStore):
k: int = 4, k: int = 4,
filter: Optional[dict] = None, filter: Optional[dict] = None,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
results: List[Tuple[str, str, float]] = self.storage.query_nearest_neighbours( results: List[dict[str, Any]] = self.storage.search(
embedding, k, filter embedding, k=k, select_columns=["document"], metadata_filters=filter
) )
docs = [ docs = [
( (
Document( Document(
page_content=result[0], page_content=result["document"],
metadata=json.loads(result[1]), metadata=result["metadata"],
), ),
result[2], result["distance"],
) )
for result in results for result in results
] ]
@ -363,9 +257,11 @@ class Hologres(VectorStore):
) -> Hologres: ) -> Hologres:
""" """
Return VectorStore initialized from texts and embeddings. Return VectorStore initialized from texts and embeddings.
Postgres connection string is required Hologres connection string is required
"Either pass it as a parameter "Either pass it as a parameter
or set the HOLOGRES_CONNECTION_STRING environment variable. or set the HOLOGRES_CONNECTION_STRING environment variable.
Create the connection string by calling
HologresVector.connection_string_from_db_params
""" """
embeddings = embedding.embed_documents(list(texts)) embeddings = embedding.embed_documents(list(texts))
@ -397,9 +293,11 @@ class Hologres(VectorStore):
generated embeddings. generated embeddings.
Return VectorStore initialized from documents and embeddings. Return VectorStore initialized from documents and embeddings.
Postgres connection string is required Hologres connection string is required
"Either pass it as a parameter "Either pass it as a parameter
or set the HOLOGRES_CONNECTION_STRING environment variable. or set the HOLOGRES_CONNECTION_STRING environment variable.
Create the connection string by calling
HologresVector.connection_string_from_db_params
Example: Example:
.. code-block:: python .. code-block:: python
@ -463,9 +361,11 @@ class Hologres(VectorStore):
if not connection_string: if not connection_string:
raise ValueError( raise ValueError(
"Postgres connection string is required" "Hologres connection string is required"
"Either pass it as a parameter" "Either pass it as a parameter"
"or set the HOLOGRES_CONNECTION_STRING environment variable." "or set the HOLOGRES_CONNECTION_STRING environment variable."
"Create the connection string by calling"
"HologresVector.connection_string_from_db_params"
) )
return connection_string return connection_string
@ -483,9 +383,11 @@ class Hologres(VectorStore):
) -> Hologres: ) -> Hologres:
""" """
Return VectorStore initialized from documents and embeddings. Return VectorStore initialized from documents and embeddings.
Postgres connection string is required Hologres connection string is required
"Either pass it as a parameter "Either pass it as a parameter
or set the HOLOGRES_CONNECTION_STRING environment variable. or set the HOLOGRES_CONNECTION_STRING environment variable.
Create the connection string by calling
HologresVector.connection_string_from_db_params
""" """
texts = [d.page_content for d in documents] texts = [d.page_content for d in documents]

View File

@ -3281,6 +3281,22 @@ files = [
[package.dependencies] [package.dependencies]
numpy = "*" numpy = "*"
[[package]]
name = "hologres-vector"
version = "0.0.6"
description = ""
optional = true
python-versions = ">=3.7"
files = [
{file = "hologres_vector-0.0.6-py3-none-any.whl", hash = "sha256:c506eaafd9ae8c529955605fae71856e95191a64dde144d0a25b06536e6544a4"},
{file = "hologres_vector-0.0.6.tar.gz", hash = "sha256:13251b74bcb9ef2af61cc39c6f155e16452e03891c2f0a07f708f0157baf7b08"},
]
[package.dependencies]
psycopg2-binary = "*"
typing = "*"
uuid = "*"
[[package]] [[package]]
name = "hpack" name = "hpack"
version = "4.0.0" version = "4.0.0"
@ -10452,6 +10468,17 @@ files = [
{file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"},
] ]
[[package]]
name = "typing"
version = "3.7.4.3"
description = "Type Hints for Python"
optional = true
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "typing-3.7.4.3-py2-none-any.whl", hash = "sha256:283d868f5071ab9ad873e5e52268d611e851c870a2ba354193026f2dfb29d8b5"},
{file = "typing-3.7.4.3.tar.gz", hash = "sha256:1187fb9c82fd670d10aa07bbb6cfcfe4bdda42d6fab8d5134f04e8c4d0b71cc9"},
]
[[package]] [[package]]
name = "typing-extensions" name = "typing-extensions"
version = "4.8.0" version = "4.8.0"
@ -10583,6 +10610,16 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[[package]]
name = "uuid"
version = "1.30"
description = "UUID object and generation functions (Python 2.3 or higher)"
optional = true
python-versions = "*"
files = [
{file = "uuid-1.30.tar.gz", hash = "sha256:1f87cc004ac5120466f36c5beae48b4c48cc411968eed0eaecd3da82aa96193f"},
]
[[package]] [[package]]
name = "validators" name = "validators"
version = "0.22.0" version = "0.22.0"
@ -11431,14 +11468,14 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"] cffi = ["cffi (>=1.11)"]
[extras] [extras]
all = ["O365", "aleph-alpha-client", "amadeus", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "dgml-utils", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jinja2", "jq", "lancedb", "langkit", "lark", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] all = ["O365", "aleph-alpha-client", "amadeus", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "dgml-utils", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "hologres-vector", "html2text", "huggingface_hub", "jinja2", "jq", "lancedb", "langkit", "lark", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
azure = ["azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"] azure = ["azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"]
clarifai = ["clarifai"] clarifai = ["clarifai"]
cli = ["typer"] cli = ["typer"]
cohere = ["cohere"] cohere = ["cohere"]
docarray = ["docarray"] docarray = ["docarray"]
embeddings = ["sentence-transformers"] embeddings = ["sentence-transformers"]
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"] extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
javascript = ["esprima"] javascript = ["esprima"]
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"] openai = ["openai", "tiktoken"]
@ -11448,4 +11485,4 @@ text-helpers = ["chardet"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "92909a7f5f12e9861a45e19cdd271ca516aebe71f4b2702c95b651966d2db6b7" content-hash = "0cd9769243ade0dc1df941e902aa66c18a57333ae50309f004b4f60e6e27b5cf"

View File

@ -143,6 +143,7 @@ azure-ai-textanalytics = {version = "^5.3.0", optional = true}
google-cloud-documentai = {version = "^2.20.1", optional = true} google-cloud-documentai = {version = "^2.20.1", optional = true}
fireworks-ai = {version = "^0.6.0", optional = true, python = ">=3.9,<4.0"} fireworks-ai = {version = "^0.6.0", optional = true, python = ">=3.9,<4.0"}
javelin-sdk = {version = "^0.1.8", optional = true} javelin-sdk = {version = "^0.1.8", optional = true}
hologres-vector = {version = "^0.0.6", optional = true}
praw = {version = "^7.7.1", optional = true} praw = {version = "^7.7.1", optional = true}
msal = {version = "^1.25.0", optional = true} msal = {version = "^1.25.0", optional = true}
databricks-vectorsearch = {version = "^0.21", optional = true} databricks-vectorsearch = {version = "^0.21", optional = true}
@ -315,6 +316,7 @@ all = [
"amadeus", "amadeus",
"librosa", "librosa",
"python-arango", "python-arango",
"hologres-vector",
"dgml-utils", "dgml-utils",
] ]
@ -386,6 +388,7 @@ extended_testing = [
"rspace_client", "rspace_client",
"fireworks-ai", "fireworks-ai",
"javelin-sdk", "javelin-sdk",
"hologres-vector",
"praw", "praw",
"databricks-vectorsearch", "databricks-vectorsearch",
"dgml-utils", "dgml-utils",