diff --git a/docs/docs/integrations/providers/hologres.mdx b/docs/docs/integrations/providers/hologres.mdx index 44ecf3008b1..dea4c567ab7 100644 --- a/docs/docs/integrations/providers/hologres.mdx +++ b/docs/docs/integrations/providers/hologres.mdx @@ -11,7 +11,7 @@ Click [here](https://www.alibabacloud.com/zh/product/hologres) to fast deploy a Hologres cloud instance. ```bash -pip install psycopg2 +pip install hologres-vector ``` ## Vector Store diff --git a/docs/docs/integrations/vectorstores/hologres.ipynb b/docs/docs/integrations/vectorstores/hologres.ipynb index 12892056803..a3c1da5531a 100644 --- a/docs/docs/integrations/vectorstores/hologres.ipynb +++ b/docs/docs/integrations/vectorstores/hologres.ipynb @@ -22,7 +22,7 @@ "metadata": {}, "outputs": [], "source": [ - "#!pip install psycopg2" + "!pip install hologres-vector" ] }, { diff --git a/libs/langchain/langchain/vectorstores/hologres.py b/libs/langchain/langchain/vectorstores/hologres.py index 1739a4b9b87..eed7006b09d 100644 --- a/libs/langchain/langchain/vectorstores/hologres.py +++ b/libs/langchain/langchain/vectorstores/hologres.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json import logging import uuid from typing import Any, Dict, Iterable, List, Optional, Tuple, Type @@ -15,104 +14,6 @@ ADA_TOKEN_COUNT = 1536 _LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding" -class HologresWrapper: - """`Hologres API` wrapper.""" - - def __init__(self, connection_string: str, ndims: int, table_name: str) -> None: - """Initialize the wrapper. - - Args: - connection_string: Hologres connection string. - ndims: Number of dimensions of the embedding output. - table_name: Name of the table to store embeddings and data. - """ - - import psycopg2 - - self.table_name = table_name - self.conn = psycopg2.connect(connection_string) - self.cursor = self.conn.cursor() - self.conn.autocommit = False - self.ndims = ndims - - def create_vector_extension(self) -> None: - self.cursor.execute("create extension if not exists proxima") - self.conn.commit() - - def create_table(self, drop_if_exist: bool = True) -> None: - if drop_if_exist: - self.cursor.execute(f"drop table if exists {self.table_name}") - self.conn.commit() - - self.cursor.execute( - f"""create table if not exists {self.table_name} ( -id text, -embedding float4[] check(array_ndims(embedding) = 1 and \ -array_length(embedding, 1) = {self.ndims}), -metadata json, -document text);""" - ) - self.cursor.execute( - f"call set_table_property('{self.table_name}'" - + """, 'proxima_vectors', -'{"embedding":{"algorithm":"Graph", -"distance_method":"SquaredEuclidean", -"build_params":{"min_flush_proxima_row_count" : 1, -"min_compaction_proxima_row_count" : 1, -"max_total_size_to_merge_mb" : 2000}}}');""" - ) - self.conn.commit() - - def get_by_id(self, id: str) -> List[Tuple]: - statement = ( - f"select id, embedding, metadata, " - f"document from {self.table_name} where id = %s;" - ) - self.cursor.execute( - statement, - (id), - ) - self.conn.commit() - return self.cursor.fetchall() - - def insert( - self, - embedding: List[float], - metadata: dict, - document: str, - id: Optional[str] = None, - ) -> None: - self.cursor.execute( - f'insert into "{self.table_name}" ' - f"values (%s, array{json.dumps(embedding)}::float4[], %s, %s)", - (id if id is not None else "null", json.dumps(metadata), document), - ) - self.conn.commit() - - def query_nearest_neighbours( - self, embedding: List[float], k: int, filter: Optional[Dict[str, str]] = None - ) -> List[Tuple[str, str, float]]: - params = [] - filter_clause = "" - if filter is not None: - conjuncts = [] - for key, val in filter.items(): - conjuncts.append("metadata->>%s=%s") - params.append(key) - params.append(val) - filter_clause = "where " + " and ".join(conjuncts) - - sql = ( - f"select document, metadata::text, " - f"pm_approx_squared_euclidean_distance(array{json.dumps(embedding)}" - f"::float4[], embedding) as distance from" - f" {self.table_name} {filter_clause} order by distance asc limit {k};" - ) - self.cursor.execute(sql, tuple(params)) - self.conn.commit() - return self.cursor.fetchall() - - class Hologres(VectorStore): """`Hologres API` vector store. @@ -152,26 +53,20 @@ class Hologres(VectorStore): """ Initialize the store. """ - self.storage = HologresWrapper( - self.connection_string, self.ndims, self.table_name + from hologres_vector import HologresVector + + self.storage = HologresVector( + self.connection_string, + ndims=self.ndims, + table_name=self.table_name, + table_schema={"document": "text"}, + pre_delete_table=self.pre_delete_table, ) - self.create_vector_extension() - self.create_table() @property def embeddings(self) -> Embeddings: return self.embedding_function - def create_vector_extension(self) -> None: - try: - self.storage.create_vector_extension() - except Exception as e: - self.logger.exception(e) - raise e - - def create_table(self) -> None: - self.storage.create_table(self.pre_delete_table) - @classmethod def __from( cls, @@ -224,11 +119,10 @@ class Hologres(VectorStore): kwargs: vectorstore specific parameters """ try: - for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids): - self.storage.insert(embedding, metadata, text, id) + schema_datas = [{"document": t} for t in texts] + self.storage.upsert_vectors(embeddings, ids, metadatas, schema_datas) except Exception as e: self.logger.exception(e) - self.storage.conn.commit() def add_texts( self, @@ -333,17 +227,17 @@ class Hologres(VectorStore): k: int = 4, filter: Optional[dict] = None, ) -> List[Tuple[Document, float]]: - results: List[Tuple[str, str, float]] = self.storage.query_nearest_neighbours( - embedding, k, filter + results: List[dict[str, Any]] = self.storage.search( + embedding, k=k, select_columns=["document"], metadata_filters=filter ) docs = [ ( Document( - page_content=result[0], - metadata=json.loads(result[1]), + page_content=result["document"], + metadata=result["metadata"], ), - result[2], + result["distance"], ) for result in results ] @@ -363,9 +257,11 @@ class Hologres(VectorStore): ) -> Hologres: """ Return VectorStore initialized from texts and embeddings. - Postgres connection string is required + Hologres connection string is required "Either pass it as a parameter or set the HOLOGRES_CONNECTION_STRING environment variable. + Create the connection string by calling + HologresVector.connection_string_from_db_params """ embeddings = embedding.embed_documents(list(texts)) @@ -397,9 +293,11 @@ class Hologres(VectorStore): generated embeddings. Return VectorStore initialized from documents and embeddings. - Postgres connection string is required + Hologres connection string is required "Either pass it as a parameter or set the HOLOGRES_CONNECTION_STRING environment variable. + Create the connection string by calling + HologresVector.connection_string_from_db_params Example: .. code-block:: python @@ -463,9 +361,11 @@ class Hologres(VectorStore): if not connection_string: raise ValueError( - "Postgres connection string is required" + "Hologres connection string is required" "Either pass it as a parameter" "or set the HOLOGRES_CONNECTION_STRING environment variable." + "Create the connection string by calling" + "HologresVector.connection_string_from_db_params" ) return connection_string @@ -483,9 +383,11 @@ class Hologres(VectorStore): ) -> Hologres: """ Return VectorStore initialized from documents and embeddings. - Postgres connection string is required + Hologres connection string is required "Either pass it as a parameter or set the HOLOGRES_CONNECTION_STRING environment variable. + Create the connection string by calling + HologresVector.connection_string_from_db_params """ texts = [d.page_content for d in documents] diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 607f41237db..717a022829d 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -3281,6 +3281,22 @@ files = [ [package.dependencies] numpy = "*" +[[package]] +name = "hologres-vector" +version = "0.0.6" +description = "" +optional = true +python-versions = ">=3.7" +files = [ + {file = "hologres_vector-0.0.6-py3-none-any.whl", hash = "sha256:c506eaafd9ae8c529955605fae71856e95191a64dde144d0a25b06536e6544a4"}, + {file = "hologres_vector-0.0.6.tar.gz", hash = "sha256:13251b74bcb9ef2af61cc39c6f155e16452e03891c2f0a07f708f0157baf7b08"}, +] + +[package.dependencies] +psycopg2-binary = "*" +typing = "*" +uuid = "*" + [[package]] name = "hpack" version = "4.0.0" @@ -10452,6 +10468,17 @@ files = [ {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, ] +[[package]] +name = "typing" +version = "3.7.4.3" +description = "Type Hints for Python" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "typing-3.7.4.3-py2-none-any.whl", hash = "sha256:283d868f5071ab9ad873e5e52268d611e851c870a2ba354193026f2dfb29d8b5"}, + {file = "typing-3.7.4.3.tar.gz", hash = "sha256:1187fb9c82fd670d10aa07bbb6cfcfe4bdda42d6fab8d5134f04e8c4d0b71cc9"}, +] + [[package]] name = "typing-extensions" version = "4.8.0" @@ -10583,6 +10610,16 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +[[package]] +name = "uuid" +version = "1.30" +description = "UUID object and generation functions (Python 2.3 or higher)" +optional = true +python-versions = "*" +files = [ + {file = "uuid-1.30.tar.gz", hash = "sha256:1f87cc004ac5120466f36c5beae48b4c48cc411968eed0eaecd3da82aa96193f"}, +] + [[package]] name = "validators" version = "0.22.0" @@ -11431,14 +11468,14 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["O365", "aleph-alpha-client", "amadeus", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "dgml-utils", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jinja2", "jq", "lancedb", "langkit", "lark", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["O365", "aleph-alpha-client", "amadeus", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "dgml-utils", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "hologres-vector", "html2text", "huggingface_hub", "jinja2", "jq", "lancedb", "langkit", "lark", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] azure = ["azure-ai-formrecognizer", "azure-ai-textanalytics", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"] clarifai = ["clarifai"] cli = ["typer"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"] +extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"] javascript = ["esprima"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -11448,4 +11485,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "92909a7f5f12e9861a45e19cdd271ca516aebe71f4b2702c95b651966d2db6b7" +content-hash = "0cd9769243ade0dc1df941e902aa66c18a57333ae50309f004b4f60e6e27b5cf" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index 93bbf763847..31e0a60f89a 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -143,6 +143,7 @@ azure-ai-textanalytics = {version = "^5.3.0", optional = true} google-cloud-documentai = {version = "^2.20.1", optional = true} fireworks-ai = {version = "^0.6.0", optional = true, python = ">=3.9,<4.0"} javelin-sdk = {version = "^0.1.8", optional = true} +hologres-vector = {version = "^0.0.6", optional = true} praw = {version = "^7.7.1", optional = true} msal = {version = "^1.25.0", optional = true} databricks-vectorsearch = {version = "^0.21", optional = true} @@ -315,6 +316,7 @@ all = [ "amadeus", "librosa", "python-arango", + "hologres-vector", "dgml-utils", ] @@ -386,6 +388,7 @@ extended_testing = [ "rspace_client", "fireworks-ai", "javelin-sdk", + "hologres-vector", "praw", "databricks-vectorsearch", "dgml-utils",