cr

env variable based fix for non chatgpt embeddings (#3964 )
this is a simple fix for #2219 i also added some documentation for this environment variable
2026-01-24 05:50:18 +00:00 · 2023-05-05 16:33:32 -07:00 · 2023-05-05 16:31:38 -07:00
3 changed files with 45 additions and 3 deletions
--- a/docs/ecosystem/pgvector.md
+++ b/docs/ecosystem/pgvector.md
@@ -24,6 +24,10 @@ To import this vectorstore:
 from langchain.vectorstores.pgvector import PGVector
 ```

+PGVector embedding size is not autodetected. If you are using ChatGPT or any other embedding with 1536 dimensions
+default is fine. If you are going to use for example HuggingFaceEmbeddings you need to set the environment variable named `PGVECTOR_VECTOR_SIZE`
+to the needed value, In case of HuggingFaceEmbeddings is would be: `PGVECTOR_VECTOR_SIZE=768`
+
 ### Usage

 For a more detailed walkthrough of the PGVector Wrapper, see [this notebook](../modules/indexes/vectorstores/examples/pgvector.ipynb)
--- a/langchain/vectorstores/pgvector.py
+++ b/langchain/vectorstores/pgvector.py
@@ -3,6 +3,7 @@ from __future__ import annotations

 import enum
 import logging
+import os
 import uuid
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type

@@ -13,13 +14,13 @@ from sqlalchemy.orm import Session, declarative_base, relationship

 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
-from langchain.utils import get_from_dict_or_env
+from langchain.utils import get_from_dict_or_env, get_from_env
 from langchain.vectorstores.base import VectorStore

 Base = declarative_base()  # type: Any


-ADA_TOKEN_COUNT = 1536
+PGVECTOR_VECTOR_SIZE = 1536
 _LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain"


@@ -79,13 +80,28 @@ class EmbeddingStore(BaseModel):
    )
    collection = relationship(CollectionStore, back_populates="embeddings")

-    embedding: Vector = sqlalchemy.Column(Vector(ADA_TOKEN_COUNT))
+    embedding: Vector = sqlalchemy.Column(Vector(PGVECTOR_VECTOR_SIZE))
    document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    cmetadata = sqlalchemy.Column(JSON, nullable=True)

    # custom_id : any user defined id
    custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)

+    def __init__(
+        self, *args: Any, vector_size: Optional[int] = None, **kwargs: Any
+    ) -> None:
+        if "embedding" not in kwargs:
+            if vector_size is None:
+                vector_size = int(
+                    get_from_env(
+                        "vector_size",
+                        "PGVECTOR_VECTOR_SIZE",
+                        default=str(PGVECTOR_VECTOR_SIZE),
+                    )
+                )
+            kwargs["embedding"] = sqlalchemy.Column(Vector(vector_size))
+        super().__init__(*args, **kwargs)
+

 class QueryResult:
    EmbeddingStore: EmbeddingStore
--- a/tests/unit_tests/vectorstores/test_pgvector.py
+++ b/tests/unit_tests/vectorstores/test_pgvector.py
@@ -0,0 +1,22 @@
+import os
+
+from langchain.vectorstores.pgvector import PGVECTOR_VECTOR_SIZE, EmbeddingStore
+
+
+def test_embedding_store_init_defaults() -> None:
+    expected = PGVECTOR_VECTOR_SIZE
+    actual = EmbeddingStore().embedding.type.dim
+    assert expected == actual
+
+
+def test_embedding_store_init_vector_size() -> None:
+    expected = 2
+    actual = EmbeddingStore(vector_size=2).embedding.type.dim
+    assert expected == actual
+
+
+def test_embedding_store_init_env_vector_size() -> None:
+    os.environ["PGVECTOR_VECTOR_SIZE"] = "3"
+    expected = 3
+    actual = EmbeddingStore().embedding.type.dim
+    assert expected == actual
Author	SHA1	Message	Date
Dev 2049	64d1e2042e	cr	2023-05-05 16:33:32 -07:00
Martin Holzhauer	0a7245f721	env variable based fix for non chatgpt embeddings (#3964 ) this is a simple fix for #2219 i also added some documentation for this environment variable	2023-05-05 16:31:38 -07:00