Compare commits

...

2 Commits

Author SHA1 Message Date
Dev 2049
64d1e2042e cr 2023-05-05 16:33:32 -07:00
Martin Holzhauer
0a7245f721 env variable based fix for non chatgpt embeddings (#3964)
this is a simple fix for #2219 

i also added some documentation for this environment variable
2023-05-05 16:31:38 -07:00
3 changed files with 45 additions and 3 deletions

View File

@@ -24,6 +24,10 @@ To import this vectorstore:
from langchain.vectorstores.pgvector import PGVector
```
PGVector embedding size is not autodetected. If you are using ChatGPT or any other embedding with 1536 dimensions
default is fine. If you are going to use for example HuggingFaceEmbeddings you need to set the environment variable named `PGVECTOR_VECTOR_SIZE`
to the needed value, In case of HuggingFaceEmbeddings is would be: `PGVECTOR_VECTOR_SIZE=768`
### Usage
For a more detailed walkthrough of the PGVector Wrapper, see [this notebook](../modules/indexes/vectorstores/examples/pgvector.ipynb)

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import enum
import logging
import os
import uuid
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
@@ -13,13 +14,13 @@ from sqlalchemy.orm import Session, declarative_base, relationship
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.utils import get_from_dict_or_env
from langchain.utils import get_from_dict_or_env, get_from_env
from langchain.vectorstores.base import VectorStore
Base = declarative_base() # type: Any
ADA_TOKEN_COUNT = 1536
PGVECTOR_VECTOR_SIZE = 1536
_LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain"
@@ -79,13 +80,28 @@ class EmbeddingStore(BaseModel):
)
collection = relationship(CollectionStore, back_populates="embeddings")
embedding: Vector = sqlalchemy.Column(Vector(ADA_TOKEN_COUNT))
embedding: Vector = sqlalchemy.Column(Vector(PGVECTOR_VECTOR_SIZE))
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
cmetadata = sqlalchemy.Column(JSON, nullable=True)
# custom_id : any user defined id
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
def __init__(
self, *args: Any, vector_size: Optional[int] = None, **kwargs: Any
) -> None:
if "embedding" not in kwargs:
if vector_size is None:
vector_size = int(
get_from_env(
"vector_size",
"PGVECTOR_VECTOR_SIZE",
default=str(PGVECTOR_VECTOR_SIZE),
)
)
kwargs["embedding"] = sqlalchemy.Column(Vector(vector_size))
super().__init__(*args, **kwargs)
class QueryResult:
EmbeddingStore: EmbeddingStore

View File

@@ -0,0 +1,22 @@
import os
from langchain.vectorstores.pgvector import PGVECTOR_VECTOR_SIZE, EmbeddingStore
def test_embedding_store_init_defaults() -> None:
expected = PGVECTOR_VECTOR_SIZE
actual = EmbeddingStore().embedding.type.dim
assert expected == actual
def test_embedding_store_init_vector_size() -> None:
expected = 2
actual = EmbeddingStore(vector_size=2).embedding.type.dim
assert expected == actual
def test_embedding_store_init_env_vector_size() -> None:
os.environ["PGVECTOR_VECTOR_SIZE"] = "3"
expected = 3
actual = EmbeddingStore().embedding.type.dim
assert expected == actual