From 97abe337bacb27cc3fdbe1bb8dd40e040ca06e5b Mon Sep 17 00:00:00 2001 From: Martin Holzhauer Date: Sat, 6 May 2023 01:31:38 +0200 Subject: [PATCH] env variable based fix for non chatgpt embeddings (#3964) this is a simple fix for #2219 i also added some documentation for this environment variable --- docs/integrations/pgvector.md | 4 ++++ langchain/vectorstores/pgvector.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/integrations/pgvector.md b/docs/integrations/pgvector.md index 3dcf1cb81b7..5da6fdb535d 100644 --- a/docs/integrations/pgvector.md +++ b/docs/integrations/pgvector.md @@ -24,6 +24,10 @@ To import this vectorstore: from langchain.vectorstores.pgvector import PGVector ``` +PGVector embedding size is not autodetected. If you are using ChatGPT or any other embedding with 1536 dimensions +default is fine. If you are going to use for example HuggingFaceEmbeddings you need to set the environment variable named `PGVECTOR_VECTOR_SIZE` +to the needed value, In case of HuggingFaceEmbeddings is would be: `PGVECTOR_VECTOR_SIZE=768` + ### Usage For a more detailed walkthrough of the PGVector Wrapper, see [this notebook](../modules/indexes/vectorstores/examples/pgvector.ipynb) diff --git a/langchain/vectorstores/pgvector.py b/langchain/vectorstores/pgvector.py index 161e8e2c52e..5edde1b8c5e 100644 --- a/langchain/vectorstores/pgvector.py +++ b/langchain/vectorstores/pgvector.py @@ -3,6 +3,7 @@ from __future__ import annotations import enum import logging +import os import uuid from typing import Any, Dict, Iterable, List, Optional, Tuple, Type @@ -19,7 +20,7 @@ from langchain.vectorstores.base import VectorStore Base = declarative_base() # type: Any -ADA_TOKEN_COUNT = 1536 +PGVECTOR_VECTOR_SIZE = int(os.getenv("PGVECTOR_VECTOR_SIZE", default="1536")) _LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain" @@ -79,7 +80,7 @@ class EmbeddingStore(BaseModel): ) collection = relationship(CollectionStore, back_populates="embeddings") - embedding: Vector = sqlalchemy.Column(Vector(ADA_TOKEN_COUNT)) + embedding: Vector = sqlalchemy.Column(Vector(PGVECTOR_VECTOR_SIZE)) document = sqlalchemy.Column(sqlalchemy.String, nullable=True) cmetadata = sqlalchemy.Column(JSON, nullable=True)