mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
community[patch]: update the default hf bge embeddings (#22627)
**Description:** This updates the langchain_community > huggingface > default bge embeddings ([the current default recommends this change](https://huggingface.co/BAAI/bge-large-en)) **Issue:** None **Dependencies:** None **Twitter handle:** @jonzeolla --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
150251fd49
commit
78ff51ce83
@ -94,15 +94,6 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|||||||
|
|
||||||
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also leave the `model_name` blank to use the default [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) model.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from langchain_huggingface import HuggingFaceEmbeddings
|
|
||||||
|
|
||||||
embeddings_model = HuggingFaceEmbeddings()
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
@ -54,7 +54,7 @@ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
|||||||
|
|
||||||
### HuggingFaceBgeEmbeddings
|
### HuggingFaceBgeEmbeddings
|
||||||
|
|
||||||
>[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).
|
>[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en-v1.5) are one of [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).
|
||||||
>BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development.
|
>BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development.
|
||||||
|
|
||||||
See a [usage example](/docs/integrations/text_embedding/bge_huggingface).
|
See a [usage example](/docs/integrations/text_embedding/bge_huggingface).
|
||||||
|
@ -44,11 +44,12 @@ from langchain_community.vectorstores.vdms import VDMS_Client
|
|||||||
from langchain_huggingface import HuggingFaceEmbeddings
|
from langchain_huggingface import HuggingFaceEmbeddings
|
||||||
|
|
||||||
client = VDMS_Client("localhost", 55555)
|
client = VDMS_Client("localhost", 55555)
|
||||||
|
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||||
vectorstore = VDMS.from_documents(
|
vectorstore = VDMS.from_documents(
|
||||||
docs,
|
docs,
|
||||||
client=client,
|
client=client,
|
||||||
collection_name="langchain-demo",
|
collection_name="langchain-demo",
|
||||||
embedding_function=HuggingFaceEmbeddings(),
|
embedding_function=HuggingFaceEmbeddings(model_name=model_name),
|
||||||
engine="FaissFlat"
|
engine="FaissFlat"
|
||||||
distance_strategy="L2",
|
distance_strategy="L2",
|
||||||
)
|
)
|
||||||
@ -58,5 +59,3 @@ results = vectorstore.similarity_search(query)
|
|||||||
```
|
```
|
||||||
|
|
||||||
For a more detailed walkthrough of the VDMS wrapper, see [this notebook](/docs/integrations/vectorstores/vdms)
|
For a more detailed walkthrough of the VDMS wrapper, see [this notebook](/docs/integrations/vectorstores/vdms)
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# BGE on Hugging Face\n",
|
"# BGE on Hugging Face\n",
|
||||||
"\n",
|
"\n",
|
||||||
">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n",
|
">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en-v1.5) are one of [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n",
|
||||||
">BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development.\n",
|
">BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This notebook shows how to use `BGE Embeddings` through `Hugging Face`"
|
"This notebook shows how to use `BGE Embeddings` through `Hugging Face`"
|
||||||
|
@ -36,7 +36,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"embeddings = HuggingFaceEmbeddings()"
|
"embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -57,7 +57,8 @@
|
|||||||
"from langchain_community.vectorstores import Annoy\n",
|
"from langchain_community.vectorstores import Annoy\n",
|
||||||
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embeddings_func = HuggingFaceEmbeddings()"
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embeddings_func = HuggingFaceEmbeddings(model_name=model_name)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -61,7 +61,8 @@
|
|||||||
"docs = text_splitter.split_documents(documents)\n",
|
"docs = text_splitter.split_documents(documents)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embeddings = HuggingFaceEmbeddings()\n",
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"db = ScaNN.from_documents(docs, embeddings)\n",
|
"db = ScaNN.from_documents(docs, embeddings)\n",
|
||||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
|
@ -45,7 +45,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embeddings = HuggingFaceEmbeddings()"
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embeddings = HuggingFaceEmbeddings(model_name=model_name)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -92,7 +92,8 @@
|
|||||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||||
"docs = text_splitter.split_documents(documents)\n",
|
"docs = text_splitter.split_documents(documents)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"embeddings = HuggingFaceEmbeddings()"
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embeddings = HuggingFaceEmbeddings(model_name=model_name)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -51,7 +51,8 @@
|
|||||||
"raw_documents = TextLoader(\"../../how_to/state_of_the_union.txt\").load()\n",
|
"raw_documents = TextLoader(\"../../how_to/state_of_the_union.txt\").load()\n",
|
||||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||||
"documents = text_splitter.split_documents(raw_documents)\n",
|
"documents = text_splitter.split_documents(raw_documents)\n",
|
||||||
"embeddings = HuggingFaceEmbeddings()\n",
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||||
"db = TileDB.from_documents(\n",
|
"db = TileDB.from_documents(\n",
|
||||||
" documents, embeddings, index_uri=\"/tmp/tiledb_index\", index_type=\"FLAT\"\n",
|
" documents, embeddings, index_uri=\"/tmp/tiledb_index\", index_type=\"FLAT\"\n",
|
||||||
")"
|
")"
|
||||||
|
@ -50,7 +50,8 @@
|
|||||||
"raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n",
|
"raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n",
|
||||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||||
"documents = text_splitter.split_documents(raw_documents)\n",
|
"documents = text_splitter.split_documents(raw_documents)\n",
|
||||||
"embeddings = HuggingFaceEmbeddings()\n",
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||||
"db = Vald.from_documents(documents, embeddings, host=\"localhost\", port=8080)"
|
"db = Vald.from_documents(documents, embeddings, host=\"localhost\", port=8080)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -197,7 +198,8 @@
|
|||||||
"raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n",
|
"raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n",
|
||||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||||
"documents = text_splitter.split_documents(raw_documents)\n",
|
"documents = text_splitter.split_documents(raw_documents)\n",
|
||||||
"embeddings = HuggingFaceEmbeddings()\n",
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"db = Vald.from_documents(\n",
|
"db = Vald.from_documents(\n",
|
||||||
" documents,\n",
|
" documents,\n",
|
||||||
|
@ -200,7 +200,8 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# create the open-source embedding function\n",
|
"# create the open-source embedding function\n",
|
||||||
"embedding = HuggingFaceEmbeddings()\n",
|
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||||
|
"embedding = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||||
"print(\n",
|
"print(\n",
|
||||||
" f\"# Embedding Dimensions: {len(embedding.embed_query('This is a test document.'))}\"\n",
|
" f\"# Embedding Dimensions: {len(embedding.embed_query('This is a test document.'))}\"\n",
|
||||||
")"
|
")"
|
||||||
|
@ -67,6 +67,19 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
|
|||||||
def __init__(self, **kwargs: Any):
|
def __init__(self, **kwargs: Any):
|
||||||
"""Initialize the sentence_transformer."""
|
"""Initialize the sentence_transformer."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
if "model_name" not in kwargs:
|
||||||
|
since = "0.2.16"
|
||||||
|
removal = "0.4.0"
|
||||||
|
warn_deprecated(
|
||||||
|
since=since,
|
||||||
|
removal=removal,
|
||||||
|
message=f"Default values for {self.__class__.__name__}.model_name"
|
||||||
|
+ f" were deprecated in LangChain {since} and will be removed in"
|
||||||
|
+ f" {removal}. Explicitly pass a model_name to the"
|
||||||
|
+ f" {self.__class__.__name__} constructor instead.",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import sentence_transformers
|
import sentence_transformers
|
||||||
|
|
||||||
@ -159,6 +172,19 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
|
|||||||
def __init__(self, **kwargs: Any):
|
def __init__(self, **kwargs: Any):
|
||||||
"""Initialize the sentence_transformer."""
|
"""Initialize the sentence_transformer."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
if "model_name" not in kwargs:
|
||||||
|
since = "0.2.16"
|
||||||
|
removal = "0.4.0"
|
||||||
|
warn_deprecated(
|
||||||
|
since=since,
|
||||||
|
removal=removal,
|
||||||
|
message=f"Default values for {self.__class__.__name__}.model_name"
|
||||||
|
+ f" were deprecated in LangChain {since} and will be removed in"
|
||||||
|
+ f" {removal}. Explicitly pass a model_name to the"
|
||||||
|
+ f" {self.__class__.__name__} constructor instead.",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from InstructorEmbedding import INSTRUCTOR
|
from InstructorEmbedding import INSTRUCTOR
|
||||||
|
|
||||||
@ -231,7 +257,7 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
|
|||||||
|
|
||||||
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
||||||
|
|
||||||
model_name = "BAAI/bge-large-en"
|
model_name = "BAAI/bge-large-en-v1.5"
|
||||||
model_kwargs = {'device': 'cpu'}
|
model_kwargs = {'device': 'cpu'}
|
||||||
encode_kwargs = {'normalize_embeddings': True}
|
encode_kwargs = {'normalize_embeddings': True}
|
||||||
hf = HuggingFaceBgeEmbeddings(
|
hf = HuggingFaceBgeEmbeddings(
|
||||||
@ -279,6 +305,19 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
|
|||||||
def __init__(self, **kwargs: Any):
|
def __init__(self, **kwargs: Any):
|
||||||
"""Initialize the sentence_transformer."""
|
"""Initialize the sentence_transformer."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
if "model_name" not in kwargs:
|
||||||
|
since = "0.2.5"
|
||||||
|
removal = "0.4.0"
|
||||||
|
warn_deprecated(
|
||||||
|
since=since,
|
||||||
|
removal=removal,
|
||||||
|
message=f"Default values for {self.__class__.__name__}.model_name"
|
||||||
|
+ f" were deprecated in LangChain {since} and will be removed in"
|
||||||
|
+ f" {removal}. Explicitly pass a model_name to the"
|
||||||
|
+ f" {self.__class__.__name__} constructor instead.",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import sentence_transformers
|
import sentence_transformers
|
||||||
|
|
||||||
|
@ -303,7 +303,7 @@ class OpenVINOBgeEmbeddings(OpenVINOEmbeddings):
|
|||||||
|
|
||||||
from langchain_community.embeddings import OpenVINOBgeEmbeddings
|
from langchain_community.embeddings import OpenVINOBgeEmbeddings
|
||||||
|
|
||||||
model_name = "BAAI/bge-large-en"
|
model_name = "BAAI/bge-large-en-v1.5"
|
||||||
model_kwargs = {'device': 'CPU'}
|
model_kwargs = {'device': 'CPU'}
|
||||||
encode_kwargs = {'normalize_embeddings': True}
|
encode_kwargs = {'normalize_embeddings': True}
|
||||||
ov = OpenVINOBgeEmbeddings(
|
ov = OpenVINOBgeEmbeddings(
|
||||||
|
@ -41,9 +41,10 @@ class ScaNN(VectorStore):
|
|||||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||||
from langchain_community.vectorstores import ScaNN
|
from langchain_community.vectorstores import ScaNN
|
||||||
|
|
||||||
|
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||||
db = ScaNN.from_texts(
|
db = ScaNN.from_texts(
|
||||||
['foo', 'bar', 'barz', 'qux'],
|
['foo', 'bar', 'barz', 'qux'],
|
||||||
HuggingFaceEmbeddings())
|
HuggingFaceEmbeddings(model_name=model_name))
|
||||||
db.similarity_search('foo?', k=1)
|
db.similarity_search('foo?', k=1)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1,12 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from typing import (
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||||
Any,
|
|
||||||
Dict,
|
|
||||||
Iterable,
|
|
||||||
List,
|
|
||||||
Optional,
|
|
||||||
Tuple,
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -40,7 +33,8 @@ class SurrealDBStore(VectorStore):
|
|||||||
from langchain_community.vectorstores.surrealdb import SurrealDBStore
|
from langchain_community.vectorstores.surrealdb import SurrealDBStore
|
||||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||||
|
|
||||||
embedding_function = HuggingFaceEmbeddings()
|
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||||
|
embedding_function = HuggingFaceEmbeddings(model_name=model_name)
|
||||||
dburl = "ws://localhost:8000/rpc"
|
dburl = "ws://localhost:8000/rpc"
|
||||||
ns = "langchain"
|
ns = "langchain"
|
||||||
db = "docstore"
|
db = "docstore"
|
||||||
|
@ -23,10 +23,11 @@ class Vald(VectorStore):
|
|||||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||||
from langchain_community.vectorstores import Vald
|
from langchain_community.vectorstores import Vald
|
||||||
|
|
||||||
|
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||||
texts = ['foo', 'bar', 'baz']
|
texts = ['foo', 'bar', 'baz']
|
||||||
vald = Vald.from_texts(
|
vald = Vald.from_texts(
|
||||||
texts=texts,
|
texts=texts,
|
||||||
embedding=HuggingFaceEmbeddings(),
|
embedding=HuggingFaceEmbeddings(model_name=model_name),
|
||||||
host="localhost",
|
host="localhost",
|
||||||
port=8080,
|
port=8080,
|
||||||
skip_strict_exist_check=False,
|
skip_strict_exist_check=False,
|
||||||
|
@ -161,9 +161,10 @@ class VDMS(VectorStore):
|
|||||||
from langchain_huggingface import HuggingFaceEmbeddings
|
from langchain_huggingface import HuggingFaceEmbeddings
|
||||||
from langchain_community.vectorstores.vdms import VDMS, VDMS_Client
|
from langchain_community.vectorstores.vdms import VDMS, VDMS_Client
|
||||||
|
|
||||||
|
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||||
vectorstore = VDMS(
|
vectorstore = VDMS(
|
||||||
client=VDMS_Client("localhost", 55555),
|
client=VDMS_Client("localhost", 55555),
|
||||||
embedding=HuggingFaceEmbeddings(),
|
embedding=HuggingFaceEmbeddings(model_name=model_name),
|
||||||
collection_name="langchain-demo",
|
collection_name="langchain-demo",
|
||||||
distance_strategy="L2",
|
distance_strategy="L2",
|
||||||
engine="FaissFlat",
|
engine="FaissFlat",
|
||||||
|
@ -92,9 +92,10 @@ from langchain.chains.query_constructor.schema import AttributeInfo
|
|||||||
|
|
||||||
from self_query_qdrant.chain import create_chain
|
from self_query_qdrant.chain import create_chain
|
||||||
|
|
||||||
|
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||||
chain = create_chain(
|
chain = create_chain(
|
||||||
llm=Cohere(),
|
llm=Cohere(),
|
||||||
embeddings=HuggingFaceEmbeddings(),
|
embeddings=HuggingFaceEmbeddings(model_name=model_name),
|
||||||
document_contents="Descriptions of cats, along with their names and breeds.",
|
document_contents="Descriptions of cats, along with their names and breeds.",
|
||||||
metadata_field_info=[
|
metadata_field_info=[
|
||||||
AttributeInfo(name="name", description="Name of the cat", type="string"),
|
AttributeInfo(name="name", description="Name of the cat", type="string"),
|
||||||
@ -112,8 +113,9 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
|
|||||||
|
|
||||||
from self_query_qdrant.chain import initialize
|
from self_query_qdrant.chain import initialize
|
||||||
|
|
||||||
|
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||||
initialize(
|
initialize(
|
||||||
embeddings=HuggingFaceEmbeddings(),
|
embeddings=HuggingFaceEmbeddings(model_name=model_name),
|
||||||
collection_name="cats",
|
collection_name="cats",
|
||||||
documents=[
|
documents=[
|
||||||
Document(
|
Document(
|
||||||
|
Loading…
Reference in New Issue
Block a user