diff --git a/docs/docs/how_to/embed_text.mdx b/docs/docs/how_to/embed_text.mdx index 0cf48c24ec7..842259d2806 100644 --- a/docs/docs/how_to/embed_text.mdx +++ b/docs/docs/how_to/embed_text.mdx @@ -8,7 +8,7 @@ The Embeddings class is a class designed for interfacing with text embedding mod Embeddings create a vector representation of a piece of text. This is useful because it means we can think about text in the vector space, and do things like semantic search where we look for pieces of text that are most similar in the vector space. -The base Embeddings class in LangChain provides two methods: one for embedding documents and one for embedding a query. The former, `.embed_documents`, takes as input multiple texts, while the latter, `.embed_query`, takes a single text. The reason for having these as two separate methods is that some embedding providers have different embedding methods for documents (to be searched over) vs queries (the search query itself). +The base Embeddings class in LangChain provides two methods: one for embedding documents and one for embedding a query. The former, `.embed_documents`, takes as input multiple texts, while the latter, `.embed_query`, takes a single text. The reason for having these as two separate methods is that some embedding providers have different embedding methods for documents (to be searched over) vs queries (the search query itself). `.embed_query` will return a list of floats, whereas `.embed_documents` returns a list of lists of floats. ## Get started @@ -94,15 +94,6 @@ from langchain_huggingface import HuggingFaceEmbeddings embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") ``` - -You can also leave the `model_name` blank to use the default [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) model. - -```python -from langchain_huggingface import HuggingFaceEmbeddings - -embeddings_model = HuggingFaceEmbeddings() -``` - diff --git a/docs/docs/integrations/platforms/huggingface.mdx b/docs/docs/integrations/platforms/huggingface.mdx index 0ddfaa6d58e..b558f8947fc 100644 --- a/docs/docs/integrations/platforms/huggingface.mdx +++ b/docs/docs/integrations/platforms/huggingface.mdx @@ -54,7 +54,7 @@ from langchain_community.embeddings import HuggingFaceInstructEmbeddings ### HuggingFaceBgeEmbeddings ->[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard). +>[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en-v1.5) are one of [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard). >BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development. See a [usage example](/docs/integrations/text_embedding/bge_huggingface). @@ -86,10 +86,10 @@ from langchain_community.embeddings import HuggingFaceHubEmbeddings ### Hugging Face dataset ->[Hugging Face Hub](https://huggingface.co/docs/hub/index) is home to over 75,000 -> [datasets](https://huggingface.co/docs/hub/index#datasets) in more than 100 languages +>[Hugging Face Hub](https://huggingface.co/docs/hub/index) is home to over 75,000 +> [datasets](https://huggingface.co/docs/hub/index#datasets) in more than 100 languages > that can be used for a broad range of tasks across NLP, Computer Vision, and Audio. -> They used for a diverse range of tasks such as translation, automatic speech +> They used for a diverse range of tasks such as translation, automatic speech > recognition, and image classification. We need to install `datasets` python package. @@ -110,7 +110,7 @@ from langchain_community.document_loaders.hugging_face_dataset import HuggingFac ### Hugging Face Hub Tools ->[Hugging Face Tools](https://huggingface.co/docs/transformers/v4.29.0/en/custom_tools) +>[Hugging Face Tools](https://huggingface.co/docs/transformers/v4.29.0/en/custom_tools) > support text I/O and are loaded using the `load_huggingface_tool` function. We need to install several python packages. diff --git a/docs/docs/integrations/providers/vdms.mdx b/docs/docs/integrations/providers/vdms.mdx index f2480e3383b..2ed0ea4455d 100644 --- a/docs/docs/integrations/providers/vdms.mdx +++ b/docs/docs/integrations/providers/vdms.mdx @@ -44,11 +44,12 @@ from langchain_community.vectorstores.vdms import VDMS_Client from langchain_huggingface import HuggingFaceEmbeddings client = VDMS_Client("localhost", 55555) +model_name = "sentence-transformers/all-mpnet-base-v2" vectorstore = VDMS.from_documents( docs, client=client, collection_name="langchain-demo", - embedding_function=HuggingFaceEmbeddings(), + embedding_function=HuggingFaceEmbeddings(model_name=model_name), engine="FaissFlat" distance_strategy="L2", ) @@ -58,5 +59,3 @@ results = vectorstore.similarity_search(query) ``` For a more detailed walkthrough of the VDMS wrapper, see [this notebook](/docs/integrations/vectorstores/vdms) - - diff --git a/docs/docs/integrations/text_embedding/bge_huggingface.ipynb b/docs/docs/integrations/text_embedding/bge_huggingface.ipynb index dfc6de33443..11e66d6d56d 100644 --- a/docs/docs/integrations/text_embedding/bge_huggingface.ipynb +++ b/docs/docs/integrations/text_embedding/bge_huggingface.ipynb @@ -7,7 +7,7 @@ "source": [ "# BGE on Hugging Face\n", "\n", - ">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n", + ">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en-v1.5) are one of [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n", ">BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development.\n", "\n", "This notebook shows how to use `BGE Embeddings` through `Hugging Face`" diff --git a/docs/docs/integrations/text_embedding/huggingfacehub.ipynb b/docs/docs/integrations/text_embedding/huggingfacehub.ipynb index 6575a4ee30f..48f20fbd8b4 100644 --- a/docs/docs/integrations/text_embedding/huggingfacehub.ipynb +++ b/docs/docs/integrations/text_embedding/huggingfacehub.ipynb @@ -36,7 +36,7 @@ "metadata": {}, "outputs": [], "source": [ - "embeddings = HuggingFaceEmbeddings()" + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")" ] }, { diff --git a/docs/docs/integrations/vectorstores/annoy.ipynb b/docs/docs/integrations/vectorstores/annoy.ipynb index 98bc14995f7..eff415a0512 100644 --- a/docs/docs/integrations/vectorstores/annoy.ipynb +++ b/docs/docs/integrations/vectorstores/annoy.ipynb @@ -57,7 +57,8 @@ "from langchain_community.vectorstores import Annoy\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "\n", - "embeddings_func = HuggingFaceEmbeddings()" + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings_func = HuggingFaceEmbeddings(model_name=model_name)" ] }, { diff --git a/docs/docs/integrations/vectorstores/scann.ipynb b/docs/docs/integrations/vectorstores/scann.ipynb index 3b56fab3ba4..c2c5ae4e896 100644 --- a/docs/docs/integrations/vectorstores/scann.ipynb +++ b/docs/docs/integrations/vectorstores/scann.ipynb @@ -61,7 +61,8 @@ "docs = text_splitter.split_documents(documents)\n", "\n", "\n", - "embeddings = HuggingFaceEmbeddings()\n", + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", "\n", "db = ScaNN.from_documents(docs, embeddings)\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", diff --git a/docs/docs/integrations/vectorstores/semadb.ipynb b/docs/docs/integrations/vectorstores/semadb.ipynb index 65eca6cebae..b1f1b573780 100644 --- a/docs/docs/integrations/vectorstores/semadb.ipynb +++ b/docs/docs/integrations/vectorstores/semadb.ipynb @@ -45,7 +45,8 @@ "source": [ "from langchain_huggingface import HuggingFaceEmbeddings\n", "\n", - "embeddings = HuggingFaceEmbeddings()" + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=model_name)" ] }, { diff --git a/docs/docs/integrations/vectorstores/surrealdb.ipynb b/docs/docs/integrations/vectorstores/surrealdb.ipynb index 422ffed1065..1a1d8f1ebcb 100644 --- a/docs/docs/integrations/vectorstores/surrealdb.ipynb +++ b/docs/docs/integrations/vectorstores/surrealdb.ipynb @@ -92,7 +92,8 @@ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "docs = text_splitter.split_documents(documents)\n", "\n", - "embeddings = HuggingFaceEmbeddings()" + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=model_name)" ] }, { diff --git a/docs/docs/integrations/vectorstores/tiledb.ipynb b/docs/docs/integrations/vectorstores/tiledb.ipynb index 4da8ebd17a1..adb857858da 100644 --- a/docs/docs/integrations/vectorstores/tiledb.ipynb +++ b/docs/docs/integrations/vectorstores/tiledb.ipynb @@ -51,7 +51,8 @@ "raw_documents = TextLoader(\"../../how_to/state_of_the_union.txt\").load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "documents = text_splitter.split_documents(raw_documents)\n", - "embeddings = HuggingFaceEmbeddings()\n", + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", "db = TileDB.from_documents(\n", " documents, embeddings, index_uri=\"/tmp/tiledb_index\", index_type=\"FLAT\"\n", ")" diff --git a/docs/docs/integrations/vectorstores/vald.ipynb b/docs/docs/integrations/vectorstores/vald.ipynb index 4a994886327..b4c3fa2d887 100644 --- a/docs/docs/integrations/vectorstores/vald.ipynb +++ b/docs/docs/integrations/vectorstores/vald.ipynb @@ -50,7 +50,8 @@ "raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "documents = text_splitter.split_documents(raw_documents)\n", - "embeddings = HuggingFaceEmbeddings()\n", + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", "db = Vald.from_documents(documents, embeddings, host=\"localhost\", port=8080)" ] }, @@ -197,7 +198,8 @@ "raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "documents = text_splitter.split_documents(raw_documents)\n", - "embeddings = HuggingFaceEmbeddings()\n", + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", "\n", "db = Vald.from_documents(\n", " documents,\n", diff --git a/docs/docs/integrations/vectorstores/vdms.ipynb b/docs/docs/integrations/vectorstores/vdms.ipynb index 7828ecbce77..dd3adceab35 100644 --- a/docs/docs/integrations/vectorstores/vdms.ipynb +++ b/docs/docs/integrations/vectorstores/vdms.ipynb @@ -200,7 +200,8 @@ "\n", "\n", "# create the open-source embedding function\n", - "embedding = HuggingFaceEmbeddings()\n", + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embedding = HuggingFaceEmbeddings(model_name=model_name)\n", "print(\n", " f\"# Embedding Dimensions: {len(embedding.embed_query('This is a test document.'))}\"\n", ")" diff --git a/libs/community/langchain_community/embeddings/huggingface.py b/libs/community/langchain_community/embeddings/huggingface.py index 2304d28a3c9..ed188c7e61e 100644 --- a/libs/community/langchain_community/embeddings/huggingface.py +++ b/libs/community/langchain_community/embeddings/huggingface.py @@ -67,6 +67,19 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings): def __init__(self, **kwargs: Any): """Initialize the sentence_transformer.""" super().__init__(**kwargs) + + if "model_name" not in kwargs: + since = "0.2.16" + removal = "0.4.0" + warn_deprecated( + since=since, + removal=removal, + message=f"Default values for {self.__class__.__name__}.model_name" + + f" were deprecated in LangChain {since} and will be removed in" + + f" {removal}. Explicitly pass a model_name to the" + + f" {self.__class__.__name__} constructor instead.", + ) + try: import sentence_transformers @@ -159,6 +172,19 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings): def __init__(self, **kwargs: Any): """Initialize the sentence_transformer.""" super().__init__(**kwargs) + + if "model_name" not in kwargs: + since = "0.2.16" + removal = "0.4.0" + warn_deprecated( + since=since, + removal=removal, + message=f"Default values for {self.__class__.__name__}.model_name" + + f" were deprecated in LangChain {since} and will be removed in" + + f" {removal}. Explicitly pass a model_name to the" + + f" {self.__class__.__name__} constructor instead.", + ) + try: from InstructorEmbedding import INSTRUCTOR @@ -231,7 +257,7 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings): from langchain_community.embeddings import HuggingFaceBgeEmbeddings - model_name = "BAAI/bge-large-en" + model_name = "BAAI/bge-large-en-v1.5" model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': True} hf = HuggingFaceBgeEmbeddings( @@ -279,6 +305,19 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings): def __init__(self, **kwargs: Any): """Initialize the sentence_transformer.""" super().__init__(**kwargs) + + if "model_name" not in kwargs: + since = "0.2.5" + removal = "0.4.0" + warn_deprecated( + since=since, + removal=removal, + message=f"Default values for {self.__class__.__name__}.model_name" + + f" were deprecated in LangChain {since} and will be removed in" + + f" {removal}. Explicitly pass a model_name to the" + + f" {self.__class__.__name__} constructor instead.", + ) + try: import sentence_transformers diff --git a/libs/community/langchain_community/embeddings/openvino.py b/libs/community/langchain_community/embeddings/openvino.py index 3256ff7d3d0..f8dfc4077c8 100644 --- a/libs/community/langchain_community/embeddings/openvino.py +++ b/libs/community/langchain_community/embeddings/openvino.py @@ -303,7 +303,7 @@ class OpenVINOBgeEmbeddings(OpenVINOEmbeddings): from langchain_community.embeddings import OpenVINOBgeEmbeddings - model_name = "BAAI/bge-large-en" + model_name = "BAAI/bge-large-en-v1.5" model_kwargs = {'device': 'CPU'} encode_kwargs = {'normalize_embeddings': True} ov = OpenVINOBgeEmbeddings( diff --git a/libs/community/langchain_community/vectorstores/scann.py b/libs/community/langchain_community/vectorstores/scann.py index 917a946cad8..e163ba43cbf 100644 --- a/libs/community/langchain_community/vectorstores/scann.py +++ b/libs/community/langchain_community/vectorstores/scann.py @@ -41,9 +41,10 @@ class ScaNN(VectorStore): from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import ScaNN + model_name = "sentence-transformers/all-mpnet-base-v2" db = ScaNN.from_texts( ['foo', 'bar', 'barz', 'qux'], - HuggingFaceEmbeddings()) + HuggingFaceEmbeddings(model_name=model_name)) db.similarity_search('foo?', k=1) """ diff --git a/libs/community/langchain_community/vectorstores/surrealdb.py b/libs/community/langchain_community/vectorstores/surrealdb.py index 60db49130b1..ce05abdc930 100644 --- a/libs/community/langchain_community/vectorstores/surrealdb.py +++ b/libs/community/langchain_community/vectorstores/surrealdb.py @@ -1,12 +1,5 @@ import asyncio -from typing import ( - Any, - Dict, - Iterable, - List, - Optional, - Tuple, -) +from typing import Any, Dict, Iterable, List, Optional, Tuple import numpy as np from langchain_core.documents import Document @@ -40,7 +33,8 @@ class SurrealDBStore(VectorStore): from langchain_community.vectorstores.surrealdb import SurrealDBStore from langchain_community.embeddings import HuggingFaceEmbeddings - embedding_function = HuggingFaceEmbeddings() + model_name = "sentence-transformers/all-mpnet-base-v2" + embedding_function = HuggingFaceEmbeddings(model_name=model_name) dburl = "ws://localhost:8000/rpc" ns = "langchain" db = "docstore" diff --git a/libs/community/langchain_community/vectorstores/vald.py b/libs/community/langchain_community/vectorstores/vald.py index 69e77f17ced..5b2c00a0d98 100644 --- a/libs/community/langchain_community/vectorstores/vald.py +++ b/libs/community/langchain_community/vectorstores/vald.py @@ -23,10 +23,11 @@ class Vald(VectorStore): from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Vald + model_name = "sentence-transformers/all-mpnet-base-v2" texts = ['foo', 'bar', 'baz'] vald = Vald.from_texts( texts=texts, - embedding=HuggingFaceEmbeddings(), + embedding=HuggingFaceEmbeddings(model_name=model_name), host="localhost", port=8080, skip_strict_exist_check=False, diff --git a/libs/community/langchain_community/vectorstores/vdms.py b/libs/community/langchain_community/vectorstores/vdms.py index face7b12157..8f1a2422bd0 100644 --- a/libs/community/langchain_community/vectorstores/vdms.py +++ b/libs/community/langchain_community/vectorstores/vdms.py @@ -161,9 +161,10 @@ class VDMS(VectorStore): from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores.vdms import VDMS, VDMS_Client + model_name = "sentence-transformers/all-mpnet-base-v2" vectorstore = VDMS( client=VDMS_Client("localhost", 55555), - embedding=HuggingFaceEmbeddings(), + embedding=HuggingFaceEmbeddings(model_name=model_name), collection_name="langchain-demo", distance_strategy="L2", engine="FaissFlat", diff --git a/templates/self-query-qdrant/README.md b/templates/self-query-qdrant/README.md index a4d7eeaf964..fb67fd88b46 100644 --- a/templates/self-query-qdrant/README.md +++ b/templates/self-query-qdrant/README.md @@ -92,9 +92,10 @@ from langchain.chains.query_constructor.schema import AttributeInfo from self_query_qdrant.chain import create_chain +model_name = "sentence-transformers/all-mpnet-base-v2" chain = create_chain( llm=Cohere(), - embeddings=HuggingFaceEmbeddings(), + embeddings=HuggingFaceEmbeddings(model_name=model_name), document_contents="Descriptions of cats, along with their names and breeds.", metadata_field_info=[ AttributeInfo(name="name", description="Name of the cat", type="string"), @@ -112,8 +113,9 @@ from langchain_community.embeddings import HuggingFaceEmbeddings from self_query_qdrant.chain import initialize +model_name = "sentence-transformers/all-mpnet-base-v2" initialize( - embeddings=HuggingFaceEmbeddings(), + embeddings=HuggingFaceEmbeddings(model_name=model_name), collection_name="cats", documents=[ Document( @@ -145,7 +147,7 @@ langchain serve ### Local Server -This will start the FastAPI app with a server running locally at +This will start the FastAPI app with a server running locally at [http://localhost:8000](http://localhost:8000) You can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)