mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 14:31:55 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
@@ -0,0 +1,7 @@
|
||||
from langchain_community.vectorstores.docarray.hnsw import DocArrayHnswSearch
|
||||
from langchain_community.vectorstores.docarray.in_memory import DocArrayInMemorySearch
|
||||
|
||||
__all__ = [
|
||||
"DocArrayHnswSearch",
|
||||
"DocArrayInMemorySearch",
|
||||
]
|
203
libs/community/langchain_community/vectorstores/docarray/base.py
Normal file
203
libs/community/langchain_community/vectorstores/docarray/base.py
Normal file
@@ -0,0 +1,203 @@
|
||||
from abc import ABC
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docarray import BaseDoc
|
||||
from docarray.index.abstract import BaseDocIndex
|
||||
|
||||
|
||||
def _check_docarray_import() -> None:
|
||||
try:
|
||||
import docarray
|
||||
|
||||
da_version = docarray.__version__.split(".")
|
||||
if int(da_version[0]) == 0 and int(da_version[1]) <= 31:
|
||||
raise ImportError(
|
||||
f"To use the DocArrayHnswSearch VectorStore the docarray "
|
||||
f"version >=0.32.0 is expected, received: {docarray.__version__}."
|
||||
f"To upgrade, please run: `pip install -U docarray`."
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import docarray python package. "
|
||||
'Please install it with `pip install "langchain[docarray]"`.'
|
||||
)
|
||||
|
||||
|
||||
class DocArrayIndex(VectorStore, ABC):
|
||||
"""Base class for `DocArray` based vector stores."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
doc_index: "BaseDocIndex",
|
||||
embedding: Embeddings,
|
||||
):
|
||||
"""Initialize a vector store from DocArray's DocIndex."""
|
||||
self.doc_index = doc_index
|
||||
self.embedding = embedding
|
||||
|
||||
@staticmethod
|
||||
def _get_doc_cls(**embeddings_params: Any) -> Type["BaseDoc"]:
|
||||
"""Get docarray Document class describing the schema of DocIndex."""
|
||||
from docarray import BaseDoc
|
||||
from docarray.typing import NdArray
|
||||
|
||||
class DocArrayDoc(BaseDoc):
|
||||
text: Optional[str]
|
||||
embedding: Optional[NdArray] = Field(**embeddings_params)
|
||||
metadata: Optional[dict]
|
||||
|
||||
return DocArrayDoc
|
||||
|
||||
@property
|
||||
def doc_cls(self) -> Type["BaseDoc"]:
|
||||
if self.doc_index._schema is None:
|
||||
raise ValueError("doc_index expected to have non-null _schema attribute.")
|
||||
return self.doc_index._schema
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Embed texts and add to the vector store.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
ids: List[str] = []
|
||||
embeddings = self.embedding.embed_documents(list(texts))
|
||||
for i, (t, e) in enumerate(zip(texts, embeddings)):
|
||||
m = metadatas[i] if metadatas else {}
|
||||
doc = self.doc_cls(text=t, embedding=e, metadata=m)
|
||||
self.doc_index.index([doc])
|
||||
ids.append(str(doc.id))
|
||||
|
||||
return ids
|
||||
|
||||
def similarity_search_with_score(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of documents most similar to the query text and
|
||||
cosine distance in float for each.
|
||||
Lower score represents more similarity.
|
||||
"""
|
||||
query_embedding = self.embedding.embed_query(query)
|
||||
query_doc = self.doc_cls(embedding=query_embedding) # type: ignore
|
||||
docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k)
|
||||
|
||||
result = [
|
||||
(Document(page_content=doc.text, metadata=doc.metadata), score)
|
||||
for doc, score in zip(docs, scores)
|
||||
]
|
||||
return result
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query.
|
||||
"""
|
||||
results = self.similarity_search_with_score(query, k=k, **kwargs)
|
||||
return [doc for doc, _ in results]
|
||||
|
||||
def _similarity_search_with_relevance_scores(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
||||
|
||||
0 is dissimilar, 1 is most similar.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
"""
|
||||
|
||||
query_doc = self.doc_cls(embedding=embedding) # type: ignore
|
||||
docs = self.doc_index.find(
|
||||
query_doc, search_field="embedding", limit=k
|
||||
).documents
|
||||
|
||||
result = [
|
||||
Document(page_content=doc.text, metadata=doc.metadata) for doc in docs
|
||||
]
|
||||
return result
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
query_embedding = self.embedding.embed_query(query)
|
||||
query_doc = self.doc_cls(embedding=query_embedding) # type: ignore
|
||||
|
||||
docs = self.doc_index.find(
|
||||
query_doc, search_field="embedding", limit=fetch_k
|
||||
).documents
|
||||
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array(query_embedding), docs.embedding, k=k
|
||||
)
|
||||
results = [
|
||||
Document(page_content=docs[idx].text, metadata=docs[idx].metadata)
|
||||
for idx in mmr_selected
|
||||
]
|
||||
return results
|
109
libs/community/langchain_community/vectorstores/docarray/hnsw.py
Normal file
109
libs/community/langchain_community/vectorstores/docarray/hnsw.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, List, Literal, Optional
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
|
||||
from langchain_community.vectorstores.docarray.base import (
|
||||
DocArrayIndex,
|
||||
_check_docarray_import,
|
||||
)
|
||||
|
||||
|
||||
class DocArrayHnswSearch(DocArrayIndex):
|
||||
"""`HnswLib` storage using `DocArray` package.
|
||||
|
||||
To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
|
||||
You can install it with `pip install "langchain[docarray]"`.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_params(
|
||||
cls,
|
||||
embedding: Embeddings,
|
||||
work_dir: str,
|
||||
n_dim: int,
|
||||
dist_metric: Literal["cosine", "ip", "l2"] = "cosine",
|
||||
max_elements: int = 1024,
|
||||
index: bool = True,
|
||||
ef_construction: int = 200,
|
||||
ef: int = 10,
|
||||
M: int = 16,
|
||||
allow_replace_deleted: bool = True,
|
||||
num_threads: int = 1,
|
||||
**kwargs: Any,
|
||||
) -> DocArrayHnswSearch:
|
||||
"""Initialize DocArrayHnswSearch store.
|
||||
|
||||
Args:
|
||||
embedding (Embeddings): Embedding function.
|
||||
work_dir (str): path to the location where all the data will be stored.
|
||||
n_dim (int): dimension of an embedding.
|
||||
dist_metric (str): Distance metric for DocArrayHnswSearch can be one of:
|
||||
"cosine", "ip", and "l2". Defaults to "cosine".
|
||||
max_elements (int): Maximum number of vectors that can be stored.
|
||||
Defaults to 1024.
|
||||
index (bool): Whether an index should be built for this field.
|
||||
Defaults to True.
|
||||
ef_construction (int): defines a construction time/accuracy trade-off.
|
||||
Defaults to 200.
|
||||
ef (int): parameter controlling query time/accuracy trade-off.
|
||||
Defaults to 10.
|
||||
M (int): parameter that defines the maximum number of outgoing
|
||||
connections in the graph. Defaults to 16.
|
||||
allow_replace_deleted (bool): Enables replacing of deleted elements
|
||||
with new added ones. Defaults to True.
|
||||
num_threads (int): Sets the number of cpu threads to use. Defaults to 1.
|
||||
**kwargs: Other keyword arguments to be passed to the get_doc_cls method.
|
||||
"""
|
||||
_check_docarray_import()
|
||||
from docarray.index import HnswDocumentIndex
|
||||
|
||||
doc_cls = cls._get_doc_cls(
|
||||
dim=n_dim,
|
||||
space=dist_metric,
|
||||
max_elements=max_elements,
|
||||
index=index,
|
||||
ef_construction=ef_construction,
|
||||
ef=ef,
|
||||
M=M,
|
||||
allow_replace_deleted=allow_replace_deleted,
|
||||
num_threads=num_threads,
|
||||
**kwargs,
|
||||
)
|
||||
doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) # type: ignore
|
||||
return cls(doc_index, embedding)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
work_dir: Optional[str] = None,
|
||||
n_dim: Optional[int] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocArrayHnswSearch:
|
||||
"""Create an DocArrayHnswSearch store and insert data.
|
||||
|
||||
|
||||
Args:
|
||||
texts (List[str]): Text data.
|
||||
embedding (Embeddings): Embedding function.
|
||||
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
|
||||
Defaults to None.
|
||||
work_dir (str): path to the location where all the data will be stored.
|
||||
n_dim (int): dimension of an embedding.
|
||||
**kwargs: Other keyword arguments to be passed to the __init__ method.
|
||||
|
||||
Returns:
|
||||
DocArrayHnswSearch Vector Store
|
||||
"""
|
||||
if work_dir is None:
|
||||
raise ValueError("`work_dir` parameter has not been set.")
|
||||
if n_dim is None:
|
||||
raise ValueError("`n_dim` parameter has not been set.")
|
||||
|
||||
store = cls.from_params(embedding, work_dir, n_dim, **kwargs)
|
||||
store.add_texts(texts=texts, metadatas=metadatas)
|
||||
return store
|
@@ -0,0 +1,70 @@
|
||||
"""Wrapper around in-memory storage."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Literal, Optional
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
|
||||
from langchain_community.vectorstores.docarray.base import (
|
||||
DocArrayIndex,
|
||||
_check_docarray_import,
|
||||
)
|
||||
|
||||
|
||||
class DocArrayInMemorySearch(DocArrayIndex):
|
||||
"""In-memory `DocArray` storage for exact search.
|
||||
|
||||
To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
|
||||
You can install it with `pip install "langchain[docarray]"`.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_params(
|
||||
cls,
|
||||
embedding: Embeddings,
|
||||
metric: Literal[
|
||||
"cosine_sim", "euclidian_dist", "sgeuclidean_dist"
|
||||
] = "cosine_sim",
|
||||
**kwargs: Any,
|
||||
) -> DocArrayInMemorySearch:
|
||||
"""Initialize DocArrayInMemorySearch store.
|
||||
|
||||
Args:
|
||||
embedding (Embeddings): Embedding function.
|
||||
metric (str): metric for exact nearest-neighbor search.
|
||||
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
|
||||
Defaults to "cosine_sim".
|
||||
**kwargs: Other keyword arguments to be passed to the get_doc_cls method.
|
||||
"""
|
||||
_check_docarray_import()
|
||||
from docarray.index import InMemoryExactNNIndex
|
||||
|
||||
doc_cls = cls._get_doc_cls(space=metric, **kwargs)
|
||||
doc_index = InMemoryExactNNIndex[doc_cls]() # type: ignore
|
||||
return cls(doc_index, embedding)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocArrayInMemorySearch:
|
||||
"""Create an DocArrayInMemorySearch store and insert data.
|
||||
|
||||
Args:
|
||||
texts (List[str]): Text data.
|
||||
embedding (Embeddings): Embedding function.
|
||||
metadatas (Optional[List[Dict[Any, Any]]]): Metadata for each text
|
||||
if it exists. Defaults to None.
|
||||
metric (str): metric for exact nearest-neighbor search.
|
||||
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
|
||||
Defaults to "cosine_sim".
|
||||
|
||||
Returns:
|
||||
DocArrayInMemorySearch Vector Store
|
||||
"""
|
||||
store = cls.from_params(embedding, **kwargs)
|
||||
store.add_texts(texts=texts, metadatas=metadatas)
|
||||
return store
|
Reference in New Issue
Block a user