community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion:

```
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
```

Moved the following to core
```
mv langchain/langchain/utils/json_schema.py core/langchain_core/utils
mv langchain/langchain/utils/html.py core/langchain_core/utils
mv langchain/langchain/utils/strings.py core/langchain_core/utils
cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py
rm langchain/langchain/utils/env.py
```

See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
Bagatur
2023-12-11 13:53:30 -08:00
committed by GitHub
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions

View File

@@ -0,0 +1,7 @@
from langchain_community.vectorstores.docarray.hnsw import DocArrayHnswSearch
from langchain_community.vectorstores.docarray.in_memory import DocArrayInMemorySearch
__all__ = [
"DocArrayHnswSearch",
"DocArrayInMemorySearch",
]

View File

@@ -0,0 +1,203 @@
from abc import ABC
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Field
from langchain_core.vectorstores import VectorStore
from langchain_community.vectorstores.utils import maximal_marginal_relevance
if TYPE_CHECKING:
from docarray import BaseDoc
from docarray.index.abstract import BaseDocIndex
def _check_docarray_import() -> None:
try:
import docarray
da_version = docarray.__version__.split(".")
if int(da_version[0]) == 0 and int(da_version[1]) <= 31:
raise ImportError(
f"To use the DocArrayHnswSearch VectorStore the docarray "
f"version >=0.32.0 is expected, received: {docarray.__version__}."
f"To upgrade, please run: `pip install -U docarray`."
)
except ImportError:
raise ImportError(
"Could not import docarray python package. "
'Please install it with `pip install "langchain[docarray]"`.'
)
class DocArrayIndex(VectorStore, ABC):
"""Base class for `DocArray` based vector stores."""
def __init__(
self,
doc_index: "BaseDocIndex",
embedding: Embeddings,
):
"""Initialize a vector store from DocArray's DocIndex."""
self.doc_index = doc_index
self.embedding = embedding
@staticmethod
def _get_doc_cls(**embeddings_params: Any) -> Type["BaseDoc"]:
"""Get docarray Document class describing the schema of DocIndex."""
from docarray import BaseDoc
from docarray.typing import NdArray
class DocArrayDoc(BaseDoc):
text: Optional[str]
embedding: Optional[NdArray] = Field(**embeddings_params)
metadata: Optional[dict]
return DocArrayDoc
@property
def doc_cls(self) -> Type["BaseDoc"]:
if self.doc_index._schema is None:
raise ValueError("doc_index expected to have non-null _schema attribute.")
return self.doc_index._schema
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Embed texts and add to the vector store.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
Returns:
List of ids from adding the texts into the vectorstore.
"""
ids: List[str] = []
embeddings = self.embedding.embed_documents(list(texts))
for i, (t, e) in enumerate(zip(texts, embeddings)):
m = metadatas[i] if metadatas else {}
doc = self.doc_cls(text=t, embedding=e, metadata=m)
self.doc_index.index([doc])
ids.append(str(doc.id))
return ids
def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of documents most similar to the query text and
cosine distance in float for each.
Lower score represents more similarity.
"""
query_embedding = self.embedding.embed_query(query)
query_doc = self.doc_cls(embedding=query_embedding) # type: ignore
docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k)
result = [
(Document(page_content=doc.text, metadata=doc.metadata), score)
for doc, score in zip(docs, scores)
]
return result
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query.
"""
results = self.similarity_search_with_score(query, k=k, **kwargs)
return [doc for doc, _ in results]
def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
0 is dissimilar, 1 is most similar.
"""
raise NotImplementedError()
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query vector.
"""
query_doc = self.doc_cls(embedding=embedding) # type: ignore
docs = self.doc_index.find(
query_doc, search_field="embedding", limit=k
).documents
result = [
Document(page_content=doc.text, metadata=doc.metadata) for doc in docs
]
return result
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
query_embedding = self.embedding.embed_query(query)
query_doc = self.doc_cls(embedding=query_embedding) # type: ignore
docs = self.doc_index.find(
query_doc, search_field="embedding", limit=fetch_k
).documents
mmr_selected = maximal_marginal_relevance(
np.array(query_embedding), docs.embedding, k=k
)
results = [
Document(page_content=docs[idx].text, metadata=docs[idx].metadata)
for idx in mmr_selected
]
return results

View File

@@ -0,0 +1,109 @@
from __future__ import annotations
from typing import Any, List, Literal, Optional
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores.docarray.base import (
DocArrayIndex,
_check_docarray_import,
)
class DocArrayHnswSearch(DocArrayIndex):
"""`HnswLib` storage using `DocArray` package.
To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
You can install it with `pip install "langchain[docarray]"`.
"""
@classmethod
def from_params(
cls,
embedding: Embeddings,
work_dir: str,
n_dim: int,
dist_metric: Literal["cosine", "ip", "l2"] = "cosine",
max_elements: int = 1024,
index: bool = True,
ef_construction: int = 200,
ef: int = 10,
M: int = 16,
allow_replace_deleted: bool = True,
num_threads: int = 1,
**kwargs: Any,
) -> DocArrayHnswSearch:
"""Initialize DocArrayHnswSearch store.
Args:
embedding (Embeddings): Embedding function.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
dist_metric (str): Distance metric for DocArrayHnswSearch can be one of:
"cosine", "ip", and "l2". Defaults to "cosine".
max_elements (int): Maximum number of vectors that can be stored.
Defaults to 1024.
index (bool): Whether an index should be built for this field.
Defaults to True.
ef_construction (int): defines a construction time/accuracy trade-off.
Defaults to 200.
ef (int): parameter controlling query time/accuracy trade-off.
Defaults to 10.
M (int): parameter that defines the maximum number of outgoing
connections in the graph. Defaults to 16.
allow_replace_deleted (bool): Enables replacing of deleted elements
with new added ones. Defaults to True.
num_threads (int): Sets the number of cpu threads to use. Defaults to 1.
**kwargs: Other keyword arguments to be passed to the get_doc_cls method.
"""
_check_docarray_import()
from docarray.index import HnswDocumentIndex
doc_cls = cls._get_doc_cls(
dim=n_dim,
space=dist_metric,
max_elements=max_elements,
index=index,
ef_construction=ef_construction,
ef=ef,
M=M,
allow_replace_deleted=allow_replace_deleted,
num_threads=num_threads,
**kwargs,
)
doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) # type: ignore
return cls(doc_index, embedding)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
work_dir: Optional[str] = None,
n_dim: Optional[int] = None,
**kwargs: Any,
) -> DocArrayHnswSearch:
"""Create an DocArrayHnswSearch store and insert data.
Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
**kwargs: Other keyword arguments to be passed to the __init__ method.
Returns:
DocArrayHnswSearch Vector Store
"""
if work_dir is None:
raise ValueError("`work_dir` parameter has not been set.")
if n_dim is None:
raise ValueError("`n_dim` parameter has not been set.")
store = cls.from_params(embedding, work_dir, n_dim, **kwargs)
store.add_texts(texts=texts, metadatas=metadatas)
return store

View File

@@ -0,0 +1,70 @@
"""Wrapper around in-memory storage."""
from __future__ import annotations
from typing import Any, Dict, List, Literal, Optional
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores.docarray.base import (
DocArrayIndex,
_check_docarray_import,
)
class DocArrayInMemorySearch(DocArrayIndex):
"""In-memory `DocArray` storage for exact search.
To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
You can install it with `pip install "langchain[docarray]"`.
"""
@classmethod
def from_params(
cls,
embedding: Embeddings,
metric: Literal[
"cosine_sim", "euclidian_dist", "sgeuclidean_dist"
] = "cosine_sim",
**kwargs: Any,
) -> DocArrayInMemorySearch:
"""Initialize DocArrayInMemorySearch store.
Args:
embedding (Embeddings): Embedding function.
metric (str): metric for exact nearest-neighbor search.
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
Defaults to "cosine_sim".
**kwargs: Other keyword arguments to be passed to the get_doc_cls method.
"""
_check_docarray_import()
from docarray.index import InMemoryExactNNIndex
doc_cls = cls._get_doc_cls(space=metric, **kwargs)
doc_index = InMemoryExactNNIndex[doc_cls]() # type: ignore
return cls(doc_index, embedding)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[Dict[Any, Any]]] = None,
**kwargs: Any,
) -> DocArrayInMemorySearch:
"""Create an DocArrayInMemorySearch store and insert data.
Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[Dict[Any, Any]]]): Metadata for each text
if it exists. Defaults to None.
metric (str): metric for exact nearest-neighbor search.
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
Defaults to "cosine_sim".
Returns:
DocArrayInMemorySearch Vector Store
"""
store = cls.from_params(embedding, **kwargs)
store.add_texts(texts=texts, metadatas=metadatas)
return store