mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 20:16:52 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
207
libs/community/langchain_community/retrievers/docarray.py
Normal file
207
libs/community/langchain_community/retrievers/docarray.py
Normal file
@@ -0,0 +1,207 @@
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
|
||||
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
|
||||
class SearchType(str, Enum):
|
||||
"""Enumerator of the types of search to perform."""
|
||||
|
||||
similarity = "similarity"
|
||||
mmr = "mmr"
|
||||
|
||||
|
||||
class DocArrayRetriever(BaseRetriever):
|
||||
"""`DocArray Document Indices` retriever.
|
||||
|
||||
Currently, it supports 5 backends:
|
||||
InMemoryExactNNIndex, HnswDocumentIndex, QdrantDocumentIndex,
|
||||
ElasticDocIndex, and WeaviateDocumentIndex.
|
||||
|
||||
Args:
|
||||
index: One of the above-mentioned index instances
|
||||
embeddings: Embedding model to represent text as vectors
|
||||
search_field: Field to consider for searching in the documents.
|
||||
Should be an embedding/vector/tensor.
|
||||
content_field: Field that represents the main content in your document schema.
|
||||
Will be used as a `page_content`. Everything else will go into `metadata`.
|
||||
search_type: Type of search to perform (similarity / mmr)
|
||||
filters: Filters applied for document retrieval.
|
||||
top_k: Number of documents to return
|
||||
"""
|
||||
|
||||
index: Any
|
||||
embeddings: Embeddings
|
||||
search_field: str
|
||||
content_field: str
|
||||
search_type: SearchType = SearchType.similarity
|
||||
top_k: int = 1
|
||||
filters: Optional[Any] = None
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
def _get_relevant_documents(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
run_manager: CallbackManagerForRetrieverRun,
|
||||
) -> List[Document]:
|
||||
"""Get documents relevant for a query.
|
||||
|
||||
Args:
|
||||
query: string to find relevant documents for
|
||||
|
||||
Returns:
|
||||
List of relevant documents
|
||||
"""
|
||||
query_emb = np.array(self.embeddings.embed_query(query))
|
||||
|
||||
if self.search_type == SearchType.similarity:
|
||||
results = self._similarity_search(query_emb)
|
||||
elif self.search_type == SearchType.mmr:
|
||||
results = self._mmr_search(query_emb)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Search type {self.search_type} does not exist. "
|
||||
f"Choose either 'similarity' or 'mmr'."
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _search(
|
||||
self, query_emb: np.ndarray, top_k: int
|
||||
) -> List[Union[Dict[str, Any], Any]]:
|
||||
"""
|
||||
Perform a search using the query embedding and return top_k documents.
|
||||
|
||||
Args:
|
||||
query_emb: Query represented as an embedding
|
||||
top_k: Number of documents to return
|
||||
|
||||
Returns:
|
||||
A list of top_k documents matching the query
|
||||
"""
|
||||
|
||||
from docarray.index import ElasticDocIndex, WeaviateDocumentIndex
|
||||
|
||||
filter_args = {}
|
||||
search_field = self.search_field
|
||||
if isinstance(self.index, WeaviateDocumentIndex):
|
||||
filter_args["where_filter"] = self.filters
|
||||
search_field = ""
|
||||
elif isinstance(self.index, ElasticDocIndex):
|
||||
filter_args["query"] = self.filters
|
||||
else:
|
||||
filter_args["filter_query"] = self.filters
|
||||
|
||||
if self.filters:
|
||||
query = (
|
||||
self.index.build_query() # get empty query object
|
||||
.find(
|
||||
query=query_emb, search_field=search_field
|
||||
) # add vector similarity search
|
||||
.filter(**filter_args) # add filter search
|
||||
.build(limit=top_k) # build the query
|
||||
)
|
||||
# execute the combined query and return the results
|
||||
docs = self.index.execute_query(query)
|
||||
if hasattr(docs, "documents"):
|
||||
docs = docs.documents
|
||||
docs = docs[:top_k]
|
||||
else:
|
||||
docs = self.index.find(
|
||||
query=query_emb, search_field=search_field, limit=top_k
|
||||
).documents
|
||||
return docs
|
||||
|
||||
def _similarity_search(self, query_emb: np.ndarray) -> List[Document]:
|
||||
"""
|
||||
Perform a similarity search.
|
||||
|
||||
Args:
|
||||
query_emb: Query represented as an embedding
|
||||
|
||||
Returns:
|
||||
A list of documents most similar to the query
|
||||
"""
|
||||
docs = self._search(query_emb=query_emb, top_k=self.top_k)
|
||||
results = [self._docarray_to_langchain_doc(doc) for doc in docs]
|
||||
return results
|
||||
|
||||
def _mmr_search(self, query_emb: np.ndarray) -> List[Document]:
|
||||
"""
|
||||
Perform a maximal marginal relevance (mmr) search.
|
||||
|
||||
Args:
|
||||
query_emb: Query represented as an embedding
|
||||
|
||||
Returns:
|
||||
A list of diverse documents related to the query
|
||||
"""
|
||||
docs = self._search(query_emb=query_emb, top_k=20)
|
||||
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
query_emb,
|
||||
[
|
||||
doc[self.search_field]
|
||||
if isinstance(doc, dict)
|
||||
else getattr(doc, self.search_field)
|
||||
for doc in docs
|
||||
],
|
||||
k=self.top_k,
|
||||
)
|
||||
results = [self._docarray_to_langchain_doc(docs[idx]) for idx in mmr_selected]
|
||||
return results
|
||||
|
||||
def _docarray_to_langchain_doc(self, doc: Union[Dict[str, Any], Any]) -> Document:
|
||||
"""
|
||||
Convert a DocArray document (which also might be a dict)
|
||||
to a langchain document format.
|
||||
|
||||
DocArray document can contain arbitrary fields, so the mapping is done
|
||||
in the following way:
|
||||
|
||||
page_content <-> content_field
|
||||
metadata <-> all other fields excluding
|
||||
tensors and embeddings (so float, int, string)
|
||||
|
||||
Args:
|
||||
doc: DocArray document
|
||||
|
||||
Returns:
|
||||
Document in langchain format
|
||||
|
||||
Raises:
|
||||
ValueError: If the document doesn't contain the content field
|
||||
"""
|
||||
|
||||
fields = doc.keys() if isinstance(doc, dict) else doc.__fields__
|
||||
|
||||
if self.content_field not in fields:
|
||||
raise ValueError(
|
||||
f"Document does not contain the content field - {self.content_field}."
|
||||
)
|
||||
lc_doc = Document(
|
||||
page_content=doc[self.content_field]
|
||||
if isinstance(doc, dict)
|
||||
else getattr(doc, self.content_field)
|
||||
)
|
||||
|
||||
for name in fields:
|
||||
value = doc[name] if isinstance(doc, dict) else getattr(doc, name)
|
||||
if (
|
||||
isinstance(value, (str, int, float, bool))
|
||||
and name != self.content_field
|
||||
):
|
||||
lc_doc.metadata[name] = value
|
||||
|
||||
return lc_doc
|
Reference in New Issue
Block a user