From 0a752a74cccb8cde99278397a54a90bf024a5cb1 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Sat, 31 Aug 2024 02:42:00 +0200 Subject: [PATCH] community[patch], docs: Add API reference doc for GraphVectorStore (#25751) --- .../graph_vectorstores/__init__.py | 162 +++++++++++++++++- .../graph_vectorstores/base.py | 7 + .../graph_vectorstores/links.py | 8 + .../langchain_core/graph_vectorstores/base.py | 41 +++-- .../graph_vectorstores/links.py | 2 + 5 files changed, 204 insertions(+), 16 deletions(-) create mode 100644 libs/community/langchain_community/graph_vectorstores/base.py create mode 100644 libs/community/langchain_community/graph_vectorstores/links.py diff --git a/libs/community/langchain_community/graph_vectorstores/__init__.py b/libs/community/langchain_community/graph_vectorstores/__init__.py index f5281743f71..485123a96b0 100644 --- a/libs/community/langchain_community/graph_vectorstores/__init__.py +++ b/libs/community/langchain_community/graph_vectorstores/__init__.py @@ -1,3 +1,161 @@ -from langchain_community.graph_vectorstores.cassandra import CassandraGraphVectorStore +"""**Graph Vector Store** -__all__ = ["CassandraGraphVectorStore"] +Sometimes embedding models don’t capture all the important relationships between +documents. +Graph Vector Stores are an extension to both vector stores and retrievers that allow +documents to be explicitly connected to each other. + +Graph vector store retrievers use both vector similarity and links to find documents +related to an unstructured query. + +Graphs allow linking between documents. +Each document identifies tags that link to and from it. +For example, a paragraph of text may be linked to URLs based on the anchor tags in +it's content and linked from the URL(s) it is published at. + +Link extractors can be used to extract links from documents. + +Example: + +.. code-block:: python + + graph_vector_store = CassandraGraphVectorStore() + link_extractor = HtmlLinkExtractor() + links = link_extractor.extract_one(HtmlInput(document.page_content, "http://mysite")) + add_links(document, links) + graph_vector_store.add_document(document) + +*********** +Get started +*********** + +We chunk the State of the Union text and split it into documents. + +.. code-block:: python + + from langchain_community.document_loaders import TextLoader + from langchain_text_splitters import CharacterTextSplitter + + raw_documents = TextLoader("state_of_the_union.txt").load() + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + documents = text_splitter.split_documents(raw_documents) + +Links can be added to documents manually but it's easier to use a +:class:`~langchain_community.graph_vectorstores.extractors.LinkExtractor`. +Several common link extractors are available and you can build your own. +For this guide, we'll use the +:class:`~langchain_community.graph_vectorstores.extractors.KeybertLinkExtractor` +which uses the KeyBERT model to tag documents with keywords and uses these keywords to +create links between documents. + +.. code-block:: python + + from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor + from langchain_community.graph_vectorstores.links import add_links + + extractor = KeybertLinkExtractor() + + for doc in documents: + add_links(doc, extractor.extract_one(doc)) + +*********************************************** +Create the graph vector store and add documents +*********************************************** + +We'll use an Apache Cassandra or Astra DB database as an example. +We create a :class:`~langchain_community.graph_vectorstores.CassandraGraphVectorStore` +from the documents and an :class:`~langchain_openai.OpenAIEmbeddings` model. + +.. code-block:: python + + import cassio + from langchain_community.graph_vectorstores import CassandraGraphVectorStore + from langchain_openai import OpenAIEmbeddings + + # Initialize cassio and the Cassandra session from the environment variables + cassio.init(auto=True) + + store = CassandraGraphVectorStore.from_documents( + embedding=OpenAIEmbeddings(), + documents=documents, + ) + +***************** +Similarity search +***************** + +If we don't traverse the graph, a graph vector store behaves like a regular vector +store. +So all methods available in a vector store are also available in a graph vector store. +The :meth:`~langchain_community.graph_vectorstores.base.GraphVectorStore.similarity_search` +method returns documents similar to a query without considering +the links between documents. + +.. code-block:: python + + docs = store.similarity_search( + "What did the president say about Ketanji Brown Jackson?" + ) + +**************** +Traversal search +**************** + +The :meth:`~langchain_community.graph_vectorstores.base.GraphVectorStore.traversal_search` +method returns documents similar to a query considering the links +between documents. It first does a similarity search and then traverses the graph to +find linked documents. + +.. code-block:: python + + docs = list( + store.traversal_search("What did the president say about Ketanji Brown Jackson?") + ) + +************* +Async methods +************* + +The graph vector store has async versions of the methods prefixed with ``a``. + +.. code-block:: python + + docs = [ + doc + async for doc in store.atraversal_search( + "What did the president say about Ketanji Brown Jackson?" + ) + ] + +**************************** +Graph vector store retriever +**************************** + +The graph vector store can be converted to a retriever. +It is similar to the vector store retriever but it also has traversal search methods +such as ``traversal`` and ``mmr_traversal``. + +.. code-block:: python + + retriever = store.as_retriever(search_type="mmr_traversal") + docs = retriever.invoke("What did the president say about Ketanji Brown Jackson?") + +""" # noqa: E501 + +from langchain_community.graph_vectorstores.base import ( + GraphVectorStore, + GraphVectorStoreRetriever, + Node, +) +from langchain_community.graph_vectorstores.cassandra import CassandraGraphVectorStore +from langchain_community.graph_vectorstores.links import ( + Link, +) + +__all__ = [ + "GraphVectorStore", + "GraphVectorStoreRetriever", + "Node", + "Link", + "CassandraGraphVectorStore", +] diff --git a/libs/community/langchain_community/graph_vectorstores/base.py b/libs/community/langchain_community/graph_vectorstores/base.py new file mode 100644 index 00000000000..a00eb79271a --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/base.py @@ -0,0 +1,7 @@ +from langchain_core.graph_vectorstores.base import ( + GraphVectorStore, + GraphVectorStoreRetriever, + Node, +) + +__all__ = ["GraphVectorStore", "GraphVectorStoreRetriever", "Node"] diff --git a/libs/community/langchain_community/graph_vectorstores/links.py b/libs/community/langchain_community/graph_vectorstores/links.py new file mode 100644 index 00000000000..921a2be8f98 --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/links.py @@ -0,0 +1,8 @@ +from langchain_core.graph_vectorstores.links import ( + Link, + add_links, + copy_with_links, + get_links, +) + +__all__ = ["Link", "add_links", "get_links", "copy_with_links"] diff --git a/libs/core/langchain_core/graph_vectorstores/base.py b/libs/core/langchain_core/graph_vectorstores/base.py index 6d0eff91819..adc9064cb48 100644 --- a/libs/core/langchain_core/graph_vectorstores/base.py +++ b/libs/core/langchain_core/graph_vectorstores/base.py @@ -38,10 +38,11 @@ class Node(Serializable): Edges exist from nodes with an outgoing link to nodes with a matching incoming link. - For instance two nodes `a` and `b` connected over a hyperlink `https://some-url` + For instance two nodes `a` and `b` connected over a hyperlink ``https://some-url`` would look like: .. code-block:: python + [ Node( id="a", @@ -118,6 +119,13 @@ def _documents_to_nodes(documents: Iterable[Document]) -> Iterator[Node]: @beta() def nodes_to_documents(nodes: Iterable[Node]) -> Iterator[Document]: + """Convert nodes to documents. + + Args: + nodes: The nodes to convert to documents. + Returns: + The documents generated from the nodes. + """ for node in nodes: metadata = node.metadata.copy() metadata[METADATA_LINKS_KEY] = [ @@ -594,19 +602,24 @@ class GraphVectorStore(VectorStore): """Return GraphVectorStoreRetriever initialized from this GraphVectorStore. Args: - search_type (Optional[str]): Defines the type of search that - the Retriever should perform. - Can be "traversal" (default), "similarity", "mmr", or - "similarity_score_threshold". - search_kwargs (Optional[Dict]): Keyword arguments to pass to the - search function. Can include things like: - k: Amount of documents to return (Default: 4) - depth: The maximum depth of edges to traverse (Default: 1) - score_threshold: Minimum relevance threshold - for similarity_score_threshold - fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) - lambda_mult: Diversity of results returned by MMR; - 1 for minimum diversity and 0 for maximum. (Default: 0.5) + **kwargs: Keyword arguments to pass to the search function. + Can include: + + - search_type (Optional[str]): Defines the type of search that + the Retriever should perform. + Can be ``traversal`` (default), ``similarity``, ``mmr``, or + ``similarity_score_threshold``. + - search_kwargs (Optional[Dict]): Keyword arguments to pass to the + search function. Can include things like: + + - k(int): Amount of documents to return (Default: 4). + - depth(int): The maximum depth of edges to traverse (Default: 1). + - score_threshold(float): Minimum relevance threshold + for similarity_score_threshold. + - fetch_k(int): Amount of documents to pass to MMR algorithm + (Default: 20). + - lambda_mult(float): Diversity of results returned by MMR; + 1 for minimum diversity and 0 for maximum. (Default: 0.5). Returns: Retriever for this GraphVectorStore. diff --git a/libs/core/langchain_core/graph_vectorstores/links.py b/libs/core/langchain_core/graph_vectorstores/links.py index 1e2353b0b4b..7464eb4aa95 100644 --- a/libs/core/langchain_core/graph_vectorstores/links.py +++ b/libs/core/langchain_core/graph_vectorstores/links.py @@ -43,6 +43,7 @@ METADATA_LINKS_KEY = "links" @beta() def get_links(doc: Document) -> List[Link]: """Get the links from a document. + Args: doc: The document to get the link tags from. Returns: @@ -60,6 +61,7 @@ def get_links(doc: Document) -> List[Link]: @beta() def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None: """Add links to the given metadata. + Args: doc: The document to add the links to. *links: The links to add to the document.