Compare commits

...

4 Commits

Author SHA1 Message Date
Eugene Yurtsev
5f8858a490 x 2024-07-11 12:49:55 -04:00
Eugene Yurtsev
7b9c1359fe x 2024-07-11 12:36:26 -04:00
Eugene Yurtsev
a41689b891 x 2024-07-11 12:33:01 -04:00
Eugene Yurtsev
833008976c x 2024-07-11 12:30:40 -04:00
10 changed files with 124 additions and 100 deletions

View File

@@ -1,3 +1,19 @@
from langchain_community.graph_vectorstores.base import (
GraphStoreNode,
GraphVectorStore,
GraphVectorStoreRetriever,
Node,
)
from langchain_community.graph_vectorstores.cassandra import CassandraGraphVectorStore
from langchain_community.graph_vectorstores.links import GraphStoreLink, Link
__all__ = ["CassandraGraphVectorStore"]
__all__ = [
"GraphVectorStore",
"GraphVectorStoreRetriever",
"GraphStoreNode",
"GraphStoreLink",
"Link", # for backward compatibility
"GraphStoreNode",
"Node", # for backward compatibility
"CassandraGraphVectorStore",
]

View File

@@ -17,12 +17,16 @@ from langchain_core.callbacks import (
CallbackManagerForRetrieverRun,
)
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import METADATA_LINKS_KEY, Link
from langchain_core.load import Serializable
from langchain_core.pydantic_v1 import Field
from langchain_core.runnables import run_in_executor
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
from langchain_community.graph_vectorstores.links import (
METADATA_LINKS_KEY,
GraphStoreLink,
)
def _has_next(iterator: Iterator) -> bool:
"""Checks if the iterator has more elements.
@@ -31,7 +35,7 @@ def _has_next(iterator: Iterator) -> bool:
return next(iterator, sentinel) is not sentinel
class Node(Serializable):
class GraphStoreNode(Serializable):
"""Node in the GraphVectorStore.
Edges exist from nodes with an outgoing link to nodes with a matching incoming link.
@@ -41,18 +45,22 @@ class Node(Serializable):
.. code-block:: python
[
Node(
GraphStoreNode(
id="a",
text="some text a",
links= [
Link(kind="hyperlink", tag="https://some-url", direction="incoming")
GraphStoreLink(
kind="hyperlink", tag="https://some-url", direction="incoming"
)
],
),
Node(
GraphStoreNode(
id="b",
text="some text b",
links= [
Link(kind="hyperlink", tag="https://some-url", direction="outgoing")
GraphStoreLink(
kind="hyperlink", tag="https://some-url", direction="outgoing"
)
],
)
]
@@ -64,15 +72,18 @@ class Node(Serializable):
"""Text contained by the node."""
metadata: dict = Field(default_factory=dict)
"""Metadata for the node."""
links: List[Link] = Field(default_factory=list)
links: List[GraphStoreLink] = Field(default_factory=list)
"""Links associated with the node."""
Node = GraphStoreNode # Alias for backwards compatibility
def _texts_to_nodes(
texts: Iterable[str],
metadatas: Optional[Iterable[dict]],
ids: Optional[Iterable[str]],
) -> Iterator[Node]:
) -> Iterator[GraphStoreNode]:
metadatas_it = iter(metadatas) if metadatas else None
ids_it = iter(ids) if ids else None
for text in texts:
@@ -88,7 +99,7 @@ def _texts_to_nodes(
links = _metadata.pop(METADATA_LINKS_KEY, [])
if not isinstance(links, list):
links = list(links)
yield Node(
yield GraphStoreNode(
id=_id,
metadata=_metadata,
text=text,
@@ -100,13 +111,13 @@ def _texts_to_nodes(
raise ValueError("metadatas iterable longer than texts")
def _documents_to_nodes(documents: Iterable[Document]) -> Iterator[Node]:
def _documents_to_nodes(documents: Iterable[Document]) -> Iterator[GraphStoreNode]:
for doc in documents:
metadata = doc.metadata.copy()
links = metadata.pop(METADATA_LINKS_KEY, [])
if not isinstance(links, list):
links = list(links)
yield Node(
yield GraphStoreNode(
id=doc.id,
metadata=metadata,
text=doc.page_content,
@@ -114,12 +125,12 @@ def _documents_to_nodes(documents: Iterable[Document]) -> Iterator[Node]:
)
def nodes_to_documents(nodes: Iterable[Node]) -> Iterator[Document]:
def nodes_to_documents(nodes: Iterable[GraphStoreNode]) -> Iterator[Document]:
for node in nodes:
metadata = node.metadata.copy()
metadata[METADATA_LINKS_KEY] = [
# Convert the core `Link` (from the node) back to the local `Link`.
Link(kind=link.kind, direction=link.direction, tag=link.tag)
GraphStoreLink(kind=link.kind, direction=link.direction, tag=link.tag)
for link in node.links
]
@@ -140,7 +151,7 @@ class GraphVectorStore(VectorStore):
@abstractmethod
def add_nodes(
self,
nodes: Iterable[Node],
nodes: Iterable[GraphStoreNode],
**kwargs: Any,
) -> Iterable[str]:
"""Add nodes to the graph store.
@@ -151,7 +162,7 @@ class GraphVectorStore(VectorStore):
async def aadd_nodes(
self,
nodes: Iterable[Node],
nodes: Iterable[GraphStoreNode],
**kwargs: Any,
) -> AsyncIterable[str]:
"""Add nodes to the graph store.
@@ -206,7 +217,7 @@ class GraphVectorStore(VectorStore):
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
The metadata key `links` shall be an iterable of
:py:class:`~langchain_core.graph_vectorstores.links.Link`.
:py:class:`~langchain_community.graph_vectorstores.links.GraphStoreLink`.
**kwargs: vectorstore specific parameters.
Returns:
@@ -254,7 +265,7 @@ class GraphVectorStore(VectorStore):
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
The metadata key `links` shall be an iterable of
:py:class:`~langchain_core.graph_vectorstores.links.Link`.
:py:class:`~langchain_community.graph_vectorstores.links.GraphStoreLink`.
**kwargs: vectorstore specific parameters.
Returns:
@@ -305,7 +316,7 @@ class GraphVectorStore(VectorStore):
Args:
documents: Documents to add to the vectorstore.
The document's metadata key `links` shall be an iterable of
:py:class:`~langchain_core.graph_vectorstores.links.Link`.
:py:class:`~langchain_community.graph_vectorstores.links.GraphStoreLink`.
Returns:
List of IDs of the added texts.
@@ -355,7 +366,7 @@ class GraphVectorStore(VectorStore):
Args:
documents: Documents to add to the vectorstore.
The document's metadata key `links` shall be an iterable of
:py:class:`~langchain_core.graph_vectorstores.links.Link`.
:py:class:`~langchain_community.graph_vectorstores.links.GraphStoreLink`.
Returns:
List of IDs of the added texts.

View File

@@ -11,12 +11,12 @@ from typing import (
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.graph_vectorstores.base import (
from langchain_community.graph_vectorstores.base import (
GraphVectorStore,
Node,
nodes_to_documents,
)
from langchain_community.utilities.cassandra import SetupMode
if TYPE_CHECKING:

View File

@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, List, Optional, Set, Union
from urllib.parse import urldefrag, urljoin, urlparse
from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link
from langchain_community.graph_vectorstores import GraphStoreLink
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
@@ -106,7 +106,7 @@ class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
def extract_one(
self,
input: HtmlInput, # noqa: A002
) -> Set[Link]:
) -> Set[GraphStoreLink]:
content = input.content
if isinstance(content, str):
from bs4 import BeautifulSoup
@@ -119,6 +119,6 @@ class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
links.add(Link.incoming(kind=self._kind, tag=base_url))
links = {GraphStoreLink.outgoing(kind=self._kind, tag=url) for url in hrefs}
links.add(GraphStoreLink.incoming(kind=self._kind, tag=base_url))
return links

View File

@@ -3,7 +3,7 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Generic, Iterable, Set, TypeVar
from langchain_core.graph_vectorstores import Link
from langchain_community.graph_vectorstores.links import GraphStoreLink
InputT = TypeVar("InputT")
@@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
@abstractmethod
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
def extract_one(self, input: InputT) -> set[GraphStoreLink]: # noqa: A002
"""Add edges from each `input` to the corresponding documents.
Args:
@@ -24,7 +24,7 @@ class LinkExtractor(ABC, Generic[InputT]):
Set of links extracted from the input.
"""
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[GraphStoreLink]]:
"""Add edges from each `input` to the corresponding documents.
Args:

View File

@@ -1,7 +1,6 @@
from typing import Callable, Iterable, Set, TypeVar
from langchain_core.graph_vectorstores import Link
from langchain_community.graph_vectorstores import GraphStoreLink
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
@@ -19,9 +18,9 @@ class LinkExtractorAdapter(LinkExtractor[InputT]):
self._underlying = underlying
self._transform = transform
def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002
def extract_one(self, input: InputT) -> Set[GraphStoreLink]: # noqa: A002
return self._underlying.extract_one(self._transform(input))
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[GraphStoreLink]]:
underlying_inputs = map(self._transform, inputs)
return self._underlying.extract_many(underlying_inputs)

View File

@@ -5,7 +5,7 @@ from langchain_core.documents import Document
@dataclass(frozen=True)
class Link:
class GraphStoreLink:
"""A link to/from a tag of a given tag.
Edges exist from nodes with an outgoing link to nodes with a matching incoming link.
@@ -20,28 +20,32 @@ class Link:
"""The tag of the link."""
@staticmethod
def incoming(kind: str, tag: str) -> "Link":
def incoming(kind: str, tag: str) -> "GraphStoreLink":
"""Create an incoming link."""
return Link(kind=kind, direction="in", tag=tag)
return GraphStoreLink(kind=kind, direction="in", tag=tag)
@staticmethod
def outgoing(kind: str, tag: str) -> "Link":
def outgoing(kind: str, tag: str) -> "GraphStoreLink":
"""Create an outgoing link."""
return Link(kind=kind, direction="out", tag=tag)
return GraphStoreLink(kind=kind, direction="out", tag=tag)
@staticmethod
def bidir(kind: str, tag: str) -> "Link":
def bidir(kind: str, tag: str) -> "GraphStoreLink":
"""Create a bidirectional link."""
return Link(kind=kind, direction="bidir", tag=tag)
return GraphStoreLink(kind=kind, direction="bidir", tag=tag)
Link = GraphStoreLink # Alias for backwards compatibility
METADATA_LINKS_KEY = "links"
def get_links(doc: Document) -> List[Link]:
def get_links(doc: Document) -> List[GraphStoreLink]:
"""Get the links from a document.
Args:
doc: The document to get the link tags from.
Returns:
The set of link tags from the document.
"""
@@ -54,8 +58,11 @@ def get_links(doc: Document) -> List[Link]:
return links
def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
def add_links(
doc: Document, *links: Union[GraphStoreLink, Iterable[GraphStoreLink]]
) -> None:
"""Add links to the given metadata.
Args:
doc: The document to add the links to.
*links: The links to add to the document.

View File

@@ -1,6 +1,6 @@
import pytest
from langchain_core.graph_vectorstores import Link
from langchain_community.graph_vectorstores import GraphStoreLink
from langchain_community.graph_vectorstores.extractors import (
HtmlInput,
HtmlLinkExtractor,
@@ -33,20 +33,20 @@ def test_one_from_str() -> None:
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
GraphStoreLink.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
GraphStoreLink.outgoing(kind="hyperlink", tag="http://cnn.com"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://same.foo"),
}
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="http://same.foo"),
GraphStoreLink.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
GraphStoreLink.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
GraphStoreLink.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
GraphStoreLink.outgoing(kind="hyperlink", tag="http://cnn.com"),
GraphStoreLink.outgoing(kind="hyperlink", tag="http://same.foo"),
}
@@ -58,11 +58,11 @@ def test_one_from_beautiful_soup() -> None:
soup = BeautifulSoup(PAGE_1, "html.parser")
results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
GraphStoreLink.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
GraphStoreLink.outgoing(kind="hyperlink", tag="http://cnn.com"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://same.foo"),
}
@@ -74,8 +74,8 @@ def test_drop_fragments() -> None:
)
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
GraphStoreLink.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
}
@@ -87,8 +87,8 @@ def test_include_fragments() -> None:
)
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
GraphStoreLink.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
}
@@ -105,13 +105,13 @@ def test_batch_from_str() -> None:
)
assert results[0] == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
GraphStoreLink.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
GraphStoreLink.outgoing(kind="hyperlink", tag="http://cnn.com"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://same.foo"),
}
assert results[1] == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
GraphStoreLink.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
GraphStoreLink.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
}

View File

@@ -1,34 +1,38 @@
import pytest
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.base import (
Node,
from langchain_community.graph_vectorstores.base import (
GraphStoreNode,
_documents_to_nodes,
_texts_to_nodes,
)
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.links import GraphStoreLink
def test_texts_to_nodes() -> None:
assert list(_texts_to_nodes(["a", "b"], [{"a": "b"}, {"c": "d"}], ["a", "b"])) == [
Node(id="a", metadata={"a": "b"}, text="a"),
Node(id="b", metadata={"c": "d"}, text="b"),
GraphStoreNode(id="a", metadata={"a": "b"}, text="a"),
GraphStoreNode(id="b", metadata={"c": "d"}, text="b"),
]
assert list(_texts_to_nodes(["a", "b"], None, ["a", "b"])) == [
Node(id="a", metadata={}, text="a"),
Node(id="b", metadata={}, text="b"),
GraphStoreNode(id="a", metadata={}, text="a"),
GraphStoreNode(id="b", metadata={}, text="b"),
]
assert list(_texts_to_nodes(["a", "b"], [{"a": "b"}, {"c": "d"}], None)) == [
Node(metadata={"a": "b"}, text="a"),
Node(metadata={"c": "d"}, text="b"),
GraphStoreNode(metadata={"a": "b"}, text="a"),
GraphStoreNode(metadata={"c": "d"}, text="b"),
]
assert list(
_texts_to_nodes(
["a"],
[{"links": {Link.incoming(kind="hyperlink", tag="http://b")}}],
[{"links": {GraphStoreLink.incoming(kind="hyperlink", tag="http://b")}}],
None,
)
) == [Node(links=[Link.incoming(kind="hyperlink", tag="http://b")], text="a")]
) == [
GraphStoreNode(
links=[GraphStoreLink.incoming(kind="hyperlink", tag="http://b")], text="a"
)
]
with pytest.raises(ValueError):
list(_texts_to_nodes(["a", "b"], None, ["a"]))
with pytest.raises(ValueError):
@@ -44,16 +48,18 @@ def test_documents_to_nodes() -> None:
Document(
id="a",
page_content="some text a",
metadata={"links": [Link.incoming(kind="hyperlink", tag="http://b")]},
metadata={
"links": [GraphStoreLink.incoming(kind="hyperlink", tag="http://b")]
},
),
Document(id="b", page_content="some text b", metadata={"c": "d"}),
]
assert list(_documents_to_nodes(documents)) == [
Node(
GraphStoreNode(
id="a",
metadata={},
links=[Link.incoming(kind="hyperlink", tag="http://b")],
links=[GraphStoreLink.incoming(kind="hyperlink", tag="http://b")],
text="some text a",
),
Node(id="b", metadata={"c": "d"}, text="some text b"),
GraphStoreNode(id="b", metadata={"c": "d"}, text="some text b"),
]

View File

@@ -1,15 +0,0 @@
from langchain_core.graph_vectorstores.base import (
GraphVectorStore,
GraphVectorStoreRetriever,
Node,
)
from langchain_core.graph_vectorstores.links import (
Link,
)
__all__ = [
"GraphVectorStore",
"GraphVectorStoreRetriever",
"Node",
"Link",
]