community[minor]: Add graph store extractors (#24065)

This adds an extractor interface and an implementation for HTML pages. Extractors are used to create GraphVectorStore Links on loaded content. **Twitter handle:** cbornet_
2025-10-09 06:23:26 +00:00 · 2024-07-11 16:35:31 +02:00
parent 9bcf8f867d
commit 5fc5ef2b52
7 changed files with 321 additions and 0 deletions
--- a/libs/community/langchain_community/graph_vectorstores/extractors/init.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/init.py
@@ -0,0 +1,17 @@
 from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
    HtmlInput,
    HtmlLinkExtractor,
 )
 from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
 )
 from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
    LinkExtractorAdapter,
 )
 __all__ = [
    "LinkExtractor",
    "LinkExtractorAdapter",
    "HtmlInput",
    "HtmlLinkExtractor",
 ]
--- a/libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py
@@ -0,0 +1,124 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional, Set, Union
 from urllib.parse import urldefrag, urljoin, urlparse
 from langchain_core.documents import Document
 from langchain_core.graph_vectorstores import Link
 from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
 )
 from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
    LinkExtractorAdapter,
 )
 if TYPE_CHECKING:
    from bs4 import BeautifulSoup
    from bs4.element import Tag
 def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
    href = link.get("href")
    if href is None:
        return None
    url = urlparse(href)
    if url.scheme not in ["http", "https", ""]:
        return None
    # Join the HREF with the page_url to convert relative paths to absolute.
    url = str(urljoin(page_url, href))
    # Fragments would be useful if we chunked a page based on section.
    # Then, each chunk would have a different URL based on the fragment.
    # Since we aren't doing that yet, they just "break" links. So, drop
    # the fragment.
    if drop_fragments:
        return urldefrag(url).url
    return url
 def _parse_hrefs(
    soup: BeautifulSoup, url: str, drop_fragments: bool = True
 ) -> Set[str]:
    soup_links: List[Tag] = soup.find_all("a")
    links: Set[str] = set()
    for link in soup_links:
        parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
        # Remove self links and entries for any 'a' tag that failed to parse
        # (didn't have href, or invalid domain, etc.)
        if parse_url and parse_url != url:
            links.add(parse_url)
    return links
@dataclass
 class HtmlInput:
    content: Union[str, BeautifulSoup]
    base_url: str
 class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
    def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
        """Extract hyperlinks from HTML content.
        Expects the input to be an HTML string or a `BeautifulSoup` object.
        Args:
            kind: The kind of edge to extract. Defaults to "hyperlink".
            drop_fragments: Whether fragments in URLs and links should be
                dropped. Defaults to `True`.
        """
        try:
            import bs4  # noqa:F401
        except ImportError as e:
            raise ImportError(
                "BeautifulSoup4 is required for HtmlLinkExtractor. "
                "Please install it with `pip install beautifulsoup4`."
            ) from e
        self._kind = kind
        self.drop_fragments = drop_fragments
    def as_document_extractor(
        self, url_metadata_key: str = "source"
    ) -> LinkExtractor[Document]:
        """Return a LinkExtractor that applies to documents.
        NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
        link extractors it may be more efficient to call the link extractors directly
        on the parsed BeautifulSoup object.
        Args:
            url_metadata_key: The name of the filed in document metadata with the URL of
                the document.
        """
        return LinkExtractorAdapter(
            underlying=self,
            transform=lambda doc: HtmlInput(
                doc.page_content, doc.metadata[url_metadata_key]
            ),
        )
    def extract_one(
        self,
        input: HtmlInput,  # noqa: A002
    ) -> Set[Link]:
        content = input.content
        if isinstance(content, str):
            from bs4 import BeautifulSoup
            content = BeautifulSoup(content, "html.parser")
        base_url = input.base_url
        if self.drop_fragments:
            base_url = urldefrag(base_url).url
        hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
        links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
        links.add(Link.incoming(kind=self._kind, tag=base_url))
        return links
--- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
@@ -0,0 +1,36 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Generic, Iterable, Set, TypeVar
 from langchain_core.graph_vectorstores import Link
 InputT = TypeVar("InputT")
 METADATA_LINKS_KEY = "links"
 class LinkExtractor(ABC, Generic[InputT]):
    """Interface for extracting links (incoming, outgoing, bidirectional)."""
    @abstractmethod
    def extract_one(self, input: InputT) -> set[Link]:  # noqa: A002
        """Add edges from each `input` to the corresponding documents.
        Args:
            input: The input content to extract edges from.
        Returns:
            Set of links extracted from the input.
        """
    def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
        """Add edges from each `input` to the corresponding documents.
        Args:
            inputs: The input content to extract edges from.
        Returns:
            Iterable over the set of links extracted from the input.
        """
        return map(self.extract_one, inputs)
--- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py
@@ -0,0 +1,27 @@
 from typing import Callable, Iterable, Set, TypeVar
 from langchain_core.graph_vectorstores import Link
 from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
 )
 InputT = TypeVar("InputT")
 UnderlyingInputT = TypeVar("UnderlyingInputT")
 class LinkExtractorAdapter(LinkExtractor[InputT]):
    def __init__(
        self,
        underlying: LinkExtractor[UnderlyingInputT],
        transform: Callable[[InputT], UnderlyingInputT],
    ) -> None:
        self._underlying = underlying
        self._transform = transform
    def extract_one(self, input: InputT) -> Set[Link]:  # noqa: A002
        return self._underlying.extract_one(self._transform(input))
    def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
        underlying_inputs = map(self._transform, inputs)
        return self._underlying.extract_many(underlying_inputs)
--- a/libs/community/tests/unit_tests/graph_vectorstores/init.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/init.py
--- a/libs/community/tests/unit_tests/graph_vectorstores/extractors/init.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/init.py
--- a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py
@@ -0,0 +1,117 @@
 import pytest
 from langchain_core.graph_vectorstores import Link
 from langchain_community.graph_vectorstores.extractors import (
    HtmlInput,
    HtmlLinkExtractor,
 )
 PAGE_1 = """
 <html>
 <body>
 Hello.
 <a href="relative">Relative</a>
 <a href="/relative-base">Relative base.</a>
 <a href="http://cnn.com">Aboslute</a>
 <a href="//same.foo">Test</a>
 </body>
 </html>
 """
 PAGE_2 = """
 <html>
 <body>
 Hello.
 <a href="/bar/#fragment">Relative</a>
 </html>
 """
@pytest.mark.requires("bs4")
 def test_one_from_str() -> None:
    extractor = HtmlLinkExtractor()
    results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/"))
    assert results == {
        Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
        Link.outgoing(kind="hyperlink", tag="https://same.foo"),
    }
    results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/"))
    assert results == {
        Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
        Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
        Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
        Link.outgoing(kind="hyperlink", tag="http://same.foo"),
    }
@pytest.mark.requires("bs4")
 def test_one_from_beautiful_soup() -> None:
    from bs4 import BeautifulSoup
    extractor = HtmlLinkExtractor()
    soup = BeautifulSoup(PAGE_1, "html.parser")
    results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/"))
    assert results == {
        Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
        Link.outgoing(kind="hyperlink", tag="https://same.foo"),
    }
@pytest.mark.requires("bs4")
 def test_drop_fragments() -> None:
    extractor = HtmlLinkExtractor(drop_fragments=True)
    results = extractor.extract_one(
        HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
    )
    assert results == {
        Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
    }
@pytest.mark.requires("bs4")
 def test_include_fragments() -> None:
    extractor = HtmlLinkExtractor(drop_fragments=False)
    results = extractor.extract_one(
        HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
    )
    assert results == {
        Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
    }
@pytest.mark.requires("bs4")
 def test_batch_from_str() -> None:
    extractor = HtmlLinkExtractor()
    results = list(
        extractor.extract_many(
            [
                HtmlInput(PAGE_1, base_url="https://foo.com/bar/"),
                HtmlInput(PAGE_2, base_url="https://foo.com/baz/"),
            ]
        )
    )
    assert results[0] == {
        Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
        Link.outgoing(kind="hyperlink", tag="https://same.foo"),
    }
    assert results[1] == {
        Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
    }