mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 00:23:25 +00:00
community[minor]: Add graph store extractors (#24065)
This adds an extractor interface and an implementation for HTML pages. Extractors are used to create GraphVectorStore Links on loaded content. **Twitter handle:** cbornet_
This commit is contained in:
parent
9bcf8f867d
commit
5fc5ef2b52
@ -0,0 +1,17 @@
|
|||||||
|
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
|
||||||
|
HtmlInput,
|
||||||
|
HtmlLinkExtractor,
|
||||||
|
)
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||||
|
LinkExtractor,
|
||||||
|
)
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||||
|
LinkExtractorAdapter,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"LinkExtractor",
|
||||||
|
"LinkExtractorAdapter",
|
||||||
|
"HtmlInput",
|
||||||
|
"HtmlLinkExtractor",
|
||||||
|
]
|
@ -0,0 +1,124 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import TYPE_CHECKING, List, Optional, Set, Union
|
||||||
|
from urllib.parse import urldefrag, urljoin, urlparse
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.graph_vectorstores import Link
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||||
|
LinkExtractor,
|
||||||
|
)
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||||
|
LinkExtractorAdapter,
|
||||||
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
|
||||||
|
href = link.get("href")
|
||||||
|
if href is None:
|
||||||
|
return None
|
||||||
|
url = urlparse(href)
|
||||||
|
if url.scheme not in ["http", "https", ""]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Join the HREF with the page_url to convert relative paths to absolute.
|
||||||
|
url = str(urljoin(page_url, href))
|
||||||
|
|
||||||
|
# Fragments would be useful if we chunked a page based on section.
|
||||||
|
# Then, each chunk would have a different URL based on the fragment.
|
||||||
|
# Since we aren't doing that yet, they just "break" links. So, drop
|
||||||
|
# the fragment.
|
||||||
|
if drop_fragments:
|
||||||
|
return urldefrag(url).url
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_hrefs(
|
||||||
|
soup: BeautifulSoup, url: str, drop_fragments: bool = True
|
||||||
|
) -> Set[str]:
|
||||||
|
soup_links: List[Tag] = soup.find_all("a")
|
||||||
|
links: Set[str] = set()
|
||||||
|
|
||||||
|
for link in soup_links:
|
||||||
|
parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
|
||||||
|
# Remove self links and entries for any 'a' tag that failed to parse
|
||||||
|
# (didn't have href, or invalid domain, etc.)
|
||||||
|
if parse_url and parse_url != url:
|
||||||
|
links.add(parse_url)
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HtmlInput:
|
||||||
|
content: Union[str, BeautifulSoup]
|
||||||
|
base_url: str
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
|
||||||
|
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
|
||||||
|
"""Extract hyperlinks from HTML content.
|
||||||
|
|
||||||
|
Expects the input to be an HTML string or a `BeautifulSoup` object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
kind: The kind of edge to extract. Defaults to "hyperlink".
|
||||||
|
drop_fragments: Whether fragments in URLs and links should be
|
||||||
|
dropped. Defaults to `True`.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import bs4 # noqa:F401
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"BeautifulSoup4 is required for HtmlLinkExtractor. "
|
||||||
|
"Please install it with `pip install beautifulsoup4`."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
self._kind = kind
|
||||||
|
self.drop_fragments = drop_fragments
|
||||||
|
|
||||||
|
def as_document_extractor(
|
||||||
|
self, url_metadata_key: str = "source"
|
||||||
|
) -> LinkExtractor[Document]:
|
||||||
|
"""Return a LinkExtractor that applies to documents.
|
||||||
|
|
||||||
|
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
|
||||||
|
link extractors it may be more efficient to call the link extractors directly
|
||||||
|
on the parsed BeautifulSoup object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url_metadata_key: The name of the filed in document metadata with the URL of
|
||||||
|
the document.
|
||||||
|
"""
|
||||||
|
return LinkExtractorAdapter(
|
||||||
|
underlying=self,
|
||||||
|
transform=lambda doc: HtmlInput(
|
||||||
|
doc.page_content, doc.metadata[url_metadata_key]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_one(
|
||||||
|
self,
|
||||||
|
input: HtmlInput, # noqa: A002
|
||||||
|
) -> Set[Link]:
|
||||||
|
content = input.content
|
||||||
|
if isinstance(content, str):
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
content = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
base_url = input.base_url
|
||||||
|
if self.drop_fragments:
|
||||||
|
base_url = urldefrag(base_url).url
|
||||||
|
|
||||||
|
hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
|
||||||
|
|
||||||
|
links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
|
||||||
|
links.add(Link.incoming(kind=self._kind, tag=base_url))
|
||||||
|
return links
|
@ -0,0 +1,36 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Generic, Iterable, Set, TypeVar
|
||||||
|
|
||||||
|
from langchain_core.graph_vectorstores import Link
|
||||||
|
|
||||||
|
InputT = TypeVar("InputT")
|
||||||
|
|
||||||
|
METADATA_LINKS_KEY = "links"
|
||||||
|
|
||||||
|
|
||||||
|
class LinkExtractor(ABC, Generic[InputT]):
|
||||||
|
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
|
||||||
|
"""Add edges from each `input` to the corresponding documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input: The input content to extract edges from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Set of links extracted from the input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
|
||||||
|
"""Add edges from each `input` to the corresponding documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: The input content to extract edges from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Iterable over the set of links extracted from the input.
|
||||||
|
"""
|
||||||
|
return map(self.extract_one, inputs)
|
@ -0,0 +1,27 @@
|
|||||||
|
from typing import Callable, Iterable, Set, TypeVar
|
||||||
|
|
||||||
|
from langchain_core.graph_vectorstores import Link
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||||
|
LinkExtractor,
|
||||||
|
)
|
||||||
|
|
||||||
|
InputT = TypeVar("InputT")
|
||||||
|
UnderlyingInputT = TypeVar("UnderlyingInputT")
|
||||||
|
|
||||||
|
|
||||||
|
class LinkExtractorAdapter(LinkExtractor[InputT]):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
underlying: LinkExtractor[UnderlyingInputT],
|
||||||
|
transform: Callable[[InputT], UnderlyingInputT],
|
||||||
|
) -> None:
|
||||||
|
self._underlying = underlying
|
||||||
|
self._transform = transform
|
||||||
|
|
||||||
|
def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002
|
||||||
|
return self._underlying.extract_one(self._transform(input))
|
||||||
|
|
||||||
|
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
|
||||||
|
underlying_inputs = map(self._transform, inputs)
|
||||||
|
return self._underlying.extract_many(underlying_inputs)
|
@ -0,0 +1,117 @@
|
|||||||
|
import pytest
|
||||||
|
from langchain_core.graph_vectorstores import Link
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors import (
|
||||||
|
HtmlInput,
|
||||||
|
HtmlLinkExtractor,
|
||||||
|
)
|
||||||
|
|
||||||
|
PAGE_1 = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Hello.
|
||||||
|
<a href="relative">Relative</a>
|
||||||
|
<a href="/relative-base">Relative base.</a>
|
||||||
|
<a href="http://cnn.com">Aboslute</a>
|
||||||
|
<a href="//same.foo">Test</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
PAGE_2 = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Hello.
|
||||||
|
<a href="/bar/#fragment">Relative</a>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_one_from_str() -> None:
|
||||||
|
extractor = HtmlLinkExtractor()
|
||||||
|
|
||||||
|
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/"))
|
||||||
|
assert results == {
|
||||||
|
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
|
||||||
|
}
|
||||||
|
|
||||||
|
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/"))
|
||||||
|
assert results == {
|
||||||
|
Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="http://same.foo"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_one_from_beautiful_soup() -> None:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
extractor = HtmlLinkExtractor()
|
||||||
|
soup = BeautifulSoup(PAGE_1, "html.parser")
|
||||||
|
results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/"))
|
||||||
|
assert results == {
|
||||||
|
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_drop_fragments() -> None:
|
||||||
|
extractor = HtmlLinkExtractor(drop_fragments=True)
|
||||||
|
results = extractor.extract_one(
|
||||||
|
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results == {
|
||||||
|
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_include_fragments() -> None:
|
||||||
|
extractor = HtmlLinkExtractor(drop_fragments=False)
|
||||||
|
results = extractor.extract_one(
|
||||||
|
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results == {
|
||||||
|
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_batch_from_str() -> None:
|
||||||
|
extractor = HtmlLinkExtractor()
|
||||||
|
results = list(
|
||||||
|
extractor.extract_many(
|
||||||
|
[
|
||||||
|
HtmlInput(PAGE_1, base_url="https://foo.com/bar/"),
|
||||||
|
HtmlInput(PAGE_2, base_url="https://foo.com/baz/"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results[0] == {
|
||||||
|
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
|
||||||
|
}
|
||||||
|
assert results[1] == {
|
||||||
|
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
|
||||||
|
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user