From 5ac936a284934ca47b0c81021ec7dcb0fa905a5b Mon Sep 17 00:00:00 2001 From: Ben Chambers <35960+bjchambers@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:01:21 -0700 Subject: [PATCH] community[minor]: add document transformer for extracting links (#24186) - **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev --- .../graph_vectorstores/extractors/__init__.py | 17 ++-- .../extractors/link_extractor.py | 2 +- .../extractors/link_extractor_transformer.py | 43 +++++++++ .../test_link_extractor_transformer.py | 92 +++++++++++++++++++ .../graph_vectorstores/links.py | 28 +++++- 5 files changed, 174 insertions(+), 8 deletions(-) create mode 100644 libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py create mode 100644 libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py index a78eb3807be..5acd8000038 100644 --- a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py +++ b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py @@ -6,20 +6,24 @@ from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor HierarchyInput, HierarchyLinkExtractor, ) -from langchain_community.graph_vectorstores.extractors.html_link_extractor import ( - HtmlInput, - HtmlLinkExtractor, -) from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import ( KeybertInput, KeybertLinkExtractor, ) -from langchain_community.graph_vectorstores.extractors.link_extractor import ( + +from .html_link_extractor import ( + HtmlInput, + HtmlLinkExtractor, +) +from .link_extractor import ( LinkExtractor, ) -from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import ( +from .link_extractor_adapter import ( LinkExtractorAdapter, ) +from .link_extractor_transformer import ( + LinkExtractorTransformer, +) __all__ = [ "GLiNERInput", @@ -34,4 +38,5 @@ __all__ = [ "LinkExtractor", "LinkExtractorAdapter", "LinkExtractorAdapter", + "LinkExtractorTransformer", ] diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py index 2eaa94bd86b..619ba2a6d13 100644 --- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py +++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py @@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]): """Interface for extracting links (incoming, outgoing, bidirectional).""" @abstractmethod - def extract_one(self, input: InputT) -> set[Link]: # noqa: A002 + def extract_one(self, input: InputT) -> Set[Link]: """Add edges from each `input` to the corresponding documents. Args: diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py new file mode 100644 index 00000000000..52e4347d45b --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py @@ -0,0 +1,43 @@ +from typing import Any, Iterable, Sequence + +from langchain_core.documents import Document +from langchain_core.documents.transformers import BaseDocumentTransformer +from langchain_core.graph_vectorstores.links import copy_with_links + +from langchain_community.graph_vectorstores.extractors.link_extractor import ( + LinkExtractor, +) + + +class LinkExtractorTransformer(BaseDocumentTransformer): + """DocumentTransformer for applying one or more LinkExtractors. + + Example: + .. code-block:: python + + extract_links = LinkExtractorTransformer([ + HtmlLinkExtractor().as_document_extractor(), + ]) + extract_links.transform_documents(docs) + """ + + def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]): + """Create a DocumentTransformer which adds extracted links to each document.""" + self.link_extractors = link_extractors + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + # Implement `transform_docments` directly, so that LinkExtractors which operate + # better in batch (`extract_many`) get a chance to do so. + + # Run each extractor over all documents. + links_per_extractor = [e.extract_many(documents) for e in self.link_extractors] + + # Transpose the list of lists to pair each document with the tuple of links. + links_per_document = zip(*links_per_extractor) + + return [ + copy_with_links(document, *links) + for document, links in zip(documents, links_per_document) + ] diff --git a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py new file mode 100644 index 00000000000..c791c759417 --- /dev/null +++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py @@ -0,0 +1,92 @@ +from typing import Set + +from langchain_core.documents import Document +from langchain_core.graph_vectorstores.links import Link, get_links + +from langchain_community.graph_vectorstores.extractors import ( + LinkExtractor, + LinkExtractorTransformer, +) + +TEXT1 = "Text1" +TEXT2 = "Text2" + + +class FakeKeywordExtractor(LinkExtractor[Document]): + def extract_one(self, input: Document) -> Set[Link]: + kws: Set[str] = set() + if input.page_content == TEXT1: + kws = {"a", "b"} + elif input.page_content == TEXT2: + kws = {"b", "c"} + + return {Link.bidir(kind="fakekw", tag=kw) for kw in kws} + + +class FakeHyperlinkExtractor(LinkExtractor[Document]): + def extract_one(self, input: Document) -> Set[Link]: + if input.page_content == TEXT1: + return { + Link.incoming(kind="fakehref", tag="http://text1"), + Link.outgoing(kind="fakehref", tag="http://text2"), + Link.outgoing(kind="fakehref", tag="http://text3"), + } + elif input.page_content == TEXT2: + return { + Link.incoming(kind="fakehref", tag="http://text2"), + Link.outgoing(kind="fakehref", tag="http://text3"), + } + else: + raise ValueError( + f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'" + ) + + +def test_one_extractor() -> None: + transformer = LinkExtractorTransformer( + [ + FakeKeywordExtractor(), + ] + ) + doc1 = Document(TEXT1) + doc2 = Document(TEXT2) + results = transformer.transform_documents([doc1, doc2]) + + assert set(get_links(results[0])) == { + Link.bidir(kind="fakekw", tag="a"), + Link.bidir(kind="fakekw", tag="b"), + } + + assert set(get_links(results[1])) == { + Link.bidir(kind="fakekw", tag="b"), + Link.bidir(kind="fakekw", tag="c"), + } + + +def test_multiple_extractors() -> None: + transformer = LinkExtractorTransformer( + [ + FakeKeywordExtractor(), + FakeHyperlinkExtractor(), + ] + ) + + doc1 = Document(TEXT1) + doc2 = Document(TEXT2) + + results = transformer.transform_documents([doc1, doc2]) + + assert set(get_links(results[0])) == { + Link.bidir(kind="fakekw", tag="a"), + Link.bidir(kind="fakekw", tag="b"), + Link.incoming(kind="fakehref", tag="http://text1"), + Link.outgoing(kind="fakehref", tag="http://text2"), + Link.outgoing(kind="fakehref", tag="http://text3"), + } + + assert set(get_links(results[1])) == { + Link.bidir(kind="fakekw", tag="b"), + Link.bidir(kind="fakekw", tag="c"), + Link.incoming(kind="fakehref", tag="http://text2"), + Link.outgoing(kind="fakehref", tag="http://text3"), + } diff --git a/libs/core/langchain_core/graph_vectorstores/links.py b/libs/core/langchain_core/graph_vectorstores/links.py index 9da58a39276..11c95986ee9 100644 --- a/libs/core/langchain_core/graph_vectorstores/links.py +++ b/libs/core/langchain_core/graph_vectorstores/links.py @@ -12,7 +12,7 @@ class Link: """ kind: str - """The kind of link. Allows different extractors to use the same tag name without + """The kind of link. Allows different extractors to use the same tag name without creating collisions between extractors. For example “keyword” vs “url”.""" direction: Literal["in", "out", "bidir"] """The direction of the link.""" @@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None: links_in_metadata.extend(link) else: links_in_metadata.append(link) + + +def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document: + """Return a document with the given links added. + + Args: + doc: The document to add the links to. + *links: The links to add to the document. + + Returns: + A document with a shallow-copy of the metadata with the links added. + """ + new_links = set(get_links(doc)) + for link in links: + if isinstance(link, Iterable): + new_links.update(link) + else: + new_links.add(link) + + return Document( + page_content=doc.page_content, + metadata={ + **doc.metadata, + METADATA_LINKS_KEY: list(new_links), + }, + )