community[minor]: add document transformer for extracting links (#24186)

- **Description:** Add a DocumentTransformer for executing one or more
`LinkExtractor`s and adding the extracted links to each document.
- **Issue:** n/a
- **Depedencies:** none

---------

Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
Ben Chambers 2024-07-22 19:01:21 -07:00 committed by GitHub
parent 3c4652c906
commit 5ac936a284
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 174 additions and 8 deletions

View File

@ -6,20 +6,24 @@ from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor
HierarchyInput,
HierarchyLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
KeybertInput,
KeybertLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor import (
from .html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from .link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
from .link_extractor_adapter import (
LinkExtractorAdapter,
)
from .link_extractor_transformer import (
LinkExtractorTransformer,
)
__all__ = [
"GLiNERInput",
@ -34,4 +38,5 @@ __all__ = [
"LinkExtractor",
"LinkExtractorAdapter",
"LinkExtractorAdapter",
"LinkExtractorTransformer",
]

View File

@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
@abstractmethod
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
def extract_one(self, input: InputT) -> Set[Link]:
"""Add edges from each `input` to the corresponding documents.
Args:

View File

@ -0,0 +1,43 @@
from typing import Any, Iterable, Sequence
from langchain_core.documents import Document
from langchain_core.documents.transformers import BaseDocumentTransformer
from langchain_core.graph_vectorstores.links import copy_with_links
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
class LinkExtractorTransformer(BaseDocumentTransformer):
"""DocumentTransformer for applying one or more LinkExtractors.
Example:
.. code-block:: python
extract_links = LinkExtractorTransformer([
HtmlLinkExtractor().as_document_extractor(),
])
extract_links.transform_documents(docs)
"""
def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
"""Create a DocumentTransformer which adds extracted links to each document."""
self.link_extractors = link_extractors
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
# Implement `transform_docments` directly, so that LinkExtractors which operate
# better in batch (`extract_many`) get a chance to do so.
# Run each extractor over all documents.
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
# Transpose the list of lists to pair each document with the tuple of links.
links_per_document = zip(*links_per_extractor)
return [
copy_with_links(document, *links)
for document, links in zip(documents, links_per_document)
]

View File

@ -0,0 +1,92 @@
from typing import Set
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link, get_links
from langchain_community.graph_vectorstores.extractors import (
LinkExtractor,
LinkExtractorTransformer,
)
TEXT1 = "Text1"
TEXT2 = "Text2"
class FakeKeywordExtractor(LinkExtractor[Document]):
def extract_one(self, input: Document) -> Set[Link]:
kws: Set[str] = set()
if input.page_content == TEXT1:
kws = {"a", "b"}
elif input.page_content == TEXT2:
kws = {"b", "c"}
return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
class FakeHyperlinkExtractor(LinkExtractor[Document]):
def extract_one(self, input: Document) -> Set[Link]:
if input.page_content == TEXT1:
return {
Link.incoming(kind="fakehref", tag="http://text1"),
Link.outgoing(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
elif input.page_content == TEXT2:
return {
Link.incoming(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
else:
raise ValueError(
f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
)
def test_one_extractor() -> None:
transformer = LinkExtractorTransformer(
[
FakeKeywordExtractor(),
]
)
doc1 = Document(TEXT1)
doc2 = Document(TEXT2)
results = transformer.transform_documents([doc1, doc2])
assert set(get_links(results[0])) == {
Link.bidir(kind="fakekw", tag="a"),
Link.bidir(kind="fakekw", tag="b"),
}
assert set(get_links(results[1])) == {
Link.bidir(kind="fakekw", tag="b"),
Link.bidir(kind="fakekw", tag="c"),
}
def test_multiple_extractors() -> None:
transformer = LinkExtractorTransformer(
[
FakeKeywordExtractor(),
FakeHyperlinkExtractor(),
]
)
doc1 = Document(TEXT1)
doc2 = Document(TEXT2)
results = transformer.transform_documents([doc1, doc2])
assert set(get_links(results[0])) == {
Link.bidir(kind="fakekw", tag="a"),
Link.bidir(kind="fakekw", tag="b"),
Link.incoming(kind="fakehref", tag="http://text1"),
Link.outgoing(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
assert set(get_links(results[1])) == {
Link.bidir(kind="fakekw", tag="b"),
Link.bidir(kind="fakekw", tag="c"),
Link.incoming(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}

View File

@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
links_in_metadata.extend(link)
else:
links_in_metadata.append(link)
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
"""Return a document with the given links added.
Args:
doc: The document to add the links to.
*links: The links to add to the document.
Returns:
A document with a shallow-copy of the metadata with the links added.
"""
new_links = set(get_links(doc))
for link in links:
if isinstance(link, Iterable):
new_links.update(link)
else:
new_links.add(link)
return Document(
page_content=doc.page_content,
metadata={
**doc.metadata,
METADATA_LINKS_KEY: list(new_links),
},
)