mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-28 09:28:48 +00:00
community[minor]: add document transformer for extracting links (#24186)
- **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
parent
3c4652c906
commit
5ac936a284
@ -6,20 +6,24 @@ from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor
|
|||||||
HierarchyInput,
|
HierarchyInput,
|
||||||
HierarchyLinkExtractor,
|
HierarchyLinkExtractor,
|
||||||
)
|
)
|
||||||
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
|
|
||||||
HtmlInput,
|
|
||||||
HtmlLinkExtractor,
|
|
||||||
)
|
|
||||||
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
|
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
|
||||||
KeybertInput,
|
KeybertInput,
|
||||||
KeybertLinkExtractor,
|
KeybertLinkExtractor,
|
||||||
)
|
)
|
||||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
|
||||||
|
from .html_link_extractor import (
|
||||||
|
HtmlInput,
|
||||||
|
HtmlLinkExtractor,
|
||||||
|
)
|
||||||
|
from .link_extractor import (
|
||||||
LinkExtractor,
|
LinkExtractor,
|
||||||
)
|
)
|
||||||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
from .link_extractor_adapter import (
|
||||||
LinkExtractorAdapter,
|
LinkExtractorAdapter,
|
||||||
)
|
)
|
||||||
|
from .link_extractor_transformer import (
|
||||||
|
LinkExtractorTransformer,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"GLiNERInput",
|
"GLiNERInput",
|
||||||
@ -34,4 +38,5 @@ __all__ = [
|
|||||||
"LinkExtractor",
|
"LinkExtractor",
|
||||||
"LinkExtractorAdapter",
|
"LinkExtractorAdapter",
|
||||||
"LinkExtractorAdapter",
|
"LinkExtractorAdapter",
|
||||||
|
"LinkExtractorTransformer",
|
||||||
]
|
]
|
||||||
|
@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
|
|||||||
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
|
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
|
def extract_one(self, input: InputT) -> Set[Link]:
|
||||||
"""Add edges from each `input` to the corresponding documents.
|
"""Add edges from each `input` to the corresponding documents.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -0,0 +1,43 @@
|
|||||||
|
from typing import Any, Iterable, Sequence
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.documents.transformers import BaseDocumentTransformer
|
||||||
|
from langchain_core.graph_vectorstores.links import copy_with_links
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||||
|
LinkExtractor,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LinkExtractorTransformer(BaseDocumentTransformer):
|
||||||
|
"""DocumentTransformer for applying one or more LinkExtractors.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
extract_links = LinkExtractorTransformer([
|
||||||
|
HtmlLinkExtractor().as_document_extractor(),
|
||||||
|
])
|
||||||
|
extract_links.transform_documents(docs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
|
||||||
|
"""Create a DocumentTransformer which adds extracted links to each document."""
|
||||||
|
self.link_extractors = link_extractors
|
||||||
|
|
||||||
|
def transform_documents(
|
||||||
|
self, documents: Sequence[Document], **kwargs: Any
|
||||||
|
) -> Sequence[Document]:
|
||||||
|
# Implement `transform_docments` directly, so that LinkExtractors which operate
|
||||||
|
# better in batch (`extract_many`) get a chance to do so.
|
||||||
|
|
||||||
|
# Run each extractor over all documents.
|
||||||
|
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
|
||||||
|
|
||||||
|
# Transpose the list of lists to pair each document with the tuple of links.
|
||||||
|
links_per_document = zip(*links_per_extractor)
|
||||||
|
|
||||||
|
return [
|
||||||
|
copy_with_links(document, *links)
|
||||||
|
for document, links in zip(documents, links_per_document)
|
||||||
|
]
|
@ -0,0 +1,92 @@
|
|||||||
|
from typing import Set
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.graph_vectorstores.links import Link, get_links
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors import (
|
||||||
|
LinkExtractor,
|
||||||
|
LinkExtractorTransformer,
|
||||||
|
)
|
||||||
|
|
||||||
|
TEXT1 = "Text1"
|
||||||
|
TEXT2 = "Text2"
|
||||||
|
|
||||||
|
|
||||||
|
class FakeKeywordExtractor(LinkExtractor[Document]):
|
||||||
|
def extract_one(self, input: Document) -> Set[Link]:
|
||||||
|
kws: Set[str] = set()
|
||||||
|
if input.page_content == TEXT1:
|
||||||
|
kws = {"a", "b"}
|
||||||
|
elif input.page_content == TEXT2:
|
||||||
|
kws = {"b", "c"}
|
||||||
|
|
||||||
|
return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
|
||||||
|
|
||||||
|
|
||||||
|
class FakeHyperlinkExtractor(LinkExtractor[Document]):
|
||||||
|
def extract_one(self, input: Document) -> Set[Link]:
|
||||||
|
if input.page_content == TEXT1:
|
||||||
|
return {
|
||||||
|
Link.incoming(kind="fakehref", tag="http://text1"),
|
||||||
|
Link.outgoing(kind="fakehref", tag="http://text2"),
|
||||||
|
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||||
|
}
|
||||||
|
elif input.page_content == TEXT2:
|
||||||
|
return {
|
||||||
|
Link.incoming(kind="fakehref", tag="http://text2"),
|
||||||
|
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_one_extractor() -> None:
|
||||||
|
transformer = LinkExtractorTransformer(
|
||||||
|
[
|
||||||
|
FakeKeywordExtractor(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
doc1 = Document(TEXT1)
|
||||||
|
doc2 = Document(TEXT2)
|
||||||
|
results = transformer.transform_documents([doc1, doc2])
|
||||||
|
|
||||||
|
assert set(get_links(results[0])) == {
|
||||||
|
Link.bidir(kind="fakekw", tag="a"),
|
||||||
|
Link.bidir(kind="fakekw", tag="b"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert set(get_links(results[1])) == {
|
||||||
|
Link.bidir(kind="fakekw", tag="b"),
|
||||||
|
Link.bidir(kind="fakekw", tag="c"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_extractors() -> None:
|
||||||
|
transformer = LinkExtractorTransformer(
|
||||||
|
[
|
||||||
|
FakeKeywordExtractor(),
|
||||||
|
FakeHyperlinkExtractor(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
doc1 = Document(TEXT1)
|
||||||
|
doc2 = Document(TEXT2)
|
||||||
|
|
||||||
|
results = transformer.transform_documents([doc1, doc2])
|
||||||
|
|
||||||
|
assert set(get_links(results[0])) == {
|
||||||
|
Link.bidir(kind="fakekw", tag="a"),
|
||||||
|
Link.bidir(kind="fakekw", tag="b"),
|
||||||
|
Link.incoming(kind="fakehref", tag="http://text1"),
|
||||||
|
Link.outgoing(kind="fakehref", tag="http://text2"),
|
||||||
|
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert set(get_links(results[1])) == {
|
||||||
|
Link.bidir(kind="fakekw", tag="b"),
|
||||||
|
Link.bidir(kind="fakekw", tag="c"),
|
||||||
|
Link.incoming(kind="fakehref", tag="http://text2"),
|
||||||
|
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||||
|
}
|
@ -12,7 +12,7 @@ class Link:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
kind: str
|
kind: str
|
||||||
"""The kind of link. Allows different extractors to use the same tag name without
|
"""The kind of link. Allows different extractors to use the same tag name without
|
||||||
creating collisions between extractors. For example “keyword” vs “url”."""
|
creating collisions between extractors. For example “keyword” vs “url”."""
|
||||||
direction: Literal["in", "out", "bidir"]
|
direction: Literal["in", "out", "bidir"]
|
||||||
"""The direction of the link."""
|
"""The direction of the link."""
|
||||||
@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
|
|||||||
links_in_metadata.extend(link)
|
links_in_metadata.extend(link)
|
||||||
else:
|
else:
|
||||||
links_in_metadata.append(link)
|
links_in_metadata.append(link)
|
||||||
|
|
||||||
|
|
||||||
|
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
|
||||||
|
"""Return a document with the given links added.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: The document to add the links to.
|
||||||
|
*links: The links to add to the document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A document with a shallow-copy of the metadata with the links added.
|
||||||
|
"""
|
||||||
|
new_links = set(get_links(doc))
|
||||||
|
for link in links:
|
||||||
|
if isinstance(link, Iterable):
|
||||||
|
new_links.update(link)
|
||||||
|
else:
|
||||||
|
new_links.add(link)
|
||||||
|
|
||||||
|
return Document(
|
||||||
|
page_content=doc.page_content,
|
||||||
|
metadata={
|
||||||
|
**doc.metadata,
|
||||||
|
METADATA_LINKS_KEY: list(new_links),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user