community[minor]: add document transformer for extracting links (#24186)

- **Description:** Add a DocumentTransformer for executing one or more
`LinkExtractor`s and adding the extracted links to each document.
- **Issue:** n/a
- **Depedencies:** none

---------

Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
Ben Chambers
2024-07-22 19:01:21 -07:00
committed by GitHub
parent 3c4652c906
commit 5ac936a284
5 changed files with 174 additions and 8 deletions

View File

@@ -12,7 +12,7 @@ class Link:
"""
kind: str
"""The kind of link. Allows different extractors to use the same tag name without
"""The kind of link. Allows different extractors to use the same tag name without
creating collisions between extractors. For example “keyword” vs “url”."""
direction: Literal["in", "out", "bidir"]
"""The direction of the link."""
@@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
links_in_metadata.extend(link)
else:
links_in_metadata.append(link)
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
"""Return a document with the given links added.
Args:
doc: The document to add the links to.
*links: The links to add to the document.
Returns:
A document with a shallow-copy of the metadata with the links added.
"""
new_links = set(get_links(doc))
for link in links:
if isinstance(link, Iterable):
new_links.update(link)
else:
new_links.add(link)
return Document(
page_content=doc.page_content,
metadata={
**doc.metadata,
METADATA_LINKS_KEY: list(new_links),
},
)