mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-12 12:59:07 +00:00
community[minor]: add document transformer for extracting links (#24186)
- **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
@@ -12,7 +12,7 @@ class Link:
|
||||
"""
|
||||
|
||||
kind: str
|
||||
"""The kind of link. Allows different extractors to use the same tag name without
|
||||
"""The kind of link. Allows different extractors to use the same tag name without
|
||||
creating collisions between extractors. For example “keyword” vs “url”."""
|
||||
direction: Literal["in", "out", "bidir"]
|
||||
"""The direction of the link."""
|
||||
@@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
|
||||
links_in_metadata.extend(link)
|
||||
else:
|
||||
links_in_metadata.append(link)
|
||||
|
||||
|
||||
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
|
||||
"""Return a document with the given links added.
|
||||
|
||||
Args:
|
||||
doc: The document to add the links to.
|
||||
*links: The links to add to the document.
|
||||
|
||||
Returns:
|
||||
A document with a shallow-copy of the metadata with the links added.
|
||||
"""
|
||||
new_links = set(get_links(doc))
|
||||
for link in links:
|
||||
if isinstance(link, Iterable):
|
||||
new_links.update(link)
|
||||
else:
|
||||
new_links.add(link)
|
||||
|
||||
return Document(
|
||||
page_content=doc.page_content,
|
||||
metadata={
|
||||
**doc.metadata,
|
||||
METADATA_LINKS_KEY: list(new_links),
|
||||
},
|
||||
)
|
||||
|
Reference in New Issue
Block a user