community[minor]: Add keybert-based link extractor (#24311)

- **Description:** Add a `KeybertLinkExtractor` for graph vectorstores. This allows extracting links from keywords in a Document and linking nodes that have common keywords. - **Issue:** None - **Dependencies:** None. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: ccurme <chester.curme@gmail.com>
2025-09-18 08:03:36 +00:00 · 2024-07-19 09:25:07 -07:00
parent ef049769f0
commit 3691701d58
4 changed files with 146 additions and 0 deletions
--- a/libs/community/langchain_community/graph_vectorstores/extractors/init.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/init.py
@@ -10,6 +10,10 @@ from langchain_community.graph_vectorstores.extractors.html_link_extractor impor
    HtmlInput,
    HtmlLinkExtractor,
 )
+from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
+    KeybertInput,
+    KeybertLinkExtractor,
+)
 from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
 )
@@ -24,6 +28,10 @@ __all__ = [
    "HierarchyLinkExtractor",
    "HtmlInput",
    "HtmlLinkExtractor",
+    "KeybertInput",
+    "KeybertLinkExtractor",
+    "LinkExtractor",
    "LinkExtractor",
    "LinkExtractorAdapter",
+    "LinkExtractorAdapter",
 ]
--- a/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py
@@ -0,0 +1,73 @@
+from typing import Any, Dict, Iterable, Optional, Set, Union
+
+from langchain_core.documents import Document
+from langchain_core.graph_vectorstores.links import Link
+
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+
+KeybertInput = Union[str, Document]
+
+
+class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
+    def __init__(
+        self,
+        *,
+        kind: str = "kw",
+        embedding_model: str = "all-MiniLM-L6-v2",
+        extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """Extract keywords using KeyBERT <https://maartengr.github.io/KeyBERT/>.
+
+        Example:
+
+            .. code-block:: python
+
+                extractor = KeybertLinkExtractor()
+
+                results = extractor.extract_one(PAGE_1)
+
+        Args:
+            kind: Kind of links to produce with this extractor.
+            embedding_model: Name of the embedding model to use with KeyBERT.
+            extract_keywords_kwargs: Keyword arguments to pass to KeyBERT's
+                `extract_keywords` method.
+        """
+        try:
+            import keybert
+
+            self._kw_model = keybert.KeyBERT(model=embedding_model)
+        except ImportError:
+            raise ImportError(
+                "keybert is required for KeybertLinkExtractor. "
+                "Please install it with `pip install keybert`."
+            ) from None
+
+        self._kind = kind
+        self._extract_keywords_kwargs = extract_keywords_kwargs or {}
+
+    def extract_one(self, input: KeybertInput) -> Set[Link]:  # noqa: A002
+        keywords = self._kw_model.extract_keywords(
+            input if isinstance(input, str) else input.page_content,
+            **self._extract_keywords_kwargs,
+        )
+        return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
+
+    def extract_many(
+        self,
+        inputs: Iterable[KeybertInput],
+    ) -> Iterable[Set[Link]]:
+        inputs = list(inputs)
+        if len(inputs) == 1:
+            # Even though we pass a list, if it contains one item, keybert will
+            # flatten it. This means it's easier to just call the special case
+            # for one item.
+            yield self.extract_one(inputs[0])
+        elif len(inputs) > 1:
+            strs = [i if isinstance(i, str) else i.page_content for i in inputs]
+            extracted = self._kw_model.extract_keywords(
+                strs, **self._extract_keywords_kwargs
+            )
+            for keywords in extracted:
+                yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}