community[minor]: GLiNER link extraction (#24314)

- **Description:** This allows extracting links between documents with common named entities using [GLiNER](https://github.com/urchade/GLiNER). - **Issue:** None - **Dependencies:** None --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-09-18 08:03:36 +00:00 · 2024-07-19 08:34:54 -07:00
parent b5acb91080
commit 83f3d95ffa
5 changed files with 126 additions and 3 deletions
--- a/libs/community/langchain_community/graph_vectorstores/extractors/init.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/init.py
@@ -1,3 +1,7 @@
+from langchain_community.graph_vectorstores.extractors.gliner_link_extractor import (
+    GLiNERInput,
+    GLiNERLinkExtractor,
+)
 from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
    HierarchyInput,
    HierarchyLinkExtractor,
@@ -14,10 +18,12 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
 )

 __all__ = [
-    "LinkExtractor",
-    "LinkExtractorAdapter",
+    "GLiNERInput",
+    "GLiNERLinkExtractor",
    "HierarchyInput",
    "HierarchyLinkExtractor",
    "HtmlInput",
    "HtmlLinkExtractor",
+    "LinkExtractor",
+    "LinkExtractorAdapter",
 ]
--- a/libs/community/langchain_community/graph_vectorstores/extractors/gliner_link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/gliner_link_extractor.py
@@ -0,0 +1,71 @@
+from typing import Any, Dict, Iterable, List, Optional, Set, Union
+
+from langchain_core.documents import Document
+from langchain_core.graph_vectorstores.links import Link
+
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+
+# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
+GLiNERInput = Union[str, Document]
+
+
+class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
+    """Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
+
+    def __init__(
+        self,
+        labels: List[str],
+        *,
+        kind: str = "entity",
+        model: str = "urchade/gliner_mediumv2.1",
+        extract_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """Extract keywords using GLiNER.
+
+        Example:
+
+            .. code-block:: python
+
+                extractor = GLiNERLinkExtractor(
+                    labels=["Person", "Award", "Date", "Competitions", "Teams"]
+                )
+                results = extractor.extract_one("some long text...")
+
+        Args:
+            labels: List of kinds of entities to extract.
+            kind: Kind of links to produce with this extractor.
+            model: GLiNER model to use.
+            extract_kwargs: Keyword arguments to pass to GLiNER.
+        """
+        try:
+            from gliner import GLiNER
+
+            self._model = GLiNER.from_pretrained(model)
+
+        except ImportError:
+            raise ImportError(
+                "gliner is required for GLiNERLinkExtractor. "
+                "Please install it with `pip install gliner`."
+            ) from None
+
+        self._labels = labels
+        self._kind = kind
+        self._extract_kwargs = extract_kwargs or {}
+
+    def extract_one(self, input: GLiNERInput) -> Set[Link]:  # noqa: A002
+        return next(iter(self.extract_many([input])))
+
+    def extract_many(
+        self,
+        inputs: Iterable[GLiNERInput],
+    ) -> Iterable[Set[Link]]:
+        strs = [i if isinstance(i, str) else i.page_content for i in inputs]
+        for entities in self._model.batch_predict_entities(
+            strs, self._labels, **self._extract_kwargs
+        ):
+            yield {
+                Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
+                for e in entities
+            }
--- a/libs/community/langchain_community/graph_vectorstores/extractors/hierarchy_link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/hierarchy_link_extractor.py
@@ -30,7 +30,9 @@ class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
        """Extract links from a document hierarchy.

        Example:
-            .. code_block: python
+
+            .. code-block:: python
+
                # Given three paths (in this case, within the "Root" document):
                h1 = ["Root", "H1"]
                h1a = ["Root", "H1", "a"]