community[minor]: GLiNER link extraction (#24314)

- **Description:** This allows extracting links between documents with
common named entities using [GLiNER](https://github.com/urchade/GLiNER).
- **Issue:** None
- **Dependencies:** None

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Ben Chambers
2024-07-19 08:34:54 -07:00
committed by GitHub
parent b5acb91080
commit 83f3d95ffa
5 changed files with 126 additions and 3 deletions

View File

@@ -1,3 +1,7 @@
from langchain_community.graph_vectorstores.extractors.gliner_link_extractor import (
GLiNERInput,
GLiNERLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
HierarchyInput,
HierarchyLinkExtractor,
@@ -14,10 +18,12 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
)
__all__ = [
"LinkExtractor",
"LinkExtractorAdapter",
"GLiNERInput",
"GLiNERLinkExtractor",
"HierarchyInput",
"HierarchyLinkExtractor",
"HtmlInput",
"HtmlLinkExtractor",
"LinkExtractor",
"LinkExtractorAdapter",
]

View File

@@ -0,0 +1,71 @@
from typing import Any, Dict, Iterable, List, Optional, Set, Union
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
GLiNERInput = Union[str, Document]
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
"""Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
def __init__(
self,
labels: List[str],
*,
kind: str = "entity",
model: str = "urchade/gliner_mediumv2.1",
extract_kwargs: Optional[Dict[str, Any]] = None,
):
"""Extract keywords using GLiNER.
Example:
.. code-block:: python
extractor = GLiNERLinkExtractor(
labels=["Person", "Award", "Date", "Competitions", "Teams"]
)
results = extractor.extract_one("some long text...")
Args:
labels: List of kinds of entities to extract.
kind: Kind of links to produce with this extractor.
model: GLiNER model to use.
extract_kwargs: Keyword arguments to pass to GLiNER.
"""
try:
from gliner import GLiNER
self._model = GLiNER.from_pretrained(model)
except ImportError:
raise ImportError(
"gliner is required for GLiNERLinkExtractor. "
"Please install it with `pip install gliner`."
) from None
self._labels = labels
self._kind = kind
self._extract_kwargs = extract_kwargs or {}
def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002
return next(iter(self.extract_many([input])))
def extract_many(
self,
inputs: Iterable[GLiNERInput],
) -> Iterable[Set[Link]]:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
for entities in self._model.batch_predict_entities(
strs, self._labels, **self._extract_kwargs
):
yield {
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
for e in entities
}

View File

@@ -30,7 +30,9 @@ class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
"""Extract links from a document hierarchy.
Example:
.. code_block: python
.. code-block:: python
# Given three paths (in this case, within the "Root" document):
h1 = ["Root", "H1"]
h1a = ["Root", "H1", "a"]