mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-18 08:03:36 +00:00
community[minor]: GLiNER link extraction (#24314)
- **Description:** This allows extracting links between documents with common named entities using [GLiNER](https://github.com/urchade/GLiNER). - **Issue:** None - **Dependencies:** None --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
@@ -1,3 +1,7 @@
|
||||
from langchain_community.graph_vectorstores.extractors.gliner_link_extractor import (
|
||||
GLiNERInput,
|
||||
GLiNERLinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
|
||||
HierarchyInput,
|
||||
HierarchyLinkExtractor,
|
||||
@@ -14,10 +18,12 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
"GLiNERInput",
|
||||
"GLiNERLinkExtractor",
|
||||
"HierarchyInput",
|
||||
"HierarchyLinkExtractor",
|
||||
"HtmlInput",
|
||||
"HtmlLinkExtractor",
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
]
|
||||
|
@@ -0,0 +1,71 @@
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
|
||||
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
|
||||
GLiNERInput = Union[str, Document]
|
||||
|
||||
|
||||
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
|
||||
"""Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
labels: List[str],
|
||||
*,
|
||||
kind: str = "entity",
|
||||
model: str = "urchade/gliner_mediumv2.1",
|
||||
extract_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""Extract keywords using GLiNER.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
extractor = GLiNERLinkExtractor(
|
||||
labels=["Person", "Award", "Date", "Competitions", "Teams"]
|
||||
)
|
||||
results = extractor.extract_one("some long text...")
|
||||
|
||||
Args:
|
||||
labels: List of kinds of entities to extract.
|
||||
kind: Kind of links to produce with this extractor.
|
||||
model: GLiNER model to use.
|
||||
extract_kwargs: Keyword arguments to pass to GLiNER.
|
||||
"""
|
||||
try:
|
||||
from gliner import GLiNER
|
||||
|
||||
self._model = GLiNER.from_pretrained(model)
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"gliner is required for GLiNERLinkExtractor. "
|
||||
"Please install it with `pip install gliner`."
|
||||
) from None
|
||||
|
||||
self._labels = labels
|
||||
self._kind = kind
|
||||
self._extract_kwargs = extract_kwargs or {}
|
||||
|
||||
def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002
|
||||
return next(iter(self.extract_many([input])))
|
||||
|
||||
def extract_many(
|
||||
self,
|
||||
inputs: Iterable[GLiNERInput],
|
||||
) -> Iterable[Set[Link]]:
|
||||
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
|
||||
for entities in self._model.batch_predict_entities(
|
||||
strs, self._labels, **self._extract_kwargs
|
||||
):
|
||||
yield {
|
||||
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
|
||||
for e in entities
|
||||
}
|
@@ -30,7 +30,9 @@ class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
|
||||
"""Extract links from a document hierarchy.
|
||||
|
||||
Example:
|
||||
.. code_block: python
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Given three paths (in this case, within the "Root" document):
|
||||
h1 = ["Root", "H1"]
|
||||
h1a = ["Root", "H1", "a"]
|
||||
|
Reference in New Issue
Block a user