mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 08:33:49 +00:00
community[minor]: GLiNER link extraction (#24314)
- **Description:** This allows extracting links between documents with common named entities using [GLiNER](https://github.com/urchade/GLiNER). - **Issue:** None - **Dependencies:** None --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
b5acb91080
commit
83f3d95ffa
@ -25,6 +25,7 @@ fireworks-ai>=0.9.0,<0.10
|
||||
friendli-client>=1.2.4,<2
|
||||
geopandas>=0.13.1
|
||||
gitpython>=3.1.32,<4
|
||||
gliner>=0.2.7
|
||||
google-cloud-documentai>=2.20.1,<3
|
||||
gql>=3.4.1,<4
|
||||
gradientai>=1.4.0,<2
|
||||
|
@ -1,3 +1,7 @@
|
||||
from langchain_community.graph_vectorstores.extractors.gliner_link_extractor import (
|
||||
GLiNERInput,
|
||||
GLiNERLinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
|
||||
HierarchyInput,
|
||||
HierarchyLinkExtractor,
|
||||
@ -14,10 +18,12 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
"GLiNERInput",
|
||||
"GLiNERLinkExtractor",
|
||||
"HierarchyInput",
|
||||
"HierarchyLinkExtractor",
|
||||
"HtmlInput",
|
||||
"HtmlLinkExtractor",
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
]
|
||||
|
@ -0,0 +1,71 @@
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
|
||||
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
|
||||
GLiNERInput = Union[str, Document]
|
||||
|
||||
|
||||
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
|
||||
"""Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
labels: List[str],
|
||||
*,
|
||||
kind: str = "entity",
|
||||
model: str = "urchade/gliner_mediumv2.1",
|
||||
extract_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""Extract keywords using GLiNER.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
extractor = GLiNERLinkExtractor(
|
||||
labels=["Person", "Award", "Date", "Competitions", "Teams"]
|
||||
)
|
||||
results = extractor.extract_one("some long text...")
|
||||
|
||||
Args:
|
||||
labels: List of kinds of entities to extract.
|
||||
kind: Kind of links to produce with this extractor.
|
||||
model: GLiNER model to use.
|
||||
extract_kwargs: Keyword arguments to pass to GLiNER.
|
||||
"""
|
||||
try:
|
||||
from gliner import GLiNER
|
||||
|
||||
self._model = GLiNER.from_pretrained(model)
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"gliner is required for GLiNERLinkExtractor. "
|
||||
"Please install it with `pip install gliner`."
|
||||
) from None
|
||||
|
||||
self._labels = labels
|
||||
self._kind = kind
|
||||
self._extract_kwargs = extract_kwargs or {}
|
||||
|
||||
def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002
|
||||
return next(iter(self.extract_many([input])))
|
||||
|
||||
def extract_many(
|
||||
self,
|
||||
inputs: Iterable[GLiNERInput],
|
||||
) -> Iterable[Set[Link]]:
|
||||
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
|
||||
for entities in self._model.batch_predict_entities(
|
||||
strs, self._labels, **self._extract_kwargs
|
||||
):
|
||||
yield {
|
||||
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
|
||||
for e in entities
|
||||
}
|
@ -30,7 +30,9 @@ class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
|
||||
"""Extract links from a document hierarchy.
|
||||
|
||||
Example:
|
||||
.. code_block: python
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Given three paths (in this case, within the "Root" document):
|
||||
h1 = ["Root", "H1"]
|
||||
h1a = ["Root", "H1", "a"]
|
||||
|
@ -0,0 +1,43 @@
|
||||
import pytest
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor
|
||||
|
||||
PAGE_1 = """
|
||||
Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃ'tjɐnu
|
||||
ʁɔ'naldu]; born 5 February 1985) is a Portuguese professional footballer who
|
||||
plays as a forward for and captains both Saudi Pro League club Al Nassr and the
|
||||
Portugal national team. Widely regarded as one of the greatest players of all
|
||||
time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's
|
||||
Player of the Year Awards, and four European Golden Shoes, the most by a
|
||||
European player. He has won 33 trophies in his career, including seven league
|
||||
titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA
|
||||
Nations League. Ronaldo holds the records for most appearances (183), goals
|
||||
(140) and assists (42) in the Champions League, goals in the European
|
||||
Championship (14), international goals (128) and international appearances
|
||||
(205). He is one of the few players to have made over 1,200 professional career
|
||||
appearances, the most by an outfield player, and has scored over 850 official
|
||||
senior career goals for club and country, making him the top goalscorer of all
|
||||
time.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.requires("gliner")
|
||||
def test_one_from_keywords() -> None:
|
||||
extractor = GLiNERLinkExtractor(
|
||||
labels=["Person", "Award", "Date", "Competitions", "Teams"]
|
||||
)
|
||||
|
||||
results = extractor.extract_one(PAGE_1)
|
||||
assert results == {
|
||||
Link.bidir(kind="entity:Person", tag="Cristiano Ronaldo dos Santos Aveiro"),
|
||||
Link.bidir(kind="entity:Award", tag="European Golden Shoes"),
|
||||
Link.bidir(kind="entity:Competitions", tag="European\nChampionship"),
|
||||
Link.bidir(kind="entity:Award", tag="UEFA Men's\nPlayer of the Year Awards"),
|
||||
Link.bidir(kind="entity:Date", tag="5 February 1985"),
|
||||
Link.bidir(kind="entity:Competitions", tag="UEFA Champions Leagues"),
|
||||
Link.bidir(kind="entity:Teams", tag="Portugal national team"),
|
||||
Link.bidir(kind="entity:Competitions", tag="UEFA European Championship"),
|
||||
Link.bidir(kind="entity:Competitions", tag="UEFA\nNations League"),
|
||||
Link.bidir(kind="entity:Award", tag="Ballon d'Or"),
|
||||
}
|
Loading…
Reference in New Issue
Block a user