community[minor]: GLiNER link extraction (#24314)

- **Description:** This allows extracting links between documents with
common named entities using [GLiNER](https://github.com/urchade/GLiNER).
- **Issue:** None
- **Dependencies:** None

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Ben Chambers 2024-07-19 08:34:54 -07:00 committed by GitHub
parent b5acb91080
commit 83f3d95ffa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 126 additions and 3 deletions

View File

@ -25,6 +25,7 @@ fireworks-ai>=0.9.0,<0.10
friendli-client>=1.2.4,<2
geopandas>=0.13.1
gitpython>=3.1.32,<4
gliner>=0.2.7
google-cloud-documentai>=2.20.1,<3
gql>=3.4.1,<4
gradientai>=1.4.0,<2

View File

@ -1,3 +1,7 @@
from langchain_community.graph_vectorstores.extractors.gliner_link_extractor import (
GLiNERInput,
GLiNERLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
HierarchyInput,
HierarchyLinkExtractor,
@ -14,10 +18,12 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
)
__all__ = [
"LinkExtractor",
"LinkExtractorAdapter",
"GLiNERInput",
"GLiNERLinkExtractor",
"HierarchyInput",
"HierarchyLinkExtractor",
"HtmlInput",
"HtmlLinkExtractor",
"LinkExtractor",
"LinkExtractorAdapter",
]

View File

@ -0,0 +1,71 @@
from typing import Any, Dict, Iterable, List, Optional, Set, Union
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
GLiNERInput = Union[str, Document]
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
"""Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
def __init__(
self,
labels: List[str],
*,
kind: str = "entity",
model: str = "urchade/gliner_mediumv2.1",
extract_kwargs: Optional[Dict[str, Any]] = None,
):
"""Extract keywords using GLiNER.
Example:
.. code-block:: python
extractor = GLiNERLinkExtractor(
labels=["Person", "Award", "Date", "Competitions", "Teams"]
)
results = extractor.extract_one("some long text...")
Args:
labels: List of kinds of entities to extract.
kind: Kind of links to produce with this extractor.
model: GLiNER model to use.
extract_kwargs: Keyword arguments to pass to GLiNER.
"""
try:
from gliner import GLiNER
self._model = GLiNER.from_pretrained(model)
except ImportError:
raise ImportError(
"gliner is required for GLiNERLinkExtractor. "
"Please install it with `pip install gliner`."
) from None
self._labels = labels
self._kind = kind
self._extract_kwargs = extract_kwargs or {}
def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002
return next(iter(self.extract_many([input])))
def extract_many(
self,
inputs: Iterable[GLiNERInput],
) -> Iterable[Set[Link]]:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
for entities in self._model.batch_predict_entities(
strs, self._labels, **self._extract_kwargs
):
yield {
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
for e in entities
}

View File

@ -30,7 +30,9 @@ class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
"""Extract links from a document hierarchy.
Example:
.. code_block: python
.. code-block:: python
# Given three paths (in this case, within the "Root" document):
h1 = ["Root", "H1"]
h1a = ["Root", "H1", "a"]

View File

@ -0,0 +1,43 @@
import pytest
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor
PAGE_1 = """
Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃ'tjɐnu
ʁɔ'naldu]; born 5 February 1985) is a Portuguese professional footballer who
plays as a forward for and captains both Saudi Pro League club Al Nassr and the
Portugal national team. Widely regarded as one of the greatest players of all
time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's
Player of the Year Awards, and four European Golden Shoes, the most by a
European player. He has won 33 trophies in his career, including seven league
titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA
Nations League. Ronaldo holds the records for most appearances (183), goals
(140) and assists (42) in the Champions League, goals in the European
Championship (14), international goals (128) and international appearances
(205). He is one of the few players to have made over 1,200 professional career
appearances, the most by an outfield player, and has scored over 850 official
senior career goals for club and country, making him the top goalscorer of all
time.
"""
@pytest.mark.requires("gliner")
def test_one_from_keywords() -> None:
extractor = GLiNERLinkExtractor(
labels=["Person", "Award", "Date", "Competitions", "Teams"]
)
results = extractor.extract_one(PAGE_1)
assert results == {
Link.bidir(kind="entity:Person", tag="Cristiano Ronaldo dos Santos Aveiro"),
Link.bidir(kind="entity:Award", tag="European Golden Shoes"),
Link.bidir(kind="entity:Competitions", tag="European\nChampionship"),
Link.bidir(kind="entity:Award", tag="UEFA Men's\nPlayer of the Year Awards"),
Link.bidir(kind="entity:Date", tag="5 February 1985"),
Link.bidir(kind="entity:Competitions", tag="UEFA Champions Leagues"),
Link.bidir(kind="entity:Teams", tag="Portugal national team"),
Link.bidir(kind="entity:Competitions", tag="UEFA European Championship"),
Link.bidir(kind="entity:Competitions", tag="UEFA\nNations League"),
Link.bidir(kind="entity:Award", tag="Ballon d'Or"),
}