mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
community[minor]: GLiNER link extraction (#24314)
- **Description:** This allows extracting links between documents with common named entities using [GLiNER](https://github.com/urchade/GLiNER). - **Issue:** None - **Dependencies:** None --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
b5acb91080
commit
83f3d95ffa
@ -25,6 +25,7 @@ fireworks-ai>=0.9.0,<0.10
|
|||||||
friendli-client>=1.2.4,<2
|
friendli-client>=1.2.4,<2
|
||||||
geopandas>=0.13.1
|
geopandas>=0.13.1
|
||||||
gitpython>=3.1.32,<4
|
gitpython>=3.1.32,<4
|
||||||
|
gliner>=0.2.7
|
||||||
google-cloud-documentai>=2.20.1,<3
|
google-cloud-documentai>=2.20.1,<3
|
||||||
gql>=3.4.1,<4
|
gql>=3.4.1,<4
|
||||||
gradientai>=1.4.0,<2
|
gradientai>=1.4.0,<2
|
||||||
|
@ -1,3 +1,7 @@
|
|||||||
|
from langchain_community.graph_vectorstores.extractors.gliner_link_extractor import (
|
||||||
|
GLiNERInput,
|
||||||
|
GLiNERLinkExtractor,
|
||||||
|
)
|
||||||
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
|
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
|
||||||
HierarchyInput,
|
HierarchyInput,
|
||||||
HierarchyLinkExtractor,
|
HierarchyLinkExtractor,
|
||||||
@ -14,10 +18,12 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
|
|||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"LinkExtractor",
|
"GLiNERInput",
|
||||||
"LinkExtractorAdapter",
|
"GLiNERLinkExtractor",
|
||||||
"HierarchyInput",
|
"HierarchyInput",
|
||||||
"HierarchyLinkExtractor",
|
"HierarchyLinkExtractor",
|
||||||
"HtmlInput",
|
"HtmlInput",
|
||||||
"HtmlLinkExtractor",
|
"HtmlLinkExtractor",
|
||||||
|
"LinkExtractor",
|
||||||
|
"LinkExtractorAdapter",
|
||||||
]
|
]
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
from typing import Any, Dict, Iterable, List, Optional, Set, Union
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.graph_vectorstores.links import Link
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||||
|
LinkExtractor,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
|
||||||
|
GLiNERInput = Union[str, Document]
|
||||||
|
|
||||||
|
|
||||||
|
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
|
||||||
|
"""Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
labels: List[str],
|
||||||
|
*,
|
||||||
|
kind: str = "entity",
|
||||||
|
model: str = "urchade/gliner_mediumv2.1",
|
||||||
|
extract_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
|
):
|
||||||
|
"""Extract keywords using GLiNER.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
extractor = GLiNERLinkExtractor(
|
||||||
|
labels=["Person", "Award", "Date", "Competitions", "Teams"]
|
||||||
|
)
|
||||||
|
results = extractor.extract_one("some long text...")
|
||||||
|
|
||||||
|
Args:
|
||||||
|
labels: List of kinds of entities to extract.
|
||||||
|
kind: Kind of links to produce with this extractor.
|
||||||
|
model: GLiNER model to use.
|
||||||
|
extract_kwargs: Keyword arguments to pass to GLiNER.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from gliner import GLiNER
|
||||||
|
|
||||||
|
self._model = GLiNER.from_pretrained(model)
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"gliner is required for GLiNERLinkExtractor. "
|
||||||
|
"Please install it with `pip install gliner`."
|
||||||
|
) from None
|
||||||
|
|
||||||
|
self._labels = labels
|
||||||
|
self._kind = kind
|
||||||
|
self._extract_kwargs = extract_kwargs or {}
|
||||||
|
|
||||||
|
def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002
|
||||||
|
return next(iter(self.extract_many([input])))
|
||||||
|
|
||||||
|
def extract_many(
|
||||||
|
self,
|
||||||
|
inputs: Iterable[GLiNERInput],
|
||||||
|
) -> Iterable[Set[Link]]:
|
||||||
|
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
|
||||||
|
for entities in self._model.batch_predict_entities(
|
||||||
|
strs, self._labels, **self._extract_kwargs
|
||||||
|
):
|
||||||
|
yield {
|
||||||
|
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
|
||||||
|
for e in entities
|
||||||
|
}
|
@ -30,7 +30,9 @@ class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
|
|||||||
"""Extract links from a document hierarchy.
|
"""Extract links from a document hierarchy.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
.. code_block: python
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
# Given three paths (in this case, within the "Root" document):
|
# Given three paths (in this case, within the "Root" document):
|
||||||
h1 = ["Root", "H1"]
|
h1 = ["Root", "H1"]
|
||||||
h1a = ["Root", "H1", "a"]
|
h1a = ["Root", "H1", "a"]
|
||||||
|
@ -0,0 +1,43 @@
|
|||||||
|
import pytest
|
||||||
|
from langchain_core.graph_vectorstores.links import Link
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor
|
||||||
|
|
||||||
|
PAGE_1 = """
|
||||||
|
Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃ'tjɐnu
|
||||||
|
ʁɔ'naldu]; born 5 February 1985) is a Portuguese professional footballer who
|
||||||
|
plays as a forward for and captains both Saudi Pro League club Al Nassr and the
|
||||||
|
Portugal national team. Widely regarded as one of the greatest players of all
|
||||||
|
time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's
|
||||||
|
Player of the Year Awards, and four European Golden Shoes, the most by a
|
||||||
|
European player. He has won 33 trophies in his career, including seven league
|
||||||
|
titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA
|
||||||
|
Nations League. Ronaldo holds the records for most appearances (183), goals
|
||||||
|
(140) and assists (42) in the Champions League, goals in the European
|
||||||
|
Championship (14), international goals (128) and international appearances
|
||||||
|
(205). He is one of the few players to have made over 1,200 professional career
|
||||||
|
appearances, the most by an outfield player, and has scored over 850 official
|
||||||
|
senior career goals for club and country, making him the top goalscorer of all
|
||||||
|
time.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("gliner")
|
||||||
|
def test_one_from_keywords() -> None:
|
||||||
|
extractor = GLiNERLinkExtractor(
|
||||||
|
labels=["Person", "Award", "Date", "Competitions", "Teams"]
|
||||||
|
)
|
||||||
|
|
||||||
|
results = extractor.extract_one(PAGE_1)
|
||||||
|
assert results == {
|
||||||
|
Link.bidir(kind="entity:Person", tag="Cristiano Ronaldo dos Santos Aveiro"),
|
||||||
|
Link.bidir(kind="entity:Award", tag="European Golden Shoes"),
|
||||||
|
Link.bidir(kind="entity:Competitions", tag="European\nChampionship"),
|
||||||
|
Link.bidir(kind="entity:Award", tag="UEFA Men's\nPlayer of the Year Awards"),
|
||||||
|
Link.bidir(kind="entity:Date", tag="5 February 1985"),
|
||||||
|
Link.bidir(kind="entity:Competitions", tag="UEFA Champions Leagues"),
|
||||||
|
Link.bidir(kind="entity:Teams", tag="Portugal national team"),
|
||||||
|
Link.bidir(kind="entity:Competitions", tag="UEFA European Championship"),
|
||||||
|
Link.bidir(kind="entity:Competitions", tag="UEFA\nNations League"),
|
||||||
|
Link.bidir(kind="entity:Award", tag="Ballon d'Or"),
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user