mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 08:58:48 +00:00
community: Add docstring for GLiNERLinkExtractor (#26218)
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
e235a572a0
commit
56580b5fff
@ -14,7 +14,117 @@ GLiNERInput = Union[str, Document]
|
|||||||
|
|
||||||
@beta()
|
@beta()
|
||||||
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
|
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
|
||||||
"""Link documents with common named entities using GLiNER <https://github.com/urchade/GLiNER>."""
|
"""Link documents with common named entities using `GLiNER`_.
|
||||||
|
|
||||||
|
`GLiNER`_ is a Named Entity Recognition (NER) model capable of identifying any
|
||||||
|
entity type using a bidirectional transformer encoder (BERT-like).
|
||||||
|
|
||||||
|
The ``GLiNERLinkExtractor`` uses GLiNER to create links between documents that
|
||||||
|
have named entities in common.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
extractor = GLiNERLinkExtractor(
|
||||||
|
labels=["Person", "Award", "Date", "Competitions", "Teams"]
|
||||||
|
)
|
||||||
|
results = extractor.extract_one("some long text...")
|
||||||
|
|
||||||
|
.. _GLiNER: https://github.com/urchade/GLiNER
|
||||||
|
|
||||||
|
.. seealso::
|
||||||
|
|
||||||
|
- :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>`
|
||||||
|
- :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>`
|
||||||
|
|
||||||
|
How to link Documents on common named entities
|
||||||
|
==============================================
|
||||||
|
|
||||||
|
Preliminaries
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Install the ``gliner`` package:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install -q langchain_community gliner
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we
|
||||||
|
extract named entity links and add them to the chunk.
|
||||||
|
|
||||||
|
Using extract_one()
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
We can use :meth:`extract_one` on a document to get the links and add the links
|
||||||
|
to the document metadata with
|
||||||
|
:meth:`~langchain_core.graph_vectorstores.links.add_links`::
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import TextLoader
|
||||||
|
from langchain_community.graph_vectorstores import CassandraGraphVectorStore
|
||||||
|
from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor
|
||||||
|
from langchain_core.graph_vectorstores.links import add_links
|
||||||
|
from langchain_text_splitters import CharacterTextSplitter
|
||||||
|
|
||||||
|
loader = TextLoader("state_of_the_union.txt")
|
||||||
|
raw_documents = loader.load()
|
||||||
|
|
||||||
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
||||||
|
documents = text_splitter.split_documents(raw_documents)
|
||||||
|
|
||||||
|
ner_extractor = GLiNERLinkExtractor(["Person", "Topic"])
|
||||||
|
for document in documents:
|
||||||
|
links = ner_extractor.extract_one(document)
|
||||||
|
add_links(document, links)
|
||||||
|
|
||||||
|
print(documents[0].metadata)
|
||||||
|
|
||||||
|
.. code-block:: output
|
||||||
|
|
||||||
|
{'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]}
|
||||||
|
|
||||||
|
Using LinkExtractorTransformer
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`,
|
||||||
|
we can simplify the link extraction::
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import TextLoader
|
||||||
|
from langchain_community.graph_vectorstores.extractors import (
|
||||||
|
GLiNERLinkExtractor,
|
||||||
|
LinkExtractorTransformer,
|
||||||
|
)
|
||||||
|
from langchain_text_splitters import CharacterTextSplitter
|
||||||
|
|
||||||
|
loader = TextLoader("state_of_the_union.txt")
|
||||||
|
raw_documents = loader.load()
|
||||||
|
|
||||||
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
||||||
|
documents = text_splitter.split_documents(raw_documents)
|
||||||
|
|
||||||
|
ner_extractor = GLiNERLinkExtractor(["Person", "Topic"])
|
||||||
|
transformer = LinkExtractorTransformer([ner_extractor])
|
||||||
|
documents = transformer.transform_documents(documents)
|
||||||
|
|
||||||
|
print(documents[0].metadata)
|
||||||
|
|
||||||
|
.. code-block:: output
|
||||||
|
|
||||||
|
{'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]}
|
||||||
|
|
||||||
|
The documents with named entity links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`::
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores import CassandraGraphVectorStore
|
||||||
|
|
||||||
|
store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
labels: List of kinds of entities to extract.
|
||||||
|
kind: Kind of links to produce with this extractor.
|
||||||
|
model: GLiNER model to use.
|
||||||
|
extract_kwargs: Keyword arguments to pass to GLiNER.
|
||||||
|
""" # noqa: E501
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -24,23 +134,6 @@ class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
|
|||||||
model: str = "urchade/gliner_mediumv2.1",
|
model: str = "urchade/gliner_mediumv2.1",
|
||||||
extract_kwargs: Optional[Dict[str, Any]] = None,
|
extract_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
):
|
):
|
||||||
"""Extract keywords using GLiNER.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
extractor = GLiNERLinkExtractor(
|
|
||||||
labels=["Person", "Award", "Date", "Competitions", "Teams"]
|
|
||||||
)
|
|
||||||
results = extractor.extract_one("some long text...")
|
|
||||||
|
|
||||||
Args:
|
|
||||||
labels: List of kinds of entities to extract.
|
|
||||||
kind: Kind of links to produce with this extractor.
|
|
||||||
model: GLiNER model to use.
|
|
||||||
extract_kwargs: Keyword arguments to pass to GLiNER.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
from gliner import GLiNER
|
from gliner import GLiNER
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user