From 56580b5fff81cf467a90072a8d54272def9049c0 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Tue, 10 Sep 2024 02:27:23 +0200 Subject: [PATCH] community: Add docstring for GLiNERLinkExtractor (#26218) Co-authored-by: Erick Friis --- .../extractors/gliner_link_extractor.py | 129 +++++++++++++++--- 1 file changed, 111 insertions(+), 18 deletions(-) diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/gliner_link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/gliner_link_extractor.py index a485f849c7a..9d567a1f8d4 100644 --- a/libs/community/langchain_community/graph_vectorstores/extractors/gliner_link_extractor.py +++ b/libs/community/langchain_community/graph_vectorstores/extractors/gliner_link_extractor.py @@ -14,7 +14,117 @@ GLiNERInput = Union[str, Document] @beta() class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]): - """Link documents with common named entities using GLiNER .""" + """Link documents with common named entities using `GLiNER`_. + + `GLiNER`_ is a Named Entity Recognition (NER) model capable of identifying any + entity type using a bidirectional transformer encoder (BERT-like). + + The ``GLiNERLinkExtractor`` uses GLiNER to create links between documents that + have named entities in common. + + Example:: + + extractor = GLiNERLinkExtractor( + labels=["Person", "Award", "Date", "Competitions", "Teams"] + ) + results = extractor.extract_one("some long text...") + + .. _GLiNER: https://github.com/urchade/GLiNER + + .. seealso:: + + - :mod:`How to use a graph vector store ` + - :class:`How to create links between documents ` + + How to link Documents on common named entities + ============================================== + + Preliminaries + ------------- + + Install the ``gliner`` package: + + .. code-block:: bash + + pip install -q langchain_community gliner + + Usage + ----- + + We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we + extract named entity links and add them to the chunk. + + Using extract_one() + ^^^^^^^^^^^^^^^^^^^ + + We can use :meth:`extract_one` on a document to get the links and add the links + to the document metadata with + :meth:`~langchain_core.graph_vectorstores.links.add_links`:: + + from langchain_community.document_loaders import TextLoader + from langchain_community.graph_vectorstores import CassandraGraphVectorStore + from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor + from langchain_core.graph_vectorstores.links import add_links + from langchain_text_splitters import CharacterTextSplitter + + loader = TextLoader("state_of_the_union.txt") + raw_documents = loader.load() + + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + documents = text_splitter.split_documents(raw_documents) + + ner_extractor = GLiNERLinkExtractor(["Person", "Topic"]) + for document in documents: + links = ner_extractor.extract_one(document) + add_links(document, links) + + print(documents[0].metadata) + + .. code-block:: output + + {'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]} + + Using LinkExtractorTransformer + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`, + we can simplify the link extraction:: + + from langchain_community.document_loaders import TextLoader + from langchain_community.graph_vectorstores.extractors import ( + GLiNERLinkExtractor, + LinkExtractorTransformer, + ) + from langchain_text_splitters import CharacterTextSplitter + + loader = TextLoader("state_of_the_union.txt") + raw_documents = loader.load() + + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + documents = text_splitter.split_documents(raw_documents) + + ner_extractor = GLiNERLinkExtractor(["Person", "Topic"]) + transformer = LinkExtractorTransformer([ner_extractor]) + documents = transformer.transform_documents(documents) + + print(documents[0].metadata) + + .. code-block:: output + + {'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]} + + The documents with named entity links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`:: + + from langchain_community.graph_vectorstores import CassandraGraphVectorStore + + store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...) + + Args: + labels: List of kinds of entities to extract. + kind: Kind of links to produce with this extractor. + model: GLiNER model to use. + extract_kwargs: Keyword arguments to pass to GLiNER. + """ # noqa: E501 def __init__( self, @@ -24,23 +134,6 @@ class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]): model: str = "urchade/gliner_mediumv2.1", extract_kwargs: Optional[Dict[str, Any]] = None, ): - """Extract keywords using GLiNER. - - Example: - - .. code-block:: python - - extractor = GLiNERLinkExtractor( - labels=["Person", "Award", "Date", "Competitions", "Teams"] - ) - results = extractor.extract_one("some long text...") - - Args: - labels: List of kinds of entities to extract. - kind: Kind of links to produce with this extractor. - model: GLiNER model to use. - extract_kwargs: Keyword arguments to pass to GLiNER. - """ try: from gliner import GLiNER