diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py index 54be34d4631..90a208b9878 100644 --- a/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py +++ b/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py @@ -20,22 +20,114 @@ class KeybertLinkExtractor(LinkExtractor[KeybertInput]): embedding_model: str = "all-MiniLM-L6-v2", extract_keywords_kwargs: Optional[Dict[str, Any]] = None, ): - """Extract keywords using KeyBERT . + """Extract keywords using `KeyBERT `_. - Example: + KeyBERT is a minimal and easy-to-use keyword extraction technique that + leverages BERT embeddings to create keywords and keyphrases that are most + similar to a document. - .. code-block:: python + The KeybertLinkExtractor uses KeyBERT to create links between documents that + have keywords in common. - extractor = KeybertLinkExtractor() + Example:: - results = extractor.extract_one(PAGE_1) + extractor = KeybertLinkExtractor() + results = extractor.extract_one("lorem ipsum...") + + .. seealso:: + + - :mod:`How to use a graph vector store ` + - :class:`How to create links between documents ` + + How to link Documents on common keywords using Keybert + ====================================================== + + Preliminaries + ------------- + + Install the keybert package: + + .. code-block:: bash + + pip install -q langchain_community keybert + + Usage + ----- + + We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we + extract keyword links and add them to the chunk. + + Using extract_one() + ^^^^^^^^^^^^^^^^^^^ + + We can use :meth:`extract_one` on a document to get the links and add the links + to the document metadata with + :meth:`~langchain_core.graph_vectorstores.links.add_links`:: + + from langchain_community.document_loaders import TextLoader + from langchain_community.graph_vectorstores import CassandraGraphVectorStore + from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor + from langchain_core.graph_vectorstores.links import add_links + from langchain_text_splitters import CharacterTextSplitter + + loader = TextLoader("state_of_the_union.txt") + + raw_documents = loader.load() + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + + documents = text_splitter.split_documents(raw_documents) + keyword_extractor = KeybertLinkExtractor() + + for document in documents: + links = keyword_extractor.extract_one(document) + add_links(document, links) + + print(documents[0].metadata) + + .. code-block:: output + + {'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]} + + Using LinkExtractorTransformer + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`, + we can simplify the link extraction:: + + from langchain_community.document_loaders import TextLoader + from langchain_community.graph_vectorstores.extractors import ( + KeybertLinkExtractor, + LinkExtractorTransformer, + ) + from langchain_text_splitters import CharacterTextSplitter + + loader = TextLoader("state_of_the_union.txt") + raw_documents = loader.load() + + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + documents = text_splitter.split_documents(raw_documents) + + transformer = LinkExtractorTransformer([KeybertLinkExtractor()]) + documents = transformer.transform_documents(documents) + + print(documents[0].metadata) + + .. code-block:: output + + {'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]} + + The documents with keyword links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`:: + + from langchain_community.graph_vectorstores import CassandraGraphVectorStore + + store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...) Args: kind: Kind of links to produce with this extractor. embedding_model: Name of the embedding model to use with KeyBERT. extract_keywords_kwargs: Keyword arguments to pass to KeyBERT's - `extract_keywords` method. - """ + ``extract_keywords`` method. + """ # noqa: E501 try: import keybert