community[minor]: Add keybert-based link extractor (#24311)

- **Description:** Add a `KeybertLinkExtractor` for graph vectorstores.
This allows extracting links from keywords in a Document and linking
nodes that have common keywords.
- **Issue:** None
- **Dependencies:** None.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Ben Chambers
2024-07-19 09:25:07 -07:00
committed by GitHub
parent ef049769f0
commit 3691701d58
4 changed files with 146 additions and 0 deletions

View File

@@ -10,6 +10,10 @@ from langchain_community.graph_vectorstores.extractors.html_link_extractor impor
HtmlInput,
HtmlLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
KeybertInput,
KeybertLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
@@ -24,6 +28,10 @@ __all__ = [
"HierarchyLinkExtractor",
"HtmlInput",
"HtmlLinkExtractor",
"KeybertInput",
"KeybertLinkExtractor",
"LinkExtractor",
"LinkExtractor",
"LinkExtractorAdapter",
"LinkExtractorAdapter",
]

View File

@@ -0,0 +1,73 @@
from typing import Any, Dict, Iterable, Optional, Set, Union
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
KeybertInput = Union[str, Document]
class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
def __init__(
self,
*,
kind: str = "kw",
embedding_model: str = "all-MiniLM-L6-v2",
extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
):
"""Extract keywords using KeyBERT <https://maartengr.github.io/KeyBERT/>.
Example:
.. code-block:: python
extractor = KeybertLinkExtractor()
results = extractor.extract_one(PAGE_1)
Args:
kind: Kind of links to produce with this extractor.
embedding_model: Name of the embedding model to use with KeyBERT.
extract_keywords_kwargs: Keyword arguments to pass to KeyBERT's
`extract_keywords` method.
"""
try:
import keybert
self._kw_model = keybert.KeyBERT(model=embedding_model)
except ImportError:
raise ImportError(
"keybert is required for KeybertLinkExtractor. "
"Please install it with `pip install keybert`."
) from None
self._kind = kind
self._extract_keywords_kwargs = extract_keywords_kwargs or {}
def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002
keywords = self._kw_model.extract_keywords(
input if isinstance(input, str) else input.page_content,
**self._extract_keywords_kwargs,
)
return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
def extract_many(
self,
inputs: Iterable[KeybertInput],
) -> Iterable[Set[Link]]:
inputs = list(inputs)
if len(inputs) == 1:
# Even though we pass a list, if it contains one item, keybert will
# flatten it. This means it's easier to just call the special case
# for one item.
yield self.extract_one(inputs[0])
elif len(inputs) > 1:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
extracted = self._kw_model.extract_keywords(
strs, **self._extract_keywords_kwargs
)
for keywords in extracted:
yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}