community[minor]: Add keybert-based link extractor (#24311)

- **Description:** Add a `KeybertLinkExtractor` for graph vectorstores.
This allows extracting links from keywords in a Document and linking
nodes that have common keywords.
- **Issue:** None
- **Dependencies:** None.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Ben Chambers 2024-07-19 09:25:07 -07:00 committed by GitHub
parent ef049769f0
commit 3691701d58
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 146 additions and 0 deletions

View File

@ -38,6 +38,7 @@ javelin-sdk>=0.1.8,<0.2
jinja2>=3,<4
jq>=1.4.1,<2
jsonschema>1
keybert>=0.8.5
lxml>=4.9.3,<6.0
markdownify>=0.11.6,<0.12
motor>=3.3.1,<4

View File

@ -10,6 +10,10 @@ from langchain_community.graph_vectorstores.extractors.html_link_extractor impor
HtmlInput,
HtmlLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
KeybertInput,
KeybertLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
@ -24,6 +28,10 @@ __all__ = [
"HierarchyLinkExtractor",
"HtmlInput",
"HtmlLinkExtractor",
"KeybertInput",
"KeybertLinkExtractor",
"LinkExtractor",
"LinkExtractor",
"LinkExtractorAdapter",
"LinkExtractorAdapter",
]

View File

@ -0,0 +1,73 @@
from typing import Any, Dict, Iterable, Optional, Set, Union
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
KeybertInput = Union[str, Document]
class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
def __init__(
self,
*,
kind: str = "kw",
embedding_model: str = "all-MiniLM-L6-v2",
extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
):
"""Extract keywords using KeyBERT <https://maartengr.github.io/KeyBERT/>.
Example:
.. code-block:: python
extractor = KeybertLinkExtractor()
results = extractor.extract_one(PAGE_1)
Args:
kind: Kind of links to produce with this extractor.
embedding_model: Name of the embedding model to use with KeyBERT.
extract_keywords_kwargs: Keyword arguments to pass to KeyBERT's
`extract_keywords` method.
"""
try:
import keybert
self._kw_model = keybert.KeyBERT(model=embedding_model)
except ImportError:
raise ImportError(
"keybert is required for KeybertLinkExtractor. "
"Please install it with `pip install keybert`."
) from None
self._kind = kind
self._extract_keywords_kwargs = extract_keywords_kwargs or {}
def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002
keywords = self._kw_model.extract_keywords(
input if isinstance(input, str) else input.page_content,
**self._extract_keywords_kwargs,
)
return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
def extract_many(
self,
inputs: Iterable[KeybertInput],
) -> Iterable[Set[Link]]:
inputs = list(inputs)
if len(inputs) == 1:
# Even though we pass a list, if it contains one item, keybert will
# flatten it. This means it's easier to just call the special case
# for one item.
yield self.extract_one(inputs[0])
elif len(inputs) > 1:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
extracted = self._kw_model.extract_keywords(
strs, **self._extract_keywords_kwargs
)
for keywords in extracted:
yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}

View File

@ -0,0 +1,64 @@
import pytest
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor
PAGE_1 = """
Supervised learning is the machine learning task of learning a function that
maps an input to an output based on example input-output pairs. It infers a
function from labeled training data consisting of a set of training examples. In
supervised learning, each example is a pair consisting of an input object
(typically a vector) and a desired output value (also called the supervisory
signal). A supervised learning algorithm analyzes the training data and produces
an inferred function, which can be used for mapping new examples. An optimal
scenario will allow for the algorithm to correctly determine the class labels
for unseen instances. This requires the learning algorithm to generalize from
the training data to unseen situations in a 'reasonable' way (see inductive
bias).
"""
PAGE_2 = """
KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages
BERT embeddings to create keywords and keyphrases that are most similar to a
document.
"""
@pytest.mark.requires("keybert")
def test_one_from_keywords() -> None:
extractor = KeybertLinkExtractor()
results = extractor.extract_one(PAGE_1)
assert results == {
Link.bidir(kind="kw", tag="supervised"),
Link.bidir(kind="kw", tag="labels"),
Link.bidir(kind="kw", tag="labeled"),
Link.bidir(kind="kw", tag="learning"),
Link.bidir(kind="kw", tag="training"),
}
@pytest.mark.requires("keybert")
def test_many_from_keyphrases() -> None:
extractor = KeybertLinkExtractor(
extract_keywords_kwargs={
"keyphrase_ngram_range": (1, 2),
}
)
results = list(extractor.extract_many([PAGE_1, PAGE_2]))
assert results[0] == {
Link.bidir(kind="kw", tag="supervised"),
Link.bidir(kind="kw", tag="labeled training"),
Link.bidir(kind="kw", tag="supervised learning"),
Link.bidir(kind="kw", tag="examples supervised"),
Link.bidir(kind="kw", tag="signal supervised"),
}
assert results[1] == {
Link.bidir(kind="kw", tag="keyphrases"),
Link.bidir(kind="kw", tag="keyword extraction"),
Link.bidir(kind="kw", tag="keybert"),
Link.bidir(kind="kw", tag="keywords keyphrases"),
Link.bidir(kind="kw", tag="keybert minimal"),
}