Harrison/move flashrank rerank (#21448)

third party integration, should be in community
This commit is contained in:
Harrison Chase 2024-05-15 13:08:52 -07:00 committed by GitHub
parent c6c2649a5a
commit 15be439719
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 122 additions and 75 deletions

View File

@ -2,6 +2,9 @@ import importlib
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from langchain_community.document_compressors.flashrank_rerank import (
FlashrankRerank,
)
from langchain_community.document_compressors.jina_rerank import (
JinaRerank, # noqa: F401
)
@ -12,12 +15,13 @@ if TYPE_CHECKING:
OpenVINOReranker,
)
__all__ = ["LLMLinguaCompressor", "OpenVINOReranker"]
__all__ = ["LLMLinguaCompressor", "OpenVINOReranker", "FlashrankRerank"]
_module_lookup = {
"LLMLinguaCompressor": "langchain_community.document_compressors.llmlingua_filter",
"OpenVINOReranker": "langchain_community.document_compressors.openvino_rerank",
"JinaRerank": "langchain_community.document_compressors.jina_rerank",
"FlashrankRerank": "langchain_community.document_compressors.flashrank_rerank",
}

View File

@ -0,0 +1,76 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict, Optional, Sequence
from langchain_core.callbacks.manager import Callbacks
from langchain_core.documents import BaseDocumentCompressor, Document
from langchain_core.pydantic_v1 import Extra, root_validator
if TYPE_CHECKING:
from flashrank import Ranker, RerankRequest
else:
# Avoid pydantic annotation issues when actually instantiating
# while keeping this import optional
try:
from flashrank import Ranker, RerankRequest
except ImportError:
pass
DEFAULT_MODEL_NAME = "ms-marco-MultiBERT-L-12"
class FlashrankRerank(BaseDocumentCompressor):
"""Document compressor using Flashrank interface."""
client: Ranker
"""Flashrank client to use for compressing documents"""
top_n: int = 3
"""Number of documents to return."""
model: Optional[str] = None
"""Model to use for reranking."""
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
arbitrary_types_allowed = True
@root_validator(pre=True)
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
try:
from flashrank import Ranker
except ImportError:
raise ImportError(
"Could not import flashrank python package. "
"Please install it with `pip install flashrank`."
)
values["model"] = values.get("model", DEFAULT_MODEL_NAME)
values["client"] = Ranker(model_name=values["model"])
return values
def compress_documents(
self,
documents: Sequence[Document],
query: str,
callbacks: Optional[Callbacks] = None,
) -> Sequence[Document]:
passages = [
{"id": i, "text": doc.page_content, "meta": doc.metadata}
for i, doc in enumerate(documents)
]
rerank_request = RerankRequest(query=query, passages=passages)
rerank_response = self.client.rerank(rerank_request)[: self.top_n]
final_results = []
for r in rerank_response:
metadata = r["meta"]
metadata["relevance_score"] = r["score"]
doc = Document(
page_content=r["text"],
metadata=metadata,
)
final_results.append(doc)
return final_results

View File

@ -1,6 +1,11 @@
from langchain_community.document_compressors import __all__, _module_lookup
EXPECTED_ALL = ["LLMLinguaCompressor", "OpenVINOReranker", "JinaRerank"]
EXPECTED_ALL = [
"LLMLinguaCompressor",
"OpenVINOReranker",
"JinaRerank",
"FlashrankRerank",
]
def test_all_imports() -> None:

View File

@ -1,3 +1,6 @@
import importlib
from typing import Any
from langchain.retrievers.document_compressors.base import DocumentCompressorPipeline
from langchain.retrievers.document_compressors.chain_extract import (
LLMChainExtractor,
@ -12,7 +15,18 @@ from langchain.retrievers.document_compressors.cross_encoder_rerank import (
from langchain.retrievers.document_compressors.embeddings_filter import (
EmbeddingsFilter,
)
from langchain.retrievers.document_compressors.flashrank_rerank import FlashrankRerank
_module_lookup = {
"FlashrankRerank": "langchain_community.document_compressors.flashrank_rerank",
}
def __getattr__(name: str) -> Any:
if name in _module_lookup:
module = importlib.import_module(_module_lookup[name])
return getattr(module, name)
raise AttributeError(f"module {__name__} has no attribute {name}")
__all__ = [
"DocumentCompressorPipeline",
@ -21,5 +35,4 @@ __all__ = [
"LLMChainFilter",
"CohereRerank",
"CrossEncoderReranker",
"FlashrankRerank",
]
] + list(_module_lookup.keys())

View File

@ -1,78 +1,27 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Dict, Optional, Sequence
from langchain_core.callbacks.manager import Callbacks
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import Extra, root_validator
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
from langchain._api import create_importer
if TYPE_CHECKING:
from flashrank import Ranker, RerankRequest
else:
# Avoid pydantic annotation issues when actually instantiating
# while keeping this import optional
try:
from flashrank import Ranker, RerankRequest
except ImportError:
pass
from langchain_community.document_compressors.flashrank_rerank import (
FlashrankRerank,
)
DEFAULT_MODEL_NAME = "ms-marco-MultiBERT-L-12"
# Create a way to dynamically look up deprecated imports.
# Used to consolidate logic for raising deprecation warnings and
# handling optional imports.
DEPRECATED_LOOKUP = {
"FlashrankRerank": "langchain_community.document_compressors.flashrank_rerank"
}
_import_attribute = create_importer(__package__, deprecated_lookups=DEPRECATED_LOOKUP)
class FlashrankRerank(BaseDocumentCompressor):
"""Document compressor using Flashrank interface."""
def __getattr__(name: str) -> Any:
"""Look up attributes dynamically."""
return _import_attribute(name)
client: Ranker
"""Flashrank client to use for compressing documents"""
top_n: int = 3
"""Number of documents to return."""
model: Optional[str] = None
"""Model to use for reranking."""
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
arbitrary_types_allowed = True
@root_validator(pre=True)
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
try:
from flashrank import Ranker
except ImportError:
raise ImportError(
"Could not import flashrank python package. "
"Please install it with `pip install flashrank`."
)
values["model"] = values.get("model", DEFAULT_MODEL_NAME)
values["client"] = Ranker(model_name=values["model"])
return values
def compress_documents(
self,
documents: Sequence[Document],
query: str,
callbacks: Optional[Callbacks] = None,
) -> Sequence[Document]:
passages = [
{"id": i, "text": doc.page_content, "meta": doc.metadata}
for i, doc in enumerate(documents)
]
rerank_request = RerankRequest(query=query, passages=passages)
rerank_response = self.client.rerank(rerank_request)[: self.top_n]
final_results = []
for r in rerank_response:
metadata = r["meta"]
metadata["relevance_score"] = r["score"]
doc = Document(
page_content=r["text"],
metadata=metadata,
)
final_results.append(doc)
return final_results
__all__ = [
"FlashrankRerank",
]