Add String Distance and Embedding Evaluators (#7123)

Add a string evaluator and pairwise string evaluator implementation for:
- Embedding distance
- String distance

Update docs
This commit is contained in:
William FH 2023-07-07 21:44:31 -07:00 committed by GitHub
parent fb6e63dc36
commit 4789c99bc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 1372 additions and 92 deletions

View File

@ -165,28 +165,35 @@ Classes
callbacks.aim_callback.AimCallbackHandler callbacks.aim_callback.AimCallbackHandler
callbacks.argilla_callback.ArgillaCallbackHandler callbacks.argilla_callback.ArgillaCallbackHandler
callbacks.arize_callback.ArizeCallbackHandler callbacks.arize_callback.ArizeCallbackHandler
callbacks.arthur_callback.ArthurCallbackHandler
callbacks.base.AsyncCallbackHandler callbacks.base.AsyncCallbackHandler
callbacks.base.BaseCallbackHandler callbacks.base.BaseCallbackHandler
callbacks.base.BaseCallbackManager callbacks.base.BaseCallbackManager
callbacks.clearml_callback.ClearMLCallbackHandler callbacks.clearml_callback.ClearMLCallbackHandler
callbacks.comet_ml_callback.CometCallbackHandler callbacks.comet_ml_callback.CometCallbackHandler
callbacks.file.FileCallbackHandler callbacks.file.FileCallbackHandler
callbacks.flyte_callback.FlyteCallbackHandler
callbacks.human.HumanApprovalCallbackHandler callbacks.human.HumanApprovalCallbackHandler
callbacks.human.HumanRejectedException callbacks.human.HumanRejectedException
callbacks.infino_callback.InfinoCallbackHandler callbacks.infino_callback.InfinoCallbackHandler
callbacks.manager.AsyncCallbackManager callbacks.manager.AsyncCallbackManager
callbacks.manager.AsyncCallbackManagerForChainRun callbacks.manager.AsyncCallbackManagerForChainRun
callbacks.manager.AsyncCallbackManagerForLLMRun callbacks.manager.AsyncCallbackManagerForLLMRun
callbacks.manager.AsyncCallbackManagerForRetrieverRun
callbacks.manager.AsyncCallbackManagerForToolRun callbacks.manager.AsyncCallbackManagerForToolRun
callbacks.manager.AsyncParentRunManager
callbacks.manager.AsyncRunManager callbacks.manager.AsyncRunManager
callbacks.manager.BaseRunManager callbacks.manager.BaseRunManager
callbacks.manager.CallbackManager callbacks.manager.CallbackManager
callbacks.manager.CallbackManagerForChainRun callbacks.manager.CallbackManagerForChainRun
callbacks.manager.CallbackManagerForLLMRun callbacks.manager.CallbackManagerForLLMRun
callbacks.manager.CallbackManagerForRetrieverRun
callbacks.manager.CallbackManagerForToolRun callbacks.manager.CallbackManagerForToolRun
callbacks.manager.ParentRunManager
callbacks.manager.RunManager callbacks.manager.RunManager
callbacks.mlflow_callback.MlflowCallbackHandler callbacks.mlflow_callback.MlflowCallbackHandler
callbacks.openai_info.OpenAICallbackHandler callbacks.openai_info.OpenAICallbackHandler
callbacks.promptlayer_callback.PromptLayerCallbackHandler
callbacks.stdout.StdOutCallbackHandler callbacks.stdout.StdOutCallbackHandler
callbacks.streaming_aiter.AsyncIteratorCallbackHandler callbacks.streaming_aiter.AsyncIteratorCallbackHandler
callbacks.streaming_aiter_final_only.AsyncFinalIteratorCallbackHandler callbacks.streaming_aiter_final_only.AsyncFinalIteratorCallbackHandler
@ -229,6 +236,8 @@ Functions
callbacks.aim_callback.import_aim callbacks.aim_callback.import_aim
callbacks.clearml_callback.import_clearml callbacks.clearml_callback.import_clearml
callbacks.comet_ml_callback.import_comet_ml callbacks.comet_ml_callback.import_comet_ml
callbacks.flyte_callback.analyze_text
callbacks.flyte_callback.import_flytekit
callbacks.infino_callback.import_infino callbacks.infino_callback.import_infino
callbacks.manager.env_var_is_set callbacks.manager.env_var_is_set
callbacks.manager.get_openai_callback callbacks.manager.get_openai_callback
@ -283,9 +292,11 @@ Classes
chains.base.Chain chains.base.Chain
chains.combine_documents.base.AnalyzeDocumentChain chains.combine_documents.base.AnalyzeDocumentChain
chains.combine_documents.base.BaseCombineDocumentsChain chains.combine_documents.base.BaseCombineDocumentsChain
chains.combine_documents.map_reduce.CombineDocsProtocol
chains.combine_documents.map_reduce.MapReduceDocumentsChain chains.combine_documents.map_reduce.MapReduceDocumentsChain
chains.combine_documents.map_rerank.MapRerankDocumentsChain chains.combine_documents.map_rerank.MapRerankDocumentsChain
chains.combine_documents.reduce.AsyncCombineDocsProtocol
chains.combine_documents.reduce.CombineDocsProtocol
chains.combine_documents.reduce.ReduceDocumentsChain
chains.combine_documents.refine.RefineDocumentsChain chains.combine_documents.refine.RefineDocumentsChain
chains.combine_documents.stuff.StuffDocumentsChain chains.combine_documents.stuff.StuffDocumentsChain
chains.constitutional_ai.base.ConstitutionalChain chains.constitutional_ai.base.ConstitutionalChain
@ -299,8 +310,10 @@ Classes
chains.flare.prompts.FinishedOutputParser chains.flare.prompts.FinishedOutputParser
chains.graph_qa.base.GraphQAChain chains.graph_qa.base.GraphQAChain
chains.graph_qa.cypher.GraphCypherQAChain chains.graph_qa.cypher.GraphCypherQAChain
chains.graph_qa.hugegraph.HugeGraphQAChain
chains.graph_qa.kuzu.KuzuQAChain chains.graph_qa.kuzu.KuzuQAChain
chains.graph_qa.nebulagraph.NebulaGraphQAChain chains.graph_qa.nebulagraph.NebulaGraphQAChain
chains.graph_qa.sparql.GraphSparqlQAChain
chains.hyde.base.HypotheticalDocumentEmbedder chains.hyde.base.HypotheticalDocumentEmbedder
chains.llm.LLMChain chains.llm.LLMChain
chains.llm_bash.base.LLMBashChain chains.llm_bash.base.LLMBashChain
@ -363,7 +376,6 @@ Functions
.. autosummary:: .. autosummary::
:toctree: chains :toctree: chains
chains.combine_documents.base.format_document
chains.graph_qa.cypher.extract_cypher chains.graph_qa.cypher.extract_cypher
chains.loading.load_chain chains.loading.load_chain
chains.loading.load_chain_from_config chains.loading.load_chain_from_config
@ -415,6 +427,7 @@ Classes
chat_models.fake.FakeListChatModel chat_models.fake.FakeListChatModel
chat_models.google_palm.ChatGooglePalm chat_models.google_palm.ChatGooglePalm
chat_models.google_palm.ChatGooglePalmError chat_models.google_palm.ChatGooglePalmError
chat_models.human.HumanInputChatModel
chat_models.openai.ChatOpenAI chat_models.openai.ChatOpenAI
chat_models.promptlayer_openai.PromptLayerChatOpenAI chat_models.promptlayer_openai.PromptLayerChatOpenAI
chat_models.vertexai.ChatVertexAI chat_models.vertexai.ChatVertexAI
@ -513,6 +526,7 @@ Classes
document_loaders.blob_loaders.youtube_audio.YoutubeAudioLoader document_loaders.blob_loaders.youtube_audio.YoutubeAudioLoader
document_loaders.blockchain.BlockchainDocumentLoader document_loaders.blockchain.BlockchainDocumentLoader
document_loaders.blockchain.BlockchainType document_loaders.blockchain.BlockchainType
document_loaders.brave_search.BraveSearchLoader
document_loaders.chatgpt.ChatGPTLoader document_loaders.chatgpt.ChatGPTLoader
document_loaders.college_confidential.CollegeConfidentialLoader document_loaders.college_confidential.CollegeConfidentialLoader
document_loaders.confluence.ConfluenceLoader document_loaders.confluence.ConfluenceLoader
@ -520,6 +534,7 @@ Classes
document_loaders.conllu.CoNLLULoader document_loaders.conllu.CoNLLULoader
document_loaders.csv_loader.CSVLoader document_loaders.csv_loader.CSVLoader
document_loaders.csv_loader.UnstructuredCSVLoader document_loaders.csv_loader.UnstructuredCSVLoader
document_loaders.cube_semantic.CubeSemanticLoader
document_loaders.dataframe.DataFrameLoader document_loaders.dataframe.DataFrameLoader
document_loaders.diffbot.DiffbotLoader document_loaders.diffbot.DiffbotLoader
document_loaders.directory.DirectoryLoader document_loaders.directory.DirectoryLoader
@ -736,6 +751,7 @@ Classes
embeddings.self_hosted.SelfHostedEmbeddings embeddings.self_hosted.SelfHostedEmbeddings
embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceEmbeddings embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceEmbeddings
embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceInstructEmbeddings embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceInstructEmbeddings
embeddings.spacy_embeddings.SpacyEmbeddings
embeddings.tensorflow_hub.TensorflowHubEmbeddings embeddings.tensorflow_hub.TensorflowHubEmbeddings
embeddings.vertexai.VertexAIEmbeddings embeddings.vertexai.VertexAIEmbeddings
@ -790,6 +806,9 @@ Classes
evaluation.comparison.eval_chain.PairwiseStringResultOutputParser evaluation.comparison.eval_chain.PairwiseStringResultOutputParser
evaluation.criteria.eval_chain.CriteriaEvalChain evaluation.criteria.eval_chain.CriteriaEvalChain
evaluation.criteria.eval_chain.CriteriaResultOutputParser evaluation.criteria.eval_chain.CriteriaResultOutputParser
evaluation.embedding_distance.base.EmbeddingDistance
evaluation.embedding_distance.base.EmbeddingDistanceEvalChain
evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain
evaluation.qa.eval_chain.ContextQAEvalChain evaluation.qa.eval_chain.ContextQAEvalChain
evaluation.qa.eval_chain.CotQAEvalChain evaluation.qa.eval_chain.CotQAEvalChain
evaluation.qa.eval_chain.QAEvalChain evaluation.qa.eval_chain.QAEvalChain
@ -799,10 +818,16 @@ Classes
evaluation.run_evaluators.implementations.ChoicesOutputParser evaluation.run_evaluators.implementations.ChoicesOutputParser
evaluation.run_evaluators.implementations.CriteriaOutputParser evaluation.run_evaluators.implementations.CriteriaOutputParser
evaluation.run_evaluators.implementations.StringRunEvaluatorInputMapper evaluation.run_evaluators.implementations.StringRunEvaluatorInputMapper
evaluation.run_evaluators.implementations.TrajectoryEvalOutputParser
evaluation.run_evaluators.implementations.TrajectoryInputMapper evaluation.run_evaluators.implementations.TrajectoryInputMapper
evaluation.run_evaluators.implementations.TrajectoryRunEvalOutputParser
evaluation.schema.AgentTrajectoryEvaluator
evaluation.schema.EvaluatorType
evaluation.schema.LLMEvalChain
evaluation.schema.PairwiseStringEvaluator evaluation.schema.PairwiseStringEvaluator
evaluation.schema.StringEvaluator evaluation.schema.StringEvaluator
evaluation.string_distance.base.PairwiseStringDistanceEvalChain
evaluation.string_distance.base.StringDistance
evaluation.string_distance.base.StringDistanceEvalChain
Functions Functions
-------------- --------------
@ -812,6 +837,8 @@ Functions
:toctree: evaluation :toctree: evaluation
evaluation.loading.load_dataset evaluation.loading.load_dataset
evaluation.loading.load_evaluator
evaluation.loading.load_evaluators
evaluation.run_evaluators.implementations.get_criteria_evaluator evaluation.run_evaluators.implementations.get_criteria_evaluator
evaluation.run_evaluators.implementations.get_qa_evaluator evaluation.run_evaluators.implementations.get_qa_evaluator
evaluation.run_evaluators.implementations.get_trajectory_evaluator evaluation.run_evaluators.implementations.get_trajectory_evaluator
@ -1057,6 +1084,7 @@ Functions
llms.aviary.get_completions llms.aviary.get_completions
llms.aviary.get_models llms.aviary.get_models
llms.base.create_base_retry_decorator
llms.base.get_prompts llms.base.get_prompts
llms.base.update_cache llms.base.update_cache
llms.cohere.completion_with_retry llms.cohere.completion_with_retry
@ -1069,6 +1097,7 @@ Functions
llms.openai.completion_with_retry llms.openai.completion_with_retry
llms.openai.update_token_usage llms.openai.update_token_usage
llms.utils.enforce_stop_tokens llms.utils.enforce_stop_tokens
llms.vertexai.completion_with_retry
llms.vertexai.is_codey_model llms.vertexai.is_codey_model
:mod:`langchain.load`: Load :mod:`langchain.load`: Load
@ -1241,7 +1270,6 @@ Classes
:toctree: prompts :toctree: prompts
:template: class.rst :template: class.rst
prompts.base.BasePromptTemplate
prompts.base.StringPromptTemplate prompts.base.StringPromptTemplate
prompts.base.StringPromptValue prompts.base.StringPromptValue
prompts.chat.AIMessagePromptTemplate prompts.chat.AIMessagePromptTemplate
@ -1348,7 +1376,7 @@ Classes
retrievers.multi_query.LineListOutputParser retrievers.multi_query.LineListOutputParser
retrievers.multi_query.MultiQueryRetriever retrievers.multi_query.MultiQueryRetriever
retrievers.pinecone_hybrid_search.PineconeHybridSearchRetriever retrievers.pinecone_hybrid_search.PineconeHybridSearchRetriever
retrievers.pupmed.PubMedRetriever retrievers.pubmed.PubMedRetriever
retrievers.remote_retriever.RemoteLangChainRetriever retrievers.remote_retriever.RemoteLangChainRetriever
retrievers.self_query.base.SelfQueryRetriever retrievers.self_query.base.SelfQueryRetriever
retrievers.self_query.chroma.ChromaTranslator retrievers.self_query.chroma.ChromaTranslator
@ -1400,28 +1428,29 @@ Classes
:toctree: schema :toctree: schema
:template: class.rst :template: class.rst
schema.AIMessage schema.agent.AgentFinish
schema.AgentFinish schema.document.BaseDocumentTransformer
schema.BaseChatMessageHistory schema.document.Document
schema.BaseDocumentTransformer schema.memory.BaseChatMessageHistory
schema.BaseLLMOutputParser schema.memory.BaseMemory
schema.BaseMemory schema.messages.AIMessage
schema.BaseMessage schema.messages.BaseMessage
schema.BaseOutputParser schema.messages.ChatMessage
schema.BaseRetriever schema.messages.FunctionMessage
schema.ChatGeneration schema.messages.HumanMessage
schema.ChatMessage schema.messages.SystemMessage
schema.ChatResult schema.output.ChatGeneration
schema.Document schema.output.ChatResult
schema.FunctionMessage schema.output.Generation
schema.Generation schema.output.LLMResult
schema.HumanMessage schema.output.RunInfo
schema.LLMResult schema.output_parser.BaseLLMOutputParser
schema.NoOpOutputParser schema.output_parser.BaseOutputParser
schema.OutputParserException schema.output_parser.NoOpOutputParser
schema.PromptValue schema.output_parser.OutputParserException
schema.RunInfo schema.prompt.PromptValue
schema.SystemMessage schema.prompt_template.BasePromptTemplate
schema.retriever.BaseRetriever
Functions Functions
-------------- --------------
@ -1430,9 +1459,10 @@ Functions
.. autosummary:: .. autosummary::
:toctree: schema :toctree: schema
schema.get_buffer_string schema.messages.get_buffer_string
schema.messages_from_dict schema.messages.messages_from_dict
schema.messages_to_dict schema.messages.messages_to_dict
schema.prompt_template.format_document
:mod:`langchain.server`: Server :mod:`langchain.server`: Server
================================ ================================
@ -1535,6 +1565,8 @@ Classes
tools.bing_search.tool.BingSearchRun tools.bing_search.tool.BingSearchRun
tools.brave_search.tool.BraveSearch tools.brave_search.tool.BraveSearch
tools.convert_to_openai.FunctionDescription tools.convert_to_openai.FunctionDescription
tools.dataforseo_api_search.tool.DataForSeoAPISearchResults
tools.dataforseo_api_search.tool.DataForSeoAPISearchRun
tools.ddg_search.tool.DuckDuckGoSearchResults tools.ddg_search.tool.DuckDuckGoSearchResults
tools.ddg_search.tool.DuckDuckGoSearchRun tools.ddg_search.tool.DuckDuckGoSearchRun
tools.file_management.copy.CopyFileTool tools.file_management.copy.CopyFileTool
@ -1708,6 +1740,7 @@ Classes
utilities.bibtex.BibtexparserWrapper utilities.bibtex.BibtexparserWrapper
utilities.bing_search.BingSearchAPIWrapper utilities.bing_search.BingSearchAPIWrapper
utilities.brave_search.BraveSearchWrapper utilities.brave_search.BraveSearchWrapper
utilities.dataforseo_api_search.DataForSeoAPIWrapper
utilities.duckduckgo_search.DuckDuckGoSearchAPIWrapper utilities.duckduckgo_search.DuckDuckGoSearchAPIWrapper
utilities.google_places_api.GooglePlacesAPIWrapper utilities.google_places_api.GooglePlacesAPIWrapper
utilities.google_search.GoogleSearchAPIWrapper utilities.google_search.GoogleSearchAPIWrapper
@ -1805,12 +1838,17 @@ Classes
vectorstores.faiss.FAISS vectorstores.faiss.FAISS
vectorstores.hologres.Hologres vectorstores.hologres.Hologres
vectorstores.lancedb.LanceDB vectorstores.lancedb.LanceDB
vectorstores.marqo.Marqo
vectorstores.matching_engine.MatchingEngine vectorstores.matching_engine.MatchingEngine
vectorstores.milvus.Milvus vectorstores.milvus.Milvus
vectorstores.mongodb_atlas.MongoDBAtlasVectorSearch vectorstores.mongodb_atlas.MongoDBAtlasVectorSearch
vectorstores.myscale.MyScale vectorstores.myscale.MyScale
vectorstores.myscale.MyScaleSettings vectorstores.myscale.MyScaleSettings
vectorstores.opensearch_vector_search.OpenSearchVectorSearch vectorstores.opensearch_vector_search.OpenSearchVectorSearch
vectorstores.pgembedding.BaseModel
vectorstores.pgembedding.CollectionStore
vectorstores.pgembedding.EmbeddingStore
vectorstores.pgembedding.PGEmbedding
vectorstores.pgvector.BaseModel vectorstores.pgvector.BaseModel
vectorstores.pgvector.CollectionStore vectorstores.pgvector.CollectionStore
vectorstores.pgvector.DistanceStrategy vectorstores.pgvector.DistanceStrategy

View File

@ -3,32 +3,63 @@
This module contains off-the-shelf evaluation chains for grading the output of This module contains off-the-shelf evaluation chains for grading the output of
LangChain primitives such as language models and chains. LangChain primitives such as language models and chains.
To load an evaluator, you can use the :func:`load_evaluators <langchain.evaluation.loading.load_evaluators>` function with the **Loading an evaluator**
To load an evaluator, you can use the :func:`load_evaluators <langchain.evaluation.loading.load_evaluators>` or
:func:`load_evaluator <langchain.evaluation.loading.load_evaluator>` functions with the
names of the evaluators to load. names of the evaluators to load.
.. code-block:: python
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("qa")
evaluator.evaluate_strings(
prediction="We sold more than 40,000 units last week",
input="How many units did we sell last week?",
reference="We sold 32,378 units",
)
The evaluator must be one of :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`.
**Datasets**
To load one of the LangChain HuggingFace datasets, you can use the :func:`load_dataset <langchain.evaluation.loading.load_dataset>` function with the To load one of the LangChain HuggingFace datasets, you can use the :func:`load_dataset <langchain.evaluation.loading.load_dataset>` function with the
name of the dataset to load. name of the dataset to load.
Some common use cases for evaluation include: .. code-block:: python
from langchain.evaluation import load_dataset
ds = load_dataset("llm-math")
**Some common use cases for evaluation include:**
- Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain <langchain.evaluation.qa.eval_chain.QAEvalChain>` - Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain <langchain.evaluation.qa.eval_chain.QAEvalChain>`
- Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>` - Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>`
- Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain <langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain>` - Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain <langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain>`
- Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain <langchain.evaluation.criteria.eval_chain.CriteriaEvalChain>` - Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain <langchain.evaluation.criteria.eval_chain.CriteriaEvalChain>`
- Computing semantic difference between a prediction and reference: :class:`EmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain>` or between two predictions: :class:`PairwiseEmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain>`
- Measuring the string distance between a prediction and reference :class:`StringDistanceEvalChain <langchain.evaluation.string_distance.base.StringDistanceEvalChain>` or between two predictions :class:`PairwiseStringDistanceEvalChain <langchain.evaluation.string_distance.base.PairwiseStringDistanceEvalChain>`
This module also contains low-level APIs for creating custom evaluators for **Low-level API**
specific evaluation tasks. These include:
These evaluators implement one of the following interfaces:
- :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`: Evaluate a prediction string against a reference label and/or input context. - :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`: Evaluate a prediction string against a reference label and/or input context.
- :class:`PairwiseStringEvaluator <langchain.evaluation.schema.PairwiseStringEvaluator>`: Evaluate two prediction strings against each other. - :class:`PairwiseStringEvaluator <langchain.evaluation.schema.PairwiseStringEvaluator>`: Evaluate two prediction strings against each other. Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs.
Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs. - :class:`AgentTrajectoryEvaluator <langchain.evaluation.schema.AgentTrajectoryEvaluator>` Evaluate the full sequence of actions taken by an agent.
- :class:`AgentTrajectoryEvaluator <langchain.evaluation.schema.AgentTrajectoryEvaluator>`: Evaluate the full sequence of actions
taken by an agent. These interfaces enable easier composability and usage within a higher level evaluation framework.
""" # noqa: E501 """ # noqa: E501
from langchain.evaluation.agents import TrajectoryEvalChain from langchain.evaluation.agents import TrajectoryEvalChain
from langchain.evaluation.comparison import PairwiseStringEvalChain from langchain.evaluation.comparison import PairwiseStringEvalChain
from langchain.evaluation.criteria import CriteriaEvalChain from langchain.evaluation.criteria import CriteriaEvalChain
from langchain.evaluation.embedding_distance import (
EmbeddingDistance,
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
)
from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
from langchain.evaluation.schema import ( from langchain.evaluation.schema import (
@ -37,6 +68,11 @@ from langchain.evaluation.schema import (
PairwiseStringEvaluator, PairwiseStringEvaluator,
StringEvaluator, StringEvaluator,
) )
from langchain.evaluation.string_distance import (
PairwiseStringDistanceEvalChain,
StringDistance,
StringDistanceEvalChain,
)
__all__ = [ __all__ = [
"EvaluatorType", "EvaluatorType",
@ -48,6 +84,12 @@ __all__ = [
"PairwiseStringEvaluator", "PairwiseStringEvaluator",
"TrajectoryEvalChain", "TrajectoryEvalChain",
"CriteriaEvalChain", "CriteriaEvalChain",
"EmbeddingDistance",
"EmbeddingDistanceEvalChain",
"PairwiseEmbeddingDistanceEvalChain",
"StringDistance",
"StringDistanceEvalChain",
"PairwiseStringDistanceEvalChain",
"load_evaluators", "load_evaluators",
"load_evaluator", "load_evaluator",
"load_dataset", "load_dataset",

View File

@ -77,7 +77,9 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
the sequence of actions taken and their outcomes. the sequence of actions taken and their outcomes.
Example: Example:
.. code-block:: python .. code-block:: python
from langchain.agents import AgentType, initialize_agent from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI from langchain.chat_models import ChatOpenAI
from langchain.evaluation import TrajectoryEvalChain from langchain.evaluation import TrajectoryEvalChain
@ -336,7 +338,8 @@ The following is the expected answer. Use this to measure correctness:
callbacks (Callbacks): Callbacks to use for this chain run. callbacks (Callbacks): Callbacks to use for this chain run.
Returns: Returns:
dict: The evaluation result. dict: The evaluation result, which includes the score and optionally
the reasoning for reaching that.
""" """
inputs = { inputs = {
"question": input, "question": input,
@ -367,7 +370,8 @@ The following is the expected answer. Use this to measure correctness:
callbacks (Callbacks): Callbacks to use for this chain run. callbacks (Callbacks): Callbacks to use for this chain run.
Returns: Returns:
dict: The evaluation result. dict: The evaluation result, which includes the score and optionally
the reasoning for reaching that.
""" """
inputs = { inputs = {
"question": input, "question": input,

View File

@ -52,7 +52,8 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain): class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
"""A chain for comparing the output of two models. """A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs.
Example: Example:
>>> from langchain.chat_models import ChatOpenAI >>> from langchain.chat_models import ChatOpenAI

View File

@ -92,10 +92,37 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
-------- --------
>>> from langchain.chat_models import ChatAnthropic >>> from langchain.chat_models import ChatAnthropic
>>> from langchain.evaluation.criteria import CriteriaEvalChain >>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> llm = ChatAnthropic() >>> llm = ChatAnthropic(temperature=0)
>>> criteria = {"my-custom-criterion": "Is the submission the most amazing ever?"} >>> criteria = {"my-custom-criterion": "Is the submission the most amazing ever?"}
>>> chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria) >>> evaluator = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
""" >>> evaluator.evaluate_strings(prediction="Imagine an ice cream flavor for the color aquamarine", input="Tell me an idea")
{
'reasoning': 'Here is my step-by-step reasoning for the given criteria:\\n\\nThe criterion is: "Is the submission the most amazing ever?" This is a subjective criterion and open to interpretation. The submission suggests an aquamarine-colored ice cream flavor which is creative but may or may not be considered the most amazing idea ever conceived. There are many possible amazing ideas and this one ice cream flavor suggestion may or may not rise to that level for every person. \\n\\nN',
'value': 'N',
'score': 0,
}
>>> from langchain.chat_models import ChatOpenAI
>>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> llm = ChatOpenAI(model="gpt-4", temperature=0)
>>> criteria = "correctness"
>>> evaluator = CriteriaEvalChain.from_llm(
... llm=llm,
... criteria=criteria,
... requires_reference=True,
... )
>>> evaluator.evaluate_strings(
... prediction="The answer is 4",
... input="How many apples are there?",
... reference="There are 3 apples",
... )
{
'score': 0,
'reasoning': 'The criterion for this task is the correctness of the submission. The submission states that there are 4 apples, but the reference indicates that there are actually 3 apples. Therefore, the submission is not correct, accurate, or factual according to the given criterion.\\n\\nN',
'value': 'N',
}
""" # noqa: E501
output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser) output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
"""The parser to use to map the output to a structured result.""" """The parser to use to map the output to a structured result."""

View File

@ -0,0 +1,12 @@
"""Evaluators that measure embedding distances."""
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistance,
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
)
__all__ = [
"EmbeddingDistance",
"EmbeddingDistanceEvalChain",
"PairwiseEmbeddingDistanceEvalChain",
]

View File

@ -0,0 +1,438 @@
"""A chain for comparing the output of two models using embeddings."""
from enum import Enum
from typing import Any, Dict, List, Optional
import numpy as np
from pydantic import Field, root_validator
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.embeddings.base import Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
from langchain.math_utils import cosine_similarity
class EmbeddingDistance(str, Enum):
"""Embedding Distance Metric.
Attributes:
COSINE: Cosine distance metric.
EUCLIDEAN: Euclidean distance metric.
MANHATTAN: Manhattan distance metric.
CHEBYSHEV: Chebyshev distance metric.
HAMMING: Hamming distance metric.
"""
COSINE = "cosine"
EUCLIDEAN = "euclidean"
MANHATTAN = "manhattan"
CHEBYSHEV = "chebyshev"
HAMMING = "hamming"
class _EmbeddingDistanceChainMixin(Chain):
"""Shared functionality for embedding distance evaluators.
Attributes:
embeddings (Embeddings): The embedding objects to vectorize the outputs.
distance_metric (EmbeddingDistance): The distance metric to use
for comparing the embeddings.
"""
embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
class Config:
"""Permit embeddings to go unvalidated."""
arbitrary_types_allowed: bool = True
@property
def output_keys(self) -> List[str]:
"""Return the output keys of the chain.
Returns:
List[str]: The output keys.
"""
return ["score"]
@root_validator
def _validate_distance_metric(cls, values: dict) -> dict:
"""Validate the distance metric.
Args:
values (dict): The values to validate.
Returns:
dict: The validated values.
"""
values["distance_metric"] = values["distance_metric"].lower()
return values
def _get_metric(self, metric: EmbeddingDistance) -> Any:
"""Get the metric function for the given metric name.
Args:
metric (EmbeddingDistance): The metric name.
Returns:
Any: The metric function.
"""
metrics = {
EmbeddingDistance.COSINE: self._cosine_distance,
EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
EmbeddingDistance.MANHATTAN: self._manhattan_distance,
EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
EmbeddingDistance.HAMMING: self._hamming_distance,
}
if metric in metrics:
return metrics[metric]
else:
raise ValueError(f"Invalid metric: {metric}")
@staticmethod
def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
"""Compute the cosine distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.ndarray: The cosine distance.
"""
return 1.0 - cosine_similarity(a, b)
@staticmethod
def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Euclidean distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Euclidean distance.
"""
return np.linalg.norm(a - b)
@staticmethod
def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Manhattan distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Manhattan distance.
"""
return np.sum(np.abs(a - b))
@staticmethod
def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Chebyshev distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Chebyshev distance.
"""
return np.max(np.abs(a - b))
@staticmethod
def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Hamming distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Hamming distance.
"""
return np.mean(a != b)
def _compute_score(self, vectors: np.ndarray) -> float:
"""Compute the score based on the distance metric.
Args:
vectors (np.ndarray): The input vectors.
Returns:
float: The computed score.
"""
metric = self._get_metric(self.distance_metric)
score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
return score
class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
"""Use embedding distances to score semantic difference between
a prediction and reference.
Examples:
>>> chain = EmbeddingDistanceEvalChain()
>>> result = chain.evaluate_strings(prediction="Hello", reference="Hi")
>>> print(result)
{'score': 0.5}
"""
@property
def requires_reference(self) -> bool:
"""Return whether the chain requires a reference.
Returns:
bool: True if a reference is required, False otherwise.
"""
return True
@property
def input_keys(self) -> List[str]:
"""Return the input keys of the chain.
Returns:
List[str]: The input keys.
"""
return ["prediction", "reference"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Compute the score for a prediction and reference.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (Optional[CallbackManagerForChainRun], optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
vectors = np.array(
self.embeddings.embed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
)
score = self._compute_score(vectors)
return {"score": score}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Asynchronously compute the score for a prediction and reference.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (AsyncCallbackManagerForChainRun, optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
embedded = await self.embeddings.aembed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
vectors = np.array(embedded)
score = self._compute_score(vectors)
return {"score": score}
def _evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between a prediction and
reference.
Args:
prediction (str): The output string from the first model.
reference (str): The reference string (required)
callbacks (Callbacks, optional): The callbacks to use.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return self(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
async def _aevaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance between
a prediction and reference.
Args:
prediction (str): The output string from the first model.
reference (str): The output string from the second model.
callbacks (Callbacks, optional): The callbacks to use.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return await self.acall(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
class PairwiseEmbeddingDistanceEvalChain(
_EmbeddingDistanceChainMixin, PairwiseStringEvaluator
):
"""Use embedding distances to score semantic difference between two predictions.
Examples:
>>> chain = PairwiseEmbeddingDistanceEvalChain()
>>> result = chain.evaluate_string_pairs(prediction="Hello", prediction_b="Hi")
>>> print(result)
{'score': 0.5}
"""
@property
def input_keys(self) -> List[str]:
"""Return the input keys of the chain.
Returns:
List[str]: The input keys.
"""
return ["prediction", "prediction_b"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Compute the score for two predictions.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (CallbackManagerForChainRun, optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
vectors = np.array(
self.embeddings.embed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
)
score = self._compute_score(vectors)
return {"score": score}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Asynchronously compute the score for two predictions.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (AsyncCallbackManagerForChainRun, optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
embedded = await self.embeddings.aembed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
vectors = np.array(embedded)
score = self._compute_score(vectors)
return {"score": score}
def _evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces
metadata (Dict[str, Any], optional): metadata to apply to
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
result = self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}
async def _aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance
between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces
metadata (Dict[str, Any], optional): metadata to apply to traces
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
result = await self.acall(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}

View File

@ -1,25 +1,46 @@
"""Loading datasets and evaluators.""" """Loading datasets and evaluators."""
from typing import Any, Dict, List, Optional, Sequence, Type from typing import Any, Dict, List, Optional, Sequence, Type, Union
from langchain.chains.base import Chain from langchain.chains.base import Chain
from langchain.chat_models.openai import ChatOpenAI from langchain.chat_models.openai import ChatOpenAI
from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
from langchain.evaluation.comparison import PairwiseStringEvalChain from langchain.evaluation.comparison import PairwiseStringEvalChain
from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
)
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain from langchain.evaluation.schema import EvaluatorType, LLMEvalChain
from langchain.evaluation.string_distance.base import (
PairwiseStringDistanceEvalChain,
StringDistanceEvalChain,
)
from langchain.schema.language_model import BaseLanguageModel from langchain.schema.language_model import BaseLanguageModel
def load_dataset(uri: str) -> List[Dict]: def load_dataset(uri: str) -> List[Dict]:
"""Load a dataset from the LangChainDatasets HuggingFace org. """Load a dataset from the `LangChainDatasets HuggingFace org <https://huggingface.co/LangChainDatasets>`_.
Args: Args:
uri: The uri of the dataset to load. uri: The uri of the dataset to load.
Returns: Returns:
A list of dictionaries, each representing a row in the dataset. A list of dictionaries, each representing a row in the dataset.
"""
**Prerequisites**
.. code-block:: shell
pip install datasets
Examples
--------
.. code-block:: python
from langchain.evaluation import load_dataset
ds = load_dataset("llm-math")
""" # noqa: E501
try: try:
from datasets import load_dataset from datasets import load_dataset
except ImportError: except ImportError:
@ -32,13 +53,17 @@ def load_dataset(uri: str) -> List[Dict]:
return [d for d in dataset["train"]] return [d for d in dataset["train"]]
_EVALUATOR_MAP: Dict[EvaluatorType, Type[LLMEvalChain]] = { _EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
EvaluatorType.QA: QAEvalChain, EvaluatorType.QA: QAEvalChain,
EvaluatorType.COT_QA: CotQAEvalChain, EvaluatorType.COT_QA: CotQAEvalChain,
EvaluatorType.CONTEXT_QA: ContextQAEvalChain, EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain, EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain, EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
EvaluatorType.CRITERIA: CriteriaEvalChain, EvaluatorType.CRITERIA: CriteriaEvalChain,
EvaluatorType.STRING_DISTANCE: StringDistanceEvalChain,
EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,
EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
} }
@ -66,8 +91,8 @@ def load_evaluator(
Examples Examples
-------- --------
>>> llm = ChatOpenAI(model="gpt-4", temperature=0) >>> from langchain.evaluation import load_evaluator, EvaluatorType
>>> evaluator = _load_evaluator("qa", llm=llm) >>> evaluator = load_evaluator(EvaluatorType.QA)
""" """
llm = llm or ChatOpenAI(model="gpt-4", temperature=0) llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
if evaluator not in _EVALUATOR_MAP: if evaluator not in _EVALUATOR_MAP:
@ -75,7 +100,11 @@ def load_evaluator(
f"Unknown evaluator type: {evaluator}" f"Unknown evaluator type: {evaluator}"
f"Valid types are: {list(_EVALUATOR_MAP.keys())}" f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
) )
return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs) evaluator_cls = _EVALUATOR_MAP[evaluator]
if issubclass(evaluator_cls, LLMEvalChain):
return evaluator_cls.from_llm(llm=llm, **kwargs)
else:
return evaluator_cls(**kwargs)
def load_evaluators( def load_evaluators(
@ -107,10 +136,9 @@ def load_evaluators(
Examples Examples
-------- --------
.. code-block:: python >>> from langchain.evaluation import load_evaluators, EvaluatorType
from langchain.evaluation import load_evaluators, EvaluatorType >>> evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA]
evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA] >>> loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
""" """
llm = llm or ChatOpenAI(model="gpt-4", temperature=0) llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
loaded = [] loaded = []

View File

@ -167,6 +167,11 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
"""Whether the chain requires an input string.""" """Whether the chain requires an input string."""
return True return True
class Config:
"""Configuration for the QAEvalChain."""
extra = Extra.ignore
@classmethod @classmethod
def _validate_input_vars(cls, prompt: PromptTemplate) -> None: def _validate_input_vars(cls, prompt: PromptTemplate) -> None:
expected_input_vars = {"query", "context", "result"} expected_input_vars = {"query", "context", "result"}

View File

@ -77,7 +77,7 @@ class RunEvaluatorChain(Chain, RunEvaluator):
async def _acall( async def _acall(
self, self,
inputs: Dict[str, Any], inputs: Dict[str, Any],
run_manager: AsyncCallbackManagerForChainRun | None = None, run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
run: Run = inputs["run"] run: Run = inputs["run"]
example: Optional[Example] = inputs.get("example") example: Optional[Example] = inputs.get("example")

View File

@ -33,6 +33,14 @@ class EvaluatorType(str, Enum):
CRITERIA = "criteria" CRITERIA = "criteria"
"""The criteria evaluator, which evaluates a model based on a """The criteria evaluator, which evaluates a model based on a
custom set of criteria.""" custom set of criteria."""
STRING_DISTANCE = "string_distance"
"""Compare predictions to a reference answer using string edit distances."""
PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
"""Compare predictions based on string edit distances."""
EMBEDDING_DISTANCE = "embedding_distance"
"""Compare a prediction to a reference label using embedding distance."""
PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
"""Compare two predictions using embedding distance."""
class LLMEvalChain(Chain): class LLMEvalChain(Chain):
@ -89,7 +97,8 @@ class _EvalArgsMixin:
class StringEvaluator(_EvalArgsMixin, ABC): class StringEvaluator(_EvalArgsMixin, ABC):
"""Protocol for evaluating strings.""" """Grade, tag, or otherwise evaluate predictions relative to their inputs
and/or reference labels."""
@property @property
def evaluation_name(self) -> str: def evaluation_name(self) -> str:
@ -204,7 +213,7 @@ class StringEvaluator(_EvalArgsMixin, ABC):
class PairwiseStringEvaluator(_EvalArgsMixin, ABC): class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
"""A protocol for comparing the output of two models.""" """Compare the output of two models (or two outputs of the same model)."""
@abstractmethod @abstractmethod
def _evaluate_string_pairs( def _evaluate_string_pairs(

View File

@ -0,0 +1,12 @@
"""String distance evaluators."""
from langchain.evaluation.string_distance.base import (
PairwiseStringDistanceEvalChain,
StringDistance,
StringDistanceEvalChain,
)
__all__ = [
"PairwiseStringDistanceEvalChain",
"StringDistance",
"StringDistanceEvalChain",
]

View File

@ -0,0 +1,376 @@
"""String distance evaluators based on the RapidFuzz library."""
from enum import Enum
from typing import Any, Callable, Dict, List, Optional
from pydantic import Field, root_validator
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
def _load_rapidfuzz() -> Any:
"""
Load the RapidFuzz library.
Raises:
ImportError: If the rapidfuzz library is not installed.
Returns:
Any: The rapidfuzz.distance module.
"""
try:
import rapidfuzz
except ImportError:
raise ImportError(
"Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
)
return rapidfuzz.distance
class StringDistance(str, Enum):
"""Distance metric to use."""
DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
LEVENSHTEIN = "levenshtein"
JARO = "jaro"
JARO_WINKLER = "jaro_winkler"
class _RapidFuzzChainMixin(Chain):
"""Shared methods for the rapidfuzz string distance evaluators."""
distance: StringDistance = Field(default=StringDistance.LEVENSHTEIN)
@root_validator
def validate_dependencies(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate that the rapidfuzz library is installed.
Args:
values (Dict[str, Any]): The input values.
Returns:
Dict[str, Any]: The validated values.
"""
_load_rapidfuzz()
return values
@property
def output_keys(self) -> List[str]:
"""
Get the output keys.
Returns:
List[str]: The output keys.
"""
return ["score"]
@staticmethod
def _get_metric(distance: str) -> Callable:
"""
Get the distance metric function based on the distance type.
Args:
distance (str): The distance type.
Returns:
Callable: The distance metric function.
Raises:
ValueError: If the distance metric is invalid.
"""
rf_distance = _load_rapidfuzz()
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
return rf_distance.DamerauLevenshtein.distance
elif distance == StringDistance.LEVENSHTEIN:
return rf_distance.Levenshtein.distance
elif distance == StringDistance.JARO:
return rf_distance.Jaro.distance
elif distance == StringDistance.JARO_WINKLER:
return rf_distance.JaroWinkler.distance
else:
raise ValueError(f"Invalid distance metric: {distance}")
@property
def metric(self) -> Callable:
"""
Get the distance metric function.
Returns:
Callable: The distance metric function.
"""
return _RapidFuzzChainMixin._get_metric(self.distance)
class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
"""Compute string distances between the prediction and the reference."""
@property
def requires_input(self) -> bool:
"""
Check if input is required.
Returns:
bool: True if input is required, False otherwise.
"""
return False
@property
def requires_reference(self) -> bool:
"""
Check if reference is required.
Returns:
bool: True if reference is required, False otherwise.
"""
return True
@property
def input_keys(self) -> List[str]:
"""
Get the input keys.
Returns:
List[str]: The input keys.
"""
return ["reference", "prediction"]
@staticmethod
def _get_metric(distance: str) -> Callable:
"""
Get the distance metric function based on the distance type.
Args:
distance (str): The distance type.
Returns:
Callable: The distance metric function.
Raises:
ValueError: If the distance metric is invalid.
"""
rf_distance = _load_rapidfuzz()
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
return rf_distance.DamerauLevenshtein.distance
elif distance == StringDistance.LEVENSHTEIN:
return rf_distance.Levenshtein.distance
elif distance == StringDistance.JARO:
return rf_distance.Jaro.distance
elif distance == StringDistance.JARO_WINKLER:
return rf_distance.JaroWinkler.distance
else:
raise ValueError(f"Invalid distance metric: {distance}")
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Compute the string distance between the prediction and the reference.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (Optional[CallbackManagerForChainRun]):
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["reference"], inputs["prediction"])}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Asynchronously compute the string distance between the prediction
and the reference.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (Optional[AsyncCallbackManagerForChainRun]:
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["reference"], inputs["prediction"])}
def _evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""
Evaluate the string distance between the prediction and the reference.
Args:
prediction (str): The prediction string.
reference (Optional[str], optional): The reference string.
input (Optional[str], optional): The input string.
callbacks (Callbacks, optional): The callbacks to use.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = self(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
return {"score": result["score"]}
async def _aevaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""
Asynchronously evaluate the string distance between the
prediction and the reference.
Args:
prediction (str): The prediction string.
reference (Optional[str], optional): The reference string.
input (Optional[str], optional): The input string.
callbacks (Callbacks, optional): The callbacks to use.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = await self.acall(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
return {"score": result["score"]}
class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvaluator):
"""Compute string edit distances between two predictions."""
@property
def input_keys(self) -> List[str]:
"""
Get the input keys.
Returns:
List[str]: The input keys.
"""
return ["prediction", "prediction_b"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Compute the string distance between two predictions.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (CallbackManagerForChainRun , optional):
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Asynchronously compute the string distance between two predictions.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (AsyncCallbackManagerForChainRun , optional):
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
def _evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""
Evaluate the string distance between two predictions.
Args:
prediction (str): The first prediction string.
prediction_b (str): The second prediction string.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces.
metadata (Dict[str, Any], optional): Metadata to apply to traces.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}
async def _aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""
Asynchronously evaluate the string distance between two predictions.
Args:
prediction (str): The first prediction string.
prediction_b (str): The second prediction string.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces.
metadata (Dict[str, Any], optional): Metadata to apply to traces.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = await self.acall(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}

109
poetry.lock generated
View File

@ -8920,6 +8920,111 @@ packaging = "*"
[package.extras] [package.extras]
test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"] test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"]
[[package]]
name = "rapidfuzz"
version = "3.1.1"
description = "rapid fuzzy string matching"
category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:17e4cbe6632aae7c35101c4b7c498e83f6eacf61be0def4ff98167df30dc69ca"},
{file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:167dbce2da6bb5b73d43e53434c5a9d7d1214b658b315420e44044782f4c482b"},
{file = "rapidfuzz-3.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdee4f4d04761ce167538adbefa01a64e7cab949d89aa09df39ef0d5e859fb2a"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e77ed7d0bd8d9be530c462c921904ada8d3417671eed749784c5a315af334d"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fdd2ab5ab56fcaf839a9f58caa8756dbfeba0b3dc187850b763d0a1e6ee9c97a"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0843c53d54d5b7d6122d8f1d7574d8c91a7aacc5c316f74d6e33d98aec82949d"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b3e953dcef0302eeb4fe8c7c4907e50d175199fc07da05ad6bd1d8d141ff138"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec5523d5c08c639cd4e301d42f3ad7c6fb061a1f1cd6b5b627e59af345edfed7"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b4995792e106c3f1ab6f56dd6089918b065888e2e55a71e3fea8d0f66bf30989"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cdbf9a76ea47f14026daaed43a2c2150ab0e9a4d5396909f028380f33e61c522"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f25d1975e846d07990cf946a5927a932aa7cccd308ae9979b03a58ff1cd80087"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e0755f5ac6c3d1dc2505eb2e6eaf5508ff17b42c084406714fbabf2d50d098b6"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:de784bbe06d32e66617cd20766c37aae2438902d54b3fa608d2e0a929ca705f4"},
{file = "rapidfuzz-3.1.1-cp310-cp310-win32.whl", hash = "sha256:ef6c38040d868dcc0132fad377aafeb5b2da71354759e77f41ae599316df2dee"},
{file = "rapidfuzz-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c74fde444bcd13ef3a803c578b28f33b4f9edf368f46ca3de57fda456065967"},
{file = "rapidfuzz-3.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:e549da8d68ad4ee385c918ea8b9efeda875df9edf6c6b48df927bd061c00bfef"},
{file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:58ca539cc6ce385d650138a9b1908b05622c2dd08a23d5aea4890523ef3774d5"},
{file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91946c496e6f380939dbea14ff6ce6de87480445c09d03964f5374101462594b"},
{file = "rapidfuzz-3.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f2024f83a9300440e845b441e71726471f7567021c1d80796ca02e71c5f0dc2"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17b017f9e1b88dfd6d9b03170ef8e86477de0d9d37fbfcbe72ca070cacbe1b65"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6772eb7cc4429f1eae5a9b41e5b0b1af8f0d50727c6e338d9ad5bceee01da5a"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c089ce856919e03f4dd8f9168d60ac580d30cd0451fd60dcdef73010eca68973"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f2cd9a3760080876fc59edb26926e51d6db44dea65e85f1eb04aa5f58c3bc41"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f32791ee045a7b3d6a56208a55d996d5f7a32fdb688f5c5ee899cb7589539eb"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:68d910048b36613701ea671de68f701e2c1ba2839295238def840ff1fc1b15f4"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6f767d4823002e65c06ea273f952fda2b88775e1c2d508564f04d32cdd7f65b2"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:10313075642a9f1f948d356f4f0803ae28a496d7967b466b9cae1a4be8aa4df3"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1465ea085154378e69bf4bc5e27bdac5c94684416882ace31865232adc9239a2"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:53e3c588e7ea158fa80095dd0ff53f49e2ede9a8d71a3a5b964ca045d845a9b9"},
{file = "rapidfuzz-3.1.1-cp311-cp311-win32.whl", hash = "sha256:cb08db5c122fea4196483b82f7596e50ef9cab1770f7696c197bf0815ac4dd17"},
{file = "rapidfuzz-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b7c65112c87568274d399ad7a62902cef17801c2bd047b162e79e43758b3ce27"},
{file = "rapidfuzz-3.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:ea3e46a534de97a6cad2018cb950492a0fcacad380e35440ce3c1c8fef96a261"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a8bb256b34fcad4f3fa00be6b57fe35bcb54f031911195929145c67d9738ffec"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51f21f37aec6bc117e9083181ddc3cbbcbf56b6506492b128d8e836d3545ca80"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a371846f45ed9d24927a8d5222884536c1e171543396b36250fafb2e848bc92"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25eea5c8006b6c8747ca204675c9e939f3c4d27167fb43b2aa211443d34f9abd"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:db5e71e5a810d2f1163c914e01b3ba241409a98286ac4850ff26076115ae401b"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c07e16ab38e717931319cff1340debbf2ef940a1cda4eb70e323079b62df306"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aadc5a8b9859737a8f87831215b7fab0c04afeb960bb987c528421a4e6dfb8b6"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:0de229cb613be060580c71c1674acbde57921c7ed33d7a726e071a2562924113"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:b1bf8aba99b267aad0a01dfb44ee39803676007724abcfb72129c350476b2341"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d3264e4a02e4148e30078104fb0c1b6c8eb166ddc5ebe843a22433f58f87dc47"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:712331c1c70c79a219c2ac233b4e25e75ffad51042840d147d5e94519c7d8a1a"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:6ede2d42ad55bd4e7a3394e98c5f58ddace78775493391732d32be61268a4116"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:32a5c47b5153f25eb512dbb91f9850225d2dcfb3404a1c48406726c7732b0726"},
{file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:51bb8f7aa4fe45618e75cdccf08491c752a7f137ffbf7d3afd1809791ac8c326"},
{file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:788fb03c5acb5b48f5f918f4cbb5dc072498becf018c64e7e27d6b76e63e68b8"},
{file = "rapidfuzz-3.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dc7f25e20781c8d42e813516ee4ff9043ecce4a8e25fc94ee6732a83d81c1c99"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4a751f216fd1222a4a8c7ceff5180872a156202c3bdca1b337e5a5b09298dfd"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83b48b789f2da1688882cba595c40179194ab15ec17ea1d4c9de9ee239649904"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a6f5cd9f1282da49b8d0747c40f3fea2d64ab5e4c2cc2295baf87ff7a0d062"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5fe8054c244bf63be2380efc275edd86da3a706460d42911dc3ff914f3260a5"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d4d509e9aa011e1be5e4da7c5062dc4fc3688714687110536925980b3d03ac6"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ccc1b5b467766110085c80bb9311d233fccc8ed1ce965aebba3125e1bab04cba"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7e181411958d04d5b437a0981e87815e8f1b1909f5ae0e339246d3bc464f53e7"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:c53cf36cdb10819b7154fefdbffbef442ba567d9c1ca74a7e76fd759ace45e6c"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:851b44130393139cb336aa54c681d595d75a3160b7be330f3acc0c3b9dabce70"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49d900da023eeb3bfbe9feee126312eb9fd0458129aa5a581e4d8d8bf4483d14"},
{file = "rapidfuzz-3.1.1-cp38-cp38-win32.whl", hash = "sha256:6c0e96821029c46847df4ff266ea283a2b6163a4f76a4567f9986934e9c4410c"},
{file = "rapidfuzz-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7af18372f576e36e93f4662bdf64043ac23dfa02d7f768d7e7e1d0211bb9cb35"},
{file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8b966344ed4122a71ab8ccdca2954db1ce0d8049cb9bcac58db07558f9d9ec32"},
{file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a293370448f2e46fdc6e086ac99923015bdc53973a65d3df35aefc685e1a5809"},
{file = "rapidfuzz-3.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:351d253fdee62d6d0e80c75f0505accc1ce8cc73a50779c60986ef21c92f20f9"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e951c874a0e5b375b2af9b5f264eefc679c0685c166ee0641e703ef0795509b"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4019def8a18bc867ac61f08a542bf474a7a9b3f662f5d5cd169c9135866562f5"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:086a2d84c2e497e3ab160ccf164e319bca874d9383d008fcadf91ede8ac7997f"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6d4da453fbd8793ebb11bed396f8a4b9041d6227bf055903447305dd7942312f"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10f56af1d46fbeaaa0dc50901c2dc439c7a455cfdac2f1acf6cffeb65ae82c48"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7726f67e4a0b2b4392f03aa62e16b12a697156c6735df27b21bd3ab561b01659"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d72916d27fb88741bfb576b0b0639354ca00f5e91046171c985262c68a86bbb5"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c85bb6946fb02231d1e60ab45c36ecee04ecf7f725e094f5beee798b6b7d36d"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:fb7049dff52cded65184a3d2ff45cfd226bff7314f49a8f4b83f943eea9181a7"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:408007b4bc5a0a0cb9bfcdcc8cffa9b71fec6ee53ccdf9c26b57539f7e264ab5"},
{file = "rapidfuzz-3.1.1-cp39-cp39-win32.whl", hash = "sha256:9dc7154889937ca5a004d17f62b4798e0af52f69c38eb3112dbdb52b006d4419"},
{file = "rapidfuzz-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:16c506bac2e0a6f6581b334a7802c2f0d8343ec1d77e5cf9452c33d6219abef8"},
{file = "rapidfuzz-3.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:5e11e11880951e767342b56627ab2dc9d3ef90e2605b656e9b5e6e0beadaaf0f"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a8b8f32463781e4703965c9cf7a609a19a74478f332e0d62cd9d0e7a9db91321"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b408ac3c7f8c3414bfd5c6044ca4bb385b390bcf5eae3ad884cef48628c131ae"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ff1a517de2b1e80ddf1a3037a6ebca9925154c1af70751518d50d5c332e1ec8"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1e23665be5918f979180130babedab9317fbb34cdae237c7defad7e86bc684e"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:15260263a0c7bffac934a53b6622d77e06e10929ee4d2e62ac6f70c13988f351"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f7acc5c9c7cf567372de5b6c817f93db508e7b9bd7f29bd6187df8d2cc60ced5"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79f5a3ab7ff6c46336f38690f0564bc7689cefa180257ed9078c42f75b10c9d2"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:362e366e79fcc9a8866b41f20ef4d2987a06f8b134096e659594c059aa8a6d88"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:819d9317c3d86b508d87ab1bca5867f3abc18b902c822bc57366ccc6330a030b"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4a64ddfb7084b678da7778c1263aee2baae5a2ca55ec5589a022defc38103eb1"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8243bb4bb4db7c3501932ced6a978b284e19c3619b6802455e47bfd0905adb81"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39c7d0dbd77a7f28ff85a1dff2afb2ed73e5cd81cca3f654450ed339a271c0ab"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4afab735bb0ac3ec9bafcc35376ed336d26af6140c4d81e4c869e77df77ecd5"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69d503a7641b5a63aa53c7aca0b857d38f48cd7bae39f8563679b324e3d2d47a"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ef3ad80458e47723812976a2ea1282ff207ad20e6cb19da1917f76699bd5aaa5"},
{file = "rapidfuzz-3.1.1.tar.gz", hash = "sha256:a06a08be3cb7d7df7993dd16e84aaf59bd5a7ff98a9f1b3e893d18b273a71c64"},
]
[package.extras]
full = ["numpy"]
[[package]] [[package]]
name = "ratelimiter" name = "ratelimiter"
version = "1.2.0.post0" version = "1.2.0.post0"
@ -12410,7 +12515,7 @@ clarifai = ["clarifai"]
cohere = ["cohere"] cohere = ["cohere"]
docarray = ["docarray"] docarray = ["docarray"]
embeddings = ["sentence-transformers"] embeddings = ["sentence-transformers"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"] extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"]
javascript = ["esprima"] javascript = ["esprima"]
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"] llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"] openai = ["openai", "tiktoken"]
@ -12420,4 +12525,4 @@ text-helpers = ["chardet"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "cc95f4e0d4bee4ba19cf539be5ffd81f1ddb33229ace936ef3b6cbd4122493ca" content-hash = "6e2acbd4f760e92454f9f9e29840679fbd59b8662a99bcb89e2251a5b8736e6d"

View File

@ -116,6 +116,7 @@ streamlit = {version = "^1.18.0", optional = true, python = ">=3.8.1,<3.9.7 || >
psychicapi = {version = "^0.8.0", optional = true} psychicapi = {version = "^0.8.0", optional = true}
cassio = {version = "^0.0.7", optional = true} cassio = {version = "^0.0.7", optional = true}
rdflib = {version = "^6.3.2", optional = true} rdflib = {version = "^6.3.2", optional = true}
rapidfuzz = {version = "^3.1.1", optional = true}
[tool.poetry.group.docs.dependencies] [tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0" autodoc_pydantic = "^1.8.0"
@ -346,7 +347,8 @@ extended_testing = [
"scikit-learn", "scikit-learn",
"streamlit", "streamlit",
"pyspark", "pyspark",
"openai" "openai",
"rapidfuzz"
] ]
[[tool.poetry.source]] [[tool.poetry.source]]

View File

@ -0,0 +1,123 @@
from typing import Tuple
import numpy as np
import pytest
from langchain.evaluation.embedding_distance import (
EmbeddingDistance,
PairwiseEmbeddingDistanceEvalChain,
)
@pytest.fixture
def vectors() -> Tuple[np.ndarray, np.ndarray]:
"""Create two random vectors."""
vector_a = np.array(
[
0.5488135,
0.71518937,
0.60276338,
0.54488318,
0.4236548,
0.64589411,
0.43758721,
0.891773,
0.96366276,
0.38344152,
]
)
vector_b = np.array(
[
0.79172504,
0.52889492,
0.56804456,
0.92559664,
0.07103606,
0.0871293,
0.0202184,
0.83261985,
0.77815675,
0.87001215,
]
)
return vector_a, vector_b
@pytest.fixture
def chain() -> PairwiseEmbeddingDistanceEvalChain:
"""Create a PairwiseEmbeddingDistanceEvalChain."""
return PairwiseEmbeddingDistanceEvalChain()
@pytest.mark.requires("scipy")
def test_cosine_similarity(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the cosine similarity."""
chain.distance_metric = EmbeddingDistance.COSINE
result = chain._compute_score(np.array(vectors))
expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_euclidean_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the euclidean distance."""
from scipy.spatial.distance import euclidean
chain.distance_metric = EmbeddingDistance.EUCLIDEAN
result = chain._compute_score(np.array(vectors))
expected = euclidean(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_manhattan_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the manhattan distance."""
from scipy.spatial.distance import cityblock
chain.distance_metric = EmbeddingDistance.MANHATTAN
result = chain._compute_score(np.array(vectors))
expected = cityblock(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_chebyshev_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the chebyshev distance."""
from scipy.spatial.distance import chebyshev
chain.distance_metric = EmbeddingDistance.CHEBYSHEV
result = chain._compute_score(np.array(vectors))
expected = chebyshev(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_hamming_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the hamming distance."""
from scipy.spatial.distance import hamming
chain.distance_metric = EmbeddingDistance.HAMMING
result = chain._compute_score(np.array(vectors))
expected = hamming(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("openai", "tiktoken")
def test_embedding_distance(chain: PairwiseEmbeddingDistanceEvalChain) -> None:
"""Test the embedding distance."""
result = chain.evaluate_string_pairs(
prediction="A single cat", prediction_b="A single cat"
)
assert np.isclose(result["score"], 0.0)

View File

@ -0,0 +1,51 @@
import pytest
from langchain.evaluation.string_distance import (
PairwiseStringDistanceEvalChain,
StringDistance,
StringDistanceEvalChain,
)
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
def test_zero_distance(distance: StringDistance) -> None:
eval_chain = StringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = eval_chain.evaluate_strings(prediction=string, reference=string)
assert "score" in result
assert result["score"] == 0
@pytest.mark.asyncio
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
async def test_zero_distance_async(distance: StringDistance) -> None:
eval_chain = StringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = await eval_chain.aevaluate_strings(prediction=string, reference=string)
assert "score" in result
assert result["score"] == 0
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
def test_zero_distance_pairwise(distance: StringDistance) -> None:
eval_chain = PairwiseStringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = eval_chain.evaluate_string_pairs(prediction=string, prediction_b=string)
assert "score" in result
assert result["score"] == 0
@pytest.mark.asyncio
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
async def test_zero_distance_pairwise_async(distance: StringDistance) -> None:
eval_chain = PairwiseStringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = await eval_chain.aevaluate_string_pairs(
prediction=string, prediction_b=string
)
assert "score" in result
assert result["score"] == 0

View File

@ -2,20 +2,27 @@
import pytest import pytest
from langchain.embeddings.fake import FakeEmbeddings
from langchain.evaluation.loading import EvaluatorType, load_evaluators from langchain.evaluation.loading import EvaluatorType, load_evaluators
from langchain.evaluation.schema import StringEvaluator from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.llms.fake_chat_model import FakeChatModel from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("evaluator_type", EvaluatorType) @pytest.mark.parametrize("evaluator_type", EvaluatorType)
def test_load_evaluators(evaluator_type: EvaluatorType) -> None: def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
"""Test loading evaluators.""" """Test loading evaluators."""
fake_llm = FakeChatModel() fake_llm = FakeChatModel()
load_evaluators([evaluator_type], llm=fake_llm) embeddings = FakeEmbeddings(size=32)
load_evaluators([evaluator_type], llm=fake_llm, embeddings=embeddings)
# Test as string # Test as string
load_evaluators([evaluator_type.value], llm=fake_llm) # type: ignore load_evaluators(
[evaluator_type.value], # type: ignore
llm=fake_llm,
embeddings=embeddings,
)
def test_criteria_eval_chain_requires_reference() -> None: def test_criteria_eval_chain_requires_reference() -> None: