From 4789c99bc24f6648fb3d5d4167050902478569e1 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Fri, 7 Jul 2023 21:44:31 -0700 Subject: [PATCH] Add String Distance and Embedding Evaluators (#7123) Add a string evaluator and pairwise string evaluator implementation for: - Embedding distance - String distance Update docs --- docs/api_reference/api_reference.rst | 98 ++-- langchain/evaluation/__init__.py | 58 ++- .../agents/trajectory_eval_chain.py | 66 +-- langchain/evaluation/comparison/eval_chain.py | 3 +- langchain/evaluation/criteria/eval_chain.py | 33 +- .../evaluation/embedding_distance/__init__.py | 12 + .../evaluation/embedding_distance/base.py | 438 ++++++++++++++++++ langchain/evaluation/loading.py | 50 +- langchain/evaluation/qa/eval_chain.py | 5 + langchain/evaluation/run_evaluators/base.py | 2 +- langchain/evaluation/schema.py | 13 +- .../evaluation/string_distance/__init__.py | 12 + langchain/evaluation/string_distance/base.py | 376 +++++++++++++++ poetry.lock | 109 ++++- pyproject.toml | 4 +- .../embedding_distance/test_embedding.py | 123 +++++ .../evaluation/string_distance/__init__.py | 0 .../evaluation/string_distance/test_base.py | 51 ++ tests/unit_tests/evaluation/test_loading.py | 11 +- 19 files changed, 1372 insertions(+), 92 deletions(-) create mode 100644 langchain/evaluation/embedding_distance/__init__.py create mode 100644 langchain/evaluation/embedding_distance/base.py create mode 100644 langchain/evaluation/string_distance/__init__.py create mode 100644 langchain/evaluation/string_distance/base.py create mode 100644 tests/integration_tests/evaluation/embedding_distance/test_embedding.py create mode 100644 tests/unit_tests/evaluation/string_distance/__init__.py create mode 100644 tests/unit_tests/evaluation/string_distance/test_base.py diff --git a/docs/api_reference/api_reference.rst b/docs/api_reference/api_reference.rst index fea725bc9bd..31d4ae19093 100644 --- a/docs/api_reference/api_reference.rst +++ b/docs/api_reference/api_reference.rst @@ -165,28 +165,35 @@ Classes callbacks.aim_callback.AimCallbackHandler callbacks.argilla_callback.ArgillaCallbackHandler callbacks.arize_callback.ArizeCallbackHandler + callbacks.arthur_callback.ArthurCallbackHandler callbacks.base.AsyncCallbackHandler callbacks.base.BaseCallbackHandler callbacks.base.BaseCallbackManager callbacks.clearml_callback.ClearMLCallbackHandler callbacks.comet_ml_callback.CometCallbackHandler callbacks.file.FileCallbackHandler + callbacks.flyte_callback.FlyteCallbackHandler callbacks.human.HumanApprovalCallbackHandler callbacks.human.HumanRejectedException callbacks.infino_callback.InfinoCallbackHandler callbacks.manager.AsyncCallbackManager callbacks.manager.AsyncCallbackManagerForChainRun callbacks.manager.AsyncCallbackManagerForLLMRun + callbacks.manager.AsyncCallbackManagerForRetrieverRun callbacks.manager.AsyncCallbackManagerForToolRun + callbacks.manager.AsyncParentRunManager callbacks.manager.AsyncRunManager callbacks.manager.BaseRunManager callbacks.manager.CallbackManager callbacks.manager.CallbackManagerForChainRun callbacks.manager.CallbackManagerForLLMRun + callbacks.manager.CallbackManagerForRetrieverRun callbacks.manager.CallbackManagerForToolRun + callbacks.manager.ParentRunManager callbacks.manager.RunManager callbacks.mlflow_callback.MlflowCallbackHandler callbacks.openai_info.OpenAICallbackHandler + callbacks.promptlayer_callback.PromptLayerCallbackHandler callbacks.stdout.StdOutCallbackHandler callbacks.streaming_aiter.AsyncIteratorCallbackHandler callbacks.streaming_aiter_final_only.AsyncFinalIteratorCallbackHandler @@ -229,6 +236,8 @@ Functions callbacks.aim_callback.import_aim callbacks.clearml_callback.import_clearml callbacks.comet_ml_callback.import_comet_ml + callbacks.flyte_callback.analyze_text + callbacks.flyte_callback.import_flytekit callbacks.infino_callback.import_infino callbacks.manager.env_var_is_set callbacks.manager.get_openai_callback @@ -283,9 +292,11 @@ Classes chains.base.Chain chains.combine_documents.base.AnalyzeDocumentChain chains.combine_documents.base.BaseCombineDocumentsChain - chains.combine_documents.map_reduce.CombineDocsProtocol chains.combine_documents.map_reduce.MapReduceDocumentsChain chains.combine_documents.map_rerank.MapRerankDocumentsChain + chains.combine_documents.reduce.AsyncCombineDocsProtocol + chains.combine_documents.reduce.CombineDocsProtocol + chains.combine_documents.reduce.ReduceDocumentsChain chains.combine_documents.refine.RefineDocumentsChain chains.combine_documents.stuff.StuffDocumentsChain chains.constitutional_ai.base.ConstitutionalChain @@ -299,8 +310,10 @@ Classes chains.flare.prompts.FinishedOutputParser chains.graph_qa.base.GraphQAChain chains.graph_qa.cypher.GraphCypherQAChain + chains.graph_qa.hugegraph.HugeGraphQAChain chains.graph_qa.kuzu.KuzuQAChain chains.graph_qa.nebulagraph.NebulaGraphQAChain + chains.graph_qa.sparql.GraphSparqlQAChain chains.hyde.base.HypotheticalDocumentEmbedder chains.llm.LLMChain chains.llm_bash.base.LLMBashChain @@ -363,7 +376,6 @@ Functions .. autosummary:: :toctree: chains - chains.combine_documents.base.format_document chains.graph_qa.cypher.extract_cypher chains.loading.load_chain chains.loading.load_chain_from_config @@ -415,6 +427,7 @@ Classes chat_models.fake.FakeListChatModel chat_models.google_palm.ChatGooglePalm chat_models.google_palm.ChatGooglePalmError + chat_models.human.HumanInputChatModel chat_models.openai.ChatOpenAI chat_models.promptlayer_openai.PromptLayerChatOpenAI chat_models.vertexai.ChatVertexAI @@ -513,6 +526,7 @@ Classes document_loaders.blob_loaders.youtube_audio.YoutubeAudioLoader document_loaders.blockchain.BlockchainDocumentLoader document_loaders.blockchain.BlockchainType + document_loaders.brave_search.BraveSearchLoader document_loaders.chatgpt.ChatGPTLoader document_loaders.college_confidential.CollegeConfidentialLoader document_loaders.confluence.ConfluenceLoader @@ -520,6 +534,7 @@ Classes document_loaders.conllu.CoNLLULoader document_loaders.csv_loader.CSVLoader document_loaders.csv_loader.UnstructuredCSVLoader + document_loaders.cube_semantic.CubeSemanticLoader document_loaders.dataframe.DataFrameLoader document_loaders.diffbot.DiffbotLoader document_loaders.directory.DirectoryLoader @@ -736,6 +751,7 @@ Classes embeddings.self_hosted.SelfHostedEmbeddings embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceEmbeddings embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceInstructEmbeddings + embeddings.spacy_embeddings.SpacyEmbeddings embeddings.tensorflow_hub.TensorflowHubEmbeddings embeddings.vertexai.VertexAIEmbeddings @@ -790,6 +806,9 @@ Classes evaluation.comparison.eval_chain.PairwiseStringResultOutputParser evaluation.criteria.eval_chain.CriteriaEvalChain evaluation.criteria.eval_chain.CriteriaResultOutputParser + evaluation.embedding_distance.base.EmbeddingDistance + evaluation.embedding_distance.base.EmbeddingDistanceEvalChain + evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain evaluation.qa.eval_chain.ContextQAEvalChain evaluation.qa.eval_chain.CotQAEvalChain evaluation.qa.eval_chain.QAEvalChain @@ -799,10 +818,16 @@ Classes evaluation.run_evaluators.implementations.ChoicesOutputParser evaluation.run_evaluators.implementations.CriteriaOutputParser evaluation.run_evaluators.implementations.StringRunEvaluatorInputMapper - evaluation.run_evaluators.implementations.TrajectoryEvalOutputParser evaluation.run_evaluators.implementations.TrajectoryInputMapper + evaluation.run_evaluators.implementations.TrajectoryRunEvalOutputParser + evaluation.schema.AgentTrajectoryEvaluator + evaluation.schema.EvaluatorType + evaluation.schema.LLMEvalChain evaluation.schema.PairwiseStringEvaluator evaluation.schema.StringEvaluator + evaluation.string_distance.base.PairwiseStringDistanceEvalChain + evaluation.string_distance.base.StringDistance + evaluation.string_distance.base.StringDistanceEvalChain Functions -------------- @@ -812,6 +837,8 @@ Functions :toctree: evaluation evaluation.loading.load_dataset + evaluation.loading.load_evaluator + evaluation.loading.load_evaluators evaluation.run_evaluators.implementations.get_criteria_evaluator evaluation.run_evaluators.implementations.get_qa_evaluator evaluation.run_evaluators.implementations.get_trajectory_evaluator @@ -1057,6 +1084,7 @@ Functions llms.aviary.get_completions llms.aviary.get_models + llms.base.create_base_retry_decorator llms.base.get_prompts llms.base.update_cache llms.cohere.completion_with_retry @@ -1069,6 +1097,7 @@ Functions llms.openai.completion_with_retry llms.openai.update_token_usage llms.utils.enforce_stop_tokens + llms.vertexai.completion_with_retry llms.vertexai.is_codey_model :mod:`langchain.load`: Load @@ -1241,7 +1270,6 @@ Classes :toctree: prompts :template: class.rst - prompts.base.BasePromptTemplate prompts.base.StringPromptTemplate prompts.base.StringPromptValue prompts.chat.AIMessagePromptTemplate @@ -1348,7 +1376,7 @@ Classes retrievers.multi_query.LineListOutputParser retrievers.multi_query.MultiQueryRetriever retrievers.pinecone_hybrid_search.PineconeHybridSearchRetriever - retrievers.pupmed.PubMedRetriever + retrievers.pubmed.PubMedRetriever retrievers.remote_retriever.RemoteLangChainRetriever retrievers.self_query.base.SelfQueryRetriever retrievers.self_query.chroma.ChromaTranslator @@ -1400,28 +1428,29 @@ Classes :toctree: schema :template: class.rst - schema.AIMessage - schema.AgentFinish - schema.BaseChatMessageHistory - schema.BaseDocumentTransformer - schema.BaseLLMOutputParser - schema.BaseMemory - schema.BaseMessage - schema.BaseOutputParser - schema.BaseRetriever - schema.ChatGeneration - schema.ChatMessage - schema.ChatResult - schema.Document - schema.FunctionMessage - schema.Generation - schema.HumanMessage - schema.LLMResult - schema.NoOpOutputParser - schema.OutputParserException - schema.PromptValue - schema.RunInfo - schema.SystemMessage + schema.agent.AgentFinish + schema.document.BaseDocumentTransformer + schema.document.Document + schema.memory.BaseChatMessageHistory + schema.memory.BaseMemory + schema.messages.AIMessage + schema.messages.BaseMessage + schema.messages.ChatMessage + schema.messages.FunctionMessage + schema.messages.HumanMessage + schema.messages.SystemMessage + schema.output.ChatGeneration + schema.output.ChatResult + schema.output.Generation + schema.output.LLMResult + schema.output.RunInfo + schema.output_parser.BaseLLMOutputParser + schema.output_parser.BaseOutputParser + schema.output_parser.NoOpOutputParser + schema.output_parser.OutputParserException + schema.prompt.PromptValue + schema.prompt_template.BasePromptTemplate + schema.retriever.BaseRetriever Functions -------------- @@ -1430,9 +1459,10 @@ Functions .. autosummary:: :toctree: schema - schema.get_buffer_string - schema.messages_from_dict - schema.messages_to_dict + schema.messages.get_buffer_string + schema.messages.messages_from_dict + schema.messages.messages_to_dict + schema.prompt_template.format_document :mod:`langchain.server`: Server ================================ @@ -1535,6 +1565,8 @@ Classes tools.bing_search.tool.BingSearchRun tools.brave_search.tool.BraveSearch tools.convert_to_openai.FunctionDescription + tools.dataforseo_api_search.tool.DataForSeoAPISearchResults + tools.dataforseo_api_search.tool.DataForSeoAPISearchRun tools.ddg_search.tool.DuckDuckGoSearchResults tools.ddg_search.tool.DuckDuckGoSearchRun tools.file_management.copy.CopyFileTool @@ -1708,6 +1740,7 @@ Classes utilities.bibtex.BibtexparserWrapper utilities.bing_search.BingSearchAPIWrapper utilities.brave_search.BraveSearchWrapper + utilities.dataforseo_api_search.DataForSeoAPIWrapper utilities.duckduckgo_search.DuckDuckGoSearchAPIWrapper utilities.google_places_api.GooglePlacesAPIWrapper utilities.google_search.GoogleSearchAPIWrapper @@ -1805,12 +1838,17 @@ Classes vectorstores.faiss.FAISS vectorstores.hologres.Hologres vectorstores.lancedb.LanceDB + vectorstores.marqo.Marqo vectorstores.matching_engine.MatchingEngine vectorstores.milvus.Milvus vectorstores.mongodb_atlas.MongoDBAtlasVectorSearch vectorstores.myscale.MyScale vectorstores.myscale.MyScaleSettings vectorstores.opensearch_vector_search.OpenSearchVectorSearch + vectorstores.pgembedding.BaseModel + vectorstores.pgembedding.CollectionStore + vectorstores.pgembedding.EmbeddingStore + vectorstores.pgembedding.PGEmbedding vectorstores.pgvector.BaseModel vectorstores.pgvector.CollectionStore vectorstores.pgvector.DistanceStrategy diff --git a/langchain/evaluation/__init__.py b/langchain/evaluation/__init__.py index eb1c4b64da6..93945272a5c 100644 --- a/langchain/evaluation/__init__.py +++ b/langchain/evaluation/__init__.py @@ -3,32 +3,63 @@ This module contains off-the-shelf evaluation chains for grading the output of LangChain primitives such as language models and chains. -To load an evaluator, you can use the :func:`load_evaluators ` function with the +**Loading an evaluator** + +To load an evaluator, you can use the :func:`load_evaluators ` or +:func:`load_evaluator ` functions with the names of the evaluators to load. +.. code-block:: python + + from langchain.evaluation import load_evaluator + + evaluator = load_evaluator("qa") + evaluator.evaluate_strings( + prediction="We sold more than 40,000 units last week", + input="How many units did we sell last week?", + reference="We sold 32,378 units", + ) + +The evaluator must be one of :class:`EvaluatorType `. + +**Datasets** + To load one of the LangChain HuggingFace datasets, you can use the :func:`load_dataset ` function with the name of the dataset to load. -Some common use cases for evaluation include: +.. code-block:: python + + from langchain.evaluation import load_dataset + ds = load_dataset("llm-math") + +**Some common use cases for evaluation include:** - Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain ` - Comparing the output of two models: :class:`PairwiseStringEvalChain ` - Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain ` - Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain ` +- Computing semantic difference between a prediction and reference: :class:`EmbeddingDistanceEvalChain ` or between two predictions: :class:`PairwiseEmbeddingDistanceEvalChain ` +- Measuring the string distance between a prediction and reference :class:`StringDistanceEvalChain ` or between two predictions :class:`PairwiseStringDistanceEvalChain ` -This module also contains low-level APIs for creating custom evaluators for -specific evaluation tasks. These include: +**Low-level API** + +These evaluators implement one of the following interfaces: - :class:`StringEvaluator `: Evaluate a prediction string against a reference label and/or input context. -- :class:`PairwiseStringEvaluator `: Evaluate two prediction strings against each other. - Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs. -- :class:`AgentTrajectoryEvaluator `: Evaluate the full sequence of actions - taken by an agent. +- :class:`PairwiseStringEvaluator `: Evaluate two prediction strings against each other. Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs. +- :class:`AgentTrajectoryEvaluator ` Evaluate the full sequence of actions taken by an agent. + +These interfaces enable easier composability and usage within a higher level evaluation framework. """ # noqa: E501 from langchain.evaluation.agents import TrajectoryEvalChain from langchain.evaluation.comparison import PairwiseStringEvalChain from langchain.evaluation.criteria import CriteriaEvalChain +from langchain.evaluation.embedding_distance import ( + EmbeddingDistance, + EmbeddingDistanceEvalChain, + PairwiseEmbeddingDistanceEvalChain, +) from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain from langchain.evaluation.schema import ( @@ -37,6 +68,11 @@ from langchain.evaluation.schema import ( PairwiseStringEvaluator, StringEvaluator, ) +from langchain.evaluation.string_distance import ( + PairwiseStringDistanceEvalChain, + StringDistance, + StringDistanceEvalChain, +) __all__ = [ "EvaluatorType", @@ -48,6 +84,12 @@ __all__ = [ "PairwiseStringEvaluator", "TrajectoryEvalChain", "CriteriaEvalChain", + "EmbeddingDistance", + "EmbeddingDistanceEvalChain", + "PairwiseEmbeddingDistanceEvalChain", + "StringDistance", + "StringDistanceEvalChain", + "PairwiseStringDistanceEvalChain", "load_evaluators", "load_evaluator", "load_dataset", diff --git a/langchain/evaluation/agents/trajectory_eval_chain.py b/langchain/evaluation/agents/trajectory_eval_chain.py index fe34be36706..2d686ae5507 100644 --- a/langchain/evaluation/agents/trajectory_eval_chain.py +++ b/langchain/evaluation/agents/trajectory_eval_chain.py @@ -77,40 +77,42 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain): the sequence of actions taken and their outcomes. Example: - .. code-block:: python - from langchain.agents import AgentType, initialize_agent - from langchain.chat_models import ChatOpenAI - from langchain.evaluation import TrajectoryEvalChain - from langchain.tools import tool - @tool - def geography_answers(country: str, question: str) -> str: - \"\"\"Very helpful answers to geography questions.\"\"\" - return f"{country}? IDK - We may never know {question}." + .. code-block:: python - llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0) - agent = initialize_agent( - tools=[geography_answers], - llm=llm, - agent=AgentType.OPENAI_FUNCTIONS, - return_intermediate_steps=True, - ) + from langchain.agents import AgentType, initialize_agent + from langchain.chat_models import ChatOpenAI + from langchain.evaluation import TrajectoryEvalChain + from langchain.tools import tool - question = "How many dwell in the largest minor region in Argentina?" - response = agent(question) + @tool + def geography_answers(country: str, question: str) -> str: + \"\"\"Very helpful answers to geography questions.\"\"\" + return f"{country}? IDK - We may never know {question}." - eval_chain = TrajectoryEvalChain.from_llm( - llm=llm, agent_tools=[geography_answers], return_reasoning=True - ) + llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0) + agent = initialize_agent( + tools=[geography_answers], + llm=llm, + agent=AgentType.OPENAI_FUNCTIONS, + return_intermediate_steps=True, + ) - result = eval_chain.evaluate_agent_trajectory( - input=question, - agent_trajectory=response["intermediate_steps"], - prediction=response["output"], - reference="Paris", - ) - print(result["score"]) - # 0 + question = "How many dwell in the largest minor region in Argentina?" + response = agent(question) + + eval_chain = TrajectoryEvalChain.from_llm( + llm=llm, agent_tools=[geography_answers], return_reasoning=True + ) + + result = eval_chain.evaluate_agent_trajectory( + input=question, + agent_trajectory=response["intermediate_steps"], + prediction=response["output"], + reference="Paris", + ) + print(result["score"]) + # 0 """ # noqa: E501 agent_tools: Optional[List[BaseTool]] = None @@ -336,7 +338,8 @@ The following is the expected answer. Use this to measure correctness: callbacks (Callbacks): Callbacks to use for this chain run. Returns: - dict: The evaluation result. + dict: The evaluation result, which includes the score and optionally + the reasoning for reaching that. """ inputs = { "question": input, @@ -367,7 +370,8 @@ The following is the expected answer. Use this to measure correctness: callbacks (Callbacks): Callbacks to use for this chain run. Returns: - dict: The evaluation result. + dict: The evaluation result, which includes the score and optionally + the reasoning for reaching that. """ inputs = { "question": input, diff --git a/langchain/evaluation/comparison/eval_chain.py b/langchain/evaluation/comparison/eval_chain.py index d1aa81436d0..97aad4d9a8c 100644 --- a/langchain/evaluation/comparison/eval_chain.py +++ b/langchain/evaluation/comparison/eval_chain.py @@ -52,7 +52,8 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]): class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain): - """A chain for comparing the output of two models. + """A chain for comparing two outputs, such as the outputs + of two models, prompts, or outputs of a single model on similar inputs. Example: >>> from langchain.chat_models import ChatOpenAI diff --git a/langchain/evaluation/criteria/eval_chain.py b/langchain/evaluation/criteria/eval_chain.py index ef17f51f7da..42aa0ff7e25 100644 --- a/langchain/evaluation/criteria/eval_chain.py +++ b/langchain/evaluation/criteria/eval_chain.py @@ -92,10 +92,37 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain): -------- >>> from langchain.chat_models import ChatAnthropic >>> from langchain.evaluation.criteria import CriteriaEvalChain - >>> llm = ChatAnthropic() + >>> llm = ChatAnthropic(temperature=0) >>> criteria = {"my-custom-criterion": "Is the submission the most amazing ever?"} - >>> chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria) - """ + >>> evaluator = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria) + >>> evaluator.evaluate_strings(prediction="Imagine an ice cream flavor for the color aquamarine", input="Tell me an idea") + { + 'reasoning': 'Here is my step-by-step reasoning for the given criteria:\\n\\nThe criterion is: "Is the submission the most amazing ever?" This is a subjective criterion and open to interpretation. The submission suggests an aquamarine-colored ice cream flavor which is creative but may or may not be considered the most amazing idea ever conceived. There are many possible amazing ideas and this one ice cream flavor suggestion may or may not rise to that level for every person. \\n\\nN', + 'value': 'N', + 'score': 0, + } + + >>> from langchain.chat_models import ChatOpenAI + >>> from langchain.evaluation.criteria import CriteriaEvalChain + >>> llm = ChatOpenAI(model="gpt-4", temperature=0) + >>> criteria = "correctness" + >>> evaluator = CriteriaEvalChain.from_llm( + ... llm=llm, + ... criteria=criteria, + ... requires_reference=True, + ... ) + >>> evaluator.evaluate_strings( + ... prediction="The answer is 4", + ... input="How many apples are there?", + ... reference="There are 3 apples", + ... ) + { + 'score': 0, + 'reasoning': 'The criterion for this task is the correctness of the submission. The submission states that there are 4 apples, but the reference indicates that there are actually 3 apples. Therefore, the submission is not correct, accurate, or factual according to the given criterion.\\n\\nN', + 'value': 'N', + } + + """ # noqa: E501 output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser) """The parser to use to map the output to a structured result.""" diff --git a/langchain/evaluation/embedding_distance/__init__.py b/langchain/evaluation/embedding_distance/__init__.py new file mode 100644 index 00000000000..a7ebd68c4ef --- /dev/null +++ b/langchain/evaluation/embedding_distance/__init__.py @@ -0,0 +1,12 @@ +"""Evaluators that measure embedding distances.""" +from langchain.evaluation.embedding_distance.base import ( + EmbeddingDistance, + EmbeddingDistanceEvalChain, + PairwiseEmbeddingDistanceEvalChain, +) + +__all__ = [ + "EmbeddingDistance", + "EmbeddingDistanceEvalChain", + "PairwiseEmbeddingDistanceEvalChain", +] diff --git a/langchain/evaluation/embedding_distance/base.py b/langchain/evaluation/embedding_distance/base.py new file mode 100644 index 00000000000..b20411a6502 --- /dev/null +++ b/langchain/evaluation/embedding_distance/base.py @@ -0,0 +1,438 @@ +"""A chain for comparing the output of two models using embeddings.""" +from enum import Enum +from typing import Any, Dict, List, Optional + +import numpy as np +from pydantic import Field, root_validator + +from langchain.callbacks.manager import ( + AsyncCallbackManagerForChainRun, + CallbackManagerForChainRun, + Callbacks, +) +from langchain.chains.base import Chain +from langchain.embeddings.base import Embeddings +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator +from langchain.math_utils import cosine_similarity + + +class EmbeddingDistance(str, Enum): + """Embedding Distance Metric. + + Attributes: + COSINE: Cosine distance metric. + EUCLIDEAN: Euclidean distance metric. + MANHATTAN: Manhattan distance metric. + CHEBYSHEV: Chebyshev distance metric. + HAMMING: Hamming distance metric. + """ + + COSINE = "cosine" + EUCLIDEAN = "euclidean" + MANHATTAN = "manhattan" + CHEBYSHEV = "chebyshev" + HAMMING = "hamming" + + +class _EmbeddingDistanceChainMixin(Chain): + """Shared functionality for embedding distance evaluators. + + Attributes: + embeddings (Embeddings): The embedding objects to vectorize the outputs. + distance_metric (EmbeddingDistance): The distance metric to use + for comparing the embeddings. + """ + + embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings) + distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE) + + class Config: + """Permit embeddings to go unvalidated.""" + + arbitrary_types_allowed: bool = True + + @property + def output_keys(self) -> List[str]: + """Return the output keys of the chain. + + Returns: + List[str]: The output keys. + """ + return ["score"] + + @root_validator + def _validate_distance_metric(cls, values: dict) -> dict: + """Validate the distance metric. + + Args: + values (dict): The values to validate. + + Returns: + dict: The validated values. + """ + values["distance_metric"] = values["distance_metric"].lower() + return values + + def _get_metric(self, metric: EmbeddingDistance) -> Any: + """Get the metric function for the given metric name. + + Args: + metric (EmbeddingDistance): The metric name. + + Returns: + Any: The metric function. + """ + metrics = { + EmbeddingDistance.COSINE: self._cosine_distance, + EmbeddingDistance.EUCLIDEAN: self._euclidean_distance, + EmbeddingDistance.MANHATTAN: self._manhattan_distance, + EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance, + EmbeddingDistance.HAMMING: self._hamming_distance, + } + if metric in metrics: + return metrics[metric] + else: + raise ValueError(f"Invalid metric: {metric}") + + @staticmethod + def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Compute the cosine distance between two vectors. + + Args: + a (np.ndarray): The first vector. + b (np.ndarray): The second vector. + + Returns: + np.ndarray: The cosine distance. + """ + return 1.0 - cosine_similarity(a, b) + + @staticmethod + def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating: + """Compute the Euclidean distance between two vectors. + + Args: + a (np.ndarray): The first vector. + b (np.ndarray): The second vector. + + Returns: + np.floating: The Euclidean distance. + """ + return np.linalg.norm(a - b) + + @staticmethod + def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating: + """Compute the Manhattan distance between two vectors. + + Args: + a (np.ndarray): The first vector. + b (np.ndarray): The second vector. + + Returns: + np.floating: The Manhattan distance. + """ + return np.sum(np.abs(a - b)) + + @staticmethod + def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating: + """Compute the Chebyshev distance between two vectors. + + Args: + a (np.ndarray): The first vector. + b (np.ndarray): The second vector. + + Returns: + np.floating: The Chebyshev distance. + """ + return np.max(np.abs(a - b)) + + @staticmethod + def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating: + """Compute the Hamming distance between two vectors. + + Args: + a (np.ndarray): The first vector. + b (np.ndarray): The second vector. + + Returns: + np.floating: The Hamming distance. + """ + return np.mean(a != b) + + def _compute_score(self, vectors: np.ndarray) -> float: + """Compute the score based on the distance metric. + + Args: + vectors (np.ndarray): The input vectors. + + Returns: + float: The computed score. + """ + metric = self._get_metric(self.distance_metric) + score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item() + return score + + +class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator): + """Use embedding distances to score semantic difference between + a prediction and reference. + + Examples: + >>> chain = EmbeddingDistanceEvalChain() + >>> result = chain.evaluate_strings(prediction="Hello", reference="Hi") + >>> print(result) + {'score': 0.5} + """ + + @property + def requires_reference(self) -> bool: + """Return whether the chain requires a reference. + + Returns: + bool: True if a reference is required, False otherwise. + """ + return True + + @property + def input_keys(self) -> List[str]: + """Return the input keys of the chain. + + Returns: + List[str]: The input keys. + """ + return ["prediction", "reference"] + + def _call( + self, + inputs: Dict[str, Any], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """Compute the score for a prediction and reference. + + Args: + inputs (Dict[str, Any]): The input data. + run_manager (Optional[CallbackManagerForChainRun], optional): + The callback manager. + + Returns: + Dict[str, Any]: The computed score. + """ + vectors = np.array( + self.embeddings.embed_documents( + [inputs["prediction"], inputs["prediction_b"]] + ) + ) + score = self._compute_score(vectors) + return {"score": score} + + async def _acall( + self, + inputs: Dict[str, Any], + run_manager: Optional[AsyncCallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """Asynchronously compute the score for a prediction and reference. + + Args: + inputs (Dict[str, Any]): The input data. + run_manager (AsyncCallbackManagerForChainRun, optional): + The callback manager. + + Returns: + Dict[str, Any]: The computed score. + """ + embedded = await self.embeddings.aembed_documents( + [inputs["prediction"], inputs["prediction_b"]] + ) + vectors = np.array(embedded) + score = self._compute_score(vectors) + return {"score": score} + + def _evaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + callbacks: Callbacks = None, + **kwargs: Any, + ) -> dict: + """Evaluate the embedding distance between a prediction and + reference. + + Args: + prediction (str): The output string from the first model. + reference (str): The reference string (required) + callbacks (Callbacks, optional): The callbacks to use. + **kwargs (Any): Additional keyword arguments. + + Returns: + dict: A dictionary containing: + - score: The embedding distance between the two + predictions. + """ + return self( + inputs={"prediction": prediction, "reference": reference}, + callbacks=callbacks, + ) + + async def _aevaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + callbacks: Callbacks = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate the embedding distance between + a prediction and reference. + + Args: + prediction (str): The output string from the first model. + reference (str): The output string from the second model. + callbacks (Callbacks, optional): The callbacks to use. + **kwargs (Any): Additional keyword arguments. + + Returns: + dict: A dictionary containing: + - score: The embedding distance between the two + predictions. + """ + return await self.acall( + inputs={"prediction": prediction, "reference": reference}, + callbacks=callbacks, + ) + + +class PairwiseEmbeddingDistanceEvalChain( + _EmbeddingDistanceChainMixin, PairwiseStringEvaluator +): + """Use embedding distances to score semantic difference between two predictions. + + Examples: + >>> chain = PairwiseEmbeddingDistanceEvalChain() + >>> result = chain.evaluate_string_pairs(prediction="Hello", prediction_b="Hi") + >>> print(result) + {'score': 0.5} + """ + + @property + def input_keys(self) -> List[str]: + """Return the input keys of the chain. + + Returns: + List[str]: The input keys. + """ + return ["prediction", "prediction_b"] + + def _call( + self, + inputs: Dict[str, Any], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """Compute the score for two predictions. + + Args: + inputs (Dict[str, Any]): The input data. + run_manager (CallbackManagerForChainRun, optional): + The callback manager. + + Returns: + Dict[str, Any]: The computed score. + """ + vectors = np.array( + self.embeddings.embed_documents( + [inputs["prediction"], inputs["prediction_b"]] + ) + ) + score = self._compute_score(vectors) + return {"score": score} + + async def _acall( + self, + inputs: Dict[str, Any], + run_manager: Optional[AsyncCallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """Asynchronously compute the score for two predictions. + + Args: + inputs (Dict[str, Any]): The input data. + run_manager (AsyncCallbackManagerForChainRun, optional): + The callback manager. + + Returns: + Dict[str, Any]: The computed score. + """ + embedded = await self.embeddings.aembed_documents( + [inputs["prediction"], inputs["prediction_b"]] + ) + vectors = np.array(embedded) + score = self._compute_score(vectors) + return {"score": score} + + def _evaluate_string_pairs( + self, + *, + prediction: str, + prediction_b: str, + callbacks: Callbacks = None, + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> dict: + """Evaluate the embedding distance between two predictions. + + Args: + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. + callbacks (Callbacks, optional): The callbacks to use. + tags (List[str], optional): Tags to apply to traces + metadata (Dict[str, Any], optional): metadata to apply to + **kwargs (Any): Additional keyword arguments. + + Returns: + dict: A dictionary containing: + - score: The embedding distance between the two + predictions. + """ + result = self( + inputs={"prediction": prediction, "prediction_b": prediction_b}, + callbacks=callbacks, + tags=tags, + metadata=metadata, + ) + return {"score": result["score"]} + + async def _aevaluate_string_pairs( + self, + *, + prediction: str, + prediction_b: str, + callbacks: Callbacks = None, + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate the embedding distance + + between two predictions. + + Args: + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. + callbacks (Callbacks, optional): The callbacks to use. + tags (List[str], optional): Tags to apply to traces + metadata (Dict[str, Any], optional): metadata to apply to traces + **kwargs (Any): Additional keyword arguments. + + Returns: + dict: A dictionary containing: + - score: The embedding distance between the two + predictions. + """ + result = await self.acall( + inputs={"prediction": prediction, "prediction_b": prediction_b}, + callbacks=callbacks, + tags=tags, + metadata=metadata, + ) + return {"score": result["score"]} diff --git a/langchain/evaluation/loading.py b/langchain/evaluation/loading.py index e65f1b8fa6b..4880fb47b88 100644 --- a/langchain/evaluation/loading.py +++ b/langchain/evaluation/loading.py @@ -1,25 +1,46 @@ """Loading datasets and evaluators.""" -from typing import Any, Dict, List, Optional, Sequence, Type +from typing import Any, Dict, List, Optional, Sequence, Type, Union from langchain.chains.base import Chain from langchain.chat_models.openai import ChatOpenAI from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain from langchain.evaluation.comparison import PairwiseStringEvalChain from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain +from langchain.evaluation.embedding_distance.base import ( + EmbeddingDistanceEvalChain, + PairwiseEmbeddingDistanceEvalChain, +) from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain from langchain.evaluation.schema import EvaluatorType, LLMEvalChain +from langchain.evaluation.string_distance.base import ( + PairwiseStringDistanceEvalChain, + StringDistanceEvalChain, +) from langchain.schema.language_model import BaseLanguageModel def load_dataset(uri: str) -> List[Dict]: - """Load a dataset from the LangChainDatasets HuggingFace org. + """Load a dataset from the `LangChainDatasets HuggingFace org `_. Args: uri: The uri of the dataset to load. Returns: A list of dictionaries, each representing a row in the dataset. - """ + + **Prerequisites** + + .. code-block:: shell + + pip install datasets + + Examples + -------- + .. code-block:: python + + from langchain.evaluation import load_dataset + ds = load_dataset("llm-math") + """ # noqa: E501 try: from datasets import load_dataset except ImportError: @@ -32,13 +53,17 @@ def load_dataset(uri: str) -> List[Dict]: return [d for d in dataset["train"]] -_EVALUATOR_MAP: Dict[EvaluatorType, Type[LLMEvalChain]] = { +_EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = { EvaluatorType.QA: QAEvalChain, EvaluatorType.COT_QA: CotQAEvalChain, EvaluatorType.CONTEXT_QA: ContextQAEvalChain, EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain, EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain, EvaluatorType.CRITERIA: CriteriaEvalChain, + EvaluatorType.STRING_DISTANCE: StringDistanceEvalChain, + EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain, + EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain, + EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain, } @@ -66,8 +91,8 @@ def load_evaluator( Examples -------- - >>> llm = ChatOpenAI(model="gpt-4", temperature=0) - >>> evaluator = _load_evaluator("qa", llm=llm) + >>> from langchain.evaluation import load_evaluator, EvaluatorType + >>> evaluator = load_evaluator(EvaluatorType.QA) """ llm = llm or ChatOpenAI(model="gpt-4", temperature=0) if evaluator not in _EVALUATOR_MAP: @@ -75,7 +100,11 @@ def load_evaluator( f"Unknown evaluator type: {evaluator}" f"Valid types are: {list(_EVALUATOR_MAP.keys())}" ) - return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs) + evaluator_cls = _EVALUATOR_MAP[evaluator] + if issubclass(evaluator_cls, LLMEvalChain): + return evaluator_cls.from_llm(llm=llm, **kwargs) + else: + return evaluator_cls(**kwargs) def load_evaluators( @@ -107,10 +136,9 @@ def load_evaluators( Examples -------- - .. code-block:: python - from langchain.evaluation import load_evaluators, EvaluatorType - evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA] - loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness") + >>> from langchain.evaluation import load_evaluators, EvaluatorType + >>> evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA] + >>> loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness") """ llm = llm or ChatOpenAI(model="gpt-4", temperature=0) loaded = [] diff --git a/langchain/evaluation/qa/eval_chain.py b/langchain/evaluation/qa/eval_chain.py index 8f658f96066..a6e73af2dc7 100644 --- a/langchain/evaluation/qa/eval_chain.py +++ b/langchain/evaluation/qa/eval_chain.py @@ -167,6 +167,11 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain): """Whether the chain requires an input string.""" return True + class Config: + """Configuration for the QAEvalChain.""" + + extra = Extra.ignore + @classmethod def _validate_input_vars(cls, prompt: PromptTemplate) -> None: expected_input_vars = {"query", "context", "result"} diff --git a/langchain/evaluation/run_evaluators/base.py b/langchain/evaluation/run_evaluators/base.py index dfa90f2d80e..c54f961e4c7 100644 --- a/langchain/evaluation/run_evaluators/base.py +++ b/langchain/evaluation/run_evaluators/base.py @@ -77,7 +77,7 @@ class RunEvaluatorChain(Chain, RunEvaluator): async def _acall( self, inputs: Dict[str, Any], - run_manager: AsyncCallbackManagerForChainRun | None = None, + run_manager: Optional[AsyncCallbackManagerForChainRun] = None, ) -> Dict[str, Any]: run: Run = inputs["run"] example: Optional[Example] = inputs.get("example") diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py index a1a1951ba23..8efe862a4ee 100644 --- a/langchain/evaluation/schema.py +++ b/langchain/evaluation/schema.py @@ -33,6 +33,14 @@ class EvaluatorType(str, Enum): CRITERIA = "criteria" """The criteria evaluator, which evaluates a model based on a custom set of criteria.""" + STRING_DISTANCE = "string_distance" + """Compare predictions to a reference answer using string edit distances.""" + PAIRWISE_STRING_DISTANCE = "pairwise_string_distance" + """Compare predictions based on string edit distances.""" + EMBEDDING_DISTANCE = "embedding_distance" + """Compare a prediction to a reference label using embedding distance.""" + PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance" + """Compare two predictions using embedding distance.""" class LLMEvalChain(Chain): @@ -89,7 +97,8 @@ class _EvalArgsMixin: class StringEvaluator(_EvalArgsMixin, ABC): - """Protocol for evaluating strings.""" + """Grade, tag, or otherwise evaluate predictions relative to their inputs + and/or reference labels.""" @property def evaluation_name(self) -> str: @@ -204,7 +213,7 @@ class StringEvaluator(_EvalArgsMixin, ABC): class PairwiseStringEvaluator(_EvalArgsMixin, ABC): - """A protocol for comparing the output of two models.""" + """Compare the output of two models (or two outputs of the same model).""" @abstractmethod def _evaluate_string_pairs( diff --git a/langchain/evaluation/string_distance/__init__.py b/langchain/evaluation/string_distance/__init__.py new file mode 100644 index 00000000000..72e4e26f214 --- /dev/null +++ b/langchain/evaluation/string_distance/__init__.py @@ -0,0 +1,12 @@ +"""String distance evaluators.""" +from langchain.evaluation.string_distance.base import ( + PairwiseStringDistanceEvalChain, + StringDistance, + StringDistanceEvalChain, +) + +__all__ = [ + "PairwiseStringDistanceEvalChain", + "StringDistance", + "StringDistanceEvalChain", +] diff --git a/langchain/evaluation/string_distance/base.py b/langchain/evaluation/string_distance/base.py new file mode 100644 index 00000000000..b03050e12b7 --- /dev/null +++ b/langchain/evaluation/string_distance/base.py @@ -0,0 +1,376 @@ +"""String distance evaluators based on the RapidFuzz library.""" + +from enum import Enum +from typing import Any, Callable, Dict, List, Optional + +from pydantic import Field, root_validator + +from langchain.callbacks.manager import ( + AsyncCallbackManagerForChainRun, + CallbackManagerForChainRun, + Callbacks, +) +from langchain.chains.base import Chain +from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator + + +def _load_rapidfuzz() -> Any: + """ + Load the RapidFuzz library. + + Raises: + ImportError: If the rapidfuzz library is not installed. + + Returns: + Any: The rapidfuzz.distance module. + """ + try: + import rapidfuzz + except ImportError: + raise ImportError( + "Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator." + ) + return rapidfuzz.distance + + +class StringDistance(str, Enum): + """Distance metric to use.""" + + DAMERAU_LEVENSHTEIN = "damerau_levenshtein" + LEVENSHTEIN = "levenshtein" + JARO = "jaro" + JARO_WINKLER = "jaro_winkler" + + +class _RapidFuzzChainMixin(Chain): + """Shared methods for the rapidfuzz string distance evaluators.""" + + distance: StringDistance = Field(default=StringDistance.LEVENSHTEIN) + + @root_validator + def validate_dependencies(cls, values: Dict[str, Any]) -> Dict[str, Any]: + """ + Validate that the rapidfuzz library is installed. + + Args: + values (Dict[str, Any]): The input values. + + Returns: + Dict[str, Any]: The validated values. + """ + _load_rapidfuzz() + return values + + @property + def output_keys(self) -> List[str]: + """ + Get the output keys. + + Returns: + List[str]: The output keys. + """ + return ["score"] + + @staticmethod + def _get_metric(distance: str) -> Callable: + """ + Get the distance metric function based on the distance type. + + Args: + distance (str): The distance type. + + Returns: + Callable: The distance metric function. + + Raises: + ValueError: If the distance metric is invalid. + """ + rf_distance = _load_rapidfuzz() + if distance == StringDistance.DAMERAU_LEVENSHTEIN: + return rf_distance.DamerauLevenshtein.distance + elif distance == StringDistance.LEVENSHTEIN: + return rf_distance.Levenshtein.distance + elif distance == StringDistance.JARO: + return rf_distance.Jaro.distance + elif distance == StringDistance.JARO_WINKLER: + return rf_distance.JaroWinkler.distance + else: + raise ValueError(f"Invalid distance metric: {distance}") + + @property + def metric(self) -> Callable: + """ + Get the distance metric function. + + Returns: + Callable: The distance metric function. + """ + return _RapidFuzzChainMixin._get_metric(self.distance) + + +class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator): + """Compute string distances between the prediction and the reference.""" + + @property + def requires_input(self) -> bool: + """ + Check if input is required. + + Returns: + bool: True if input is required, False otherwise. + """ + return False + + @property + def requires_reference(self) -> bool: + """ + Check if reference is required. + + Returns: + bool: True if reference is required, False otherwise. + """ + return True + + @property + def input_keys(self) -> List[str]: + """ + Get the input keys. + + Returns: + List[str]: The input keys. + """ + return ["reference", "prediction"] + + @staticmethod + def _get_metric(distance: str) -> Callable: + """ + Get the distance metric function based on the distance type. + + Args: + distance (str): The distance type. + + Returns: + Callable: The distance metric function. + + Raises: + ValueError: If the distance metric is invalid. + """ + rf_distance = _load_rapidfuzz() + if distance == StringDistance.DAMERAU_LEVENSHTEIN: + return rf_distance.DamerauLevenshtein.distance + elif distance == StringDistance.LEVENSHTEIN: + return rf_distance.Levenshtein.distance + elif distance == StringDistance.JARO: + return rf_distance.Jaro.distance + elif distance == StringDistance.JARO_WINKLER: + return rf_distance.JaroWinkler.distance + else: + raise ValueError(f"Invalid distance metric: {distance}") + + def _call( + self, + inputs: Dict[str, Any], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """ + Compute the string distance between the prediction and the reference. + + Args: + inputs (Dict[str, Any]): The input values. + run_manager (Optional[CallbackManagerForChainRun]): + The callback manager. + + Returns: + Dict[str, Any]: The evaluation results containing the score. + """ + return {"score": self.metric(inputs["reference"], inputs["prediction"])} + + async def _acall( + self, + inputs: Dict[str, Any], + run_manager: Optional[AsyncCallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """ + Asynchronously compute the string distance between the prediction + and the reference. + + Args: + inputs (Dict[str, Any]): The input values. + run_manager (Optional[AsyncCallbackManagerForChainRun]: + The callback manager. + + Returns: + Dict[str, Any]: The evaluation results containing the score. + """ + return {"score": self.metric(inputs["reference"], inputs["prediction"])} + + def _evaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + callbacks: Callbacks = None, + **kwargs: Any, + ) -> dict: + """ + Evaluate the string distance between the prediction and the reference. + + Args: + prediction (str): The prediction string. + reference (Optional[str], optional): The reference string. + input (Optional[str], optional): The input string. + callbacks (Callbacks, optional): The callbacks to use. + **kwargs: Additional keyword arguments. + + Returns: + dict: The evaluation results containing the score. + """ + result = self( + inputs={"prediction": prediction, "reference": reference}, + callbacks=callbacks, + ) + return {"score": result["score"]} + + async def _aevaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + callbacks: Callbacks = None, + **kwargs: Any, + ) -> dict: + """ + Asynchronously evaluate the string distance between the + prediction and the reference. + + Args: + prediction (str): The prediction string. + reference (Optional[str], optional): The reference string. + input (Optional[str], optional): The input string. + callbacks (Callbacks, optional): The callbacks to use. + **kwargs: Additional keyword arguments. + + Returns: + dict: The evaluation results containing the score. + """ + result = await self.acall( + inputs={"prediction": prediction, "reference": reference}, + callbacks=callbacks, + ) + return {"score": result["score"]} + + +class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvaluator): + """Compute string edit distances between two predictions.""" + + @property + def input_keys(self) -> List[str]: + """ + Get the input keys. + + Returns: + List[str]: The input keys. + """ + return ["prediction", "prediction_b"] + + def _call( + self, + inputs: Dict[str, Any], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """ + Compute the string distance between two predictions. + + Args: + inputs (Dict[str, Any]): The input values. + run_manager (CallbackManagerForChainRun , optional): + The callback manager. + + Returns: + Dict[str, Any]: The evaluation results containing the score. + """ + return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])} + + async def _acall( + self, + inputs: Dict[str, Any], + run_manager: Optional[AsyncCallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """ + Asynchronously compute the string distance between two predictions. + + Args: + inputs (Dict[str, Any]): The input values. + run_manager (AsyncCallbackManagerForChainRun , optional): + The callback manager. + + Returns: + Dict[str, Any]: The evaluation results containing the score. + """ + return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])} + + def _evaluate_string_pairs( + self, + *, + prediction: str, + prediction_b: str, + callbacks: Callbacks = None, + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> dict: + """ + Evaluate the string distance between two predictions. + + Args: + prediction (str): The first prediction string. + prediction_b (str): The second prediction string. + callbacks (Callbacks, optional): The callbacks to use. + tags (List[str], optional): Tags to apply to traces. + metadata (Dict[str, Any], optional): Metadata to apply to traces. + **kwargs: Additional keyword arguments. + + Returns: + dict: The evaluation results containing the score. + """ + result = self( + inputs={"prediction": prediction, "prediction_b": prediction_b}, + callbacks=callbacks, + tags=tags, + metadata=metadata, + ) + return {"score": result["score"]} + + async def _aevaluate_string_pairs( + self, + *, + prediction: str, + prediction_b: str, + callbacks: Callbacks = None, + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> dict: + """ + Asynchronously evaluate the string distance between two predictions. + + Args: + prediction (str): The first prediction string. + prediction_b (str): The second prediction string. + callbacks (Callbacks, optional): The callbacks to use. + tags (List[str], optional): Tags to apply to traces. + metadata (Dict[str, Any], optional): Metadata to apply to traces. + **kwargs: Additional keyword arguments. + + Returns: + dict: The evaluation results containing the score. + """ + result = await self.acall( + inputs={"prediction": prediction, "prediction_b": prediction_b}, + callbacks=callbacks, + tags=tags, + metadata=metadata, + ) + return {"score": result["score"]} diff --git a/poetry.lock b/poetry.lock index 894fddb67cb..5ff55b38c1a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8920,6 +8920,111 @@ packaging = "*" [package.extras] test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"] +[[package]] +name = "rapidfuzz" +version = "3.1.1" +description = "rapid fuzzy string matching" +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:17e4cbe6632aae7c35101c4b7c498e83f6eacf61be0def4ff98167df30dc69ca"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:167dbce2da6bb5b73d43e53434c5a9d7d1214b658b315420e44044782f4c482b"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdee4f4d04761ce167538adbefa01a64e7cab949d89aa09df39ef0d5e859fb2a"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e77ed7d0bd8d9be530c462c921904ada8d3417671eed749784c5a315af334d"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fdd2ab5ab56fcaf839a9f58caa8756dbfeba0b3dc187850b763d0a1e6ee9c97a"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0843c53d54d5b7d6122d8f1d7574d8c91a7aacc5c316f74d6e33d98aec82949d"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b3e953dcef0302eeb4fe8c7c4907e50d175199fc07da05ad6bd1d8d141ff138"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec5523d5c08c639cd4e301d42f3ad7c6fb061a1f1cd6b5b627e59af345edfed7"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b4995792e106c3f1ab6f56dd6089918b065888e2e55a71e3fea8d0f66bf30989"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cdbf9a76ea47f14026daaed43a2c2150ab0e9a4d5396909f028380f33e61c522"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f25d1975e846d07990cf946a5927a932aa7cccd308ae9979b03a58ff1cd80087"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e0755f5ac6c3d1dc2505eb2e6eaf5508ff17b42c084406714fbabf2d50d098b6"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:de784bbe06d32e66617cd20766c37aae2438902d54b3fa608d2e0a929ca705f4"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-win32.whl", hash = "sha256:ef6c38040d868dcc0132fad377aafeb5b2da71354759e77f41ae599316df2dee"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c74fde444bcd13ef3a803c578b28f33b4f9edf368f46ca3de57fda456065967"}, + {file = "rapidfuzz-3.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:e549da8d68ad4ee385c918ea8b9efeda875df9edf6c6b48df927bd061c00bfef"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:58ca539cc6ce385d650138a9b1908b05622c2dd08a23d5aea4890523ef3774d5"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91946c496e6f380939dbea14ff6ce6de87480445c09d03964f5374101462594b"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f2024f83a9300440e845b441e71726471f7567021c1d80796ca02e71c5f0dc2"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17b017f9e1b88dfd6d9b03170ef8e86477de0d9d37fbfcbe72ca070cacbe1b65"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6772eb7cc4429f1eae5a9b41e5b0b1af8f0d50727c6e338d9ad5bceee01da5a"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c089ce856919e03f4dd8f9168d60ac580d30cd0451fd60dcdef73010eca68973"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f2cd9a3760080876fc59edb26926e51d6db44dea65e85f1eb04aa5f58c3bc41"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f32791ee045a7b3d6a56208a55d996d5f7a32fdb688f5c5ee899cb7589539eb"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:68d910048b36613701ea671de68f701e2c1ba2839295238def840ff1fc1b15f4"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6f767d4823002e65c06ea273f952fda2b88775e1c2d508564f04d32cdd7f65b2"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:10313075642a9f1f948d356f4f0803ae28a496d7967b466b9cae1a4be8aa4df3"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1465ea085154378e69bf4bc5e27bdac5c94684416882ace31865232adc9239a2"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:53e3c588e7ea158fa80095dd0ff53f49e2ede9a8d71a3a5b964ca045d845a9b9"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-win32.whl", hash = "sha256:cb08db5c122fea4196483b82f7596e50ef9cab1770f7696c197bf0815ac4dd17"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b7c65112c87568274d399ad7a62902cef17801c2bd047b162e79e43758b3ce27"}, + {file = "rapidfuzz-3.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:ea3e46a534de97a6cad2018cb950492a0fcacad380e35440ce3c1c8fef96a261"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a8bb256b34fcad4f3fa00be6b57fe35bcb54f031911195929145c67d9738ffec"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51f21f37aec6bc117e9083181ddc3cbbcbf56b6506492b128d8e836d3545ca80"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a371846f45ed9d24927a8d5222884536c1e171543396b36250fafb2e848bc92"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25eea5c8006b6c8747ca204675c9e939f3c4d27167fb43b2aa211443d34f9abd"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:db5e71e5a810d2f1163c914e01b3ba241409a98286ac4850ff26076115ae401b"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c07e16ab38e717931319cff1340debbf2ef940a1cda4eb70e323079b62df306"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aadc5a8b9859737a8f87831215b7fab0c04afeb960bb987c528421a4e6dfb8b6"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:0de229cb613be060580c71c1674acbde57921c7ed33d7a726e071a2562924113"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:b1bf8aba99b267aad0a01dfb44ee39803676007724abcfb72129c350476b2341"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d3264e4a02e4148e30078104fb0c1b6c8eb166ddc5ebe843a22433f58f87dc47"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:712331c1c70c79a219c2ac233b4e25e75ffad51042840d147d5e94519c7d8a1a"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:6ede2d42ad55bd4e7a3394e98c5f58ddace78775493391732d32be61268a4116"}, + {file = "rapidfuzz-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:32a5c47b5153f25eb512dbb91f9850225d2dcfb3404a1c48406726c7732b0726"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:51bb8f7aa4fe45618e75cdccf08491c752a7f137ffbf7d3afd1809791ac8c326"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:788fb03c5acb5b48f5f918f4cbb5dc072498becf018c64e7e27d6b76e63e68b8"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dc7f25e20781c8d42e813516ee4ff9043ecce4a8e25fc94ee6732a83d81c1c99"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4a751f216fd1222a4a8c7ceff5180872a156202c3bdca1b337e5a5b09298dfd"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83b48b789f2da1688882cba595c40179194ab15ec17ea1d4c9de9ee239649904"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a6f5cd9f1282da49b8d0747c40f3fea2d64ab5e4c2cc2295baf87ff7a0d062"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5fe8054c244bf63be2380efc275edd86da3a706460d42911dc3ff914f3260a5"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d4d509e9aa011e1be5e4da7c5062dc4fc3688714687110536925980b3d03ac6"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ccc1b5b467766110085c80bb9311d233fccc8ed1ce965aebba3125e1bab04cba"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7e181411958d04d5b437a0981e87815e8f1b1909f5ae0e339246d3bc464f53e7"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:c53cf36cdb10819b7154fefdbffbef442ba567d9c1ca74a7e76fd759ace45e6c"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:851b44130393139cb336aa54c681d595d75a3160b7be330f3acc0c3b9dabce70"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49d900da023eeb3bfbe9feee126312eb9fd0458129aa5a581e4d8d8bf4483d14"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-win32.whl", hash = "sha256:6c0e96821029c46847df4ff266ea283a2b6163a4f76a4567f9986934e9c4410c"}, + {file = "rapidfuzz-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7af18372f576e36e93f4662bdf64043ac23dfa02d7f768d7e7e1d0211bb9cb35"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8b966344ed4122a71ab8ccdca2954db1ce0d8049cb9bcac58db07558f9d9ec32"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a293370448f2e46fdc6e086ac99923015bdc53973a65d3df35aefc685e1a5809"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:351d253fdee62d6d0e80c75f0505accc1ce8cc73a50779c60986ef21c92f20f9"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e951c874a0e5b375b2af9b5f264eefc679c0685c166ee0641e703ef0795509b"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4019def8a18bc867ac61f08a542bf474a7a9b3f662f5d5cd169c9135866562f5"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:086a2d84c2e497e3ab160ccf164e319bca874d9383d008fcadf91ede8ac7997f"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6d4da453fbd8793ebb11bed396f8a4b9041d6227bf055903447305dd7942312f"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10f56af1d46fbeaaa0dc50901c2dc439c7a455cfdac2f1acf6cffeb65ae82c48"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7726f67e4a0b2b4392f03aa62e16b12a697156c6735df27b21bd3ab561b01659"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d72916d27fb88741bfb576b0b0639354ca00f5e91046171c985262c68a86bbb5"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c85bb6946fb02231d1e60ab45c36ecee04ecf7f725e094f5beee798b6b7d36d"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:fb7049dff52cded65184a3d2ff45cfd226bff7314f49a8f4b83f943eea9181a7"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:408007b4bc5a0a0cb9bfcdcc8cffa9b71fec6ee53ccdf9c26b57539f7e264ab5"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-win32.whl", hash = "sha256:9dc7154889937ca5a004d17f62b4798e0af52f69c38eb3112dbdb52b006d4419"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:16c506bac2e0a6f6581b334a7802c2f0d8343ec1d77e5cf9452c33d6219abef8"}, + {file = "rapidfuzz-3.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:5e11e11880951e767342b56627ab2dc9d3ef90e2605b656e9b5e6e0beadaaf0f"}, + {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a8b8f32463781e4703965c9cf7a609a19a74478f332e0d62cd9d0e7a9db91321"}, + {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b408ac3c7f8c3414bfd5c6044ca4bb385b390bcf5eae3ad884cef48628c131ae"}, + {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ff1a517de2b1e80ddf1a3037a6ebca9925154c1af70751518d50d5c332e1ec8"}, + {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1e23665be5918f979180130babedab9317fbb34cdae237c7defad7e86bc684e"}, + {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:15260263a0c7bffac934a53b6622d77e06e10929ee4d2e62ac6f70c13988f351"}, + {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f7acc5c9c7cf567372de5b6c817f93db508e7b9bd7f29bd6187df8d2cc60ced5"}, + {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79f5a3ab7ff6c46336f38690f0564bc7689cefa180257ed9078c42f75b10c9d2"}, + {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:362e366e79fcc9a8866b41f20ef4d2987a06f8b134096e659594c059aa8a6d88"}, + {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:819d9317c3d86b508d87ab1bca5867f3abc18b902c822bc57366ccc6330a030b"}, + {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4a64ddfb7084b678da7778c1263aee2baae5a2ca55ec5589a022defc38103eb1"}, + {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8243bb4bb4db7c3501932ced6a978b284e19c3619b6802455e47bfd0905adb81"}, + {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39c7d0dbd77a7f28ff85a1dff2afb2ed73e5cd81cca3f654450ed339a271c0ab"}, + {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4afab735bb0ac3ec9bafcc35376ed336d26af6140c4d81e4c869e77df77ecd5"}, + {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69d503a7641b5a63aa53c7aca0b857d38f48cd7bae39f8563679b324e3d2d47a"}, + {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ef3ad80458e47723812976a2ea1282ff207ad20e6cb19da1917f76699bd5aaa5"}, + {file = "rapidfuzz-3.1.1.tar.gz", hash = "sha256:a06a08be3cb7d7df7993dd16e84aaf59bd5a7ff98a9f1b3e893d18b273a71c64"}, +] + +[package.extras] +full = ["numpy"] + [[package]] name = "ratelimiter" version = "1.2.0.post0" @@ -12410,7 +12515,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"] +extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"] javascript = ["esprima"] llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -12420,4 +12525,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "cc95f4e0d4bee4ba19cf539be5ffd81f1ddb33229ace936ef3b6cbd4122493ca" +content-hash = "6e2acbd4f760e92454f9f9e29840679fbd59b8662a99bcb89e2251a5b8736e6d" diff --git a/pyproject.toml b/pyproject.toml index e8ec961273b..f711152d9db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,6 +116,7 @@ streamlit = {version = "^1.18.0", optional = true, python = ">=3.8.1,<3.9.7 || > psychicapi = {version = "^0.8.0", optional = true} cassio = {version = "^0.0.7", optional = true} rdflib = {version = "^6.3.2", optional = true} +rapidfuzz = {version = "^3.1.1", optional = true} [tool.poetry.group.docs.dependencies] autodoc_pydantic = "^1.8.0" @@ -346,7 +347,8 @@ extended_testing = [ "scikit-learn", "streamlit", "pyspark", - "openai" + "openai", + "rapidfuzz" ] [[tool.poetry.source]] diff --git a/tests/integration_tests/evaluation/embedding_distance/test_embedding.py b/tests/integration_tests/evaluation/embedding_distance/test_embedding.py new file mode 100644 index 00000000000..b85c3fae7ff --- /dev/null +++ b/tests/integration_tests/evaluation/embedding_distance/test_embedding.py @@ -0,0 +1,123 @@ +from typing import Tuple + +import numpy as np +import pytest + +from langchain.evaluation.embedding_distance import ( + EmbeddingDistance, + PairwiseEmbeddingDistanceEvalChain, +) + + +@pytest.fixture +def vectors() -> Tuple[np.ndarray, np.ndarray]: + """Create two random vectors.""" + vector_a = np.array( + [ + 0.5488135, + 0.71518937, + 0.60276338, + 0.54488318, + 0.4236548, + 0.64589411, + 0.43758721, + 0.891773, + 0.96366276, + 0.38344152, + ] + ) + vector_b = np.array( + [ + 0.79172504, + 0.52889492, + 0.56804456, + 0.92559664, + 0.07103606, + 0.0871293, + 0.0202184, + 0.83261985, + 0.77815675, + 0.87001215, + ] + ) + return vector_a, vector_b + + +@pytest.fixture +def chain() -> PairwiseEmbeddingDistanceEvalChain: + """Create a PairwiseEmbeddingDistanceEvalChain.""" + return PairwiseEmbeddingDistanceEvalChain() + + +@pytest.mark.requires("scipy") +def test_cosine_similarity( + chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray] +) -> None: + """Test the cosine similarity.""" + chain.distance_metric = EmbeddingDistance.COSINE + result = chain._compute_score(np.array(vectors)) + expected = 1.0 - np.dot(vectors[0], vectors[1]) / ( + np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1]) + ) + assert np.isclose(result, expected) + + +@pytest.mark.requires("scipy") +def test_euclidean_distance( + chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray] +) -> None: + """Test the euclidean distance.""" + from scipy.spatial.distance import euclidean + + chain.distance_metric = EmbeddingDistance.EUCLIDEAN + result = chain._compute_score(np.array(vectors)) + expected = euclidean(*vectors) + assert np.isclose(result, expected) + + +@pytest.mark.requires("scipy") +def test_manhattan_distance( + chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray] +) -> None: + """Test the manhattan distance.""" + from scipy.spatial.distance import cityblock + + chain.distance_metric = EmbeddingDistance.MANHATTAN + result = chain._compute_score(np.array(vectors)) + expected = cityblock(*vectors) + assert np.isclose(result, expected) + + +@pytest.mark.requires("scipy") +def test_chebyshev_distance( + chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray] +) -> None: + """Test the chebyshev distance.""" + from scipy.spatial.distance import chebyshev + + chain.distance_metric = EmbeddingDistance.CHEBYSHEV + result = chain._compute_score(np.array(vectors)) + expected = chebyshev(*vectors) + assert np.isclose(result, expected) + + +@pytest.mark.requires("scipy") +def test_hamming_distance( + chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray] +) -> None: + """Test the hamming distance.""" + from scipy.spatial.distance import hamming + + chain.distance_metric = EmbeddingDistance.HAMMING + result = chain._compute_score(np.array(vectors)) + expected = hamming(*vectors) + assert np.isclose(result, expected) + + +@pytest.mark.requires("openai", "tiktoken") +def test_embedding_distance(chain: PairwiseEmbeddingDistanceEvalChain) -> None: + """Test the embedding distance.""" + result = chain.evaluate_string_pairs( + prediction="A single cat", prediction_b="A single cat" + ) + assert np.isclose(result["score"], 0.0) diff --git a/tests/unit_tests/evaluation/string_distance/__init__.py b/tests/unit_tests/evaluation/string_distance/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit_tests/evaluation/string_distance/test_base.py b/tests/unit_tests/evaluation/string_distance/test_base.py new file mode 100644 index 00000000000..8fe35c08e43 --- /dev/null +++ b/tests/unit_tests/evaluation/string_distance/test_base.py @@ -0,0 +1,51 @@ +import pytest + +from langchain.evaluation.string_distance import ( + PairwiseStringDistanceEvalChain, + StringDistance, + StringDistanceEvalChain, +) + + +@pytest.mark.requires("rapidfuzz") +@pytest.mark.parametrize("distance", list(StringDistance)) +def test_zero_distance(distance: StringDistance) -> None: + eval_chain = StringDistanceEvalChain(distance=distance) + string = "三人行则必有我师" + result = eval_chain.evaluate_strings(prediction=string, reference=string) + assert "score" in result + assert result["score"] == 0 + + +@pytest.mark.asyncio +@pytest.mark.requires("rapidfuzz") +@pytest.mark.parametrize("distance", list(StringDistance)) +async def test_zero_distance_async(distance: StringDistance) -> None: + eval_chain = StringDistanceEvalChain(distance=distance) + string = "三人行则必有我师" + result = await eval_chain.aevaluate_strings(prediction=string, reference=string) + assert "score" in result + assert result["score"] == 0 + + +@pytest.mark.requires("rapidfuzz") +@pytest.mark.parametrize("distance", list(StringDistance)) +def test_zero_distance_pairwise(distance: StringDistance) -> None: + eval_chain = PairwiseStringDistanceEvalChain(distance=distance) + string = "三人行则必有我师" + result = eval_chain.evaluate_string_pairs(prediction=string, prediction_b=string) + assert "score" in result + assert result["score"] == 0 + + +@pytest.mark.asyncio +@pytest.mark.requires("rapidfuzz") +@pytest.mark.parametrize("distance", list(StringDistance)) +async def test_zero_distance_pairwise_async(distance: StringDistance) -> None: + eval_chain = PairwiseStringDistanceEvalChain(distance=distance) + string = "三人行则必有我师" + result = await eval_chain.aevaluate_string_pairs( + prediction=string, prediction_b=string + ) + assert "score" in result + assert result["score"] == 0 diff --git a/tests/unit_tests/evaluation/test_loading.py b/tests/unit_tests/evaluation/test_loading.py index e707246fb04..20e0740d4da 100644 --- a/tests/unit_tests/evaluation/test_loading.py +++ b/tests/unit_tests/evaluation/test_loading.py @@ -2,20 +2,27 @@ import pytest +from langchain.embeddings.fake import FakeEmbeddings from langchain.evaluation.loading import EvaluatorType, load_evaluators from langchain.evaluation.schema import StringEvaluator from tests.unit_tests.llms.fake_chat_model import FakeChatModel from tests.unit_tests.llms.fake_llm import FakeLLM +@pytest.mark.requires("rapidfuzz") @pytest.mark.parametrize("evaluator_type", EvaluatorType) def test_load_evaluators(evaluator_type: EvaluatorType) -> None: """Test loading evaluators.""" fake_llm = FakeChatModel() - load_evaluators([evaluator_type], llm=fake_llm) + embeddings = FakeEmbeddings(size=32) + load_evaluators([evaluator_type], llm=fake_llm, embeddings=embeddings) # Test as string - load_evaluators([evaluator_type.value], llm=fake_llm) # type: ignore + load_evaluators( + [evaluator_type.value], # type: ignore + llm=fake_llm, + embeddings=embeddings, + ) def test_criteria_eval_chain_requires_reference() -> None: