mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 19:03:25 +00:00
Add Exact match and Regex Match Evaluators (#11132)
This commit is contained in:
parent
e355606b11
commit
33da8bd711
@ -6,6 +6,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Custom Pairwise Evaluator\n",
|
"# Custom Pairwise Evaluator\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/custom.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
|
"You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Pairwise Embedding Distance \n",
|
"# Pairwise Embedding Distance \n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
"One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Pairwise String Comparison\n",
|
"# Pairwise String Comparison\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/pairwise_string.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The `StringComparison` evaluators facilitate this so you can answer questions like:\n",
|
"Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The `StringComparison` evaluators facilitate this so you can answer questions like:\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Comparing Chain Outputs\n",
|
"# Comparing Chain Outputs\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/examples/comparisons.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
|
"Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -1,318 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "bce7335e-f3b2-44f3-90cc-8c0a23a89a21",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"from langchain.agents import load_tools\n",
|
|
||||||
"from langchain.agents import initialize_agent\n",
|
|
||||||
"from langchain.chat_models import ChatOpenAI\n",
|
|
||||||
"from langchain.utilities import GoogleSearchAPIWrapper\n",
|
|
||||||
"from langchain.schema import (\n",
|
|
||||||
" SystemMessage,\n",
|
|
||||||
" HumanMessage,\n",
|
|
||||||
" AIMessage\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"******\"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_PROJECT\"] = \"Jarvis\"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"prefix_messages = [{\"role\": \"system\", \"content\": \"You are a helpful discord Chatbot.\"}]\n",
|
|
||||||
"\n",
|
|
||||||
"llm = ChatOpenAI(model_name='gpt-3.5-turbo', \n",
|
|
||||||
" temperature=0.5, \n",
|
|
||||||
" max_tokens = 2000)\n",
|
|
||||||
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
|
|
||||||
"agent = initialize_agent(tools,\n",
|
|
||||||
" llm,\n",
|
|
||||||
" agent=\"zero-shot-react-description\",\n",
|
|
||||||
" verbose=True,\n",
|
|
||||||
" handle_parsing_errors=True\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"async def on_ready():\n",
|
|
||||||
" print(f'{bot.user} has connected to Discord!')\n",
|
|
||||||
"\n",
|
|
||||||
"async def on_message(message):\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Detected bot name in message:\", message.content)\n",
|
|
||||||
"\n",
|
|
||||||
" # Capture the output of agent.run() in the response variable\n",
|
|
||||||
" response = agent.run(message.content)\n",
|
|
||||||
"\n",
|
|
||||||
" while response:\n",
|
|
||||||
" print(response)\n",
|
|
||||||
" chunk, response = response[:2000], response[2000:]\n",
|
|
||||||
" print(f\"Chunk: {chunk}\")\n",
|
|
||||||
" print(\"Response sent.\")\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 22,
|
|
||||||
"id": "1551ce9f-b6de-4035-b6d6-825722823b48",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from dataclasses import dataclass\n",
|
|
||||||
"@dataclass\n",
|
|
||||||
"class Message:\n",
|
|
||||||
" content: str"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 23,
|
|
||||||
"id": "6e6859ec-8544-4407-9663-6b53c0092903",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Detected bot name in message: Hi AI, how are you today?\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
|
||||||
"\u001b[32;1m\u001b[1;3mThis question is not something that can be answered using the available tools.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3m\u001b[0m\n",
|
|
||||||
"\n",
|
|
||||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
|
||||||
"Agent stopped due to iteration limit or time limit.\n",
|
|
||||||
"Chunk: Agent stopped due to iteration limit or time limit.\n",
|
|
||||||
"Response sent.\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"await on_message(Message(content=\"Hi AI, how are you today?\"))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 24,
|
|
||||||
"id": "b850294c-7f8f-4e79-adcf-47e4e3a898df",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langsmith import Client\n",
|
|
||||||
"\n",
|
|
||||||
"client = Client()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 25,
|
|
||||||
"id": "6d089ddc-69bc-45a8-b8db-9962e4f1f5ee",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from itertools import islice\n",
|
|
||||||
"\n",
|
|
||||||
"runs = list(islice(client.list_runs(), 10))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 38,
|
|
||||||
"id": "f0349fac-5a98-400f-ba03-61ed4e1332be",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"runs = sorted(runs, key=lambda x: x.start_time, reverse=True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 26,
|
|
||||||
"id": "02f133f0-39ee-4b46-b443-12c1f9b76fff",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"ids = [run.id for run in runs]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 39,
|
|
||||||
"id": "3366dce4-0c38-4a7d-8111-046a58b24917",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"runs2 = list(client.list_runs(id=ids))\n",
|
|
||||||
"runs2 = sorted(runs2, key=lambda x: x.start_time, reverse=True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 42,
|
|
||||||
"id": "82915b90-39a0-47d6-9121-56a13f210f52",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['a36092d2-4ad5-4fb4-9b0d-0dba9a2ed836',\n",
|
|
||||||
" '9398e6be-964f-4aa4-8de9-ad78cd4b7074']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 42,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"[str(x) for x in ids[:2]]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 48,
|
|
||||||
"id": "f610ec91-dc48-4a17-91c5-5c4675c77abc",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langsmith.run_helpers import traceable\n",
|
|
||||||
"\n",
|
|
||||||
"@traceable(run_type=\"llm\", name=\"\"\"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/dQw4w9WgXcQ?start=5\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" allowfullscreen></iframe>\"\"\")\n",
|
|
||||||
"def foo():\n",
|
|
||||||
" return \"bar\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 49,
|
|
||||||
"id": "bd317bd7-8b2a-433a-8ec3-098a84ba8e64",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'bar'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 49,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"foo()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 52,
|
|
||||||
"id": "b142519b-6885-415c-83b9-4a346fb90589",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.llms import AzureOpenAI"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "5c50bb2b-72b8-4322-9b16-d857ecd9f347",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
@ -6,6 +6,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Criteria Evaluation\n",
|
"# Criteria Evaluation\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/criteria_eval_chain.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"In scenarios where you wish to assess a model's output using a specific rubric or criteria set, the `criteria` evaluator proves to be a handy tool. It allows you to verify if an LLM or Chain's output complies with a defined set of criteria.\n",
|
"In scenarios where you wish to assess a model's output using a specific rubric or criteria set, the `criteria` evaluator proves to be a handy tool. It allows you to verify if an LLM or Chain's output complies with a defined set of criteria.\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Custom String Evaluator\n",
|
"# Custom String Evaluator\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/custom.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
|
"You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Embedding Distance\n",
|
"# Embedding Distance\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/embedding_distance.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
"To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
175
docs/extras/guides/evaluation/string/exact_match.ipynb
Normal file
175
docs/extras/guides/evaluation/string/exact_match.ipynb
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2da95378",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Exact Match\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/exact_match.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"Probably the simplest ways to evaluate an LLM or runnable's string output against a reference label is by a simple string equivalence.\n",
|
||||||
|
"\n",
|
||||||
|
"This can be accessed using the `exact_match` evaluator."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import ExactMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = ExactMatchStringEvaluator()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Alternatively via the loader:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "f6790c46",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = load_evaluator(\"exact_match\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "49ad9139",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 0}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"1 LLM.\",\n",
|
||||||
|
" reference=\"2 llm\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 0}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"LangChain\",\n",
|
||||||
|
" reference=\"langchain\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configure the ExactMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"You can relax the \"exactness\" when comparing strings."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"evaluator = ExactMatchStringEvaluator(\n",
|
||||||
|
" ignore_case=True,\n",
|
||||||
|
" ignore_numbers=True,\n",
|
||||||
|
" ignore_punctuation=True,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Alternatively\n",
|
||||||
|
"# evaluator = load_evaluator(\"exact_match\", ignore_case=True, ignore_numbers=True, ignore_punctuation=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"1 LLM.\",\n",
|
||||||
|
" reference=\"2 llm\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
243
docs/extras/guides/evaluation/string/regex_match.ipynb
Normal file
243
docs/extras/guides/evaluation/string/regex_match.ipynb
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2da95378",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Regex Match\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/regex_match.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"To evaluate chain or runnable string predictions against a custom regex, you can use the `regex_match` evaluator."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import RegexMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = RegexMatchStringEvaluator()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Alternatively via the loader:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "f6790c46",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = load_evaluator(\"regex_match\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "49ad9139",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a YYYY-MM-DD string.\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 2024-01-05\",\n",
|
||||||
|
" reference=\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 0}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a MM-DD-YYYY string.\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 2024-01-05\",\n",
|
||||||
|
" reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "168fcd92-dffb-4345-b097-02d0fedf52fd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a MM-DD-YYYY string.\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 01-05-2024\",\n",
|
||||||
|
" reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1d82dab5-6a49-4fe7-b3fb-8bcfb27d26e0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Match against multiple patterns\n",
|
||||||
|
"\n",
|
||||||
|
"To match against multiple patterns, use a regex union \"|\"."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "b87b915e-b7c2-476b-a452-99688a22293a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a MM-DD-YYYY string or YYYY-MM-DD\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 01-05-2024\",\n",
|
||||||
|
" reference=\"|\".join([\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\", \".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"])\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configure the RegexMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"You can specify any regex flags to use when matching."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = RegexMatchStringEvaluator(\n",
|
||||||
|
" flags=re.IGNORECASE\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Alternatively\n",
|
||||||
|
"# evaluator = load_evaluator(\"exact_match\", flags=re.IGNORECASE)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"I LOVE testing\",\n",
|
||||||
|
" reference=\"I love testing\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "82de8d3e-c829-440e-a582-3fb70cecad3b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -6,6 +6,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# String Distance\n",
|
"# String Distance\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/string_distance.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance. This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
|
"One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance. This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Custom Trajectory Evaluator\n",
|
"# Custom Trajectory Evaluator\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/trajectory/custom.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
|
"You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Agent Trajectory\n",
|
"# Agent Trajectory\n",
|
||||||
|
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/trajectory/trajectory_eval.ipynb)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Agents can be difficult to holistically evaluate due to the breadth of actions and generation they can make. We recommend using multiple evaluation techniques appropriate to your use case. One way to evaluate an agent is to look at the whole trajectory of actions taken along with their responses.\n",
|
"Agents can be difficult to holistically evaluate due to the breadth of actions and generation they can make. We recommend using multiple evaluation techniques appropriate to your use case. One way to evaluate an agent is to look at the whole trajectory of actions taken along with their responses.\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -67,8 +67,10 @@ from langchain.evaluation.embedding_distance import (
|
|||||||
EmbeddingDistanceEvalChain,
|
EmbeddingDistanceEvalChain,
|
||||||
PairwiseEmbeddingDistanceEvalChain,
|
PairwiseEmbeddingDistanceEvalChain,
|
||||||
)
|
)
|
||||||
|
from langchain.evaluation.exact_match.base import ExactMatchStringEvaluator
|
||||||
from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators
|
from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators
|
||||||
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
|
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
|
||||||
|
from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
|
||||||
from langchain.evaluation.schema import (
|
from langchain.evaluation.schema import (
|
||||||
AgentTrajectoryEvaluator,
|
AgentTrajectoryEvaluator,
|
||||||
EvaluatorType,
|
EvaluatorType,
|
||||||
@ -83,6 +85,8 @@ from langchain.evaluation.string_distance import (
|
|||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"EvaluatorType",
|
"EvaluatorType",
|
||||||
|
"ExactMatchStringEvaluator",
|
||||||
|
"RegexMatchStringEvaluator",
|
||||||
"PairwiseStringEvalChain",
|
"PairwiseStringEvalChain",
|
||||||
"LabeledPairwiseStringEvalChain",
|
"LabeledPairwiseStringEvalChain",
|
||||||
"QAEvalChain",
|
"QAEvalChain",
|
||||||
|
97
libs/langchain/langchain/evaluation/exact_match/base.py
Normal file
97
libs/langchain/langchain/evaluation/exact_match/base.py
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
import string
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
from langchain.evaluation.schema import StringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
class ExactMatchStringEvaluator(StringEvaluator):
|
||||||
|
"""Compute an exact match between the prediction and the reference.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
----------
|
||||||
|
>>> evaluator = ExactMatchChain()
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="Mindy is the CTO",
|
||||||
|
) # This will return {'score': 1.0}
|
||||||
|
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="Mindy is the CEO",
|
||||||
|
) # This will return {'score': 0.0}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
ignore_case: bool = False,
|
||||||
|
ignore_punctuation: bool = False,
|
||||||
|
ignore_numbers: bool = False,
|
||||||
|
**kwargs: Any,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.ignore_case = ignore_case
|
||||||
|
self.ignore_punctuation = ignore_punctuation
|
||||||
|
self.ignore_numbers = ignore_numbers
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_input(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator does not require input.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator requires a reference.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def input_keys(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get the input keys.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: The input keys.
|
||||||
|
"""
|
||||||
|
return ["reference", "prediction"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Get the evaluation name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The evaluation name.
|
||||||
|
"""
|
||||||
|
return "exact_match"
|
||||||
|
|
||||||
|
def _evaluate_strings( # type: ignore[arg-type,override]
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
prediction: str,
|
||||||
|
reference: str,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Evaluate the exact match between the prediction and the reference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prediction (str): The prediction string.
|
||||||
|
reference (Optional[str], optional): The reference string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: The evaluation results containing the score.
|
||||||
|
"""
|
||||||
|
if self.ignore_case:
|
||||||
|
prediction = prediction.lower()
|
||||||
|
reference = reference.lower()
|
||||||
|
if self.ignore_punctuation:
|
||||||
|
prediction = prediction.translate(str.maketrans("", "", string.punctuation))
|
||||||
|
reference = reference.translate(str.maketrans("", "", string.punctuation))
|
||||||
|
if self.ignore_numbers:
|
||||||
|
prediction = prediction.translate(str.maketrans("", "", string.digits))
|
||||||
|
reference = reference.translate(str.maketrans("", "", string.digits))
|
||||||
|
return {"score": int(prediction == reference)}
|
@ -14,11 +14,13 @@ from langchain.evaluation.embedding_distance.base import (
|
|||||||
EmbeddingDistanceEvalChain,
|
EmbeddingDistanceEvalChain,
|
||||||
PairwiseEmbeddingDistanceEvalChain,
|
PairwiseEmbeddingDistanceEvalChain,
|
||||||
)
|
)
|
||||||
|
from langchain.evaluation.exact_match.base import ExactMatchStringEvaluator
|
||||||
from langchain.evaluation.parsing.base import (
|
from langchain.evaluation.parsing.base import (
|
||||||
JsonEqualityEvaluator,
|
JsonEqualityEvaluator,
|
||||||
JsonValidityEvaluator,
|
JsonValidityEvaluator,
|
||||||
)
|
)
|
||||||
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
|
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
|
||||||
|
from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
|
||||||
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
|
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
|
||||||
from langchain.evaluation.string_distance.base import (
|
from langchain.evaluation.string_distance.base import (
|
||||||
PairwiseStringDistanceEvalChain,
|
PairwiseStringDistanceEvalChain,
|
||||||
@ -78,6 +80,8 @@ _EVALUATOR_MAP: Dict[
|
|||||||
EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
|
EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
|
||||||
EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator,
|
EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator,
|
||||||
EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator,
|
EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator,
|
||||||
|
EvaluatorType.REGEX_MATCH: RegexMatchStringEvaluator,
|
||||||
|
EvaluatorType.EXACT_MATCH: ExactMatchStringEvaluator,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -111,7 +115,7 @@ def load_evaluator(
|
|||||||
if evaluator not in _EVALUATOR_MAP:
|
if evaluator not in _EVALUATOR_MAP:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unknown evaluator type: {evaluator}"
|
f"Unknown evaluator type: {evaluator}"
|
||||||
f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
|
f"\nValid types are: {list(_EVALUATOR_MAP.keys())}"
|
||||||
)
|
)
|
||||||
evaluator_cls = _EVALUATOR_MAP[evaluator]
|
evaluator_cls = _EVALUATOR_MAP[evaluator]
|
||||||
if issubclass(evaluator_cls, LLMEvalChain):
|
if issubclass(evaluator_cls, LLMEvalChain):
|
||||||
|
86
libs/langchain/langchain/evaluation/regex_match/base.py
Normal file
86
libs/langchain/langchain/evaluation/regex_match/base.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
import re
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
from langchain.evaluation.schema import StringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
class RegexMatchStringEvaluator(StringEvaluator):
|
||||||
|
"""Compute a regex match between the prediction and the reference.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
----------
|
||||||
|
>>> evaluator = RegexMatchStringEvaluator(flags=re.IGNORECASE)
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="^mindy.*cto$",
|
||||||
|
) # This will return {'score': 1.0} due to the IGNORECASE flag
|
||||||
|
|
||||||
|
>>> evaluator = RegexMatchStringEvaluator()
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="^Mike.*CEO$",
|
||||||
|
) # This will return {'score': 0.0}
|
||||||
|
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="^Mike.*CEO$|^Mindy.*CTO$",
|
||||||
|
) # This will return {'score': 1.0} as the prediction matches the second pattern in the union
|
||||||
|
""" # noqa: E501
|
||||||
|
|
||||||
|
def __init__(self, *, flags: int = 0, **kwargs: Any): # Default is no flags
|
||||||
|
super().__init__()
|
||||||
|
self.flags = flags
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_input(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator does not require input.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator requires a reference.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def input_keys(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get the input keys.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: The input keys.
|
||||||
|
"""
|
||||||
|
return ["reference", "prediction"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Get the evaluation name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The evaluation name.
|
||||||
|
"""
|
||||||
|
return "regex_match"
|
||||||
|
|
||||||
|
def _evaluate_strings( # type: ignore[arg-type,override]
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
prediction: str,
|
||||||
|
reference: str,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Evaluate the regex match between the prediction and the reference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prediction (str): The prediction string.
|
||||||
|
reference (Optional[str], optional): The reference regex pattern.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: The evaluation results containing the score.
|
||||||
|
"""
|
||||||
|
match = re.match(reference, prediction, flags=self.flags)
|
||||||
|
return {"score": int(bool(match))}
|
@ -44,6 +44,10 @@ class EvaluatorType(str, Enum):
|
|||||||
custom set of criteria, with a reference label."""
|
custom set of criteria, with a reference label."""
|
||||||
STRING_DISTANCE = "string_distance"
|
STRING_DISTANCE = "string_distance"
|
||||||
"""Compare predictions to a reference answer using string edit distances."""
|
"""Compare predictions to a reference answer using string edit distances."""
|
||||||
|
EXACT_MATCH = "exact_match"
|
||||||
|
"""Compare predictions to a reference answer using exact matching."""
|
||||||
|
REGEX_MATCH = "regex_match"
|
||||||
|
"""Compare predictions to a reference answer using regular expressions."""
|
||||||
PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
|
PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
|
||||||
"""Compare predictions based on string edit distances."""
|
"""Compare predictions based on string edit distances."""
|
||||||
EMBEDDING_DISTANCE = "embedding_distance"
|
EMBEDDING_DISTANCE = "embedding_distance"
|
||||||
|
@ -261,4 +261,34 @@ class RunEvalConfig(BaseModel):
|
|||||||
|
|
||||||
evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
|
evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
|
||||||
|
|
||||||
|
class ExactMatch(EvalConfig):
|
||||||
|
"""Configuration for an exact match string evaluator.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ignore_case : bool
|
||||||
|
Whether to ignore case when comparing strings.
|
||||||
|
ignore_punctuation : bool
|
||||||
|
Whether to ignore punctuation when comparing strings.
|
||||||
|
ignore_numbers : bool
|
||||||
|
Whether to ignore numbers when comparing strings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
|
||||||
|
ignore_case: bool = False
|
||||||
|
ignore_punctuation: bool = False
|
||||||
|
ignore_numbers: bool = False
|
||||||
|
|
||||||
|
class RegexMatch(EvalConfig):
|
||||||
|
"""Configuration for a regex match string evaluator.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
flags : int
|
||||||
|
The flags to pass to the regex. Example: re.IGNORECASE.
|
||||||
|
"""
|
||||||
|
|
||||||
|
evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
|
||||||
|
flags: int = 0
|
||||||
|
|
||||||
# TODO: Trajectory
|
# TODO: Trajectory
|
||||||
|
@ -0,0 +1,49 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.evaluation import ExactMatchStringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def exact_match_string_evaluator() -> ExactMatchStringEvaluator:
|
||||||
|
"""Create an ExactMatchStringEvaluator with default configuration."""
|
||||||
|
return ExactMatchStringEvaluator()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def exact_match_string_evaluator_ignore_case() -> ExactMatchStringEvaluator:
|
||||||
|
"""Create an ExactMatchStringEvaluator with ignore_case set to True."""
|
||||||
|
return ExactMatchStringEvaluator(ignore_case=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_exact_matching(
|
||||||
|
exact_match_string_evaluator: ExactMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "Mindy is the CTO"
|
||||||
|
result = exact_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
||||||
|
|
||||||
|
reference = "Mindy is the CEO"
|
||||||
|
result = exact_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_exact_matching_with_ignore_case(
|
||||||
|
exact_match_string_evaluator_ignore_case: ExactMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "mindy is the cto"
|
||||||
|
result = exact_match_string_evaluator_ignore_case.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
||||||
|
|
||||||
|
reference = "mindy is the CEO"
|
||||||
|
result = exact_match_string_evaluator_ignore_case.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 0.0
|
@ -0,0 +1,45 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.evaluation import RegexMatchStringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def regex_match_string_evaluator() -> RegexMatchStringEvaluator:
|
||||||
|
"""Create a RegexMatchStringEvaluator with default configuration."""
|
||||||
|
return RegexMatchStringEvaluator()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def regex_match_string_evaluator_ignore_case() -> RegexMatchStringEvaluator:
|
||||||
|
"""Create a RegexMatchStringEvaluator with IGNORECASE flag."""
|
||||||
|
return RegexMatchStringEvaluator(flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_regex_matching(
|
||||||
|
regex_match_string_evaluator: RegexMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "^Mindy.*CTO$"
|
||||||
|
result = regex_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
||||||
|
|
||||||
|
reference = "^Mike.*CEO$"
|
||||||
|
result = regex_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_regex_matching_with_ignore_case(
|
||||||
|
regex_match_string_evaluator_ignore_case: RegexMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "^mindy.*cto$"
|
||||||
|
result = regex_match_string_evaluator_ignore_case.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
@ -41,6 +41,7 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
|
|||||||
EvaluatorType.LABELED_PAIRWISE_STRING,
|
EvaluatorType.LABELED_PAIRWISE_STRING,
|
||||||
],
|
],
|
||||||
[EvaluatorType.JSON_EQUALITY],
|
[EvaluatorType.JSON_EQUALITY],
|
||||||
|
[EvaluatorType.EXACT_MATCH, EvaluatorType.REGEX_MATCH],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_eval_chain_requires_references(evaluator_types: List[EvaluatorType]) -> None:
|
def test_eval_chain_requires_references(evaluator_types: List[EvaluatorType]) -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user