Make eval output parsers more robust (#10658)

Ran through a few hundred generations with some models to fix up the
parsers
This commit is contained in:
William FH 2023-09-17 19:24:20 -07:00 committed by GitHub
parent 3992c1ae9b
commit a3e5507faa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 94 additions and 13 deletions

View File

@ -2,7 +2,8 @@
from __future__ import annotations from __future__ import annotations
import re import re
from typing import Any, List, Optional, Sequence import string
from typing import Any, List, Optional, Sequence, Tuple
from langchain.callbacks.manager import Callbacks from langchain.callbacks.manager import Callbacks
from langchain.chains.llm import LLMChain from langchain.chains.llm import LLMChain
@ -14,13 +15,32 @@ from langchain.schema import RUN_KEY
from langchain.schema.language_model import BaseLanguageModel from langchain.schema.language_model import BaseLanguageModel
def _get_score(verdict: str) -> Optional[int]: def _get_score(text: str) -> Optional[Tuple[str, int]]:
match = re.search(r"(?i)(?:grade:\s*)?(correct|incorrect)", verdict) match = re.search(r"grade:\s*(correct|incorrect)", text.strip(), re.IGNORECASE)
if match: if match:
if match.group(1).upper() == "CORRECT": if match.group(1).upper() == "CORRECT":
return 1 return "CORRECT", 1
elif match.group(1).upper() == "INCORRECT": elif match.group(1).upper() == "INCORRECT":
return 0 return "INCORRECT", 0
try:
first_word = (
text.strip().split()[0].translate(str.maketrans("", "", string.punctuation))
)
if first_word.upper() == "CORRECT":
return "CORRECT", 1
elif first_word.upper() == "INCORRECT":
return "INCORRECT", 0
last_word = (
text.strip()
.split()[-1]
.translate(str.maketrans("", "", string.punctuation))
)
if last_word.upper() == "CORRECT":
return "CORRECT", 1
elif last_word.upper() == "INCORRECT":
return "INCORRECT", 0
except IndexError:
pass
return None return None
@ -33,17 +53,15 @@ def _parse_string_eval_output(text: str) -> dict:
Returns: Returns:
Any: The parsed output. Any: The parsed output.
""" """
splits = text.strip().rsplit("\n", maxsplit=1) reasoning = text.strip()
if len(splits) == 1: parsed_scores = _get_score(reasoning)
verdict = splits[0] if parsed_scores is None:
reasoning = None value, score = None, None
else: else:
reasoning, verdict = splits value, score = parsed_scores
reasoning = reasoning.strip()
score = _get_score(verdict)
return { return {
"reasoning": reasoning, "reasoning": reasoning,
"value": verdict, "value": value,
"score": score, "score": score,
} }

View File

@ -9,6 +9,7 @@ from langchain.evaluation.qa.eval_chain import (
ContextQAEvalChain, ContextQAEvalChain,
CotQAEvalChain, CotQAEvalChain,
QAEvalChain, QAEvalChain,
_parse_string_eval_output,
) )
from langchain.evaluation.schema import StringEvaluator from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.llms.fake_llm import FakeLLM from tests.unit_tests.llms.fake_llm import FakeLLM
@ -67,3 +68,65 @@ def test_returns_expected_results(
prediction="my prediction", reference="my reference", input="my input" prediction="my prediction", reference="my reference", input="my input"
) )
assert results["score"] == 1 assert results["score"] == 1
@pytest.mark.parametrize(
"output,expected",
[
(
""" GRADE: CORRECT
QUESTION: according to the passage, what is the main reason that the author wrote this passage?
STUDENT ANSWER: to explain the importance of washing your hands
TRUE ANSWER: to explain the importance of washing your hands
GRADE:""", # noqa: E501
{
"value": "CORRECT",
"score": 1,
},
),
(
""" Here is my step-by-step reasoning to grade the student's answer:
1. The question asks who founded the Roanoke settlement.
2. The context states that the grade incorrect answer is Walter Raleigh.
3. The student's answer is "Sir Walter Raleigh".
4. The student's answer matches the context, which states the answer is Walter Raleigh.
5. The addition of "Sir" in the student's answer does not contradict the context. It provides extra detail about Walter Raleigh's title, but the core answer of Walter Raleigh is still correct.
6. Therefore, the student's answer contains the same factual information as the true answer, so it should be graded as correct.
GRADE: CORRECT""", # noqa: E501
{
"value": "CORRECT",
"score": 1,
},
),
(
""" CORRECT
QUESTION: who was the first president of the united states?
STUDENT ANSWER: George Washington
TRUE ANSWER: George Washington was the first president of the United States.
GRADE:""",
{
"value": "CORRECT",
"score": 1,
},
),
(
"""The student's answer is "Regent's Park," which matches the correct answer given in the context. Therefore, the student's answer is CORRECT.""", # noqa: E501
{
"value": "CORRECT",
"score": 1,
},
),
],
)
def test_qa_output_parser(output: str, expected: dict) -> None:
expected["reasoning"] = output.strip()
assert _parse_string_eval_output(output) == expected