mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-03 15:55:44 +00:00
Compare commits
13 Commits
langchain-
...
cc/depreca
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
39d1759b66 | ||
|
|
7005c4fe5b | ||
|
|
1eace6523d | ||
|
|
bacf4c58ef | ||
|
|
b71b5bd3d7 | ||
|
|
31364de10c | ||
|
|
8a70754dfe | ||
|
|
9bd4459f9a | ||
|
|
50c1ecc5f1 | ||
|
|
f51a9024ae | ||
|
|
15254d1027 | ||
|
|
d38c9c7026 | ||
|
|
d249318f94 |
@@ -27,11 +27,11 @@ from langchain_core.callbacks.manager import (
|
||||
from langchain_core.exceptions import OutputParserException
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.language_models.chat_models import BaseChatModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.output_parsers import BaseOutputParser, StrOutputParser
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import Runnable
|
||||
from langchain_core.tools import BaseTool
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.agents.trajectory_eval_prompt import (
|
||||
EVAL_CHAT_PROMPT,
|
||||
TOOL_FREE_EVAL_CHAT_PROMPT,
|
||||
@@ -147,7 +147,7 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
|
||||
|
||||
agent_tools: Optional[List[BaseTool]] = None
|
||||
"""A list of tools available to the agent."""
|
||||
eval_chain: LLMChain
|
||||
eval_chain: Runnable
|
||||
"""The language model chain used for evaluation."""
|
||||
output_parser: TrajectoryOutputParser = Field(
|
||||
default_factory=TrajectoryOutputParser
|
||||
@@ -253,7 +253,7 @@ The following is the expected answer. Use this to measure correctness:
|
||||
prompt = EVAL_CHAT_PROMPT
|
||||
else:
|
||||
prompt = TOOL_FREE_EVAL_CHAT_PROMPT
|
||||
eval_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
eval_chain = prompt | llm | StrOutputParser()
|
||||
return cls(
|
||||
agent_tools=agent_tools, # type: ignore[arg-type]
|
||||
eval_chain=eval_chain,
|
||||
@@ -303,8 +303,8 @@ The following is the expected answer. Use this to measure correctness:
|
||||
if self.agent_tools:
|
||||
chain_input["tool_descriptions"] = self._tools_description
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
raw_output = self.eval_chain.run(
|
||||
chain_input, callbacks=_run_manager.get_child()
|
||||
raw_output = self.eval_chain.invoke(
|
||||
chain_input, {"callbacks": _run_manager.get_child()}
|
||||
)
|
||||
return cast(dict, self.output_parser.parse(raw_output))
|
||||
|
||||
@@ -327,8 +327,8 @@ The following is the expected answer. Use this to measure correctness:
|
||||
if self.agent_tools:
|
||||
chain_input["tool_descriptions"] = self._tools_description
|
||||
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
|
||||
raw_output = await self.eval_chain.arun(
|
||||
chain_input, callbacks=_run_manager.get_child()
|
||||
raw_output = await self.eval_chain.ainvoke(
|
||||
chain_input, {"callbacks": _run_manager.get_child()}
|
||||
)
|
||||
return cast(dict, self.output_parser.parse(raw_output))
|
||||
|
||||
|
||||
@@ -6,14 +6,15 @@ import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.prompts.base import BasePromptTemplate
|
||||
from langchain_core.prompts.prompt import PromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.comparison.prompt import (
|
||||
COMPARISON_TEMPLATE,
|
||||
COMPARISON_TEMPLATE_WITH_REFERENCE,
|
||||
@@ -151,7 +152,7 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
|
||||
}
|
||||
|
||||
|
||||
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain):
|
||||
"""A chain for comparing two outputs, such as the outputs
|
||||
of two models, prompts, or outputs of a single model on similar inputs.
|
||||
|
||||
@@ -186,6 +187,10 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
output_parser: BaseOutputParser = Field(
|
||||
default_factory=PairwiseStringResultOutputParser
|
||||
)
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
@@ -228,6 +233,22 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
|
||||
)
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
@@ -305,6 +326,19 @@ Performance may be significantly worse with other models."
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
return parsed
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | self.output_parser
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
def _evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
@@ -338,13 +372,17 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = self.invoke(input_, config=config, include_run_info=include_run_info)
|
||||
return self._prepare_output(result)
|
||||
|
||||
async def _aevaluate_string_pairs(
|
||||
@@ -380,13 +418,20 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = await self.ainvoke(
|
||||
input_, config=config, include_run_info=include_run_info
|
||||
)
|
||||
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
|
||||
@@ -4,14 +4,14 @@ import re
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Mapping, Optional, Union
|
||||
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
|
||||
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
|
||||
from langchain.schema import RUN_KEY
|
||||
@@ -164,7 +164,7 @@ def resolve_criteria(
|
||||
return criteria_
|
||||
|
||||
|
||||
class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
class CriteriaEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""LLM Chain for evaluating runs against criteria.
|
||||
|
||||
Parameters
|
||||
@@ -184,7 +184,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
reference labels in the prompt. Otherwise, the `PROMPT` template will be
|
||||
used, which is a reference-free prompt.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain` constructor.
|
||||
Additional keyword arguments to pass to the `Chain` constructor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -231,6 +231,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
criterion_name: str
|
||||
"""The name of the criterion being evaluated."""
|
||||
output_key: str = "results" #: :meta private:
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
@@ -267,6 +271,22 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
"\nTo use references, use the labeled_criteria instead."
|
||||
)
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
@classmethod
|
||||
def _resolve_prompt(
|
||||
cls, prompt: Optional[BasePromptTemplate] = None
|
||||
@@ -332,7 +352,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
The prompt template to use for generating prompts. If not provided,
|
||||
a default prompt template will be used.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain`
|
||||
Additional keyword arguments to pass to the `Chain`
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
@@ -396,6 +416,19 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
return parsed
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | self.output_parser
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
def _evaluate_strings(
|
||||
self,
|
||||
*,
|
||||
@@ -420,7 +453,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
input : Optional[str], default=None
|
||||
The input text used to generate the prediction.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain` `__call__`
|
||||
Additional keyword arguments to pass to the `Chain` `invoke`
|
||||
method.
|
||||
|
||||
Returns
|
||||
@@ -442,13 +475,17 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
)
|
||||
"""
|
||||
input_ = self._get_eval_input(prediction, reference, input)
|
||||
result = self(
|
||||
input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = self.invoke(input_, config=config, include_run_info=include_run_info)
|
||||
return self._prepare_output(result)
|
||||
|
||||
async def _aevaluate_strings(
|
||||
@@ -475,7 +512,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
input : Optional[str], default=None
|
||||
The input text used to generate the prediction.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain` `acall`
|
||||
Additional keyword arguments to pass to the `Chain` `acall`
|
||||
method.
|
||||
|
||||
Returns
|
||||
@@ -497,12 +534,18 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
)
|
||||
"""
|
||||
input_ = self._get_eval_input(prediction, reference, input)
|
||||
result = await self.acall(
|
||||
input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = await self.ainvoke(
|
||||
input_, config=config, include_run_info=include_run_info
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
@@ -556,7 +599,7 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
|
||||
The prompt template to use for generating prompts. If not provided,
|
||||
a default prompt will be used.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain`
|
||||
Additional keyword arguments to pass to the `Chain`
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
|
||||
@@ -4,13 +4,15 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
import string
|
||||
from typing import Any, List, Optional, Sequence, Tuple
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.callbacks.manager import CallbackManagerForChainRun, Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain_core.prompts.base import BasePromptTemplate
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT
|
||||
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
|
||||
from langchain.schema import RUN_KEY
|
||||
@@ -67,10 +69,14 @@ def _parse_string_eval_output(text: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
class QAEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""LLM Chain for evaluating question answering."""
|
||||
|
||||
output_key: str = "results" #: :meta private:
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
class Config:
|
||||
extra = "ignore"
|
||||
@@ -91,6 +97,35 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
def requires_input(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | StrOutputParser()
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
@@ -141,8 +176,14 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
}
|
||||
for i, example in enumerate(examples)
|
||||
]
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
outputs = self.batch(inputs, config=config)
|
||||
|
||||
return self.apply(inputs, callbacks=callbacks)
|
||||
# Subset to output key only
|
||||
return [{self.output_key: output[self.output_key]} for output in outputs]
|
||||
|
||||
def _prepare_output(self, result: dict) -> dict:
|
||||
parsed_result = _parse_string_eval_output(result[self.output_key])
|
||||
@@ -174,13 +215,17 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
Returns:
|
||||
dict: The evaluation results containing the score or value.
|
||||
"""
|
||||
result = self(
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = self.invoke(
|
||||
{
|
||||
"query": input,
|
||||
"answer": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
callbacks=callbacks,
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
@@ -195,17 +240,31 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
result = await self.acall(
|
||||
inputs={"query": input, "answer": reference, "result": prediction},
|
||||
callbacks=callbacks,
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = await self.ainvoke(
|
||||
{
|
||||
"query": input,
|
||||
"answer": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
class ContextQAEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""LLM Chain for evaluating QA w/o GT based on context"""
|
||||
|
||||
output_key: str = "text" #: :meta private:
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
return False
|
||||
@@ -220,6 +279,22 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
"""Whether the chain requires an input string."""
|
||||
return True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
class Config:
|
||||
extra = "ignore"
|
||||
|
||||
@@ -236,6 +311,19 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
def evaluation_name(self) -> str:
|
||||
return "Contextual Accuracy"
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | StrOutputParser()
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
@@ -281,8 +369,13 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
}
|
||||
for i, example in enumerate(examples)
|
||||
]
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
outputs = self.batch(inputs, config=config)
|
||||
|
||||
return self.apply(inputs, callbacks=callbacks)
|
||||
return [{self.output_key: output[self.output_key]} for output in outputs]
|
||||
|
||||
def _prepare_output(self, result: dict) -> dict:
|
||||
parsed_result = _parse_string_eval_output(result[self.output_key])
|
||||
@@ -300,13 +393,17 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
result = self(
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = self.invoke(
|
||||
{
|
||||
"query": input,
|
||||
"context": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
callbacks=callbacks,
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
@@ -321,9 +418,17 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
result = await self.acall(
|
||||
inputs={"query": input, "context": reference, "result": prediction},
|
||||
callbacks=callbacks,
|
||||
if callbacks:
|
||||
config = RunnableConfig(callbacks=callbacks)
|
||||
else:
|
||||
config = None
|
||||
result = await self.ainvoke(
|
||||
{
|
||||
"query": input,
|
||||
"context": reference,
|
||||
"result": prediction,
|
||||
},
|
||||
config=config,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseLLMOutputParser
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
@@ -17,8 +18,44 @@ _QA_OUTPUT_PARSER = RegexParser(
|
||||
)
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.13",
|
||||
message=(
|
||||
"This class is deprecated and will be removed in langchain 1.0. "
|
||||
"See API reference for replacement: "
|
||||
"https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.generate_chain.QAGenerateChain.html" # noqa: E501
|
||||
),
|
||||
removal="1.0",
|
||||
)
|
||||
class QAGenerateChain(LLMChain):
|
||||
"""LLM Chain for generating examples for question answering."""
|
||||
"""LLM Chain for generating examples for question answering.
|
||||
|
||||
Note: this class is deprecated. See below for a replacement implementation
|
||||
that leverages LLM tool calling features.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_openai import ChatOpenAI
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
template = \"\"\"You are a teacher coming up with questions to ask on a quiz.
|
||||
Given the following document, please generate a question and answer based on that document.
|
||||
|
||||
These questions should be detailed and be based explicitly on information in the document.
|
||||
\"\"\"
|
||||
|
||||
prompt = ChatPromptTemplate.from_template(template)
|
||||
|
||||
class QuestionAndAnswer(TypedDict):
|
||||
\"\"\"Question and answer based on document.\"\"\"
|
||||
question: str
|
||||
answer: str
|
||||
|
||||
llm = ChatOpenAI(model="gpt-4o-mini").with_structured_output(QuestionAndAnswer)
|
||||
llm.invoke("...")
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
output_parser: BaseLLMOutputParser = Field(default=_QA_OUTPUT_PARSER)
|
||||
output_key: str = "qa_pairs"
|
||||
|
||||
@@ -6,14 +6,16 @@ import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langchain_core.callbacks import CallbackManagerForChainRun
|
||||
from langchain_core.callbacks.manager import Callbacks
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.output_parsers import BaseOutputParser
|
||||
from langchain_core.prompts.base import BasePromptTemplate
|
||||
from langchain_core.prompts.prompt import PromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.criteria.eval_chain import (
|
||||
CRITERIA_TYPE,
|
||||
Criteria,
|
||||
@@ -144,7 +146,7 @@ class ScoreStringResultOutputParser(BaseOutputParser[dict]):
|
||||
}
|
||||
|
||||
|
||||
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain):
|
||||
"""A chain for scoring on a scale of 1-10 the output of a model.
|
||||
|
||||
Attributes:
|
||||
@@ -178,10 +180,43 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
"""The value to normalize the score by, if specified."""
|
||||
criterion_name: str
|
||||
"""The name of the criterion being evaluated."""
|
||||
llm: BaseLanguageModel
|
||||
"""The language model to use for scoring."""
|
||||
prompt: BasePromptTemplate
|
||||
"""The prompt to use for scoring."""
|
||||
|
||||
class Config:
|
||||
extra = "ignore"
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Will be whatever keys the prompt expects.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return self.prompt.input_variables
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Will always return text key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
if run_manager:
|
||||
config = RunnableConfig(callbacks=run_manager.get_child())
|
||||
else:
|
||||
config = None
|
||||
chain = self.prompt | self.llm | self.output_parser
|
||||
response = chain.invoke(inputs, config=config)
|
||||
return {self.output_key: response}
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
return False
|
||||
@@ -348,13 +383,17 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = self.invoke(input_, config=config, include_run_info=include_run_info)
|
||||
return self._prepare_output(result)
|
||||
|
||||
async def _aevaluate_string_pairs(
|
||||
@@ -385,12 +424,18 @@ Performance may be significantly worse with other models."
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
|
||||
# prep config
|
||||
config: RunnableConfig = {}
|
||||
if callbacks is not None:
|
||||
config["callbacks"] = callbacks
|
||||
if tags is not None:
|
||||
config["tags"] = tags
|
||||
if metadata is not None:
|
||||
config["metadata"] = metadata
|
||||
|
||||
result = await self.ainvoke(
|
||||
input_, config=config, include_run_info=include_run_info
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user