From d6d559d50d0dcbff1baf817b5cbf2c27ab09ac0e Mon Sep 17 00:00:00 2001 From: Dhruv Chawla <43818888+Dominastorm@users.noreply.github.com> Date: Wed, 17 Apr 2024 01:02:03 +0530 Subject: [PATCH] community[minor]: add UpTrainCallbackHandler (#19956) - **Description:** This PR adds a callback handler for UpTrain. It performs evaluations in the RAG pipeline to check the quality of retrieved documents, generated queries and responses. - **Dependencies:** - The UpTrainCallbackHandler requires the uptrain package --------- Co-authored-by: Eugene Yurtsev --- .../docs/integrations/callbacks/uptrain.ipynb | 421 ++++++++++++++++++ docs/docs/integrations/providers/uptrain.md | 20 + .../langchain_community/callbacks/__init__.py | 5 + .../callbacks/uptrain_callback.py | 389 ++++++++++++++++ .../unit_tests/callbacks/test_imports.py | 1 + 5 files changed, 836 insertions(+) create mode 100644 docs/docs/integrations/callbacks/uptrain.ipynb create mode 100644 docs/docs/integrations/providers/uptrain.md create mode 100644 libs/community/langchain_community/callbacks/uptrain_callback.py diff --git a/docs/docs/integrations/callbacks/uptrain.ipynb b/docs/docs/integrations/callbacks/uptrain.ipynb new file mode 100644 index 00000000000..0dbb04f9020 --- /dev/null +++ b/docs/docs/integrations/callbacks/uptrain.ipynb @@ -0,0 +1,421 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# UpTrain\n", + "\n", + "> UpTrain [[github](https://github.com/uptrain-ai/uptrain) || [website](https://uptrain.ai/) || [docs](https://docs.uptrain.ai/getting-started/introduction)] is an open-source platform to evaluate and improve LLM applications. It provides grades for 20+ preconfigured checks (covering language, code, embedding use cases), performs root cause analyses on instances of failure cases and provides guidance for resolving them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UpTrain Callback Handler\n", + "\n", + "This notebook showcases the UpTrain callback handler seamlessly integrating into your pipeline, facilitating diverse evaluations. We have chosen a few evaluations that we deemed apt for evaluating the chains. These evaluations run automatically, with results displayed in the output. More details on UpTrain's evaluations can be found [here](https://github.com/uptrain-ai/uptrain?tab=readme-ov-file#pre-built-evaluations-we-offer-). \n", + "\n", + "Selected retievers from Langchain are highlighted for demonstration:\n", + "\n", + "### 1. **Vanilla RAG**:\n", + "RAG plays a crucial role in retrieving context and generating responses. To ensure its performance and response quality, we conduct the following evaluations:\n", + "\n", + "- **[Context Relevance](https://docs.uptrain.ai/predefined-evaluations/context-awareness/context-relevance)**: Determines if the context extracted from the query is relevant to the response.\n", + "- **[Factual Accuracy](https://docs.uptrain.ai/predefined-evaluations/context-awareness/factual-accuracy)**: Assesses if the LLM is hallcuinating or providing incorrect information.\n", + "- **[Response Completeness](https://docs.uptrain.ai/predefined-evaluations/response-quality/response-completeness)**: Checks if the response contains all the information requested by the query.\n", + "\n", + "### 2. **Multi Query Generation**:\n", + "MultiQueryRetriever creates multiple variants of a question having a similar meaning to the original question. Given the complexity, we include the previous evaluations and add:\n", + "\n", + "- **[Multi Query Accuracy](https://docs.uptrain.ai/predefined-evaluations/query-quality/multi-query-accuracy)**: Assures that the multi-queries generated mean the same as the original query.\n", + "\n", + "### 3. **Context Compression and Reranking**:\n", + "Re-ranking involves reordering nodes based on relevance to the query and choosing top n nodes. Since the number of nodes can reduce once the re-ranking is complete, we perform the following evaluations:\n", + "\n", + "- **[Context Reranking](https://docs.uptrain.ai/predefined-evaluations/context-awareness/context-reranking)**: Checks if the order of re-ranked nodes is more relevant to the query than the original order.\n", + "- **[Context Conciseness](https://docs.uptrain.ai/predefined-evaluations/context-awareness/context-conciseness)**: Examines whether the reduced number of nodes still provides all the required information.\n", + "\n", + "These evaluations collectively ensure the robustness and effectiveness of the RAG, MultiQueryRetriever, and the Reranking process in the chain." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain langchain_openai uptrain faiss-cpu flashrank" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NOTE: that you can also install `faiss-gpu` instead of `faiss-cpu` if you want to use the GPU enabled version of the library." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "\n", + "from langchain.chains import RetrievalQA\n", + "from langchain.retrievers import ContextualCompressionRetriever\n", + "from langchain.retrievers.document_compressors import FlashrankRerank\n", + "from langchain.retrievers.multi_query import MultiQueryRetriever\n", + "from langchain_community.callbacks.uptrain_callback import UpTrainCallbackHandler\n", + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.vectorstores import FAISS\n", + "from langchain_core.output_parsers.string import StrOutputParser\n", + "from langchain_core.prompts.chat import ChatPromptTemplate\n", + "from langchain_core.runnables.passthrough import RunnablePassthrough\n", + "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", + "from langchain_text_splitters import (\n", + " RecursiveCharacterTextSplitter,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the documents" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split the document into chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "chunks = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the retriever" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenAIEmbeddings()\n", + "db = FAISS.from_documents(chunks, embeddings)\n", + "retriever = db.as_retriever()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the LLM" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(temperature=0, model=\"gpt-4\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set the openai API key\n", + "This key is required to perform the evaluations. UpTrain uses the GPT models to evaluate the responses generated by the LLM." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "OPENAI_API_KEY = getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "For each of the retrievers below, it is better to define the callback handler again to avoid interference. You can choose between the following options for evaluating using UpTrain:\n", + "\n", + "### 1. **UpTrain's Open-Source Software (OSS)**: \n", + "You can use the open-source evaluation service to evaluate your model.\n", + "In this case, you will need to provie an OpenAI API key. You can get yours [here](https://platform.openai.com/account/api-keys).\n", + "\n", + "Parameters:\n", + "- key_type=\"openai\"\n", + "- api_key=\"OPENAI_API_KEY\"\n", + "- project_name_prefix=\"PROJECT_NAME_PREFIX\"\n", + "\n", + "\n", + "### 2. **UpTrain Managed Service and Dashboards**: \n", + "You can create a free UpTrain account [here](https://uptrain.ai/) and get free trial credits. If you want more trial credits, [book a call with the maintainers of UpTrain here](https://calendly.com/uptrain-sourabh/30min).\n", + "\n", + "UpTrain Managed service provides:\n", + "1. Dashboards with advanced drill-down and filtering options\n", + "1. Insights and common topics among failing cases\n", + "1. Observability and real-time monitoring of production data\n", + "1. Regression testing via seamless integration with your CI/CD pipelines\n", + "\n", + "The notebook contains some screenshots of the dashboards and the insights that you can get from the UpTrain managed service.\n", + "\n", + "Parameters:\n", + "- key_type=\"uptrain\"\n", + "- api_key=\"UPTRAIN_API_KEY\"\n", + "- project_name_prefix=\"PROJECT_NAME_PREFIX\"\n", + "\n", + "\n", + "**Note:** The `project_name_prefix` will be used as prefix for the project names in the UpTrain dashboard. These will be different for different types of evals. For example, if you set project_name_prefix=\"langchain\" and perform the multi_query evaluation, the project name will be \"langchain_multi_query\"." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Vanilla RAG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "UpTrain callback handler will automatically capture the query, context and response once generated and will run the following three evaluations *(Graded from 0 to 1)* on the response:\n", + "- **[Context Relevance](https://docs.uptrain.ai/predefined-evaluations/context-awareness/context-relevance)**: Check if the context extractedfrom the query is relevant to the response.\n", + "- **[Factual Accuracy](https://docs.uptrain.ai/predefined-evaluations/context-awareness/factual-accuracy)**: Check how factually accurate the response is.\n", + "- **[Response Completeness](https://docs.uptrain.ai/predefined-evaluations/response-quality/response-completeness)**: Check if the response contains all the information that the query is asking for." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the RAG prompt\n", + "template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n", + "{context}\n", + "Question: {question}\n", + "\"\"\"\n", + "rag_prompt_text = ChatPromptTemplate.from_template(template)\n", + "\n", + "# Create the chain\n", + "chain = (\n", + " {\"context\": retriever, \"question\": RunnablePassthrough()}\n", + " | rag_prompt_text\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "# Create the uptrain callback handler\n", + "uptrain_callback = UpTrainCallbackHandler(key_type=\"openai\", api_key=OPENAI_API_KEY)\n", + "config = {\"callbacks\": [uptrain_callback]}\n", + "\n", + "# Invoke the chain with a query\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = chain.invoke(query, config=config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Multi Query Generation\n", + "\n", + "The **MultiQueryRetriever** is used to tackle the problem that the RAG pipeline might not return the best set of documents based on the query. It generates multiple queries that mean the same as the original query and then fetches documents for each.\n", + "\n", + "To evluate this retriever, UpTrain will run the following evaluation:\n", + "- **[Multi Query Accuracy](https://docs.uptrain.ai/predefined-evaluations/query-quality/multi-query-accuracy)**: Checks if the multi-queries generated mean the same as the original query." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-04-10 14:09:15.887\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate_on_server\u001b[0m:\u001b[36m376\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain\u001b[0m\n", + "\u001b[32m2024-04-10 14:09:21.367\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate\u001b[0m:\u001b[36m365\u001b[0m - \u001b[1mLocal server not running, start the server to log data and visualize in the dashboard!\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Question: What did the president say about Ketanji Brown Jackson\n", + "Multi Queries:\n", + " - How did the president comment on Ketanji Brown Jackson?\n", + " - What were the president's remarks regarding Ketanji Brown Jackson?\n", + " - What statements has the president made about Ketanji Brown Jackson?\n", + "\n", + "Multi Query Accuracy Score: 1.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-04-10 14:09:29.142\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate_on_server\u001b[0m:\u001b[36m376\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain\u001b[0m\n", + "\u001b[32m2024-04-10 14:09:53.095\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate\u001b[0m:\u001b[36m365\u001b[0m - \u001b[1mLocal server not running, start the server to log data and visualize in the dashboard!\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Question: What did the president say about Ketanji Brown Jackson\n", + "Response: The president mentioned that he had nominated Ketanji Brown Jackson to serve on the United States Supreme Court 4 days ago. He described her as one of the nation's top legal minds who will continue Justice Breyer’s legacy of excellence. He also mentioned that she is a former top litigator in private practice, a former federal public defender, and comes from a family of public school educators and police officers. Since her nomination, she has received a broad range of support, including from the Fraternal Order of Police and former judges appointed by both Democrats and Republicans.\n", + "\n", + "Context Relevance Score: 1.0\n", + "Factual Accuracy Score: 1.0\n", + "Response Completeness Score: 1.0\n" + ] + } + ], + "source": [ + "# Create the retriever\n", + "multi_query_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm)\n", + "\n", + "# Create the uptrain callback\n", + "uptrain_callback = UpTrainCallbackHandler(key_type=\"openai\", api_key=OPENAI_API_KEY)\n", + "config = {\"callbacks\": [uptrain_callback]}\n", + "\n", + "# Create the RAG prompt\n", + "template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n", + "{context}\n", + "Question: {question}\n", + "\"\"\"\n", + "rag_prompt_text = ChatPromptTemplate.from_template(template)\n", + "\n", + "chain = (\n", + " {\"context\": multi_query_retriever, \"question\": RunnablePassthrough()}\n", + " | rag_prompt_text\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "# Invoke the chain with a query\n", + "question = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = chain.invoke(question, config=config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Context Compression and Reranking\n", + "\n", + "The reranking process involves reordering nodes based on relevance to the query and choosing the top n nodes. Since the number of nodes can reduce once the reranking is complete, we perform the following evaluations:\n", + "- **[Context Reranking](https://docs.uptrain.ai/predefined-evaluations/context-awareness/context-reranking)**: Check if the order of re-ranked nodes is more relevant to the query than the original order.\n", + "- **[Context Conciseness](https://docs.uptrain.ai/predefined-evaluations/context-awareness/context-conciseness)**: Check if the reduced number of nodes still provides all the required information." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the retriever\n", + "compressor = FlashrankRerank()\n", + "compression_retriever = ContextualCompressionRetriever(\n", + " base_compressor=compressor, base_retriever=retriever\n", + ")\n", + "\n", + "# Create the chain\n", + "chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever)\n", + "\n", + "# Create the uptrain callback\n", + "uptrain_callback = UpTrainCallbackHandler(key_type=\"openai\", api_key=OPENAI_API_KEY)\n", + "config = {\"callbacks\": [uptrain_callback]}\n", + "\n", + "# Invoke the chain with a query\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "result = chain.invoke(query, config=config)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/providers/uptrain.md b/docs/docs/integrations/providers/uptrain.md new file mode 100644 index 00000000000..e371f27870d --- /dev/null +++ b/docs/docs/integrations/providers/uptrain.md @@ -0,0 +1,20 @@ +# UpTrain + +>[UpTrain](https://uptrain.ai/) is an open-source unified platform to evaluate and +>improve Generative AI applications. It provides grades for 20+ preconfigured evaluations +>(covering language, code, embedding use cases), performs root cause analysis on failure +>cases and gives insights on how to resolve them. + +## Installation and Setup + +```bash +pip install uptrain +``` + +## Callbacks + +```python +from langchain_community.callbacks.uptrain_callback import UpTrainCallbackHandler +``` + +See an [example](/docs/integrations/callbacks/uptrain). diff --git a/libs/community/langchain_community/callbacks/__init__.py b/libs/community/langchain_community/callbacks/__init__.py index adc0c9750f2..64005983684 100644 --- a/libs/community/langchain_community/callbacks/__init__.py +++ b/libs/community/langchain_community/callbacks/__init__.py @@ -72,6 +72,9 @@ if TYPE_CHECKING: from langchain_community.callbacks.trubrics_callback import ( TrubricsCallbackHandler, # noqa: F401 ) + from langchain_community.callbacks.uptrain_callback import ( + UpTrainCallbackHandler, # noqa: F401 + ) from langchain_community.callbacks.wandb_callback import ( WandbCallbackHandler, # noqa: F401 ) @@ -101,6 +104,7 @@ _module_lookup = { "SageMakerCallbackHandler": "langchain_community.callbacks.sagemaker_callback", "StreamlitCallbackHandler": "langchain_community.callbacks.streamlit", "TrubricsCallbackHandler": "langchain_community.callbacks.trubrics_callback", + "UpTrainCallbackHandler": "langchain_community.callbacks.uptrain_callback", "WandbCallbackHandler": "langchain_community.callbacks.wandb_callback", "WhyLabsCallbackHandler": "langchain_community.callbacks.whylabs_callback", "get_openai_callback": "langchain_community.callbacks.manager", @@ -136,6 +140,7 @@ __all__ = [ "SageMakerCallbackHandler", "StreamlitCallbackHandler", "TrubricsCallbackHandler", + "UpTrainCallbackHandler", "WandbCallbackHandler", "WhyLabsCallbackHandler", "get_openai_callback", diff --git a/libs/community/langchain_community/callbacks/uptrain_callback.py b/libs/community/langchain_community/callbacks/uptrain_callback.py new file mode 100644 index 00000000000..fd08fd02c58 --- /dev/null +++ b/libs/community/langchain_community/callbacks/uptrain_callback.py @@ -0,0 +1,389 @@ +""" +UpTrain Callback Handler + +UpTrain is an open-source platform to evaluate and improve LLM applications. It provides +grades for 20+ preconfigured checks (covering language, code, embedding use cases), +performs root cause analyses on instances of failure cases and provides guidance for +resolving them. + +This module contains a callback handler for integrating UpTrain seamlessly into your +pipeline and facilitating diverse evaluations. The callback handler automates various +evaluations to assess the performance and effectiveness of the components within the +pipeline. + +The evaluations conducted include: + +1. RAG: + - Context Relevance: Determines the relevance of the context extracted from the query + to the response. + - Factual Accuracy: Assesses if the Language Model (LLM) is providing accurate + information or hallucinating. + - Response Completeness: Checks if the response contains all the information + requested by the query. + +2. Multi Query Generation: + MultiQueryRetriever generates multiple variants of a question with similar meanings + to the original question. This evaluation includes previous assessments and adds: + - Multi Query Accuracy: Ensures that the multi-queries generated convey the same + meaning as the original query. + +3. Context Compression and Reranking: + Re-ranking involves reordering nodes based on relevance to the query and selecting + top n nodes. + Due to the potential reduction in the number of nodes after re-ranking, the following + evaluations + are performed in addition to the RAG evaluations: + - Context Reranking: Determines if the order of re-ranked nodes is more relevant to + the query than the original order. + - Context Conciseness: Examines whether the reduced number of nodes still provides + all the required information. + +These evaluations collectively ensure the robustness and effectiveness of the RAG query +engine, MultiQueryRetriever, and the re-ranking process within the pipeline. + +Useful links: +Github: https://github.com/uptrain-ai/uptrain +Website: https://uptrain.ai/ +Docs: https://docs.uptrain.ai/getting-started/introduction + +""" + +import logging +import sys +from collections import defaultdict +from typing import ( + Any, + DefaultDict, + Dict, + List, + Optional, + Sequence, + Set, +) +from uuid import UUID + +from langchain_core.callbacks.base import BaseCallbackHandler +from langchain_core.documents import Document +from langchain_core.outputs import LLMResult + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler(sys.stdout) +formatter = logging.Formatter("%(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def import_uptrain() -> Any: + try: + import uptrain + except ImportError as e: + raise ImportError( + "To use the UpTrainCallbackHandler, you need the" + "`uptrain` package. Please install it with" + "`pip install uptrain`.", + e, + ) + + return uptrain + + +class UpTrainDataSchema: + """The UpTrain data schema for tracking evaluation results. + + Args: + project_name_prefix (str): Prefix for the project name. + + Attributes: + project_name_prefix (str): Prefix for the project name. + uptrain_results (DefaultDict[str, Any]): Dictionary to store evaluation results. + eval_types (Set[str]): Set to store the types of evaluations. + query (str): Query for the RAG evaluation. + context (str): Context for the RAG evaluation. + response (str): Response for the RAG evaluation. + old_context (List[str]): Old context nodes for Context Conciseness evaluation. + new_context (List[str]): New context nodes for Context Conciseness evaluation. + context_conciseness_run_id (str): Run ID for Context Conciseness evaluation. + multi_queries (List[str]): List of multi queries for Multi Query evaluation. + multi_query_run_id (str): Run ID for Multi Query evaluation. + multi_query_daugher_run_id (str): Run ID for Multi Query daughter evaluation. + + """ + + def __init__(self, project_name_prefix: str) -> None: + """Initialize the UpTrain data schema.""" + # For tracking project name and results + self.project_name_prefix: str = project_name_prefix + self.uptrain_results: DefaultDict[str, Any] = defaultdict(list) + + # For tracking event types + self.eval_types: Set[str] = set() + + ## RAG + self.query: str = "" + self.context: str = "" + self.response: str = "" + + ## CONTEXT CONCISENESS + self.old_context: List[str] = [] + self.new_context: List[str] = [] + self.context_conciseness_run_id: UUID = UUID(int=0) + + # MULTI QUERY + self.multi_queries: List[str] = [] + self.multi_query_run_id: UUID = UUID(int=0) + self.multi_query_daugher_run_id: UUID = UUID(int=0) + + +class UpTrainCallbackHandler(BaseCallbackHandler): + """Callback Handler that logs evaluation results to uptrain and the console. + + Args: + project_name_prefix (str): Prefix for the project name. + key_type (str): Type of key to use. Must be 'uptrain' or 'openai'. + api_key (str): API key for the UpTrain or OpenAI API. + (This key is required to perform evaluations using GPT.) + + Raises: + ValueError: If the key type is invalid. + ImportError: If the `uptrain` package is not installed. + + """ + + def __init__( + self, + *, + project_name_prefix: str = "langchain", + key_type: str = "openai", + api_key: str = "sk-****************", # The API key to use for evaluation + model: str = "gpt-3.5-turbo", # The model to use for evaluation + log_results: bool = True, + ) -> None: + """Initializes the `UpTrainCallbackHandler`.""" + super().__init__() + + uptrain = import_uptrain() + + self.log_results = log_results + + # Set uptrain variables + self.schema = UpTrainDataSchema(project_name_prefix=project_name_prefix) + self.first_score_printed_flag = False + + if key_type == "uptrain": + settings = uptrain.Settings(uptrain_access_token=api_key, model=model) + self.uptrain_client = uptrain.APIClient(settings=settings) + elif key_type == "openai": + settings = uptrain.Settings( + openai_api_key=api_key, evaluate_locally=False, model=model + ) + self.uptrain_client = uptrain.EvalLLM(settings=settings) + else: + raise ValueError("Invalid key type: Must be 'uptrain' or 'openai'") + + def uptrain_evaluate( + self, + project_name: str, + data: List[Dict[str, Any]], + checks: List[str], + ) -> None: + """Run an evaluation on the UpTrain server using UpTrain client.""" + if self.uptrain_client.__class__.__name__ == "APIClient": + uptrain_result = self.uptrain_client.log_and_evaluate( + project_name=project_name, + data=data, + checks=checks, + ) + else: + uptrain_result = self.uptrain_client.evaluate( + data=data, + checks=checks, + ) + self.schema.uptrain_results[project_name].append(uptrain_result) + + score_name_map = { + "score_context_relevance": "Context Relevance Score", + "score_factual_accuracy": "Factual Accuracy Score", + "score_response_completeness": "Response Completeness Score", + "score_sub_query_completeness": "Sub Query Completeness Score", + "score_context_reranking": "Context Reranking Score", + "score_context_conciseness": "Context Conciseness Score", + "score_multi_query_accuracy": "Multi Query Accuracy Score", + } + + if self.log_results: + # Set logger level to INFO to print the evaluation results + logger.setLevel(logging.INFO) + + for row in uptrain_result: + columns = list(row.keys()) + for column in columns: + if column == "question": + logger.info(f"\nQuestion: {row[column]}") + self.first_score_printed_flag = False + elif column == "response": + logger.info(f"Response: {row[column]}") + self.first_score_printed_flag = False + elif column == "variants": + logger.info("Multi Queries:") + for variant in row[column]: + logger.info(f" - {variant}") + self.first_score_printed_flag = False + elif column.startswith("score"): + if not self.first_score_printed_flag: + logger.info("") + self.first_score_printed_flag = True + if column in score_name_map: + logger.info(f"{score_name_map[column]}: {row[column]}") + else: + logger.info(f"{column}: {row[column]}") + + if self.log_results: + # Set logger level back to WARNING + # (We are doing this to avoid printing the logs from HTTP requests) + logger.setLevel(logging.WARNING) + + def on_llm_end( + self, + response: LLMResult, + *, + run_id: UUID, + parent_run_id: Optional[UUID] = None, + **kwargs: Any, + ) -> None: + """Log records to uptrain when an LLM ends.""" + uptrain = import_uptrain() + self.schema.response = response.generations[0][0].text + if ( + "qa_rag" in self.schema.eval_types + and parent_run_id != self.schema.multi_query_daugher_run_id + ): + data = [ + { + "question": self.schema.query, + "context": self.schema.context, + "response": self.schema.response, + } + ] + + self.uptrain_evaluate( + project_name=f"{self.schema.project_name_prefix}_rag", + data=data, + checks=[ + uptrain.Evals.CONTEXT_RELEVANCE, + uptrain.Evals.FACTUAL_ACCURACY, + uptrain.Evals.RESPONSE_COMPLETENESS, + ], + ) + + def on_chain_start( + self, + serialized: Dict[str, Any], + inputs: Dict[str, Any], + *, + run_id: UUID, + tags: Optional[List[str]] = None, + parent_run_id: Optional[UUID] = None, + metadata: Optional[Dict[str, Any]] = None, + run_type: Optional[str] = None, + name: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Do nothing when chain starts""" + if parent_run_id == self.schema.multi_query_run_id: + self.schema.multi_query_daugher_run_id = run_id + if isinstance(inputs, dict) and set(inputs.keys()) == {"context", "question"}: + self.schema.eval_types.add("qa_rag") + + context = "" + if isinstance(inputs["context"], Document): + context = inputs["context"].page_content + elif isinstance(inputs["context"], list): + for doc in inputs["context"]: + context += doc.page_content + "\n" + elif isinstance(inputs["context"], str): + context = inputs["context"] + self.schema.context = context + self.schema.query = inputs["question"] + pass + + def on_retriever_start( + self, + serialized: Dict[str, Any], + query: str, + *, + run_id: UUID, + parent_run_id: Optional[UUID] = None, + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> None: + if "contextual_compression" in serialized["id"]: + self.schema.eval_types.add("contextual_compression") + self.schema.query = query + self.schema.context_conciseness_run_id = run_id + + if "multi_query" in serialized["id"]: + self.schema.eval_types.add("multi_query") + self.schema.multi_query_run_id = run_id + self.schema.query = query + elif "multi_query" in self.schema.eval_types: + self.schema.multi_queries.append(query) + + def on_retriever_end( + self, + documents: Sequence[Document], + *, + run_id: UUID, + parent_run_id: Optional[UUID] = None, + **kwargs: Any, + ) -> Any: + """Run when Retriever ends running.""" + uptrain = import_uptrain() + if run_id == self.schema.multi_query_run_id: + data = [ + { + "question": self.schema.query, + "variants": self.schema.multi_queries, + } + ] + + self.uptrain_evaluate( + project_name=f"{self.schema.project_name_prefix}_multi_query", + data=data, + checks=[uptrain.Evals.MULTI_QUERY_ACCURACY], + ) + if "contextual_compression" in self.schema.eval_types: + if parent_run_id == self.schema.context_conciseness_run_id: + for doc in documents: + self.schema.old_context.append(doc.page_content) + elif run_id == self.schema.context_conciseness_run_id: + for doc in documents: + self.schema.new_context.append(doc.page_content) + context = "\n".join( + [ + f"{index}. {string}" + for index, string in enumerate(self.schema.old_context, start=1) + ] + ) + reranked_context = "\n".join( + [ + f"{index}. {string}" + for index, string in enumerate(self.schema.new_context, start=1) + ] + ) + data = [ + { + "question": self.schema.query, + "context": context, + "concise_context": reranked_context, + "reranked_context": reranked_context, + } + ] + self.uptrain_evaluate( + project_name=f"{self.schema.project_name_prefix}_context_reranking", + data=data, + checks=[ + uptrain.Evals.CONTEXT_CONCISENESS, + uptrain.Evals.CONTEXT_RERANKING, + ], + ) diff --git a/libs/community/tests/unit_tests/callbacks/test_imports.py b/libs/community/tests/unit_tests/callbacks/test_imports.py index 648e198c2c9..26e6b7daaad 100644 --- a/libs/community/tests/unit_tests/callbacks/test_imports.py +++ b/libs/community/tests/unit_tests/callbacks/test_imports.py @@ -25,6 +25,7 @@ EXPECTED_ALL = [ "LabelStudioCallbackHandler", "TrubricsCallbackHandler", "FiddlerCallbackHandler", + "UpTrainCallbackHandler", ]