From 41a4c06a949c9b76f32f2d56d5232f67182c7330 Mon Sep 17 00:00:00 2001 From: Rohan Dey <58871401+RohanDey02@users.noreply.github.com> Date: Wed, 29 Nov 2023 22:08:50 -0500 Subject: [PATCH] Added support for a Pandas DataFrame OutputParser (#13257) **Description:** Added support for a Pandas DataFrame OutputParser with format instructions, along with unit tests and a demo notebook. Namely, we've added the ability to request data from a DataFrame, have the LLM parse the request, and then use that request to retrieve a well-formatted response. Within LangChain, it seamlessly integrates with language models like OpenAI's `text-davinci-003`, facilitating streamlined interaction using the format instructions (just like the other output parsers). This parser structures its requests as `[]`. The instructions detail permissible operations, valid columns, and array formats, ensuring clarity and adherence to the required format. For example: - When the LLM receives the input: "Retrieve the mean of `num_legs` from rows 1 to 3." - The provided format instructions guide the LLM to structure the request as: "mean:num_legs[1..3]". The parser processes this formatted request, leveraging the LLM's understanding to extract the mean of `num_legs` from rows 1 to 3 within the Pandas DataFrame. This integration allows users to communicate requests naturally, with the LLM transforming these instructions into structured commands understood by the `PandasDataFrameOutputParser`. The format instructions act as a bridge between natural language queries and precise DataFrame operations, optimizing communication and data retrieval. **Issue:** - https://github.com/langchain-ai/langchain/issues/11532 **Dependencies:** No additional dependencies :) **Tag maintainer:** @baskaryan **Twitter handle:** No need. :) --------- Co-authored-by: Wasee Alam Co-authored-by: Harrison Chase --- .../output_parsers/pandas_dataframe.ipynb | 229 ++++++++++++++++++ .../langchain/output_parsers/__init__.py | 2 + .../output_parsers/format_instructions.py | 22 ++ .../output_parsers/pandas_dataframe.py | 157 ++++++++++++ .../unit_tests/output_parsers/test_imports.py | 1 + .../test_pandas_dataframe_parser.py | 110 +++++++++ 6 files changed, 521 insertions(+) create mode 100644 docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb create mode 100644 libs/langchain/langchain/output_parsers/pandas_dataframe.py create mode 100644 libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py diff --git a/docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb b/docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb new file mode 100644 index 00000000000..ea0e32ed903 --- /dev/null +++ b/docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas DataFrame Parser\n", + "\n", + "A Pandas DataFrame is a popular data structure in the Python programming language, commonly used for data manipulation and analysis. It provides a comprehensive set of tools for working with structured data, making it a versatile option for tasks such as data cleaning, transformation, and analysis.\n", + "\n", + "This output parser allows users to specify an arbitrary Pandas DataFrame and query LLMs for data in the form of a formatted dictionary that extracts data from the corresponding DataFrame. Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate a well-formed query as per the defined format instructions.\n", + "\n", + "Use Pandas' DataFrame object to declare the DataFrame you wish to perform queries on." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pprint\n", + "from typing import Any, Dict\n", + "\n", + "import pandas as pd\n", + "from langchain.llms import OpenAI\n", + "from langchain.output_parsers import PandasDataFrameOutputParser\n", + "from langchain.prompts import PromptTemplate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"text-davinci-003\"\n", + "temperature = 0.5\n", + "model = OpenAI(model_name=model_name, temperature=temperature)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Solely for documentation purposes.\n", + "def format_parser_output(parser_output: Dict[str, Any]) -> None:\n", + " for key in parser_output.keys():\n", + " parser_output[key] = parser_output[key].to_dict()\n", + " return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Define your desired Pandas DataFrame.\n", + "df = pd.DataFrame(\n", + " {\n", + " \"num_legs\": [2, 4, 8, 0],\n", + " \"num_wings\": [2, 0, 0, 0],\n", + " \"num_specimen_seen\": [10, 2, 1, 8],\n", + " }\n", + ")\n", + "\n", + "# Set up a parser + inject instructions into the prompt template.\n", + "parser = PandasDataFrameOutputParser(dataframe=df)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM Output: column:num_wings\n", + "{'num_wings': {0: 2,\n", + " 1: 0,\n", + " 2: 0,\n", + " 3: 0}}\n" + ] + } + ], + "source": [ + "# Here's an example of a column operation being performed.\n", + "df_query = \"Retrieve the num_wings column.\"\n", + "\n", + "# Set up the prompt.\n", + "prompt = PromptTemplate(\n", + " template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n", + " input_variables=[\"query\"],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()},\n", + ")\n", + "\n", + "_input = prompt.format_prompt(query=df_query)\n", + "output = model(_input.to_string())\n", + "print(\"LLM Output:\", output)\n", + "parser_output = parser.parse(output)\n", + "\n", + "format_parser_output(parser_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM Output: row:1\n", + "{'1': {'num_legs': 4,\n", + " 'num_specimen_seen': 2,\n", + " 'num_wings': 0}}\n" + ] + } + ], + "source": [ + "# Here's an example of a row operation being performed.\n", + "df_query = \"Retrieve the first row.\"\n", + "\n", + "# Set up the prompt.\n", + "prompt = PromptTemplate(\n", + " template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n", + " input_variables=[\"query\"],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()},\n", + ")\n", + "\n", + "_input = prompt.format_prompt(query=df_query)\n", + "output = model(_input.to_string())\n", + "print(\"LLM Output:\", output)\n", + "parser_output = parser.parse(output)\n", + "\n", + "format_parser_output(parser_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM Output: mean:num_legs[1..3]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'mean': 4.0}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Here's an example of a random Pandas DataFrame operation limiting the number of rows\n", + "df_query = \"Retrieve the average of the num_legs column from rows 1 to 3.\"\n", + "\n", + "# Set up the prompt.\n", + "prompt = PromptTemplate(\n", + " template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n", + " input_variables=[\"query\"],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()},\n", + ")\n", + "\n", + "_input = prompt.format_prompt(query=df_query)\n", + "output = model(_input.to_string())\n", + "print(\"LLM Output:\", output)\n", + "parser.parse(output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Here's an example of a poorly formatted query\n", + "df_query = \"Retrieve the mean of the num_fingers column.\"\n", + "\n", + "# Set up the prompt.\n", + "prompt = PromptTemplate(\n", + " template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n", + " input_variables=[\"query\"],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()},\n", + ")\n", + "\n", + "_input = prompt.format_prompt(query=df_query)\n", + "output = model(_input.to_string()) # Expected Output: \"Invalid column: num_fingers\".\n", + "print(\"LLM Output:\", output)\n", + "parser.parse(output) # Expected Output: Will raise an OutputParserException." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/langchain/langchain/output_parsers/__init__.py b/libs/langchain/langchain/output_parsers/__init__.py index 9f7848c984d..e37803b81d0 100644 --- a/libs/langchain/langchain/output_parsers/__init__.py +++ b/libs/langchain/langchain/output_parsers/__init__.py @@ -28,6 +28,7 @@ from langchain.output_parsers.openai_tools import ( JsonOutputToolsParser, PydanticToolsParser, ) +from langchain.output_parsers.pandas_dataframe import PandasDataFrameOutputParser from langchain.output_parsers.pydantic import PydanticOutputParser from langchain.output_parsers.rail_parser import GuardrailsOutputParser from langchain.output_parsers.regex import RegexParser @@ -47,6 +48,7 @@ __all__ = [ "MarkdownListOutputParser", "NumberedListOutputParser", "OutputFixingParser", + "PandasDataFrameOutputParser", "PydanticOutputParser", "RegexDictParser", "RegexParser", diff --git a/libs/langchain/langchain/output_parsers/format_instructions.py b/libs/langchain/langchain/output_parsers/format_instructions.py index fd9fb3ba238..91f2007b702 100644 --- a/libs/langchain/langchain/output_parsers/format_instructions.py +++ b/libs/langchain/langchain/output_parsers/format_instructions.py @@ -41,3 +41,25 @@ Here are the output tags: ``` {tags} ```""" + + +PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS = """The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters. +1. The column names are limited to the possible columns below. +2. Arrays must either be a comma-seperated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4]. +3. Remember that arrays are optional and not necessarily required. +4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation". + +As an example, for the formats: +1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column. +2. String "row:1" is a well-formatted instance which gets row 1. +3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2, where num_legs is a possible column. +4. String "row:1[num_legs]" is a well-formatted instance which gets row 1, but for just column num_legs, where num_legs is a possible column. +5. String "mean:num_legs[1..3]" is a well-formatted instance which takes the mean of num_legs from rows 1 to 3, where num_legs is a possible column and mean is a valid Pandas DataFrame operation. +6. String "do_something:num_legs" is a badly-formatted instance, where do_something is not a valid Pandas DataFrame operation. +7. String "mean:invalid_col" is a badly-formatted instance, where invalid_col is not a possible column. + +Here are the possible columns: +``` +{columns} +``` +""" diff --git a/libs/langchain/langchain/output_parsers/pandas_dataframe.py b/libs/langchain/langchain/output_parsers/pandas_dataframe.py new file mode 100644 index 00000000000..85bde591026 --- /dev/null +++ b/libs/langchain/langchain/output_parsers/pandas_dataframe.py @@ -0,0 +1,157 @@ +import re +from typing import Any, Dict, List, Tuple, Union + +from langchain.output_parsers.format_instructions import ( + PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS, +) +from langchain.pydantic_v1 import validator +from langchain.schema import BaseOutputParser, OutputParserException + + +class PandasDataFrameOutputParser(BaseOutputParser): + """Parse an output using Pandas DataFrame format.""" + + """The Pandas DataFrame to parse.""" + dataframe: Any + + @validator("dataframe") + def validate_dataframe(cls, val: Any) -> Any: + import pandas as pd + + if issubclass(type(val), pd.DataFrame): + return val + if pd.DataFrame(val).empty: + raise ValueError("DataFrame cannot be empty.") + + raise TypeError( + "Wrong type for 'dataframe', must be a subclass \ + of Pandas DataFrame (pd.DataFrame)" + ) + + def parse_array( + self, array: str, original_request_params: str + ) -> Tuple[List[Union[int, str]], str]: + parsed_array: List[Union[int, str]] = [] + + # Check if the format is [1,3,5] + if re.match(r"\[\d+(,\s*\d+)*\]", array): + parsed_array = [int(i) for i in re.findall(r"\d+", array)] + # Check if the format is [1..5] + elif re.match(r"\[(\d+)\.\.(\d+)\]", array): + match = re.match(r"\[(\d+)\.\.(\d+)\]", array) + if match: + start, end = map(int, match.groups()) + parsed_array = list(range(start, end + 1)) + else: + raise OutputParserException( + f"Unable to parse the array provided in {array}. \ + Please check the format instructions." + ) + # Check if the format is ["column_name"] + elif re.match(r"\[[a-zA-Z0-9_]+(?:,[a-zA-Z0-9_]+)*\]", array): + match = re.match(r"\[[a-zA-Z0-9_]+(?:,[a-zA-Z0-9_]+)*\]", array) + if match: + parsed_array = list(map(str, match.group().strip("[]").split(","))) + else: + raise OutputParserException( + f"Unable to parse the array provided in {array}. \ + Please check the format instructions." + ) + + # Validate the array + if not parsed_array: + raise OutputParserException( + f"Invalid array format in '{original_request_params}'. \ + Please check the format instructions." + ) + elif ( + isinstance(parsed_array[0], int) + and parsed_array[-1] > self.dataframe.index.max() + ): + raise OutputParserException( + f"The maximum index {parsed_array[-1]} exceeds the maximum index of \ + the Pandas DataFrame {self.dataframe.index.max()}." + ) + + return parsed_array, original_request_params.split("[")[0] + + def parse(self, request: str) -> Dict[str, Any]: + stripped_request_params = None + splitted_request = request.strip().split(":") + if len(splitted_request) != 2: + raise OutputParserException( + f"Request '{request}' is not correctly formatted. \ + Please refer to the format instructions." + ) + result = {} + try: + request_type, request_params = splitted_request + if request_type in {"Invalid column", "Invalid operation"}: + raise OutputParserException( + f"{request}. Please check the format instructions." + ) + array_exists = re.search(r"(\[.*?\])", request_params) + if array_exists: + parsed_array, stripped_request_params = self.parse_array( + array_exists.group(1), request_params + ) + if request_type == "column": + filtered_df = self.dataframe[ + self.dataframe.index.isin(parsed_array) + ] + if len(parsed_array) == 1: + result[stripped_request_params] = filtered_df[ + stripped_request_params + ].iloc[parsed_array[0]] + else: + result[stripped_request_params] = filtered_df[ + stripped_request_params + ] + elif request_type == "row": + filtered_df = self.dataframe[ + self.dataframe.columns.intersection(parsed_array) + ] + if len(parsed_array) == 1: + result[stripped_request_params] = filtered_df.iloc[ + int(stripped_request_params) + ][parsed_array[0]] + else: + result[stripped_request_params] = filtered_df.iloc[ + int(stripped_request_params) + ] + else: + filtered_df = self.dataframe[ + self.dataframe.index.isin(parsed_array) + ] + result[request_type] = getattr( + filtered_df[stripped_request_params], request_type + )() + else: + if request_type == "column": + result[request_params] = self.dataframe[request_params] + elif request_type == "row": + result[request_params] = self.dataframe.iloc[int(request_params)] + else: + result[request_type] = getattr( + self.dataframe[request_params], request_type + )() + except (AttributeError, IndexError, KeyError): + if request_type not in {"column", "row"}: + raise OutputParserException( + f"Unsupported request type '{request_type}'. \ + Please check the format instructions." + ) + raise OutputParserException( + f"""Requested index { + request_params + if stripped_request_params is None + else stripped_request_params + } is out of bounds.""" + ) + + return result + + def get_format_instructions(self) -> str: + return PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS.format( + columns=", ".join(self.dataframe.columns) + ) diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py index e4e8d0c1c14..1bd2bce22fc 100644 --- a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py +++ b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py @@ -11,6 +11,7 @@ EXPECTED_ALL = [ "MarkdownListOutputParser", "NumberedListOutputParser", "OutputFixingParser", + "PandasDataFrameOutputParser", "PydanticOutputParser", "RegexDictParser", "RegexParser", diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py b/libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py new file mode 100644 index 00000000000..4d809220642 --- /dev/null +++ b/libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py @@ -0,0 +1,110 @@ +"""Test PandasDataframeParser""" +import pandas as pd + +from langchain.output_parsers.pandas_dataframe import PandasDataFrameOutputParser +from langchain.schema import OutputParserException + +df = pd.DataFrame( + {"chicken": [1, 2, 3, 4], "veggies": [5, 4, 3, 2], "steak": [9, 8, 7, 6]} +) + +parser = PandasDataFrameOutputParser(dataframe=df) + + +# Test Invalid Column +def test_pandas_output_parser_col_no_array() -> None: + try: + parser.parse("column:num_legs") + assert False, "Should have raised OutputParserException" + except OutputParserException: + assert True + + +# Test Column with invalid array (above DataFrame max index) +def test_pandas_output_parser_col_oob() -> None: + try: + parser.parse("row:10") + assert False, "Should have raised OutputParserException" + except OutputParserException: + assert True + + +# Test Column with array [x] +def test_pandas_output_parser_col_first_elem() -> None: + expected_output = {"chicken": 1} + actual_output = parser.parse("column:chicken[0]") + assert actual_output == expected_output + + +# Test Column with array [x,y,z] +def test_pandas_output_parser_col_multi_elem() -> None: + expected_output = {"chicken": pd.Series([1, 2], name="chicken", dtype="int64")} + actual_output = parser.parse("column:chicken[0, 1]") + for key in actual_output.keys(): + assert expected_output["chicken"].equals(actual_output[key]) + + +# Test Row with invalid row entry +def test_pandas_output_parser_row_no_array() -> None: + try: + parser.parse("row:5") + assert False, "Should have raised OutputParserException" + except OutputParserException: + assert True + + +# Test Row with valid row entry +def test_pandas_output_parser_row_first() -> None: + expected_output = {"1": pd.Series({"chicken": 2, "veggies": 4, "steak": 8})} + actual_output = parser.parse("row:1") + assert actual_output["1"].equals(expected_output["1"]) + + +# Test Row with invalid col entry +def test_pandas_output_parser_row_no_column() -> None: + try: + parser.parse("row:1[num_legs]") + assert False, "Should have raised OutputParserException" + except OutputParserException: + assert True + + +# Test Row with valid col entry +def test_pandas_output_parser_row_col_1() -> None: + expected_output = {"1": 2} + actual_output = parser.parse("row:1[chicken]") + assert actual_output == expected_output + + +def test_pandas_output_parser_special_ops() -> None: + actual_output = [ + {"mean": 3.0}, + {"median": 3.0}, + {"min": 2}, + {"max": 4}, + {"var": 1.0}, + {"std": 1.0}, + {"count": 3}, + {"quantile": 3.0}, + ] + + expected_output = [ + parser.parse("mean:chicken[1..3]"), + parser.parse("median:chicken[1..3]"), + parser.parse("min:chicken[1..3]"), + parser.parse("max:chicken[1..3]"), + parser.parse("var:chicken[1..3]"), + parser.parse("std:chicken[1..3]"), + parser.parse("count:chicken[1..3]"), + parser.parse("quantile:chicken[1..3]"), + ] + + assert actual_output == expected_output + + +def test_pandas_output_parser_invalid_special_op() -> None: + try: + parser.parse("riemann_sum:chicken") + assert False, "Should have raised OutputParserException" + except OutputParserException: + assert True