From 41a4c06a949c9b76f32f2d56d5232f67182c7330 Mon Sep 17 00:00:00 2001
From: Rohan Dey <58871401+RohanDey02@users.noreply.github.com>
Date: Wed, 29 Nov 2023 22:08:50 -0500
Subject: [PATCH] Added support for a Pandas DataFrame OutputParser (#13257)

**Description:**

Added support for a Pandas DataFrame OutputParser with format
instructions, along with unit tests and a demo notebook. Namely, we've
added the ability to request data from a DataFrame, have the LLM parse
the request, and then use that request to retrieve a well-formatted
response.

Within LangChain, it seamlessly integrates with language models like
OpenAI's `text-davinci-003`, facilitating streamlined interaction using
the format instructions (just like the other output parsers).

This parser structures its requests as
`<operation/column/row>[<optional_array_params>]`. The instructions
detail permissible operations, valid columns, and array formats,
ensuring clarity and adherence to the required format.

For example:

- When the LLM receives the input: "Retrieve the mean of `num_legs` from
rows 1 to 3."
- The provided format instructions guide the LLM to structure the
request as: "mean:num_legs[1..3]".

The parser processes this formatted request, leveraging the LLM's
understanding to extract the mean of `num_legs` from rows 1 to 3 within
the Pandas DataFrame.

This integration allows users to communicate requests naturally, with
the LLM transforming these instructions into structured commands
understood by the `PandasDataFrameOutputParser`. The format instructions
act as a bridge between natural language queries and precise DataFrame
operations, optimizing communication and data retrieval.

**Issue:**

- https://github.com/langchain-ai/langchain/issues/11532

**Dependencies:**

No additional dependencies :)

**Tag maintainer:**

@baskaryan

**Twitter handle:**

No need. :)

---------

Co-authored-by: Wasee Alam <waseealam@protonmail.com>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
---
 .../output_parsers/pandas_dataframe.ipynb     | 229 ++++++++++++++++++
 .../langchain/output_parsers/__init__.py      |   2 +
 .../output_parsers/format_instructions.py     |  22 ++
 .../output_parsers/pandas_dataframe.py        | 157 ++++++++++++
 .../unit_tests/output_parsers/test_imports.py |   1 +
 .../test_pandas_dataframe_parser.py           | 110 +++++++++
 6 files changed, 521 insertions(+)
 create mode 100644 docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb
 create mode 100644 libs/langchain/langchain/output_parsers/pandas_dataframe.py
 create mode 100644 libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py

diff --git a/docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb b/docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb
new file mode 100644
index 00000000000..ea0e32ed903
--- /dev/null
+++ b/docs/docs/modules/model_io/output_parsers/pandas_dataframe.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pandas DataFrame Parser\n",
+    "\n",
+    "A Pandas DataFrame is a popular data structure in the Python programming language, commonly used for data manipulation and analysis. It provides a comprehensive set of tools for working with structured data, making it a versatile option for tasks such as data cleaning, transformation, and analysis.\n",
+    "\n",
+    "This output parser allows users to specify an arbitrary Pandas DataFrame and query LLMs for data in the form of a formatted dictionary that extracts data from the corresponding DataFrame. Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate a well-formed query as per the defined format instructions.\n",
+    "\n",
+    "Use Pandas' DataFrame object to declare the DataFrame you wish to perform queries on."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pprint\n",
+    "from typing import Any, Dict\n",
+    "\n",
+    "import pandas as pd\n",
+    "from langchain.llms import OpenAI\n",
+    "from langchain.output_parsers import PandasDataFrameOutputParser\n",
+    "from langchain.prompts import PromptTemplate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"text-davinci-003\"\n",
+    "temperature = 0.5\n",
+    "model = OpenAI(model_name=model_name, temperature=temperature)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Solely for documentation purposes.\n",
+    "def format_parser_output(parser_output: Dict[str, Any]) -> None:\n",
+    "    for key in parser_output.keys():\n",
+    "        parser_output[key] = parser_output[key].to_dict()\n",
+    "    return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define your desired Pandas DataFrame.\n",
+    "df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"num_legs\": [2, 4, 8, 0],\n",
+    "        \"num_wings\": [2, 0, 0, 0],\n",
+    "        \"num_specimen_seen\": [10, 2, 1, 8],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# Set up a parser + inject instructions into the prompt template.\n",
+    "parser = PandasDataFrameOutputParser(dataframe=df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLM Output: column:num_wings\n",
+      "{'num_wings': {0: 2,\n",
+      "               1: 0,\n",
+      "               2: 0,\n",
+      "               3: 0}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Here's an example of a column operation being performed.\n",
+    "df_query = \"Retrieve the num_wings column.\"\n",
+    "\n",
+    "# Set up the prompt.\n",
+    "prompt = PromptTemplate(\n",
+    "    template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
+    "    input_variables=[\"query\"],\n",
+    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "_input = prompt.format_prompt(query=df_query)\n",
+    "output = model(_input.to_string())\n",
+    "print(\"LLM Output:\", output)\n",
+    "parser_output = parser.parse(output)\n",
+    "\n",
+    "format_parser_output(parser_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLM Output: row:1\n",
+      "{'1': {'num_legs': 4,\n",
+      "       'num_specimen_seen': 2,\n",
+      "       'num_wings': 0}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Here's an example of a row operation being performed.\n",
+    "df_query = \"Retrieve the first row.\"\n",
+    "\n",
+    "# Set up the prompt.\n",
+    "prompt = PromptTemplate(\n",
+    "    template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
+    "    input_variables=[\"query\"],\n",
+    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "_input = prompt.format_prompt(query=df_query)\n",
+    "output = model(_input.to_string())\n",
+    "print(\"LLM Output:\", output)\n",
+    "parser_output = parser.parse(output)\n",
+    "\n",
+    "format_parser_output(parser_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLM Output: mean:num_legs[1..3]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'mean': 4.0}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Here's an example of a random Pandas DataFrame operation limiting the number of rows\n",
+    "df_query = \"Retrieve the average of the num_legs column from rows 1 to 3.\"\n",
+    "\n",
+    "# Set up the prompt.\n",
+    "prompt = PromptTemplate(\n",
+    "    template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
+    "    input_variables=[\"query\"],\n",
+    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "_input = prompt.format_prompt(query=df_query)\n",
+    "output = model(_input.to_string())\n",
+    "print(\"LLM Output:\", output)\n",
+    "parser.parse(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Here's an example of a poorly formatted query\n",
+    "df_query = \"Retrieve the mean of the num_fingers column.\"\n",
+    "\n",
+    "# Set up the prompt.\n",
+    "prompt = PromptTemplate(\n",
+    "    template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
+    "    input_variables=[\"query\"],\n",
+    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "_input = prompt.format_prompt(query=df_query)\n",
+    "output = model(_input.to_string())  # Expected Output: \"Invalid column: num_fingers\".\n",
+    "print(\"LLM Output:\", output)\n",
+    "parser.parse(output)  # Expected Output: Will raise an OutputParserException."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/libs/langchain/langchain/output_parsers/__init__.py b/libs/langchain/langchain/output_parsers/__init__.py
index 9f7848c984d..e37803b81d0 100644
--- a/libs/langchain/langchain/output_parsers/__init__.py
+++ b/libs/langchain/langchain/output_parsers/__init__.py
@@ -28,6 +28,7 @@ from langchain.output_parsers.openai_tools import (
     JsonOutputToolsParser,
     PydanticToolsParser,
 )
+from langchain.output_parsers.pandas_dataframe import PandasDataFrameOutputParser
 from langchain.output_parsers.pydantic import PydanticOutputParser
 from langchain.output_parsers.rail_parser import GuardrailsOutputParser
 from langchain.output_parsers.regex import RegexParser
@@ -47,6 +48,7 @@ __all__ = [
     "MarkdownListOutputParser",
     "NumberedListOutputParser",
     "OutputFixingParser",
+    "PandasDataFrameOutputParser",
     "PydanticOutputParser",
     "RegexDictParser",
     "RegexParser",
diff --git a/libs/langchain/langchain/output_parsers/format_instructions.py b/libs/langchain/langchain/output_parsers/format_instructions.py
index fd9fb3ba238..91f2007b702 100644
--- a/libs/langchain/langchain/output_parsers/format_instructions.py
+++ b/libs/langchain/langchain/output_parsers/format_instructions.py
@@ -41,3 +41,25 @@ Here are the output tags:
 ```
 {tags}
 ```"""
+
+
+PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS = """The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
+1. The column names are limited to the possible columns below.
+2. Arrays must either be a comma-seperated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4].
+3. Remember that arrays are optional and not necessarily required.
+4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation".
+
+As an example, for the formats:
+1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column.
+2. String "row:1" is a well-formatted instance which gets row 1.
+3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2, where num_legs is a possible column.
+4. String "row:1[num_legs]" is a well-formatted instance which gets row 1, but for just column num_legs, where num_legs is a possible column.
+5. String "mean:num_legs[1..3]" is a well-formatted instance which takes the mean of num_legs from rows 1 to 3, where num_legs is a possible column and mean is a valid Pandas DataFrame operation.
+6. String "do_something:num_legs" is a badly-formatted instance, where do_something is not a valid Pandas DataFrame operation.
+7. String "mean:invalid_col" is a badly-formatted instance, where invalid_col is not a possible column.
+
+Here are the possible columns:
+```
+{columns}
+```
+"""
diff --git a/libs/langchain/langchain/output_parsers/pandas_dataframe.py b/libs/langchain/langchain/output_parsers/pandas_dataframe.py
new file mode 100644
index 00000000000..85bde591026
--- /dev/null
+++ b/libs/langchain/langchain/output_parsers/pandas_dataframe.py
@@ -0,0 +1,157 @@
+import re
+from typing import Any, Dict, List, Tuple, Union
+
+from langchain.output_parsers.format_instructions import (
+    PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS,
+)
+from langchain.pydantic_v1 import validator
+from langchain.schema import BaseOutputParser, OutputParserException
+
+
+class PandasDataFrameOutputParser(BaseOutputParser):
+    """Parse an output using Pandas DataFrame format."""
+
+    """The Pandas DataFrame to parse."""
+    dataframe: Any
+
+    @validator("dataframe")
+    def validate_dataframe(cls, val: Any) -> Any:
+        import pandas as pd
+
+        if issubclass(type(val), pd.DataFrame):
+            return val
+        if pd.DataFrame(val).empty:
+            raise ValueError("DataFrame cannot be empty.")
+
+        raise TypeError(
+            "Wrong type for 'dataframe', must be a subclass \
+                of Pandas DataFrame (pd.DataFrame)"
+        )
+
+    def parse_array(
+        self, array: str, original_request_params: str
+    ) -> Tuple[List[Union[int, str]], str]:
+        parsed_array: List[Union[int, str]] = []
+
+        # Check if the format is [1,3,5]
+        if re.match(r"\[\d+(,\s*\d+)*\]", array):
+            parsed_array = [int(i) for i in re.findall(r"\d+", array)]
+        # Check if the format is [1..5]
+        elif re.match(r"\[(\d+)\.\.(\d+)\]", array):
+            match = re.match(r"\[(\d+)\.\.(\d+)\]", array)
+            if match:
+                start, end = map(int, match.groups())
+                parsed_array = list(range(start, end + 1))
+            else:
+                raise OutputParserException(
+                    f"Unable to parse the array provided in {array}. \
+                        Please check the format instructions."
+                )
+        # Check if the format is ["column_name"]
+        elif re.match(r"\[[a-zA-Z0-9_]+(?:,[a-zA-Z0-9_]+)*\]", array):
+            match = re.match(r"\[[a-zA-Z0-9_]+(?:,[a-zA-Z0-9_]+)*\]", array)
+            if match:
+                parsed_array = list(map(str, match.group().strip("[]").split(",")))
+            else:
+                raise OutputParserException(
+                    f"Unable to parse the array provided in {array}. \
+                        Please check the format instructions."
+                )
+
+        # Validate the array
+        if not parsed_array:
+            raise OutputParserException(
+                f"Invalid array format in '{original_request_params}'. \
+                    Please check the format instructions."
+            )
+        elif (
+            isinstance(parsed_array[0], int)
+            and parsed_array[-1] > self.dataframe.index.max()
+        ):
+            raise OutputParserException(
+                f"The maximum index {parsed_array[-1]} exceeds the maximum index of \
+                    the Pandas DataFrame {self.dataframe.index.max()}."
+            )
+
+        return parsed_array, original_request_params.split("[")[0]
+
+    def parse(self, request: str) -> Dict[str, Any]:
+        stripped_request_params = None
+        splitted_request = request.strip().split(":")
+        if len(splitted_request) != 2:
+            raise OutputParserException(
+                f"Request '{request}' is not correctly formatted. \
+                    Please refer to the format instructions."
+            )
+        result = {}
+        try:
+            request_type, request_params = splitted_request
+            if request_type in {"Invalid column", "Invalid operation"}:
+                raise OutputParserException(
+                    f"{request}. Please check the format instructions."
+                )
+            array_exists = re.search(r"(\[.*?\])", request_params)
+            if array_exists:
+                parsed_array, stripped_request_params = self.parse_array(
+                    array_exists.group(1), request_params
+                )
+                if request_type == "column":
+                    filtered_df = self.dataframe[
+                        self.dataframe.index.isin(parsed_array)
+                    ]
+                    if len(parsed_array) == 1:
+                        result[stripped_request_params] = filtered_df[
+                            stripped_request_params
+                        ].iloc[parsed_array[0]]
+                    else:
+                        result[stripped_request_params] = filtered_df[
+                            stripped_request_params
+                        ]
+                elif request_type == "row":
+                    filtered_df = self.dataframe[
+                        self.dataframe.columns.intersection(parsed_array)
+                    ]
+                    if len(parsed_array) == 1:
+                        result[stripped_request_params] = filtered_df.iloc[
+                            int(stripped_request_params)
+                        ][parsed_array[0]]
+                    else:
+                        result[stripped_request_params] = filtered_df.iloc[
+                            int(stripped_request_params)
+                        ]
+                else:
+                    filtered_df = self.dataframe[
+                        self.dataframe.index.isin(parsed_array)
+                    ]
+                    result[request_type] = getattr(
+                        filtered_df[stripped_request_params], request_type
+                    )()
+            else:
+                if request_type == "column":
+                    result[request_params] = self.dataframe[request_params]
+                elif request_type == "row":
+                    result[request_params] = self.dataframe.iloc[int(request_params)]
+                else:
+                    result[request_type] = getattr(
+                        self.dataframe[request_params], request_type
+                    )()
+        except (AttributeError, IndexError, KeyError):
+            if request_type not in {"column", "row"}:
+                raise OutputParserException(
+                    f"Unsupported request type '{request_type}'. \
+                        Please check the format instructions."
+                )
+            raise OutputParserException(
+                f"""Requested index {
+                    request_params
+                    if stripped_request_params is None
+                    else stripped_request_params
+                } is out of bounds."""
+            )
+
+        return result
+
+    def get_format_instructions(self) -> str:
+        return PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS.format(
+            columns=", ".join(self.dataframe.columns)
+        )
diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
index e4e8d0c1c14..1bd2bce22fc 100644
--- a/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_imports.py
@@ -11,6 +11,7 @@ EXPECTED_ALL = [
     "MarkdownListOutputParser",
     "NumberedListOutputParser",
     "OutputFixingParser",
+    "PandasDataFrameOutputParser",
     "PydanticOutputParser",
     "RegexDictParser",
     "RegexParser",
diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py b/libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py
new file mode 100644
index 00000000000..4d809220642
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_pandas_dataframe_parser.py
@@ -0,0 +1,110 @@
+"""Test PandasDataframeParser"""
+import pandas as pd
+
+from langchain.output_parsers.pandas_dataframe import PandasDataFrameOutputParser
+from langchain.schema import OutputParserException
+
+df = pd.DataFrame(
+    {"chicken": [1, 2, 3, 4], "veggies": [5, 4, 3, 2], "steak": [9, 8, 7, 6]}
+)
+
+parser = PandasDataFrameOutputParser(dataframe=df)
+
+
+# Test Invalid Column
+def test_pandas_output_parser_col_no_array() -> None:
+    try:
+        parser.parse("column:num_legs")
+        assert False, "Should have raised OutputParserException"
+    except OutputParserException:
+        assert True
+
+
+# Test Column with invalid array (above DataFrame max index)
+def test_pandas_output_parser_col_oob() -> None:
+    try:
+        parser.parse("row:10")
+        assert False, "Should have raised OutputParserException"
+    except OutputParserException:
+        assert True
+
+
+# Test Column with array [x]
+def test_pandas_output_parser_col_first_elem() -> None:
+    expected_output = {"chicken": 1}
+    actual_output = parser.parse("column:chicken[0]")
+    assert actual_output == expected_output
+
+
+# Test Column with array [x,y,z]
+def test_pandas_output_parser_col_multi_elem() -> None:
+    expected_output = {"chicken": pd.Series([1, 2], name="chicken", dtype="int64")}
+    actual_output = parser.parse("column:chicken[0, 1]")
+    for key in actual_output.keys():
+        assert expected_output["chicken"].equals(actual_output[key])
+
+
+# Test Row with invalid row entry
+def test_pandas_output_parser_row_no_array() -> None:
+    try:
+        parser.parse("row:5")
+        assert False, "Should have raised OutputParserException"
+    except OutputParserException:
+        assert True
+
+
+# Test Row with valid row entry
+def test_pandas_output_parser_row_first() -> None:
+    expected_output = {"1": pd.Series({"chicken": 2, "veggies": 4, "steak": 8})}
+    actual_output = parser.parse("row:1")
+    assert actual_output["1"].equals(expected_output["1"])
+
+
+# Test Row with invalid col entry
+def test_pandas_output_parser_row_no_column() -> None:
+    try:
+        parser.parse("row:1[num_legs]")
+        assert False, "Should have raised OutputParserException"
+    except OutputParserException:
+        assert True
+
+
+# Test Row with valid col entry
+def test_pandas_output_parser_row_col_1() -> None:
+    expected_output = {"1": 2}
+    actual_output = parser.parse("row:1[chicken]")
+    assert actual_output == expected_output
+
+
+def test_pandas_output_parser_special_ops() -> None:
+    actual_output = [
+        {"mean": 3.0},
+        {"median": 3.0},
+        {"min": 2},
+        {"max": 4},
+        {"var": 1.0},
+        {"std": 1.0},
+        {"count": 3},
+        {"quantile": 3.0},
+    ]
+
+    expected_output = [
+        parser.parse("mean:chicken[1..3]"),
+        parser.parse("median:chicken[1..3]"),
+        parser.parse("min:chicken[1..3]"),
+        parser.parse("max:chicken[1..3]"),
+        parser.parse("var:chicken[1..3]"),
+        parser.parse("std:chicken[1..3]"),
+        parser.parse("count:chicken[1..3]"),
+        parser.parse("quantile:chicken[1..3]"),
+    ]
+
+    assert actual_output == expected_output
+
+
+def test_pandas_output_parser_invalid_special_op() -> None:
+    try:
+        parser.parse("riemann_sum:chicken")
+        assert False, "Should have raised OutputParserException"
+    except OutputParserException:
+        assert True