{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Spark Dataframe Agent\n", "\n", "This notebook shows how to use agents to interact with a Spark dataframe. It is mostly optimized for question answering.\n", "\n", "**NOTE: this agent calls the Python agent under the hood, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from langchain.agents import create_spark_dataframe_agent\n", "import os\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = \"...input_your_openai_api_key...\"" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n", "|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|\n", "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n", "| 1| 0| 3|Braund, Mr. Owen ...| male|22.0| 1| 0| A/5 21171| 7.25| null| S|\n", "| 2| 1| 1|Cumings, Mrs. Joh...|female|38.0| 1| 0| PC 17599|71.2833| C85| C|\n", "| 3| 1| 3|Heikkinen, Miss. ...|female|26.0| 0| 0|STON/O2. 3101282| 7.925| null| S|\n", "| 4| 1| 1|Futrelle, Mrs. Ja...|female|35.0| 1| 0| 113803| 53.1| C123| S|\n", "| 5| 0| 3|Allen, Mr. Willia...| male|35.0| 0| 0| 373450| 8.05| null| S|\n", "| 6| 0| 3| Moran, Mr. James| male|null| 0| 0| 330877| 8.4583| null| Q|\n", "| 7| 0| 1|McCarthy, Mr. Tim...| male|54.0| 0| 0| 17463|51.8625| E46| S|\n", "| 8| 0| 3|Palsson, Master. ...| male| 2.0| 3| 1| 349909| 21.075| null| S|\n", "| 9| 1| 3|Johnson, Mrs. Osc...|female|27.0| 0| 2| 347742|11.1333| null| S|\n", "| 10| 1| 2|Nasser, Mrs. Nich...|female|14.0| 1| 0| 237736|30.0708| null| C|\n", "| 11| 1| 3|Sandstrom, Miss. ...|female| 4.0| 1| 1| PP 9549| 16.7| G6| S|\n", "| 12| 1| 1|Bonnell, Miss. El...|female|58.0| 0| 0| 113783| 26.55| C103| S|\n", "| 13| 0| 3|Saundercock, Mr. ...| male|20.0| 0| 0| A/5. 2151| 8.05| null| S|\n", "| 14| 0| 3|Andersson, Mr. An...| male|39.0| 1| 5| 347082| 31.275| null| S|\n", "| 15| 0| 3|Vestrom, Miss. Hu...|female|14.0| 0| 0| 350406| 7.8542| null| S|\n", "| 16| 1| 2|Hewlett, Mrs. (Ma...|female|55.0| 0| 0| 248706| 16.0| null| S|\n", "| 17| 0| 3|Rice, Master. Eugene| male| 2.0| 4| 1| 382652| 29.125| null| Q|\n", "| 18| 1| 2|Williams, Mr. Cha...| male|null| 0| 0| 244373| 13.0| null| S|\n", "| 19| 0| 3|Vander Planke, Mr...|female|31.0| 1| 0| 345763| 18.0| null| S|\n", "| 20| 1| 3|Masselmani, Mrs. ...|female|null| 0| 0| 2649| 7.225| null| C|\n", "+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "from langchain.llms import OpenAI\n", "from pyspark.sql import SparkSession\n", "\n", "spark = SparkSession.builder.getOrCreate()\n", "csv_file_path = \"titanic.csv\"\n", "df = spark.read.csv(csv_file_path, header=True, inferSchema=True)\n", "df.show()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "agent = create_spark_dataframe_agent(llm=OpenAI(temperature=0), df=df, verbose=True)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3mThought: I need to find out how many rows are in the dataframe\n", "Action: python_repl_ast\n", "Action Input: df.count()\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m891\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n", "Final Answer: There are 891 rows in the dataframe.\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] }, { "data": { "text/plain": [ "'There are 891 rows in the dataframe.'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "agent.run(\"how many rows are there?\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3mThought: I need to find out how many people have more than 3 siblings\n", "Action: python_repl_ast\n", "Action Input: df.filter(df.SibSp > 3).count()\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m30\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n", "Final Answer: 30 people have more than 3 siblings.\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] }, { "data": { "text/plain": [ "'30 people have more than 3 siblings.'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "agent.run(\"how many people have more than 3 siblings\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3mThought: I need to get the average age first\n", "Action: python_repl_ast\n", "Action Input: df.agg({\"Age\": \"mean\"}).collect()[0][0]\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m29.69911764705882\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m I now have the average age, I need to get the square root\n", "Action: python_repl_ast\n", "Action Input: math.sqrt(29.69911764705882)\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3mname 'math' is not defined\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m I need to import math first\n", "Action: python_repl_ast\n", "Action Input: import math\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m I now have the math library imported, I can get the square root\n", "Action: python_repl_ast\n", "Action Input: math.sqrt(29.69911764705882)\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m5.449689683556195\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n", "Final Answer: 5.449689683556195\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] }, { "data": { "text/plain": [ "'5.449689683556195'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "agent.run(\"whats the square root of the average age?\")" ] } ], "metadata": { "kernelspec": { "display_name": "LangChain", "language": "python", "name": "langchain" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }