mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-31 07:41:40 +00:00 
			
		
		
		
	# Add minor fixes for PySpark Document Loader Docs Renamed "PySpack" to "PySpark" and executed the notebook to show outputs.
		
			
				
	
	
		
			156 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			156 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | |
|  "cells": [
 | |
|   {
 | |
|    "attachments": {},
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "# PySpark DataFrame Loader\n",
 | |
|     "\n",
 | |
|     "This notebook goes over how to load data from a [PySpark](https://spark.apache.org/docs/latest/api/python/) DataFrame."
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 1,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "#!pip install pyspark"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 2,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "from pyspark.sql import SparkSession"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 3,
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stderr",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "Setting default log level to \"WARN\".\n",
 | |
|       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 | |
|       "23/05/31 14:08:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "spark = SparkSession.builder.getOrCreate()"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 4,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "df = spark.read.csv('example_data/mlb_teams_2012.csv', header=True)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 5,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "from langchain.document_loaders import PySparkDataFrameLoader"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 6,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "loader = PySparkDataFrameLoader(spark, df, page_content_column=\"Team\")"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 7,
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stderr",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "[Stage 8:>                                                          (0 + 1) / 1]\r"
 | |
|      ]
 | |
|     },
 | |
|     {
 | |
|      "data": {
 | |
|       "text/plain": [
 | |
|        "[Document(page_content='Nationals', metadata={' \"Payroll (millions)\"': '     81.34', ' \"Wins\"': ' 98'}),\n",
 | |
|        " Document(page_content='Reds', metadata={' \"Payroll (millions)\"': '          82.20', ' \"Wins\"': ' 97'}),\n",
 | |
|        " Document(page_content='Yankees', metadata={' \"Payroll (millions)\"': '      197.96', ' \"Wins\"': ' 95'}),\n",
 | |
|        " Document(page_content='Giants', metadata={' \"Payroll (millions)\"': '       117.62', ' \"Wins\"': ' 94'}),\n",
 | |
|        " Document(page_content='Braves', metadata={' \"Payroll (millions)\"': '        83.31', ' \"Wins\"': ' 94'}),\n",
 | |
|        " Document(page_content='Athletics', metadata={' \"Payroll (millions)\"': '     55.37', ' \"Wins\"': ' 94'}),\n",
 | |
|        " Document(page_content='Rangers', metadata={' \"Payroll (millions)\"': '      120.51', ' \"Wins\"': ' 93'}),\n",
 | |
|        " Document(page_content='Orioles', metadata={' \"Payroll (millions)\"': '       81.43', ' \"Wins\"': ' 93'}),\n",
 | |
|        " Document(page_content='Rays', metadata={' \"Payroll (millions)\"': '          64.17', ' \"Wins\"': ' 90'}),\n",
 | |
|        " Document(page_content='Angels', metadata={' \"Payroll (millions)\"': '       154.49', ' \"Wins\"': ' 89'}),\n",
 | |
|        " Document(page_content='Tigers', metadata={' \"Payroll (millions)\"': '       132.30', ' \"Wins\"': ' 88'}),\n",
 | |
|        " Document(page_content='Cardinals', metadata={' \"Payroll (millions)\"': '    110.30', ' \"Wins\"': ' 88'}),\n",
 | |
|        " Document(page_content='Dodgers', metadata={' \"Payroll (millions)\"': '       95.14', ' \"Wins\"': ' 86'}),\n",
 | |
|        " Document(page_content='White Sox', metadata={' \"Payroll (millions)\"': '     96.92', ' \"Wins\"': ' 85'}),\n",
 | |
|        " Document(page_content='Brewers', metadata={' \"Payroll (millions)\"': '       97.65', ' \"Wins\"': ' 83'}),\n",
 | |
|        " Document(page_content='Phillies', metadata={' \"Payroll (millions)\"': '     174.54', ' \"Wins\"': ' 81'}),\n",
 | |
|        " Document(page_content='Diamondbacks', metadata={' \"Payroll (millions)\"': '  74.28', ' \"Wins\"': ' 81'}),\n",
 | |
|        " Document(page_content='Pirates', metadata={' \"Payroll (millions)\"': '       63.43', ' \"Wins\"': ' 79'}),\n",
 | |
|        " Document(page_content='Padres', metadata={' \"Payroll (millions)\"': '        55.24', ' \"Wins\"': ' 76'}),\n",
 | |
|        " Document(page_content='Mariners', metadata={' \"Payroll (millions)\"': '      81.97', ' \"Wins\"': ' 75'}),\n",
 | |
|        " Document(page_content='Mets', metadata={' \"Payroll (millions)\"': '          93.35', ' \"Wins\"': ' 74'}),\n",
 | |
|        " Document(page_content='Blue Jays', metadata={' \"Payroll (millions)\"': '     75.48', ' \"Wins\"': ' 73'}),\n",
 | |
|        " Document(page_content='Royals', metadata={' \"Payroll (millions)\"': '        60.91', ' \"Wins\"': ' 72'}),\n",
 | |
|        " Document(page_content='Marlins', metadata={' \"Payroll (millions)\"': '      118.07', ' \"Wins\"': ' 69'}),\n",
 | |
|        " Document(page_content='Red Sox', metadata={' \"Payroll (millions)\"': '      173.18', ' \"Wins\"': ' 69'}),\n",
 | |
|        " Document(page_content='Indians', metadata={' \"Payroll (millions)\"': '       78.43', ' \"Wins\"': ' 68'}),\n",
 | |
|        " Document(page_content='Twins', metadata={' \"Payroll (millions)\"': '         94.08', ' \"Wins\"': ' 66'}),\n",
 | |
|        " Document(page_content='Rockies', metadata={' \"Payroll (millions)\"': '       78.06', ' \"Wins\"': ' 64'}),\n",
 | |
|        " Document(page_content='Cubs', metadata={' \"Payroll (millions)\"': '          88.19', ' \"Wins\"': ' 61'}),\n",
 | |
|        " Document(page_content='Astros', metadata={' \"Payroll (millions)\"': '        60.65', ' \"Wins\"': ' 55'})]"
 | |
|       ]
 | |
|      },
 | |
|      "execution_count": 7,
 | |
|      "metadata": {},
 | |
|      "output_type": "execute_result"
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "loader.load()"
 | |
|    ]
 | |
|   }
 | |
|  ],
 | |
|  "metadata": {
 | |
|   "kernelspec": {
 | |
|    "display_name": "Python 3 (ipykernel)",
 | |
|    "language": "python",
 | |
|    "name": "python3"
 | |
|   },
 | |
|   "language_info": {
 | |
|    "codemirror_mode": {
 | |
|     "name": "ipython",
 | |
|     "version": 3
 | |
|    },
 | |
|    "file_extension": ".py",
 | |
|    "mimetype": "text/x-python",
 | |
|    "name": "python",
 | |
|    "nbconvert_exporter": "python",
 | |
|    "pygments_lexer": "ipython3",
 | |
|    "version": "3.10.9"
 | |
|   }
 | |
|  },
 | |
|  "nbformat": 4,
 | |
|  "nbformat_minor": 2
 | |
| }
 |