mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-30 23:29:54 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			223 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			223 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | |
|  "cells": [
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "# Google BigQuery\n",
 | |
|     "\n",
 | |
|     ">[Google BigQuery](https://cloud.google.com/bigquery) is a serverless and cost-effective enterprise data warehouse that works across clouds and scales with your data.\n",
 | |
|     "`BigQuery` is a part of the `Google Cloud Platform`.\n",
 | |
|     "\n",
 | |
|     "Load a `BigQuery` query with one document per row."
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "metadata": {
 | |
|     "tags": []
 | |
|    },
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "#!pip install google-cloud-bigquery"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 3,
 | |
|    "metadata": {
 | |
|     "tags": []
 | |
|    },
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "from langchain.document_loaders import BigQueryLoader"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 3,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "BASE_QUERY = \"\"\"\n",
 | |
|     "SELECT\n",
 | |
|     "  id,\n",
 | |
|     "  dna_sequence,\n",
 | |
|     "  organism\n",
 | |
|     "FROM (\n",
 | |
|     "  SELECT\n",
 | |
|     "    ARRAY (\n",
 | |
|     "    SELECT\n",
 | |
|     "      AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n",
 | |
|     "    UNION ALL\n",
 | |
|     "    SELECT\n",
 | |
|     "      AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n",
 | |
|     "    UNION ALL\n",
 | |
|     "    SELECT\n",
 | |
|     "      AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n",
 | |
|     "  UNNEST(new_array)\n",
 | |
|     "\"\"\""
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "## Basic Usage"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 6,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "loader = BigQueryLoader(BASE_QUERY)\n",
 | |
|     "\n",
 | |
|     "data = loader.load()"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 7,
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={}, lookup_index=0)]\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "print(data)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "## Specifying Which Columns are Content vs Metadata"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 8,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "loader = BigQueryLoader(\n",
 | |
|     "    BASE_QUERY,\n",
 | |
|     "    page_content_columns=[\"dna_sequence\", \"organism\"],\n",
 | |
|     "    metadata_columns=[\"id\"],\n",
 | |
|     ")\n",
 | |
|     "\n",
 | |
|     "data = loader.load()"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 9,
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "[Document(page_content='dna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={'id': 1}, lookup_index=0), Document(page_content='dna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={'id': 2}, lookup_index=0), Document(page_content='dna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={'id': 3}, lookup_index=0)]\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "print(data)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "## Adding Source to Metadata"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 18,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "# Note that the `id` column is being returned twice, with one instance aliased as `source`\n",
 | |
|     "ALIASED_QUERY = \"\"\"\n",
 | |
|     "SELECT\n",
 | |
|     "  id,\n",
 | |
|     "  dna_sequence,\n",
 | |
|     "  organism,\n",
 | |
|     "  id as source\n",
 | |
|     "FROM (\n",
 | |
|     "  SELECT\n",
 | |
|     "    ARRAY (\n",
 | |
|     "    SELECT\n",
 | |
|     "      AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n",
 | |
|     "    UNION ALL\n",
 | |
|     "    SELECT\n",
 | |
|     "      AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n",
 | |
|     "    UNION ALL\n",
 | |
|     "    SELECT\n",
 | |
|     "      AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n",
 | |
|     "  UNNEST(new_array)\n",
 | |
|     "\"\"\""
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 19,
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "loader = BigQueryLoader(ALIASED_QUERY, metadata_columns=[\"source\"])\n",
 | |
|     "\n",
 | |
|     "data = loader.load()"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 20,
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).\\nsource: 1', lookup_str='', metadata={'source': 1}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).\\nsource: 2', lookup_str='', metadata={'source': 2}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).\\nsource: 3', lookup_str='', metadata={'source': 3}, lookup_index=0)]\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "print(data)"
 | |
|    ]
 | |
|   }
 | |
|  ],
 | |
|  "metadata": {
 | |
|   "kernelspec": {
 | |
|    "display_name": "Python 3 (ipykernel)",
 | |
|    "language": "python",
 | |
|    "name": "python3"
 | |
|   },
 | |
|   "language_info": {
 | |
|    "codemirror_mode": {
 | |
|     "name": "ipython",
 | |
|     "version": 3
 | |
|    },
 | |
|    "file_extension": ".py",
 | |
|    "mimetype": "text/x-python",
 | |
|    "name": "python",
 | |
|    "nbconvert_exporter": "python",
 | |
|    "pygments_lexer": "ipython3",
 | |
|    "version": "3.10.6"
 | |
|   }
 | |
|  },
 | |
|  "nbformat": 4,
 | |
|  "nbformat_minor": 4
 | |
| }
 |