mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-05 19:15:44 +00:00
Harrison/airbyte (#989)
Co-authored-by: zanderchase <zanderchase@gmail.com> Co-authored-by: Harrison Chase <harrisonchase@Harrisons-MacBook-Pro.local>
This commit is contained in:
parent
e9799d6821
commit
2e96704d59
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
@ -0,0 +1,171 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1f3a5ebf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Airbyte JSON\n",
|
||||
"This covers how to load any source from Airbyte into a local JSON file that can be read in as a document\n",
|
||||
"\n",
|
||||
"Prereqs:\n",
|
||||
"Have docker desktop installed\n",
|
||||
"\n",
|
||||
"Steps:\n",
|
||||
"\n",
|
||||
"1) clone Airbyte from GitHub - `git clone https://github.com/airbytehq/airbyte.git`\n",
|
||||
"\n",
|
||||
"2) switch into Airbyte directory - `cd airbyte`\n",
|
||||
"\n",
|
||||
"3) start Airbyte - `docker compose up`\n",
|
||||
"\n",
|
||||
"4) In your browser, just visit http://localhost:8000. You will be asked for a username and password. By default, that's username `airbyte` and password `password`.\n",
|
||||
"\n",
|
||||
"5) Setup any source you wish\n",
|
||||
"\n",
|
||||
"6) Set destination as Local JSON, with specified destination path - lets say `/json_data`. Set up manual sync.\n",
|
||||
"\n",
|
||||
"7) Run the connection!\n",
|
||||
"\n",
|
||||
"7) To see what files are create, you can navigate to: `file:///tmp/airbyte_local`\n",
|
||||
"\n",
|
||||
"8) Find your data and copy path. That path should be saved in the file variable below. It should start with `/tmp/airbyte_local`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "180c8b74",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AirbyteJSONLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4af10665",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"_airbyte_raw_pokemon.jsonl\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls /tmp/airbyte_local/json_data/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "721d9316",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AirbyteJSONLoader('/tmp/airbyte_local/json_data/_airbyte_raw_pokemon.jsonl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "9858b946",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "fca024cb",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"abilities: \n",
|
||||
"ability: \n",
|
||||
"name: blaze\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/66/\n",
|
||||
"\n",
|
||||
"is_hidden: False\n",
|
||||
"slot: 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ability: \n",
|
||||
"name: solar-power\n",
|
||||
"url: https://pokeapi.co/api/v2/ability/94/\n",
|
||||
"\n",
|
||||
"is_hidden: True\n",
|
||||
"slot: 3\n",
|
||||
"\n",
|
||||
"base_experience: 267\n",
|
||||
"forms: \n",
|
||||
"name: charizard\n",
|
||||
"url: https://pokeapi.co/api/v2/pokemon-form/6/\n",
|
||||
"\n",
|
||||
"game_indices: \n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: red\n",
|
||||
"url: https://pokeapi.co/api/v2/version/1/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"name: blue\n",
|
||||
"url: https://pokeapi.co/api/v2/version/2/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"game_index: 180\n",
|
||||
"version: \n",
|
||||
"n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data[0].page_content[:500])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9fa002a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -49,6 +49,8 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
|
||||
|
||||
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
|
||||
|
||||
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
|
||||
|
||||
.. toctree::
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""All different types of document loaders."""
|
||||
|
||||
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
@ -53,5 +54,6 @@ __all__ = [
|
||||
"GutenbergLoader",
|
||||
"PagedPDFSplitter",
|
||||
"EveryNoteLoader",
|
||||
"AirbyteJSONLoader",
|
||||
"OnlinePDFLoader",
|
||||
]
|
||||
|
41
langchain/document_loaders/airbyte_json.py
Normal file
41
langchain/document_loaders/airbyte_json.py
Normal file
@ -0,0 +1,41 @@
|
||||
"""Loader that loads local airbyte json files."""
|
||||
import json
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def _stringify_value(val: Any) -> str:
|
||||
if isinstance(val, str):
|
||||
return val
|
||||
elif isinstance(val, dict):
|
||||
return "\n" + _stringify_dict(val)
|
||||
elif isinstance(val, list):
|
||||
return "\n".join(_stringify_value(v) for v in val)
|
||||
else:
|
||||
return str(val)
|
||||
|
||||
|
||||
def _stringify_dict(data: dict) -> str:
|
||||
text = ""
|
||||
for key, value in data.items():
|
||||
text += key + ": " + _stringify_value(data[key]) + "\n"
|
||||
return text
|
||||
|
||||
|
||||
class AirbyteJSONLoader(BaseLoader):
|
||||
"""Loader that loads local airbyte json files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path. This should start with '/tmp/airbyte_local/'."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
text = ""
|
||||
for line in open(self.file_path, "r"):
|
||||
data = json.loads(line)["_airbyte_data"]
|
||||
text += _stringify_dict(data)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue
Block a user