mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-09 04:50:37 +00:00
Harrison/airbyte (#989)
Co-authored-by: zanderchase <zanderchase@gmail.com> Co-authored-by: Harrison Chase <harrisonchase@Harrisons-MacBook-Pro.local>
This commit is contained in:
parent
e9799d6821
commit
2e96704d59
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
171
docs/modules/document_loaders/examples/airbyte_json.ipynb
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1f3a5ebf",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Airbyte JSON\n",
|
||||||
|
"This covers how to load any source from Airbyte into a local JSON file that can be read in as a document\n",
|
||||||
|
"\n",
|
||||||
|
"Prereqs:\n",
|
||||||
|
"Have docker desktop installed\n",
|
||||||
|
"\n",
|
||||||
|
"Steps:\n",
|
||||||
|
"\n",
|
||||||
|
"1) clone Airbyte from GitHub - `git clone https://github.com/airbytehq/airbyte.git`\n",
|
||||||
|
"\n",
|
||||||
|
"2) switch into Airbyte directory - `cd airbyte`\n",
|
||||||
|
"\n",
|
||||||
|
"3) start Airbyte - `docker compose up`\n",
|
||||||
|
"\n",
|
||||||
|
"4) In your browser, just visit http://localhost:8000. You will be asked for a username and password. By default, that's username `airbyte` and password `password`.\n",
|
||||||
|
"\n",
|
||||||
|
"5) Setup any source you wish\n",
|
||||||
|
"\n",
|
||||||
|
"6) Set destination as Local JSON, with specified destination path - lets say `/json_data`. Set up manual sync.\n",
|
||||||
|
"\n",
|
||||||
|
"7) Run the connection!\n",
|
||||||
|
"\n",
|
||||||
|
"7) To see what files are create, you can navigate to: `file:///tmp/airbyte_local`\n",
|
||||||
|
"\n",
|
||||||
|
"8) Find your data and copy path. That path should be saved in the file variable below. It should start with `/tmp/airbyte_local`\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "180c8b74",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import AirbyteJSONLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "4af10665",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"_airbyte_raw_pokemon.jsonl\r\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!ls /tmp/airbyte_local/json_data/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "721d9316",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = AirbyteJSONLoader('/tmp/airbyte_local/json_data/_airbyte_raw_pokemon.jsonl')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "9858b946",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "fca024cb",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"abilities: \n",
|
||||||
|
"ability: \n",
|
||||||
|
"name: blaze\n",
|
||||||
|
"url: https://pokeapi.co/api/v2/ability/66/\n",
|
||||||
|
"\n",
|
||||||
|
"is_hidden: False\n",
|
||||||
|
"slot: 1\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"ability: \n",
|
||||||
|
"name: solar-power\n",
|
||||||
|
"url: https://pokeapi.co/api/v2/ability/94/\n",
|
||||||
|
"\n",
|
||||||
|
"is_hidden: True\n",
|
||||||
|
"slot: 3\n",
|
||||||
|
"\n",
|
||||||
|
"base_experience: 267\n",
|
||||||
|
"forms: \n",
|
||||||
|
"name: charizard\n",
|
||||||
|
"url: https://pokeapi.co/api/v2/pokemon-form/6/\n",
|
||||||
|
"\n",
|
||||||
|
"game_indices: \n",
|
||||||
|
"game_index: 180\n",
|
||||||
|
"version: \n",
|
||||||
|
"name: red\n",
|
||||||
|
"url: https://pokeapi.co/api/v2/version/1/\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"game_index: 180\n",
|
||||||
|
"version: \n",
|
||||||
|
"name: blue\n",
|
||||||
|
"url: https://pokeapi.co/api/v2/version/2/\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"game_index: 180\n",
|
||||||
|
"version: \n",
|
||||||
|
"n\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(data[0].page_content[:500])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "9fa002a5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -49,6 +49,8 @@ There are a lot of different document loaders that LangChain supports. Below are
|
|||||||
|
|
||||||
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
|
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
|
||||||
|
|
||||||
|
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
|
||||||
|
|
||||||
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
|
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""All different types of document loaders."""
|
"""All different types of document loaders."""
|
||||||
|
|
||||||
|
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
@ -53,5 +54,6 @@ __all__ = [
|
|||||||
"GutenbergLoader",
|
"GutenbergLoader",
|
||||||
"PagedPDFSplitter",
|
"PagedPDFSplitter",
|
||||||
"EveryNoteLoader",
|
"EveryNoteLoader",
|
||||||
|
"AirbyteJSONLoader",
|
||||||
"OnlinePDFLoader",
|
"OnlinePDFLoader",
|
||||||
]
|
]
|
||||||
|
41
langchain/document_loaders/airbyte_json.py
Normal file
41
langchain/document_loaders/airbyte_json.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
"""Loader that loads local airbyte json files."""
|
||||||
|
import json
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
def _stringify_value(val: Any) -> str:
|
||||||
|
if isinstance(val, str):
|
||||||
|
return val
|
||||||
|
elif isinstance(val, dict):
|
||||||
|
return "\n" + _stringify_dict(val)
|
||||||
|
elif isinstance(val, list):
|
||||||
|
return "\n".join(_stringify_value(v) for v in val)
|
||||||
|
else:
|
||||||
|
return str(val)
|
||||||
|
|
||||||
|
|
||||||
|
def _stringify_dict(data: dict) -> str:
|
||||||
|
text = ""
|
||||||
|
for key, value in data.items():
|
||||||
|
text += key + ": " + _stringify_value(data[key]) + "\n"
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
class AirbyteJSONLoader(BaseLoader):
|
||||||
|
"""Loader that loads local airbyte json files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path. This should start with '/tmp/airbyte_local/'."""
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
text = ""
|
||||||
|
for line in open(self.file_path, "r"):
|
||||||
|
data = json.loads(line)["_airbyte_data"]
|
||||||
|
text += _stringify_dict(data)
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue
Block a user