mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-08 16:48:49 +00:00
adding webpage loading logic (#942)
This commit is contained in:
parent
c71027e725
commit
8e126bc9bd
93
docs/modules/document_loaders/examples/azlyrics.ipynb
Normal file
93
docs/modules/document_loaders/examples/azlyrics.ipynb
Normal file
@ -0,0 +1,93 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c31caff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AZLyrics\n",
|
||||
"This covers how to load AZLyrics webpages into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "7e6f5726",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AZLyricsLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a0df4c24",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AZLyricsLoader(\"https://www.azlyrics.com/lyrics/mileycyrus/flowers.html\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8cd61b6e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "162fd286",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content=\"Miley Cyrus - Flowers Lyrics | AZLyrics.com\\n\\r\\nWe were good, we were gold\\nKinda dream that can't be sold\\nWe were right till we weren't\\nBuilt a home and watched it burn\\n\\nI didn't wanna leave you\\nI didn't wanna lie\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\n\\nPaint my nails, cherry red\\nMatch the roses that you left\\nNo remorse, no regret\\nI forgive every word you said\\n\\nI didn't wanna leave you, baby\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours, yeah\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\\nI didn't wanna wanna leave you\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours (Yeah)\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than\\nYeah, I can love me better than you can, uh\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby (Than you can)\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\", lookup_str='', metadata={'source': 'https://www.azlyrics.com/lyrics/mileycyrus/flowers.html'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6358000c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
94
docs/modules/document_loaders/examples/imsdb.ipynb
Normal file
94
docs/modules/document_loaders/examples/imsdb.ipynb
Normal file
File diff suppressed because one or more lines are too long
117
docs/modules/document_loaders/examples/web_base.ipynb
Normal file
117
docs/modules/document_loaders/examples/web_base.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -37,6 +37,14 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`GCS Directory <./examples/gcs_directory.html>`_: A walkthrough of how to load all files in a directory from Google Cloud Storage (GCS).
|
||||
|
||||
`Web Base <./examples/web_base.html>`_: A walkthrough of how to load all text data from webpages.
|
||||
|
||||
`IMSDb <./examples/imsdb.html>`_: A walkthrough of how to load all text data from IMSDb webpage.
|
||||
|
||||
`AZLyrics <./examples/azlyrics.html>`_: A walkthrough of how to load all text data from AZLyrics webpage.
|
||||
|
||||
`College Confidential <./examples/college_confidential.html>`_: A walkthrough of how to load all text data from College Confidential webpage.
|
||||
|
||||
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
|
||||
|
||||
.. toctree::
|
||||
|
@ -1,5 +1,7 @@
|
||||
"""All different types of document loaders."""
|
||||
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
@ -8,6 +10,7 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||
from langchain.document_loaders.gutenberg import GutenbergLoader
|
||||
from langchain.document_loaders.html import UnstructuredHTMLLoader
|
||||
from langchain.document_loaders.imsdb import IMSDbLoader
|
||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
@ -17,6 +20,7 @@ from langchain.document_loaders.roam import RoamLoader
|
||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
from langchain.document_loaders.youtube import YoutubeLoader
|
||||
|
||||
__all__ = [
|
||||
@ -37,5 +41,9 @@ __all__ = [
|
||||
"S3DirectoryLoader",
|
||||
"GCSFileLoader",
|
||||
"GCSDirectoryLoader",
|
||||
"WebBaseLoader",
|
||||
"IMSDbLoader",
|
||||
"AZLyricsLoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"GutenbergLoader",
|
||||
]
|
||||
|
22
langchain/document_loaders/azlyrics.py
Normal file
22
langchain/document_loaders/azlyrics.py
Normal file
@ -0,0 +1,22 @@
|
||||
"""Loader that loads AZLyrics."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class AZLyricsLoader(WebBaseLoader):
|
||||
"""Loader that loads AZLyrics webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
title = soup.title.text
|
||||
lyrics = soup.find_all("div", {"class": ""})[2].text
|
||||
text = title + lyrics
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
20
langchain/document_loaders/college_confidential.py
Normal file
20
langchain/document_loaders/college_confidential.py
Normal file
@ -0,0 +1,20 @@
|
||||
"""Loader that loads College Confidential."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class CollegeConfidentialLoader(WebBaseLoader):
|
||||
"""Loader that loads College Confidential webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("main[class='skin-handler']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
20
langchain/document_loaders/imsdb.py
Normal file
20
langchain/document_loaders/imsdb.py
Normal file
@ -0,0 +1,20 @@
|
||||
"""Loader that loads IMSDb."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class IMSDbLoader(WebBaseLoader):
|
||||
"""Loader that loads IMSDb webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("td[class='scrtext']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
29
langchain/document_loaders/web_base.py
Normal file
29
langchain/document_loaders/web_base.py
Normal file
@ -0,0 +1,29 @@
|
||||
"""Web base loader class."""
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class WebBaseLoader(BaseLoader):
|
||||
"""Loader that uses urllib and beautiful soup to load webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def scrape(self) -> BeautifulSoup:
|
||||
"""Scrape data from webpage and return it in BeautifulSoup format."""
|
||||
html_doc = requests.get(self.web_path)
|
||||
soup = BeautifulSoup(html_doc.text, "html.parser")
|
||||
return soup
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
soup = self.scrape()
|
||||
text = soup.get_text()
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue
Block a user