Harrison/hn loader (#1130)

Co-authored-by: William X <william.y.xuan@gmail.com>
This commit is contained in:
Harrison Chase 2023-02-17 15:15:02 -08:00 committed by GitHub
parent 47c3221fda
commit d5f3dfa1e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 168 additions and 3 deletions

View File

@ -0,0 +1,101 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "4babfba5",
"metadata": {},
"source": [
"# Hacker News\n",
"How to pull page data and comments from Hacker News"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ff49b177",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import HNLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "849a8d52",
"metadata": {},
"outputs": [],
"source": [
"loader = HNLoader(\"https://news.ycombinator.com/item?id=34817881\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c2826836",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fefa2adc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content=\"delta_p_delta_x 18 hours ago \\n | next [] \\n\\nAstrophysical and cosmological simulations are often insightful. They're also very cross-disciplinary; besides the obvious astrophysics, there's networking and sysadmin, parallel computing and algorithm theory (so that the simulation programs are actually fast but still accurate), systems design, and even a bit of graphic design for the visualisations.Some of my favourite simulation projects:- IllustrisTNG: https://www.tng-project.org/- SWIFT: https://swift.dur.ac.uk/- CO5BOLD: https://www.astro.uu.se/~bf/co5bold_main.html (which produced these animations of a red-giant star: https://www.astro.uu.se/~bf/movie/AGBmovie.html)- AbacusSummit: https://abacussummit.readthedocs.io/en/latest/And I can add the simulations in the article, too.\\n \\nreply\", lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universes Standard Candles?'}, lookup_index=0),\n",
" Document(page_content=\"andrewflnr 19 hours ago \\n | prev | next [] \\n\\nWhoa. I didn't know the accretion theory of Ia supernovae was dead, much less that it had been since 2011.\\n \\nreply\", lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universes Standard Candles?'}, lookup_index=0),\n",
" Document(page_content='andreareina 18 hours ago \\n | prev | next [] \\n\\nThis seems to be the paper https://academic.oup.com/mnras/article/517/4/5260/6779709\\n \\nreply', lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universes Standard Candles?'}, lookup_index=0),\n",
" Document(page_content=\"andreareina 18 hours ago \\n | prev [] \\n\\nWouldn't double detonation show up as variance in the brightness?\\n \\nreply\", lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universes Standard Candles?'}, lookup_index=0)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "938ff4ee",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
},
"vscode": {
"interpreter": {
"hash": "c05c795047059754c96cf5f30fd1289e4658e92c92d00704a3cddb24e146e3ef"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -31,6 +31,8 @@ There are a lot of different document loaders that LangChain supports. Below are
`YouTube <./examples/youtube.html>`_: A walkthrough of how to load the transcript from a YouTube video. `YouTube <./examples/youtube.html>`_: A walkthrough of how to load the transcript from a YouTube video.
`Hacker News <./examples/hn.html>`_: A walkthrough of how to load a Hacker News page.
`s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3. `s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3.
`s3 Directory <./examples/s3_directory.html>`_: A walkthrough of how to load all files in a directory from s3. `s3 Directory <./examples/s3_directory.html>`_: A walkthrough of how to load all files in a directory from s3.

View File

@ -11,6 +11,7 @@ from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.gcs_file import GCSFileLoader
from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.googledrive import GoogleDriveLoader
from langchain.document_loaders.gutenberg import GutenbergLoader from langchain.document_loaders.gutenberg import GutenbergLoader
from langchain.document_loaders.hn import HNLoader
from langchain.document_loaders.html import UnstructuredHTMLLoader from langchain.document_loaders.html import UnstructuredHTMLLoader
from langchain.document_loaders.imsdb import IMSDbLoader from langchain.document_loaders.imsdb import IMSDbLoader
from langchain.document_loaders.notion import NotionDirectoryLoader from langchain.document_loaders.notion import NotionDirectoryLoader
@ -47,6 +48,7 @@ __all__ = [
"YoutubeLoader", "YoutubeLoader",
"S3FileLoader", "S3FileLoader",
"TextLoader", "TextLoader",
"HNLoader",
"S3DirectoryLoader", "S3DirectoryLoader",
"GCSFileLoader", "GCSFileLoader",
"GCSDirectoryLoader", "GCSDirectoryLoader",

View File

@ -0,0 +1,59 @@
"""Loader that loads HN."""
from typing import Any, List
from langchain.docstore.document import Document
from langchain.document_loaders.web_base import WebBaseLoader
class HNLoader(WebBaseLoader):
"""Load Hacker News data from either main page results or the comments page."""
def load(self) -> List[Document]:
"""Get important HN webpage information.
Components are:
- title
- content
- source url,
- time of post
- author of the post
- number of comments
- rank of the post
"""
soup_info = self.scrape()
if "item" in self.web_path:
return self.load_comments(soup_info)
else:
return self.load_results(soup_info)
def load_comments(self, soup_info: Any) -> List[Document]:
"""Load comments from a HN post."""
comments = soup_info.select("tr[class='athing comtr']")
title = soup_info.select_one("tr[id='pagespace']").get("title")
documents = []
for comment in comments:
text = comment.text.strip()
metadata = {"source": self.web_path, "title": title}
documents.append(Document(page_content=text, metadata=metadata))
return documents
def load_results(self, soup: Any) -> List[Document]:
"""Load items from an HN page."""
items = soup.select("tr[class='athing']")
documents = []
for lineItem in items:
ranking = lineItem.select_one("span[class='rank']").text
link = lineItem.find("span", {"class": "titleline"}).find("a").get("href")
title = lineItem.find("span", {"class": "titleline"}).text.strip()
metadata = {
"source": self.web_path,
"title": title,
"link": link,
"ranking": ranking,
}
documents.append(
Document(
page_content=title, link=link, ranking=ranking, metadata=metadata
)
)
return documents

View File

@ -1,8 +1,7 @@
"""Web base loader class.""" """Web base loader class."""
from typing import List from typing import Any, List
import requests import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -15,8 +14,10 @@ class WebBaseLoader(BaseLoader):
"""Initialize with webpage path.""" """Initialize with webpage path."""
self.web_path = web_path self.web_path = web_path
def scrape(self) -> BeautifulSoup: def scrape(self) -> Any:
"""Scrape data from webpage and return it in BeautifulSoup format.""" """Scrape data from webpage and return it in BeautifulSoup format."""
from bs4 import BeautifulSoup
html_doc = requests.get(self.web_path) html_doc = requests.get(self.web_path)
soup = BeautifulSoup(html_doc.text, "html.parser") soup = BeautifulSoup(html_doc.text, "html.parser")
return soup return soup