diff --git a/docs/modules/document_loaders/examples/hn.ipynb b/docs/modules/document_loaders/examples/hn.ipynb new file mode 100644 index 00000000000..5765b38c2e8 --- /dev/null +++ b/docs/modules/document_loaders/examples/hn.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4babfba5", + "metadata": {}, + "source": [ + "# Hacker News\n", + "How to pull page data and comments from Hacker News" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ff49b177", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import HNLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "849a8d52", + "metadata": {}, + "outputs": [], + "source": [ + "loader = HNLoader(\"https://news.ycombinator.com/item?id=34817881\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c2826836", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fefa2adc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content=\"delta_p_delta_x 18 hours ago \\n | next [–] \\n\\nAstrophysical and cosmological simulations are often insightful. They're also very cross-disciplinary; besides the obvious astrophysics, there's networking and sysadmin, parallel computing and algorithm theory (so that the simulation programs are actually fast but still accurate), systems design, and even a bit of graphic design for the visualisations.Some of my favourite simulation projects:- IllustrisTNG: https://www.tng-project.org/- SWIFT: https://swift.dur.ac.uk/- CO5BOLD: https://www.astro.uu.se/~bf/co5bold_main.html (which produced these animations of a red-giant star: https://www.astro.uu.se/~bf/movie/AGBmovie.html)- AbacusSummit: https://abacussummit.readthedocs.io/en/latest/And I can add the simulations in the article, too.\\n \\nreply\", lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universe’s Standard Candles?'}, lookup_index=0),\n", + " Document(page_content=\"andrewflnr 19 hours ago \\n | prev | next [–] \\n\\nWhoa. I didn't know the accretion theory of Ia supernovae was dead, much less that it had been since 2011.\\n \\nreply\", lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universe’s Standard Candles?'}, lookup_index=0),\n", + " Document(page_content='andreareina 18 hours ago \\n | prev | next [–] \\n\\nThis seems to be the paper https://academic.oup.com/mnras/article/517/4/5260/6779709\\n \\nreply', lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universe’s Standard Candles?'}, lookup_index=0),\n", + " Document(page_content=\"andreareina 18 hours ago \\n | prev [–] \\n\\nWouldn't double detonation show up as variance in the brightness?\\n \\nreply\", lookup_str='', metadata={'source': 'https://news.ycombinator.com/item?id=34817881', 'title': 'What Lights the Universe’s Standard Candles?'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938ff4ee", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + }, + "vscode": { + "interpreter": { + "hash": "c05c795047059754c96cf5f30fd1289e4658e92c92d00704a3cddb24e146e3ef" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst index bf14941567e..b4a2021b9e6 100644 --- a/docs/modules/document_loaders/how_to_guides.rst +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -31,6 +31,8 @@ There are a lot of different document loaders that LangChain supports. Below are `YouTube <./examples/youtube.html>`_: A walkthrough of how to load the transcript from a YouTube video. +`Hacker News <./examples/hn.html>`_: A walkthrough of how to load a Hacker News page. + `s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3. `s3 Directory <./examples/s3_directory.html>`_: A walkthrough of how to load all files in a directory from s3. diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index fd2eac6229e..b64a5ee3c21 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -11,6 +11,7 @@ from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.gutenberg import GutenbergLoader +from langchain.document_loaders.hn import HNLoader from langchain.document_loaders.html import UnstructuredHTMLLoader from langchain.document_loaders.imsdb import IMSDbLoader from langchain.document_loaders.notion import NotionDirectoryLoader @@ -47,6 +48,7 @@ __all__ = [ "YoutubeLoader", "S3FileLoader", "TextLoader", + "HNLoader", "S3DirectoryLoader", "GCSFileLoader", "GCSDirectoryLoader", diff --git a/langchain/document_loaders/hn.py b/langchain/document_loaders/hn.py new file mode 100644 index 00000000000..b7546ecf38f --- /dev/null +++ b/langchain/document_loaders/hn.py @@ -0,0 +1,59 @@ +"""Loader that loads HN.""" +from typing import Any, List + +from langchain.docstore.document import Document +from langchain.document_loaders.web_base import WebBaseLoader + + +class HNLoader(WebBaseLoader): + """Load Hacker News data from either main page results or the comments page.""" + + def load(self) -> List[Document]: + """Get important HN webpage information. + + Components are: + - title + - content + - source url, + - time of post + - author of the post + - number of comments + - rank of the post + """ + soup_info = self.scrape() + if "item" in self.web_path: + return self.load_comments(soup_info) + else: + return self.load_results(soup_info) + + def load_comments(self, soup_info: Any) -> List[Document]: + """Load comments from a HN post.""" + comments = soup_info.select("tr[class='athing comtr']") + title = soup_info.select_one("tr[id='pagespace']").get("title") + documents = [] + for comment in comments: + text = comment.text.strip() + metadata = {"source": self.web_path, "title": title} + documents.append(Document(page_content=text, metadata=metadata)) + return documents + + def load_results(self, soup: Any) -> List[Document]: + """Load items from an HN page.""" + items = soup.select("tr[class='athing']") + documents = [] + for lineItem in items: + ranking = lineItem.select_one("span[class='rank']").text + link = lineItem.find("span", {"class": "titleline"}).find("a").get("href") + title = lineItem.find("span", {"class": "titleline"}).text.strip() + metadata = { + "source": self.web_path, + "title": title, + "link": link, + "ranking": ranking, + } + documents.append( + Document( + page_content=title, link=link, ranking=ranking, metadata=metadata + ) + ) + return documents diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 2514b581a8e..ffd9b536fe6 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -1,8 +1,7 @@ """Web base loader class.""" -from typing import List +from typing import Any, List import requests -from bs4 import BeautifulSoup from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -15,8 +14,10 @@ class WebBaseLoader(BaseLoader): """Initialize with webpage path.""" self.web_path = web_path - def scrape(self) -> BeautifulSoup: + def scrape(self) -> Any: """Scrape data from webpage and return it in BeautifulSoup format.""" + from bs4 import BeautifulSoup + html_doc = requests.get(self.web_path) soup = BeautifulSoup(html_doc.text, "html.parser") return soup