mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 21:33:51 +00:00
RSS Feed / OPML loader (#8694)
Replace this comment with: - Description: added a document loader for a list of RSS feeds or OPML. It iterates through the list and uses NewsURLLoader to load each article. - Issue: N/A - Dependencies: feedparser, listparser - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: @ruze --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
53e4148a1b
commit
8ef7e14a85
@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<opml version="1.0">
|
||||
<head>
|
||||
<title>Sample RSS feed subscriptions</title>
|
||||
</head>
|
||||
<body>
|
||||
<outline text="Tech" title="Tech">
|
||||
<outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
|
||||
<outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
|
||||
</outline>
|
||||
</body>
|
||||
</opml>
|
170
docs/extras/integrations/document_loaders/rss.ipynb
Normal file
170
docs/extras/integrations/document_loaders/rss.ipynb
Normal file
@ -0,0 +1,170 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2dfc4698",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RSS Feeds\n",
|
||||
"\n",
|
||||
"This covers how to load HTML news articles from a list of RSS feed URLs into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "16c3699e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import RSSFeedLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "836fbac1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urls = [\"https://www.engadget.com/rss.xml\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "33089aba-ff74-4d00-8f40-9449c29587cc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pass in urls to load them into Documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00f46fda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = RSSFeedLoader(urls=urls)\n",
|
||||
"data = loader.load()\n",
|
||||
"print(len(data))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data[0]"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "b447468cc42266d0"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"You can pass arguments to the NewsURLLoader which it uses to load articles."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "c36d3b0d329faf2a"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = RSSFeedLoader(urls=urls, nlp=True)\n",
|
||||
"data = loader.load()\n",
|
||||
"print(len(data))"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "5fdada62470d3019"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data[0].metadata['keywords']"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "11d71963f7735c1d"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data[0].metadata['summary']"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "9fb64ba0e8780966"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"You can also use an OPML file such as a Feedly export. Pass in either a URL or the OPML contents."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "98ac26c488315bff"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8b6f07ae526a897c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"example_data/sample_rss_feeds.opml\", \"r\") as f:\n",
|
||||
" loader = RSSFeedLoader(opml=f.read())\n",
|
||||
"data = loader.load()\n",
|
||||
"print(len(data))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data[0]"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "b68a26b3"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -128,6 +128,7 @@ from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||
from langchain.document_loaders.reddit import RedditPostsLoader
|
||||
from langchain.document_loaders.roam import RoamLoader
|
||||
from langchain.document_loaders.rocksetdb import RocksetLoader
|
||||
from langchain.document_loaders.rss import RSSFeedLoader
|
||||
from langchain.document_loaders.rst import UnstructuredRSTLoader
|
||||
from langchain.document_loaders.rtf import UnstructuredRTFLoader
|
||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||
@ -280,6 +281,7 @@ __all__ = [
|
||||
"RedditPostsLoader",
|
||||
"RoamLoader",
|
||||
"RocksetLoader",
|
||||
"RSSFeedLoader",
|
||||
"S3DirectoryLoader",
|
||||
"S3FileLoader",
|
||||
"SRTLoader",
|
||||
|
133
libs/langchain/langchain/document_loaders/rss.py
Normal file
133
libs/langchain/langchain/document_loaders/rss.py
Normal file
@ -0,0 +1,133 @@
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
import logging
|
||||
from typing import Any, Iterator, List, Optional, Sequence
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.news import NewsURLLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RSSFeedLoader(BaseLoader):
|
||||
"""Loader that uses newspaper to load news articles from RSS feeds.
|
||||
|
||||
Args:
|
||||
urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.
|
||||
opml: OPML file to load feed urls from. Only one of urls or opml should be provided. The value
|
||||
can be a URL string, or OPML markup contents as byte or string.
|
||||
continue_on_failure: If True, continue loading documents even if
|
||||
loading fails for a particular URL.
|
||||
show_progress_bar: If True, use tqdm to show a loading progress bar. Requires
|
||||
tqdm to be installed, ``pip install tqdm``.
|
||||
**newsloader_kwargs: Any additional named arguments to pass to
|
||||
NewsURLLoader.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.document_loaders import RSSFeedLoader
|
||||
|
||||
loader = RSSFeedLoader(
|
||||
urls=["<url-1>", "<url-2>"],
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
The loader uses feedparser to parse RSS feeds. The feedparser library is not installed by default so you should
|
||||
install it if using this loader:
|
||||
https://pythonhosted.org/feedparser/
|
||||
|
||||
If you use OPML, you should also install listparser:
|
||||
https://pythonhosted.org/listparser/
|
||||
|
||||
Finally, newspaper is used to process each article:
|
||||
https://newspaper.readthedocs.io/en/latest/
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
urls: Optional[Sequence[str]] = None,
|
||||
opml: Optional[str] = None,
|
||||
continue_on_failure: bool = True,
|
||||
show_progress_bar: bool = False,
|
||||
**newsloader_kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with urls or OPML."""
|
||||
if (urls is None) == (
|
||||
opml is None
|
||||
): # This is True if both are None or neither is None
|
||||
raise ValueError(
|
||||
"Provide either the urls or the opml argument, but not both."
|
||||
)
|
||||
self.urls = urls
|
||||
self.opml = opml
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.show_progress_bar = show_progress_bar
|
||||
self.newsloader_kwargs = newsloader_kwargs
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
iter = self.lazy_load()
|
||||
if self.show_progress_bar:
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Package tqdm must be installed if show_progress_bar=True. "
|
||||
"Please install with 'pip install tqdm' or set "
|
||||
"show_progress_bar=False."
|
||||
) from e
|
||||
iter = tqdm(iter)
|
||||
return list(iter)
|
||||
|
||||
@property
|
||||
def _get_urls(self) -> Sequence[str]:
|
||||
if self.urls:
|
||||
return self.urls
|
||||
try:
|
||||
import listparser
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Package listparser must be installed if the opml arg is used. "
|
||||
"Please install with 'pip install listparser' or use the "
|
||||
"urls arg instead."
|
||||
) from e
|
||||
rss = listparser.parse(self.opml)
|
||||
return [feed.url for feed in rss.feeds]
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
try:
|
||||
import feedparser # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"feedparser package not found, please install it with "
|
||||
"`pip install feedparser`"
|
||||
)
|
||||
|
||||
for url in self._get_urls:
|
||||
try:
|
||||
feed = feedparser.parse(url)
|
||||
if getattr(feed, "bozo", False):
|
||||
raise ValueError(
|
||||
f"Error fetching {url}, exception: {feed.bozo_exception}"
|
||||
)
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(f"Error fetching {url}, exception: {e}")
|
||||
continue
|
||||
else:
|
||||
raise e
|
||||
try:
|
||||
for entry in feed.entries:
|
||||
loader = NewsURLLoader(
|
||||
urls=[entry.link],
|
||||
**self.newsloader_kwargs,
|
||||
)
|
||||
article = loader.load()[0]
|
||||
article.metadata["feed"] = url
|
||||
yield article
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(f"Error processing entry {entry.link}, exception: {e}")
|
||||
continue
|
||||
else:
|
||||
raise e
|
125
libs/langchain/poetry.lock
generated
125
libs/langchain/poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "absl-py"
|
||||
@ -2158,6 +2158,18 @@ ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
name = "cssselect"
|
||||
version = "1.2.0"
|
||||
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"},
|
||||
{file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cycler"
|
||||
version = "0.11.0"
|
||||
@ -2838,6 +2850,22 @@ files = [
|
||||
[package.extras]
|
||||
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
|
||||
|
||||
[[package]]
|
||||
name = "feedfinder2"
|
||||
version = "0.0.4"
|
||||
description = "Find the feed URLs for a website."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "feedfinder2-0.0.4.tar.gz", hash = "sha256:3701ee01a6c85f8b865a049c30ba0b4608858c803fe8e30d1d289fdbe89d0efe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beautifulsoup4 = "*"
|
||||
requests = "*"
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "feedparser"
|
||||
version = "6.0.10"
|
||||
@ -4378,6 +4406,17 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab
|
||||
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
|
||||
testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "jieba3k"
|
||||
version = "0.35.1"
|
||||
description = "Chinese Words Segementation Utilities"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "jieba3k-0.35.1.zip", hash = "sha256:980a4f2636b778d312518066be90c7697d410dd5a472385f5afced71a2db1c10"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jina"
|
||||
version = "3.14.1"
|
||||
@ -6472,6 +6511,33 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.2)", "pydata-sphinx-
|
||||
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"]
|
||||
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "newspaper3k"
|
||||
version = "0.2.8"
|
||||
description = "Simplified python article discovery & extraction."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "newspaper3k-0.2.8-py3-none-any.whl", hash = "sha256:44a864222633d3081113d1030615991c3dbba87239f6bbf59d91240f71a22e3e"},
|
||||
{file = "newspaper3k-0.2.8.tar.gz", hash = "sha256:9f1bd3e1fb48f400c715abf875cc7b0a67b7ddcd87f50c9aeeb8fcbbbd9004fb"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beautifulsoup4 = ">=4.4.1"
|
||||
cssselect = ">=0.9.2"
|
||||
feedfinder2 = ">=0.0.4"
|
||||
feedparser = ">=5.2.1"
|
||||
jieba3k = ">=0.35.1"
|
||||
lxml = ">=3.6.0"
|
||||
nltk = ">=3.2.1"
|
||||
Pillow = ">=3.3.0"
|
||||
python-dateutil = ">=2.5.3"
|
||||
PyYAML = ">=3.11"
|
||||
requests = ">=2.10.0"
|
||||
tinysegmenter = "0.3"
|
||||
tldextract = ">=2.0.1"
|
||||
|
||||
[[package]]
|
||||
name = "nlpcloud"
|
||||
version = "1.0.42"
|
||||
@ -10001,6 +10067,22 @@ urllib3 = ">=1.21.1,<1.27"
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||
|
||||
[[package]]
|
||||
name = "requests-file"
|
||||
version = "1.5.1"
|
||||
description = "File transport adapter for Requests"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "requests-file-1.5.1.tar.gz", hash = "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e"},
|
||||
{file = "requests_file-1.5.1-py2.py3-none-any.whl", hash = "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
requests = ">=1.0.0"
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "requests-oauthlib"
|
||||
version = "1.3.1"
|
||||
@ -11708,6 +11790,35 @@ webencodings = ">=0.4"
|
||||
doc = ["sphinx", "sphinx_rtd_theme"]
|
||||
test = ["flake8", "isort", "pytest"]
|
||||
|
||||
[[package]]
|
||||
name = "tinysegmenter"
|
||||
version = "0.3"
|
||||
description = "Very compact Japanese tokenizer"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "tinysegmenter-0.3.tar.gz", hash = "sha256:ed1f6d2e806a4758a73be589754384cbadadc7e1a414c81a166fc9adf2d40c6d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tldextract"
|
||||
version = "3.4.4"
|
||||
description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "tldextract-3.4.4-py3-none-any.whl", hash = "sha256:581e7dbefc90e7bb857bb6f768d25c811a3c5f0892ed56a9a2999ddb7b1b70c2"},
|
||||
{file = "tldextract-3.4.4.tar.gz", hash = "sha256:5fe3210c577463545191d45ad522d3d5e78d55218ce97215e82004dcae1e1234"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
filelock = ">=3.0.8"
|
||||
idna = "*"
|
||||
requests = ">=2.1.0"
|
||||
requests-file = ">=1.4"
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.13.3"
|
||||
@ -11950,7 +12061,7 @@ files = [
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\""}
|
||||
accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\" or extra == \"torch\""}
|
||||
filelock = "*"
|
||||
huggingface-hub = ">=0.14.1,<1.0"
|
||||
numpy = ">=1.17"
|
||||
@ -13422,15 +13533,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
||||
cffi = ["cffi (>=1.11)"]
|
||||
|
||||
[extras]
|
||||
all = ["anthropic", "clarifai", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "marqo", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "libdeeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb", "esprima", "octoai-sdk", "rdflib", "amadeus", "xinference", "librosa", "python-arango"]
|
||||
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-search-documents"]
|
||||
all = ["O365", "aleph-alpha-client", "amadeus", "anthropic", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "libdeeplake", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "octoai-sdk", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha", "xinference"]
|
||||
azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"]
|
||||
clarifai = ["clarifai"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "jq", "pdfminer-six", "pgvector", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "mwparserfromhell", "mwxml", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "streamlit", "pyspark", "openai", "sympy", "rapidfuzz", "openai", "rank-bm25", "geopandas", "jinja2", "xinference", "gitpython"]
|
||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["anthropic", "clarifai", "cohere", "openai", "openllm", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers", "xinference"]
|
||||
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
qdrant = ["qdrant-client"]
|
||||
text-helpers = ["chardet"]
|
||||
@ -13438,4 +13549,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "84ededcf21a742653863c033dd31e1b24af7562d479c179cd58ba22b2a9805e9"
|
||||
content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290"
|
||||
|
@ -128,6 +128,8 @@ xinference = {version = "^0.0.6", optional = true}
|
||||
python-arango = {version = "^7.5.9", optional = true}
|
||||
gitpython = {version = "^3.1.32", optional = true}
|
||||
librosa = {version="^0.10.0.post2", optional = true }
|
||||
feedparser = {version = "^6.0.10", optional = true}
|
||||
newspaper3k = {version = "^0.2.8", optional = true}
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
# The only dependencies that should be added are
|
||||
@ -363,6 +365,8 @@ extended_testing = [
|
||||
"jinja2",
|
||||
"xinference",
|
||||
"gitpython",
|
||||
"newspaper3k",
|
||||
"feedparser",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
@ -0,0 +1,42 @@
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders.rss import RSSFeedLoader
|
||||
|
||||
|
||||
def test_rss_loader() -> None:
|
||||
loader = RSSFeedLoader(urls=["https://www.engadget.com/rss.xml"])
|
||||
docs = loader.load()
|
||||
|
||||
assert docs[0] is not None
|
||||
assert hasattr(docs[0], "page_content")
|
||||
assert hasattr(docs[0], "metadata")
|
||||
|
||||
metadata = docs[0].metadata
|
||||
assert "feed" in metadata
|
||||
assert "title" in metadata
|
||||
assert "link" in metadata
|
||||
assert "authors" in metadata
|
||||
assert "language" in metadata
|
||||
assert "description" in metadata
|
||||
assert "publish_date" in metadata
|
||||
|
||||
|
||||
def test_rss_loader_with_opml() -> None:
|
||||
file_path = Path(__file__).parent.parent / "examples"
|
||||
with open(file_path.joinpath("sample_rss_feeds.opml"), "r") as f:
|
||||
loader = RSSFeedLoader(opml=f.read())
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
assert docs[0] is not None
|
||||
assert hasattr(docs[0], "page_content")
|
||||
assert hasattr(docs[0], "metadata")
|
||||
|
||||
metadata = docs[0].metadata
|
||||
assert "feed" in metadata
|
||||
assert "title" in metadata
|
||||
assert "link" in metadata
|
||||
assert "authors" in metadata
|
||||
assert "language" in metadata
|
||||
assert "description" in metadata
|
||||
assert "publish_date" in metadata
|
@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<opml version="1.0">
|
||||
<head>
|
||||
<title>Sample RSS feed subscriptions</title>
|
||||
</head>
|
||||
<body>
|
||||
<outline text="Tech" title="Tech">
|
||||
<outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
|
||||
<outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
|
||||
</outline>
|
||||
</body>
|
||||
</opml>
|
18
libs/langchain/tests/unit_tests/document_loaders/test_rss.py
Normal file
18
libs/langchain/tests/unit_tests/document_loaders/test_rss.py
Normal file
@ -0,0 +1,18 @@
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import RSSFeedLoader
|
||||
|
||||
|
||||
@pytest.mark.requires("feedparser", "newspaper")
|
||||
def test_continue_on_failure_true() -> None:
|
||||
"""Test exception is not raised when continue_on_failure=True."""
|
||||
loader = RSSFeedLoader(["badurl.foobar"])
|
||||
loader.load()
|
||||
|
||||
|
||||
@pytest.mark.requires("feedparser", "newspaper")
|
||||
def test_continue_on_failure_false() -> None:
|
||||
"""Test exception is raised when continue_on_failure=False."""
|
||||
loader = RSSFeedLoader(["badurl.foobar"], continue_on_failure=False)
|
||||
with pytest.raises(Exception):
|
||||
loader.load()
|
199
poetry.lock
generated
199
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user