community[patch]: fix bilibili loader handling of multi-page content (#30283)

Previously the loader would only extract subtitles from the first page
of multi-page videos.
This commit is contained in:
homeffjy 2025-03-15 02:53:03 +08:00 committed by GitHub
parent 0b80bec015
commit 2c99f12062
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 19 additions and 4 deletions

View File

@ -11,6 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
# Pre-compile regular expressions for video ID extraction
BV_PATTERN = re.compile(r"BV\w+")
AV_PATTERN = re.compile(r"av[0-9]+")
PAGE_INDEX_PATTERN = re.compile(r"p=(\d+)")
class BiliBiliLoader(BaseLoader):
@ -93,8 +94,17 @@ class BiliBiliLoader(BaseLoader):
if not self.credential:
return "", video_info
cid = 0
page_match = PAGE_INDEX_PATTERN.search(url)
if page_match:
cid = video_info["pages"][int(page_match.group(1)) - 1][
"cid"
] # Bilibili page index starts from 1
else:
cid = video_info["cid"]
# Fetching and processing subtitles
sub = sync(v.get_subtitle(video_info["cid"]))
sub = sync(v.get_subtitle(cid))
sub_list = sub.get("subtitles", [])
if sub_list:
sub_url = sub_list[0].get("subtitle_url", "")

View File

@ -7,9 +7,12 @@ def test_bilibili_loader() -> None:
[
"https://www.bilibili.com/video/BV1xt411o7Xu/",
"https://www.bilibili.com/video/av330407025/",
"https://www.bilibili.com/video/BV16b4y1R7wP/?p=5",
]
)
docs = loader.load()
assert len(docs) == 2
assert len(docs) == 3
assert docs[0].metadata["aid"] == 34218168
assert docs[1].metadata["videos"] == 1
assert docs[2].metadata["pages"][5 - 1]["cid"] == 300059803
assert docs[2].metadata["cid"] == 300048569

View File

@ -1,4 +1,5 @@
version = 1
revision = 1
requires-python = ">=3.9, <4.0"
resolution-markers = [
"python_full_version >= '3.12.4' and platform_python_implementation == 'PyPy'",
@ -1530,6 +1531,7 @@ requires-dist = [
{ name = "requests", specifier = ">=2,<3" },
{ name = "sqlalchemy", specifier = ">=1.4,<3" },
]
provides-extras = ["community", "anthropic", "openai", "cohere", "google-vertexai", "google-genai", "fireworks", "ollama", "together", "mistralai", "huggingface", "groq", "aws", "deepseek", "xai"]
[package.metadata.requires-dev]
codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }]
@ -1744,7 +1746,7 @@ typing = [
[[package]]
name = "langchain-core"
version = "0.3.41"
version = "0.3.45"
source = { editable = "../core" }
dependencies = [
{ name = "jsonpatch" },
@ -1802,7 +1804,7 @@ typing = [
[[package]]
name = "langchain-tests"
version = "0.3.13"
version = "0.3.14"
source = { editable = "../standard-tests" }
dependencies = [
{ name = "httpx" },