mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 09:58:44 +00:00
community[patch]: fix bilibili loader handling of multi-page content (#30283)
Previously the loader would only extract subtitles from the first page of multi-page videos.
This commit is contained in:
parent
0b80bec015
commit
2c99f12062
@ -11,6 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
# Pre-compile regular expressions for video ID extraction
|
||||
BV_PATTERN = re.compile(r"BV\w+")
|
||||
AV_PATTERN = re.compile(r"av[0-9]+")
|
||||
PAGE_INDEX_PATTERN = re.compile(r"p=(\d+)")
|
||||
|
||||
|
||||
class BiliBiliLoader(BaseLoader):
|
||||
@ -93,8 +94,17 @@ class BiliBiliLoader(BaseLoader):
|
||||
if not self.credential:
|
||||
return "", video_info
|
||||
|
||||
cid = 0
|
||||
page_match = PAGE_INDEX_PATTERN.search(url)
|
||||
if page_match:
|
||||
cid = video_info["pages"][int(page_match.group(1)) - 1][
|
||||
"cid"
|
||||
] # Bilibili page index starts from 1
|
||||
else:
|
||||
cid = video_info["cid"]
|
||||
|
||||
# Fetching and processing subtitles
|
||||
sub = sync(v.get_subtitle(video_info["cid"]))
|
||||
sub = sync(v.get_subtitle(cid))
|
||||
sub_list = sub.get("subtitles", [])
|
||||
if sub_list:
|
||||
sub_url = sub_list[0].get("subtitle_url", "")
|
||||
|
@ -7,9 +7,12 @@ def test_bilibili_loader() -> None:
|
||||
[
|
||||
"https://www.bilibili.com/video/BV1xt411o7Xu/",
|
||||
"https://www.bilibili.com/video/av330407025/",
|
||||
"https://www.bilibili.com/video/BV16b4y1R7wP/?p=5",
|
||||
]
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 2
|
||||
assert len(docs) == 3
|
||||
assert docs[0].metadata["aid"] == 34218168
|
||||
assert docs[1].metadata["videos"] == 1
|
||||
assert docs[2].metadata["pages"][5 - 1]["cid"] == 300059803
|
||||
assert docs[2].metadata["cid"] == 300048569
|
||||
|
@ -1,4 +1,5 @@
|
||||
version = 1
|
||||
revision = 1
|
||||
requires-python = ">=3.9, <4.0"
|
||||
resolution-markers = [
|
||||
"python_full_version >= '3.12.4' and platform_python_implementation == 'PyPy'",
|
||||
@ -1530,6 +1531,7 @@ requires-dist = [
|
||||
{ name = "requests", specifier = ">=2,<3" },
|
||||
{ name = "sqlalchemy", specifier = ">=1.4,<3" },
|
||||
]
|
||||
provides-extras = ["community", "anthropic", "openai", "cohere", "google-vertexai", "google-genai", "fireworks", "ollama", "together", "mistralai", "huggingface", "groq", "aws", "deepseek", "xai"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }]
|
||||
@ -1744,7 +1746,7 @@ typing = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.3.41"
|
||||
version = "0.3.45"
|
||||
source = { editable = "../core" }
|
||||
dependencies = [
|
||||
{ name = "jsonpatch" },
|
||||
@ -1802,7 +1804,7 @@ typing = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-tests"
|
||||
version = "0.3.13"
|
||||
version = "0.3.14"
|
||||
source = { editable = "../standard-tests" }
|
||||
dependencies = [
|
||||
{ name = "httpx" },
|
||||
|
Loading…
Reference in New Issue
Block a user