mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 09:58:44 +00:00
community[patch]: fix bilibili loader handling of multi-page content (#30283)
Previously the loader would only extract subtitles from the first page of multi-page videos.
This commit is contained in:
parent
0b80bec015
commit
2c99f12062
@ -11,6 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
|||||||
# Pre-compile regular expressions for video ID extraction
|
# Pre-compile regular expressions for video ID extraction
|
||||||
BV_PATTERN = re.compile(r"BV\w+")
|
BV_PATTERN = re.compile(r"BV\w+")
|
||||||
AV_PATTERN = re.compile(r"av[0-9]+")
|
AV_PATTERN = re.compile(r"av[0-9]+")
|
||||||
|
PAGE_INDEX_PATTERN = re.compile(r"p=(\d+)")
|
||||||
|
|
||||||
|
|
||||||
class BiliBiliLoader(BaseLoader):
|
class BiliBiliLoader(BaseLoader):
|
||||||
@ -93,8 +94,17 @@ class BiliBiliLoader(BaseLoader):
|
|||||||
if not self.credential:
|
if not self.credential:
|
||||||
return "", video_info
|
return "", video_info
|
||||||
|
|
||||||
|
cid = 0
|
||||||
|
page_match = PAGE_INDEX_PATTERN.search(url)
|
||||||
|
if page_match:
|
||||||
|
cid = video_info["pages"][int(page_match.group(1)) - 1][
|
||||||
|
"cid"
|
||||||
|
] # Bilibili page index starts from 1
|
||||||
|
else:
|
||||||
|
cid = video_info["cid"]
|
||||||
|
|
||||||
# Fetching and processing subtitles
|
# Fetching and processing subtitles
|
||||||
sub = sync(v.get_subtitle(video_info["cid"]))
|
sub = sync(v.get_subtitle(cid))
|
||||||
sub_list = sub.get("subtitles", [])
|
sub_list = sub.get("subtitles", [])
|
||||||
if sub_list:
|
if sub_list:
|
||||||
sub_url = sub_list[0].get("subtitle_url", "")
|
sub_url = sub_list[0].get("subtitle_url", "")
|
||||||
|
@ -7,9 +7,12 @@ def test_bilibili_loader() -> None:
|
|||||||
[
|
[
|
||||||
"https://www.bilibili.com/video/BV1xt411o7Xu/",
|
"https://www.bilibili.com/video/BV1xt411o7Xu/",
|
||||||
"https://www.bilibili.com/video/av330407025/",
|
"https://www.bilibili.com/video/av330407025/",
|
||||||
|
"https://www.bilibili.com/video/BV16b4y1R7wP/?p=5",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 2
|
assert len(docs) == 3
|
||||||
assert docs[0].metadata["aid"] == 34218168
|
assert docs[0].metadata["aid"] == 34218168
|
||||||
assert docs[1].metadata["videos"] == 1
|
assert docs[1].metadata["videos"] == 1
|
||||||
|
assert docs[2].metadata["pages"][5 - 1]["cid"] == 300059803
|
||||||
|
assert docs[2].metadata["cid"] == 300048569
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
version = 1
|
version = 1
|
||||||
|
revision = 1
|
||||||
requires-python = ">=3.9, <4.0"
|
requires-python = ">=3.9, <4.0"
|
||||||
resolution-markers = [
|
resolution-markers = [
|
||||||
"python_full_version >= '3.12.4' and platform_python_implementation == 'PyPy'",
|
"python_full_version >= '3.12.4' and platform_python_implementation == 'PyPy'",
|
||||||
@ -1530,6 +1531,7 @@ requires-dist = [
|
|||||||
{ name = "requests", specifier = ">=2,<3" },
|
{ name = "requests", specifier = ">=2,<3" },
|
||||||
{ name = "sqlalchemy", specifier = ">=1.4,<3" },
|
{ name = "sqlalchemy", specifier = ">=1.4,<3" },
|
||||||
]
|
]
|
||||||
|
provides-extras = ["community", "anthropic", "openai", "cohere", "google-vertexai", "google-genai", "fireworks", "ollama", "together", "mistralai", "huggingface", "groq", "aws", "deepseek", "xai"]
|
||||||
|
|
||||||
[package.metadata.requires-dev]
|
[package.metadata.requires-dev]
|
||||||
codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }]
|
codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }]
|
||||||
@ -1744,7 +1746,7 @@ typing = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-core"
|
name = "langchain-core"
|
||||||
version = "0.3.41"
|
version = "0.3.45"
|
||||||
source = { editable = "../core" }
|
source = { editable = "../core" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "jsonpatch" },
|
{ name = "jsonpatch" },
|
||||||
@ -1802,7 +1804,7 @@ typing = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-tests"
|
name = "langchain-tests"
|
||||||
version = "0.3.13"
|
version = "0.3.14"
|
||||||
source = { editable = "../standard-tests" }
|
source = { editable = "../standard-tests" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
|
Loading…
Reference in New Issue
Block a user