diff --git a/libs/community/langchain_community/document_loaders/bilibili.py b/libs/community/langchain_community/document_loaders/bilibili.py index 192311f59da..a8d4d36b405 100644 --- a/libs/community/langchain_community/document_loaders/bilibili.py +++ b/libs/community/langchain_community/document_loaders/bilibili.py @@ -11,6 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader # Pre-compile regular expressions for video ID extraction BV_PATTERN = re.compile(r"BV\w+") AV_PATTERN = re.compile(r"av[0-9]+") +PAGE_INDEX_PATTERN = re.compile(r"p=(\d+)") class BiliBiliLoader(BaseLoader): @@ -93,8 +94,17 @@ class BiliBiliLoader(BaseLoader): if not self.credential: return "", video_info + cid = 0 + page_match = PAGE_INDEX_PATTERN.search(url) + if page_match: + cid = video_info["pages"][int(page_match.group(1)) - 1][ + "cid" + ] # Bilibili page index starts from 1 + else: + cid = video_info["cid"] + # Fetching and processing subtitles - sub = sync(v.get_subtitle(video_info["cid"])) + sub = sync(v.get_subtitle(cid)) sub_list = sub.get("subtitles", []) if sub_list: sub_url = sub_list[0].get("subtitle_url", "") diff --git a/libs/community/tests/integration_tests/document_loaders/test_bilibili.py b/libs/community/tests/integration_tests/document_loaders/test_bilibili.py index b1cfc6b2b56..01cd9e0fefe 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_bilibili.py +++ b/libs/community/tests/integration_tests/document_loaders/test_bilibili.py @@ -7,9 +7,12 @@ def test_bilibili_loader() -> None: [ "https://www.bilibili.com/video/BV1xt411o7Xu/", "https://www.bilibili.com/video/av330407025/", + "https://www.bilibili.com/video/BV16b4y1R7wP/?p=5", ] ) docs = loader.load() - assert len(docs) == 2 + assert len(docs) == 3 assert docs[0].metadata["aid"] == 34218168 assert docs[1].metadata["videos"] == 1 + assert docs[2].metadata["pages"][5 - 1]["cid"] == 300059803 + assert docs[2].metadata["cid"] == 300048569 diff --git a/libs/community/uv.lock b/libs/community/uv.lock index 636a3af67f0..776605b46ea 100644 --- a/libs/community/uv.lock +++ b/libs/community/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.9, <4.0" resolution-markers = [ "python_full_version >= '3.12.4' and platform_python_implementation == 'PyPy'", @@ -1530,6 +1531,7 @@ requires-dist = [ { name = "requests", specifier = ">=2,<3" }, { name = "sqlalchemy", specifier = ">=1.4,<3" }, ] +provides-extras = ["community", "anthropic", "openai", "cohere", "google-vertexai", "google-genai", "fireworks", "ollama", "together", "mistralai", "huggingface", "groq", "aws", "deepseek", "xai"] [package.metadata.requires-dev] codespell = [{ name = "codespell", specifier = ">=2.2.0,<3.0.0" }] @@ -1744,7 +1746,7 @@ typing = [ [[package]] name = "langchain-core" -version = "0.3.41" +version = "0.3.45" source = { editable = "../core" } dependencies = [ { name = "jsonpatch" }, @@ -1802,7 +1804,7 @@ typing = [ [[package]] name = "langchain-tests" -version = "0.3.13" +version = "0.3.14" source = { editable = "../standard-tests" } dependencies = [ { name = "httpx" },