mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 13:27:36 +00:00
community[patch]: fix bugs for bilibili Loader (#18036)
- **Description:** 1. Fix the BiliBiliLoader that can receive cookie parameters, it requires 3 other parameters to run. The change is backward compatible. 2. Add test; 3. Add example in docs - **Issue:** [#14213] Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
1ef3fa0411
commit
3d3cc71287
File diff suppressed because one or more lines are too long
@ -8,20 +8,55 @@ from langchain_core.documents import Document
|
|||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
# Pre-compile regular expressions for video ID extraction
|
||||||
|
BV_PATTERN = re.compile(r"BV\w+")
|
||||||
|
AV_PATTERN = re.compile(r"av[0-9]+")
|
||||||
|
|
||||||
|
|
||||||
class BiliBiliLoader(BaseLoader):
|
class BiliBiliLoader(BaseLoader):
|
||||||
"""Load `BiliBili` video transcripts."""
|
"""
|
||||||
|
Loader for fetching transcripts from BiliBili videos.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, video_urls: List[str]):
|
def __init__(
|
||||||
"""Initialize with bilibili url.
|
self,
|
||||||
|
video_urls: List[str],
|
||||||
|
sessdata: str = "",
|
||||||
|
bili_jct: str = "",
|
||||||
|
buvid3: str = "",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the loader with BiliBili video URLs and authentication cookies.
|
||||||
|
if no authentication cookies are provided, the loader can't get transcripts
|
||||||
|
and will only fetch videos info.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
video_urls: List of bilibili urls.
|
video_urls (List[str]): List of BiliBili video URLs.
|
||||||
|
sessdata (str): SESSDATA cookie value for authentication.
|
||||||
|
bili_jct (str): BILI_JCT cookie value for authentication.
|
||||||
|
buvid3 (str): BUVI3 cookie value for authentication.
|
||||||
"""
|
"""
|
||||||
self.video_urls = video_urls
|
self.video_urls = video_urls
|
||||||
|
self.credential = None
|
||||||
|
try:
|
||||||
|
from bilibili_api import video
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"requests package not found, please install it with "
|
||||||
|
"`pip install bilibili-api-python`"
|
||||||
|
)
|
||||||
|
if sessdata and bili_jct and buvid3:
|
||||||
|
self.credential = video.Credential(
|
||||||
|
sessdata=sessdata, bili_jct=bili_jct, buvid3=buvid3
|
||||||
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load Documents from bilibili url."""
|
"""
|
||||||
|
Load and return a list of documents containing video transcripts.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of Document objects transcripts and metadata.
|
||||||
|
"""
|
||||||
results = []
|
results = []
|
||||||
for url in self.video_urls:
|
for url in self.video_urls:
|
||||||
transcript, video_info = self._get_bilibili_subs_and_info(url)
|
transcript, video_info = self._get_bilibili_subs_and_info(url)
|
||||||
@ -31,6 +66,10 @@ class BiliBiliLoader(BaseLoader):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]:
|
def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]:
|
||||||
|
"""
|
||||||
|
Retrieve video information and transcript for a given BiliBili URL.
|
||||||
|
"""
|
||||||
|
bvid = BV_PATTERN.search(url)
|
||||||
try:
|
try:
|
||||||
from bilibili_api import sync, video
|
from bilibili_api import sync, video
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -38,46 +77,50 @@ class BiliBiliLoader(BaseLoader):
|
|||||||
"requests package not found, please install it with "
|
"requests package not found, please install it with "
|
||||||
"`pip install bilibili-api-python`"
|
"`pip install bilibili-api-python`"
|
||||||
)
|
)
|
||||||
|
if bvid:
|
||||||
bvid = re.search(r"BV\w+", url)
|
v = video.Video(bvid=bvid.group(), credential=self.credential)
|
||||||
if bvid is not None:
|
|
||||||
v = video.Video(bvid=bvid.group())
|
|
||||||
else:
|
else:
|
||||||
aid = re.search(r"av[0-9]+", url)
|
aid = AV_PATTERN.search(url)
|
||||||
if aid is not None:
|
if aid:
|
||||||
try:
|
v = video.Video(aid=int(aid.group()[2:]), credential=self.credential)
|
||||||
v = video.Video(aid=int(aid.group()[2:]))
|
|
||||||
except AttributeError:
|
|
||||||
raise ValueError(f"{url} is not bilibili url.")
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"{url} is not bilibili url.")
|
raise ValueError(f"Unable to find a valid video ID in URL: {url}")
|
||||||
|
|
||||||
video_info = sync(v.get_info())
|
video_info = sync(v.get_info())
|
||||||
video_info.update({"url": url})
|
video_info.update({"url": url})
|
||||||
sub = sync(v.get_subtitle(video_info["cid"]))
|
|
||||||
|
|
||||||
# Get subtitle url
|
# Return if no credential is provided
|
||||||
sub_list = sub["subtitles"]
|
if not self.credential:
|
||||||
|
return "", video_info
|
||||||
|
|
||||||
|
# Fetching and processing subtitles
|
||||||
|
sub = sync(v.get_subtitle(video_info["cid"]))
|
||||||
|
sub_list = sub.get("subtitles", [])
|
||||||
if sub_list:
|
if sub_list:
|
||||||
sub_url = sub_list[0]["subtitle_url"]
|
sub_url = sub_list[0].get("subtitle_url", "")
|
||||||
if not sub_url.startswith("http"):
|
if not sub_url.startswith("http"):
|
||||||
sub_url = "https:" + sub_url
|
sub_url = "https:" + sub_url
|
||||||
result = requests.get(sub_url)
|
|
||||||
raw_sub_titles = json.loads(result.content)["body"]
|
|
||||||
raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
|
|
||||||
|
|
||||||
raw_transcript_with_meta_info = (
|
response = requests.get(sub_url)
|
||||||
f"Video Title: {video_info['title']},"
|
if response.status_code == 200:
|
||||||
f"description: {video_info['desc']}\n\n"
|
raw_sub_titles = json.loads(response.content).get("body", [])
|
||||||
f"Transcript: {raw_transcript}"
|
raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
|
||||||
)
|
|
||||||
return raw_transcript_with_meta_info, video_info
|
raw_transcript_with_meta_info = (
|
||||||
|
f"Video Title: {video_info['title']}, "
|
||||||
|
f"description: {video_info['desc']}\n\n"
|
||||||
|
f"Transcript: {raw_transcript}"
|
||||||
|
)
|
||||||
|
return raw_transcript_with_meta_info, video_info
|
||||||
|
else:
|
||||||
|
warnings.warn(
|
||||||
|
f"Failed to fetch subtitles for {url}. "
|
||||||
|
f"HTTP Status Code: {response.status_code}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raw_transcript = ""
|
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f"""
|
f"No subtitles found for video: {url}. Returning empty transcript."
|
||||||
No subtitles found for video: {url}.
|
|
||||||
Return Empty transcript.
|
|
||||||
"""
|
|
||||||
)
|
)
|
||||||
return raw_transcript, video_info
|
|
||||||
|
# Return empty transcript if no subtitles are found
|
||||||
|
return "", video_info
|
||||||
|
@ -10,11 +10,6 @@ def test_bilibili_loader() -> None:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
assert len(docs) == 2
|
assert len(docs) == 2
|
||||||
|
assert docs[0].metadata["aid"] == 34218168
|
||||||
assert len(docs[0].page_content) > 0
|
assert docs[1].metadata["videos"] == 1
|
||||||
assert docs[1].metadata["owner"]["mid"] == 398095160
|
|
||||||
|
|
||||||
assert docs[1].page_content == ""
|
|
||||||
assert docs[1].metadata["owner"]["mid"] == 398095160
|
|
||||||
|
Loading…
Reference in New Issue
Block a user