community[patch]: fix bugs for bilibili Loader (#18036)

- **Description:** 
1. Fix the BiliBiliLoader that can receive cookie parameters, it
requires 3 other parameters to run. The change is backward compatible.
  2. Add test;      
  3. Add example in docs

- **Issue:** [#14213]

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
Jiaming 2024-03-29 07:39:38 +08:00 committed by GitHub
parent 1ef3fa0411
commit 3d3cc71287
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 149 additions and 57 deletions

File diff suppressed because one or more lines are too long

View File

@ -8,20 +8,55 @@ from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader from langchain_community.document_loaders.base import BaseLoader
# Pre-compile regular expressions for video ID extraction
BV_PATTERN = re.compile(r"BV\w+")
AV_PATTERN = re.compile(r"av[0-9]+")
class BiliBiliLoader(BaseLoader): class BiliBiliLoader(BaseLoader):
"""Load `BiliBili` video transcripts.""" """
Loader for fetching transcripts from BiliBili videos.
"""
def __init__(self, video_urls: List[str]): def __init__(
"""Initialize with bilibili url. self,
video_urls: List[str],
sessdata: str = "",
bili_jct: str = "",
buvid3: str = "",
):
"""
Initialize the loader with BiliBili video URLs and authentication cookies.
if no authentication cookies are provided, the loader can't get transcripts
and will only fetch videos info.
Args: Args:
video_urls: List of bilibili urls. video_urls (List[str]): List of BiliBili video URLs.
sessdata (str): SESSDATA cookie value for authentication.
bili_jct (str): BILI_JCT cookie value for authentication.
buvid3 (str): BUVI3 cookie value for authentication.
""" """
self.video_urls = video_urls self.video_urls = video_urls
self.credential = None
try:
from bilibili_api import video
except ImportError:
raise ImportError(
"requests package not found, please install it with "
"`pip install bilibili-api-python`"
)
if sessdata and bili_jct and buvid3:
self.credential = video.Credential(
sessdata=sessdata, bili_jct=bili_jct, buvid3=buvid3
)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load Documents from bilibili url.""" """
Load and return a list of documents containing video transcripts.
Returns:
List[Document]: List of Document objects transcripts and metadata.
"""
results = [] results = []
for url in self.video_urls: for url in self.video_urls:
transcript, video_info = self._get_bilibili_subs_and_info(url) transcript, video_info = self._get_bilibili_subs_and_info(url)
@ -31,6 +66,10 @@ class BiliBiliLoader(BaseLoader):
return results return results
def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]: def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]:
"""
Retrieve video information and transcript for a given BiliBili URL.
"""
bvid = BV_PATTERN.search(url)
try: try:
from bilibili_api import sync, video from bilibili_api import sync, video
except ImportError: except ImportError:
@ -38,46 +77,50 @@ class BiliBiliLoader(BaseLoader):
"requests package not found, please install it with " "requests package not found, please install it with "
"`pip install bilibili-api-python`" "`pip install bilibili-api-python`"
) )
if bvid:
bvid = re.search(r"BV\w+", url) v = video.Video(bvid=bvid.group(), credential=self.credential)
if bvid is not None:
v = video.Video(bvid=bvid.group())
else: else:
aid = re.search(r"av[0-9]+", url) aid = AV_PATTERN.search(url)
if aid is not None: if aid:
try: v = video.Video(aid=int(aid.group()[2:]), credential=self.credential)
v = video.Video(aid=int(aid.group()[2:]))
except AttributeError:
raise ValueError(f"{url} is not bilibili url.")
else: else:
raise ValueError(f"{url} is not bilibili url.") raise ValueError(f"Unable to find a valid video ID in URL: {url}")
video_info = sync(v.get_info()) video_info = sync(v.get_info())
video_info.update({"url": url}) video_info.update({"url": url})
sub = sync(v.get_subtitle(video_info["cid"]))
# Get subtitle url # Return if no credential is provided
sub_list = sub["subtitles"] if not self.credential:
return "", video_info
# Fetching and processing subtitles
sub = sync(v.get_subtitle(video_info["cid"]))
sub_list = sub.get("subtitles", [])
if sub_list: if sub_list:
sub_url = sub_list[0]["subtitle_url"] sub_url = sub_list[0].get("subtitle_url", "")
if not sub_url.startswith("http"): if not sub_url.startswith("http"):
sub_url = "https:" + sub_url sub_url = "https:" + sub_url
result = requests.get(sub_url)
raw_sub_titles = json.loads(result.content)["body"]
raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
raw_transcript_with_meta_info = ( response = requests.get(sub_url)
f"Video Title: {video_info['title']}," if response.status_code == 200:
f"description: {video_info['desc']}\n\n" raw_sub_titles = json.loads(response.content).get("body", [])
f"Transcript: {raw_transcript}" raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
)
return raw_transcript_with_meta_info, video_info raw_transcript_with_meta_info = (
f"Video Title: {video_info['title']}, "
f"description: {video_info['desc']}\n\n"
f"Transcript: {raw_transcript}"
)
return raw_transcript_with_meta_info, video_info
else:
warnings.warn(
f"Failed to fetch subtitles for {url}. "
f"HTTP Status Code: {response.status_code}"
)
else: else:
raw_transcript = ""
warnings.warn( warnings.warn(
f""" f"No subtitles found for video: {url}. Returning empty transcript."
No subtitles found for video: {url}.
Return Empty transcript.
"""
) )
return raw_transcript, video_info
# Return empty transcript if no subtitles are found
return "", video_info

View File

@ -10,11 +10,6 @@ def test_bilibili_loader() -> None:
] ]
) )
docs = loader.load() docs = loader.load()
assert len(docs) == 2 assert len(docs) == 2
assert docs[0].metadata["aid"] == 34218168
assert len(docs[0].page_content) > 0 assert docs[1].metadata["videos"] == 1
assert docs[1].metadata["owner"]["mid"] == 398095160
assert docs[1].page_content == ""
assert docs[1].metadata["owner"]["mid"] == 398095160