mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-01 20:49:17 +00:00
Add HTML Title and Page Language into metadata for AsyncHtmlLoader (#11326)
**Description:** Revise `libs/langchain/langchain/document_loaders/async_html.py` to store the HTML Title and Page Language in the `metadata` of `AsyncHtmlLoader`.
This commit is contained in:
parent
4b16601d33
commit
c08b622b2d
@ -24,6 +24,18 @@ default_header_template = {
|
||||
}
|
||||
|
||||
|
||||
def _build_metadata(soup: Any, url: str) -> dict:
|
||||
"""Build metadata from BeautifulSoup output."""
|
||||
metadata = {"source": url}
|
||||
if title := soup.find("title"):
|
||||
metadata["title"] = title.get_text()
|
||||
if description := soup.find("meta", attrs={"name": "description"}):
|
||||
metadata["description"] = description.get("content", "No description found.")
|
||||
if html := soup.find("html"):
|
||||
metadata["language"] = html.get("lang", "No language found.")
|
||||
return metadata
|
||||
|
||||
|
||||
class AsyncHtmlLoader(BaseLoader):
|
||||
"""Load `HTML` asynchronously."""
|
||||
|
||||
@ -33,6 +45,9 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
header_template: Optional[dict] = None,
|
||||
verify_ssl: Optional[bool] = True,
|
||||
proxies: Optional[dict] = None,
|
||||
autoset_encoding: bool = True,
|
||||
encoding: Optional[str] = None,
|
||||
default_parser: str = "html.parser",
|
||||
requests_per_second: int = 2,
|
||||
requests_kwargs: Optional[Dict[str, Any]] = None,
|
||||
raise_for_status: bool = False,
|
||||
@ -68,8 +83,46 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
self.session.proxies.update(proxies)
|
||||
|
||||
self.requests_per_second = requests_per_second
|
||||
self.default_parser = default_parser
|
||||
self.requests_kwargs = requests_kwargs or {}
|
||||
self.raise_for_status = raise_for_status
|
||||
self.autoset_encoding = autoset_encoding
|
||||
self.encoding = encoding
|
||||
|
||||
@staticmethod
|
||||
def _check_parser(parser: str) -> None:
|
||||
"""Check that parser is valid for bs4."""
|
||||
valid_parsers = ["html.parser", "lxml", "xml", "lxml-xml", "html5lib"]
|
||||
if parser not in valid_parsers:
|
||||
raise ValueError(
|
||||
"`parser` must be one of " + ", ".join(valid_parsers) + "."
|
||||
)
|
||||
|
||||
def _scrape(
|
||||
self,
|
||||
url: str,
|
||||
parser: Union[str, None] = None,
|
||||
bs_kwargs: Optional[dict] = None,
|
||||
) -> Any:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
if parser is None:
|
||||
if url.endswith(".xml"):
|
||||
parser = "xml"
|
||||
else:
|
||||
parser = self.default_parser
|
||||
|
||||
self._check_parser(parser)
|
||||
|
||||
html_doc = self.session.get(url, **self.requests_kwargs)
|
||||
if self.raise_for_status:
|
||||
html_doc.raise_for_status()
|
||||
|
||||
if self.encoding is not None:
|
||||
html_doc.encoding = self.encoding
|
||||
elif self.autoset_encoding:
|
||||
html_doc.encoding = html_doc.apparent_encoding
|
||||
return BeautifulSoup(html_doc.text, parser, **(bs_kwargs or {}))
|
||||
|
||||
async def _fetch(
|
||||
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
||||
@ -142,7 +195,8 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
results = asyncio.run(self.fetch_all(self.web_paths))
|
||||
docs = []
|
||||
for i, text in enumerate(cast(List[str], results)):
|
||||
metadata = {"source": self.web_paths[i]}
|
||||
soup = self._scrape(self.web_paths[i])
|
||||
metadata = _build_metadata(soup, self.web_paths[i])
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
return docs
|
||||
|
Loading…
Reference in New Issue
Block a user