mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-08 12:31:49 +00:00
community: Implement lazy_load() for PlaywrightURLLoader (#18676)
Integration tests: `tests/integration_tests/document_loaders/test_url_playwright.py`
This commit is contained in:
parent
c092db862e
commit
db8db6faae
@ -2,7 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional
|
from typing import TYPE_CHECKING, AsyncIterator, Dict, Iterator, List, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -160,16 +160,14 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
# Use the provided evaluator, if any, otherwise, use the default.
|
# Use the provided evaluator, if any, otherwise, use the default.
|
||||||
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
|
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load the specified URLs using Playwright and create Document instances.
|
"""Load the specified URLs using Playwright and create Document instances.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Document]: A list of Document instances with loaded content.
|
A list of Document instances with loaded content.
|
||||||
"""
|
"""
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
docs: List[Document] = list()
|
|
||||||
|
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
@ -181,7 +179,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
|
|
||||||
text = self.evaluator.evaluate(page, browser, response)
|
text = self.evaluator.evaluate(page, browser, response)
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
yield Document(page_content=text, metadata=metadata)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.continue_on_failure:
|
if self.continue_on_failure:
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -190,19 +188,25 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
browser.close()
|
browser.close()
|
||||||
return docs
|
|
||||||
|
|
||||||
async def aload(self) -> List[Document]:
|
async def aload(self) -> List[Document]:
|
||||||
"""Load the specified URLs with Playwright and create Documents asynchronously.
|
"""Load the specified URLs with Playwright and create Documents asynchronously.
|
||||||
Use this function when in a jupyter notebook environment.
|
Use this function when in a jupyter notebook environment.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Document]: A list of Document instances with loaded content.
|
A list of Document instances with loaded content.
|
||||||
|
"""
|
||||||
|
return [doc async for doc in self.alazy_load()]
|
||||||
|
|
||||||
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
||||||
|
"""Load the specified URLs with Playwright and create Documents asynchronously.
|
||||||
|
Use this function when in a jupyter notebook environment.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of Document instances with loaded content.
|
||||||
"""
|
"""
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
docs: List[Document] = list()
|
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
@ -214,7 +218,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
|
|
||||||
text = await self.evaluator.evaluate_async(page, browser, response)
|
text = await self.evaluator.evaluate_async(page, browser, response)
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
yield Document(page_content=text, metadata=metadata)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.continue_on_failure:
|
if self.continue_on_failure:
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -223,4 +227,3 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
await browser.close()
|
await browser.close()
|
||||||
return docs
|
|
||||||
|
Loading…
Reference in New Issue
Block a user