From d2f882158fe73a55909895f916197e18e623c41a Mon Sep 17 00:00:00 2001 From: Roy Williams Date: Thu, 26 Jan 2023 22:37:31 -0500 Subject: [PATCH] Add type information for crawler.py (#738) Added type information to `crawler.py` to make it safer to use and understand. --- docs/modules/agents/implementations/natbot.py | 2 +- langchain/chains/natbot/crawler.py | 172 ++++++++++-------- 2 files changed, 100 insertions(+), 74 deletions(-) diff --git a/docs/modules/agents/implementations/natbot.py b/docs/modules/agents/implementations/natbot.py index 59c696ac732..d9d24af1533 100644 --- a/docs/modules/agents/implementations/natbot.py +++ b/docs/modules/agents/implementations/natbot.py @@ -2,7 +2,7 @@ import time from langchain.chains.natbot.base import NatBotChain -from langchain.chains.natbot.crawler import Crawler # type: ignore +from langchain.chains.natbot.crawler import Crawler def run_cmd(cmd: str, _crawler: Crawler) -> None: diff --git a/langchain/chains/natbot/crawler.py b/langchain/chains/natbot/crawler.py index 6be687c1962..6fcf9b4b2de 100644 --- a/langchain/chains/natbot/crawler.py +++ b/langchain/chains/natbot/crawler.py @@ -1,9 +1,23 @@ # flake8: noqa -# type: ignore import time from sys import platform +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterable, + List, + Optional, + Set, + Tuple, + TypedDict, + Union, +) -black_listed_elements = { +if TYPE_CHECKING: + from playwright.sync_api import Browser, CDPSession, Page, sync_playwright + +black_listed_elements: Set[str] = { "html", "head", "title", @@ -19,8 +33,21 @@ black_listed_elements = { } +class ElementInViewPort(TypedDict): + node_index: str + backend_node_id: int + node_name: Optional[str] + node_value: Optional[str] + node_meta: List[str] + is_clickable: bool + origin_x: int + origin_y: int + center_x: int + center_y: int + + class Crawler: - def __init__(self): + def __init__(self) -> None: try: from playwright.sync_api import sync_playwright except ImportError: @@ -28,16 +55,20 @@ class Crawler: "Could not import playwright python package. " "Please it install it with `pip install playwright`." ) - self.browser = sync_playwright().start().chromium.launch(headless=False) - self.page = self.browser.new_page() + self.browser: Browser = ( + sync_playwright().start().chromium.launch(headless=False) + ) + self.page: Page = self.browser.new_page() self.page.set_viewport_size({"width": 1280, "height": 1080}) + self.page_element_buffer: Dict[int, ElementInViewPort] + self.client: CDPSession - def go_to_page(self, url): + def go_to_page(self, url: str) -> None: self.page.goto(url=url if "://" in url else "http://" + url) self.client = self.page.context.new_cdp_session(self.page) self.page_element_buffer = {} - def scroll(self, direction): + def scroll(self, direction: str) -> None: if direction == "up": self.page.evaluate( "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;" @@ -47,7 +78,7 @@ class Crawler: "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;" ) - def click(self, id): + def click(self, id: Union[str, int]) -> None: # Inject javascript into the page which removes the target= attribute from all links js = """ links = document.getElementsByTagName("a"); @@ -59,41 +90,37 @@ class Crawler: element = self.page_element_buffer.get(int(id)) if element: - x = element.get("center_x") - y = element.get("center_y") + x: float = element["center_x"] + y: float = element["center_y"] self.page.mouse.click(x, y) else: print("Could not find element") - def type(self, id, text): + def type(self, id: Union[str, int], text: str) -> None: self.click(id) self.page.keyboard.type(text) - def enter(self): + def enter(self) -> None: self.page.keyboard.press("Enter") - def crawl(self): + def crawl(self) -> List[str]: page = self.page page_element_buffer = self.page_element_buffer start = time.time() page_state_as_text = [] - device_pixel_ratio = page.evaluate("window.devicePixelRatio") + device_pixel_ratio: float = page.evaluate("window.devicePixelRatio") if platform == "darwin" and device_pixel_ratio == 1: # lies device_pixel_ratio = 2 - win_scroll_x = page.evaluate("window.scrollX") - win_scroll_y = page.evaluate("window.scrollY") - win_upper_bound = page.evaluate("window.pageYOffset") - win_left_bound = page.evaluate("window.pageXOffset") - win_width = page.evaluate("window.screen.width") - win_height = page.evaluate("window.screen.height") - win_right_bound = win_left_bound + win_width - win_lower_bound = win_upper_bound + win_height - document_offset_height = page.evaluate("document.body.offsetHeight") - document_scroll_height = page.evaluate("document.body.scrollHeight") + win_upper_bound: float = page.evaluate("window.pageYOffset") + win_left_bound: float = page.evaluate("window.pageXOffset") + win_width: float = page.evaluate("window.screen.width") + win_height: float = page.evaluate("window.screen.height") + win_right_bound: float = win_left_bound + win_width + win_lower_bound: float = win_upper_bound + win_height # percentage_progress_start = (win_upper_bound / document_scroll_height) * 100 # percentage_progress_end = ( @@ -116,40 +143,35 @@ class Crawler: "DOMSnapshot.captureSnapshot", {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True}, ) - strings = tree["strings"] - document = tree["documents"][0] - nodes = document["nodes"] - backend_node_id = nodes["backendNodeId"] - attributes = nodes["attributes"] - node_value = nodes["nodeValue"] - parent = nodes["parentIndex"] - node_types = nodes["nodeType"] - node_names = nodes["nodeName"] - is_clickable = set(nodes["isClickable"]["index"]) + strings: Dict[int, str] = tree["strings"] + document: Dict[str, Any] = tree["documents"][0] + nodes: Dict[str, Any] = document["nodes"] + backend_node_id: Dict[int, int] = nodes["backendNodeId"] + attributes: Dict[int, Dict[int, Any]] = nodes["attributes"] + node_value: Dict[int, int] = nodes["nodeValue"] + parent: Dict[int, int] = nodes["parentIndex"] + node_names: Dict[int, int] = nodes["nodeName"] + is_clickable: Set[int] = set(nodes["isClickable"]["index"]) - text_value = nodes["textValue"] - text_value_index = text_value["index"] - text_value_values = text_value["value"] + input_value: Dict[str, Any] = nodes["inputValue"] + input_value_index: List[int] = input_value["index"] + input_value_values: List[int] = input_value["value"] - input_value = nodes["inputValue"] - input_value_index = input_value["index"] - input_value_values = input_value["value"] + layout: Dict[str, Any] = document["layout"] + layout_node_index: List[int] = layout["nodeIndex"] + bounds: Dict[int, List[float]] = layout["bounds"] - input_checked = nodes["inputChecked"] - layout = document["layout"] - layout_node_index = layout["nodeIndex"] - bounds = layout["bounds"] + cursor: int = 0 - cursor = 0 - html_elements_text = [] + child_nodes: Dict[str, List[Dict[str, Any]]] = {} + elements_in_view_port: List[ElementInViewPort] = [] - child_nodes = {} - elements_in_view_port = [] + anchor_ancestry: Dict[str, Tuple[bool, Optional[int]]] = {"-1": (False, None)} + button_ancestry: Dict[str, Tuple[bool, Optional[int]]] = {"-1": (False, None)} - anchor_ancestry = {"-1": (False, None)} - button_ancestry = {"-1": (False, None)} - - def convert_name(node_name, has_click_handler): + def convert_name( + node_name: Optional[str], has_click_handler: Optional[bool] + ) -> str: if node_name == "a": return "link" if node_name == "input": @@ -163,7 +185,9 @@ class Crawler: else: return "text" - def find_attributes(attributes, keys): + def find_attributes( + attributes: Dict[int, Any], keys: List[str] + ) -> Dict[str, str]: values = {} for [key_index, value_index] in zip(*(iter(attributes),) * 2): @@ -181,7 +205,13 @@ class Crawler: return values - def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id): + def add_to_hash_tree( + hash_tree: Dict[str, Tuple[bool, Optional[int]]], + tag: str, + node_id: int, + node_name: Optional[str], + parent_id: int, + ) -> Tuple[bool, Optional[int]]: parent_id_str = str(parent_id) if not parent_id_str in hash_tree: parent_name = strings[node_names[parent_id]].lower() @@ -195,7 +225,7 @@ class Crawler: # even if the anchor is nested in another anchor, we set the "root" for all descendants to be ::Self if node_name == tag: - value = (True, node_id) + value: Tuple[bool, Optional[int]] = (True, node_id) elif ( is_parent_desc_anchor ): # reuse the parent's anchor_id (which could be much higher in the tree) @@ -212,7 +242,7 @@ class Crawler: for index, node_name_index in enumerate(node_names): node_parent = parent[index] - node_name = strings[node_name_index].lower() + node_name: Optional[str] = strings[node_name_index].lower() is_ancestor_of_anchor, anchor_id = add_to_hash_tree( anchor_ancestry, "a", index, node_name, node_parent @@ -253,7 +283,7 @@ class Crawler: if not partially_is_in_viewport: continue - meta_data = [] + meta_data: List[str] = [] # inefficient to grab the same set of keys for kinds of objects, but it's fine for now element_attributes = find_attributes( @@ -274,7 +304,7 @@ class Crawler: else child_nodes.setdefault(str(ancestor_node_key), []) ) - if node_name == "#text" and ancestor_exception: + if node_name == "#text" and ancestor_exception and ancestor_node: text = strings[node_value[index]] if text == "|" or text == "•": continue @@ -289,7 +319,7 @@ class Crawler: ) # prevent [button ... (button)..] for key in element_attributes: - if ancestor_exception: + if ancestor_exception and ancestor_node: ancestor_node.append( { "type": "attribute", @@ -344,36 +374,32 @@ class Crawler: for element in elements_in_view_port: node_index = element.get("node_index") node_name = element.get("node_name") - node_value = element.get("node_value") - is_clickable = element.get("is_clickable") - origin_x = element.get("origin_x") - origin_y = element.get("origin_y") - center_x = element.get("center_x") - center_y = element.get("center_y") - meta_data = element.get("node_meta") + element_node_value = element.get("node_value") + node_is_clickable = element.get("is_clickable") + node_meta_data: Optional[List[str]] = element.get("node_meta") - inner_text = f"{node_value} " if node_value else "" + inner_text = f"{element_node_value} " if element_node_value else "" meta = "" if node_index in child_nodes: - for child in child_nodes.get(node_index): + for child in child_nodes[node_index]: entry_type = child.get("type") entry_value = child.get("value") - if entry_type == "attribute": + if entry_type == "attribute" and node_meta_data: entry_key = child.get("key") - meta_data.append(f'{entry_key}="{entry_value}"') + node_meta_data.append(f'{entry_key}="{entry_value}"') else: inner_text += f"{entry_value} " - if meta_data: - meta_string = " ".join(meta_data) + if node_meta_data: + meta_string = " ".join(node_meta_data) meta = f" {meta_string}" if inner_text != "": inner_text = f"{inner_text.strip()}" - converted_node_name = convert_name(node_name, is_clickable) + converted_node_name = convert_name(node_name, node_is_clickable) # not very elegant, more like a placeholder if (