community[patch]: doc loaders mypy fixes (#17368)

**Description:** Fixed `type: ignore`'s for mypy for some document_loaders. **Issue:** [Remove "type: ignore" comments #17048 ](https://github.com/langchain-ai/langchain/issues/17048) --------- Co-authored-by: Robby <h0rv@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-07-17 10:13:29 +00:00 · 2024-02-12 19:51:06 -05:00 · 2024-02-12 19:51:06 -05:00 · ece4b43a81
commit ece4b43a81
parent 0653aa469a
11 changed files with 37 additions and 24 deletions
--- a/libs/community/langchain_community/document_loaders/arcgis_loader.py
+++ b/libs/community/langchain_community/document_loaders/arcgis_loader.py
@ -41,7 +41,7 @@ class ArcGISLoader(BaseLoader):
            ) from e

        try:
-            from bs4 import BeautifulSoup  # type: ignore
+            from bs4 import BeautifulSoup

            self.BEAUTIFULSOUP = BeautifulSoup
        except ImportError:
--- a/libs/community/langchain_community/document_loaders/assemblyai.py
+++ b/libs/community/langchain_community/document_loaders/assemblyai.py
@ -123,7 +123,9 @@ class AssemblyAIAudioLoaderById(BaseLoader):

    """

-    def __init__(self, transcript_id, api_key, transcript_format):  # type: ignore[no-untyped-def]
+    def __init__(
+        self, transcript_id: str, api_key: str, transcript_format: TranscriptFormat
+    ):
        """
        Initializes the AssemblyAI AssemblyAIAudioLoaderById.

--- a/libs/community/langchain_community/document_loaders/astradb.py
+++ b/libs/community/langchain_community/document_loaders/astradb.py
@ -65,7 +65,7 @@ class AstraDBLoader(BaseLoader):
        return list(self.lazy_load())

    def lazy_load(self) -> Iterator[Document]:
-        queue = Queue(self.nb_prefetched)  # type: ignore[var-annotated]
+        queue = Queue(self.nb_prefetched)  # type: ignore
        t = threading.Thread(target=self.fetch_results, args=(queue,))
        t.start()
        while True:
--- a/libs/community/langchain_community/document_loaders/blob_loaders/file_system.py
+++ b/libs/community/langchain_community/document_loaders/blob_loaders/file_system.py
@ -12,6 +12,7 @@ def _make_iterator(
    length_func: Callable[[], int], show_progress: bool = False
 ) -> Callable[[Iterable[T]], Iterator[T]]:
    """Create a function that optionally wraps an iterable in tqdm."""
+    iterator: Callable[[Iterable[T]], Iterator[T]]
    if show_progress:
        try:
            from tqdm.auto import tqdm
@ -29,7 +30,7 @@ def _make_iterator(

        iterator = _with_tqdm
    else:
-        iterator = iter  # type: ignore
+        iterator = iter

    return iterator

--- a/libs/community/langchain_community/document_loaders/csv_loader.py
+++ b/libs/community/langchain_community/document_loaders/csv_loader.py
@ -90,7 +90,7 @@ class CSVLoader(BaseLoader):
    def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
        docs = []

-        csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
+        csv_reader = csv.DictReader(csvfile, **self.csv_args)
        for i, row in enumerate(csv_reader):
            try:
                source = (
--- a/libs/community/langchain_community/document_loaders/directory.py
+++ b/libs/community/langchain_community/document_loaders/directory.py
@ -122,10 +122,10 @@ class DirectoryLoader(BaseLoader):

        if self.sample_size > 0:
            if self.randomize_sample:
-                randomizer = (
-                    random.Random(self.sample_seed) if self.sample_seed else random
+                randomizer = random.Random(
+                    self.sample_seed if self.sample_seed else None
                )
-                randomizer.shuffle(items)  # type: ignore
+                randomizer.shuffle(items)
            items = items[: min(len(items), self.sample_size)]

        pbar = None
--- a/libs/community/langchain_community/document_loaders/git.py
+++ b/libs/community/langchain_community/document_loaders/git.py
@ -41,7 +41,7 @@ class GitLoader(BaseLoader):

    def load(self) -> List[Document]:
        try:
-            from git import Blob, Repo  # type: ignore
+            from git import Blob, Repo
        except ImportError as ex:
            raise ImportError(
                "Could not import git python package. "
@ -76,7 +76,7 @@ class GitLoader(BaseLoader):

            file_path = os.path.join(self.repo_path, item.path)

-            ignored_files = repo.ignored([file_path])  # type: ignore
+            ignored_files = repo.ignored([file_path])
            if len(ignored_files):
                continue

--- a/libs/community/langchain_community/document_loaders/parsers/language/javascript.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/javascript.py
@ -1,4 +1,4 @@
-from typing import Any, List
+from typing import Any, List, Tuple

 from langchain_community.document_loaders.parsers.language.code_segmenter import (
    CodeSegmenter,
@ -55,15 +55,18 @@ class JavaScriptSegmenter(CodeSegmenter):
        tree = esprima.parseScript(self.code, loc=True)
        simplified_lines = self.source_lines[:]

+        indices_to_del: List[Tuple[int, int]] = []
        for node in tree.body:
            if isinstance(
                node,
                (esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
            ):
-                start = node.loc.start.line - 1
+                start, end = node.loc.start.line - 1, node.loc.end.line
                simplified_lines[start] = f"// Code for: {simplified_lines[start]}"

-                for line_num in range(start + 1, node.loc.end.line):
-                    simplified_lines[line_num] = None  # type: ignore
+                indices_to_del.append((start + 1, end))

-        return "\n".join(line for line in simplified_lines if line is not None)
+        for start, end in reversed(indices_to_del):
+            del simplified_lines[start + 0 : end]
+
+        return "\n".join(line for line in simplified_lines)
--- a/libs/community/langchain_community/document_loaders/parsers/language/python.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/python.py
@ -1,5 +1,5 @@
 import ast
-from typing import Any, List
+from typing import Any, List, Tuple

 from langchain_community.document_loaders.parsers.language.code_segmenter import (
    CodeSegmenter,
@ -39,13 +39,15 @@ class PythonSegmenter(CodeSegmenter):
        tree = ast.parse(self.code)
        simplified_lines = self.source_lines[:]

+        indices_to_del: List[Tuple[int, int]] = []
        for node in ast.iter_child_nodes(tree):
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
-                start = node.lineno - 1
+                start, end = node.lineno - 1, node.end_lineno
                simplified_lines[start] = f"# Code for: {simplified_lines[start]}"
+                assert isinstance(end, int)
+                indices_to_del.append((start + 1, end))

-                assert isinstance(node.end_lineno, int)
-                for line_num in range(start + 1, node.end_lineno):
-                    simplified_lines[line_num] = None  # type: ignore
+        for start, end in reversed(indices_to_del):
+            del simplified_lines[start + 0 : end]

-        return "\n".join(line for line in simplified_lines if line is not None)
+        return "\n".join(simplified_lines)
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -318,7 +318,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
        output_string = StringIO()
        with open_filename(self.file_path, "rb") as fp:
            extract_text_to_fp(
-                fp,  # type: ignore[arg-type]
+                fp,
                output_string,
                codec="",
                laparams=LAParams(),
--- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py
+++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@ -215,6 +215,11 @@ class RecursiveUrlLoader(BaseLoader):
            visited: A set of visited URLs.
            depth: To reach the current url, how many pages have been visited.
        """
+        if not self.use_async or not self._lock:
+            raise ValueError(
+                "Async functions forbidden when not initialized with `use_async`"
+            )
+
        try:
            import aiohttp
        except ImportError:
@ -237,7 +242,7 @@ class RecursiveUrlLoader(BaseLoader):
                headers=self.headers,
            )
        )
-        async with self._lock:  # type: ignore
+        async with self._lock:
            visited.add(url)
        try:
            async with session.get(url) as response:
@ -277,7 +282,7 @@ class RecursiveUrlLoader(BaseLoader):

            # Recursively call the function to get the children of the children
            sub_tasks = []
-            async with self._lock:  # type: ignore
+            async with self._lock:
                to_visit = set(sub_links).difference(visited)
                for link in to_visit:
                    sub_tasks.append(