community[patch]: doc loaders mypy fixes (#17368)

**Description:** Fixed `type: ignore`'s for mypy for some
document_loaders.
**Issue:** [Remove "type: ignore" comments #17048
](https://github.com/langchain-ai/langchain/issues/17048)

---------

Co-authored-by: Robby <h0rv@users.noreply.github.com>
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Robby 2024-02-12 19:51:06 -05:00 committed by GitHub
parent 0653aa469a
commit ece4b43a81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 37 additions and 24 deletions

View File

@ -41,7 +41,7 @@ class ArcGISLoader(BaseLoader):
) from e ) from e
try: try:
from bs4 import BeautifulSoup # type: ignore from bs4 import BeautifulSoup
self.BEAUTIFULSOUP = BeautifulSoup self.BEAUTIFULSOUP = BeautifulSoup
except ImportError: except ImportError:

View File

@ -123,7 +123,9 @@ class AssemblyAIAudioLoaderById(BaseLoader):
""" """
def __init__(self, transcript_id, api_key, transcript_format): # type: ignore[no-untyped-def] def __init__(
self, transcript_id: str, api_key: str, transcript_format: TranscriptFormat
):
""" """
Initializes the AssemblyAI AssemblyAIAudioLoaderById. Initializes the AssemblyAI AssemblyAIAudioLoaderById.

View File

@ -65,7 +65,7 @@ class AstraDBLoader(BaseLoader):
return list(self.lazy_load()) return list(self.lazy_load())
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
queue = Queue(self.nb_prefetched) # type: ignore[var-annotated] queue = Queue(self.nb_prefetched) # type: ignore
t = threading.Thread(target=self.fetch_results, args=(queue,)) t = threading.Thread(target=self.fetch_results, args=(queue,))
t.start() t.start()
while True: while True:

View File

@ -12,6 +12,7 @@ def _make_iterator(
length_func: Callable[[], int], show_progress: bool = False length_func: Callable[[], int], show_progress: bool = False
) -> Callable[[Iterable[T]], Iterator[T]]: ) -> Callable[[Iterable[T]], Iterator[T]]:
"""Create a function that optionally wraps an iterable in tqdm.""" """Create a function that optionally wraps an iterable in tqdm."""
iterator: Callable[[Iterable[T]], Iterator[T]]
if show_progress: if show_progress:
try: try:
from tqdm.auto import tqdm from tqdm.auto import tqdm
@ -29,7 +30,7 @@ def _make_iterator(
iterator = _with_tqdm iterator = _with_tqdm
else: else:
iterator = iter # type: ignore iterator = iter
return iterator return iterator

View File

@ -90,7 +90,7 @@ class CSVLoader(BaseLoader):
def __read_file(self, csvfile: TextIOWrapper) -> List[Document]: def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
docs = [] docs = []
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore csv_reader = csv.DictReader(csvfile, **self.csv_args)
for i, row in enumerate(csv_reader): for i, row in enumerate(csv_reader):
try: try:
source = ( source = (

View File

@ -122,10 +122,10 @@ class DirectoryLoader(BaseLoader):
if self.sample_size > 0: if self.sample_size > 0:
if self.randomize_sample: if self.randomize_sample:
randomizer = ( randomizer = random.Random(
random.Random(self.sample_seed) if self.sample_seed else random self.sample_seed if self.sample_seed else None
) )
randomizer.shuffle(items) # type: ignore randomizer.shuffle(items)
items = items[: min(len(items), self.sample_size)] items = items[: min(len(items), self.sample_size)]
pbar = None pbar = None

View File

@ -41,7 +41,7 @@ class GitLoader(BaseLoader):
def load(self) -> List[Document]: def load(self) -> List[Document]:
try: try:
from git import Blob, Repo # type: ignore from git import Blob, Repo
except ImportError as ex: except ImportError as ex:
raise ImportError( raise ImportError(
"Could not import git python package. " "Could not import git python package. "
@ -76,7 +76,7 @@ class GitLoader(BaseLoader):
file_path = os.path.join(self.repo_path, item.path) file_path = os.path.join(self.repo_path, item.path)
ignored_files = repo.ignored([file_path]) # type: ignore ignored_files = repo.ignored([file_path])
if len(ignored_files): if len(ignored_files):
continue continue

View File

@ -1,4 +1,4 @@
from typing import Any, List from typing import Any, List, Tuple
from langchain_community.document_loaders.parsers.language.code_segmenter import ( from langchain_community.document_loaders.parsers.language.code_segmenter import (
CodeSegmenter, CodeSegmenter,
@ -55,15 +55,18 @@ class JavaScriptSegmenter(CodeSegmenter):
tree = esprima.parseScript(self.code, loc=True) tree = esprima.parseScript(self.code, loc=True)
simplified_lines = self.source_lines[:] simplified_lines = self.source_lines[:]
indices_to_del: List[Tuple[int, int]] = []
for node in tree.body: for node in tree.body:
if isinstance( if isinstance(
node, node,
(esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration), (esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
): ):
start = node.loc.start.line - 1 start, end = node.loc.start.line - 1, node.loc.end.line
simplified_lines[start] = f"// Code for: {simplified_lines[start]}" simplified_lines[start] = f"// Code for: {simplified_lines[start]}"
for line_num in range(start + 1, node.loc.end.line): indices_to_del.append((start + 1, end))
simplified_lines[line_num] = None # type: ignore
return "\n".join(line for line in simplified_lines if line is not None) for start, end in reversed(indices_to_del):
del simplified_lines[start + 0 : end]
return "\n".join(line for line in simplified_lines)

View File

@ -1,5 +1,5 @@
import ast import ast
from typing import Any, List from typing import Any, List, Tuple
from langchain_community.document_loaders.parsers.language.code_segmenter import ( from langchain_community.document_loaders.parsers.language.code_segmenter import (
CodeSegmenter, CodeSegmenter,
@ -39,13 +39,15 @@ class PythonSegmenter(CodeSegmenter):
tree = ast.parse(self.code) tree = ast.parse(self.code)
simplified_lines = self.source_lines[:] simplified_lines = self.source_lines[:]
indices_to_del: List[Tuple[int, int]] = []
for node in ast.iter_child_nodes(tree): for node in ast.iter_child_nodes(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start = node.lineno - 1 start, end = node.lineno - 1, node.end_lineno
simplified_lines[start] = f"# Code for: {simplified_lines[start]}" simplified_lines[start] = f"# Code for: {simplified_lines[start]}"
assert isinstance(end, int)
indices_to_del.append((start + 1, end))
assert isinstance(node.end_lineno, int) for start, end in reversed(indices_to_del):
for line_num in range(start + 1, node.end_lineno): del simplified_lines[start + 0 : end]
simplified_lines[line_num] = None # type: ignore
return "\n".join(line for line in simplified_lines if line is not None) return "\n".join(simplified_lines)

View File

@ -318,7 +318,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
output_string = StringIO() output_string = StringIO()
with open_filename(self.file_path, "rb") as fp: with open_filename(self.file_path, "rb") as fp:
extract_text_to_fp( extract_text_to_fp(
fp, # type: ignore[arg-type] fp,
output_string, output_string,
codec="", codec="",
laparams=LAParams(), laparams=LAParams(),

View File

@ -215,6 +215,11 @@ class RecursiveUrlLoader(BaseLoader):
visited: A set of visited URLs. visited: A set of visited URLs.
depth: To reach the current url, how many pages have been visited. depth: To reach the current url, how many pages have been visited.
""" """
if not self.use_async or not self._lock:
raise ValueError(
"Async functions forbidden when not initialized with `use_async`"
)
try: try:
import aiohttp import aiohttp
except ImportError: except ImportError:
@ -237,7 +242,7 @@ class RecursiveUrlLoader(BaseLoader):
headers=self.headers, headers=self.headers,
) )
) )
async with self._lock: # type: ignore async with self._lock:
visited.add(url) visited.add(url)
try: try:
async with session.get(url) as response: async with session.get(url) as response:
@ -277,7 +282,7 @@ class RecursiveUrlLoader(BaseLoader):
# Recursively call the function to get the children of the children # Recursively call the function to get the children of the children
sub_tasks = [] sub_tasks = []
async with self._lock: # type: ignore async with self._lock:
to_visit = set(sub_links).difference(visited) to_visit = set(sub_links).difference(visited)
for link in to_visit: for link in to_visit:
sub_tasks.append( sub_tasks.append(