diff --git a/libs/community/langchain_community/document_loaders/arcgis_loader.py b/libs/community/langchain_community/document_loaders/arcgis_loader.py index 4958ef2ce6a..24264b0add7 100644 --- a/libs/community/langchain_community/document_loaders/arcgis_loader.py +++ b/libs/community/langchain_community/document_loaders/arcgis_loader.py @@ -41,7 +41,7 @@ class ArcGISLoader(BaseLoader): ) from e try: - from bs4 import BeautifulSoup # type: ignore + from bs4 import BeautifulSoup self.BEAUTIFULSOUP = BeautifulSoup except ImportError: diff --git a/libs/community/langchain_community/document_loaders/assemblyai.py b/libs/community/langchain_community/document_loaders/assemblyai.py index ee040ab513d..3b1b4060b98 100644 --- a/libs/community/langchain_community/document_loaders/assemblyai.py +++ b/libs/community/langchain_community/document_loaders/assemblyai.py @@ -123,7 +123,9 @@ class AssemblyAIAudioLoaderById(BaseLoader): """ - def __init__(self, transcript_id, api_key, transcript_format): # type: ignore[no-untyped-def] + def __init__( + self, transcript_id: str, api_key: str, transcript_format: TranscriptFormat + ): """ Initializes the AssemblyAI AssemblyAIAudioLoaderById. diff --git a/libs/community/langchain_community/document_loaders/astradb.py b/libs/community/langchain_community/document_loaders/astradb.py index 4cc1621a39d..8a78e287a6e 100644 --- a/libs/community/langchain_community/document_loaders/astradb.py +++ b/libs/community/langchain_community/document_loaders/astradb.py @@ -65,7 +65,7 @@ class AstraDBLoader(BaseLoader): return list(self.lazy_load()) def lazy_load(self) -> Iterator[Document]: - queue = Queue(self.nb_prefetched) # type: ignore[var-annotated] + queue = Queue(self.nb_prefetched) # type: ignore t = threading.Thread(target=self.fetch_results, args=(queue,)) t.start() while True: diff --git a/libs/community/langchain_community/document_loaders/blob_loaders/file_system.py b/libs/community/langchain_community/document_loaders/blob_loaders/file_system.py index d9c8ebf8833..ee756f32ed7 100644 --- a/libs/community/langchain_community/document_loaders/blob_loaders/file_system.py +++ b/libs/community/langchain_community/document_loaders/blob_loaders/file_system.py @@ -12,6 +12,7 @@ def _make_iterator( length_func: Callable[[], int], show_progress: bool = False ) -> Callable[[Iterable[T]], Iterator[T]]: """Create a function that optionally wraps an iterable in tqdm.""" + iterator: Callable[[Iterable[T]], Iterator[T]] if show_progress: try: from tqdm.auto import tqdm @@ -29,7 +30,7 @@ def _make_iterator( iterator = _with_tqdm else: - iterator = iter # type: ignore + iterator = iter return iterator diff --git a/libs/community/langchain_community/document_loaders/csv_loader.py b/libs/community/langchain_community/document_loaders/csv_loader.py index 92198ac5bac..28d340e5fbd 100644 --- a/libs/community/langchain_community/document_loaders/csv_loader.py +++ b/libs/community/langchain_community/document_loaders/csv_loader.py @@ -90,7 +90,7 @@ class CSVLoader(BaseLoader): def __read_file(self, csvfile: TextIOWrapper) -> List[Document]: docs = [] - csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore + csv_reader = csv.DictReader(csvfile, **self.csv_args) for i, row in enumerate(csv_reader): try: source = ( diff --git a/libs/community/langchain_community/document_loaders/directory.py b/libs/community/langchain_community/document_loaders/directory.py index 3837133a693..7cfa456487c 100644 --- a/libs/community/langchain_community/document_loaders/directory.py +++ b/libs/community/langchain_community/document_loaders/directory.py @@ -122,10 +122,10 @@ class DirectoryLoader(BaseLoader): if self.sample_size > 0: if self.randomize_sample: - randomizer = ( - random.Random(self.sample_seed) if self.sample_seed else random + randomizer = random.Random( + self.sample_seed if self.sample_seed else None ) - randomizer.shuffle(items) # type: ignore + randomizer.shuffle(items) items = items[: min(len(items), self.sample_size)] pbar = None diff --git a/libs/community/langchain_community/document_loaders/git.py b/libs/community/langchain_community/document_loaders/git.py index 97c02b2111b..8d97ecd3df4 100644 --- a/libs/community/langchain_community/document_loaders/git.py +++ b/libs/community/langchain_community/document_loaders/git.py @@ -41,7 +41,7 @@ class GitLoader(BaseLoader): def load(self) -> List[Document]: try: - from git import Blob, Repo # type: ignore + from git import Blob, Repo except ImportError as ex: raise ImportError( "Could not import git python package. " @@ -76,7 +76,7 @@ class GitLoader(BaseLoader): file_path = os.path.join(self.repo_path, item.path) - ignored_files = repo.ignored([file_path]) # type: ignore + ignored_files = repo.ignored([file_path]) if len(ignored_files): continue diff --git a/libs/community/langchain_community/document_loaders/parsers/language/javascript.py b/libs/community/langchain_community/document_loaders/parsers/language/javascript.py index 0f2fea68fa2..27a360a2e6c 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/javascript.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/javascript.py @@ -1,4 +1,4 @@ -from typing import Any, List +from typing import Any, List, Tuple from langchain_community.document_loaders.parsers.language.code_segmenter import ( CodeSegmenter, @@ -55,15 +55,18 @@ class JavaScriptSegmenter(CodeSegmenter): tree = esprima.parseScript(self.code, loc=True) simplified_lines = self.source_lines[:] + indices_to_del: List[Tuple[int, int]] = [] for node in tree.body: if isinstance( node, (esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration), ): - start = node.loc.start.line - 1 + start, end = node.loc.start.line - 1, node.loc.end.line simplified_lines[start] = f"// Code for: {simplified_lines[start]}" - for line_num in range(start + 1, node.loc.end.line): - simplified_lines[line_num] = None # type: ignore + indices_to_del.append((start + 1, end)) - return "\n".join(line for line in simplified_lines if line is not None) + for start, end in reversed(indices_to_del): + del simplified_lines[start + 0 : end] + + return "\n".join(line for line in simplified_lines) diff --git a/libs/community/langchain_community/document_loaders/parsers/language/python.py b/libs/community/langchain_community/document_loaders/parsers/language/python.py index ca810946bd4..52dbc68352a 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/python.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/python.py @@ -1,5 +1,5 @@ import ast -from typing import Any, List +from typing import Any, List, Tuple from langchain_community.document_loaders.parsers.language.code_segmenter import ( CodeSegmenter, @@ -39,13 +39,15 @@ class PythonSegmenter(CodeSegmenter): tree = ast.parse(self.code) simplified_lines = self.source_lines[:] + indices_to_del: List[Tuple[int, int]] = [] for node in ast.iter_child_nodes(tree): if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): - start = node.lineno - 1 + start, end = node.lineno - 1, node.end_lineno simplified_lines[start] = f"# Code for: {simplified_lines[start]}" + assert isinstance(end, int) + indices_to_del.append((start + 1, end)) - assert isinstance(node.end_lineno, int) - for line_num in range(start + 1, node.end_lineno): - simplified_lines[line_num] = None # type: ignore + for start, end in reversed(indices_to_del): + del simplified_lines[start + 0 : end] - return "\n".join(line for line in simplified_lines if line is not None) + return "\n".join(simplified_lines) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 60c2e7fe482..ea086054110 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -318,7 +318,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader): output_string = StringIO() with open_filename(self.file_path, "rb") as fp: extract_text_to_fp( - fp, # type: ignore[arg-type] + fp, output_string, codec="", laparams=LAParams(), diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index c24ab1730fd..08887659841 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -215,6 +215,11 @@ class RecursiveUrlLoader(BaseLoader): visited: A set of visited URLs. depth: To reach the current url, how many pages have been visited. """ + if not self.use_async or not self._lock: + raise ValueError( + "Async functions forbidden when not initialized with `use_async`" + ) + try: import aiohttp except ImportError: @@ -237,7 +242,7 @@ class RecursiveUrlLoader(BaseLoader): headers=self.headers, ) ) - async with self._lock: # type: ignore + async with self._lock: visited.add(url) try: async with session.get(url) as response: @@ -277,7 +282,7 @@ class RecursiveUrlLoader(BaseLoader): # Recursively call the function to get the children of the children sub_tasks = [] - async with self._lock: # type: ignore + async with self._lock: to_visit = set(sub_links).difference(visited) for link in to_visit: sub_tasks.append(