mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-18 21:09:00 +00:00
community[patch]: doc loaders mypy fixes (#17368)
**Description:** Fixed `type: ignore`'s for mypy for some document_loaders. **Issue:** [Remove "type: ignore" comments #17048 ](https://github.com/langchain-ai/langchain/issues/17048) --------- Co-authored-by: Robby <h0rv@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
0653aa469a
commit
ece4b43a81
@ -41,7 +41,7 @@ class ArcGISLoader(BaseLoader):
|
|||||||
) from e
|
) from e
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup # type: ignore
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
self.BEAUTIFULSOUP = BeautifulSoup
|
self.BEAUTIFULSOUP = BeautifulSoup
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -123,7 +123,9 @@ class AssemblyAIAudioLoaderById(BaseLoader):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, transcript_id, api_key, transcript_format): # type: ignore[no-untyped-def]
|
def __init__(
|
||||||
|
self, transcript_id: str, api_key: str, transcript_format: TranscriptFormat
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Initializes the AssemblyAI AssemblyAIAudioLoaderById.
|
Initializes the AssemblyAI AssemblyAIAudioLoaderById.
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ class AstraDBLoader(BaseLoader):
|
|||||||
return list(self.lazy_load())
|
return list(self.lazy_load())
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
queue = Queue(self.nb_prefetched) # type: ignore[var-annotated]
|
queue = Queue(self.nb_prefetched) # type: ignore
|
||||||
t = threading.Thread(target=self.fetch_results, args=(queue,))
|
t = threading.Thread(target=self.fetch_results, args=(queue,))
|
||||||
t.start()
|
t.start()
|
||||||
while True:
|
while True:
|
||||||
|
@ -12,6 +12,7 @@ def _make_iterator(
|
|||||||
length_func: Callable[[], int], show_progress: bool = False
|
length_func: Callable[[], int], show_progress: bool = False
|
||||||
) -> Callable[[Iterable[T]], Iterator[T]]:
|
) -> Callable[[Iterable[T]], Iterator[T]]:
|
||||||
"""Create a function that optionally wraps an iterable in tqdm."""
|
"""Create a function that optionally wraps an iterable in tqdm."""
|
||||||
|
iterator: Callable[[Iterable[T]], Iterator[T]]
|
||||||
if show_progress:
|
if show_progress:
|
||||||
try:
|
try:
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
@ -29,7 +30,7 @@ def _make_iterator(
|
|||||||
|
|
||||||
iterator = _with_tqdm
|
iterator = _with_tqdm
|
||||||
else:
|
else:
|
||||||
iterator = iter # type: ignore
|
iterator = iter
|
||||||
|
|
||||||
return iterator
|
return iterator
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ class CSVLoader(BaseLoader):
|
|||||||
def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
|
def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
|
||||||
docs = []
|
docs = []
|
||||||
|
|
||||||
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
|
csv_reader = csv.DictReader(csvfile, **self.csv_args)
|
||||||
for i, row in enumerate(csv_reader):
|
for i, row in enumerate(csv_reader):
|
||||||
try:
|
try:
|
||||||
source = (
|
source = (
|
||||||
|
@ -122,10 +122,10 @@ class DirectoryLoader(BaseLoader):
|
|||||||
|
|
||||||
if self.sample_size > 0:
|
if self.sample_size > 0:
|
||||||
if self.randomize_sample:
|
if self.randomize_sample:
|
||||||
randomizer = (
|
randomizer = random.Random(
|
||||||
random.Random(self.sample_seed) if self.sample_seed else random
|
self.sample_seed if self.sample_seed else None
|
||||||
)
|
)
|
||||||
randomizer.shuffle(items) # type: ignore
|
randomizer.shuffle(items)
|
||||||
items = items[: min(len(items), self.sample_size)]
|
items = items[: min(len(items), self.sample_size)]
|
||||||
|
|
||||||
pbar = None
|
pbar = None
|
||||||
|
@ -41,7 +41,7 @@ class GitLoader(BaseLoader):
|
|||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
try:
|
try:
|
||||||
from git import Blob, Repo # type: ignore
|
from git import Blob, Repo
|
||||||
except ImportError as ex:
|
except ImportError as ex:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Could not import git python package. "
|
"Could not import git python package. "
|
||||||
@ -76,7 +76,7 @@ class GitLoader(BaseLoader):
|
|||||||
|
|
||||||
file_path = os.path.join(self.repo_path, item.path)
|
file_path = os.path.join(self.repo_path, item.path)
|
||||||
|
|
||||||
ignored_files = repo.ignored([file_path]) # type: ignore
|
ignored_files = repo.ignored([file_path])
|
||||||
if len(ignored_files):
|
if len(ignored_files):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, List
|
from typing import Any, List, Tuple
|
||||||
|
|
||||||
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
||||||
CodeSegmenter,
|
CodeSegmenter,
|
||||||
@ -55,15 +55,18 @@ class JavaScriptSegmenter(CodeSegmenter):
|
|||||||
tree = esprima.parseScript(self.code, loc=True)
|
tree = esprima.parseScript(self.code, loc=True)
|
||||||
simplified_lines = self.source_lines[:]
|
simplified_lines = self.source_lines[:]
|
||||||
|
|
||||||
|
indices_to_del: List[Tuple[int, int]] = []
|
||||||
for node in tree.body:
|
for node in tree.body:
|
||||||
if isinstance(
|
if isinstance(
|
||||||
node,
|
node,
|
||||||
(esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
|
(esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
|
||||||
):
|
):
|
||||||
start = node.loc.start.line - 1
|
start, end = node.loc.start.line - 1, node.loc.end.line
|
||||||
simplified_lines[start] = f"// Code for: {simplified_lines[start]}"
|
simplified_lines[start] = f"// Code for: {simplified_lines[start]}"
|
||||||
|
|
||||||
for line_num in range(start + 1, node.loc.end.line):
|
indices_to_del.append((start + 1, end))
|
||||||
simplified_lines[line_num] = None # type: ignore
|
|
||||||
|
|
||||||
return "\n".join(line for line in simplified_lines if line is not None)
|
for start, end in reversed(indices_to_del):
|
||||||
|
del simplified_lines[start + 0 : end]
|
||||||
|
|
||||||
|
return "\n".join(line for line in simplified_lines)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import ast
|
import ast
|
||||||
from typing import Any, List
|
from typing import Any, List, Tuple
|
||||||
|
|
||||||
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
||||||
CodeSegmenter,
|
CodeSegmenter,
|
||||||
@ -39,13 +39,15 @@ class PythonSegmenter(CodeSegmenter):
|
|||||||
tree = ast.parse(self.code)
|
tree = ast.parse(self.code)
|
||||||
simplified_lines = self.source_lines[:]
|
simplified_lines = self.source_lines[:]
|
||||||
|
|
||||||
|
indices_to_del: List[Tuple[int, int]] = []
|
||||||
for node in ast.iter_child_nodes(tree):
|
for node in ast.iter_child_nodes(tree):
|
||||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
||||||
start = node.lineno - 1
|
start, end = node.lineno - 1, node.end_lineno
|
||||||
simplified_lines[start] = f"# Code for: {simplified_lines[start]}"
|
simplified_lines[start] = f"# Code for: {simplified_lines[start]}"
|
||||||
|
assert isinstance(end, int)
|
||||||
|
indices_to_del.append((start + 1, end))
|
||||||
|
|
||||||
assert isinstance(node.end_lineno, int)
|
for start, end in reversed(indices_to_del):
|
||||||
for line_num in range(start + 1, node.end_lineno):
|
del simplified_lines[start + 0 : end]
|
||||||
simplified_lines[line_num] = None # type: ignore
|
|
||||||
|
|
||||||
return "\n".join(line for line in simplified_lines if line is not None)
|
return "\n".join(simplified_lines)
|
||||||
|
@ -318,7 +318,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
output_string = StringIO()
|
output_string = StringIO()
|
||||||
with open_filename(self.file_path, "rb") as fp:
|
with open_filename(self.file_path, "rb") as fp:
|
||||||
extract_text_to_fp(
|
extract_text_to_fp(
|
||||||
fp, # type: ignore[arg-type]
|
fp,
|
||||||
output_string,
|
output_string,
|
||||||
codec="",
|
codec="",
|
||||||
laparams=LAParams(),
|
laparams=LAParams(),
|
||||||
|
@ -215,6 +215,11 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
visited: A set of visited URLs.
|
visited: A set of visited URLs.
|
||||||
depth: To reach the current url, how many pages have been visited.
|
depth: To reach the current url, how many pages have been visited.
|
||||||
"""
|
"""
|
||||||
|
if not self.use_async or not self._lock:
|
||||||
|
raise ValueError(
|
||||||
|
"Async functions forbidden when not initialized with `use_async`"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import aiohttp
|
import aiohttp
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -237,7 +242,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
async with self._lock: # type: ignore
|
async with self._lock:
|
||||||
visited.add(url)
|
visited.add(url)
|
||||||
try:
|
try:
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
@ -277,7 +282,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
|
|
||||||
# Recursively call the function to get the children of the children
|
# Recursively call the function to get the children of the children
|
||||||
sub_tasks = []
|
sub_tasks = []
|
||||||
async with self._lock: # type: ignore
|
async with self._lock:
|
||||||
to_visit = set(sub_links).difference(visited)
|
to_visit = set(sub_links).difference(visited)
|
||||||
for link in to_visit:
|
for link in to_visit:
|
||||||
sub_tasks.append(
|
sub_tasks.append(
|
||||||
|
Loading…
Reference in New Issue
Block a user