mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-31 02:11:09 +00:00
community:Lazy load wikipedia dump file (#15111)
**Description:** the MWDumpLoader implementation currently does not support the lazy_load method, and the files are usually very large. We are proposing refactoring the load function, extracting two private functions with the functionality of loading the dump file and parsing a single page, to reuse the code in the lazy_load implementation.
This commit is contained in:
committed by
GitHub
parent
619cd3ce54
commit
be578f32be
@@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Sequence, Union
|
from typing import Iterator, List, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@@ -60,37 +60,55 @@ class MWDumpLoader(BaseLoader):
|
|||||||
self.skip_redirects = skip_redirects
|
self.skip_redirects = skip_redirects
|
||||||
self.stop_on_error = stop_on_error
|
self.stop_on_error = stop_on_error
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def _load_dump_file(self):
|
||||||
"""Load from a file path."""
|
|
||||||
try:
|
try:
|
||||||
import mwparserfromhell
|
|
||||||
import mwxml
|
import mwxml
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Unable to import 'mwparserfromhell' or 'mwxml'. Please install with"
|
"Unable to import 'mwxml'. Please install with" " `pip install mwxml`."
|
||||||
" `pip install mwparserfromhell mwxml`."
|
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
|
return mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
|
||||||
|
|
||||||
|
def _load_single_page_from_dump(self, page) -> Document:
|
||||||
|
"""Parse a single page."""
|
||||||
|
try:
|
||||||
|
import mwparserfromhell
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"Unable to import 'mwparserfromhell'. Please install with"
|
||||||
|
" `pip install mwparserfromhell`."
|
||||||
|
) from e
|
||||||
|
for revision in page:
|
||||||
|
code = mwparserfromhell.parse(revision.text)
|
||||||
|
text = code.strip_code(
|
||||||
|
normalize=True, collapse=True, keep_template_params=False
|
||||||
|
)
|
||||||
|
metadata = {"source": page.title}
|
||||||
|
return Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load from a file path."""
|
||||||
|
|
||||||
|
return [doc for doc in self.lazy_load()]
|
||||||
|
|
||||||
|
def lazy_load(
|
||||||
|
self,
|
||||||
|
) -> Iterator[Document]:
|
||||||
|
"""Lazy load from a file path."""
|
||||||
|
|
||||||
|
dump = self._load_dump_file()
|
||||||
|
|
||||||
docs = []
|
|
||||||
for page in dump.pages:
|
for page in dump.pages:
|
||||||
if self.skip_redirects and page.redirect:
|
if self.skip_redirects and page.redirect:
|
||||||
continue
|
continue
|
||||||
if self.namespaces and page.namespace not in self.namespaces:
|
if self.namespaces and page.namespace not in self.namespaces:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
for revision in page:
|
yield self._load_single_page_from_dump(page)
|
||||||
code = mwparserfromhell.parse(revision.text)
|
|
||||||
text = code.strip_code(
|
|
||||||
normalize=True, collapse=True, keep_template_params=False
|
|
||||||
)
|
|
||||||
metadata = {"source": page.title}
|
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Parsing error: {}".format(e))
|
logger.error("Parsing error: {}".format(e))
|
||||||
if self.stop_on_error:
|
if self.stop_on_error:
|
||||||
raise e
|
raise e
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
return docs
|
|
||||||
|
Reference in New Issue
Block a user