mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-26 21:11:25 +00:00
Fix revue
This commit is contained in:
parent
b76e9bd579
commit
496a933f68
@ -1470,6 +1470,17 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
|
|
||||||
return extract_from_images_with_rapidocr(images)
|
return extract_from_images_with_rapidocr(images)
|
||||||
|
|
||||||
|
_map_extract_tables: Dict[Literal["markdown", "html", None], str] = {
|
||||||
|
"markdown": "",
|
||||||
|
"html": "But, use html syntax for convert all tables. ",
|
||||||
|
}
|
||||||
|
_map_extract_images = {
|
||||||
|
RapidOCRBlobParser: "",
|
||||||
|
TesseractBlobParser: "",
|
||||||
|
LLMImageBlobParser: "If you come across a picture, "
|
||||||
|
"diagram or other illustration, "
|
||||||
|
"describe it. ",
|
||||||
|
}
|
||||||
|
|
||||||
class ZeroxPDFParser(BaseBlobParser):
|
class ZeroxPDFParser(BaseBlobParser):
|
||||||
"""Parse a blob from a PDF using `py-zerox` library.
|
"""Parse a blob from a PDF using `py-zerox` library.
|
||||||
@ -1520,25 +1531,8 @@ class ZeroxPDFParser(BaseBlobParser):
|
|||||||
print(docs[0].page_content[:100])
|
print(docs[0].page_content[:100])
|
||||||
print(docs[0].metadata)
|
print(docs[0].metadata)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
warnings.filterwarnings(
|
|
||||||
"ignore",
|
|
||||||
module=r"^pyzerox.models.modellitellm$",
|
|
||||||
message=r"\s*Custom system prompt was provided which.*",
|
|
||||||
)
|
|
||||||
_warn_images_to_text = False
|
_warn_images_to_text = False
|
||||||
_warn_creator = False
|
_warn_creator = False
|
||||||
_map_extract_tables: Dict[Literal["markdown", "html", None], str] = {
|
|
||||||
"markdown": "",
|
|
||||||
"html": "But, use html syntax for convert all tables. ",
|
|
||||||
}
|
|
||||||
_map_extract_images = {
|
|
||||||
RapidOCRBlobParser: "",
|
|
||||||
TesseractBlobParser: "",
|
|
||||||
LLMImageBlobParser: "If you come across a picture, "
|
|
||||||
"diagram or other illustration, "
|
|
||||||
"describe it. ",
|
|
||||||
}
|
|
||||||
_prompt = (
|
_prompt = (
|
||||||
"Convert the following PDF page to markdown. "
|
"Convert the following PDF page to markdown. "
|
||||||
"{prompt_tables}"
|
"{prompt_tables}"
|
||||||
@ -1680,10 +1674,10 @@ class ZeroxPDFParser(BaseBlobParser):
|
|||||||
zerox_prompt = self.custom_system_prompt
|
zerox_prompt = self.custom_system_prompt
|
||||||
|
|
||||||
if not zerox_prompt and self.images_parser or self.extract_tables:
|
if not zerox_prompt and self.images_parser or self.extract_tables:
|
||||||
prompt_tables = ZeroxPDFParser._map_extract_tables[self.extract_tables]
|
prompt_tables = _map_extract_tables[self.extract_tables]
|
||||||
clazz = self.images_parser.__class__
|
clazz = self.images_parser.__class__
|
||||||
if clazz in ZeroxPDFParser._map_extract_images:
|
if clazz in _map_extract_images:
|
||||||
prompt_images = ZeroxPDFParser._map_extract_images[clazz]
|
prompt_images = _map_extract_images[clazz]
|
||||||
else:
|
else:
|
||||||
if not ZeroxPDFParser._warn_creator:
|
if not ZeroxPDFParser._warn_creator:
|
||||||
ZeroxPDFParser._warn_creator = True
|
ZeroxPDFParser._warn_creator = True
|
||||||
|
Loading…
Reference in New Issue
Block a user