mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 08:03:39 +00:00
Update key convention strategy in metadata.
This commit is contained in:
parent
cdd366f66d
commit
f30ac5ddeb
@ -1375,48 +1375,6 @@ class PyPDFium2Parser(BaseBlobParser):
|
|||||||
return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images))
|
return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images))
|
||||||
|
|
||||||
|
|
||||||
# The legacy PDFPlumberParser use key with upper case.
|
|
||||||
# This is not aligned with the new convention, which requires the key to be in
|
|
||||||
# lower case.
|
|
||||||
class _PDFPlumberParserMetadata(dict):
|
|
||||||
_warning_keys: set[str] = set()
|
|
||||||
|
|
||||||
def __init__(self, d: dict[str, Any]):
|
|
||||||
super().__init__({k.lower(): v for k, v in d.items()})
|
|
||||||
self._pdf_metadata_keys = set(d.keys())
|
|
||||||
|
|
||||||
def _lower(self, k: object) -> object:
|
|
||||||
assert isinstance(k, str)
|
|
||||||
if k in self._pdf_metadata_keys:
|
|
||||||
lk = k.lower()
|
|
||||||
if lk != k:
|
|
||||||
if k not in _PDFPlumberParserMetadata._warning_keys:
|
|
||||||
_PDFPlumberParserMetadata._warning_keys.add(str(k))
|
|
||||||
logger.warning(
|
|
||||||
'The key "%s" with uppercase is deprecated. '
|
|
||||||
"Update your code and vectorstore.",
|
|
||||||
k,
|
|
||||||
)
|
|
||||||
return lk
|
|
||||||
else:
|
|
||||||
return k
|
|
||||||
|
|
||||||
def __contains__(self, k: object) -> bool:
|
|
||||||
return super().__contains__(self._lower(k))
|
|
||||||
|
|
||||||
def __delitem__(self, k: object) -> None:
|
|
||||||
super().__delitem__(self._lower(k))
|
|
||||||
|
|
||||||
def __getitem__(self, k: object) -> Any:
|
|
||||||
return super().__getitem__(self._lower(k))
|
|
||||||
|
|
||||||
def get(self, k: object, default: Any = None) -> Any:
|
|
||||||
return super().get(self._lower(str(k)), default)
|
|
||||||
|
|
||||||
def __setitem__(self, k: object, v: Any) -> None:
|
|
||||||
super().__setitem__(self._lower(str(k)), v)
|
|
||||||
|
|
||||||
|
|
||||||
class PDFPlumberParser(BaseBlobParser):
|
class PDFPlumberParser(BaseBlobParser):
|
||||||
"""Parse a blob from a PDF using `pdfplumber` library.
|
"""Parse a blob from a PDF using `pdfplumber` library.
|
||||||
|
|
||||||
@ -1564,7 +1522,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
from pdfplumber.utils import geometry # import WordExctractor, TextMap
|
from pdfplumber.utils import geometry # import WordExctractor, TextMap
|
||||||
|
|
||||||
contents = []
|
contents = []
|
||||||
doc_metadata = _purge_metadata(
|
doc_metadata = doc.metadata | _purge_metadata(
|
||||||
(
|
(
|
||||||
doc.metadata
|
doc.metadata
|
||||||
| {
|
| {
|
||||||
@ -1574,6 +1532,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for page in doc.pages:
|
for page in doc.pages:
|
||||||
tables_bbox: list[tuple[float, float, float, float]] = (
|
tables_bbox: list[tuple[float, float, float, float]] = (
|
||||||
self._extract_tables_bbox_from_page(page)
|
self._extract_tables_bbox_from_page(page)
|
||||||
@ -1618,12 +1577,10 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
yield Document(
|
yield Document(
|
||||||
page_content=all_text,
|
page_content=all_text,
|
||||||
metadata=_validate_metadata(
|
metadata=_validate_metadata(
|
||||||
_PDFPlumberParserMetadata(
|
|
||||||
doc_metadata
|
doc_metadata
|
||||||
| {
|
| {
|
||||||
"page": page.page_number - 1,
|
"page": page.page_number - 1,
|
||||||
}
|
}
|
||||||
)
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -1638,9 +1595,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
if self.mode == "single":
|
if self.mode == "single":
|
||||||
yield Document(
|
yield Document(
|
||||||
page_content=self.pages_delimiter.join(contents),
|
page_content=self.pages_delimiter.join(contents),
|
||||||
metadata=_validate_metadata(
|
metadata=_validate_metadata(doc_metadata),
|
||||||
_PDFPlumberParserMetadata(doc_metadata)
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
|
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
|
||||||
@ -2073,7 +2028,8 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
for idx, page in enumerate(document.pages):
|
for idx, page in enumerate(document.pages):
|
||||||
yield Document(
|
yield Document(
|
||||||
page_content=page.get_text(config=self.linearization_config),
|
page_content=page.get_text(config=self.linearization_config),
|
||||||
metadata={"source": blob.source, "page": idx + 1}, # type: ignore[attr-defined]
|
metadata={"source": blob.source, "page": idx + 1},
|
||||||
|
# type: ignore[attr-defined]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user