mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 13:36:15 +00:00
feat: add UnstructuredExcelLoader
for .xlsx
and .xls
files (#5617)
# Unstructured Excel Loader Adds an `UnstructuredExcelLoader` class for `.xlsx` and `.xls` files. Works with `unstructured>=0.6.7`. A plain text representation of the Excel file will be available under the `page_content` attribute in the doc. If you use the loader in `"elements"` mode, an HTML representation of the Excel file will be available under the `text_as_html` metadata key. Each sheet in the Excel document is its own document. ### Testing ```python from langchain.document_loaders import UnstructuredExcelLoader loader = UnstructuredExcelLoader( "example_data/stanley-cups.xlsx", mode="elements" ) docs = loader.load() ``` ## Who can review? @hwchase17 @eyurtsev
This commit is contained in:
15
tests/integration_tests/document_loaders/test_excel.py
Normal file
15
tests/integration_tests/document_loaders/test_excel.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import UnstructuredExcelLoader
|
||||
|
||||
EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples"
|
||||
|
||||
|
||||
def test_unstructured_excel_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = os.path.join(EXAMPLE_DIRECTORY, "stanley-cups.xlsx")
|
||||
loader = UnstructuredExcelLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
BIN
tests/integration_tests/examples/stanley-cups.xlsx
Normal file
BIN
tests/integration_tests/examples/stanley-cups.xlsx
Normal file
Binary file not shown.
Reference in New Issue
Block a user