mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 13:36:15 +00:00
feat: Add UnstructuredXMLLoader
for .xml
files (#5955)
# Unstructured XML Loader Adds an `UnstructuredXMLLoader` class for .xml files. Works with unstructured>=0.6.7. A plain text representation of the text with the XML tags will be available under the `page_content` attribute in the doc. ### Testing ```python from langchain.document_loaders import UnstructuredXMLLoader loader = UnstructuredXMLLoader( "example_data/factbook.xml", ) docs = loader.load() ``` ## Who can review? @hwchase17 @eyurtsev
This commit is contained in:
15
tests/integration_tests/document_loaders/test_xml.py
Normal file
15
tests/integration_tests/document_loaders/test_xml.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import UnstructuredXMLLoader
|
||||
|
||||
EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples"
|
||||
|
||||
|
||||
def test_unstructured_xml_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = os.path.join(EXAMPLE_DIRECTORY, "factbook.xml")
|
||||
loader = UnstructuredXMLLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
27
tests/integration_tests/examples/factbook.xml
Normal file
27
tests/integration_tests/examples/factbook.xml
Normal file
@@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<factbook>
|
||||
<country>
|
||||
<name>United States</name>
|
||||
<capital>Washington, DC</capital>
|
||||
<leader>Joe Biden</leader>
|
||||
<sport>Baseball</sport>
|
||||
</country>
|
||||
<country>
|
||||
<name>Canada</name>
|
||||
<capital>Ottawa</capital>
|
||||
<leader>Justin Trudeau</leader>
|
||||
<sport>Hockey</sport>
|
||||
</country>
|
||||
<country>
|
||||
<name>France</name>
|
||||
<capital>Paris</capital>
|
||||
<leader>Emmanuel Macron</leader>
|
||||
<sport>Soccer</sport>
|
||||
</country>
|
||||
<country>
|
||||
<name>Trinidad & Tobado</name>
|
||||
<capital>Port of Spain</capital>
|
||||
<leader>Keith Rowley</leader>
|
||||
<sport>Track & Field</sport>
|
||||
</country>
|
||||
</factbook>
|
Reference in New Issue
Block a user