diff --git a/docs/modules/indexes/document_loaders/examples/csv.ipynb b/docs/modules/indexes/document_loaders/examples/csv.ipynb
index 6b62950ba68..e6555437fc2 100644
--- a/docs/modules/indexes/document_loaders/examples/csv.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/csv.ipynb
@@ -29,7 +29,6 @@
"cell_type": "code",
"execution_count": 26,
"metadata": {
- "collapsed": false,
"jupyter": {
"outputs_hidden": false
}
@@ -45,7 +44,6 @@
"cell_type": "code",
"execution_count": 27,
"metadata": {
- "collapsed": false,
"jupyter": {
"outputs_hidden": false
}
@@ -76,7 +74,6 @@
"cell_type": "code",
"execution_count": 28,
"metadata": {
- "collapsed": false,
"jupyter": {
"outputs_hidden": false
}
@@ -96,7 +93,6 @@
"cell_type": "code",
"execution_count": 29,
"metadata": {
- "collapsed": false,
"jupyter": {
"outputs_hidden": false
}
@@ -152,6 +148,211 @@
"source": [
"print(data)"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `UnstructuredCSVLoader`\n",
+ "\n",
+ "You can also load the table using the `UnstructuredCSVLoader`. One advantage of using `UnstructuredCSVLoader` is that if you use it in `\"elements\"` mode, an HTML representation of the table will be available in the metadata."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders.csv_loader import UnstructuredCSVLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = UnstructuredCSVLoader(file_path='example_data/mlb_teams_2012.csv', mode=\"elements\")\n",
+ "docs = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "
\n",
+ " \n",
+ " \n",
+ " Nationals | \n",
+ " 81.34 | \n",
+ " 98 | \n",
+ "
\n",
+ " \n",
+ " Reds | \n",
+ " 82.20 | \n",
+ " 97 | \n",
+ "
\n",
+ " \n",
+ " Yankees | \n",
+ " 197.96 | \n",
+ " 95 | \n",
+ "
\n",
+ " \n",
+ " Giants | \n",
+ " 117.62 | \n",
+ " 94 | \n",
+ "
\n",
+ " \n",
+ " Braves | \n",
+ " 83.31 | \n",
+ " 94 | \n",
+ "
\n",
+ " \n",
+ " Athletics | \n",
+ " 55.37 | \n",
+ " 94 | \n",
+ "
\n",
+ " \n",
+ " Rangers | \n",
+ " 120.51 | \n",
+ " 93 | \n",
+ "
\n",
+ " \n",
+ " Orioles | \n",
+ " 81.43 | \n",
+ " 93 | \n",
+ "
\n",
+ " \n",
+ " Rays | \n",
+ " 64.17 | \n",
+ " 90 | \n",
+ "
\n",
+ " \n",
+ " Angels | \n",
+ " 154.49 | \n",
+ " 89 | \n",
+ "
\n",
+ " \n",
+ " Tigers | \n",
+ " 132.30 | \n",
+ " 88 | \n",
+ "
\n",
+ " \n",
+ " Cardinals | \n",
+ " 110.30 | \n",
+ " 88 | \n",
+ "
\n",
+ " \n",
+ " Dodgers | \n",
+ " 95.14 | \n",
+ " 86 | \n",
+ "
\n",
+ " \n",
+ " White Sox | \n",
+ " 96.92 | \n",
+ " 85 | \n",
+ "
\n",
+ " \n",
+ " Brewers | \n",
+ " 97.65 | \n",
+ " 83 | \n",
+ "
\n",
+ " \n",
+ " Phillies | \n",
+ " 174.54 | \n",
+ " 81 | \n",
+ "
\n",
+ " \n",
+ " Diamondbacks | \n",
+ " 74.28 | \n",
+ " 81 | \n",
+ "
\n",
+ " \n",
+ " Pirates | \n",
+ " 63.43 | \n",
+ " 79 | \n",
+ "
\n",
+ " \n",
+ " Padres | \n",
+ " 55.24 | \n",
+ " 76 | \n",
+ "
\n",
+ " \n",
+ " Mariners | \n",
+ " 81.97 | \n",
+ " 75 | \n",
+ "
\n",
+ " \n",
+ " Mets | \n",
+ " 93.35 | \n",
+ " 74 | \n",
+ "
\n",
+ " \n",
+ " Blue Jays | \n",
+ " 75.48 | \n",
+ " 73 | \n",
+ "
\n",
+ " \n",
+ " Royals | \n",
+ " 60.91 | \n",
+ " 72 | \n",
+ "
\n",
+ " \n",
+ " Marlins | \n",
+ " 118.07 | \n",
+ " 69 | \n",
+ "
\n",
+ " \n",
+ " Red Sox | \n",
+ " 173.18 | \n",
+ " 69 | \n",
+ "
\n",
+ " \n",
+ " Indians | \n",
+ " 78.43 | \n",
+ " 68 | \n",
+ "
\n",
+ " \n",
+ " Twins | \n",
+ " 94.08 | \n",
+ " 66 | \n",
+ "
\n",
+ " \n",
+ " Rockies | \n",
+ " 78.06 | \n",
+ " 64 | \n",
+ "
\n",
+ " \n",
+ " Cubs | \n",
+ " 88.19 | \n",
+ " 61 | \n",
+ "
\n",
+ " \n",
+ " Astros | \n",
+ " 60.65 | \n",
+ " 55 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(docs[0].metadata[\"text_as_html\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -170,7 +371,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.8.13"
}
},
"nbformat": 4,
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index 3ec4db3534a..83b6330adc8 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -19,7 +19,7 @@ from langchain.document_loaders.chatgpt import ChatGPTLoader
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
from langchain.document_loaders.confluence import ConfluenceLoader
from langchain.document_loaders.conllu import CoNLLULoader
-from langchain.document_loaders.csv_loader import CSVLoader
+from langchain.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.document_loaders.diffbot import DiffbotLoader
from langchain.document_loaders.directory import DirectoryLoader
@@ -222,6 +222,7 @@ __all__ = [
"TwitterTweetLoader",
"UnstructuredAPIFileIOLoader",
"UnstructuredAPIFileLoader",
+ "UnstructuredCSVLoader",
"UnstructuredEPubLoader",
"UnstructuredEmailLoader",
"UnstructuredExcelLoader",
diff --git a/langchain/document_loaders/csv_loader.py b/langchain/document_loaders/csv_loader.py
index a844f94b1db..3d5e47b1a65 100644
--- a/langchain/document_loaders/csv_loader.py
+++ b/langchain/document_loaders/csv_loader.py
@@ -1,8 +1,12 @@
import csv
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import (
+ UnstructuredFileLoader,
+ validate_unstructured_version,
+)
class CSVLoader(BaseLoader):
@@ -61,3 +65,18 @@ class CSVLoader(BaseLoader):
docs.append(doc)
return docs
+
+
+class UnstructuredCSVLoader(UnstructuredFileLoader):
+ """Loader that uses unstructured to load CSV files."""
+
+ def __init__(
+ self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+ ):
+ validate_unstructured_version(min_unstructured_version="0.6.8")
+ super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
+
+ def _get_elements(self) -> List:
+ from unstructured.partition.csv import partition_csv
+
+ return partition_csv(filename=self.file_path, **self.unstructured_kwargs)
diff --git a/tests/integration_tests/document_loaders/test_csv_loader.py b/tests/integration_tests/document_loaders/test_csv_loader.py
new file mode 100644
index 00000000000..ffce01cf17b
--- /dev/null
+++ b/tests/integration_tests/document_loaders/test_csv_loader.py
@@ -0,0 +1,15 @@
+import os
+from pathlib import Path
+
+from langchain.document_loaders import UnstructuredCSVLoader
+
+EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples"
+
+
+def test_unstructured_csv_loader() -> None:
+ """Test unstructured loader."""
+ file_path = os.path.join(EXAMPLE_DIRECTORY, "stanley-cups.csv")
+ loader = UnstructuredCSVLoader(str(file_path))
+ docs = loader.load()
+
+ assert len(docs) == 1
diff --git a/tests/integration_tests/examples/stanley-cups.csv b/tests/integration_tests/examples/stanley-cups.csv
new file mode 100644
index 00000000000..4414023f005
--- /dev/null
+++ b/tests/integration_tests/examples/stanley-cups.csv
@@ -0,0 +1,5 @@
+Stanley Cups,,
+Team,Location,Stanley Cups
+Blues,STL,1
+Flyers,PHI,2
+Maple Leafs,TOR,13
\ No newline at end of file