Harrison/doc2txt (#3772)

Co-authored-by: rishni ratnam <rishniratnam@gmail.com>
This commit is contained in:
Harrison Chase
2023-04-28 21:54:16 -07:00
committed by GitHub
parent ce4fea983b
commit c494ca3ad2
3 changed files with 137 additions and 3 deletions

View File

@@ -10,9 +10,78 @@
"This covers how to load Word documents into a document format that we can use downstream."
]
},
{
"cell_type": "markdown",
"id": "9438686b",
"metadata": {},
"source": [
"## Using Docx2txt\n",
"\n",
"Load .docx using `Docx2txt` into a document."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "7b80ea89",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import Docx2txtLoader"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "99a12031",
"metadata": {},
"outputs": [],
"source": [
"loader = Docx2txtLoader(\"example_data/fake.docx\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b92f68b0",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d83dd755",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "markdown",
"id": "8d40727d",
"metadata": {},
"source": [
"## Using Unstructured"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "721c48aa",
"metadata": {},
"outputs": [],
@@ -129,7 +198,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
"version": "3.9.1"
}
},
"nbformat": 4,