diff --git a/docs/docs/how_to/document_loader_html.ipynb b/docs/docs/how_to/document_loader_html.ipynb index d23cc54101f..4fdeecbe490 100644 --- a/docs/docs/how_to/document_loader_html.ipynb +++ b/docs/docs/how_to/document_loader_html.ipynb @@ -23,12 +23,12 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install \"unstructured[html]\"" + "%pip install unstructured" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "7d167ca3-c7c7-4ef0-b509-080629f0f482", "metadata": {}, "outputs": [ @@ -36,14 +36,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Document(page_content='My First Heading\\n\\nMy first paragraph.', metadata={'source': '../../../docs/integrations/document_loaders/example_data/fake-content.html'})]\n" + "[Document(page_content='My First Heading\\n\\nMy first paragraph.', metadata={'source': '../../docs/integrations/document_loaders/example_data/fake-content.html'})]\n" ] } ], "source": [ "from langchain_community.document_loaders import UnstructuredHTMLLoader\n", "\n", - "file_path = \"../../../docs/integrations/document_loaders/example_data/fake-content.html\"\n", + "file_path = \"../../docs/integrations/document_loaders/example_data/fake-content.html\"\n", "\n", "loader = UnstructuredHTMLLoader(file_path)\n", "data = loader.load()\n", @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "0a2050a8-6df6-4696-9889-ba367d6f9caa", "metadata": {}, "outputs": [ @@ -81,7 +81,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Document(page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', metadata={'source': '../../../docs/integrations/document_loaders/example_data/fake-content.html', 'title': 'Test Title'})]\n" + "[Document(page_content='\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', metadata={'source': '../../docs/integrations/document_loaders/example_data/fake-content.html', 'title': 'Test Title'})]\n" ] } ], @@ -111,7 +111,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/how_to/document_loader_markdown.ipynb b/docs/docs/how_to/document_loader_markdown.ipynb index ac5688c97cc..b68d4eadc14 100644 --- a/docs/docs/how_to/document_loader_markdown.ipynb +++ b/docs/docs/how_to/document_loader_markdown.ipynb @@ -21,12 +21,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "c8b147fb-6877-4f7a-b2ee-ee971c7bc662", "metadata": {}, "outputs": [], "source": [ - "# !pip install \"unstructured[md]\"" + "%pip install \"unstructured[md]\"" ] }, { @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "80c50cc4-7ce9-4418-81b9-29c52c7b3627", "metadata": {}, "outputs": [ @@ -62,7 +62,7 @@ "from langchain_community.document_loaders import UnstructuredMarkdownLoader\n", "from langchain_core.documents import Document\n", "\n", - "markdown_path = \"../../../../README.md\"\n", + "markdown_path = \"../../../README.md\"\n", "loader = UnstructuredMarkdownLoader(markdown_path)\n", "\n", "data = loader.load()\n", @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "a986bbce-7fd3-41d1-bc47-49f9f57c7cd1", "metadata": {}, "outputs": [ @@ -92,11 +92,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of documents: 65\n", + "Number of documents: 66\n", "\n", - "page_content='šŸ¦œļøšŸ”— LangChain' metadata={'source': '../../../../README.md', 'last_modified': '2024-04-29T13:40:19', 'page_number': 1, 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': '../../../..', 'filename': 'README.md', 'category': 'Title'}\n", + "page_content='šŸ¦œļøšŸ”— LangChain' metadata={'source': '../../../README.md', 'category_depth': 0, 'last_modified': '2024-06-28T15:20:01', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': '../../..', 'filename': 'README.md', 'category': 'Title'}\n", "\n", - "page_content='⚔ Build context-aware reasoning applications ⚔' metadata={'source': '../../../../README.md', 'last_modified': '2024-04-29T13:40:19', 'page_number': 1, 'languages': ['eng'], 'parent_id': 'c3223b6f7100be08a78f1e8c0c28fde1', 'filetype': 'text/markdown', 'file_directory': '../../../..', 'filename': 'README.md', 'category': 'NarrativeText'}\n", + "page_content='⚔ Build context-aware reasoning applications ⚔' metadata={'source': '../../../README.md', 'last_modified': '2024-06-28T15:20:01', 'languages': ['eng'], 'parent_id': '200b8a7d0dd03f66e4f13456566d2b3a', 'filetype': 'text/markdown', 'file_directory': '../../..', 'filename': 'README.md', 'category': 'NarrativeText'}\n", "\n" ] } @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "75abc139-3ded-4e8e-9f21-d0c8ec40fdfc", "metadata": {}, "outputs": [ @@ -129,13 +129,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'Title', 'NarrativeText', 'ListItem'}\n" + "{'ListItem', 'NarrativeText', 'Title'}\n" ] } ], "source": [ "print(set(document.metadata[\"category\"] for document in data))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "223b4c11", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -154,7 +162,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/how_to/document_loader_pdf.ipynb b/docs/docs/how_to/document_loader_pdf.ipynb index 4a5275e9812..3b879bdce04 100644 --- a/docs/docs/how_to/document_loader_pdf.ipynb +++ b/docs/docs/how_to/document_loader_pdf.ipynb @@ -25,22 +25,22 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install pypdf" + "%pip install --upgrade --quiet pypdf" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "7d8ccd0b-8415-4916-af32-0e6d30b9496b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='LayoutParser : A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1( \\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1Allen Institute for AI\\nshannons@allenai.org\\n2Brown University\\nruochen zhang@brown.edu\\n3Harvard University\\n{melissadell,jacob carlson }@fas.harvard.edu\\n4University of Washington\\nbcgl@cs.washington.edu\\n5University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser , an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io .\\nKeywords: Document Image Analysis Ā·Deep Learning Ā·Layout Analysis\\nĀ·Character Recognition Ā·Open Source library Ā·Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [ 11,arXiv:2103.15348v2 [cs.CV] 21 Jun 2021', metadata={'source': '../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf', 'page': 0})" + "Document(page_content='LayoutParser : A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1( \\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1Allen Institute for AI\\nshannons@allenai.org\\n2Brown University\\nruochen zhang@brown.edu\\n3Harvard University\\n{melissadell,jacob carlson }@fas.harvard.edu\\n4University of Washington\\nbcgl@cs.washington.edu\\n5University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser , an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io .\\nKeywords: Document Image Analysis Ā·Deep Learning Ā·Layout Analysis\\nĀ·Character Recognition Ā·Open Source library Ā·Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [ 11,arXiv:2103.15348v2 [cs.CV] 21 Jun 2021', metadata={'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf', 'page': 0})" ] }, - "execution_count": 1, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -49,7 +49,7 @@ "from langchain_community.document_loaders import PyPDFLoader\n", "\n", "file_path = (\n", - " \"../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", ")\n", "loader = PyPDFLoader(file_path)\n", "pages = loader.load_and_split()\n", @@ -76,13 +76,13 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install faiss-cpu \n", + "%pip install --upgrade --quiet faiss-cpu \n", "# use `pip install faiss-gpu` for CUDA GPU support" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7ba35f1c-0a85-4f2f-a56e-3a994c69180d", "metadata": {}, "outputs": [], @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "e0eaec77-f5cf-4172-8e39-41e1520eabba", "metadata": {}, "outputs": [ @@ -139,7 +139,7 @@ "source": [ "### Extract text from images\n", "\n", - "Some PDFs contain images of text-- e.g., within scanned documents, or figures. Using the `rapidocr-onnxruntime` package we can extract images as text as well:" + "Some PDFs contain images of text -- e.g., within scanned documents, or figures. Using the `rapidocr-onnxruntime` package we can extract images as text as well:" ] }, { @@ -149,12 +149,12 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install rapidocr-onnxruntime" + "%pip install --upgrade --quiet rapidocr-onnxruntime" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "babc138a-2188-49f7-a8d6-3570fa3ad802", "metadata": {}, "outputs": [ @@ -164,7 +164,7 @@ "'LayoutParser : A Unified Toolkit for DL-Based DIA 5\\nTable 1: Current layout detection models in the LayoutParser model zoo\\nDataset Base Model1Large Model Notes\\nPubLayNet [38] F / M M Layouts of modern scientific documents\\nPRImA [3] M - Layouts of scanned modern magazines and scientific reports\\nNewspaper [17] F - Layouts of scanned US newspapers from the 20th century\\nTableBank [18] F F Table region on modern scientific and business document\\nHJDataset [31] F / M - Layouts of history Japanese documents\\n1For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy\\nvs. computational cost). For ā€œbase modelā€ and ā€œlarge modelā€, we refer to using the ResNet 50 or ResNet 101\\nbackbones [ 13], respectively. One can train models of different architectures, like Faster R-CNN [ 28] (F) and Mask\\nR-CNN [ 12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\\nusing the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\\nzoo in coming months.\\nlayout data structures , which are optimized for efficiency and versatility. 3) When\\nnecessary, users can employ existing or customized OCR models via the unified\\nAPI provided in the OCR module . 4)LayoutParser comes with a set of utility\\nfunctions for the visualization and storage of the layout data. 5) LayoutParser\\nis also highly customizable, via its integration with functions for layout data\\nannotation and model training . We now provide detailed descriptions for each\\ncomponent.\\n3.1 Layout Detection Models\\nInLayoutParser , a layout model takes a document image as an input and\\ngenerates a list of rectangular boxes for the target content regions. Different\\nfrom traditional methods, it relies on deep convolutional neural networks rather\\nthan manually curated rules to identify content regions. It is formulated as an\\nobject detection problem and state-of-the-art models like Faster R-CNN [ 28] and\\nMask R-CNN [ 12] are used. This yields prediction results of high accuracy and\\nmakes it possible to build a concise, generalized interface for layout detection.\\nLayoutParser , built upon Detectron2 [ 35], provides a minimal API that can\\nperform layout detection with only four lines of code in Python:\\n1import layoutparser as lp\\n2image = cv2. imread (\" image_file \") # load images\\n3model = lp. Detectron2LayoutModel (\\n4 \"lp :// PubLayNet / faster_rcnn_R_50_FPN_3x / config \")\\n5layout = model . detect ( image )\\nLayoutParser provides a wealth of pre-trained model weights using various\\ndatasets covering different languages, time periods, and document types. Due to\\ndomain shift [ 7], the prediction performance can notably drop when models are ap-\\nplied to target samples that are significantly different from the training dataset. As\\ndocument structures and layouts vary greatly in different domains, it is important\\nto select models trained on a dataset similar to the test samples. A semantic syntax\\nis used for initializing the model weights in LayoutParser , using both the dataset\\nname and model name lp:/// .'" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -182,19 +182,42 @@ "source": [ "## Using PyMuPDF\n", "\n", - "This is the fastest of the PDF parsing options, and contains detailed metadata about the PDF and its pages, as well as returns one document per page." + "`PyMuPDF` is optimized for speed, and contains detailed metadata about the PDF and its pages. It returns one document per page:" ] }, { "cell_type": "code", "execution_count": null, - "id": "1be9463c-e08b-432e-be46-dc41f6d0ec28", + "id": "34dab6cd", "metadata": {}, "outputs": [], + "source": [ + "%pip install --upgrade --quiet pymupdf" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1be9463c-e08b-432e-be46-dc41f6d0ec28", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 (\\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: Document Image Analysis Ā· Deep Learning Ā· Layout Analysis\\nĀ· Character Recognition Ā· Open Source library Ā· Toolkit.\\n1\\nIntroduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [11,\\narXiv:2103.15348v2 [cs.CV] 21 Jun 2021\\n', metadata={'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf', 'file_path': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.40.21', 'creationDate': 'D:20210622012710Z', 'modDate': 'D:20210622012710Z', 'trapped': ''})" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from langchain_community.document_loaders import PyMuPDFLoader\n", "\n", - "loader = PyMuPDFLoader(\"example_data/layout-parser-paper.pdf\")\n", + "loader = PyMuPDFLoader(\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + ")\n", "data = loader.load()\n", "data[0]" ] @@ -208,7 +231,7 @@ "\n", "## Using MathPix\n", "\n", - "Inspired by Daniel Gross's [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)" + "Inspired by Daniel Gross's snippet here: [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)" ] }, { @@ -221,10 +244,11 @@ "from langchain_community.document_loaders import MathpixPDFLoader\n", "\n", "file_path = (\n", - " \"../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", ")\n", "loader = MathpixPDFLoader(file_path)\n", - "data = loader.load()" + "data = loader.load()\n", + "data[0]" ] }, { @@ -234,23 +258,47 @@ "source": [ "## Using Unstructured\n", "\n", - "[Unstructured](https://unstructured-io.github.io/unstructured/) supports a common interface for working with unstructured or semi-structured file formats, such as Markdown or PDF. LangChain's [UnstructuredPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.UnstructuredPDFLoader.html) integrates with Unstructured to parse PDF documents into LangChain [Document](https://api.python.langchain.com/en/latest/documents/langchain_core.documents.base.Document.html) objects." + "[Unstructured](https://unstructured-io.github.io/unstructured/) supports a common interface for working with unstructured or semi-structured file formats, such as Markdown or PDF. LangChain's [UnstructuredPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.UnstructuredPDFLoader.html) integrates with Unstructured to parse PDF documents into LangChain [Document](https://api.python.langchain.com/en/latest/documents/langchain_core.documents.base.Document.html) objects.\n", + "\n", + "Please see [this page](/docs/integrations/providers/unstructured/) for more information on installing system requirements." ] }, { "cell_type": "code", - "execution_count": 12, - "id": "c6a15bd3-aaa4-49dc-935a-f18617a7dbdd", + "execution_count": null, + "id": "b82aaf68", "metadata": {}, "outputs": [], + "source": [ + "%pip install --upgrade --quiet unstructured" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c6a15bd3-aaa4-49dc-935a-f18617a7dbdd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='1 2 0 2\\n\\nn u J\\n\\n1 2\\n\\n]\\n\\nV C . s c [\\n\\n2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a\\n\\nLayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis\\n\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5\\n\\n1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca\\n\\nAbstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.\\n\\nKeywords: Document Image Analysis Ā· Deep Learning Ā· Layout Analysis Ā· Character Recognition Ā· Open Source library Ā· Toolkit.\\n\\n1\\n\\nIntroduction\\n\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11,\\n\\n2\\n\\nZ. Shen et al.\\n\\n37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual specification of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and benefit a broad spectrum of large-scale document digitization projects.\\n\\nHowever, there are several practical difficulties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would benefit the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-fledged infrastructure for easily curating the target document image datasets and fine-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the final outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it difficult for research teams to learn about how full pipelines are implemented and leads them to invest significant resources in reinventing the DIA wheel.\\n\\nLayoutParser provides a unified toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:\\n\\n1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)\\n\\n2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage\\n\\n3. Comprehensive tools for efficient document image data annotation and model tuning to support different levels of customization\\n\\n4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)\\n\\nThe library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research.\\n\\nLayoutParser is well aligned with recent efforts for improving DL model reusability in other disciplines like natural language processing [8, 34] and com- puter vision [35], but with a focus on unique challenges in DIA. We show LayoutParser can be applied in sophisticated and large-scale digitization projects\\n\\nLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\nthat require precision, efficiency, and robustness, as well as simple and light- weight document processing tasks focusing on efficacy and flexibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned.\\n\\nThe rest of the paper is organized as follows. Section 2 provides an overview of related work. The core LayoutParser library, DL Model Zoo, and customized model training are described in Section 3, and the DL model hub and commu- nity platform are detailed in Section 4. Section 5 shows two examples of how LayoutParser can be used in practical DIA projects, and Section 6 concludes.\\n\\n2 Related Work\\n\\nRecently, various DL models and datasets have been developed for layout analysis tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen- tation tasks on historical documents. Object detection-based methods like Faster R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38] and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also been used in table detection [27]. However, these models are usually implemented individually and there is no unified framework to load and use such models.\\n\\nThere has been a surge of interest in creating open-source tools for document image processing: a search of document image analysis in Github leads to 5M relevant code pieces 6; yet most of them rely on traditional rule-based methods or provide limited functionalities. The closest prior research to our work is the OCR-D project7, which also tries to build a complete toolkit for DIA. However, similar to the platform developed by Neudecker et al. [21], it is designed for analyzing historical documents, and provides no supports for recent DL models. The DocumentLayoutAnalysis project8 focuses on processing born-digital PDF documents via analyzing the stored PDF data. Repositories like DeepLayout9 and Detectron2-PubLayNet10 are individual deep learning models trained on layout analysis datasets without support for the full DIA pipeline. The Document Analysis and Exploitation (DAE) platform [15] and the DeepDIVA project [2] aim to improve the reproducibility of DIA methods (or DL models), yet they are not actively maintained. OCR engines like Tesseract [14], easyOCR11 and paddleOCR12 usually do not come with comprehensive functionalities for other DIA tasks like layout analysis.\\n\\nRecent years have also seen numerous efforts to create libraries for promoting reproducibility and reusability in the field of DL. Libraries like Dectectron2 [35],\\n\\n6 The number shown is obtained by specifying the search type as ā€˜code’. 7 https://ocr-d.de/en/about 8 https://github.com/BobLd/DocumentLayoutAnalysis 9 https://github.com/leonlulu/DeepLayout 10 https://github.com/hpanwar08/detectron2 11 https://github.com/JaidedAI/EasyOCR 12 https://github.com/PaddlePaddle/PaddleOCR\\n\\n3\\n\\n4\\n\\nZ. Shen et al.\\n\\nLayout Data Structure\\n\\nDocument Images\\n\\nDIA Pipeline Sharing\\n\\nCustomized Model Training\\n\\nEfficient Data Annotation\\n\\nDIA Model Hub\\n\\nModel Customization\\n\\nStorage & Visualization\\n\\nCommunity Platform\\n\\nLayout Detection Models\\n\\nThe Core LayoutParser Library\\n\\nOCR Module\\n\\nFig. 1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of off-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via efficient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.\\n\\nAllenNLP [8] and transformers [34] have provided the community with complete DL-based support for developing and deploying models for general computer vision and natural language processing problems. LayoutParser, on the other hand, specializes specifically in DIA tasks. LayoutParser is also equipped with a community platform inspired by established model hubs such as Torch Hub [23] and TensorFlow Hub [1]. It enables the sharing of pretrained models as well as full document processing pipelines that are unique to DIA tasks.\\n\\nThere have been a variety of document data collections to facilitate the development of DL models. Some examples include PRImA [3](magazine layouts), PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic papers), Newspaper Navigator Dataset [16, 17](newspaper figure layouts) and HJDataset [31](historical Japanese document layouts). A spectrum of models trained on these datasets are currently available in the LayoutParser model zoo to support different use cases.\\n\\n3 The Core LayoutParser Library\\n\\nAt the core of LayoutParser is an off-the-shelf toolkit that streamlines DL- based document image analysis. Five components support a simple interface with comprehensive functionalities: 1) The layout detection models enable using pre-trained or self-trained DL models for layout detection with just four lines of code. 2) The detected layout information is stored in carefully engineered\\n\\nLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\nTable 1: Current layout detection models in the LayoutParser model zoo\\n\\nDataset\\n\\nBase Model1 Large Model Notes\\n\\nPubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]\\n\\nF / M M F F F / M\\n\\nM - - F -\\n\\nLayouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents\\n\\n1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For ā€œbase modelā€ and ā€œlarge modelā€, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.\\n\\nlayout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component.\\n\\n3.1 Layout Detection Models\\n\\nIn LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:\\n\\n1 import layoutparser as lp 2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel (\\n\\n\" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" )\\n\\n4 5 layout = model . detect ( image )\\n\\nLayoutParser provides a wealth of pre-trained model weights using various datasets covering different languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are significantly different from the training dataset. As document structures and layouts vary greatly in different domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///.\\n\\n5\\n\\n6\\n\\nZ. Shen et al.\\n\\nFig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility.\\n\\nShown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 different datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5).\\n\\n3.2 Layout Data Structures\\n\\nA critical feature of LayoutParser is the implementation of a series of data structures and operations that can be used to efficiently process and manipulate the layout elements. In document image analysis pipelines, various post-processing on the layout analysis model outputs is usually required to obtain the final outputs. Traditionally, this requires exporting DL model outputs and then loading the results into other pipelines. All model outputs from LayoutParser will be stored in carefully engineered data types optimized for further processing, which makes it possible to build an end-to-end document digitization pipeline within LayoutParser. There are three key components in the data structure, namely the Coordinate system, the TextBlock, and the Layout. They provide different levels of abstraction for the layout data, and a set of APIs are supported for transformations or operations on these classes.\\n\\nLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\nCoordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be specified and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13.\\n\\nBased on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort.\\n\\n3.3 OCR\\n\\nLayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:\\n\\n1 ocr_agent = lp . TesseractAgent () 2 # Can be easily switched to other OCR software 3 tokens = ocr_agent . detect ( image )\\n\\nThe OCR outputs will also be stored in the aforementioned layout data structures and can be seamlessly incorporated into the digitization pipeline. Currently LayoutParser supports the Tesseract and Google Cloud Vision OCR engines.\\n\\nLayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classification (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.\\n\\n13 This is also available in the LayoutParser documentation pages.\\n\\n7\\n\\n8\\n\\nZ. Shen et al.\\n\\nTable 2: All operations supported by the layout elements. The same APIs are supported across different layout element classes including Coordinate types, TextBlock and Layout.\\n\\nOperation Name\\n\\nDescription\\n\\nblock.pad(top, bottom, right, left) Enlarge the current block according to the input\\n\\nblock.scale(fx, fy)\\n\\nScale the current block given the ratio in x and y direction\\n\\nblock.shift(dx, dy)\\n\\nMove the current block with the shift distances in x and y direction\\n\\nblock1.is in(block2)\\n\\nWhether block1 is inside of block2\\n\\nblock1.intersect(block2)\\n\\nReturn the intersection region of block1 and block2. Coordinate type to be determined based on the inputs.\\n\\nblock1.union(block2)\\n\\nReturn the union region of block1 and block2. Coordinate type to be determined based on the inputs.\\n\\nblock1.relative to(block2)\\n\\nConvert the absolute coordinates of block1 to relative coordinates to block2\\n\\nblock1.condition on(block2)\\n\\nCalculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates\\n\\nblock.crop image(image)\\n\\nObtain the image segments in the block region\\n\\n3.4 Storage and visualization\\n\\nThe end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into different formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-specific formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5). Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in different modes. More detailed information can be found in the online LayoutParser documentation page.\\n\\n3.5 Customized Model Training\\n\\nBesides the off-the-shelf library, LayoutParser is also highly customizable with supports for highly unique and challenging document analysis tasks. Target document images can be vastly different from the existing datasets for train- ing layout models, which leads to low layout detection accuracy. Training data\\n\\n14 https://altoxml.github.io\\n\\nLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\nFig. 3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR’d texts at their corresponding positions on the image canvas. In this figure, tokens in textual regions are filtered using the API and then displayed.\\n\\ncan also be highly sensitive and not sharable publicly. To overcome these chal- lenges, LayoutParser is built with rich features for efficient data annotation and customized model training.\\n\\nLayoutParser incorporates a toolkit optimized for annotating document lay- outs using object-level active learning [32]. With the help from a layout detection model trained along with labeling, only the most important layout objects within each image, rather than the whole image, are required for labeling. The rest of the regions are automatically annotated with high confidence predictions from the layout detection model. This allows a layout dataset to be created more efficiently with only around 60% of the labeling budget.\\n\\nAfter the training dataset is curated, LayoutParser supports different modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are significantly different and a large training set is available. However, as suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally different domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets.\\n\\n9\\n\\n10\\n\\nZ. Shen et al.\\n\\nFig. 4: Illustration of (a) the original historical Japanese document with layout detection results and (b) a recreated version of the document image that achieves much better character recognition recall. The reorganization algorithm rearranges the tokens based on the their detected bounding boxes given a maximum allowed height.\\n\\n4 LayoutParser Community Platform\\n\\nAnother focus of LayoutParser is promoting the reusability of layout detection models and full digitization pipelines. Similar to many existing deep learning libraries, LayoutParser comes with a community model hub for distributing layout models. End-users can upload their self-trained models to the model hub, and these models can be loaded into a similar interface as the currently available LayoutParser pre-trained models. For example, the model trained on the News Navigator dataset [17] has been incorporated in the model hub.\\n\\nBeyond DL models, LayoutParser also promotes the sharing of entire doc- ument digitization pipelines. For example, sometimes the pipeline requires the combination of multiple DL models to achieve better accuracy. Currently, pipelines are mainly described in academic papers and implementations are often not pub- licly available. To this end, the LayoutParser community platform also enables the sharing of layout pipelines to promote the discussion and reuse of techniques. For each shared pipeline, it has a dedicated project page, with links to the source code, documentation, and an outline of the approaches. A discussion panel is provided for exchanging ideas. Combined with the core LayoutParser library, users can easily build reusable components based on the shared pipelines and apply them to solve their unique problems.\\n\\n5 Use Cases\\n\\nThe core objective of LayoutParser is to make it easier to create both large-scale and light-weight document digitization pipelines. Large-scale document processing\\n\\nLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\nfocuses on precision, efficiency, and robustness. The target documents may have complicated structures, and may require training multiple layout detection models to achieve the optimal accuracy. Light-weight pipelines are built for relatively simple documents, with an emphasis on development ease, speed and flexibility. Ideally one only needs to use existing resources, and model training should be avoided. Through two exemplar projects, we show how practitioners in both academia and industry can easily build such pipelines using LayoutParser and extract high-quality structured document data for their downstream tasks. The source code for these projects will be publicly available in the LayoutParser community hub.\\n\\n5.1 A Comprehensive Historical Document Digitization Pipeline\\n\\nThe digitization of historical documents can unlock valuable data that can shed light on many important social, economic, and historical questions. Yet due to scan noises, page wearing, and the prevalence of complicated layout structures, ob- taining a structured representation of historical document scans is often extremely complicated. In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese firm financial ta- bles with complicated layouts. The pipeline applies two layout models to identify different levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.\\n\\nAs shown in Figure 4 (a), the document contains columns of text written vertically 15, a common style in Japanese. Due to scanning noise and archaic printing technology, the columns can be skewed or have vari- able widths, and hence cannot be eas- ily identified via rule-based methods. Within each column, words are sepa- rated by white spaces of variable size, and the vertical positions of objects can be an indicator of their layout type.\\n\\nFig. 5: Illustration of how LayoutParser helps with the historical document digi- tization pipeline.\\n\\n15 A document page consists of eight rows like this. For simplicity we skip the row\\n\\nsegmentation discussion and refer readers to the source code when available.\\n\\n11\\n\\n12\\n\\nZ. Shen et al.\\n\\nTo decipher the complicated layout\\n\\nstructure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identified and rectified via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model.\\n\\nA combination of character recognition methods is developed to tackle the unique challenges in this document. In our experiments, we found that irregular spacing between the tokens led to a low character recognition recall rate, whereas existing OCR models tend to perform better on densely-arranged texts. To overcome this challenge, we create a document reorganization algorithm that rearranges the text based on the token bounding boxes detected in the layout analysis step. Figure 4 (b) illustrates the generated image of dense text, which is sent to the OCR APIs as a whole to reduce the transaction costs. The flexible coordinate system in LayoutParser is used to transform the OCR results relative to their original positions on the page.\\n\\nAdditionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set.\\n\\nOverall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR.\\n\\n16 This measures the overlap between the detected and ground-truth characters, and\\n\\nthe maximum is 1.\\n\\n17 This measures the number of edits from the ground-truth text to the predicted text,\\n\\nand lower is better.\\n\\nLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\nFig. 6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in different locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.\\n\\n5.2 A light-weight Visual Table Extractor\\n\\nDetecting tables and parsing their structures (table extraction) are of central im- portance for many document digitization tasks. Many previous works [26, 30, 27] and tools 18 have been developed to identify and parse table structures. Yet they might require training complicated models from scratch, or are only applicable for born-digital PDF documents. In this section, we show how LayoutParser can help build a light-weight accurate visual table extractor for legal docket tables using the existing resources with minimal effort.\\n\\nThe extractor uses a pre-trained layout detection model for identifying the table regions and some simple rules for pairing the rows and the columns in the PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the LayoutParser Model Zoo can be used for detecting table regions. By filtering out model predictions of low confidence and removing overlapping predictions, LayoutParser can identify the tabular regions on each page, which significantly simplifies the subsequent steps. By applying the line detection functions within the tabular segments, provided in the utility module from LayoutParser, the pipeline can identify the three distinct columns in the tables. A row clustering method is then applied via analyzing the y coordinates of token bounding boxes in the left-most column, which are obtained from the OCR engines. A non-maximal suppression algorithm is used to remove duplicated rows with extremely small gaps. Shown in Figure 6, the built pipeline can detect tables at different positions on a page accurately. Continued tables from different pages are concatenated, and a structured table representation has been easily created.\\n\\n18 https://github.com/atlanhq/camelot, https://github.com/tabulapdf/tabula\\n\\n13\\n\\n14\\n\\nZ. Shen et al.\\n\\n6 Conclusion\\n\\nLayoutParser provides a comprehensive toolkit for deep learning-based document image analysis. The off-the-shelf library is easy to install, and can be used to build flexible and accurate pipelines for processing documents with complicated structures. It also supports high-level customization and enables easy labeling and training of DL models on unique document image datasets. The LayoutParser community platform facilitates sharing DL models and DIA pipelines, inviting discussion and promoting code reproducibility and reusability. The LayoutParser team is committed to keeping the library updated continuously and bringing the most recent advances in DL-based DIA, such as multi-modal document modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.\\n\\nAcknowledgements We thank the anonymous reviewers for their comments and suggestions. This project is supported in part by NSF Grant OIA-2033558 and funding from the Harvard Data Science Initiative and Harvard Catalyst. Zejiang Shen thanks Doug Downey for suggestions.\\n\\nReferences\\n\\n[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., ManĀ“e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., ViĀ“egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensorflow.org\\n\\n[2] Alberti, M., Pondenkandath, V., WĀØursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423–428. IEEE (2018)\\n\\n[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296–300. IEEE (2009)\\n\\n[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365–9374 (2019)\\n\\n[5] Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale\\n\\nHierarchical Image Database. In: CVPR09 (2009)\\n\\n[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)\\n\\n[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)\\n\\nLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., GraliĀ“nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)\\n\\n[9]\\n\\n[10] Graves, A., FernĀ“andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)\\n\\n[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., DollĀ“ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the\\n\\nIEEE international conference on computer vision. pp. 2961–2969 (2017)\\n\\n[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)\\n\\n[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J.\\n\\n2007(159), 2 (Jul 2007)\\n\\n[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)\\n\\n[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143\\n\\n[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767\\n\\n[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)\\n\\n[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., DollĀ“ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)\\n\\n[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)\\n\\n[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)\\n\\n[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)\\n\\n15\\n\\n16\\n\\nZ. Shen et al.\\n\\n[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)\\n\\n[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)\\n\\n[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)\\n\\n[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)\\n\\n[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)\\n\\n[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)\\n\\n[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning\\n\\nbased layout annotation. arXiv preprint arXiv:2010.01762 (2020)\\n\\n[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)\\n\\n[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) [35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://\\n\\ngithub.com/facebookresearch/detectron2 (2019)\\n\\n[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)\\n\\n[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of\\n\\ntext and layout for document image understanding (2019)\\n\\n[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:\\n\\nlargest dataset ever for doc- In: 2019 International Conference on Document IEEE (Sep 2019).\\n\\nument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166\\n\\nlayout analysis.', metadata={'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf'})" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from langchain_community.document_loaders import UnstructuredPDFLoader\n", "\n", "file_path = (\n", - " \"../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", ")\n", "loader = UnstructuredPDFLoader(file_path)\n", - "data = loader.load()" + "data = loader.load()\n", + "data[0]" ] }, { @@ -265,24 +313,24 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "efd80620-0bb8-4298-ab3b-07d7ef9c0085", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='1 2 0 2', metadata={'source': '../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': '../../../docs/integrations/document_loaders/example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-03-18T13:22:22', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText'})" + "Document(page_content='1 2 0 2', metadata={'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': '../../docs/integrations/document_loaders/example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText'})" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "file_path = (\n", - " \"../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", ")\n", "loader = UnstructuredPDFLoader(file_path, mode=\"elements\")\n", "\n", @@ -300,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "3c40d9e8-5bf7-466d-b2bb-ce2ae08bea35", "metadata": {}, "outputs": [ @@ -310,7 +358,7 @@ "{'ListItem', 'NarrativeText', 'Title', 'UncategorizedText'}" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -336,12 +384,24 @@ "execution_count": 18, "id": "54737607-072e-4eb9-aac8-6615472fefc1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='3 2 0 2\\n\\nb e F 7\\n\\n]\\n\\nG A . h t a m\\n\\n[\\n\\n1 v 3 0 8 3 0 . 2 0 3 2 : v i X r a\\n\\nA WEAK (k, k)-LEFSCHETZ THEOREM FOR PROJECTIVE TORIC ORBIFOLDS\\n\\nWilliam D. Montoya\\n\\nInstituto de MatemĀ“atica, Estat“ıstica e ComputaĀøc˜ao Cient“ıfica, Universidade Estadual de Campinas (UNICAMP),\\n\\nRua SĀ“ergio Buarque de Holanda 651, 13083-859, Campinas, SP, Brazil\\n\\nFebruary 9, 2023\\n\\nAbstract\\n\\nFirstly we show a generalization of the (1, 1)-Lefschetz theorem for projective toric orbifolds and secondly we prove that on 2k-dimensional quasi-smooth hyper- surfaces coming from quasi-smooth intersection surfaces, under the Cayley trick, every rational (k, k)-cohomology class is algebraic, i.e., the Hodge conjecture holds on them.\\n\\n1\\n\\nIntroduction\\n\\nIn [3] we proved that, under suitable conditions, on a very general codimension s quasi- smooth intersection subvariety X in a projective toric orbifold Pd Ī£ with d + s = 2(k + 1) the Hodge conjecture holds, that is, every (p, p)-cohomology class, under the PoincarĀ“e duality is a rational linear combination of fundamental classes of algebraic subvarieties of X. The proof of the above-mentioned result relies, for p ≠ d + 1 āˆ’ s, on a Lefschetz\\n\\nDate: February 9, 2023 2020 Mathematics Subject Classification: 14C30, 14M10, 14J70, 14M25 Keywords: (1,1)- Lefschetz theorem, Hodge conjecture, toric varieties, complete intersection Email: wmontoya@ime.unicamp.br\\n\\n1\\n\\ntheorem ([7]) and the Hard Lefschetz theorem for projective orbifolds ([11]). When p = d + 1 āˆ’ s the proof relies on the Cayley trick, a trick which associates to X a quasi-smooth hypersurface Y in a projective vector bundle, and the Cayley Proposition (4.3) which gives an isomorphism of some primitive cohomologies (4.2) of X and Y . The Cayley trick, following the philosophy of Mavlyutov in [7], reduces results known for quasi-smooth hypersurfaces to quasi-smooth intersection subvarieties. The idea in this paper goes the other way around, we translate some results for quasi-smooth intersection subvarieties to quasi-smooth hypersurfaces, mainly the (1, 1)-Lefschetz theorem.\\n\\nAcknowledgement. I thank Prof. Ugo Bruzzo and Tiago Fonseca for useful discus-\\n\\nsions. I also acknowledge support from FAPESP postdoctoral grant No. 2019/23499-7.\\n\\n2 Preliminaries and Notation\\n\\n2.1 Toric varieties\\n\\nLet M be a free abelian group of rank d, let N = Hom(M, Z), and NR = N āŠ—Z R.\\n\\nA convex subset σ āŠ‚ NR is a rational k-dimensional simplicial cone if there exist k linearly independent primitive elements e1, . . . , ek ∈ N such that σ = {µ1e1 + ⋯ + µkek}.\\n\\nDefinition 2.1.\\n\\nThe generators ei are integral if for every i and any nonnegative rational number µ the product µei is in N only if µ is an integer.\\n\\nGiven two rational simplicial cones σ, Ļƒā€² one says that Ļƒā€² is a face of σ (Ļƒā€² < σ) if the set of integral generators of Ļƒā€² is a subset of the set of integral generators of σ.\\n\\nA finite set Ī£ = {σ1, . . . , σt} of rational simplicial cones is called a rational simplicial complete d-dimensional fan if:\\n\\n1. all faces of cones in Ī£ are in Ī£;\\n\\n2. if σ, Ļƒā€² ∈ Ī£ then σ ∩ Ļƒā€² < σ and σ ∩ Ļƒā€² < Ļƒā€²;\\n\\n3. NR = σ1 ∪ ā‹… ā‹… ā‹… ∪ σt.\\n\\nA rational simplicial complete d-dimensional fan Ī£ defines a d-dimensional toric variety Ī£ having only orbifold singularities which we assume to be projective. Moreover, T ∶= Pd N āŠ—Z Cāˆ— ā‰ƒ (Cāˆ—)d is the torus action on Pd Ī£. We denote by Ī£(i) the i-dimensional cones\\n\\n2\\n\\nof Ī£ and each ρ ∈ Ī£ corresponds to an irreducible T -invariant Weil divisor Dρ on Pd Cl(Ī£) be the group of Weil divisors on Pd\\n\\nĪ£ module rational equivalences.\\n\\nĪ£. Let\\n\\nThe total coordinate ring of Pd\\n\\nĪ£ is the polynomial ring S = C[xρ ∣ ρ ∈ Ī£(1)], S has the ρ ∈\\n\\nCl(Ī£)-grading, a Weil divisor D = āˆ‘ĻāˆˆĪ£(1) uρDρ determines the monomial xu ∶= āˆĻāˆˆĪ£(1) xuρ S and conversely deg(xu) = [D] ∈ Cl(Ī£).\\n\\nFor a cone σ ∈ Ī£, Ė†Ļƒ is the set of 1-dimensional cone in Ī£ that are not contained in σ\\n\\nand xĖ†Ļƒ ∶= āˆĻāˆˆĖ†Ļƒ xρ is the associated monomial in S.\\n\\nĪ£ is the monomial ideal BĪ£ ∶=< xĖ†Ļƒ ∣ σ ∈ Ī£ > and\\n\\nDefinition 2.2. The irrelevant ideal of Pd the zero locus Z(Ī£) ∶= V(BĪ£) in the affine space Ad ∶= Spec(S) is the irrelevant locus.\\n\\nProposition 2.3 (Theorem 5.1.11 [5]). The toric variety Pd Ī£ is a categorical quotient Ad āˆ– Z(Ī£) by the group Hom(Cl(Ī£), Cāˆ—) and the group action is induced by the Cl(Ī£)- grading of S.\\n\\n2.2 Orbifolds\\n\\nNow we give a brief introduction to complex orbifolds and we mention the needed theorems for the next section. Namely: de Rham theorem and Dolbeault theorem for complex orbifolds.\\n\\nDefinition 2.4. A complex orbifold of complex dimension d is a singular complex space whose singularities are locally isomorphic to quotient singularities Cd/G, for finite sub- groups G āŠ‚ Gl(d, C).\\n\\nDefinition 2.5. A differential form on a complex orbifold Z is defined locally at z ∈ Z as a G-invariant differential form on Cd where G āŠ‚ Gl(d, C) and Z is locally isomorphic to Cd/G around z.\\n\\nRoughly speaking the local geometry of orbifolds reduces to local G-invariant geometry. We have a complex of differential forms (Aā—(Z), d) and a double complex (Aā—,ā—(Z), āˆ‚, ĀÆāˆ‚) of bigraded differential forms which define the de Rham and the Dolbeault cohomology groups (for a fixed p ∈ N) respectively:\\n\\ndR(Z, C) ∶=\\n\\nH ā—\\n\\nker d im d\\n\\nand H p,ā—(Z, ĀÆāˆ‚) ∶=\\n\\nker ĀÆāˆ‚ im ĀÆāˆ‚\\n\\nTheorem 2.6 (Theorem 3.4.4 in [4] and Theorem 1.2 in [1] ). Let Z be a compact complex orbifold. There are natural isomorphisms:\\n\\n3\\n\\nH ā—\\n\\ndR(Z, C) ā‰ƒ H ā—(Z, C)\\n\\nH p,ā—(Z, ĀÆāˆ‚) ā‰ƒ H ā—(X, Ωp Z )\\n\\n3\\n\\n(1,1)-Lefschetz theorem for projective toric orbifolds\\n\\nDefinition 3.1. A subvariety X āŠ‚ Pd Z(Ī£).\\n\\nĪ£ is quasi-smooth if V(IX ) āŠ‚ A#Ī£(1) is smooth outside\\n\\nExample 3.2. Quasi-smooth hypersurfaces or more generally quasi-smooth intersection sub- varieties are quasi-smooth subvarieties (see [2] or [7] for more details).\\n\\nRemark 3.3. Quasi-smooth subvarieties are suborbifolds of Pd Ī£ in the sense of Satake in [8]. Intuitively speaking they are subvarieties whose only singularities come from the ambient space.\\n\\nTheorem 3.4. Let X āŠ‚ Pd class Ī» ∈ H 1,1(X) ∩ H 2(X, Z) is algebraic\\n\\nĪ£ be a quasi-smooth subvariety. Then every (1, 1)-cohomology\\n\\nProof. From the exponential short exact sequence\\n\\n0 → Z → OX → Oāˆ— X\\n\\n→ 0\\n\\nwe have a long exact sequence in cohomology\\n\\nX ) → H 2(X, Z) → H 2(OX ) ā‰ƒ H 0,2(X)\\n\\nH 1(Oāˆ—\\n\\nwhere the last isomorphisms is due to Steenbrink in [9]. Now, it is enough to prove the commutativity of the next diagram\\n\\nH 2(X, Z)\\n\\nH 2(X, OX )\\n\\nH 2(X, C)\\n\\nā‰ƒ Dolbeault\\n\\nde Rham ā‰ƒ\\n\\n(cid:15)\\n\\n(cid:15)\\n\\nH 2\\n\\ndR(X, C)\\n\\n/\\n\\n/ H 0,2\\n\\nĀÆāˆ‚ (X)\\n\\n4\\n\\nā–³\\n\\nā–³\\n\\nThe key points are the de Rham and Dolbeault’s isomorphisms for orbifolds. The rest\\n\\nof the proof follows as the (1, 1)-Lefschetz theorem in [6].\\n\\nRemark 3.5. For k = 1 and Pd Lefschetz theorem.\\n\\nĪ£ as the projective space, we recover the classical (1, 1)-\\n\\nBy the Hard Lefschetz Theorem for projective orbifolds (see [11] for details) we get an\\n\\nisomorphism of cohomologies :\\n\\nH ā—(X, Q) ā‰ƒ H 2 dim Xāˆ’ā—(X, Q)\\n\\ngiven by the Lefschetz morphism and since it is a morphism of Hodge structures, we have:\\n\\nH 1,1(X, Q) ā‰ƒ H dim Xāˆ’1,dim Xāˆ’1(X, Q)\\n\\nFor X as before:\\n\\nCorollary 3.6. If the dimension of X is 1, 2 or 3. The Hodge conjecture holds on X.\\n\\nProof. If the dimCX = 1 the result is clear by the Hard Lefschetz theorem for projective orbifolds. The dimension 2 and 3 cases are covered by Theorem 3.5 and the Hard Lefschetz. theorem.\\n\\n4 Cayley trick and Cayley proposition\\n\\nThe Cayley trick is a way to associate to a quasi-smooth intersection subvariety a quasi- smooth hypersurface. Let L1, . . . , Ls be line bundles on Pd Ī£ be the projective space bundle associated to the vector bundle E = L1 āŠ• ⋯ āŠ• Ls. It is known that P(E) is a (d + s āˆ’ 1)-dimensional simplicial toric variety whose fan depends on the degrees of the line bundles and the fan Ī£. Furthermore, if the Cox ring, without considering the grading, of Pd\\n\\nĪ£ and let Ļ€ ∶ P(E) → Pd\\n\\nĪ£ is C[x1, . . . , xm] then the Cox ring of P(E) is\\n\\nC[x1, . . . , xm, y1, . . . , ys]\\n\\nMoreover for X a quasi-smooth intersection subvariety cut off by f1, . . . , fs with deg(fi) = [Li] we relate the hypersurface Y cut off by F = y1f1 + ā‹… ā‹… ā‹… + ysfs which turns out to be quasi-smooth. For more details see Section 2 in [7].\\n\\n5\\n\\nā–³\\n\\nWe will denote P(E) as Pd+sāˆ’1\\n\\nĪ£,X to keep track of its relation with X and Pd Ī£.\\n\\nThe following is a key remark.\\n\\nRemark 4.1. There is a morphism ι ∶ X → Y āŠ‚ Pd+sāˆ’1 with y ≠ 0 has a preimage. Hence for any subvariety W = V(IW ) āŠ‚ X āŠ‚ Pd W ′ āŠ‚ Y āŠ‚ Pd+sāˆ’1 Ī£,X such that Ļ€(W ′) = W , i.e., W ′ = {z = (x, y) ∣ x ∈ W }.\\n\\nĪ£,X . Moreover every point z ∶= (x, y) ∈ Y Ī£ there exists\\n\\nā–³\\n\\nFor X āŠ‚ Pd\\n\\nĪ£ a quasi-smooth intersection variety the morphism in cohomology induced\\n\\nby the inclusion iāˆ— ∶ H dāˆ’s(Pd\\n\\nĪ£, C) → H dāˆ’s(X, C) is injective by Proposition 1.4 in [7].\\n\\nDefinition 4.2. The primitive cohomology of H dāˆ’s and H dāˆ’s prim(X, Q) with rational coefficients.\\n\\nprim(X) is the quotient H dāˆ’s(X, C)/iāˆ—(H dāˆ’s(Pd\\n\\nH dāˆ’s(Pd\\n\\nĪ£, C) and H dāˆ’s(X, C) have pure Hodge structures, and the morphism iāˆ— is com-\\n\\npatible with them, so that H dāˆ’s\\n\\nprim(X) gets a pure Hodge structure.\\n\\nThe next Proposition is the Cayley proposition.\\n\\nProposition 4.3. [Proposition 2.3 in [3] ] Let X = X1 āˆ©ā‹… ā‹… ā‹…āˆ©Xs be a quasi-smooth intersec- , d+sāˆ’3 tion subvariety in Pd 2\\n\\nĪ£ cut off by homogeneous polynomials f1 . . . fs. Then for p ≠ d+sāˆ’1\\n\\n2\\n\\nH pāˆ’1,d+sāˆ’1āˆ’p\\n\\nprim\\n\\n(Y ) ā‰ƒ H pāˆ’s,dāˆ’p\\n\\nprim (X).\\n\\nCorollary 4.4. If d + s = 2(k + 1),\\n\\nH k+1āˆ’s,k+1āˆ’s\\n\\nprim\\n\\n(X) ā‰ƒ H k,k\\n\\nprim(Y )\\n\\nRemark 4.5. The above isomorphisms are also true with rational coefficients since H ā—(X, C) = H ā—(X, Q) āŠ—Q C. See the beginning of Section 7.1 in [10] for more details.\\n\\nā–³\\n\\n5 Main result\\n\\nTheorem 5.1. Let Y = {F = y1f1 + ⋯ + ykfk = 0} āŠ‚ P2k+1 associated to the quasi-smooth intersection surface X = Xf1 ∩ ā‹… ā‹… ā‹… ∩ Xfk āŠ‚ Pk+2 the Hodge conjecture holds.\\n\\nĪ£,X be the quasi-smooth hypersurface Ī£ . Then on Y\\n\\nProof. If H k,k proposition H k,k\\n\\nprim(X, Q) = 0 we are done. So let us assume H k,k\\n\\nprim(X, Q) ≠ 0. By the Cayley prim(X, Q) and by the (1, 1)-Lefschetz theorem for projective\\n\\nprim(Y, Q) ā‰ƒ H 1,1\\n\\n6\\n\\nĪ£, C))\\n\\ntoric orbifolds there is a non-zero algebraic basis Ī»C1, . . . , Ī»Cn with rational coefficients of H 1,1 prim(X, Q) algebraic curves C1, . . . , Cn in X such that under the PoincarĀ“e duality the class in homology [Ci] goes to Ī»Ci, [Ci] ↦ Ī»Ci. Recall that the Cox ring of Pk+2 is contained in the Cox ring of P2k+1 Ī£,X without considering the Ī£ ) then (α, 0) ∈ Cl(P2k+1 grading. Considering the grading we have that if α ∈ Cl(Pk+2 Ī£,X ). So the polynomials defining Ci āŠ‚ Pk+2 X,Ī£ but with different degree. Moreover, by Remark 4.1 each Ci is contained in Y = {F = y1f1 + ⋯ + ykfk = 0} and furthermore it has codimension k.\\n\\nprim(X, Q), that is, there are n ∶= h1,1\\n\\ncan be interpreted in P2k+1\\n\\nĪ£\\n\\ni=1 is a basis of H k,k It is enough to prove that Ī»Ci is different from zero in H k,k prim(Y, Q) or equivalently that the cohomology classes {Ī»Ci}n i=1 do not come from the ambient space. By contradiction, let us assume that there exists a j and C āŠ‚ P2k+1 Ī£,X , Q) with iāˆ—(Ī»C) = Ī»Cj or in terms of homology there exists a (k + 2)-dimensional algebraic subvariety V āŠ‚ P2k+1 Ī£,X such that V ∩ Y = Cj so they are equal as a homology class of P2k+1 Ī£,X ,i.e., [V ∩ Y ] = [Cj] . Ī£ where Ļ€ ∶ (x, y) ↦ x. Hence It is easy to check that Ļ€(V ) ∩ X = Cj as a subvariety of Pk+2 [Ļ€(V ) ∩ X] = [Cj] which is equivalent to say that Ī»Cj comes from Pk+2 Ī£ which contradicts the choice of [Cj].\\n\\nClaim: {Ī»Ci}n\\n\\nprim(Y, Q).\\n\\nĪ£,X such that Ī»C ∈ H k,k(P2k+1\\n\\nRemark 5.2. Into the proof of the previous theorem, the key fact was that on X the Hodge conjecture holds and we translate it to Y by contradiction. So, using an analogous argument we have:\\n\\nProposition 5.3. Let Y = {F = y1fs+⋯+ysfs = 0} āŠ‚ P2k+1 associated to a quasi-smooth intersection subvariety X = Xf1 ∩ ā‹… ā‹… ā‹… ∩ Xfs āŠ‚ Pd d + s = 2(k + 1). If the Hodge conjecture holds on X then it holds as well on Y .\\n\\nĪ£,X be the quasi-smooth hypersurface Ī£ such that\\n\\nCorollary 5.4. If the dimension of Y is 2s āˆ’ 1, 2s or 2s + 1 then the Hodge conjecture holds on Y .\\n\\nProof. By Proposition 5.3 and Corollary 3.6.\\n\\n7\\n\\nā–³\\n\\nReferences\\n\\n[1] Angella, D. Cohomologies of certain orbifolds. Journal of Geometry and Physics\\n\\n71 (2013), 117–126.\\n\\n[2] Batyrev, V. V., and Cox, D. A. On the Hodge structure of projective hypersur-\\n\\nfaces in toric varieties. Duke Mathematical Journal 75, 2 (Aug 1994).\\n\\n[3] Bruzzo, U., and Montoya, W. On the Hodge conjecture for quasi-smooth in- tersections in toric varieties. S˜ao Paulo J. Math. Sci. Special Section: Geometry in Algebra and Algebra in Geometry (2021).\\n\\n[4] Caramello Jr, F. C. Introduction to orbifolds. arXiv:1909.08699v6 (2019).\\n\\n[5] Cox, D., Little, J., and Schenck, H. Toric varieties, vol. 124. American Math-\\n\\nematical Soc., 2011.\\n\\n[6] Griffiths, P., and Harris, J. Principles of Algebraic Geometry. John Wiley &\\n\\nSons, Ltd, 1978.\\n\\n[7] Mavlyutov, A. R. Cohomology of complete intersections in toric varieties. Pub-\\n\\nlished in Pacific J. of Math. 191 No. 1 (1999), 133–144.\\n\\n[8] Satake, I. On a Generalization of the Notion of Manifold. Proceedings of the National Academy of Sciences of the United States of America 42, 6 (1956), 359–363.\\n\\n[9] Steenbrink, J. H. M. Intersection form for quasi-homogeneous singularities. Com-\\n\\npositio Mathematica 34, 2 (1977), 211–223.\\n\\n[10] Voisin, C. Hodge Theory and Complex Algebraic Geometry I, vol. 1 of Cambridge\\n\\nStudies in Advanced Mathematics. Cambridge University Press, 2002.\\n\\n[11] Wang, Z. Z., and Zaffran, D. A remark on the Hard Lefschetz theorem for KĀØahler orbifolds. Proceedings of the American Mathematical Society 137, 08 (Aug 2009).\\n\\n8', metadata={'source': '/var/folders/z4/1qk27d6n7w59z2h3r31hwxgr0000gn/T/tmpgjpunfou/tmp.pdf'})" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from langchain_community.document_loaders import OnlinePDFLoader\n", "\n", "loader = OnlinePDFLoader(\"https://arxiv.org/pdf/2302.03803.pdf\")\n", - "data = loader.load()" + "data = loader.load()\n", + "data[0]" ] }, { @@ -362,10 +422,11 @@ "from langchain_community.document_loaders import PyPDFium2Loader\n", "\n", "file_path = (\n", - " \"../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", ")\n", "loader = PyPDFium2Loader(file_path)\n", - "data = loader.load()" + "data = loader.load()\n", + "data[0]" ] }, { @@ -378,18 +439,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "4f465592-15be-4b8f-8f8c-0ffe207d0e4d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='1\\n2\\n0\\n2\\n\\nn\\nu\\nJ\\n\\n1\\n2\\n\\n]\\n\\nV\\nC\\n.\\ns\\nc\\n[\\n\\n2\\nv\\n8\\n4\\n3\\n5\\n1\\n.\\n3\\n0\\n1\\n2\\n:\\nv\\ni\\nX\\nr\\na\\n\\nLayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\n\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\n\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\n\\nKeywords: Document Image Analysis Ā· Deep Learning Ā· Layout Analysis\\nĀ· Character Recognition Ā· Open Source library Ā· Toolkit.\\n\\n1\\n\\nIntroduction\\n\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classification [11,\\n\\n \\n \\n \\n \\n \\n \\n\\x0c2\\n\\nZ. Shen et al.\\n\\n37], layout detection [38, 22], table detection [26], and scene text detection [4].\\nA generalized learning-based framework dramatically reduces the need for the\\nmanual specification of complicated rules, which is the status quo with traditional\\nmethods. DL has the potential to transform DIA pipelines and benefit a broad\\nspectrum of large-scale document digitization projects.\\n\\nHowever, there are several practical difficulties for taking advantages of re-\\ncent advances in DL-based methods: 1) DL models are notoriously convoluted\\nfor reuse and extension. Existing models are developed using distinct frame-\\nworks like TensorFlow [1] or PyTorch [24], and the high-level parameters can\\nbe obfuscated by implementation details [8]. It can be a time-consuming and\\nfrustrating experience to debug, reproduce, and adapt existing models for DIA,\\nand many researchers who would benefit the most from using these methods lack\\nthe technical background to implement them from scratch. 2) Document images\\ncontain diverse and disparate patterns across domains, and customized training\\nis often required to achieve a desirable detection accuracy. Currently there is no\\nfull-fledged infrastructure for easily curating the target document image datasets\\nand fine-tuning or re-training the models. 3) DIA usually requires a sequence of\\nmodels and other processing to obtain the final outputs. Often research teams use\\nDL models and then perform further document analyses in separate processes,\\nand these pipelines are not documented in any central location (and often not\\ndocumented at all). This makes it difficult for research teams to learn about how\\nfull pipelines are implemented and leads them to invest significant resources in\\nreinventing the DIA wheel.\\n\\nLayoutParser provides a unified toolkit to support DL-based document image\\nanalysis and processing. To address the aforementioned challenges, LayoutParser\\nis built with the following components:\\n\\n1. An off-the-shelf toolkit for applying DL models for layout detection, character\\n\\nrecognition, and other DIA tasks (Section 3)\\n\\n2. A rich repository of pre-trained neural network models (Model Zoo) that\\n\\nunderlies the off-the-shelf usage\\n\\n3. Comprehensive tools for efficient document image data annotation and model\\n\\ntuning to support different levels of customization\\n\\n4. A DL model hub and community platform for the easy sharing, distribu-\\ntion, and discussion of DIA models and pipelines, to promote reusability,\\nreproducibility, and extensibility (Section 4)\\n\\nThe library implements simple and intuitive Python APIs without sacrificing\\ngeneralizability and versatility, and can be easily installed via pip. Its convenient\\nfunctions for handling document image data can be seamlessly integrated with\\nexisting DIA pipelines. With detailed documentations and carefully curated\\ntutorials, we hope this tool will benefit a variety of end-users, and will lead to\\nadvances in applications in both industry and academic research.\\n\\nLayoutParser is well aligned with recent efforts for improving DL model\\nreusability in other disciplines like natural language processing [8, 34] and com-\\nputer vision [35], but with a focus on unique challenges in DIA. We show\\nLayoutParser can be applied in sophisticated and large-scale digitization projects\\n\\n\\x0cLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n3\\n\\nthat require precision, efficiency, and robustness, as well as simple and light-\\nweight document processing tasks focusing on efficacy and flexibility (Section 5).\\nLayoutParser is being actively maintained, and support for more deep learning\\nmodels and novel methods in text-based layout analysis methods [37, 34] is\\nplanned.\\n\\nThe rest of the paper is organized as follows. Section 2 provides an overview\\nof related work. The core LayoutParser library, DL Model Zoo, and customized\\nmodel training are described in Section 3, and the DL model hub and commu-\\nnity platform are detailed in Section 4. Section 5 shows two examples of how\\nLayoutParser can be used in practical DIA projects, and Section 6 concludes.\\n\\n2 Related Work\\n\\nRecently, various DL models and datasets have been developed for layout analysis\\ntasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen-\\ntation tasks on historical documents. Object detection-based methods like Faster\\nR-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38]\\nand detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also\\nbeen used in table detection [27]. However, these models are usually implemented\\nindividually and there is no unified framework to load and use such models.\\n\\nThere has been a surge of interest in creating open-source tools for document\\nimage processing: a search of document image analysis in Github leads to 5M\\nrelevant code pieces 6; yet most of them rely on traditional rule-based methods\\nor provide limited functionalities. The closest prior research to our work is the\\nOCR-D project7, which also tries to build a complete toolkit for DIA. However,\\nsimilar to the platform developed by Neudecker et al. [21], it is designed for\\nanalyzing historical documents, and provides no supports for recent DL models.\\nThe DocumentLayoutAnalysis project8 focuses on processing born-digital PDF\\ndocuments via analyzing the stored PDF data. Repositories like DeepLayout9\\nand Detectron2-PubLayNet10 are individual deep learning models trained on\\nlayout analysis datasets without support for the full DIA pipeline. The Document\\nAnalysis and Exploitation (DAE) platform [15] and the DeepDIVA project [2]\\naim to improve the reproducibility of DIA methods (or DL models), yet they\\nare not actively maintained. OCR engines like Tesseract [14], easyOCR11 and\\npaddleOCR12 usually do not come with comprehensive functionalities for other\\nDIA tasks like layout analysis.\\n\\nRecent years have also seen numerous efforts to create libraries for promoting\\nreproducibility and reusability in the field of DL. Libraries like Dectectron2 [35],\\n\\n6 The number shown is obtained by specifying the search type as ā€˜code’.\\n7 https://ocr-d.de/en/about\\n8 https://github.com/BobLd/DocumentLayoutAnalysis\\n9 https://github.com/leonlulu/DeepLayout\\n10 https://github.com/hpanwar08/detectron2\\n11 https://github.com/JaidedAI/EasyOCR\\n12 https://github.com/PaddlePaddle/PaddleOCR\\n\\n\\x0c4\\n\\nZ. Shen et al.\\n\\nFig. 1: The overall architecture of LayoutParser. For an input document image,\\nthe core LayoutParser library provides a set of off-the-shelf tools for layout\\ndetection, OCR, visualization, and storage, backed by a carefully designed layout\\ndata structure. LayoutParser also supports high level customization via efficient\\nlayout annotation and model training functions. These improve model accuracy\\non the target samples. The community platform enables the easy sharing of DIA\\nmodels and whole digitization pipelines to promote reusability and reproducibility.\\nA collection of detailed documentation, tutorials and exemplar projects make\\nLayoutParser easy to learn and use.\\n\\nAllenNLP [8] and transformers [34] have provided the community with complete\\nDL-based support for developing and deploying models for general computer\\nvision and natural language processing problems. LayoutParser, on the other\\nhand, specializes specifically in DIA tasks. LayoutParser is also equipped with a\\ncommunity platform inspired by established model hubs such as Torch Hub [23]\\nand TensorFlow Hub [1]. It enables the sharing of pretrained models as well as\\nfull document processing pipelines that are unique to DIA tasks.\\n\\nThere have been a variety of document data collections to facilitate the\\ndevelopment of DL models. Some examples include PRImA [3](magazine layouts),\\nPubLayNet [38](academic paper layouts), Table Bank [18](tables in academic\\npapers), Newspaper Navigator Dataset [16, 17](newspaper figure layouts) and\\nHJDataset [31](historical Japanese document layouts). A spectrum of models\\ntrained on these datasets are currently available in the LayoutParser model zoo\\nto support different use cases.\\n\\n3 The Core LayoutParser Library\\n\\nAt the core of LayoutParser is an off-the-shelf toolkit that streamlines DL-\\nbased document image analysis. Five components support a simple interface\\nwith comprehensive functionalities: 1) The layout detection models enable using\\npre-trained or self-trained DL models for layout detection with just four lines\\nof code. 2) The detected layout information is stored in carefully engineered\\n\\nEfficient Data AnnotationCustomized Model TrainingModel CustomizationDIA Model HubDIA Pipeline SharingCommunity PlatformLayout Detection ModelsDocument Images The Core LayoutParser LibraryOCR ModuleStorage & VisualizationLayout Data Structure\\x0cLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n5\\n\\nTable 1: Current layout detection models in the LayoutParser model zoo\\n\\nDataset\\n\\nBase Model1 Large Model Notes\\n\\nPubLayNet [38]\\nPRImA [3]\\nNewspaper [17]\\nTableBank [18]\\nHJDataset [31]\\n\\nF / M\\nM\\nF\\nF\\nF / M\\n\\nM\\n-\\n-\\nF\\n-\\n\\nLayouts of modern scientific documents\\nLayouts of scanned modern magazines and scientific reports\\nLayouts of scanned US newspapers from the 20th century\\nTable region on modern scientific and business document\\nLayouts of history Japanese documents\\n\\n1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy\\nvs. computational cost). For ā€œbase modelā€ and ā€œlarge modelā€, we refer to using the ResNet 50 or ResNet 101\\nbackbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask\\nR-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\\nusing the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\\nzoo in coming months.\\n\\nlayout data structures, which are optimized for efficiency and versatility. 3) When\\nnecessary, users can employ existing or customized OCR models via the unified\\nAPI provided in the OCR module. 4) LayoutParser comes with a set of utility\\nfunctions for the visualization and storage of the layout data. 5) LayoutParser\\nis also highly customizable, via its integration with functions for layout data\\nannotation and model training. We now provide detailed descriptions for each\\ncomponent.\\n\\n3.1 Layout Detection Models\\n\\nIn LayoutParser, a layout model takes a document image as an input and\\ngenerates a list of rectangular boxes for the target content regions. Different\\nfrom traditional methods, it relies on deep convolutional neural networks rather\\nthan manually curated rules to identify content regions. It is formulated as an\\nobject detection problem and state-of-the-art models like Faster R-CNN [28] and\\nMask R-CNN [12] are used. This yields prediction results of high accuracy and\\nmakes it possible to build a concise, generalized interface for layout detection.\\nLayoutParser, built upon Detectron2 [35], provides a minimal API that can\\nperform layout detection with only four lines of code in Python:\\n\\n1 import layoutparser as lp\\n2 image = cv2 . imread ( \" image_file \" ) # load images\\n3 model = lp . De t e c tro n2 Lay outM odel (\\n\\n\" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" )\\n\\n4\\n5 layout = model . detect ( image )\\n\\nLayoutParser provides a wealth of pre-trained model weights using various\\ndatasets covering different languages, time periods, and document types. Due to\\ndomain shift [7], the prediction performance can notably drop when models are ap-\\nplied to target samples that are significantly different from the training dataset. As\\ndocument structures and layouts vary greatly in different domains, it is important\\nto select models trained on a dataset similar to the test samples. A semantic syntax\\nis used for initializing the model weights in LayoutParser, using both the dataset\\nname and model name lp:///.\\n\\n\\x0c6\\n\\nZ. Shen et al.\\n\\nFig. 2: The relationship between the three types of layout data structures.\\nCoordinate supports three kinds of variation; TextBlock consists of the co-\\nordinate information and extra features like block text, types, and reading orders;\\na Layout object is a list of all possible layout elements, including other Layout\\nobjects. They all support the same set of transformation and operation APIs for\\nmaximum flexibility.\\n\\nShown in Table 1, LayoutParser currently hosts 9 pre-trained models trained\\non 5 different datasets. Description of the training dataset is provided alongside\\nwith the trained models such that users can quickly identify the most suitable\\nmodels for their tasks. Additionally, when such a model is not readily available,\\nLayoutParser also supports training customized layout models and community\\nsharing of the models (detailed in Section 3.5).\\n\\n3.2 Layout Data Structures\\n\\nA critical feature of LayoutParser is the implementation of a series of data\\nstructures and operations that can be used to efficiently process and manipulate\\nthe layout elements. In document image analysis pipelines, various post-processing\\non the layout analysis model outputs is usually required to obtain the final\\noutputs. Traditionally, this requires exporting DL model outputs and then loading\\nthe results into other pipelines. All model outputs from LayoutParser will be\\nstored in carefully engineered data types optimized for further processing, which\\nmakes it possible to build an end-to-end document digitization pipeline within\\nLayoutParser. There are three key components in the data structure, namely\\nthe Coordinate system, the TextBlock, and the Layout. They provide different\\nlevels of abstraction for the layout data, and a set of APIs are supported for\\ntransformations or operations on these classes.\\n\\n\\x0cLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n7\\n\\nCoordinates are the cornerstones for storing layout information. Currently,\\nthree types of Coordinate data structures are provided in LayoutParser, shown\\nin Figure 2. Interval and Rectangle are the most common data types and\\nsupport specifying 1D or 2D regions within a document. They are parameterized\\nwith 2 and 4 parameters. A Quadrilateral class is also implemented to support\\na more generalized representation of rectangular regions when the document\\nis skewed or distorted, where the 4 corner points can be specified and a total\\nof 8 degrees of freedom are supported. A wide collection of transformations\\nlike shift, pad, and scale, and operations like intersect, union, and is_in,\\nare supported for these classes. Notably, it is common to separate a segment\\nof the image and analyze it individually. LayoutParser provides full support\\nfor this scenario via image cropping operations crop_image and coordinate\\ntransformations like relative_to and condition_on that transform coordinates\\nto and from their relative representations. We refer readers to Table 2 for a more\\ndetailed description of these operations13.\\n\\nBased on Coordinates, we implement the TextBlock class that stores both\\nthe positional and extra features of individual layout elements. It also supports\\nspecifying the reading orders via setting the parent field to the index of the parent\\nobject. A Layout class is built that takes in a list of TextBlocks and supports\\nprocessing the elements in batch. Layout can also be nested to support hierarchical\\nlayout structures. They support the same operations and transformations as the\\nCoordinate classes, minimizing both learning and deployment effort.\\n\\n3.3 OCR\\n\\nLayoutParser provides a unified interface for existing OCR tools. Though there\\nare many OCR tools available, they are usually configured differently with distinct\\nAPIs or protocols for using them. It can be inefficient to add new OCR tools into\\nan existing pipeline, and difficult to make direct comparisons among the available\\ntools to find the best option for a particular project. To this end, LayoutParser\\nbuilds a series of wrappers among existing OCR engines, and provides nearly\\nthe same syntax for using them. It supports a plug-and-play style of using OCR\\nengines, making it effortless to switch, evaluate, and compare different OCR\\nmodules:\\n\\n1 ocr_agent = lp . TesseractAgent ()\\n2 # Can be easily switched to other OCR software\\n3 tokens = ocr_agent . detect ( image )\\n\\nThe OCR outputs will also be stored in the aforementioned layout data\\nstructures and can be seamlessly incorporated into the digitization pipeline.\\nCurrently LayoutParser supports the Tesseract and Google Cloud Vision OCR\\nengines.\\n\\nLayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained\\nwith the Connectionist Temporal Classification (CTC) loss [10]. It can be used\\nlike the other OCR modules, and can be easily trained on customized datasets.\\n\\n13 This is also available in the LayoutParser documentation pages.\\n\\n\\x0c8\\n\\nZ. Shen et al.\\n\\nTable 2: All operations supported by the layout elements. The same APIs are\\nsupported across different layout element classes including Coordinate types,\\nTextBlock and Layout.\\n\\nOperation Name\\n\\nDescription\\n\\nblock.pad(top, bottom, right, left) Enlarge the current block according to the input\\n\\nblock.scale(fx, fy)\\n\\nblock.shift(dx, dy)\\n\\nScale the current block given the ratio\\nin x and y direction\\n\\nMove the current block with the shift\\ndistances in x and y direction\\n\\nblock1.is in(block2)\\n\\nWhether block1 is inside of block2\\n\\nblock1.intersect(block2)\\n\\nblock1.union(block2)\\n\\nblock1.relative to(block2)\\n\\nblock1.condition on(block2)\\n\\nReturn the intersection region of block1 and block2.\\nCoordinate type to be determined based on the inputs.\\n\\nReturn the union region of block1 and block2.\\nCoordinate type to be determined based on the inputs.\\n\\nConvert the absolute coordinates of block1 to\\nrelative coordinates to block2\\n\\nCalculate the absolute coordinates of block1 given\\nthe canvas block2’s absolute coordinates\\n\\nblock.crop image(image)\\n\\nObtain the image segments in the block region\\n\\n3.4 Storage and visualization\\n\\nThe end goal of DIA is to transform the image-based document data into a\\nstructured database. LayoutParser supports exporting layout data into different\\nformats like JSON, csv, and will add the support for the METS/ALTO XML\\nformat 14 . It can also load datasets from layout analysis-specific formats like\\nCOCO [38] and the Page Format [25] for training layout models (Section 3.5).\\nVisualization of the layout detection results is critical for both presentation\\nand debugging. LayoutParser is built with an integrated API for displaying the\\nlayout information along with the original document image. Shown in Figure 3, it\\nenables presenting layout data with rich meta information and features in different\\nmodes. More detailed information can be found in the online LayoutParser\\ndocumentation page.\\n\\n3.5 Customized Model Training\\n\\nBesides the off-the-shelf library, LayoutParser is also highly customizable with\\nsupports for highly unique and challenging document analysis tasks. Target\\ndocument images can be vastly different from the existing datasets for train-\\ning layout models, which leads to low layout detection accuracy. Training data\\n\\n14 https://altoxml.github.io\\n\\n\\x0cLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n9\\n\\nFig. 3: Layout detection and OCR results visualization generated by the\\nLayoutParser APIs. Mode I directly overlays the layout region bounding boxes\\nand categories over the original image. Mode II recreates the original document\\nvia drawing the OCR’d texts at their corresponding positions on the image\\ncanvas. In this figure, tokens in textual regions are filtered using the API and\\nthen displayed.\\n\\ncan also be highly sensitive and not sharable publicly. To overcome these chal-\\nlenges, LayoutParser is built with rich features for efficient data annotation and\\ncustomized model training.\\n\\nLayoutParser incorporates a toolkit optimized for annotating document lay-\\nouts using object-level active learning [32]. With the help from a layout detection\\nmodel trained along with labeling, only the most important layout objects within\\neach image, rather than the whole image, are required for labeling. The rest of\\nthe regions are automatically annotated with high confidence predictions from\\nthe layout detection model. This allows a layout dataset to be created more\\nefficiently with only around 60% of the labeling budget.\\n\\nAfter the training dataset is curated, LayoutParser supports different modes\\nfor training the layout models. Fine-tuning can be used for training models on a\\nsmall newly-labeled dataset by initializing the model with existing pre-trained\\nweights. Training from scratch can be helpful when the source dataset and\\ntarget are significantly different and a large training set is available. However, as\\nsuggested in Studer et al.’s work[33], loading pre-trained weights on large-scale\\ndatasets like ImageNet [5], even from totally different domains, can still boost\\nmodel performance. Through the integrated API provided by LayoutParser,\\nusers can easily compare model performances on the benchmark datasets.\\n\\n\\x0c10\\n\\nZ. Shen et al.\\n\\nFig. 4: Illustration of (a) the original historical Japanese document with layout\\ndetection results and (b) a recreated version of the document image that achieves\\nmuch better character recognition recall. The reorganization algorithm rearranges\\nthe tokens based on the their detected bounding boxes given a maximum allowed\\nheight.\\n\\n4 LayoutParser Community Platform\\n\\nAnother focus of LayoutParser is promoting the reusability of layout detection\\nmodels and full digitization pipelines. Similar to many existing deep learning\\nlibraries, LayoutParser comes with a community model hub for distributing\\nlayout models. End-users can upload their self-trained models to the model hub,\\nand these models can be loaded into a similar interface as the currently available\\nLayoutParser pre-trained models. For example, the model trained on the News\\nNavigator dataset [17] has been incorporated in the model hub.\\n\\nBeyond DL models, LayoutParser also promotes the sharing of entire doc-\\nument digitization pipelines. For example, sometimes the pipeline requires the\\ncombination of multiple DL models to achieve better accuracy. Currently, pipelines\\nare mainly described in academic papers and implementations are often not pub-\\nlicly available. To this end, the LayoutParser community platform also enables\\nthe sharing of layout pipelines to promote the discussion and reuse of techniques.\\nFor each shared pipeline, it has a dedicated project page, with links to the source\\ncode, documentation, and an outline of the approaches. A discussion panel is\\nprovided for exchanging ideas. Combined with the core LayoutParser library,\\nusers can easily build reusable components based on the shared pipelines and\\napply them to solve their unique problems.\\n\\n5 Use Cases\\n\\nThe core objective of LayoutParser is to make it easier to create both large-scale\\nand light-weight document digitization pipelines. Large-scale document processing\\n\\n\\x0cLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n11\\n\\nfocuses on precision, efficiency, and robustness. The target documents may have\\ncomplicated structures, and may require training multiple layout detection models\\nto achieve the optimal accuracy. Light-weight pipelines are built for relatively\\nsimple documents, with an emphasis on development ease, speed and flexibility.\\nIdeally one only needs to use existing resources, and model training should be\\navoided. Through two exemplar projects, we show how practitioners in both\\nacademia and industry can easily build such pipelines using LayoutParser and\\nextract high-quality structured document data for their downstream tasks. The\\nsource code for these projects will be publicly available in the LayoutParser\\ncommunity hub.\\n\\n5.1 A Comprehensive Historical Document Digitization Pipeline\\n\\nThe digitization of historical documents can unlock valuable data that can shed\\nlight on many important social, economic, and historical questions. Yet due to\\nscan noises, page wearing, and the prevalence of complicated layout structures, ob-\\ntaining a structured representation of historical document scans is often extremely\\ncomplicated.\\nIn this example, LayoutParser was\\nused to develop a comprehensive\\npipeline, shown in Figure 5, to gener-\\nate high-quality structured data from\\nhistorical Japanese firm financial ta-\\nbles with complicated layouts. The\\npipeline applies two layout models to\\nidentify different levels of document\\nstructures and two customized OCR\\nengines for optimized character recog-\\nnition accuracy.\\n\\nAs shown in Figure 4 (a), the\\ndocument contains columns of text\\nwritten vertically 15, a common style\\nin Japanese. Due to scanning noise\\nand archaic printing technology, the\\ncolumns can be skewed or have vari-\\nable widths, and hence cannot be eas-\\nily identified via rule-based methods.\\nWithin each column, words are sepa-\\nrated by white spaces of variable size,\\nand the vertical positions of objects\\ncan be an indicator of their layout\\ntype.\\n\\nFig. 5: Illustration of how LayoutParser\\nhelps with the historical document digi-\\ntization pipeline.\\n\\n15 A document page consists of eight rows like this. For simplicity we skip the row\\n\\nsegmentation discussion and refer readers to the source code when available.\\n\\n\\x0c12\\n\\nZ. Shen et al.\\n\\nTo decipher the complicated layout\\n\\nstructure, two object detection models have been trained to recognize individual\\ncolumns and tokens, respectively. A small training set (400 images with approxi-\\nmately 100 annotations each) is curated via the active learning based annotation\\ntool [32] in LayoutParser. The models learn to identify both the categories and\\nregions for each token or column via their distinct visual features. The layout\\ndata structure enables easy grouping of the tokens within each column, and\\nrearranging columns to achieve the correct reading orders based on the horizontal\\nposition. Errors are identified and rectified via checking the consistency of the\\nmodel predictions. Therefore, though trained on a small dataset, the pipeline\\nachieves a high level of layout detection accuracy: it achieves a 96.97 AP [19]\\nscore across 5 categories for the column detection model, and a 89.23 AP across\\n4 categories for the token detection model.\\n\\nA combination of character recognition methods is developed to tackle the\\nunique challenges in this document. In our experiments, we found that irregular\\nspacing between the tokens led to a low character recognition recall rate, whereas\\nexisting OCR models tend to perform better on densely-arranged texts. To\\novercome this challenge, we create a document reorganization algorithm that\\nrearranges the text based on the token bounding boxes detected in the layout\\nanalysis step. Figure 4 (b) illustrates the generated image of dense text, which is\\nsent to the OCR APIs as a whole to reduce the transaction costs. The flexible\\ncoordinate system in LayoutParser is used to transform the OCR results relative\\nto their original positions on the page.\\n\\nAdditionally, it is common for historical documents to use unique fonts\\nwith different glyphs, which significantly degrades the accuracy of OCR models\\ntrained on modern texts. In this document, a special flat font is used for printing\\nnumbers and could not be detected by off-the-shelf OCR engines. Using the highly\\nflexible functionalities from LayoutParser, a pipeline approach is constructed\\nthat achieves a high recognition accuracy with minimal effort. As the characters\\nhave unique visual structures and are usually clustered together, we train the\\nlayout model to identify number regions with a dedicated category. Subsequently,\\nLayoutParser crops images within these regions, and identifies characters within\\nthem using a self-trained OCR model based on a CNN-RNN [6]. The model\\ndetects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and\\na 0.17 average Levinstein distances17 for token prediction on the test set.\\n\\nOverall, it is possible to create an intricate and highly accurate digitization\\npipeline for large-scale digitization using LayoutParser. The pipeline avoids\\nspecifying the complicated rules used in traditional methods, is straightforward\\nto develop, and is robust to outliers. The DL models also generate fine-grained\\nresults that enable creative approaches like page reorganization for OCR.\\n\\n16 This measures the overlap between the detected and ground-truth characters, and\\n\\nthe maximum is 1.\\n\\n17 This measures the number of edits from the ground-truth text to the predicted text,\\n\\nand lower is better.\\n\\n\\x0cLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n13\\n\\nFig. 6: This lightweight table detector can identify tables (outlined in red) and\\ncells (shaded in blue) in different locations on a page. In very few cases (d), it\\nmight generate minor error predictions, e.g, failing to capture the top text line of\\na table.\\n\\n5.2 A light-weight Visual Table Extractor\\n\\nDetecting tables and parsing their structures (table extraction) are of central im-\\nportance for many document digitization tasks. Many previous works [26, 30, 27]\\nand tools 18 have been developed to identify and parse table structures. Yet they\\nmight require training complicated models from scratch, or are only applicable\\nfor born-digital PDF documents. In this section, we show how LayoutParser can\\nhelp build a light-weight accurate visual table extractor for legal docket tables\\nusing the existing resources with minimal effort.\\n\\nThe extractor uses a pre-trained layout detection model for identifying the\\ntable regions and some simple rules for pairing the rows and the columns in the\\nPDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the\\nLayoutParser Model Zoo can be used for detecting table regions. By filtering\\nout model predictions of low confidence and removing overlapping predictions,\\nLayoutParser can identify the tabular regions on each page, which significantly\\nsimplifies the subsequent steps. By applying the line detection functions within\\nthe tabular segments, provided in the utility module from LayoutParser, the\\npipeline can identify the three distinct columns in the tables. A row clustering\\nmethod is then applied via analyzing the y coordinates of token bounding boxes in\\nthe left-most column, which are obtained from the OCR engines. A non-maximal\\nsuppression algorithm is used to remove duplicated rows with extremely small\\ngaps. Shown in Figure 6, the built pipeline can detect tables at different positions\\non a page accurately. Continued tables from different pages are concatenated,\\nand a structured table representation has been easily created.\\n\\n18 https://github.com/atlanhq/camelot, https://github.com/tabulapdf/tabula\\n\\n\\x0c14\\n\\nZ. Shen et al.\\n\\n6 Conclusion\\n\\nLayoutParser provides a comprehensive toolkit for deep learning-based document\\nimage analysis. The off-the-shelf library is easy to install, and can be used to\\nbuild flexible and accurate pipelines for processing documents with complicated\\nstructures. It also supports high-level customization and enables easy labeling and\\ntraining of DL models on unique document image datasets. The LayoutParser\\ncommunity platform facilitates sharing DL models and DIA pipelines, inviting\\ndiscussion and promoting code reproducibility and reusability. The LayoutParser\\nteam is committed to keeping the library updated continuously and bringing\\nthe most recent advances in DL-based DIA, such as multi-modal document\\nmodeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.\\n\\nAcknowledgements We thank the anonymous reviewers for their comments\\nand suggestions. This project is supported in part by NSF Grant OIA-2033558\\nand funding from the Harvard Data Science Initiative and Harvard Catalyst.\\nZejiang Shen thanks Doug Downey for suggestions.\\n\\nReferences\\n\\n[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado,\\nG.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A.,\\nIrving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg,\\nJ., ManĀ“e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J.,\\nSteiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V.,\\nViĀ“egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng,\\nX.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015),\\nhttps://www.tensorflow.org/, software available from tensorflow.org\\n\\n[2] Alberti, M., Pondenkandath, V., WĀØursch, M., Ingold, R., Liwicki, M.: Deepdiva: a\\nhighly-functional python framework for reproducible experiments. In: 2018 16th\\nInternational Conference on Frontiers in Handwriting Recognition (ICFHR). pp.\\n423–428. IEEE (2018)\\n\\n[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic\\ndataset for performance evaluation of document layout analysis. In: 2009 10th\\nInternational Conference on Document Analysis and Recognition. pp. 296–300.\\nIEEE (2009)\\n\\n[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text\\ndetection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and\\nPattern Recognition. pp. 9365–9374 (2019)\\n\\n[5] Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale\\n\\nHierarchical Image Database. In: CVPR09 (2009)\\n\\n[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with\\ncoarse-to-fine attention. In: International Conference on Machine Learning. pp.\\n980–989. PMLR (2017)\\n\\n[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation.\\nIn: International conference on machine learning. pp. 1180–1189. PMLR (2015)\\n\\n\\x0cLayoutParser: A Unified Toolkit for DL-Based DIA\\n\\n15\\n\\n[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters,\\nM., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language\\nprocessing platform. arXiv preprint arXiv:1803.07640 (2018)\\n(cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P.,\\nGraliĀ“nski, F.: Lambert: Layout-aware (language) modeling using bert for in-\\nformation extraction (2020)\\n\\n[9]\\n\\n[10] Graves, A., FernĀ“andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal\\nclassification: labelling unsegmented sequence data with recurrent neural networks.\\nIn: Proceedings of the 23rd international conference on Machine learning. pp.\\n369–376 (2006)\\n\\n[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for\\ndocument image classification and retrieval. In: 2015 13th International Conference\\non Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015)\\n[12] He, K., Gkioxari, G., DollĀ“ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the\\n\\nIEEE international conference on computer vision. pp. 2961–2969 (2017)\\n\\n[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition.\\nIn: Proceedings of the IEEE conference on computer vision and pattern recognition.\\npp. 770–778 (2016)\\n\\n[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J.\\n\\n2007(159), 2 (Jul 2007)\\n\\n[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis\\nbenchmarking. In: 2011 International Conference on Document Analysis and\\nRecognition. pp. 42–47. IEEE (2011)\\n\\n[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5\\nmillion images. In: Adjunct Publication of the 33rd Annual ACM Sym-\\nposium on User\\nInterface Software and Technology. p. 120–122. UIST\\n’20 Adjunct, Association for Computing Machinery, New York, NY, USA\\n(2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus.\\nlib.washington.edu/10.1145/3379350.3416143\\n\\n[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N.,\\nThomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting\\nHeadlines and Visual Content from 16 Million Historic Newspaper Pages in\\nChronicling America, p. 3055–3062. Association for Computing Machinery, New\\nYork, NY, USA (2020), https://doi.org/10.1145/3340531.3412767\\n\\n[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark\\nfor image-based table detection and recognition. arXiv preprint arXiv:1903.01949\\n(2019)\\n\\n[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., DollĀ“ar, P.,\\nZitnick, C.L.: Microsoft coco: Common objects in context. In: European conference\\non computer vision. pp. 740–755. Springer (2014)\\n\\n[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic\\nsegmentation. In: Proceedings of the IEEE conference on computer vision and\\npattern recognition. pp. 3431–3440 (2015)\\n\\n[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten-\\ncroft, K.: An experimental workflow development platform for historical document\\ndigitisation and analysis. In: Proceedings of the 2011 workshop on historical\\ndocument imaging and processing. pp. 161–168 (2011)\\n\\n[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach\\nfor document segmentation. In: 2018 16th International Conference on Frontiers\\nin Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)\\n\\n\\x0c16\\n\\nZ. Shen et al.\\n\\n[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z.,\\nDesmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017)\\n[24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen,\\nT., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style,\\nhigh-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019)\\n[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth\\nelements) format framework. In: 2010 20th International Conference on Pattern\\nRecognition. pp. 257–260. IEEE (2010)\\n\\n[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet:\\nAn approach for end to end table detection and structure recognition from image-\\nbased documents. In: Proceedings of the IEEE/CVF Conference on Computer\\nVision and Pattern Recognition Workshops. pp. 572–573 (2020)\\n\\n[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph\\nneural networks. In: 2019 International Conference on Document Analysis and\\nRecognition (ICDAR). pp. 142–147. IEEE (2019)\\n\\n[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object\\ndetection with region proposal networks. In: Advances in neural information\\nprocessing systems. pp. 91–99 (2015)\\n\\n[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph\\nneural network model. IEEE transactions on neural networks 20(1), 61–80 (2008)\\n[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning\\nfor detection and structure recognition of tables in document images. In: 2017 14th\\nIAPR international conference on document analysis and recognition (ICDAR).\\nvol. 1, pp. 1162–1167. IEEE (2017)\\n\\n[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents\\nwith complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer\\nVision and Pattern Recognition Workshops. pp. 548–549 (2020)\\n\\n[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning\\n\\nbased layout annotation. arXiv preprint arXiv:2010.01762 (2020)\\n\\n[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer,\\nA., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for\\nhistorical document image analysis. In: 2019 International Conference on Document\\nAnalysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)\\n\\n[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P.,\\nRault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of-\\nthe-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)\\n[35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://\\n\\ngithub.com/facebookresearch/detectron2 (2019)\\n\\n[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C.,\\nChe, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document\\nunderstanding. arXiv preprint arXiv:2012.14740 (2020)\\n\\n[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of\\n\\ntext and layout for document image understanding (2019)\\n\\n[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:\\n\\nlayout analysis.\\n\\nument\\nAnalysis and Recognition (ICDAR). pp. 1015–1022.\\nhttps://doi.org/10.1109/ICDAR.2019.00166\\n\\nlargest dataset ever for doc-\\nIn: 2019 International Conference on Document\\nIEEE (Sep 2019).\\n\\n\\x0c', metadata={'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf'})" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from langchain_community.document_loaders import PDFMinerLoader\n", "\n", "file_path = (\n", - " \"../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", ")\n", "loader = PDFMinerLoader(file_path)\n", - "data = loader.load()" + "data = loader.load()\n", + "data[0]" ] }, { @@ -404,36 +477,48 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 35, "id": "2d39159e-61a5-4ac2-a6c2-3981c3aa6f4d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='\\n\\n\\n\\n\\n
1\\n
2\\n
0\\n
2\\n
n\\n
u\\n
J\\n
1\\n
2\\n
]\\n
V\\n
C\\n
.\\n
s\\n
c\\n
[\\n
2\\n
v\\n
8\\n
4\\n
3\\n
5\\n
1\\n
.\\n
3\\n
0\\n
1\\n
2\\n
:\\n
v\\n
i\\n
X\\n
r\\n
a\\n
LayoutParser: A Unified Toolkit for Deep\\n
Learning Based Document Image Analysis\\n
Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\n
Lee
4, Jacob Carlson3, and Weining Li5\\n
1 Allen Institute for AI\\n
shannons@allenai.org\\n
2 Brown University\\n
ruochen zhang@brown.edu\\n
3 Harvard University\\n
{melissadell,jacob carlson}@fas.harvard.edu\\n
4 University of Washington\\n
bcgl@cs.washington.edu\\n
5 University of Waterloo\\n
w422li@uwaterloo.ca\\n
Abstract. Recent advances in document image analysis (DIA) have been\\n
primarily driven by the application of neural networks. Ideally, research\\n
outcomes could be easily deployed in production and extended for further\\n
investigation. However, various factors like loosely organized codebases\\n
and sophisticated model configurations complicate the easy reuse of im-\\n
portant innovations by a wide audience. Though there have been on-going\\n
efforts to improve reusability and simplify deep learning (DL) model\\n
development in disciplines like natural language processing and computer\\n
vision, none of them are optimized for challenges in the domain of DIA.\\n
This represents a major gap in the existing toolkit, as DIA is central to\\n
academic research across a wide range of disciplines in the social sciences\\n
and humanities. This paper introduces
LayoutParser, an open-source\\n
library for streamlining the usage of DL in DIA research and applica-\\n
tions. The core
LayoutParser library comes with a set of simple and\\n
intuitive interfaces for applying and customizing DL models for layout de-\\n
tection, character recognition, and many other document processing tasks.\\n
To promote extensibility,
LayoutParser also incorporates a community\\n
platform for sharing both pre-trained models and full document digiti-\\n
zation pipelines. We demonstrate that LayoutParser is helpful for both\\n
lightweight and large-scale digitization pipelines in real-word use cases.\\n
The library is publicly available at
https://layout-parser.github.io.\\n
Keywords: Document Image Analysis Ā· Deep Learning Ā· Layout Analysis\\n
Ā· Character Recognition Ā· Open Source library Ā· Toolkit.\\n
1\\n
Introduction\\n
Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\n
document image analysis (DIA) tasks including document image classification [11,\\n
\\n\\n \\n
\\n
\\n
\\n
\\n
\\n
\\n\\n
2\\n
Z. Shen et al.\\n
37], layout detection [38, 22], table detection [26], and scene text detection [4].\\n
A generalized learning-based framework dramatically reduces the need for the\\n
manual specification of complicated rules, which is the status quo with traditional\\n
methods. DL has the potential to transform DIA pipelines and benefit a broad\\n
spectrum of large-scale document digitization projects.\\n
However, there are several practical difficulties for taking advantages of re-\\n
cent advances in DL-based methods: 1) DL models are notoriously convoluted\\n
for reuse and extension. Existing models are developed using distinct frame-\\n
works like TensorFlow [1] or PyTorch [24], and the high-level parameters can\\n
be obfuscated by implementation details [8]. It can be a time-consuming and\\n
frustrating experience to debug, reproduce, and adapt existing models for DIA,\\n
and
many researchers who would benefit the most from using these methods lack\\n
the technical background to implement them from scratch.
2) Document images\\n
contain diverse and disparate patterns across domains, and customized training\\n
is often required to achieve a desirable detection accuracy. Currently there is no\\n
full-fledged infrastructure for easily curating the target document image datasets\\n
and fine-tuning or re-training the models.
3) DIA usually requires a sequence of\\n
models and other processing to obtain the final outputs. Often research teams use\\n
DL models and then perform further document analyses in separate processes,\\n
and these pipelines are not documented in any central location (and often not\\n
documented at all). This makes it
difficult for research teams to learn about how\\n
full pipelines are implemented
and leads them to invest significant resources in\\n
reinventing the DIA wheel
.\\n
LayoutParser provides a unified toolkit to support DL-based document image\\n
analysis and processing. To address the aforementioned challenges,
LayoutParser\\n
is built with the following components:\\n
1. An off-the-shelf toolkit for applying DL models for layout detection, character\\n
recognition, and other DIA tasks (Section 3)\\n
2. A rich repository of pre-trained neural network models (Model Zoo) that\\n
underlies the off-the-shelf usage\\n
3. Comprehensive tools for efficient document image data annotation and model\\n
tuning to support different levels of customization\\n
4. A DL model hub and community platform for the easy sharing, distribu-\\n
tion, and discussion of DIA models and pipelines, to promote reusability,\\n
reproducibility, and extensibility (Section 4)\\n
The library implements simple and intuitive Python APIs without sacrificing\\n
generalizability and versatility, and can be easily installed via pip. Its convenient\\n
functions for handling document image data can be seamlessly integrated with\\n
existing DIA pipelines. With detailed documentations and carefully curated\\n
tutorials, we hope this tool will benefit a variety of end-users, and will lead to\\n
advances in applications in both industry and academic research.\\n
LayoutParser is well aligned with recent efforts for improving DL model\\n
reusability in other disciplines like natural language processing [8, 34] and com-\\n
puter vision [35], but with a focus on unique challenges in DIA. We show\\n
LayoutParser can be applied in sophisticated and large-scale digitization projects\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
3\\n
that require precision, efficiency, and robustness, as well as simple and light-\\n
weight document processing tasks focusing on efficacy and flexibility (Section 5).\\n
LayoutParser is being actively maintained, and support for more deep learning\\n
models and novel methods in text-based layout analysis methods [37, 34] is\\n
planned.\\n
The rest of the paper is organized as follows. Section 2 provides an overview\\n
of related work. The core
LayoutParser library, DL Model Zoo, and customized\\n
model training are described in Section 3, and the DL model hub and commu-\\n
nity platform are detailed in Section 4. Section 5 shows two examples of how\\n
LayoutParser can be used in practical DIA projects, and Section 6 concludes.\\n
2 Related Work\\n
Recently, various DL models and datasets have been developed for layout analysis\\n
tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen-\\n
tation tasks on historical documents. Object detection-based methods like Faster\\n
R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38]\\n
and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also\\n
been used in table detection [27]. However, these models are usually implemented\\n
individually and there is no unified framework to load and use such models.\\n
There has been a surge of interest in creating open-source tools for document\\n
image processing: a search of
document image analysis in Github leads to 5M\\n
relevant code pieces
6; yet most of them rely on traditional rule-based methods\\n
or provide limited functionalities. The closest prior research to our work is the\\n
OCR-D project
7, which also tries to build a complete toolkit for DIA. However,\\n
similar to the platform developed by Neudecker et al. [21], it is designed for\\n
analyzing historical documents, and provides no supports for recent DL models.\\n
The
DocumentLayoutAnalysis project8 focuses on processing born-digital PDF\\n
documents via analyzing the stored PDF data. Repositories like
DeepLayout9\\n
and Detectron2-PubLayNet10 are individual deep learning models trained on\\n
layout analysis datasets without support for the full DIA pipeline. The Document\\n
Analysis and Exploitation (DAE) platform [15] and the DeepDIVA project [2]\\n
aim to improve the reproducibility of DIA methods (or DL models), yet they\\n
are not actively maintained. OCR engines like
Tesseract [14], easyOCR11 and\\n
paddleOCR12 usually do not come with comprehensive functionalities for other\\n
DIA tasks like layout analysis.\\n
Recent years have also seen numerous efforts to create libraries for promoting\\n
reproducibility and reusability in the field of DL. Libraries like Dectectron2 [35],\\n
6 The number shown is obtained by specifying the search type as ā€˜code’.\\n
7 https://ocr-d.de/en/about\\n
8 https://github.com/BobLd/DocumentLayoutAnalysis\\n
9 https://github.com/leonlulu/DeepLayout\\n
10 https://github.com/hpanwar08/detectron2\\n
11 https://github.com/JaidedAI/EasyOCR\\n
12 https://github.com/PaddlePaddle/PaddleOCR\\n
\\n\\n\\n
4\\n
Z. Shen et al.\\n
Fig. 1: The overall architecture of LayoutParser. For an input document image,\\n
the core LayoutParser library provides a set of off-the-shelf tools for layout\\n
detection, OCR, visualization, and storage, backed by a carefully designed layout\\n
data structure.
LayoutParser also supports high level customization via efficient\\n
layout annotation and model training functions. These improve model accuracy\\n
on the target samples. The community platform enables the easy sharing of DIA\\n
models and whole digitization pipelines to promote reusability and reproducibility.\\n
A collection of detailed documentation, tutorials and exemplar projects make\\n
LayoutParser easy to learn and use.\\n
AllenNLP [8] and transformers [34] have provided the community with complete\\n
DL-based support for developing and deploying models for general computer\\n
vision and natural language processing problems.
LayoutParser, on the other\\n
hand, specializes specifically in DIA tasks.
LayoutParser is also equipped with a\\n
community platform inspired by established model hubs such as
Torch Hub [23]\\n
and
TensorFlow Hub [1]. It enables the sharing of pretrained models as well as\\n
full document processing pipelines that are unique to DIA tasks.\\n
There have been a variety of document data collections to facilitate the\\n
development of DL models. Some examples include PRImA [3](magazine layouts),\\n
PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic\\n
papers), Newspaper Navigator Dataset [16, 17](newspaper figure layouts) and\\n
HJDataset [31](historical Japanese document layouts). A spectrum of models\\n
trained on these datasets are currently available in the LayoutParser model zoo\\n
to support different use cases.\\n
3 The Core LayoutParser Library\\n
At the core of LayoutParser is an off-the-shelf toolkit that streamlines DL-\\n
based document image analysis. Five components support a simple interface\\n
with comprehensive functionalities: 1) The
layout detection models enable using\\n
pre-trained or self-trained DL models for layout detection with just four lines\\n
of code. 2) The detected layout information is stored in carefully engineered\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nEfficient Data Annotation\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCustomized Model Training\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nModel Customization\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nDIA Model Hub\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nDIA Pipeline Sharing\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCommunity Platform\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLayout Detection Models\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nDocument Images \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nThe Core LayoutParser Library\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nOCR Module\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nStorage & Visualization\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLayout Data Structure\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
5\\n
Table 1: Current layout detection models in the LayoutParser model zoo\\n
Dataset\\n
Base Model1 Large Model Notes\\n
PubLayNet [38]\\n
PRImA [3]\\n
Newspaper [17]\\n
TableBank [18]\\n
HJDataset [31]\\n
F / M\\n
M\\n
F\\n
F\\n
F / M\\n
M\\n
-\\n
-\\n
F\\n
-\\n
Layouts of modern scientific documents\\n
Layouts of scanned modern magazines and scientific reports\\n
Layouts of scanned US newspapers from the 20th century\\n
Table region on modern scientific and business document\\n
Layouts of history Japanese documents\\n
1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy\\n
vs. computational cost). For ā€œbase modelā€ and ā€œlarge modelā€, we refer to using the ResNet 50 or ResNet 101\\n
backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask\\n
R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\\n
using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\\n
zoo in coming months.\\n
layout data structures, which are optimized for efficiency and versatility. 3) When\\n
necessary, users can employ existing or customized OCR models via the unified\\n
API provided in the
OCR module. 4) LayoutParser comes with a set of utility\\n
functions for the
visualization and storage of the layout data. 5) LayoutParser\\n
is also highly customizable, via its integration with functions for layout data\\n
annotation and model training
. We now provide detailed descriptions for each\\n
component.\\n
3.1 Layout Detection Models\\n
In LayoutParser, a layout model takes a document image as an input and\\n
generates a list of rectangular boxes for the target content regions. Different\\n
from traditional methods, it relies on deep convolutional neural networks rather\\n
than manually curated rules to identify content regions. It is formulated as an\\n
object detection problem and state-of-the-art models like Faster R-CNN [28] and\\n
Mask R-CNN [12] are used. This yields prediction results of high accuracy and\\n
makes it possible to build a concise, generalized interface for layout detection.\\n
LayoutParser, built upon Detectron2 [35], provides a minimal API that can\\n
perform layout detection with only four lines of code in Python:\\n
1 import layoutparser as lp\\n
2 image = cv2 . imread ( " image_file " ) # load images\\n
3 model = lp . De t e c tro n2 Lay outM odel (\\n
" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config " )\\n
4\\n
5
layout = model . detect ( image )\\n
LayoutParser provides a wealth of pre-trained model weights using various\\n
datasets covering different languages, time periods, and document types. Due to\\n
domain shift [7], the prediction performance can notably drop when models are ap-\\n
plied to target samples that are significantly different from the training dataset. As\\n
document structures and layouts vary greatly in different domains, it is important\\n
to select models trained on a dataset similar to the test samples. A semantic syntax\\n
is used for initializing the model weights in
LayoutParser, using both the dataset\\n
name and model name lp://<dataset-name>/<model-architecture-name>.\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
6\\n
Z. Shen et al.\\n
Fig. 2: The relationship between the three types of layout data structures.\\n
Coordinate supports three kinds of variation; TextBlock consists of the co-\\n
ordinate information and extra features like block text, types, and reading orders;\\n
a
Layout object is a list of all possible layout elements, including other Layout\\n
objects. They all support the same set of transformation and operation APIs for\\n
maximum flexibility.\\n
Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained\\n
on 5 different datasets. Description of the training dataset is provided alongside\\n
with the trained models such that users can quickly identify the most suitable\\n
models for their tasks. Additionally, when such a model is not readily available,\\n
LayoutParser also supports training customized layout models and community\\n
sharing of the models (detailed in Section 3.5).\\n
3.2 Layout Data Structures\\n
A critical feature of LayoutParser is the implementation of a series of data\\n
structures and operations that can be used to efficiently process and manipulate\\n
the layout elements. In document image analysis pipelines, various post-processing\\n
on the layout analysis model outputs is usually required to obtain the final\\n
outputs. Traditionally, this requires exporting DL model outputs and then loading\\n
the results into other pipelines. All model outputs from
LayoutParser will be\\n
stored in carefully engineered data types optimized for further processing, which\\n
makes it possible to build an end-to-end document digitization pipeline within\\n
LayoutParser. There are three key components in the data structure, namely\\n
the
Coordinate system, the TextBlock, and the Layout. They provide different\\n
levels of abstraction for the layout data, and a set of APIs are supported for\\n
transformations or operations on these classes.\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
7\\n
Coordinates are the cornerstones for storing layout information. Currently,\\n
three types of
Coordinate data structures are provided in LayoutParser, shown\\n
in Figure 2.
Interval and Rectangle are the most common data types and\\n
support specifying 1D or 2D regions within a document. They are parameterized\\n
with 2 and 4 parameters. A
Quadrilateral class is also implemented to support\\n
a more generalized representation of rectangular regions when the document\\n
is skewed or distorted, where the 4 corner points can be specified and a total\\n
of 8 degrees of freedom are supported. A wide collection of transformations\\n
like
shift, pad, and scale, and operations like intersect, union, and is_in,\\n
are supported for these classes. Notably, it is common to separate a segment\\n
of the image and analyze it individually.
LayoutParser provides full support\\n
for this scenario via image cropping operations
crop_image and coordinate\\n
transformations like
relative_to and condition_on that transform coordinates\\n
to and from their relative representations. We refer readers to Table 2 for a more\\n
detailed description of these operations13.\\n
Based on Coordinates, we implement the TextBlock class that stores both\\n
the positional and extra features of individual layout elements. It also supports\\n
specifying the reading orders via setting the
parent field to the index of the parent\\n
object. A
Layout class is built that takes in a list of TextBlocks and supports\\n
processing the elements in batch.
Layout can also be nested to support hierarchical\\n
layout structures. They support the same operations and transformations as the\\n
Coordinate classes, minimizing both learning and deployment effort.\\n
3.3 OCR\\n
LayoutParser provides a unified interface for existing OCR tools. Though there\\n
are many OCR tools available, they are usually configured differently with distinct\\n
APIs or protocols for using them. It can be inefficient to add new OCR tools into\\n
an existing pipeline, and difficult to make direct comparisons among the available\\n
tools to find the best option for a particular project. To this end,
LayoutParser\\n
builds a series of wrappers among existing OCR engines, and provides nearly\\n
the same syntax for using them. It supports a plug-and-play style of using OCR\\n
engines, making it effortless to switch, evaluate, and compare different OCR\\n
modules:\\n
1 ocr_agent = lp . TesseractAgent ()\\n
2 # Can be easily switched to other OCR software\\n
3 tokens = ocr_agent . detect ( image )\\n
The OCR outputs will also be stored in the aforementioned layout data\\n
structures and can be seamlessly incorporated into the digitization pipeline.\\n
Currently
LayoutParser supports the Tesseract and Google Cloud Vision OCR\\n
engines.\\n
LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained\\n
with the Connectionist Temporal Classification (CTC) loss [10]. It can be used\\n
like the other OCR modules, and can be easily trained on customized datasets.\\n
13 This is also available in the LayoutParser documentation pages.\\n
\\n\\n\\n\\n\\n\\n
8\\n
Z. Shen et al.\\n
Table 2: All operations supported by the layout elements. The same APIs are\\n
supported across different layout element classes including
Coordinate types,\\n
TextBlock and Layout.\\n
Operation Name\\n
Description\\n
block.pad(top, bottom, right, left) Enlarge the current block according to the input\\n
block.scale(fx, fy)\\n
block.shift(dx, dy)\\n
Scale the current block given the ratio\\n
in x and y direction\\n
Move the current block with the shift\\n
distances in x and y direction\\n
block1.is in(block2)\\n
Whether block1 is inside of block2\\n
block1.intersect(block2)\\n
block1.union(block2)\\n
block1.relative to(block2)\\n
block1.condition on(block2)\\n
Return the intersection region of block1 and block2.\\n
Coordinate type to be determined based on the inputs.\\n
Return the union region of block1 and block2.\\n
Coordinate type to be determined based on the inputs.\\n
Convert the absolute coordinates of block1 to\\n
relative coordinates to block2\\n
Calculate the absolute coordinates of block1 given\\n
the canvas block2’s absolute coordinates\\n
block.crop image(image)\\n
Obtain the image segments in the block region\\n
3.4 Storage and visualization\\n
The end goal of DIA is to transform the image-based document data into a\\n
structured database.
LayoutParser supports exporting layout data into different\\n
formats like
JSON, csv, and will add the support for the METS/ALTO XML\\n
format
14 . It can also load datasets from layout analysis-specific formats like\\n
COCO [38] and the Page Format [25] for training layout models (Section 3.5).\\n
Visualization of the layout detection results is critical for both presentation\\n
and debugging.
LayoutParser is built with an integrated API for displaying the\\n
layout information along with the original document image. Shown in Figure 3, it\\n
enables presenting layout data with rich meta information and features in different\\n
modes. More detailed information can be found in the online
LayoutParser\\n
documentation page.\\n
3.5 Customized Model Training\\n
Besides the off-the-shelf library, LayoutParser is also highly customizable with\\n
supports for highly unique and challenging document analysis tasks. Target\\n
document images can be vastly different from the existing datasets for train-\\n
ing layout models, which leads to low layout detection accuracy. Training data\\n
14 https://altoxml.github.io\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
9\\n
Fig. 3: Layout detection and OCR results visualization generated by the\\n
LayoutParser APIs. Mode I directly overlays the layout region bounding boxes\\n
and categories over the original image. Mode II recreates the original document\\n
via drawing the OCR’d texts at their corresponding positions on the image\\n
canvas. In this figure, tokens in textual regions are filtered using the API and\\n
then displayed.\\n
can also be highly sensitive and not sharable publicly. To overcome these chal-\\n
lenges,
LayoutParser is built with rich features for efficient data annotation and\\n
customized model training.\\n
LayoutParser incorporates a toolkit optimized for annotating document lay-\\n
outs using object-level active learning [32]. With the help from a layout detection\\n
model trained along with labeling, only the most important layout objects within\\n
each image, rather than the whole image, are required for labeling. The rest of\\n
the regions are automatically annotated with high confidence predictions from\\n
the layout detection model. This allows a layout dataset to be created more\\n
efficiently with only around 60% of the labeling budget.\\n
After the training dataset is curated, LayoutParser supports different modes\\n
for training the layout models.
Fine-tuning can be used for training models on a\\n
small newly-labeled dataset by initializing the model with existing pre-trained\\n
weights.
Training from scratch can be helpful when the source dataset and\\n
target are significantly different and a large training set is available. However, as\\n
suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale\\n
datasets like ImageNet [5], even from totally different domains, can still boost\\n
model performance. Through the integrated API provided by
LayoutParser,\\n
users can easily compare model performances on the benchmark datasets.\\n
\\n\\n
10\\n
Z. Shen et al.\\n
Fig. 4: Illustration of (a) the original historical Japanese document with layout\\n
detection results and (b) a recreated version of the document image that achieves\\n
much better character recognition recall. The reorganization algorithm rearranges\\n
the tokens based on the their detected bounding boxes given a maximum allowed\\n
height.\\n
4 LayoutParser Community Platform\\n
Another focus of LayoutParser is promoting the reusability of layout detection\\n
models and full digitization pipelines. Similar to many existing deep learning\\n
libraries,
LayoutParser comes with a community model hub for distributing\\n
layout models. End-users can upload their self-trained models to the model hub,\\n
and these models can be loaded into a similar interface as the currently available\\n
LayoutParser pre-trained models. For example, the model trained on the News\\n
Navigator dataset [17] has been incorporated in the model hub.\\n
Beyond DL models, LayoutParser also promotes the sharing of entire doc-\\n
ument digitization pipelines. For example, sometimes the pipeline requires the\\n
combination of multiple DL models to achieve better accuracy. Currently, pipelines\\n
are mainly described in academic papers and implementations are often not pub-\\n
licly available. To this end, the
LayoutParser community platform also enables\\n
the sharing of layout pipelines to promote the discussion and reuse of techniques.\\n
For each shared pipeline, it has a dedicated project page, with links to the source\\n
code, documentation, and an outline of the approaches. A discussion panel is\\n
provided for exchanging ideas. Combined with the core
LayoutParser library,\\n
users can easily build reusable components based on the shared pipelines and\\n
apply them to solve their unique problems.\\n
5 Use Cases\\n
The core objective of LayoutParser is to make it easier to create both large-scale\\n
and light-weight document digitization pipelines. Large-scale document processing\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
11\\n
focuses on precision, efficiency, and robustness. The target documents may have\\n
complicated structures, and may require training multiple layout detection models\\n
to achieve the optimal accuracy. Light-weight pipelines are built for relatively\\n
simple documents, with an emphasis on development ease, speed and flexibility.\\n
Ideally one only needs to use existing resources, and model training should be\\n
avoided. Through two exemplar projects, we show how practitioners in both\\n
academia and industry can easily build such pipelines using
LayoutParser and\\n
extract high-quality structured document data for their downstream tasks. The\\n
source code for these projects will be publicly available in the
LayoutParser\\n
community hub.\\n
5.1 A Comprehensive Historical Document Digitization Pipeline\\n
The digitization of historical documents can unlock valuable data that can shed\\n
light on many important social, economic, and historical questions. Yet due to\\n
scan noises, page wearing, and the prevalence of complicated layout structures, ob-\\n
taining a structured representation of historical document scans is often extremely\\n
complicated.\\n
In this example,
LayoutParser was\\n
used to develop a comprehensive\\n
pipeline, shown in Figure 5, to gener-\\n
ate high-quality structured data from\\n
historical Japanese firm financial ta-\\n
bles with complicated layouts. The\\n
pipeline applies two layout models to\\n
identify different levels of document\\n
structures and two customized OCR\\n
engines for optimized character recog-\\n
nition accuracy.\\n
As shown in Figure 4 (a), the\\n
document contains columns of text\\n
written vertically
15, a common style\\n
in Japanese. Due to scanning noise\\n
and archaic printing technology, the\\n
columns can be skewed or have vari-\\n
able widths, and hence cannot be eas-\\n
ily identified via rule-based methods.\\n
Within each column, words are sepa-\\n
rated by white spaces of variable size,\\n
and the vertical positions of objects\\n
can be an indicator of their layout\\n
type.\\n
Fig. 5: Illustration of how LayoutParser\\n
helps with the historical document digi-\\n
tization pipeline.\\n
15 A document page consists of eight rows like this. For simplicity we skip the row\\n
segmentation discussion and refer readers to the source code when available.\\n
\\n\\n\\n
12\\n
Z. Shen et al.\\n
To decipher the complicated layout\\n
structure, two object detection models have been trained to recognize individual\\n
columns and tokens, respectively. A small training set (400 images with approxi-\\n
mately 100 annotations each) is curated via the active learning based annotation\\n
tool [32] in
LayoutParser. The models learn to identify both the categories and\\n
regions for each token or column via their distinct visual features. The layout\\n
data structure enables easy grouping of the tokens within each column, and\\n
rearranging columns to achieve the correct reading orders based on the horizontal\\n
position. Errors are identified and rectified via checking the consistency of the\\n
model predictions. Therefore, though trained on a small dataset, the pipeline\\n
achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19]\\n
score across 5 categories for the column detection model, and a 89.23 AP across\\n
4 categories for the token detection model.\\n
A combination of character recognition methods is developed to tackle the\\n
unique challenges in this document. In our experiments, we found that irregular\\n
spacing between the tokens led to a low character recognition recall rate, whereas\\n
existing OCR models tend to perform better on densely-arranged texts. To\\n
overcome this challenge, we create a document reorganization algorithm that\\n
rearranges the text based on the token bounding boxes detected in the layout\\n
analysis step. Figure 4 (b) illustrates the generated image of dense text, which is\\n
sent to the OCR APIs as a whole to reduce the transaction costs. The flexible\\n
coordinate system in
LayoutParser is used to transform the OCR results relative\\n
to their original positions on the page.\\n
Additionally, it is common for historical documents to use unique fonts\\n
with different glyphs, which significantly degrades the accuracy of OCR models\\n
trained on modern texts. In this document, a special flat font is used for printing\\n
numbers and could not be detected by off-the-shelf OCR engines. Using the highly\\n
flexible functionalities from
LayoutParser, a pipeline approach is constructed\\n
that achieves a high recognition accuracy with minimal effort. As the characters\\n
have unique visual structures and are usually clustered together, we train the\\n
layout model to identify number regions with a dedicated category. Subsequently,\\n
LayoutParser crops images within these regions, and identifies characters within\\n
them using a self-trained OCR model based on a CNN-RNN [6]. The model\\n
detects a total of 15 possible categories, and achieves a 0.98 Jaccard score
16 and\\n
a 0.17 average Levinstein distances17 for token prediction on the test set.\\n
Overall, it is possible to create an intricate and highly accurate digitization\\n
pipeline for large-scale digitization using
LayoutParser. The pipeline avoids\\n
specifying the complicated rules used in traditional methods, is straightforward\\n
to develop, and is robust to outliers. The DL models also generate fine-grained\\n
results that enable creative approaches like page reorganization for OCR.\\n
16 This measures the overlap between the detected and ground-truth characters, and\\n
the maximum is 1.\\n
17 This measures the number of edits from the ground-truth text to the predicted text,\\n
and lower is better.\\n
\\n\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
13\\n
Fig. 6: This lightweight table detector can identify tables (outlined in red) and\\n
cells (shaded in blue) in different locations on a page. In very few cases (d), it\\n
might generate minor error predictions, e.g, failing to capture the top text line of\\n
a table.\\n
5.2 A light-weight Visual Table Extractor\\n
Detecting tables and parsing their structures (table extraction) are of central im-\\n
portance for many document digitization tasks. Many previous works [26, 30, 27]\\n
and tools
18 have been developed to identify and parse table structures. Yet they\\n
might require training complicated models from scratch, or are only applicable\\n
for born-digital PDF documents. In this section, we show how
LayoutParser can\\n
help build a light-weight accurate visual table extractor for legal docket tables\\n
using the existing resources with minimal effort.\\n
The extractor uses a pre-trained layout detection model for identifying the\\n
table regions and some simple rules for pairing the rows and the columns in the\\n
PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the\\n
LayoutParser Model Zoo can be used for detecting table regions. By filtering\\n
out model predictions of low confidence and removing overlapping predictions,\\n
LayoutParser can identify the tabular regions on each page, which significantly\\n
simplifies the subsequent steps. By applying the line detection functions within\\n
the tabular segments, provided in the utility module from LayoutParser, the\\n
pipeline can identify the three distinct columns in the tables. A row clustering\\n
method is then applied via analyzing the y coordinates of token bounding boxes in\\n
the left-most column, which are obtained from the OCR engines. A non-maximal\\n
suppression algorithm is used to remove duplicated rows with extremely small\\n
gaps. Shown in Figure 6, the built pipeline can detect tables at different positions\\n
on a page accurately. Continued tables from different pages are concatenated,\\n
and a structured table representation has been easily created.\\n
18 https://github.com/atlanhq/camelot, https://github.com/tabulapdf/tabula\\n
\\n\\n\\n
14\\n
Z. Shen et al.\\n
6 Conclusion\\n
LayoutParser provides a comprehensive toolkit for deep learning-based document\\n
image analysis. The off-the-shelf library is easy to install, and can be used to\\n
build flexible and accurate pipelines for processing documents with complicated\\n
structures. It also supports high-level customization and enables easy labeling and\\n
training of DL models on unique document image datasets. The
LayoutParser\\n
community platform facilitates sharing DL models and DIA pipelines, inviting\\n
discussion and promoting code reproducibility and reusability. The
LayoutParser\\n
team is committed to keeping the library updated continuously and bringing\\n
the most recent advances in DL-based DIA, such as multi-modal document\\n
modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.\\n
Acknowledgements We thank the anonymous reviewers for their comments\\n
and suggestions. This project is supported in part by NSF Grant OIA-2033558\\n
and funding from the Harvard Data Science Initiative and Harvard Catalyst.\\n
Zejiang Shen thanks Doug Downey for suggestions.\\n
References\\n
[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado,\\n
G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A.,\\n
Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg,\\n
J., Man“e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J.,\\n
Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V.,\\n
Vi“egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng,\\n
X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015),\\n
https://www.tensorflow.org/, software available from tensorflow.org\\n
[2] Alberti, M., Pondenkandath, V., WĀØursch, M., Ingold, R., Liwicki, M.: Deepdiva: a\\n
highly-functional python framework for reproducible experiments. In: 2018 16th\\n
International Conference on Frontiers in Handwriting Recognition (ICFHR). pp.\\n
423–428. IEEE (2018)\\n
[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic\\n
dataset for performance evaluation of document layout analysis. In: 2009 10th\\n
International Conference on Document Analysis and Recognition. pp. 296–300.\\n
IEEE (2009)\\n
[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text\\n
detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and\\n
Pattern Recognition. pp. 9365–9374 (2019)\\n
[5] Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale\\n
Hierarchical Image Database. In: CVPR09 (2009)\\n
[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with\\n
coarse-to-fine attention. In: International Conference on Machine Learning. pp.\\n
980–989. PMLR (2017)\\n
[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation.\\n
In: International conference on machine learning. pp. 1180–1189. PMLR (2015)\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
15\\n
[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters,\\n
M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language\\n
processing platform. arXiv preprint arXiv:1803.07640 (2018)\\n
(cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P.,\\n
Grali“nski, F.: Lambert: Layout-aware (language) modeling using bert for in-\\n
formation extraction (2020)\\n
[9]\\n
[10] Graves, A., Fern“andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal\\n
classification: labelling unsegmented sequence data with recurrent neural networks.\\n
In: Proceedings of the 23rd international conference on Machine learning. pp.\\n
369–376 (2006)\\n
[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for\\n
document image classification and retrieval. In: 2015 13th International Conference\\n
on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015)\\n
[12] He, K., Gkioxari, G., Doll“ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the\\n
IEEE international conference on computer vision. pp. 2961–2969 (2017)\\n
[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition.\\n
In: Proceedings of the IEEE conference on computer vision and pattern recognition.\\n
pp. 770–778 (2016)\\n
[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J.\\n
2007(159), 2 (Jul 2007)\\n
[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis\\n
benchmarking. In: 2011 International Conference on Document Analysis and\\n
Recognition. pp. 42–47. IEEE (2011)\\n
[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5\\n
million images. In: Adjunct Publication of the 33rd Annual ACM Sym-\\n
posium on User\\n
Interface Software and Technology. p. 120–122. UIST\\n
’20 Adjunct, Association for Computing Machinery, New York, NY, USA\\n
(2020). https://doi.org/10.1145/3379350.3416143,
https://doi-org.offcampus.\\n
lib.washington.edu/10.1145/3379350.3416143\\n
[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N.,\\n
Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting\\n
Headlines and Visual Content from 16 Million Historic Newspaper Pages in\\n
Chronicling America, p. 3055–3062. Association for Computing Machinery, New\\n
York, NY, USA (2020),
https://doi.org/10.1145/3340531.3412767\\n
[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark\\n
for image-based table detection and recognition. arXiv preprint arXiv:1903.01949\\n
(2019)\\n
[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll“ar, P.,\\n
Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference\\n
on computer vision. pp. 740–755. Springer (2014)\\n
[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic\\n
segmentation. In: Proceedings of the IEEE conference on computer vision and\\n
pattern recognition. pp. 3431–3440 (2015)\\n
[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten-\\n
croft, K.: An experimental workflow development platform for historical document\\n
digitisation and analysis. In: Proceedings of the 2011 workshop on historical\\n
document imaging and processing. pp. 161–168 (2011)\\n
[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach\\n
for document segmentation. In: 2018 16th International Conference on Frontiers\\n
in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)\\n
\\n\\n
16\\n
Z. Shen et al.\\n
[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z.,\\n
Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017)\\n
[24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen,\\n
T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style,\\n
high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019)\\n
[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth\\n
elements) format framework. In: 2010 20th International Conference on Pattern\\n
Recognition. pp. 257–260. IEEE (2010)\\n
[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet:\\n
An approach for end to end table detection and structure recognition from image-\\n
based documents. In: Proceedings of the IEEE/CVF Conference on Computer\\n
Vision and Pattern Recognition Workshops. pp. 572–573 (2020)\\n
[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph\\n
neural networks. In: 2019 International Conference on Document Analysis and\\n
Recognition (ICDAR). pp. 142–147. IEEE (2019)\\n
[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object\\n
detection with region proposal networks. In: Advances in neural information\\n
processing systems. pp. 91–99 (2015)\\n
[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph\\n
neural network model. IEEE transactions on neural networks
20(1), 61–80 (2008)\\n
[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning\\n
for detection and structure recognition of tables in document images. In: 2017 14th\\n
IAPR international conference on document analysis and recognition (ICDAR).\\n
vol. 1, pp. 1162–1167. IEEE (2017)\\n
[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents\\n
with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer\\n
Vision and Pattern Recognition Workshops. pp. 548–549 (2020)\\n
[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning\\n
based layout annotation. arXiv preprint arXiv:2010.01762 (2020)\\n
[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer,\\n
A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for\\n
historical document image analysis. In: 2019 International Conference on Document\\n
Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)\\n
[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P.,\\n
Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of-\\n
the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)\\n
[35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2.
https://\\n
github.com/facebookresearch/detectron2 (2019)\\n
[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C.,\\n
Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document\\n
understanding. arXiv preprint arXiv:2012.14740 (2020)\\n
[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of\\n
text and layout for document image understanding (2019)\\n
[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:\\n
layout analysis.\\n
ument\\n
Analysis and Recognition (ICDAR). pp. 1015–1022.\\n
https://doi.org/10.1109/ICDAR.2019.00166\\n
largest dataset ever for doc-\\n
In: 2019 International Conference on Document\\n
IEEE (Sep 2019).\\n
Page: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
\\n\\n', metadata={'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf'})" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader\n", "\n", "file_path = (\n", - " \"../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", ")\n", "loader = PDFMinerPDFasHTMLLoader(file_path)\n", - "data = loader.load()[0]" + "docs = loader.load()\n", + "docs[0]" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 36, "id": "2f18fc1e-988f-4778-ab79-4fac739bec8f", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", - "soup = BeautifulSoup(data.page_content, \"html.parser\")\n", + "soup = BeautifulSoup(docs[0].page_content, \"html.parser\")\n", "content = soup.find_all(\"div\")" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 37, "id": "0b40f5bd-631e-4444-b79e-ef55e088807e", "metadata": {}, "outputs": [], @@ -469,10 +554,18 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 38, "id": "953b168f-4ae1-4279-b370-c21961206c0a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='Recently, various DL models and datasets have been developed for layout analysis\\ntasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen-\\ntation tasks on historical documents. Object detection-based methods like Faster\\nR-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38]\\nand detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also\\nbeen used in table detection [27]. However, these models are usually implemented\\nindividually and there is no unified framework to load and use such models.\\nThere has been a surge of interest in creating open-source tools for document\\nimage processing: a search of document image analysis in Github leads to 5M\\nrelevant code pieces 6; yet most of them rely on traditional rule-based methods\\nor provide limited functionalities. The closest prior research to our work is the\\nOCR-D project7, which also tries to build a complete toolkit for DIA. However,\\nsimilar to the platform developed by Neudecker et al. [21], it is designed for\\nanalyzing historical documents, and provides no supports for recent DL models.\\nThe DocumentLayoutAnalysis project8 focuses on processing born-digital PDF\\ndocuments via analyzing the stored PDF data. Repositories like DeepLayout9\\nand Detectron2-PubLayNet10 are individual deep learning models trained on\\nlayout analysis datasets without support for the full DIA pipeline. The Document\\nAnalysis and Exploitation (DAE) platform [15] and the DeepDIVA project [2]\\naim to improve the reproducibility of DIA methods (or DL models), yet they\\nare not actively maintained. OCR engines like Tesseract [14], easyOCR11 and\\npaddleOCR12 usually do not come with comprehensive functionalities for other\\nDIA tasks like layout analysis.\\nRecent years have also seen numerous efforts to create libraries for promoting\\nreproducibility and reusability in the field of DL. Libraries like Dectectron2 [35],\\n6 The number shown is obtained by specifying the search type as ā€˜code’.\\n7 https://ocr-d.de/en/about\\n8 https://github.com/BobLd/DocumentLayoutAnalysis\\n9 https://github.com/leonlulu/DeepLayout\\n10 https://github.com/hpanwar08/detectron2\\n11 https://github.com/JaidedAI/EasyOCR\\n12 https://github.com/PaddlePaddle/PaddleOCR\\n4\\nZ. Shen et al.\\nFig. 1: The overall architecture of LayoutParser. For an input document image,\\nthe core LayoutParser library provides a set of off-the-shelf tools for layout\\ndetection, OCR, visualization, and storage, backed by a carefully designed layout\\ndata structure. LayoutParser also supports high level customization via efficient\\nlayout annotation and model training functions. These improve model accuracy\\non the target samples. The community platform enables the easy sharing of DIA\\nmodels and whole digitization pipelines to promote reusability and reproducibility.\\nA collection of detailed documentation, tutorials and exemplar projects make\\nLayoutParser easy to learn and use.\\nAllenNLP [8] and transformers [34] have provided the community with complete\\nDL-based support for developing and deploying models for general computer\\nvision and natural language processing problems. LayoutParser, on the other\\nhand, specializes specifically in DIA tasks. LayoutParser is also equipped with a\\ncommunity platform inspired by established model hubs such as Torch Hub [23]\\nand TensorFlow Hub [1]. It enables the sharing of pretrained models as well as\\nfull document processing pipelines that are unique to DIA tasks.\\nThere have been a variety of document data collections to facilitate the\\ndevelopment of DL models. Some examples include PRImA [3](magazine layouts),\\nPubLayNet [38](academic paper layouts), Table Bank [18](tables in academic\\npapers), Newspaper Navigator Dataset [16, 17](newspaper figure layouts) and\\nHJDataset [31](historical Japanese document layouts). A spectrum of models\\ntrained on these datasets are currently available in the LayoutParser model zoo\\nto support different use cases.\\n' metadata={'heading': '2 Related Work\\n', 'content_font': 9, 'heading_font': 11, 'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf'}\n" + ] + } + ], "source": [ "from langchain_core.documents import Document\n", "\n", @@ -486,7 +579,7 @@ " or s[1] > semantic_snippets[cur_idx].metadata[\"heading_font\"]\n", " ):\n", " metadata = {\"heading\": s[0], \"content_font\": 0, \"heading_font\": s[1]}\n", - " metadata.update(data.metadata)\n", + " metadata.update(docs[0].metadata)\n", " semantic_snippets.append(Document(page_content=\"\", metadata=metadata))\n", " cur_idx += 1\n", " continue\n", @@ -506,30 +599,11 @@ " # if current snippet's font size > previous section's content but less than previous section's heading than also make a new\n", " # section (e.g. title of a PDF will have the highest font size but we don't want it to subsume all sections)\n", " metadata = {\"heading\": s[0], \"content_font\": 0, \"heading_font\": s[1]}\n", - " metadata.update(data.metadata)\n", + " metadata.update(docs[0].metadata)\n", " semantic_snippets.append(Document(page_content=\"\", metadata=metadata))\n", - " cur_idx += 1" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "9bf28b73-dad4-4f51-9238-4af523fa7225", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Document(page_content='Recently, various DL models and datasets have been developed for layout analysis\\ntasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen-\\ntation tasks on historical documents. Object detection-based methods like Faster\\nR-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38]\\nand detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also\\nbeen used in table detection [27]. However, these models are usually implemented\\nindividually and there is no unified framework to load and use such models.\\nThere has been a surge of interest in creating open-source tools for document\\nimage processing: a search of document image analysis in Github leads to 5M\\nrelevant code pieces 6; yet most of them rely on traditional rule-based methods\\nor provide limited functionalities. The closest prior research to our work is the\\nOCR-D project7, which also tries to build a complete toolkit for DIA. However,\\nsimilar to the platform developed by Neudecker et al. [21], it is designed for\\nanalyzing historical documents, and provides no supports for recent DL models.\\nThe DocumentLayoutAnalysis project8 focuses on processing born-digital PDF\\ndocuments via analyzing the stored PDF data. Repositories like DeepLayout9\\nand Detectron2-PubLayNet10 are individual deep learning models trained on\\nlayout analysis datasets without support for the full DIA pipeline. The Document\\nAnalysis and Exploitation (DAE) platform [15] and the DeepDIVA project [2]\\naim to improve the reproducibility of DIA methods (or DL models), yet they\\nare not actively maintained. OCR engines like Tesseract [14], easyOCR11 and\\npaddleOCR12 usually do not come with comprehensive functionalities for other\\nDIA tasks like layout analysis.\\nRecent years have also seen numerous efforts to create libraries for promoting\\nreproducibility and reusability in the field of DL. Libraries like Dectectron2 [35],\\n6 The number shown is obtained by specifying the search type as ā€˜code’.\\n7 https://ocr-d.de/en/about\\n8 https://github.com/BobLd/DocumentLayoutAnalysis\\n9 https://github.com/leonlulu/DeepLayout\\n10 https://github.com/hpanwar08/detectron2\\n11 https://github.com/JaidedAI/EasyOCR\\n12 https://github.com/PaddlePaddle/PaddleOCR\\n4\\nZ. Shen et al.\\nFig. 1: The overall architecture of LayoutParser. For an input document image,\\nthe core LayoutParser library provides a set of off-the-shelf tools for layout\\ndetection, OCR, visualization, and storage, backed by a carefully designed layout\\ndata structure. LayoutParser also supports high level customization via efficient\\nlayout annotation and model training functions. These improve model accuracy\\non the target samples. The community platform enables the easy sharing of DIA\\nmodels and whole digitization pipelines to promote reusability and reproducibility.\\nA collection of detailed documentation, tutorials and exemplar projects make\\nLayoutParser easy to learn and use.\\nAllenNLP [8] and transformers [34] have provided the community with complete\\nDL-based support for developing and deploying models for general computer\\nvision and natural language processing problems. LayoutParser, on the other\\nhand, specializes specifically in DIA tasks. LayoutParser is also equipped with a\\ncommunity platform inspired by established model hubs such as Torch Hub [23]\\nand TensorFlow Hub [1]. It enables the sharing of pretrained models as well as\\nfull document processing pipelines that are unique to DIA tasks.\\nThere have been a variety of document data collections to facilitate the\\ndevelopment of DL models. Some examples include PRImA [3](magazine layouts),\\nPubLayNet [38](academic paper layouts), Table Bank [18](tables in academic\\npapers), Newspaper Navigator Dataset [16, 17](newspaper figure layouts) and\\nHJDataset [31](historical Japanese document layouts). A spectrum of models\\ntrained on these datasets are currently available in the LayoutParser model zoo\\nto support different use cases.\\n', metadata={'heading': '2 Related Work\\n', 'content_font': 9, 'heading_font': 11, 'source': '../../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf'})" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "semantic_snippets[4]" + " cur_idx += 1\n", + "\n", + "print(semantic_snippets[4])" ] }, { @@ -544,26 +618,32 @@ }, { "cell_type": "code", - "execution_count": 30, - "id": "78e5a485-ff53-4b0c-ba5f-9f442079b529", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import PyPDFDirectoryLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 31, + "execution_count": 39, "id": "51b2fe13-3755-4031-b7ce-84d9983db71c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='\\n\\n\\n\\n\\n
1\\n
2\\n
0\\n
2\\n
n\\n
u\\n
J\\n
1\\n
2\\n
]\\n
V\\n
C\\n
.\\n
s\\n
c\\n
[\\n
2\\n
v\\n
8\\n
4\\n
3\\n
5\\n
1\\n
.\\n
3\\n
0\\n
1\\n
2\\n
:\\n
v\\n
i\\n
X\\n
r\\n
a\\n
LayoutParser: A Unified Toolkit for Deep\\n
Learning Based Document Image Analysis\\n
Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\n
Lee
4, Jacob Carlson3, and Weining Li5\\n
1 Allen Institute for AI\\n
shannons@allenai.org\\n
2 Brown University\\n
ruochen zhang@brown.edu\\n
3 Harvard University\\n
{melissadell,jacob carlson}@fas.harvard.edu\\n
4 University of Washington\\n
bcgl@cs.washington.edu\\n
5 University of Waterloo\\n
w422li@uwaterloo.ca\\n
Abstract. Recent advances in document image analysis (DIA) have been\\n
primarily driven by the application of neural networks. Ideally, research\\n
outcomes could be easily deployed in production and extended for further\\n
investigation. However, various factors like loosely organized codebases\\n
and sophisticated model configurations complicate the easy reuse of im-\\n
portant innovations by a wide audience. Though there have been on-going\\n
efforts to improve reusability and simplify deep learning (DL) model\\n
development in disciplines like natural language processing and computer\\n
vision, none of them are optimized for challenges in the domain of DIA.\\n
This represents a major gap in the existing toolkit, as DIA is central to\\n
academic research across a wide range of disciplines in the social sciences\\n
and humanities. This paper introduces
LayoutParser, an open-source\\n
library for streamlining the usage of DL in DIA research and applica-\\n
tions. The core
LayoutParser library comes with a set of simple and\\n
intuitive interfaces for applying and customizing DL models for layout de-\\n
tection, character recognition, and many other document processing tasks.\\n
To promote extensibility,
LayoutParser also incorporates a community\\n
platform for sharing both pre-trained models and full document digiti-\\n
zation pipelines. We demonstrate that LayoutParser is helpful for both\\n
lightweight and large-scale digitization pipelines in real-word use cases.\\n
The library is publicly available at
https://layout-parser.github.io.\\n
Keywords: Document Image Analysis Ā· Deep Learning Ā· Layout Analysis\\n
Ā· Character Recognition Ā· Open Source library Ā· Toolkit.\\n
1\\n
Introduction\\n
Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\n
document image analysis (DIA) tasks including document image classification [11,\\n
\\n\\n \\n
\\n
\\n
\\n
\\n
\\n
\\n\\n
2\\n
Z. Shen et al.\\n
37], layout detection [38, 22], table detection [26], and scene text detection [4].\\n
A generalized learning-based framework dramatically reduces the need for the\\n
manual specification of complicated rules, which is the status quo with traditional\\n
methods. DL has the potential to transform DIA pipelines and benefit a broad\\n
spectrum of large-scale document digitization projects.\\n
However, there are several practical difficulties for taking advantages of re-\\n
cent advances in DL-based methods: 1) DL models are notoriously convoluted\\n
for reuse and extension. Existing models are developed using distinct frame-\\n
works like TensorFlow [1] or PyTorch [24], and the high-level parameters can\\n
be obfuscated by implementation details [8]. It can be a time-consuming and\\n
frustrating experience to debug, reproduce, and adapt existing models for DIA,\\n
and
many researchers who would benefit the most from using these methods lack\\n
the technical background to implement them from scratch.
2) Document images\\n
contain diverse and disparate patterns across domains, and customized training\\n
is often required to achieve a desirable detection accuracy. Currently there is no\\n
full-fledged infrastructure for easily curating the target document image datasets\\n
and fine-tuning or re-training the models.
3) DIA usually requires a sequence of\\n
models and other processing to obtain the final outputs. Often research teams use\\n
DL models and then perform further document analyses in separate processes,\\n
and these pipelines are not documented in any central location (and often not\\n
documented at all). This makes it
difficult for research teams to learn about how\\n
full pipelines are implemented
and leads them to invest significant resources in\\n
reinventing the DIA wheel
.\\n
LayoutParser provides a unified toolkit to support DL-based document image\\n
analysis and processing. To address the aforementioned challenges,
LayoutParser\\n
is built with the following components:\\n
1. An off-the-shelf toolkit for applying DL models for layout detection, character\\n
recognition, and other DIA tasks (Section 3)\\n
2. A rich repository of pre-trained neural network models (Model Zoo) that\\n
underlies the off-the-shelf usage\\n
3. Comprehensive tools for efficient document image data annotation and model\\n
tuning to support different levels of customization\\n
4. A DL model hub and community platform for the easy sharing, distribu-\\n
tion, and discussion of DIA models and pipelines, to promote reusability,\\n
reproducibility, and extensibility (Section 4)\\n
The library implements simple and intuitive Python APIs without sacrificing\\n
generalizability and versatility, and can be easily installed via pip. Its convenient\\n
functions for handling document image data can be seamlessly integrated with\\n
existing DIA pipelines. With detailed documentations and carefully curated\\n
tutorials, we hope this tool will benefit a variety of end-users, and will lead to\\n
advances in applications in both industry and academic research.\\n
LayoutParser is well aligned with recent efforts for improving DL model\\n
reusability in other disciplines like natural language processing [8, 34] and com-\\n
puter vision [35], but with a focus on unique challenges in DIA. We show\\n
LayoutParser can be applied in sophisticated and large-scale digitization projects\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
3\\n
that require precision, efficiency, and robustness, as well as simple and light-\\n
weight document processing tasks focusing on efficacy and flexibility (Section 5).\\n
LayoutParser is being actively maintained, and support for more deep learning\\n
models and novel methods in text-based layout analysis methods [37, 34] is\\n
planned.\\n
The rest of the paper is organized as follows. Section 2 provides an overview\\n
of related work. The core
LayoutParser library, DL Model Zoo, and customized\\n
model training are described in Section 3, and the DL model hub and commu-\\n
nity platform are detailed in Section 4. Section 5 shows two examples of how\\n
LayoutParser can be used in practical DIA projects, and Section 6 concludes.\\n
2 Related Work\\n
Recently, various DL models and datasets have been developed for layout analysis\\n
tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen-\\n
tation tasks on historical documents. Object detection-based methods like Faster\\n
R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38]\\n
and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also\\n
been used in table detection [27]. However, these models are usually implemented\\n
individually and there is no unified framework to load and use such models.\\n
There has been a surge of interest in creating open-source tools for document\\n
image processing: a search of
document image analysis in Github leads to 5M\\n
relevant code pieces
6; yet most of them rely on traditional rule-based methods\\n
or provide limited functionalities. The closest prior research to our work is the\\n
OCR-D project
7, which also tries to build a complete toolkit for DIA. However,\\n
similar to the platform developed by Neudecker et al. [21], it is designed for\\n
analyzing historical documents, and provides no supports for recent DL models.\\n
The
DocumentLayoutAnalysis project8 focuses on processing born-digital PDF\\n
documents via analyzing the stored PDF data. Repositories like
DeepLayout9\\n
and Detectron2-PubLayNet10 are individual deep learning models trained on\\n
layout analysis datasets without support for the full DIA pipeline. The Document\\n
Analysis and Exploitation (DAE) platform [15] and the DeepDIVA project [2]\\n
aim to improve the reproducibility of DIA methods (or DL models), yet they\\n
are not actively maintained. OCR engines like
Tesseract [14], easyOCR11 and\\n
paddleOCR12 usually do not come with comprehensive functionalities for other\\n
DIA tasks like layout analysis.\\n
Recent years have also seen numerous efforts to create libraries for promoting\\n
reproducibility and reusability in the field of DL. Libraries like Dectectron2 [35],\\n
6 The number shown is obtained by specifying the search type as ā€˜code’.\\n
7 https://ocr-d.de/en/about\\n
8 https://github.com/BobLd/DocumentLayoutAnalysis\\n
9 https://github.com/leonlulu/DeepLayout\\n
10 https://github.com/hpanwar08/detectron2\\n
11 https://github.com/JaidedAI/EasyOCR\\n
12 https://github.com/PaddlePaddle/PaddleOCR\\n
\\n\\n\\n
4\\n
Z. Shen et al.\\n
Fig. 1: The overall architecture of LayoutParser. For an input document image,\\n
the core LayoutParser library provides a set of off-the-shelf tools for layout\\n
detection, OCR, visualization, and storage, backed by a carefully designed layout\\n
data structure.
LayoutParser also supports high level customization via efficient\\n
layout annotation and model training functions. These improve model accuracy\\n
on the target samples. The community platform enables the easy sharing of DIA\\n
models and whole digitization pipelines to promote reusability and reproducibility.\\n
A collection of detailed documentation, tutorials and exemplar projects make\\n
LayoutParser easy to learn and use.\\n
AllenNLP [8] and transformers [34] have provided the community with complete\\n
DL-based support for developing and deploying models for general computer\\n
vision and natural language processing problems.
LayoutParser, on the other\\n
hand, specializes specifically in DIA tasks.
LayoutParser is also equipped with a\\n
community platform inspired by established model hubs such as
Torch Hub [23]\\n
and
TensorFlow Hub [1]. It enables the sharing of pretrained models as well as\\n
full document processing pipelines that are unique to DIA tasks.\\n
There have been a variety of document data collections to facilitate the\\n
development of DL models. Some examples include PRImA [3](magazine layouts),\\n
PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic\\n
papers), Newspaper Navigator Dataset [16, 17](newspaper figure layouts) and\\n
HJDataset [31](historical Japanese document layouts). A spectrum of models\\n
trained on these datasets are currently available in the LayoutParser model zoo\\n
to support different use cases.\\n
3 The Core LayoutParser Library\\n
At the core of LayoutParser is an off-the-shelf toolkit that streamlines DL-\\n
based document image analysis. Five components support a simple interface\\n
with comprehensive functionalities: 1) The
layout detection models enable using\\n
pre-trained or self-trained DL models for layout detection with just four lines\\n
of code. 2) The detected layout information is stored in carefully engineered\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nEfficient Data Annotation\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCustomized Model Training\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nModel Customization\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nDIA Model Hub\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nDIA Pipeline Sharing\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCommunity Platform\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLayout Detection Models\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nDocument Images \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nThe Core LayoutParser Library\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nOCR Module\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nStorage & Visualization\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nLayout Data Structure\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
5\\n
Table 1: Current layout detection models in the LayoutParser model zoo\\n
Dataset\\n
Base Model1 Large Model Notes\\n
PubLayNet [38]\\n
PRImA [3]\\n
Newspaper [17]\\n
TableBank [18]\\n
HJDataset [31]\\n
F / M\\n
M\\n
F\\n
F\\n
F / M\\n
M\\n
-\\n
-\\n
F\\n
-\\n
Layouts of modern scientific documents\\n
Layouts of scanned modern magazines and scientific reports\\n
Layouts of scanned US newspapers from the 20th century\\n
Table region on modern scientific and business document\\n
Layouts of history Japanese documents\\n
1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy\\n
vs. computational cost). For ā€œbase modelā€ and ā€œlarge modelā€, we refer to using the ResNet 50 or ResNet 101\\n
backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask\\n
R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\\n
using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\\n
zoo in coming months.\\n
layout data structures, which are optimized for efficiency and versatility. 3) When\\n
necessary, users can employ existing or customized OCR models via the unified\\n
API provided in the
OCR module. 4) LayoutParser comes with a set of utility\\n
functions for the
visualization and storage of the layout data. 5) LayoutParser\\n
is also highly customizable, via its integration with functions for layout data\\n
annotation and model training
. We now provide detailed descriptions for each\\n
component.\\n
3.1 Layout Detection Models\\n
In LayoutParser, a layout model takes a document image as an input and\\n
generates a list of rectangular boxes for the target content regions. Different\\n
from traditional methods, it relies on deep convolutional neural networks rather\\n
than manually curated rules to identify content regions. It is formulated as an\\n
object detection problem and state-of-the-art models like Faster R-CNN [28] and\\n
Mask R-CNN [12] are used. This yields prediction results of high accuracy and\\n
makes it possible to build a concise, generalized interface for layout detection.\\n
LayoutParser, built upon Detectron2 [35], provides a minimal API that can\\n
perform layout detection with only four lines of code in Python:\\n
1 import layoutparser as lp\\n
2 image = cv2 . imread ( " image_file " ) # load images\\n
3 model = lp . De t e c tro n2 Lay outM odel (\\n
" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config " )\\n
4\\n
5
layout = model . detect ( image )\\n
LayoutParser provides a wealth of pre-trained model weights using various\\n
datasets covering different languages, time periods, and document types. Due to\\n
domain shift [7], the prediction performance can notably drop when models are ap-\\n
plied to target samples that are significantly different from the training dataset. As\\n
document structures and layouts vary greatly in different domains, it is important\\n
to select models trained on a dataset similar to the test samples. A semantic syntax\\n
is used for initializing the model weights in
LayoutParser, using both the dataset\\n
name and model name lp://<dataset-name>/<model-architecture-name>.\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
6\\n
Z. Shen et al.\\n
Fig. 2: The relationship between the three types of layout data structures.\\n
Coordinate supports three kinds of variation; TextBlock consists of the co-\\n
ordinate information and extra features like block text, types, and reading orders;\\n
a
Layout object is a list of all possible layout elements, including other Layout\\n
objects. They all support the same set of transformation and operation APIs for\\n
maximum flexibility.\\n
Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained\\n
on 5 different datasets. Description of the training dataset is provided alongside\\n
with the trained models such that users can quickly identify the most suitable\\n
models for their tasks. Additionally, when such a model is not readily available,\\n
LayoutParser also supports training customized layout models and community\\n
sharing of the models (detailed in Section 3.5).\\n
3.2 Layout Data Structures\\n
A critical feature of LayoutParser is the implementation of a series of data\\n
structures and operations that can be used to efficiently process and manipulate\\n
the layout elements. In document image analysis pipelines, various post-processing\\n
on the layout analysis model outputs is usually required to obtain the final\\n
outputs. Traditionally, this requires exporting DL model outputs and then loading\\n
the results into other pipelines. All model outputs from
LayoutParser will be\\n
stored in carefully engineered data types optimized for further processing, which\\n
makes it possible to build an end-to-end document digitization pipeline within\\n
LayoutParser. There are three key components in the data structure, namely\\n
the
Coordinate system, the TextBlock, and the Layout. They provide different\\n
levels of abstraction for the layout data, and a set of APIs are supported for\\n
transformations or operations on these classes.\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
7\\n
Coordinates are the cornerstones for storing layout information. Currently,\\n
three types of
Coordinate data structures are provided in LayoutParser, shown\\n
in Figure 2.
Interval and Rectangle are the most common data types and\\n
support specifying 1D or 2D regions within a document. They are parameterized\\n
with 2 and 4 parameters. A
Quadrilateral class is also implemented to support\\n
a more generalized representation of rectangular regions when the document\\n
is skewed or distorted, where the 4 corner points can be specified and a total\\n
of 8 degrees of freedom are supported. A wide collection of transformations\\n
like
shift, pad, and scale, and operations like intersect, union, and is_in,\\n
are supported for these classes. Notably, it is common to separate a segment\\n
of the image and analyze it individually.
LayoutParser provides full support\\n
for this scenario via image cropping operations
crop_image and coordinate\\n
transformations like
relative_to and condition_on that transform coordinates\\n
to and from their relative representations. We refer readers to Table 2 for a more\\n
detailed description of these operations13.\\n
Based on Coordinates, we implement the TextBlock class that stores both\\n
the positional and extra features of individual layout elements. It also supports\\n
specifying the reading orders via setting the
parent field to the index of the parent\\n
object. A
Layout class is built that takes in a list of TextBlocks and supports\\n
processing the elements in batch.
Layout can also be nested to support hierarchical\\n
layout structures. They support the same operations and transformations as the\\n
Coordinate classes, minimizing both learning and deployment effort.\\n
3.3 OCR\\n
LayoutParser provides a unified interface for existing OCR tools. Though there\\n
are many OCR tools available, they are usually configured differently with distinct\\n
APIs or protocols for using them. It can be inefficient to add new OCR tools into\\n
an existing pipeline, and difficult to make direct comparisons among the available\\n
tools to find the best option for a particular project. To this end,
LayoutParser\\n
builds a series of wrappers among existing OCR engines, and provides nearly\\n
the same syntax for using them. It supports a plug-and-play style of using OCR\\n
engines, making it effortless to switch, evaluate, and compare different OCR\\n
modules:\\n
1 ocr_agent = lp . TesseractAgent ()\\n
2 # Can be easily switched to other OCR software\\n
3 tokens = ocr_agent . detect ( image )\\n
The OCR outputs will also be stored in the aforementioned layout data\\n
structures and can be seamlessly incorporated into the digitization pipeline.\\n
Currently
LayoutParser supports the Tesseract and Google Cloud Vision OCR\\n
engines.\\n
LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained\\n
with the Connectionist Temporal Classification (CTC) loss [10]. It can be used\\n
like the other OCR modules, and can be easily trained on customized datasets.\\n
13 This is also available in the LayoutParser documentation pages.\\n
\\n\\n\\n\\n\\n\\n
8\\n
Z. Shen et al.\\n
Table 2: All operations supported by the layout elements. The same APIs are\\n
supported across different layout element classes including
Coordinate types,\\n
TextBlock and Layout.\\n
Operation Name\\n
Description\\n
block.pad(top, bottom, right, left) Enlarge the current block according to the input\\n
block.scale(fx, fy)\\n
block.shift(dx, dy)\\n
Scale the current block given the ratio\\n
in x and y direction\\n
Move the current block with the shift\\n
distances in x and y direction\\n
block1.is in(block2)\\n
Whether block1 is inside of block2\\n
block1.intersect(block2)\\n
block1.union(block2)\\n
block1.relative to(block2)\\n
block1.condition on(block2)\\n
Return the intersection region of block1 and block2.\\n
Coordinate type to be determined based on the inputs.\\n
Return the union region of block1 and block2.\\n
Coordinate type to be determined based on the inputs.\\n
Convert the absolute coordinates of block1 to\\n
relative coordinates to block2\\n
Calculate the absolute coordinates of block1 given\\n
the canvas block2’s absolute coordinates\\n
block.crop image(image)\\n
Obtain the image segments in the block region\\n
3.4 Storage and visualization\\n
The end goal of DIA is to transform the image-based document data into a\\n
structured database.
LayoutParser supports exporting layout data into different\\n
formats like
JSON, csv, and will add the support for the METS/ALTO XML\\n
format
14 . It can also load datasets from layout analysis-specific formats like\\n
COCO [38] and the Page Format [25] for training layout models (Section 3.5).\\n
Visualization of the layout detection results is critical for both presentation\\n
and debugging.
LayoutParser is built with an integrated API for displaying the\\n
layout information along with the original document image. Shown in Figure 3, it\\n
enables presenting layout data with rich meta information and features in different\\n
modes. More detailed information can be found in the online
LayoutParser\\n
documentation page.\\n
3.5 Customized Model Training\\n
Besides the off-the-shelf library, LayoutParser is also highly customizable with\\n
supports for highly unique and challenging document analysis tasks. Target\\n
document images can be vastly different from the existing datasets for train-\\n
ing layout models, which leads to low layout detection accuracy. Training data\\n
14 https://altoxml.github.io\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
9\\n
Fig. 3: Layout detection and OCR results visualization generated by the\\n
LayoutParser APIs. Mode I directly overlays the layout region bounding boxes\\n
and categories over the original image. Mode II recreates the original document\\n
via drawing the OCR’d texts at their corresponding positions on the image\\n
canvas. In this figure, tokens in textual regions are filtered using the API and\\n
then displayed.\\n
can also be highly sensitive and not sharable publicly. To overcome these chal-\\n
lenges,
LayoutParser is built with rich features for efficient data annotation and\\n
customized model training.\\n
LayoutParser incorporates a toolkit optimized for annotating document lay-\\n
outs using object-level active learning [32]. With the help from a layout detection\\n
model trained along with labeling, only the most important layout objects within\\n
each image, rather than the whole image, are required for labeling. The rest of\\n
the regions are automatically annotated with high confidence predictions from\\n
the layout detection model. This allows a layout dataset to be created more\\n
efficiently with only around 60% of the labeling budget.\\n
After the training dataset is curated, LayoutParser supports different modes\\n
for training the layout models.
Fine-tuning can be used for training models on a\\n
small newly-labeled dataset by initializing the model with existing pre-trained\\n
weights.
Training from scratch can be helpful when the source dataset and\\n
target are significantly different and a large training set is available. However, as\\n
suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale\\n
datasets like ImageNet [5], even from totally different domains, can still boost\\n
model performance. Through the integrated API provided by
LayoutParser,\\n
users can easily compare model performances on the benchmark datasets.\\n
\\n\\n
10\\n
Z. Shen et al.\\n
Fig. 4: Illustration of (a) the original historical Japanese document with layout\\n
detection results and (b) a recreated version of the document image that achieves\\n
much better character recognition recall. The reorganization algorithm rearranges\\n
the tokens based on the their detected bounding boxes given a maximum allowed\\n
height.\\n
4 LayoutParser Community Platform\\n
Another focus of LayoutParser is promoting the reusability of layout detection\\n
models and full digitization pipelines. Similar to many existing deep learning\\n
libraries,
LayoutParser comes with a community model hub for distributing\\n
layout models. End-users can upload their self-trained models to the model hub,\\n
and these models can be loaded into a similar interface as the currently available\\n
LayoutParser pre-trained models. For example, the model trained on the News\\n
Navigator dataset [17] has been incorporated in the model hub.\\n
Beyond DL models, LayoutParser also promotes the sharing of entire doc-\\n
ument digitization pipelines. For example, sometimes the pipeline requires the\\n
combination of multiple DL models to achieve better accuracy. Currently, pipelines\\n
are mainly described in academic papers and implementations are often not pub-\\n
licly available. To this end, the
LayoutParser community platform also enables\\n
the sharing of layout pipelines to promote the discussion and reuse of techniques.\\n
For each shared pipeline, it has a dedicated project page, with links to the source\\n
code, documentation, and an outline of the approaches. A discussion panel is\\n
provided for exchanging ideas. Combined with the core
LayoutParser library,\\n
users can easily build reusable components based on the shared pipelines and\\n
apply them to solve their unique problems.\\n
5 Use Cases\\n
The core objective of LayoutParser is to make it easier to create both large-scale\\n
and light-weight document digitization pipelines. Large-scale document processing\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
11\\n
focuses on precision, efficiency, and robustness. The target documents may have\\n
complicated structures, and may require training multiple layout detection models\\n
to achieve the optimal accuracy. Light-weight pipelines are built for relatively\\n
simple documents, with an emphasis on development ease, speed and flexibility.\\n
Ideally one only needs to use existing resources, and model training should be\\n
avoided. Through two exemplar projects, we show how practitioners in both\\n
academia and industry can easily build such pipelines using
LayoutParser and\\n
extract high-quality structured document data for their downstream tasks. The\\n
source code for these projects will be publicly available in the
LayoutParser\\n
community hub.\\n
5.1 A Comprehensive Historical Document Digitization Pipeline\\n
The digitization of historical documents can unlock valuable data that can shed\\n
light on many important social, economic, and historical questions. Yet due to\\n
scan noises, page wearing, and the prevalence of complicated layout structures, ob-\\n
taining a structured representation of historical document scans is often extremely\\n
complicated.\\n
In this example,
LayoutParser was\\n
used to develop a comprehensive\\n
pipeline, shown in Figure 5, to gener-\\n
ate high-quality structured data from\\n
historical Japanese firm financial ta-\\n
bles with complicated layouts. The\\n
pipeline applies two layout models to\\n
identify different levels of document\\n
structures and two customized OCR\\n
engines for optimized character recog-\\n
nition accuracy.\\n
As shown in Figure 4 (a), the\\n
document contains columns of text\\n
written vertically
15, a common style\\n
in Japanese. Due to scanning noise\\n
and archaic printing technology, the\\n
columns can be skewed or have vari-\\n
able widths, and hence cannot be eas-\\n
ily identified via rule-based methods.\\n
Within each column, words are sepa-\\n
rated by white spaces of variable size,\\n
and the vertical positions of objects\\n
can be an indicator of their layout\\n
type.\\n
Fig. 5: Illustration of how LayoutParser\\n
helps with the historical document digi-\\n
tization pipeline.\\n
15 A document page consists of eight rows like this. For simplicity we skip the row\\n
segmentation discussion and refer readers to the source code when available.\\n
\\n\\n\\n
12\\n
Z. Shen et al.\\n
To decipher the complicated layout\\n
structure, two object detection models have been trained to recognize individual\\n
columns and tokens, respectively. A small training set (400 images with approxi-\\n
mately 100 annotations each) is curated via the active learning based annotation\\n
tool [32] in
LayoutParser. The models learn to identify both the categories and\\n
regions for each token or column via their distinct visual features. The layout\\n
data structure enables easy grouping of the tokens within each column, and\\n
rearranging columns to achieve the correct reading orders based on the horizontal\\n
position. Errors are identified and rectified via checking the consistency of the\\n
model predictions. Therefore, though trained on a small dataset, the pipeline\\n
achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19]\\n
score across 5 categories for the column detection model, and a 89.23 AP across\\n
4 categories for the token detection model.\\n
A combination of character recognition methods is developed to tackle the\\n
unique challenges in this document. In our experiments, we found that irregular\\n
spacing between the tokens led to a low character recognition recall rate, whereas\\n
existing OCR models tend to perform better on densely-arranged texts. To\\n
overcome this challenge, we create a document reorganization algorithm that\\n
rearranges the text based on the token bounding boxes detected in the layout\\n
analysis step. Figure 4 (b) illustrates the generated image of dense text, which is\\n
sent to the OCR APIs as a whole to reduce the transaction costs. The flexible\\n
coordinate system in
LayoutParser is used to transform the OCR results relative\\n
to their original positions on the page.\\n
Additionally, it is common for historical documents to use unique fonts\\n
with different glyphs, which significantly degrades the accuracy of OCR models\\n
trained on modern texts. In this document, a special flat font is used for printing\\n
numbers and could not be detected by off-the-shelf OCR engines. Using the highly\\n
flexible functionalities from
LayoutParser, a pipeline approach is constructed\\n
that achieves a high recognition accuracy with minimal effort. As the characters\\n
have unique visual structures and are usually clustered together, we train the\\n
layout model to identify number regions with a dedicated category. Subsequently,\\n
LayoutParser crops images within these regions, and identifies characters within\\n
them using a self-trained OCR model based on a CNN-RNN [6]. The model\\n
detects a total of 15 possible categories, and achieves a 0.98 Jaccard score
16 and\\n
a 0.17 average Levinstein distances17 for token prediction on the test set.\\n
Overall, it is possible to create an intricate and highly accurate digitization\\n
pipeline for large-scale digitization using
LayoutParser. The pipeline avoids\\n
specifying the complicated rules used in traditional methods, is straightforward\\n
to develop, and is robust to outliers. The DL models also generate fine-grained\\n
results that enable creative approaches like page reorganization for OCR.\\n
16 This measures the overlap between the detected and ground-truth characters, and\\n
the maximum is 1.\\n
17 This measures the number of edits from the ground-truth text to the predicted text,\\n
and lower is better.\\n
\\n\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
13\\n
Fig. 6: This lightweight table detector can identify tables (outlined in red) and\\n
cells (shaded in blue) in different locations on a page. In very few cases (d), it\\n
might generate minor error predictions, e.g, failing to capture the top text line of\\n
a table.\\n
5.2 A light-weight Visual Table Extractor\\n
Detecting tables and parsing their structures (table extraction) are of central im-\\n
portance for many document digitization tasks. Many previous works [26, 30, 27]\\n
and tools
18 have been developed to identify and parse table structures. Yet they\\n
might require training complicated models from scratch, or are only applicable\\n
for born-digital PDF documents. In this section, we show how
LayoutParser can\\n
help build a light-weight accurate visual table extractor for legal docket tables\\n
using the existing resources with minimal effort.\\n
The extractor uses a pre-trained layout detection model for identifying the\\n
table regions and some simple rules for pairing the rows and the columns in the\\n
PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the\\n
LayoutParser Model Zoo can be used for detecting table regions. By filtering\\n
out model predictions of low confidence and removing overlapping predictions,\\n
LayoutParser can identify the tabular regions on each page, which significantly\\n
simplifies the subsequent steps. By applying the line detection functions within\\n
the tabular segments, provided in the utility module from LayoutParser, the\\n
pipeline can identify the three distinct columns in the tables. A row clustering\\n
method is then applied via analyzing the y coordinates of token bounding boxes in\\n
the left-most column, which are obtained from the OCR engines. A non-maximal\\n
suppression algorithm is used to remove duplicated rows with extremely small\\n
gaps. Shown in Figure 6, the built pipeline can detect tables at different positions\\n
on a page accurately. Continued tables from different pages are concatenated,\\n
and a structured table representation has been easily created.\\n
18 https://github.com/atlanhq/camelot, https://github.com/tabulapdf/tabula\\n
\\n\\n\\n
14\\n
Z. Shen et al.\\n
6 Conclusion\\n
LayoutParser provides a comprehensive toolkit for deep learning-based document\\n
image analysis. The off-the-shelf library is easy to install, and can be used to\\n
build flexible and accurate pipelines for processing documents with complicated\\n
structures. It also supports high-level customization and enables easy labeling and\\n
training of DL models on unique document image datasets. The
LayoutParser\\n
community platform facilitates sharing DL models and DIA pipelines, inviting\\n
discussion and promoting code reproducibility and reusability. The
LayoutParser\\n
team is committed to keeping the library updated continuously and bringing\\n
the most recent advances in DL-based DIA, such as multi-modal document\\n
modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.\\n
Acknowledgements We thank the anonymous reviewers for their comments\\n
and suggestions. This project is supported in part by NSF Grant OIA-2033558\\n
and funding from the Harvard Data Science Initiative and Harvard Catalyst.\\n
Zejiang Shen thanks Doug Downey for suggestions.\\n
References\\n
[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado,\\n
G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A.,\\n
Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg,\\n
J., Man“e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J.,\\n
Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V.,\\n
Vi“egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng,\\n
X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015),\\n
https://www.tensorflow.org/, software available from tensorflow.org\\n
[2] Alberti, M., Pondenkandath, V., WĀØursch, M., Ingold, R., Liwicki, M.: Deepdiva: a\\n
highly-functional python framework for reproducible experiments. In: 2018 16th\\n
International Conference on Frontiers in Handwriting Recognition (ICFHR). pp.\\n
423–428. IEEE (2018)\\n
[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic\\n
dataset for performance evaluation of document layout analysis. In: 2009 10th\\n
International Conference on Document Analysis and Recognition. pp. 296–300.\\n
IEEE (2009)\\n
[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text\\n
detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and\\n
Pattern Recognition. pp. 9365–9374 (2019)\\n
[5] Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale\\n
Hierarchical Image Database. In: CVPR09 (2009)\\n
[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with\\n
coarse-to-fine attention. In: International Conference on Machine Learning. pp.\\n
980–989. PMLR (2017)\\n
[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation.\\n
In: International conference on machine learning. pp. 1180–1189. PMLR (2015)\\n
\\n\\n
LayoutParser: A Unified Toolkit for DL-Based DIA\\n
15\\n
[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters,\\n
M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language\\n
processing platform. arXiv preprint arXiv:1803.07640 (2018)\\n
(cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P.,\\n
Grali“nski, F.: Lambert: Layout-aware (language) modeling using bert for in-\\n
formation extraction (2020)\\n
[9]\\n
[10] Graves, A., Fern“andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal\\n
classification: labelling unsegmented sequence data with recurrent neural networks.\\n
In: Proceedings of the 23rd international conference on Machine learning. pp.\\n
369–376 (2006)\\n
[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for\\n
document image classification and retrieval. In: 2015 13th International Conference\\n
on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015)\\n
[12] He, K., Gkioxari, G., Doll“ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the\\n
IEEE international conference on computer vision. pp. 2961–2969 (2017)\\n
[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition.\\n
In: Proceedings of the IEEE conference on computer vision and pattern recognition.\\n
pp. 770–778 (2016)\\n
[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J.\\n
2007(159), 2 (Jul 2007)\\n
[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis\\n
benchmarking. In: 2011 International Conference on Document Analysis and\\n
Recognition. pp. 42–47. IEEE (2011)\\n
[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5\\n
million images. In: Adjunct Publication of the 33rd Annual ACM Sym-\\n
posium on User\\n
Interface Software and Technology. p. 120–122. UIST\\n
’20 Adjunct, Association for Computing Machinery, New York, NY, USA\\n
(2020). https://doi.org/10.1145/3379350.3416143,
https://doi-org.offcampus.\\n
lib.washington.edu/10.1145/3379350.3416143\\n
[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N.,\\n
Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting\\n
Headlines and Visual Content from 16 Million Historic Newspaper Pages in\\n
Chronicling America, p. 3055–3062. Association for Computing Machinery, New\\n
York, NY, USA (2020),
https://doi.org/10.1145/3340531.3412767\\n
[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark\\n
for image-based table detection and recognition. arXiv preprint arXiv:1903.01949\\n
(2019)\\n
[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll“ar, P.,\\n
Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference\\n
on computer vision. pp. 740–755. Springer (2014)\\n
[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic\\n
segmentation. In: Proceedings of the IEEE conference on computer vision and\\n
pattern recognition. pp. 3431–3440 (2015)\\n
[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten-\\n
croft, K.: An experimental workflow development platform for historical document\\n
digitisation and analysis. In: Proceedings of the 2011 workshop on historical\\n
document imaging and processing. pp. 161–168 (2011)\\n
[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach\\n
for document segmentation. In: 2018 16th International Conference on Frontiers\\n
in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)\\n
\\n\\n
16\\n
Z. Shen et al.\\n
[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z.,\\n
Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017)\\n
[24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen,\\n
T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style,\\n
high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019)\\n
[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth\\n
elements) format framework. In: 2010 20th International Conference on Pattern\\n
Recognition. pp. 257–260. IEEE (2010)\\n
[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet:\\n
An approach for end to end table detection and structure recognition from image-\\n
based documents. In: Proceedings of the IEEE/CVF Conference on Computer\\n
Vision and Pattern Recognition Workshops. pp. 572–573 (2020)\\n
[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph\\n
neural networks. In: 2019 International Conference on Document Analysis and\\n
Recognition (ICDAR). pp. 142–147. IEEE (2019)\\n
[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object\\n
detection with region proposal networks. In: Advances in neural information\\n
processing systems. pp. 91–99 (2015)\\n
[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph\\n
neural network model. IEEE transactions on neural networks
20(1), 61–80 (2008)\\n
[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning\\n
for detection and structure recognition of tables in document images. In: 2017 14th\\n
IAPR international conference on document analysis and recognition (ICDAR).\\n
vol. 1, pp. 1162–1167. IEEE (2017)\\n
[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents\\n
with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer\\n
Vision and Pattern Recognition Workshops. pp. 548–549 (2020)\\n
[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning\\n
based layout annotation. arXiv preprint arXiv:2010.01762 (2020)\\n
[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer,\\n
A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for\\n
historical document image analysis. In: 2019 International Conference on Document\\n
Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)\\n
[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P.,\\n
Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of-\\n
the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)\\n
[35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2.
https://\\n
github.com/facebookresearch/detectron2 (2019)\\n
[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C.,\\n
Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document\\n
understanding. arXiv preprint arXiv:2012.14740 (2020)\\n
[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of\\n
text and layout for document image understanding (2019)\\n
[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:\\n
layout analysis.\\n
ument\\n
Analysis and Recognition (ICDAR). pp. 1015–1022.\\n
https://doi.org/10.1109/ICDAR.2019.00166\\n
largest dataset ever for doc-\\n
In: 2019 International Conference on Document\\n
IEEE (Sep 2019).\\n
Page: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
\\n\\n', metadata={'source': '../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf'})" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "directory_path = \"../../../docs/integrations/document_loaders/example_data/\"\n", + "from langchain_community.document_loaders import PyPDFDirectoryLoader\n", + "\n", + "directory_path = (\n", + " \"../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf\"\n", + ")\n", "loader = PyPDFDirectoryLoader(\"example_data/\")\n", "\n", + "docs = loader.load()\n", "\n", - "docs = loader.load()" + "data[0]" ] }, { @@ -585,6 +665,8 @@ "source": [ "from langchain_community.document_loaders import PDFPlumberLoader\n", "\n", + "loader = PDFPlumberLoader(\"../../docs/integrations/document_loaders/example_data/\")\n", + "\n", "data = loader.load()\n", "data[0]" ] @@ -613,7 +695,9 @@ "from langchain_community.document_loaders import AmazonTextractPDFLoader\n", "\n", "loader = AmazonTextractPDFLoader(\"example_data/alejandro_rosalez_sample-small.jpeg\")\n", - "documents = loader.load()" + "documents = loader.load()\n", + "\n", + "documents[0]" ] }, { @@ -660,7 +744,9 @@ " api_endpoint=endpoint, api_key=key, file_path=file_path, api_model=\"prebuilt-layout\"\n", ")\n", "\n", - "documents = loader.load()" + "documents = loader.load()\n", + "\n", + "documents[0]" ] } ], @@ -680,7 +766,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/csv.ipynb b/docs/docs/integrations/document_loaders/csv.ipynb index 3c1424bb1b4..ab004160f3d 100644 --- a/docs/docs/integrations/document_loaders/csv.ipynb +++ b/docs/docs/integrations/document_loaders/csv.ipynb @@ -13,36 +13,7 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, - "outputs": [], - "source": [ - "from langchain_community.document_loaders.csv_loader import CSVLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "loader = CSVLoader(file_path=\"./example_data/mlb_teams_2012.csv\")\n", - "\n", - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, + "execution_count": 1, "metadata": { "jupyter": { "outputs_hidden": false @@ -53,11 +24,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Document(page_content='Team: Nationals\\n\"Payroll (millions)\": 81.34\\n\"Wins\": 98', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 0}, lookup_index=0), Document(page_content='Team: Reds\\n\"Payroll (millions)\": 82.20\\n\"Wins\": 97', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 1}, lookup_index=0), Document(page_content='Team: Yankees\\n\"Payroll (millions)\": 197.96\\n\"Wins\": 95', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 2}, lookup_index=0), Document(page_content='Team: Giants\\n\"Payroll (millions)\": 117.62\\n\"Wins\": 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 3}, lookup_index=0), Document(page_content='Team: Braves\\n\"Payroll (millions)\": 83.31\\n\"Wins\": 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 4}, lookup_index=0), Document(page_content='Team: Athletics\\n\"Payroll (millions)\": 55.37\\n\"Wins\": 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 5}, lookup_index=0), Document(page_content='Team: Rangers\\n\"Payroll (millions)\": 120.51\\n\"Wins\": 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 6}, lookup_index=0), Document(page_content='Team: Orioles\\n\"Payroll (millions)\": 81.43\\n\"Wins\": 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 7}, lookup_index=0), Document(page_content='Team: Rays\\n\"Payroll (millions)\": 64.17\\n\"Wins\": 90', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 8}, lookup_index=0), Document(page_content='Team: Angels\\n\"Payroll (millions)\": 154.49\\n\"Wins\": 89', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 9}, lookup_index=0), Document(page_content='Team: Tigers\\n\"Payroll (millions)\": 132.30\\n\"Wins\": 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 10}, lookup_index=0), Document(page_content='Team: Cardinals\\n\"Payroll (millions)\": 110.30\\n\"Wins\": 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 11}, lookup_index=0), Document(page_content='Team: Dodgers\\n\"Payroll (millions)\": 95.14\\n\"Wins\": 86', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 12}, lookup_index=0), Document(page_content='Team: White Sox\\n\"Payroll (millions)\": 96.92\\n\"Wins\": 85', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 13}, lookup_index=0), Document(page_content='Team: Brewers\\n\"Payroll (millions)\": 97.65\\n\"Wins\": 83', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 14}, lookup_index=0), Document(page_content='Team: Phillies\\n\"Payroll (millions)\": 174.54\\n\"Wins\": 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 15}, lookup_index=0), Document(page_content='Team: Diamondbacks\\n\"Payroll (millions)\": 74.28\\n\"Wins\": 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 16}, lookup_index=0), Document(page_content='Team: Pirates\\n\"Payroll (millions)\": 63.43\\n\"Wins\": 79', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 17}, lookup_index=0), Document(page_content='Team: Padres\\n\"Payroll (millions)\": 55.24\\n\"Wins\": 76', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 18}, lookup_index=0), Document(page_content='Team: Mariners\\n\"Payroll (millions)\": 81.97\\n\"Wins\": 75', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 19}, lookup_index=0), Document(page_content='Team: Mets\\n\"Payroll (millions)\": 93.35\\n\"Wins\": 74', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 20}, lookup_index=0), Document(page_content='Team: Blue Jays\\n\"Payroll (millions)\": 75.48\\n\"Wins\": 73', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 21}, lookup_index=0), Document(page_content='Team: Royals\\n\"Payroll (millions)\": 60.91\\n\"Wins\": 72', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 22}, lookup_index=0), Document(page_content='Team: Marlins\\n\"Payroll (millions)\": 118.07\\n\"Wins\": 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 23}, lookup_index=0), Document(page_content='Team: Red Sox\\n\"Payroll (millions)\": 173.18\\n\"Wins\": 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 24}, lookup_index=0), Document(page_content='Team: Indians\\n\"Payroll (millions)\": 78.43\\n\"Wins\": 68', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 25}, lookup_index=0), Document(page_content='Team: Twins\\n\"Payroll (millions)\": 94.08\\n\"Wins\": 66', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 26}, lookup_index=0), Document(page_content='Team: Rockies\\n\"Payroll (millions)\": 78.06\\n\"Wins\": 64', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 27}, lookup_index=0), Document(page_content='Team: Cubs\\n\"Payroll (millions)\": 88.19\\n\"Wins\": 61', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 28}, lookup_index=0), Document(page_content='Team: Astros\\n\"Payroll (millions)\": 60.65\\n\"Wins\": 55', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 29}, lookup_index=0)]\n" + "[Document(page_content='Team: Nationals\\n\"Payroll (millions)\": 81.34\\n\"Wins\": 98', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 0}), Document(page_content='Team: Reds\\n\"Payroll (millions)\": 82.20\\n\"Wins\": 97', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 1}), Document(page_content='Team: Yankees\\n\"Payroll (millions)\": 197.96\\n\"Wins\": 95', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 2}), Document(page_content='Team: Giants\\n\"Payroll (millions)\": 117.62\\n\"Wins\": 94', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 3}), Document(page_content='Team: Braves\\n\"Payroll (millions)\": 83.31\\n\"Wins\": 94', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 4}), Document(page_content='Team: Athletics\\n\"Payroll (millions)\": 55.37\\n\"Wins\": 94', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 5}), Document(page_content='Team: Rangers\\n\"Payroll (millions)\": 120.51\\n\"Wins\": 93', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 6}), Document(page_content='Team: Orioles\\n\"Payroll (millions)\": 81.43\\n\"Wins\": 93', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 7}), Document(page_content='Team: Rays\\n\"Payroll (millions)\": 64.17\\n\"Wins\": 90', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 8}), Document(page_content='Team: Angels\\n\"Payroll (millions)\": 154.49\\n\"Wins\": 89', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 9}), Document(page_content='Team: Tigers\\n\"Payroll (millions)\": 132.30\\n\"Wins\": 88', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 10}), Document(page_content='Team: Cardinals\\n\"Payroll (millions)\": 110.30\\n\"Wins\": 88', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 11}), Document(page_content='Team: Dodgers\\n\"Payroll (millions)\": 95.14\\n\"Wins\": 86', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 12}), Document(page_content='Team: White Sox\\n\"Payroll (millions)\": 96.92\\n\"Wins\": 85', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 13}), Document(page_content='Team: Brewers\\n\"Payroll (millions)\": 97.65\\n\"Wins\": 83', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 14}), Document(page_content='Team: Phillies\\n\"Payroll (millions)\": 174.54\\n\"Wins\": 81', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 15}), Document(page_content='Team: Diamondbacks\\n\"Payroll (millions)\": 74.28\\n\"Wins\": 81', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 16}), Document(page_content='Team: Pirates\\n\"Payroll (millions)\": 63.43\\n\"Wins\": 79', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 17}), Document(page_content='Team: Padres\\n\"Payroll (millions)\": 55.24\\n\"Wins\": 76', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 18}), Document(page_content='Team: Mariners\\n\"Payroll (millions)\": 81.97\\n\"Wins\": 75', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 19}), Document(page_content='Team: Mets\\n\"Payroll (millions)\": 93.35\\n\"Wins\": 74', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 20}), Document(page_content='Team: Blue Jays\\n\"Payroll (millions)\": 75.48\\n\"Wins\": 73', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 21}), Document(page_content='Team: Royals\\n\"Payroll (millions)\": 60.91\\n\"Wins\": 72', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 22}), Document(page_content='Team: Marlins\\n\"Payroll (millions)\": 118.07\\n\"Wins\": 69', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 23}), Document(page_content='Team: Red Sox\\n\"Payroll (millions)\": 173.18\\n\"Wins\": 69', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 24}), Document(page_content='Team: Indians\\n\"Payroll (millions)\": 78.43\\n\"Wins\": 68', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 25}), Document(page_content='Team: Twins\\n\"Payroll (millions)\": 94.08\\n\"Wins\": 66', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 26}), Document(page_content='Team: Rockies\\n\"Payroll (millions)\": 78.06\\n\"Wins\": 64', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 27}), Document(page_content='Team: Cubs\\n\"Payroll (millions)\": 88.19\\n\"Wins\": 61', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 28}), Document(page_content='Team: Astros\\n\"Payroll (millions)\": 60.65\\n\"Wins\": 55', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 29})]\n" ] } ], "source": [ + "from langchain_community.document_loaders.csv_loader import CSVLoader\n", + "\n", + "loader = CSVLoader(file_path=\"./example_data/mlb_teams_2012.csv\")\n", + "\n", + "data = loader.load()\n", + "\n", "print(data)" ] }, @@ -72,29 +49,7 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "loader = CSVLoader(\n", - " file_path=\"./example_data/mlb_teams_2012.csv\",\n", - " csv_args={\n", - " \"delimiter\": \",\",\n", - " \"quotechar\": '\"',\n", - " \"fieldnames\": [\"MLB Team\", \"Payroll in millions\", \"Wins\"],\n", - " },\n", - ")\n", - "\n", - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, + "execution_count": 4, "metadata": { "jupyter": { "outputs_hidden": false @@ -105,11 +60,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "[Document(page_content='MLB Team: Team\\nPayroll in millions: \"Payroll (millions)\"\\nWins: \"Wins\"', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 0}, lookup_index=0), Document(page_content='MLB Team: Nationals\\nPayroll in millions: 81.34\\nWins: 98', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 1}, lookup_index=0), Document(page_content='MLB Team: Reds\\nPayroll in millions: 82.20\\nWins: 97', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 2}, lookup_index=0), Document(page_content='MLB Team: Yankees\\nPayroll in millions: 197.96\\nWins: 95', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 3}, lookup_index=0), Document(page_content='MLB Team: Giants\\nPayroll in millions: 117.62\\nWins: 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 4}, lookup_index=0), Document(page_content='MLB Team: Braves\\nPayroll in millions: 83.31\\nWins: 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 5}, lookup_index=0), Document(page_content='MLB Team: Athletics\\nPayroll in millions: 55.37\\nWins: 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 6}, lookup_index=0), Document(page_content='MLB Team: Rangers\\nPayroll in millions: 120.51\\nWins: 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 7}, lookup_index=0), Document(page_content='MLB Team: Orioles\\nPayroll in millions: 81.43\\nWins: 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 8}, lookup_index=0), Document(page_content='MLB Team: Rays\\nPayroll in millions: 64.17\\nWins: 90', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 9}, lookup_index=0), Document(page_content='MLB Team: Angels\\nPayroll in millions: 154.49\\nWins: 89', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 10}, lookup_index=0), Document(page_content='MLB Team: Tigers\\nPayroll in millions: 132.30\\nWins: 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 11}, lookup_index=0), Document(page_content='MLB Team: Cardinals\\nPayroll in millions: 110.30\\nWins: 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 12}, lookup_index=0), Document(page_content='MLB Team: Dodgers\\nPayroll in millions: 95.14\\nWins: 86', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 13}, lookup_index=0), Document(page_content='MLB Team: White Sox\\nPayroll in millions: 96.92\\nWins: 85', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 14}, lookup_index=0), Document(page_content='MLB Team: Brewers\\nPayroll in millions: 97.65\\nWins: 83', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 15}, lookup_index=0), Document(page_content='MLB Team: Phillies\\nPayroll in millions: 174.54\\nWins: 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 16}, lookup_index=0), Document(page_content='MLB Team: Diamondbacks\\nPayroll in millions: 74.28\\nWins: 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 17}, lookup_index=0), Document(page_content='MLB Team: Pirates\\nPayroll in millions: 63.43\\nWins: 79', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 18}, lookup_index=0), Document(page_content='MLB Team: Padres\\nPayroll in millions: 55.24\\nWins: 76', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 19}, lookup_index=0), Document(page_content='MLB Team: Mariners\\nPayroll in millions: 81.97\\nWins: 75', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 20}, lookup_index=0), Document(page_content='MLB Team: Mets\\nPayroll in millions: 93.35\\nWins: 74', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 21}, lookup_index=0), Document(page_content='MLB Team: Blue Jays\\nPayroll in millions: 75.48\\nWins: 73', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 22}, lookup_index=0), Document(page_content='MLB Team: Royals\\nPayroll in millions: 60.91\\nWins: 72', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 23}, lookup_index=0), Document(page_content='MLB Team: Marlins\\nPayroll in millions: 118.07\\nWins: 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 24}, lookup_index=0), Document(page_content='MLB Team: Red Sox\\nPayroll in millions: 173.18\\nWins: 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 25}, lookup_index=0), Document(page_content='MLB Team: Indians\\nPayroll in millions: 78.43\\nWins: 68', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 26}, lookup_index=0), Document(page_content='MLB Team: Twins\\nPayroll in millions: 94.08\\nWins: 66', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 27}, lookup_index=0), Document(page_content='MLB Team: Rockies\\nPayroll in millions: 78.06\\nWins: 64', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 28}, lookup_index=0), Document(page_content='MLB Team: Cubs\\nPayroll in millions: 88.19\\nWins: 61', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 29}, lookup_index=0), Document(page_content='MLB Team: Astros\\nPayroll in millions: 60.65\\nWins: 55', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 30}, lookup_index=0)]\n" + "[Document(page_content='MLB Team: Team\\nPayroll in millions: \"Payroll (millions)\"\\nWins: \"Wins\"', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 0}), Document(page_content='MLB Team: Nationals\\nPayroll in millions: 81.34\\nWins: 98', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 1}), Document(page_content='MLB Team: Reds\\nPayroll in millions: 82.20\\nWins: 97', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 2}), Document(page_content='MLB Team: Yankees\\nPayroll in millions: 197.96\\nWins: 95', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 3}), Document(page_content='MLB Team: Giants\\nPayroll in millions: 117.62\\nWins: 94', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 4}), Document(page_content='MLB Team: Braves\\nPayroll in millions: 83.31\\nWins: 94', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 5}), Document(page_content='MLB Team: Athletics\\nPayroll in millions: 55.37\\nWins: 94', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 6}), Document(page_content='MLB Team: Rangers\\nPayroll in millions: 120.51\\nWins: 93', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 7}), Document(page_content='MLB Team: Orioles\\nPayroll in millions: 81.43\\nWins: 93', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 8}), Document(page_content='MLB Team: Rays\\nPayroll in millions: 64.17\\nWins: 90', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 9}), Document(page_content='MLB Team: Angels\\nPayroll in millions: 154.49\\nWins: 89', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 10}), Document(page_content='MLB Team: Tigers\\nPayroll in millions: 132.30\\nWins: 88', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 11}), Document(page_content='MLB Team: Cardinals\\nPayroll in millions: 110.30\\nWins: 88', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 12}), Document(page_content='MLB Team: Dodgers\\nPayroll in millions: 95.14\\nWins: 86', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 13}), Document(page_content='MLB Team: White Sox\\nPayroll in millions: 96.92\\nWins: 85', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 14}), Document(page_content='MLB Team: Brewers\\nPayroll in millions: 97.65\\nWins: 83', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 15}), Document(page_content='MLB Team: Phillies\\nPayroll in millions: 174.54\\nWins: 81', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 16}), Document(page_content='MLB Team: Diamondbacks\\nPayroll in millions: 74.28\\nWins: 81', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 17}), Document(page_content='MLB Team: Pirates\\nPayroll in millions: 63.43\\nWins: 79', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 18}), Document(page_content='MLB Team: Padres\\nPayroll in millions: 55.24\\nWins: 76', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 19}), Document(page_content='MLB Team: Mariners\\nPayroll in millions: 81.97\\nWins: 75', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 20}), Document(page_content='MLB Team: Mets\\nPayroll in millions: 93.35\\nWins: 74', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 21}), Document(page_content='MLB Team: Blue Jays\\nPayroll in millions: 75.48\\nWins: 73', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 22}), Document(page_content='MLB Team: Royals\\nPayroll in millions: 60.91\\nWins: 72', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 23}), Document(page_content='MLB Team: Marlins\\nPayroll in millions: 118.07\\nWins: 69', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 24}), Document(page_content='MLB Team: Red Sox\\nPayroll in millions: 173.18\\nWins: 69', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 25}), Document(page_content='MLB Team: Indians\\nPayroll in millions: 78.43\\nWins: 68', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 26}), Document(page_content='MLB Team: Twins\\nPayroll in millions: 94.08\\nWins: 66', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 27}), Document(page_content='MLB Team: Rockies\\nPayroll in millions: 78.06\\nWins: 64', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 28}), Document(page_content='MLB Team: Cubs\\nPayroll in millions: 88.19\\nWins: 61', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 29}), Document(page_content='MLB Team: Astros\\nPayroll in millions: 60.65\\nWins: 55', metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 30})]\n" ] } ], "source": [ + "loader = CSVLoader(\n", + " file_path=\"./example_data/mlb_teams_2012.csv\",\n", + " csv_args={\n", + " \"delimiter\": \",\",\n", + " \"quotechar\": '\"',\n", + " \"fieldnames\": [\"MLB Team\", \"Payroll in millions\", \"Wins\"],\n", + " },\n", + ")\n", + "\n", + "data = loader.load()\n", + "\n", "print(data)" ] }, @@ -126,29 +92,22 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "loader = CSVLoader(file_path=\"./example_data/mlb_teams_2012.csv\", source_column=\"Team\")\n", - "\n", - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[Document(page_content='Team: Nationals\\n\"Payroll (millions)\": 81.34\\n\"Wins\": 98', lookup_str='', metadata={'source': 'Nationals', 'row': 0}, lookup_index=0), Document(page_content='Team: Reds\\n\"Payroll (millions)\": 82.20\\n\"Wins\": 97', lookup_str='', metadata={'source': 'Reds', 'row': 1}, lookup_index=0), Document(page_content='Team: Yankees\\n\"Payroll (millions)\": 197.96\\n\"Wins\": 95', lookup_str='', metadata={'source': 'Yankees', 'row': 2}, lookup_index=0), Document(page_content='Team: Giants\\n\"Payroll (millions)\": 117.62\\n\"Wins\": 94', lookup_str='', metadata={'source': 'Giants', 'row': 3}, lookup_index=0), Document(page_content='Team: Braves\\n\"Payroll (millions)\": 83.31\\n\"Wins\": 94', lookup_str='', metadata={'source': 'Braves', 'row': 4}, lookup_index=0), Document(page_content='Team: Athletics\\n\"Payroll (millions)\": 55.37\\n\"Wins\": 94', lookup_str='', metadata={'source': 'Athletics', 'row': 5}, lookup_index=0), Document(page_content='Team: Rangers\\n\"Payroll (millions)\": 120.51\\n\"Wins\": 93', lookup_str='', metadata={'source': 'Rangers', 'row': 6}, lookup_index=0), Document(page_content='Team: Orioles\\n\"Payroll (millions)\": 81.43\\n\"Wins\": 93', lookup_str='', metadata={'source': 'Orioles', 'row': 7}, lookup_index=0), Document(page_content='Team: Rays\\n\"Payroll (millions)\": 64.17\\n\"Wins\": 90', lookup_str='', metadata={'source': 'Rays', 'row': 8}, lookup_index=0), Document(page_content='Team: Angels\\n\"Payroll (millions)\": 154.49\\n\"Wins\": 89', lookup_str='', metadata={'source': 'Angels', 'row': 9}, lookup_index=0), Document(page_content='Team: Tigers\\n\"Payroll (millions)\": 132.30\\n\"Wins\": 88', lookup_str='', metadata={'source': 'Tigers', 'row': 10}, lookup_index=0), Document(page_content='Team: Cardinals\\n\"Payroll (millions)\": 110.30\\n\"Wins\": 88', lookup_str='', metadata={'source': 'Cardinals', 'row': 11}, lookup_index=0), Document(page_content='Team: Dodgers\\n\"Payroll (millions)\": 95.14\\n\"Wins\": 86', lookup_str='', metadata={'source': 'Dodgers', 'row': 12}, lookup_index=0), Document(page_content='Team: White Sox\\n\"Payroll (millions)\": 96.92\\n\"Wins\": 85', lookup_str='', metadata={'source': 'White Sox', 'row': 13}, lookup_index=0), Document(page_content='Team: Brewers\\n\"Payroll (millions)\": 97.65\\n\"Wins\": 83', lookup_str='', metadata={'source': 'Brewers', 'row': 14}, lookup_index=0), Document(page_content='Team: Phillies\\n\"Payroll (millions)\": 174.54\\n\"Wins\": 81', lookup_str='', metadata={'source': 'Phillies', 'row': 15}, lookup_index=0), Document(page_content='Team: Diamondbacks\\n\"Payroll (millions)\": 74.28\\n\"Wins\": 81', lookup_str='', metadata={'source': 'Diamondbacks', 'row': 16}, lookup_index=0), Document(page_content='Team: Pirates\\n\"Payroll (millions)\": 63.43\\n\"Wins\": 79', lookup_str='', metadata={'source': 'Pirates', 'row': 17}, lookup_index=0), Document(page_content='Team: Padres\\n\"Payroll (millions)\": 55.24\\n\"Wins\": 76', lookup_str='', metadata={'source': 'Padres', 'row': 18}, lookup_index=0), Document(page_content='Team: Mariners\\n\"Payroll (millions)\": 81.97\\n\"Wins\": 75', lookup_str='', metadata={'source': 'Mariners', 'row': 19}, lookup_index=0), Document(page_content='Team: Mets\\n\"Payroll (millions)\": 93.35\\n\"Wins\": 74', lookup_str='', metadata={'source': 'Mets', 'row': 20}, lookup_index=0), Document(page_content='Team: Blue Jays\\n\"Payroll (millions)\": 75.48\\n\"Wins\": 73', lookup_str='', metadata={'source': 'Blue Jays', 'row': 21}, lookup_index=0), Document(page_content='Team: Royals\\n\"Payroll (millions)\": 60.91\\n\"Wins\": 72', lookup_str='', metadata={'source': 'Royals', 'row': 22}, lookup_index=0), Document(page_content='Team: Marlins\\n\"Payroll (millions)\": 118.07\\n\"Wins\": 69', lookup_str='', metadata={'source': 'Marlins', 'row': 23}, lookup_index=0), Document(page_content='Team: Red Sox\\n\"Payroll (millions)\": 173.18\\n\"Wins\": 69', lookup_str='', metadata={'source': 'Red Sox', 'row': 24}, lookup_index=0), Document(page_content='Team: Indians\\n\"Payroll (millions)\": 78.43\\n\"Wins\": 68', lookup_str='', metadata={'source': 'Indians', 'row': 25}, lookup_index=0), Document(page_content='Team: Twins\\n\"Payroll (millions)\": 94.08\\n\"Wins\": 66', lookup_str='', metadata={'source': 'Twins', 'row': 26}, lookup_index=0), Document(page_content='Team: Rockies\\n\"Payroll (millions)\": 78.06\\n\"Wins\": 64', lookup_str='', metadata={'source': 'Rockies', 'row': 27}, lookup_index=0), Document(page_content='Team: Cubs\\n\"Payroll (millions)\": 88.19\\n\"Wins\": 61', lookup_str='', metadata={'source': 'Cubs', 'row': 28}, lookup_index=0), Document(page_content='Team: Astros\\n\"Payroll (millions)\": 60.65\\n\"Wins\": 55', lookup_str='', metadata={'source': 'Astros', 'row': 29}, lookup_index=0)]\n" + "[Document(page_content='Team: Nationals\\n\"Payroll (millions)\": 81.34\\n\"Wins\": 98', metadata={'source': 'Nationals', 'row': 0}), Document(page_content='Team: Reds\\n\"Payroll (millions)\": 82.20\\n\"Wins\": 97', metadata={'source': 'Reds', 'row': 1}), Document(page_content='Team: Yankees\\n\"Payroll (millions)\": 197.96\\n\"Wins\": 95', metadata={'source': 'Yankees', 'row': 2}), Document(page_content='Team: Giants\\n\"Payroll (millions)\": 117.62\\n\"Wins\": 94', metadata={'source': 'Giants', 'row': 3}), Document(page_content='Team: Braves\\n\"Payroll (millions)\": 83.31\\n\"Wins\": 94', metadata={'source': 'Braves', 'row': 4}), Document(page_content='Team: Athletics\\n\"Payroll (millions)\": 55.37\\n\"Wins\": 94', metadata={'source': 'Athletics', 'row': 5}), Document(page_content='Team: Rangers\\n\"Payroll (millions)\": 120.51\\n\"Wins\": 93', metadata={'source': 'Rangers', 'row': 6}), Document(page_content='Team: Orioles\\n\"Payroll (millions)\": 81.43\\n\"Wins\": 93', metadata={'source': 'Orioles', 'row': 7}), Document(page_content='Team: Rays\\n\"Payroll (millions)\": 64.17\\n\"Wins\": 90', metadata={'source': 'Rays', 'row': 8}), Document(page_content='Team: Angels\\n\"Payroll (millions)\": 154.49\\n\"Wins\": 89', metadata={'source': 'Angels', 'row': 9}), Document(page_content='Team: Tigers\\n\"Payroll (millions)\": 132.30\\n\"Wins\": 88', metadata={'source': 'Tigers', 'row': 10}), Document(page_content='Team: Cardinals\\n\"Payroll (millions)\": 110.30\\n\"Wins\": 88', metadata={'source': 'Cardinals', 'row': 11}), Document(page_content='Team: Dodgers\\n\"Payroll (millions)\": 95.14\\n\"Wins\": 86', metadata={'source': 'Dodgers', 'row': 12}), Document(page_content='Team: White Sox\\n\"Payroll (millions)\": 96.92\\n\"Wins\": 85', metadata={'source': 'White Sox', 'row': 13}), Document(page_content='Team: Brewers\\n\"Payroll (millions)\": 97.65\\n\"Wins\": 83', metadata={'source': 'Brewers', 'row': 14}), Document(page_content='Team: Phillies\\n\"Payroll (millions)\": 174.54\\n\"Wins\": 81', metadata={'source': 'Phillies', 'row': 15}), Document(page_content='Team: Diamondbacks\\n\"Payroll (millions)\": 74.28\\n\"Wins\": 81', metadata={'source': 'Diamondbacks', 'row': 16}), Document(page_content='Team: Pirates\\n\"Payroll (millions)\": 63.43\\n\"Wins\": 79', metadata={'source': 'Pirates', 'row': 17}), Document(page_content='Team: Padres\\n\"Payroll (millions)\": 55.24\\n\"Wins\": 76', metadata={'source': 'Padres', 'row': 18}), Document(page_content='Team: Mariners\\n\"Payroll (millions)\": 81.97\\n\"Wins\": 75', metadata={'source': 'Mariners', 'row': 19}), Document(page_content='Team: Mets\\n\"Payroll (millions)\": 93.35\\n\"Wins\": 74', metadata={'source': 'Mets', 'row': 20}), Document(page_content='Team: Blue Jays\\n\"Payroll (millions)\": 75.48\\n\"Wins\": 73', metadata={'source': 'Blue Jays', 'row': 21}), Document(page_content='Team: Royals\\n\"Payroll (millions)\": 60.91\\n\"Wins\": 72', metadata={'source': 'Royals', 'row': 22}), Document(page_content='Team: Marlins\\n\"Payroll (millions)\": 118.07\\n\"Wins\": 69', metadata={'source': 'Marlins', 'row': 23}), Document(page_content='Team: Red Sox\\n\"Payroll (millions)\": 173.18\\n\"Wins\": 69', metadata={'source': 'Red Sox', 'row': 24}), Document(page_content='Team: Indians\\n\"Payroll (millions)\": 78.43\\n\"Wins\": 68', metadata={'source': 'Indians', 'row': 25}), Document(page_content='Team: Twins\\n\"Payroll (millions)\": 94.08\\n\"Wins\": 66', metadata={'source': 'Twins', 'row': 26}), Document(page_content='Team: Rockies\\n\"Payroll (millions)\": 78.06\\n\"Wins\": 64', metadata={'source': 'Rockies', 'row': 27}), Document(page_content='Team: Cubs\\n\"Payroll (millions)\": 88.19\\n\"Wins\": 61', metadata={'source': 'Cubs', 'row': 28}), Document(page_content='Team: Astros\\n\"Payroll (millions)\": 60.65\\n\"Wins\": 55', metadata={'source': 'Astros', 'row': 29})]\n" ] } ], "source": [ + "loader = CSVLoader(file_path=\"./example_data/mlb_teams_2012.csv\", source_column=\"Team\")\n", + "\n", + "data = loader.load()\n", + "\n", "print(data)" ] }, @@ -163,28 +122,7 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredCSVLoader(\n", - " file_path=\"example_data/mlb_teams_2012.csv\", mode=\"elements\"\n", - ")\n", - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -194,6 +132,11 @@ "\n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -349,6 +292,13 @@ } ], "source": [ + "from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader\n", + "\n", + "loader = UnstructuredCSVLoader(\n", + " file_path=\"example_data/mlb_teams_2012.csv\", mode=\"elements\"\n", + ")\n", + "docs = loader.load()\n", + "\n", "print(docs[0].metadata[\"text_as_html\"])" ] }, @@ -376,7 +326,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/email.ipynb b/docs/docs/integrations/document_loaders/email.ipynb index 28190812397..18220d85dc7 100644 --- a/docs/docs/integrations/document_loaders/email.ipynb +++ b/docs/docs/integrations/document_loaders/email.ipynb @@ -7,7 +7,9 @@ "source": [ "# Email\n", "\n", - "This notebook shows how to load email (`.eml`) or `Microsoft Outlook` (`.msg`) files." + "This notebook shows how to load email (`.eml`) or `Microsoft Outlook` (`.msg`) files.\n", + "\n", + "Please see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies." ] }, { @@ -27,49 +29,13 @@ }, "outputs": [], "source": [ - "%pip install --upgrade --quiet unstructured" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "40cd9806", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredEmailLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2d20b852", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "loader = UnstructuredEmailLoader(\"example_data/fake-email.eml\")" + "%pip install --upgrade --quiet unstructured" ] }, { "cell_type": "code", "execution_count": 3, - "id": "579fa702", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "90c1d899", + "id": "2d20b852", "metadata": { "tags": [] }, @@ -77,15 +43,21 @@ { "data": { "text/plain": [ - "[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', metadata={'source': 'example_data/fake-email.eml'})]" + "[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', metadata={'source': './example_data/fake-email.eml'})]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders import UnstructuredEmailLoader\n", + "\n", + "loader = UnstructuredEmailLoader(\"./example_data/fake-email.eml\")\n", + "\n", + "data = loader.load()\n", + "\n", "data" ] }, @@ -101,42 +73,26 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "b9592eaf", "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredEmailLoader(\"example_data/fake-email.eml\", mode=\"elements\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "0b16d03f", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d7bdc5e5", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='This is a test email to use for unit tests.', metadata={'source': 'example_data/fake-email.eml', 'filename': 'fake-email.eml', 'file_directory': 'example_data', 'date': '2022-12-16T17:04:16-05:00', 'filetype': 'message/rfc822', 'sent_from': ['Matthew Robinson '], 'sent_to': ['Matthew Robinson '], 'subject': 'Test Email', 'category': 'NarrativeText'})" + "Document(page_content='This is a test email to use for unit tests.', metadata={'source': 'example_data/fake-email.eml', 'file_directory': 'example_data', 'filename': 'fake-email.eml', 'last_modified': '2022-12-16T17:04:16-05:00', 'sent_from': ['Matthew Robinson '], 'sent_to': ['Matthew Robinson '], 'subject': 'Test Email', 'languages': ['eng'], 'filetype': 'message/rfc822', 'category': 'NarrativeText'})" ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "loader = UnstructuredEmailLoader(\"example_data/fake-email.eml\", mode=\"elements\")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] }, @@ -152,46 +108,30 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "6539f166", "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredEmailLoader(\n", - " \"example_data/fake-email.eml\",\n", - " mode=\"elements\",\n", - " process_attachments=True,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "aebead38", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ddeb60f4", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='This is a test email to use for unit tests.', metadata={'source': 'example_data/fake-email.eml', 'filename': 'fake-email.eml', 'file_directory': 'example_data', 'date': '2022-12-16T17:04:16-05:00', 'filetype': 'message/rfc822', 'sent_from': ['Matthew Robinson '], 'sent_to': ['Matthew Robinson '], 'subject': 'Test Email', 'category': 'NarrativeText'})" + "Document(page_content='This is a test email to use for unit tests.', metadata={'source': 'example_data/fake-email.eml', 'file_directory': 'example_data', 'filename': 'fake-email.eml', 'last_modified': '2022-12-16T17:04:16-05:00', 'sent_from': ['Matthew Robinson '], 'sent_to': ['Matthew Robinson '], 'subject': 'Test Email', 'languages': ['eng'], 'filetype': 'message/rfc822', 'category': 'NarrativeText'})" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "loader = UnstructuredEmailLoader(\n", + " \"example_data/fake-email.eml\",\n", + " mode=\"elements\",\n", + " process_attachments=True,\n", + ")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] }, @@ -210,57 +150,33 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade --quiet extract_msg" + "%pip install --upgrade --quiet extract_msg" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "1e7a8444", "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import OutlookMessageLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "77a055e6", - "metadata": {}, - "outputs": [], - "source": [ - "loader = OutlookMessageLoader(\"example_data/fake-email.msg\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "789882de", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "46aa0632", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='This is a test email to experiment with the MS Outlook MSG Extractor\\r\\n\\r\\n\\r\\n-- \\r\\n\\r\\n\\r\\nKind regards\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nBrian Zhou\\r\\n\\r\\n', metadata={'subject': 'Test for TIF files', 'sender': 'Brian Zhou ', 'date': 'Mon, 18 Nov 2013 16:26:24 +0800'})" + "Document(page_content='This is a test email to experiment with the MS Outlook MSG Extractor\\r\\n\\r\\n\\r\\n-- \\r\\n\\r\\n\\r\\nKind regards\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nBrian Zhou\\r\\n\\r\\n', metadata={'source': 'example_data/fake-email.msg', 'subject': 'Test for TIF files', 'sender': 'Brian Zhou ', 'date': datetime.datetime(2013, 11, 18, 0, 26, 24, tzinfo=zoneinfo.ZoneInfo(key='America/Los_Angeles'))})" ] }, - "execution_count": 11, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders import OutlookMessageLoader\n", + "\n", + "loader = OutlookMessageLoader(\"example_data/fake-email.msg\")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] }, @@ -289,7 +205,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/epub.ipynb b/docs/docs/integrations/document_loaders/epub.ipynb index 76df43e44ca..ffd48041a2e 100644 --- a/docs/docs/integrations/document_loaders/epub.ipynb +++ b/docs/docs/integrations/document_loaders/epub.ipynb @@ -9,7 +9,9 @@ "\n", ">[EPUB](https://en.wikipedia.org/wiki/EPUB) is an e-book file format that uses the \".epub\" file extension. The term is short for electronic publication and is sometimes styled ePub. `EPUB` is supported by many e-readers, and compatible software is available for most smartphones, tablets, and computers.\n", "\n", - "This covers how to load `.epub` documents into the Document format that we can use downstream. You'll need to install the [`pandoc`](https://pandoc.org/installing.html) package for this loader to work." + "This covers how to load `.epub` documents into the Document format that we can use downstream. You'll need to install the [`pandoc`](https://pandoc.org/installing.html) package for this loader to work with e.g. `brew install pandoc` for OSX.\n", + "\n", + "Please see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies." ] }, { @@ -19,43 +21,36 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade --quiet pandoc" + "%pip install --upgrade --quiet unstructured" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "721c48aa", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Guide\\n\\nTable of Contents\\n\\nBegin\\nReading\\n\\nPages\\n\\n169\\n\\n170\\n\\n171\\n\\n172\\n\\n173\\n\\n174\\n\\n175\\n\\n176\\n\\n177\\n\\n178\\n\\n179\\n\\n180\\n\\n181\\n\\n182\\n\\n183\\n\\n184\\n\\n185\\n\\n186\\n\\n187\\n\\n188\\n\\n189\\n\\n190\\n\\n191\\n\\n192\\n\\n193\\n\\n194\\n\\n195\\n\\n196\\n\\n197\\n\\n198\\n\\n199\\n\\n200\\n\\n201\\n\\n202\\n\\n203\\n\\n204\\n\\n205\\n\\n206\\n\\n207\\n\\n208\\n\\n209\\n\\n210\\n\\n211\\n\\n212\\n\\n213\\n\\n214\\n\\n215\\n\\n216\\n\\n217\\n\\n218\\n\\n219\\n\\n220\\n\\n221\\n\\n222\\n\\n223\\n\\n224\\n\\n225\\n\\n226\\n\\n227\\n\\n228\\n\\n229\\n\\n230\\n\\n231\\n\\n232\\n\\n233\\n\\n234\\n\\n235\\n\\n236\\n\\n237\\n\\n238\\n\\n239\\n\\n240\\n\\n241\\n\\n242\\n\\n243\\n\\n244\\n\\n245\\n\\n246\\n\\n247\\n\\n248\\n\\n249\\n\\n250\\n\\n251\\n\\n252\\n\\n253\\n\\n254\\n\\n255\\n\\n256\\n\\n257\\n\\n258\\n\\n259\\n\\n260\\n\\n169\\n\\nSECTION IV FAIRY STORIES—MODERN FANTASTIC\\nTALES\\n\\n170\\n\\nBIBLIOGRAPHY\\n\\nAlden, Raymond Macdonald, Why the Chimes Rang, and Other\\nStories.\\n\\nAndersen, Hans Christian, Fairy Tales.\\n\\nBarrie, Sir James Matthew, The Little White Bird.\\n[Peter Pan.]\\n\\nBaum, L. Frank, The Wizard of Oz.\\n\\nBenson, A. C., David Blaize and the Blue Door.\\n\\nBeston, H. B., The Firelight Fairy Book.\\n\\nBrown, Abbie Farwell, The Lonesomest Doll.\\n\\nBrowne, Frances, Granny\\'s Wonderful Chair.\\n\\nCarryl, Charles E., Davy and the Goblin.\\n\\n\"Carroll, Lewis,\" Alice\\'s Adventures in\\nWonderland.\\n\\n\"Carroll, Lewis,\" Through the Looking-Glass and What Alice\\nFound There.\\n\\nChamisso, Adelbert von, The Wonderful History of Peter\\nSchlemihl.\\n\\n\"Collodi, C.,\" The Adventures of Pinocchio.\\n\\nCox, Palmer, The Brownies: Their Book.\\n\\nCraik, Dinah Mulock, Adventures of a Brownie.\\n\\nCraik, Dinah Mulock, The Little Lame Prince and His\\nTraveling-Cloak.\\n\\nCrothers, Samuel McChord, Miss Muffet\\'s Christmas\\nParty.\\n\\nDickens, Charles, A Christmas Carol.\\n\\nEwald, Carl, Two-Legs, and Other Stories.\\n\\nGrahame, Kenneth, The Wind in the Willows.\\n\\nHarris, Joel Chandler, Nights with Uncle Remus.\\n\\nHawthorne, Nathaniel, \"The Snow Image,\" \"Little Daffydowndilly,\" \"A\\nRill from the Town Pump.\"\\n\\nIngelow, Jean, Mopsa the Fairy.\\n\\nIngelow, Jean, Stories Told to a Child. 2 vols.\\n\\nJordan, David Starr, The Book of Knight and\\nBarbara.\\n\\nLagerlof, Selma, The Wonderful Adventures of Nils.\\n\\nLa Motte-FouquĆ©, F. de, Undine.\\n\\nLang, Andrew, Prince Prigio.\\n\\nKingsley, Charles, The Water Babies.\\n\\nMaeterlinck, Maurice, The Blue Bird.\\n\\nMacdonald, George, The Princess and the Goblin.\\n\\nMacdonald, George, At the Back of the North Wind.\\n\\nPyle, Katherine, In the Green Forest.\\n\\nRaspe, Rudolph Erich, Baron Munchausen\\'s\\nNarrative.\\n\\nRichards, Laura E., The Story of Toto.\\n\\nRichards, Laura E., The Pig Brother.\\n\\nRuskin, John, The King of the Golden River.\\n\\nStockton, Frank R., Fanciful Tales.\\n\\nSwift, Jonathan, Gulliver\\'s Travels.\\n\\nThackeray, William Makepeace, The Rose and the\\nRing.\\n\\nWilde, Oscar, The Happy Prince, and Other Stories.\\n\\nWilkins, Mary E., The Pot of Gold.\\n\\n171\\n\\nINTRODUCTORY\\n\\nThe difficulties of classification are very apparent here, and once\\nmore it must be noted that illustrative and practical purposes rather\\nthan logical ones are served by the arrangement adopted. The modern\\nfanciful story is here placed next to the real folk story instead of\\nafter all the groups of folk products. The Hebrew stories at the\\nbeginning belong quite as well, perhaps even better, in Section V, while\\nthe stories at the end of Section VI shade off into the more modern\\ntypes of short tales. Then the fact that other groups of modern stories\\nare to follow later, illustrating more realistic studies of life and the\\nvery recent and remarkably numerous writings centering around animal\\nlife, limits the list here. Many of the animal stories might, with equal\\npropriety, be placed under the head of the fantastic.\\n\\nThe child\\'s natural literature. The\\nworld has lost certain secrets as the price of an advancing\\ncivilization. It is a commonplace of observation that no one can\\nduplicate the success of Mother Goose, whether she be thought of as the\\nmaker of jingles or the teller of tales. The conditions of modern life\\npreclude the generally naĆÆve attitude that produced the folk rhymes,\\nballads, tales, proverbs, fables, and myths. The folk saw things simply\\nand directly. The complex, analytic, questioning mind is not yet, either\\nin or out of stories. The motives from which people act are to them\\nplain and not mixed. Characters are good or bad. They feel no need of\\nelaborately explaining their joys and sorrows. Such experiences come\\nwith the day\\'s work. \"To-morrow to fresh woods, and pastures new.\" The\\nzest of life with them is emphatic. Their humor is fresh, unbounded,\\nsincere; there is no trace of cynicism. In folk literature we do not\\nfeel the presence of a \"writer\" who is mightily concerned about\\nmaintaining his reputation for wisdom, originality, or style. Hence the\\nfreedom from any note of straining after effect, of artificiality. In\\nthe midst of a life limited to fundamental needs, their literature deals\\nwith fundamentals. On the whole, it was a literature for entertainment.\\nA more learned upper class may have concerned itself then about\\n\"problems\" and \"purposes,\" as the whole world does now, but the\\nliterature of the folk had no such interests.\\n\\nWithout discussing the limits of the culture-epoch theory of human\\ndevelopment as a complete guide in education, it is clear that the young\\nchild passes through a period when his mind looks out upon the world in\\na manner analogous to that of the folk as expressed in their literature.\\nQuarrel with the fact as we may, it still remains a fact that his nature\\ncraves these old stories and will not be satisfied with something \"just\\nas good.\"\\n\\nThe modern fairy story. The advance\\nof civilization has been accompanied by a wistful longing for the\\nsimplicities left by the way. In some periods this interest in the past\\nhas been more marked than in others. When the machinery of life has\\nweighed too heavily on the human spirit, men have turned for relief to a\\ncontemplation of the \"good old times\" and have preached crusades of a\\n\"return to nature.\" 172 Many modern writers have tried to recapture some\\nof the power of the folk tale by imitating its method. In many cases\\nthey have had a fair degree of success: in one case, that of Hans\\nChristian Andersen, the success is admittedly very complete. As a rule,\\nhowever, the sharpness of the sense of wonder has been blunted, and many\\nimitators of the old fairy tale succeed in keeping only the shell.\\nAnother class of modern fantastic tale is that of the pourquoi story, which has the explanation of something\\nas its object. Such tales grow out of the attempt to use the charm of\\nold stories as a means of conveying instruction, somewhat after the\\nmethod of those parents who covered up our bitter medicine with some of\\nour favorite jam. Even \"Little Red Riding Hood,\" as we saw, has been\\nturned into a flower myth. So compelling is this pedagogical motive that\\nso-called nature myths have been invented or made from existing stories\\nin great numbers. The practical results please many teachers, but it may\\nbe questioned whether the gain is sufficient to compensate children for\\nthe distorting results upon masterpieces.\\n\\nWide range of the modern fairy tale.\\nThe bibliography will suggest something of the treasures in the field of\\nthe modern fanciful story. From the delightful nonsense of Alice\\nin Wonderland and the \"travelers\\' tales\" of Baron\\nMunchausen to the profound seriousness of The King of the\\nGolden River and Why the Chimes Rang is a far cry.\\nThere are the rich fancies of Barrie and Maeterlinck, at the same time\\ndelicate as the promises of spring and brilliant as the fruitions of\\nsummer. One may be blown away to the land of Oz, he may lose his shadow\\nwith Peter Schlemihl, he may outdo the magic carpet with his\\nTraveling-Cloak, he may visit the courts of kings with his Wonderful\\nChair; Miss Muffet will invite us to her Christmas party, Lemuel\\nGulliver will lead us to lands not marked in the school atlas; on every\\nside is a world of wonder.\\n\\nSome qualities of these modern tales.\\nEvery age produces after its own fashion, and we must expect to find the\\nmodern user of the fairy-story method expressing through it the\\nqualities of his own outlook upon the world. Interest in the picturesque\\naspects of landscape will be emphasized, as in the early portions of\\n\"The Story of Fairyfoot\" and, with especial magnificence of style,\\nthroughout The King of the Golden River. There will appear\\nthe saddened mood of the modern in the face of the human miseries that\\nmake happiness a mockery, as in \"The Happy Prince.\" The destructive\\neffects of the possessive instinct upon all that is finest in human\\nnature is reflected in \"The Prince\\'s Dream.\" That the most valuable\\nefforts are often those performed with least spectacular settings may be\\ndiscerned in \"The Knights of the Silver Shield,\" while the lesson of\\nkindly helpfulness is the burden of \"Old Pipes and the Dryad.\" In many\\nmodern stories the reader is too much aware of the conscious efforts of\\nstyle and structure. The thoughtful child will sometimes be too much\\ndistressed by the more somber modern story, and should not hear too many\\nof the gloomy type.\\n\\nAndersen the consummate master. Hans\\nChristian Andersen is the acknowledged master of the modern story for\\nchildren. What are the sources of his success? Genius is always\\nunexplainable except in terms of itself, but some things are clear. To\\nbegin, he makes a mark—drives down a peg: \"There came a soldier marching\\nalong 173 the high road—one, two! one, two!\" and\\nyou are off. No backing and filling, no jockeying for position, no\\nelaborate setting of the stage. The story\\'s the thing! Next, the\\nlanguage is the language of common oral speech, free and unrestrained.\\nThe rigid forms of the grammar are eschewed. There is no beating around\\nthe bush. Seeing through the eyes of the child, he uses the language\\nthat is natural to such sight: \"Aha! there sat the dog with eyes as big\\nas mill-wheels.\" In quick dramatic fashion the story unrolls before your\\nvision: \"So the soldier cut the witch\\'s head off. There she lay!\" No\\nagonizing over the cruelty of it, the lack of sympathy. It is a joke\\nafter the child\\'s own heart, and with a hearty laugh at this end to an\\nimpostor, the listener is on with the story. The logic is the logic of\\nchildhood: \"And everyone could see she was a real princess, for she was\\nso lovely.\" When Andersen deals with some of the deeper truths of\\nexistence, as in \"The Nightingale\" or \"The Ugly Duckling,\" he still\\nmanages to throw it all into the form that is natural and convincing and\\nsimple to the child. He never mounts a pedestal and becomes a grown-up\\nphilosopher. Perhaps Andersen\\'s secret lay in the fact that some fairy\\ngodmother invested him at birth with a power to see things so completely\\nas a child sees them that he never questioned the dignity of the method.\\nIn few of his stories is there any evidence of a constraint due to a\\nconscious attempt to write down to the understandings of children.\\n\\nSUGGESTIONS FOR READING\\n\\nThe most valuable discussion of the difficulties to be mastered in\\nwriting the literary fairy tale, and the story of the only very complete\\nmastery yet made, will be found in the account of Hans Christian\\nAndersen in Eminent Authors of the Nineteenth Century, by\\nGeorg Brandes. Now and then hints of importance on such stories and\\ntheir value for children may be found in biographies of the more\\nprominent writers represented in the section and mentioned in the\\nbibliography, and in magazine articles and reviews. These latter may be\\nlocated by use of the periodical indexes found in most libraries. For\\nthe proper attitude which the schools should have toward fiction and\\nfanciful writing in general, nothing could be better than two lectures\\non \"Children\\'s Reading,\" in On the Art of Reading, by Sir\\nArthur Quiller-Couch.\\n\\n174\\n\\n190\\n\\nThe rabbis of old were good story-tellers. They were essentially\\nteachers and they understood that the best sermon is a story. \"They were\\nfond of the parable, the anecdote, the apt illustration, and their\\nlegends that have been transmitted to us, all aglow with the light and\\nlife of the Orient, possess perennial charm.\" It is possible to find in\\nrabbinical sources a large number of brief stories that have the power\\nof entertaining as well as of emphasizing some qualities of character\\nthat are important in all ages. The plan of this book does not include\\nthe wonderful stories of the Old Testament, which are easy of access to\\nany teacher and may be used as experience directs. The Hebrew stories\\nfollowing correspond very nearly to the folk anecdote and are placed in\\nthis section because of their literary form.\\n\\nDr. Abram S. Isaacs (1851—) is a professor in New York University and\\nis also a rabbi. The selection that follows is from his Stories\\nfrom the Rabbis. (Copyrighted. Used by special permission of The\\nBloch Publishing Company, New York.) Taking advantage of the popular\\nsuperstition that a four-leaved clover is a sign of good luck, Dr.\\nIsaacs has grouped together four parable-like stories, each of which\\ndeals with wealth as a subject. The editors are responsible for the\\nspecial titles given. The messages of these stories might be summarized\\nas follows: If you would be lucky, (1) be honest because it is right to\\nbe honest, (2) value good friends more highly than gold, (3) let love\\naccompany each gift of charity, and (4) use common sense in your\\nbusiness ventures.\\n\\nA FOUR-LEAVED CLOVER\\n\\nABRAM S. ISAACS\\n\\n1. The Rabbi and The Diadem\\n\\nGreat was the alarm in the palace of Rome, which soon spread\\nthroughout the entire city. The Empress had lost her costly diadem, and\\nit could not be found. They searched in every direction, but it was all\\nin vain. Half distracted, for the mishap boded no good to her or her\\nhouse, the Empress redoubled her exertions to regain her precious\\npossession, but without result. As a last resource it was proclaimed in\\nthe public streets:\\n\\n\"The Empress has lost a priceless diadem. Whoever restores it within\\nthirty days shall receive a princely reward. But he who delays, and\\nbrings it after thirty days, shall lose his head.\"\\n\\nIn those times all nationalities flocked toward Rome; all classes and\\ncreeds could be met in its stately halls and crowded thoroughfares.\\nAmong the rest was a rabbi, a learnĆØd sage from the East, who loved\\ngoodness and lived a righteous life, in the stir and turmoil of the\\nWestern world. It chanced one night as he was strolling up and down, in\\nbusy meditation, beneath the clear, moonlit sky, he saw the diadem\\nsparkling at his feet. He seized it quickly, brought it to his dwelling,\\nwhere he guarded it carefully until the thirty days had expired, when he\\nresolved to return it to the owner.\\n\\nHe proceeded to the palace, and, undismayed at sight of long lines of\\nsoldiery and officials, asked for an audience with the Empress.\\n\\n\"What dost thou mean by this?\" she inquired, when he told her his\\nstory and gave her the diadem. \"Why didst thou delay until this hour?\\nDost thou know the penalty? Thy head must be forfeited.\"\\n\\n\"I delayed until now,\" the rabbi answered calmly, \"so that thou\\nmightst know that I return thy diadem, not for the sake of the reward,\\nstill less out of fear of punishment; but solely to comply 175 with\\nthe Divine command not to withhold from another the property which\\nbelongs to him.\"\\n\\n\"Blessed be thy God!\" the Empress answered, and dismissed the rabbi\\nwithout further reproof; for had he not done right for right\\'s sake?\\n\\n2. Friendship\\n\\nA certain father was doubly blessed—he had reached a good old age,\\nand had ten sons. One day he called them to his side, and after repeated\\nexpressions of affection, told them that he had acquired a fortune by\\nindustry and economy, and would give them one hundred gold pieces each\\nbefore his death, so that they might begin business for themselves, and\\nnot be obliged to wait until he had passed away. It happened, however,\\nthat, soon after, he lost a portion of his property, much to his regret,\\nand had only nine hundred and fifty gold pieces left. So he gave one\\nhundred to each of his nine sons. When his youngest son, whom he loved\\nmost of all, asked naturally what was to be his share, the father\\nreplied:\\n\\n\"My son, I promised to give each of thy brothers one hundred gold\\npieces. I shall keep my word to them. I have fifty left. Thirty I shall\\nreserve for my funeral expenses, and twenty will be thy portion. But\\nunderstand this—I possess, in addition, ten friends, whom I give over to\\nthee as compensation for the loss of the eighty gold pieces. Believe me,\\nthey are worth more than all the gold and silver.\"\\n\\nThe youth tenderly embraced his parent, and assured him that he was\\ncontent, such was his confidence and affection. In a few days the father\\ndied, and the nine sons took their money, and without a thought of their\\nyoungest brother and the small amount he had received, followed each his\\nown fancy. But the youngest son, although his portion was the least,\\nresolved to heed his father\\'s words, and hold fast to the ten friends.\\nWhen a short time had elapsed he prepared a simple feast, went to the\\nten friends of his father, and said to them: \"My father, almost in his\\nlast words, asked me to keep you, his friends, in honor. Before I leave\\nthis place to seek my fortune elsewhere, will you not share with me a\\nfarewell meal, and aid me thus to comply with his dying request?\"\\n\\nThe ten friends, stirred by his earnestness and cordiality, accepted\\nhis invitation with pleasure, and enjoyed the repast, although they were\\nused to richer fare. When the moment for parting arrived, however, one\\nof them rose and spoke: \"My friends, it seems to me that of all the sons\\nof our dear friend that has gone, the youngest alone is mindful of his\\nfather\\'s friendship for us, and reverences his memory. Let us, then, be\\ntrue friends to him, for his own sake as well, and provide for him a\\ngenerous sum, that he may begin business here, and not be forced to live\\namong strangers.\"\\n\\nThe proposal, so unexpected and yet so merited, was received with\\napplause. The youth, proud of their friendship, soon became a prosperous\\nmerchant, who never forgot that faithful friends were more valuable than\\ngold or silver, and left an honored name to his descendants.\\n\\n3. True Charity\\n\\nThere lived once a very wealthy man, who cared little for money,\\nexcept as 176 a means for helping others. He used to adopt a\\npeculiar plan in his method of charitable relief. He had three boxes\\nmade for the three different classes of people whom he desired to\\nassist. In one box he put gold pieces, which he distributed among\\nartists and scholars, for he honored knowledge and learning as the\\nhighest possession. In the second box he placed silver pieces for widows\\nand orphans, for whom his sympathies were readily awakened. In the third\\nwere copper coins for the general poor and beggars—no one was turned\\naway from his dwelling without some gift, however small.\\n\\nThat the man was beloved by all, need hardly be said. He rejoiced\\nthat he was enabled to do so much good, retained his modest bearing, and\\ncontinued to regard his wealth as only an incentive to promote the\\nhappiness of mankind, without distinction of creed or nationality.\\nUnhappily, his wife was just the opposite. She rarely gave food or\\nraiment to the poor, and felt angry at her husband\\'s liberality, which\\nshe considered shameless extravagance.\\n\\nThe day came when in the pressure of various duties he had to leave\\nhis house, and could not return until the morrow. Unaware of his sudden\\ndeparture, the poor knocked at the door as usual for his kind gifts; but\\nwhen they found him absent, they were about to go away or remain in the\\nstreet, being terrified at the thought of asking his wife for alms.\\nVexed at their conduct, she exclaimed impetuously: \"I will give to the\\npoor according to my husband\\'s method.\"\\n\\nShe seized the keys of the boxes, and first opened the box of gold.\\nBut how great was her terror when she gazed at its contents—frogs\\njumping here and there. Then she went to the silver box, and it was full\\nof ants. With troubled heart, she opened the copper box, and it was\\ncrowded with creeping bugs. Loud then were her complaints, and bitter\\nher tears, at the deception, and she kept her room until her husband\\nreturned.\\n\\nNo sooner did the man enter the room, annoyed that so many poor\\npeople were kept waiting outside, than she asked him: \"Why did you give\\nme keys to boxes of frogs, ants, and bugs, instead of gold, silver, and\\ncopper? Was it right thus to deceive your wife, and disappoint the\\npoor?\"\\n\\n\"Not so,\" rejoined her husband. \"The mistake must be yours, not mine.\\nI have given you the right keys. I do not know what you have done with\\nthem. Come, let me have them. I am guiltless of any deception.\" He took\\nthe keys, quickly opened the boxes, and found the coins as he had left\\nthem. \"Ah, dear wife,\" said he, when she had regained her composure,\\n\"your heart, I fear, was not in the gift, when you wished to give to the\\npoor. It is the feeling that prompts us to aid, not the mere money,\\nwhich is the chief thing after all.\"\\n\\nAnd ever after, her heart was changed. Her gifts blessed the poor of\\nthe land, and aroused their love and reverence.\\n\\n4. An Eastern Garden\\n\\nIn an Eastern city a lovely garden flourished, whose beauty and\\nluxuriance awakened much admiration. It was the owner\\'s greatest\\npleasure to watch its growth, as leaf, flower, and tree seemed daily to\\nunfold to brighter bloom. One morning, while taking his usual stroll\\nthrough the well-kept paths, 177 he was surprised to find that\\nsome blossoms were picked to pieces. The next day he noticed more signs\\nof mischief, and rendered thus more observant he gave himself no rest\\nuntil he had discovered the culprit. It was a little trembling bird,\\nwhom he managed to capture, and was about to kill in his anger, when it\\nexclaimed: \"Do not kill me, I beg you, kind sir. I am only a wee, tiny\\nbird. My flesh is too little to satisfy you. I would not furnish\\none-hundredth of a meal to a man of your size. Let me free without any\\nhesitation, and I shall teach you something that will be of much use to\\nyou and your friends.\"\\n\\n\"I would dearly like to put an end to you,\" replied the man, \"for you\\nwere rapidly putting an end to my garden. It is a good thing to rid the\\nworld of such annoyances. But as I am not revengeful, and am always glad\\nto learn something useful, I shall set you free this time.\" And he\\nopened his hand to give the bird more air.\\n\\n\"Attention!\" cried the bird. \"Here are three rules which should guide\\nyou through life, and if you observe them you will find your path made\\neasier: Do not cry over spilt milk; do not desire what is unattainable,\\nand do not believe what is impossible.\"\\n\\nThe man was satisfied with the advice, and let the bird escape; but\\nit had scarcely regained its liberty, when, from a high tree opposite,\\nit exclaimed:\\n\\n\"What a silly man! The idea of letting me escape! If you only knew\\nwhat you have lost! But it is too late now.\"\\n\\n\"What have I lost?\" the man asked, angrily.\\n\\n\"Why, if you had killed me, as you intended, you would have found\\ninside of me a huge pearl, as large as a goose\\'s egg, and you would have\\nbeen a wealthy man forever.\"\\n\\n\"Dear little bird,\" the man said in his blandest tones; \"sweet little\\nbird, I will not harm you. Only come down to me, and I will treat you as\\nif you were my own child, and give you fruit and flowers all day. I\\nassure you of this most sacredly.\"\\n\\nBut the bird shook its head sagely, and replied: \"What a silly man,\\nto forget so soon the advice which was given him in all seriousness. I\\ntold you not to cry over spilt milk, and here you are, worrying over\\nwhat has happened. I urged you not to desire the unattainable, and now\\nyou wish to capture me again. And, finally, I asked you not to believe\\nwhat is impossible, and you are rashly imagining that I have a huge\\npearl inside of me, when a goose\\'s egg is larger than my whole body. You\\nought to learn your lessons better in the future, if you would become\\nwise,\" added the bird, as with another twist of its head it flew away,\\nand was lost in the distance.\\n\\n191\\n\\nA classic collection of short stories from the ancient Hebrew sages\\nis the little book, Hebrew Tales, published in London in\\n1826 by the noted Jewish scholar Hyman Hurwitz (1770-1844). A modern\\nhandy edition of this book (about sixty tales) is published as Vol. II\\nof the Library of Jewish Classics. Of special interest is the fact that\\nit contained three stories by the poet Samuel Taylor Coleridge, who had\\npublished them first in his periodical, The Friend.\\nColeridge was much interested in Hebrew literature, and especially fond\\nof speaking in parables, as those who know \"The Ancient Mariner\" will\\nreadily recall. The 178 following is one of the three stories referred\\nto, and it had prefixed to it the significant text, \"The Lord helpeth\\nman and beast.\" (Psalm XXXVI, 6.)\\n\\nTHE LORD HELPETH MAN AND BEAST\\n\\nSAMUEL TAYLOR COLERIDGE\\n\\nDuring his march to conquer the world, Alexander, the Macedonian,\\ncame to a people in Africa who dwelt in a remote and secluded corner, in\\npeaceful huts, and knew neither war nor conqueror. They led him to the\\nhut of their chief, who received him hospitably, and placed before him\\ngolden dates, golden figs, and bread of gold.\\n\\n\"Do you eat gold in this country?\" said Alexander.\\n\\n\"I take it for granted,\" replied the chief, \"that thou wert able to\\nfind eatable food in thine own country. For what reason, then, art thou\\ncome amongst us?\"\\n\\n\"Your gold has not tempted me hither,\" said Alexander, \"but I would\\nbecome acquainted with your manners and customs.\"\\n\\n\"So be it,\" rejoined the other: \"sojourn among us as long as it\\npleaseth thee.\"\\n\\nAt the close of this conversation, two citizens entered, as into\\ntheir court of justice. The plaintiff said, \"I bought of this man a\\npiece of land, and as I was making a deep drain through it, I found a\\ntreasure. This is not mine, for I only bargained for the land, and not\\nfor any treasure that might be concealed beneath it; and yet the former\\nowner of the land will not receive it.\" The defendant answered, \"I hope\\nI have a conscience, as well as my fellow citizen. I sold him the land\\nwith all its contingent, as well as existing advantages, and\\nconsequently, the treasure inclusively.\"\\n\\nThe chief, who was at the same time their supreme judge,\\nrecapitulated their words, in order that the parties might see whether\\nor not he understood them aright. Then, after some reflection, said:\\n\"Thou hast a son, friend, I believe?\"\\n\\n\"Yes.\"\\n\\n\"And thou,\" addressing the other, \"a daughter?\"\\n\\n\"Yes.\"\\n\\n\"Well, then, let thy son marry thy daughter, and bestow the\\ntreasure on the young couple for a marriage portion.\" Alexander seemed\\nsurprised and perplexed. \"Think you my sentence unjust?\" the chief asked\\nhim.\\n\\n\"Oh, no!\" replied Alexander; \"but it astonishes me.\"\\n\\n\"And how, then,\" rejoined the chief, \"would the case have been\\ndecided in your country?\"\\n\\n\"To confess the truth,\" said Alexander, \"we should have taken both\\nparties into custody, and have seized the treasure for the king\\'s\\nuse.\"\\n\\n\"For the king\\'s use!\" exclaimed the chief; \"does the sun shine on\\nthat country?\"\\n\\n\"Oh, yes!\"\\n\\n\"Does it rain there?\"\\n\\n\"Assuredly.\"\\n\\n\"Wonderful! But are there tame animals in the country, that live on\\nthe grass and green herbs?\"\\n\\n\"Very many, and of many kinds.\"\\n\\n\"Ay, that must, then, be the cause,\" said the chief: \"for the sake of\\nthose innocent animals the All-gracious Being continues to let the sun\\nshine, and the rain drop down on your country; since its inhabitants are\\nunworthy of such blessings.\" 179\\n\\n192\\n\\nBy almost common consent Hans Christian Andersen (1805-1875), the\\nDanish author, is the acknowledged master of all modern writers of fairy\\ntales. He was born in poverty, the son of a poor shoemaker. With a\\nnaturally keen dramatic sense, his imagination was stirred by stories\\nfrom the Arabian Nights and La Fontaine\\'s\\nFables, by French and Spanish soldiers marching through his\\nnative city, and by listening to the wonderful folk tales of his\\ncountry. On a toy stage and with toy actors, these vivid impressions\\ntook actual form. The world continued a dramatic spectacle to him\\nthroughout his existence. His consuming ambition was for the stage, but\\nhe had none of the personal graces so necessary for success. He was\\nungainly and awkward, like his \"ugly duckling.\" But when at last he\\nbegan to write, he had the power to transfer to the page the vivid\\ndramas in his mind, and this power culminated in the creation of fairy\\nstories for children which he began to publish in 1835. It is usual to\\nsay that Andersen, like Peter Pan, \"never grew up,\" and it is certain\\nthat he never lost the power of seeing things as children see them. Like\\nmany great writers whose fame now rests on the suffrages of child\\nreaders, Andersen seems at first to have felt that the\\nTales were slight and beneath his dignity. They are not all\\nof the same high quality. Occasionally one of them becomes \"too\\nsentimental and sickly sweet,\" but the best of them have a sturdiness\\nthat is thoroughly refreshing.\\n\\nThe most acute analysis of the elements of Andersen\\'s greatness as\\nthe ideal writer for children is that made by his fellow-countryman\\nGeorg Brandes in Eminent Authors of the Nineteenth Century.\\nA briefer account on similar lines will be found in H. J. Boyesen\\'s\\nScandinavian Literature. A still briefer account, eminently\\nsatisfactory for an introduction to Andersen, by Benjamin W. Wells, is\\nin Warner\\'s Library of the World\\'s Best Literature. The\\ninterested student cannot, of course, afford to neglect Andersen\\'s own\\nThe Story of My Life. Among the more elaborate biographies\\nthe Life of Hans Christian Andersen by R. Nisbet Bain is\\nprobably the best. The first translation of the Tales into\\nEnglish was made by Mary Howitt in 1846 and, as far as it goes, is still\\nregarded as one of the finest. However, Andersen has been very fortunate\\nin his many translators. The version by H. W. Dulcken has been published\\nin many cheap forms and perhaps more widely read than any other. In\\naddition to the stories in the following pages, some of those most\\nsuitable for use are \"The Little Match Girl,\" \"The Silver Shilling,\"\\n\"Five Peas in the Pod,\" \"Hans Clodhopper,\" and \"The Snow Queen.\" The\\nlatter is one of the longest and an undoubted masterpiece.\\n\\nThe first two stories following are taken from Mrs. Henderson\\'s\\nAndersen\\'s Best Fairy Tales. (Copyright. Rand McNally &\\nCo.) This little book contains thirteen stories in a very simple\\ntranslation and also an excellent story of Andersen\\'s life in a form\\nmost attractive to children. \"The Princess and the Pea\" is a story for\\nthe story\\'s sake. The humor, perhaps slightly satirical, is based upon\\nthe notion so common in the old folk tales that royal personages are\\ndecidedly more delicate than the person of low degree. However, the\\ntendency to think oneself of more consequence than another is not\\nconfined to any one class.\\n\\nTHE REAL PRINCESS\\n\\n\\nHANS CHRISTIAN ANDERSEN\\n\\n(Version by Alice Corbin Henderson)\\n\\nThere was once a Prince who wanted to marry a Princess. But it was\\nonly a real Princess that he wanted to marry.\\n\\nHe traveled all over the world to find a real one. But, although\\nthere were 180 plenty of princesses, whether they were\\nreal princesses he could never discover. There was always\\nsomething that did not seem quite right about them.\\n\\nAt last he had to come home again. But he was very sad, because he\\nwanted to marry a real Princess.\\n\\nOne night there was a terrible storm. It thundered and lightened and\\nthe rain poured down in torrents. In the middle of the storm there came\\na knocking, knocking, knocking at the castle gate. The kind old King\\nhimself went down to open the castle gate.\\n\\nIt was a young Princess that stood outside the gate. The wind and the\\nrain had almost blown her to pieces. Water streamed out of her hair and\\nout of her clothes. Water ran in at the points of her shoes and out\\nagain at the heels. Yet she said that she was a real\\nPrincess.\\n\\n\"Well, we will soon find out about that!\" thought the Queen.\\n\\nShe said nothing, but went into the bedroom, took off all the\\nbedding, and put a small dried pea on the bottom of the bedstead. Then\\nshe piled twenty mattresses on top of the pea, and on top of these she\\nput twenty feather beds. This was where the Princess had to sleep that\\nnight.\\n\\nIn the morning they asked her how she had slept through the\\nnight.\\n\\n\"Oh, miserably!\" said the Princess. \"I hardly closed my eyes the\\nwhole night long! Goodness only knows what was in my bed! I slept upon\\nsomething so hard that I am black and blue all over. It was\\ndreadful!\"\\n\\nSo then they knew that she was a real Princess. For, through\\nthe twenty mattresses and the twenty feather beds, she had still felt\\nthe pea. No one but a real Princess could have had such a\\ntender skin.\\n\\nSo the Prince took her for his wife. He knew now that he had a\\nreal Princess.\\n\\nAs for the pea, it was put in a museum where it may still be seen if\\nno one has carried it away.\\n\\nNow this is a true story!\\n\\n193\\n\\nWith some dozen exceptions, all of Andersen\\'s Tales are\\nbased upon older stories, either upon some old folk tale or upon\\nsomething that he ran across in his reading. Dr. Brandes, in his\\nEminent Authors, shows in detail how \"The Emperor\\'s New Clothes\"\\ncame into being. \"One day in turning over the leaves of Don Manuel\\'s\\nCount Lucanor, Andersen became charmed by the homely wisdom\\nof the old Spanish story, with the delicate flavor of the Middle Ages\\npervading it, and he lingered over chapter vii, which treats of how a\\nking was served by three rogues.\" But Andersen\\'s story is a very\\ndifferent one in many ways from his Spanish original. For one thing, the\\nmeaning is so universal that no one can miss it. Most of us have, in all\\nlikelihood, at some time pretended to know what we do not know or to be\\nwhat we are not in order to save our face, to avoid the censure or\\nridicule of others. \"There is much concerning which people dare not\\nspeak the truth, through cowardice, through fear of acting otherwise\\nthan \\'all the world,\\' through anxiety lest they should appear stupid.\\nAnd the story is eternally new and it never ends. It has its grave side,\\nbut just because of its endlessness it has also its humorous side.\" When\\nthe absurd bubble of the grand procession is punctured by the child,\\nwhose mental honesty has not yet been spoiled by the pressure of\\nconvention, the Emperor \"held himself stiffer than ever, and the\\nchamberlains carried the invisible train.\" For it would never do to hold\\nup the procession!\\n\\nTranscriber\\'s Note: original reads \\'Emporer\\'s\\'\\n\\n181\\n\\nTHE EMPEROR\\'S NEW CLOTHES\\n\\n\\nHANS CHRISTIAN ANDERSEN\\n\\n(Version by Alice Corbin Henderson)\\n\\nMany years ago there lived an Emperor who thought so much of new\\nclothes that he spent all his money on them. He did not care for his\\nsoldiers; he did not care to go to the theater. He liked to drive out in\\nthe park only that he might show off his new clothes. He had a coat for\\nevery hour of the day. They usually say of a king, \"He is in the council\\nchamber.\" But of the Emperor they said, \"He is in the clothes\\ncloset!\"\\n\\nIt was a gay city in which the Emperor lived. And many strangers came\\nto visit it every day. Among these, one day, there came two rogues who\\nset themselves up as weavers. They said they knew how to weave the most\\nbeautiful cloths imaginable. And not only were the colors and patterns\\nused remarkably beautiful, but clothes made from this cloth could not be\\nseen by any one who was unfit for the office he held or was too stupid\\nfor any use.\\n\\n\"Those would be fine clothes!\" thought the Emperor. \"If I wore those\\nI could find out what men in my empire were not fit for the places they\\nheld. I could tell the clever men from the dunces! I must have some\\nclothes woven for me at once!\"\\n\\nSo he gave the two rogues a great deal of money that they might begin\\ntheir work at once.\\n\\nThe rogues immediately put up two looms and pretended to be working.\\nBut there was nothing at all on their looms. They called for the finest\\nsilks and the brightest gold, but this they put into their pockets. At\\nthe empty looms they worked steadily until late into the night.\\n\\n\"I should like to know how the weavers are getting on with my\\nclothes,\" thought the Emperor.\\n\\nBut he felt a little uneasy when he thought that any one who was\\nstupid or was not fit for his office would be unable to see the cloth.\\nOf course he had no fears for himself; but still he thought he would\\nsend some one else first, just to see how matters stood.\\n\\n\"I will send my faithful old Minister to the weavers,\" thought the\\nEmperor. \"He can see how the stuff looks, for he is a clever man, and no\\none is so careful in fulfilling duties as he is!\"\\n\\nSo the good old Minister went into the room where the two rogues sat\\nworking at the empty looms.\\n\\n\"Mercy on us!\" thought the old Minister, opening his eyes wide, \"I\\ncan\\'t see a thing!\" But he didn\\'t care to say so.\\n\\nBoth the rascals begged him to be good enough to step a little\\nnearer. They pointed to the empty looms and asked him if he did not\\nthink the pattern and the coloring wonderful. The poor old Minister\\nstared and stared as hard as he could, but he could not see anything,\\nfor, of course, there was nothing to see!\\n\\n\"Mercy!\" he said to himself. \"Is it possible that I am a dunce? I\\nnever thought so! Certainly no one must know it. Am I unfit for office?\\nIt will never do to say that I cannot see the stuff!\"\\n\\n\"Well, sir, why do you say nothing of it?\" asked the rogue who was\\npretending to weave.\\n\\n\"Oh, it is beautiful—charming!\" said the old Minister, peering\\nthrough his spectacles. \"What a fine pattern, and what wonderful colors!\\nI shall tell the Emperor that I am very much pleased with it.\" 182\\n\\n\"Well, we are glad to hear you say so,\" answered the two\\nswindlers.\\n\\nThen they named all the colors of the invisible cloth upon the looms,\\nand described the peculiar pattern. The old Minister listened intently,\\nso that he could repeat all that was said of it to the Emperor.\\n\\nThe rogues now began to demand more money, more silk, and more gold\\nthread in order to proceed with the weaving. All of this, of course,\\nwent into their pockets. Not a single strand was ever put on the empty\\nlooms at which they went on working.\\n\\nThe Emperor soon sent another faithful friend to see how soon the new\\nclothes would be ready. But he fared no better than the Minister. He\\nlooked and looked and looked, but still saw nothing but the empty\\nlooms.\\n\\n\"Isn\\'t that a pretty piece of stuff?\" asked both rogues, showing and\\nexplaining the handsome pattern which was not there at all.\\n\\n\"I am not stupid!\" thought the man. \"It must be that I am not worthy\\nof my good position. That is, indeed, strange. But I must not let it be\\nknown!\"\\n\\nSo he praised the cloth he did not see, and expressed his approval of\\nthe color and the design that were not there. To the Emperor he said,\\n\"It is charming!\"\\n\\nSoon everybody in town was talking about the wonderful cloth that the\\ntwo rogues were weaving.\\n\\nThe Emperor began to think now that he himself would like to see the\\nwonderful cloth while it was still on the looms. Accompanied by a number\\nof his friends, among whom were the two faithful officers who had\\nalready beheld the imaginary stuff, he went to visit the two men who\\nwere weaving, might and main, without any fiber and without any\\nthread.\\n\\n\"Isn\\'t it splendid!\" cried the two statesmen who had already been\\nthere, and who thought the others would see something upon the empty\\nlooms. \"Look, your Majesty! What colors! And what a design!\"\\n\\n\"What\\'s this?\" thought the Emperor. \"I see nothing at all! Am I a\\ndunce? Am I not fit to be Emperor? That would be the worst thing that\\ncould happen to me, if it were true.\"\\n\\n\"Oh, it is very pretty!\" said the Emperor aloud. \"It has my highest\\napproval!\"\\n\\nHe nodded his head happily, and stared at the empty looms. Never\\nwould he say that he could see nothing!\\n\\nHis friends, too, gazed and gazed, but saw no more than had the\\nothers. Yet they all cried out, \"It is beautiful!\" and advised the\\nEmperor to wear a suit made of this cloth in a great procession that was\\nsoon to take place.\\n\\n\"It is magnificent, gorgeous!\" was the cry that went from mouth to\\nmouth. The Emperor gave each of the rogues a royal ribbon to wear in his\\nbuttonhole, and called them the Imperial Court Weavers.\\n\\nThe rogues were up the whole night before the morning of the\\nprocession. They kept more than sixteen candles burning. The people\\ncould see them hard at work, completing the new clothes of the Emperor.\\nThey took yards of stuff down from the empty looms; they made cuts in\\nthe air with big scissors; they sewed with needles without thread; and,\\nat last, they said, \"The clothes are ready!\"\\n\\nThe Emperor himself, with his grandest courtiers, went to put on his\\nnew suit. 183\\n\\n\"See!\" said the rogues, lifting their arms as if holding something.\\n\"Here are the trousers! Here is the coat! Here is the cape!\" and so on.\\n\"It is as light as a spider\\'s web. One might think one had nothing on.\\nBut that is just the beauty of it!\"\\n\\n\"Very nice,\" said the courtiers. But they could see nothing; for\\nthere was nothing!\\n\\n\"Will your Imperial Majesty be graciously pleased to take off your\\nclothes,\" asked the rogues, \"so that we may put on the new ones before\\nthis long mirror?\"\\n\\nThe Emperor took off all his own clothes, and the two rogues\\npretended to put on each new garment as it was ready. They wrapped him\\nabout, and they tied and they buttoned. The Emperor turned round and\\nround before the mirror.\\n\\n\"How well his Majesty looks in his new clothes!\" said the people.\\n\"How becoming they are! What a pattern! What colors! It is a beautiful\\ndress!\"\\n\\n\"They are waiting outside with the canopy which is to be carried over\\nyour Majesty in the procession,\" said the master of ceremonies.\\n\\n\"I am ready,\" said the Emperor. \"Don\\'t the clothes fit well?\" he\\nasked, giving a last glance into the mirror as though he were looking at\\nall his new finery.\\n\\nThe men who were to carry the train of the Emperor\\'s cloak stooped\\ndown to the floor as if picking up the train, and then held it high in\\nthe air. They did not dare let it be known that they could see\\nnothing.\\n\\nSo the Emperor marched along under the bright canopy. Everybody in\\nthe streets and at the windows cried out: \"How beautiful the Emperor\\'s\\nnew clothes are! What a fine train! And they fit to perfection!\"\\n\\nNo one would let it be known that he could see nothing, for that\\nwould have proved that he was unfit for office or that he was very, very\\nstupid. None of the Emperor\\'s clothes had ever been as successful as\\nthese.\\n\\n\"But he has nothing on!\" said a little child.\\n\\n\"Just listen to the innocent!\" said its father.\\n\\nBut one person whispered to another what the child had said. \"He has\\nnothing on! A child says he has nothing on!\"\\n\\n\"But he has nothing on!\" at last cried all the people.\\n\\nThe Emperor writhed, for he knew that this was true. But he realized\\nthat it would never do to stop the procession. So he held himself\\nstiffer than ever, and the chamberlains carried the invisible train.\\n\\n194\\n\\nIn his story \"The Nightingale,\" Andersen suggests that the so-called\\nupper class of society may become so conventionalized as to be unable to\\nappreciate true beauty. Poor fishermen and the little kitchen girl in\\nthe story recognize the beauty of the exquisite song of the nightingale,\\nand Andersen shows his regard for royalty by having the emperor\\nappreciate it twice. The last part of the story is especially\\nimpressive. When Death approached the emperor and took from him the\\nsymbols that had made him rank above his fellows, the emperor saw the\\nrealities of life and again perceived the beauty of the nightingale\\'s\\nsong. This contact with real life made Death shrink away. Then the\\nemperor learned Andersen\\'s message to artificial society: If you would\\nbehold true beauty, you must have it in your own heart.\\n\\n184\\n\\nTHE NIGHTINGALE\\n\\nHANS CHRISTIAN ANDERSEN\\n\\nIn China, you must know, the Emperor is a Chinaman, and all whom he\\nhas about him are Chinamen too. It happened a good many years ago, but\\nthat\\'s just why it\\'s worth while to hear the story before it is\\nforgotten. The Emperor\\'s palace was the most splendid in the world; it\\nwas made entirely of porcelain, very costly, but so delicate and brittle\\nthat one had to take care how one touched it. In the garden were to be\\nseen the most wonderful flowers, and to the costliest of them silver\\nbells were tied, which sounded, so that nobody should pass by without\\nnoticing the flowers. Yes, everything in the Emperor\\'s garden was\\nadmirably arranged. And it extended so far that the gardener himself did\\nnot know where the end was. If a man went on and on, he came into a\\nglorious forest with high trees and deep lakes. The wood extended\\nstraight down to the sea, which was blue and deep; great ships could\\nsail, too, beneath the branches of the trees; and in the trees lived a\\nNightingale, which sang so splendidly that even the poor fisherman, who\\nhad many other things to do, stopped still and listened, when he had\\ngone out at night to throw out his nets, and heard the Nightingale.\\n\\n\"How beautiful that is!\" he said; but he was obliged to attend to his\\nproperty, and thus forgot the bird. But when the next night the bird\\nsang again, and the fisherman heard it, he exclaimed again, \"How\\nbeautiful that is!\"\\n\\nFrom all the countries of the world travelers came to the city of the\\nEmperor, and admired it, and the palace and the garden, but when they\\nheard the Nightingale, they said, \"That is the best of all!\"\\n\\nAnd the travelers told of it when they came home; and the learnĆØd men\\nwrote many books about the town, the palace, and the garden. But they\\ndid not forget the Nightingale; that was placed highest of all; and\\nthose who were poets wrote most magnificent poems about the Nightingale\\nin the wood by the deep lake.\\n\\nThe books went through all the world, and a few of them once came to\\nthe Emperor. He sat in his golden chair, and read, and read: every\\nmoment he nodded his head, for it pleased him to peruse the masterly\\ndescriptions of the city, the palace, and the garden. \"But the\\nNightingale is the best of all,\" it stood written there.\\n\\n\"What\\'s that?\" exclaimed the Emperor. \"I don\\'t know the Nightingale\\nat all! Is there such a bird in my empire, and even in my garden? I\\'ve\\nnever heard of that. To think that I should have to learn such a thing\\nfor the first time from books!\"\\n\\nAnd hereupon he called his cavalier. This cavalier was so grand that\\nif anyone lower in rank than himself dared to speak to him, or to ask\\nhim any question, he answered nothing but \"P!\"—and that meant\\nnothing.\\n\\n\"There is said to be a wonderful bird here called a Nightingale,\"\\nsaid the Emperor. \"They say it is the best thing in all my great empire.\\nWhy have I never heard anything about it?\"\\n\\n\"I have never heard him named,\" replied the cavalier. \"He has never\\nbeen introduced at Court.\"\\n\\n\"I command that he shall appear this evening, and sing before me,\"\\nsaid the Emperor. \"All the world knows what I possess, and I do not know\\nit myself!\"\\n\\n\"I have never heard him mentioned,\" said the cavalier. \"I will seek\\nfor him. I will find him.\"\\n\\nBut where was he to be found? The cavalier ran up and down all the\\nstaircases, through halls and passages, but no one among all those whom\\nhe met had heard talk of the Nightingale. And the cavalier ran back to\\nthe Emperor, and said that it must be a fable invented by the writers of\\nbooks.\\n\\n\"Your Imperial Majesty cannot believe how much is written that is\\nfiction, besides something that they call the black art.\"\\n\\n\"But the book in which I read this,\" said the Emperor, \"was sent to\\nme by the high and mighty Emperor of Japan and therefore it cannot be a\\nfalsehood. I will hear the Nightingale! It must be here this\\nevening! It has my imperial favor; and if it does not come, all the\\nCourt shall be trampled upon after the Court has supped!\"\\n\\n\"Tsing-pe!\" said the cavalier; and again he ran up and down all the\\nstaircases, and through all the halls and corridors; and half the Court\\nran with him, for the courtiers did not like being trampled upon.\\n\\nThen there was a great inquiry after the wonderful Nightingale, which\\nall the world knew excepting the people at Court.\\n\\nAt last they met with a poor little girl in the kitchen, who\\nsaid:\\n\\n\"The Nightingale? I know it well; yes, it can sing gloriously. Every\\nevening I get leave to carry my poor sick mother the scraps from the\\ntable. She lives down by the strand; and when I get back and am tired,\\nand rest in the wood, then I hear the Nightingale sing. And then the\\nwater comes into my eyes, and it is just as if my mother kissed me.\"\\n\\n\"Little kitchen girl,\" said the cavalier, \"I will get you a place in\\nthe Court kitchen, with permission to see the Emperor dine, if you will\\nbut lead us to the Nightingale, for it is announced for this\\nevening.\"\\n\\nSo they all went out into the wood where the Nightingale was\\naccustomed to sing; half the Court went forth. When they were in the\\nmidst of their journey a cow began to low.\\n\\n\"Oh!\" cried the Court pages, \"now we have it! That shows a wonderful\\npower in so small a creature! I have certainly heard it before.\"\\n\\n\"No, those are cows lowing,\" said the little kitchen girl. \"We are a\\nlong way from the place yet.\"\\n\\nNow the frogs began to croak in the marsh.\\n\\n\"Glorious!\" said the Chinese Court preacher. \"Now I hear it—it sounds\\njust like little church bells.\"\\n\\n\"No, those are frogs,\" said the little kitchen maid. \"But now I think\\nwe shall soon hear it.\"\\n\\nAnd then the Nightingale began to sing.\\n\\n\"That is it!\" exclaimed the little girl. \"Listen, listen! and yonder\\nit sits.\"\\n\\nAnd she pointed to a little gray bird up in the boughs.\\n\\n\"Is it possible?\" cried the cavalier. \"I should never have thought it\\nlooked like that! How simple it looks! It must certainly have lost its\\ncolor at seeing such grand people around.\"\\n\\n\"Little Nightingale!\" called the little kitchen maid, quite loudly,\\n\"our gracious Emperor wishes you to sing before him.\"\\n\\n\"With the greatest pleasure!\" replied the Nightingale, and began to\\nsing most delightfully.\\n\\n\"It sounds just like glass bells!\" said the cavalier. \"And look at\\nits little throat, how it\\'s working! It\\'s wonderful that we should never\\nhave heard it before. That bird will be a great success at Court.\"\\n\\n\"Shall I sing once more before the Emperor?\" inquired the\\nNightingale, for it thought the Emperor was present.\\n\\n\"My excellent little Nightingale,\" said the cavalier, \"I have great\\npleasure in inviting you to a Court festival this evening, when you\\nshall charm his Imperial Majesty with your beautiful singing.\"\\n\\n\"My song sounds best in the green wood,\" replied the Nightingale;\\nstill it came willingly when it heard what the Emperor wished.\\n\\nThe palace was festively adorned. The walls and the flooring, which\\nwere of porcelain, gleamed in the rays of thousands of golden lamps. The\\nmost glorious flowers, which could ring clearly, had been placed in the\\npassages. There was a running to and fro, and a thorough draught, and\\nall the bells rang so loudly that one could not hear one\\'s self\\nspeak.\\n\\nIn the midst of the great hall, where the Emperor sat, a golden perch\\nhad been placed, on which the Nightingale was to sit. The whole Court\\nwas there, and the little cook-maid had got leave to stand behind the\\ndoor, as she had now received the title of a real Court cook. All were\\nin full dress, and all looked at the little gray bird, to which the\\nEmperor nodded.\\n\\nAnd the Nightingale sang so gloriously that the tears came into the\\nEmperor\\'s eyes, and the tears ran down over his cheeks; then the\\nNightingale sang still more sweetly, that went straight to the heart.\\nThe Emperor was so much pleased that he said the Nightingale should have\\nhis golden slipper to wear round its neck. But the Nightingale declined\\nthis with thanks, saying it had already received a sufficient\\nreward.\\n\\n\"I have seen tears in the Emperor\\'s eyes—that is the real treasure to\\nme. An Emperor\\'s tears have a peculiar power. I am rewarded enough!\" And\\nthen it sang again with a sweet, glorious voice.\\n\\n\"That\\'s the most amiable coquetry I ever saw!\" said the ladies who\\nstood round about, and then they took water in their mouths to gurgle\\nwhen anyone spoke to them. They thought they should be nightingales too.\\nAnd the lackeys and chambermaids reported that they were satisfied also;\\nand that was saying a good deal, for they are the most difficult to\\nplease. In short, the Nightingale achieved a real success.\\n\\nIt was now to remain at Court, to have its own cage, with liberty to\\ngo out twice every day and once at night. Twelve servants were appointed\\nwhen the Nightingale went out, each of whom had a silken string fastened\\nto the bird\\'s legs, which they held very tight. There was really no\\npleasure in an excursion of that kind.\\n\\nThe whole city spoke of the wonderful bird, and whenever two people\\nmet, one said nothing but \"Nightin,\" and the other said \"gale\"; and then\\nthey both sighed, and understood one another. Eleven pedlars\\' children\\nwere named after the bird, but not one of them could sing a note.\\n\\nOne day the Emperor received a large parcel, on which was written,\\n\"The Nightingale.\"\\n\\n\"There we have a new book about this celebrated bird,\" said the\\nEmperor. 187\\n\\nBut it was not a book, but a little work of art, contained in a\\nbox—an artificial nightingale, which was to sing like a natural one, and\\nwas brilliantly ornamented with diamonds, sapphires, and rubies. So soon\\nas the artificial bird was wound up, he could sing one of the pieces\\nthat he really sang, and then his tail moved up and down, and shone with\\nsilver and gold. Round his neck hung a little ribbon, and on that was\\nwritten, \"The Emperor of China\\'s nightingale is poor compared to that of\\nthe Emperor of Japan.\"\\n\\n\"That is capital!\" said they all, and he who had brought the\\nartificial bird immediately received the title, Imperial\\nHead-Nightingale-Bringer.\\n\\n\"Now they must sing together; what a duet that will be!\" cried the\\ncourtiers.\\n\\nAnd so they had to sing together; but it did not sound very well, for\\nthe real Nightingale sang its own way, and the artificial bird sang\\nwaltzes.\\n\\n\"That\\'s not his fault,\" said the playmaster; \"he\\'s quite perfect, and\\nvery much in my style.\"\\n\\nNow the artificial bird was to sing alone. It had just as much\\nsuccess as the real one, and then it was much handsomer to look at—it\\nshone like bracelets and breastpins.\\n\\nThree and thirty times over did it sing the same piece, and yet was\\nnot tired. The people would gladly have heard it again, but the Emperor\\nsaid that the living Nightingale ought to sing something now. But where\\nwas it? No one had noticed that it had flown away out of the open\\nwindow, back to the green wood.\\n\\n\"But what has become of that?\" asked the Emperor.\\n\\nAnd all the courtiers abused the Nightingale, and declared that it\\nwas a very ungrateful creature.\\n\\n\"We have the best bird after all,\" said they.\\n\\nAnd so the artificial bird had to sing again, and that was the\\nthirty-fourth time that they listened to the same piece. For all that\\nthey did not know it quite by heart, for it was so very difficult. And\\nthe playmaster praised the bird particularly; yes, he declared that it\\nwas better than a nightingale, not only with regard to its plumage and\\nthe many beautiful diamonds, but inside as well.\\n\\n\"For you see, ladies and gentlemen, and above all, your Imperial\\nMajesty, with a real nightingale one can never calculate what is coming,\\nbut in this artificial bird, everything is settled. One can explain it;\\none can open it and make people understand where the waltzes come from,\\nhow they go, and how one follows up another.\"\\n\\n\"Those are quite our own ideas,\" they all said.\\n\\nAnd the speaker received permission to show the bird to the people on\\nthe next Sunday. The people were to hear it sing too, the Emperor\\ncommanded: and they did hear it, and were as much pleased as if they had\\nall got tipsy upon tea, for that\\'s quite the Chinese fashion, and they\\nall said, \"Oh!\" and held up their forefingers and nodded. But the poor\\nfisherman, who had heard the real Nightingale, said:\\n\\n\"It sounds pretty enough, and the melodies resemble each other, but\\nthere\\'s something wanting, though I know not what!\"\\n\\nThe real Nightingale was banished from the country and empire. The\\nartificial bird had its place on a silken 188 cushion close to the Emperor\\'s\\nbed; all the presents it had received, gold and precious stones, were\\nranged about it; in title it had advanced to be the High Imperial\\nAfter-Dinner-Singer, and in rank to Number One on the left hand; for the\\nEmperor considered that side the most important on which the heart is\\nplaced, and even in an Emperor the heart is on the left side; and the\\nplaymaster wrote a work of five and twenty volumes about the artificial\\nbird; it was very learnĆØd and very long, full of the most difficult\\nChinese words; but yet all the people declared that they had read it and\\nunderstood it, for fear of being considered stupid, and having their\\nbodies trampled on.\\n\\nSo a whole year went by. The Emperor, the Court, and all the other\\nChinese knew every little twitter in the artificial bird\\'s song by\\nheart. But just for that reason it pleased them best—they could sing\\nwith it themselves, and they did so. The street boys sang,\\n\"Tsi-tsi-tsi-glug-glug!\" and the Emperor himself sang it too. Yes, that\\nwas certainly famous.\\n\\nBut one evening, when the artificial bird was singing its best, and\\nthe Emperor lay in bed listening to it, something inside the bird said,\\n\"Whizz!\" Something cracked. \"Whir-r-r!\" All the wheels ran round, and\\nthen the music stopped.\\n\\nThe Emperor immediately sprang out of bed, and caused his body\\nphysician to be called; but what could he do? Then they sent\\nfor a watchmaker, and after a good deal of talking and investigation,\\nthe bird was put into something like order, but the watchmaker said that\\nthe bird must be carefully treated, for the barrels were worn, and it\\nwould be impossible to put new ones in in such a manner that the music\\nwould go. There was a great lamentation; only once in the year was it\\npermitted to let the bird sing, and that was almost too much. But then\\nthe playmaster made a little speech full of heavy words, and said this\\nwas just as good as before—and so of course it was as good as\\nbefore.\\n\\nNow five years had gone by, and a real grief came upon the whole\\nnation. The Chinese were really fond of their Emperor, and now he was\\nill, and could not, it was said, live much longer. Already a new Emperor\\nhad been chosen, and the people stood out in the street and asked the\\ncavalier how the Emperor did.\\n\\n\"P!\" said he, and shook his head.\\n\\nCold and pale lay the Emperor in his great, gorgeous bed; the whole\\nCourt thought him dead, and each one ran to pay homage to the new ruler.\\nThe chamberlains ran out to talk it over, and the ladies\\' maids had a\\ngreat coffee party. All about, in all the halls and passages, cloth had\\nbeen laid down so that no footstep could be heard, and therefore it was\\nquiet there, quite quiet. But the Emperor was not dead yet; stiff and\\npale he lay on the gorgeous bed, with the long velvet curtains and the\\nheavy gold tassels; high up, a window stood open, and the moon shone in\\nupon the Emperor and the artificial bird.\\n\\nThe poor Emperor could scarcely breathe; it was just as if something\\nlay upon his chest; he opened his eyes, and then he saw that it was\\nDeath who sat upon his chest, and had put on his golden crown, and held\\nin one hand the Emperor\\'s sword, in the other his beautiful banner. And\\nall around, from among the folds of the splendid velvet curtains,\\nstrange heads peered forth; a few very 189 ugly, the rest quite lovely and\\nmild. These were all the Emperor\\'s bad and good deeds, that stood before\\nhim now that Death sat upon his heart.\\n\\n\"Do you remember this?\" whispered one to the other. \"Do you remember\\nthat?\" and then they told him so much that the perspiration ran from his\\nforehead.\\n\\n\"I did not know that!\" said the Emperor. \"Music! music! the great\\nChinese drum!\" he cried, \"so that I need not hear all they say!\"\\n\\nAnd they continued speaking, and Death nodded like a Chinaman to all\\nthey said.\\n\\n\"Music! music!\" cried the Emperor. \"You little precious golden bird,\\nsing, sing! I have given you gold and costly presents; I have even hung\\nmy golden slipper around your neck—sing now, sing!\"\\n\\nBut the bird stood still; no one was there to wind him up, and he\\ncould not sing without that; but Death continued to stare at the Emperor\\nwith his great, hollow eyes, and it was quiet, fearfully quiet.\\n\\nThen there sounded from the window, suddenly, the most lovely song.\\nIt was the little live Nightingale, that sat outside on a spray. It had\\nheard of the Emperor\\'s sad plight, and had come to sing to him of\\ncomfort and hope. As it sang the specters grew paler and paler; the\\nblood ran quicker and more quickly through the Emperor\\'s weak limbs; and\\neven Death listened, and said:\\n\\n\"Go on, little Nightingale, go on!\"\\n\\n\"But will you give me that splendid golden sword? Will you give me\\nthat rich banner? Will you give me the Emperor\\'s crown?\"\\n\\nAnd Death gave up each of these treasures for a song. And the\\nNightingale sang on and on; and it sang of the quiet churchyard where\\nthe white roses grow, where the elder blossoms smell sweet, and where\\nthe fresh grass is moistened by the tears of survivors. Then Death felt\\na longing to see his garden, and floated out at the window in the form\\nof a cold white mist.\\n\\n\"Thanks! thanks!\" said the Emperor. \"You heavenly little bird; I know\\nyou well. I banished you from my country and empire, and yet you have\\ncharmed away the evil faces from my couch, and banished Death from my\\nheart! How can I reward you?\"\\n\\n\"You have rewarded me!\" replied the Nightingale. \"I have drawn tears\\nfrom your eyes, when I sang the first time—I shall never forget that.\\nThose are the jewels that rejoice a singer\\'s heart. But now sleep, and\\ngrow fresh and strong again. I will sing you something.\"\\n\\nAnd it sang, and the Emperor fell into a sweet slumber. Ah! how mild\\nand refreshing that sleep was! The sun shone upon him through the\\nwindows when he awoke refreshed and restored: not one of his servants\\nhad yet returned, for they all thought he was dead; only the Nightingale\\nstill sat beside him and sang.\\n\\n\"You must always stay with me,\" said the Emperor. \"You shall sing as\\nyou please; and I\\'ll break the artificial bird into a thousand\\npieces.\"\\n\\n\"Not so,\" replied the Nightingale. \"It did well as long as it could;\\nkeep it as you have done till now. I cannot build my nest in the palace\\nto dwell in it, but let me come when I feel the wish; then I will sit in\\nthe evening on the spray yonder by the window, and sing you something,\\nso that you may be glad and thoughtful at once. I will sing of those who\\nare happy and of those who suffer. 190 I will sing of good and of evil\\nthat remains hidden round about you. The little singing bird flies far\\naround, to the poor fisherman, to the peasant\\'s roof, to everyone who\\ndwells far away from you and from your Court. I love your heart more\\nthan your crown, and yet the crown has an air of sanctity about it. I\\nwill come and sing to you—but one thing you must promise me.\"\\n\\n\"Every thing!\" said the Emperor; and he stood there in his imperial\\nrobes, which he had put on himself, and pressed the sword which was\\nheavy with gold to his heart.\\n\\n\"One thing I beg of you: tell no one that you have a little bird who\\ntells you everything. Then it will go all the better.\"\\n\\nAnd the Nightingale flew away.\\n\\nThe servants came in to look at their dead Emperor, and—yes, there he\\nstood, and the Emperor said, \"Good-morning!\"\\n\\n195\\n\\nThis story is a favorite for the Christmas season. It is loosely\\nconstructed, and rambles along for some time after it might have been\\nexpected to finish. Such rambling is often very attractive to childish\\nlisteners, as it allows the introduction of unexpected incidents. Miss\\nKready has some interesting suggestions about dramatizing this story in\\nher Study of Fairy Tales, pp. 151-153. The translation is\\nDulcken\\'s.\\n\\nTHE FIR TREE\\n\\nHANS CHRISTIAN ANDERSEN\\n\\nOut in the forest stood a pretty little Fir Tree. It had a good\\nplace; it could have sunlight, air there was in plenty, and all around\\ngrew many larger comrades—pines as well as firs. But the little Fir Tree\\nwished ardently to become greater. It did not care for the warm sun and\\nthe fresh air; it took no notice of the peasant children, who went about\\ntalking together, when they had come out to look for strawberries and\\nraspberries. Often they came with a whole pot-full, or had strung\\nberries on a straw; then they would sit down by the little Fir Tree and\\nsay, \"How pretty and small that one is!\" and the Fir Tree did not like\\nto hear that at all.\\n\\nNext year he had grown a great joint, and the following year he was\\nlonger still, for in fir trees one can always tell by the number of\\nrings they have how many years they have been growing.\\n\\n\"Oh, if I were only as great a tree as the other!\" sighed the little\\nFir, \"then I would spread my branches far around, and look out from my\\ncrown into the wide world. The birds would then build nests in my\\nboughs, and when the wind blew I could nod just as grandly as the others\\nyonder.\"\\n\\nIt took no pleasure in the sunshine, in the birds, and in the red\\nclouds that went sailing over him morning and evening.\\n\\nWhen it was winter, and the snow lay all around, white and sparkling,\\na hare would often come jumping along, and spring right over the little\\nFir Tree. Oh! this made him so angry. But two winters went by, and when\\nthe third came the little Tree had grown so tall that the hare was\\nobliged to run round it.\\n\\n\"Oh! to grow, to grow, and become old; that\\'s the only fine thing in\\nthe world,\" thought the Tree.\\n\\nIn the autumn woodcutters always came and felled a few of the largest\\ntrees; that was done this year too, and the little Fir Tree, that was\\nnow quite well grown, shuddered with fear, for the 191 great\\nstately trees fell to the ground with a crash, and their branches were\\ncut off, so that the trees looked quite naked, long, and slender—they\\ncould hardly be recognized. But then they were laid upon wagons, and\\nhorses dragged them away out of the wood. Where were they going? What\\ndestiny awaited them?\\n\\nIn the spring, when the Swallows and the Stork came, the Tree asked\\nthem, \"Do you know where they were taken? Did you not meet them?\"\\n\\nThe Swallows knew nothing about it, but the Stork looked thoughtful,\\nnodded his head, and said:\\n\\n\"Yes, I think so. I met many new ships when I flew out of Egypt; on\\nthe ships were stately masts; I fancy these were the trees. They smelt\\nlike fir. I can assure you they\\'re stately—very stately.\"\\n\\n\"Oh that I were only big enough to go over the sea! What kind of\\nthing is this sea, and how does it look?\"\\n\\n\"It would take too long to explain all that,\" said the Stork, and he\\nwent away.\\n\\n\"Rejoice in thy youth,\" said the Sunbeams; \"rejoice in thy fresh\\ngrowth, and in the young life that is within thee.\"\\n\\nAnd the wind kissed the Tree, and the dew wept tears upon it; but the\\nFir Tree did not understand that.\\n\\nWhen Christmas-time approached, quite young trees were felled,\\nsometimes trees which were neither so old nor so large as this Fir Tree,\\nthat never rested, but always wanted to go away. These young trees,\\nwhich were always the most beautiful, kept all their branches; they were\\nput upon wagons, and horses dragged them away out of the wood.\\n\\n\"Where are they all going?\" asked the Fir Tree. \"They are not greater\\nthan I—indeed, one of them was much smaller. Why do they keep all their\\nbranches? Whither are they taken?\"\\n\\n\"We know that! We know that!\" chirped the Sparrows. \"Yonder in the\\ntown we looked in at the windows. We know where they go. Oh! they are\\ndressed up in the greatest pomp and splendor that can be imagined. We\\nhave looked in at the windows, and have perceived that they are planted\\nin the middle of a warm room, and adorned with the most beautiful\\nthings—gilt apples, honey-cakes, playthings, and many hundred\\ncandles.\"\\n\\n\"And then?\" asked the Fir Tree, and trembled through all its\\nbranches. \"And then? What happens then?\"\\n\\n\"Why, we have not seen anything more. But it was incomparable.\"\\n\\n\"Perhaps I may be destined to tread this glorious path one day!\"\\ncried the Fir Tree, rejoicingly. \"That is even better than traveling\\nacross the sea. How painfully I long for it! If it were only Christmas\\nnow! Now I am great and grown up, like the rest who were led away last\\nyear. Oh, if I were only on the carriage! If I were only in the warm\\nroom, among all the pomp and splendor! And then? Yes, then something\\neven better will come, something far more charming, or else why should\\nthey adorn me so? There must be something grander, something greater\\nstill to come; but what? Oh! I\\'m suffering, I\\'m longing! I don\\'t know\\nmyself what is the matter with me!\"\\n\\n\"Rejoice in us,\" said Air and Sunshine. \"Rejoice in thy fresh youth\\nhere in the woodland.\"\\n\\nBut the Fir Tree did not rejoice at all, but it grew and grew; winter\\nand summer it stood there, green, dark green. The 192 people\\nwho saw it said, \"That\\'s a handsome tree!\" and at Christmas time it was\\nfelled before any one of the others. The ax cut deep into its marrow,\\nand the tree fell to the ground with a sigh; it felt a pain, a sensation\\nof faintness, and could not think at all of happiness, for it was sad at\\nparting from its home, from the place where it had grown up; it knew\\nthat it should never again see the dear old companions, the little\\nbushes and flowers all around—perhaps not even the birds. The parting\\nwas not at all agreeable.\\n\\nThe Tree only came to itself when it was unloaded in a yard, with\\nother trees, and heard a man say:\\n\\n\"This one is famous; we want only this one!\"\\n\\nNow two servants came in gay liveries, and carried the Fir Tree into\\na large, beautiful saloon. All around the walls hung pictures, and by\\nthe great stove stood large Chinese vases with lions on the covers;\\nthere were rocking-chairs, silken sofas, great tables covered with\\npicture books, and toys worth a hundred times a hundred dollars, at\\nleast the children said so. And the Fir Tree was put into a great tub\\nfilled with sand; but no one could see that it was a tub, for it was\\nhung round with green cloth, and stood on a large, many-colored carpet.\\nOh, how the Tree trembled! What was to happen now? The servants, and the\\nyoung ladies also, decked it out. On one branch they hung little nets,\\ncut out of colored paper; every net was filled with sweetmeats; golden\\napples and walnuts hung down, as if they grew there, and more than a\\nhundred little candles, red, white, and blue, were fastened to the\\ndifferent boughs. Dolls that looked exactly like real people—the tree\\nhad never seen such before—swung among the foliage, and high on the\\nsummit of the Tree was fixed a tinsel star. It was splendid,\\nparticularly splendid.\\n\\n\"This evening,\" said all, \"this evening it will shine.\"\\n\\n\"Oh,\" thought the Tree, \"that it were evening already! Oh, that the\\nlights may be soon lit up! When may that be done? I wonder if trees will\\ncome out of the forest to look at me? Will the sparrows fly against the\\npanes? Shall I grow fast here, and stand adorned in summer and\\nwinter?\"\\n\\nYes, he did not guess badly. But he had a complete backache from mere\\nlonging, and the backache is just as bad for a Tree as the headache for\\na person.\\n\\nAt last the candles were lighted. What a brilliance, what splendor!\\nThe Tree trembled so in all its branches that one of the candles set\\nfire to a green twig, and it was scorched.\\n\\n\"Heaven preserve us!\" cried the young ladies; and they hastily put\\nthe fire out.\\n\\nNow the Tree might not even tremble. Oh, that was terrible! It was so\\nafraid of setting fire to some of its ornaments, and it was quite\\nbewildered with all the brilliance. And now the folding doors were\\nthrown open, and a number of children rushed in as if they would have\\noverturned the whole Tree; the older people followed more deliberately.\\nThe little ones stood quite silent, but only for a minute; then they\\nshouted till the room rang: they danced gleefully round the Tree, and\\none present after another was plucked from it.\\n\\n\"What are they about?\" thought the Tree. \"What\\'s going to be\\ndone?\"\\n\\nAnd the candles burned down to the twigs, and as they burned down\\nthey 193 were extinguished, and then the children received\\npermission to plunder the Tree. Oh! they rushed in upon it, so that\\nevery branch cracked again: if it had not been fastened by the top and\\nby the golden star to the ceiling, it would have fallen down.\\n\\nThe children danced about with their pretty toys. No one looked at\\nthe Tree except one old man, who came up and peeped among the branches,\\nbut only to see if a fig or an apple had not been forgotten.\\n\\n\"A story! A story!\" shouted the children; and they drew a little fat\\nman toward the tree; and he sat down just beneath it—\"for then we shall\\nbe in the green wood,\" said he, \"and the tree may have the advantage of\\nlistening to my tale. But I can only tell one. Will you hear the story\\nof Ivede-Avede, or of Klumpey-Dumpey, who fell downstairs, and still was\\nraised up to honor and married the Princess?\"\\n\\n\"Ivede-Avede!\" cried some, \"Klumpey-Dumpey!\" cried others, and there\\nwas a great crying and shouting. Only the Fir Tree was quite silent, and\\nthought, \"Shall I not be in it? Shall I have nothing to do in it?\" But\\nhe had been in the evening\\'s amusement, and had done what was required\\nof him.\\n\\nAnd the fat man told about Klumpey-Dumpey who fell downstairs, and\\nyet was raised to honor and married the Princess. And the children\\nclapped their hands, and cried, \"Tell another! tell another!\" for they\\nwanted to hear about Ivede-Avede; but they only got the story of\\nKlumpey-Dumpey. The Fir Tree stood quite silent and thoughtful; never\\nhad the birds in the wood told such a story as that. Klumpey-Dumpey fell\\ndownstairs, and yet came to honor and married the Princess!\\n\\n\"Yes, so it happens in the world!\" thought the Fir Tree, and believed\\nit must be true, because that was such a nice man who told it. \"Well,\\nwho can know? Perhaps I shall fall downstairs, too, and marry a\\nPrincess!\" And it looked forward with pleasure to being adorned again,\\nthe next evening, with candles and toys, gold and fruit. \"To-morrow I\\nshall not tremble,\" it thought.\\n\\n\"I will rejoice in all my splendor. To-morrow I shall hear the story\\nof Klumpey-Dumpey again, and perhaps that of Ivede-Avede, too.\"\\n\\nAnd the Tree stood all night quiet and thoughtful.\\n\\nIn the morning the servants and the chambermaid came in.\\n\\n\"Now my splendor will begin afresh,\" thought the Tree. But they\\ndragged him out of the room, and upstairs to the garret, and here they\\nput him in a dark corner where no daylight shone.\\n\\n\"What\\'s the meaning of this?\" thought the Tree. \"What am I to do\\nhere? What is to happen?\"\\n\\nAnd he leaned against the wall, and thought, and thought. And he had\\ntime enough, for days and nights went by, and nobody came up; and when\\nat length someone came, it was only to put some great boxes in a corner.\\nNow the Tree stood quite hidden away, and the supposition is that it was\\nquite forgotten.\\n\\n\"Now it\\'s winter outside,\" thought the Tree. \"The earth is hard and\\ncovered with snow, and people cannot plant me; therefore I suppose I\\'m\\nto be sheltered here until spring comes. How considerate that is! How\\ngood people are! If it were only not so dark 194 here,\\nand so terribly solitary!—not even a little hare? That was pretty out\\nthere in the wood, when the snow lay thick and the hare sprang past;\\nyes, even when he jumped over me; but then I did not like it. It is\\nterribly lonely up here!\"\\n\\n\"Piep! piep!\" said a little Mouse, and crept forward, and then came\\nanother little one. They smelt at the Fir Tree, and then slipped among\\nthe branches.\\n\\n\"It\\'s horribly cold,\" said the two little Mice, \"or else it would be\\ncomfortable here. Don\\'t you think so, you old Fir Tree?\"\\n\\n\"I\\'m not old at all,\" said the Fir Tree. \"There are many much older\\nthan I.\"\\n\\n\"Where do you come from?\" asked the Mice. \"And what do you know?\"\\nThey were dreadfully inquisitive. \"Tell us about the most beautiful spot\\non earth. Have you been there? Have you been in the store room, where\\ncheeses lie on the shelves, and hams hang from the ceiling, where one\\ndances on tallow candles, and goes in thin and comes out fat?\"\\n\\n\"I don\\'t know that,\" replied the Tree; \"but I know the wood, where\\nthe sun shines and the birds sing.\"\\n\\nAnd then it told all about its youth.\\n\\nAnd the little Mice had never heard anything of the kind; and they\\nlistened and said:\\n\\n\"What a number of things you have seen! How happy you must have\\nbeen!\"\\n\\n\"I?\" replied the Fir Tree; and it thought about what it had told.\\n\"Yes, those were really quite happy times.\" But then he told of the\\nChristmas Eve, when he had been hung with sweetmeats and candles.\\n\\n\"Oh!\" said the little Mice, \"how happy you have been, you old Fir\\nTree!\"\\n\\n\"I\\'m not old at all,\" said the Tree. \"I only came out of the wood\\nthis winter. I\\'m only rather backward in my growth.\"\\n\\n\"What splendid stories you can tell!\" said the little Mice.\\n\\nAnd next night they came with four other little Mice, to hear what\\nthe Tree had to relate; and the more it said, the more clearly did it\\nremember everything, and thought, \"Those were quite merry days! But they\\nmay come again. Klumpey-Dumpey fell downstairs, and yet he married the\\nPrincess. Perhaps I may marry a Princess too!\" And the Fir Tree thought\\nof a pretty little Birch Tree that grew out in the forest; for the Fir\\nTree, that Birch was a real Princess.\\n\\n\"Who\\'s Klumpey-Dumpey?\" asked the little Mice.\\n\\nAnd then the Fir Tree told the whole story. It could remember every\\nsingle word; and the little Mice were ready to leap to the very top of\\nthe tree with pleasure. Next night a great many more Mice came, and on\\nSunday two Rats even appeared; but these thought the story was not\\npretty, and the little Mice were sorry for that, for now they also did\\nnot like it so much as before.\\n\\n\"Do you only know one story?\" asked the Rats.\\n\\n\"Only that one,\" replied the Tree. \"I heard that on the happiest\\nevening of my life; I did not think then how happy I was.\"\\n\\n\"That\\'s a very miserable story. Don\\'t you know any about bacon and\\ntallow candles—a store-room story?\"\\n\\n\"No,\" said the Tree.\\n\\n\"Then we\\'d rather not hear you,\" said the Rats.\\n\\nAnd they went back to their own people. The little Mice at last\\nstayed 195 away also; and then the Tree sighed and said:\\n\\n\"It was very nice when they sat round me, the merry little Mice, and\\nlistened when I spoke to them. Now that\\'s past too. But I shall remember\\nto be pleased when they take me out.\"\\n\\nBut when did that happen? Why, it was one morning that people came\\nand rummaged in the garret: the boxes were put away, and the Tree\\nbrought out; they certainly threw him rather roughly on the floor, but a\\nservant dragged him away at once to the stairs, where the daylight\\nshone.\\n\\n\"Now life is beginning again!\" thought the Tree.\\n\\nIt felt the fresh air and the first sunbeams, and now it was out in\\nthe courtyard. Everything passed so quickly that the Tree quite forgot\\nto look at itself, there was so much to look at all round. The courtyard\\nwas close to a garden, and here everything was blooming; the roses hung\\nfresh and fragrant over the little paling, the linden trees were in\\nblossom, and the swallows cried, \"Quinze-wit! quinze-wit! my husband\\'s\\ncome!\" But it was not the Fir Tree that they meant.\\n\\n\"Now I shall live!\" said the Tree, rejoicingly, and spread its\\nbranches far out; but, alas! they were all withered and yellow; and it\\nlay in the corner among nettles and weeds. The tinsel star was still\\nupon it, and shone in the bright sunshine.\\n\\nIn the courtyard a couple of the merry children were playing who had\\ndanced round the tree at Christmas time, and had rejoiced over it. One\\nof the youngest ran up and tore off the golden star.\\n\\n\"Look what is sticking to the ugly old fir tree!\" said the child, and\\nhe trod upon the branches till they cracked again under his boots.\\n\\nAnd the Tree looked at all the blooming flowers and the splendor of\\nthe garden, and then looked at itself, and wished it had remained in the\\ndark corner of the garret; it thought of its fresh youth in the wood, of\\nthe merry Christmas Eve, and of the little Mice which had listened so\\npleasantly to the story of Klumpey-Dumpey.\\n\\n\"Past! past!\" said the old Tree. \"Had I but rejoiced when I could\\nhave done so! Past! past!\"\\n\\nAnd the servant came and chopped the Tree into little pieces; a whole\\nbundle lay there; it blazed brightly under the great brewing copper, and\\nit sighed deeply, and each sigh was like a little shot; and the children\\nwho were at play there ran up and seated themselves at the fire, looked\\ninto it, and cried \"Puff! puff!\" But at each explosion, which was a deep\\nsigh, the Tree thought of a summer day in the woods, or of a winter\\nnight there, when the stars beamed; he thought of Christmas Eve and of\\nKlumpey-Dumpey, the only story he had ever heard or knew how to tell;\\nand then the Tree was burned.\\n\\nThe boys played in the garden, and the youngest had on his breast a\\ngolden star, which the Tree had worn on its happiest evening. Now that\\nwas past, and the Tree\\'s life was past, and the story is past too: past!\\npast!—and that\\'s the way with all stories.\\n\\n196\\n\\nThe tale that follows was one of the author\\'s earliest stories,\\npublished in 1835. It is clearly based upon an old folk tale, one\\nvariant of which is \"The Blue Light\" from the Grimm collection (No.\\n174). \"It was 196 a lucky stroke,\" says Brandes, \"that made\\nAndersen the poet of children. After long fumbling, after unsuccessful\\nefforts, which must necessarily throw a false and ironic light on the\\nself-consciousness of a poet whose pride based its justification mainly\\non the expectancy of a future which he felt slumbering within his soul,\\nafter wandering about for long years, Andersen … one evening found\\nhimself in front of a little insignificant yet mysterious door, the door\\nof the nursery story. He touched it, it yielded, and he saw, burning in\\nthe obscurity within, the little \\'Tinder-Box\\' that became his Aladdin\\'s\\nlamp. He struck fire with it, and the spirits of the lamp—the dogs with\\neyes as large as tea-cups, as mill-wheels, as the round tower in\\nCopenhagen—stood before him and brought him the three giant chests,\\ncontaining all the copper, silver, and gold treasure stories of the\\nnursery story. The first story had sprung into existence, and the\\n\\'Tinder-Box\\' drew all the others onward in its train. Happy is he who\\nhas found his \\'tinder-box.\\'\" The translation is by H. W. Dulcken.\\n\\nTHE TINDER-BOX\\n\\nHANS CHRISTIAN ANDERSEN\\n\\nThere came a soldier marching along the high road—one, two! one,\\ntwo! He had his knapsack on his back and a saber by his side, for\\nhe had been in the wars, and now he wanted to go home. And on the way he\\nmet with an old witch; she was very hideous, and her under lip hung down\\nupon her breast. She said, \"Good evening, soldier. What a fine sword you\\nhave, and what a big knapsack! You\\'re a proper soldier! Now you shall\\nhave as much money as you like to have.\"\\n\\n\"I thank you, you old witch!\" said the soldier.\\n\\n\"Do you see that great tree?\" quoth the witch; and she pointed to a\\ntree which stood beside them. \"It\\'s quite hollow inside. You must climb\\nto the top, and then you\\'ll see a hole, through which you can let\\nyourself down and get deep into the tree. I\\'ll tie a rope round your\\nbody, so that I can pull you up again when you call me.\"\\n\\n\"What am I to do down in the tree?\" asked the soldier.\\n\\n\"Get money,\" replied the witch. \"Listen to me. When you come down to\\nthe earth under the tree, you will find yourself in a great hall: it is\\nquite light, for above three hundred lamps are burning there. Then you\\nwill see three doors; those you can open, for the keys are hanging\\nthere. If you go into the first chamber, you\\'ll see a great chest in the\\nmiddle of the floor; on this chest sits a dog, and he\\'s got a pair of\\neyes as big as two tea-cups. But you need not care for that. I\\'ll give\\nyou my blue-checked apron, and you can spread it out upon the floor;\\nthen go up quickly and take the dog, and set him on my apron; then open\\nthe chest, and take as many shillings as you like. They are of copper:\\nif you prefer silver, you must go into the second chamber. But there\\nsits a dog with a pair of eyes as big as mill-wheels. But do not you\\ncare for that. Set him upon my apron, and take some of the money. And if\\nyou want gold, you can have that too—in fact, as much as you can\\ncarry—if you go into the third chamber. But the dog that sits on the\\nmoney-chest there has two eyes as big as round towers. He is a fierce\\ndog, you may be sure; but you needn\\'t be afraid, for all that. Only set\\nhim on my apron, and he won\\'t hurt you; and take out of the chest as\\nmuch gold as you like.\"\\n\\n\"That\\'s not so bad,\" said the soldier. \"But what am I to give you,\\nold witch? for you will not do it for nothing, I fancy.\"\\n\\n\"No,\" replied the witch, \"not a single shilling will I have. You\\nshall only bring me an old tinder-box which my grandmother forgot when\\nshe was down there last.\"\\n\\n\"Then tie the rope round my body,\" cried the soldier.\\n\\n\"Here it is,\" said the witch, \"and here\\'s my blue-checked apron.\"\\n\\nThen the soldier climbed up into the tree, let himself slip down into\\nthe hole, and stood, as the witch had said, in the great hall where the\\nthree hundred lamps were burning.\\n\\nNow he opened the first door. Ugh! there sat the dog with eyes as big\\nas tea-cups, staring at him. \"You\\'re a nice fellow!\" exclaimed the\\nsoldier; and he set him on the witch\\'s apron, and took as many copper\\nshillings as his pockets would hold, and then locked the chest, set the\\ndog on it again, and went into the second chamber. Aha! there sat the\\ndog with eyes as big as mill-wheels.\\n\\n\"You should not stare so hard at me,\" said the soldier; \"you might\\nstrain your eyes.\" And he set the dog upon the witch\\'s apron. And when\\nhe saw the silver money in the chest, he threw away all the copper money\\nhe had, and filled his pocket and his knapsack with silver only. Then he\\nwent into the third chamber. Oh, but that was horrid! The dog there\\nreally had eyes as big as towers, and they turned round and round in his\\nhead like wheels.\\n\\n\"Good evening!\" said the soldier; and he touched his cap, for he had\\nnever seen such a dog as that before. When he had looked at him a little\\nmore closely, he thought, \"That will do,\" and lifted him down to the\\nfloor, and opened the chest. Mercy! what a quantity of gold was there!\\nHe could buy with it the whole town, and the sugar sucking-pigs of the\\ncake woman, and all the tin soldiers, whips, and rocking-horses in the\\nwhole world. Yes, that was a quantity of money! Now the soldier threw\\naway all the silver coin with which he had filled his pockets and his\\nknapsack, and took gold instead: yes, all his pockets, his knapsack, his\\nboots, and his cap were filled, so that he could scarcely walk. Now\\nindeed he had plenty of money. He put the dog on the chest, shut the\\ndoor, and then called up through the tree, \"Now pull me up, you old\\nwitch.\"\\n\\n\"Have you the tinder-box?\" asked the witch.\\n\\n\"Plague on it!\" exclaimed the soldier, \"I had clean forgotten that.\"\\nAnd he went and brought it.\\n\\nThe witch drew him up, and he stood on the high road again, with\\npockets, boots, knapsack, and cap full of gold.\\n\\n\"What are you going to do with the tinder-box?\" asked the\\nsoldier.\\n\\n\"That\\'s nothing to you,\" retorted the witch. \"You\\'ve had your\\nmoney—just give me the tinder-box.\"\\n\\n\"Nonsense!\" said the soldier. \"Tell me directly what you\\'re going to\\ndo with it, or I\\'ll draw my sword and cut off your head.\"\\n\\n\"No!\" cried the witch.\\n\\nSo the soldier cut off her head. There she lay! But he tied up all\\nhis money in her apron, took it on his back like a bundle, put the\\ntinder-box in his pocket, and went straight off toward the town.\\n\\nThat was a splendid town! And he put up at the very best inn and\\nasked for the finest rooms, and ordered his 198\\nfavorite dishes, for now he was rich, as he had so much money. The\\nservant who had to clean his boots certainly thought them a remarkably\\nold pair for such a rich gentleman; but he had not bought any new ones\\nyet. The next day he procured proper boots and handsome clothes. Now our\\nsoldier had become a fine gentleman; and the people told him of all the\\nsplendid things which were in their city, and about the King, and what a\\npretty Princess the King\\'s daughter was.\\n\\n\"Where can one get to see her?\" asked the soldier.\\n\\n\"She is not to be seen at all,\" said they, all together; \"she lives\\nin a great copper castle, with a great many walls and towers round about\\nit; no one but the King may go in and out there, for it has been\\nprophesied that she shall marry a common soldier, and the King can\\'t\\nbear that.\"\\n\\n\"I should like to see her,\" thought the soldier; but he could not get\\nleave to do so. Now he lived merrily, went to the theater, drove in the\\nKing\\'s garden, and gave much money to the poor; and this was very kind\\nof him, for he knew from old times how hard it is when one has not a\\nshilling. Now he was rich, had fine clothes, and gained many friends,\\nwho all said he was a rare one, a true cavalier; and that pleased the\\nsoldier well. But as he spent money every day and never earned any, he\\nhad at last only two shillings left; and he was obliged to turn out of\\nthe fine rooms in which he had dwelt, and had to live in a little garret\\nunder the roof, and clean his boots for himself, and mend them with a\\ndarning-needle. None of his friends came to see him, for there were too\\nmany stairs to climb.\\n\\nIt was quite dark one evening, and he could not even buy himself a\\ncandle, when it occurred to him that there was a candle-end in the\\ntinder-box which he had taken out of the hollow tree into which the\\nwitch had helped him. He brought out the tinder-box and the candle-end;\\nbut as soon as he struck fire and the sparks rose up from the flint, the\\ndoor flew open, and the dog who had eyes as big as a couple of tea-cups,\\nand whom he had seen in the tree, stood before him, and said:\\n\\n\"What are my lord\\'s commands?\"\\n\\n\"What is this?\" said the soldier. \"That\\'s a famous tinder-box, if I\\ncan get everything with it that I want! Bring me some money,\" said he to\\nthe dog: and whisk! the dog was gone, and whisk! he\\nwas back again, with a great bag full of shillings in his mouth.\\n\\nNow the soldier knew what a capital tinder-box this was. If he struck\\nit once, the dog came who sat upon the chest of copper money; if he\\nstruck it twice, the dog came who had the silver; and if he struck it\\nthree times, then appeared the dog who had the gold. Now the soldier\\nmoved back into the fine rooms, and appeared again in handsome clothes;\\nand all his friends knew him again, and cared very much for him\\nindeed.\\n\\nOnce he thought to himself, \"It is a very strange thing that one\\ncannot get to see the Princess. They all say she is very beautiful; but\\nwhat is the use of that, if she has always to sit in the great copper\\ncastle with the many towers? Can I not get to see her at all? Where is\\nmy tinder-box?\" And so he struck a light, and whisk! came the\\ndog with eyes as big as tea-cups.\\n\\n\"It is midnight, certainly,\" said the soldier, \"but I should very\\nmuch like 199 to see the Princess, only for one little\\nmoment.\"\\n\\nAnd the dog was outside the door directly, and, before the soldier\\nthought it, came back with the Princess. She sat upon the dog\\'s back and\\nslept; and everyone could see she was a real Princess, for she was so\\nlovely. The soldier could not refrain from kissing her, for he was a\\nthorough soldier. Then the dog ran back again with the Princess. But\\nwhen morning came, and the King and Queen were drinking tea, the\\nPrincess said she had had a strange dream, the night before, about a dog\\nand a soldier—that she had ridden upon the dog, and the soldier had\\nkissed her.\\n\\n\"That would be a fine history!\" said the Queen.\\n\\nSo one of the old Court ladies had to watch the next night by the\\nPrincess\\'s bed, to see if this was really a dream, or what it might\\nbe.\\n\\nThe soldier had a great longing to see the lovely Princess again; so\\nthe dog came in the night, took her away, and ran as fast as he could.\\nBut the old lady put on water-boots, and ran just as fast after him.\\nWhen she saw that they both entered a great house, she thought, \"Now I\\nknow where it is\"; and with a bit of chalk she drew a great cross on the\\ndoor. Then she went home and lay down, and the dog came up with the\\nPrincess; but when he saw that there was a cross drawn on the door where\\nthe soldier lived, he took a piece of chalk too, and drew crosses on all\\nthe doors in the town. And that was cleverly done, for now the lady\\ncould not find the right door, because all the doors had crosses upon\\nthem.\\n\\nIn the morning early came the King and the Queen, the old Court lady\\nand all the officers, to see where it was the Princess had been. \"Here\\nit is!\" said the King, when he saw the first door with a cross upon it.\\n\"No, my dear husband, it is there!\" said the Queen, who descried another\\ndoor which also showed a cross. \"But there is one, and there is one!\"\\nsaid all, for wherever they looked there were crosses on the doors. So\\nthey saw that it would avail them nothing if they searched on.\\n\\nBut the Queen was an exceedingly clever woman, who could do more than\\nride in a coach. She took her great gold scissors, cut a piece of silk\\ninto pieces, and made a neat little bag: this bag she filled with fine\\nwheat flour, and tied it on the Princess\\'s back; and when that was done,\\nshe cut a little hole in the bag, so that the flour would be scattered\\nalong all the way which the Princess should take.\\n\\nIn the night the dog came again, took the Princess on his back, and\\nran with her to the soldier, who loved her very much, and would gladly\\nhave been a prince, so that he might have her for his wife. The dog did\\nnot notice at all how the flour ran out in a stream from the castle to\\nthe windows of the soldier\\'s house, where he ran up the wall with the\\nPrincess. In the morning the King and Queen saw well enough where their\\ndaughter had been, and they took the soldier and put him in prison.\\n\\nThere he sat. Oh, but it was dark and disagreeable there! And they\\nsaid to him, \"To-morrow you shall be hanged.\" That was not amusing to\\nhear, and he had left his tinder-box at the inn. In the morning he could\\nsee, through the iron grating of the little window, how the people were\\nhurrying out of the town to see him hanged. He heard the 200 drums\\nbeat and saw the soldiers marching. All the people were running out, and\\namong them was a shoemaker\\'s boy with leather apron and slippers, and he\\ngalloped so fast that one of his slippers flew off, and came right\\nagainst the wall where the soldier sat looking through the iron\\ngrating.\\n\\n\"Halloo, you shoemaker\\'s boy! you needn\\'t be in such a hurry,\" cried\\nthe soldier to him: \"it will not begin till I come. But if you will run\\nto where I lived, and bring me my tinder-box, you shall have four\\nshillings; but you must put your best leg foremost.\"\\n\\nThe shoemaker\\'s boy wanted to get the four shillings, so he went and\\nbrought the tinder-box, and—well, we shall hear now what happened.\\n\\nOutside the town a great gallows had been built, and around it stood\\nthe soldiers and many hundred thousand people. The King and Queen sat on\\na splendid throne, opposite to the Judges and the whole Council. The\\nsoldier already stood upon the ladder; but as they were about to put the\\nrope round his neck, he said that before a poor criminal suffered his\\npunishment an innocent request was always granted to him. He wanted very\\nmuch to smoke a pipe of tobacco, as it would be the last pipe he should\\nsmoke in this world. The King would not say \"No\" to this; so the soldier\\ntook his tinder-box and struck fire. One—two—three—! and there suddenly\\nstood all the dogs—the one with eyes as big as tea-cups, the one with\\neyes as large as mill-wheels, and the one whose eyes were as big as\\nround towers.\\n\\n\"Help me now, so that I may not be hanged,\" said the soldier. And the\\ndogs fell upon the Judge and all the Council, seized one by the leg and\\nanother by the nose, and tossed them all many feet into the air, so that\\nthey fell down and were all broken to pieces.\\n\\n\"I won\\'t!\" cried the King; but the biggest dog took him and the Queen\\nand threw them after the others. Then the soldiers were afraid, and the\\npeople cried, \"Little soldier, you shall be our King, and marry the\\nbeautiful Princess!\"\\n\\nSo they put the soldier into the King\\'s coach, and all the three dogs\\ndarted on in front and cried \"Hurrah!\" and the boys whistled through\\ntheir fingers, and the soldiers presented arms. The Princess came out of\\nthe copper castle, and became Queen, and she liked that well enough. The\\nwedding lasted a week, and the three dogs sat at the table too, and\\nopened their eyes wider than ever at all they saw.\\n\\n197\\n\\nThe following is one of Andersen\\'s early stories, published in 1838.\\nIt has always been a great favorite. Whimsically odd couples, in this\\ncase so constant in their devotion to each other, seemed to appeal to\\nAndersen. The romance of the Whip Top and the Ball in the little story\\n\"The Lovers\" deals with another odd couple. \"Constant\" or \"steadfast\"\\nare terms sometimes used in the different versions instead of \"hardy,\"\\nand, if they seem better to carry the meaning intended, teachers should\\nfeel free to substitute one of them in telling or reading the story. The\\ntranslation is by H. W. Dulcken.\\n\\nTHE HARDY TIN SOLDIER\\n\\nHANS CHRISTIAN ANDERSEN\\n\\nThere were once five-and-twenty tin soldiers; they were all brothers,\\nfor they had all been born of one old tin spoon. 201 They\\nshouldered their muskets, and looked straight before them; their uniform\\nwas red and blue, and very splendid. The first thing they had heard in\\nthe world, when the lid was taken off their box, had been the words,\\n\"Tin soldiers!\" These words were uttered by a little boy, clapping his\\nhands: the soldiers had been given to him, for it was his birthday; and\\nnow he put them upon the table. Each soldier was exactly like the rest;\\nbut one of them had been cast last of all, and there had not been enough\\ntin to finish him; but he stood as firmly upon his one leg as the others\\non their two; and it was just this Soldier who became remarkable.\\n\\nOn the table on which they had been placed stood many other\\nplaythings, but the toy that attracted most attention was a neat castle\\nof cardboard. Through the little windows one could see straight into the\\nhall. Before the castle some little trees were placed round a little\\nlooking-glass, which was to represent a clear lake. Waxen swans swam on\\nthis lake, and were mirrored in it. This was all very pretty; but the\\nprettiest of all was a little lady, who stood at the open door of the\\ncastle; she was also cut out in paper, but she had a dress of the\\nclearest gauze, and a little narrow blue ribbon over her shoulders, that\\nlooked like a scarf; and in the middle of this ribbon was a shining\\ntinsel rose as big as her whole face. The little lady stretched out both\\nher arms, for she was a dancer; and then she lifted one leg so high that\\nthe Tin Soldier could not see it at all, and thought that, like himself,\\nshe had but one leg.\\n\\n\"That would be the wife for me,\" thought he; \"but she is very grand.\\nShe lives in a castle, and I have only a box, and there are\\nfive-and-twenty of us in that. It is no place for her. But I must try to\\nmake acquaintance with her.\"\\n\\nAnd then he lay down at full length behind a snuff-box which was on\\nthe table; there he could easily watch the little dainty lady, who\\ncontinued to stand upon one leg without losing her balance.\\n\\nWhen the evening came all the other tin soldiers were put into their\\nbox, and the people in the house went to bed. Now the toys began to play\\nat \"visiting,\" and at \"war,\" and \"giving balls.\" The tin soldiers\\nrattled in their box, for they wanted to join, but could not lift the\\nlid. The nutcracker threw somersaults, and the pencil amused itself on\\nthe table; there was so much noise that the canary woke up, and began to\\nspeak too, and even in verse. The only two who did not stir from their\\nplaces were the Tin Soldier and the Dancing Lady: she stood straight up\\non the point of one of her toes, and stretched out both her arms; and he\\nwas just as enduring on his one leg; and he never turned his eyes away\\nfrom her.\\n\\nNow the clock struck twelve—and, bounce! the lid flew off the\\nsnuff-box; but there was no snuff in it, but a little black Goblin: you\\nsee, it was a trick.\\n\\n\"Tin Soldier!\" said the Goblin, \"don\\'t stare at things that don\\'t\\nconcern you.\"\\n\\nBut the Tin Soldier pretended not to hear him.\\n\\n\"Just you wait till to-morrow!\" said the Goblin.\\n\\nBut when the morning came, and the children got up, the Tin Soldier\\nwas placed in the window; and whether it was the Goblin or the draught\\nthat did it, all at once the window flew open, 202 and the\\nSoldier fell head over heels out of the third story. That was a terrible\\npassage! He put his leg straight up, and stuck with helmet downward and\\nhis bayonet between the paving-stones.\\n\\nThe servant-maid and the little boy came down directly to look for\\nhim, but though they almost trod upon him, they could not see him. If\\nthe Soldier had cried out \"Here I am!\" they would have found him; but he\\ndid not think it fitting to call out loudly, because he was in\\nuniform.\\n\\nNow it began to rain; the drops soon fell thicker, and at last it\\ncame down into a complete stream. When the rain was past, two street\\nboys came by.\\n\\n\"Just look!\" said one of them, \"there lies a Tin Soldier. He must\\ncome out and ride in the boat.\"\\n\\nAnd they made a boat out of a newspaper, and put the Tin Soldier in\\nthe middle of it, and so he sailed down the gutter, and the two boys ran\\nbeside him and clapped their hands. Goodness preserve us! how the waves\\nrose in that gutter, and how fast the stream ran! But then it had been a\\nheavy rain. The paper boat rocked up and down, and sometimes turned\\nround so rapidly that the Tin Soldier trembled; but he remained firm,\\nand never changed countenance, and looked straight before him, and\\nshouldered his musket.\\n\\nAll at once the boat went into a long drain, and it became as dark as\\nif he had been in his box.\\n\\n\"Where am I going now?\" he thought. \"Yes, yes, that\\'s the Goblin\\'s\\nfault. Ah! if the little lady only sat here with me in the boat, it\\nmight be twice as dark for what I should care.\"\\n\\nSuddenly there came a great Water Rat, which lived under the\\ndrain.\\n\\n\"Have you a passport?\" said the Rat. \"Give me your passport.\"\\n\\nBut the Tin Soldier kept silence, and held his musket tighter than\\never.\\n\\nThe boat went on, but the Rat came after it. Hu! how he gnashed his\\nteeth, and called out to the bits of straw and wood:\\n\\n\"Hold him! hold him! He hasn\\'t paid toll—he hasn\\'t shown his\\npassport!\"\\n\\nBut the stream became stronger and stronger. The Tin Soldier could\\nsee the bright daylight where the arch ended; but he heard a roaring\\nnoise which might well frighten a bolder man. Only think—just where the\\ntunnel ended, the drain ran into a great canal; and for him that would\\nhave been as dangerous as for us to be carried down a great\\nwaterfall.\\n\\nNow he was already so near it that he could not stop. The boat was\\ncarried out, the poor Tin Soldier stiffening himself as much as he\\ncould, and no one could say that he moved an eyelid. The boat whirled\\nround three or four times, and was full of water to the very edge—it\\nmust sink. The Tin Soldier stood up to his neck in water, and the boat\\nsank deeper and deeper, and the paper was loosened more and more; and\\nnow the water closed over the soldier\\'s head. Then he thought of the\\npretty little Dancer, and how he should never see her again; and it\\nsounded in the soldier\\'s ears:\\n\\nFarewell, farewell, thou warrior brave,\\n\\nFor this day\\nthou must die!\\n\\nTranscriber\\'s Note: original reads \\'warrier\\'\\n\\nAnd now the paper parted, and the Tin Soldier fell out; but at that\\nmoment he was snapped up by a great fish.\\n\\nOh, how dark it was in that fish\\'s body! It was darker yet than in\\nthe 203 drain tunnel; and then it was very narrow too.\\nBut the Tin Soldier remained unmoved, and lay at full length shouldering\\nhis musket.\\n\\nThe fish swam to and fro; he made the most wonderful movements, and\\nthen became quite still. At last something flashed through him like\\nlightning. The daylight shone quite clear, and a voice said aloud, \"The\\nTin Soldier!\" The fish had been caught, carried to market, bought, and\\ntaken into the kitchen, where the cook cut him open with a large knife.\\nShe seized the Soldier round the body with both her hands and carried\\nhim into the room, where all were anxious to see the remarkable man who\\nhad traveled about in the inside of a fish; but the Tin Soldier was not\\nat all proud. They placed him on the table, and there—no! What curious\\nthings may happen in the world. The Tin Soldier was in the very room in\\nwhich he had been before! He saw the same children, and the same toys\\nstood on the table; and there was the pretty castle with the graceful\\nlittle Dancer. She was still balancing herself on one leg, and held the\\nother extended in the air. She was hardy too. That moved the Tin\\nSoldier; he was very nearly weeping tin tears, but that would not have\\nbeen proper. He looked at her, but they said nothing to each other.\\n\\nThen one of the little boys took the Tin Soldier and flung him into\\nthe stove. He gave no reason for doing this. It must have been the fault\\nof the Goblin in the snuff-box.\\n\\nThe Tin Soldier stood there quite illuminated, and felt a heat that\\nwas terrible; but whether this heat proceeded from the real fire or from\\nlove he did not know. The colors had quite gone off from him; but\\nwhether that had happened on the journey, or had been caused by grief,\\nno one could say. He looked at the little lady, she looked at him, and\\nhe felt that he was melting; but he still stood firm, shouldering his\\nmusket. Then suddenly the door flew open, and the draught of air caught\\nthe Dancer, and she flew like a sylph just into the stove to the Tin\\nSoldier, and flashed up in a flame, and she was gone. Then the Tin\\nSoldier melted down into a lump; and when the servant-maid took the\\nashes out next day, she found him in the shape of a little tin heart.\\nBut of the Dancer nothing remained but the tinsel rose, and that was\\nburned as black as a coal.\\n\\n198\\n\\n\"The Ugly Duckling\" has always been regarded as one of Andersen\\'s\\nmost exquisite stories. No one can fail to notice the parallel that\\nsuggests itself between the successive stages in the duckling\\'s history\\nand those in Andersen\\'s own life. In this story, remarks Dr. Brandes,\\n\"there is the quintessence of the author\\'s entire life (melancholy,\\nhumor, martyrdom, triumph) and of his whole nature: the gift of\\nobservation and the sparkling intellect which he used to avenge himself\\nupon folly and wickedness, the varied faculties which constitute his\\ngenius.\" The standards of judgment used by the ducks, the turkey, the\\nhen, and the cat are all delightfully and humorously satirical of human\\nstupidity and shortsightedness. The translation used is by H. W.\\nDulcken.\\n\\nTHE UGLY DUCKLING\\n\\nHANS CHRISTIAN ANDERSEN\\n\\nIt was glorious out in the country. It was summer, and the cornfields\\nwere yellow, and the oats were green; the hay 204 had\\nbeen put up in stacks in the green meadows, and the stork went about on\\nhis long red legs, and chattered Egyptian, for this was the language he\\nhad learned from his good mother. All around the fields and meadows were\\ngreat forests, and in the midst of these forests lay deep lakes. Yes, it\\nwas really glorious out in the country. In the midst of the sunshine\\nthere lay an old farm, surrounded by deep canals, and from the wall down\\nto the water grew great burdocks, so high that little children could\\nstand upright under the loftiest of them. It was just as wild there as\\nin the deepest wood. Here sat a Duck upon her nest, for she had to hatch\\nher young ones; but she was almost tired out before the little ones\\ncame; and then she so seldom had visitors. The other ducks liked better\\nto swim about in the canals than to run up to sit down under a burdock\\nand cackle with her.\\n\\nAt last one eggshell after another burst open. \"Piep! piep!\" it\\ncried, and in all the eggs there were little creatures that stuck out\\ntheir heads.\\n\\n\"Rap! rap!\" they said; and they all came rapping out as fast as they\\ncould, looking all round them under the green leaves; and the mother let\\nthem look as much as they chose, for green is good for the eyes.\\n\\n\"How wide the world is!\" said the young ones, for they certainly had\\nmuch more room now than when they were in the eggs.\\n\\n\"Do you think this is all the world!\" asked the mother. \"That extends\\nfar across the other side of the garden, quite into the parson\\'s field,\\nbut I have never been there yet. I hope you are all together,\" she\\ncontinued, and stood up. \"No, I have not all. The largest egg still lies\\nthere. How long is that to last? I am really tired of it.\" And she sat\\ndown again.\\n\\n\"Well, how goes it?\" asked an old Duck who had come to pay her a\\nvisit.\\n\\n\"It lasts a long time with that one egg,\" said the Duck who sat\\nthere. \"It will not burst. Now, only look at the others; are they not\\nthe prettiest ducks one could possibly see? They are all like their\\nfather; the bad fellow never comes to see me.\"\\n\\n\"Let me see the egg which will not burst,\" said the old visitor.\\n\"Believe me, it is a turkey\\'s egg. I was once cheated in that way, and\\nhad much anxiety and trouble with the young ones, for they are afraid of\\nthe water. I could not get them to venture in. I quacked and clucked,\\nbut it was of no use. Let me see the egg. Yes, that\\'s a turkey\\'s egg!\\nLet it lie there, and you teach the other children to swim.\"\\n\\n\"I think I will sit on it a little longer,\" said the Duck. \"I\\'ve sat\\nso long now that I can sit a few days more.\"\\n\\n\"Just as you please,\" said the old Duck; and she went away.\\n\\nAt last the great egg burst. \"Piep! piep!\" said the little one, and\\ncrept forth. It was very large and very ugly. The Duck looked at it.\\n\\n\"It\\'s a very large duckling,\" said she; \"none of the others look like\\nthat; can it really be a turkey chick? Now we shall soon find out. It\\nmust go into the water, even if I have to thrust it in myself.\"\\n\\nThe next day the weather was splendidly bright, and the sun shone on\\nall the green trees. The Mother-Duck went down to the water with all her\\nlittle ones. Splash! she jumped into the water. \"Quack! quack!\" she\\nsaid, and then one duckling after another plunged in. The 205 water\\nclosed over their heads, but they came up in an instant, and swam\\ncapitally; their legs went of themselves, and there they were, all in\\nthe water. The ugly gray Duckling swam with them.\\n\\n\"No, it\\'s not a turkey,\" said she; \"look how well it can use its\\nlegs, and how upright it holds itself. It is my own child! On the whole\\nit\\'s quite pretty, if one looks at it rightly. Quack! quack! come with\\nme, and I\\'ll lead you out into the great world, and present you in the\\npoultry-yard; but keep close to me, so that no one may tread on you; and\\ntake care of the cats!\"\\n\\nAnd so they came into the poultry-yard. There was a terrible riot\\ngoing on in there, for two families were quarreling about an eel\\'s head,\\nand the cat got it after all.\\n\\n\"See, that\\'s how it goes in the world!\" said the Mother-Duck; and she\\nwhetted her beak, for she, too, wanted the eel\\'s head. \"Only use your\\nlegs,\" she said. \"See that you bustle about, and bow your heads before\\nthe old Duck yonder. She\\'s the grandest of all here; she\\'s of Spanish\\nblood—that\\'s why she\\'s so fat; and do you see, she has a red rag round\\nher leg; that\\'s something particularly fine, and the greatest\\ndistinction a duck can enjoy; it signifies that one does not want to\\nlose her, and that she\\'s to be recognized by man and beast. Shake\\nyourselves—don\\'t turn in your toes; a well-brought-up Duck turns its\\ntoes quite out, just like father and mother, so! Now bend your necks and\\nsay \\'Rap!\\'\"\\n\\nAnd they did so; but the other Ducks round about looked at them, and\\nsaid quite boldly:\\n\\n\"Look there! now we\\'re to have these hanging on, as if there were not\\nenough of us already! And—fie—! how that Duckling yonder looks; we won\\'t\\nstand that!\" And one duck flew up immediately, and bit it in the\\nneck.\\n\\n\"Let it alone,\" said the mother; \"it does no harm to anyone.\"\\n\\n\"Yes, but it\\'s too large and peculiar,\" said the Duck who had bitten\\nit; \"and therefore it must be buffeted.\"\\n\\n\"Those are pretty children that the mother has there,\" said the old\\nDuck with the rag round her leg. \"They\\'re all pretty but that one; that\\nwas a failure. I wish she could alter it.\"\\n\\n\"That cannot be done, my lady,\" replied the Mother-Duck. \"It is not\\npretty, but it has a really good disposition, and swims as well as any\\nother; I may even say it swims better. I think it will grow up pretty,\\nand become smaller in time; it has lain too long in the egg, and\\ntherefore is not properly shaped.\" And then she pinched it in the neck,\\nand smoothed its feathers. \"Moreover, it is a drake,\" she said, \"and\\ntherefore it is not of so much consequence. I think he will be very\\nstrong; he makes his way already.\"\\n\\n\"The other ducklings are graceful enough,\" said the old Duck. \"Make\\nyourself at home; and if you find an eel\\'s head, you may bring it\\nme.\"\\n\\nAnd now they were at home. But the poor Duckling which had crept last\\nout of the egg, and looked so ugly, was bitten and pushed and jeered, as\\nmuch by the ducks as by the chickens.\\n\\n\"It is too big!\" they all said. And the turkey-cock, who had been\\nborn with spurs, and therefore thought himself an Emperor, blew himself\\nup like a ship in full sail, and bore straight down upon it; then he\\ngobbled, and grew quite red in the face. The poor Duckling did not know\\nwhere it should stand or walk; 206 it was quite melancholy,\\nbecause it looked ugly and was scoffed at by the whole yard.\\n\\nSo it went on the first day; and afterward it became worse and worse.\\nThe poor Duckling was hunted about by every one; even its brothers and\\nsisters were quite angry with it, and said, \"If the cat would only catch\\nyou, you ugly creature!\" And the mother said, \"If you were only far\\naway!\" And the ducks bit it, and the chickens beat it, and the girl who\\nhad to feed the poultry kicked at it with her foot.\\n\\nThen it ran and flew over the fence, and the little birds in the\\nbushes flew up in fear.\\n\\n\"That is because I am so ugly!\" thought the Duckling; and it shut its\\neyes, but flew no farther; thus it came out into the great moor, where\\nthe Wild Ducks lived. Here it lay the whole night long; and it was weary\\nand downcast.\\n\\nToward morning the Wild Ducks flew up, and looked at their new\\ncompanion.\\n\\n\"What sort of a one are you?\" they asked; and the Duckling turned in\\nevery direction, and bowed as well as it could. \"You are remarkably\\nugly!\" said the Wild Ducks. \"But that is very indifferent to us, so long\\nas you do not marry into our family.\"\\n\\nPoor thing! It certainly did not think of marrying, and only hoped to\\nobtain leave to lie among the reeds and drink some of the\\nswamp-water.\\n\\nThus it lay two whole days; then came thither two Wild Geese, or,\\nproperly speaking, two wild ganders. It was not long since each had\\ncrept out of an egg, and that\\'s why they were so saucy.\\n\\n\"Listen, comrade,\" said one of them. \"You\\'re so ugly that I like you.\\nWill you go with us, and become a bird of passage? Near here, in another\\nmoor, there are a few sweet lovely wild geese, all unmarried, and all\\nable to say, \\'Rap!\\' You\\'ve a chance of making your fortune, ugly as you\\nare!\"\\n\\n\"Piff! paff!\" resounded through the air; and the two ganders fell\\ndown dead in the swamp, and the water became blood-red. \"Piff! paff!\" it\\nsounded again, and whole flocks of wild geese rose up from the reeds.\\nAnd then there was another report. A great hunt was going on. The\\nhunters were lying in wait all round the moor, and some were even\\nsitting up in the branches of the trees, which spread far over the\\nreeds. The blue smoke rose up like clouds among the dark trees, and was\\nwafted far away across the water; and the hunting dogs came—splash,\\nsplash!—into the swamp, and the rushes and the reeds bent down on every\\nside. That was a fright for the poor Duckling! It turned its head, and\\nput it under its wing; but at that moment a frightful great dog stood\\nclose by the Duckling. His tongue hung far out of his mouth and his eyes\\ngleamed horrible and ugly; he thrust out his nose close against the\\nDuckling, showed his sharp teeth, and—splash, splash!—on he went without\\nseizing it.\\n\\n\"Oh, Heaven be thanked!\" sighed the Duckling. \"I am so ugly that even\\nthe dog does not like to bite me!\"\\n\\nAnd so it lay quite quiet, while the shots rattled through the reeds\\nand gun after gun was fired. At last, late in the day, silence was\\nrestored; but the poor Duckling did not dare to rise up; it waited\\nseveral hours before it looked round, and then hastened away out of the\\nmoor as fast as it could. It ran on over field and meadow; there was\\nsuch a storm raging that it was difficult to get from one place to\\nanother. 207\\n\\nToward evening the Duck came to a little miserable peasant\\'s hut.\\nThis hut was so dilapidated that it did not know on which side it should\\nfall; and that\\'s why it remained standing. The storm whistled round the\\nDuckling in such a way that the poor creature was obliged to sit down,\\nto stand against it; and the tempest grew worse and worse. Then the\\nDuckling noticed that one of the hinges of the door had given way, and\\nthe door hung so slanting that the Duckling could slip through the crack\\ninto the room; and it did so.\\n\\nHere lived a woman with her Tom Cat and her Hen. And the Tom Cat,\\nwhom she called Sonnie, could arch his back and purr. He could even give\\nout sparks; but for that one had to stroke his fur the wrong way. The\\nHen had quite little short legs, and therefore she was called\\nChickabiddy-shortshanks; she laid good eggs, and the woman loved her as\\nher own child.\\n\\nIn the morning the strange Duckling was at once noticed, and the Tom\\nCat began to purr, and the Hen to cluck.\\n\\n\"What\\'s this?\" said the woman, and looked all round; but she could\\nnot see well, and therefore she thought the Duckling was a fat duck that\\nhad strayed. \"This is a rare prize,\" she said. \"Now I shall have duck\\'s\\neggs. I hope it is not a drake. We must try that.\"\\n\\nAnd so the Duckling was admitted on trial for three weeks; but no\\neggs came. And the Tom Cat was master of the house, and the Hen was the\\nlady, and they always said, \"We and the world!\" for they thought they\\nwere half the world, and by far the better half. The Duckling thought\\none might have a different opinion, but the Hen would not allow it.\\n\\n\"Can you lay eggs?\" she asked.\\n\\n\"No.\"\\n\\n\"Then you\\'ll have the goodness to hold your tongue.\"\\n\\nAnd the Tom Cat said, \"Can you curve your back, and purr, and give\\nout sparks?\"\\n\\n\"No.\"\\n\\n\"Then you cannot have any opinion of your own when sensible people\\nare speaking.\"\\n\\nAnd the Duckling sat in a corner and was melancholy; then the fresh\\nair and the sunshine streamed in; and it was seized with such a strange\\nlonging to swim on the water that it could not help telling the Hen of\\nit.\\n\\n\"What are you thinking of?\" cried the Hen. \"You have nothing to do;\\nthat\\'s why you have these fancies. Purr or lay eggs, and they will pass\\nover.\"\\n\\n\"But it is so charming to swim on the water!\" said the Duckling, \"so\\nrefreshing to let it close above one\\'s head, and to dive down to the\\nbottom.\"\\n\\n\"Yes, that must be a mighty pleasure, truly,\" quoth the Hen. \"I fancy\\nyou must have gone crazy. Ask the Cat about it—he\\'s the cleverest animal\\nI know—ask him if he likes to swim on the water, or to dive down: I\\nwon\\'t speak about myself. Ask our mistress, the old woman; no one in the\\nworld is cleverer than she. Do you think she has any desire to swim, and\\nto let the water close above her head?\"\\n\\n\"You don\\'t understand me,\" said the Duckling.\\n\\n\"We don\\'t understand you? Then pray who is to understand you? You\\nsurely don\\'t pretend to be cleverer than the Tom Cat and the old woman—I\\nwon\\'t say anything of myself. Don\\'t be conceited, child, and be grateful\\nfor all the kindness you have received. Did 208 you not\\nget into a warm room, and have you not fallen into company from which\\nyou may learn something? But you are a chatterer, and it is not pleasant\\nto associate with you. You may believe me, I speak for your good. I tell\\nyou disagreeable things, and by that one may always know one\\'s true\\nfriends. Only take care that you learn to lay eggs, or to purr and give\\nout sparks!\"\\n\\n\"I think I will go out into the wide world,\" said the Duckling.\\n\\n\"Yes, do go,\" replied the Hen.\\n\\nAnd the Duckling went away. It swam on the water, and dived, but it\\nwas slighted by every creature because of its ugliness.\\n\\nNow came the autumn. The leaves in the forest turned yellow and\\nbrown; the wind caught them so that they danced about, and up in the air\\nit was very cold. The clouds hung low, heavy with hail and snow-flakes,\\nand on the fence stood the raven, crying, \"Croak! croak!\" for mere cold;\\nyes, it was enough to make one feel cold to think of this. The poor\\nlittle Duckling certainly had not a good time. One evening—the sun was\\njust setting in his beauty—there came a whole flock of great handsome\\nbirds out of the bushes; they were dazzlingly white, with long flexible\\nnecks; they were swans. They uttered a very peculiar cry, spread forth\\ntheir glorious great wings, and flew away from that cold region to\\nwarmer lands, to fair open lakes. They mounted so high, so high! and the\\nugly little Duckling felt quite strange as it watched them. It turned\\nround and round in the water like a wheel, stretched out its neck toward\\nthem, and uttered such a strange loud cry as frightened itself. Oh! it\\ncould not forget those beautiful, happy birds; and so soon as it could\\nsee them no longer, it dived down to the very bottom, and when it came\\nup again, it was quite beside itself. It knew not the name of those\\nbirds, and knew not whither they were flying; but it loved them more\\nthan it had ever loved anyone. It was not at all envious of them. How\\ncould it think of wishing to possess such loveliness as they had? It\\nwould have been glad if only the ducks would have endured its\\ncompany—the poor ugly creature!\\n\\nAnd the winter grew cold, very cold! The Duckling was forced to swim\\nabout in the water, to prevent the surface from freezing entirely; but\\nevery night the hole in which it swam about became smaller and smaller.\\nIt froze so hard that the icy covering crackled again; and the Duckling\\nwas obliged to use its legs continually to prevent the hole from\\nfreezing up. At last it became exhausted, and lay quite still, and thus\\nfroze fast into the ice.\\n\\nEarly in the morning a peasant came by, and when he saw what had\\nhappened, he took his wooden shoe, broke the ice-crust to pieces, and\\ncarried the Duckling home to his wife. Then it came to itself again. The\\nchildren wanted to play with it; but the Duckling thought they would do\\nit an injury, and in its terror fluttered up into the milk-pan, so that\\nthe milk spurted down into the room. The woman clapped her hands, at\\nwhich the Duckling flew down into the butter-tub, and then into the\\nmeal-barrel and out again. How it looked then! The woman screamed, and\\nstruck at it with the fire-tongs; the children tumbled over one another\\nin their efforts to catch the Duckling; and they laughed and screamed\\nfinely. Happily the door stood open, and the poor creature was able to\\nslip out between the shrubs into the newly- 209fallen\\nsnow; and there it lay quite exhausted.\\n\\nBut it would be too melancholy if I were to tell all the misery and\\ncare which the Duckling had to endure in the hard winter. It lay out on\\nthe moor among the reeds when the sun began to shine again and the larks\\nto sing; it was a beautiful spring.\\n\\nThen all at once the Duckling could flap its wings; they beat the air\\nmore strongly than before, and bore it strongly away; and before it well\\nknew how all this had happened, it found itself in a great garden, where\\nthe elder trees smelt sweet, and bent their long green branches down to\\nthe canal that wound through the region. Oh, here it was so beautiful,\\nsuch a gladness of spring! and from the thicket came three glorious\\nwhite swans; they rustled their wings, and swam lightly on the water.\\nThe Duckling knew the splendid creatures, and felt oppressed by a\\npeculiar sadness.\\n\\n\"I will fly away to them, to the royal birds! and they will kill me,\\nbecause I, that am so ugly, dare to approach them. But it is of no\\nconsequence! Better to be killed by them than to be pursued by\\nducks, and beaten by fowls, and pushed about by the girl who takes care\\nof the poultry-yard, and to suffer hunger in winter!\" And it flew out\\ninto the water, and swam toward the beautiful swans: these looked at it,\\nand came sailing down upon it with outspread wings. \"Kill me!\" said the\\npoor creature, and bent its head down upon the water, expecting nothing\\nbut death. But what was this that it saw in the clear water? It beheld\\nits own image—and, lo! it was no longer a clumsy dark-gray bird, ugly\\nand hateful to look at, but—a swan.\\n\\nIt matters nothing if one was born in a duck-yard, if one has only\\nlain in a swan\\'s egg.\\n\\nIt felt quite glad at all the need and misfortune it had suffered,\\nnow it realized its happiness in all the splendor that surrounded it.\\nAnd the great swans swam round it, and stroked it with their beaks.\\n\\nInto the garden came little children, who threw bread and corn into\\nthe water; the youngest cried, \"There is a new one!\" and the other\\nchildren shouted joyously, \"Yes, a new one has arrived!\" And they\\nclapped their hands and danced about, and ran to their father and\\nmother; and bread and cake were thrown into the water; and they all\\nsaid, \"The new one is the most beautiful of all! so young and handsome!\"\\nand the old swans bowed their heads before him.\\n\\nThen he felt quite ashamed, and hid his head under his wing, for he\\ndid not know what to do; he was so happy, and yet not at all proud. He\\nthought how he had been persecuted and despised; and now he heard them\\nsaying that he was the most beautiful of all the birds. Even the elder\\ntree bent its branches straight down into the water before him, and the\\nsun shone warm and mild. Then his wings rustled, he lifted his slender\\nneck, and cried rejoicingly from the depths of his heart:\\n\\n\"I never dreamed of so much happiness when I was still the Ugly\\nDuckling!\"\\n\\n199\\n\\nOne of the really successful modern attempts at telling new fairy\\nstories was Granny\\'s Wonderful Chair (1857) by the blind\\npoet Frances Browne (1816-1887). In spite of the obstacles due to\\nblindness, poverty, and ill-health, she succeeded in educating herself,\\nand after achieving some fame as a poet left her mountain village 210 in\\ncounty Donegal, Ireland, to make a literary career in Edinburgh and\\nLondon. She published many volumes of poems, novels, and children\\'s\\nbooks. Only one of these is now much read or remembered, but it has\\ntaken a firm place in the affections of children. In Granny\\'s\\nWonderful Chair there are seven stories, set in an interesting\\nframework which tells of the adventures of the little girl Snowflower\\nand her chair at the court of King Winwealth. This chair had magic power\\nto transport Snowflower wherever she wished to go, like the magic carpet\\nin the Arabian Nights. When she laid down her head and\\nsaid, \"Chair of my grandmother, tell me a story,\" a clear voice from\\nunder the cushion would at once begin to speak. Besides the story that\\nfollows, two of the most satisfactory in the collection are \"The Greedy\\nShepherd\" and \"The Story of Merrymind.\" Perhaps one of the secrets of\\ntheir charm is in the power of visualization which the author possessed.\\nThe pictures are all clear and definite, yet touched with the glamor of\\nfairyland.\\n\\nTHE STORY OF FAIRYFOOT\\n\\nFRANCES BROWNE\\n\\nOnce upon a time there stood far away in the west country a town\\ncalled Stumpinghame. It contained seven windmills, a royal palace, a\\nmarket place, and a prison, with every other convenience befitting the\\ncapital of a kingdom. A capital city was Stumpinghame, and its\\ninhabitants thought it the only one in the world. It stood in the midst\\nof a great plain, which for three leagues round its walls was covered\\nwith corn, flax, and orchards. Beyond that lay a great circle of pasture\\nland, seven leagues in breadth, and it was bounded on all sides by a\\nforest so thick and old that no man in Stumpinghame knew its extent; and\\nthe opinion of the learned was that it reached to the end of the\\nworld.\\n\\nThere were strong reasons for this opinion. First, that forest was\\nknown to be inhabited time out of mind by the fairies, and no hunter\\ncared to go beyond its border—so all the west country believed it to be\\nsolidly full of old trees to the heart. Secondly, the people of\\nStumpinghame were no travelers—man, woman, and child had feet so large\\nand heavy that it was by no means convenient to carry them far. Whether\\nit was the nature of the place or the people, I cannot tell, but great\\nfeet had been the fashion there time immemorial, and the higher the\\nfamily the larger were they. It was, therefore, the aim of everybody\\nabove the degree of shepherds, and such-like rustics, to swell out and\\nenlarge their feet by way of gentility; and so successful were they in\\nthese undertakings that, on a pinch, respectable people\\'s slippers would\\nhave served for panniers.\\n\\nStumpinghame had a king of its own, and his name was Stiffstep; his\\nfamily was very ancient and large-footed. His subjects called him Lord\\nof the World, and he made a speech to them every year concerning the\\ngrandeur of his mighty empire. His queen, Hammerheel, was the greatest\\nbeauty in Stumpinghame. Her majesty\\'s shoe was not much less than a\\nfishing-boat; their six children promised to be quite as handsome, and\\nall went well with them till the birth of their seventh son.\\n\\nFor a long time nobody about the palace could understand what was the\\nmatter—the ladies-in-waiting looked so astonished, and the king so\\nvexed; but at last it was whispered through the city that the queen\\'s\\nseventh child had been born with such miserably small feet that 211 they\\nresembled nothing ever seen or heard of in Stumpinghame, except the feet\\nof the fairies.\\n\\nThe chronicles furnished no example of such an affliction ever before\\nhappening in the royal family. The common people thought it portended\\nsome great calamity to the city; the learnĆØd men began to write books\\nabout it; and all the relations of the king and queen assembled at the\\npalace to mourn with them over their singular misfortune. The whole\\ncourt and most of the citizens helped in this mourning, but when it had\\nlasted seven days they all found out it was of no use. So the relations\\nwent to their homes, and the people took to their work. If the learnĆØd\\nmen\\'s books were written, nobody ever read them; and to cheer up the\\nqueen\\'s spirits, the young prince was sent privately out to the pasture\\nlands, to be nursed among the shepherds.\\n\\nThe chief man there was called Fleecefold, and his wife\\'s name was\\nRough Ruddy. They lived in a snug cottage with their son Blackthorn and\\ntheir daughter Brownberry, and were thought great people, because they\\nkept the king\\'s sheep. Moreover, Fleecefold\\'s family were known to be\\nancient; and Rough Ruddy boasted that she had the largest feet in all\\nthe pastures. The shepherds held them in high respect, and it grew still\\nhigher when the news spread that the king\\'s seventh son had been sent to\\ntheir cottage. People came from all quarters to see the young prince,\\nand great were the lamentations over his misfortune in having such small\\nfeet.\\n\\nThe king and queen had given him fourteen names, beginning with\\nAugustus—such being the fashion in that royal family; but the honest\\ncountry people could not remember so many; besides, his feet were the\\nmost remarkable thing about the child, so with one accord they called\\nhim Fairyfoot. At first it was feared this might be high treason, but\\nwhen no notice was taken by the king or his ministers, the shepherds\\nconcluded it was no harm, and the boy never had another name throughout\\nthe pastures. At court it was not thought polite to speak of him at all.\\nThey did not keep his birthday, and he was never sent for at Christmas,\\nbecause the queen and her ladies could not bear the sight. Once a year\\nthe undermost scullion was sent to see how he did, with a bundle of his\\nnext brother\\'s cast-off clothes; and, as the king grew old and cross, it\\nwas said he had thoughts of disowning him.\\n\\nSo Fairyfoot grew in Fleecefold\\'s cottage. Perhaps the country air\\nmade him fair and rosy—for all agreed that he would have been a handsome\\nboy but for his small feet, with which nevertheless he learned to walk,\\nand in time to run and to jump, thereby amazing everybody, for such\\ndoings were not known among the children of Stumpinghame. The news of\\ncourt, however, traveled to the shepherds, and Fairyfoot was despised\\namong them. The old people thought him unlucky; the children refused to\\nplay with him. Fleecefold was ashamed to have him in his cottage, but he\\ndurst not disobey the king\\'s orders. Moreover, Blackthorn wore most of\\nthe clothes brought by the scullion. At last, Rough Ruddy found out that\\nthe sight of such horrid jumping would make her children vulgar; and, as\\nsoon as he was old enough, she sent Fairyfoot every day to watch some\\nsickly sheep that grazed on a wild, weedy pasture, hard by the forest.\\n\\nPoor Fairyfoot was often lonely and sorrowful; many a time he wished\\nhis feet would grow larger, or that people wouldn\\'t notice them so much;\\nand all the comfort he had was running and jumping by himself in the\\nwild pasture, and thinking that none of the shepherds\\' children could do\\nthe like, for all their pride of their great feet.\\n\\nTired of this sport, he was lying in the shadow of a mossy rock one\\nwarm summer\\'s noon, with the sheep feeding around, when a robin, pursued\\nby a great hawk, flew into the old velvet cap which lay on the ground\\nbeside him. Fairyfoot covered it up, and the hawk, frightened by his\\nshout, flew away.\\n\\n\"Now you may go, poor robin!\" he said, opening the cap: but instead\\nof the bird, out sprang a little man dressed in russet-brown, and\\nlooking as if he were an hundred years old. Fairyfoot could not speak\\nfor astonishment, but the little man said—\\n\\n\"Thank you for your shelter, and be sure I will do as much for you.\\nCall on me if you are ever in trouble; my name is Robin Goodfellow\"; and\\ndarting off, he was out of sight in an instant. For days the boy\\nwondered who that little man could be, but he told nobody, for the\\nlittle man\\'s feet were as small as his own, and it was clear he would be\\nno favorite in Stumpinghame. Fairyfoot kept the story to himself, and at\\nlast midsummer came. That evening was a feast among the shepherds. There\\nwere bonfires on the hills, and fun in the villages. But Fairyfoot sat\\nalone beside his sheepfold, for the children of his village had refused\\nto let him dance with them about the bonfire, and he had gone there to\\nbewail the size of his feet, which came between him and so many good\\nthings. Fairyfoot had never felt so lonely in all his life, and\\nremembering the little man, he plucked up spirit, and cried—\\n\\n\"Ho! Robin Goodfellow!\"\\n\\n\"Here I am,\" said a shrill voice at his elbow; and there stood the\\nlittle man himself.\\n\\n\"I am very lonely, and no one will play with me, because my feet are\\nnot large enough,\" said Fairyfoot.\\n\\n\"Come then and play with us,\" said the little man. \"We lead the\\nmerriest lives in the world, and care for nobody\\'s feet; but all\\ncompanies have their own manners, and there are two things you must mind\\namong us: first, do as you see the rest doing; and secondly, never speak\\nof anything you may hear or see, for we and the people of this country\\nhave had no friendship ever since large feet came in fashion.\"\\n\\n\"I will do that, and anything more you like,\" said Fairyfoot; and the\\nlittle man, taking his hand, led him over the pasture into the forest\\nand along a mossy path among old trees wreathed with ivy (he never knew\\nhow far), till they heard the sound of music and came upon a meadow\\nwhere the moon shone as bright as day, and all the flowers of the\\nyear—snowdrops, violets, primroses, and cowslips—bloomed together in the\\nthick grass. There were a crowd of little men and women, some clad in\\nrusset color, but far more in green, dancing round a little well as\\nclear as crystal. And under great rose-trees which grew here and there\\nin the meadow, companies were sitting round low tables covered with cups\\nof milk, dishes of honey, and carved wooden flagons filled with clear\\nred wine. The little man led Fairyfoot up to the nearest table, handed\\nhim one of the flagons, and said 213—\\n\\n\"Drink to the good company.\"\\n\\nWine was not very common among the shepherds of Stumpinghame, and the\\nboy had never tasted such drink as that before; for scarcely had it gone\\ndown when he forgot all his troubles—how Blackthorn and Brownberry wore\\nhis clothes, how Rough Ruddy sent him to keep the sickly sheep, and the\\nchildren would not dance with him: in short, he forgot the whole\\nmisfortune of his feet, and it seemed to his mind that he was a king\\'s\\nson, and all was well with him. All the little people about the well\\ncried—\"Welcome! welcome!\" and every one said—\"Come and dance with me!\"\\nSo Fairyfoot was as happy as a prince, and drank milk and ate honey till\\nthe moon was low in the sky, and then the little man took him by the\\nhand, and never stopped nor stayed till he was at his own bed of straw\\nin the cottage corner.\\n\\nNext morning Fairyfoot was not tired for all his dancing. Nobody in\\nthe cottage had missed him, and he went out with the sheep as usual; but\\nevery night all that summer, when the shepherds were safe in bed, the\\nlittle man came and took him away to dance in the forest. Now he did not\\ncare to play with the shepherds\\' children, nor grieve that his father\\nand mother had forgotten him, but watched the sheep all day, singing to\\nhimself or plaiting rushes; and when the sun went down, Fairyfoot\\'s\\nheart rejoiced at the thought of meeting that merry company.\\n\\nThe wonder was that he was never tired nor sleepy, as people are apt\\nto be who dance all night; but before the summer was ended Fairyfoot\\nfound out the reason. One night, when the moon was full, and the last of\\nthe ripe corn rustling in the fields, Robin Goodfellow came for him as\\nusual, and away they went to the flowery green. The fun there was high,\\nand Robin was in haste. So he only pointed to the carved cup from which\\nFairyfoot every night drank the clear red wine.\\n\\n\"I am not thirsty, and there is no use losing time,\" thought the boy\\nto himself, and he joined the dance; but never in all his life did\\nFairyfoot find such hard work as to keep pace with the company. Their\\nfeet seemed to move like lightning, the swallows did not fly so fast or\\nturn so quickly. Fairyfoot did his best, for he never gave in easily,\\nbut at length, his breath and strength being spent, the boy was glad to\\nsteal away and sit down behind a mossy oak, where his eyes closed for\\nvery weariness. When he awoke the dance was nearly over, but two little\\nladies clad in green talked close beside him.\\n\\n\"What a beautiful boy!\" said one of them. \"He is worthy to be a\\nking\\'s son. Only see what handsome feet he has!\"\\n\\n\"Yes,\" said the other, with a laugh, that sounded spiteful; \"they are\\njust like the feet Princess Maybloom had before she washed them in the\\nGrowing Well. Her father has sent far and wide throughout the whole\\ncountry searching for a doctor to make them small again, but nothing in\\nthis world can do it except the water of the Fair Fountain, and none but\\nI and the nightingales know where it is.\"\\n\\n\"One would not care to let the like be known,\" said the first little\\nlady: \"there would come such crowds of these great coarse creatures of\\nmankind, nobody would have peace for leagues round. But you will surely\\nsend word to the sweet princess!—she was so kind to our birds and\\nbutterflies, and danced so like one of ourselves!\" 214\\n\\n\"Not I, indeed!\" said the spiteful fairy. \"Her old skinflint of a\\nfather cut down the cedar which I loved best in the whole forest, and\\nmade a chest of it to hold his money in; besides, I never liked the\\nprincess—everybody praised her so. But come, we shall be too late for\\nthe last dance.\"\\n\\nWhen they were gone, Fairyfoot could sleep no more with astonishment.\\nHe did not wonder at the fairies admiring his feet, because their own\\nwere much the same; but it amazed him that Princess Maybloom\\'s father\\nshould be troubled at hers growing large. Moreover, he wished to see\\nthat same princess and her country, since there were really other places\\nin the world than Stumpinghame.\\n\\nWhen Robin Goodfellow came to take him home as usual he durst not let\\nhim know that he had overheard anything; but never was the boy so\\nunwilling to get up as on that morning, and all day he was so weary that\\nin the afternoon Fairyfoot fell asleep, with his head on a clump of\\nrushes. It was seldom that any one thought of looking after him and the\\nsickly sheep; but it so happened that towards evening the old shepherd,\\nFleecefold, thought he would see how things went on in the pastures. The\\nshepherd had a bad temper and a thick staff, and no sooner did he catch\\nsight of Fairyfoot sleeping, and his flock straying away, than shouting\\nall the ill names he could remember, in a voice which woke up the boy,\\nhe ran after him as fast as his great feet would allow; while Fairyfoot,\\nseeing no other shelter from his fury, fled into the forest, and never\\nstopped nor stayed till he reached the banks of a little stream.\\n\\nThinking it might lead him to the fairies\\' dancing-ground, he\\nfollowed that stream for many an hour, but it wound away into the heart\\nof the forest, flowing through dells, falling over mossy rocks, and at\\nlast leading Fairyfoot, when he was tired and the night had fallen, to a\\ngrove of great rose-trees, with the moon shining on it as bright as day,\\nand thousands of nightingales singing in the branches. In the midst of\\nthat grove was a clear spring, bordered with banks of lilies, and\\nFairyfoot sat down by it to rest himself and listen. The singing was so\\nsweet he could have listened for ever, but as he sat the nightingales\\nleft off their songs, and began to talk together in the silence of the\\nnight.\\n\\n\"What boy is that,\" said one on a branch above him, \"who sits so\\nlonely by the Fair Fountain? He cannot have come from Stumpinghame with\\nsuch small and handsome feet.\"\\n\\n\"No, I\\'ll warrant you,\" said another, \"he has come from the west\\ncountry. How in the world did he find the way?\"\\n\\n\"How simple you are!\" said a third nightingale. \"What had he to do\\nbut follow the ground-ivy which grows over height and hollow, bank and\\nbush, from the lowest gate of the king\\'s kitchen garden to the root of\\nthis rose-tree? He looks a wise boy, and I hope he will keep the secret,\\nor we shall have all the west country here, dabbling in our fountain,\\nand leaving us no rest to either talk or sing.\"\\n\\nFairyfoot sat in great astonishment at this discourse, but by and by,\\nwhen the talk ceased and the songs began, he thought it might be as well\\nfor him to follow the ground-ivy, and see the Princess Maybloom, not to\\nspeak of getting rid of Rough Ruddy, the sickly sheep, and the crusty\\nold shepherd. It 215 was a long journey; but he went on, eating wild\\nberries by day, sleeping in the hollows of old trees by night, and never\\nlosing sight of the ground-ivy, which led him over height and hollow,\\nbank and bush, out of the forest, and along a noble high road, with\\nfields and villages on every side, to a great city, and a low\\nold-fashioned gate of the king\\'s kitchen-garden, which was thought too\\nmean for the scullions, and had not been opened for seven years.\\n\\nThere was no use knocking—the gate was overgrown with tall weeds and\\nmoss; so, being an active boy, he climbed over, and walked through the\\ngarden, till a white fawn came frisking by, and he heard a soft voice\\nsaying sorrowfully—\\n\\n\"Come back, come back, my fawn! I cannot run and play with you now,\\nmy feet have grown so heavy\"; and looking round he saw the loveliest\\nyoung princess in the world, dressed in snow-white, and wearing a wreath\\nof roses on her golden hair; but walking slowly, as the great people did\\nin Stumpinghame, for her feet were as large as the best of them.\\n\\nAfter her came six young ladies, dressed in white and walking slowly,\\nfor they could not go before the princess; but Fairyfoot was amazed to\\nsee that their feet were as small as his own. At once he guessed that\\nthis must be the Princess Maybloom, and made her an humble bow,\\nsaying—\\n\\n\"Royal princess, I have heard of your trouble because your feet have\\ngrown large; in my country that\\'s all the fashion. For seven years past\\nI have been wondering what would make mine grow, to no purpose; but I\\nknow of a certain fountain that will make yours smaller and finer than\\never they were, if the king, your father, gives you leave to come with\\nme, accompanied by two of your maids that are the least given to\\ntalking, and the most prudent officer in all his household; for it would\\ngrievously offend the fairies and the nightingales to make that fountain\\nknown.\"\\n\\nWhen the princess heard that, she danced for joy in spite of her\\nlarge feet, and she and her six maids brought Fairyfoot before the king\\nand queen, where they sat in their palace hall, with all the courtiers\\npaying their morning compliments. The lords were very much astonished to\\nsee a ragged, bare-footed boy brought in among them, and the ladies\\nthought Princess Maybloom must have gone mad; but Fairyfoot, making an\\nhumble reverence, told his message to the king and queen, and offered to\\nset out with the princess that very day. At first the king would not\\nbelieve that there could be any use in his offer, because so many great\\nphysicians had failed to give any relief. The courtiers laughed\\nFairyfoot to scorn, the pages wanted to turn him out for an impudent\\nimpostor, and the prime minister said he ought to be put to death for\\nhigh treason.\\n\\nFairyfoot wished himself safe in the forest again, or even keeping\\nthe sickly sheep; but the queen, being a prudent woman, said—\\n\\n\"I pray your majesty to notice what fine feet this boy has. There may\\nbe some truth in his story. For the sake of our only daughter, I will\\nchoose two maids who talk the least of all our train, and my\\nchamberlain, who is the most discreet officer in our household. Let them\\ngo with the princess; who knows but our sorrow may be lessened?\"\\n\\nAfter some persuasion the king consented, though all his councillors\\nadvised 216 the contrary. So the two silent maids, the\\ndiscreet chamberlain, and her fawn, which would not stay behind, were\\nsent with Princess Maybloom, and they all set out after dinner.\\nFairyfoot had hard work guiding them along the track of the ground-ivy.\\nThe maids and the chamberlain did not like the brambles and rough roots\\nof the forest—they thought it hard to eat berries and sleep in hollow\\ntrees; but the princess went on with good courage, and at last they\\nreached the grove of rose-trees, and the spring bordered with\\nlilies.\\n\\nThe chamberlain washed—and though his hair had been grey, and his\\nface wrinkled, the young courtiers envied his beauty for years after.\\nThe maids washed—and from that day they were esteemed the fairest in all\\nthe palace. Lastly, the princess washed also—it could make her no\\nfairer, but the moment her feet touched the water they grew less, and\\nwhen she had washed and dried them three times, they were as small and\\nfinely-shaped as Fairyfoot\\'s own. There was great joy among them, but\\nthe boy said sorrowfully—\\n\\n\"Oh! if there had been a well in the world to make my feet large, my\\nfather and mother would not have cast me off, nor sent me to live among\\nthe shepherds.\"\\n\\n\"Cheer up your heart,\" said the Princess Maybloom; \"if you want large\\nfeet, there is a well in this forest that will do it. Last summer time I\\ncame with my father and his foresters to see a great cedar cut down, of\\nwhich he meant to make a money chest. While they were busy with the\\ncedar, I saw a bramble branch covered with berries. Some were ripe and\\nsome were green, but it was the longest bramble that ever grew; for the\\nsake of the berries, I went on and on to its root, which grew hard by a\\nmuddy-looking well, with banks of dark green moss, in the deepest part\\nof the forest. The day was warm and dry and my feet were sore with the\\nrough ground, so I took off my scarlet shoes and washed my feet in the\\nwell; but as I washed they grew larger every minute, and nothing could\\never make them less again. I have seen the bramble this day; it is not\\nfar off, and as you have shown me the Fair Fountain, I will show you the\\nGrowing Well.\"\\n\\nUp rose Fairyfoot and Princess Maybloom, and went together till they\\nfound the bramble, and came to where its root grew, hard by the\\nmuddy-looking well, with banks of dark green moss in the deepest dell of\\nthe forest. Fairyfoot sat down to wash, but at that minute he heard a\\nsound of music, and knew it was the fairies going to their dancing\\nground.\\n\\n\"If my feet grow large,\" said the boy to himself, \"how shall I dance\\nwith them?\" So, rising quickly, he took the Princess Maybloom by the\\nhand. The fawn followed them; the maids and the chamberlain followed it,\\nand all followed the music through the forest. At last they came to the\\nflowery green. Robin Goodfellow welcomed the company for Fairyfoot\\'s\\nsake, and gave every one a drink of the fairies\\' wine. So they danced\\nthere from sunset till the grey morning, and nobody was tired; but\\nbefore the lark sang, Robin Goodfellow took them all safe home, as he\\nused to take Fairyfoot.\\n\\nThere was great joy that day in the palace because Princess\\nMaybloom\\'s feet were made small again. The king gave Fairyfoot all\\nmanner of fine clothes and rich jewels; and when they heard his\\nwonderful story, he and the queen asked 217 him to live with them and be\\ntheir son. In process of time Fairyfoot and Princess Maybloom were\\nmarried, and still live happily. When they go to visit at Stumpinghame,\\nthey always wash their feet in the Growing Well, lest the royal family\\nmight think them a disgrace, but when they come back, they make haste to\\nthe Fair Fountain; and the fairies and the nightingales are great\\nfriends to them, as well as the maids and the chamberlain, because they\\nhave told nobody about it, and there is peace and quiet yet in the grove\\nof rose-trees.\\n\\n200\\n\\nThe ill-fated Oscar Wilde (1856-1900) was born in Ireland, was\\neducated at Oxford, came into great notoriety as the reputed leader of\\nthe \"aesthetic movement,\" was prominent in the London literary world\\nfrom 1885 to 1895, fell under the obloquy of most of his countrymen, and\\ndied in distressing circumstances in Paris. In addition to some\\nremarkable plays, poems, and prose books, he wrote a number of unusual\\nstories especially fascinating to children, which were collected under\\nthe title The Happy Prince, and Other Tales. These stories\\nwere at once recognized as classic in quality. While they contain much\\nimplied criticism of certain features of modern civilization, the whole\\ntone is so idealistic and the workmanship so fine that they convey no\\nstrong note of bitterness to the child. \"The Happy Prince\" suggests that\\nWilde saw on the one hand \"the white faces of starving children looking\\nout listlessly at the black streets\"; while on the other hand he saw the\\nPyramids, marble angels sculptured on the cathedral tower, and the\\ngold-covered statue of the Prince of the Palace of the Care-Free. Wilde\\nalso suggests a remedy for the starvation and wretchedness that exist,\\nespecially among children, in most cities where great wealth is\\ndisplayed. The important thing in presenting this story to children is\\nto get the full sympathetic response due to the sacrifice made by the\\nHappy Prince and the little swallow. So much of the effect depends upon\\nthe wonderful beauty of the language that teachers will, as a rule, get\\nbetter results from reading or reciting than from any kind of oral\\nparaphrase. Another story in this same volume widely and successfully\\nused by teachers is the one called \"The Selfish Giant.\"\\n\\nTHE HAPPY PRINCE\\n\\nOSCAR WILDE\\n\\nHigh above the city, on a tall column, stood the statue of the Happy\\nPrince. He was gilded all over with thin leaves of fine gold, for eyes\\nhe had two bright sapphires, and a large red ruby glowed on his\\nsword-hilt.\\n\\nHe was very much admired indeed. \"He is as beautiful as a\\nweathercock,\" remarked one of the Town Councillors who wished to gain a\\nreputation for having artistic tastes; \"only not quite so useful,\" he\\nadded, fearing lest people should think him unpractical, which he really\\nwas not.\\n\\n\"Why can\\'t you be like the Happy Prince?\" asked a sensible mother of\\nher little boy who was crying for the moon. \"The Happy Prince never\\ndreams of crying for anything.\"\\n\\n\"I am glad there is some one in the world who is quite happy,\"\\nmuttered a disappointed man as he gazed at the wonderful statue.\\n\\n\"He looks just like an angel,\" said the Charity Children as they came\\nout of the cathedral in their bright scarlet cloaks and their clean\\nwhite pinafores.\\n\\n\"How do you know?\" said the Mathematical Master; \"you have never seen\\none.\" 218\\n\\n\"Ah! but we have, in our dreams,\" answered the children; and the\\nMathematical Master frowned and looked very severe, for he did not\\napprove of children dreaming.\\n\\nOne night there flew over the city a Little Swallow. His friends had\\ngone away to Egypt six weeks before, but he had stayed behind, for he\\nwas in love with the most beautiful Reed. He had met her early in the\\nspring as he was flying down the river after a big yellow moth, and had\\nbeen so attracted by her slender waist that he had stopped to talk to\\nher.\\n\\n\"Shall I love you?\" said the Swallow, who liked to come to the point\\nat once, and the Reed made him a low bow. So he flew round and round\\nher, touching the water with his wings, and making silver ripples. This\\nwas his courtship, and it lasted all through the summer.\\n\\n\"It is a ridiculous attachment,\" twittered the other Swallows; \"she\\nhas no money, and far too many relations\"; and indeed the river was\\nquite full of Reeds. Then when the autumn came they all flew away.\\n\\nAfter they had gone he felt lonely, and began to tire of his\\nlady-love. \"She has no conversation,\" he said, \"and I am afraid that she\\nis a coquette, for she is always flirting with the wind.\" And certainly,\\nwhenever the wind blew, the Reed made the most graceful curtseys. \"I\\nadmit that she is domestic,\" he continued, \"but I love traveling, and my\\nwife, consequently, should love traveling also.\"\\n\\n\"Will you come away with me?\" he said finally to her; but the Reed\\nshook her head, she was so attached to her home.\\n\\n\"You have been trifling with me,\" he cried. \"I am off to the\\nPyramids. Good-bye!\" and he flew away.\\n\\nAll day long he flew, and at night-time he arrived at the city.\\n\"Where shall I put up?\" he said; \"I hope the town has made\\npreparations.\"\\n\\nThen he saw the statue on the tall column.\\n\\n\"I will put up there,\" he cried; \"it is a fine position, with plenty\\nof fresh air.\" So he alighted just between the feet of the Happy\\nPrince.\\n\\n\"I have a golden bedroom,\" he said softly to himself as he looked\\nround, and he prepared to go to sleep; but just as he was putting his\\nhead under his wing a large drop of water fell on him. \"What a curious\\nthing!\" he cried; \"there is not a single cloud in the sky, the stars are\\nquite clear and bright, and yet it is raining. The climate in the north\\nof Europe is really dreadful. The Reed used to like the rain, but that\\nwas merely her selfishness.\"\\n\\nThen another drop fell.\\n\\n\"What is the use of a statue if it cannot keep the rain off?\" he\\nsaid; \"I must look for a good chimney-pot,\" and he determined to fly\\naway.\\n\\nBut before he had opened his wings, a third drop fell, and he looked\\nup, and saw—Ah! what did he see?\\n\\nThe eyes of the Happy Prince were filled with tears, and tears were\\nrunning down his golden cheeks. His face was so beautiful in the\\nmoonlight that the little Swallow was filled with pity.\\n\\n\"Who are you?\" he said.\\n\\n\"I am the Happy Prince.\"\\n\\n\"Why are you weeping then?\" asked the Swallow; \"you have quite\\ndrenched me.\"\\n\\n\"When I was alive and had a human heart,\" answered the statue, \"I did\\nnot know what tears were, for I lived in the Palace of Sans-Souci, where\\nsorrow is not allowed to enter. In the daytime I 219 played\\nwith my companions in the garden, and in the evening I led the dance in\\nthe Great Hall. Round the garden ran a very lofty wall, but I never\\ncared to ask what lay beyond it, everything about me was so beautiful.\\nMy courtiers called me the Happy Prince, and happy indeed I was, if\\npleasure be happiness. So I lived, and so I died. And now that I am dead\\nthey have set me up here so high that I can see all the ugliness and all\\nthe misery of my city, and though my heart is made of lead yet I cannot\\nchoose but weep.\"\\n\\n\"What! is he not solid gold?\" said the Swallow to himself. He was too\\npolite to make any personal remarks out loud.\\n\\n\"Far away,\" continued the statue in a low musical voice, \"far away in\\na little street there is a poor house. One of the windows is open, and\\nthrough it I can see a woman seated at a table. Her face is thin and\\nworn, and she has coarse, red hands, all pricked by the needle, for she\\nis a seamstress. She is embroidering passion-flowers on a satin gown for\\nthe loveliest of the Queen\\'s maids-of-honor to wear at the next\\nCourt-ball. In a bed in the corner of the room her little boy is lying\\nill. He has a fever, and is asking for oranges. His mother has nothing\\nto give him but river water, so he is crying. Swallow, Swallow, little\\nSwallow, will you not take her the ruby out of my sword-hilt? My feet\\nare fastened to this pedestal and I cannot move.\"\\n\\n\"I am waited for in Egypt,\" said the Swallow. \"My friends are flying\\nup and down the Nile, and talking to the large lotus-flowers. Soon they\\nwill go to sleep in the tomb of the great King. The King is there\\nhimself in his painted coffin. He is wrapped in yellow linen, and\\nembalmed with spices. Round his neck is a chain of pale green jade, and\\nhis hands are like withered leaves.\"\\n\\n\"Swallow, Swallow, little Swallow,\" said the Prince, \"will you not\\nstay with me for one night, and be my messenger? The boy is so thirsty,\\nand the mother so sad.\"\\n\\n\"I don\\'t think I like boys,\" answered the Swallow. \"Last summer, when\\nI was staying on the river, there were two rude boys, the miller\\'s sons,\\nwho were always throwing stones at me. They never hit me, of course; we\\nswallows fly far too well for that, and besides, I come of a family\\nfamous for its agility; but still, it was a mark of disrespect.\"\\n\\nBut the Happy Prince looked so sad that the little Swallow was sorry.\\n\"It is very cold here,\" he said; \"but I will stay with you for one\\nnight, and be your messenger.\"\\n\\n\"Thank you, little Swallow,\" said the Prince.\\n\\nSo the Swallow picked out the great ruby from the Prince\\'s sword, and\\nflew away with it in his beak over the roofs of the town.\\n\\nHe passed by the cathedral tower, where the white marble angels were\\nsculptured. He passed by the palace and heard the sound of dancing. A\\nbeautiful girl came out on the balcony with her lover. \"How wonderful\\nthe stars are,\" he said to her, \"and how wonderful is the power of\\nlove!\"\\n\\n\"I hope my dress will be ready in time for the State-ball,\" she\\nanswered; \"I have ordered passion-flowers to be embroidered on it; but\\nthe seamstresses are so lazy.\"\\n\\nHe passed over the river, and saw the lanterns hanging to the masts\\nof the ships. He passed over the Ghetto, and saw the old Jews bargaining\\nwith each 220 other, and weighing out money in copper scales.\\nAt last he came to the poor house and looked in. The boy was tossing\\nfeverishly on his bed, and the mother had fallen asleep, she was so\\ntired. In he hopped, and laid the great ruby on the table beside the\\nwoman\\'s thimble. Then he flew gently round the bed, fanning the boy\\'s\\nforehead with his wings. \"How cool I feel,\" said the boy. \"I must be\\ngetting better\"; and he sank into a delicious slumber.\\n\\nThen the Swallow flew back to the Happy Prince, and told him what he\\nhad done. \"It is curious,\" he remarked, \"but I feel quite warm now,\\nalthough it is so cold.\"\\n\\n\"That is because you have done a good action,\" said the Prince. And\\nthe little Swallow began to think, and then he fell asleep. Thinking\\nalways made him sleepy.\\n\\nWhen day broke he flew down to the river and had a bath. \"What a\\nremarkable phenomenon,\" said the Professor of Ornithology as he was\\npassing over the bridge. \"A swallow in winter!\" And he wrote a long\\nletter about it to the local newspaper. Every one quoted it, it was full\\nof so many words that they could not understand.\\n\\n\"To-night I go to Egypt,\" said the Swallow, and he was in high\\nspirits at the prospect. He visited all the public monuments, and sat a\\nlong time on top of the church steeple. Wherever he went the Sparrows\\nchirruped, and said to each other, \"What a distinguished stranger!\" so\\nhe enjoyed himself very much.\\n\\nWhen the moon rose he flew back to the Happy Prince. \"Have you any\\ncommissions for Egypt?\" he cried; \"I am just starting.\"\\n\\n\"Swallow, Swallow, little Swallow,\" said the Prince, \"will you not\\nstay with me one night longer?\"\\n\\n\"I am waited for in Egypt,\" answered the Swallow. \"To-morrow my\\nfriends will fly up to the Second Cataract. The river-horse couches\\nthere among the bulrushes, and on a great granite throne sits the God\\nMemnon. All night long he watches the stars, and when the morning star\\nshines he utters one cry of joy, and then he is silent. At noon the\\nyellow lions come down to the water\\'s edge to drink. They have eyes like\\ngreen beryls, and their roar is louder than the roar of the\\ncataract.\"\\n\\n\"Swallow, Swallow, little Swallow,\" said the Prince, \"far away across\\nthe city I see a young man in a garret. He is leaning over a desk\\ncovered with papers, and in a tumbler by his side there is a bunch of\\nwithered violets. His hair is brown and crisp, and his lips are red as a\\npomegranate, and he has large and dreamy eyes. He is trying to finish a\\nplay for the Director of the Theatre, but he is too cold to write any\\nmore. There is no fire in the grate, and hunger has made him faint.\"\\n\\n\"I will wait with you one night longer,\" said the Swallow, who really\\nhad a good heart. \"Shall I take him another ruby?\"\\n\\n\"Alas! I have no ruby now,\" said the Prince; \"my eyes are all that I\\nhave left. They are made of rare sapphires, which were brought out of\\nIndia a thousand years ago. Pluck out one of them and take it to him. He\\nwill sell it to the jeweller, and buy food and firewood, and finish his\\nplay.\"\\n\\n\"Dear Prince,\" said the Swallow, \"I cannot do that\"; and he began to\\nweep.\\n\\n\"Swallow, Swallow, little Swallow,\" said the Prince, \"do as I command\\nyou.\"\\n\\nSo the Swallow plucked out the 221 Prince\\'s eye, and flew away to\\nthe student\\'s garret. It was easy enough to get in, as there was a hole\\nin the roof. Through this he darted, and came into the room. The young\\nman had his head buried in his hands, so he did not hear the flutter of\\nthe bird\\'s wings, and when he looked up he found the beautiful sapphire\\nlying on the withered violets.\\n\\n\"I am beginning to be appreciated,\" he cried; \"this is from some\\ngreat admirer. Now I can finish my play,\" and he looked quite happy.\\n\\nThe next day the Swallow flew down to the harbor. He sat on the mast\\nof a large vessel and watched the sailors hauling big chests out of the\\nhold with ropes. \"Heave a-hoy!\" they shouted as each chest came up. \"I\\nam going to Egypt!\" cried the Swallow, but nobody minded, and when the\\nmoon rose he flew back to the Happy Prince.\\n\\n\"I am come to bid you good-bye,\" he cried.\\n\\n\"Swallow, Swallow, little Swallow,\" said the Prince, \"will you not\\nstay with me one night longer?\"\\n\\n\"It is winter,\" answered the Swallow, \"and the chill snow will soon\\nbe here. In Egypt the sun is warm on the green palm-trees, and the\\ncrocodiles lie in the mud and look lazily about them. My companions are\\nbuilding a nest in the Temple of Baalbec, and the pink and white doves\\nare watching them, and cooing to each other. Dear Prince, I must leave\\nyou, but I will never forget you, and next spring I will bring you back\\ntwo beautiful jewels in place of those you have given away. The ruby\\nshall be redder than a red rose, and the sapphire shall be as blue as\\nthe great sea.\"\\n\\n\"In the square below,\" said the Happy Prince, \"there stands a little\\nmatch-girl. She has let her matches fall in the gutter, and they are all\\nspoiled. Her father will beat her if she does not bring home some money,\\nand she is crying. She has no shoes or stockings, and her little head is\\nbare. Pluck out my other eye, and give it to her, and her father will\\nnot beat her.\"\\n\\n\"I will stay with you one night longer,\" said the Swallow, \"but I\\ncannot pluck out your eye. You would be quite blind then.\"\\n\\n\"Swallow, Swallow, little Swallow,\" said the Prince, \"do as I command\\nyou.\"\\n\\nSo he plucked out the Prince\\'s other eye, and darted down with it. He\\nswooped past the match-girl, and slipped the jewel into the palm of her\\nhand. \"What a lovely bit of glass,\" cried the little girl; and she ran\\nhome, laughing.\\n\\nThen the Swallow came back to the Prince. \"You are blind now,\" he\\nsaid, \"so I will stay with you always.\"\\n\\n\"No, little Swallow,\" said the poor Prince, \"you must go away to\\nEgypt.\"\\n\\n\"I will stay with you always,\" said the Swallow, and he slept at the\\nPrince\\'s feet.\\n\\nAll the next day he sat on the Prince\\'s shoulder, and told him\\nstories of what he had seen in strange lands. He told him of the red\\nibises, who stand in long rows on the banks of the Nile, and catch\\ngoldfish in their beaks; of the Sphinx, who is as old as the world\\nitself, and lives in the desert, and knows everything; of the merchants,\\nwho walk slowly by the side of their camels, and carry amber beads in\\ntheir hands; of the King of the Mountains of the Moon, who is as black\\nas ebony, and worships a large crystal; of the great green snake that\\nsleeps in a palm-tree, and has twenty priests to feed it with\\nhoney-cakes; and of the pygmies who sail over a big lake on large flat\\n222\\nleaves, and are always at war with the butterflies.\\n\\n\"Dear little Swallow,\" said the Prince, \"you tell me of marvelous\\nthings, but more marvelous than anything is the suffering of men and of\\nwomen. There is no Mystery so great as Misery. Fly over my city, little\\nSwallow, and tell me what you see there.\"\\n\\nSo the Swallow flew over the great city, and saw the rich making\\nmerry in their beautiful houses, while the beggars were sitting at the\\ngates. He flew into dark lanes, and saw the white faces of starving\\nchildren looking out listlessly at the black streets. Under the archway\\nof a bridge two little boys were lying in one another\\'s arms to try to\\nkeep themselves warm. \"How hungry we are!\" they said. \"You must not lie\\nhere,\" shouted the Watchman, and they wandered out into the rain.\\n\\nThen he flew back and told the Prince what he had seen.\\n\\n\"I am covered with fine gold,\" said the Prince; \"you must take it\\noff, leaf by leaf, and give it to my poor; the living always think that\\ngold can make them happy.\"\\n\\nLeaf after leaf of the fine gold the Swallow picked off, till the\\nHappy Prince looked quite dull and grey. Leaf after leaf of the fine\\ngold he brought to the poor, and the children\\'s faces grew rosier, and\\nthey laughed and played games in the street. \"We have bread now!\" they\\ncried.\\n\\nThen the snow came, and after the snow came the frost. The streets\\nlooked as if they were made of silver, they were so bright and\\nglistening; long icicles like crystal daggers hung down from the eaves\\nof the houses, everybody went about in furs, and the little boys wore\\nscarlet caps and skated on the ice.\\n\\nThe poor little Swallow grew colder and colder, but he would not\\nleave the Prince; he loved him too well. He picked up crumbs outside the\\nbaker\\'s door when the baker was not looking, and tried to keep himself\\nwarm by flapping his wings.\\n\\nBut at last he knew that he was going to die. He had just strength to\\nfly up to the Prince\\'s shoulder once more. \"Good-bye, dear Prince!\" he\\nmurmured, \"will you let me kiss your hand?\"\\n\\n\"I am glad that you are going to Egypt at last, little Swallow,\" said\\nthe Prince. \"You have stayed too long here; but you must kiss me on the\\nlips, for I love you.\"\\n\\n\"It is not to Egypt that I am going,\" said the Swallow. \"I am going\\nto the House of Death. Death is the brother of Sleep, is he not?\"\\n\\nAnd he kissed the Happy Prince on the lips, and fell down dead at his\\nfeet.\\n\\nAt that moment a curious crack sounded inside the statue, as if\\nsomething had suddenly broken. The fact is that the leaden heart had\\nsnapped right in two. It certainly was a dreadfully hard frost.\\n\\nEarly the next morning the Mayor was walking in the square below in\\ncompany with the Town Councillors. As they passed the column he looked\\nup at the statue: \"Dear me! how shabby the Happy Prince looks!\" he\\nsaid.\\n\\n\"How shabby indeed!\" cried the Town Councillors, who always agreed\\nwith the Mayor; and they went up to look at it.\\n\\n\"The ruby has fallen out of his sword, his eyes are gone, and he is\\ngolden no longer,\" said the Mayor; \"in fact, he is little better than a\\nbeggar!\"\\n\\n\"Little better than a beggar,\" said the Town Councillors.\\n\\n\"And here is actually a dead bird at his feet!\" continued the Mayor.\\n\"We 223 must really issue a proclamation that birds are\\nnot to be allowed to die here.\" And the Town Clerk made a note of the\\nsuggestion.\\n\\nSo they pulled down the statue of the Happy Prince. \"As he is no\\nlonger beautiful he is no longer useful,\" said the Art Professor at the\\nUniversity.\\n\\nThen they melted the statue in a furnace, and the Mayor held a\\nmeeting of the Corporation to decide what was to be done with the metal.\\n\"We must have another statue, of course,\" he said, \"and it shall be a\\nstatue of myself.\"\\n\\n\"Of myself,\" said each of the Town Councillors, and they quarrelled.\\nWhen I last heard of them they were quarreling still.\\n\\n\"What a strange thing!\" said the overseer of the workmen at the\\nfoundry. \"This broken lead heart will not melt in the furnace. We must\\nthrow it away.\" So they threw it on a dustheap where the dead Swallow\\nwas also lying.\\n\\n\"Bring me the two most precious things in the city,\" said God to one\\nof His Angels; and the Angel brought Him the leaden heart and the dead\\nbird.\\n\\n\"You have rightly chosen,\" said God, \"for in my garden of Paradise\\nthis little bird shall sing for evermore, and in my city of gold the\\nHappy Prince shall praise me.\"\\n\\n201\\n\\nTwo stories of unusual interest and charm for children are found in\\nthe collection of eleven by Raymond M. Alden (1873—), Why the\\nChimes Rang. One is the title story of the volume; the other is\\n\"The Knights of the Silver Shield.\" The latter follows by permission of\\nthe publishers, The Bobbs-Merrill Co., Indianapolis. (Copyright, 1906,\\n1908.) It is of striking dramatic interest and emphasizes a much-needed\\nquality of character, the importance of a loyal performance of the\\nlowlier duties of life. The salvation of a nation may depend upon the\\nhumble guardian of the gate quite as much as upon those who are engaged\\nin the more spectacular struggle with giants. Mr. Alden is a scholarly\\nprofessor of literature in Leland Stanford Jr. University, and it may\\ninterest the reader to know that he is the son of the author of the\\nPansy Books, a type of religious or Sunday-school fiction\\nwidely read throughout the country by a generation or two of young\\npeople.\\n\\nTHE KNIGHTS OF THE SILVER SHIELD\\n\\nRAYMOND MACDONALD ALDEN\\n\\nThere was once a splendid castle in a forest, with great stone walls\\nand a high gateway, and turrets that rose away above the tallest trees.\\nThe forest was dark and dangerous, and many cruel giants lived in it;\\nbut in the castle was a company of knights, who were kept there by the\\nking of the country, to help travelers who might be in the forest and to\\nfight with the giants whenever they could.\\n\\nEach of these knights wore a beautiful suit of armor and carried a\\nlong spear, while over his helmet there floated a great red plume that\\ncould be seen a long way off by any one in distress. But the most\\nwonderful thing about the knights\\' armor was their shields. They were\\nnot like those of other knights, but had been made by a great magician\\nwho had lived in the castle many years before. They were made of silver,\\nand sometimes shone in the sunlight with dazzling brightness; but at\\nother times the surface of the shields would be clouded as though by a\\nmist, and one could not see his face 224 reflected there as he could\\nwhen they shone brightly.\\n\\nNow, when each young knight received his spurs and his armor, a new\\nshield was also given him from among those that the magician had made;\\nand when the shield was new its surface was always cloudy and dull. But\\nas the knight began to do service against the giants, or went on\\nexpeditions to help poor travelers in the forest, his shield grew\\nbrighter and brighter, so that he could see his face clearly reflected\\nin it. But if he proved to be a lazy or cowardly knight, and let the\\ngiants get the better of him, or did not care what became of the\\ntravelers, then the shield grew more and more cloudy, until the knight\\nbecame ashamed to carry it.\\n\\nBut this was not all. When any one of the knights fought a\\nparticularly hard battle, and won the victory, or when he went on some\\nhard errand for the lord of the castle, and was successful, not only did\\nhis silver shield grow brighter, but when one looked into the center of\\nit he could see something like a golden star shining in its very heart.\\nThis was the greatest honor that a knight could achieve, and the other\\nknights always spoke of such a one as having \"won his star.\" It was\\nusually not till he was pretty old and tried as a soldier that he could\\nwin it. At the time when this story begins, the lord of the castle\\nhimself was the only one of the knights whose shield bore the golden\\nstar.\\n\\nThere came a time when the worst of the giants in the forest gathered\\nthemselves together to have a battle against the knights. They made a\\ncamp in a dark hollow not far from the castle, and gathered all their\\nbest warriors together, and all the knights made ready to fight them.\\nThe windows of the castle were closed and barred; the air was full of\\nthe noise of armor being made ready for use; and the knights were so\\nexcited that they could scarcely rest or eat.\\n\\nNow there was a young knight in the castle, named Sir Roland, who was\\namong those most eager for the battle. He was a splendid warrior, with\\neyes that shone like stars whenever there was anything to do in the way\\nof knightly deeds. And although he was still quite young, his shield had\\nbegun to shine enough to show plainly that he had done bravely in some\\nof his errands through the forest. This battle, he thought, would be the\\ngreat opportunity of his life. And on the morning of the day when they\\nwere to go forth to it, and all the knights assembled in the great hall\\nof the castle to receive the commands of their leaders, Sir Roland hoped\\nthat he would be put in the most dangerous place of all, so that he\\ncould show what knightly stuff he was made of.\\n\\nBut when the lord of the castle came to him, as he went about in full\\narmor giving his commands, he said: \"One brave knight must stay behind\\nand guard the gateway of the castle, and it is you, Sir Roland, being\\none of the youngest, whom I have chosen for this.\"\\n\\nAt these words Sir Roland was so disappointed that he bit his lip and\\nclosed his helmet over his face so that the other knights might not see\\nit. For a moment he felt as if he must reply angrily to the commander\\nand tell him that it was not right to leave so sturdy a knight behind\\nwhen he was eager to fight. But he struggled against this feeling and\\nwent quietly to look after his duties at the gate. The gateway was high\\nand narrow, and was reached from outside by a high, narrow bridge that\\ncrossed the moat, which 225 surrounded the castle on every side. When an\\nenemy approached, the knight on guard rang a great bell just inside the\\ngate, and the bridge was drawn up against the castle wall, so that no\\none could come across the moat. So the giants had long ago given up\\ntrying to attack the castle itself.\\n\\nTo-day the battle was to be in the dark hollow in the forest, and it\\nwas not likely that there would be anything to do at the castle gate,\\nexcept to watch it like a common doorkeeper. It was not strange that Sir\\nRoland thought some one else might have done this.\\n\\nPresently all the other knights marched out in their flashing armor,\\ntheir red plumes waving over their heads, and their spears in their\\nhands. The lord of the castle stopped only to tell Sir Roland to keep\\nguard over the gate until they had all returned and to let no one enter.\\nThen they went into the shadows of the forest and were soon lost to\\nsight.\\n\\nSir Roland stood looking after them long after they had gone,\\nthinking how happy he would be if he were on the way to battle like\\nthem. But after a little he put this out of his mind and tried to think\\nof pleasanter things. It was a long time before anything happened, or\\nany word came from the battle.\\n\\nAt last Sir Roland saw one of the knights come limping down the path\\nto the castle, and he went out on the bridge to meet him. Now this\\nknight was not a brave one, and he had been frightened away as soon as\\nhe was wounded.\\n\\n\"I have been hurt,\" he said, \"so that I can not fight any more. But I\\ncould watch the gate for you, if you would like to go back in my\\nplace.\"\\n\\nAt first Sir Roland\\'s heart leaped with joy at this, but then he\\nremembered what the commander had told him on going away, and he\\nsaid:\\n\\n\"I should like to go, but a knight belongs where his commander has\\nput him. My place is here at the gate, and I can not open it even for\\nyou. Your place is at the battle.\"\\n\\nThe knight was ashamed when he heard this, and he presently turned\\nabout and went into the forest again.\\n\\nSo Sir Roland kept guard silently for another hour. Then there came\\nan old beggar woman down the path to the castle and asked Sir Roland if\\nshe might come in and have some food. He told her that no one could\\nenter the castle that day, but that he would send a servant out to her\\nwith food, and that she might sit and rest as long as she would.\\n\\n\"I have been past the hollow in the forest where the battle is going\\non,\" said the old woman, while she was waiting for her food.\\n\\n\"And how do you think it is going?\" asked Sir Roland.\\n\\n\"Badly for the knights, I am afraid,\" said the old woman. \"The giants\\nare fighting as they have never fought before. I should think you had\\nbetter go and help your friends.\"\\n\\n\"I should like to, indeed,\" said Sir Roland. \"But I am set to guard\\nthe gateway of the castle and can not leave.\"\\n\\n\"One fresh knight would make a great difference when they are all\\nweary with fighting,\" said the old woman. \"I should think that, while\\nthere are no enemies about, you would be much more useful there.\"\\n\\n\"You may well think so,\" said Sir Roland, \"and so may I; but it is\\nneither you nor I that is commander here.\"\\n\\n\"I suppose,\" said the old woman then, \"that you are one of the kind\\nof 226 knights who like to keep out of fighting. You are\\nlucky to have so good an excuse for staying at home.\" And she laughed a\\nthin and taunting laugh.\\n\\nThen Sir Roland was very angry, and thought that if it were only a\\nman instead of a woman, he would show him whether he liked fighting or\\nno. But as it was a woman, he shut his lips and set his teeth hard\\ntogether, and as the servant came just then with the food he had sent\\nfor, he gave it to the old woman quickly and shut the gate that she\\nmight not talk to him any more.\\n\\nIt was not very long before he heard some one calling outside. Sir\\nRoland opened the gate and saw standing at the other end of the\\ndrawbridge a little old man in a long black cloak. \"Why are you knocking\\nhere?\" he said. \"The castle is closed to-day.\"\\n\\n\"Are you Sir Roland?\" said the little old man.\\n\\n\"Yes,\" said Sir Roland.\\n\\n\"Then you ought not to be staying here when your commander and his\\nknights are having so hard a struggle with the giants, and when you have\\nthe chance to make of yourself the greatest knight in this kingdom.\\nListen to me! I have brought you a magic sword.\"\\n\\nAs he said this, the old man drew from under his coat a wonderful\\nsword that flashed in the sunlight as if it were covered with diamonds.\\n\"This is the sword of all swords,\" he said, \"and it is for you, if you\\nwill leave your idling here by the castle gate and carry it to the\\nbattle. Nothing can stand before it. When you lift it the giants will\\nfall back, your master will be saved, and you will be crowned the\\nvictorious knight—the one who will soon take his commander\\'s place as\\nlord of the castle.\"\\n\\nNow Sir Roland believed that it was a magician who was speaking to\\nhim, for it certainly appeared to be a magic sword. It seemed so\\nwonderful that the sword should be brought to him, that he reached out\\nhis hand as though he would take it, and the little old man came\\nforward, as though he would cross the drawbridge into the castle. But as\\nhe did so, it came to Sir Roland\\'s mind again that that bridge and the\\ngateway had been intrusted to him, and he called out \"No!\" to the old\\nman, so that he stopped where he was standing. But he waved the shining\\nsword in the air again, and said: \"It is for you! Take it, and win the\\nvictory!\"\\n\\nSir Roland was really afraid that if he looked any longer at the\\nsword or listened to any more words of the old man, he would not be able\\nto hold himself within the castle. For this reason he struck the great\\nbell at the gateway, which was the signal for the servants inside to\\npull in the chains of the drawbridge, and instantly they began to pull,\\nand the drawbridge came up, so that the old man could not cross it to\\nenter the castle, nor Sir Roland to go out.\\n\\nThen, as he looked across the moat, Sir Roland saw a wonderful thing.\\nThe little old man threw off his black cloak, and as he did so he began\\nto grow bigger and bigger, until in a minute more he was a giant as tall\\nas any in the forest. At first Sir Roland could scarcely believe his\\neyes. Then he realized that this must be one of their giant enemies, who\\nhad changed himself to a little old man through some magic power, that\\nhe might make his way into the castle while all the knights were away.\\nSir Roland shuddered to think what might have happened if he had taken\\nthe sword and left the gate unguarded. The giant shook his 227 fist\\nacross the moat that lay between them, and then, knowing that he could\\ndo nothing more, he went angrily back into the forest.\\n\\nSir Roland now resolved not to open the gate again, and to pay no\\nattention to any other visitor. But it was not long before he heard a\\nsound that made him spring forward in joy. It was the bugle of the lord\\nof the castle, and there came sounding after it the bugles of many of\\nthe knights that were with him, pealing so joyfully that Sir Roland was\\nsure they were safe and happy. As they came nearer, he could hear their\\nshouts of victory. So he gave the signal to let down the drawbridge\\nagain, and went out to meet them. They were dusty and bloodstained and\\nweary, but they had won the battle with the giants; and it had been such\\na great victory that there had never been a happier home-coming.\\n\\nSir Roland greeted them all as they passed in over the bridge, and\\nthen, when he had closed the gate and fastened it, he followed them into\\nthe great hall of the castle. The lord of the castle took his place on\\nthe highest seat, with the other knights about him, and Sir Roland came\\nforward with the key of the gate, to give his account of what he had\\ndone in the place to which the commander had appointed him. The lord of\\nthe castle bowed to him as a sign for him to begin, but just as he\\nopened his mouth to speak, one of the knights cried out:\\n\\n\"The shield! the shield! Sir Roland\\'s shield!\"\\n\\nEvery one turned and looked at the shield which Sir Roland carried on\\nhis left arm. He himself could see only the top of it and did not know\\nwhat they could mean. But what they saw was the golden star of\\nknighthood, shining brightly from the center of Sir Roland\\'s shield.\\nThere had never been such amazement in the castle before.\\n\\nSir Roland knelt before the lord of the castle to receive his\\ncommands. He still did not know why every one was looking at him so\\nexcitedly, and wondered if he had in some way done wrong.\\n\\n\"Speak, Sir Knight,\" said the commander, as soon as he could find his\\nvoice after his surprise, \"and tell us all that has happened to-day at\\nthe castle. Have you been attacked? Have any giants come hither? Did you\\nfight them alone?\"\\n\\n\"No, my Lord,\" said Sir Roland. \"Only one giant has been here, and he\\nwent away silently when he found he could not enter.\"\\n\\nThen he told all that had happened through the day.\\n\\nWhen he had finished, the knights all looked at one another, but no\\none spoke a word. Then they looked again at Sir Roland\\'s shield, to make\\nsure that their eyes had not deceived them, and there the golden star\\nwas still shining.\\n\\nAfter a little silence the lord of the castle spoke.\\n\\n\"Men make mistakes,\" he said, \"but our silver shields are never\\nmistaken. Sir Roland has fought and won the hardest battle of all\\nto-day.\"\\n\\nThen the others all rose and saluted Sir Roland, who was the youngest\\nknight that ever carried the golden star.\\n\\n202\\n\\nJean Ingelow (1820-1897) was an English poet, novelist, and writer of\\nstories for children, who lived in the fen district of Lincolnshire. Her\\nmost noted poem deals with a terrible catastrophe that happened there\\nmore than three centuries ago. It is called \"The High Tide on the\\nCoast of Lincolnshire.\" Many reading books for the third or fourth grade\\ncontain her dainty and melodious \"Seven Times One,\" in which a little\\ngirl expresses the joy and sense of power felt on reaching a seventh\\nbirthday. Of her children\\'s books, the favorite is Mopsa the\\nFairy, which some one has called a \"delightful succession of\\nbreezy impossibilities.\" Her shorter stories for children are collected\\nunder the title Stories Told to a Child (two series), from\\nwhich \"The Prince\\'s Dream\" is taken. It is somewhat old fashioned in\\nmethod and style, reminding one of the stories of the days of Addison\\nand Steele. Its seriousness is in striking contrast with the more\\nflippant note in much modern writing for children, and it is sure to\\nsuggest some questions on the dangers and advantages of great\\npossessions in their effects on labor, liberty, and human happiness in\\ngeneral. However, the moral will take care of itself, and the attention\\nshould rest on the means used by the old man to teach the young prince\\nthe things he is shut out from learning by experience. The children will\\neasily see that it is an anticipation of the moving-picture method. Some\\nother good stories in the collection mentioned are \"I Have a Right,\"\\n\"The Fairy Who Judged Her Neighbors,\" and \"Anselmo.\"\\n\\nTHE PRINCE\\'S DREAM\\n\\nJEAN INGELOW\\n\\nIf we may credit the fable, there is a tower in the midst of a great\\nAsiatic plain, wherein is confined a prince who was placed there in his\\nearliest infancy, with many slaves and attendants, and all the luxuries\\nthat are compatible with imprisonment.\\n\\nWhether he was brought there from some motive of state, whether to\\nconceal him from enemies, or to deprive him of rights, has not\\ntranspired; but it is certain that up to the date of this little history\\nhe had never set his foot outside the walls of that high tower, and that\\nof the vast world without he knew only the green plains which surrounded\\nit; the flocks and the birds of that region were all his experience of\\nliving creatures, and all the men he saw outside were shepherds.\\n\\nAnd yet he was not utterly deprived of change, for sometimes one of\\nhis attendants would be ordered away, and his place would be supplied by\\na new one. This fresh companion the prince would never weary of\\nquestioning, and letting him talk of cities, of ships, of forests, of\\nmerchandise, of kings; but though in turns they all tried to satisfy his\\ncuriosity, they could not succeed in conveying very distinct notions to\\nhis mind; partly because there was nothing in the tower to which they\\ncould compare the external world, partly because, having chiefly lived\\nlives of seclusion and indolence in Eastern palaces, they knew it only\\nby hearsay themselves.\\n\\nAt length, one day, a venerable man of a noble presence was brought\\nto the tower, with soldiers to guard him and slaves to attend him. The\\nprince was glad of his presence, though at first he seldom opened his\\nlips, and it was manifest that confinement made him miserable. With\\nrestless feet he would wander from window to window of the stone tower,\\nand mount from story to story; but mount as high as he would there was\\nstill nothing to be seen but the vast unvarying plain, clothed with\\nscanty grass, and flooded with the glaring sunshine; flocks and herds,\\nand shepherds, moved across it sometimes, but nothing else, not even a\\nshadow, for there was no cloud in the sky to cast one.\\n\\nThe old man, however, always treated the prince with respect, and\\nanswered his questions with a great deal of patience, till at length he\\nfound a pleasure in satisfying his curiosity, which so much pleased the\\nyoung prisoner, that, as a great condescension, he invited him to come\\nout on the roof of the tower and drink sherbet with him in the cool of\\nthe evening, and tell him of the country beyond the desert, and what\\nseas are like, and mountains, and towns.\\n\\n\"I have learnt much from my attendants, and know this world pretty\\nwell by hearsay,\" said the prince, as they reclined on the rich carpet\\nwhich was spread on the roof.\\n\\nThe old man smiled, but did not answer; perhaps because he did not\\ncare to undeceive his young companion, perhaps because so many slaves\\nwere present, some of whom were serving them with fruit, and others\\nburning rich odors on a little chafing-dish that stood between them.\\n\\n\"But there are some words to which I never could attach any\\nparticular meaning,\" proceeded the prince, as the slaves began to\\nretire, \"and three in particular that my attendants cannot satisfy me\\nupon, or are reluctant to do so.\"\\n\\n\"What words are those, my prince?\" asked the old man. The prince\\nturned on his elbow to be sure that the last slave had descended the\\ntower stairs, then replied—\\n\\n\"O man of much knowledge, the words are these—Labor, and Liberty, and\\nGold.\"\\n\\n\"Prince,\" said the old man, \"I do not wonder that it has been hard to\\nmake thee understand the first, the nature of it, and the cause why most\\nmen are born to it; as for the second, it would be treason for thee and\\nme to do more than whisper it here, and sigh for it when none are\\nlistening; but the third need hardly puzzle thee, thy hookah is bright\\nwith it; all thy jewels are set in it; gold is inlaid in the ivory of\\nthy bath; thy cup and thy dish are of gold, and golden threads are\\nwrought into thy raiment.\"\\n\\n\"That is true,\" replied the prince, \"and if I had not seen and\\nhandled this gold, perhaps I might not find its merits so hard to\\nunderstand; but I possess it in abundance, and it does not feed me, nor\\nmake music for me, nor fan me when the sun is hot, nor cause me to sleep\\nwhen I am weary; therefore when my slaves have told me how merchants go\\nout and brave the perilous wind and sea, and live in the unstable ships,\\nand run risks from shipwreck and pirates, and when, having asked them\\nwhy they have done this, they have answered, \\'For gold,\\' I have found it\\nhard to believe them; and when they have told me how men have lied, and\\nrobbed, and deceived; how they have murdered one another, and leagued\\ntogether to depose kings, to oppress provinces, and all for gold; then I\\nhave said to myself, either my slaves have combined to make me believe\\nthat which is not, or this gold must be very different from the yellow\\nstuff that this coin is made of, this coin which is of no use but to\\nhave a hole pierced through it and hang to my girdle, that it may tinkle\\nwhen I walk.\"\\n\\n\"Notwithstanding,\" said the old man, \"nothing can be done without\\ngold; for look you, prince, it is better than bread, and fruit, and\\nmusic, for it can buy them all, since men love it, and have agreed to\\nexchange it for whatever they may need.\"\\n\\n\"How so?\" asked the prince.\\n\\n\"If a man has many loaves he cannot eat them all,\" answered the old\\nman; \"therefore he goes to his neighbor and 230 says,\\n\\'I have bread and thou hast a coin of gold—let us change\\'; so he\\nreceives the gold and goes to another man, saying, \\'Thou hast two houses\\nand I have none; lend me one of thy houses to live in, and I will give\\nthee my gold\\'; thus again they change, and he that has the gold says, \\'I\\nhave food enough and goods enough, but I want a wife, I will go to the\\nmerchant and get a marriage gift for her father, and for it I will give\\nhim this gold.\\'\"\\n\\n\"It is well,\" said the prince; \"but in time of drought, if there is\\nno bread in a city, can they make it of gold?\"\\n\\n\"Not so,\" answered the old man, \"but they must send their gold to a\\ncity where there is food, and bring that back instead of it.\"\\n\\n\"But if there was a famine all over the world,\" asked the prince,\\n\"what would they do then?\"\\n\\n\"Why then, and only then,\" said the old man, \"they must starve, and\\nthe gold would be nought, for it can only be changed for that which\\nis; it cannot make that which is not.\"\\n\\n\"And where do they get gold?\" asked the prince; \"is it the precious\\nfruit of some rare tree, or have they whereby they can draw it down from\\nthe sky at sunset?\"\\n\\n\"Some of it,\" said the old man, \"they dig out of the ground.\"\\n\\nThen he told the prince of ancient rivers running through terrible\\ndeserts, whose sands glitter, with golden grains and are yellow in the\\nfierce heat of the sun, and of dreary mines where the Indian slaves work\\nin gangs tied together, never seeing the light of day; and lastly (for\\nhe was a man of much knowledge, and had traveled far), he told him of\\nthe valley of the Sacramento in the New World, and of those mountains\\nwhere the people of Europe send their criminals, and where now their\\nfree men pour forth to gather gold, and dig for it as hard as if for\\nlife; sitting up by it at night lest any should take it from them,\\ngiving up houses and country, and wife and children, for the sake of a\\nfew feet of mud, whence they dig clay that glitters as they wash it; and\\nhow they sift it and rock it as patiently as if it were their own\\nchildren in the cradle, and afterwards carry it in their bosoms, and\\nforego on account of it safety and rest.\\n\\n\"But, prince,\" he proceeded, observing that the young man was\\nabsorbed in his narrative, \"if you would pass your word to me never to\\nbetray me, I would procure for you a sight of the external world, and in\\na trance you should see those places where gold is dug, and traverse\\nthose regions forbidden to your mortal footsteps.\"\\n\\nUpon this, the prince threw himself at the old man\\'s feet, and\\npromised heartily to observe the secrecy required, and entreated that,\\nfor however short time, he might be suffered to see this wonderful\\nworld.\\n\\nThen, if we may credit the story, the old man drew nearer to the\\nchafing-dish which stood between them, and having fanned the dying\\nembers in it, cast upon them a certain powder and some herbs, from\\nwhence as they burnt a peculiar smoke arose. As their vapors spread, he\\ndesired the prince to draw near and inhale them, and then (says the\\nfable) when he should sleep he should find himself, in his dream, at\\nwhatever place he might desire, with this strange advantage, that he\\nshould see things in their truth and reality as well as in their outward\\nshows.\\n\\nSo the prince, not without some fear, prepared to obey; but first he\\ndrank his 231 sherbet, and handed over the golden cup to the\\nold man by way of recompense; then he reclined beside the chafing-dish\\nand inhaled the heavy perfume till he became overpowered with sleep, and\\nsank down upon the carpet in a dream.\\n\\nThe prince knew not where he was, but a green country was floating\\nbefore him, and he found himself standing in a marshy valley, where a\\nfew wretched cottages were scattered here and there with no means of\\ncommunication. There was a river, but it had overflowed its banks and\\nmade the central land impassable, the fences had been broken down by it,\\nand the fields of corn laid low; a few wretched peasants were wandering\\nabout there; they looked half clad and half starved. \"A miserable valley\\nindeed!\" exclaimed the prince; but as he said it a man came down from\\nthe hills with a great bag of gold in his hand.\\n\\n\"This valley is mine,\" said he to the people; \"I have bought it for\\ngold. Now make banks that the river may not overflow, and I will give\\nyou gold; also make fences and plant fields, and cover in the roofs of\\nyour houses, and buy yourselves richer clothing.\" So the people did so,\\nand as the gold got lower in the bag the valley grew fairer and greener,\\ntill the prince exclaimed, \"O gold, I see your value now! O wonderful,\\nbeneficent gold!\"\\n\\nBut presently the valley melted away like a mist, and the prince saw\\nan army besieging a city; he heard a general haranguing his soldiers to\\nurge them on, and the soldiers shouting and battering the walls; but\\nshortly, when the city was well-nigh taken, he saw some men secretly\\nthrowing gold among the soldiers, so much of it that they threw down\\ntheir arms to pick it up, and said that the walls were so strong that\\nthey could not throw them down. \"O powerful gold!\" thought the prince;\\n\"thou art stronger than the city walls!\"\\n\\nAfter that it seemed to himself that he was walking about in a desert\\ncountry, and in his dream he thought, \"Now I know what labor is, for I\\nhave seen it, and its benefits; and I know what liberty is, for I have\\ntasted it; I can wander where I will, and no man questions me; but gold\\nis more strange to me than ever, for I have seen it buy both liberty and\\nlabor.\" Shortly after this he saw a great crowd digging upon a barren\\nhill, and when he drew near he understood that he had reached the summit\\nof his wishes, and that he was to see the place where the gold came\\nfrom.\\n\\nHe came up and stood a long time watching the people as they toiled\\nready to faint in the sun, so great was the labor of digging the\\ngold.\\n\\nHe saw who had much and could not trust any one to help them to carry\\nit, binding it in bundles over their shoulders, and bending and groaning\\nunder its weight; he saw others hide it in the ground, and watch the\\nplace clothed in rags, that none might suspect that they were rich; but\\nsome, on the contrary, who had dug up an unusual quantity, he saw\\ndancing and singing, and vaunting their success, till robbers waylaid\\nthem when they slept, and rifled their bundles and carried their golden\\nsand away.\\n\\n\"All these men are mad,\" thought the prince, \"and this pernicious\\ngold has made them so.\"\\n\\nAfter this, as he wandered here and there, he saw groups of people\\nsmelting the gold under the shadow of the trees, and he observed that a\\ndancing, quivering vapor rose up from it, which dazzled their 232 eyes,\\nand distorted everything that they looked at; arraying it also in\\ndifferent colors from the true one. He observed that this vapor from the\\ngold caused all things to rock and reel before the eyes of those who\\nlooked through it, and also, by some strange affinity, it drew their\\nhearts towards those that carried much gold on their persons, so that\\nthey called them good and beautiful; it also caused them to see darkness\\nand dullness in the faces of those who carried none. \"This,\" thought the\\nprince, \"is very strange\"; but not being able to explain it, he went\\nstill further, and there he saw more people. Each of these had adorned\\nhimself with a broad golden girdle, and was sitting in the shade, while\\nother men waited on them.\\n\\n\"What ails these people?\" he inquired of one who was looking on, for\\nhe observed a peculiar air of weariness and dullness in their faces. He\\nwas answered that the girdles were very tight and heavy, and being bound\\nover the regions of the heart, were supposed to impede its action, and\\nprevent it from beating high, and also to chill the wearer, as being of\\nopaque material, the warm sunshine of the earth could not get through to\\nwarm him.\\n\\n\"Why, then, do they not break them asunder,\" exclaimed the prince,\\n\"and fling them away?\"\\n\\n\"Break them asunder!\" cried the man; \"why what a madman you must be;\\nthey are made of the purest gold!\"\\n\\n\"Forgive my ignorance,\" replied the prince; \"I am a stranger.\"\\n\\nSo he walked on, for feelings of delicacy prevented him from gazing\\nany longer at the men with the golden girdles; but as he went he\\npondered on the misery he had seen, and thought to himself that this\\ngolden sand did more mischief than all the poisons of the apothecary;\\nfor it dazzled the eyes of some, it strained the hearts of others, it\\nbowed down the heads of many to the earth with its weight; it was a sore\\nlabor to gather it, and when it was gathered, the robber might carry it\\naway; it would be a good thing, he thought, if there were none of\\nit.\\n\\nAfter this he came to a place where were sitting some aged widows and\\nsome orphan children of the gold-diggers, who were helpless and\\ndestitute; they were weeping and bemoaning themselves, but stopped at\\nthe approach of a man, whose appearance attracted the prince, for he had\\na very great bundle of gold on his back, and yet it did not bow him down\\nat all; his apparel was rich but he had no girdle on, and his face was\\nanything but sad.\\n\\n\"Sir,\" said the prince to him, \"you have a great burden; you are\\nfortunate to be able to stand under it.\"\\n\\n\"I could not do so,\" he replied, \"only that as I go on I keep\\nlightening it\"; and as he passed each of the widows, he threw gold to\\nher, and stooping down, hid pieces of it in the bosoms of the\\nchildren.\\n\\n\"You have no girdle,\" said the prince.\\n\\n\"I once had one,\" answered the gold gatherer; \"but it was so tight\\nover my breast that my very heart grew cold under it, and almost ceased\\nto beat. Having a great quantity of gold on my back, I felt almost at\\nthe last gasp; so I threw off my girdle and being on the bank of a\\nriver, which I knew not how to cross, I was about to fling it in, I was\\nso vexed! \\'But no,\\' thought I, \\'there are many people waiting here to\\ncross besides myself. I will make my girdle into a bridge, and we will\\ncross over on it.\\'\"\\n\\n\"Turn your girdle into a bridge!\" exclaimed the prince doubtfully,\\nfor he did not quite understand.\\n\\nThe man explained himself. 233\\n\\n\"And then, sir, after that,\" he continued, \"I turned one half of my\\nburden into bread, and gave it to these poor people. Since then I have\\nnot been oppressed by its weight, however heavy it may have been; for\\nfew men have a heavier one. In fact, I gather more from day to day.\"\\n\\nAs the man kept speaking, he scattered his gold right and left with a\\ncheerful countenance, and the prince was about to reply, when suddenly a\\ngreat trembling under his feet made him fall to the ground. The refining\\nfires of the gold gatherers sprang up into flames, and then went out;\\nnight fell over everything on the earth, and nothing was visible in the\\nsky but the stars of the southern cross, which were glittering above\\nhim.\\n\\n\"It is past midnight,\" thought the prince, \"for the stars of the\\ncross begin to bend.\"\\n\\nHe raised himself upon his elbow, and tried to pierce the darkness,\\nbut could not. At length a slender blue flame darted out, as from ashes\\nin a chafing-dish, and by the light of it he saw the strange pattern of\\nhis carpet and the cushions lying about. He did not recognise them at\\nfirst, but presently he knew that he was lying in his usual place, at\\nthe top of his tower.\\n\\n\"Wake up, prince,\" said the old man.\\n\\nThe prince sat up and sighed, and the old man inquired what he had\\nseen.\\n\\n\"O man of much learning!\" answered the prince, \"I have seen that this\\nis a wonderful world; I have seen the value of labor, and I know the\\nuses of it; I have tasted the sweetness of liberty, and am grateful,\\nthough it was but in a dream; but as for that other word that was so\\ngreat a mystery to me, I only know this, that it must remain a mystery\\nforever, since I am fain to believe that all men are bent on getting it;\\nthough, once gotten, it causeth them endless disquietude, only second to\\ntheir discomfort that are without it. I am fain to believe that they can\\nprocure with it whatever they most desire, and yet that it cankers their\\nhearts and dazzles their eyes; that it is their nature and their duty to\\ngather it; and yet that, when once gathered, the best thing they can do\\nis to scatter it!\"\\n\\nAlas! the prince visited this wonderful world no more; for the next\\nmorning, when he awoke, the old man was gone. He had taken with him the\\ngolden cup which the prince had given him. And the sentinel was also\\ngone, none knew whither. Perhaps the old man had turned his golden cup\\ninto a golden key.\\n\\n203\\n\\nFew modern writers have given their readers more genuine delight than\\nFrank R. Stockton (1834-1902). The most absurd and illogical situations\\nand characters are presented with an air of such quiet sincerity that\\none refuses to question the reality of it all. Rudder\\nGrange established his reputation in 1879, and was followed by a\\nlong list of stories of delightfully impossible events. For several\\nyears Stockton was one of the editors of St. Nicholas, and\\nsome of his stories for children, of first quality in both form and\\ncontent, deserve to be better known than they are. Five of the best of\\nthem for school use have been brought together in a little volume called\\nFanciful Tales. One of these, \"Old Pipes and the Dryad,\" is\\ngiven here by permission of the publishers, Charles Scribner\\'s Sons, New\\nYork. (Copyright, 1894.) This story is based upon the old mythical\\nbelief that the trees are inhabited by guardian deities known as dryads,\\nor hamadryads. To injure a tree meant to injure its guardian spirit and\\nwas almost certain to insure 234 disaster for the guilty person.\\nOn the other hand, to protect a tree would bring some token of\\nappreciation from the dryad. A good introduction to the story would be\\nthe telling of one or two of these tree myths as found in Gayley\\'s\\nClassic Myths or Bulfinch\\'s Age of Fable. A\\nfine literary version of one of them is in Lowell\\'s \"Rhoecus.\" But the\\nbeautiful and kindly helpfulness of Old Pipes will carry its own message\\nwhether one knows any mythology or not.\\n\\nOLD PIPES AND THE DRYAD\\n\\nFRANK R. STOCKTON\\n\\nA Mountain brook ran through a little village. Over the brook there\\nwas a narrow bridge, and from the bridge a foot-path led out from the\\nvillage and up the hill-side, to the cottage of Old Pipes and his\\nmother.\\n\\nFor many, many years Old Pipes had been employed by the villagers to\\npipe the cattle down from the hills. Every afternoon, an hour before\\nsunset, he would sit on a rock in front of his cottage and play on his\\npipes. Then all the flocks and herds that were grazing on the mountains\\nwould hear him, wherever they might happen to be, and would come down to\\nthe village—the cows by the easiest paths, the sheep by those not quite\\nso easy, and the goats by the steep and rocky ways that were hardest of\\nall.\\n\\nBut now, for a year or more, Old Pipes had not piped the cattle home.\\nIt is true that every afternoon he sat upon the rock and played upon his\\npipes; but the cattle did not hear him. He had grown old, and his breath\\nwas feeble. The echoes of his cheerful notes, which used to come from\\nthe rocky hill on the other side of the valley, were heard no more; and\\ntwenty yards from Old Pipes one could scarcely tell what tune he was\\nplaying. He had become somewhat deaf, and did not know that the sound of\\nhis pipes was so thin and weak, and that the cattle did not hear him.\\nThe cows, the sheep, and the goats came down every afternoon as before;\\nbut this was because two boys and a girl were sent up after them. The\\nvillagers did not wish the good old man to know that his piping was no\\nlonger of any use; so they paid him his little salary every month, and\\nsaid nothing about the two boys and the girl.\\n\\nOld Pipes\\'s mother was, of course, a great deal older than he was,\\nand was as deaf as a gate—post, latch, hinges, and all—and she never\\nknew that the sound of her son\\'s pipe did not spread over all the\\nmountain-side and echo back strong and clear from the opposite hills.\\nShe was very fond of Old Pipes, and proud of his piping; and as he was\\nso much younger than she was, she never thought of him as being very\\nold. She cooked for him, and made his bed, and mended his clothes; and\\nthey lived very comfortably on his little salary.\\n\\nOne afternoon, at the end of the month, when Old Pipes had finished\\nhis piping, he took his stout staff and went down the hill to the\\nvillage to receive the money for his month\\'s work. The path seemed a\\ngreat deal steeper and more difficult than it used to be; and Old Pipes\\nthought that it must have been washed by the rains and greatly damaged.\\nHe remembered it as a path that was quite easy to traverse either up or\\ndown. But Old Pipes had been a very active man, and as his mother was so\\nmuch older than he was, he never thought of himself as aged and infirm.\\n235\\n\\nWhen the Chief Villager had paid him, and he had talked a little with\\nsome of his friends, Old Pipes started to go home. But when he had\\ncrossed the bridge over the brook, and gone a short distance up the\\nhill-side, he became very tired, and sat down upon a stone. He had not\\nbeen sitting there half a minute, when along came two boys and a\\ngirl.\\n\\n\"Children,\" said Old Pipes, \"I\\'m very tired to-night, and I don\\'t\\nbelieve I can climb up this steep path to my home. I think I shall have\\nto ask you to help me.\"\\n\\n\"We will do that,\" said the boys and the girl, quite cheerfully; and\\none boy took him by the right hand and the other by the left, while the\\ngirl pushed him in the back. In this way he went up the hill quite\\neasily, and soon reached his cottage door. Old Pipes gave each of the\\nthree children a copper coin, and then they sat down for a few minutes\\'\\nrest before starting back to the village.\\n\\n\"I\\'m sorry that I tired you so much,\" said Old Pipes.\\n\\n\"Oh, that would not have tired us,\" said one of the boys, \"if we had\\nnot been so far to-day after the cows, the sheep, and the goats. They\\nrambled high up on the mountain, and we never before had such a time in\\nfinding them.\"\\n\\n\"Had to go after the cows, the sheep, and the goats!\" exclaimed Old\\nPipes. \"What do you mean by that?\"\\n\\nThe girl, who stood behind the old man, shook her head, put her hand\\non her mouth, and made all sorts of signs to the boy to stop talking on\\nthis subject; but he did not notice her, and promptly answered Old\\nPipes.\\n\\n\"Why, you see, good sir,\" said he, \"that as the cattle can\\'t hear\\nyour pipes now, somebody has to go after them every evening to drive\\nthem down from the mountain, and the Chief Villager has hired us three\\nto do it. Generally it is not very hard work, but to-night the cattle\\nhad wandered far.\"\\n\\n\"How long have you been doing this?\" asked the old man.\\n\\nThe girl shook her head and clapped her hand on her mouth as before,\\nbut the boy went on.\\n\\n\"I think it is about a year now,\" he said, \"since the people first\\nfelt sure that the cattle could not hear your pipes; and from that time\\nwe\\'ve been driving them down. But we are rested now, and will go home.\\nGood-night, sir.\"\\n\\nThe three children then went down the hill, the girl scolding the boy\\nall the way home. Old Pipes stood silent a few moments, and then he went\\ninto his cottage.\\n\\n\"Mother,\" he shouted, \"did you hear what those children said?\"\\n\\n\"Children!\" exclaimed the old woman; \"I did not hear them. I did not\\nknow there were any children here.\"\\n\\nThen Old Pipes told his mother—shouting very loudly to make her\\nhear—how the two boys and the girl had helped him up the hill, and what\\nhe had heard about his piping and the cattle.\\n\\n\"They can\\'t hear you?\" cried his mother. \"Why, what\\'s the matter with\\nthe cattle?\"\\n\\n\"Ah, me!\" said Old Pipes; \"I don\\'t believe there\\'s anything the\\nmatter with the cattle. It must be with me and my pipes that there is\\nsomething the matter. But one thing is certain: if I do not earn the\\nwages the Chief Villager pays me, I shall not take them. I shall go\\nstraight down to the village and give back the money I received to-day.\"\\n236\\n\\n\"Nonsense!\" cried his mother. \"I\\'m sure you\\'ve piped as well as you\\ncould, and no more can be expected. And what are we to do without the\\nmoney?\"\\n\\n\"I don\\'t know,\" said Old Pipes; \"but I\\'m going down to the village to\\npay it back.\"\\n\\nThe sun had now set; but the moon was shining very brightly on the\\nhill-side, and Old Pipes could see his way very well. He did not take\\nthe same path by which he had gone before, but followed another, which\\nled among the trees upon the hill-side, and, though longer, was not so\\nsteep.\\n\\nWhen he had gone about half-way, the old man sat down to rest,\\nleaning his back against a great oak tree. As he did so, he heard a\\nsound like knocking inside the tree, and then a voice said:\\n\\n\"Let me out! let me out!\"\\n\\nOld Pipes instantly forgot that he was tired, and sprang to his feet.\\n\"This must be a Dryad tree!\" he exclaimed. \"If it is, I\\'ll let her\\nout.\"\\n\\nOld Pipes had never, to his knowledge, seen a Dryad tree, but he knew\\nthere were such trees on the hill-sides and the mountains, and that\\nDryads lived in them. He knew, too, that in the summer time, on those\\ndays when the moon rose before the sun went down, a Dryad could come out\\nof her tree if any one could find the key which locked her in, and turn\\nit. Old Pipes closely examined the trunk of the tree, which stood in the\\nfull moonlight. \"If I see that key,\" he said, \"I shall surely turn it.\"\\nBefore long he found a piece of bark standing out from the tree, which\\nlooked to him very much like the handle of a key. He took hold of it,\\nand found he could turn it quite around. As he did so, a large part of\\nthe side of the tree was pushed open, and a beautiful Dryad stepped quickly\\nout.\\n\\nTranscriber\\'s Note: original reads \\'Dyrad\\'\\n\\nFor a moment she stood motionless, gazing on the scene before her—the\\ntranquil valley, the hills, the forest, and the mountain-side, all lying\\nin the soft clear light of the moon. \"Oh, lovely! lovely!\" she\\nexclaimed. \"How long it is since I have seen anything like this!\" And\\nthen, turning to Old Pipes, she said: \"How good of you to let me out! I\\nam so happy, and so thankful, that I must kiss you, you dear old man!\"\\nAnd she threw her arms around the neck of Old Pipes, and kissed him on\\nboth cheeks.\\n\\n\"You don\\'t know,\" she then went on to say, \"how doleful it is to be\\nshut up so long in a tree. I don\\'t mind it in the winter, for then I am\\nglad to be sheltered, but in summer it is a rueful thing not to be able\\nto see all the beauties of the world. And it\\'s ever so long since I\\'ve\\nbeen let out. People so seldom come this way; and when they do come at\\nthe right time, they either don\\'t hear me or they are frightened and run\\naway. But you, you dear old man, you were not frightened, and you looked\\nand looked for the key, and you let me out; and now I shall not have to\\ngo back till winter has come, and the air grows cold. Oh, it is\\nglorious! What can I do for you, to show you how grateful I am?\"\\n\\n\"I am very glad,\" said Old Pipes, \"that I let you out, since I see\\nthat it makes you so happy; but I must admit that I tried to find the\\nkey because I had a great desire to see a Dryad. But, if you wish to do\\nsomething for me, you can, if you happen to be going down toward the\\nvillage.\"\\n\\n\"To the village!\" exclaimed the Dryad. 237 \"I will go anywhere for you, my\\nkind old benefactor.\"\\n\\n\"Well, then,\" said Old Pipes, \"I wish you would take this little bag\\nof money to the Chief Villager and tell him that Old Pipes cannot\\nreceive pay for the services which he does not perform. It is now more\\nthan a year that I have not been able to make the cattle hear me, when I\\npiped to call them home. I did not know this until to-night; but now\\nthat I know it, I cannot keep the money, and so I send it back.\" And,\\nhanding the little bag to the Dryad, he bade her good-night, and turned\\ntoward his cottage.\\n\\n\"Good-night,\" said the Dryad. \"And I thank you over, and over, and\\nover again, you good old man!\"\\n\\nOld Pipes walked toward his home, very glad to be saved the fatigue\\nof going all the way down to the village and back again. \"To be sure,\"\\nhe said to himself, \"this path does not seem at all steep, and I can\\nwalk along it very easily; but it would have tired me dreadfully to come\\nup all the way from the village, especially as I could not have expected\\nthose children to help me again.\" When he reached home his mother was\\nsurprised to see him returning so soon.\\n\\n\"What!\" she exclaimed; \"have you already come back? What did the\\nChief Villager say? Did he take the money?\"\\n\\nOld Pipes was just about to tell her that he had sent the money to\\nthe village by a Dryad, when he suddenly reflected that his mother would\\nbe sure to disapprove such a proceeding, and so he merely said he had\\nsent it by a person whom he had met.\\n\\n\"And how do you know that the person will ever take it to the Chief\\nVillager?\" cried his mother. \"You will lose it, and the villagers will\\nnever get it. Oh, Pipes! Pipes! when will you be old enough to have\\nordinary common-sense?\"\\n\\nOld Pipes considered that, as he was already seventy years of age, he\\ncould scarcely expect to grow any wiser; but he made no remark on this\\nsubject, and, saying that he doubted not that the money would go safely\\nto its destination, he sat down to his supper. His mother scolded him\\nroundly, but he did not mind it; and after supper he went out and sat on\\na rustic chair in front of the cottage to look at the moonlit village,\\nand to wonder whether or not the Chief Villager really received the\\nmoney. While he was doing these two things, he went fast asleep.\\n\\nWhen Old Pipes left the Dryad, she did not go down to the village\\nwith the little bag of money. She held it in her hand, and thought about\\nwhat she had heard. \"This is a good and honest old man,\" she said; \"and\\nit is a shame that he should lose this money. He looked as if he needed\\nit, and I don\\'t believe the people in the village will take it from one\\nwho has served them so long. Often, when in my tree, have I heard the\\nsweet notes of his pipes. I am going to take the money back to him.\" She\\ndid not start immediately, because there were so many beautiful things\\nto look at; but after awhile she went up to the cottage, and, finding\\nOld Pipes asleep in his chair, she slipped the little bag into his\\ncoat-pocket, and silently sped away.\\n\\nThe next day Old Pipes told his mother that he would go up the\\nmountain and cut some wood. He had a right to get wood from the\\nmountain, but for a long time he had been content to pick up the dead\\nbranches which lay about his cottage. To-day, however, he felt so strong\\nand vigorous that he thought he would 238 go and cut some fuel that would\\nbe better than this. He worked all the morning, and when he came back he\\ndid not feel at all tired, and he had a very good appetite for his\\ndinner.\\n\\nNow, Old Pipes knew a good deal about Dryads; but there was one thing\\nwhich, although he had heard, he had forgotten. This was, that a kiss\\nfrom a Dryad made a person ten years younger.\\n\\nThe people of the village knew this, and they were very careful not\\nto let any child of ten years or younger go into the woods where the\\nDryads were supposed to be; for, if they should chance to be kissed by\\none of these tree-nymphs, they would be set back so far that they would\\ncease to exist.\\n\\nA story was told in the village that a very bad boy of eleven once\\nran away into the woods, and had an adventure of this kind; and when his\\nmother found him he was a little baby of one year old. Taking advantage\\nof her opportunity, she brought him up more carefully than she had done\\nbefore, and he grew to be a very good boy indeed.\\n\\nNow Old Pipes had been kissed twice by the Dryad, once on each cheek,\\nand he therefore felt as vigorous and active as when he was a hale man\\nof fifty. His mother noticed how much work he was doing, and told him\\nthat he need not try in that way to make up for the loss of his piping\\nwages; for he would only tire himself out, and get sick. But her son\\nanswered that he had not felt so well for years, and that he was quite\\nable to work.\\n\\nIn the course of the afternoon, Old Pipes, for the first time that\\nday, put his hand in his coat-pocket, and there, to his amazement, he\\nfound the little bag of money. \"Well, well!\" he exclaimed, \"I am stupid,\\nindeed! I really thought that I had seen a Dryad; but when I sat down by\\nthat big oak tree I must have gone to sleep and dreamed it all; and then\\nI came home, thinking I had given the money to a Dryad, when it was in\\nmy pocket all the time. But the Chief Villager shall have the money. I\\nshall not take it to him to-day, but to-morrow I wish to go to the\\nvillage to see some of my old friends; and then I shall give up the\\nmoney.\"\\n\\nToward the close of the afternoon, Old Pipes, as had been his custom\\nfor so many years, took his pipes from the shelf on which they lay, and\\nwent out to the rock in front of the cottage.\\n\\n\"What are you going to do?\" cried his mother. \"If you will not\\nconsent to be paid, why do you pipe?\"\\n\\n\"I am going to pipe for my own pleasure,\" said her son. \"I am used to\\nit, and I do not wish to give it up. It does not matter now whether the\\ncattle hear me or not, and I am sure that my piping will injure no\\none.\"\\n\\nWhen the good man began to play upon his favorite instrument he was\\nastonished at the sound that came from it. The beautiful notes of the\\npipes sounded clear and strong down into the valley, and spread over the\\nhills, and up the sides of the mountain beyond, while, after a little\\ninterval, an echo came back from the rocky hill on the other side of the\\nvalley.\\n\\n\"Ha! ha!\" he cried, \"what has happened to my pipes? They must have\\nbeen stopped up of late, but now they are as clear and good as\\never.\"\\n\\nAgain the merry notes went sounding far and wide. The cattle on the\\nmountain heard them, and those that were old enough remembered how these\\nnotes had called them from their pastures 239 every evening, and so they\\nstarted down the mountain-side, the others following.\\n\\nThe merry notes were heard in the village below, and the people were\\nmuch astonished thereby. \"Why, who can be blowing the pipes of Old\\nPipes?\" they said. But, as they were all very busy, no one went up to\\nsee. One thing, however, was plain enough: the cattle were coming down\\nthe mountain. And so the two boys and the girl did not have to go after\\nthem, and had an hour for play, for which they were very glad.\\n\\nThe next morning Old Pipes started down to the village with his\\nmoney, and on the way he met the Dryad. \"Oh, ho!\" he cried, \"is that\\nyou? Why, I thought my letting you out of the tree was nothing but a\\ndream.\"\\n\\n\"A dream!\" cried the Dryad; \"if you only knew how happy you have made\\nme, you would not think it merely a dream. And has it not benefited you?\\nDo you not feel happier? Yesterday I heard you playing beautifully on\\nyour pipes.\"\\n\\n\"Yes, yes,\" cried he. \"I did not understand it before, but I see it\\nall now. I have really grown younger. I thank you, I thank you, good\\nDryad, from the bottom of my heart. It was the finding of the money in\\nmy pocket that made me think it was a dream.\"\\n\\n\"Oh, I put it in when you were asleep,\" she said, laughing, \"because\\nI thought you ought to keep it. Good-by, kind, honest man. May you live\\nlong, and be as happy as I am now.\"\\n\\nOld Pipes was greatly delighted when he understood that he was really\\na younger man; but that made no difference about the money, and he kept\\non his way to the village. As soon as he reached it, he was eagerly\\nquestioned as to who had been playing his pipes the evening before, and\\nwhen the people heard that it was himself they were very much surprised.\\nThereupon Old Pipes told what had happened to him, and then there was\\ngreater wonder, with hearty congratulations and hand-shakes; for Old\\nPipes was liked by everyone. The Chief Villager refused to take his\\nmoney; and although Old Pipes said that he had not earned it, everyone\\npresent insisted that, as he would now play on his pipes as before, he\\nshould lose nothing because, for a time, he was unable to perform his\\nduty.\\n\\nSo Old Pipes was obliged to keep his money, and after an hour or two\\nspent in conversation with his friends he returned to his cottage.\\n\\nThere was one person, however, who was not pleased with what had\\nhappened to Old Pipes. This was an Echo-dwarf who lived on the hills\\nacross the valley. It was his work to echo back the notes of the pipes\\nwhenever they could be heard.\\n\\nA great many other Echo-dwarfs lived on these hills. They all worked,\\nbut in different ways. Some echoed back the songs of maidens, some the\\nshouts of children, and others the music that was often heard in the\\nvillage. But there was only one who could send back the strong notes of\\nthe pipes of Old Pipes, and this had been his sole duty for many years.\\nBut when the old man grew feeble, and the notes of his pipes could not\\nbe heard on the opposite hills, this Echo-dwarf had nothing to do, and\\nhe spent his time in delightful idleness; and he slept so much and grew\\nso fat that it made his companions laugh to see him walk. 240\\n\\nOn the afternoon on which, after so long an interval, the sound of\\nthe pipes was heard on the echo hills, this dwarf was fast asleep behind\\na rock. As soon as the first notes reached them, some of his companions\\nran to wake him up. Rolling to his feet, he echoed back the merry tune\\nof Old Pipes.\\n\\nNaturally, he was very angry at being thus obliged to give up his\\nlife of comfort, and he hoped very much that this pipe-playing would not\\noccur again. The next afternoon he was awake and listening, and, sure\\nenough, at the usual hour, along came the notes of the pipes as clear\\nand strong as they ever had been; and he was obliged to work as long as\\nOld Pipes played. The Echo-dwarf was very angry. He had supposed, of\\ncourse, that the pipe-playing had ceased forever, and he felt that he\\nhad a right to be indignant at being thus deceived. He was so much\\ndisturbed that he made up his mind to go and try to find out how long\\nthis was to last. He had plenty of time, as the pipes were played but\\nonce a day, and he set off early in the morning for the hill on which\\nOld Pipes lived. It was hard work for the fat little fellow, and when he\\nhad crossed the valley and had gone some distance into the woods on the\\nhill-side, he stopped to rest, and in a few minutes the Dryad came\\ntripping along.\\n\\n\"Ho, ho!\" exclaimed the dwarf; \"what are you doing here? and how did\\nyou get out of your tree?\"\\n\\n\"Doing!\" cried the Dryad; \"I am being happy; that\\'s what I am doing.\\nAnd I was let out of my tree by the good old man who plays the pipes to\\ncall the cattle down from the mountain. And it makes me happier to think\\nthat I have been of service to him. I gave him two kisses of gratitude,\\nand now he is young enough to play his pipes as well as ever.\"\\n\\nThe Echo-dwarf stepped forward, his face pale with passion. \"Am I to\\nbelieve,\" he said, \"that you are the cause of this great evil that has\\ncome upon me? and that you are the wicked creature who has again started\\nthis old man upon his career of pipe-playing? What have I ever done to\\nyou that you should have condemned me for years and years to echo back\\nthe notes of those wretched pipes?\"\\n\\nAt this the Dryad laughed loudly.\\n\\n\"What a funny little fellow you are!\" she said. \"Anyone would think\\nyou had been condemned to toil from morning till night; while what you\\nreally have to do is merely to imitate for half an hour every day the\\nmerry notes of Old Pipes\\'s piping. Fie upon you, Echo-dwarf! You are\\nlazy and selfish; and that is what is the matter with you. Instead of\\ngrumbling at being obliged to do a little wholesome work, which is less,\\nI am sure, than that of any other echo-dwarf upon the rocky hill-side,\\nyou should rejoice at the good fortune of the old man who has regained\\nso much of his strength and vigor. Go home and learn to be just and\\ngenerous; and then, perhaps, you may be happy. Good-by.\"\\n\\n\"Insolent creature!\" shouted the dwarf, as he shook his fat little\\nfist at her. \"I\\'ll make you suffer for this. You shall find out what it\\nis to heap injury and insult upon one like me, and to snatch from him\\nthe repose that he has earned by long years of toil.\" And, shaking his\\nhead savagely, he hurried back to the rocky hill-side.\\n\\nEvery afternoon the merry notes of 241 the pipes of Old Pipes sounded\\ndown into the valley and over the hills and up the mountain-side; and\\nevery afternoon when he had echoed them back, the little dwarf grew more\\nand more angry with the Dryad. Each day, from early morning till it was\\ntime for him to go back to his duties upon the rocky hill-side, he\\nsearched the woods for her. He intended, if he met her, to pretend to be\\nvery sorry for what he had said, and he thought he might be able to play\\na trick upon her which would avenge him well.\\n\\nOne day, while thus wandering among the trees, he met Old Pipes. The\\nEcho-dwarf did not generally care to see or speak to ordinary people;\\nbut now he was so anxious to find the object of his search, that he\\nstopped and asked Old Pipes if he had seen the Dryad. The piper had not\\nnoticed the little fellow, and he looked down on him with some\\nsurprise.\\n\\n\"No,\" he said; \"I have not seen her, and I have been looking\\neverywhere for her.\"\\n\\n\"You!\" cried the dwarf, \"what do you wish with her?\"\\n\\nOld Pipes then sat down on a stone, so that he should be nearer the\\near of his small companion, and he told what the Dryad had done for\\nhim.\\n\\nWhen the Echo-dwarf heard that this was the man whose pipes he was\\nobliged to echo back every day, he would have slain him on the spot, had\\nhe been able; but, as he was not able, he merely ground his teeth and\\nlistened to the rest of the story.\\n\\n\"I am looking for the Dryad now,\" Old Pipes continued, \"on account of\\nmy aged mother. When I was old myself, I did not notice how very old my\\nmother was; but now it shocks me to see how feeble her years have caused\\nher to become; and I am looking for the Dryad to ask her to make my\\nmother younger, as she made me.\"\\n\\nThe eyes of the Echo-dwarf glistened. Here was a man who might help\\nhim in his plans.\\n\\n\"Your idea is a good one,\" he said to Old Pipes, \"and it does you\\nhonor. But you should know that a Dryad can make no person younger but\\none who lets her out of her tree. However, you can manage the affair\\nvery easily. All you need do is to find the Dryad, tell her what you\\nwant, and request her to step into her tree and be shut up for a short\\ntime. Then you will go and bring your mother to the tree; she will open\\nit, and everything will be as you wish. Is not this a good plan?\"\\n\\n\"Excellent!\" cried Old Pipes; \"and I will go instantly and search\\nmore diligently for the Dryad.\"\\n\\n\"Take me with you,\" said the Echo-dwarf. \"You can easily carry me on\\nyour strong shoulders; and I shall be glad to help you in any way that I\\ncan.\"\\n\\n\"Now then,\" said the little fellow to himself, as Old Pipes carried\\nhim rapidly along, \"if he persuades the Dryad to get into a tree,—and\\nshe is quite foolish enough to do it,—and then goes away to bring his\\nmother, I shall take a stone or a club and I will break off the key of\\nthat tree, so that nobody can ever turn it again. Then Mistress Dryad\\nwill see what she has brought upon herself by her behavior to me.\"\\n\\nBefore long they came to the great oak tree in which the Dryad had\\nlived, and at a distance they saw that beautiful creature herself coming\\ntoward them.\\n\\n\"How excellently well everything happens!\" said the dwarf. \"Put me\\ndown, 242 and I will go. Your business with the Dryad is\\nmore important than mine; and you need not say anything about my having\\nsuggested your plan to you. I am willing that you should have all the\\ncredit of it yourself.\"\\n\\nOld Pipes put the Echo-dwarf upon the ground, but the little rogue\\ndid not go away. He hid himself between some low, mossy rocks, and he\\nwas so much like them in color that you would not have noticed him if\\nyou had been looking straight at him.\\n\\nWhen the Dryad came up, Old Pipes lost no time in telling her about\\nhis mother, and what he wished her to do. At first, the Dryad answered\\nnothing, but stood looking very sadly at Old Pipes.\\n\\n\"Do you really wish me to go into my tree again?\" she said. \"I should\\ndreadfully dislike to do it, for I don\\'t know what might happen. It is\\nnot at all necessary, for I could make your mother younger at any time\\nif she would give me the opportunity. I had already thought of making\\nyou still happier in this way, and several times I have waited about\\nyour cottage, hoping to meet your aged mother, but she never comes\\noutside, and you know a Dryad cannot enter a house. I cannot imagine\\nwhat put this idea into your head. Did you think of it yourself?\"\\n\\n\"No, I cannot say that I did,\" answered Old Pipes. \"A little dwarf\\nwhom I met in the woods proposed it to me.\"\\n\\n\"Oh!\" cried the Dryad; \"now I see through it all. It is the scheme of\\nthat vile Echo-dwarf—your enemy and mine. Where is he? I should like to\\nsee him.\"\\n\\n\"I think he has gone away,\" said Old Pipes.\\n\\n\"No, he has not,\" said the Dryad, whose quick eyes perceived the\\nEcho-dwarf among the rocks, \"there he is. Seize him and drag him out, I\\nbeg of you.\"\\n\\nOld Pipes saw the dwarf as soon as he was pointed out to him; and\\nrunning to the rocks, he caught the little fellow by the arm and pulled\\nhim out.\\n\\n\"Now, then,\" cried the Dryad, who had opened the door of the great\\noak, \"just stick him in there, and we will shut him up. Then I shall be\\nsafe from his mischief for the rest of the time I am free.\"\\n\\nOld Pipes thrust the Echo-dwarf into the tree; the Dryad pushed the\\ndoor shut; there was a clicking sound of bark and wood, and no one would\\nhave noticed that the big oak had ever had an opening in it.\\n\\n\"There,\" said the Dryad; \"now we need not be afraid of him. And I\\nassure you, my good piper, that I shall be very glad to make your mother\\nyounger as soon as I can. Will you not ask her to come out and meet\\nme?\"\\n\\n\"Of course I will,\" cried Old Pipes; \"and I will do it without\\ndelay.\"\\n\\nAnd then, the Dryad by his side, he hurried to his cottage. But when\\nhe mentioned the matter to his mother, the old woman became very angry\\nindeed. She did not believe in Dryads; and, if they really did exist,\\nshe knew they must be witches and sorceresses, and she would have\\nnothing to do with them. If her son had ever allowed himself to be\\nkissed by one of them, he ought to be ashamed of himself. As to its\\ndoing him the least bit of good, she did not believe a word of it. He\\nfelt better than he used to feel, but that was very common. She had\\nsometimes felt that way herself, and she forbade him ever to mention a\\nDryad to her again.\\n\\nThat afternoon, Old Pipes, feeling 243 very sad that his plan in\\nregard to his mother had failed, sat down upon the rock and played upon\\nhis pipes. The pleasant sounds went down the valley and up the hills and\\nmountain, but, to the great surprise of some persons who happened to\\nnotice the fact, the notes were not echoed back from the rocky\\nhill-side, but from the woods on the side of the valley on which Old\\nPipes lived. The next day many of the villagers stopped in their work to\\nlisten to the echo of the pipes coming from the woods. The sound was not\\nas clear and strong as it used to be when it was sent back from the\\nrocky hill-side, but it certainly came from among the trees. Such a\\nthing as an echo changing its place in this way had never been heard of\\nbefore, and nobody was able to explain how it could have happened. Old\\nPipes, however, knew very well that the sound came from the Echo-dwarf\\nshut up in the great oak tree. The sides of the tree were thin, and the\\nsound of the pipes could be heard through them, and the dwarf was\\nobliged by the laws of his being to echo back those notes whenever they\\ncame to him. But Old Pipes thought he might get the Dryad in trouble if\\nhe let anyone know that the Echo-dwarf was shut up in the tree, and so\\nhe wisely said nothing about it.\\n\\nOne day the two boys and the girl who had helped Old Pipes up the\\nhill were playing in the woods. Stopping near the great oak tree, they\\nheard a sound of knocking within it, and then a voice plainly said:\\n\\n\"Let me out! let me out!\"\\n\\nFor a moment the children stood still in astonishment, and then one\\nof the boys exclaimed:\\n\\n\"Oh, it is a Dryad, like the one Old Pipes found! Let\\'s let her\\nout!\"\\n\\n\"What are you thinking of?\" cried the girl. \"I am the oldest of all,\\nand I am only thirteen. Do you wish to be turned into crawling babies?\\nRun! run! run!\"\\n\\nAnd the two boys and the girl dashed down into the valley as fast as\\ntheir legs could carry them. There was no desire in their youthful\\nhearts to be made younger than they were, and for fear that their\\nparents might think it well that they should commence their careers\\nanew, they never said a word about finding the Dryad tree.\\n\\nAs the summer days went on, Old Pipes\\'s mother grew feebler and\\nfeebler. One day when her son was away, for he now frequently went into\\nthe woods to hunt or fish, or down into the valley to work, she arose\\nfrom her knitting to prepare the simple dinner. But she felt so weak and\\ntired that she was not able to do the work to which she had been so long\\naccustomed. \"Alas! alas!\" she said, \"the time has come when I am too old\\nto work. My son will have to hire some one to come here and cook his\\nmeals, make his bed, and mend his clothes. Alas! alas! I had hoped that\\nas long as I lived I should be able to do these things. But it is not\\nso. I have grown utterly worthless, and some one else must prepare the\\ndinner for my son. I wonder where he is.\" And tottering to the door, she\\nwent outside to look for him. She did not feel able to stand, and\\nreaching the rustic chair, she sank into it, quite exhausted, and soon\\nfell asleep.\\n\\nThe Dryad, who had often come to the cottage to see if she could find\\nan opportunity of carrying out Old Pipes\\'s affectionate design, now\\nhappened by; 244 and seeing that the much-desired occasion had\\ncome, she stepped up quietly behind the old woman and gently kissed her\\non each cheek, and then as quietly disappeared.\\n\\nIn a few minutes the mother of Old Pipes awoke, and looking up at the\\nsun, she exclaimed: \"Why, it is almost dinner-time! My son will be here\\ndirectly, and I am not ready for him.\" And rising to her feet, she\\nhurried into the house, made the fire, set the meat and vegetables to\\ncook, laid the cloth, and by the time her son arrived the meal was on\\nthe table.\\n\\n\"How a little sleep does refresh one,\" she said to herself, as she\\nwas bustling about. She was a woman of very vigorous constitution, and\\nat seventy had been a great deal stronger and more active than her son\\nwas at that age. The moment Old Pipes saw his mother, he knew that the\\nDryad had been there; but, while he felt as happy as a king, he was too\\nwise to say anything about her.\\n\\n\"It is astonishing how well I feel to-day,\" said his mother; \"and\\neither my hearing has improved or you speak much more plainly than you\\nhave done of late.\"\\n\\nThe summer days went on and passed away, the leaves were falling from\\nthe trees, and the air was becoming cold.\\n\\n\"Nature has ceased to be lovely,\" said the Dryad, \"and the night\\nwinds chill me. It is time for me to go back into my comfortable\\nquarters in the great oak. But first I must pay another visit to the\\ncottage of Old Pipes.\"\\n\\nShe found the piper and his mother sitting side by side on the rock\\nin front of the door. The cattle were not to go to the mountain any more\\nthat season, and he was piping them down for the last time. Loud and\\nmerrily sounded the pipes of Old Pipes, and down the mountain-side came\\nthe cattle, the cows by the easiest paths, the sheep by those not quite\\nso easy, and the goats by the most difficult ones among the rocks; while\\nfrom the great oak tree were heard the echoes of the cheerful music.\\n\\n\"How happy they look, sitting there together,\" said the Dryad; \"and I\\ndon\\'t believe it will do them a bit of harm to be still younger.\" And\\nmoving quietly up behind them, she first kissed Old Pipes on his cheek\\nand then kissed his mother.\\n\\nOld Pipes, who had stopped playing, knew what it was, but he did not\\nmove, and said nothing. His mother, thinking that her son had kissed\\nher, turned to him with a smile and kissed him in return. And then she\\narose and went into the cottage, a vigorous woman of sixty, followed by\\nher son, erect and happy, and twenty years younger than herself.\\n\\nThe Dryad sped away to the woods, shrugging her shoulders as she felt\\nthe cool evening wind.\\n\\nWhen she reached the great oak, she turned the key and opened the\\ndoor. \"Come out,\" said she to the Echo-dwarf, who sat blinking within.\\n\"Winter is coming on, and I want the comfortable shelter of my tree for\\nmyself. The cattle have come down from the mountain for the last time\\nthis year, the pipes will no longer sound, and you can go to your rocks\\nand have a holiday until next spring.\"\\n\\nUpon hearing these words the dwarf skipped quickly out, and the Dryad\\nentered the tree and pulled the door shut after her. \"Now, then,\" she\\nsaid to herself, \"he can break off the key if he likes. It does not\\nmatter to me. 245 Another will grow out next spring. And although\\nthe good piper made me no promise, I know that when the warm days arrive\\nnext year, he will come and let me out again.\"\\n\\nThe Echo-dwarf did not stop to break the key of the tree. He was too\\nhappy to be released to think of anything else, and he hastened as fast\\nas he could to his home on the rocky hill-side.\\n\\nThe Dryad was not mistaken when she trusted in the piper. When the\\nwarm days came again he went to the oak tree to let her out. But, to his\\nsorrow and surprise, he found the great tree lying upon the ground. A\\nwinter storm had blown it down, and it lay with its trunk shattered and\\nsplit. And what became of the Dryad no one ever knew.\\n\\n204\\n\\nJohn Ruskin (1819-1900), the most eloquent of English prose writers,\\nwas much interested in the question of literature for both grown-ups and\\nchildren. He edited a reissue of Taylor\\'s translation of Grimms\\'\\nPopular Stories, issued \"Dame Wiggins of Lee and Her Seven\\nWonderful Cats\" (see No. 143), and wrote that masterpiece among modern\\nstories for children, The King of the Golden River. Its\\nfine idealism, splendidly imagined structure, wonderful word-paintings,\\nand perfect English all combine to justify the high place assigned to\\nit. Ruskin wrote the story in 1841, at a \"couple of sittings,\" though it\\nwas not published until ten years later. Speaking of it later in life,\\nhe said that it \"was written to amuse a little girl; and being a fairly\\ngood imitation of Grimm and Dickens, mixed with a little true Alpine\\nfeeling of my own, it has been rightly pleasing to nice children, and\\ngood for them. But it is totally valueless, for all that. I can no more\\nwrite a story than compose a picture.\" The final statement may be taken\\nfor what it is worth, written as it was at a time of disillusionment.\\nThe first part of Ruskin\\'s analysis is certainly true and has been thus\\nexpanded by his biographer, Sir E. T. Cook: \"The grotesque and the\\nGerman setting of the tale were taken from Grimm; from Dickens it took\\nits tone of pervading kindliness and geniality. The Alpine ecstasy and\\nthe eager pressing of the moral were Ruskin\\'s own; and so also is the\\nstyle, delicately poised between poetry and comedy.\"\\n\\nTHE KING OF THE GOLDEN RIVER\\n\\nOR\\n\\nTHE BLACK BROTHERS\\n\\nJOHN RUSKIN\\n\\nCHAPTER I\\n\\nHOW THE AGRICULTURAL SYSTEM OF THE BLACK BROTHERS WAS INTERFERED WITH BY\\nSOUTH-WEST WIND, ESQUIRE\\n\\nIn a secluded and mountainous part of Stiria there was, in old time,\\na valley of the most surprising and luxuriant fertility. It was\\nsurrounded, on all sides, by steep and rocky mountains, rising into\\npeaks, which were always covered with snow, and from which a number of\\ntorrents descended in constant cataracts. One of these fell westward,\\nover the face of a crag so high, that, when the sun had set to\\neverything else, and all below was darkness, his beams still shone full\\nupon this waterfall, so that it looked like a shower of gold. It was,\\ntherefore, called by the people of the neighborhood, the Golden River.\\nIt was strange that none of these streams fell into the valley itself.\\nThey all descended on the other side of the mountains, and wound away\\nthrough broad plains and by populous cities. But the clouds were drawn\\nso constantly to the snowy hills, and rested so softly 246 in the\\ncircular hollow, that in time of drought and heat, when all the country\\nround was burnt up, there was still rain in the little valley; and its\\ncrops were so heavy, and its hay so high, and its apples so red, and its\\ngrapes so blue, and its wine so rich, and its honey so sweet, that it\\nwas a marvel to every one who beheld it, and was commonly called the\\nTreasure Valley.\\n\\nThe whole of this little valley belonged to three brothers, called\\nSchwartz, Hans, and Gluck. Schwartz and Hans, the two elder brothers,\\nwere very ugly men, with overhanging eyebrows and small dull eyes, which\\nwere always half shut, so that you couldn\\'t see into them, and\\nalways fancied they saw very far into you. They lived by\\nfarming the Treasure Valley, and very good farmers they were. They\\nkilled everything that did not pay for its eating. They shot the\\nblackbirds because they pecked the fruit; and killed the hedgehogs, lest\\nthey should suck the cows; they poisoned the crickets for eating the\\ncrumbs in the kitchen; and smothered the cicadas, which used to sing all\\nsummer in the lime trees. They worked their servants without any wages,\\ntill they would not work any more, and then quarreled with them, and\\nturned them out of doors without paying them. It would have been very\\nodd if, with such a farm, and such a system of farming, they hadn\\'t got\\nvery rich; and very rich they did get. They generally contrived\\nto keep their corn by them till it was very dear, and then sell it for\\ntwice its value; they had heaps of gold lying about on their floors, yet\\nit was never known that they had given so much as a penny or a crust in\\ncharity; they never went to mass; grumbled perpetually at paying tithes;\\nand were, in a word, of so cruel and grinding a temper as to receive\\nfrom all those with whom they had any dealings the nickname of the\\n\"Black Brothers.\"\\n\\nThe youngest brother, Gluck, was as completely opposed, in both\\nappearance and character, to his seniors as could possibly be imagined\\nor desired. He was not above twelve years old, fair, blue-eyed, and kind\\nin temper to every living thing. He did not, of course, agree\\nparticularly well with his brothers, or rather, they did not agree with\\nhim. He was usually appointed to the honorable office of\\nturnspit, when there was anything to roast, which was not often; for, to\\ndo the brothers justice, they were hardly less sparing upon themselves\\nthan upon other people. At other times he used to clean the shoes,\\nfloors, and sometimes the plates, occasionally getting what was left on\\nthem, by way of encouragement, and a wholesome quantity of dry blows, by\\nway of education.\\n\\nThings went on in this manner for a long time. At last came a very\\nwet summer, and everything went wrong in the country around. The hay had\\nhardly been got in, when the haystacks were floated bodily down to the\\nsea by an inundation; the vines were cut to pieces with the hail; the\\ncorn was all killed by a black blight; only in the Treasure Valley, as\\nusual, all was safe. As it had rain when there was rain nowhere else, so\\nit had sun when there was sun nowhere else. Everybody came to buy corn\\nat the farm, and went away pouring maledictions on the Black Brothers.\\nThey asked what they liked, and got it, except from the poor people, who\\ncould only beg, and several of whom were starved at their very door,\\nwithout the slightest regard or notice. 247\\n\\nIt was drawing towards winter, and very cold weather, when one day\\nthe two elder brothers had gone out, with their usual warning to little\\nGluck, who was left to mind the roast, that he was to let nobody in, and\\ngive nothing out. Gluck sat down quite close to the fire, for it was\\nraining very hard, and the kitchen walls were by no means dry or\\ncomfortable looking. He turned and turned, and the roast got nice and\\nbrown. \"What a pity,\" thought Gluck, \"my brothers never ask anybody to\\ndinner. I\\'m sure, when they\\'ve got such a nice piece of mutton as this,\\nand nobody else has got so much as a piece of dry bread, it would do\\ntheir hearts good to have somebody to eat it with them.\"\\n\\nJust as he spoke, there came a double knock at the house door, yet\\nheavy and dull, as though the knocker had been tied up—more like a puff\\nthan a knock.\\n\\n\"It must be the wind,\" said Gluck; \"nobody else would venture to\\nknock double knocks at our door.\"\\n\\nNo; it wasn\\'t the wind; there it came again very hard, and what was\\nparticularly astounding, the knocker seemed to be in a hurry, and not to\\nbe in the least afraid of the consequences. Gluck went to the window,\\nopened it, and put his head out to see who it was.\\n\\nIt was the most extraordinary looking little gentleman he had ever\\nseen in his life. He had a very large nose, slightly brass-colored; his\\ncheeks were very round, and very red, and might have warranted a\\nsupposition that he had been blowing a refractory fire for the last\\neight-and-forty hours; his eyes twinkled merrily through long silky\\neyelashes, his mustaches curled twice round like a corkscrew on each\\nside of his mouth, and his hair, of a curious mixed pepper-and-salt\\ncolor, descended far over his shoulders. He was about four-feet-six in\\nheight, and wore a conical pointed cap of nearly the same altitude,\\ndecorated with a black feather some three feet long. His doublet was\\nprolonged behind into something resembling a violent exaggeration of\\nwhat is now termed a \"swallowtail,\" but was much obscured by the\\nswelling folds of an enormous black, glossy-looking cloak, which must\\nhave been very much too long in calm weather, as the wind, whistling\\nround the old house, carried it clear out from the wearer\\'s shoulders to\\nabout four times his own length.\\n\\nGluck was so perfectly paralyzed by the singular appearance of his\\nvisitor, that he remained fixed without uttering a word, until the old\\ngentleman, having performed another, and a more energetic concerto on\\nthe knocker, turned round to look after his fly-away cloak. In so doing\\nhe caught sight of Gluck\\'s little yellow head jammed in the window, with\\nits mouth and eyes very wide open indeed.\\n\\n\"Hollo!\" said the little gentleman, \"that\\'s not the way to answer the\\ndoor: I\\'m wet; let me in!\"\\n\\nTo do the little gentleman justice, he was wet. His feather\\nhung down between his legs like a beaten puppy\\'s tail, dripping like an\\numbrella; and from the ends of his mustaches the water was running into\\nhis waistcoat pockets, and out again like a mill stream.\\n\\n\"I beg pardon, sir,\" said Gluck, \"I\\'m very sorry, but I really\\ncan\\'t.\"\\n\\n\"Can\\'t what?\" said the old gentleman.\\n\\n\"I can\\'t let you in, sir,—I can\\'t indeed; my brothers would beat me\\nto death, sir, if I thought of such a thing. What do you want, sir?\"\\n248\\n\\n\"Want?\" said the old gentleman, petulantly. \"I want fire, and\\nshelter; and there\\'s your great fire there blazing, crackling, and\\ndancing on the walls, with nobody to feel it. Let me in, I say; I only\\nwant to warm myself.\"\\n\\nGluck had had his head, by this time, so long out of the window, that\\nhe began to feel it was really unpleasantly cold, and when he turned,\\nand saw the beautiful fire rustling and roaring, and throwing long\\nbright tongues up the chimney, as if it were licking its chops at the\\nsavory smell of the leg of mutton, his heart melted within him that it\\nshould be burning away for nothing. \"He does look very wet,\"\\nsaid little Gluck; \"I\\'ll just let him in for a quarter of an hour.\"\\nRound he went to the door, and opened it; and as the little gentleman\\nwalked in, there came a gust of wind through the house that made the old\\nchimneys totter.\\n\\n\"That\\'s a good boy,\" said the little gentleman. \"Never mind your\\nbrothers. I\\'ll talk to them.\"\\n\\n\"Pray, sir, don\\'t do any such thing,\" said Gluck. \"I can\\'t let you\\nstay till they come; they\\'d be the death of me.\"\\n\\n\"Dear me,\" said the old gentleman, \"I\\'m very sorry to hear that. How\\nlong may I stay?\"\\n\\n\"Only till the mutton\\'s done, sir,\" replied Gluck, \"and it\\'s very\\nbrown.\"\\n\\nThen the old gentleman walked into the kitchen, and sat himself down\\non the hob, with the top of his cap accommodated up the chimney, for it\\nwas a great deal too high for the roof.\\n\\n\"You\\'ll soon dry there, sir,\" said Gluck, and sat down again to turn\\nthe mutton. But the old gentleman did not dry there, but went\\non drip, drip, dripping among the cinders, and the fire fizzed and\\nsputtered, and began to look very black and uncomfortable; never was\\nsuch a cloak; every fold in it ran like a gutter.\\n\\n\"I beg pardon, sir,\" said Gluck at length, after watching the water\\nspreading in long, quicksilver-like streams over the floor for a quarter\\nof an hour; \"mayn\\'t I take your cloak?\"\\n\\n\"No thank you,\" said the old gentleman.\\n\\n\"Your cap, sir?\"\\n\\n\"I am all right, thank you,\" said the old gentleman rather\\ngruffly.\\n\\n\"But—sir—I\\'m very sorry,\" said Gluck hesitatingly; \"but—really,\\nsir—you\\'re—putting the fire out.\"\\n\\n\"It\\'ll take longer to do the mutton, then,\" replied his visitor\\ndryly.\\n\\nGluck was very much puzzled by the behavior of his guest; it was such\\na strange mixture of coolness and humility. He turned away at the string\\nmeditatively for another five minutes.\\n\\n\"That mutton looks very nice,\" said the old gentleman at length.\\n\"Can\\'t you give me a little bit?\"\\n\\n\"Impossible, sir,\" said Gluck.\\n\\n\"I\\'m very hungry,\" continued the old gentleman; \"I\\'ve had nothing to\\neat yesterday nor to-day. They surely couldn\\'t miss a bit from the\\nknuckle!\"\\n\\nHe spoke in so very melancholy a tone that it quite melted Gluck\\'s\\nheart. \"They promised me one slice to-day, sir,\" said he; \"I can give\\nyou that, but not a bit more.\"\\n\\n\"That\\'s a good boy,\" said the old gentleman again.\\n\\nThen Gluck warmed a plate, and sharpened a knife. \"I don\\'t care if I\\ndo get beaten for it,\" thought he. Just as he had cut a large slice out\\nof the mutton, there came a tremendous rap at the door. The old\\ngentleman jumped 249 off the hob, as if it had suddenly become\\ninconveniently warm. Gluck fitted the slice into the mutton again, with\\ndesperate efforts at exactitude, and ran to open the door.\\n\\n\"What did you keep us waiting in the rain for?\" said Schwartz, as he\\nwalked in, throwing his umbrella in Gluck\\'s face. \"Ay! what for, indeed,\\nyou little vagabond?\" said Hans, administering an educational box on the\\near, as he followed his brother into the kitchen.\\n\\n\"Bless my soul!\" said Schwartz when he opened the door.\\n\\n\"Amen,\" said the little gentleman, who had taken his cap off and was\\nstanding in the middle of the kitchen, bowing with the utmost possible\\nvelocity.\\n\\n\"Who\\'s that?\" said Schwartz, catching up a rolling-pin, and turning\\nto Gluck with a fierce frown.\\n\\n\"I don\\'t know, indeed, brother,\" said Gluck in great terror.\\n\\n\"How did he get in?\" roared Schwartz.\\n\\n\"My dear brother,\" said Gluck, deprecatingly, \"he was so\\nvery wet!\"\\n\\nThe rolling-pin was descending on Gluck\\'s head; but, at the instant,\\nthe old gentleman interposed his conical cap, on which it crashed with a\\nshock that shook the water out of it all over the room. What was very\\nodd, the rolling pin no sooner touched the cap, than it flew out of\\nSchwartz\\'s hand, spinning like a straw in a high wind, and fell into the\\ncorner at the farther end of the room.\\n\\n\"Who are you, sir?\" demanded Schwartz, turning upon him.\\n\\n\"What\\'s your business?\" snarled Hans.\\n\\n\"I\\'m a poor old man, sir,\" the little gentleman began very modestly,\\n\"and I saw your fire through the window, and begged shelter for a\\nquarter of an hour.\"\\n\\n\"Have the goodness to walk out again, then,\" said Schwartz. \"We\\'ve\\nquite enough water in our kitchen, without making it a drying\\nhouse.\"\\n\\n\"It is a cold day to turn an old man out in, sir; look at my gray\\nhairs.\" They hung down to his shoulders, as I told you before.\\n\\n\"Ay!\" said Hans, \"there are enough of them to keep you warm.\\nWalk!\"\\n\\n\"I\\'m very, very hungry, sir; couldn\\'t you spare me a bit of bread\\nbefore I go?\"\\n\\n\"Bread, indeed!\" said Schwartz; \"do you suppose we\\'ve nothing to do\\nwith our bread but to give it to such red-nosed fellows as you?\"\\n\\n\"Why don\\'t you sell your feather?\" said Hans, sneeringly. \"Out with\\nyou!\"\\n\\n\"A little bit,\" said the old gentleman.\\n\\n\"Be off!\" said Schwartz.\\n\\n\"Pray, gentlemen—\"\\n\\n\"Off, and be hanged!\" cried Hans, seizing him by the collar. But he\\nhad no sooner touched the old gentleman\\'s collar, than away he went\\nafter the rolling-pin, spinning round and round, till he fell into the\\ncorner on the top of it. Then Schwartz was very angry, and ran at the\\nold gentleman to turn him out; but he also had hardly touched him, when\\naway he went after Hans and the rolling-pin, and hit his head against\\nthe wall as he tumbled into the corner. And so there they lay, all\\nthree.\\n\\nThen the old gentleman spun himself round with velocity in the\\nopposite direction; continued to spin until his long cloak was all wound\\nneatly about him, clapped his cap on his head, very much on one side\\n(for it could not stand upright without going through the ceiling), gave\\nan additional twist to his corkscrew mustaches, and replied with perfect\\ncoolness: \"Gentlemen, I wish you 250 a very good morning. At twelve\\no\\'clock to-night I\\'ll call again; after such a refusal of hospitality as\\nI have just experienced, you will not be surprised if that visit is the\\nlast I ever pay you.\"\\n\\n\"If ever I catch you here again,\" muttered Schwartz, coming, half\\nfrightened, out of the corner—but, before he could finish his sentence,\\nthe old gentleman had shut the house door behind him with a great bang:\\nand there drove past the window, at the same instant, a wreath of ragged\\ncloud that whirled and rolled away down the valley in all manner of\\nshapes; turning over and over in the air, and melting away at last in a\\ngush of rain.\\n\\n\"A very pretty business, indeed, Mr. Gluck!\" said Schwartz. \"Dish the\\nmutton, sir. If ever I catch you at such a trick again—bless me, why the\\nmutton\\'s been cut!\"\\n\\n\"You promised me one slice, brother, you know,\" said Gluck.\\n\\n\"Oh! and you were cutting it hot, I suppose, and going to catch all\\nthe gravy. It\\'ll be long before I promise you such a thing again. Leave\\nthe room, sir; and have the kindness to wait in the coal-cellar till I\\ncall you.\"\\n\\nGluck left the room melancholy enough. The brothers ate as much\\nmutton as they could, locked the rest in the cupboard, and proceeded to\\nget very drunk after dinner.\\n\\nSuch a night as it was! Howling wind and rushing rain, without\\nintermission! The brothers had just sense enough left to put up all the\\nshutters, and double bar the door, before they went to bed. They usually\\nslept in the same room. As the clock struck twelve, they were both\\nawakened by a tremendous crash. Their door burst open with a violence\\nthat shook the house from top to bottom.\\n\\n\"What\\'s that?\" cried Schwartz, starting up in his bed.\\n\\n\"Only I,\" said the little gentleman.\\n\\nThe two brothers sat up on their bolster and stared into the\\ndarkness. The room was full of water, and by a misty moonbeam, which\\nfound its way through a hole in the shutter, they could see in the midst\\nof it an enormous foam globe, spinning round, and bobbing up and down\\nlike a cork, on which, as on a most luxurious cushion, reclined the\\nlittle old gentleman, cap and all. There was plenty of room for it now,\\nfor the roof was off.\\n\\n\"Sorry to incommode you,\" said their visitor, ironically. \"I\\'m afraid\\nyour beds are dampish; perhaps you had better go to your brother\\'s room;\\nI\\'ve left the ceiling on, there.\"\\n\\nThey required no second admonition, but rushed into Gluck\\'s room, wet\\nthrough, and in an agony of terror.\\n\\n\"You\\'ll find my card on the kitchen table,\" the old gentleman called\\nafter them. \"Remember, the last visit.\"\\n\\n\"Pray Heaven it may!\" said Schwartz, shuddering. And the foam globe\\ndisappeared.\\n\\nDawn came at last, and the two brothers looked out of Gluck\\'s little\\nwindow in the morning. The Treasure Valley was one mass of ruin and\\ndesolation. The inundation had swept away trees, crops, and cattle, and\\nleft in their stead a waste of red sand and gray mud. The two brothers\\ncrept shivering and horror-struck into the kitchen. The water had gutted\\nthe whole first floor; corn, money, almost every movable thing had been\\nswept away, and there was left only a small white card on the kitchen\\ntable. On it, in large, breezy 251 long-legged letters, were\\nengraved the words:—\\n\\nSouth-West Wind, Esquire.\\n\\nCHAPTER II\\n\\nOF THE PROCEEDINGS OF THE THREE BROTHERS AFTER THE VISIT OF SOUTH-WEST\\nWIND, ESQUIRE; AND HOW LITTLE GLUCK HAD AN INTERVIEW WITH THE KING OF\\nTHE GOLDEN RIVER\\n\\nSouth-West Wind, Esquire, was as good as his word. After the\\nmomentous visit above related, he entered the Treasure Valley no more;\\nand, what was worse, he had so much influence with his relations, the\\nWest Winds in general, and used it so effectually, that they all adopted\\na similar line of conduct. So no rain fell in the valley from one year\\'s\\nend to another. Though everything remained green and flourishing in the\\nplains below, the inheritance of the Three Brothers was a desert. What\\nhad once been the richest soil in the kingdom, became a shifting heap of\\nred sand; and the brothers, unable longer to contend with the adverse\\nskies, abandoned their valueless patrimony in despair, to seek some\\nmeans of gaining a livelihood among the cities and people of the plains.\\nAll their money was gone, and they had nothing left but some curious,\\nold-fashioned pieces of gold plates, the last remnants of their\\nill-gotten wealth.\\n\\n\"Suppose we turn goldsmiths?\" said Schwartz to Hans, as they entered\\nthe large city. \"It is a good knave\\'s trade; we can put a great deal of\\ncopper into the gold, without any one\\'s finding it out.\"\\n\\nThe thought was agreed to be a very good one; they hired a furnace,\\nand turned goldsmiths. But two slight circumstances affected their\\ntrade; the first, that people did not approve of the coppered gold; the\\nsecond, that the two elder brothers, whenever they had sold anything,\\nused to leave little Gluck to mind the furnace, and go and drink out the\\nmoney in the ale-house next door. So they melted all their gold, without\\nmaking money enough to buy more, and were at last reduced to one large\\ndrinking mug, which an uncle of his had given to little Gluck, and which\\nhe was very fond of, and would not have parted with for the world;\\nthough he never drank anything out of it but milk and water. The mug was\\na very odd mug to look at. The handle was formed of two wreaths of\\nflowing golden hair, so finely spun that it looked more like silk than\\nmetal, and these wreaths descended into, and mixed with, a beard and\\nwhiskers of the same exquisite workmanship, which surrounded and\\ndecorated a very fierce little face, of the reddest gold imaginable,\\nright in the front of the mug, with a pair of eyes in it which seemed to\\ncommand its whole circumference. It was impossible to drink out of the\\nmug without being subjected to an intense gaze out of the side of these\\neyes; and Schwartz positively averred that once, after emptying it, full\\nof Rhenish, seventeen times, he had seen them wink! When it came to the\\nmug\\'s turn to be made into spoons, it half broke poor little Gluck\\'s\\nheart; but the brothers only laughed at him, tossed the mug into the\\nmelting-pot, and staggered out to the ale-house; leaving him, as usual,\\nto pour the gold into bars, when it was all ready.\\n\\nWhen they were gone, Gluck took a farewell look at his old friend in\\nthe melting-pot. The flowing hair was all gone; nothing remained but the\\nred nose, 252 and the sparkling eyes, which looked more\\nmalicious than ever. \"And no wonder,\" thought Gluck, \"after being\\ntreated in that way.\" He sauntered disconsolately to the window, and sat\\nhimself down to catch the fresh evening air, and escape the hot breath\\nof the furnace. Now this window commanded a direct view of the range of\\nmountains, which, as I told you before, overhung the Treasure Valley,\\nand more especially of the peak from which fell the Golden River. It was\\njust at the close of the day, and when Gluck sat down at the window, he\\nsaw the rocks of the mountain tops, all crimson and purple with the\\nsunset; and there were bright tongues of fiery cloud burning and\\nquivering about them; and the river, brighter than all, fell, in a\\nwaving column of pure gold, from precipice to precipice, with the double\\narch of a broad purple rainbow stretched across it, flushing and fading\\nalternately in the wreaths of spray.\\n\\n\"Ah!\" said Gluck aloud, after he had looked at it for a while, \"if\\nthat river were really all gold, what a nice thing it would be.\"\\n\\n\"No, it wouldn\\'t, Gluck,\" said a clear metallic voice, close at his\\near.\\n\\n\"Bless me! what\\'s that?\" exclaimed Gluck, jumping up. There was\\nnobody there. He looked round the room, and under the table, and a great\\nmany times behind him, but there was certainly nobody there, and he sat\\ndown again at the window. This time he didn\\'t speak, but he couldn\\'t\\nhelp thinking again that it would be very convenient if the river were\\nreally all gold.\\n\\n\"Not at all, my boy,\" said the same voice, louder than before.\\n\\n\"Bless me!\" said Gluck again; \"what is that?\" He looked\\nagain into all the corners, and cupboards, and then began turning round,\\nand round, as fast as he could in the middle of the room, thinking there\\nwas somebody behind him, when the same voice struck again on his ear. It\\nwas singing now very merrily, \"Lala-lira-la\"; no words, only a soft\\nrunning effervescent melody, something like that of a kettle on the\\nboil. Gluck looked out of the window. No, it was certainly in the house.\\nUpstairs, and downstairs. No, it was certainly in that very room, coming\\nin quicker time, and clearer notes, every moment. \"Lala-lira-la.\" All at\\nonce it struck Gluck that it sounded louder near the furnace. He ran to\\nthe opening, and looked in; yes, he saw right, it seemed to be coming,\\nnot only out of the furnace, but out of the pot. He uncovered it, and\\nran back in a great fright, for the pot was certainly singing! He stood\\nin the farthest corner of the room, with his hands up, and his mouth\\nopen, for a minute or two, when the singing stopped, and the voice\\nbecame clear, and pronunciative.\\n\\n\"Hollo!\" said the voice.\\n\\nGluck made no answer.\\n\\n\"Hollo! Gluck, my boy,\" said the pot again.\\n\\nGluck summoned all his energies, walked straight up to the crucible,\\ndrew it out of the furnace, and looked in. The gold was all melted, and\\nits surface as smooth and polished as a river; but instead of reflecting\\nlittle Gluck\\'s head, as he looked in, he saw, meeting his glance from\\nbeneath the gold, the red nose and sharp eyes of his old friend of the\\nmug, a thousand times redder and sharper than ever he had seen them in\\nhis life.\\n\\n\"Come, Gluck, my boy,\" said the voice out of the pot again, \"I\\'m all\\nright; pour me out.\" 253\\n\\nBut Gluck was too much astonished to do anything of the kind.\\n\\n\"Pour me out, I say,\" said the voice rather gruffly.\\n\\nStill Gluck couldn\\'t move.\\n\\n\"Will you pour me out?\" said the voice passionately. \"I\\'m\\ntoo hot.\"\\n\\nBy a violent effort, Gluck recovered the use of his limbs, took hold\\nof the crucible, and sloped it so as to pour out the gold. But instead\\nof a liquid stream, there came out, first, a pair of pretty little\\nyellow legs, then some coat tails, then a pair of arms stuck a-kimbo,\\nand, finally, the well-known head of his friend the mug; all which\\narticles, uniting as they rolled out, stood up energetically on the\\nfloor, in the shape of a little golden dwarf, about a foot and a half\\nhigh.\\n\\n\"That\\'s right!\" said the dwarf, stretching out first his legs and\\nthen his arms, and then shaking his head up and down, and as far round\\nas it would go, for five minutes, without stopping; apparently with the\\nview of ascertaining if he were quite correctly put together, while\\nGluck stood contemplating him in speechless amazement. He was dressed in\\na slashed doublet of spun gold, so fine in its texture that the\\nprismatic colors gleamed over it, as if on a surface of mother of pearl;\\nand, over this brilliant doublet, his hair and beard fell full halfway\\nto the ground in waving curls so exquisitely delicate that Gluck could\\nhardly tell where they ended; they seemed to melt into air. The features\\nof the face, however, were by no means finished with the same delicacy;\\nthey were rather coarse, slightly inclining to coppery in complexion,\\nand indicative, in expression, of a very pertinacious and intractable\\ndisposition in their small proprietor. When the dwarf had finished his\\nself-examination, he turned his small sharp eyes full on Gluck and\\nstared at him deliberately for a minute or two. \"No, it wouldn\\'t, Gluck,\\nmy boy,\" said the little man.\\n\\nThis was certainly rather an abrupt and unconnected mode of\\ncommencing conversation. It might indeed be supposed to refer to the\\ncourse of Gluck\\'s thoughts, which had first produced the dwarf\\'s\\nobservations out of the pot; but whatever it referred to, Gluck had no\\ninclination to dispute the dictum.\\n\\n\"Wouldn\\'t it, sir?\" said Gluck, very mildly and submissively\\nindeed.\\n\\n\"No,\" said the dwarf, conclusively. \"No, it wouldn\\'t.\" And with that,\\nthe dwarf pulled his cap hard over his brows, and took two turns, of\\nthree feet long, up and down the room, lifting his legs up very high,\\nand setting them down very hard. This pause gave time for Gluck to\\ncollect his thoughts a little, and, seeing no great reason to view his\\ndiminutive visitor with dread, and feeling his curiosity overcome his\\namazement, he ventured on a question of peculiar delicacy.\\n\\n\"Pray, sir,\" said Gluck rather hesitatingly, \"were you my mug?\"\\n\\nOn which the little man turned sharp round, walked straight up to\\nGluck, and drew himself up to his full height. \"I,\" said the little man,\\n\"am the King of the Golden River.\" Whereupon he turned about again, and\\ntook two more turns, some six feet long, in order to allow time for the\\nconsternation which this announcement produced in his auditor to\\nevaporate. After which, he again walked up to Gluck and stood still, as\\nif expecting some comment on his communication.\\n\\nGluck determined to say something at all events. \"I hope your Majesty\\nis very well,\" said Gluck. 254\\n\\n\"Listen!\" said the little man, deigning no reply to this polite\\ninquiry. \"I am the King of what you mortals call the Golden River. The\\nshape you saw me in, was owing to the malice of a stronger king, from\\nwhose enchantments you have this instant freed me. What I have seen of\\nyou, and your conduct to your wicked brothers, renders me willing to\\nserve you; therefore, attend to what I tell you. Whoever shall climb to\\nthe top of that mountain from which you see the Golden River issue, and\\nshall cast into the stream at its source three drops of holy water, for\\nhim, and for him only, the river shall turn to gold. But no one failing\\nin his first, can succeed in a second attempt; and if any one shall cast\\nunholy water into the river, it will overwhelm him, and he will become a\\nblack stone.\" So saying, the King of the Golden River turned away and\\ndeliberately walked into the center of the hottest flame of the furnace.\\nHis figure became red, white, transparent, dazzling—a blaze of intense\\nlight—rose, trembled, and disappeared. The King of the Golden River had\\nevaporated.\\n\\n\"Oh!\" cried poor Gluck, running to look up the chimney after him;\\n\"Oh, dear, dear, dear me! My mug! my mug! my mug!\"\\n\\nCHAPTER III\\n\\nHOW MR. HANS SET OFF ON AN EXPEDITION TO THE GOLDEN RIVER, AND HOW HE\\nPROSPERED THEREIN\\n\\nThe King of the Golden River had hardly made the extraordinary exit,\\nrelated in the last chapter, before Hans and Schwartz came roaring into\\nthe house, very savagely drunk. The discovery of the total loss of their\\nlast piece of plate had the effect of sobering them just enough to\\nenable them to stand over Gluck, beating him very steadily for a quarter\\nof an hour; at the expiration of which period they dropped into a couple\\nof chairs, and requested to know what he had got to say for himself.\\nGluck told them his story, of which, of course, they did not believe a\\nword. They beat him again, till their arms were tired, and staggered to\\nbed. In the morning, however, the steadiness with which he adhered to\\nhis story obtained him some degree of credence; the immediate\\nconsequence of which was, that the two brothers, after wrangling a long\\ntime on the knotty question, which of them should try his fortune first,\\ndrew their swords and began fighting. The noise of the fray alarmed the\\nneighbors, who, finding they could not pacify the combatants, sent for\\nthe constable.\\n\\nHans, on hearing this, contrived to escape, and hid himself; but\\nSchwartz was taken before the magistrate, fined for breaking the peace,\\nand, having drunk out his last penny the evening before, was thrown into\\nprison till he should pay.\\n\\nWhen Hans heard this, he was much delighted, and determined to set\\nout immediately for the Golden River. How to get the holy water was the\\nquestion. He went to the priest, but the priest could not give any holy\\nwater to so abandoned a character. So Hans went to vespers in the\\nevening for the first time in his life, and, under pretense of crossing\\nhimself, stole a cupful, and returned home in triumph.\\n\\nNext morning he got up before the sun rose, put the holy water into a\\nstrong flask, and two bottles of wine and some meat in a basket, slung\\nthem over his back, took his alpine staff in his hand, and set off for\\nthe mountains.\\n\\nOn his way out of the town he had to pass the prison, and as he\\nlooked in at the windows, whom should he see but Schwartz himself\\npeeping out of the bars, and looking very disconsolate.\\n\\n\"Good morning, brother,\" said Hans; \"have you any message for the\\nKing of the Golden River?\"\\n\\nSchwartz gnashed his teeth with rage, and shook the bars with all his\\nstrength; but Hans only laughed at him, and advising him to make himself\\ncomfortable till he came back again, shouldered his basket, shook the\\nbottle of holy water in Schwartz\\'s face till it frothed again, and\\nmarched off in the highest spirits in the world.\\n\\nIt was, indeed, a morning that might have made any one happy, even\\nwith no Golden River to seek for. Level lines of dewy mist lay stretched\\nalong the valley, out of which rose the massy mountains—their lower\\ncliffs in pale gray shadow, hardly distinguishable from the floating\\nvapor, but gradually ascending till they caught the sunlight, which ran\\nin sharp touches of ruddy color along the angular crags, and pierced, in\\nlong level rays, through their fringes of spear-like pine. Far above,\\nshot up red splintered masses of castellated rock, jagged and shivered\\ninto myriads of fantastic forms, with here and there a streak of sunlit\\nsnow, traced down their chasms like a line of forked lightning; and, far\\nbeyond, and far above all these, fainter than the morning cloud, but\\npurer and changeless, slept, in the blue sky, the utmost peaks of the\\neternal snow.\\n\\nThe Golden River, which sprang from one of the lower and snowless\\nelevations, was now nearly in shadow; all but the uppermost jets of\\nspray, which rose like slow smoke above the undulating line of the\\ncataract, and floated away in feeble wreaths upon the morning wind.\\n\\nOn this object, and on this alone, Hans\\'s eyes and thoughts were\\nfixed; forgetting the distance he had to traverse, he set off at an\\nimprudent rate of walking, which greatly exhausted him before he had\\nscaled the first range of the green and low hills. He was, moreover,\\nsurprised, on surmounting them, to find that a large glacier, of whose\\nexistence, notwithstanding his previous knowledge of the mountains, he\\nhad been absolutely ignorant, lay between him and the source of the\\nGolden River. He entered on it with the boldness of a practised\\nmountaineer; yet he thought he had never traversed so strange or so\\ndangerous a glacier in his life. The ice was excessively slippery, and\\nout of all its chasms came wild sounds of gushing water; not monotonous\\nor low, but changeful and loud, rising occasionally into drifting\\npassages of wild melody; then breaking off into short melancholy tones,\\nor sudden shrieks, resembling those of human voices in distress or pain.\\nThe ice was broken into thousands of confused shapes, but none, Hans\\nthought, like the ordinary forms of splintered ice. There seemed a\\ncurious expression about all their outlines—a perpetual\\nresemblance to living features, distorted and scornful. Myriads of\\ndeceitful shadows, and lurid lights, played and floated about and\\nthrough the pale blue pinnacles, dazzling and confusing the sight of the\\ntraveler; while his ears grew dull and his head giddy with the constant\\ngush and roar of the concealed waters. These painful circumstances\\nincreased upon him as he advanced; the ice crashed and yawned into fresh\\nchasms at his feet, tottering spires nodded around him, and 256 fell\\nthundering across his path; and though he had repeatedly faced these\\ndangers on the most terrific glaciers, and in the wildest weather, it\\nwas with a new and oppressive feeling of panic terror that he leaped the\\nlast chasm, and flung himself, exhausted and shuddering, on the firm\\nturf of the mountain.\\n\\nHe had been compelled to abandon his basket of food, which became a\\nperilous encumbrance on the glacier, and had now no means of refreshing\\nhimself but by breaking off and eating some of the pieces of ice. This,\\nhowever, relieved his thirst; an hour\\'s repose recruited his hardy\\nframe, and with the indomitable spirit of avarice, he resumed his\\nlaborious journey.\\n\\nHis way now lay straight up a ridge of bare red rocks, without a\\nblade of grass to ease the foot, or a projecting angle to afford an inch\\nof shade from the south sun. It was past noon, and the rays beat\\nintensely upon the steep path, while the whole atmosphere was motionless\\nand penetrated with heat. Intense thirst was soon added to the bodily\\nfatigue with which Hans was now afflicted; glance after glance he cast\\non the flask of water which hung at his belt. \"Three drops are enough,\"\\nat last thought he; \"I may, at least, cool my lips with it.\"\\n\\nHe opened the flask, and was raising it to his lips, when his eye\\nfell on an object lying on the rock beside him; he thought it moved. It\\nwas a small dog, apparently in the last agony of death from thirst. Its\\ntongue was out, its jaws dry, its limbs extended lifelessly, and a swarm\\nof black ants were crawling about its lips and throat. Its eye moved to\\nthe bottle which Hans held in his hand. He raised it, drank, spurned the\\nanimal with his foot, and passed on. And he did not know how it was, but\\nhe thought that a strange shadow had suddenly come across the blue\\nsky.\\n\\nThe path became steeper and more rugged every moment; and the high\\nhill air, instead of refreshing him, seemed to throw his blood into a\\nfever. The noise of the hill cataracts sounded like mockery in his ears;\\nthey were all distant, and his thirst increased every moment. Another\\nhour passed, and he again looked down to the flask at his side; it was\\nhalf empty, but there was much more than three drops in it. He stopped\\nto open it; and again, as he did so, something moved in the path above\\nhim. It was a fair child, stretched nearly lifeless on the rock, its\\nbreast heaving with thirst, its eyes closed, and its lips parched and\\nburning. Hans eyed it deliberately, drank, and passed on. And a dark\\ngray cloud came over the sun, and long, snake-like shadows crept up\\nalong the mountain sides. Hans struggled on. The sun was sinking, but\\nits descent seemed to bring no coolness; the leaden weight of the dead\\nair pressed upon his brow and heart, but the goal was near. He saw the\\ncataract of the Golden River springing from the hillside, scarcely five\\nhundred feet above him. He paused for a moment to breathe, and sprang on\\nto complete his task.\\n\\nAt this instant a faint cry fell on his ear. He turned, and saw a\\ngray-haired old man extended on the rocks. His eyes were sunk, his\\nfeatures deadly pale, and gathered into an expression of despair.\\n\"Water!\" he stretched his arms to Hans, and cried feebly, \"Water! I am\\ndying.\"\\n\\n\"I have none,\" replied Hans; \"thou hast had thy share of life.\" He\\nstrode over the prostrate body, and darted on. And a flash of blue\\nlightning rose out of the East, shaped like a sword; it shook 257 thrice\\nover the whole heaven, and left it dark with one heavy, impenetrable\\nshade. The sun was setting; it plunged toward the horizon like a red-hot\\nball.\\n\\nThe roar of the Golden River rose on Hans\\'s ear. He stood at the\\nbrink of the chasm through which it ran. Its waves were filled with the\\nred glory of the sunset; they shook their crests like tongues of fire,\\nand flashes of bloody light gleamed along their foam. Their sound came\\nmightier and mightier on his senses; his brain grew giddy with the\\nprolonged thunder. Shuddering he drew the flask from his girdle, and\\nhurled it into the center of the torrent. As he did so, an icy chill\\nshot through his limbs; he staggered, shrieked, and fell. The waters\\nclosed over his cry. And the moaning of the river rose wildly into the\\nnight, as it gushed over\\n\\nThe Black Stone.\\n\\nCHAPTER IV\\n\\nHOW MR. SCHWARTZ SET OFF ON AN EXPEDITION TO THE GOLDEN RIVER, AND HOW\\nHE PROSPERED THEREIN\\n\\nPoor little Gluck waited very anxiously alone in the house for Hans\\'s\\nreturn. Finding he did not come back, he was terribly frightened and\\nwent and told Schwartz in the prison, all that had happened. Then\\nSchwartz was very much pleased, and said that Hans must certainly have\\nbeen turned into a black stone, and he should have all the gold to\\nhimself. But Gluck was very sorry, and cried all night. When he got up\\nin the morning there was no bread in the house, nor any money; so Gluck\\nwent and hired himself to another goldsmith, and he worked so hard, and\\nso neatly, and so long every day, that he soon got money enough together\\nto pay his brother\\'s fine, and he went and gave it all to Schwartz, and\\nSchwartz got out of prison. Then Schwartz was quite pleased, and said he\\nshould have some of the gold of the river. But Gluck only begged he\\nwould go and see what had become of Hans.\\n\\nNow when Schwartz had heard that Hans had stolen the holy water, he\\nthought to himself that such a proceeding might not be considered\\naltogether correct by the King of the Golden River, and determined to\\nmanage matters better. So he took some more of Gluck\\'s money, and went\\nto a bad priest, who gave him some holy water very readily for it. Then\\nSchwartz was sure it was all quite right. So Schwartz got up early in\\nthe morning before the sun rose, and took some bread and wine, in a\\nbasket, and put his holy water in a flask, and set off for the\\nmountains. Like his brother, he was much surprised at the sight of the\\nglacier, and had great difficulty in crossing it, even after leaving his\\nbasket behind him. The day was cloudless, but not bright; there was a\\nheavy purple haze hanging over the sky, and the hills looked lowering\\nand gloomy. And as Schwartz climbed the steep rock path, the thirst came\\nupon him, as it had upon his brother, until he lifted his flask to his\\nlips to drink. Then he saw the fair child lying near him on the rocks,\\nand it cried to him, and moaned for water.\\n\\n\"Water, indeed,\" said Schwartz; \"I haven\\'t half enough for myself,\"\\nand passed on. And as he went he thought the sunbeams grew more dim, and\\nhe saw a low bank of black cloud rising out of the West; and, when he\\nhad climbed for another hour the thirst overcame him again, and he would\\nhave drunk. 258 Then he saw the old man lying before him on the\\npath, and heard him cry out for water. \"Water, indeed,\" said Schwartz,\\n\"I haven\\'t enough for myself,\" and on he went.\\n\\nThen again the light seemed to fade before his eyes, and he looked\\nup, and, behold, a mist, of the color of blood, had come over the sun;\\nand the bank of black cloud had risen very high, and its edges were\\ntossing and tumbling like the waves of the angry sea. And they cast long\\nshadows, which flickered over Schwartz\\'s path.\\n\\nThen Schwartz climbed for another hour, and again his thirst\\nreturned; and as he lifted his flask to his lips, he thought he saw his\\nbrother Hans lying exhausted on the path before him, and, as he gazed,\\nthe figure stretched its arms to him, and cried for water. \"Ha, ha,\"\\nlaughed Schwartz, \"are you there? Remember the prison bars, my boy.\\nWater, indeed! do you suppose I carried it all the way up here for\\nyou?\" And he strode over the figure; yet, as he passed, he\\nthought he saw a strange expression of mockery about its lips. And, when\\nhe had gone a few yards farther, he looked back; but the figure was not\\nthere.\\n\\nAnd a sudden horror came over Schwartz, he knew not why; but the\\nthirst for gold prevailed over his fear, and he rushed on. And the bank\\nof black cloud rose to the zenith, and out of it came bursts of spiry\\nlightning, and waves of darkness seemed to heave and float between their\\nflashes over the whole heavens. And the sky where the sun was setting\\nwas all level, and like a lake of blood; and a strong wind came out of\\nthat sky, tearing its crimson cloud into fragments, and scattering them\\nfar into the darkness. And when Schwartz stood by the brink of the\\nGolden River, its waves were black, like thunder clouds, but their foam\\nwas like fire; and the roar of the waters below, and the thunder above,\\nmet, as he cast the flask into the stream. And, as he did so, the\\nlightning glared into his eyes, and the earth gave way beneath him, and\\nthe waters closed over his cry. And the moaning of the river rose wildly\\ninto the night, as it gushed over the\\n\\nTwo Black Stones.\\n\\nCHAPTER V\\n\\nHOW LITTLE GLUCK SET OFF ON AN EXPEDITION TO THE GOLDEN RIVER, AND HOW\\nHE PROSPERED THEREIN; WITH OTHER MATTERS OF INTEREST\\n\\nWhen Gluck found that Schwartz did not come back he was very sorry,\\nand did not know what to do. He had no money, and was obliged to go and\\nhire himself again to the goldsmith, who worked him very hard, and gave\\nhim very little money. So, after a month or two, Gluck grew tired, and\\nmade up his mind to go and try his fortune with the Golden River. \"The\\nlittle King looked very kind,\" thought he. \"I don\\'t think he will turn\\nme into a black stone.\" So he went to the priest, and the priest gave\\nhim some holy water as soon as he asked for it. Then Gluck took some\\nbread in his basket, and the bottle of water, and set off very early for\\nthe mountains.\\n\\nIf the glacier had occasioned a great deal of fatigue to his\\nbrothers, it was twenty times worse for him, who was neither so strong\\nnor so practised on the mountains. He had several bad falls, lost his\\nbasket and bread, and was very much frightened at the strange noises\\nunder the ice. He lay a long time to rest on the grass, after he had got\\nover, 259 and began to climb the hill just in the hottest\\npart of the day. When he had climbed for an hour, he got dreadfully\\nthirsty, and was going to drink like his brothers, when he saw an old\\nman coming down the path above him, looking very feeble, and leaning on\\na staff. \"My son,\" said the old man, \"I am faint with thirst. Give me\\nsome of that water.\" Then Gluck looked at him, and when he saw that he\\nwas pale and weary, he gave him the water; \"Only pray don\\'t drink it\\nall,\" said Gluck. But the old man drank a great deal, and gave him back\\nthe bottle two-thirds empty. Then he bade him good speed, and Gluck went\\non again merrily. And the path became easier to his feet, and two or\\nthree blades of grass appeared upon it, and some grasshoppers began\\nsinging on the bank beside it; and Gluck thought he had never heard such\\nmerry singing.\\n\\nThen he went on for another hour, and the thirst increased on him so\\nthat he thought he should be forced to drink. But, as he raised the\\nflask, he saw a little child lying panting by the road-side, and it\\ncried out piteously for water. Then Gluck struggled with himself, and\\ndetermined to bear the thirst a little longer; and he put the bottle to\\nthe child\\'s lips, and it drank it all but a few drops. Then it smiled on\\nhim, and got up and ran down the hill; and Gluck looked after it, till\\nit became as small as a little star, and then turned and began climbing\\nagain. And then there were all kinds of sweet flowers growing on the\\nrocks, bright green moss with pale pink starry flowers, and soft belled\\ngentians, more blue than the sky at its deepest, and pure white\\ntransparent lilies. And crimson and purple butterflies darted hither and\\nthither, and the sky sent down such pure light that Gluck had never felt\\nso happy in his life.\\n\\nYet, when he had climbed for another hour, his thirst became\\nintolerable again; and, when he looked at his bottle, he saw that there\\nwere only five or six drops left in it, and he could not venture to\\ndrink. And, as he was hanging the flask to his belt again, he saw a\\nlittle dog lying on the rocks, gasping for breath—just as Hans had seen\\nit on the day of his ascent. And Gluck stopped and looked at it, and\\nthen at the Golden River, not five hundred yards above him; and he\\nthought of the dwarf\\'s words, \"that no one could succeed, except in his\\nfirst attempt\"; and he tried to pass the dog, but it whined piteously,\\nand Gluck stopped again. \"Poor beastie,\" said Gluck, \"it\\'ll be dead when\\nI come down again, if I don\\'t help it.\" Then he looked closer and closer\\nat it, and its eye turned on him so mournfully that he could not stand\\nit. \"Confound the King and his gold, too,\" said Gluck; and he opened the\\nflask, and poured all the water into the dog\\'s mouth.\\n\\nThe dog sprang up and stood on its hind legs. Its tail disappeared,\\nits ears became long, longer, silky, golden; its nose became very red,\\nits eyes became very twinkling; in three seconds the dog was gone, and\\nbefore Gluck stood his old acquaintance, the King of the Golden\\nRiver.\\n\\n\"Thank you,\" said the monarch; \"but don\\'t be frightened, it\\'s all\\nright\"; for Gluck showed manifest symptoms of consternation at this\\nunlooked-for reply to his last observation. \"Why didn\\'t you come\\nbefore,\" continued the dwarf, \"instead of sending me those rascally\\nbrothers of yours, for me to have the trouble of turning into stones?\\nVery hard stones they make, too.\"\\n\\n\"Oh, dear me!\" said Gluck, \"have you really been so cruel?\"\\n\\n\"Cruel!\" said the dwarf: \"they poured unholy water into my stream; do\\nyou suppose I\\'m going to allow that?\"\\n\\n\"Why,\" said Gluck, \"I am sure, sir—your Majesty, I mean,—they got the\\nwater out of the church font.\"\\n\\n\"Very probably,\" replied the dwarf; \"but,\" and his countenance grew\\nstern as he spoke, \"the water which has been refused to the cry of the\\nweary and dying is unholy, though it had been blessed by every saint in\\nheaven; and the water which is found in the vessel of mercy is holy,\\nthough it had been defiled with corpses.\"\\n\\nSo saying, the dwarf stooped and plucked a lily that grew at his\\nfeet. On its white leaves there hung three drops of clear dew. And the\\ndwarf shook them into the flask which Gluck held in his hand. \"Cast\\nthese into the river,\" he said, \"and descend on the other side of the\\nmountains into the Treasure Valley, and so good speed.\"\\n\\nAs he spoke, the figure of the dwarf became indistinct. The playing\\ncolors of his robe formed themselves into a prismatic mist of dewy\\nlight: he stood for an instant veiled with them as with the belt of a\\nbroad rainbow. The colors grew faint, the mist rose into the air; the\\nmonarch had evaporated.\\n\\nAnd Gluck climbed to the brink of the Golden River and its waves were\\nas clear as crystal, and as brilliant as the sun. And, when he cast the\\nthree drops of dew into the stream, there opened where they fell, a\\nsmall circular whirlpool, into which the waters descended with a musical\\nnoise.\\n\\nGluck stood watching it for some time, very much disappointed,\\nbecause not only the river was not turned into gold but its waters\\nseemed much diminished in quantity. Yet he obeyed his friend the dwarf,\\nand descended the other side of the mountains, towards the Treasure\\nValley; and, as he went, he thought he heard the noise of water working\\nits way under the ground. And when he came in sight of the Treasure\\nValley, behold, a river, like the Golden River, was springing from a new\\ncleft of the rocks above it, and was flowing in innumerable streams\\namong the dry heaps of red sand.\\n\\nAnd, as Gluck gazed, fresh grass sprang beside the new streams, and\\ncreeping plants grew, and climbed among the moistening soil. Young\\nflowers opened suddenly along the river sides, as stars leap out when\\ntwilight is deepening, and thickets of myrtle, and tendrils of vine,\\ncast lengthening shadows over the valley as they grew. And thus the\\nTreasure Valley became a garden again, and the inheritance, which had\\nbeen lost by cruelty, was regained by love.\\n\\nAnd Gluck went and dwelt in the valley, and the poor were never\\ndriven from his door; so that his barns became full of corn, and his\\nhouse of treasure. And for him, the river had, according to the dwarf\\'s\\npromise, become a River of Gold.\\n\\nAnd, to this day, the inhabitants of the valley point out the place\\nwhere the three drops of holy dew were cast into the stream, and trace\\nthe course of the Golden River under the ground, until it emerges in the\\nTreasure Valley. And at the top of the cataract of the Golden River are\\nstill to be seen two black stones, round\\nwhich the waters howl mournfully every day at sunset; and these stones\\nare still called by the people of the valley\\n\\nThe Black Brothers.', metadata={'source': './example_data/childrens-literature.epub'})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from langchain_community.document_loaders import UnstructuredEPubLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9d3d0e35", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "loader = UnstructuredEPubLoader(\"winter-sports.epub\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06073f91", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = loader.load()" + "from langchain_community.document_loaders import UnstructuredEPubLoader\n", + "\n", + "loader = UnstructuredEPubLoader(\"./example_data/childrens-literature.epub\")\n", + "\n", + "data = loader.load()\n", + "\n", + "data[0]" ] }, { @@ -70,46 +65,30 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "064f9162", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "loader = UnstructuredEPubLoader(\"winter-sports.epub\", mode=\"elements\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abefbbdb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a547c534", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='The Project Gutenberg eBook of Winter Sports in\\nSwitzerland, by E. F. Benson', lookup_str='', metadata={'source': 'winter-sports.epub', 'page_number': 1, 'category': 'Title'}, lookup_index=0)" + "Document(page_content='Guide', metadata={'source': './example_data/childrens-literature.epub', 'category_depth': 1, 'last_modified': '2024-07-01T11:12:08', 'languages': ['eng'], 'filetype': 'application/epub', 'file_directory': './example_data', 'filename': 'childrens-literature.epub', 'category': 'Title'})" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "loader = UnstructuredEPubLoader(\n", + " \"./example_data/childrens-literature.epub\", mode=\"elements\"\n", + ")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] }, @@ -138,7 +117,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/example_data/childrens-literature.epub b/docs/docs/integrations/document_loaders/example_data/childrens-literature.epub new file mode 100644 index 00000000000..ba84a643994 Binary files /dev/null and b/docs/docs/integrations/document_loaders/example_data/childrens-literature.epub differ diff --git a/docs/docs/integrations/document_loaders/example_data/layout-parser-paper-screenshot.png b/docs/docs/integrations/document_loaders/example_data/layout-parser-paper-screenshot.png new file mode 100644 index 00000000000..9b6d0ffa62d Binary files /dev/null and b/docs/docs/integrations/document_loaders/example_data/layout-parser-paper-screenshot.png differ diff --git a/docs/docs/integrations/document_loaders/example_data/state_of_the_union.txt b/docs/docs/integrations/document_loaders/example_data/state_of_the_union.txt new file mode 100644 index 00000000000..b453aacdae3 --- /dev/null +++ b/docs/docs/integrations/document_loaders/example_data/state_of_the_union.txt @@ -0,0 +1,723 @@ +Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. + +Last year COVID-19 kept us apart. This year we are finally together again. + +Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. + +With a duty to one another to the American people to the Constitution. + +And with an unwavering resolve that freedom will always triumph over tyranny. + +Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. + +He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. + +He met the Ukrainian people. + +From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. + +Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. + +In this struggle as President Zelenskyy said in his speech to the European Parliament ā€œLight will win over darkness.ā€ The Ukrainian Ambassador to the United States is here tonight. + +Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. + +Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. + +Throughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos. + +They keep moving. + +And the costs and the threats to America and the world keep rising. + +That’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2. + +The United States is a member along with 29 other nations. + +It matters. American diplomacy matters. American resolve matters. + +Putin’s latest attack on Ukraine was premeditated and unprovoked. + +He rejected repeated efforts at diplomacy. + +He thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready. Here is what we did. + +We prepared extensively and carefully. + +We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin. + +I spent countless hours unifying our European allies. We shared with the world in advance what we knew Putin was planning and precisely how he would try to falsely justify his aggression. + +We countered Russia’s lies with truth. + +And now that he has acted the free world is holding him accountable. + +Along with twenty-seven members of the European Union including France, Germany, Italy, as well as countries like the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland. + +We are inflicting pain on Russia and supporting the people of Ukraine. Putin is now isolated from the world more than ever. + +Together with our allies –we are right now enforcing powerful economic sanctions. + +We are cutting off Russia’s largest banks from the international financial system. + +Preventing Russia’s central bank from defending the Russian Ruble making Putin’s $630 Billion ā€œwar fundā€ worthless. + +We are choking off Russia’s access to technology that will sap its economic strength and weaken its military for years to come. + +Tonight I say to the Russian oligarchs and corrupt leaders who have bilked billions of dollars off this violent regime no more. + +The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs. + +We are joining with our European allies to find and seize your yachts your luxury apartments your private jets. We are coming for your ill-begotten gains. + +And tonight I am announcing that we will join our allies in closing off American air space to all Russian flights – further isolating Russia – and adding an additional squeeze –on their economy. The Ruble has lost 30% of its value. + +The Russian stock market has lost 40% of its value and trading remains suspended. Russia’s economy is reeling and Putin alone is to blame. + +Together with our allies we are providing support to the Ukrainians in their fight for freedom. Military assistance. Economic assistance. Humanitarian assistance. + +We are giving more than $1 Billion in direct assistance to Ukraine. + +And we will continue to aid the Ukrainian people as they defend their country and to help ease their suffering. + +Let me be clear, our forces are not engaged and will not engage in conflict with Russian forces in Ukraine. + +Our forces are not going to Europe to fight in Ukraine, but to defend our NATO Allies – in the event that Putin decides to keep moving west. + +For that purpose we’ve mobilized American ground forces, air squadrons, and ship deployments to protect NATO countries including Poland, Romania, Latvia, Lithuania, and Estonia. + +As I have made crystal clear the United States and our Allies will defend every inch of territory of NATO countries with the full force of our collective power. + +And we remain clear-eyed. The Ukrainians are fighting back with pure courage. But the next few days weeks, months, will be hard on them. + +Putin has unleashed violence and chaos. But while he may make gains on the battlefield – he will pay a continuing high price over the long run. + +And a proud Ukrainian people, who have known 30 years of independence, have repeatedly shown that they will not tolerate anyone who tries to take their country backwards. + +To all Americans, I will be honest with you, as I’ve always promised. A Russian dictator, invading a foreign country, has costs around the world. + +And I’m taking robust action to make sure the pain of our sanctions is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. + +Tonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. + +America will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. + +These steps will help blunt gas prices here at home. And I know the news about what’s happening can seem alarming. + +But I want you to know that we are going to be okay. + +When the history of this era is written Putin’s war on Ukraine will have left Russia weaker and the rest of the world stronger. + +While it shouldn’t have taken something so terrible for people around the world to see what’s at stake now everyone sees it clearly. + +We see the unity among leaders of nations and a more unified Europe a more unified West. And we see unity among the people who are gathering in cities in large crowds around the world even in Russia to demonstrate their support for Ukraine. + +In the battle between democracy and autocracy, democracies are rising to the moment, and the world is clearly choosing the side of peace and security. + +This is a real test. It’s going to take time. So let us continue to draw inspiration from the iron will of the Ukrainian people. + +To our fellow Ukrainian Americans who forge a deep bond that connects our two nations we stand with you. + +Putin may circle Kyiv with tanks, but he will never gain the hearts and souls of the Ukrainian people. + +He will never extinguish their love of freedom. He will never weaken the resolve of the free world. + +We meet tonight in an America that has lived through two of the hardest years this nation has ever faced. + +The pandemic has been punishing. + +And so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. + +I understand. + +I remember when my Dad had to leave our home in Scranton, Pennsylvania to find work. I grew up in a family where if the price of food went up, you felt it. + +That’s why one of the first things I did as President was fight to pass the American Rescue Plan. + +Because people were hurting. We needed to act, and we did. + +Few pieces of legislation have done more in a critical moment in our history to lift us out of crisis. + +It fueled our efforts to vaccinate the nation and combat COVID-19. It delivered immediate economic relief for tens of millions of Americans. + +Helped put food on their table, keep a roof over their heads, and cut the cost of health insurance. + +And as my Dad used to say, it gave people a little breathing room. + +And unlike the $2 Trillion tax cut passed in the previous administration that benefitted the top 1% of Americans, the American Rescue Plan helped working people—and left no one behind. + +And it worked. It created jobs. Lots of jobs. + +In fact—our economy created over 6.5 Million new jobs just last year, more jobs created in one year +than ever before in the history of America. + +Our economy grew at a rate of 5.7% last year, the strongest growth in nearly 40 years, the first step in bringing fundamental change to an economy that hasn’t worked for the working people of this nation for too long. + +For the past 40 years we were told that if we gave tax breaks to those at the very top, the benefits would trickle down to everyone else. + +But that trickle-down theory led to weaker economic growth, lower wages, bigger deficits, and the widest gap between those at the top and everyone else in nearly a century. + +Vice President Harris and I ran for office with a new economic vision for America. + +Invest in America. Educate Americans. Grow the workforce. Build the economy from the bottom up +and the middle out, not from the top down. + +Because we know that when the middle class grows, the poor have a ladder up and the wealthy do very well. + +America used to have the best roads, bridges, and airports on Earth. + +Now our infrastructure is ranked 13th in the world. + +We won’t be able to compete for the jobs of the 21st Century if we don’t fix that. + +That’s why it was so important to pass the Bipartisan Infrastructure Law—the most sweeping investment to rebuild America in history. + +This was a bipartisan effort, and I want to thank the members of both parties who worked to make it happen. + +We’re done talking about infrastructure weeks. + +We’re going to have an infrastructure decade. + +It is going to transform America and put us on a path to win the economic competition of the 21st Century that we face with the rest of the world—particularly with China. + +As I’ve told Xi Jinping, it is never a good bet to bet against the American people. + +We’ll create good jobs for millions of Americans, modernizing roads, airports, ports, and waterways all across America. + +And we’ll do it all to withstand the devastating effects of the climate crisis and promote environmental justice. + +We’ll build a national network of 500,000 electric vehicle charging stations, begin to replace poisonous lead pipes—so every child—and every American—has clean water to drink at home and at school, provide affordable high-speed internet for every American—urban, suburban, rural, and tribal communities. + +4,000 projects have already been announced. + +And tonight, I’m announcing that this year we will start fixing over 65,000 miles of highway and 1,500 bridges in disrepair. + +When we use taxpayer dollars to rebuild America – we are going to Buy American: buy American products to support American jobs. + +The federal government spends about $600 Billion a year to keep the country safe and secure. + +There’s been a law on the books for almost a century +to make sure taxpayers’ dollars support American jobs and businesses. + +Every Administration says they’ll do it, but we are actually doing it. + +We will buy American to make sure everything from the deck of an aircraft carrier to the steel on highway guardrails are made in America. + +But to compete for the best jobs of the future, we also need to level the playing field with China and other competitors. + +That’s why it is so important to pass the Bipartisan Innovation Act sitting in Congress that will make record investments in emerging technologies and American manufacturing. + +Let me give you one example of why it’s so important to pass it. + +If you travel 20 miles east of Columbus, Ohio, you’ll find 1,000 empty acres of land. + +It won’t look like much, but if you stop and look closely, you’ll see a ā€œField of dreams,ā€ the ground on which America’s future will be built. + +This is where Intel, the American company that helped build Silicon Valley, is going to build its $20 billion semiconductor ā€œmega siteā€. + +Up to eight state-of-the-art factories in one place. 10,000 new good-paying jobs. + +Some of the most sophisticated manufacturing in the world to make computer chips the size of a fingertip that power the world and our everyday lives. + +Smartphones. The Internet. Technology we have yet to invent. + +But that’s just the beginning. + +Intel’s CEO, Pat Gelsinger, who is here tonight, told me they are ready to increase their investment from +$20 billion to $100 billion. + +That would be one of the biggest investments in manufacturing in American history. + +And all they’re waiting for is for you to pass this bill. + +So let’s not wait any longer. Send it to my desk. I’ll sign it. + +And we will really take off. + +And Intel is not alone. + +There’s something happening in America. + +Just look around and you’ll see an amazing story. + +The rebirth of the pride that comes from stamping products ā€œMade In America.ā€ The revitalization of American manufacturing. + +Companies are choosing to build new factories here, when just a few years ago, they would have built them overseas. + +That’s what is happening. Ford is investing $11 billion to build electric vehicles, creating 11,000 jobs across the country. + +GM is making the largest investment in its history—$7 billion to build electric vehicles, creating 4,000 jobs in Michigan. + +All told, we created 369,000 new manufacturing jobs in America just last year. + +Powered by people I’ve met like JoJo Burgess, from generations of union steelworkers from Pittsburgh, who’s here with us tonight. + +As Ohio Senator Sherrod Brown says, ā€œIt’s time to bury the label ā€œRust Belt.ā€ + +It’s time. + +But with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. + +Inflation is robbing them of the gains they might otherwise feel. + +I get it. That’s why my top priority is getting prices under control. + +Look, our economy roared back faster than most predicted, but the pandemic meant that businesses had a hard time hiring enough workers to keep up production in their factories. + +The pandemic also disrupted global supply chains. + +When factories close, it takes longer to make goods and get them from the warehouse to the store, and prices go up. + +Look at cars. + +Last year, there weren’t enough semiconductors to make all the cars that people wanted to buy. + +And guess what, prices of automobiles went up. + +So—we have a choice. + +One way to fight inflation is to drive down wages and make Americans poorer. + +I have a better plan to fight inflation. + +Lower your costs, not your wages. + +Make more cars and semiconductors in America. + +More infrastructure and innovation in America. + +More goods moving faster and cheaper in America. + +More jobs where you can earn a good living in America. + +And instead of relying on foreign supply chains, let’s make it in America. + +Economists call it ā€œincreasing the productive capacity of our economy.ā€ + +I call it building a better America. + +My plan to fight inflation will lower your costs and lower the deficit. + +17 Nobel laureates in economics say my plan will ease long-term inflationary pressures. Top business leaders and most Americans support my plan. And here’s the plan: + +First – cut the cost of prescription drugs. Just look at insulin. One in ten Americans has diabetes. In Virginia, I met a 13-year-old boy named Joshua Davis. + +He and his Dad both have Type 1 diabetes, which means they need insulin every day. Insulin costs about $10 a vial to make. + +But drug companies charge families like Joshua and his Dad up to 30 times more. I spoke with Joshua’s mom. + +Imagine what it’s like to look at your child who needs insulin and have no idea how you’re going to pay for it. + +What it does to your dignity, your ability to look your child in the eye, to be the parent you expect to be. + +Joshua is here with us tonight. Yesterday was his birthday. Happy birthday, buddy. + +For Joshua, and for the 200,000 other young people with Type 1 diabetes, let’s cap the cost of insulin at $35 a month so everyone can afford it. + +Drug companies will still do very well. And while we’re at it let Medicare negotiate lower prices for prescription drugs, like the VA already does. + +Look, the American Rescue Plan is helping millions of families on Affordable Care Act plans save $2,400 a year on their health care premiums. Let’s close the coverage gap and make those savings permanent. + +Second – cut energy costs for families an average of $500 a year by combatting climate change. + +Let’s provide investments and tax credits to weatherize your homes and businesses to be energy efficient and you get a tax credit; double America’s clean energy production in solar, wind, and so much more; lower the price of electric vehicles, saving you another $80 a month because you’ll never have to pay at the gas pump again. + +Third – cut the cost of child care. Many families pay up to $14,000 a year for child care per child. + +Middle-class and working families shouldn’t have to pay more than 7% of their income for care of young children. + +My plan will cut the cost in half for most families and help parents, including millions of women, who left the workforce during the pandemic because they couldn’t afford child care, to be able to get back to work. + +My plan doesn’t stop there. It also includes home and long-term care. More affordable housing. And Pre-K for every 3- and 4-year-old. + +All of these will lower costs. + +And under my plan, nobody earning less than $400,000 a year will pay an additional penny in new taxes. Nobody. + +The one thing all Americans agree on is that the tax system is not fair. We have to fix it. + +I’m not looking to punish anyone. But let’s make sure corporations and the wealthiest Americans start paying their fair share. + +Just last year, 55 Fortune 500 corporations earned $40 billion in profits and paid zero dollars in federal income tax. + +That’s simply not fair. That’s why I’ve proposed a 15% minimum tax rate for corporations. + +We got more than 130 countries to agree on a global minimum tax rate so companies can’t get out of paying their taxes at home by shipping jobs and factories overseas. + +That’s why I’ve proposed closing loopholes so the very wealthy don’t pay a lower tax rate than a teacher or a firefighter. + +So that’s my plan. It will grow the economy and lower costs for families. + +So what are we waiting for? Let’s get this done. And while you’re at it, confirm my nominees to the Federal Reserve, which plays a critical role in fighting inflation. + +My plan will not only lower costs to give families a fair shot, it will lower the deficit. + +The previous Administration not only ballooned the deficit with tax cuts for the very wealthy and corporations, it undermined the watchdogs whose job was to keep pandemic relief funds from being wasted. + +But in my administration, the watchdogs have been welcomed back. + +We’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans. + +And tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. + +By the end of this year, the deficit will be down to less than half what it was before I took office. + +The only president ever to cut the deficit by more than one trillion dollars in a single year. + +Lowering your costs also means demanding more competition. + +I’m a capitalist, but capitalism without competition isn’t capitalism. + +It’s exploitation—and it drives up prices. + +When corporations don’t have to compete, their profits go up, your prices go up, and small businesses and family farmers and ranchers go under. + +We see it happening with ocean carriers moving goods in and out of America. + +During the pandemic, these foreign-owned companies raised prices by as much as 1,000% and made record profits. + +Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. + +And as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. + +That ends on my watch. + +Medicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. + +We’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. + +Let’s pass the Paycheck Fairness Act and paid leave. + +Raise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. + +Let’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges. + +And let’s pass the PRO Act when a majority of workers want to form a union—they shouldn’t be stopped. + +When we invest in our workers, when we build the economy from the bottom up and the middle out together, we can do something we haven’t done in a long time: build a better America. + +For more than two years, COVID-19 has impacted every decision in our lives and the life of the nation. + +And I know you’re tired, frustrated, and exhausted. + +But I also know this. + +Because of the progress we’ve made, because of your resilience and the tools we have, tonight I can say +we are moving forward safely, back to more normal routines. + +We’ve reached a new moment in the fight against COVID-19, with severe cases down to a level not seen since last July. + +Just a few days ago, the Centers for Disease Control and Prevention—the CDC—issued new mask guidelines. + +Under these new guidelines, most Americans in most of the country can now be mask free. + +And based on the projections, more of the country will reach that point across the next couple of weeks. + +Thanks to the progress we have made this past year, COVID-19 need no longer control our lives. + +I know some are talking about ā€œliving with COVID-19ā€. Tonight – I say that we will never just accept living with COVID-19. + +We will continue to combat the virus as we do other diseases. And because this is a virus that mutates and spreads, we will stay on guard. + +Here are four common sense steps as we move forward safely. + +First, stay protected with vaccines and treatments. We know how incredibly effective vaccines are. If you’re vaccinated and boosted you have the highest degree of protection. + +We will never give up on vaccinating more Americans. Now, I know parents with kids under 5 are eager to see a vaccine authorized for their children. + +The scientists are working hard to get that done and we’ll be ready with plenty of vaccines when they do. + +We’re also ready with anti-viral treatments. If you get COVID-19, the Pfizer pill reduces your chances of ending up in the hospital by 90%. + +We’ve ordered more of these pills than anyone in the world. And Pfizer is working overtime to get us 1 Million pills this month and more than double that next month. + +And we’re launching the ā€œTest to Treatā€ initiative so people can get tested at a pharmacy, and if they’re positive, receive antiviral pills on the spot at no cost. + +If you’re immunocompromised or have some other vulnerability, we have treatments and free high-quality masks. + +We’re leaving no one behind or ignoring anyone’s needs as we move forward. + +And on testing, we have made hundreds of millions of tests available for you to order for free. + +Even if you already ordered free tests tonight, I am announcing that you can order more from covidtests.gov starting next week. + +Second – we must prepare for new variants. Over the past year, we’ve gotten much better at detecting new variants. + +If necessary, we’ll be able to deploy new vaccines within 100 days instead of many more months or years. + +And, if Congress provides the funds we need, we’ll have new stockpiles of tests, masks, and pills ready if needed. + +I cannot promise a new variant won’t come. But I can promise you we’ll do everything within our power to be ready if it does. + +Third – we can end the shutdown of schools and businesses. We have the tools we need. + +It’s time for Americans to get back to work and fill our great downtowns again. People working from home can feel safe to begin to return to the office. + +We’re doing that here in the federal government. The vast majority of federal workers will once again work in person. + +Our schools are open. Let’s keep it that way. Our kids need to be in school. + +And with 75% of adult Americans fully vaccinated and hospitalizations down by 77%, most Americans can remove their masks, return to work, stay in the classroom, and move forward safely. + +We achieved this because we provided free vaccines, treatments, tests, and masks. + +Of course, continuing this costs money. + +I will soon send Congress a request. + +The vast majority of Americans have used these tools and may want to again, so I expect Congress to pass it quickly. + +Fourth, we will continue vaccinating the world. + +We’ve sent 475 Million vaccine doses to 112 countries, more than any other nation. + +And we won’t stop. + +We have lost so much to COVID-19. Time with one another. And worst of all, so much loss of life. + +Let’s use this moment to reset. Let’s stop looking at COVID-19 as a partisan dividing line and see it for what it is: A God-awful disease. + +Let’s stop seeing each other as enemies, and start seeing each other for who we really are: Fellow Americans. + +We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. + +I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. + +They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. + +Officer Mora was 27 years old. + +Officer Rivera was 22. + +Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. + +I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. + +I’ve worked on these issues a long time. + +I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. + +So let’s not abandon our streets. Or choose between safety and equal justice. + +Let’s come together to protect our communities, restore trust, and hold law enforcement accountable. + +That’s why the Justice Department required body cameras, banned chokeholds, and restricted no-knock warrants for its officers. + +That’s why the American Rescue Plan provided $350 Billion that cities, states, and counties can use to hire more police and invest in proven strategies like community violence interruption—trusted messengers breaking the cycle of violence and trauma and giving young people hope. + +We should all agree: The answer is not to Defund the police. The answer is to FUND the police with the resources and training they need to protect our communities. + +I ask Democrats and Republicans alike: Pass my budget and keep our neighborhoods safe. + +And I will keep doing everything in my power to crack down on gun trafficking and ghost guns you can buy online and make at home—they have no serial numbers and can’t be traced. + +And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon? + +Ban assault weapons and high-capacity magazines. + +Repeal the liability shield that makes gun manufacturers the only industry in America that can’t be sued. + +These laws don’t infringe on the Second Amendment. They save lives. + +The most fundamental right in America is the right to vote – and to have it counted. And it’s under assault. + +In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. + +We cannot let this happen. + +Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. + +Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. + +One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. + +And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. + +A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. + +And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. + +We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. + +We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. + +We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. + +We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders. + +We can do all this while keeping lit the torch of liberty that has led generations of immigrants to this land—my forefathers and so many of yours. + +Provide a pathway to citizenship for Dreamers, those on temporary status, farm workers, and essential workers. + +Revise our laws so businesses have the workers they need and families don’t wait decades to reunite. + +It’s not only the right thing to do—it’s the economically smart thing to do. + +That’s why immigration reform is supported by everyone from labor unions to religious leaders to the U.S. Chamber of Commerce. + +Let’s get it done once and for all. + +Advancing liberty and justice also requires protecting the rights of women. + +The constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before. + +If we want to go forward—not backward—we must protect access to health care. Preserve a woman’s right to choose. And let’s continue to advance maternal health care in America. + +And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. + +As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. + +While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. + +And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. + +So tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. + +First, beat the opioid epidemic. + +There is so much we can do. Increase funding for prevention, treatment, harm reduction, and recovery. + +Get rid of outdated rules that stop doctors from prescribing treatments. And stop the flow of illicit drugs by working with state and local law enforcement to go after traffickers. + +If you’re suffering from addiction, know you are not alone. I believe in recovery, and I celebrate the 23 million Americans in recovery. + +Second, let’s take on mental health. Especially among our children, whose lives and education have been turned upside down. + +The American Rescue Plan gave schools money to hire teachers and help students make up for lost learning. + +I urge every parent to make sure your school does just that. And we can all play a part—sign up to be a tutor or a mentor. + +Children were also struggling before the pandemic. Bullying, violence, trauma, and the harms of social media. + +As Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. + +It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. + +And let’s get all Americans the mental health services they need. More people they can turn to for help, and full parity between physical and mental health care. + +Third, support our veterans. + +Veterans are the best of us. + +I’ve always believed that we have a sacred obligation to equip all those we send to war and care for them and their families when they come home. + +My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. + +Our troops in Iraq and Afghanistan faced many dangers. + +One was stationed at bases and breathing in toxic smoke from ā€œburn pitsā€ that incinerated wastes of war—medical and hazard material, jet fuel, and more. + +When they came home, many of the world’s fittest and best trained warriors were never the same. + +Headaches. Numbness. Dizziness. + +A cancer that would put them in a flag-draped coffin. + +I know. + +One of those soldiers was my son Major Beau Biden. + +We don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. + +But I’m committed to finding out everything we can. + +Committed to military families like Danielle Robinson from Ohio. + +The widow of Sergeant First Class Heath Robinson. + +He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. + +Stationed near Baghdad, just yards from burn pits the size of football fields. + +Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter. + +But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. + +Danielle says Heath was a fighter to the very end. + +He didn’t know how to stop fighting, and neither did she. + +Through her pain she found purpose to demand we do better. + +Tonight, Danielle—we are. + +The VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. + +And tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers. + +I’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. + +And fourth, let’s end cancer as we know it. + +This is personal to me and Jill, to Kamala, and to so many of you. + +Cancer is the #2 cause of death in America–second only to heart disease. + +Last month, I announced our plan to supercharge +the Cancer Moonshot that President Obama asked me to lead six years ago. + +Our goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases. + +More support for patients and families. + +To get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. + +It’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. + +ARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. + +A unity agenda for the nation. + +We can do this. + +My fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. + +In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. + +We have fought for freedom, expanded liberty, defeated totalitarianism and terror. + +And built the strongest, freest, and most prosperous nation the world has ever known. + +Now is the hour. + +Our moment of responsibility. + +Our test of resolve and conscience, of history itself. + +It is in this moment that our character is formed. Our purpose is found. Our future is forged. + +Well I know this nation. + +We will meet the test. + +To protect freedom and liberty, to expand fairness and opportunity. + +We will save democracy. + +As hard as these times have been, I am more optimistic about America today than I have been my whole life. + +Because I see the future that is within our grasp. + +Because I know there is simply nothing beyond our capacity. + +We are the only nation on Earth that has always turned every crisis we have faced into an opportunity. + +The only nation that can be defined by a single word: possibilities. + +So on this night, in our 245th year as a nation, I have come to report on the State of the Union. + +And my report is this: the State of the Union is strong—because you, the American people, are strong. + +We are stronger today than we were a year ago. + +And we will be stronger a year from now than we are today. + +Now is our moment to meet and overcome the challenges of our time. + +And we will, as one people. + +One America. + +The United States of America. + +May God bless you all. May God protect our troops. \ No newline at end of file diff --git a/docs/docs/integrations/document_loaders/image.ipynb b/docs/docs/integrations/document_loaders/image.ipynb index 8060d3bd485..be45e01d973 100644 --- a/docs/docs/integrations/document_loaders/image.ipynb +++ b/docs/docs/integrations/document_loaders/image.ipynb @@ -7,7 +7,9 @@ "source": [ "# Images\n", "\n", - "This covers how to load images such as `JPG` or `PNG` into a document format that we can use downstream." + "This covers how to load images into a document format that we can use downstream with other LangChain modules.\n", + "\n", + "It uses [Unstructured](https://unstructured.io/) to handle a wide variety of image formats, such as `.jpg` and `.png`. Please see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies." ] }, { @@ -27,63 +29,35 @@ }, "outputs": [], "source": [ - "%pip install --upgrade --quiet pdfminer" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "0cc0cd42", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_community.document_loaders.image import UnstructuredImageLoader" + "%pip install --upgrade --quiet \"unstructured[all-docs]\"" ] }, { "cell_type": "code", "execution_count": 2, - "id": "082d557c", + "id": "0cc0cd42", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "loader = UnstructuredImageLoader(\"layout-parser-paper-fast.jpg\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df11c953", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "4284d44c", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content=\"LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\n\\n\\nā€˜Zxjiang Shen' (F3}, Ruochen Zhangā€, Melissa Dell*, Benjamin Charles Germain\\nLeet, Jacob Carlson, and Weining LiF\\n\\n\\nsugehen\\n\\nshangthrows, et\\n\\nā€œAbstract. Recent advanocs in document image analysis (DIA) have been\\nā€˜pimarliy driven bythe application of neural networks dell roar\\n{uteomer could be aly deployed in production and extended fo farther\\n[nvetigtion. However, various factory ke lcely organize codebanee\\nsnd sophisticated modal cnigurations compat the ey ree of\\nā€˜erin! innovation by wide sence, Though there have been sng\\nā€˜Hors to improve reuablty and simplify deep lees (DL) mode\\nā€˜aon, sone of them ae optimized for challenge inthe demain of DIA,\\nThis roprscte a major gap in the extng fol, sw DIA i eal to\\nscademic research acon wie range of dpi in the social ssencee\\n[rary for streamlining the sage of DL in DIA research and appicn\\nā€˜tons The core LayoutFaraer brary comes with a sch of simple and\\nIntative interfaee or applying and eutomiing DI. odel fr Inyo de\\npltfom for sharing both protrined modes an fal document dist\\n{ation pipeline We demonutate that LayootPareer shea fr both\\nlightweight and lrgeseledgtieation pipelines in eal-word uae ces\\nThe leary pblely smal at Btspe://layost-pareergsthab So\\n\\n\\n\\nā€˜Keywords: Document Image AnalysisĀ» Deep Learning Layout Analysis\\nā€˜Character Renguition - Open Serres dary Ā« Tol\\n\\n\\nIntroduction\\n\\n\\nā€˜Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndoctiment image analysis (DIA) tea including document image clasiffeation [I]\\n\", lookup_str='', metadata={'source': 'layout-parser-paper-fast.jpg'}, lookup_index=0)" + "Document(page_content='2021\\n\\n2103.15348v2 [cs.CV] 21 Jun\\n\\narXiv\\n\\nLayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis\\n\\nZejiang Shen! (&4), Ruochen Zhang?, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li?\\n\\n1\\n\\nAllen Institute for AI shannons@allenai.org ? Brown University ruochen_zhang@brown. edu 3 Harvard University {melissadell, jacob_carlson}@fas.harvard.edu 4 University of Washington begl@cs.washington.edu 5 University of Waterloo w4221i@uwaterloo.ca\\n\\nAbstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https: //layout-parser.github. io.\\n\\nKeywords: Document Image Analysis - Deep Learning - Layout Analysis - Character Recognition - Open Source library - Toolkit.\\n\\n1 Introduction\\n\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11,', metadata={'source': './example_data/layout-parser-paper-screenshot.png'})" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders.image import UnstructuredImageLoader\n", + "\n", + "loader = UnstructuredImageLoader(\"./example_data/layout-parser-paper-screenshot.png\")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] }, @@ -94,47 +68,33 @@ "source": [ "### Retain Elements\n", "\n", - "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can keep that separation by specifying `mode=\"elements\"`." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "0fab833b", "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredImageLoader(\"layout-parser-paper-fast.jpg\", mode=\"elements\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c3e8ff1b", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "43c23d2d", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\n', lookup_str='', metadata={'source': 'layout-parser-paper-fast.jpg', 'filename': 'layout-parser-paper-fast.jpg', 'page_number': 1, 'category': 'Title'}, lookup_index=0)" + "Document(page_content='2021', metadata={'source': './example_data/layout-parser-paper-screenshot.png', 'coordinates': {'points': ((47.0, 492.0), (47.0, 591.0), (83.0, 591.0), (83.0, 492.0)), 'system': 'PixelSpace', 'layout_width': 1624, 'layout_height': 1920}, 'last_modified': '2024-07-01T10:38:29', 'filetype': 'PNG', 'languages': ['eng'], 'page_number': 1, 'file_directory': './example_data', 'filename': 'layout-parser-paper-screenshot.png', 'category': 'UncategorizedText'})" ] }, - "execution_count": 7, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "loader = UnstructuredImageLoader(\n", + " \"./example_data/layout-parser-paper-screenshot.png\", mode=\"elements\"\n", + ")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] } @@ -155,7 +115,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/microsoft_excel.ipynb b/docs/docs/integrations/document_loaders/microsoft_excel.ipynb index d7cc5e8083c..a0139b81a48 100644 --- a/docs/docs/integrations/document_loaders/microsoft_excel.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_excel.ipynb @@ -7,7 +7,9 @@ "source": [ "# Microsoft Excel\n", "\n", - "The `UnstructuredExcelLoader` is used to load `Microsoft Excel` files. The loader works with both `.xlsx` and `.xls` files. The page content will be the raw text of the Excel file. If you use the loader in `\"elements\"` mode, an HTML representation of the Excel file will be available in the document metadata under the `text_as_html` key." + "The `UnstructuredExcelLoader` is used to load `Microsoft Excel` files. The loader works with both `.xlsx` and `.xls` files. The page content will be the raw text of the Excel file. If you use the loader in `\"elements\"` mode, an HTML representation of the Excel file will be available in the document metadata under the `text_as_html` key.\n", + "\n", + "Please see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies." ] }, { @@ -22,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "a654e4d9", "metadata": {}, "outputs": [ @@ -36,13 +38,13 @@ { "data": { "text/plain": [ - "[Document(page_content='Stanley Cups', metadata={'source': 'example_data/stanley-cups.xlsx', 'file_directory': 'example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Title'}),\n", - " Document(page_content='\\n\\n\\nTeam\\nLocation\\nStanley Cups\\n\\n\\nBlues\\nSTL\\n1\\n\\n\\nFlyers\\nPHI\\n2\\n\\n\\nMaple Leafs\\nTOR\\n13\\n\\n\\n', metadata={'source': 'example_data/stanley-cups.xlsx', 'file_directory': 'example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups', 'page_number': 1, 'text_as_html': '
Team\"Payroll (millions)\"\"Wins\"
Nationals81.3498
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
', 'languages': ['eng'], 'parent_id': '17e9a90f9616f2abed8cf32b5bd3810d', 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table'}),\n", - " Document(page_content='Stanley Cups Since 67', metadata={'source': 'example_data/stanley-cups.xlsx', 'file_directory': 'example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups Since 67', 'page_number': 2, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Title'}),\n", - " Document(page_content='\\n\\n\\nTeam\\nLocation\\nStanley Cups\\n\\n\\nBlues\\nSTL\\n1\\n\\n\\nFlyers\\nPHI\\n2\\n\\n\\nMaple Leafs\\nTOR\\n0\\n\\n\\n', metadata={'source': 'example_data/stanley-cups.xlsx', 'file_directory': 'example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups Since 67', 'page_number': 2, 'text_as_html': '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
', 'languages': ['eng'], 'parent_id': 'ee34bd8c186b57e3530d5443ffa58122', 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table'})]" + "[Document(page_content='Stanley Cups', metadata={'source': './example_data/stanley-cups.xlsx', 'file_directory': './example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Title'}),\n", + " Document(page_content='\\n\\n\\nTeam\\nLocation\\nStanley Cups\\n\\n\\nBlues\\nSTL\\n1\\n\\n\\nFlyers\\nPHI\\n2\\n\\n\\nMaple Leafs\\nTOR\\n13\\n\\n\\n', metadata={'source': './example_data/stanley-cups.xlsx', 'file_directory': './example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups', 'page_number': 1, 'text_as_html': '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
', 'languages': ['eng'], 'parent_id': '17e9a90f9616f2abed8cf32b5bd3810d', 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table'}),\n", + " Document(page_content='Stanley Cups Since 67', metadata={'source': './example_data/stanley-cups.xlsx', 'file_directory': './example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups Since 67', 'page_number': 2, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Title'}),\n", + " Document(page_content='\\n\\n\\nTeam\\nLocation\\nStanley Cups\\n\\n\\nBlues\\nSTL\\n1\\n\\n\\nFlyers\\nPHI\\n2\\n\\n\\nMaple Leafs\\nTOR\\n0\\n\\n\\n', metadata={'source': './example_data/stanley-cups.xlsx', 'file_directory': './example_data', 'filename': 'stanley-cups.xlsx', 'last_modified': '2023-12-19T13:42:18', 'page_name': 'Stanley Cups Since 67', 'page_number': 2, 'text_as_html': '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
', 'languages': ['eng'], 'parent_id': 'ee34bd8c186b57e3530d5443ffa58122', 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table'})]" ] }, - "execution_count": 6, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -50,7 +52,7 @@ "source": [ "from langchain_community.document_loaders import UnstructuredExcelLoader\n", "\n", - "loader = UnstructuredExcelLoader(\"example_data/stanley-cups.xlsx\", mode=\"elements\")\n", + "loader = UnstructuredExcelLoader(\"./example_data/stanley-cups.xlsx\", mode=\"elements\")\n", "docs = loader.load()\n", "\n", "print(len(docs))\n", diff --git a/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb b/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb index 7d463b1e259..b80558050d1 100644 --- a/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb @@ -9,7 +9,9 @@ "\n", ">[Microsoft PowerPoint](https://en.wikipedia.org/wiki/Microsoft_PowerPoint) is a presentation program by Microsoft.\n", "\n", - "This covers how to load `Microsoft PowerPoint` documents into a document format that we can use downstream." + "This covers how to load `Microsoft PowerPoint` documents into a document format that we can use downstream.\n", + "\n", + "Please see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies." ] }, { @@ -25,46 +27,10 @@ "%pip install python-pptx" ] }, - { - "cell_type": "code", - "execution_count": 1, - "id": "721c48aa", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredPowerPointLoader" - ] - }, { "cell_type": "code", "execution_count": 2, - "id": "9d3d0e35", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "06073f91", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c9adc5cb", + "id": "721c48aa", "metadata": { "tags": [] }, @@ -72,15 +38,21 @@ { "data": { "text/plain": [ - "[Document(page_content='Adding a Bullet Slide\\n\\nFind the bullet slide layout\\n\\nUse _TextFrame.text for first bullet\\n\\nUse _TextFrame.add_paragraph() for subsequent bullets\\n\\nHere is a lot of text!\\n\\nHere is some text in a text box!', metadata={'source': 'example_data/fake-power-point.pptx'})]" + "[Document(page_content='Adding a Bullet Slide\\n\\nFind the bullet slide layout\\n\\nUse _TextFrame.text for first bullet\\n\\nUse _TextFrame.add_paragraph() for subsequent bullets\\n\\nHere is a lot of text!\\n\\nHere is some text in a text box!', metadata={'source': './example_data/fake-power-point.pptx'})]" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders import UnstructuredPowerPointLoader\n", + "\n", + "loader = UnstructuredPowerPointLoader(\"./example_data/fake-power-point.pptx\")\n", + "\n", + "data = loader.load()\n", + "\n", "data" ] }, @@ -94,38 +66,16 @@ "Under the hood, `Unstructured` creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "064f9162", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredPowerPointLoader(\n", - " \"example_data/fake-power-point.pptx\", mode=\"elements\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "abefbbdb", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, { "cell_type": "code", "execution_count": 4, - "id": "a547c534", + "id": "064f9162", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='Adding a Bullet Slide', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)" + "Document(page_content='Adding a Bullet Slide', metadata={'source': './example_data/fake-power-point.pptx', 'category_depth': 0, 'file_directory': './example_data', 'filename': 'fake-power-point.pptx', 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'category': 'Title'})" ] }, "execution_count": 4, @@ -134,6 +84,12 @@ } ], "source": [ + "loader = UnstructuredPowerPointLoader(\n", + " \"./example_data/fake-power-point.pptx\", mode=\"elements\"\n", + ")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] }, @@ -209,7 +165,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/microsoft_word.ipynb b/docs/docs/integrations/document_loaders/microsoft_word.ipynb index 7600725029a..6139d71a6de 100644 --- a/docs/docs/integrations/document_loaders/microsoft_word.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_word.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "7b80ea891", "metadata": {}, "outputs": [], @@ -34,52 +34,28 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "7b80ea89", "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import Docx2txtLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "99a12031", - "metadata": {}, - "outputs": [], - "source": [ - "loader = Docx2txtLoader(\"example_data/fake.docx\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b92f68b0", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d83dd755", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]" + "[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': './example_data/fake.docx'})]" ] }, - "execution_count": 7, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders import Docx2txtLoader\n", + "\n", + "loader = Docx2txtLoader(\"./example_data/fake.docx\")\n", + "\n", + "data = loader.load()\n", + "\n", "data" ] }, @@ -88,57 +64,35 @@ "id": "8d40727d", "metadata": {}, "source": [ - "## Using Unstructured" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "721c48aa", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredWordDocumentLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9d3d0e35", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredWordDocumentLoader(\"example_data/fake.docx\")" + "## Using Unstructured\n", + "\n", + "Please see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies." ] }, { "cell_type": "code", "execution_count": 3, - "id": "06073f91", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c9adc5cb", + "id": "721c48aa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]" + "[Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders import UnstructuredWordDocumentLoader\n", + "\n", + "loader = UnstructuredWordDocumentLoader(\"example_data/fake.docx\")\n", + "\n", + "data = loader.load()\n", + "\n", "data" ] }, @@ -157,39 +111,23 @@ "execution_count": 5, "id": "064f9162", "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredWordDocumentLoader(\"example_data/fake.docx\", mode=\"elements\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "abefbbdb", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a547c534", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)" + "Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': './example_data/fake.docx', 'category_depth': 0, 'file_directory': './example_data', 'filename': 'fake.docx', 'last_modified': '2023-12-19T13:42:18', 'languages': ['por', 'cat'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title'})" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "loader = UnstructuredWordDocumentLoader(\"./example_data/fake.docx\", mode=\"elements\")\n", + "\n", + "data = loader.load()\n", + "\n", "data[0]" ] }, @@ -263,7 +201,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/odt.ipynb b/docs/docs/integrations/document_loaders/odt.ipynb index f36ace1da6b..ea339bce06a 100644 --- a/docs/docs/integrations/document_loaders/odt.ipynb +++ b/docs/docs/integrations/document_loaders/odt.ipynb @@ -19,29 +19,21 @@ "execution_count": 1, "id": "e6616e3a", "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredODTLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a654e4d9", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.odt', 'filename': 'example_data/fake.odt', 'category': 'Title'})" + "Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.odt', 'category_depth': 0, 'file_directory': 'example_data', 'filename': 'fake.odt', 'last_modified': '2023-12-19T13:42:18', 'languages': ['por', 'cat'], 'filetype': 'application/vnd.oasis.opendocument.text', 'category': 'Title'})" ] }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders import UnstructuredODTLoader\n", + "\n", "loader = UnstructuredODTLoader(\"example_data/fake.odt\", mode=\"elements\")\n", "docs = loader.load()\n", "docs[0]" @@ -72,7 +64,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/org_mode.ipynb b/docs/docs/integrations/document_loaders/org_mode.ipynb index 39bab739458..754963c6707 100644 --- a/docs/docs/integrations/document_loaders/org_mode.ipynb +++ b/docs/docs/integrations/document_loaders/org_mode.ipynb @@ -22,35 +22,23 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredOrgModeLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredOrgModeLoader(file_path=\"example_data/README.org\", mode=\"elements\")\n", - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "page_content='Example Docs' metadata={'source': 'example_data/README.org', 'filename': 'README.org', 'file_directory': 'example_data', 'filetype': 'text/org', 'page_number': 1, 'category': 'Title'}\n" + "page_content='Example Docs' metadata={'source': './example_data/README.org', 'category_depth': 0, 'last_modified': '2023-12-19T13:42:18', 'languages': ['eng'], 'filetype': 'text/org', 'file_directory': './example_data', 'filename': 'README.org', 'category': 'Title'}\n" ] } ], "source": [ + "from langchain_community.document_loaders import UnstructuredOrgModeLoader\n", + "\n", + "loader = UnstructuredOrgModeLoader(\n", + " file_path=\"./example_data/README.org\", mode=\"elements\"\n", + ")\n", + "docs = loader.load()\n", + "\n", "print(docs[0])" ] }, @@ -78,7 +66,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/rst.ipynb b/docs/docs/integrations/document_loaders/rst.ipynb index f7cff53fac5..e69678bbc92 100644 --- a/docs/docs/integrations/document_loaders/rst.ipynb +++ b/docs/docs/integrations/document_loaders/rst.ipynb @@ -22,35 +22,21 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredRSTLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredRSTLoader(file_path=\"example_data/README.rst\", mode=\"elements\")\n", - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "page_content='Example Docs' metadata={'source': 'example_data/README.rst', 'filename': 'README.rst', 'file_directory': 'example_data', 'filetype': 'text/x-rst', 'page_number': 1, 'category': 'Title'}\n" + "page_content='Example Docs' metadata={'source': './example_data/README.rst', 'category_depth': 0, 'last_modified': '2023-12-19T13:42:18', 'languages': ['eng'], 'filetype': 'text/x-rst', 'file_directory': './example_data', 'filename': 'README.rst', 'category': 'Title'}\n" ] } ], "source": [ + "from langchain_community.document_loaders import UnstructuredRSTLoader\n", + "\n", + "loader = UnstructuredRSTLoader(file_path=\"./example_data/README.rst\", mode=\"elements\")\n", + "docs = loader.load()\n", + "\n", "print(docs[0])" ] }, @@ -78,7 +64,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/tsv.ipynb b/docs/docs/integrations/document_loaders/tsv.ipynb index 9d8e192c43d..1213e8cda7b 100644 --- a/docs/docs/integrations/document_loaders/tsv.ipynb +++ b/docs/docs/integrations/document_loaders/tsv.ipynb @@ -22,27 +22,6 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders.tsv import UnstructuredTSVLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredTSVLoader(\n", - " file_path=\"example_data/mlb_teams_2012.csv\", mode=\"elements\"\n", - ")\n", - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", @@ -51,6 +30,9 @@ "\n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -146,6 +128,13 @@ } ], "source": [ + "from langchain_community.document_loaders.tsv import UnstructuredTSVLoader\n", + "\n", + "loader = UnstructuredTSVLoader(\n", + " file_path=\"./example_data/mlb_teams_2012.csv\", mode=\"elements\"\n", + ")\n", + "docs = loader.load()\n", + "\n", "print(docs[0].metadata[\"text_as_html\"])" ] }, @@ -173,7 +162,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/unstructured_file.ipynb b/docs/docs/integrations/document_loaders/unstructured_file.ipynb index 9664ab67fc5..bd171c13b8e 100644 --- a/docs/docs/integrations/document_loaders/unstructured_file.ipynb +++ b/docs/docs/integrations/document_loaders/unstructured_file.ipynb @@ -7,18 +7,31 @@ "source": [ "# Unstructured File\n", "\n", - "This notebook covers how to use `Unstructured` package to load files of many types. `Unstructured` currently supports loading of text files, powerpoints, html, pdfs, images, and more." + "This notebook covers how to use `Unstructured` package to load files of many types. `Unstructured` currently supports loading of text files, powerpoints, html, pdfs, images, and more.\n", + "\n", + "Please see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "2886982e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "# # Install package\n", - "%pip install --upgrade --quiet \"unstructured[all-docs]\"" + "%pip install --upgrade --quiet \"unstructured[all-docs]\"" ] }, { @@ -51,39 +64,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "79d3e549", "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredFileLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2593d1dc", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredFileLoader(\"./example_data/state_of_the_union.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "fe34e941", - "metadata": {}, - "outputs": [], - "source": [ - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ee449788", - "metadata": {}, "outputs": [ { "data": { @@ -91,12 +74,18 @@ "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\\n\\nLast year COVID-19 kept us apart. This year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constit'" ] }, - "execution_count": 7, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from langchain_community.document_loaders import UnstructuredFileLoader\n", + "\n", + "loader = UnstructuredFileLoader(\"./example_data/state_of_the_union.txt\")\n", + "\n", + "docs = loader.load()\n", + "\n", "docs[0].page_content[:400]" ] }, @@ -110,41 +99,28 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "092d9a0b", "metadata": {}, - "outputs": [], - "source": [ - "files = [\"./example_data/whatsapp_chat.txt\", \"./example_data/layout-parser-paper.pdf\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f841c4f8", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredFileLoader(files)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "993c240b", - "metadata": {}, - "outputs": [], - "source": [ - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ce4ff07", - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'1/22/23, 6:30 PM - User 1: Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!\\n\\n1/22/23, 8:24 PM - User 2: Goodmorning! $50 is too low.\\n\\n1/23/23, 2:59 AM - User 1: How much do you want?\\n\\n1/23/23, 3:00 AM - User 2: Online is at least $100\\n\\n1/23/23, 3:01 AM - User 2: Here is $129\\n\\n1/23/23, 3:01 AM - User 2: \\n\\n1/23/23, 3:01 AM - User 1: Im not int'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "files = [\"./example_data/whatsapp_chat.txt\", \"./example_data/layout-parser-paper.pdf\"]\n", + "\n", + "loader = UnstructuredFileLoader(files)\n", + "\n", + "docs = loader.load()\n", + "\n", "docs[0].page_content[:400]" ] }, @@ -160,48 +136,32 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "ff5b616d", "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredFileLoader(\n", - " \"./example_data/state_of_the_union.txt\", mode=\"elements\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "feca3b6c", - "metadata": {}, - "outputs": [], - "source": [ - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "fec5bbac", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", - " Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", - " Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", - " Document(page_content='With a duty to one another to the American people to the Constitution.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", - " Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0)]" + "[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', metadata={'source': './example_data/state_of_the_union.txt', 'file_directory': './example_data', 'filename': 'state_of_the_union.txt', 'last_modified': '2024-07-01T11:18:22', 'languages': ['eng'], 'filetype': 'text/plain', 'category': 'NarrativeText'}),\n", + " Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', metadata={'source': './example_data/state_of_the_union.txt', 'file_directory': './example_data', 'filename': 'state_of_the_union.txt', 'last_modified': '2024-07-01T11:18:22', 'languages': ['eng'], 'filetype': 'text/plain', 'category': 'NarrativeText'}),\n", + " Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', metadata={'source': './example_data/state_of_the_union.txt', 'file_directory': './example_data', 'filename': 'state_of_the_union.txt', 'last_modified': '2024-07-01T11:18:22', 'languages': ['eng'], 'filetype': 'text/plain', 'category': 'NarrativeText'}),\n", + " Document(page_content='With a duty to one another to the American people to the Constitution.', metadata={'source': './example_data/state_of_the_union.txt', 'file_directory': './example_data', 'filename': 'state_of_the_union.txt', 'last_modified': '2024-07-01T11:18:22', 'languages': ['eng'], 'filetype': 'text/plain', 'category': 'UncategorizedText'}),\n", + " Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', metadata={'source': './example_data/state_of_the_union.txt', 'file_directory': './example_data', 'filename': 'state_of_the_union.txt', 'last_modified': '2024-07-01T11:18:22', 'languages': ['eng'], 'filetype': 'text/plain', 'category': 'NarrativeText'})]" ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "loader = UnstructuredFileLoader(\n", + " \"./example_data/state_of_the_union.txt\", mode=\"elements\"\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", "docs[:5]" ] }, @@ -217,59 +177,35 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "id": "767238a4", "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredFileLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9518b425", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredFileLoader(\n", - " \"layout-parser-paper-fast.pdf\", strategy=\"fast\", mode=\"elements\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "645f29e9", - "metadata": {}, - "outputs": [], - "source": [ - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "60685353", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='1', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", - " Document(page_content='2', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", - " Document(page_content='0', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", - " Document(page_content='2', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", - " Document(page_content='n', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'Title'}, lookup_index=0)]" + "[Document(page_content='2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 393.9), (16.34, 560.0), (36.34, 560.0), (36.34, 393.9)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': '89565df026a24279aaea20dc08cedbec', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'Title'}),\n", + " Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((162.779, 338.45008160000003), (162.779, 566.8455408), (454.0372021523199, 566.8455408), (454.0372021523199, 338.45008160000003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'links': [{'text': ':// layout - parser . github . io', 'url': 'https://layout-parser.github.io', 'start_index': 1477}], 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'NarrativeText'})]" ] }, - "execution_count": 4, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "docs[:5]" + "from langchain_community.document_loaders import UnstructuredFileLoader\n", + "\n", + "loader = UnstructuredFileLoader(\n", + " \"./example_data/layout-parser-paper.pdf\", strategy=\"fast\", mode=\"elements\"\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "docs[5:10]" ] }, { @@ -287,59 +223,33 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "8ca8a648", - "metadata": {}, - "outputs": [], - "source": [ - "!wget https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/example-docs/layout-parser-paper.pdf -P \"../../\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "id": "686e5eb4", "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredFileLoader(\n", - " \"./example_data/layout-parser-paper.pdf\", mode=\"elements\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c90f0e94", - "metadata": {}, - "outputs": [], - "source": [ - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "6ec859d8", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis', lookup_str='', metadata={'source': '../../layout-parser-paper.pdf'}, lookup_index=0),\n", - " Document(page_content='Zejiang Shen 1 ( (ea)\\n ), Ruochen Zhang 2 , Melissa Dell 3 , Benjamin Charles Germain Lee 4 , Jacob Carlson 3 , and Weining Li 5', lookup_str='', metadata={'source': '../../layout-parser-paper.pdf'}, lookup_index=0),\n", - " Document(page_content='Allen Institute for AI shannons@allenai.org', lookup_str='', metadata={'source': '../../layout-parser-paper.pdf'}, lookup_index=0),\n", - " Document(page_content='Brown University ruochen zhang@brown.edu', lookup_str='', metadata={'source': '../../layout-parser-paper.pdf'}, lookup_index=0),\n", - " Document(page_content='Harvard University { melissadell,jacob carlson } @fas.harvard.edu', lookup_str='', metadata={'source': '../../layout-parser-paper.pdf'}, lookup_index=0)]" + "[Document(page_content='2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 393.9), (16.34, 560.0), (36.34, 560.0), (36.34, 393.9)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': '89565df026a24279aaea20dc08cedbec', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'Title'}),\n", + " Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((162.779, 338.45008160000003), (162.779, 566.8455408), (454.0372021523199, 566.8455408), (454.0372021523199, 338.45008160000003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'links': [{'text': ':// layout - parser . github . io', 'url': 'https://layout-parser.github.io', 'start_index': 1477}], 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'NarrativeText'})]" ] }, - "execution_count": 1, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "docs[:5]" + "loader = UnstructuredFileLoader(\n", + " \"./example_data/layout-parser-paper.pdf\", mode=\"elements\"\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "docs[5:10]" ] }, { @@ -352,62 +262,38 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "id": "112e5538", "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredFileLoader\n", - "from unstructured.cleaners.core import clean_extra_whitespace" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b9c5ac8d", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredFileLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"elements\",\n", - " post_processors=[clean_extra_whitespace],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c44d5def", - "metadata": {}, - "outputs": [], - "source": [ - "docs = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b6f27929", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'}),\n", - " Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n", - " Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n", - " Document(page_content='1 2 0 2', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n", - " Document(page_content='n u J', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 258.36), (16.34, 286.14), (36.34, 286.14), (36.34, 258.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'})]" + "[Document(page_content='2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 393.9), (16.34, 560.0), (36.34, 560.0), (36.34, 393.9)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': '89565df026a24279aaea20dc08cedbec', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'Title'}),\n", + " Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'UncategorizedText'}),\n", + " Document(page_content='Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((162.779, 338.45008160000003), (162.779, 566.8455408), (454.0372021523199, 566.8455408), (454.0372021523199, 338.45008160000003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2023-12-19T13:42:18', 'links': [{'text': ':// layout - parser . github . io', 'url': 'https://layout-parser.github.io', 'start_index': 1477}], 'page_number': 1, 'parent_id': 'bde0b230a1aa488e3ce837d33015181b', 'filetype': 'application/pdf', 'category': 'NarrativeText'})]" ] }, - "execution_count": 5, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "docs[:5]" + "from langchain_community.document_loaders import UnstructuredFileLoader\n", + "from unstructured.cleaners.core import clean_extra_whitespace\n", + "\n", + "loader = UnstructuredFileLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"elements\",\n", + " post_processors=[clean_extra_whitespace],\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "docs[5:10]" ] }, { @@ -420,39 +306,6 @@ "If you want to get up and running with less set up, you can simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or `UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API. You can generate a free Unstructured API key [here](https://www.unstructured.io/api-key/). The [Unstructured documentation](https://unstructured-io.github.io/unstructured/) page will have instructions on how to generate an API key once they’re available. Check out the instructions [here](https://github.com/Unstructured-IO/unstructured-api#dizzy-instructions-for-using-the-docker-image) if you’d like to self-host the Unstructured API or run it locally." ] }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b50c70bc", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredAPIFileLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "12b6d2cf", - "metadata": {}, - "outputs": [], - "source": [ - "filenames = [\"example_data/fake.docx\", \"example_data/fake-email.eml\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "39a9894d", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredAPIFileLoader(\n", - " file_path=filenames[0],\n", - " api_key=\"FAKE_API_KEY\",\n", - ")" - ] - }, { "cell_type": "code", "execution_count": 4, @@ -471,6 +324,15 @@ } ], "source": [ + "from langchain_community.document_loaders import UnstructuredAPIFileLoader\n", + "\n", + "filenames = [\"example_data/fake.docx\", \"example_data/fake-email.eml\"]\n", + "\n", + "loader = UnstructuredAPIFileLoader(\n", + " file_path=filenames[0],\n", + " api_key=\"FAKE_API_KEY\",\n", + ")\n", + "\n", "docs = loader.load()\n", "docs[0]" ] @@ -483,19 +345,6 @@ "You can also batch multiple files through the Unstructured API in a single API using `UnstructuredAPIFileLoader`." ] }, - { - "cell_type": "code", - "execution_count": 5, - "id": "79a18e7e", - "metadata": {}, - "outputs": [], - "source": [ - "loader = UnstructuredAPIFileLoader(\n", - " file_path=filenames,\n", - " api_key=\"FAKE_API_KEY\",\n", - ")" - ] - }, { "cell_type": "code", "execution_count": 6, @@ -514,6 +363,11 @@ } ], "source": [ + "loader = UnstructuredAPIFileLoader(\n", + " file_path=filenames,\n", + " api_key=\"FAKE_API_KEY\",\n", + ")\n", + "\n", "docs = loader.load()\n", "docs[0]" ] @@ -543,7 +397,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.0" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/url.ipynb b/docs/docs/integrations/document_loaders/url.ipynb index bc26f369618..71747c7d5cd 100644 --- a/docs/docs/integrations/document_loaders/url.ipynb +++ b/docs/docs/integrations/document_loaders/url.ipynb @@ -7,25 +7,11 @@ "source": [ "# URL\n", "\n", - "This example covers how to load `HTML` documents from a list of `URLs` into the `Document` format that we can use downstream." - ] - }, - { - "cell_type": "markdown", - "id": "5ccca101-b167-43bc-849e-9d456b16a123", - "metadata": { - "execution": { - "iopub.execute_input": "2024-04-02T00:13:43.279309Z", - "iopub.status.busy": "2024-04-02T00:13:43.278977Z", - "iopub.status.idle": "2024-04-02T00:13:43.282230Z", - "shell.execute_reply": "2024-04-02T00:13:43.281907Z", - "shell.execute_reply.started": "2024-04-02T00:13:43.279282Z" - } - }, - "source": [ + "This example covers how to load `HTML` documents from a list of `URLs` into the `Document` format that we can use downstream.\n", + "\n", "## Unstructured URL Loader\n", "\n", - "You have to install the `unstructured` library:" + "For the examples below, please install the `unstructured` library and see [this guide](/docs/integrations/providers/unstructured/) for more instructions on setting up Unstructured locally, including setting up required system dependencies:" ] }, { @@ -35,26 +21,18 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -U unstructured" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "16c3699e", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredURLLoader" + "%pip install --upgrade --quiet unstructured" ] }, { "cell_type": "code", "execution_count": 2, - "id": "836fbac1", + "id": "16c3699e", "metadata": {}, "outputs": [], "source": [ + "from langchain_community.document_loaders import UnstructuredURLLoader\n", + "\n", "urls = [\n", " \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023\",\n", " \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023\",\n", @@ -66,7 +44,7 @@ "id": "33089aba-ff74-4d00-8f40-9449c29587cc", "metadata": {}, "source": [ - "Pass in ssl_verify=False with headers=headers to get past ssl_verification error." + "Pass in ssl_verify=False with headers=headers to get past ssl_verification errors." ] }, { @@ -74,19 +52,24 @@ "execution_count": 3, "id": "00f46fda", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Skip to main content\\n\\nSearch form\\n\\nHome\\n\\nWho We Are\\n\\nResearch\\n\\nPublications\\n\\nGet Involved\\n\\nPlanned Giving\\n\\nDonate\\n\\nRussian Offensive Campaign Assessment, February 8, 2023\\n\\nFeb 8, 2023 - ISW Press\\n\\nDownload the PDF\\n\\nKarolina Hird, Riley Bailey, George Barros, Layne Philipson, Nicole Wolkov, and Mason Clark\\n\\nFebruary 8, 8:30pm ET\\n\\nClick\\xa0here\\xa0to see ISW’s interactive map of the Russian invasion of Ukraine. This map is updated daily alongside the static maps present in this report.\\n\\nRussian forces have regained the initiative in Ukraine and have begun their next major offensive in Luhansk Oblast.\\xa0The pace of Russian operations along the Svatove-Kreminna line in western Luhansk Oblast has increased markedly over the past week, and Russian sources are widely reporting that conventional Russian troops are attacking Ukrainian defensive lines and making marginal advances along the Kharkiv-Luhansk Oblast border, particularly northwest of Svatove near Kupyansk and west of Kreminna.[1]\\xa0Geolocated combat footage has confirmed Russian gains in the Dvorichne area northwest of Svatove.[2]\\xa0Russian military command additionally appears to have fully committed elements of several conventional divisions to decisive offensive operations along the Svatove-Kreminna line, as ISW previously reported.[3]\\xa0Elements of several regiments of the 144th\\xa0and 3rd\\xa0Motor Rifle Division (20th\\xa0Combined Arms Army, Western Military District) and a regiment of the 90th\\xa0Tank Division (Central Military District), supported by elements of the 76th\\xa0Airborne Division and unspecified Southern Military District elements, are conducting offensive operations along the entire Svatove-Kreminna line and are reportedly advancing against Ukrainian defenses.[4]\\n\\nThe commitment of significant elements of at least three major Russian divisions to offensive operations in this sector indicates the Russian offensive has begun, even if Ukrainian forces are so far preventing Russian forces from securing significant gains.\\xa0The Russian offensive likely has not yet reached its full tempo; Russian command has not yet committed elements of the 2nd\\xa0Motorized Rifle Division (1st\\xa0Guards Tank Army, Western Military District), which deployed to Luhansk Oblast in January after deploying to Belarus.[5]\\xa0Russian forces are gradually beginning an offensive, but its success is not inherent or predetermined. While Russian forces in Luhansk Oblast now have the initiative (in that Russian forces are setting the terms of battle, ending the period of Ukrainian initiative from August 2022), the full commitment of these forces could lead to their eventual culmination along the Svatove-Kreminna line without achieving their objectives of capturing all of Luhansk and Donetsk oblasts. That culmination would likely provide a window of opportunity for Ukrainian forces to exploit with their own counteroffensive.[6]\\n\\nDonetsk People’s Republic (DNR) People’s Militia command reportedly assumed control over a Russian artillery battalion, likely in support of an effort to strengthen degraded DNR forces ahead of an imminent Russian offensive.\\xa0A Russian source published a video appeal from mobilized personnel of the 640th\\xa0howitzer battalion from Saratov Oblast on February 8 in which they stated that Russian military officials sent them to join DNR units and that DNR commanders are now trying to transfer them to infantry assault units.[7]\\xa0ISW has not previously observed Russian personnel subordinated to a DNR formation and this claim, if true, would suggest that Russian forces may be reinforcing degraded DNR formations with mobilized personnel from Russia itself because DNR formations are unable to replenish losses themselves. The reported subordination of Russian military personnel to DNR formations may portend a Russian effort to prepare DNR formations for an expanded role in their zone of responsibility along the western outskirts of Donetsk City, and the transfer of remaining conventional Russian forces from this area to the Bakhmut area and Luhansk Oblast, where Russian forces are conducting an increased pace of offensive operations.\\n\\nThe reported subordination of Russian mobilized personnel to DNR formations could also suggest that Russian military command may be continuing efforts to integrate ad hoc DNR and Luhansk Peopleā€˜s Republic (LNR) formations into the Russian Armed Forces, but will likely face significant difficulties.\\xa0The Russian Southern Military District formally controls the armed forces of the DNR and LNR through the 1st\\xa0and 2nd\\xa0Army Corps, respectively. However, many DNR and LNR formations remain ad hoc units and are not fully integrated into Russian MoD structures. ISW previously assessed that the Russian Ministry of Defense (MoD) appears to be rushing to integrate irregular conventional forces into a more traditional structure and may be creating new formations from DNR/LNR units in support of Russian Defense Minister Sergei Shoigu’s proposals to create new maneuver divisions.[8]\\xa0Russian forces would likely need to temporarily remove these irregular forces from frontline positions to integrate them into new Russian formations, a prospect that would not be operationally sound ahead of increased Russan offensive operations in Ukraine. Russian officials therefore may be attempting to gradually integrate these irregular formations through subordinating mobilized personnel under them without disrupting the command structures and existing personnel operating at front line positions. The mobilized personnel of the 640th\\xa0howitzer battalion claimed that DNR command is retraining assault units for artillery purposes yet still committing their artillery battalion to infantry roles, indicating a breakdown in command and the proper utilization of personnel among DNR formations.[9]\\xa0The Russian MoD will likely struggle to correct the poor effectiveness of DNR/LNR forces through the rapid integration of Russian personnel.\\n\\nRussian officials continue to propose measures to prepare Russia’s military industry for a protracted war in Ukraine while also likely setting further conditions for sanctions evasion.\\xa0Russian Prime Minister Mikhail Mishustin stated on February 8 that the Russian government will subsidize investment projects for the modernization of enterprises operating in the interests of the Russian military and will allocate significant funds for manufacturing new military equipment.[10]\\xa0Mishustin also stated that the Russian government would extend benefits to Russian entrepreneurs who support the Russian military, including extended payment periods on rented federal property.[11]\\xa0The Kremlin likely intends these measures to augment its overarching effort to gradually prepare Russia’s military industry for a protracted war in Ukraine while avoiding a wider economic mobilization that would create further domestic economic disruptions and corresponding discontent.[12]\\n\\nRussian officials also likely proposed these measures in coordination with a recent decree excluding Russian officials from requirements to list income declarations and proposals to repeal federal procurement procedures. The Kremlin may be creating a system of subsidies and benefits designed to have little oversight or accounting. This lack of oversight and accounting would likely allow Russian firms to better evade international sanctions regimes targeting Russia’s military industry.[13]\\xa0The United Kingdom announced a new list of sanctioned entities on February 8 focused on Russia’s military industry.[14]\\xa0ISW previously reported that 82% of Iranian-made drones downed in Ukraine had chips, semiconductors, and other components from the United States, suggesting that Russia and Iran are likely exploiting loopholes to transfer Western-produced arms components to Russia via proxy actors.[15]\\xa0The Kremlin’s effort to prepare the Russian military industry for a protracted war in Ukraine in part relies on the ability of Russian military industry to have consistent access to multiple secure supply chains of key foreign components that it otherwise cannot produce.\\n\\nKey Takeaways\\n\\nRussian forces have regained the initiative in Ukraine and have begun their next major offensive in Luhansk Oblast.\\n\\nThe commitment of significant elements of at least three major Russian divisions to offensive operations in this sector indicates the Russian offensive has begun, even if Ukrainian forces are so far preventing Russian forces from securing significant gains.\\n\\nDonetsk People’s Republic (DNR) People’s Militia command reportedly assumed control over a Russian artillery battalion, likely in support of an effort to strengthen degraded DNR forces ahead of an imminent Russian offensive.\\n\\nThe reported subordination of Russian mobilized personnel to DNR formations could also suggest that Russian military command may be continuing efforts to integrate ad hoc DNR and Luhansk Peopleā€˜s Republic (LNR) formations into the Russian Armed Forces, but will likely face significant difficulties.\\n\\nRussian officials continue to propose measures to prepare Russia’s military industry for a protracted war in Ukraine while also likely setting further conditions for sanctions evasion.\\n\\nRussian forces conducted ground attacks around Bakhmut and continued making tactical advances.\\n\\nRussian forces continued offensive actions northwest of Svatove and intensified offensive operations near Kreminna.\\n\\nRussian forces conducted limited ground attacks in the Avdiivka-Donetsk City area and western Donetsk Oblast.\\n\\nRussian and Ukrainian forces reportedly continue small-scale skirmishes and reconnaissance activity in the Dnipro River delta and on the Kinburn Spit.\\n\\nThe Wagner Group is reportedly resorting to more coercive tactics in its prison recruitment campaign, possibly in response to the campaign’s declining effectiveness.\\n\\nWe do not report in detail on Russian war crimes because those activities are well-covered in Western media and do not directly affect the military operations we are assessing and forecasting. We will continue to evaluate and report on the effects of these criminal activities on the Ukrainian military and population and specifically on combat in Ukrainian urban areas. We utterly condemn these Russian violations of the laws of armed conflict, Geneva Conventions, and humanity even though we do not describe them in these reports.\\n\\nUkrainian Counteroffensives—Eastern Ukraine\\n\\nRussian Main Effort—Eastern Ukraine (comprised of two subordinate main efforts);\\n\\nRussia Subordinate Main Effort #1—Capture the remainder of Luhansk Oblast and push westward into eastern Kharkiv Oblast and encircle northern Donetsk Oblast\\n\\nRussian Subordinate Main Effort #2—Capture the entirety of Donetsk Oblast\\n\\nRussian Supporting Effort—Southern Axis\\n\\nRussian Mobilization and Force Generation Efforts\\n\\nActivities in Russian-occupied Areas\\n\\nRussian Main Effort—Eastern Ukraine\\n\\nRussian Subordinate Main Effort #1- Luhansk Oblast (Russian objective: Capture the remainder of Luhansk Oblast and continue offensive operations into eastern Kharkiv Oblast and\\xa0\\xa0northern Donetsk Oblast)\\n\\nISW continues to assess the current Russian most likely course of action (MLCOA) is an imminent offensive effort in Luhansk Oblast and is therefore adjusting the structure of the daily campaign assessments. We will no longer include the Eastern Kharkiv and Western Luhansk Oblast area as part of Ukrainian counteroffensives and will assess this area as a subordinate part of the Russian main effort in Eastern Ukraine. The assessment of Luhansk Oblast as part of the Russian main effort does not preclude the possibility of continued Ukrainian counteroffensive actions here or anywhere else in theater in the future. ISW will report out on Ukrainian counteroffensive efforts as they occur.\\n\\nRussian forces continued offensive actions northwest of Svatove on February 8. Kharkiv Oblast Head Oleh Synehubov reported on February 8 that Russian forces are increasing their presence northwest of Svatove in the Kupyansk and Dvorichna areas.[16]\\xa0A former Luhansk Peopleā€˜s Republic (LNR) deputy claimed that fierce fighting is ongoing 7km from the Kupyansk area, likely referring to areas near Synkivka, which Russian sources claimed Russian forces captured on February 6.[17]\\xa0The Ukrainian General Staff reported that Russian forces conducted a limited ground attack near Novoselivske, about 15km northwest of Svatove.[18]\\xa0\\xa0Former Russian militant commander and nationalist milblogger Igor Girkin denied that Russian forces have made any significant territorial gains in Kharkiv Oblast, particularly in the Kupyansk direction, as of February 8.[19]\\n\\nRussian forces also reportedly intensified offensive operations in the Kreminna area. Luhansk Oblast Head Serhiy Haidai stated on February 8 that there has been a ā€maximum escalationā€ in the Kreminna direction and that Russian forces are attempting to break through Ukrainian defenses in this area.[20]\\xa0The Ukrainian General Staff reported that Russian forces attacked near Chervonopopivka (5km north of Kreminna).[21]\\xa0Several Russian milbloggers circulated unconfirmed footage of unspecified Central and Western Military District elements which crossed the Zherebets River running north to south in western Luhansk Oblast, roughly parallel to the Svatove-Kreminna line) and captured Ukrainian positions in an unspecified location around February 6.[22]\\xa0Russian sources also reported that elements of the 3rd\\xa0Motor Rifle Division (20th\\xa0Combined Arms Army, Western Military District) are approaching the Zherebets River and are threatening Ukrainian positions in the area.[23]\\xa0A prominent Russian milblogger posted footage of the 59th\\xa0Tank Regiment of the 144th\\xa0Motor Rifle Division (20th\\xa0Combined Arms Army, Western Military District) attacking towards Torske (13km west of Kreminna) and claimed the unit pushed Ukrainian forces back to secondary lines of defense.[24]\\n\\nRussian forces continued offensive operations south of Kreminna on February 8. The Ukrainian General Staff reported Russian troops attacked near Shepilove (7km south of Kreminna) and Bilohorivka (10km south of Kreminna).[25]\\xa0Chechen Head Ramzan Kadyrov claimed that elements of the Chechen ā€Akhmatā€ special forces and 2nd\\xa0Brigade of the Luhansk People’s Republic 2nd\\xa0Army Corps captured Ukrainian positions near Berestove, 30km south of Kreminna.[26]\\xa0Russian forces appear to be pushing northeast of the Bakhmut area towards Siversk (17km southwest of Kreminna) to provide a supporting line of advance to the likely main Russian push directly westward toward Kreminna.\\n\\nRussian Subordinate Main Effort #2—Donetsk Oblast\\xa0(Russian objective: Capture the entirety of Donetsk Oblast, the claimed territory of Russia’s proxies in Donbas)\\n\\nRussian forces conducted ground attacks around Bakhmut and continued making tactical advances on February 8. Geolocated footage posted between February 4 and 8 confirms that Russian forces have made marginal advances north of Bakhmut near Krasna Hora and Zaliznyanske (10km north of Bakhmut), in the Stupky area of northern Bakhmut, and southwest of Bakhmut near Ivanivske.[27]\\xa0Russian forces are visually confirmed to be within 2.5 km of the E40 Bakhmut-Slovyansk highway.[28]\\xa0The Ukrainian General Staff also reported that Ukrainian troops repelled Russian attacks on Bakhmut itself; northeast of Bakhmut near Verkhnokamyanske (30km northeast), Fedorivka (15km northeast), Spirne (27km northeast), and Vymika (20km northeast); north of Bakhmut near Paraskoviivka (5km north) and Krasna Hora (4km north); northwest of Bakhmut near Orikhovo-Vasylivka (12km northwest) and Dubovo-Vasylivka (7km northwest); and west of Bakhmut near Ivanivske (5km west) and Chasiv Yar (10km west).[29]\\xa0\\xa0The Ukrainian General Staff’s report that Russian forces are attacking towards Orikhovo-Vasylivka and Dubovo-Vasylivka is consistent with geolocated combat footage and indicates that Russian forces seek to encircle Bakhmut by cutting off Ukrainian forces’ access to the E40. Similarly, the report of a Russian attack on Chasiv Yar indicates that Russian forces have likely advanced closer to the T0504 Kostyantynivka-Chasiv Yar-Bakhmut highway southwest of Bakhmut.\\xa0\\xa0Russian sources claimed that Wagner Group fighters took control of Krasna Hora and are fighting northeast of Bakhmut.[30]\\xa0Russian milbloggers also claimed that Wagner Group forces established fire control over a section of the T0504 highway between Stupochky and Ivanivske.[31]\\n\\nRussian forces conducted limited ground attacks in the Avdiivka-Donetsk City area on February 8. The Ukrainian General Staff reported that Ukrainian forces repelled Russian assaults near Avdiivka, north of Avdiivka near Kamianka, and along the western outskirts of Donetsk City near Vodyane, Pervomaiske, and Krasnohorivka.[32]\\xa0Former Russian officer and prominent milblogger Igor Girkin claimed that Russian forces did not advance near Avdiivka and took heavy losses.[33]\\xa0Another milblogger claimed that fighting is ongoing in western Marinka (on the southwestern outskirts of Donetsk City) and that unspecified elements of the Southern Military District (SMD) advanced through urban areas of Marinka on February 8.[34]\\xa0The milblogger also stated that Russian forces were able to gain a foothold in positions near a tire repair plant in Marinka.[35]\\xa0Videos posted by milbloggers on February 8 reportedly show SMD tank units attacking a Ukrainian position in Marinka and Russian tanks operating in western Marinka.[36]\\xa0Former Deputy LNR Interior Minister Vitaly Kiselev posted a video on February 8 purportedly showing Russian elements of the 150th Motorized Rifle Division (8th Combined Arms Army, SMD) attacking Marinka and claimed that Russian forces had cleared all Ukrainian fortifications there.[37]\\xa0The deployment of valuable Russian conventional military units (as opposed to DNR proxy forces) in the area is notable, if confirmed. Girkin, however, claimed that the situation in Marinka has not changed and continues at a sluggish pace.[38]\\n\\nRussian forces conducted a limited ground attack in western Donetsk Oblast on February 8. The Ukrainian General Staff reported that Ukrainian forces repelled Russian assaults near Bohoyavlenka (25km southwest of Donetsk City).[39]\\xa0Russian sources made conflicting claims about the status of operations in this area. One milblogger claimed that fierce fighting is ongoing near Vuhledar (30km southwest of Donetsk City), while other milbloggers stated that there is no active fighting in the area.[40]\\xa0Girkin claimed that Ukrainian forces repelled Russian assaults near\\xa0Vuhledar and inflicted heavy losses.[41]\\xa0Odesa Military Administration Spokesman Serhiy Bratchuk shared a video on February 8 of Ukrainian forces attacking and halting a disorganized Russian mechanized column near Vuhledar.[42]\\n\\nSupporting Effort—Southern Axis\\xa0(Russian objective: Maintain frontline positions and secure rear areas against Ukrainian strikes)\\n\\nRussian and Ukrainian forces reportedly continued small scale skirmishes and reconnaissance activity in the Dnipro River delta and on the Kinburn Spit on February 8. The United Kingdom Ministry of Defense (UK MoD) reported that Russian forces are using small boats to try to maintain a presence on islands in the Dnipro River delta south of Kherson City and that Ukrainian forces have deployed long-range artillery to strike several Russian outposts on the islands.[43]\\xa0The UK MoD reported that Russian and Ukrainian forces have likely deployed small groups on the Kinburn Spit in Mykolaiv Oblast, aiming to control the Dnipro Gulf.[44]\\xa0Ukraine’s Southern Operational Command Spokesperson Natalia Humenyuk previously stated that Russian forces are increasing the number of reconnaissance and sabotage attempts in the area of the Dnipro River delta as part of an information operation to create a perceived threat against Kherson City.[45]\\n\\nRussian forces continue to construct defensive fortifications in Zaporizhia Oblast. Satellite imagery collected between January 26 and February 7 shows Russian forces expanding trench and field fortifications near Tarasivka, Zaporizhia Oblast.[46]\\xa0Russian forces likely constructed these fortifications to further strengthen Russian positions along the T0401 highway between Polohy and Tokmak.\\xa0\\xa0Russian forces are likely establishing long defensive lines along critical grounds lines of communication (GLOCs) in Zaporizhia Oblast in preparation to defend against possible future Ukrainian counteroffensive operations along the Zaporizhia frontline. However, ISW has not observed Russian forces constructing defenses intended to halt a cross-country Ukrainian attack on a large front, and defensive positions remain limited to main roads.\\n\\nRussian forces continued routine fire west of Hulyaipole and in Dnipropetrovsk, Kherson, and Mykolaiv Oblasts on February 8.[47]\\xa0Ukrainian sources reported that Russian forces struck Kherson City and in the vicinity of Ochakiv, Mykolaiv Oblast.[48]\\n\\nMobilization and Force Generation Efforts\\xa0(Russian objective: Expand combat power without conducting general mobilization)\\n\\nRussian officials continued attempts to extend social benefits held by regular Russian servicemembers to volunteer formations serving in Ukraine. Russian Prime Minister Mikhail Mishustin stated on February 8 that the Russian government has prepared new measures to support volunteers, including increasing pensions and social assistance payments related to injuries and disabilities.[49]\\xa0The Russian State Duma is reportedly drafting a bill to include provisions against discrediting volunteer detachments assisting the Russian military in Ukraine, as Wagner Group Financier Yevgeny Prigozhin previously demanded.[50]\\xa0The Kremlin is likely pursuing efforts to more formally recognize volunteer formations in order to mitigate continued criticism of the Russian Ministry of Defense (MoD) over the unclear status of volunteer formations.[51]\\n\\nThe Wagner Group is reportedly resorting to more coercive tactics in its campaign to recruit prisoners, possibly in response to declining numbers of recruits since autumn 2022. Independent Russian outlet\\xa0Agentstvo\\xa0reported on February 8 that Russian lawyers and human rights activists stated that Wagner Group representatives and Russian Ministry of Internal Affairs and Federal Security Service (FSB) officials are threatening prisoners in Samara and Rostov oblasts, Krasnodar Krai, and the North Caucasus with new criminal cases if they refuse to volunteer with the Wagner Group in Ukraine.[52]\\xa0One of the lawyers reportedly stated that fewer convicts have agreed to volunteer with the Wagner Group in an unspecified recent period because of information about its high casualties, supporting ISW’s previous assessment that Russian convicts’ resistance may have caused a decline in the Wagner Group’s prison recruitment campaign.[53]\\xa0The Wagner Group will likely continue these more coercive practices as it seeks to replenish its forces in Ukraine with more convict recruits following months of highly attritional human wave attacks in eastern Ukraine.\\n\\nRussian officials continue to promote incremental efforts to fix longstanding personnel issues associated with mobilization. Russian Deputy Chairman of the Federation Council (and head of the mobilization working group) Andrey Turchak claimed that the mobilization working group has received appeals from 22,000 Russian servicemembers and their family members since holding its first meeting on December 29, 2022, addressing issues like the incorrect accrual of payments and the wrongful mobilization of fathers with many children who should be exempt.[54]\\xa0Turchak stated that the working group has heavily focused on solving poor recordkeeping issues through efforts to digitize military registration information from military recruitment offices.[55]\\xa0Turchak claimed that the working group sent a report to Russian President Vladimir Putin with recommendations to establish comprehensive rehabilitation centers, a minimum set of measures to support family members, a reduced term for recognizing a Russian soldier as missing, and a guarantee for receiving pensions.[56]\\xa0These proposals and efforts are likely meant primarily to placate ultranationalist figures that criticized the numerous issues associated with mobilization and hedge against further domestic discontent ahead of a likely second wave of mobilization.\\n\\nActivity in Russian-occupied Areas\\xa0(Russian objective: consolidate administrative control of and annexed areas; forcibly integrate Ukrainian civilians into Russian sociocultural, economic, military, and governance systems)\\n\\nRussian occupation authorities are continuing efforts to increase connectivity between Russia and southern Ukraine. Kherson Occupation Head Vladimir Saldo claimed on February 8 that Russian occupation authorities have approved design and research works on a new highway that will run from Crimea, north of the Sea of Azov, to Rostov-on-Don, Russia.[57]\\xa0Saldo also claimed that the construction of a new town in the Arabat Spit in northeast Crimea has begun.[58]\\xa0ISW has previously assessed that Russian occupation authorities likely seek to increase the population in the deep rear of occupied territories to strengthen production capabilities and support logistics related to Russia’s invasion of Ukraine.[59]\\n\\nRussian occupation authorities continue to lean on patronage-like partnerships with Russian federal subjects to restore infrastructure in occupied territories. Donetsk People’s Republic (DNR) Head Denis Pushilin claimed on February 8 that he held a meeting with Sakhalin Oblast Governor Valery Limarenko in which they discussed Sakhalin Oblast’s plans to help repair kindergartens, stadiums, schools, and playgrounds in occupied Shakhtarsk, Donetsk Oblast.[60]\\xa0Luhansk People’s Republic (LNR) Head Leonid Pasechnik held a meeting with Voronezh Oblast Governor Aleksandr Gusev on February 8 during which Gusev claimed that Voronezh Oblast hopes to develop occupied Luhansk Oblast to not only extract raw materials, but also to develop a processing industry.[61]\\xa0Gusev claimed that Voronezh Oblast will double the amount of aid it previously provided occupied Luhansk Oblast in 2022 to bring living standards in occupied Luhansk Oblast to those of Russia’s ā€œnationalā€ level.[62]\\n\\nSignificant activity in Belarus\\xa0(ISW assesses that a Russian or Belarusian attack into northern Ukraine in early 2023 is extraordinarily unlikely and has thus restructured this section of the update. It will no longer include counter-indicators for such an offensive.\\n\\nISW will continue to report daily observed Russian and Belarusian military activity in Belarus, but these are not indicators that Russian and Belarusian forces are preparing for an imminent attack on Ukraine from Belarus. ISW will revise this text and its assessment if it observes any unambiguous indicators that Russia or Belarus is preparing to attack northern Ukraine.)\\n\\nBelarusian airborne forces may be conducting tactical force-on-force exercises with Russian airborne elements in Belarus. The Belarusian Ministry of Defense announced on February 8 that unspecified airborne infantry companies — likely of the Belarusian 38th Air Assault Brigade — conducted a force-on-force company tactical exercise at the Brest Training Ground, emphasizing using unmanned aerial vehicles, urban warfare, small unit tactics, and tactical medicine.[63]\\xa0It is unclear if Russian airborne forces participated in this exercise. The Belarusian 38th Air Assault Brigade has historically conducted joint exercises with elements of the Russian 76th Air Assault Division, 106th Airborne Division, and the 31st Air Assault Brigade - all units that have taken casualties in Ukraine and require regeneration.[64]\\n\\nBelarusian maneuver elements continue conducting exercises in Belarus. Unspecified elements of the Belarusian 19th Separate Guards Mechanized Brigade conducted tactical readiness exercises at the Lepelsky Training Ground in Vitebsk Oblast, Belarus, on February 8.[65]\\n\\nNote: ISW does not receive any classified material from any source, uses only publicly available information, and draws extensively on Russian, Ukrainian, and Western reporting and social media as well as commercially available satellite imagery and other geospatial data as the basis for these reports. References to all sources used are provided in the endnotes of each update.\\n\\n[1]\\xa0https://isw.pub/UkrWar020623;\\xa0https://isw.pub/UkrWar020423; https://isw.pub/UkrWar020223\\n\\n[2]\\xa0https://t.me/DeepStateUA/15451\\n\\n[3]\\xa0https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-january-25-2023\\n\\n[4]\\xa0https://t.me/rybar/43387;\\xa0https://t.me/notes_veterans/7845;\\xa0https://t.me/russkiy_opolchenec/35783;\\xa0https://t.me/RVvoenkor/37711;\\xa0https://t.me/wargonzo/10782;\\xa0https://t.me/vysokygovorit/10725;\\xa0https://t.me/vysokygovorit/10704;\\xa0https://t.me/vysokygovorit/10706;\\xa0https://t.me/vysokygovorit/10703;\\xa0https://t.me/wargonzo/10726;\\xa0https://t.me/kommunist/15515;\\xa0https://t.me/rybar/43256;\\xa0https://t.me/rybar/43257;\\xa0https://t.me/dva_majors/8678;\\xa0https://t.me/rybar/43256;\\xa0https://t.me/vysokygovorit/10684;\\xa0https://t.me/rybar/43220;\\xa0https://t.me/rybar/43218;\\xa0https://t.me/wargonzo/10658;\\xa0https://t.me/rybar/43197;\\xa0https://t.me/boris_rozhin/76787;\\xa0https://t.me/Z4LPR/445;\\xa0https://t.me/m0sc0wcalling/18752;\\xa0https://t.me/komdiv_76/311;\\xa0https://t.me/harry_homolsky/3185\\n\\n[5]https://www.understandingwar.org/sites/default/files/Russian%20Operations%20Assessments%20January%2027%202023.pdf\\n\\n[6]\\xa0https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-january-15-2023\\n\\n[7]\\xa0https://t.me/ostorozhno_novosti/14167\\n\\n[8]\\xa0https://isw.pub/UkrWar020323\\xa0;\\xa0https://isw.pub/UkrWar020423\\n\\n[9]\\xa0https://t.me/ostorozhno_novosti/14167\\n\\n[10]\\xa0https://podolyaka\\xa0dot ru/2023/02/08/zayavleniya-premer-ministra-rf-mihaila-mishustina-o-podderzhke-uchastnikov-svo-i-voennoy-promyshlennosti/;\\xa0https://stolica-s\\xa0dot su/archives/366231;\\xa0https://t.me/rybar/43402\\n\\n[11]\\xa0https://podolyaka\\xa0dot ru/2023/02/08/zayavleniya-premer-ministra-rf-mihaila-mishustina-o-podderzhke-uchastnikov-svo-i-voennoy-promyshlennosti/;\\xa0https://stolica-s\\xa0dot su/archives/366231;\\xa0https://t.me/rybar/43402\\n\\n[12]\\xa0https://isw.pub/UkrWar011823\\xa0;\\xa0https://isw.pub/UkrWar010723\\xa0;\\n\\n[13]\\xa0https://isw.pub/UkrWar020623\\xa0;\\xa0https://isw.pub/UkrWar020123\\n\\n[14]\\xa0https://www.gov.uk/government/news/new-sanctions-target-putins-war-machine-and-financial-networks-as-uk-accelerates-economic-pressure-on-russia\\n\\n[15]\\xa0https://www.cbsnews.com/news/ukraine-war-russia-iranian-drones-us-made-technology-chips/\\xa0;\\xa0https://isw.pub/UkrWar020623\\n\\n[16]\\xa0https://suspilne\\xa0dot media/amp/378863-de-okupanti-posiluut-prisutnist-na-harkivsini-dani-sinegubova/\\n\\n[17]\\xa0https://t.me/kommunist/15598; https://understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-6-2023\\n\\n[18]https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\n\\n[19]\\xa0https://t.me/strelkovii/3896\\n\\n[20]\\xa0https://t.me/luhanskaVTSA/8438\\n\\n[21]https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\n\\n[22]\\xa0https://t.me/russkiy_opolchenec/35783; https://t.me/RVvoenkor/37711\\n\\n[23]\\xa0https://t.me/rybar/43387;\\xa0https://t.me/notes_veterans/7845\\n\\n[24]\\xa0https://t.me/wargonzo/10782\\n\\n[25]https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\n\\n[26]\\xa0https://t.me/RKadyrov_95/3332\\n\\n[27]\\xa0https://twitter.com/fdov21/status/1623368452667805701\\n\\n[28]\\xa0https://twitter.com/Militarylandnet/status/1623071883988987905; https://twitter.com/EerikMatero/status/1623076900548517892; https://twitter.com/neonhandrail/status/1623206937134497792; https://twitter.com/neonhandrail/status/1623207358888558593\\n\\n[29]https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\n\\n[30]\\xa0https://t.me/wargonzo/10773;\\xa0https://t.me/strelkovii/3896\\n\\n[31]\\xa0https://t.me/DonbassYasinovatayanaliniiOgnia/36445;\\xa0https://t.me/NeoficialniyBeZsonoV/22062;\\xa0https://t.me/wargonzo/10773;\\xa0https://t.me/strelkovii/3896\\n\\n[32]https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\n\\n[33]\\xa0https://t.me/strelkovii/3896\\n\\n[34]\\xa0https://t.me/rybar/43405\\n\\n[35]\\xa0https://t.me/rybar/43405\\n\\n[36]\\xa0https://t.me/boris_rozhin/77568;\\xa0https://t.me/sashakots/38439\\n\\n[37]\\xa0https://t.me/kommunist/15635\\n\\n[38]\\xa0https://t.me/strelkovii/3896\\n\\n[39]https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\n\\n[40]\\xa0https://t.me/boris_rozhin/77574;\\xa0https://t.me/wargonzo/10773\\n\\n[41]\\xa0https://t.me/strelkovii/3896\\n\\n[42]\\xa0https://t.me/Bratchuk_Sergey/29230\\n\\n[43]\\xa0https://twitter.com/DefenceHQ/status/1623199796352745475/photo/1\\n\\n[44]\\xa0https://twitter.com/DefenceHQ/status/1623199796352745475/photo/1\\n\\n[45]\\xa0https://armyinform dot\\xa0com.ua/2023/02/01/zbilshennya-kilkosti-rozviduvalnyh-grup-voroga-v-gyrli-dnipra-mozhe-buty-oznakoyu-nagnitannya-sytuacziyi-gumenyuk/\\n\\n[46]\\xa0https://twitter.com/bradyafr/status/1623082928283746304?s=20&t=ETx-WeYabYcAhBWPxEz2BA\\n\\n[47]\\xa0https://t.me/mykolaivskaODA/4236\\xa0;\\xa0\\xa0https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\xa0;\\xa0\\xa0https://www.facebook.com/GeneralStaff.ua/posts/pfbid02kL8XZwXNsSUphpcF5SAsVUR92f3mJJevFsSWvEZnRJsXKEr7LQzhWmWxXFubauUml\\xa0;\\xa0https://t.me/khersonskaODA/3607\\xa0; https://t.me/khersonskaODA/3616; https://t.me/khersonskaODA/3613;\\xa0https://t.me/khersonskaODA/3615\\xa0;\\xa0https://www.facebook.com/sergey.khlan/posts/pfbid02L4QqnKMLM3QLzY1pvQrr5CxtPVxrEu55qdBouZ7dB3jqdrfGhBHuXQyW5tavq4d3l\\xa0;\\xa0https://t.me/mykolaivskaODA/4236\\xa0\\xa0;\\xa0https://t.me/zoda_gov_ua/16505\\xa0\\xa0;\\xa0https://t.me/vilkul/2680\\xa0\\xa0;\\xa0https://t.me/Yevtushenko_E/2419\\n\\n[48]\\xa0https://t.me/mykolaivskaODA/4236\\xa0;\\xa0\\xa0https://www.facebook.com/GeneralStaff.ua/posts/pfbid0FuH223o7wLNSiJSNdCXigpCmwEnYbhGorMX4DoRP98heCwqkGax2jw7LAfJod8mgl\\xa0;\\xa0\\xa0https://www.facebook.com/GeneralStaff.ua/posts/pfbid02kL8XZwXNsSUphpcF5SAsVUR92f3mJJevFsSWvEZnRJsXKEr7LQzhWmWxXFubauUml\\xa0;\\xa0https://t.me/khersonskaODA/3607\\n\\n[49]\\xa0https://podolyaka dot ru/2023/02/08/zayavleniya-premer-ministra-rf-mihaila-mishustina-o-podderzhke-uchastnikov-svo-i-voennoy-promyshlennosti/; https://stolica-s dot su/archives/366231; https://t.me/rybar/43402\\n\\n[50]\\xa0https://isw.pub/UkrWar020723; https://t.me/concordgroup_official/425; https://t.me/Prigozhin_hat/2612\\n\\n[51]\\xa0https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-1-2023\\n\\n[52]\\xa0https://t.me/agentstvonews/2482\\n\\n[53]\\xa0https://t.me/agentstvonews/2482\\xa0;\\xa0https://isw.pub/UkrWar020123\\n\\n[54]\\xa0https://t.me/turchak_andrey/1032\\xa0;\\xa0\\xa0https://isw.pub/UkrWar122922\\n\\n[55]\\xa0https://t.me/turchak_andrey/1032\\n\\n[56]\\xa0https://t.me/turchak_andrey/1032\\n\\n[57]\\xa0https://t.me/SALDO_VGA/363;\\xa0https://t.me/SALDO_VGA/365\\n\\n[58]\\xa0https://t.me/SALDO_VGA/363;\\xa0https://t.me/SALDO_VGA/365\\n\\n[59]\\xa0https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-january-5-2023\\n\\n[60]\\xa0https://t.me/pushilindenis/3165\\n\\n[61]\\xa0https://t.me/glava_lnr_info/745;\\xa0https://t.me/glava_lnr_info/748\\n\\n[62]\\xa0https://t.me/glava_lnr_info/745;\\xa0https://t.me/glava_lnr_info/748\\n\\n[63]\\xa0https://t.me/modmilby/22455; https://t.me/modmilby/22419\\n\\n[64]\\xa0https://www.osw.waw.pl/sites/default/files/OSW-Report_Russia%E2%80%99s-Belarusian-army_net.pdf\\n\\n[65]\\xa0https://t.me/modmilby/22470\\n\\nTags\\n\\nUkraine Project\\n\\nFile Attachments:\\n\\nZaporizhia Battle Map Draft February 08,2023.png\\n\\nKherson-Mykolaiv Battle Map Draft February 08,2023.png\\n\\nDonetsk Battle Map Draft February 08,2023.png\\n\\nDraftUkraineCoTFebruary08,2023.png\\n\\nKharkiv Battle Map Draft February 08,2023.png\\n\\n1400 16th Street NW, Suite 515 Washington, DC 20036\\n\\nph (202) 293-5550\\n\\nĀ©2007 – 2024 THE INSTITUTE FOR THE STUDY OF WAR', metadata={'source': 'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023'})" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "loader = UnstructuredURLLoader(urls=urls)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b68a26b3", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" + "loader = UnstructuredURLLoader(urls=urls)\n", + "\n", + "data = loader.load()\n", + "\n", + "data[0]" ] }, { @@ -112,50 +95,39 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -U selenium unstructured" + "%pip install --upgrade --quiet selenium unstructured" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "5fc50835", "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import SeleniumURLLoader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24e896ce", - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Menu\\n\\nSearch\\n\\nClose\\n\\nCollapse side panel\\n\\n323,527 photos\\n\\nCN Tower\\n\\n4.6\\n\\n(71,071)\\n\\nTourist attraction\\n\\nOverview\\n\\nTickets\\n\\nReviews\\n\\nAbout\\n\\nDirections\\n\\nSave\\n\\nNearby\\n\\nSend to phone\\n\\nShare\\n\\nLandmark, over 553-metre tower featuring a glass floor & a revolving eatery with panoramic views.\\n\\nSponsoredBy CityPASS\\n\\nSave 42% at 5 top Toronto attractions.$92\\xa0Ā·\\xa04.6(9k+)Entry included\\n\\nAdmission\\n\\nAbout these results\\n\\nGives you entry to this place\\n\\nCN Tower Official site$32.87\\ue315Instant confirmation Ā· Mobile ticket\\n\\nEvendo $60.43\\ue315Multi-attraction pass Ā· Free cancellation\\n\\nCityPASS $92.04\\ue315Multi-attraction pass Ā· Mobile ticket\\n\\nMore\\n\\n290 Bremner Blvd, Toronto, ON M5V 3L9, Canada\\n\\nOpen ā‹… Closes 9:30\\u202fPM\\n\\nCanada Day hours\\n\\nMonday (Canada Day) 9:30\\u202fAM–9:30\\u202fPM Holiday hours Tuesday 9:30\\u202fAM–9:30\\u202fPM Wednesday 9:30\\u202fAM–9:30\\u202fPM Thursday 9:30\\u202fAM–9:30\\u202fPM Friday 9:30\\u202fAM–9:30\\u202fPM Saturday 9:30\\u202fAM–9:30\\u202fPM Sunday 9:30\\u202fAM–9:30\\u202fPM\\n\\nSuggest new hours\\n\\n\\ue80bcntower.ca\\n\\n+1 416-868-6937\\n\\nJJV7+25 Toronto, Ontario, Canada\\n\\nSend to your phone\\n\\nLGBTQ+ friendly\\n\\n\\ue3c9Suggest an edit\\n\\nTours & Activities\\n\\nAbout these results\\n\\nGives you more ways to discover this place\\n\\nGetYourGuide $98Guided Night Tour with CN Tower Entry4.7(227) • 2h 30m\\n\\nEvendo $95Toronto Small Group Night Tour with CN Tower4.7(40) • 2h 30m\\n\\nViator $101Toronto Small Group Night Tour with CN Tower4.7(41) • 2h 30m\\n\\nMore\\n\\nUpdates from customers\\n\\nNice place to have city view. Plan in advance to avoid crowd during weekends\\n\\na day ago\\n\\nPopular times\\n\\nMondays\\n\\nBased on visits to this place.\\n\\n6a\\n\\n9a\\n\\n12p\\n\\n3p\\n\\n6p\\n\\n9p\\n\\nLive\\n\\nBusier than usual\\n\\n6a\\n\\n9a\\n\\n12p\\n\\n3p\\n\\n6p\\n\\n9p\\n\\n6a\\n\\n9a\\n\\n12p\\n\\n3p\\n\\n6p\\n\\n9p\\n\\n6a\\n\\n9a\\n\\n12p\\n\\n3p\\n\\n6p\\n\\n9p\\n\\n6a\\n\\n9a\\n\\n12p\\n\\n3p\\n\\n6p\\n\\n9p\\n\\n6a\\n\\n9a\\n\\n12p\\n\\n3p\\n\\n6p\\n\\n9p\\n\\n6a\\n\\n9a\\n\\n12p\\n\\n3p\\n\\n6p\\n\\n9p\\n\\nPhotos & videos\\n\\nAll\\n\\nLatest\\n\\nToday\\n\\nVideos\\n\\nEdgeWalk at the\\n\\nSHARPSHOOTR – SIXSHOOTR\\n\\nBy owner\\n\\nStreet View & 360°\\n\\nAdd photos & videos\\n\\nAt this place\\n\\n360 The Restaurant at the CN Tower\\n\\n4.1(6,819)\\n\\n$$$$\\n\\nCanadian restaurant\\n\\nFloor 1\\n\\nOpen ā‹… Closes 10\\u202fPM\\n\\nEdgeWalk at the CN Tower\\n\\n4.8(652)\\n\\nTourist attraction\\n\\nFloor 1\\n\\nOpen ā‹… Closes 9\\u202fPM\\n\\nSkypod CN Tower\\n\\n4.7(387)\\n\\nObservation deck\\n\\nFloor 1\\n\\nOpen ā‹… Closes 10:30\\u202fPM\\n\\nCN TOWER Restaurant\\n\\n4.3(98)\\n\\nBuffet restaurant\\n\\nView all\\n\\nQuestions and answers\\n\\nWhat’s CN means\\n\\nCN stands for Canadian National, it was built by the Canadian National Railway (correct me if I\\'m wrong)\\n\\nSee 23 answers\\n\\n6 years ago\\n\\nMore questions\\n\\nReview summary\\n\\nReviews are automatically processed to detect inappropriate content like fake reviews and spam. We may take down reviews that are flagged in order to comply with \\n\\nGoogle policies or legal obligations.\\n\\n5 4 3 2 1\\n\\n4.6\\n\\n71,071 reviews\\n\\n\"Fantastic View ,\\n\\nFood\\n\\nService\\n\\n, Great Family outing\\n\\nlocation\\n\\n\"Could be cheaper a\\n\\nbit\\n\\nlol\\n\\notherwise great\\n\\nplace\\n\\nto be at anytime of the\\n\\nyear\\n\\n\"Not one of us had a\\n\\nthing\\n\\nleft on our\\n\\nplate\\n\\nand every\\n\\npiece\\n\\nof\\n\\nbread\\n\\nwas eaten.\"\\n\\nWrite a review\\n\\nReviews\\n\\nSort\\n\\nAll\\n\\nelevator\\n\\n1436\\n\\nglass floor\\n\\n1192\\n\\nobservation deck\\n\\n706\\n\\nrevolving restaurant\\n\\n309\\n\\nqueue\\n\\n308\\n\\naquarium\\n\\n203\\n\\nskyscrapers\\n\\n100\\n\\nblue jays\\n\\n68\\n\\nskywalk\\n\\n53\\n\\nmetal detector\\n\\n25\\n\\n+6\\n\\nRohan Kulkarni\\n\\nLocal Guide Ā· 97 reviews Ā· 206 photos\\n\\n\\ue838\\ue838\\ue838\\ue838\\ue838\\n\\na week ago\\n\\nNew\\n\\nDuring summer weekends the wait times are usually longer around 30-45 minutes. Especially during evening time as people tend to visit to catch the sunset view of the city. This is a must visit if you’re a tourist or never visited any other …\\n\\nMore\\n\\n0:30\\n\\n+12\\n\\nLike2\\n\\nShare\\n\\nP Can**l**!a (PbTC)\\n\\nLocal Guide Ā· 43 reviews Ā· 116 photos\\n\\n\\ue838\\ue838\\ue838\\ue838\\ue838\\n\\na week ago\\n\\nNew\\n\\nCome and visit CN Tower they have 360° view of the city, high rises and airport. I recommend buying ticket in advance and come before 11:00 am. I don’t really like the elevator as I have motion sickness, it runs 22km/hr to get up there. …\\n\\nMore\\n\\n0:13\\n\\n0:09\\n\\n0:16\\n\\n0:15\\n\\n+3\\n\\nLike2\\n\\nShare\\n\\nManan Pandya\\n\\nLocal Guide Ā· 72 reviews Ā· 139 photos\\n\\n\\ue838\\ue838\\ue838\\ue838\\ue838\\n\\n4 weeks ago\\n\\nOne of Toronto\\'s tourism attractions. The tower provides a lovely perspective of Lake Ontario and the city, which is home to a number of remarkable high-rise buildings. The lower deck\\'s glass floor is wonderful, but it\\'s only for people who …\\n\\nMore\\n\\n0:08\\n\\nLike3\\n\\nShare\\n\\nMore reviews (71,068)\\n\\nPeople also search for\\n\\nToronto Sign\\n\\n4.7(14,202)\\n\\nTourist attraction\\n\\nCN Tower Photo Spot\\n\\n4.6(21)\\n\\nTourist attraction\\n\\nClock Tower\\n\\n4.8(16)\\n\\nHistorical landmark\\n\\nRoundhouse Park\\n\\n4.5(5,289)\\n\\nPark\\n\\nCasa Loma\\n\\n4.5(28,386)\\n\\nMuseum\\n\\nWeb results\\n\\nAbout this data\\n\\nCollapse side panel\\n\\nCollapse side panel\\n\\n When you have eliminated the \\n\\nEnable JavaScript to see Google Maps.', metadata={'source': 'https://goo.gl/maps/NDSHwePEyaHMFGwh8', 'title': 'CN Tower - Google Maps', 'description': 'No description found.', 'language': 'en'})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from langchain_community.document_loaders import SeleniumURLLoader\n", + "\n", "urls = [\n", " \"https://www.youtube.com/watch?v=dQw4w9WgXcQ\",\n", " \"https://goo.gl/maps/NDSHwePEyaHMFGwh8\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60a29397", - "metadata": {}, - "outputs": [], - "source": [ - "loader = SeleniumURLLoader(urls=urls)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0090cd57", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" + "]\n", + "\n", + "loader = SeleniumURLLoader(urls=urls)\n", + "\n", + "data = loader.load()\n", + "\n", + "data[1]" ] }, { @@ -182,12 +154,12 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -U playwright unstructured" + "%pip install --upgrade --quiet playwright unstructured" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "53158417", "metadata": {}, "outputs": [], @@ -196,46 +168,43 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "0ab4e115", + "cell_type": "markdown", + "id": "2ec62600", "metadata": {}, - "outputs": [], "source": [ - "from langchain_community.document_loaders import PlaywrightURLLoader" + "Currently, nly the async method supported:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ce5a9a0a", + "execution_count": 14, + "id": "0ab4e115", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content=\"Rick Astley - Never Gonna Give You Up (Official Music Video)\\n\\nSearch\\n\\nWatch later\\n\\nShare\\n\\nCopy link\\n\\nInfo\\n\\nShopping\\n\\nTap to unmute\\n\\n2x\\n\\nIf playback doesn't begin shortly, try restarting your device.\\n\\nUp next\\n\\nLiveUpcoming\\n\\nPlay Now\\n\\nRick Astley\\n\\nSubscribe\\n\\nSubscribed\\n\\nThe new album, 'Are We There Yet?' out now!\\n\\nRick Astley - Forever and More (Official Video)3:47\\n\\nThis video is unavailable\\n\\nAre We There Yet?15 videos\\n\\nRick Astley\\n\\nSubscribe\\n\\nSubscribed\\n\\nYou're signed out\\n\\nVideos you watch may be added to the TV's watch history and influence TV recommendations. To avoid this, cancel and sign in to YouTube on your computer.\\n\\nShare\\n\\nAn error occurred while retrieving sharing information. Please try again later.\\n\\n0:00\\n\\n0:00 / 3:32\\n\\nWatch full video\\n\\n•\\n\\nScroll for details\\n\\n•\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n NaN / NaN\\n\\nNaN / NaN\\n\\nNaN / NaN\\n\\nNaN / NaN\\n\\nNaN / NaN\\n\\nSearch\", metadata={'source': 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'})" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from langchain_community.document_loaders import PlaywrightURLLoader\n", + "\n", "urls = [\n", " \"https://www.youtube.com/watch?v=dQw4w9WgXcQ\",\n", " \"https://goo.gl/maps/NDSHwePEyaHMFGwh8\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2dc3e0bc", - "metadata": {}, - "outputs": [], - "source": [ - "loader = PlaywrightURLLoader(urls=urls, remove_selectors=[\"header\", \"footer\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10b79f80", - "metadata": {}, - "outputs": [], - "source": [ - "data = loader.load()" + "]\n", + "\n", + "loader = PlaywrightURLLoader(urls=urls, remove_selectors=[\"header\", \"footer\"])\n", + "\n", + "data = await loader.aload()\n", + "\n", + "data[0]" ] } ], @@ -255,7 +224,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.5" } }, "nbformat": 4, diff --git a/docs/docs/integrations/document_loaders/xml.ipynb b/docs/docs/integrations/document_loaders/xml.ipynb index 19135b2dfe0..55f2f14c640 100644 --- a/docs/docs/integrations/document_loaders/xml.ipynb +++ b/docs/docs/integrations/document_loaders/xml.ipynb @@ -17,29 +17,10 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_community.document_loaders import UnstructuredXMLLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a654e4d9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Document(page_content='United States\\n\\nWashington, DC\\n\\nJoe Biden\\n\\nBaseball\\n\\nCanada\\n\\nOttawa\\n\\nJustin Trudeau\\n\\nHockey\\n\\nFrance\\n\\nParis\\n\\nEmmanuel Macron\\n\\nSoccer\\n\\nTrinidad & Tobado\\n\\nPort of Spain\\n\\nKeith Rowley\\n\\nTrack & Field', metadata={'source': 'example_data/factbook.xml'})" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "from langchain_community.document_loaders import UnstructuredXMLLoader\n", + "\n", "loader = UnstructuredXMLLoader(\n", - " \"example_data/factbook.xml\",\n", + " \"./example_data/factbook.xml\",\n", ")\n", "docs = loader.load()\n", "docs[0]" diff --git a/docs/docs/integrations/providers/unstructured.mdx b/docs/docs/integrations/providers/unstructured.mdx index 1a85b180c0f..83c1584f30e 100644 --- a/docs/docs/integrations/providers/unstructured.mdx +++ b/docs/docs/integrations/providers/unstructured.mdx @@ -14,14 +14,18 @@ its dependencies running locally. - Install the Python SDK with `pip install unstructured`. - You can install document specific dependencies with extras, i.e. `pip install "unstructured[docx]"`. - To install the dependencies for all document types, use `pip install "unstructured[all-docs]"`. -- Install the following system dependencies if they are not already available on your system. +- Install the following system dependencies if they are not already available on your system with e.g. `brew install` for Mac. Depending on what document types you're parsing, you may not need all of these. - `libmagic-dev` (filetype detection) - `poppler-utils` (images and PDFs) - `tesseract-ocr`(images and PDFs) + - `qpdf` (PDFs) - `libreoffice` (MS Office docs) - `pandoc` (EPUBs) +When running locally, Unstructured also recommends using Docker [by following this guide](https://docs.unstructured.io/open-source/installation/docker-installation) +to ensure all system dependencies are installed correctly. + If you want to get up and running with less set up, you can simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or `UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API.
Team, \"Payroll (millions)\", \"Wins\"
Nationals, 81.34, 98