From dfb93dd2b537bcdd8a7bc092a88fee66cf75a076 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 10 Aug 2023 23:47:22 +0900 Subject: [PATCH] Improved grobid documentation (#9025) - Description: Improvement in the Grobid loader documentation, typos and suggesting to use the docker image instead of installing Grobid in local (the documentation was also limited to Mac, while docker allow running in any platform) - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: @whitenoise --- .../document_loaders/grobid.ipynb | 60 ++----------------- docs/extras/integrations/providers/grobid.mdx | 30 +++++----- 2 files changed, 21 insertions(+), 69 deletions(-) diff --git a/docs/extras/integrations/document_loaders/grobid.ipynb b/docs/extras/integrations/document_loaders/grobid.ipynb index 96bf6b8ddb4..83ffffcc29b 100644 --- a/docs/extras/integrations/document_loaders/grobid.ipynb +++ b/docs/extras/integrations/document_loaders/grobid.ipynb @@ -9,66 +9,16 @@ "\n", "GROBID is a machine learning library for extracting, parsing, and re-structuring raw documents.\n", "\n", - "It is particularly good for sturctured PDFs, like academic papers.\n", + "It is designed and expected to be used to parse academic papers, where it works particularly well. Note: if the articles supplied to Grobid are large documents (e.g. dissertations) exceeding a certain number of elements, they might not be processed. \n", "\n", - "This loader uses GROBIB to parse PDFs into `Documents` that retain metadata associated with the section of text.\n", + "This loader uses Grobid to parse PDFs into `Documents` that retain metadata associated with the section of text.\n", "\n", "---\n", + "The best approach is to install Grobid via docker, see https://grobid.readthedocs.io/en/latest/Grobid-docker/. \n", "\n", - "For users on `Mac` - \n", + "(Note: additional instructions can be found [here](https://python.langchain.com/docs/extras/integrations/providers/grobid.mdx).)\n", "\n", - "(Note: additional instructions can be found [here](https://python.langchain.com/docs/ecosystem/integrations/grobid.mdx).)\n", - "\n", - "Install Java (Apple Silicon):\n", - "```\n", - "$ arch -arm64 brew install openjdk@11\n", - "$ brew --prefix openjdk@11\n", - "/opt/homebrew/opt/openjdk@ 11\n", - "```\n", - "\n", - "In `~/.zshrc`:\n", - "```\n", - "export JAVA_HOME=/opt/homebrew/opt/openjdk@11\n", - "export PATH=$JAVA_HOME/bin:$PATH\n", - "```\n", - "\n", - "Then, in Terminal:\n", - "```\n", - "$ source ~/.zshrc\n", - "```\n", - "\n", - "Confirm install:\n", - "```\n", - "$ which java\n", - "/opt/homebrew/opt/openjdk@11/bin/java\n", - "$ java -version \n", - "openjdk version \"11.0.19\" 2023-04-18\n", - "OpenJDK Runtime Environment Homebrew (build 11.0.19+0)\n", - "OpenJDK 64-Bit Server VM Homebrew (build 11.0.19+0, mixed mode)\n", - "```\n", - "\n", - "Then, get [Grobid](https://grobid.readthedocs.io/en/latest/Install-Grobid/#getting-grobid):\n", - "```\n", - "$ curl -LO https://github.com/kermitt2/grobid/archive/0.7.3.zip\n", - "$ unzip 0.7.3.zip\n", - "```\n", - " \n", - "Build\n", - "```\n", - "$ ./gradlew clean install\n", - "```\n", - "\n", - "Then, run the server:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "2d8992fc", - "metadata": {}, - "outputs": [], - "source": [ - "! get_ipython().system_raw('nohup ./gradlew run > grobid.log 2>&1 &')" + "Once grobid is up-and-running you can interact as described below. \n" ] }, { diff --git a/docs/extras/integrations/providers/grobid.mdx b/docs/extras/integrations/providers/grobid.mdx index 6a24e68baa2..4fd52abe234 100644 --- a/docs/extras/integrations/providers/grobid.mdx +++ b/docs/extras/integrations/providers/grobid.mdx @@ -1,22 +1,23 @@ # Grobid +GROBID is a machine learning library for extracting, parsing, and re-structuring raw documents. + +It is designed and expected to be used to parse academic papers, where it works particularly well. + +*Note*: if the articles supplied to Grobid are large documents (e.g. dissertations) exceeding a certain number +of elements, they might not be processed. + This page covers how to use the Grobid to parse articles for LangChain. -It is separated into two parts: installation and running the server -## Installation and Setup -#Ensure You have Java installed -!apt-get install -y openjdk-11-jdk -q -!update-alternatives --set java /usr/lib/jvm/java-11-openjdk-amd64/bin/java +## Installation +The grobid installation is described in details in https://grobid.readthedocs.io/en/latest/Install-Grobid/. +However, it is probably easier and less troublesome to run grobid through a docker container, +as documented [here](https://grobid.readthedocs.io/en/latest/Grobid-docker/). -#Clone and install the Grobid Repo -import os -!git clone https://github.com/kermitt2/grobid.git -os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64" -os.chdir('grobid') -!./gradlew clean install +## Use Grobid with LangChain -#Run the server, -get_ipython().system_raw('nohup ./gradlew run > grobid.log 2>&1 &') +Once grobid is installed and up and running (you can check by accessing it http://localhost:8070), +you're ready to go. You can now use the GrobidParser to produce documents ```python @@ -41,4 +42,5 @@ loader = GenericLoader.from_filesystem( ) docs = loader.load() ``` -Chunk metadata will include bboxes although these are a bit funky to parse, see https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/ +Chunk metadata will include Bounding Boxes. Although these are a bit funky to parse, +they are explained in https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/ \ No newline at end of file