Docugami DataLoader (#4727)

### Adds a document loader for Docugami Specifically: 1. Adds a data loader that talks to the [Docugami](http://docugami.com) API to download processed documents as semantic XML 2. Parses the semantic XML into chunks, with additional metadata capturing chunk semantics 3. Adds a detailed notebook showing how you can use additional metadata returned by Docugami for techniques like the [self-querying retriever](https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/self_query_retriever.html) 4. Adds an integration test, and related documentation Here is an example of a result that is not possible without the capabilities added by Docugami (from the notebook): <img width="1585" alt="image" src="https://github.com/hwchase17/langchain/assets/749277/bb6c1ce3-13dc-4349-a53b-de16681fdd5b"> --------- Co-authored-by: Taqi Jaffri <tjaffri@docugami.com> Co-authored-by: Taqi Jaffri <tjaffri@gmail.com>
2025-09-07 05:52:15 +00:00 · 2023-05-15 10:53:00 -04:00
parent c2761aa8f4
commit 3c490b5ba3
10 changed files with 1269 additions and 8 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,6 +82,7 @@ pdfminer-six = {version = "^20221105", optional = true}
 docarray = {version="^0.31.0", optional=true}
 protobuf = {version="3.19", optional=true}
 hnswlib = {version="^0.7.0", optional=true}
+lxml = {version = "^4.9.2", optional = true}


 [tool.poetry.group.docs.dependencies]
@@ -170,8 +171,14 @@ embeddings = ["sentence-transformers"]
 azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
 all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
 # An extra used to be able to add extended testing.
+# Please use new-line on formatting to make it easier to add new packages without
+# merge-conflicts
 extended_testing = [
-  "pypdf", "pdfminer.six", "tqdm", "jq"
+ "jq",
+ "pdfminer.six",
+ "pypdf",
+ "tqdm",
+ "lxml",
 ]

 [tool.ruff]