Docugami DataLoader (#4727)

### Adds a document loader for Docugami

Specifically:

1. Adds a data loader that talks to the [Docugami](http://docugami.com)
API to download processed documents as semantic XML
2. Parses the semantic XML into chunks, with additional metadata
capturing chunk semantics
3. Adds a detailed notebook showing how you can use additional metadata
returned by Docugami for techniques like the [self-querying
retriever](https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/self_query_retriever.html)
4. Adds an integration test, and related documentation

Here is an example of a result that is not possible without the
capabilities added by Docugami (from the notebook):

<img width="1585" alt="image"
src="https://github.com/hwchase17/langchain/assets/749277/bb6c1ce3-13dc-4349-a53b-de16681fdd5b">

---------

Co-authored-by: Taqi Jaffri <tjaffri@docugami.com>
Co-authored-by: Taqi Jaffri <tjaffri@gmail.com>
This commit is contained in:
Eugene Yurtsev
2023-05-15 10:53:00 -04:00
committed by GitHub
parent c2761aa8f4
commit 3c490b5ba3
10 changed files with 1269 additions and 8 deletions

View File

@@ -82,6 +82,7 @@ pdfminer-six = {version = "^20221105", optional = true}
docarray = {version="^0.31.0", optional=true}
protobuf = {version="3.19", optional=true}
hnswlib = {version="^0.7.0", optional=true}
lxml = {version = "^4.9.2", optional = true}
[tool.poetry.group.docs.dependencies]
@@ -170,8 +171,14 @@ embeddings = ["sentence-transformers"]
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
# An extra used to be able to add extended testing.
# Please use new-line on formatting to make it easier to add new packages without
# merge-conflicts
extended_testing = [
"pypdf", "pdfminer.six", "tqdm", "jq"
"jq",
"pdfminer.six",
"pypdf",
"tqdm",
"lxml",
]
[tool.ruff]