mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-07 05:52:15 +00:00
Docugami DataLoader (#4727)
### Adds a document loader for Docugami Specifically: 1. Adds a data loader that talks to the [Docugami](http://docugami.com) API to download processed documents as semantic XML 2. Parses the semantic XML into chunks, with additional metadata capturing chunk semantics 3. Adds a detailed notebook showing how you can use additional metadata returned by Docugami for techniques like the [self-querying retriever](https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/self_query_retriever.html) 4. Adds an integration test, and related documentation Here is an example of a result that is not possible without the capabilities added by Docugami (from the notebook): <img width="1585" alt="image" src="https://github.com/hwchase17/langchain/assets/749277/bb6c1ce3-13dc-4349-a53b-de16681fdd5b"> --------- Co-authored-by: Taqi Jaffri <tjaffri@docugami.com> Co-authored-by: Taqi Jaffri <tjaffri@gmail.com>
This commit is contained in:
@@ -82,6 +82,7 @@ pdfminer-six = {version = "^20221105", optional = true}
|
||||
docarray = {version="^0.31.0", optional=true}
|
||||
protobuf = {version="3.19", optional=true}
|
||||
hnswlib = {version="^0.7.0", optional=true}
|
||||
lxml = {version = "^4.9.2", optional = true}
|
||||
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
@@ -170,8 +171,14 @@ embeddings = ["sentence-transformers"]
|
||||
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
|
||||
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
|
||||
# An extra used to be able to add extended testing.
|
||||
# Please use new-line on formatting to make it easier to add new packages without
|
||||
# merge-conflicts
|
||||
extended_testing = [
|
||||
"pypdf", "pdfminer.six", "tqdm", "jq"
|
||||
"jq",
|
||||
"pdfminer.six",
|
||||
"pypdf",
|
||||
"tqdm",
|
||||
"lxml",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
Reference in New Issue
Block a user