[unstructured][security] Bump unstructured version (#25364)

This ensures version 0.15.7+ is pulled. 
This version of unstructured uses a version of NLTK >= 3.8.2 that has a
fix for a critical CVE:
https://github.com/advisories/GHSA-cgvx-9447-vcch
This commit is contained in:
Christophe Bornet 2024-08-21 21:25:24 +02:00 committed by GitHub
parent 39c44817ae
commit b71ae52e65
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 31 additions and 41 deletions

View File

@ -1231,7 +1231,7 @@ files = [
[[package]]
name = "langchain-core"
version = "0.2.23"
version = "0.2.30"
description = "Building applications with LLMs through composability"
optional = false
python-versions = ">=3.8.1,<4.0"
@ -1248,6 +1248,7 @@ pydantic = [
]
PyYAML = ">=5.3"
tenacity = "^8.1.0,!=8.4.0"
typing-extensions = ">=4.7"
[package.source]
type = "directory"
@ -1397,9 +1398,13 @@ files = [
{file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"},
{file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"},
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"},
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"},
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"},
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"},
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"},
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"},
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"},
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"},
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"},
{file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"},
{file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"},
@ -1931,6 +1936,7 @@ description = "Nvidia JIT LTO Library"
optional = false
python-versions = ">=3"
files = [
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
]
@ -2911,21 +2917,6 @@ files = [
{file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
]
[[package]]
name = "pytesseract"
version = "0.3.10"
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
]
[package.dependencies]
packaging = ">=21.3"
Pillow = ">=8.0.0"
[[package]]
name = "pytest"
version = "7.4.4"
@ -3066,18 +3057,19 @@ typing-extensions = ">=4.9.0"
[[package]]
name = "python-pptx"
version = "0.6.23"
description = "Generate and manipulate Open XML PowerPoint (.pptx) files"
version = "1.0.2"
description = "Create, read, and update PowerPoint 2007+ (.pptx) files."
optional = false
python-versions = "*"
python-versions = ">=3.8"
files = [
{file = "python-pptx-0.6.23.tar.gz", hash = "sha256:587497ff28e779ab18dbb074f6d4052893c85dedc95ed75df319364f331fedee"},
{file = "python_pptx-0.6.23-py3-none-any.whl", hash = "sha256:dd0527194627a2b7cc05f3ba23ecaa2d9a0d5ac9b6193a28ed1b7a716f4217d4"},
{file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"},
{file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"},
]
[package.dependencies]
lxml = ">=3.1.0"
Pillow = ">=3.3.2"
typing-extensions = ">=4.9.0"
XlsxWriter = ">=0.5.7"
[[package]]
@ -3139,7 +3131,6 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@ -4081,13 +4072,13 @@ files = [
[[package]]
name = "unstructured"
version = "0.15.0"
version = "0.15.7"
description = "A library that prepares raw documents for downstream ML tasks."
optional = false
python-versions = "<3.13,>=3.9.0"
files = [
{file = "unstructured-0.15.0-py3-none-any.whl", hash = "sha256:43538e0463aec3741eef06eaf69fc27f6fdaffa7376816c797cb25abb9b55b45"},
{file = "unstructured-0.15.0.tar.gz", hash = "sha256:b75deb8e46dffbe1c6f82936810a31ab9516923c3a60bd01821d7701fa13b628"},
{file = "unstructured-0.15.7-py3-none-any.whl", hash = "sha256:9b176f18776142feed1f058f11d16046ae24d077fa96648979ae9c474819f56c"},
{file = "unstructured-0.15.7.tar.gz", hash = "sha256:ac55bf31b1d4c19c33c0e2ec5f615d96d03a2bf49a784f23b29d5530b90d6830"},
]
[package.dependencies]
@ -4115,12 +4106,11 @@ pillow-heif = {version = "*", optional = true, markers = "extra == \"all-docs\""
psutil = "*"
pypandoc = {version = "*", optional = true, markers = "extra == \"all-docs\""}
pypdf = {version = "*", optional = true, markers = "extra == \"all-docs\""}
pytesseract = {version = "*", optional = true, markers = "extra == \"all-docs\""}
python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"all-docs\""}
python-iso639 = "*"
python-magic = "*"
python-oxmsg = {version = "*", optional = true, markers = "extra == \"all-docs\""}
python-pptx = {version = "<=0.6.23", optional = true, markers = "extra == \"all-docs\""}
python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"all-docs\""}
rapidfuzz = "*"
requests = "*"
tabulate = "*"
@ -4134,14 +4124,14 @@ xlrd = {version = "*", optional = true, markers = "extra == \"all-docs\""}
[package.extras]
airtable = ["pyairtable"]
all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypandoc", "pypdf", "pytesseract", "python-docx (>=1.1.2)", "python-oxmsg", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)", "xlrd"]
astra = ["astrapy"]
all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-oxmsg", "python-pptx (>=1.0.1)", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)", "xlrd"]
astradb = ["astrapy"]
azure = ["adlfs", "fsspec"]
azure-cognitive-search = ["azure-search-documents"]
bedrock = ["boto3", "langchain-community"]
biomed = ["bs4"]
box = ["boxfs", "fsspec"]
chroma = ["chromadb", "importlib-metadata (>=7.1.0)", "typer (<=0.9.0)"]
chroma = ["chromadb", "importlib-metadata (>=8.2.0)", "tenacity (==8.5.0)", "typer (<=0.9.0)"]
clarifai = ["clarifai"]
confluence = ["atlassian-python-api"]
csv = ["pandas"]
@ -4152,7 +4142,7 @@ doc = ["python-docx (>=1.1.2)"]
docx = ["python-docx (>=1.1.2)"]
dropbox = ["dropboxdrivefs", "fsspec"]
elasticsearch = ["elasticsearch[async]"]
embed-huggingface = ["huggingface", "langchain-community", "sentence-transformers"]
embed-huggingface = ["langchain-huggingface"]
embed-octoai = ["openai", "tiktoken"]
embed-vertexai = ["langchain", "langchain-community", "langchain-google-vertexai"]
embed-voyageai = ["langchain", "langchain-voyageai"]
@ -4163,26 +4153,26 @@ gitlab = ["python-gitlab"]
google-drive = ["google-api-python-client"]
hubspot = ["hubspot-api-client", "urllib3"]
huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"]
image = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypdf", "pytesseract", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)"]
image = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypdf", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)"]
jira = ["atlassian-python-api"]
kafka = ["confluent-kafka"]
local-inference = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypandoc", "pypdf", "pytesseract", "python-docx (>=1.1.2)", "python-oxmsg", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)", "xlrd"]
local-inference = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-oxmsg", "python-pptx (>=1.0.1)", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)", "xlrd"]
md = ["markdown"]
mongodb = ["pymongo"]
msg = ["python-oxmsg"]
notion = ["htmlBuilder", "notion-client"]
odt = ["pypandoc", "python-docx (>=1.1.2)"]
onedrive = ["Office365-REST-Python-Client", "bs4", "msal"]
openai = ["langchain-community", "openai", "tiktoken"]
openai = ["langchain-openai"]
opensearch = ["opensearch-py"]
org = ["pypandoc"]
outlook = ["Office365-REST-Python-Client", "msal"]
paddleocr = ["unstructured.paddleocr (==2.8.0.1)"]
pdf = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypdf", "pytesseract", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)"]
paddleocr = ["paddlepaddle (==3.0.0b1)", "unstructured.paddleocr (==2.8.0.1)"]
pdf = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pikepdf", "pillow-heif", "pypdf", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)"]
pinecone = ["pinecone-client (>=3.7.1)"]
postgres = ["psycopg2-binary"]
ppt = ["python-pptx (<=0.6.23)"]
pptx = ["python-pptx (<=0.6.23)"]
ppt = ["python-pptx (>=1.0.1)"]
pptx = ["python-pptx (>=1.0.1)"]
qdrant = ["qdrant-client"]
reddit = ["praw"]
rst = ["pypandoc"]
@ -4416,4 +4406,4 @@ local = ["unstructured"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<4.0"
content-hash = "250df0f667fbb8eafc622dfe771541a9a25e718cc786f6adb0c13081c4b77745"
content-hash = "d95a01d052e3f6175a45c5a589692274300a88782938ed71f835c5f68842d821"

View File

@ -15,7 +15,7 @@ license = "MIT"
python = ">=3.9,<4.0"
langchain-core = "^0.2.23"
unstructured-client = { version = "^0.24.1" }
unstructured = { version = "^0.15.0", optional = true, python = "<3.13", extras = [
unstructured = { version = "^0.15.7", optional = true, python = "<3.13", extras = [
"all-docs",
] }
@ -50,7 +50,7 @@ ruff = "^0.1.8"
[tool.poetry.group.typing.dependencies]
mypy = "^1.7.1"
unstructured = { version = "^0.15.0", python = "<3.13", extras = ["all-docs"] }
unstructured = { version = "^0.15.7", python = "<3.13", extras = ["all-docs"] }
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.dev]