From 11e8457606fcf78b4db454ab3e92d8438abf1235 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 10 May 2023 15:16:13 -0400 Subject: [PATCH] Revert "Add workflow for testing with all deps (#4410)" This reverts commit 80558b5b278a9d3262ca7b4c6f0878e31d5fa459. --- .github/actions/poetry_setup/action.yml | 64 ---------- .github/workflows/test.yml | 10 +- .github/workflows/test_all.yml | 33 ----- poetry.lock | 119 +----------------- pyproject.toml | 3 - tests/data.py | 10 -- .../parsers/test_pdf_parsers.py | 79 ------------ .../parsers/test_public_api.py | 11 -- 8 files changed, 7 insertions(+), 322 deletions(-) delete mode 100644 .github/actions/poetry_setup/action.yml delete mode 100644 .github/workflows/test_all.yml delete mode 100644 tests/data.py delete mode 100644 tests/unit_tests/document_loader/parsers/test_pdf_parsers.py delete mode 100644 tests/unit_tests/document_loader/parsers/test_public_api.py diff --git a/.github/actions/poetry_setup/action.yml b/.github/actions/poetry_setup/action.yml deleted file mode 100644 index 3e2ef52fb62..00000000000 --- a/.github/actions/poetry_setup/action.yml +++ /dev/null @@ -1,64 +0,0 @@ -# An action for setting up poetry install with caching. -# Using a custom action since the default action does not -# take poetry install groups into account. -# Action code from: -# https://github.com/actions/setup-python/issues/505#issuecomment-1273013236 -name: poetry-install-with-caching -description: Poetry install with support for caching of dependency groups. - -inputs: - python-version: - description: Python version, supporting MAJOR.MINOR only - required: true - - poetry-version: - description: Poetry version - required: true - - install-command: - description: Command run for installing dependencies - required: false - default: poetry install - - cache-key: - description: Cache key to use for manual handling of caching - required: true - - working-directory: - description: Directory to run install-command in - required: false - default: "" - -runs: - using: composite - steps: - - uses: actions/setup-python@v4 - with: - python-version: ${{ inputs.python-version }} - - - uses: actions/cache@v3 - id: cache-pip - env: - SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15" - with: - path: | - ~/.cache/pip - key: pip-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }} - - - run: pipx install poetry==${{ inputs.poetry-version }} --python python${{ inputs.python-version }} - shell: bash - - - uses: actions/cache@v3 - id: cache-poetry - env: - SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15" - with: - path: | - ~/.cache/pypoetry/virtualenvs - ~/.cache/pypoetry/cache - ~/.cache/pypoetry/artifacts - key: poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ inputs.cache-key }}-${{ hashFiles('poetry.lock') }} - - - run: ${{ inputs.install-command }} - working-directory: ${{ inputs.working-directory }} - shell: bash diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ec960bb411e..ed46af8aece 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,13 +20,15 @@ jobs: - "3.11" steps: - uses: actions/checkout@v3 + - name: Install poetry + run: pipx install poetry==$POETRY_VERSION - name: Set up Python ${{ matrix.python-version }} - uses: "./.github/actions/poetry_setup" + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - poetry-version: "1.4.2" - cache-key: "main" - install-command: "poetry install" + cache: "poetry" + - name: Install dependencies + run: poetry install - name: Run unit tests run: | make test diff --git a/.github/workflows/test_all.yml b/.github/workflows/test_all.yml deleted file mode 100644 index d914a16f740..00000000000 --- a/.github/workflows/test_all.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Run unit tests with all optional packages installed. -name: test_all - -on: - push: - branches: [master] - pull_request: - -env: - POETRY_VERSION: "1.4.2" - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: - - "3.8" - - "3.9" - - "3.10" - - "3.11" - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: "./.github/actions/poetry_setup" - with: - python-version: ${{ matrix.python-version }} - poetry-version: "1.4.2" - cache-key: "extended" - install-command: "poetry install -E extended_testing" - - name: Run unit tests - run: | - make test diff --git a/poetry.lock b/poetry.lock index d29d7498a89..0c0b0b722eb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5362,27 +5362,6 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] s3 = ["boto3"] test = ["mock", "pytest", "pytest-coverage", "typer-cli"] -[[package]] -name = "pdfminer-six" -version = "20221105" -description = "PDF parser and analyzer" -category = "main" -optional = true -python-versions = ">=3.6" -files = [ - {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, - {file = "pdfminer.six-20221105.tar.gz", hash = "sha256:8448ab7b939d18b64820478ecac5394f482d7a79f5f7eaa7703c6c959c175e1d"}, -] - -[package.dependencies] -charset-normalizer = ">=2.0.0" -cryptography = ">=36.0.0" - -[package.extras] -dev = ["black", "mypy (==0.931)", "nox", "pytest"] -docs = ["sphinx", "sphinx-argparse"] -image = ["Pillow"] - [[package]] name = "pexpect" version = "4.8.0" @@ -6244,101 +6223,6 @@ pyarrow = ">=10" [package.extras] tests = ["duckdb", "polars[pandas,pyarrow]", "pytest"] -[[package]] -name = "pymongo" -version = "4.3.3" -description = "Python driver for MongoDB " -category = "dev" -optional = false -python-versions = ">=3.7" -files = [ - {file = "pymongo-4.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:74731c9e423c93cbe791f60c27030b6af6a948cef67deca079da6cd1bb583a8e"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux1_i686.whl", hash = "sha256:66413c50d510e5bcb0afc79880d1693a2185bcea003600ed898ada31338c004e"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:9b87b23570565a6ddaa9244d87811c2ee9cffb02a753c8a2da9c077283d85845"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_i686.whl", hash = "sha256:695939036a320f4329ccf1627edefbbb67cc7892b8222d297b0dd2313742bfee"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_ppc64le.whl", hash = "sha256:ffcc8394123ea8d43fff8e5d000095fe7741ce3f8988366c5c919c4f5eb179d3"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_s390x.whl", hash = "sha256:943f208840777f34312c103a2d1caab02d780c4e9be26b3714acf6c4715ba7e1"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:01f7cbe88d22440b6594c955e37312d932fd632ffed1a86d0c361503ca82cc9d"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdb87309de97c63cb9a69132e1cb16be470e58cffdfbad68fdd1dc292b22a840"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d86c35d94b5499689354ccbc48438a79f449481ee6300f3e905748edceed78e7"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a966d5304b7d90c45c404914e06bbf02c5bf7e99685c6c12f0047ef2aa837142"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be1d2ce7e269215c3ee9a215e296b7a744aff4f39233486d2c4d77f5f0c561a6"}, - {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55b6163dac53ef1e5d834297810c178050bd0548a4136cd4e0f56402185916ca"}, - {file = "pymongo-4.3.3-cp310-cp310-win32.whl", hash = "sha256:dc0cff74cd36d7e1edba91baa09622c35a8a57025f2f2b7a41e3f83b1db73186"}, - {file = "pymongo-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:cafa52873ae12baa512a8721afc20de67a36886baae6a5f394ddef0ce9391f91"}, - {file = "pymongo-4.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:599d3f6fbef31933b96e2d906b0f169b3371ff79ea6aaf6ecd76c947a3508a3d"}, - {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0640b4e9d008e13956b004d1971a23377b3d45491f87082161c92efb1e6c0d6"}, - {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:341221e2f2866a5960e6f8610f4cbac0bb13097f3b1a289aa55aba984fc0d969"}, - {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7fac06a539daef4fcf5d8288d0d21b412f9b750454cd5a3cf90484665db442a"}, - {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3a51901066696c4af38c6c63a1f0aeffd5e282367ff475de8c191ec9609b56d"}, - {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3055510fdfdb1775bc8baa359783022f70bb553f2d46e153c094dfcb08578ff"}, - {file = "pymongo-4.3.3-cp311-cp311-win32.whl", hash = "sha256:524d78673518dcd352a91541ecd2839c65af92dc883321c2109ef6e5cd22ef23"}, - {file = "pymongo-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b8a03af1ce79b902a43f5f694c4ca8d92c2a4195db0966f08f266549e2fc49bc"}, - {file = "pymongo-4.3.3-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:39b03045c71f761aee96a12ebfbc2f4be89e724ff6f5e31c2574c1a0e2add8bd"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6fcfbf435eebf8a1765c6d1f46821740ebe9f54f815a05c8fc30d789ef43cb12"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7d43ac9c7eeda5100fb0a7152fab7099c9cf9e5abd3bb36928eb98c7d7a339c6"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3b93043b14ba7eb08c57afca19751658ece1cfa2f0b7b1fb5c7a41452fbb8482"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:c09956606c08c4a7c6178a04ba2dd9388fcc5db32002ade9c9bc865ab156ab6d"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:b0cfe925610f2fd59555bb7fc37bd739e4b197d33f2a8b2fae7b9c0c6640318c"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:4d00b91c77ceb064c9b0459f0d6ea5bfdbc53ea9e17cf75731e151ef25a830c7"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:c6258a3663780ae47ba73d43eb63c79c40ffddfb764e09b56df33be2f9479837"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c29e758f0e734e1e90357ae01ec9c6daf19ff60a051192fe110d8fb25c62600e"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f3621a46cdc7a9ba8080422262398a91762a581d27e0647746588d3f995c88"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:47f7aa217b25833cd6f0e72b0d224be55393c2692b4f5e0561cb3beeb10296e9"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2fdc855149efe7cdcc2a01ca02bfa24761c640203ea94df467f3baf19078be"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5effd87c7d363890259eac16c56a4e8da307286012c076223997f8cc4a8c435b"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6dd1cf2995fdbd64fc0802313e8323f5fa18994d51af059b5b8862b73b5e53f0"}, - {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:bb869707d8e30645ed6766e44098600ca6cdf7989c22a3ea2b7966bb1d98d4b2"}, - {file = "pymongo-4.3.3-cp37-cp37m-win32.whl", hash = "sha256:49210feb0be8051a64d71691f0acbfbedc33e149f0a5d6e271fddf6a12493fed"}, - {file = "pymongo-4.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:54c377893f2cbbffe39abcff5ff2e917b082c364521fa079305f6f064e1a24a9"}, - {file = "pymongo-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c184ec5be465c0319440734491e1aa4709b5f3ba75fdfc9dbbc2ae715a7f6829"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:dca34367a4e77fcab0693e603a959878eaf2351585e7d752cac544bc6b2dee46"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cd6a4afb20fb3c26a7bfd4611a0bbb24d93cbd746f5eb881f114b5e38fd55501"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0c466710871d0026c190fc4141e810cf9d9affbf4935e1d273fbdc7d7cda6143"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:d07d06dba5b5f7d80f9cc45501456e440f759fe79f9895922ed486237ac378a8"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:711bc52cb98e7892c03e9b669bebd89c0a890a90dbc6d5bb2c47f30239bac6e9"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:34b040e095e1671df0c095ec0b04fc4ebb19c4c160f87c2b55c079b16b1a6b00"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:4ed00f96e147f40b565fe7530d1da0b0f3ab803d5dd5b683834500fa5d195ec4"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef888f48eb9203ee1e04b9fb27429017b290fb916f1e7826c2f7808c88798394"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:316498b642c00401370b2156b5233b256f9b33799e0a8d9d0b8a7da217a20fca"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa7e202feb683dad74f00dea066690448d0cfa310f8a277db06ec8eb466601b5"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52896e22115c97f1c829db32aa2760b0d61839cfe08b168c2b1d82f31dbc5f55"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c051fe37c96b9878f37fa58906cb53ecd13dcb7341d3a85f1e2e2f6b10782d9"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5134d33286c045393c7beb51be29754647cec5ebc051cf82799c5ce9820a2ca2"}, - {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a9c2885b4a8e6e39db5662d8b02ca6dcec796a45e48c2de12552841f061692ba"}, - {file = "pymongo-4.3.3-cp38-cp38-win32.whl", hash = "sha256:a6cd6f1db75eb07332bd3710f58f5fce4967eadbf751bad653842750a61bda62"}, - {file = "pymongo-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:d5571b6978750601f783cea07fb6b666837010ca57e5cefa389c1d456f6222e2"}, - {file = "pymongo-4.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:81d1a7303bd02ca1c5be4aacd4db73593f573ba8e0c543c04c6da6275fd7a47e"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:016c412118e1c23fef3a1eada4f83ae6e8844fd91986b2e066fc1b0013cdd9ae"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:8fd6e191b92a10310f5a6cfe10d6f839d79d192fb02480bda325286bd1c7b385"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e2961b05f9c04a53da8bfc72f1910b6aec7205fcf3ac9c036d24619979bbee4b"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:b38a96b3eed8edc515b38257f03216f382c4389d022a8834667e2bc63c0c0c31"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:c1a70c51da9fa95bd75c167edb2eb3f3c4d27bc4ddd29e588f21649d014ec0b7"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:8a06a0c02f5606330e8f2e2f3b7949877ca7e4024fa2bff5a4506bec66c49ec7"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:6c2216d8b6a6d019c6f4b1ad55f890e5e77eb089309ffc05b6911c09349e7474"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eac0a143ef4f28f49670bf89cb15847eb80b375d55eba401ca2f777cd425f338"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08fc250b5552ee97ceeae0f52d8b04f360291285fc7437f13daa516ce38fdbc6"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704d939656e21b073bfcddd7228b29e0e8a93dd27b54240eaafc0b9a631629a6"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1074f1a6f23e28b983c96142f2d45be03ec55d93035b471c26889a7ad2365db3"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b16250238de8dafca225647608dddc7bbb5dce3dd53b4d8e63c1cc287394c2f"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7761cacb8745093062695b11574effea69db636c2fd0a9269a1f0183712927b4"}, - {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fd7bb378d82b88387dc10227cfd964f6273eb083e05299e9b97cbe075da12d11"}, - {file = "pymongo-4.3.3-cp39-cp39-win32.whl", hash = "sha256:dc24d245026a72d9b4953729d31813edd4bd4e5c13622d96e27c284942d33f24"}, - {file = "pymongo-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:fc28e8d85d392a06434e9a934908d97e2cf453d69488d2bcd0bfb881497fd975"}, - {file = "pymongo-4.3.3.tar.gz", hash = "sha256:34e95ffb0a68bffbc3b437f2d1f25fc916fef3df5cdeed0992da5f42fae9b807"}, -] - -[package.dependencies] -dnspython = ">=1.16.0,<3.0.0" - -[package.extras] -aws = ["pymongo-auth-aws (<2.0.0)"] -encryption = ["pymongo-auth-aws (<2.0.0)", "pymongocrypt (>=1.3.0,<2.0.0)"] -gssapi = ["pykerberos"] -ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] -snappy = ["python-snappy"] -zstd = ["zstandard"] - [[package]] name = "pyowm" version = "3.3.0" @@ -9802,7 +9686,6 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] -extended-testing = ["pdfminer-six", "pypdf"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] @@ -9810,4 +9693,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "63324f95fc04b48b631353c5e1a1a2ecc1b9dc757441181d4920e9d77cd8951f" +content-hash = "2352db14ae75227c4d1ab34d48c74da3a16ceaeb5c5fa5df1a1dfcc5ae8e69e6" diff --git a/pyproject.toml b/pyproject.toml index 737552380a5..ff5cbebb81c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,6 @@ pexpect = {version = "^4.8.0", optional = true} pyvespa = {version = "^0.33.0", optional = true} O365 = {version = "^2.0.26", optional = true} jq = {version = "^1.4.1", optional = true} -pdfminer-six = {version = "^20221105", optional = true} [tool.poetry.group.docs.dependencies] @@ -162,8 +161,6 @@ cohere = ["cohere"] embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq"] -# An extra used to be able to add extended testing. -extended_testing = ["pypdf", "pdfminer.six"] [tool.ruff] select = [ diff --git a/tests/data.py b/tests/data.py deleted file mode 100644 index 228a9b212e1..00000000000 --- a/tests/data.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Module defines common test data.""" -from pathlib import Path - -_THIS_DIR = Path(__file__).parent - -_EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples" - -# Paths to test PDF files -HELLO_PDF = _EXAMPLES_DIR / "hello.pdf" -LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf" diff --git a/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py b/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py deleted file mode 100644 index 4063cece835..00000000000 --- a/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Tests for the various PDF parsers.""" -from typing import Iterator - -import pytest - -from langchain.document_loaders.base import BaseBlobParser -from langchain.document_loaders.blob_loaders import Blob -from langchain.document_loaders.parsers.pdf import ( - PDFMinerParser, - PyMuPDFParser, - PyPDFium2Parser, - PyPDFParser, -) -from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF - - -def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None: - """Standard tests to verify that the given parser works. - - Args: - parser (BaseBlobParser): The parser to test. - splits_by_page (bool): Whether the parser splits by page or not by default. - """ - blob = Blob.from_path(HELLO_PDF) - doc_generator = parser.lazy_parse(blob) - assert isinstance(doc_generator, Iterator) - docs = list(doc_generator) - assert len(docs) == 1 - page_content = docs[0].page_content - assert isinstance(page_content, str) - # The different parsers return different amount of whitespace, so using - # startswith instead of equals. - assert docs[0].page_content.startswith("Hello world!") - - blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF) - doc_generator = parser.lazy_parse(blob) - assert isinstance(doc_generator, Iterator) - docs = list(doc_generator) - - if splits_by_page: - assert len(docs) == 16 - else: - assert len(docs) == 1 - # Test is imprecise since the parsers yield different parse information depending - # on configuration. Each parser seems to yield a slightly different result - # for this page! - assert "LayoutParser" in docs[0].page_content - metadata = docs[0].metadata - - assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF) - - if splits_by_page: - assert metadata["page"] == 0 - - -@pytest.mark.requires("fitz") -def test_pymupdf_loader() -> None: - """Test PyMuPDF loader.""" - _assert_with_parser(PyMuPDFParser()) - - -@pytest.mark.requires("pypdf") -def test_pypdf_parser() -> None: - """Test PyPDF parser.""" - _assert_with_parser(PyPDFParser()) - - -@pytest.mark.requires("pdfminer") -def test_pdfminer_parser() -> None: - """Test PDFMiner parser.""" - # Does not follow defaults to split by page. - _assert_with_parser(PDFMinerParser(), splits_by_page=False) - - -@pytest.mark.requires("pypdfium2") -def test_pypdfium2_parser() -> None: - """Test PyPDFium2 parser.""" - # Does not follow defaults to split by page. - _assert_with_parser(PyPDFium2Parser()) diff --git a/tests/unit_tests/document_loader/parsers/test_public_api.py b/tests/unit_tests/document_loader/parsers/test_public_api.py deleted file mode 100644 index 52ce7e8e3e4..00000000000 --- a/tests/unit_tests/document_loader/parsers/test_public_api.py +++ /dev/null @@ -1,11 +0,0 @@ -from langchain.document_loaders.parsers import __all__ - - -def test_parsers_public_api_correct() -> None: - """Test public API of parsers for breaking changes.""" - assert set(__all__) == { - "PyPDFParser", - "PDFMinerParser", - "PyMuPDFParser", - "PyPDFium2Parser", - }