From 80558b5b278a9d3262ca7b4c6f0878e31d5fa459 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 10 May 2023 09:35:07 -0400 Subject: [PATCH] Add workflow for testing with all deps (#4410) # Add action to test with all dependencies installed PR adds a custom action for setting up poetry that allows specifying a cache key: https://github.com/actions/setup-python/issues/505#issuecomment-1273013236 This makes it possible to run 2 types of unit tests: (1) unit tests with only core dependencies (2) unit tests with extended dependencies (e.g., those that rely on an optional pdf parsing library) As part of this PR, we're moving some pdf parsing tests into the unit-tests section and making sure that these unit tests get executed when running with extended dependencies. --- .github/actions/poetry_setup/action.yml | 64 ++++++++++ .github/workflows/test.yml | 10 +- .github/workflows/test_all.yml | 33 +++++ poetry.lock | 119 +++++++++++++++++- pyproject.toml | 3 + tests/data.py | 10 ++ .../parsers/test_pdf_parsers.py | 79 ++++++++++++ .../parsers/test_public_api.py | 11 ++ 8 files changed, 322 insertions(+), 7 deletions(-) create mode 100644 .github/actions/poetry_setup/action.yml create mode 100644 .github/workflows/test_all.yml create mode 100644 tests/data.py create mode 100644 tests/unit_tests/document_loader/parsers/test_pdf_parsers.py create mode 100644 tests/unit_tests/document_loader/parsers/test_public_api.py diff --git a/.github/actions/poetry_setup/action.yml b/.github/actions/poetry_setup/action.yml new file mode 100644 index 00000000000..3e2ef52fb62 --- /dev/null +++ b/.github/actions/poetry_setup/action.yml @@ -0,0 +1,64 @@ +# An action for setting up poetry install with caching. +# Using a custom action since the default action does not +# take poetry install groups into account. +# Action code from: +# https://github.com/actions/setup-python/issues/505#issuecomment-1273013236 +name: poetry-install-with-caching +description: Poetry install with support for caching of dependency groups. + +inputs: + python-version: + description: Python version, supporting MAJOR.MINOR only + required: true + + poetry-version: + description: Poetry version + required: true + + install-command: + description: Command run for installing dependencies + required: false + default: poetry install + + cache-key: + description: Cache key to use for manual handling of caching + required: true + + working-directory: + description: Directory to run install-command in + required: false + default: "" + +runs: + using: composite + steps: + - uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + + - uses: actions/cache@v3 + id: cache-pip + env: + SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15" + with: + path: | + ~/.cache/pip + key: pip-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }} + + - run: pipx install poetry==${{ inputs.poetry-version }} --python python${{ inputs.python-version }} + shell: bash + + - uses: actions/cache@v3 + id: cache-poetry + env: + SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15" + with: + path: | + ~/.cache/pypoetry/virtualenvs + ~/.cache/pypoetry/cache + ~/.cache/pypoetry/artifacts + key: poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ inputs.cache-key }}-${{ hashFiles('poetry.lock') }} + + - run: ${{ inputs.install-command }} + working-directory: ${{ inputs.working-directory }} + shell: bash diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ed46af8aece..ec960bb411e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,15 +20,13 @@ jobs: - "3.11" steps: - uses: actions/checkout@v3 - - name: Install poetry - run: pipx install poetry==$POETRY_VERSION - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: "./.github/actions/poetry_setup" with: python-version: ${{ matrix.python-version }} - cache: "poetry" - - name: Install dependencies - run: poetry install + poetry-version: "1.4.2" + cache-key: "main" + install-command: "poetry install" - name: Run unit tests run: | make test diff --git a/.github/workflows/test_all.yml b/.github/workflows/test_all.yml new file mode 100644 index 00000000000..d914a16f740 --- /dev/null +++ b/.github/workflows/test_all.yml @@ -0,0 +1,33 @@ +# Run unit tests with all optional packages installed. +name: test_all + +on: + push: + branches: [master] + pull_request: + +env: + POETRY_VERSION: "1.4.2" + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: "./.github/actions/poetry_setup" + with: + python-version: ${{ matrix.python-version }} + poetry-version: "1.4.2" + cache-key: "extended" + install-command: "poetry install -E extended_testing" + - name: Run unit tests + run: | + make test diff --git a/poetry.lock b/poetry.lock index 0c0b0b722eb..d29d7498a89 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5362,6 +5362,27 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] s3 = ["boto3"] test = ["mock", "pytest", "pytest-coverage", "typer-cli"] +[[package]] +name = "pdfminer-six" +version = "20221105" +description = "PDF parser and analyzer" +category = "main" +optional = true +python-versions = ">=3.6" +files = [ + {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, + {file = "pdfminer.six-20221105.tar.gz", hash = "sha256:8448ab7b939d18b64820478ecac5394f482d7a79f5f7eaa7703c6c959c175e1d"}, +] + +[package.dependencies] +charset-normalizer = ">=2.0.0" +cryptography = ">=36.0.0" + +[package.extras] +dev = ["black", "mypy (==0.931)", "nox", "pytest"] +docs = ["sphinx", "sphinx-argparse"] +image = ["Pillow"] + [[package]] name = "pexpect" version = "4.8.0" @@ -6223,6 +6244,101 @@ pyarrow = ">=10" [package.extras] tests = ["duckdb", "polars[pandas,pyarrow]", "pytest"] +[[package]] +name = "pymongo" +version = "4.3.3" +description = "Python driver for MongoDB " +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pymongo-4.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:74731c9e423c93cbe791f60c27030b6af6a948cef67deca079da6cd1bb583a8e"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux1_i686.whl", hash = "sha256:66413c50d510e5bcb0afc79880d1693a2185bcea003600ed898ada31338c004e"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:9b87b23570565a6ddaa9244d87811c2ee9cffb02a753c8a2da9c077283d85845"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_i686.whl", hash = "sha256:695939036a320f4329ccf1627edefbbb67cc7892b8222d297b0dd2313742bfee"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_ppc64le.whl", hash = "sha256:ffcc8394123ea8d43fff8e5d000095fe7741ce3f8988366c5c919c4f5eb179d3"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_s390x.whl", hash = "sha256:943f208840777f34312c103a2d1caab02d780c4e9be26b3714acf6c4715ba7e1"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:01f7cbe88d22440b6594c955e37312d932fd632ffed1a86d0c361503ca82cc9d"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdb87309de97c63cb9a69132e1cb16be470e58cffdfbad68fdd1dc292b22a840"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d86c35d94b5499689354ccbc48438a79f449481ee6300f3e905748edceed78e7"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a966d5304b7d90c45c404914e06bbf02c5bf7e99685c6c12f0047ef2aa837142"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be1d2ce7e269215c3ee9a215e296b7a744aff4f39233486d2c4d77f5f0c561a6"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55b6163dac53ef1e5d834297810c178050bd0548a4136cd4e0f56402185916ca"}, + {file = "pymongo-4.3.3-cp310-cp310-win32.whl", hash = "sha256:dc0cff74cd36d7e1edba91baa09622c35a8a57025f2f2b7a41e3f83b1db73186"}, + {file = "pymongo-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:cafa52873ae12baa512a8721afc20de67a36886baae6a5f394ddef0ce9391f91"}, + {file = "pymongo-4.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:599d3f6fbef31933b96e2d906b0f169b3371ff79ea6aaf6ecd76c947a3508a3d"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0640b4e9d008e13956b004d1971a23377b3d45491f87082161c92efb1e6c0d6"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:341221e2f2866a5960e6f8610f4cbac0bb13097f3b1a289aa55aba984fc0d969"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7fac06a539daef4fcf5d8288d0d21b412f9b750454cd5a3cf90484665db442a"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3a51901066696c4af38c6c63a1f0aeffd5e282367ff475de8c191ec9609b56d"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3055510fdfdb1775bc8baa359783022f70bb553f2d46e153c094dfcb08578ff"}, + {file = "pymongo-4.3.3-cp311-cp311-win32.whl", hash = "sha256:524d78673518dcd352a91541ecd2839c65af92dc883321c2109ef6e5cd22ef23"}, + {file = "pymongo-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b8a03af1ce79b902a43f5f694c4ca8d92c2a4195db0966f08f266549e2fc49bc"}, + {file = "pymongo-4.3.3-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:39b03045c71f761aee96a12ebfbc2f4be89e724ff6f5e31c2574c1a0e2add8bd"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6fcfbf435eebf8a1765c6d1f46821740ebe9f54f815a05c8fc30d789ef43cb12"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7d43ac9c7eeda5100fb0a7152fab7099c9cf9e5abd3bb36928eb98c7d7a339c6"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3b93043b14ba7eb08c57afca19751658ece1cfa2f0b7b1fb5c7a41452fbb8482"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:c09956606c08c4a7c6178a04ba2dd9388fcc5db32002ade9c9bc865ab156ab6d"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:b0cfe925610f2fd59555bb7fc37bd739e4b197d33f2a8b2fae7b9c0c6640318c"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:4d00b91c77ceb064c9b0459f0d6ea5bfdbc53ea9e17cf75731e151ef25a830c7"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:c6258a3663780ae47ba73d43eb63c79c40ffddfb764e09b56df33be2f9479837"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c29e758f0e734e1e90357ae01ec9c6daf19ff60a051192fe110d8fb25c62600e"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f3621a46cdc7a9ba8080422262398a91762a581d27e0647746588d3f995c88"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:47f7aa217b25833cd6f0e72b0d224be55393c2692b4f5e0561cb3beeb10296e9"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2fdc855149efe7cdcc2a01ca02bfa24761c640203ea94df467f3baf19078be"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5effd87c7d363890259eac16c56a4e8da307286012c076223997f8cc4a8c435b"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6dd1cf2995fdbd64fc0802313e8323f5fa18994d51af059b5b8862b73b5e53f0"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:bb869707d8e30645ed6766e44098600ca6cdf7989c22a3ea2b7966bb1d98d4b2"}, + {file = "pymongo-4.3.3-cp37-cp37m-win32.whl", hash = "sha256:49210feb0be8051a64d71691f0acbfbedc33e149f0a5d6e271fddf6a12493fed"}, + {file = "pymongo-4.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:54c377893f2cbbffe39abcff5ff2e917b082c364521fa079305f6f064e1a24a9"}, + {file = "pymongo-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c184ec5be465c0319440734491e1aa4709b5f3ba75fdfc9dbbc2ae715a7f6829"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:dca34367a4e77fcab0693e603a959878eaf2351585e7d752cac544bc6b2dee46"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cd6a4afb20fb3c26a7bfd4611a0bbb24d93cbd746f5eb881f114b5e38fd55501"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0c466710871d0026c190fc4141e810cf9d9affbf4935e1d273fbdc7d7cda6143"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:d07d06dba5b5f7d80f9cc45501456e440f759fe79f9895922ed486237ac378a8"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:711bc52cb98e7892c03e9b669bebd89c0a890a90dbc6d5bb2c47f30239bac6e9"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:34b040e095e1671df0c095ec0b04fc4ebb19c4c160f87c2b55c079b16b1a6b00"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:4ed00f96e147f40b565fe7530d1da0b0f3ab803d5dd5b683834500fa5d195ec4"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef888f48eb9203ee1e04b9fb27429017b290fb916f1e7826c2f7808c88798394"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:316498b642c00401370b2156b5233b256f9b33799e0a8d9d0b8a7da217a20fca"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa7e202feb683dad74f00dea066690448d0cfa310f8a277db06ec8eb466601b5"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52896e22115c97f1c829db32aa2760b0d61839cfe08b168c2b1d82f31dbc5f55"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c051fe37c96b9878f37fa58906cb53ecd13dcb7341d3a85f1e2e2f6b10782d9"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5134d33286c045393c7beb51be29754647cec5ebc051cf82799c5ce9820a2ca2"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a9c2885b4a8e6e39db5662d8b02ca6dcec796a45e48c2de12552841f061692ba"}, + {file = "pymongo-4.3.3-cp38-cp38-win32.whl", hash = "sha256:a6cd6f1db75eb07332bd3710f58f5fce4967eadbf751bad653842750a61bda62"}, + {file = "pymongo-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:d5571b6978750601f783cea07fb6b666837010ca57e5cefa389c1d456f6222e2"}, + {file = "pymongo-4.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:81d1a7303bd02ca1c5be4aacd4db73593f573ba8e0c543c04c6da6275fd7a47e"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:016c412118e1c23fef3a1eada4f83ae6e8844fd91986b2e066fc1b0013cdd9ae"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:8fd6e191b92a10310f5a6cfe10d6f839d79d192fb02480bda325286bd1c7b385"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e2961b05f9c04a53da8bfc72f1910b6aec7205fcf3ac9c036d24619979bbee4b"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:b38a96b3eed8edc515b38257f03216f382c4389d022a8834667e2bc63c0c0c31"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:c1a70c51da9fa95bd75c167edb2eb3f3c4d27bc4ddd29e588f21649d014ec0b7"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:8a06a0c02f5606330e8f2e2f3b7949877ca7e4024fa2bff5a4506bec66c49ec7"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:6c2216d8b6a6d019c6f4b1ad55f890e5e77eb089309ffc05b6911c09349e7474"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eac0a143ef4f28f49670bf89cb15847eb80b375d55eba401ca2f777cd425f338"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08fc250b5552ee97ceeae0f52d8b04f360291285fc7437f13daa516ce38fdbc6"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704d939656e21b073bfcddd7228b29e0e8a93dd27b54240eaafc0b9a631629a6"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1074f1a6f23e28b983c96142f2d45be03ec55d93035b471c26889a7ad2365db3"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b16250238de8dafca225647608dddc7bbb5dce3dd53b4d8e63c1cc287394c2f"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7761cacb8745093062695b11574effea69db636c2fd0a9269a1f0183712927b4"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fd7bb378d82b88387dc10227cfd964f6273eb083e05299e9b97cbe075da12d11"}, + {file = "pymongo-4.3.3-cp39-cp39-win32.whl", hash = "sha256:dc24d245026a72d9b4953729d31813edd4bd4e5c13622d96e27c284942d33f24"}, + {file = "pymongo-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:fc28e8d85d392a06434e9a934908d97e2cf453d69488d2bcd0bfb881497fd975"}, + {file = "pymongo-4.3.3.tar.gz", hash = "sha256:34e95ffb0a68bffbc3b437f2d1f25fc916fef3df5cdeed0992da5f42fae9b807"}, +] + +[package.dependencies] +dnspython = ">=1.16.0,<3.0.0" + +[package.extras] +aws = ["pymongo-auth-aws (<2.0.0)"] +encryption = ["pymongo-auth-aws (<2.0.0)", "pymongocrypt (>=1.3.0,<2.0.0)"] +gssapi = ["pykerberos"] +ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] +snappy = ["python-snappy"] +zstd = ["zstandard"] + [[package]] name = "pyowm" version = "3.3.0" @@ -9686,6 +9802,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] +extended-testing = ["pdfminer-six", "pypdf"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] @@ -9693,4 +9810,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "2352db14ae75227c4d1ab34d48c74da3a16ceaeb5c5fa5df1a1dfcc5ae8e69e6" +content-hash = "63324f95fc04b48b631353c5e1a1a2ecc1b9dc757441181d4920e9d77cd8951f" diff --git a/pyproject.toml b/pyproject.toml index a4dc14ac6cf..18e4ef7e597 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,7 @@ pexpect = {version = "^4.8.0", optional = true} pyvespa = {version = "^0.33.0", optional = true} O365 = {version = "^2.0.26", optional = true} jq = {version = "^1.4.1", optional = true} +pdfminer-six = {version = "^20221105", optional = true} [tool.poetry.group.docs.dependencies] @@ -161,6 +162,8 @@ cohere = ["cohere"] embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq"] +# An extra used to be able to add extended testing. +extended_testing = ["pypdf", "pdfminer.six"] [tool.ruff] select = [ diff --git a/tests/data.py b/tests/data.py new file mode 100644 index 00000000000..228a9b212e1 --- /dev/null +++ b/tests/data.py @@ -0,0 +1,10 @@ +"""Module defines common test data.""" +from pathlib import Path + +_THIS_DIR = Path(__file__).parent + +_EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples" + +# Paths to test PDF files +HELLO_PDF = _EXAMPLES_DIR / "hello.pdf" +LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf" diff --git a/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py b/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py new file mode 100644 index 00000000000..4063cece835 --- /dev/null +++ b/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py @@ -0,0 +1,79 @@ +"""Tests for the various PDF parsers.""" +from typing import Iterator + +import pytest + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.parsers.pdf import ( + PDFMinerParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, +) +from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF + + +def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None: + """Standard tests to verify that the given parser works. + + Args: + parser (BaseBlobParser): The parser to test. + splits_by_page (bool): Whether the parser splits by page or not by default. + """ + blob = Blob.from_path(HELLO_PDF) + doc_generator = parser.lazy_parse(blob) + assert isinstance(doc_generator, Iterator) + docs = list(doc_generator) + assert len(docs) == 1 + page_content = docs[0].page_content + assert isinstance(page_content, str) + # The different parsers return different amount of whitespace, so using + # startswith instead of equals. + assert docs[0].page_content.startswith("Hello world!") + + blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF) + doc_generator = parser.lazy_parse(blob) + assert isinstance(doc_generator, Iterator) + docs = list(doc_generator) + + if splits_by_page: + assert len(docs) == 16 + else: + assert len(docs) == 1 + # Test is imprecise since the parsers yield different parse information depending + # on configuration. Each parser seems to yield a slightly different result + # for this page! + assert "LayoutParser" in docs[0].page_content + metadata = docs[0].metadata + + assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF) + + if splits_by_page: + assert metadata["page"] == 0 + + +@pytest.mark.requires("fitz") +def test_pymupdf_loader() -> None: + """Test PyMuPDF loader.""" + _assert_with_parser(PyMuPDFParser()) + + +@pytest.mark.requires("pypdf") +def test_pypdf_parser() -> None: + """Test PyPDF parser.""" + _assert_with_parser(PyPDFParser()) + + +@pytest.mark.requires("pdfminer") +def test_pdfminer_parser() -> None: + """Test PDFMiner parser.""" + # Does not follow defaults to split by page. + _assert_with_parser(PDFMinerParser(), splits_by_page=False) + + +@pytest.mark.requires("pypdfium2") +def test_pypdfium2_parser() -> None: + """Test PyPDFium2 parser.""" + # Does not follow defaults to split by page. + _assert_with_parser(PyPDFium2Parser()) diff --git a/tests/unit_tests/document_loader/parsers/test_public_api.py b/tests/unit_tests/document_loader/parsers/test_public_api.py new file mode 100644 index 00000000000..52ce7e8e3e4 --- /dev/null +++ b/tests/unit_tests/document_loader/parsers/test_public_api.py @@ -0,0 +1,11 @@ +from langchain.document_loaders.parsers import __all__ + + +def test_parsers_public_api_correct() -> None: + """Test public API of parsers for breaking changes.""" + assert set(__all__) == { + "PyPDFParser", + "PDFMinerParser", + "PyMuPDFParser", + "PyPDFium2Parser", + }