From ef365543cb2806df31e977dffa9e4f7b3768e06a Mon Sep 17 00:00:00 2001 From: Jan Heimes <45521680+JANHMS@users.noreply.github.com> Date: Tue, 3 Dec 2024 23:06:25 +0100 Subject: [PATCH] community: add Needle retriever and document loader integration (#28157) - [x] **PR title**: "community: add Needle retriever and document loader integration" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** This PR adds a new integration for Needle, which includes: - **NeedleRetriever**: A retriever for fetching documents from Needle collections. - **NeedleLoader**: A document loader for managing and loading documents into Needle collections. - Example notebooks demonstrating usage have been added in: - `docs/docs/integrations/retrievers/needle.ipynb` - `docs/docs/integrations/document_loaders/needle.ipynb`. - **Dependencies:** The `needle-python` package is required as an external dependency for accessing Needle's API. It has been added to the extended testing dependencies list. - **Twitter handle:** Feel free to mention me if this PR gets announced: [needlexai](https://x.com/NeedlexAI). - [x] **Add tests and docs**: If you're adding a new integration, please include 1. Unit tests have been added for both `NeedleRetriever` and `NeedleLoader` in `libs/community/tests/unit_tests`. These tests mock API calls to avoid relying on network access. 2. Example notebooks have been added to `docs/docs/integrations/`, showcasing both retriever and loader functionality. - [x] **Lint and test**: Run `make format`, `make lint`, and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ - `make format`: Passed - `make lint`: Passed - `make test`: Passed (requires `needle-python` to be installed locally; this package is not added to LangChain dependencies). Additional guidelines: - [x] Optional dependencies are imported only within functions. - [x] No dependencies have been added to pyproject.toml files except for those required for unit tests. - [x] The PR does not touch more than one package. - [x] Changes are fully backwards compatible. - [x] Community additions are not re-imported into LangChain core. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis --- .../document_loaders/needle.ipynb | 253 ++++++++++++++++++ .../docs/integrations/retrievers/needle.ipynb | 235 ++++++++++++++++ libs/community/extended_testing_deps.txt | 1 + .../document_loaders/__init__.py | 5 + .../document_loaders/needle.py | 164 ++++++++++++ .../retrievers/__init__.py | 3 + .../langchain_community/retrievers/needle.py | 96 +++++++ .../document_loaders/test_imports.py | 1 + .../document_loaders/test_needle.py | 75 ++++++ .../unit_tests/retrievers/test_imports.py | 1 + .../unit_tests/retrievers/test_needle.py | 72 +++++ 11 files changed, 906 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/needle.ipynb create mode 100644 docs/docs/integrations/retrievers/needle.ipynb create mode 100644 libs/community/langchain_community/document_loaders/needle.py create mode 100644 libs/community/langchain_community/retrievers/needle.py create mode 100644 libs/community/tests/unit_tests/document_loaders/test_needle.py create mode 100644 libs/community/tests/unit_tests/retrievers/test_needle.py diff --git a/docs/docs/integrations/document_loaders/needle.ipynb b/docs/docs/integrations/document_loaders/needle.ipynb new file mode 100644 index 00000000000..1ea13845611 --- /dev/null +++ b/docs/docs/integrations/document_loaders/needle.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Needle Document Loader\n", + "[Needle](https://needle-ai.com) makes it easy to create your RAG pipelines with minimal effort. \n", + "\n", + "For more details, refer to our [API documentation](https://docs.needle-ai.com/docs/api-reference/needle-api)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "The Needle Document Loader is a utility for integrating Needle collections with LangChain. It enables seamless storage, retrieval, and utilization of documents for Retrieval-Augmented Generation (RAG) workflows.\n", + "\n", + "This example demonstrates:\n", + "\n", + "* Storing documents into a Needle collection.\n", + "* Setting up a retriever to fetch documents.\n", + "* Building a Retrieval-Augmented Generation (RAG) pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup\n", + "Before starting, ensure you have the following environment variables set:\n", + "\n", + "* NEEDLE_API_KEY: Your API key for authenticating with Needle.\n", + "* OPENAI_API_KEY: Your OpenAI API key for language model operations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"NEEDLE_API_KEY\"] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization\n", + "To initialize the NeedleLoader, you need the following parameters:\n", + "\n", + "* needle_api_key: Your Needle API key (or set it as an environment variable).\n", + "* collection_id: The ID of the Needle collection to work with." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders.needle import NeedleLoader\n", + "\n", + "collection_id = \"clt_01J87M9T6B71DHZTHNXYZQRG5H\"\n", + "\n", + "# Initialize NeedleLoader to store documents to the collection\n", + "document_loader = NeedleLoader(\n", + " needle_api_key=os.getenv(\"NEEDLE_API_KEY\"),\n", + " collection_id=collection_id,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load\n", + "To add files to the Needle collection:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = {\n", + " \"tech-radar-30.pdf\": \"https://www.thoughtworks.com/content/dam/thoughtworks/documents/radar/2024/04/tr_technology_radar_vol_30_en.pdf\"\n", + "}\n", + "\n", + "document_loader.add_files(files=files)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the documents in the collection\n", + "# collections_documents = document_loader.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lazy Load\n", + "The lazy_load method allows you to iteratively load documents from the Needle collection, yielding each document as it is fetched:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the documents in the collection\n", + "# collections_documents = document_loader.lazy_load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage\n", + "### Use within a chain\n", + "Below is a complete example of setting up a RAG pipeline with Needle within a chain:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input': 'Did RAG move to accepted?',\n", + " 'context': [Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.')],\n", + " 'answer': 'Yes, RAG has been adopted as the preferred pattern for improving the quality of responses generated by a large language model.'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "from langchain.chains import create_retrieval_chain\n", + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", + "from langchain_community.retrievers.needle import NeedleRetriever\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(temperature=0)\n", + "\n", + "# Initialize the Needle retriever (make sure your Needle API key is set as an environment variable)\n", + "retriever = NeedleRetriever(\n", + " needle_api_key=os.getenv(\"NEEDLE_API_KEY\"),\n", + " collection_id=\"clt_01J87M9T6B71DHZTHNXYZQRG5H\",\n", + ")\n", + "\n", + "# Define system prompt for the assistant\n", + "system_prompt = \"\"\"\n", + " You are an assistant for question-answering tasks. \n", + " Use the following pieces of retrieved context to answer the question.\n", + " If you don't know, say so concisely.\\n\\n{context}\n", + " \"\"\"\n", + "\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [(\"system\", system_prompt), (\"human\", \"{input}\")]\n", + ")\n", + "\n", + "# Define the question-answering chain using a document chain (stuff chain) and the retriever\n", + "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n", + "\n", + "# Create the RAG (Retrieval-Augmented Generation) chain by combining the retriever and the question-answering chain\n", + "rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n", + "\n", + "# Define the input query\n", + "query = {\"input\": \"Did RAG move to accepted?\"}\n", + "\n", + "response = rag_chain.invoke(query)\n", + "\n", + "response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all `Needle` features and configurations head to the API reference: https://docs.needle-ai.com" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/retrievers/needle.ipynb b/docs/docs/integrations/retrievers/needle.ipynb new file mode 100644 index 00000000000..616b6c0a896 --- /dev/null +++ b/docs/docs/integrations/retrievers/needle.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Needle Retriever\n", + "[Needle](https://needle-ai.com) makes it easy to create your RAG pipelines with minimal effort. \n", + "\n", + "For more details, refer to our [API documentation](https://docs.needle-ai.com/docs/api-reference/needle-api)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "The Needle Document Loader is a utility for integrating Needle collections with LangChain. It enables seamless storage, retrieval, and utilization of documents for Retrieval-Augmented Generation (RAG) workflows.\n", + "\n", + "This example demonstrates:\n", + "\n", + "* Storing documents into a Needle collection.\n", + "* Setting up a retriever to fetch documents.\n", + "* Building a Retrieval-Augmented Generation (RAG) pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup\n", + "Before starting, ensure you have the following environment variables set:\n", + "\n", + "* NEEDLE_API_KEY: Your API key for authenticating with Needle.\n", + "* OPENAI_API_KEY: Your OpenAI API key for language model operations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization\n", + "To initialize the NeedleLoader, you need the following parameters:\n", + "\n", + "* needle_api_key: Your Needle API key (or set it as an environment variable).\n", + "* collection_id: The ID of the Needle collection to work with." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"NEEDLE_API_KEY\"] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders.needle import NeedleLoader\n", + "\n", + "collection_id = \"clt_01J87M9T6B71DHZTHNXYZQRG5H\"\n", + "\n", + "# Initialize NeedleLoader to store documents to the collection\n", + "document_loader = NeedleLoader(\n", + " needle_api_key=os.getenv(\"NEEDLE_API_KEY\"),\n", + " collection_id=collection_id,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load\n", + "To add files to the Needle collection:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = {\n", + " \"tech-radar-30.pdf\": \"https://www.thoughtworks.com/content/dam/thoughtworks/documents/radar/2024/04/tr_technology_radar_vol_30_en.pdf\"\n", + "}\n", + "\n", + "document_loader.add_files(files=files)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the documents in the collection\n", + "# collections_documents = document_loader.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage\n", + "### Use within a chain\n", + "Below is a complete example of setting up a RAG pipeline with Needle within a chain:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input': 'Did RAG move to accepted?',\n", + " 'context': [Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='New Moved in/out No change\\n\\n© Thoughtworks, Inc. All Rights Reserved. 12\\n\\nTechniques\\n\\n1. Retrieval-augmented generation (RAG)\\nAdopt\\n\\nRetrieval-augmented generation (RAG) is the preferred pattern for our teams to improve the quality of \\nresponses generated by a large language model (LLM). We’ve successfully used it in several projects, \\nincluding the popular Jugalbandi AI Platform. With RAG, information about relevant and trustworthy \\ndocuments — in formats like HTML and PDF — are stored in databases that supports a vector data \\ntype or efficient document search, such as pgvector, Qdrant or Elasticsearch Relevance Engine. For \\na given prompt, the database is queried to retrieve relevant documents, which are then combined \\nwith the prompt to provide richer context to the LLM. This results in higher quality output and greatly \\nreduced hallucinations. The context window — which determines the maximum size of the LLM input \\n— is limited, which means that selecting the most relevant documents is crucial. We improve the \\nrelevancy of the content that is added to the prompt by reranking. Similarly, the documents are usually \\ntoo large to calculate an embedding, which means they must be split into smaller chunks. This is often \\na difficult problem, and one approach is to have the chunks overlap to a certain extent.'),\n", + " Document(metadata={}, page_content='https://www.thoughtworks.com/radar/tools/nemo-guardrails\\nhttps://www.thoughtworks.com/radar/platforms/langfuse\\nhttps://www.thoughtworks.com/radar/techniques/retrieval-augmented-generation-rag\\nhttps://cruisecontrol.sourceforge.net/\\nhttps://martinfowler.com/articles/continuousIntegration.html\\nhttps://www.thoughtworks.com/radar/techniques/peer-review-equals-pull-request\\nhttps://martinfowler.com/bliki/ContinuousIntegrationCertification.html\\nhttps://linearb.io/platform/gitstream\\nhttps://www.thoughtworks.com/radar/tools/github-merge-queue\\nhttps://stacking.dev/\\n\\n© Thoughtworks, Inc. All Rights Reserved. 8\\n\\nHold HoldAssess AssessTrial TrialAdopt Adopt\\n\\n18\\n\\n8\\n\\n24\\n\\n29\\n\\n30\\n31\\n\\n32\\n33\\n\\n34 35\\n\\n36\\n37\\n\\n38 39\\n\\n40\\n41\\n\\n42\\n43\\n\\n26\\n\\n2\\n\\n3\\n\\n4\\n\\n5\\n\\n6 7\\n\\n9\\n\\n15\\n\\n16\\n\\n17\\n\\n10\\n\\n11\\n\\n12\\n\\n13 14\\n\\n44\\n\\n47\\n49\\n\\n50\\n\\n65\\n66\\n\\n67 68\\n69\\n\\n70\\n71\\n\\n72\\n\\n73 74\\n\\n75\\n\\n76 77\\n\\n78\\n79\\n\\n80\\n81\\n\\n82\\n\\n83\\n\\n51\\n\\n52 54\\n\\n59\\n\\n53\\n56\\n\\n58\\n\\n61\\n\\n62\\n63\\n\\n64\\n\\n85\\n\\n88 89\\n\\n90 91\\n\\n92\\n93\\n\\n94\\n95 96\\n\\n97\\n\\n98 99\\n\\n100\\n\\n101\\n102\\n\\n103\\n\\n104\\n\\n86\\n\\n87\\n1921\\n\\n22\\n\\n20\\n28\\n\\n25\\n\\n27\\n\\n23\\n\\n84\\n\\n105\\n\\n1\\n45\\n\\n46\\n\\n48\\n\\n55\\n57')],\n", + " 'answer': 'Yes, RAG has moved to the \"Adopt\" status.'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "from langchain.chains import create_retrieval_chain\n", + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", + "from langchain_community.retrievers.needle import NeedleRetriever\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(temperature=0)\n", + "\n", + "# Initialize the Needle retriever (make sure your Needle API key is set as an environment variable)\n", + "retriever = NeedleRetriever(\n", + " needle_api_key=os.getenv(\"NEEDLE_API_KEY\"),\n", + " collection_id=\"clt_01J87M9T6B71DHZTHNXYZQRG5H\",\n", + ")\n", + "\n", + "# Define system prompt for the assistant\n", + "system_prompt = \"\"\"\n", + " You are an assistant for question-answering tasks. \n", + " Use the following pieces of retrieved context to answer the question.\n", + " If you don't know, say so concisely.\\n\\n{context}\n", + " \"\"\"\n", + "\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [(\"system\", system_prompt), (\"human\", \"{input}\")]\n", + ")\n", + "\n", + "# Define the question-answering chain using a document chain (stuff chain) and the retriever\n", + "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n", + "\n", + "# Create the RAG (Retrieval-Augmented Generation) chain by combining the retriever and the question-answering chain\n", + "rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n", + "\n", + "# Define the input query\n", + "query = {\"input\": \"Did RAG move to accepted?\"}\n", + "\n", + "response = rag_chain.invoke(query)\n", + "\n", + "response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all `Needle` features and configurations head to the API reference: https://docs.needle-ai.com" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index d331fb66e85..fc208a71ef2 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -46,6 +46,7 @@ motor>=3.3.1,<4 msal>=1.25.0,<2 mwparserfromhell>=0.6.4,<0.7 mwxml>=0.3.3,<0.4 +needle-python>=0.4 networkx>=3.2.1,<4 newspaper3k>=0.2.8,<0.3 numexpr>=2.8.6,<3 diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 2576093d3d4..8a56f918ab6 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -299,6 +299,9 @@ if TYPE_CHECKING: from langchain_community.document_loaders.mongodb import ( MongodbLoader, ) + from langchain_community.document_loaders.needle import ( + NeedleLoader, + ) from langchain_community.document_loaders.news import ( NewsURLLoader, ) @@ -631,6 +634,7 @@ _module_lookup = { "MergedDataLoader": "langchain_community.document_loaders.merge", "ModernTreasuryLoader": "langchain_community.document_loaders.modern_treasury", "MongodbLoader": "langchain_community.document_loaders.mongodb", + "NeedleLoader": "langchain_community.document_loaders.needle", "NewsURLLoader": "langchain_community.document_loaders.news", "NotebookLoader": "langchain_community.document_loaders.notebook", "NotionDBLoader": "langchain_community.document_loaders.notiondb", @@ -837,6 +841,7 @@ __all__ = [ "MergedDataLoader", "ModernTreasuryLoader", "MongodbLoader", + "NeedleLoader", "NewsURLLoader", "NotebookLoader", "NotionDBLoader", diff --git a/libs/community/langchain_community/document_loaders/needle.py b/libs/community/langchain_community/document_loaders/needle.py new file mode 100644 index 00000000000..03b9ee0c0e0 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/needle.py @@ -0,0 +1,164 @@ +from typing import Dict, Iterator, List, Optional + +from langchain_core.document_loaders.base import BaseLoader +from langchain_core.documents import Document + + +class NeedleLoader(BaseLoader): + """ + NeedleLoader is a document loader for managing documents stored in a collection. + + Setup: + Install the `needle-python` library and set your Needle API key. + + .. code-block:: bash + + pip install needle-python + export NEEDLE_API_KEY="your-api-key" + + Key init args: + - `needle_api_key` (Optional[str]): API key for authenticating with Needle. + - `collection_id` (str): Needle collection to load documents from. + + Usage: + .. code-block:: python + + from langchain_community.document_loaders.needle import NeedleLoader + + loader = NeedleLoader( + needle_api_key="your-api-key", + collection_id="your-collection-id" + ) + + # Load documents + documents = loader.load() + for doc in documents: + print(doc.metadata) + + # Lazy load documents + for doc in loader.lazy_load(): + print(doc.metadata) + """ + + def __init__( + self, + needle_api_key: Optional[str] = None, + collection_id: Optional[str] = None, + ) -> None: + """ + Initializes the NeedleLoader with API key and collection ID. + + Args: + needle_api_key (Optional[str]): API key for authenticating with Needle. + collection_id (Optional[str]): Identifier for the Needle collection. + + Raises: + ImportError: If the `needle-python` library is not installed. + ValueError: If the collection ID is not provided. + """ + try: + from needle.v1 import NeedleClient + except ImportError: + raise ImportError( + "Please install with `pip install needle-python` to use NeedleLoader." + ) + + super().__init__() + self.needle_api_key = needle_api_key + self.collection_id = collection_id + self.client: Optional[NeedleClient] = None + + if self.needle_api_key: + self.client = NeedleClient(api_key=self.needle_api_key) + + if not self.collection_id: + raise ValueError("Collection ID must be provided.") + + def _get_collection(self) -> None: + """ + Ensures the Needle collection is set and the client is initialized. + + Raises: + ValueError: If the Needle client is not initialized or + if the collection ID is missing. + """ + if self.client is None: + raise ValueError( + "NeedleClient is not initialized. Provide a valid API key." + ) + if not self.collection_id: + raise ValueError("Collection ID must be provided.") + + def add_files(self, files: Dict[str, str]) -> None: + """ + Adds files to the Needle collection. + + Args: + files (Dict[str, str]): Dictionary where keys are file names and values + are file URLs. + + Raises: + ImportError: If the `needle-python` library is not installed. + ValueError: If the collection is not properly initialized. + """ + try: + from needle.v1.models import FileToAdd + except ImportError: + raise ImportError( + "Please install with `pip install needle-python` to add files." + ) + + self._get_collection() + assert self.client is not None, "NeedleClient must be initialized." + + files_to_add = [FileToAdd(name=name, url=url) for name, url in files.items()] + + self.client.collections.files.add( + collection_id=self.collection_id, files=files_to_add + ) + + def _fetch_documents(self) -> List[Document]: + """ + Fetches metadata for documents from the Needle collection. + + Returns: + List[Document]: A list of documents with metadata. Content is excluded. + + Raises: + ValueError: If the collection is not properly initialized. + """ + self._get_collection() + assert self.client is not None, "NeedleClient must be initialized." + + files = self.client.collections.files.list(self.collection_id) + docs = [ + Document( + page_content="", # Needle doesn't provide file content fetching + metadata={ + "source": file.url, + "title": file.name, + "size": getattr(file, "size", None), + }, + ) + for file in files + if file.status == "indexed" + ] + return docs + + def load(self) -> List[Document]: + """ + Loads all documents from the Needle collection. + + Returns: + List[Document]: A list of documents from the collection. + """ + return self._fetch_documents() + + def lazy_load(self) -> Iterator[Document]: + """ + Lazily loads documents from the Needle collection. + + Yields: + Iterator[Document]: An iterator over the documents. + """ + yield from self._fetch_documents() diff --git a/libs/community/langchain_community/retrievers/__init__.py b/libs/community/langchain_community/retrievers/__init__.py index 75600eab6d5..ce4ac731bde 100644 --- a/libs/community/langchain_community/retrievers/__init__.py +++ b/libs/community/langchain_community/retrievers/__init__.py @@ -93,6 +93,7 @@ if TYPE_CHECKING: MilvusRetriever, ) from langchain_community.retrievers.nanopq import NanoPQRetriever + from langchain_community.retrievers.needle import NeedleRetriever from langchain_community.retrievers.outline import ( OutlineRetriever, ) @@ -173,6 +174,7 @@ _module_lookup = { "MetalRetriever": "langchain_community.retrievers.metal", "MilvusRetriever": "langchain_community.retrievers.milvus", "NanoPQRetriever": "langchain_community.retrievers.nanopq", + "NeedleRetriever": "langchain_community.retrievers.needle", "OutlineRetriever": "langchain_community.retrievers.outline", "PineconeHybridSearchRetriever": "langchain_community.retrievers.pinecone_hybrid_search", # noqa: E501 "PubMedRetriever": "langchain_community.retrievers.pubmed", @@ -229,6 +231,7 @@ __all__ = [ "MetalRetriever", "MilvusRetriever", "NanoPQRetriever", + "NeedleRetriever", "NeuralDBRetriever", "OutlineRetriever", "PineconeHybridSearchRetriever", diff --git a/libs/community/langchain_community/retrievers/needle.py b/libs/community/langchain_community/retrievers/needle.py new file mode 100644 index 00000000000..201d617fda4 --- /dev/null +++ b/libs/community/langchain_community/retrievers/needle.py @@ -0,0 +1,96 @@ +from typing import Any, List, Optional # noqa: I001 + +from langchain_core.callbacks import CallbackManagerForRetrieverRun +from langchain_core.documents import Document +from langchain_core.retrievers import BaseRetriever +from pydantic import BaseModel, Field + + +class NeedleRetriever(BaseRetriever, BaseModel): + """ + NeedleRetriever retrieves relevant documents or context from a Needle collection + based on a search query. + + Setup: + Install the `needle-python` library and set your Needle API key. + + .. code-block:: bash + + pip install needle-python + export NEEDLE_API_KEY="your-api-key" + + Key init args: + - `needle_api_key` (Optional[str]): The API key for authenticating with Needle. + - `collection_id` (str): The ID of the Needle collection to search in. + - `client` (Optional[NeedleClient]): An optional instance of the NeedleClient. + + Usage: + .. code-block:: python + + from langchain_community.retrievers.needle import NeedleRetriever + + retriever = NeedleRetriever( + needle_api_key="your-api-key", + collection_id="your-collection-id" + ) + + results = retriever.retrieve("example query") + for doc in results: + print(doc.page_content) + """ + + client: Optional[Any] = None + """Optional instance of NeedleClient.""" + needle_api_key: Optional[str] = Field(None, description="Needle API Key") + collection_id: Optional[str] = Field( + ..., description="The ID of the Needle collection to search in" + ) + + def _initialize_client(self) -> None: + """ + Initialize the NeedleClient with the provided API key. + + If a client instance is already provided, this method does nothing. + """ + try: + from needle.v1 import NeedleClient + except ImportError: + raise ImportError("Please install with `pip install needle-python`.") + + if not self.client: + self.client = NeedleClient(api_key=self.needle_api_key) + + def _search_collection(self, query: str) -> List[Document]: + """ + Search the Needle collection for relevant documents. + + Args: + query (str): The search query used to find relevant documents. + + Returns: + List[Document]: A list of documents matching the search query. + """ + self._initialize_client() + if self.client is None: + raise ValueError("NeedleClient is not initialized. Provide an API key.") + + results = self.client.collections.search( + collection_id=self.collection_id, text=query + ) + docs = [Document(page_content=result.content) for result in results] + return docs + + def _get_relevant_documents( + self, query: str, *, run_manager: CallbackManagerForRetrieverRun + ) -> List[Document]: + """ + Retrieve relevant documents based on the query. + + Args: + query (str): The query string used to search the collection. + Returns: + List[Document]: A list of documents relevant to the query. + """ + # The `run_manager` parameter is included to match the superclass signature, + # but it is not used in this implementation. + return self._search_collection(query) diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index b49a1b7cc4a..ddeaf734b0f 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -105,6 +105,7 @@ EXPECTED_ALL = [ "MergedDataLoader", "ModernTreasuryLoader", "MongodbLoader", + "NeedleLoader", "NewsURLLoader", "NotebookLoader", "NotionDBLoader", diff --git a/libs/community/tests/unit_tests/document_loaders/test_needle.py b/libs/community/tests/unit_tests/document_loaders/test_needle.py new file mode 100644 index 00000000000..d8f7a22fb0f --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_needle.py @@ -0,0 +1,75 @@ +import pytest +from pytest_mock import MockerFixture + + +@pytest.mark.requires("needle") +def test_add_and_fetch_files(mocker: MockerFixture) -> None: + """ + Test adding and fetching files using the NeedleLoader with a mock. + """ + from langchain_community.document_loaders.needle import NeedleLoader # noqa: I001 + from needle.v1.models import CollectionFile # noqa: I001 + + # Create mock instances using mocker + # Create mock instances using mocker + mock_files = mocker.Mock() + mock_files.add.return_value = [ + CollectionFile( + id="mock_id", + name="tech-radar-30.pdf", + url="https://example.com/", + status="indexed", + type="mock_type", + user_id="mock_user_id", + connector_id="mock_connector_id", + size=1234, + md5_hash="mock_md5_hash", + created_at="2024-01-01T00:00:00Z", + updated_at="2024-01-01T00:00:00Z", + ) + ] + mock_files.list.return_value = [ + CollectionFile( + id="mock_id", + name="tech-radar-30.pdf", + url="https://example.com/", + status="indexed", + type="mock_type", + user_id="mock_user_id", + connector_id="mock_connector_id", + size=1234, + md5_hash="mock_md5_hash", + created_at="2024-01-01T00:00:00Z", + updated_at="2024-01-01T00:00:00Z", + ) + ] + + mock_collections = mocker.Mock() + mock_collections.files = mock_files + + mock_needle_client = mocker.Mock() + mock_needle_client.collections = mock_collections + + # Patch the NeedleClient to return the mock client + mocker.patch("needle.v1.NeedleClient", return_value=mock_needle_client) + + # Initialize NeedleLoader with mock API key and collection ID + document_store = NeedleLoader( + needle_api_key="fake_api_key", + collection_id="fake_collection_id", + ) + + # Define files to add + files = { + "tech-radar-30.pdf": "https://www.thoughtworks.com/content/dam/thoughtworks/documents/radar/2024/04/tr_technology_radar_vol_30_en.pdf" + } + + # Add files to the collection using the mock client + document_store.add_files(files=files) + + # Fetch the added files using the mock client + added_files = document_store._fetch_documents() + + # Assertions to verify that the file was added and fetched correctly + assert isinstance(added_files[0].metadata["title"], str) + assert isinstance(added_files[0].metadata["source"], str) diff --git a/libs/community/tests/unit_tests/retrievers/test_imports.py b/libs/community/tests/unit_tests/retrievers/test_imports.py index f5b791139a8..dde08e2f817 100644 --- a/libs/community/tests/unit_tests/retrievers/test_imports.py +++ b/libs/community/tests/unit_tests/retrievers/test_imports.py @@ -26,6 +26,7 @@ EXPECTED_ALL = [ "MetalRetriever", "MilvusRetriever", "NanoPQRetriever", + "NeedleRetriever", "OutlineRetriever", "PineconeHybridSearchRetriever", "PubMedRetriever", diff --git a/libs/community/tests/unit_tests/retrievers/test_needle.py b/libs/community/tests/unit_tests/retrievers/test_needle.py new file mode 100644 index 00000000000..853250d409f --- /dev/null +++ b/libs/community/tests/unit_tests/retrievers/test_needle.py @@ -0,0 +1,72 @@ +from typing import Any + +import pytest +from pytest_mock import MockerFixture + + +# Mock class to simulate search results from Needle API +class MockSearchResult: + def __init__(self, content: str) -> None: + self.content = content + + +# Mock class to simulate NeedleClient and its collections behavior +class MockNeedleClient: + def __init__(self, api_key: str) -> None: + self.api_key = api_key + self.collections = self.MockCollections() + + class MockCollections: + def search(self, collection_id: str, text: str) -> list[MockSearchResult]: + return [ + MockSearchResult(content=f"Result for query: {text}"), + MockSearchResult(content=f"Another result for query: {text}"), + ] + + +@pytest.mark.requires("needle") +def test_needle_retriever_initialization() -> None: + """ + Test that the NeedleRetriever is initialized correctly. + """ + from langchain_community.retrievers.needle import NeedleRetriever # noqa: I001 + + retriever = NeedleRetriever( + needle_api_key="mock_api_key", + collection_id="mock_collection_id", + ) + + assert retriever.needle_api_key == "mock_api_key" + assert retriever.collection_id == "mock_collection_id" + + +@pytest.mark.requires("needle") +def test_get_relevant_documents(mocker: MockerFixture) -> None: + """ + Test that the retriever correctly fetches documents. + """ + from langchain_community.retrievers.needle import NeedleRetriever # noqa: I001 + + # Patch the actual NeedleClient import path used in the NeedleRetriever + mocker.patch("needle.v1.NeedleClient", new=MockNeedleClient) + + # Initialize the retriever with mocked API key and collection ID + retriever = NeedleRetriever( + needle_api_key="mock_api_key", + collection_id="mock_collection_id", + ) + + mock_run_manager: Any = None + + # Perform the search + query = "What is RAG?" + retrieved_documents = retriever._get_relevant_documents( + query, run_manager=mock_run_manager + ) + + # Validate the results + assert len(retrieved_documents) == 2 + assert retrieved_documents[0].page_content == "Result for query: What is RAG?" + assert ( + retrieved_documents[1].page_content == "Another result for query: What is RAG?" + )