From a349fce8802661ceaabb65d65b48e97b9c325066 Mon Sep 17 00:00:00 2001 From: Zheng Robert Jia Date: Thu, 20 Jun 2024 14:36:49 -0500 Subject: [PATCH] docs[minor],community[patch]: Minor tutorial docs improvement, minor import error quick fix. (#22725) minor changes to module import error handling and minor issues in tutorial documents. --------- Co-authored-by: Bagatur Co-authored-by: Eugene Yurtsev Co-authored-by: Eugene Yurtsev --- docs/docs/how_to/document_loader_pdf.ipynb | 11 +++++++++++ .../document_loaders/microsoft_powerpoint.ipynb | 13 +++++++++++++ docs/docs/tutorials/qa_chat_history.ipynb | 10 ++-------- docs/scripts/generate_api_reference_links.py | 3 ++- .../document_loaders/parsers/pdf.py | 16 ++++++++++++++-- 5 files changed, 42 insertions(+), 11 deletions(-) diff --git a/docs/docs/how_to/document_loader_pdf.ipynb b/docs/docs/how_to/document_loader_pdf.ipynb index a27f9383cf8..4a5275e9812 100644 --- a/docs/docs/how_to/document_loader_pdf.ipynb +++ b/docs/docs/how_to/document_loader_pdf.ipynb @@ -69,6 +69,17 @@ "Once we have loaded PDFs into LangChain `Document` objects, we can index them (e.g., a RAG application) in the usual way:" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3b932bb", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install faiss-cpu \n", + "# use `pip install faiss-gpu` for CUDA GPU support" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb b/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb index 670d5c70ee7..7d463b1e259 100644 --- a/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_powerpoint.ipynb @@ -12,6 +12,19 @@ "This covers how to load `Microsoft PowerPoint` documents into a document format that we can use downstream." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "aef1500f", + "metadata": {}, + "outputs": [], + "source": [ + "# Install packages\n", + "%pip install unstructured\n", + "%pip install python-magic\n", + "%pip install python-pptx" + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/docs/docs/tutorials/qa_chat_history.ipynb b/docs/docs/tutorials/qa_chat_history.ipynb index cb032f8d59e..aa1cae52651 100644 --- a/docs/docs/tutorials/qa_chat_history.ipynb +++ b/docs/docs/tutorials/qa_chat_history.ipynb @@ -322,7 +322,7 @@ "\n", "Now we can build our full QA chain. This is as simple as updating the retriever to be our new `history_aware_retriever`.\n", "\n", - "Again, we will use [create_stuff_documents_chain](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html) to generate a `question_answer_chain`, with input keys `context`, `chat_history`, and `input`-- it accepts the retrieved context alongside the conversation history and query to generate an answer.\n", + "Again, we will use [create_stuff_documents_chain](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html) to generate a `question_answer_chain`, with input keys `context`, `chat_history`, and `input`-- it accepts the retrieved context alongside the conversation history and query to generate an answer. A more detailed explaination is over [here](/docs/tutorials/rag/#built-in-chains)\n", "\n", "We build our final `rag_chain` with [create_retrieval_chain](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html). This chain applies the `history_aware_retriever` and `question_answer_chain` in sequence, retaining intermediate outputs such as the retrieved context for convenience. It has input keys `input` and `chat_history`, and includes `input`, `chat_history`, `context`, and `answer` in its output." ] @@ -760,13 +760,6 @@ "id": "931c4fe3-c603-4efb-9b37-5f7cbbb1cbbd", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Error in LangChainTracer.on_tool_end callback: TracerException(\"Found chain run at ID 0ec120e2-b1fc-4593-9fee-2dd4f4cae256, but expected {'tool'} run.\")\n" - ] - }, { "data": { "text/plain": [ @@ -1030,6 +1023,7 @@ "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from langgraph.checkpoint.sqlite import SqliteSaver\n", + "from langgraph.prebuilt import create_react_agent\n", "\n", "memory = SqliteSaver.from_conn_string(\":memory:\")\n", "llm = ChatOpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n", diff --git a/docs/scripts/generate_api_reference_links.py b/docs/scripts/generate_api_reference_links.py index f05cc9abc93..ff3eb53a2c9 100644 --- a/docs/scripts/generate_api_reference_links.py +++ b/docs/scripts/generate_api_reference_links.py @@ -24,7 +24,7 @@ _IMPORT_RE = re.compile( _CURRENT_PATH = Path(__file__).parent.absolute() # Directory where generated markdown files are stored -_DOCS_DIR = _CURRENT_PATH / "docs" +_DOCS_DIR = _CURRENT_PATH.parent.parent / "docs" def find_files(path): @@ -75,6 +75,7 @@ def main(): for file in find_files(args.docs_dir): file_imports = replace_imports(file) + print(file) if file_imports: # Use relative file path as key diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 32b0a0d1335..76cf101791d 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -89,7 +89,13 @@ class PyPDFParser(BaseBlobParser): def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" - import pypdf + try: + import pypdf + except ImportError: + raise ImportError( + "`pypdf` package not found, please install it with " + "`pip install pypdf`" + ) with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) @@ -144,7 +150,13 @@ class PDFMinerParser(BaseBlobParser): """Lazily parse the blob.""" if not self.extract_images: - from pdfminer.high_level import extract_text + try: + from pdfminer.high_level import extract_text + except ImportError: + raise ImportError( + "`pdfminer` package not found, please install it with " + "`pip install pdfminer.six`" + ) with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] if self.concatenate_pages: