docs: run how-to guides in CI (#27615)

Add how-to guides to [Run notebooks job](https://github.com/langchain-ai/langchain/actions/workflows/run_notebooks.yml) and fix existing notebooks. - As with tutorials, cassettes must be updated when HTTP calls in guides change (by running existing [script](https://github.com/langchain-ai/langchain/blob/master/docs/scripts/update_cassettes.sh)). - Cassettes now total ~62mb over 474 files. - `docs/scripts/prepare_notebooks_for_ci.py` lists a number of notebooks that do not run (e.g., due to requiring additional infra, slowness, requiring `input()`, etc.).
2025-09-02 11:39:18 +00:00 · 2024-10-30 12:35:38 -04:00
parent 88bfd60b03
commit 595dc592c9
420 changed files with 2333 additions and 321 deletions
--- a/docs/scripts/cache_data.py
+++ b/docs/scripts/cache_data.py
@@ -0,0 +1,14 @@
+import tiktoken
+from unstructured.nlp.tokenize import download_nltk_packages
+
+
+def download_tiktoken_data():
+    # This will trigger the download and caching of the necessary files
+    _ = tiktoken.encoding_for_model("gpt2")
+    _ = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    _ = tiktoken.encoding_for_model("gpt-4o-mini")
+
+
+if __name__ == "__main__":
+    download_tiktoken_data()
+    download_nltk_packages()
--- a/docs/scripts/download_tiktoken.py
+++ b/docs/scripts/download_tiktoken.py
@@ -1,4 +0,0 @@
-import tiktoken
-
-# This will trigger the download and caching of the necessary files
-encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
--- a/docs/scripts/execute_notebooks.sh
+++ b/docs/scripts/execute_notebooks.sh
@@ -9,9 +9,11 @@ WORKING_DIRECTORY=$1
 # Function to execute a single notebook
 execute_notebook() {
    file="$1"
-    echo "Starting execution of $file"
+    index="$2"
+    total="$3"
+    echo "Starting execution of $file ($index/$total)"
    start_time=$(date +%s)
-    if ! output=$(time poetry run jupyter nbconvert --to notebook --execute $file 2>&1); then
+    if ! output=$(time poetry run jupyter nbconvert --to notebook --execute --ExecutePreprocessor.kernel_name=python3 $file 2>&1); then
        end_time=$(date +%s)
        execution_time=$((end_time - start_time))
        echo "Error in $file. Execution time: $execution_time seconds"
@@ -27,12 +29,18 @@ export -f execute_notebook

 # Determine the list of notebooks to execute
 if [ "$WORKING_DIRECTORY" == "all" ]; then
-    notebooks=$(find docs/docs/tutorials -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
+    notebooks=$(find docs/docs/tutorials docs/docs/how_to -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
 else
    notebooks=$(find "$WORKING_DIRECTORY" -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
 fi

-# Execute notebooks sequentially
-for file in $notebooks; do
-    execute_notebook "$file"
+# Convert the list of notebooks to an array
+notebooks_array=($notebooks)
+total_notebooks=${#notebooks_array[@]}
+
+# Execute notebooks sequentially with progress indication
+for i in "${!notebooks_array[@]}"; do
+    file="${notebooks_array[$i]}"
+    index=$((i + 1))
+    execute_notebook "$file" "$index" "$total_notebooks"
 done
--- a/docs/scripts/prepare_notebooks_for_ci.py
+++ b/docs/scripts/prepare_notebooks_for_ci.py
@@ -8,20 +8,39 @@ import click
 import nbformat

 logger = logging.getLogger(__name__)
-NOTEBOOK_DIRS = ("docs/docs/tutorials",)
+NOTEBOOK_DIRS = ("docs/docs/how_to", "docs/docs/tutorials")
 DOCS_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 CASSETTES_PATH = os.path.join(DOCS_PATH, "cassettes")

-# TODO: populate if needed
 NOTEBOOKS_NO_CASSETTES = [
    "docs/docs/tutorials/retrievers.ipynb",  # TODO: fix non-determinism
+    "docs/docs/how_to/multi_vector.ipynb",  # Non-determinism due to batch
 ]

 NOTEBOOKS_NO_EXECUTION = [
+    "docs/docs/how_to/add_scores_retriever.ipynb",  # Requires Pinecone instance
+    "docs/docs/how_to/chat_model_rate_limiting.ipynb",  # Slow (demonstrates rate limiting)
+    "docs/docs/how_to/document_loader_directory.ipynb",  # Deliberately raises error
+    "docs/docs/how_to/document_loader_pdf.ipynb",  # Local parsing section is slow
+    "docs/docs/how_to/example_selectors_langsmith.ipynb",  # TODO: add langchain-benchmarks; fix cassette issue
+    "docs/docs/how_to/extraction_long_text.ipynb",  # Non-determinism due to batch
+    "docs/docs/how_to/graph_constructing.ipynb",  # Requires local neo4j
+    "docs/docs/how_to/graph_mapping.ipynb",  # Requires local neo4j
+    "docs/docs/how_to/graph_prompting.ipynb",  # Requires local neo4j
+    "docs/docs/how_to/graph_semantic.ipynb",  # Requires local neo4j
+    "docs/docs/how_to/hybrid.ipynb",  # Requires AstraDB instance
+    "docs/docs/how_to/indexing.ipynb",  # Requires local Elasticsearch
+    "docs/docs/how_to/local_llms.ipynb",  # Local LLMs
+    "docs/docs/how_to/migrate_agent.ipynb",  # TODO: resolve issue with asyncio / exception handling
+    "docs/docs/how_to/qa_per_user.ipynb",  # Requires Pinecone instance
+    "docs/docs/how_to/query_high_cardinality.ipynb",  # Heavy
+    "docs/docs/how_to/split_by_token.ipynb",  # TODO: requires Korean document, also heavy deps
+    "docs/docs/how_to/tools_error.ipynb",  # Deliberately raises error
+    "docs/docs/how_to/tools_human.ipynb",  # Requires human input()
+    "docs/docs/how_to/tools_prompting.ipynb",  # Local LLMs
    "docs/docs/tutorials/graph.ipynb",  # Requires local graph db running
    "docs/docs/tutorials/local_rag.ipynb",  # Local LLMs
    "docs/docs/tutorials/query_analysis.ipynb",  # Requires youtube_transcript_api
-    "docs/docs/tutorials/sql_qa.ipynb",  # Requires Chinook db locally
    "docs/docs/tutorials/summarization.ipynb",  # TODO: source of non-determinism somewhere, fix or add to no cassettes
 ]

--- a/docs/scripts/update_cassettes.sh
+++ b/docs/scripts/update_cassettes.sh
@@ -1,5 +1,24 @@
 #!/bin/bash

+# Use this script to update cassettes for a notebook. The script does the following:
+#
+# 1. Delete existing cassettes for the specified notebook
+# 2. Pre-download and cache nltk and tiktoken files
+# 3. Modify the notebook to generate cassettes for each cell.
+# 4. Execute the notebook.
+#
+# Important: make sure the notebook is in a clean state, with any desired changes
+# staged or committed. The script will modify the notebook in place, and these
+# modifications should be discarded after the cassettes are generated.
+#
+# Usage:
+# In monorepo env, `poetry install --with dev,test`
+# `./docs/scripts/update_cassettes.sh path/to/notebook`
+# e.g., `./docs/scripts/update_cassettes.sh docs/docs/how_to/tool_choice.ipynb`
+#
+# Make sure to set any env vars required by the notebook.
+
+
 # Get the working directory from the input argument, default to 'all' if not provided
 WORKING_DIRECTORY=${1:-all}

@@ -21,8 +40,8 @@ delete_cassettes() {
 delete_cassettes "$WORKING_DIRECTORY"

 # Pre-download tiktoken files
-echo "Pre-downloading tiktoken files..."
-poetry run python docs/scripts/download_tiktoken.py
+echo "Pre-downloading nltk and tiktoken files..."
+poetry run python docs/scripts/cache_data.py

 # Prepare notebooks
 echo "Preparing notebooks for CI..."