docs: run how-to guides in CI (#27615)

Add how-to guides to [Run notebooks
job](https://github.com/langchain-ai/langchain/actions/workflows/run_notebooks.yml)
and fix existing notebooks.

- As with tutorials, cassettes must be updated when HTTP calls in guides
change (by running existing
[script](https://github.com/langchain-ai/langchain/blob/master/docs/scripts/update_cassettes.sh)).
- Cassettes now total ~62mb over 474 files.
- `docs/scripts/prepare_notebooks_for_ci.py` lists a number of notebooks
that do not run (e.g., due to requiring additional infra, slowness,
requiring `input()`, etc.).
This commit is contained in:
ccurme
2024-10-30 12:35:38 -04:00
committed by GitHub
parent 88bfd60b03
commit 595dc592c9
420 changed files with 2333 additions and 321 deletions

View File

@@ -0,0 +1,14 @@
import tiktoken
from unstructured.nlp.tokenize import download_nltk_packages
def download_tiktoken_data():
# This will trigger the download and caching of the necessary files
_ = tiktoken.encoding_for_model("gpt2")
_ = tiktoken.encoding_for_model("gpt-3.5-turbo")
_ = tiktoken.encoding_for_model("gpt-4o-mini")
if __name__ == "__main__":
download_tiktoken_data()
download_nltk_packages()

View File

@@ -1,4 +0,0 @@
import tiktoken
# This will trigger the download and caching of the necessary files
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

View File

@@ -9,9 +9,11 @@ WORKING_DIRECTORY=$1
# Function to execute a single notebook
execute_notebook() {
file="$1"
echo "Starting execution of $file"
index="$2"
total="$3"
echo "Starting execution of $file ($index/$total)"
start_time=$(date +%s)
if ! output=$(time poetry run jupyter nbconvert --to notebook --execute $file 2>&1); then
if ! output=$(time poetry run jupyter nbconvert --to notebook --execute --ExecutePreprocessor.kernel_name=python3 $file 2>&1); then
end_time=$(date +%s)
execution_time=$((end_time - start_time))
echo "Error in $file. Execution time: $execution_time seconds"
@@ -27,12 +29,18 @@ export -f execute_notebook
# Determine the list of notebooks to execute
if [ "$WORKING_DIRECTORY" == "all" ]; then
notebooks=$(find docs/docs/tutorials -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
notebooks=$(find docs/docs/tutorials docs/docs/how_to -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
else
notebooks=$(find "$WORKING_DIRECTORY" -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
fi
# Execute notebooks sequentially
for file in $notebooks; do
execute_notebook "$file"
# Convert the list of notebooks to an array
notebooks_array=($notebooks)
total_notebooks=${#notebooks_array[@]}
# Execute notebooks sequentially with progress indication
for i in "${!notebooks_array[@]}"; do
file="${notebooks_array[$i]}"
index=$((i + 1))
execute_notebook "$file" "$index" "$total_notebooks"
done

View File

@@ -8,20 +8,39 @@ import click
import nbformat
logger = logging.getLogger(__name__)
NOTEBOOK_DIRS = ("docs/docs/tutorials",)
NOTEBOOK_DIRS = ("docs/docs/how_to", "docs/docs/tutorials")
DOCS_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CASSETTES_PATH = os.path.join(DOCS_PATH, "cassettes")
# TODO: populate if needed
NOTEBOOKS_NO_CASSETTES = [
"docs/docs/tutorials/retrievers.ipynb", # TODO: fix non-determinism
"docs/docs/how_to/multi_vector.ipynb", # Non-determinism due to batch
]
NOTEBOOKS_NO_EXECUTION = [
"docs/docs/how_to/add_scores_retriever.ipynb", # Requires Pinecone instance
"docs/docs/how_to/chat_model_rate_limiting.ipynb", # Slow (demonstrates rate limiting)
"docs/docs/how_to/document_loader_directory.ipynb", # Deliberately raises error
"docs/docs/how_to/document_loader_pdf.ipynb", # Local parsing section is slow
"docs/docs/how_to/example_selectors_langsmith.ipynb", # TODO: add langchain-benchmarks; fix cassette issue
"docs/docs/how_to/extraction_long_text.ipynb", # Non-determinism due to batch
"docs/docs/how_to/graph_constructing.ipynb", # Requires local neo4j
"docs/docs/how_to/graph_mapping.ipynb", # Requires local neo4j
"docs/docs/how_to/graph_prompting.ipynb", # Requires local neo4j
"docs/docs/how_to/graph_semantic.ipynb", # Requires local neo4j
"docs/docs/how_to/hybrid.ipynb", # Requires AstraDB instance
"docs/docs/how_to/indexing.ipynb", # Requires local Elasticsearch
"docs/docs/how_to/local_llms.ipynb", # Local LLMs
"docs/docs/how_to/migrate_agent.ipynb", # TODO: resolve issue with asyncio / exception handling
"docs/docs/how_to/qa_per_user.ipynb", # Requires Pinecone instance
"docs/docs/how_to/query_high_cardinality.ipynb", # Heavy
"docs/docs/how_to/split_by_token.ipynb", # TODO: requires Korean document, also heavy deps
"docs/docs/how_to/tools_error.ipynb", # Deliberately raises error
"docs/docs/how_to/tools_human.ipynb", # Requires human input()
"docs/docs/how_to/tools_prompting.ipynb", # Local LLMs
"docs/docs/tutorials/graph.ipynb", # Requires local graph db running
"docs/docs/tutorials/local_rag.ipynb", # Local LLMs
"docs/docs/tutorials/query_analysis.ipynb", # Requires youtube_transcript_api
"docs/docs/tutorials/sql_qa.ipynb", # Requires Chinook db locally
"docs/docs/tutorials/summarization.ipynb", # TODO: source of non-determinism somewhere, fix or add to no cassettes
]

View File

@@ -1,5 +1,24 @@
#!/bin/bash
# Use this script to update cassettes for a notebook. The script does the following:
#
# 1. Delete existing cassettes for the specified notebook
# 2. Pre-download and cache nltk and tiktoken files
# 3. Modify the notebook to generate cassettes for each cell.
# 4. Execute the notebook.
#
# Important: make sure the notebook is in a clean state, with any desired changes
# staged or committed. The script will modify the notebook in place, and these
# modifications should be discarded after the cassettes are generated.
#
# Usage:
# In monorepo env, `poetry install --with dev,test`
# `./docs/scripts/update_cassettes.sh path/to/notebook`
# e.g., `./docs/scripts/update_cassettes.sh docs/docs/how_to/tool_choice.ipynb`
#
# Make sure to set any env vars required by the notebook.
# Get the working directory from the input argument, default to 'all' if not provided
WORKING_DIRECTORY=${1:-all}
@@ -21,8 +40,8 @@ delete_cassettes() {
delete_cassettes "$WORKING_DIRECTORY"
# Pre-download tiktoken files
echo "Pre-downloading tiktoken files..."
poetry run python docs/scripts/download_tiktoken.py
echo "Pre-downloading nltk and tiktoken files..."
poetry run python docs/scripts/cache_data.py
# Prepare notebooks
echo "Preparing notebooks for CI..."