mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-02 11:39:18 +00:00
docs: run how-to guides in CI (#27615)
Add how-to guides to [Run notebooks job](https://github.com/langchain-ai/langchain/actions/workflows/run_notebooks.yml) and fix existing notebooks. - As with tutorials, cassettes must be updated when HTTP calls in guides change (by running existing [script](https://github.com/langchain-ai/langchain/blob/master/docs/scripts/update_cassettes.sh)). - Cassettes now total ~62mb over 474 files. - `docs/scripts/prepare_notebooks_for_ci.py` lists a number of notebooks that do not run (e.g., due to requiring additional infra, slowness, requiring `input()`, etc.).
This commit is contained in:
14
docs/scripts/cache_data.py
Normal file
14
docs/scripts/cache_data.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import tiktoken
|
||||
from unstructured.nlp.tokenize import download_nltk_packages
|
||||
|
||||
|
||||
def download_tiktoken_data():
|
||||
# This will trigger the download and caching of the necessary files
|
||||
_ = tiktoken.encoding_for_model("gpt2")
|
||||
_ = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
||||
_ = tiktoken.encoding_for_model("gpt-4o-mini")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
download_tiktoken_data()
|
||||
download_nltk_packages()
|
@@ -1,4 +0,0 @@
|
||||
import tiktoken
|
||||
|
||||
# This will trigger the download and caching of the necessary files
|
||||
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
@@ -9,9 +9,11 @@ WORKING_DIRECTORY=$1
|
||||
# Function to execute a single notebook
|
||||
execute_notebook() {
|
||||
file="$1"
|
||||
echo "Starting execution of $file"
|
||||
index="$2"
|
||||
total="$3"
|
||||
echo "Starting execution of $file ($index/$total)"
|
||||
start_time=$(date +%s)
|
||||
if ! output=$(time poetry run jupyter nbconvert --to notebook --execute $file 2>&1); then
|
||||
if ! output=$(time poetry run jupyter nbconvert --to notebook --execute --ExecutePreprocessor.kernel_name=python3 $file 2>&1); then
|
||||
end_time=$(date +%s)
|
||||
execution_time=$((end_time - start_time))
|
||||
echo "Error in $file. Execution time: $execution_time seconds"
|
||||
@@ -27,12 +29,18 @@ export -f execute_notebook
|
||||
|
||||
# Determine the list of notebooks to execute
|
||||
if [ "$WORKING_DIRECTORY" == "all" ]; then
|
||||
notebooks=$(find docs/docs/tutorials -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
|
||||
notebooks=$(find docs/docs/tutorials docs/docs/how_to -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
|
||||
else
|
||||
notebooks=$(find "$WORKING_DIRECTORY" -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
|
||||
fi
|
||||
|
||||
# Execute notebooks sequentially
|
||||
for file in $notebooks; do
|
||||
execute_notebook "$file"
|
||||
# Convert the list of notebooks to an array
|
||||
notebooks_array=($notebooks)
|
||||
total_notebooks=${#notebooks_array[@]}
|
||||
|
||||
# Execute notebooks sequentially with progress indication
|
||||
for i in "${!notebooks_array[@]}"; do
|
||||
file="${notebooks_array[$i]}"
|
||||
index=$((i + 1))
|
||||
execute_notebook "$file" "$index" "$total_notebooks"
|
||||
done
|
||||
|
@@ -8,20 +8,39 @@ import click
|
||||
import nbformat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
NOTEBOOK_DIRS = ("docs/docs/tutorials",)
|
||||
NOTEBOOK_DIRS = ("docs/docs/how_to", "docs/docs/tutorials")
|
||||
DOCS_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CASSETTES_PATH = os.path.join(DOCS_PATH, "cassettes")
|
||||
|
||||
# TODO: populate if needed
|
||||
NOTEBOOKS_NO_CASSETTES = [
|
||||
"docs/docs/tutorials/retrievers.ipynb", # TODO: fix non-determinism
|
||||
"docs/docs/how_to/multi_vector.ipynb", # Non-determinism due to batch
|
||||
]
|
||||
|
||||
NOTEBOOKS_NO_EXECUTION = [
|
||||
"docs/docs/how_to/add_scores_retriever.ipynb", # Requires Pinecone instance
|
||||
"docs/docs/how_to/chat_model_rate_limiting.ipynb", # Slow (demonstrates rate limiting)
|
||||
"docs/docs/how_to/document_loader_directory.ipynb", # Deliberately raises error
|
||||
"docs/docs/how_to/document_loader_pdf.ipynb", # Local parsing section is slow
|
||||
"docs/docs/how_to/example_selectors_langsmith.ipynb", # TODO: add langchain-benchmarks; fix cassette issue
|
||||
"docs/docs/how_to/extraction_long_text.ipynb", # Non-determinism due to batch
|
||||
"docs/docs/how_to/graph_constructing.ipynb", # Requires local neo4j
|
||||
"docs/docs/how_to/graph_mapping.ipynb", # Requires local neo4j
|
||||
"docs/docs/how_to/graph_prompting.ipynb", # Requires local neo4j
|
||||
"docs/docs/how_to/graph_semantic.ipynb", # Requires local neo4j
|
||||
"docs/docs/how_to/hybrid.ipynb", # Requires AstraDB instance
|
||||
"docs/docs/how_to/indexing.ipynb", # Requires local Elasticsearch
|
||||
"docs/docs/how_to/local_llms.ipynb", # Local LLMs
|
||||
"docs/docs/how_to/migrate_agent.ipynb", # TODO: resolve issue with asyncio / exception handling
|
||||
"docs/docs/how_to/qa_per_user.ipynb", # Requires Pinecone instance
|
||||
"docs/docs/how_to/query_high_cardinality.ipynb", # Heavy
|
||||
"docs/docs/how_to/split_by_token.ipynb", # TODO: requires Korean document, also heavy deps
|
||||
"docs/docs/how_to/tools_error.ipynb", # Deliberately raises error
|
||||
"docs/docs/how_to/tools_human.ipynb", # Requires human input()
|
||||
"docs/docs/how_to/tools_prompting.ipynb", # Local LLMs
|
||||
"docs/docs/tutorials/graph.ipynb", # Requires local graph db running
|
||||
"docs/docs/tutorials/local_rag.ipynb", # Local LLMs
|
||||
"docs/docs/tutorials/query_analysis.ipynb", # Requires youtube_transcript_api
|
||||
"docs/docs/tutorials/sql_qa.ipynb", # Requires Chinook db locally
|
||||
"docs/docs/tutorials/summarization.ipynb", # TODO: source of non-determinism somewhere, fix or add to no cassettes
|
||||
]
|
||||
|
||||
|
@@ -1,5 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Use this script to update cassettes for a notebook. The script does the following:
|
||||
#
|
||||
# 1. Delete existing cassettes for the specified notebook
|
||||
# 2. Pre-download and cache nltk and tiktoken files
|
||||
# 3. Modify the notebook to generate cassettes for each cell.
|
||||
# 4. Execute the notebook.
|
||||
#
|
||||
# Important: make sure the notebook is in a clean state, with any desired changes
|
||||
# staged or committed. The script will modify the notebook in place, and these
|
||||
# modifications should be discarded after the cassettes are generated.
|
||||
#
|
||||
# Usage:
|
||||
# In monorepo env, `poetry install --with dev,test`
|
||||
# `./docs/scripts/update_cassettes.sh path/to/notebook`
|
||||
# e.g., `./docs/scripts/update_cassettes.sh docs/docs/how_to/tool_choice.ipynb`
|
||||
#
|
||||
# Make sure to set any env vars required by the notebook.
|
||||
|
||||
|
||||
# Get the working directory from the input argument, default to 'all' if not provided
|
||||
WORKING_DIRECTORY=${1:-all}
|
||||
|
||||
@@ -21,8 +40,8 @@ delete_cassettes() {
|
||||
delete_cassettes "$WORKING_DIRECTORY"
|
||||
|
||||
# Pre-download tiktoken files
|
||||
echo "Pre-downloading tiktoken files..."
|
||||
poetry run python docs/scripts/download_tiktoken.py
|
||||
echo "Pre-downloading nltk and tiktoken files..."
|
||||
poetry run python docs/scripts/cache_data.py
|
||||
|
||||
# Prepare notebooks
|
||||
echo "Preparing notebooks for CI..."
|
||||
|
Reference in New Issue
Block a user