mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 03:59:42 +00:00
infra: add CI job for running tutorial notebooks (#26944)
This commit is contained in:
4
docs/scripts/download_tiktoken.py
Normal file
4
docs/scripts/download_tiktoken.py
Normal file
@@ -0,0 +1,4 @@
|
||||
import tiktoken
|
||||
|
||||
# This will trigger the download and caching of the necessary files
|
||||
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
31
docs/scripts/execute_notebooks.sh
Executable file
31
docs/scripts/execute_notebooks.sh
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Read the list of notebooks to skip from the JSON file
|
||||
SKIP_NOTEBOOKS=$(python -c "import json; print('\n'.join(json.load(open('docs/notebooks_no_execution.json'))))")
|
||||
|
||||
# Function to execute a single notebook
|
||||
execute_notebook() {
|
||||
file="$1"
|
||||
echo "Starting execution of $file"
|
||||
start_time=$(date +%s)
|
||||
if ! output=$(time poetry run jupyter nbconvert --to notebook --execute $file 2>&1); then
|
||||
end_time=$(date +%s)
|
||||
execution_time=$((end_time - start_time))
|
||||
echo "Error in $file. Execution time: $execution_time seconds"
|
||||
echo "Error details: $output"
|
||||
exit 1
|
||||
fi
|
||||
end_time=$(date +%s)
|
||||
execution_time=$((end_time - start_time))
|
||||
echo "Finished $file. Execution time: $execution_time seconds"
|
||||
}
|
||||
|
||||
export -f execute_notebook
|
||||
|
||||
# Find all notebooks and filter out those in the skip list
|
||||
notebooks=$(find docs/docs/tutorials -name "*.ipynb" | grep -v ".ipynb_checkpoints" | grep -vFf <(echo "$SKIP_NOTEBOOKS"))
|
||||
|
||||
# Execute notebooks sequentially
|
||||
for file in $notebooks; do
|
||||
execute_notebook "$file"
|
||||
done
|
181
docs/scripts/prepare_notebooks_for_ci.py
Normal file
181
docs/scripts/prepare_notebooks_for_ci.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""Preprocess notebooks for CI. Currently adds VCR cassettes and optionally removes pip install cells."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
import click
|
||||
import nbformat
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
NOTEBOOK_DIRS = ("docs/docs/tutorials",)
|
||||
DOCS_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CASSETTES_PATH = os.path.join(DOCS_PATH, "cassettes")
|
||||
|
||||
# TODO: populate if needed
|
||||
NOTEBOOKS_NO_CASSETTES = [
|
||||
"docs/docs/tutorials/retrievers.ipynb", # TODO: fix non-determinism
|
||||
]
|
||||
|
||||
NOTEBOOKS_NO_EXECUTION = [
|
||||
"docs/docs/tutorials/graph.ipynb", # Requires local graph db running
|
||||
"docs/docs/tutorials/local_rag.ipynb", # Local LLMs
|
||||
"docs/docs/tutorials/query_analysis.ipynb", # Requires youtube_transcript_api
|
||||
"docs/docs/tutorials/sql_qa.ipynb", # Requires Chinook db locally
|
||||
"docs/docs/tutorials/summarization.ipynb", # TODO: source of non-determinism somewhere, fix or add to no cassettes
|
||||
]
|
||||
|
||||
|
||||
def comment_install_cells(notebook: nbformat.NotebookNode) -> nbformat.NotebookNode:
|
||||
for cell in notebook.cells:
|
||||
if cell.cell_type != "code":
|
||||
continue
|
||||
|
||||
if "pip install" in cell.source:
|
||||
# Comment out the lines in cells containing "pip install"
|
||||
cell.source = "\n".join(
|
||||
f"# {line}" if line.strip() else line
|
||||
for line in cell.source.splitlines()
|
||||
)
|
||||
|
||||
return notebook
|
||||
|
||||
|
||||
def is_magic_command(code: str) -> bool:
|
||||
return code.strip().startswith("%") or code.strip().startswith("!")
|
||||
|
||||
|
||||
def is_comment(code: str) -> bool:
|
||||
return code.strip().startswith("#")
|
||||
|
||||
|
||||
def add_vcr_to_notebook(
|
||||
notebook: nbformat.NotebookNode, cassette_prefix: str
|
||||
) -> nbformat.NotebookNode:
|
||||
"""Inject `with vcr.cassette` into each code cell of the notebook."""
|
||||
|
||||
# Inject VCR context manager into each code cell
|
||||
for idx, cell in enumerate(notebook.cells):
|
||||
if cell.cell_type != "code":
|
||||
continue
|
||||
|
||||
lines = cell.source.splitlines()
|
||||
# skip if empty cell
|
||||
if not lines:
|
||||
continue
|
||||
|
||||
are_magic_lines = [is_magic_command(line) for line in lines]
|
||||
|
||||
# skip if all magic
|
||||
if all(are_magic_lines):
|
||||
continue
|
||||
|
||||
if any(are_magic_lines):
|
||||
raise ValueError(
|
||||
"Cannot process code cells with mixed magic and non-magic code."
|
||||
)
|
||||
|
||||
# skip if just comments
|
||||
if all(is_comment(line) or not line.strip() for line in lines):
|
||||
continue
|
||||
|
||||
cell_id = cell.get("id", idx)
|
||||
cassette_name = f"{cassette_prefix}_{cell_id}.msgpack.zlib"
|
||||
cell.source = (
|
||||
f"with custom_vcr.use_cassette('{cassette_name}', filter_headers=['x-api-key', 'authorization'], record_mode='once', serializer='advanced_compressed'):\n"
|
||||
+ "\n".join(f" {line}" for line in lines)
|
||||
)
|
||||
|
||||
# Add import statement
|
||||
vcr_import_lines = [
|
||||
"import nest_asyncio",
|
||||
"nest_asyncio.apply()",
|
||||
"import vcr",
|
||||
"import msgpack",
|
||||
"import base64",
|
||||
"import zlib",
|
||||
"custom_vcr = vcr.VCR()",
|
||||
"",
|
||||
"def compress_data(data, compression_level=9):",
|
||||
" packed = msgpack.packb(data, use_bin_type=True)",
|
||||
" compressed = zlib.compress(packed, level=compression_level)",
|
||||
" return base64.b64encode(compressed).decode('utf-8')",
|
||||
"",
|
||||
"def decompress_data(compressed_string):",
|
||||
" decoded = base64.b64decode(compressed_string)",
|
||||
" decompressed = zlib.decompress(decoded)",
|
||||
" return msgpack.unpackb(decompressed, raw=False)",
|
||||
"",
|
||||
"class AdvancedCompressedSerializer:",
|
||||
" def serialize(self, cassette_dict):",
|
||||
" return compress_data(cassette_dict)",
|
||||
"",
|
||||
" def deserialize(self, cassette_string):",
|
||||
" return decompress_data(cassette_string)",
|
||||
"",
|
||||
"custom_vcr.register_serializer('advanced_compressed', AdvancedCompressedSerializer())",
|
||||
"custom_vcr.serializer = 'advanced_compressed'",
|
||||
]
|
||||
import_cell = nbformat.v4.new_code_cell(source="\n".join(vcr_import_lines))
|
||||
import_cell.pop("id", None)
|
||||
notebook.cells.insert(0, import_cell)
|
||||
return notebook
|
||||
|
||||
|
||||
def process_notebooks(should_comment_install_cells: bool) -> None:
|
||||
for directory in NOTEBOOK_DIRS:
|
||||
for root, _, files in os.walk(directory):
|
||||
for file in files:
|
||||
if not file.endswith(".ipynb") or "ipynb_checkpoints" in root:
|
||||
continue
|
||||
|
||||
notebook_path = os.path.join(root, file)
|
||||
try:
|
||||
notebook = nbformat.read(notebook_path, as_version=4)
|
||||
|
||||
if should_comment_install_cells:
|
||||
notebook = comment_install_cells(notebook)
|
||||
|
||||
base_filename = os.path.splitext(os.path.basename(file))[0]
|
||||
cassette_prefix = os.path.join(CASSETTES_PATH, base_filename)
|
||||
if notebook_path not in NOTEBOOKS_NO_CASSETTES:
|
||||
notebook = add_vcr_to_notebook(
|
||||
notebook, cassette_prefix=cassette_prefix
|
||||
)
|
||||
|
||||
if notebook_path in NOTEBOOKS_NO_EXECUTION:
|
||||
# Add a cell at the beginning to indicate that this notebook should not be executed
|
||||
warning_cell = nbformat.v4.new_markdown_cell(
|
||||
source="**Warning:** This notebook is not meant to be executed automatically."
|
||||
)
|
||||
notebook.cells.insert(0, warning_cell)
|
||||
|
||||
# Add a special tag to the first code cell
|
||||
if notebook.cells and notebook.cells[1].cell_type == "code":
|
||||
notebook.cells[1].metadata["tags"] = notebook.cells[
|
||||
1
|
||||
].metadata.get("tags", []) + ["no_execution"]
|
||||
|
||||
nbformat.write(notebook, notebook_path)
|
||||
logger.info(f"Processed: {notebook_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {notebook_path}: {e}")
|
||||
|
||||
with open(os.path.join(DOCS_PATH, "notebooks_no_execution.json"), "w") as f:
|
||||
json.dump(NOTEBOOKS_NO_EXECUTION, f)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--comment-install-cells",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Whether to comment out install cells",
|
||||
)
|
||||
def main(comment_install_cells):
|
||||
process_notebooks(should_comment_install_cells=comment_install_cells)
|
||||
logger.info("All notebooks processed successfully.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user