diff --git a/docs/docs/integrations/toolkits/github.ipynb b/docs/docs/integrations/toolkits/github.ipynb index a14a5542cf2..783b83c4840 100644 --- a/docs/docs/integrations/toolkits/github.ipynb +++ b/docs/docs/integrations/toolkits/github.ipynb @@ -76,18 +76,17 @@ "* Pull requests (read and write)\n", "\n", "\n", - "\n", - "Once the app has been registered, add it to the repository you wish the bot to act upon.\n", + "Once the app has been registered, you must give your app permission to access each of the repositories you whish it to act upon. Use the App settings on [github.com here](https://github.com/settings/installations).\n", "\n", "### 3. Set Environmental Variables\n", "\n", "Before initializing your agent, the following environmental variables need to be set:\n", "\n", "* **GITHUB_APP_ID**- A six digit number found in your app's general settings\n", - "* **GITHUB_APP_PRIVATE_KEY**- The location of your app's private key .pem file\n", - "* **GITHUB_REPOSITORY**- The name of the Github repository you want your bot to act upon. Must follow the format {username}/{repo-name}. Make sure the app has been added to this repository first!\n", - "* **GITHUB_BRANCH**- The branch where the bot will make its commits. Defaults to 'master.'\n", - "* **GITHUB_BASE_BRANCH**- The base branch of your repo, usually either 'main' or 'master.' This is where pull requests will base from. Defaults to 'master.'\n" + "* **GITHUB_APP_PRIVATE_KEY**- The location of your app's private key .pem file, or the full text of that file as a string.\n", + "* **GITHUB_REPOSITORY**- The name of the Github repository you want your bot to act upon. Must follow the format {username}/{repo-name}. *Make sure the app has been added to this repository first!*\n", + "* Optional: **GITHUB_BRANCH**- The branch where the bot will make its commits. Defaults to `repo.default_branch`.\n", + "* Optional: **GITHUB_BASE_BRANCH**- The base branch of your repo upon which PRs will based from. Defaults to `repo.default_branch`.\n" ] }, { @@ -99,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -107,13 +106,13 @@ "\n", "from langchain.agents import AgentType, initialize_agent\n", "from langchain.agents.agent_toolkits.github.toolkit import GitHubToolkit\n", - "from langchain.llms import OpenAI\n", + "from langchain.chat_models import ChatOpenAI\n", "from langchain.utilities.github import GitHubAPIWrapper" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -130,16 +129,54 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available tools:\n", + "\tGet Issues\n", + "\tGet Issue\n", + "\tComment on Issue\n", + "\tList open pull requests (PRs)\n", + "\tGet Pull Request\n", + "\tOverview of files included in PR\n", + "\tCreate Pull Request\n", + "\tList Pull Requests' Files\n", + "\tCreate File\n", + "\tRead File\n", + "\tUpdate File\n", + "\tDelete File\n", + "\tOverview of existing files in Main branch\n", + "\tOverview of files in current working branch\n", + "\tList branches in this repository\n", + "\tSet active branch\n", + "\tCreate a new branch\n", + "\tGet files from a directory\n", + "\tSearch issues and pull requests\n", + "\tSearch code\n", + "\tCreate review request\n" + ] + } + ], "source": [ - "llm = OpenAI(temperature=0)\n", + "llm = ChatOpenAI(temperature=0, model=\"gpt-4-1106-preview\")\n", "github = GitHubAPIWrapper()\n", "toolkit = GitHubToolkit.from_github_api_wrapper(github)\n", + "tools = toolkit.get_tools()\n", + "\n", + "# STRUCTURED_CHAT includes args_schema for each tool, helps tool args parsing errors.\n", "agent = initialize_agent(\n", - " toolkit.get_tools(), llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True\n", - ")" + " tools,\n", + " llm,\n", + " agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,\n", + " verbose=True,\n", + ")\n", + "print(\"Available tools:\")\n", + "for tool in tools:\n", + " print(\"\\t\" + tool.name)" ] }, { @@ -219,7 +256,428 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example: Advanced Agent\n", + "## Example: Read an issue, open a pull request\n", + "\n", + "Workflow: \n", + "1. Read issues, either a specific one or just ask it to look at recent ones. \n", + "2. Write code, commit it to a new branch.\n", + "3. Open a PR\n", + "4. \"Request review\" on the PR from the original author of the issue.\n", + "\n", + "\n", + "### Input data and LangSmith Trace\n", + "* LangSmith trace for this run: https://smith.langchain.com/public/fee6643c-b214-42d0-967b-d24dcdd690fe/r\n", + "* Input issue: https://github.com/KastanDay/ML4Bio/issues/33\n", + "* Final PR created by bot: https://github.com/KastanDay/ML4Bio/pull/40" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please implement these changes by creating or editing the necessary files. \n", + "\n", + "1. First use read_file to read any files in the repo that seem relevant. \n", + "2. Then, when you're ready, start implementing changes by creating and updating files. Implement any and all remaining code to make the project work as the commenter intended. \n", + "2. The last step is to create a PR with a clear and concise title and description, list any concerns or final changes necessary in the PR body.\n", + "3. After opening the PR, comment on the original issue and mention the new PR your just opened, you must comment \"I opened a PR for you to review here #\" (it'll be something like #30). That hashtag syntax will automatically link to the PR, as necessary. Thanks.\n", + "4. If you feel the PR is satisfactory for completing your assignment, create a review request for the original user that opened the issue. Use their username to tag them.\n", + "\n", + "Feel free to ask for help or leave a comment on the Issue or PR if you're stuck.\n", + "\n", + "Here's your latest assignment: {issue_description}\n" + ] + } + ], + "source": [ + "from langchain import hub\n", + "\n", + "gh_issue_prompt_template = hub.pull(\"kastanday/new-github-issue\")\n", + "print(gh_issue_prompt_template.template)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please implement these changes by creating or editing the necessary files. \n", + "\n", + "1. First use read_file to read any files in the repo that seem relevant. \n", + "2. Then, when you're ready, start implementing changes by creating and updating files. Implement any and all remaining code to make the project work as the commenter intended. \n", + "2. The last step is to create a PR with a clear and concise title and description, list any concerns or final changes necessary in the PR body.\n", + "3. After opening the PR, comment on the original issue and mention the new PR your just opened, you must comment \"I opened a PR for you to review here #\" (it'll be something like #30). That hashtag syntax will automatically link to the PR, as necessary. Thanks.\n", + "4. If you feel the PR is satisfactory for completing your assignment, create a review request for the original user that opened the issue. Use their username to tag them.\n", + "\n", + "Feel free to ask for help or leave a comment on the Issue or PR if you're stuck.\n", + "\n", + "Here's your latest assignment: Title: Create a full command line executable workflow for RNA-Seq on PBMC Samples. Open a new pull request (on a separate branch) and comment the PR number here when you're done..\n", + "Opened by user: KastanDay\n", + "Body: Experiment Type:\n", + "RNA-Seq\n", + "Sequencing of total cellular RNA\n", + "\n", + "Workflow Management:\n", + "Bash/SLURM\n", + "Scripting and job scheduling\n", + "\n", + "Software Stack:\n", + "FastQC\n", + "MultiQC\n", + "STAR\n", + "RSEM\n", + "samtools\n", + "DESeq2\n", + "\n", + "What else to know about the pipeline?\n", + "I am working PBMC samples collected from patients that are undergoing immunotherapy.\n", + "\n", + "Use the data files existing in [Report_WholeBrain](https://github.com/KastanDay/ML4Bio/tree/main/Report_WholeBrain) as input for this workflow.\n", + "\n", + "You should write a series of bash scripts and R scripts that can accomplish this task. Open a PR with those scripts when you're done.\n" + ] + } + ], + "source": [ + "def format_issue(issue):\n", + " title = f\"Title: {issue.get('title')}.\"\n", + " opened_by = f\"Opened by user: {issue.get('opened_by')}\"\n", + " body = f\"Body: {issue.get('body')}\"\n", + " comments = issue.get(\"comments\") # often too long\n", + " return \"\\n\".join([title, opened_by, body])\n", + "\n", + "\n", + "issue = github.get_issue(33) # task to implement a RNA-seq pipeline (bioinformatics)\n", + "final_gh_issue_prompt = gh_issue_prompt_template.format(\n", + " issue_description=format_issue(issue)\n", + ")\n", + "print(final_gh_issue_prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.memory.summary_buffer import ConversationSummaryBufferMemory\n", + "from langchain_core.prompts.chat import MessagesPlaceholder\n", + "\n", + "summarizer_llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo\") # type: ignore\n", + "chat_history = MessagesPlaceholder(variable_name=\"chat_history\")\n", + "memory = ConversationSummaryBufferMemory(\n", + " memory_key=\"chat_history\",\n", + " return_messages=True,\n", + " llm=summarizer_llm,\n", + " max_token_limit=2_000,\n", + ")\n", + "\n", + "agent = initialize_agent(\n", + " tools,\n", + " llm,\n", + " agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,\n", + " verbose=True,\n", + " handle_parsing_errors=True, # or pass a function that accepts the error and returns a string\n", + " max_iterations=30,\n", + " max_execution_time=None,\n", + " early_stopping_method=\"generate\",\n", + " memory=memory,\n", + " # trim_intermediate_steps=fancier_trim_intermediate_steps,\n", + " agent_kwargs={\n", + " \"memory_prompts\": [chat_history],\n", + " \"input_variables\": [\"input\", \"agent_scratchpad\", \"chat_history\"],\n", + " \"prefix\": final_gh_issue_prompt,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m```json\n", + "{\n", + " \"action\": \"Get files from a directory\",\n", + " \"action_input\": \"ML4Bio/tree/main/Report_WholeBrain\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mError: status code 404, None\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe previous action to get files from a directory failed because the path provided does not exist or is not accessible. I need to correct the path to access the files in the `Report_WholeBrain` directory. Let's try to fetch the list of files from the correct directory path. \n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Get files from a directory\",\n", + " \"action_input\": \"Report_WholeBrain\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3m['Report_WholeBrain/MDSclustering_WholeBrain.html', 'Report_WholeBrain/MDSclustering_WholeBrain_RUVremoved.html', 'Report_WholeBrain/Report_Antonson_WholeBrain_2022Mar.Rmd', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 1-1.png', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 2-1.png', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 3-1.png', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 4-1.png', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 6-1.png', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 7-1.png', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 8-1.png', 'Report_WholeBrain/Report_WholeBrain_files/figure-html/Figure 9-1.png', 'Report_WholeBrain/SalmonSummarizedOutput.RData', 'Report_WholeBrain/SampleInfo_RUVvariables_WholeBrain_2022-05-12.csv', 'Report_WholeBrain/Targets_Final.txt', 'Report_WholeBrain/WholeBrain_GeneResults_2022-05-12.xlsx', 'Report_WholeBrain/WholeBrain_GeneResults_RUV_2022-05-12.xlsx', 'Report_WholeBrain/WholeBrain_Gene_level_counts_2022-05-12.xlsx', 'Report_WholeBrain/WholeBrain_RUV_FDR0.1.html', 'Report_WholeBrain/WholeBrain_logCPMValues_RUVcorrected_2022-05-12.xlsx', 'Report_WholeBrain/WholeBrain_logCPMvalues_2022-05-12.xlsx', 'Report_WholeBrain/WholeBrain_rawP05.html', 'Report_WholeBrain/getGO.R', 'Report_WholeBrain/getPath.R', 'Report_WholeBrain/interactive_plots/css/glimma.min.css', 'Report_WholeBrain/interactive_plots/css/src/images/animated-overlay.gif', 'Report_WholeBrain/interactive_plots/css/src/images/favicon.ico', 'Report_WholeBrain/interactive_plots/css/src/images/sort_asc.png', 'Report_WholeBrain/interactive_plots/css/src/images/sort_asc_disabled.png', 'Report_WholeBrain/interactive_plots/css/src/images/sort_both.png', 'Report_WholeBrain/interactive_plots/css/src/images/sort_desc.png', 'Report_WholeBrain/interactive_plots/css/src/images/sort_desc_disabled.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_flat_0_aaaaaa_40x100.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_flat_75_ffffff_40x100.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_glass_55_fbf9ee_1x400.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_glass_65_ffffff_1x400.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_glass_75_dadada_1x400.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_glass_75_e6e6e6_1x400.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_glass_95_fef1ec_1x400.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-bg_highlight-soft_75_cccccc_1x100.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-icons_222222_256x240.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-icons_2e83ff_256x240.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-icons_454545_256x240.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-icons_888888_256x240.png', 'Report_WholeBrain/interactive_plots/css/src/images/ui-icons_cd0a0a_256x240.png', 'Report_WholeBrain/interactive_plots/js/glimma.min.js', 'Report_WholeBrain/interactive_plots/js/old_MDSclustering_Microglia.js', 'Report_WholeBrain/interactive_plots/js/old_MDSclustering_Microglia_RUV.js', 'Report_WholeBrain/interactive_plots/js/old_MDSclustering_WholeBrain.js', 'Report_WholeBrain/interactive_plots/js/old_MDSclustering_WholeBrain_RUV.js', 'Report_WholeBrain/interactive_plots/js/old_MDSclustering_WholeBrain_noOUT.js', 'Report_WholeBrain/interactive_plots/js/old_Microglia_rawP05.js', 'Report_WholeBrain/interactive_plots/js/old_WholeBrain_RUV_FDR0.1.js', 'Report_WholeBrain/interactive_plots/js/old_WholeBrain_rawP05.js', 'Report_WholeBrain/interactive_plots/old_MDSclustering_Microglia.html', 'Report_WholeBrain/interactive_plots/old_MDSclustering_Microglia_RUV.html', 'Report_WholeBrain/interactive_plots/old_MDSclustering_WholeBrain.html', 'Report_WholeBrain/interactive_plots/old_MDSclustering_WholeBrain_RUV.html', 'Report_WholeBrain/interactive_plots/old_MDSclustering_WholeBrain_noOUT.html', 'Report_WholeBrain/interactive_plots/old_Microglia_rawP05.html', 'Report_WholeBrain/interactive_plots/old_WholeBrain_RUV_FDR0.1.html', 'Report_WholeBrain/interactive_plots/old_WholeBrain_rawP05.html', 'Report_WholeBrain/trx_EGids_combined.txt', 'Report_WholeBrain/⭐️____Report_Antonson_WholeBrain_2022Mar.html']\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe files in the `Report_WholeBrain` directory seem to be mostly reports, images, and some R scripts. However, none of these files appear to be raw RNA-Seq data files or scripts directly related to the RNA-Seq workflow requested by the user. To proceed with creating the workflow, I will need to create new bash and R scripts that will handle the RNA-Seq analysis using the software stack mentioned (FastQC, MultiQC, STAR, RSEM, samtools, DESeq2).\n", + "\n", + "Before creating the scripts, I should create a new branch to work on this feature.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create a new branch\",\n", + " \"action_input\": \"rna-seq-workflow\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[33;1m\u001b[1;3mBranch 'rna-seq-workflow_v1' created successfully, and set as current active branch.\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mNow that the new branch `rna-seq-workflow_v1` is created and active, I can start creating the necessary bash and R scripts for the RNA-Seq workflow. I will begin by creating a bash script for quality control using FastQC and MultiQC.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create File\",\n", + " \"action_input\": \"rna-seq-workflow/01_quality_control.sh\\n#!/bin/bash\\n#PBS -N quality_control\\n#PBS -l walltime=04:00:00\\n#PBS -l nodes=1:ppn=8\\n#PBS -q batch\\n\\n# Load modules for FastQC and MultiQC\\nmodule load FastQC/0.11.8\\nmodule load MultiQC/1.9\\n\\n# Define input and output directories\\nINPUT_DIR='/path/to/raw_data'\\nOUTPUT_DIR='/path/to/output/quality_control'\\n\\n# Create output directory if it doesn't exist\\nmkdir -p $OUTPUT_DIR\\n\\n# Run FastQC on all files in the input directory\\nfastqc -o $OUTPUT_DIR -t 8 $INPUT_DIR/*.fastq.gz\\n\\n# Run MultiQC to aggregate results\\nmultiqc -o $OUTPUT_DIR $OUTPUT_DIR\\n\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mCreated file rna-seq-workflow/01_quality_control.sh\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe quality control bash script has been created successfully. Next, I will create a bash script for the alignment step using STAR.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create File\",\n", + " \"action_input\": \"rna-seq-workflow/02_alignment.sh\\n#!/bin/bash\\n#PBS -N alignment\\n#PBS -l walltime=08:00:00\\n#PBS -l nodes=1:ppn=16\\n#PBS -q batch\\n\\n# Load module for STAR\\nmodule load STAR/2.7.3a\\n\\n# Define input and output directories\\nINPUT_DIR='/path/to/quality_control'\\nOUTPUT_DIR='/path/to/output/alignment'\\nGENOME_DIR='/path/to/genome_indices'\\n\\n# Create output directory if it doesn't exist\\nmkdir -p $OUTPUT_DIR\\n\\n# Run STAR for each sample\\nfor sample in $INPUT_DIR/*.fastq.gz\\ndo\\n # Define output file names\\n prefix=$(basename $sample .fastq.gz)\\n output_prefix=$OUTPUT_DIR/$prefix\\n\\n # Run STAR\\n STAR --runThreadN 16 --genomeDir $GENOME_DIR --readFilesIn $sample --readFilesCommand zcat --outFileNamePrefix $output_prefix --outSAMtype BAM SortedByCoordinate\\n\\ndone\\n\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mCreated file rna-seq-workflow/02_alignment.sh\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe alignment bash script has been created successfully. Next, I will create a bash script for transcript quantification using RSEM.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create File\",\n", + " \"action_input\": \"rna-seq-workflow/03_transcript_quantification.sh\\n#!/bin/bash\\n#PBS -N transcript_quantification\\n#PBS -l walltime=12:00:00\\n#PBS -l nodes=1:ppn=8\\n#PBS -q batch\\n\\n# Load module for RSEM\\nmodule load RSEM/1.3.3\\n\\n# Define input and output directories\\nINPUT_DIR='/path/to/alignment'\\nOUTPUT_DIR='/path/to/output/transcript_quantification'\\nREFERENCE_DIR='/path/to/reference'\\n\\n# Create output directory if it doesn't exist\\nmkdir -p $OUTPUT_DIR\\n\\n# Run RSEM for each sample\\nfor bam in $INPUT_DIR/*.bam\\ndo\\n # Define output file names\\n prefix=$(basename $bam .bam)\\n output_prefix=$OUTPUT_DIR/$prefix\\n\\n # Run RSEM\\n rsem-calculate-expression --bam --no-bam-output --estimate-rspd --append-names --output-genome-bam -p 8 $bam $REFERENCE_DIR $output_prefix\\n\\ndone\\n\"\n", + "}\n", + "```\n", + "\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mCreated file rna-seq-workflow/03_transcript_quantification.sh\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe transcript quantification bash script has been created successfully. Now, I will create a bash script for sorting and indexing BAM files using samtools, which is a common step before differential expression analysis.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create File\",\n", + " \"action_input\": \"rna-seq-workflow/04_sort_index_bam.sh\\n#!/bin/bash\\n#PBS -N sort_index_bam\\n#PBS -l walltime=02:00:00\\n#PBS -l nodes=1:ppn=4\\n#PBS -q batch\\n\\n# Load module for samtools\\nmodule load samtools/1.10\\n\\n# Define input and output directories\\nINPUT_DIR='/path/to/alignment'\\nOUTPUT_DIR='/path/to/output/sorted_bams'\\n\\n# Create output directory if it doesn't exist\\nmkdir -p $OUTPUT_DIR\\n\\n# Sort and index BAM files\\nfor bam in $INPUT_DIR/*.bam\\ndo\\n # Define output file names\\n sorted_bam=$OUTPUT_DIR/$(basename $bam .bam).sorted.bam\\n\\n # Sort BAM file\\n samtools sort -@ 4 -o $sorted_bam $bam\\n\\n # Index sorted BAM file\\n samtools index $sorted_bam\\ndone\\n\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mCreated file rna-seq-workflow/04_sort_index_bam.sh\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe sorting and indexing of BAM files script has been created successfully. The next step is to create an R script for differential expression analysis using DESeq2. This script will be executed after the previous steps in the workflow have been completed.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create File\",\n", + " \"action_input\": \"rna-seq-workflow/05_differential_expression.R\\n# Load DESeq2 library\\nlibrary(DESeq2)\\n\\n# Set input directory\\ninput_dir <- '/path/to/transcript_quantification'\\n\\n# Read in the count matrix and the sample information\\n# Assuming countData is a matrix with count data and colData is a DataFrame with sample information\\ncountData <- as.matrix(read.csv(file.path(input_dir, 'count_matrix.csv'), row.names=1))\\ncolData <- read.csv(file.path(input_dir, 'colData.csv'))\\n\\n# Create a DESeqDataSet object\\ndds <- DESeqDataSetFromMatrix(countData = countData,\\n colData = colData,\\n design = ~ condition)\\n\\n# Run the differential expression analysis\\ndds <- DESeq(dds)\\n\\n# Get the results\\nres <- results(dds)\\n\\n# Order the results by the adjusted p-value\\nresOrdered <- res[order(res$padj),]\\n\\n# Write the results to a CSV file\\nwrite.csv(as.data.frame(resOrdered), file.path(input_dir, 'differential_expression_results.csv'))\\n\"\n", + "}\n", + "```\n", + "\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mCreated file rna-seq-workflow/05_differential_expression.R\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe R script for differential expression analysis using DESeq2 has been created successfully. Now that all the necessary scripts for the RNA-Seq workflow have been created, I should create a master bash script that will call each of these scripts in sequence to run the entire workflow.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create File\",\n", + " \"action_input\": \"rna-seq-workflow/run_workflow.sh\\n#!/bin/bash\\n# Master script to run the RNA-Seq workflow\\n\\n# Run quality control\\nbash 01_quality_control.sh\\n\\n# Run alignment\\nbash 02_alignment.sh\\n\\n# Run transcript quantification\\nbash 03_transcript_quantification.sh\\n\\n# Sort and index BAM files\\nbash 04_sort_index_bam.sh\\n\\n# Run differential expression analysis\\nRscript 05_differential_expression.R\\n\\n# Workflow completed\\necho \\\"RNA-Seq workflow completed successfully.\\\"\\n\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mCreated file rna-seq-workflow/run_workflow.sh\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe master bash script to run the entire RNA-Seq workflow has been created successfully. The next step is to create a pull request with all the scripts that have been created in the `rna-seq-workflow_v1` branch.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create Pull Request\",\n", + " \"action_input\": \"Create a full command line executable workflow for RNA-Seq on PBMC Samples\\n\\nThis pull request includes a series of bash and R scripts to perform RNA-Seq analysis on PBMC samples. The workflow includes steps for quality control, alignment, transcript quantification, sorting and indexing BAM files, and differential expression analysis using the following software stack: FastQC, MultiQC, STAR, RSEM, samtools, and DESeq2.\\n\\nPlease review the scripts and provide feedback or request any additional changes as necessary.\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[36;1m\u001b[1;3mSuccessfully created PR number 40\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe pull request has been successfully created. Now, I need to comment on the original issue to inform the user about the new PR.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Comment on Issue\",\n", + " \"action_input\": \"2\\n\\nI opened a PR for you to review here #40\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mCommented on issue 2\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe comment has been posted on the original issue. Now, I will create a review request for the user KastanDay who opened the issue.\n", + "\n", + "Action:\n", + "```json\n", + "{\n", + " \"action\": \"Create review request\",\n", + " \"action_input\": \"KastanDay\"\n", + "}\n", + "```\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mReview request created for user KastanDay on PR #40\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mAction:\n", + "```json\n", + "{\n", + " \"action\": \"Final Answer\",\n", + " \"action_input\": \"The tasks have been completed: a new branch was created, the necessary scripts for the RNA-Seq workflow were written, a pull request was opened, the original issue was commented on with the PR number, and a review request was sent to the user KastanDay.\"\n", + "}\n", + "```\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + } + ], + "source": [ + "from langchain_core.tracers.context import tracing_v2_enabled\n", + "\n", + "# To use langsmith (recommended for these long tasks):\n", + "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", + "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n", + "os.environ[\"LANGCHAIN_API_KEY\"] = \"ls__......\"\n", + "os.environ[\"LANGCHAIN_PROJECT\"] = \"Github_Demo_PR\"\n", + "os.environ[\"LANGCHAIN_WANDB_TRACING\"] = \"false\"\n", + "\n", + "\n", + "with tracing_v2_enabled(project_name=\"Github_Demo_PR\", tags=[\"PR_bot\"]) as cb:\n", + " agent.run(final_gh_issue_prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Full text of tools\n", + "\n", + "When using or building tools, it's always helpful to inspect what the model sees.\n", + "\n", + "On OpenAI models, tool descriptions are part of the `SystemPrompt`.\n", + "\n", + "The `args` are added to the prompt in structured chats, e.g. `AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION`, but not in `AgentType.ZERO_SHOT_REACT_DESCRIPTION`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get Issues: \n", + "This tool will fetch a list of the repository's issues. It will return the title, and issue number of 5 issues. It takes no input., args: {'no_input': {'title': 'No Input', 'description': 'No input required, e.g. `` (empty string).', 'default': '', 'type': 'string'}}\n", + "Get Issue: \n", + "This tool will fetch the title, body, and comment thread of a specific issue. **VERY IMPORTANT**: You must specify the issue number as an integer., args: {'issue_number': {'title': 'Issue Number', 'description': 'Issue number as an integer, e.g. `42`', 'default': 0, 'type': 'integer'}}\n", + "Comment on Issue: \n", + "This tool is useful when you need to comment on a GitHub issue. Simply pass in the issue number and the comment you would like to make. Please use this sparingly as we don't want to clutter the comment threads. **VERY IMPORTANT**: Your input to this tool MUST strictly follow these rules:\n", + "\n", + "- First you must specify the issue number as an integer\n", + "- Then you must place two newlines\n", + "- Then you must specify your comment, args: {'input': {'title': 'Input', 'description': 'Follow the required formatting.', 'type': 'string'}}\n", + "List open pull requests (PRs): \n", + "This tool will fetch a list of the repository's Pull Requests (PRs). It will return the title, and PR number of 5 PRs. It takes no input., args: {'no_input': {'title': 'No Input', 'description': 'No input required, e.g. `` (empty string).', 'default': '', 'type': 'string'}}\n", + "Get Pull Request: \n", + "This tool will fetch the title, body, comment thread and commit history of a specific Pull Request (by PR number). **VERY IMPORTANT**: You must specify the PR number as an integer., args: {'pr_number': {'title': 'Pr Number', 'description': 'The PR number as an integer, e.g. `12`', 'default': 0, 'type': 'integer'}}\n", + "Overview of files included in PR: \n", + "This tool will fetch the full text of all files in a pull request (PR) given the PR number as an input. This is useful for understanding the code changes in a PR or contributing to it. **VERY IMPORTANT**: You must specify the PR number as an integer input parameter., args: {'pr_number': {'title': 'Pr Number', 'description': 'The PR number as an integer, e.g. `12`', 'default': 0, 'type': 'integer'}}\n", + "Create Pull Request: \n", + "This tool is useful when you need to create a new pull request in a GitHub repository. **VERY IMPORTANT**: Your input to this tool MUST strictly follow these rules:\n", + "\n", + "- First you must specify the title of the pull request\n", + "- Then you must place two newlines\n", + "- Then you must write the body or description of the pull request\n", + "\n", + "When appropriate, always reference relevant issues in the body by using the syntax `closes #>>> OLD\n", + "- Then you must specify the new contents which you would like to replace the old contents with wrapped in NEW <<<< and >>>> NEW\n", + "\n", + "For example, if you would like to replace the contents of the file /test/test.txt from \"old contents\" to \"new contents\", you would pass in the following string:\n", + "\n", + "test/test.txt\n", + "\n", + "This is text that will not be changed\n", + "OLD <<<<\n", + "old contents\n", + ">>>> OLD\n", + "NEW <<<<\n", + "new contents\n", + ">>>> NEW, args: {'formatted_file_update': {'title': 'Formatted File Update', 'description': 'Strictly follow the provided rules.', 'type': 'string'}}\n", + "Delete File: \n", + "This tool is a wrapper for the GitHub API, useful when you need to delete a file in a GitHub repository. Simply pass in the full file path of the file you would like to delete. **IMPORTANT**: the path must not start with a slash, args: {'formatted_filepath': {'title': 'Formatted Filepath', 'description': 'The full file path of the file you would like to delete where the path must NOT start with a slash, e.g. `some_dir/my_file.py`. Only input a string, not the param name.', 'type': 'string'}}\n", + "Overview of existing files in Main branch: \n", + "This tool will provide an overview of all existing files in the main branch of the repository. It will list the file names, their respective paths, and a brief summary of their contents. This can be useful for understanding the structure and content of the repository, especially when navigating through large codebases. No input parameters are required., args: {'no_input': {'title': 'No Input', 'description': 'No input required, e.g. `` (empty string).', 'default': '', 'type': 'string'}}\n", + "Overview of files in current working branch: \n", + "This tool will provide an overview of all files in your current working branch where you should implement changes. This is great for getting a high level overview of the structure of your code. No input parameters are required., args: {'no_input': {'title': 'No Input', 'description': 'No input required, e.g. `` (empty string).', 'default': '', 'type': 'string'}}\n", + "List branches in this repository: \n", + "This tool will fetch a list of all branches in the repository. It will return the name of each branch. No input parameters are required., args: {'no_input': {'title': 'No Input', 'description': 'No input required, e.g. `` (empty string).', 'default': '', 'type': 'string'}}\n", + "Set active branch: \n", + "This tool will set the active branch in the repository, similar to `git checkout ` and `git switch -c `. **VERY IMPORTANT**: You must specify the name of the branch as a string input parameter., args: {'branch_name': {'title': 'Branch Name', 'description': 'The name of the branch, e.g. `my_branch`.', 'type': 'string'}}\n", + "Create a new branch: \n", + "This tool will create a new branch in the repository. **VERY IMPORTANT**: You must specify the name of the new branch as a string input parameter., args: {'branch_name': {'title': 'Branch Name', 'description': 'The name of the branch, e.g. `my_branch`.', 'type': 'string'}}\n", + "Get files from a directory: \n", + "This tool will fetch a list of all files in a specified directory. **VERY IMPORTANT**: You must specify the path of the directory as a string input parameter., args: {'input': {'title': 'Input', 'description': 'The path of the directory, e.g. `some_dir/inner_dir`. Only input a string, do not include the parameter name.', 'default': '', 'type': 'string'}}\n", + "Search issues and pull requests: \n", + "This tool will search for issues and pull requests in the repository. **VERY IMPORTANT**: You must specify the search query as a string input parameter., args: {'search_query': {'title': 'Search Query', 'description': 'Natural language search query, e.g. `My issue title or topic`.', 'type': 'string'}}\n", + "Search code: \n", + "This tool will search for code in the repository. **VERY IMPORTANT**: You must specify the search query as a string input parameter., args: {'search_query': {'title': 'Search Query', 'description': 'A keyword-focused natural language search query for code, e.g. `MyFunctionName()`.', 'type': 'string'}}\n", + "Create review request: \n", + "This tool will create a review request on the open pull request that matches the current active branch. **VERY IMPORTANT**: You must specify the username of the person who is being requested as a string input parameter., args: {'username': {'title': 'Username', 'description': 'GitHub username of the user being requested, e.g. `my_username`.', 'type': 'string'}}\n" + ] + } + ], + "source": [ + "from langchain.tools.render import render_text_description_and_args\n", + "\n", + "print(render_text_description_and_args(tools))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example: Agent with Search\n", "\n", "If your agent does not need to use all 8 tools, you can build tools individually to use. For this example, we'll make an agent that does not use the create_file, delete_file or create_pull_request tools, but can also use duckduckgo-search." ] @@ -382,7 +840,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/agents/agent_toolkits/github/toolkit.py b/libs/langchain/langchain/agents/agent_toolkits/github/toolkit.py index 390866ac886..fb437b7a4f8 100644 --- a/libs/langchain/langchain/agents/agent_toolkits/github/toolkit.py +++ b/libs/langchain/langchain/agents/agent_toolkits/github/toolkit.py @@ -1,22 +1,129 @@ """GitHub Toolkit.""" from typing import Dict, List +from langchain_core.pydantic_v1 import BaseModel + from langchain.agents.agent_toolkits.base import BaseToolkit +from langchain.pydantic_v1 import Field from langchain.tools import BaseTool from langchain.tools.github.prompt import ( COMMENT_ON_ISSUE_PROMPT, + CREATE_BRANCH_PROMPT, CREATE_FILE_PROMPT, CREATE_PULL_REQUEST_PROMPT, + CREATE_REVIEW_REQUEST_PROMPT, DELETE_FILE_PROMPT, + GET_FILES_FROM_DIRECTORY_PROMPT, GET_ISSUE_PROMPT, GET_ISSUES_PROMPT, + GET_PR_PROMPT, + LIST_BRANCHES_IN_REPO_PROMPT, + LIST_PRS_PROMPT, + LIST_PULL_REQUEST_FILES, + OVERVIEW_EXISTING_FILES_BOT_BRANCH, + OVERVIEW_EXISTING_FILES_IN_MAIN, READ_FILE_PROMPT, + SEARCH_CODE_PROMPT, + SEARCH_ISSUES_AND_PRS_PROMPT, + SET_ACTIVE_BRANCH_PROMPT, UPDATE_FILE_PROMPT, ) from langchain.tools.github.tool import GitHubAction from langchain.utilities.github import GitHubAPIWrapper +class NoInput(BaseModel): + no_input: str = Field("", description="No input required, e.g. `` (empty string).") + + +class GetIssue(BaseModel): + issue_number: int = Field(0, description="Issue number as an integer, e.g. `42`") + + +class CommentOnIssue(BaseModel): + input: str = Field(..., description="Follow the required formatting.") + + +class GetPR(BaseModel): + pr_number: int = Field(0, description="The PR number as an integer, e.g. `12`") + + +class CreatePR(BaseModel): + formatted_pr: str = Field(..., description="Follow the required formatting.") + + +class CreateFile(BaseModel): + formatted_file: str = Field(..., description="Follow the required formatting.") + + +class ReadFile(BaseModel): + formatted_filepath: str = Field( + ..., + description=( + "The full file path of the file you would like to read where the " + "path must NOT start with a slash, e.g. `some_dir/my_file.py`." + ), + ) + + +class UpdateFile(BaseModel): + formatted_file_update: str = Field( + ..., description="Strictly follow the provided rules." + ) + + +class DeleteFile(BaseModel): + formatted_filepath: str = Field( + ..., + description=( + "The full file path of the file you would like to delete" + " where the path must NOT start with a slash, e.g." + " `some_dir/my_file.py`. Only input a string," + " not the param name." + ), + ) + + +class DirectoryPath(BaseModel): + input: str = Field( + "", + description=( + "The path of the directory, e.g. `some_dir/inner_dir`." + " Only input a string, do not include the parameter name." + ), + ) + + +class BranchName(BaseModel): + branch_name: str = Field( + ..., description="The name of the branch, e.g. `my_branch`." + ) + + +class SearchCode(BaseModel): + search_query: str = Field( + ..., + description=( + "A keyword-focused natural language search" + "query for code, e.g. `MyFunctionName()`." + ), + ) + + +class CreateReviewRequest(BaseModel): + username: str = Field( + ..., + description="GitHub username of the user being requested, e.g. `my_username`.", + ) + + +class SearchIssuesAndPRs(BaseModel): + search_query: str = Field( + ..., + description="Natural language search query, e.g. `My issue title or topic`.", + ) + + class GitHubToolkit(BaseToolkit): """GitHub Toolkit. @@ -41,41 +148,127 @@ class GitHubToolkit(BaseToolkit): "mode": "get_issues", "name": "Get Issues", "description": GET_ISSUES_PROMPT, + "args_schema": NoInput, }, { "mode": "get_issue", "name": "Get Issue", "description": GET_ISSUE_PROMPT, + "args_schema": GetIssue, }, { "mode": "comment_on_issue", "name": "Comment on Issue", "description": COMMENT_ON_ISSUE_PROMPT, + "args_schema": CommentOnIssue, + }, + { + "mode": "list_open_pull_requests", + "name": "List open pull requests (PRs)", + "description": LIST_PRS_PROMPT, + "args_schema": NoInput, + }, + { + "mode": "get_pull_request", + "name": "Get Pull Request", + "description": GET_PR_PROMPT, + "args_schema": GetPR, + }, + { + "mode": "list_pull_request_files", + "name": "Overview of files included in PR", + "description": LIST_PULL_REQUEST_FILES, + "args_schema": GetPR, }, { "mode": "create_pull_request", "name": "Create Pull Request", "description": CREATE_PULL_REQUEST_PROMPT, + "args_schema": CreatePR, + }, + { + "mode": "list_pull_request_files", + "name": "List Pull Requests' Files", + "description": LIST_PULL_REQUEST_FILES, + "args_schema": GetPR, }, { "mode": "create_file", "name": "Create File", "description": CREATE_FILE_PROMPT, + "args_schema": CreateFile, }, { "mode": "read_file", "name": "Read File", "description": READ_FILE_PROMPT, + "args_schema": ReadFile, }, { "mode": "update_file", "name": "Update File", "description": UPDATE_FILE_PROMPT, + "args_schema": UpdateFile, }, { "mode": "delete_file", "name": "Delete File", "description": DELETE_FILE_PROMPT, + "args_schema": DeleteFile, + }, + { + "mode": "list_files_in_main_branch", + "name": "Overview of existing files in Main branch", + "description": OVERVIEW_EXISTING_FILES_IN_MAIN, + "args_schema": NoInput, + }, + { + "mode": "list_files_in_bot_branch", + "name": "Overview of files in current working branch", + "description": OVERVIEW_EXISTING_FILES_BOT_BRANCH, + "args_schema": NoInput, + }, + { + "mode": "list_branches_in_repo", + "name": "List branches in this repository", + "description": LIST_BRANCHES_IN_REPO_PROMPT, + "args_schema": NoInput, + }, + { + "mode": "set_active_branch", + "name": "Set active branch", + "description": SET_ACTIVE_BRANCH_PROMPT, + "args_schema": BranchName, + }, + { + "mode": "create_branch", + "name": "Create a new branch", + "description": CREATE_BRANCH_PROMPT, + "args_schema": BranchName, + }, + { + "mode": "get_files_from_directory", + "name": "Get files from a directory", + "description": GET_FILES_FROM_DIRECTORY_PROMPT, + "args_schema": DirectoryPath, + }, + { + "mode": "search_issues_and_prs", + "name": "Search issues and pull requests", + "description": SEARCH_ISSUES_AND_PRS_PROMPT, + "args_schema": SearchIssuesAndPRs, + }, + { + "mode": "search_code", + "name": "Search code", + "description": SEARCH_CODE_PROMPT, + "args_schema": SearchCode, + }, + { + "mode": "create_review_request", + "name": "Create review request", + "description": CREATE_REVIEW_REQUEST_PROMPT, + "args_schema": CreateReviewRequest, }, ] tools = [ @@ -84,6 +277,7 @@ class GitHubToolkit(BaseToolkit): description=action["description"], mode=action["mode"], api_wrapper=github_api_wrapper, + args_schema=action.get("args_schema", None), ) for action in operations ] diff --git a/libs/langchain/langchain/tools/github/prompt.py b/libs/langchain/langchain/tools/github/prompt.py index e0e72a808b8..3d66713e02b 100644 --- a/libs/langchain/langchain/tools/github/prompt.py +++ b/libs/langchain/langchain/tools/github/prompt.py @@ -1,19 +1,17 @@ # flake8: noqa GET_ISSUES_PROMPT = """ -This tool will fetch a list of the repository's issues. It will return the title, and issue number of 5 issues. It takes no input. -""" +This tool will fetch a list of the repository's issues. It will return the title, and issue number of 5 issues. It takes no input.""" GET_ISSUE_PROMPT = """ -This tool will fetch the title, body, and comment thread of a specific issue. **VERY IMPORTANT**: You must specify the issue number as an integer. -""" +This tool will fetch the title, body, and comment thread of a specific issue. **VERY IMPORTANT**: You must specify the issue number as an integer.""" COMMENT_ON_ISSUE_PROMPT = """ This tool is useful when you need to comment on a GitHub issue. Simply pass in the issue number and the comment you would like to make. Please use this sparingly as we don't want to clutter the comment threads. **VERY IMPORTANT**: Your input to this tool MUST strictly follow these rules: - First you must specify the issue number as an integer - Then you must place two newlines -- Then you must specify your comment -""" +- Then you must specify your comment""" + CREATE_PULL_REQUEST_PROMPT = """ This tool is useful when you need to create a new pull request in a GitHub repository. **VERY IMPORTANT**: Your input to this tool MUST strictly follow these rules: @@ -21,13 +19,13 @@ This tool is useful when you need to create a new pull request in a GitHub repos - Then you must place two newlines - Then you must write the body or description of the pull request -To reference an issue in the body, put its issue number directly after a #. -For example, if you would like to create a pull request called "README updates" with contents "added contributors' names, closes issue #3", you would pass in the following string: +When appropriate, always reference relevant issues in the body by using the syntax `closes #>>> OLD NEW <<<< new contents ->>>> NEW -""" +>>>> NEW""" DELETE_FILE_PROMPT = """ -This tool is a wrapper for the GitHub API, useful when you need to delete a file in a GitHub repository. Simply pass in the full file path of the file you would like to delete. **IMPORTANT**: the path must not start with a slash -""" +This tool is a wrapper for the GitHub API, useful when you need to delete a file in a GitHub repository. Simply pass in the full file path of the file you would like to delete. **IMPORTANT**: the path must not start with a slash""" + +GET_PR_PROMPT = """ +This tool will fetch the title, body, comment thread and commit history of a specific Pull Request (by PR number). **VERY IMPORTANT**: You must specify the PR number as an integer.""" + +LIST_PRS_PROMPT = """ +This tool will fetch a list of the repository's Pull Requests (PRs). It will return the title, and PR number of 5 PRs. It takes no input.""" + +LIST_PULL_REQUEST_FILES = """ +This tool will fetch the full text of all files in a pull request (PR) given the PR number as an input. This is useful for understanding the code changes in a PR or contributing to it. **VERY IMPORTANT**: You must specify the PR number as an integer input parameter.""" + +OVERVIEW_EXISTING_FILES_IN_MAIN = """ +This tool will provide an overview of all existing files in the main branch of the repository. It will list the file names, their respective paths, and a brief summary of their contents. This can be useful for understanding the structure and content of the repository, especially when navigating through large codebases. No input parameters are required.""" + +OVERVIEW_EXISTING_FILES_BOT_BRANCH = """ +This tool will provide an overview of all files in your current working branch where you should implement changes. This is great for getting a high level overview of the structure of your code. No input parameters are required.""" + +SEARCH_ISSUES_AND_PRS_PROMPT = """ +This tool will search for issues and pull requests in the repository. **VERY IMPORTANT**: You must specify the search query as a string input parameter.""" + +SEARCH_CODE_PROMPT = """ +This tool will search for code in the repository. **VERY IMPORTANT**: You must specify the search query as a string input parameter.""" + +CREATE_REVIEW_REQUEST_PROMPT = """ +This tool will create a review request on the open pull request that matches the current active branch. **VERY IMPORTANT**: You must specify the username of the person who is being requested as a string input parameter.""" + +LIST_BRANCHES_IN_REPO_PROMPT = """ +This tool will fetch a list of all branches in the repository. It will return the name of each branch. No input parameters are required.""" + +SET_ACTIVE_BRANCH_PROMPT = """ +This tool will set the active branch in the repository, similar to `git checkout ` and `git switch -c `. **VERY IMPORTANT**: You must specify the name of the branch as a string input parameter.""" + +CREATE_BRANCH_PROMPT = """ +This tool will create a new branch in the repository. **VERY IMPORTANT**: You must specify the name of the new branch as a string input parameter.""" + +GET_FILES_FROM_DIRECTORY_PROMPT = """ +This tool will fetch a list of all files in a specified directory. **VERY IMPORTANT**: You must specify the path of the directory as a string input parameter.""" diff --git a/libs/langchain/langchain/tools/github/tool.py b/libs/langchain/langchain/tools/github/tool.py index 6099ea47d8b..71761f63587 100644 --- a/libs/langchain/langchain/tools/github/tool.py +++ b/libs/langchain/langchain/tools/github/tool.py @@ -7,9 +7,9 @@ To use this tool, you must first set as environment variables: GITHUB_REPOSITORY -> format: {owner}/{repo} """ -from typing import Optional +from typing import Optional, Type -from langchain_core.pydantic_v1 import Field +from langchain_core.pydantic_v1 import BaseModel, Field from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.tools.base import BaseTool @@ -23,11 +23,15 @@ class GitHubAction(BaseTool): mode: str name: str = "" description: str = "" + args_schema: Optional[Type[BaseModel]] = None def _run( self, - instructions: str, + instructions: Optional[str] = "", run_manager: Optional[CallbackManagerForToolRun] = None, ) -> str: """Use the GitHub API to run an operation.""" + if not instructions or instructions == "{}": + # Catch other forms of empty input that GPT-4 likes to send. + instructions = "" return self.api_wrapper.run(self.mode, instructions) diff --git a/libs/langchain/langchain/utilities/github.py b/libs/langchain/langchain/utilities/github.py index 234ec87b1e9..94070871833 100644 --- a/libs/langchain/langchain/utilities/github.py +++ b/libs/langchain/langchain/utilities/github.py @@ -4,12 +4,15 @@ from __future__ import annotations import json from typing import TYPE_CHECKING, Any, Dict, List, Optional +import requests +import tiktoken from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator from langchain.utils import get_from_dict_or_env if TYPE_CHECKING: from github.Issue import Issue + from github.PullRequest import PullRequest class GitHubAPIWrapper(BaseModel): @@ -20,7 +23,7 @@ class GitHubAPIWrapper(BaseModel): github_repository: Optional[str] = None github_app_id: Optional[str] = None github_app_private_key: Optional[str] = None - github_branch: Optional[str] = None + active_branch: Optional[str] = None github_base_branch: Optional[str] = None class Config: @@ -41,13 +44,6 @@ class GitHubAPIWrapper(BaseModel): values, "github_app_private_key", "GITHUB_APP_PRIVATE_KEY" ) - github_branch = get_from_dict_or_env( - values, "github_branch", "GITHUB_BRANCH", default="master" - ) - github_base_branch = get_from_dict_or_env( - values, "github_base_branch", "GITHUB_BASE_BRANCH", default="master" - ) - try: from github import Auth, GithubIntegration @@ -57,8 +53,13 @@ class GitHubAPIWrapper(BaseModel): "Please install it with `pip install PyGithub`" ) - with open(github_app_private_key, "r") as f: - private_key = f.read() + try: + # interpret the key as a file path + # fallback to interpreting as the key itself + with open(github_app_private_key, "r") as f: + private_key = f.read() + except Exception: + private_key = github_app_private_key auth = Auth.AppAuth( github_app_id, @@ -69,13 +70,28 @@ class GitHubAPIWrapper(BaseModel): # create a GitHub instance: g = installation.get_github_for_installation() + repo = g.get_repo(github_repository) + + github_base_branch = get_from_dict_or_env( + values, + "github_base_branch", + "GITHUB_BASE_BRANCH", + default=repo.default_branch, + ) + + active_branch = get_from_dict_or_env( + values, + "active_branch", + "ACTIVE_BRANCH", + default=repo.default_branch, + ) values["github"] = g - values["github_repo_instance"] = g.get_repo(github_repository) + values["github_repo_instance"] = repo values["github_repository"] = github_repository values["github_app_id"] = github_app_id values["github_app_private_key"] = github_app_private_key - values["github_branch"] = github_branch + values["active_branch"] = active_branch values["github_base_branch"] = github_base_branch return values @@ -92,19 +108,45 @@ class GitHubAPIWrapper(BaseModel): for issue in issues: title = issue.title number = issue.number - parsed.append({"title": title, "number": number}) + opened_by = issue.user.login if issue.user else None + issue_dict = {"title": title, "number": number} + if opened_by is not None: + issue_dict["opened_by"] = opened_by + parsed.append(issue_dict) + return parsed + + def parse_pull_requests(self, pull_requests: List[PullRequest]) -> List[dict]: + """ + Extracts title and number from each Issue and puts them in a dictionary + Parameters: + issues(List[Issue]): A list of Github Issue objects + Returns: + List[dict]: A dictionary of issue titles and numbers + """ + parsed = [] + for pr in pull_requests: + parsed.append( + { + "title": pr.title, + "number": pr.number, + "commits": str(pr.commits), + "comments": str(pr.comments), + } + ) return parsed def get_issues(self) -> str: """ - Fetches all open issues from the repo + Fetches all open issues from the repo excluding pull requests Returns: str: A plaintext report containing the number of issues and each issue's title and number. """ issues = self.github_repo_instance.get_issues(state="open") - if issues.totalCount > 0: + # Filter out pull requests (part of GH issues object) + issues = [issue for issue in issues if not issue.pull_request] + if issues: parsed_issues = self.parse_issues(issues) parsed_issues_str = ( "Found " + str(len(parsed_issues)) + " issues:\n" + str(parsed_issues) @@ -113,14 +155,201 @@ class GitHubAPIWrapper(BaseModel): else: return "No open issues available" + def list_open_pull_requests(self) -> str: + """ + Fetches all open PRs from the repo + + Returns: + str: A plaintext report containing the number of PRs + and each PR's title and number. + """ + # issues = self.github_repo_instance.get_issues(state="open") + pull_requests = self.github_repo_instance.get_pulls(state="open") + if pull_requests.totalCount > 0: + parsed_prs = self.parse_pull_requests(pull_requests) + parsed_prs_str = ( + "Found " + str(len(parsed_prs)) + " pull requests:\n" + str(parsed_prs) + ) + return parsed_prs_str + else: + return "No open pull requests available" + + def list_files_in_main_branch(self) -> str: + """ + Fetches all files in the main branch of the repo. + + Returns: + str: A plaintext report containing the paths and names of the files. + """ + files: List[str] = [] + try: + contents = self.github_repo_instance.get_contents( + "", ref=self.github_base_branch + ) + for content in contents: + if content.type == "dir": + files.extend(self.get_files_from_directory(content.path)) + else: + files.append(content.path) + + if files: + files_str = "\n".join(files) + return f"Found {len(files)} files in the main branch:\n{files_str}" + else: + return "No files found in the main branch" + except Exception as e: + return str(e) + + def set_active_branch(self, branch_name: str) -> str: + """Equivalent to `git checkout branch_name` for this Agent. + Clones formatting from Github. + + Returns an Error (as a string) if branch doesn't exist. + """ + curr_branches = [ + branch.name for branch in self.github_repo_instance.get_branches() + ] + if branch_name in curr_branches: + self.active_branch = branch_name + return f"Switched to branch `{branch_name}`" + else: + return ( + f"Error {branch_name} does not exist," + f"in repo with current branches: {str(curr_branches)}" + ) + + def list_branches_in_repo(self) -> str: + """ + Fetches a list of all branches in the repository. + + Returns: + str: A plaintext report containing the names of the branches. + """ + try: + branches = [ + branch.name for branch in self.github_repo_instance.get_branches() + ] + if branches: + branches_str = "\n".join(branches) + return ( + f"Found {len(branches)} branches in the repository:" + f"\n{branches_str}" + ) + else: + return "No branches found in the repository" + except Exception as e: + return str(e) + + def create_branch(self, proposed_branch_name: str) -> str: + """ + Create a new branch, and set it as the active bot branch. + Equivalent to `git switch -c proposed_branch_name` + If the proposed branch already exists, we append _v1 then _v2... + until a unique name is found. + + Returns: + str: A plaintext success message. + """ + from github import GithubException + + i = 0 + new_branch_name = proposed_branch_name + base_branch = self.github_repo_instance.get_branch( + self.github_repo_instance.default_branch + ) + for i in range(1000): + try: + self.github_repo_instance.create_git_ref( + ref=f"refs/heads/{new_branch_name}", sha=base_branch.commit.sha + ) + self.active_branch = new_branch_name + return ( + f"Branch '{new_branch_name}' " + "created successfully, and set as current active branch." + ) + except GithubException as e: + if e.status == 422 and "Reference already exists" in e.data["message"]: + i += 1 + new_branch_name = f"{proposed_branch_name}_v{i}" + else: + # Handle any other exceptions + print(f"Failed to create branch. Error: {e}") + raise Exception( + "Unable to create branch name from proposed_branch_name: " + f"{proposed_branch_name}" + ) + return ( + "Unable to create branch. " + "At least 1000 branches exist with named derived from " + f"proposed_branch_name: `{proposed_branch_name}`" + ) + + def list_files_in_bot_branch(self) -> str: + """ + Fetches all files in the active branch of the repo, + the branch the bot uses to make changes. + + Returns: + str: A plaintext list containing the the filepaths in the branch. + """ + files: List[str] = [] + try: + contents = self.github_repo_instance.get_contents( + "", ref=self.active_branch + ) + for content in contents: + if content.type == "dir": + files.extend(self.get_files_from_directory(content.path)) + else: + files.append(content.path) + + if files: + files_str = "\n".join(files) + return ( + f"Found {len(files)} files in branch `{self.active_branch}`:\n" + f"{files_str}" + ) + else: + return f"No files found in branch: `{self.active_branch}`" + except Exception as e: + return f"Error: {e}" + + def get_files_from_directory(self, directory_path: str) -> str: + """ + Recursively fetches files from a directory in the repo. + + Parameters: + directory_path (str): Path to the directory + + Returns: + str: List of file paths, or an error message. + """ + from github import GithubException + + files: List[str] = [] + try: + contents = self.github_repo_instance.get_contents( + directory_path, ref=self.active_branch + ) + except GithubException as e: + return f"Error: status code {e.status}, {e.message}" + + for content in contents: + if content.type == "dir": + files.extend(self.get_files_from_directory(content.path)) + else: + files.append(content.path) + return str(files) + def get_issue(self, issue_number: int) -> Dict[str, Any]: """ Fetches a specific issue and its first 10 comments Parameters: issue_number(int): The number for the github issue Returns: - dict: A doctionary containing the issue's title, - body, and comments as a string + dict: A dictionary containing the issue's title, + body, comments as a string, and the username of the user + who opened the issue """ issue = self.github_repo_instance.get_issue(number=issue_number) page = 0 @@ -133,12 +362,142 @@ class GitHubAPIWrapper(BaseModel): comments.append({"body": comment.body, "user": comment.user.login}) page += 1 + opened_by = None + if issue.user and issue.user.login: + opened_by = issue.user.login + return { + "number": issue_number, "title": issue.title, "body": issue.body, "comments": str(comments), + "opened_by": str(opened_by), } + def list_pull_request_files(self, pr_number: int) -> List[Dict[str, Any]]: + """Fetches the full text of all files in a PR. Truncates after first 3k tokens. + # TODO: Enhancement to summarize files with ctags if they're getting long. + + Args: + pr_number(int): The number of the pull request on Github + + Returns: + dict: A dictionary containing the issue's title, + body, and comments as a string + """ + MAX_TOKENS_FOR_FILES = 3_000 + pr_files = [] + pr = self.github_repo_instance.get_pull(number=int(pr_number)) + total_tokens = 0 + page = 0 + while True: # or while (total_tokens + tiktoken()) < MAX_TOKENS_FOR_FILES: + files_page = pr.get_files().get_page(page) + if len(files_page) == 0: + break + for file in files_page: + try: + file_metadata_response = requests.get(file.contents_url) + if file_metadata_response.status_code == 200: + download_url = json.loads(file_metadata_response.text)[ + "download_url" + ] + else: + print(f"Failed to download file: {file.contents_url}, skipping") + continue + + file_content_response = requests.get(download_url) + if file_content_response.status_code == 200: + # Save the content as a UTF-8 string + file_content = file_content_response.text + else: + print( + "Failed downloading file content " + f"(Error {file_content_response.status_code}). Skipping" + ) + continue + + file_tokens = len( + tiktoken.get_encoding("cl100k_base").encode( + file_content + file.filename + "file_name file_contents" + ) + ) + if (total_tokens + file_tokens) < MAX_TOKENS_FOR_FILES: + pr_files.append( + { + "filename": file.filename, + "contents": file_content, + "additions": file.additions, + "deletions": file.deletions, + } + ) + total_tokens += file_tokens + except Exception as e: + print(f"Error when reading files from a PR on github. {e}") + page += 1 + return pr_files + + def get_pull_request(self, pr_number: int) -> Dict[str, Any]: + """ + Fetches a specific pull request and its first 10 comments, + limited by max_tokens. + + Parameters: + pr_number(int): The number for the Github pull + max_tokens(int): The maximum number of tokens in the response + Returns: + dict: A dictionary containing the pull's title, body, + and comments as a string + """ + max_tokens = 2_000 + pull = self.github_repo_instance.get_pull(number=pr_number) + total_tokens = 0 + + def get_tokens(text: str) -> int: + return len(tiktoken.get_encoding("cl100k_base").encode(text)) + + def add_to_dict(data_dict: Dict[str, Any], key: str, value: str) -> None: + nonlocal total_tokens # Declare total_tokens as nonlocal + tokens = get_tokens(value) + if total_tokens + tokens <= max_tokens: + data_dict[key] = value + total_tokens += tokens # Now this will modify the outer variable + + response_dict: Dict[str, str] = {} + add_to_dict(response_dict, "title", pull.title) + add_to_dict(response_dict, "number", str(pr_number)) + add_to_dict(response_dict, "body", pull.body) + + comments: List[str] = [] + page = 0 + while len(comments) <= 10: + comments_page = pull.get_issue_comments().get_page(page) + if len(comments_page) == 0: + break + for comment in comments_page: + comment_str = str({"body": comment.body, "user": comment.user.login}) + if total_tokens + get_tokens(comment_str) > max_tokens: + break + comments.append(comment_str) + total_tokens += get_tokens(comment_str) + page += 1 + add_to_dict(response_dict, "comments", str(comments)) + + commits: List[str] = [] + page = 0 + while len(commits) <= 10: + commits_page = pull.get_commits().get_page(page) + if len(commits_page) == 0: + break + for commit in commits_page: + commit_str = str({"message": commit.commit.message}) + if total_tokens + get_tokens(commit_str) > max_tokens: + break + commits.append(commit_str) + total_tokens += get_tokens(commit_str) + page += 1 + add_to_dict(response_dict, "commits", str(commits)) + return response_dict + def create_pull_request(self, pr_query: str) -> str: """ Makes a pull request from the bot's branch to the base branch @@ -150,9 +509,9 @@ class GitHubAPIWrapper(BaseModel): Returns: str: A success or failure message """ - if self.github_base_branch == self.github_branch: + if self.github_base_branch == self.active_branch: return """Cannot make a pull request because - commits are already in the master branch""" + commits are already in the main or master branch.""" else: try: title = pr_query.split("\n")[0] @@ -160,7 +519,7 @@ class GitHubAPIWrapper(BaseModel): pr = self.github_repo_instance.create_pull( title=title, body=body, - head=self.github_branch, + head=self.active_branch, base=self.github_base_branch, ) return f"Successfully created PR number {pr.number}" @@ -198,33 +557,60 @@ class GitHubAPIWrapper(BaseModel): Returns: str: A success or failure message """ + if self.active_branch == self.github_base_branch: + return ( + "You're attempting to commit to the directly to the" + f"{self.github_base_branch} branch, which is protected. " + "Please create a new branch and try again." + ) + file_path = file_query.split("\n")[0] file_contents = file_query[len(file_path) + 2 :] + try: - exists = self.github_repo_instance.get_contents(file_path) - if exists is None: - self.github_repo_instance.create_file( - path=file_path, - message="Create " + file_path, - content=file_contents, - branch=self.github_branch, + try: + file = self.github_repo_instance.get_contents( + file_path, ref=self.active_branch ) - return "Created file " + file_path - else: - return f"File already exists at {file_path}. Use update_file instead" + if file: + return ( + f"File already exists at `{file_path}` " + f"on branch `{self.active_branch}`. You must use " + "`update_file` to modify it." + ) + except Exception: + # expected behavior, file shouldn't exist yet + pass + + self.github_repo_instance.create_file( + path=file_path, + message="Create " + file_path, + content=file_contents, + branch=self.active_branch, + ) + return "Created file " + file_path except Exception as e: return "Unable to make file due to error:\n" + str(e) def read_file(self, file_path: str) -> str: """ - Reads a file from the github repo + Read a file from this agent's branch, defined by self.active_branch, + which supports PR branches. Parameters: file_path(str): the file path Returns: - str: The file decoded as a string + str: The file decoded as a string, or an error message if not found """ - file = self.github_repo_instance.get_contents(file_path) - return file.decoded_content.decode("utf-8") + try: + file = self.github_repo_instance.get_contents( + file_path, ref=self.active_branch + ) + return file.decoded_content.decode("utf-8") + except Exception as e: + return ( + f"File not found `{file_path}` on branch" + f"`{self.active_branch}`. Error: {str(e)}" + ) def update_file(self, file_query: str) -> str: """ @@ -244,8 +630,14 @@ class GitHubAPIWrapper(BaseModel): Returns: A success or failure message """ + if self.active_branch == self.github_base_branch: + return ( + "You're attempting to commit to the directly" + f"to the {self.github_base_branch} branch, which is protected. " + "Please create a new branch and try again." + ) try: - file_path = file_query.split("\n")[0] + file_path: str = file_query.split("\n")[0] old_file_contents = ( file_query.split("OLD <<<<")[1].split(">>>> OLD")[0].strip() ) @@ -267,12 +659,14 @@ class GitHubAPIWrapper(BaseModel): self.github_repo_instance.update_file( path=file_path, - message="Update " + file_path, + message="Update " + str(file_path), content=updated_file_content, - branch=self.github_branch, - sha=self.github_repo_instance.get_contents(file_path).sha, + branch=self.active_branch, + sha=self.github_repo_instance.get_contents( + file_path, ref=self.active_branch + ).sha, ) - return "Updated file " + file_path + return "Updated file " + str(file_path) except Exception as e: return "Unable to update file due to error:\n" + str(e) @@ -284,23 +678,119 @@ class GitHubAPIWrapper(BaseModel): Returns: str: Success or failure message """ + if self.active_branch == self.github_base_branch: + return ( + "You're attempting to commit to the directly" + f"to the {self.github_base_branch} branch, which is protected. " + "Please create a new branch and try again." + ) try: - file = self.github_repo_instance.get_contents(file_path) self.github_repo_instance.delete_file( path=file_path, message="Delete " + file_path, - branch=self.github_branch, - sha=file.sha, + branch=self.active_branch, + sha=self.github_repo_instance.get_contents( + file_path, ref=self.active_branch + ).sha, ) return "Deleted file " + file_path except Exception as e: return "Unable to delete file due to error:\n" + str(e) + def search_issues_and_prs(self, query: str) -> str: + """ + Searches issues and pull requests in the repository. + + Parameters: + query(str): The search query + + Returns: + str: A string containing the first 5 issues and pull requests + """ + search_result = self.github.search_issues(query, repo=self.github_repository) + max_items = min(5, len(search_result)) + results = [f"Top {max_items} results:"] + for issue in search_result[:max_items]: + results.append( + f"Title: {issue.title}, Number: {issue.number}, State: {issue.state}" + ) + return "\n".join(results) + + def search_code(self, query: str) -> str: + """ + Searches code in the repository. + # Todo: limit total tokens returned... + + Parameters: + query(str): The search query + + Returns: + str: A string containing, at most, the top 5 search results + """ + search_result = self.github.search_code( + query=query, repo=self.github_repository + ) + if search_result.totalCount == 0: + return "0 results found." + max_results = min(5, search_result.totalCount) + results = [f"Showing top {max_results} of {search_result.totalCount} results:"] + count = 0 + for code in search_result: + if count >= max_results: + break + # Get the file content using the PyGithub get_contents method + file_content = self.github_repo_instance.get_contents( + code.path, ref=self.active_branch + ).decoded_content.decode() + results.append( + f"Filepath: `{code.path}`\nFile contents: " + f"{file_content}\n" + ) + count += 1 + return "\n".join(results) + + def create_review_request(self, reviewer_username: str) -> str: + """ + Creates a review request on *THE* open pull request + that matches the current active_branch. + + Parameters: + reviewer_username(str): The username of the person who is being requested + + Returns: + str: A message confirming the creation of the review request + """ + pull_requests = self.github_repo_instance.get_pulls( + state="open", sort="created" + ) + # find PR against active_branch + pr = next( + (pr for pr in pull_requests if pr.head.ref == self.active_branch), None + ) + if pr is None: + return ( + "No open pull request found for the " + f"current branch `{self.active_branch}`" + ) + + try: + pr.create_review_request(reviewers=[reviewer_username]) + return ( + f"Review request created for user {reviewer_username} " + f"on PR #{pr.number}" + ) + except Exception as e: + return f"Failed to create a review request with error {e}" + def run(self, mode: str, query: str) -> str: - if mode == "get_issues": - return self.get_issues() - elif mode == "get_issue": + if mode == "get_issue": return json.dumps(self.get_issue(int(query))) + elif mode == "get_pull_request": + return json.dumps(self.get_pull_request(int(query))) + elif mode == "list_pull_request_files": + return json.dumps(self.list_pull_request_files(int(query))) + elif mode == "get_issues": + return self.get_issues() elif mode == "comment_on_issue": return self.comment_on_issue(query) elif mode == "create_file": @@ -313,5 +803,25 @@ class GitHubAPIWrapper(BaseModel): return self.update_file(query) elif mode == "delete_file": return self.delete_file(query) + elif mode == "list_open_pull_requests": + return self.list_open_pull_requests() + elif mode == "list_files_in_main_branch": + return self.list_files_in_main_branch() + elif mode == "list_files_in_bot_branch": + return self.list_files_in_bot_branch() + elif mode == "list_branches_in_repo": + return self.list_branches_in_repo() + elif mode == "set_active_branch": + return self.set_active_branch(query) + elif mode == "create_branch": + return self.create_branch(query) + elif mode == "get_files_from_directory": + return self.get_files_from_directory(query) + elif mode == "search_issues_and_prs": + return self.search_issues_and_prs(query) + elif mode == "search_code": + return self.search_code(query) + elif mode == "create_review_request": + return self.create_review_request(query) else: raise ValueError("Invalid mode" + mode)