langchain/scripts/clean_file.py

import argparse
from typing import Optional, Sequence

from langchain.base_language import BaseLanguageModel
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatAnthropic, ChatOpenAI


def parse_args(src: Optional[Sequence[str]] = None) -> argparse.Namespace:
    """Parse arguments."""
    parser = argparse.ArgumentParser(
        description="Apply an LLM to a code file to proofread docstrings and edit grammar.",
    )
    parser.add_argument("file", type=str, help="File to proofread.")
    parser.add_argument(
        "--model",
        type=str,
        default="auto",
        help="Model to use.",
        choices=["anthropic", "openai", "auto"],
    )
    return parser.parse_args(src)


def select_model(text: str, model: str) -> BaseLanguageModel:
    if model == "anthropic":
        return ChatAnthropic(model="claude-v1-100k", temperature=0)
    elif model == "openai":
        return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
    elif model == "auto":
        import tiktoken

        encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
        num_tokens = len(encoding.encode(text))
        if num_tokens > 15800:
            return ChatAnthropic(model="claude-v1-100k", temperature=0)
        else:
            return ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
    else:
        raise ValueError(f"Invalid model {model}")


def main(file: str, model: str) -> str:
    """Run the llm."""
    with open(file, "r") as f:
        text = f.read()
    model_ = select_model(text, model)
    template = """Please review the following code and improve the docstrings to make our documentation clean. Update the docstrings and descriptions as needed in the format of scikit-learn. Provide examples if necessary.
    ```
    {code}
    ```"""
    chain = LLMChain.from_string(llm=model_, template=template)
    cleaned = chain(text, return_only_outputs=True)["text"]
    return cleaned.strip().strip("`")


if __name__ == "__main__":
    args = parse_args()
    cleaned = main(args.file, args.model)
    print(cleaned)