mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 04:55:14 +00:00
feat (documents): add a source code loader based on AST manipulation (#6486)
#### Summary A new approach to loading source code is implemented: Each top-level function and class in the code is loaded into separate documents. Then, an additional document is created with the top-level code, but without the already loaded functions and classes. This could improve the accuracy of QA chains over source code. For instance, having this script: ``` class MyClass: def __init__(self, name): self.name = name def greet(self): print(f"Hello, {self.name}!") def main(): name = input("Enter your name: ") obj = MyClass(name) obj.greet() if __name__ == '__main__': main() ``` The loader will create three documents with this content: First document: ``` class MyClass: def __init__(self, name): self.name = name def greet(self): print(f"Hello, {self.name}!") ``` Second document: ``` def main(): name = input("Enter your name: ") obj = MyClass(name) obj.greet() ``` Third document: ``` # Code for: class MyClass: # Code for: def main(): if __name__ == '__main__': main() ``` A threshold parameter is added to control whether small scripts are split in this way or not. At this moment, only Python and JavaScript are supported. The appropriate parser is determined by examining the file extension. #### Tests This PR adds: - Unit tests - Integration tests #### Dependencies Only one dependency was added as optional (needed for the JavaScript parser). #### Documentation A notebook is added showing how the loader can be used. #### Who can review? @eyurtsev @hwchase17 --------- Co-authored-by: rlm <pexpresss31@gmail.com>
This commit is contained in:
committed by
GitHub
parent
da462d9dd4
commit
e494b0a09f
@@ -0,0 +1,17 @@
|
||||
class MyClass {
|
||||
constructor(name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
greet() {
|
||||
console.log(`Hello, ${this.name}!`);
|
||||
}
|
||||
}
|
||||
|
||||
function main() {
|
||||
const name = prompt("Enter your name:");
|
||||
const obj = new MyClass(name);
|
||||
obj.greet();
|
||||
}
|
||||
|
||||
main();
|
@@ -0,0 +1,16 @@
|
||||
class MyClass:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
def greet(self):
|
||||
print(f"Hello, {self.name}!")
|
||||
|
||||
|
||||
def main():
|
||||
name = input("Enter your name: ")
|
||||
obj = MyClass(name)
|
||||
obj.greet()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@@ -0,0 +1,419 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "213a38a2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Source Code\n",
|
||||
"\n",
|
||||
"This notebook covers how to load source code files using a special approach with language parsing: each top-level function and class in the code is loaded into separate documents. Any remaining code top-level code outside the already loaded functions and classes will be loaded into a seperate document.\n",
|
||||
"\n",
|
||||
"This approach can potentially improve the accuracy of QA models over source code. Currently, the supported languages for code parsing are Python and JavaScript. The language used for parsing can be configured, along with the minimum number of lines required to activate the splitting based on syntax."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7fa47b2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install esprima"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "beb55c2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"from pprint import pprint\n",
|
||||
"from langchain.text_splitter import Language\n",
|
||||
"from langchain.document_loaders.generic import GenericLoader\n",
|
||||
"from langchain.document_loaders.parsers import LanguageParser"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "64056e07",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GenericLoader.from_filesystem(\n",
|
||||
" \"./example_data/source_code\",\n",
|
||||
" glob=\"*\",\n",
|
||||
" suffixes=[\".py\", \".js\"],\n",
|
||||
" parser=LanguageParser()\n",
|
||||
")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8af79bd7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"6"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "85edf3fc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'content_type': 'functions_classes',\n",
|
||||
" 'language': <Language.PYTHON: 'python'>,\n",
|
||||
" 'source': 'example_data/source_code/example.py'}\n",
|
||||
"{'content_type': 'functions_classes',\n",
|
||||
" 'language': <Language.PYTHON: 'python'>,\n",
|
||||
" 'source': 'example_data/source_code/example.py'}\n",
|
||||
"{'content_type': 'simplified_code',\n",
|
||||
" 'language': <Language.PYTHON: 'python'>,\n",
|
||||
" 'source': 'example_data/source_code/example.py'}\n",
|
||||
"{'content_type': 'functions_classes',\n",
|
||||
" 'language': <Language.JS: 'js'>,\n",
|
||||
" 'source': 'example_data/source_code/example.js'}\n",
|
||||
"{'content_type': 'functions_classes',\n",
|
||||
" 'language': <Language.JS: 'js'>,\n",
|
||||
" 'source': 'example_data/source_code/example.js'}\n",
|
||||
"{'content_type': 'simplified_code',\n",
|
||||
" 'language': <Language.JS: 'js'>,\n",
|
||||
" 'source': 'example_data/source_code/example.js'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for document in docs:\n",
|
||||
" pprint(document.metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f44e3e37",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"class MyClass:\n",
|
||||
" def __init__(self, name):\n",
|
||||
" self.name = name\n",
|
||||
"\n",
|
||||
" def greet(self):\n",
|
||||
" print(f\"Hello, {self.name}!\")\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"def main():\n",
|
||||
" name = input(\"Enter your name: \")\n",
|
||||
" obj = MyClass(name)\n",
|
||||
" obj.greet()\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"# Code for: class MyClass:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Code for: def main():\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" main()\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"class MyClass {\n",
|
||||
" constructor(name) {\n",
|
||||
" this.name = name;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" greet() {\n",
|
||||
" console.log(`Hello, ${this.name}!`);\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"function main() {\n",
|
||||
" const name = prompt(\"Enter your name:\");\n",
|
||||
" const obj = new MyClass(name);\n",
|
||||
" obj.greet();\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"// Code for: class MyClass {\n",
|
||||
"\n",
|
||||
"// Code for: function main() {\n",
|
||||
"\n",
|
||||
"main();\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"\\n\\n--8<--\\n\\n\".join([document.page_content for document in docs]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "69aad0ed",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The parser can be disabled for small files. \n",
|
||||
"\n",
|
||||
"The parameter `parser_threshold` indicates the minimum number of lines that the source code file must have to be segmented using the parser."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ae024794",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GenericLoader.from_filesystem(\n",
|
||||
" \"./example_data/source_code\",\n",
|
||||
" glob=\"*\",\n",
|
||||
" suffixes=[\".py\"],\n",
|
||||
" parser=LanguageParser(language=Language.PYTHON, parser_threshold=1000)\n",
|
||||
")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "5d3b372a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "89e546ad",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"class MyClass:\n",
|
||||
" def __init__(self, name):\n",
|
||||
" self.name = name\n",
|
||||
"\n",
|
||||
" def greet(self):\n",
|
||||
" print(f\"Hello, {self.name}!\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def main():\n",
|
||||
" name = input(\"Enter your name: \")\n",
|
||||
" obj = MyClass(name)\n",
|
||||
" obj.greet()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" main()\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c9c71e61",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Splitting\n",
|
||||
"\n",
|
||||
"Additional splitting could be needed for those functions, classes, or scripts that are too big."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "adbaa79f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GenericLoader.from_filesystem(\n",
|
||||
" \"./example_data/source_code\",\n",
|
||||
" glob=\"*\",\n",
|
||||
" suffixes=[\".js\"],\n",
|
||||
" parser=LanguageParser(language=Language.JS)\n",
|
||||
")\n",
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "c44c0d3f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import (\n",
|
||||
" RecursiveCharacterTextSplitter,\n",
|
||||
" Language,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "b1e0053d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"js_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.JS, chunk_size=60, chunk_overlap=0\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "7dbe6188",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result = js_splitter.split_documents(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "8a80d089",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"7"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "000a6011",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"class MyClass {\n",
|
||||
" constructor(name) {\n",
|
||||
" this.name = name;\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"greet() {\n",
|
||||
" console.log(`Hello, ${this.name}!`);\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"function main() {\n",
|
||||
" const name = prompt(\"Enter your name:\");\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"const obj = new MyClass(name);\n",
|
||||
" obj.greet();\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"// Code for: class MyClass {\n",
|
||||
"\n",
|
||||
"// Code for: function main() {\n",
|
||||
"\n",
|
||||
"--8<--\n",
|
||||
"\n",
|
||||
"main();\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"\\n\\n--8<--\\n\\n\".join([document.page_content for document in result]))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Reference in New Issue
Block a user