mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 10:54:15 +00:00
Add PythonLoader which auto-detects encoding of Python files (#3311)
This PR contributes a `PythonLoader`, which inherits from `TextLoader` but detects and sets the encoding automatically.
This commit is contained in:
parent
1ecbeec24e
commit
aa9d5707e0
@ -11,7 +11,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 3,
|
||||
"id": "019d8520",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -128,10 +128,69 @@
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "598a2805",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you need to load Python source code files, use the `PythonLoader`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c558bd73",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PythonLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "a3cfaba7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = DirectoryLoader('../../../../../', glob=\"**/*.py\", loader_cls=PythonLoader)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "e2e1e26a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "ffb8ff36",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"691"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "984c8429",
|
||||
"id": "7f6e0eae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@ -153,7 +212,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.10.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import (
|
||||
UnstructuredPDFLoader,
|
||||
)
|
||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||
from langchain.document_loaders.python import PythonLoader
|
||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||
from langchain.document_loaders.roam import RoamLoader
|
||||
from langchain.document_loaders.rtf import UnstructuredRTFLoader
|
||||
@ -156,4 +157,5 @@ __all__ = [
|
||||
"ImageCaptionLoader",
|
||||
"DiscordChatLoader",
|
||||
"ConfluenceLoader",
|
||||
"PythonLoader",
|
||||
]
|
||||
|
14
langchain/document_loaders/python.py
Normal file
14
langchain/document_loaders/python.py
Normal file
@ -0,0 +1,14 @@
|
||||
import tokenize
|
||||
|
||||
from langchain.document_loaders.text import TextLoader
|
||||
|
||||
|
||||
class PythonLoader(TextLoader):
|
||||
"""
|
||||
Load Python files, respecting any non-default encoding if specified.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
with open(file_path, "rb") as f:
|
||||
encoding, _ = tokenize.detect_encoding(f.readline)
|
||||
super().__init__(file_path=file_path, encoding=encoding)
|
@ -148,6 +148,9 @@ select = [
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
]
|
||||
exclude = [
|
||||
"tests/integration_tests/examples/non-utf8-encoding.py",
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
ignore_missing_imports = "True"
|
||||
|
19
tests/integration_tests/document_loaders/test_python.py
Normal file
19
tests/integration_tests/document_loaders/test_python.py
Normal file
@ -0,0 +1,19 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders.python import PythonLoader
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["default-encoding.py", "non-utf8-encoding.py"])
|
||||
def test_python_loader(filename: str) -> None:
|
||||
"""Test Python loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples" / filename
|
||||
loader = PythonLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
metadata = docs[0].metadata
|
||||
|
||||
assert metadata["source"] == str(file_path)
|
1
tests/integration_tests/examples/default-encoding.py
Normal file
1
tests/integration_tests/examples/default-encoding.py
Normal file
@ -0,0 +1 @@
|
||||
u = "🦜🔗"
|
3
tests/integration_tests/examples/non-utf8-encoding.py
Normal file
3
tests/integration_tests/examples/non-utf8-encoding.py
Normal file
@ -0,0 +1,3 @@
|
||||
# coding: iso-8859-5
|
||||
# ±¶ÿàáâãäåæçèéêëìíîï <- Cyrillic characters
|
||||
u = "®âðÄ"
|
Loading…
Reference in New Issue
Block a user