mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 08:33:49 +00:00
community: Add PHP language parser to document_loaders (#19850)
**Description:** Added a PHP language parser to document_loaders **Issue:** N/A **Dependencies:** N/A **Twitter handle:** N/A --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
2f03bc397e
commit
e1a24d09c5
@ -612,6 +612,63 @@
|
||||
"haskell_docs = haskell_splitter.create_documents([HASKELL_CODE])\n",
|
||||
"haskell_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a11f7cd-cd85-430c-b307-5b5b5f07f8db",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## PHP\n",
|
||||
"Here's an example using the PHP text splitter:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "90c66e7e-87a5-4a81-bece-7949aabf2369",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='<?php\\nnamespace foo;'),\n",
|
||||
" Document(page_content='class Hello {'),\n",
|
||||
" Document(page_content='public function __construct() { }\\n}'),\n",
|
||||
" Document(page_content='function hello() {\\n echo \"Hello World!\";\\n}'),\n",
|
||||
" Document(page_content='interface Human {\\n public function breath();\\n}'),\n",
|
||||
" Document(page_content='trait Foo { }\\nenum Color\\n{\\n case Red;'),\n",
|
||||
" Document(page_content='case Blue;\\n}')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"PHP_CODE = \"\"\"<?php\n",
|
||||
"namespace foo;\n",
|
||||
"class Hello {\n",
|
||||
" public function __construct() { }\n",
|
||||
"}\n",
|
||||
"function hello() {\n",
|
||||
" echo \"Hello World!\";\n",
|
||||
"}\n",
|
||||
"interface Human {\n",
|
||||
" public function breath();\n",
|
||||
"}\n",
|
||||
"trait Foo { }\n",
|
||||
"enum Color\n",
|
||||
"{\n",
|
||||
" case Red;\n",
|
||||
" case Blue;\n",
|
||||
"}\"\"\"\n",
|
||||
"php_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.PHP, chunk_size=50, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"haskell_docs = php_splitter.create_documents([PHP_CODE])\n",
|
||||
"haskell_docs"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -630,7 +687,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -18,6 +18,7 @@ from langchain_community.document_loaders.parsers.language.javascript import (
|
||||
from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.php import PHPSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter
|
||||
from langchain_community.document_loaders.parsers.language.rust import RustSegmenter
|
||||
@ -42,6 +43,7 @@ LANGUAGE_EXTENSIONS: Dict[str, str] = {
|
||||
"pl": "perl",
|
||||
"ts": "ts",
|
||||
"java": "java",
|
||||
"php": "php",
|
||||
}
|
||||
|
||||
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
||||
@ -60,6 +62,7 @@ LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
||||
"perl": PerlSegmenter,
|
||||
"ts": TypeScriptSegmenter,
|
||||
"java": JavaSegmenter,
|
||||
"php": PHPSegmenter,
|
||||
}
|
||||
|
||||
Language = Literal[
|
||||
|
@ -0,0 +1,35 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(function_definition) @function
|
||||
(class_declaration) @class
|
||||
(interface_declaration) @interface
|
||||
(trait_declaration) @trait
|
||||
(enum_declaration) @enum
|
||||
(namespace_definition) @namespace
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class PHPSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for PHP."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("php")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@ -0,0 +1,68 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.php import PHPSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestPHPSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """<?php
|
||||
namespace foo;
|
||||
|
||||
class Hello {
|
||||
public function __construct() { }
|
||||
}
|
||||
|
||||
function hello() {
|
||||
echo "Hello World!";
|
||||
}
|
||||
|
||||
interface Human {
|
||||
public function breath();
|
||||
}
|
||||
|
||||
trait Foo { }
|
||||
|
||||
enum Color
|
||||
{
|
||||
case Red;
|
||||
case Blue;
|
||||
}"""
|
||||
|
||||
self.expected_simplified_code = """<?php
|
||||
// Code for: namespace foo;
|
||||
|
||||
// Code for: class Hello {
|
||||
|
||||
// Code for: function hello() {
|
||||
|
||||
// Code for: interface Human {
|
||||
|
||||
// Code for: trait Foo { }
|
||||
|
||||
// Code for: enum Color"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"namespace foo;",
|
||||
"class Hello {\n public function __construct() { }\n}",
|
||||
'function hello() {\n echo "Hello World!";\n}',
|
||||
"interface Human {\n public function breath();\n}",
|
||||
"trait Foo { }",
|
||||
"enum Color\n{\n case Red;\n case Blue;\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(PHPSegmenter("<?php $a = 0;").is_valid())
|
||||
self.assertFalse(PHPSegmenter("<?php a ?b}+ c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = PHPSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = PHPSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
Loading…
Reference in New Issue
Block a user