community: Add PHP language parser to document_loaders (#19850)

**Description:**
Added a PHP language parser to document_loaders
**Issue:** N/A
**Dependencies:** N/A
**Twitter handle:** N/A

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
david02871
2024-04-08 16:30:28 +01:00
committed by GitHub
parent 2f03bc397e
commit e1a24d09c5
4 changed files with 164 additions and 1 deletions

View File

@@ -18,6 +18,7 @@ from langchain_community.document_loaders.parsers.language.javascript import (
from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter
from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter
from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter
from langchain_community.document_loaders.parsers.language.php import PHPSegmenter
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter
from langchain_community.document_loaders.parsers.language.rust import RustSegmenter
@@ -42,6 +43,7 @@ LANGUAGE_EXTENSIONS: Dict[str, str] = {
"pl": "perl",
"ts": "ts",
"java": "java",
"php": "php",
}
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
@@ -60,6 +62,7 @@ LANGUAGE_SEGMENTERS: Dict[str, Any] = {
"perl": PerlSegmenter,
"ts": TypeScriptSegmenter,
"java": JavaSegmenter,
"php": PHPSegmenter,
}
Language = Literal[

View File

@@ -0,0 +1,35 @@
from typing import TYPE_CHECKING
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
TreeSitterSegmenter,
)
if TYPE_CHECKING:
from tree_sitter import Language
CHUNK_QUERY = """
[
(function_definition) @function
(class_declaration) @class
(interface_declaration) @interface
(trait_declaration) @trait
(enum_declaration) @enum
(namespace_definition) @namespace
]
""".strip()
class PHPSegmenter(TreeSitterSegmenter):
"""Code segmenter for PHP."""
def get_language(self) -> "Language":
from tree_sitter_languages import get_language
return get_language("php")
def get_chunk_query(self) -> str:
return CHUNK_QUERY
def make_line_comment(self, text: str) -> str:
return f"// {text}"

View File

@@ -0,0 +1,68 @@
import unittest
import pytest
from langchain_community.document_loaders.parsers.language.php import PHPSegmenter
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
class TestPHPSegmenter(unittest.TestCase):
def setUp(self) -> None:
self.example_code = """<?php
namespace foo;
class Hello {
public function __construct() { }
}
function hello() {
echo "Hello World!";
}
interface Human {
public function breath();
}
trait Foo { }
enum Color
{
case Red;
case Blue;
}"""
self.expected_simplified_code = """<?php
// Code for: namespace foo;
// Code for: class Hello {
// Code for: function hello() {
// Code for: interface Human {
// Code for: trait Foo { }
// Code for: enum Color"""
self.expected_extracted_code = [
"namespace foo;",
"class Hello {\n public function __construct() { }\n}",
'function hello() {\n echo "Hello World!";\n}',
"interface Human {\n public function breath();\n}",
"trait Foo { }",
"enum Color\n{\n case Red;\n case Blue;\n}",
]
def test_is_valid(self) -> None:
self.assertTrue(PHPSegmenter("<?php $a = 0;").is_valid())
self.assertFalse(PHPSegmenter("<?php a ?b}+ c 1 2 3").is_valid())
def test_extract_functions_classes(self) -> None:
segmenter = PHPSegmenter(self.example_code)
extracted_code = segmenter.extract_functions_classes()
self.assertEqual(extracted_code, self.expected_extracted_code)
def test_simplify_code(self) -> None:
segmenter = PHPSegmenter(self.example_code)
simplified_code = segmenter.simplify_code()
self.assertEqual(simplified_code, self.expected_simplified_code)