From e1a24d09c52b5684057d507b86b0e55cbf30f060 Mon Sep 17 00:00:00 2001 From: david02871 Date: Mon, 8 Apr 2024 16:30:28 +0100 Subject: [PATCH] community: Add PHP language parser to document_loaders (#19850) **Description:** Added a PHP language parser to document_loaders **Issue:** N/A **Dependencies:** N/A **Twitter handle:** N/A --------- Co-authored-by: Chester Curme --- .../document_transformers/code_splitter.ipynb | 59 +++++++++++++++- .../parsers/language/language_parser.py | 3 + .../document_loaders/parsers/language/php.py | 35 ++++++++++ .../parsers/language/test_php.py | 68 +++++++++++++++++++ 4 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/php.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py diff --git a/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb b/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb index 1090eb17ea6..43b2fb8ab61 100644 --- a/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb @@ -612,6 +612,63 @@ "haskell_docs = haskell_splitter.create_documents([HASKELL_CODE])\n", "haskell_docs" ] + }, + { + "cell_type": "markdown", + "id": "4a11f7cd-cd85-430c-b307-5b5b5f07f8db", + "metadata": {}, + "source": [ + "## PHP\n", + "Here's an example using the PHP text splitter:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "90c66e7e-87a5-4a81-bece-7949aabf2369", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content=' "Language": + from tree_sitter_languages import get_language + + return get_language("php") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py new file mode 100644 index 00000000000..c54df82dc7c --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py @@ -0,0 +1,68 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.php import PHPSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestPHPSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """ None: + self.assertTrue(PHPSegmenter(" None: + segmenter = PHPSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = PHPSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code)