From ef082c77b14399b186512cd9f220d1c5d65ac4e8 Mon Sep 17 00:00:00 2001 From: Frank Date: Wed, 7 Feb 2024 01:42:33 +0800 Subject: [PATCH] =?UTF-8?q?community[minor]:=20add=20github=20file=20loade?= =?UTF-8?q?r=20to=20load=20any=20github=20file=20content=20b=E2=80=A6=20(#?= =?UTF-8?q?15305)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description support load any github file content based on file extension. Why not use [git loader](https://python.langchain.com/docs/integrations/document_loaders/git#load-existing-repository-from-disk) ? git loader clones the whole repo even only interested part of files, that's too heavy. This GithubFileLoader only downloads that you are interested files. ### Twitter handle my twitter: @shufanhaotop --------- Co-authored-by: Hao Fan Co-authored-by: Bagatur --- .../document_loaders/github.ipynb | 152 +++++++----------- .../document_loaders/__init__.py | 6 +- .../document_loaders/github.py | 63 +++++++- .../document_loaders/test_github.py | 109 ++++++++++++- .../document_loaders/test_imports.py | 1 + 5 files changed, 232 insertions(+), 99 deletions(-) diff --git a/docs/docs/integrations/document_loaders/github.ipynb b/docs/docs/integrations/document_loaders/github.ipynb index 3d9f57243f0..4b7bb7cdb28 100644 --- a/docs/docs/integrations/document_loaders/github.ipynb +++ b/docs/docs/integrations/document_loaders/github.ipynb @@ -6,7 +6,7 @@ "source": [ "# GitHub\n", "\n", - "This notebooks shows how you can load issues and pull requests (PRs) for a given repository on [GitHub](https://github.com/). We will use the LangChain Python repository as an example." + "This notebooks shows how you can load issues and pull requests (PRs) for a given repository on [GitHub](https://github.com/). Also shows how you can load github files for agiven repository on [GitHub](https://github.com/). We will use the LangChain Python repository as an example." ] }, { @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "tags": [] }, @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -100,27 +100,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Creates GitHubLoader (#5257)\r\n", - "\r\n", - "GitHubLoader is a DocumentLoader that loads issues and PRs from GitHub.\r\n", - "\r\n", - "Fixes #5257\r\n", - "\r\n", - "Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested:\r\n", - "DataLoaders\r\n", - "- @eyurtsev\r\n", - "\n", - "{'url': 'https://github.com/langchain-ai/langchain/pull/5408', 'title': 'DocumentLoader for GitHub', 'creator': 'UmerHA', 'created_at': '2023-05-29T14:50:53Z', 'comments': 0, 'state': 'open', 'labels': ['enhancement', 'lgtm', 'doc loader'], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5408, 'is_pull_request': True}\n" - ] - } - ], + "outputs": [], "source": [ "print(docs[0].page_content)\n", "print(docs[0].metadata)" @@ -142,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -157,84 +139,68 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### System Info\n", - "\n", - "LangChain version = 0.0.167\r\n", - "Python version = 3.11.0\r\n", - "System = Windows 11 (using Jupyter)\n", - "\n", - "### Who can help?\n", - "\n", - "- @hwchase17\r\n", - "- @agola11\r\n", - "- @UmerHA (I have a fix ready, will submit a PR)\n", - "\n", - "### Information\n", - "\n", - "- [ ] The official example notebooks/scripts\n", - "- [X] My own modified scripts\n", - "\n", - "### Related Components\n", - "\n", - "- [X] LLMs/Chat Models\n", - "- [ ] Embedding Models\n", - "- [X] Prompts / Prompt Templates / Prompt Selectors\n", - "- [ ] Output Parsers\n", - "- [ ] Document Loaders\n", - "- [ ] Vector Stores / Retrievers\n", - "- [ ] Memory\n", - "- [ ] Agents / Agent Executors\n", - "- [ ] Tools / Toolkits\n", - "- [ ] Chains\n", - "- [ ] Callbacks/Tracing\n", - "- [ ] Async\n", - "\n", - "### Reproduction\n", - "\n", - "```\r\n", - "import os\r\n", - "os.environ[\"OPENAI_API_KEY\"] = \"...\"\r\n", - "\r\n", - "from langchain.chains import LLMChain\r\n", - "from langchain_openai import ChatOpenAI\r\n", - "from langchain.prompts import PromptTemplate\r\n", - "from langchain.prompts.chat import ChatPromptTemplate\r\n", - "from langchain.schema import messages_from_dict\r\n", - "\r\n", - "role_strings = [\r\n", - " (\"system\", \"you are a bird expert\"), \r\n", - " (\"human\", \"which bird has a point beak?\")\r\n", - "]\r\n", - "prompt = ChatPromptTemplate.from_role_strings(role_strings)\r\n", - "chain = LLMChain(llm=ChatOpenAI(), prompt=prompt)\r\n", - "chain.run({})\r\n", - "```\n", - "\n", - "### Expected behavior\n", - "\n", - "Chain should run\n", - "{'url': 'https://github.com/langchain-ai/langchain/issues/5027', 'title': \"ChatOpenAI models don't work with prompts created via ChatPromptTemplate.from_role_strings\", 'creator': 'UmerHA', 'created_at': '2023-05-20T10:39:18Z', 'comments': 1, 'state': 'open', 'labels': [], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5027, 'is_pull_request': False}\n" - ] - } - ], + "outputs": [], "source": [ "print(docs[0].page_content)\n", "print(docs[0].metadata)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Github File Content\n", + "\n", + "For below code, loads all markdown file in rpeo `langchain-ai/langchain`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import GithubFileLoader" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "loader = GithubFileLoader(\n", + " repo=\"langchain-ai/langchain\", # the repo name\n", + " access_token=ACCESS_TOKEN,\n", + " github_api_url=\"https://api.github.com\",\n", + " file_filter=lambda file_path: file_path.endswith(\n", + " \".md\"\n", + " ), # load all markdowns files.\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "example output of one of document: \n", + "\n", + "```json\n", + "documents.metadata: \n", + " {\n", + " \"path\": \"README.md\",\n", + " \"sha\": \"82f1c4ea88ecf8d2dfsfx06a700e84be4\",\n", + " \"source\": \"https://github.com/langchain-ai/langchain/blob/master/README.md\"\n", + " }\n", + "documents.content:\n", + " mock content\n", + "```" + ] } ], "metadata": { @@ -253,7 +219,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index b06f416d4da..869b0dca032 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -102,7 +102,10 @@ from langchain_community.document_loaders.gcs_file import GCSFileLoader from langchain_community.document_loaders.geodataframe import GeoDataFrameLoader from langchain_community.document_loaders.git import GitLoader from langchain_community.document_loaders.gitbook import GitbookLoader -from langchain_community.document_loaders.github import GitHubIssuesLoader +from langchain_community.document_loaders.github import ( + GithubFileLoader, + GitHubIssuesLoader, +) from langchain_community.document_loaders.google_speech_to_text import ( GoogleSpeechToTextLoader, ) @@ -296,6 +299,7 @@ __all__ = [ "GCSDirectoryLoader", "GCSFileLoader", "GeoDataFrameLoader", + "GithubFileLoader", "GitHubIssuesLoader", "GitLoader", "GitbookLoader", diff --git a/libs/community/langchain_community/document_loaders/github.py b/libs/community/langchain_community/document_loaders/github.py index 77bdf7da6c7..8a361a46e2b 100644 --- a/libs/community/langchain_community/document_loaders/github.py +++ b/libs/community/langchain_community/document_loaders/github.py @@ -1,6 +1,7 @@ +import base64 from abc import ABC from datetime import datetime -from typing import Dict, Iterator, List, Literal, Optional, Union +from typing import Callable, Dict, Iterator, List, Literal, Optional, Union import requests from langchain_core.documents import Document @@ -20,7 +21,7 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC): github_api_url: str = "https://api.github.com" """URL of GitHub API""" - @root_validator(pre=True) + @root_validator(pre=True, allow_reuse=True) def validate_environment(cls, values: Dict) -> Dict: """Validate that access token exists in environment.""" values["access_token"] = get_from_dict_or_env( @@ -65,7 +66,7 @@ class GitHubIssuesLoader(BaseGitHubLoader): """Only show notifications updated after the given time. This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ.""" - @validator("since") + @validator("since", allow_reuse=True) def validate_since(cls, v: Optional[str]) -> Optional[str]: if v: try: @@ -186,3 +187,59 @@ class GitHubIssuesLoader(BaseGitHubLoader): def url(self) -> str: """Create URL for GitHub API.""" return f"{self.github_api_url}/repos/{self.repo}/issues?{self.query_params}" + + +class GithubFileLoader(BaseGitHubLoader, ABC): + """Load GitHub File""" + + file_extension: str = ".md" + branch: str = "main" + + file_filter: Optional[Callable[[str], bool]] + + def get_file_paths(self) -> List[Dict]: + base_url = ( + f"{self.github_api_url}/api/v3/repos/{self.repo}/git/trees/" + f"{self.branch}?recursive=1" + ) + response = requests.get(base_url, headers=self.headers) + response.raise_for_status() + all_files = response.json()["tree"] + """ one element in all_files + { + 'path': '.github', + 'mode': '040000', + 'type': 'tree', + 'sha': '89a2ae046e8b59eb96531f123c0c6d4913885df1', + 'url': 'https://github.com/api/v3/repos/shufanhao/langchain/git/trees/89a2ae046e8b59eb96531f123c0c6d4913885dxxx' + } + """ + return [ + f + for f in all_files + if not (self.file_filter and not self.file_filter(f["path"])) + ] + + def get_file_content_by_path(self, path: str) -> str: + base_url = f"{self.github_api_url}/api/v3/repos/{self.repo}/contents/{path}" + response = requests.get(base_url, headers=self.headers) + response.raise_for_status() + + content_encoded = response.json()["content"] + return base64.b64decode(content_encoded).decode("utf-8") + + def load(self) -> List[Document]: + documents = [] + + files = self.get_file_paths() + for file in files: + content = self.get_file_content_by_path(file["path"]) + metadata = { + "path": file["path"], + "sha": file["sha"], + "source": f"{self.github_api_url}/{self.repo}/{file['type']}/" + f"{self.branch}/{file['path']}", + } + documents.append(Document(page_content=content, metadata=metadata)) + + return documents diff --git a/libs/community/tests/unit_tests/document_loaders/test_github.py b/libs/community/tests/unit_tests/document_loaders/test_github.py index 5641836dcd6..f9f74be6975 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_github.py +++ b/libs/community/tests/unit_tests/document_loaders/test_github.py @@ -1,8 +1,13 @@ +import base64 + import pytest from langchain_core.documents import Document from pytest_mock import MockerFixture -from langchain_community.document_loaders.github import GitHubIssuesLoader +from langchain_community.document_loaders.github import ( + GithubFileLoader, + GitHubIssuesLoader, +) def test_initialization() -> None: @@ -48,7 +53,7 @@ def test_invalid_initialization() -> None: GitHubIssuesLoader(since="not_a_date") -def test_load(mocker: MockerFixture) -> None: +def test_load_github_issue(mocker: MockerFixture) -> None: mocker.patch( "requests.get", return_value=mocker.MagicMock(json=lambda: [], links=None) ) @@ -127,3 +132,103 @@ def test_url() -> None: "&assignee=user1&creator=user2&mentioned=user3&labels=bug,ui,@high" "&sort=comments&direction=asc&since=2023-05-26T00:00:00Z" ) + + +def test_github_file_content_get_file_paths(mocker: MockerFixture) -> None: + # Mock the requests.get method to simulate the API response + mocker.patch( + "requests.get", + return_value=mocker.MagicMock( + json=lambda: { + "tree": [ + { + "path": "readme.md", + "mode": "100644", + "type": "blob", + "sha": "789", + "size": 37, + "url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789", + } + ] + }, + status_code=200, + ), + ) + + # case1: add file_filter + loader = GithubFileLoader( + repo="shufanhao/langchain", + access_token="access_token", + github_api_url="https://github.com", + file_filter=lambda file_path: file_path.endswith(".md"), + ) + + # Call the load method + files = loader.get_file_paths() + + # Assert the results + assert len(files) == 1 + assert files[0]["path"] == "readme.md" + + # case2: didn't add file_filter + loader = GithubFileLoader( + repo="shufanhao/langchain", + access_token="access_token", + github_api_url="https://github.com", + ) + + # Call the load method + files = loader.get_file_paths() + assert len(files) == 1 + assert files[0]["path"] == "readme.md" + + # case3: add file_filter with a non-exist file path + loader = GithubFileLoader( + repo="shufanhao/langchain", + access_token="access_token", + github_api_url="https://github.com", + file_filter=lambda file_path: file_path.endswith(".py"), + ) + + # Call the load method + files = loader.get_file_paths() + assert len(files) == 0 + + +def test_github_file_content_loader(mocker: MockerFixture) -> None: + # Mock the requests.get method to simulate the API response + file_path_res = mocker.MagicMock( + json=lambda: { + "tree": [ + { + "path": "readme.md", + "mode": "100644", + "type": "blob", + "sha": "789", + "size": 37, + "url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789", + } + ] + }, + status_code=200, + ) + file_content_res = mocker.MagicMock( + json=lambda: {"content": base64.b64encode("Mocked content".encode("utf-8"))}, + status_code=200, + ) + + mocker.patch("requests.get", side_effect=[file_path_res, file_content_res]) + + # case1: file_extension=".md" + loader = GithubFileLoader( + repo="shufanhao/langchain", + access_token="access_token", + github_api_url="https://github.com", + ) + + # Call the load method + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content == "Mocked content" + assert docs[0].metadata["sha"] == "789" diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index f4511686f7a..d22f81aa19b 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -65,6 +65,7 @@ EXPECTED_ALL = [ "GCSDirectoryLoader", "GCSFileLoader", "GeoDataFrameLoader", + "GithubFileLoader", "GitHubIssuesLoader", "GitLoader", "GitbookLoader",