From 42591be4f6d6efa634deaad1df2aea318b877d3c Mon Sep 17 00:00:00 2001 From: RadhikaBansal97 <32077248+RadhikaBansal97@users.noreply.github.com> Date: Fri, 1 Mar 2024 23:06:31 +0530 Subject: [PATCH] community[patch]: Change github endpoint in GithubLoader (#17622) Description- - Changed the GitHub endpoint as existing was not working and giving 404 not found error - Also the existing function was failing if file_filter is not passed as the tree api return all paths including directory as well, and when get_file_content was iterating over these path, the function was failing for directory as the api was returning list of files inside the directory, so added a condition to ignore the paths if it a directory - Fixes this issue - https://github.com/langchain-ai/langchain/issues/17453 Co-authored-by: Radhika Bansal --- .../document_loaders/github.py | 18 ++++++++++++------ .../unit_tests/document_loaders/test_github.py | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/github.py b/libs/community/langchain_community/document_loaders/github.py index 65f327d948f..c912c1b59de 100644 --- a/libs/community/langchain_community/document_loaders/github.py +++ b/libs/community/langchain_community/document_loaders/github.py @@ -211,7 +211,7 @@ class GithubFileLoader(BaseGitHubLoader, ABC): def get_file_paths(self) -> List[Dict]: base_url = ( - f"{self.github_api_url}/api/v3/repos/{self.repo}/git/trees/" + f"{self.github_api_url}/repos/{self.repo}/git/trees/" f"{self.branch}?recursive=1" ) response = requests.get(base_url, headers=self.headers) @@ -222,8 +222,8 @@ class GithubFileLoader(BaseGitHubLoader, ABC): 'path': '.github', 'mode': '040000', 'type': 'tree', - 'sha': '89a2ae046e8b59eb96531f123c0c6d4913885df1', - 'url': 'https://github.com/api/v3/repos/shufanhao/langchain/git/trees/89a2ae046e8b59eb96531f123c0c6d4913885dxxx' + 'sha': '5dc46e6b38b22707894ced126270b15e2f22f64e', + 'url': 'https://api.github.com/repos/langchain-ai/langchain/git/blobs/5dc46e6b38b22707894ced126270b15e2f22f64e' } """ return [ @@ -233,12 +233,15 @@ class GithubFileLoader(BaseGitHubLoader, ABC): ] def get_file_content_by_path(self, path: str) -> str: - base_url = f"{self.github_api_url}/api/v3/repos/{self.repo}/contents/{path}" + base_url = f"{self.github_api_url}/repos/{self.repo}/contents/{path}" response = requests.get(base_url, headers=self.headers) response.raise_for_status() - content_encoded = response.json()["content"] - return base64.b64decode(content_encoded).decode("utf-8") + if isinstance(response.json(), dict): + content_encoded = response.json()["content"] + return base64.b64decode(content_encoded).decode("utf-8") + + return "" def load(self) -> List[Document]: documents = [] @@ -246,6 +249,9 @@ class GithubFileLoader(BaseGitHubLoader, ABC): files = self.get_file_paths() for file in files: content = self.get_file_content_by_path(file["path"]) + if content == "": + continue + metadata = { "path": file["path"], "sha": file["sha"], diff --git a/libs/community/tests/unit_tests/document_loaders/test_github.py b/libs/community/tests/unit_tests/document_loaders/test_github.py index f9f74be6975..bc00d375449 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_github.py +++ b/libs/community/tests/unit_tests/document_loaders/test_github.py @@ -147,7 +147,7 @@ def test_github_file_content_get_file_paths(mocker: MockerFixture) -> None: "type": "blob", "sha": "789", "size": 37, - "url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789", + "url": "https://github.com/repos/shufanhao/langchain/git/blobs/789", } ] }, @@ -206,7 +206,7 @@ def test_github_file_content_loader(mocker: MockerFixture) -> None: "type": "blob", "sha": "789", "size": 37, - "url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789", + "url": "https://github.com/repos/shufanhao/langchain/git/blobs/789", } ] },