community[patch]: Change github endpoint in GithubLoader (#17622)

Description- 
- Changed the GitHub endpoint as existing was not working and giving 404
not found error
- Also the existing function was failing if file_filter is not passed as
the tree api return all paths including directory as well, and when
get_file_content was iterating over these path, the function was failing
for directory as the api was returning list of files inside the
directory, so added a condition to ignore the paths if it a directory
- Fixes this issue -
https://github.com/langchain-ai/langchain/issues/17453

Co-authored-by: Radhika Bansal <Radhika.Bansal@veritas.com>
This commit is contained in:
RadhikaBansal97 2024-03-01 23:06:31 +05:30 committed by GitHub
parent 2b93206f02
commit 8bafd2df5e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 8 deletions

View File

@ -211,7 +211,7 @@ class GithubFileLoader(BaseGitHubLoader, ABC):
def get_file_paths(self) -> List[Dict]: def get_file_paths(self) -> List[Dict]:
base_url = ( base_url = (
f"{self.github_api_url}/api/v3/repos/{self.repo}/git/trees/" f"{self.github_api_url}/repos/{self.repo}/git/trees/"
f"{self.branch}?recursive=1" f"{self.branch}?recursive=1"
) )
response = requests.get(base_url, headers=self.headers) response = requests.get(base_url, headers=self.headers)
@ -222,8 +222,8 @@ class GithubFileLoader(BaseGitHubLoader, ABC):
'path': '.github', 'path': '.github',
'mode': '040000', 'mode': '040000',
'type': 'tree', 'type': 'tree',
'sha': '89a2ae046e8b59eb96531f123c0c6d4913885df1', 'sha': '5dc46e6b38b22707894ced126270b15e2f22f64e',
'url': 'https://github.com/api/v3/repos/shufanhao/langchain/git/trees/89a2ae046e8b59eb96531f123c0c6d4913885dxxx' 'url': 'https://api.github.com/repos/langchain-ai/langchain/git/blobs/5dc46e6b38b22707894ced126270b15e2f22f64e'
} }
""" """
return [ return [
@ -233,12 +233,15 @@ class GithubFileLoader(BaseGitHubLoader, ABC):
] ]
def get_file_content_by_path(self, path: str) -> str: def get_file_content_by_path(self, path: str) -> str:
base_url = f"{self.github_api_url}/api/v3/repos/{self.repo}/contents/{path}" base_url = f"{self.github_api_url}/repos/{self.repo}/contents/{path}"
response = requests.get(base_url, headers=self.headers) response = requests.get(base_url, headers=self.headers)
response.raise_for_status() response.raise_for_status()
content_encoded = response.json()["content"] if isinstance(response.json(), dict):
return base64.b64decode(content_encoded).decode("utf-8") content_encoded = response.json()["content"]
return base64.b64decode(content_encoded).decode("utf-8")
return ""
def load(self) -> List[Document]: def load(self) -> List[Document]:
documents = [] documents = []
@ -246,6 +249,9 @@ class GithubFileLoader(BaseGitHubLoader, ABC):
files = self.get_file_paths() files = self.get_file_paths()
for file in files: for file in files:
content = self.get_file_content_by_path(file["path"]) content = self.get_file_content_by_path(file["path"])
if content == "":
continue
metadata = { metadata = {
"path": file["path"], "path": file["path"],
"sha": file["sha"], "sha": file["sha"],

View File

@ -147,7 +147,7 @@ def test_github_file_content_get_file_paths(mocker: MockerFixture) -> None:
"type": "blob", "type": "blob",
"sha": "789", "sha": "789",
"size": 37, "size": 37,
"url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789", "url": "https://github.com/repos/shufanhao/langchain/git/blobs/789",
} }
] ]
}, },
@ -206,7 +206,7 @@ def test_github_file_content_loader(mocker: MockerFixture) -> None:
"type": "blob", "type": "blob",
"sha": "789", "sha": "789",
"size": 37, "size": 37,
"url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789", "url": "https://github.com/repos/shufanhao/langchain/git/blobs/789",
} }
] ]
}, },