mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-31 16:08:59 +00:00 
			
		
		
		
	community[minor]: add github file loader to load any github file content b… (#15305)
### Description support load any github file content based on file extension. Why not use [git loader](https://python.langchain.com/docs/integrations/document_loaders/git#load-existing-repository-from-disk) ? git loader clones the whole repo even only interested part of files, that's too heavy. This GithubFileLoader only downloads that you are interested files. ### Twitter handle my twitter: @shufanhaotop --------- Co-authored-by: Hao Fan <h_fan@apple.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
		| @@ -1,6 +1,7 @@ | ||||
| import base64 | ||||
| from abc import ABC | ||||
| from datetime import datetime | ||||
| from typing import Dict, Iterator, List, Literal, Optional, Union | ||||
| from typing import Callable, Dict, Iterator, List, Literal, Optional, Union | ||||
|  | ||||
| import requests | ||||
| from langchain_core.documents import Document | ||||
| @@ -20,7 +21,7 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC): | ||||
|     github_api_url: str = "https://api.github.com" | ||||
|     """URL of GitHub API""" | ||||
|  | ||||
|     @root_validator(pre=True) | ||||
|     @root_validator(pre=True, allow_reuse=True) | ||||
|     def validate_environment(cls, values: Dict) -> Dict: | ||||
|         """Validate that access token exists in environment.""" | ||||
|         values["access_token"] = get_from_dict_or_env( | ||||
| @@ -65,7 +66,7 @@ class GitHubIssuesLoader(BaseGitHubLoader): | ||||
|     """Only show notifications updated after the given time. | ||||
|         This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ.""" | ||||
|  | ||||
|     @validator("since") | ||||
|     @validator("since", allow_reuse=True) | ||||
|     def validate_since(cls, v: Optional[str]) -> Optional[str]: | ||||
|         if v: | ||||
|             try: | ||||
| @@ -186,3 +187,59 @@ class GitHubIssuesLoader(BaseGitHubLoader): | ||||
|     def url(self) -> str: | ||||
|         """Create URL for GitHub API.""" | ||||
|         return f"{self.github_api_url}/repos/{self.repo}/issues?{self.query_params}" | ||||
|  | ||||
|  | ||||
| class GithubFileLoader(BaseGitHubLoader, ABC): | ||||
|     """Load GitHub File""" | ||||
|  | ||||
|     file_extension: str = ".md" | ||||
|     branch: str = "main" | ||||
|  | ||||
|     file_filter: Optional[Callable[[str], bool]] | ||||
|  | ||||
|     def get_file_paths(self) -> List[Dict]: | ||||
|         base_url = ( | ||||
|             f"{self.github_api_url}/api/v3/repos/{self.repo}/git/trees/" | ||||
|             f"{self.branch}?recursive=1" | ||||
|         ) | ||||
|         response = requests.get(base_url, headers=self.headers) | ||||
|         response.raise_for_status() | ||||
|         all_files = response.json()["tree"] | ||||
|         """ one element in all_files | ||||
|         { | ||||
|             'path': '.github',  | ||||
|             'mode': '040000',  | ||||
|             'type': 'tree',  | ||||
|             'sha': '89a2ae046e8b59eb96531f123c0c6d4913885df1',  | ||||
|             'url': 'https://github.com/api/v3/repos/shufanhao/langchain/git/trees/89a2ae046e8b59eb96531f123c0c6d4913885dxxx' | ||||
|         } | ||||
|         """ | ||||
|         return [ | ||||
|             f | ||||
|             for f in all_files | ||||
|             if not (self.file_filter and not self.file_filter(f["path"])) | ||||
|         ] | ||||
|  | ||||
|     def get_file_content_by_path(self, path: str) -> str: | ||||
|         base_url = f"{self.github_api_url}/api/v3/repos/{self.repo}/contents/{path}" | ||||
|         response = requests.get(base_url, headers=self.headers) | ||||
|         response.raise_for_status() | ||||
|  | ||||
|         content_encoded = response.json()["content"] | ||||
|         return base64.b64decode(content_encoded).decode("utf-8") | ||||
|  | ||||
|     def load(self) -> List[Document]: | ||||
|         documents = [] | ||||
|  | ||||
|         files = self.get_file_paths() | ||||
|         for file in files: | ||||
|             content = self.get_file_content_by_path(file["path"]) | ||||
|             metadata = { | ||||
|                 "path": file["path"], | ||||
|                 "sha": file["sha"], | ||||
|                 "source": f"{self.github_api_url}/{self.repo}/{file['type']}/" | ||||
|                 f"{self.branch}/{file['path']}", | ||||
|             } | ||||
|             documents.append(Document(page_content=content, metadata=metadata)) | ||||
|  | ||||
|         return documents | ||||
|   | ||||
		Reference in New Issue
	
	Block a user