community[minor]: add github file loader to load any github file content b… (#15305)

### Description support load any github file content based on file extension. Why not use [git loader](https://python.langchain.com/docs/integrations/document_loaders/git#load-existing-repository-from-disk) ? git loader clones the whole repo even only interested part of files, that's too heavy. This GithubFileLoader only downloads that you are interested files. ### Twitter handle my twitter: @shufanhaotop --------- Co-authored-by: Hao Fan <h_fan@apple.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-11-01 16:52:51 +00:00 · 2024-02-07 01:42:33 +08:00
parent ac662b3698
commit ef082c77b1
5 changed files with 232 additions and 99 deletions
--- a/libs/community/langchain_community/document_loaders/github.py
+++ b/libs/community/langchain_community/document_loaders/github.py
@@ -1,6 +1,7 @@
+import base64
 from abc import ABC
 from datetime import datetime
-from typing import Dict, Iterator, List, Literal, Optional, Union
+from typing import Callable, Dict, Iterator, List, Literal, Optional, Union

 import requests
 from langchain_core.documents import Document
@@ -20,7 +21,7 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
    github_api_url: str = "https://api.github.com"
    """URL of GitHub API"""

-    @root_validator(pre=True)
+    @root_validator(pre=True, allow_reuse=True)
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that access token exists in environment."""
        values["access_token"] = get_from_dict_or_env(
@@ -65,7 +66,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):
    """Only show notifications updated after the given time.
        This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ."""

-    @validator("since")
+    @validator("since", allow_reuse=True)
    def validate_since(cls, v: Optional[str]) -> Optional[str]:
        if v:
            try:
@@ -186,3 +187,59 @@ class GitHubIssuesLoader(BaseGitHubLoader):
    def url(self) -> str:
        """Create URL for GitHub API."""
        return f"{self.github_api_url}/repos/{self.repo}/issues?{self.query_params}"
+
+
+class GithubFileLoader(BaseGitHubLoader, ABC):
+    """Load GitHub File"""
+
+    file_extension: str = ".md"
+    branch: str = "main"
+
+    file_filter: Optional[Callable[[str], bool]]
+
+    def get_file_paths(self) -> List[Dict]:
+        base_url = (
+            f"{self.github_api_url}/api/v3/repos/{self.repo}/git/trees/"
+            f"{self.branch}?recursive=1"
+        )
+        response = requests.get(base_url, headers=self.headers)
+        response.raise_for_status()
+        all_files = response.json()["tree"]
+        """ one element in all_files
+        {
+            'path': '.github', 
+            'mode': '040000', 
+            'type': 'tree', 
+            'sha': '89a2ae046e8b59eb96531f123c0c6d4913885df1', 
+            'url': 'https://github.com/api/v3/repos/shufanhao/langchain/git/trees/89a2ae046e8b59eb96531f123c0c6d4913885dxxx'
+        }
+        """
+        return [
+            f
+            for f in all_files
+            if not (self.file_filter and not self.file_filter(f["path"]))
+        ]
+
+    def get_file_content_by_path(self, path: str) -> str:
+        base_url = f"{self.github_api_url}/api/v3/repos/{self.repo}/contents/{path}"
+        response = requests.get(base_url, headers=self.headers)
+        response.raise_for_status()
+
+        content_encoded = response.json()["content"]
+        return base64.b64decode(content_encoded).decode("utf-8")
+
+    def load(self) -> List[Document]:
+        documents = []
+
+        files = self.get_file_paths()
+        for file in files:
+            content = self.get_file_content_by_path(file["path"])
+            metadata = {
+                "path": file["path"],
+                "sha": file["sha"],
+                "source": f"{self.github_api_url}/{self.repo}/{file['type']}/"
+                f"{self.branch}/{file['path']}",
+            }
+            documents.append(Document(page_content=content, metadata=metadata))
+
+        return documents