From ef082c77b14399b186512cd9f220d1c5d65ac4e8 Mon Sep 17 00:00:00 2001
From: Frank <fanhao01@baidu.com>
Date: Wed, 7 Feb 2024 01:42:33 +0800
Subject: [PATCH] =?UTF-8?q?community[minor]:=20add=20github=20file=20loade?=
 =?UTF-8?q?r=20to=20load=20any=20github=20file=20content=20b=E2=80=A6=20(#?=
 =?UTF-8?q?15305)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
support load any github file content based on file extension.

Why not use [git
loader](https://python.langchain.com/docs/integrations/document_loaders/git#load-existing-repository-from-disk)
?
git loader clones the whole repo even only interested part of files,
that's too heavy. This GithubFileLoader only downloads that you are
interested files.

### Twitter handle
my twitter: @shufanhaotop

---------

Co-authored-by: Hao Fan <h_fan@apple.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../document_loaders/github.ipynb             | 152 +++++++-----------
 .../document_loaders/__init__.py              |   6 +-
 .../document_loaders/github.py                |  63 +++++++-
 .../document_loaders/test_github.py           | 109 ++++++++++++-
 .../document_loaders/test_imports.py          |   1 +
 5 files changed, 232 insertions(+), 99 deletions(-)

diff --git a/docs/docs/integrations/document_loaders/github.ipynb b/docs/docs/integrations/document_loaders/github.ipynb
index 3d9f57243f0..4b7bb7cdb28 100644
--- a/docs/docs/integrations/document_loaders/github.ipynb
+++ b/docs/docs/integrations/document_loaders/github.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# GitHub\n",
     "\n",
-    "This notebooks shows how you can load issues and pull requests (PRs) for a given repository on [GitHub](https://github.com/). We will use the LangChain Python repository as an example."
+    "This notebooks shows how you can load issues and pull requests (PRs) for a given repository on [GitHub](https://github.com/). Also shows how you can load github files for agiven repository on [GitHub](https://github.com/). We will use the LangChain Python repository as an example."
    ]
   },
   {
@@ -46,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "tags": []
    },
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,7 +91,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,27 +100,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "# Creates GitHubLoader (#5257)\r\n",
-      "\r\n",
-      "GitHubLoader is a DocumentLoader that loads issues and PRs from GitHub.\r\n",
-      "\r\n",
-      "Fixes #5257\r\n",
-      "\r\n",
-      "Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested:\r\n",
-      "DataLoaders\r\n",
-      "- @eyurtsev\r\n",
-      "\n",
-      "{'url': 'https://github.com/langchain-ai/langchain/pull/5408', 'title': 'DocumentLoader for GitHub', 'creator': 'UmerHA', 'created_at': '2023-05-29T14:50:53Z', 'comments': 0, 'state': 'open', 'labels': ['enhancement', 'lgtm', 'doc loader'], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5408, 'is_pull_request': True}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(docs[0].page_content)\n",
     "print(docs[0].metadata)"
@@ -142,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -157,84 +139,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "### System Info\n",
-      "\n",
-      "LangChain version = 0.0.167\r\n",
-      "Python version = 3.11.0\r\n",
-      "System = Windows 11 (using Jupyter)\n",
-      "\n",
-      "### Who can help?\n",
-      "\n",
-      "- @hwchase17\r\n",
-      "- @agola11\r\n",
-      "- @UmerHA (I have a fix ready, will submit a PR)\n",
-      "\n",
-      "### Information\n",
-      "\n",
-      "- [ ] The official example notebooks/scripts\n",
-      "- [X] My own modified scripts\n",
-      "\n",
-      "### Related Components\n",
-      "\n",
-      "- [X] LLMs/Chat Models\n",
-      "- [ ] Embedding Models\n",
-      "- [X] Prompts / Prompt Templates / Prompt Selectors\n",
-      "- [ ] Output Parsers\n",
-      "- [ ] Document Loaders\n",
-      "- [ ] Vector Stores / Retrievers\n",
-      "- [ ] Memory\n",
-      "- [ ] Agents / Agent Executors\n",
-      "- [ ] Tools / Toolkits\n",
-      "- [ ] Chains\n",
-      "- [ ] Callbacks/Tracing\n",
-      "- [ ] Async\n",
-      "\n",
-      "### Reproduction\n",
-      "\n",
-      "```\r\n",
-      "import os\r\n",
-      "os.environ[\"OPENAI_API_KEY\"] = \"...\"\r\n",
-      "\r\n",
-      "from langchain.chains import LLMChain\r\n",
-      "from langchain_openai import ChatOpenAI\r\n",
-      "from langchain.prompts import PromptTemplate\r\n",
-      "from langchain.prompts.chat import ChatPromptTemplate\r\n",
-      "from langchain.schema import messages_from_dict\r\n",
-      "\r\n",
-      "role_strings = [\r\n",
-      "    (\"system\", \"you are a bird expert\"), \r\n",
-      "    (\"human\", \"which bird has a point beak?\")\r\n",
-      "]\r\n",
-      "prompt = ChatPromptTemplate.from_role_strings(role_strings)\r\n",
-      "chain = LLMChain(llm=ChatOpenAI(), prompt=prompt)\r\n",
-      "chain.run({})\r\n",
-      "```\n",
-      "\n",
-      "### Expected behavior\n",
-      "\n",
-      "Chain should run\n",
-      "{'url': 'https://github.com/langchain-ai/langchain/issues/5027', 'title': \"ChatOpenAI models don't work with prompts created via ChatPromptTemplate.from_role_strings\", 'creator': 'UmerHA', 'created_at': '2023-05-20T10:39:18Z', 'comments': 1, 'state': 'open', 'labels': [], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5027, 'is_pull_request': False}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(docs[0].page_content)\n",
     "print(docs[0].metadata)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Github File Content\n",
+    "\n",
+    "For below code, loads all markdown file in rpeo `langchain-ai/langchain`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import GithubFileLoader"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "loader = GithubFileLoader(\n",
+    "    repo=\"langchain-ai/langchain\",  # the repo name\n",
+    "    access_token=ACCESS_TOKEN,\n",
+    "    github_api_url=\"https://api.github.com\",\n",
+    "    file_filter=lambda file_path: file_path.endswith(\n",
+    "        \".md\"\n",
+    "    ),  # load all markdowns files.\n",
+    ")\n",
+    "documents = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "example output of one of document: \n",
+    "\n",
+    "```json\n",
+    "documents.metadata: \n",
+    "    {\n",
+    "      \"path\": \"README.md\",\n",
+    "      \"sha\": \"82f1c4ea88ecf8d2dfsfx06a700e84be4\",\n",
+    "      \"source\": \"https://github.com/langchain-ai/langchain/blob/master/README.md\"\n",
+    "    }\n",
+    "documents.content:\n",
+    "    mock content\n",
+    "```"
+   ]
   }
  ],
  "metadata": {
@@ -253,7 +219,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py
index b06f416d4da..869b0dca032 100644
--- a/libs/community/langchain_community/document_loaders/__init__.py
+++ b/libs/community/langchain_community/document_loaders/__init__.py
@@ -102,7 +102,10 @@ from langchain_community.document_loaders.gcs_file import GCSFileLoader
 from langchain_community.document_loaders.geodataframe import GeoDataFrameLoader
 from langchain_community.document_loaders.git import GitLoader
 from langchain_community.document_loaders.gitbook import GitbookLoader
-from langchain_community.document_loaders.github import GitHubIssuesLoader
+from langchain_community.document_loaders.github import (
+    GithubFileLoader,
+    GitHubIssuesLoader,
+)
 from langchain_community.document_loaders.google_speech_to_text import (
     GoogleSpeechToTextLoader,
 )
@@ -296,6 +299,7 @@ __all__ = [
     "GCSDirectoryLoader",
     "GCSFileLoader",
     "GeoDataFrameLoader",
+    "GithubFileLoader",
     "GitHubIssuesLoader",
     "GitLoader",
     "GitbookLoader",
diff --git a/libs/community/langchain_community/document_loaders/github.py b/libs/community/langchain_community/document_loaders/github.py
index 77bdf7da6c7..8a361a46e2b 100644
--- a/libs/community/langchain_community/document_loaders/github.py
+++ b/libs/community/langchain_community/document_loaders/github.py
@@ -1,6 +1,7 @@
+import base64
 from abc import ABC
 from datetime import datetime
-from typing import Dict, Iterator, List, Literal, Optional, Union
+from typing import Callable, Dict, Iterator, List, Literal, Optional, Union
 
 import requests
 from langchain_core.documents import Document
@@ -20,7 +21,7 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
     github_api_url: str = "https://api.github.com"
     """URL of GitHub API"""
 
-    @root_validator(pre=True)
+    @root_validator(pre=True, allow_reuse=True)
     def validate_environment(cls, values: Dict) -> Dict:
         """Validate that access token exists in environment."""
         values["access_token"] = get_from_dict_or_env(
@@ -65,7 +66,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):
     """Only show notifications updated after the given time.
         This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ."""
 
-    @validator("since")
+    @validator("since", allow_reuse=True)
     def validate_since(cls, v: Optional[str]) -> Optional[str]:
         if v:
             try:
@@ -186,3 +187,59 @@ class GitHubIssuesLoader(BaseGitHubLoader):
     def url(self) -> str:
         """Create URL for GitHub API."""
         return f"{self.github_api_url}/repos/{self.repo}/issues?{self.query_params}"
+
+
+class GithubFileLoader(BaseGitHubLoader, ABC):
+    """Load GitHub File"""
+
+    file_extension: str = ".md"
+    branch: str = "main"
+
+    file_filter: Optional[Callable[[str], bool]]
+
+    def get_file_paths(self) -> List[Dict]:
+        base_url = (
+            f"{self.github_api_url}/api/v3/repos/{self.repo}/git/trees/"
+            f"{self.branch}?recursive=1"
+        )
+        response = requests.get(base_url, headers=self.headers)
+        response.raise_for_status()
+        all_files = response.json()["tree"]
+        """ one element in all_files
+        {
+            'path': '.github', 
+            'mode': '040000', 
+            'type': 'tree', 
+            'sha': '89a2ae046e8b59eb96531f123c0c6d4913885df1', 
+            'url': 'https://github.com/api/v3/repos/shufanhao/langchain/git/trees/89a2ae046e8b59eb96531f123c0c6d4913885dxxx'
+        }
+        """
+        return [
+            f
+            for f in all_files
+            if not (self.file_filter and not self.file_filter(f["path"]))
+        ]
+
+    def get_file_content_by_path(self, path: str) -> str:
+        base_url = f"{self.github_api_url}/api/v3/repos/{self.repo}/contents/{path}"
+        response = requests.get(base_url, headers=self.headers)
+        response.raise_for_status()
+
+        content_encoded = response.json()["content"]
+        return base64.b64decode(content_encoded).decode("utf-8")
+
+    def load(self) -> List[Document]:
+        documents = []
+
+        files = self.get_file_paths()
+        for file in files:
+            content = self.get_file_content_by_path(file["path"])
+            metadata = {
+                "path": file["path"],
+                "sha": file["sha"],
+                "source": f"{self.github_api_url}/{self.repo}/{file['type']}/"
+                f"{self.branch}/{file['path']}",
+            }
+            documents.append(Document(page_content=content, metadata=metadata))
+
+        return documents
diff --git a/libs/community/tests/unit_tests/document_loaders/test_github.py b/libs/community/tests/unit_tests/document_loaders/test_github.py
index 5641836dcd6..f9f74be6975 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_github.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_github.py
@@ -1,8 +1,13 @@
+import base64
+
 import pytest
 from langchain_core.documents import Document
 from pytest_mock import MockerFixture
 
-from langchain_community.document_loaders.github import GitHubIssuesLoader
+from langchain_community.document_loaders.github import (
+    GithubFileLoader,
+    GitHubIssuesLoader,
+)
 
 
 def test_initialization() -> None:
@@ -48,7 +53,7 @@ def test_invalid_initialization() -> None:
         GitHubIssuesLoader(since="not_a_date")
 
 
-def test_load(mocker: MockerFixture) -> None:
+def test_load_github_issue(mocker: MockerFixture) -> None:
     mocker.patch(
         "requests.get", return_value=mocker.MagicMock(json=lambda: [], links=None)
     )
@@ -127,3 +132,103 @@ def test_url() -> None:
         "&assignee=user1&creator=user2&mentioned=user3&labels=bug,ui,@high"
         "&sort=comments&direction=asc&since=2023-05-26T00:00:00Z"
     )
+
+
+def test_github_file_content_get_file_paths(mocker: MockerFixture) -> None:
+    # Mock the requests.get method to simulate the API response
+    mocker.patch(
+        "requests.get",
+        return_value=mocker.MagicMock(
+            json=lambda: {
+                "tree": [
+                    {
+                        "path": "readme.md",
+                        "mode": "100644",
+                        "type": "blob",
+                        "sha": "789",
+                        "size": 37,
+                        "url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789",
+                    }
+                ]
+            },
+            status_code=200,
+        ),
+    )
+
+    # case1: add file_filter
+    loader = GithubFileLoader(
+        repo="shufanhao/langchain",
+        access_token="access_token",
+        github_api_url="https://github.com",
+        file_filter=lambda file_path: file_path.endswith(".md"),
+    )
+
+    # Call the load method
+    files = loader.get_file_paths()
+
+    # Assert the results
+    assert len(files) == 1
+    assert files[0]["path"] == "readme.md"
+
+    # case2: didn't add file_filter
+    loader = GithubFileLoader(
+        repo="shufanhao/langchain",
+        access_token="access_token",
+        github_api_url="https://github.com",
+    )
+
+    # Call the load method
+    files = loader.get_file_paths()
+    assert len(files) == 1
+    assert files[0]["path"] == "readme.md"
+
+    # case3: add file_filter with a non-exist file path
+    loader = GithubFileLoader(
+        repo="shufanhao/langchain",
+        access_token="access_token",
+        github_api_url="https://github.com",
+        file_filter=lambda file_path: file_path.endswith(".py"),
+    )
+
+    # Call the load method
+    files = loader.get_file_paths()
+    assert len(files) == 0
+
+
+def test_github_file_content_loader(mocker: MockerFixture) -> None:
+    # Mock the requests.get method to simulate the API response
+    file_path_res = mocker.MagicMock(
+        json=lambda: {
+            "tree": [
+                {
+                    "path": "readme.md",
+                    "mode": "100644",
+                    "type": "blob",
+                    "sha": "789",
+                    "size": 37,
+                    "url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789",
+                }
+            ]
+        },
+        status_code=200,
+    )
+    file_content_res = mocker.MagicMock(
+        json=lambda: {"content": base64.b64encode("Mocked content".encode("utf-8"))},
+        status_code=200,
+    )
+
+    mocker.patch("requests.get", side_effect=[file_path_res, file_content_res])
+
+    # case1: file_extension=".md"
+    loader = GithubFileLoader(
+        repo="shufanhao/langchain",
+        access_token="access_token",
+        github_api_url="https://github.com",
+    )
+
+    # Call the load method
+    docs = loader.load()
+
+    assert len(docs) == 1
+    assert docs[0].page_content == "Mocked content"
+    assert docs[0].metadata["sha"] == "789"
diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py
index f4511686f7a..d22f81aa19b 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@@ -65,6 +65,7 @@ EXPECTED_ALL = [
     "GCSDirectoryLoader",
     "GCSFileLoader",
     "GeoDataFrameLoader",
+    "GithubFileLoader",
     "GitHubIssuesLoader",
     "GitLoader",
     "GitbookLoader",