mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-17 02:03:44 +00:00
community[minor]: add github file loader to load any github file content b… (#15305)
### Description support load any github file content based on file extension. Why not use [git loader](https://python.langchain.com/docs/integrations/document_loaders/git#load-existing-repository-from-disk) ? git loader clones the whole repo even only interested part of files, that's too heavy. This GithubFileLoader only downloads that you are interested files. ### Twitter handle my twitter: @shufanhaotop --------- Co-authored-by: Hao Fan <h_fan@apple.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
ac662b3698
commit
ef082c77b1
@ -6,7 +6,7 @@
|
||||
"source": [
|
||||
"# GitHub\n",
|
||||
"\n",
|
||||
"This notebooks shows how you can load issues and pull requests (PRs) for a given repository on [GitHub](https://github.com/). We will use the LangChain Python repository as an example."
|
||||
"This notebooks shows how you can load issues and pull requests (PRs) for a given repository on [GitHub](https://github.com/). Also shows how you can load github files for agiven repository on [GitHub](https://github.com/). We will use the LangChain Python repository as an example."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -46,7 +46,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
@ -57,7 +57,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -91,7 +91,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -100,27 +100,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Creates GitHubLoader (#5257)\r\n",
|
||||
"\r\n",
|
||||
"GitHubLoader is a DocumentLoader that loads issues and PRs from GitHub.\r\n",
|
||||
"\r\n",
|
||||
"Fixes #5257\r\n",
|
||||
"\r\n",
|
||||
"Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested:\r\n",
|
||||
"DataLoaders\r\n",
|
||||
"- @eyurtsev\r\n",
|
||||
"\n",
|
||||
"{'url': 'https://github.com/langchain-ai/langchain/pull/5408', 'title': 'DocumentLoader for GitHub', 'creator': 'UmerHA', 'created_at': '2023-05-29T14:50:53Z', 'comments': 0, 'state': 'open', 'labels': ['enhancement', 'lgtm', 'doc loader'], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5408, 'is_pull_request': True}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)\n",
|
||||
"print(docs[0].metadata)"
|
||||
@ -142,7 +124,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -157,84 +139,68 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"### System Info\n",
|
||||
"\n",
|
||||
"LangChain version = 0.0.167\r\n",
|
||||
"Python version = 3.11.0\r\n",
|
||||
"System = Windows 11 (using Jupyter)\n",
|
||||
"\n",
|
||||
"### Who can help?\n",
|
||||
"\n",
|
||||
"- @hwchase17\r\n",
|
||||
"- @agola11\r\n",
|
||||
"- @UmerHA (I have a fix ready, will submit a PR)\n",
|
||||
"\n",
|
||||
"### Information\n",
|
||||
"\n",
|
||||
"- [ ] The official example notebooks/scripts\n",
|
||||
"- [X] My own modified scripts\n",
|
||||
"\n",
|
||||
"### Related Components\n",
|
||||
"\n",
|
||||
"- [X] LLMs/Chat Models\n",
|
||||
"- [ ] Embedding Models\n",
|
||||
"- [X] Prompts / Prompt Templates / Prompt Selectors\n",
|
||||
"- [ ] Output Parsers\n",
|
||||
"- [ ] Document Loaders\n",
|
||||
"- [ ] Vector Stores / Retrievers\n",
|
||||
"- [ ] Memory\n",
|
||||
"- [ ] Agents / Agent Executors\n",
|
||||
"- [ ] Tools / Toolkits\n",
|
||||
"- [ ] Chains\n",
|
||||
"- [ ] Callbacks/Tracing\n",
|
||||
"- [ ] Async\n",
|
||||
"\n",
|
||||
"### Reproduction\n",
|
||||
"\n",
|
||||
"```\r\n",
|
||||
"import os\r\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = \"...\"\r\n",
|
||||
"\r\n",
|
||||
"from langchain.chains import LLMChain\r\n",
|
||||
"from langchain_openai import ChatOpenAI\r\n",
|
||||
"from langchain.prompts import PromptTemplate\r\n",
|
||||
"from langchain.prompts.chat import ChatPromptTemplate\r\n",
|
||||
"from langchain.schema import messages_from_dict\r\n",
|
||||
"\r\n",
|
||||
"role_strings = [\r\n",
|
||||
" (\"system\", \"you are a bird expert\"), \r\n",
|
||||
" (\"human\", \"which bird has a point beak?\")\r\n",
|
||||
"]\r\n",
|
||||
"prompt = ChatPromptTemplate.from_role_strings(role_strings)\r\n",
|
||||
"chain = LLMChain(llm=ChatOpenAI(), prompt=prompt)\r\n",
|
||||
"chain.run({})\r\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"### Expected behavior\n",
|
||||
"\n",
|
||||
"Chain should run\n",
|
||||
"{'url': 'https://github.com/langchain-ai/langchain/issues/5027', 'title': \"ChatOpenAI models don't work with prompts created via ChatPromptTemplate.from_role_strings\", 'creator': 'UmerHA', 'created_at': '2023-05-20T10:39:18Z', 'comments': 1, 'state': 'open', 'labels': [], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5027, 'is_pull_request': False}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)\n",
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Github File Content\n",
|
||||
"\n",
|
||||
"For below code, loads all markdown file in rpeo `langchain-ai/langchain`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import GithubFileLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"loader = GithubFileLoader(\n",
|
||||
" repo=\"langchain-ai/langchain\", # the repo name\n",
|
||||
" access_token=ACCESS_TOKEN,\n",
|
||||
" github_api_url=\"https://api.github.com\",\n",
|
||||
" file_filter=lambda file_path: file_path.endswith(\n",
|
||||
" \".md\"\n",
|
||||
" ), # load all markdowns files.\n",
|
||||
")\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"example output of one of document: \n",
|
||||
"\n",
|
||||
"```json\n",
|
||||
"documents.metadata: \n",
|
||||
" {\n",
|
||||
" \"path\": \"README.md\",\n",
|
||||
" \"sha\": \"82f1c4ea88ecf8d2dfsfx06a700e84be4\",\n",
|
||||
" \"source\": \"https://github.com/langchain-ai/langchain/blob/master/README.md\"\n",
|
||||
" }\n",
|
||||
"documents.content:\n",
|
||||
" mock content\n",
|
||||
"```"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -253,7 +219,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -102,7 +102,10 @@ from langchain_community.document_loaders.gcs_file import GCSFileLoader
|
||||
from langchain_community.document_loaders.geodataframe import GeoDataFrameLoader
|
||||
from langchain_community.document_loaders.git import GitLoader
|
||||
from langchain_community.document_loaders.gitbook import GitbookLoader
|
||||
from langchain_community.document_loaders.github import GitHubIssuesLoader
|
||||
from langchain_community.document_loaders.github import (
|
||||
GithubFileLoader,
|
||||
GitHubIssuesLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.google_speech_to_text import (
|
||||
GoogleSpeechToTextLoader,
|
||||
)
|
||||
@ -296,6 +299,7 @@ __all__ = [
|
||||
"GCSDirectoryLoader",
|
||||
"GCSFileLoader",
|
||||
"GeoDataFrameLoader",
|
||||
"GithubFileLoader",
|
||||
"GitHubIssuesLoader",
|
||||
"GitLoader",
|
||||
"GitbookLoader",
|
||||
|
@ -1,6 +1,7 @@
|
||||
import base64
|
||||
from abc import ABC
|
||||
from datetime import datetime
|
||||
from typing import Dict, Iterator, List, Literal, Optional, Union
|
||||
from typing import Callable, Dict, Iterator, List, Literal, Optional, Union
|
||||
|
||||
import requests
|
||||
from langchain_core.documents import Document
|
||||
@ -20,7 +21,7 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
|
||||
github_api_url: str = "https://api.github.com"
|
||||
"""URL of GitHub API"""
|
||||
|
||||
@root_validator(pre=True)
|
||||
@root_validator(pre=True, allow_reuse=True)
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that access token exists in environment."""
|
||||
values["access_token"] = get_from_dict_or_env(
|
||||
@ -65,7 +66,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):
|
||||
"""Only show notifications updated after the given time.
|
||||
This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ."""
|
||||
|
||||
@validator("since")
|
||||
@validator("since", allow_reuse=True)
|
||||
def validate_since(cls, v: Optional[str]) -> Optional[str]:
|
||||
if v:
|
||||
try:
|
||||
@ -186,3 +187,59 @@ class GitHubIssuesLoader(BaseGitHubLoader):
|
||||
def url(self) -> str:
|
||||
"""Create URL for GitHub API."""
|
||||
return f"{self.github_api_url}/repos/{self.repo}/issues?{self.query_params}"
|
||||
|
||||
|
||||
class GithubFileLoader(BaseGitHubLoader, ABC):
|
||||
"""Load GitHub File"""
|
||||
|
||||
file_extension: str = ".md"
|
||||
branch: str = "main"
|
||||
|
||||
file_filter: Optional[Callable[[str], bool]]
|
||||
|
||||
def get_file_paths(self) -> List[Dict]:
|
||||
base_url = (
|
||||
f"{self.github_api_url}/api/v3/repos/{self.repo}/git/trees/"
|
||||
f"{self.branch}?recursive=1"
|
||||
)
|
||||
response = requests.get(base_url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
all_files = response.json()["tree"]
|
||||
""" one element in all_files
|
||||
{
|
||||
'path': '.github',
|
||||
'mode': '040000',
|
||||
'type': 'tree',
|
||||
'sha': '89a2ae046e8b59eb96531f123c0c6d4913885df1',
|
||||
'url': 'https://github.com/api/v3/repos/shufanhao/langchain/git/trees/89a2ae046e8b59eb96531f123c0c6d4913885dxxx'
|
||||
}
|
||||
"""
|
||||
return [
|
||||
f
|
||||
for f in all_files
|
||||
if not (self.file_filter and not self.file_filter(f["path"]))
|
||||
]
|
||||
|
||||
def get_file_content_by_path(self, path: str) -> str:
|
||||
base_url = f"{self.github_api_url}/api/v3/repos/{self.repo}/contents/{path}"
|
||||
response = requests.get(base_url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
|
||||
content_encoded = response.json()["content"]
|
||||
return base64.b64decode(content_encoded).decode("utf-8")
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
documents = []
|
||||
|
||||
files = self.get_file_paths()
|
||||
for file in files:
|
||||
content = self.get_file_content_by_path(file["path"])
|
||||
metadata = {
|
||||
"path": file["path"],
|
||||
"sha": file["sha"],
|
||||
"source": f"{self.github_api_url}/{self.repo}/{file['type']}/"
|
||||
f"{self.branch}/{file['path']}",
|
||||
}
|
||||
documents.append(Document(page_content=content, metadata=metadata))
|
||||
|
||||
return documents
|
||||
|
@ -1,8 +1,13 @@
|
||||
import base64
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from langchain_community.document_loaders.github import GitHubIssuesLoader
|
||||
from langchain_community.document_loaders.github import (
|
||||
GithubFileLoader,
|
||||
GitHubIssuesLoader,
|
||||
)
|
||||
|
||||
|
||||
def test_initialization() -> None:
|
||||
@ -48,7 +53,7 @@ def test_invalid_initialization() -> None:
|
||||
GitHubIssuesLoader(since="not_a_date")
|
||||
|
||||
|
||||
def test_load(mocker: MockerFixture) -> None:
|
||||
def test_load_github_issue(mocker: MockerFixture) -> None:
|
||||
mocker.patch(
|
||||
"requests.get", return_value=mocker.MagicMock(json=lambda: [], links=None)
|
||||
)
|
||||
@ -127,3 +132,103 @@ def test_url() -> None:
|
||||
"&assignee=user1&creator=user2&mentioned=user3&labels=bug,ui,@high"
|
||||
"&sort=comments&direction=asc&since=2023-05-26T00:00:00Z"
|
||||
)
|
||||
|
||||
|
||||
def test_github_file_content_get_file_paths(mocker: MockerFixture) -> None:
|
||||
# Mock the requests.get method to simulate the API response
|
||||
mocker.patch(
|
||||
"requests.get",
|
||||
return_value=mocker.MagicMock(
|
||||
json=lambda: {
|
||||
"tree": [
|
||||
{
|
||||
"path": "readme.md",
|
||||
"mode": "100644",
|
||||
"type": "blob",
|
||||
"sha": "789",
|
||||
"size": 37,
|
||||
"url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789",
|
||||
}
|
||||
]
|
||||
},
|
||||
status_code=200,
|
||||
),
|
||||
)
|
||||
|
||||
# case1: add file_filter
|
||||
loader = GithubFileLoader(
|
||||
repo="shufanhao/langchain",
|
||||
access_token="access_token",
|
||||
github_api_url="https://github.com",
|
||||
file_filter=lambda file_path: file_path.endswith(".md"),
|
||||
)
|
||||
|
||||
# Call the load method
|
||||
files = loader.get_file_paths()
|
||||
|
||||
# Assert the results
|
||||
assert len(files) == 1
|
||||
assert files[0]["path"] == "readme.md"
|
||||
|
||||
# case2: didn't add file_filter
|
||||
loader = GithubFileLoader(
|
||||
repo="shufanhao/langchain",
|
||||
access_token="access_token",
|
||||
github_api_url="https://github.com",
|
||||
)
|
||||
|
||||
# Call the load method
|
||||
files = loader.get_file_paths()
|
||||
assert len(files) == 1
|
||||
assert files[0]["path"] == "readme.md"
|
||||
|
||||
# case3: add file_filter with a non-exist file path
|
||||
loader = GithubFileLoader(
|
||||
repo="shufanhao/langchain",
|
||||
access_token="access_token",
|
||||
github_api_url="https://github.com",
|
||||
file_filter=lambda file_path: file_path.endswith(".py"),
|
||||
)
|
||||
|
||||
# Call the load method
|
||||
files = loader.get_file_paths()
|
||||
assert len(files) == 0
|
||||
|
||||
|
||||
def test_github_file_content_loader(mocker: MockerFixture) -> None:
|
||||
# Mock the requests.get method to simulate the API response
|
||||
file_path_res = mocker.MagicMock(
|
||||
json=lambda: {
|
||||
"tree": [
|
||||
{
|
||||
"path": "readme.md",
|
||||
"mode": "100644",
|
||||
"type": "blob",
|
||||
"sha": "789",
|
||||
"size": 37,
|
||||
"url": "https://github.com/api/v3/repos/shufanhao/langchain/git/blobs/789",
|
||||
}
|
||||
]
|
||||
},
|
||||
status_code=200,
|
||||
)
|
||||
file_content_res = mocker.MagicMock(
|
||||
json=lambda: {"content": base64.b64encode("Mocked content".encode("utf-8"))},
|
||||
status_code=200,
|
||||
)
|
||||
|
||||
mocker.patch("requests.get", side_effect=[file_path_res, file_content_res])
|
||||
|
||||
# case1: file_extension=".md"
|
||||
loader = GithubFileLoader(
|
||||
repo="shufanhao/langchain",
|
||||
access_token="access_token",
|
||||
github_api_url="https://github.com",
|
||||
)
|
||||
|
||||
# Call the load method
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].page_content == "Mocked content"
|
||||
assert docs[0].metadata["sha"] == "789"
|
||||
|
@ -65,6 +65,7 @@ EXPECTED_ALL = [
|
||||
"GCSDirectoryLoader",
|
||||
"GCSFileLoader",
|
||||
"GeoDataFrameLoader",
|
||||
"GithubFileLoader",
|
||||
"GitHubIssuesLoader",
|
||||
"GitLoader",
|
||||
"GitbookLoader",
|
||||
|
Loading…
Reference in New Issue
Block a user