Merge pull request #18539

* Implement lazy_load() for GitLoader
This commit is contained in:
Christophe Bornet 2024-03-06 19:25:14 +01:00 committed by GitHub
parent 9a6f7e213b
commit 5985454269
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,5 +1,5 @@
import os import os
from typing import Callable, List, Optional from typing import Callable, Iterator, Optional
from langchain_core.documents import Document from langchain_core.documents import Document
@ -39,7 +39,7 @@ class GitLoader(BaseLoader):
self.branch = branch self.branch = branch
self.file_filter = file_filter self.file_filter = file_filter
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
try: try:
from git import Blob, Repo from git import Blob, Repo
except ImportError as ex: except ImportError as ex:
@ -68,8 +68,6 @@ class GitLoader(BaseLoader):
repo = Repo(self.repo_path) repo = Repo(self.repo_path)
repo.git.checkout(self.branch) repo.git.checkout(self.branch)
docs: List[Document] = []
for item in repo.tree().traverse(): for item in repo.tree().traverse():
if not isinstance(item, Blob): if not isinstance(item, Blob):
continue continue
@ -102,9 +100,6 @@ class GitLoader(BaseLoader):
"file_name": item.name, "file_name": item.name,
"file_type": file_type, "file_type": file_type,
} }
doc = Document(page_content=text_content, metadata=metadata) yield Document(page_content=text_content, metadata=metadata)
docs.append(doc)
except Exception as e: except Exception as e:
print(f"Error reading file {file_path}: {e}") # noqa: T201 print(f"Error reading file {file_path}: {e}") # noqa: T201
return docs