mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-04-27 19:28:38 +00:00
feat(bulk-ingest): Add --ignored Flag to Exclude Specific Files and Directories During Ingestion (#1432)
This commit is contained in:
parent
24fae660e6
commit
b178b51451
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,6 @@
|
||||
.venv
|
||||
.env
|
||||
venv
|
||||
|
||||
settings-me.yaml
|
||||
|
||||
|
17
Makefile
17
Makefile
@ -56,3 +56,20 @@ wipe:
|
||||
|
||||
setup:
|
||||
poetry run python scripts/setup
|
||||
|
||||
list:
|
||||
@echo "Available commands:"
|
||||
@echo " test : Run tests using pytest"
|
||||
@echo " test-coverage : Run tests with coverage report"
|
||||
@echo " black : Check code format with black"
|
||||
@echo " ruff : Check code with ruff"
|
||||
@echo " format : Format code with black and ruff"
|
||||
@echo " mypy : Run mypy for type checking"
|
||||
@echo " check : Run format and mypy commands"
|
||||
@echo " run : Run the application"
|
||||
@echo " dev-windows : Run the application in development mode on Windows"
|
||||
@echo " dev : Run the application in development mode"
|
||||
@echo " api-docs : Generate API documentation"
|
||||
@echo " ingest : Ingest data using specified script"
|
||||
@echo " wipe : Wipe data using specified script"
|
||||
@echo " setup : Setup the application"
|
||||
|
@ -20,20 +20,20 @@ class LocalIngestWorker:
|
||||
|
||||
self._files_under_root_folder: list[Path] = list()
|
||||
|
||||
def _find_all_files_in_folder(self, root_path: Path) -> None:
|
||||
def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None:
|
||||
"""Search all files under the root folder recursively.
|
||||
Count them at the same time
|
||||
"""
|
||||
for file_path in root_path.iterdir():
|
||||
if file_path.is_file():
|
||||
if file_path.is_file() and file_path.name not in ignored:
|
||||
self.total_documents += 1
|
||||
self._files_under_root_folder.append(file_path)
|
||||
elif file_path.is_dir():
|
||||
self._find_all_files_in_folder(file_path)
|
||||
elif file_path.is_dir() and file_path.name not in ignored:
|
||||
self._find_all_files_in_folder(file_path, ignored)
|
||||
|
||||
def ingest_folder(self, folder_path: Path) -> None:
|
||||
def ingest_folder(self, folder_path: Path, ignored: list[str]) -> None:
|
||||
# Count total documents before ingestion
|
||||
self._find_all_files_in_folder(folder_path)
|
||||
self._find_all_files_in_folder(folder_path, ignored)
|
||||
self._ingest_all(self._files_under_root_folder)
|
||||
|
||||
def _ingest_all(self, files_to_ingest: list[Path]) -> None:
|
||||
@ -64,12 +64,19 @@ parser.add_argument(
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignored",
|
||||
nargs="*",
|
||||
help="List of files/directories to ignore",
|
||||
default=[],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-file",
|
||||
help="Optional path to a log file. If provided, logs will be written to this file.",
|
||||
type=str,
|
||||
default=None,
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set up logging to a file if a path is provided
|
||||
@ -91,9 +98,17 @@ if __name__ == "__main__":
|
||||
|
||||
ingest_service = global_injector.get(IngestService)
|
||||
worker = LocalIngestWorker(ingest_service)
|
||||
worker.ingest_folder(root_path)
|
||||
worker.ingest_folder(root_path, args.ignored)
|
||||
|
||||
if args.ignored:
|
||||
logger.info(f"Skipping following files and directories: {args.ignored}")
|
||||
|
||||
if args.watch:
|
||||
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
|
||||
directories_to_watch = [
|
||||
dir
|
||||
for dir in root_path.iterdir()
|
||||
if dir.is_dir() and dir.name not in args.ignored
|
||||
]
|
||||
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
|
||||
watcher.start()
|
||||
|
Loading…
Reference in New Issue
Block a user