mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-07-04 02:56:41 +00:00
feat(bulk-ingest): Add --ignored Flag to Exclude Specific Files and Directories During Ingestion (#1432)
This commit is contained in:
parent
24fae660e6
commit
b178b51451
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,6 @@
|
|||||||
.venv
|
.venv
|
||||||
|
.env
|
||||||
|
venv
|
||||||
|
|
||||||
settings-me.yaml
|
settings-me.yaml
|
||||||
|
|
||||||
|
17
Makefile
17
Makefile
@ -56,3 +56,20 @@ wipe:
|
|||||||
|
|
||||||
setup:
|
setup:
|
||||||
poetry run python scripts/setup
|
poetry run python scripts/setup
|
||||||
|
|
||||||
|
list:
|
||||||
|
@echo "Available commands:"
|
||||||
|
@echo " test : Run tests using pytest"
|
||||||
|
@echo " test-coverage : Run tests with coverage report"
|
||||||
|
@echo " black : Check code format with black"
|
||||||
|
@echo " ruff : Check code with ruff"
|
||||||
|
@echo " format : Format code with black and ruff"
|
||||||
|
@echo " mypy : Run mypy for type checking"
|
||||||
|
@echo " check : Run format and mypy commands"
|
||||||
|
@echo " run : Run the application"
|
||||||
|
@echo " dev-windows : Run the application in development mode on Windows"
|
||||||
|
@echo " dev : Run the application in development mode"
|
||||||
|
@echo " api-docs : Generate API documentation"
|
||||||
|
@echo " ingest : Ingest data using specified script"
|
||||||
|
@echo " wipe : Wipe data using specified script"
|
||||||
|
@echo " setup : Setup the application"
|
||||||
|
@ -20,20 +20,20 @@ class LocalIngestWorker:
|
|||||||
|
|
||||||
self._files_under_root_folder: list[Path] = list()
|
self._files_under_root_folder: list[Path] = list()
|
||||||
|
|
||||||
def _find_all_files_in_folder(self, root_path: Path) -> None:
|
def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None:
|
||||||
"""Search all files under the root folder recursively.
|
"""Search all files under the root folder recursively.
|
||||||
Count them at the same time
|
Count them at the same time
|
||||||
"""
|
"""
|
||||||
for file_path in root_path.iterdir():
|
for file_path in root_path.iterdir():
|
||||||
if file_path.is_file():
|
if file_path.is_file() and file_path.name not in ignored:
|
||||||
self.total_documents += 1
|
self.total_documents += 1
|
||||||
self._files_under_root_folder.append(file_path)
|
self._files_under_root_folder.append(file_path)
|
||||||
elif file_path.is_dir():
|
elif file_path.is_dir() and file_path.name not in ignored:
|
||||||
self._find_all_files_in_folder(file_path)
|
self._find_all_files_in_folder(file_path, ignored)
|
||||||
|
|
||||||
def ingest_folder(self, folder_path: Path) -> None:
|
def ingest_folder(self, folder_path: Path, ignored: list[str]) -> None:
|
||||||
# Count total documents before ingestion
|
# Count total documents before ingestion
|
||||||
self._find_all_files_in_folder(folder_path)
|
self._find_all_files_in_folder(folder_path, ignored)
|
||||||
self._ingest_all(self._files_under_root_folder)
|
self._ingest_all(self._files_under_root_folder)
|
||||||
|
|
||||||
def _ingest_all(self, files_to_ingest: list[Path]) -> None:
|
def _ingest_all(self, files_to_ingest: list[Path]) -> None:
|
||||||
@ -64,12 +64,19 @@ parser.add_argument(
|
|||||||
action=argparse.BooleanOptionalAction,
|
action=argparse.BooleanOptionalAction,
|
||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ignored",
|
||||||
|
nargs="*",
|
||||||
|
help="List of files/directories to ignore",
|
||||||
|
default=[],
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--log-file",
|
"--log-file",
|
||||||
help="Optional path to a log file. If provided, logs will be written to this file.",
|
help="Optional path to a log file. If provided, logs will be written to this file.",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Set up logging to a file if a path is provided
|
# Set up logging to a file if a path is provided
|
||||||
@ -91,9 +98,17 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
ingest_service = global_injector.get(IngestService)
|
ingest_service = global_injector.get(IngestService)
|
||||||
worker = LocalIngestWorker(ingest_service)
|
worker = LocalIngestWorker(ingest_service)
|
||||||
worker.ingest_folder(root_path)
|
worker.ingest_folder(root_path, args.ignored)
|
||||||
|
|
||||||
|
if args.ignored:
|
||||||
|
logger.info(f"Skipping following files and directories: {args.ignored}")
|
||||||
|
|
||||||
if args.watch:
|
if args.watch:
|
||||||
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
|
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
|
||||||
|
directories_to_watch = [
|
||||||
|
dir
|
||||||
|
for dir in root_path.iterdir()
|
||||||
|
if dir.is_dir() and dir.name not in args.ignored
|
||||||
|
]
|
||||||
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
|
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
|
||||||
watcher.start()
|
watcher.start()
|
||||||
|
Loading…
Reference in New Issue
Block a user