feat(bulk-ingest): Add --ignored Flag to Exclude Specific Files and Directories During Ingestion (#1432)

This commit is contained in:
Nick Smirnov 2024-02-07 21:59:32 +03:00 committed by GitHub
parent 24fae660e6
commit b178b51451
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 41 additions and 7 deletions

2
.gitignore vendored
View File

@ -1,4 +1,6 @@
.venv
.env
venv
settings-me.yaml

View File

@ -56,3 +56,20 @@ wipe:
setup:
poetry run python scripts/setup
list:
@echo "Available commands:"
@echo " test : Run tests using pytest"
@echo " test-coverage : Run tests with coverage report"
@echo " black : Check code format with black"
@echo " ruff : Check code with ruff"
@echo " format : Format code with black and ruff"
@echo " mypy : Run mypy for type checking"
@echo " check : Run format and mypy commands"
@echo " run : Run the application"
@echo " dev-windows : Run the application in development mode on Windows"
@echo " dev : Run the application in development mode"
@echo " api-docs : Generate API documentation"
@echo " ingest : Ingest data using specified script"
@echo " wipe : Wipe data using specified script"
@echo " setup : Setup the application"

View File

@ -20,20 +20,20 @@ class LocalIngestWorker:
self._files_under_root_folder: list[Path] = list()
def _find_all_files_in_folder(self, root_path: Path) -> None:
def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None:
"""Search all files under the root folder recursively.
Count them at the same time
"""
for file_path in root_path.iterdir():
if file_path.is_file():
if file_path.is_file() and file_path.name not in ignored:
self.total_documents += 1
self._files_under_root_folder.append(file_path)
elif file_path.is_dir():
self._find_all_files_in_folder(file_path)
elif file_path.is_dir() and file_path.name not in ignored:
self._find_all_files_in_folder(file_path, ignored)
def ingest_folder(self, folder_path: Path) -> None:
def ingest_folder(self, folder_path: Path, ignored: list[str]) -> None:
# Count total documents before ingestion
self._find_all_files_in_folder(folder_path)
self._find_all_files_in_folder(folder_path, ignored)
self._ingest_all(self._files_under_root_folder)
def _ingest_all(self, files_to_ingest: list[Path]) -> None:
@ -64,12 +64,19 @@ parser.add_argument(
action=argparse.BooleanOptionalAction,
default=False,
)
parser.add_argument(
"--ignored",
nargs="*",
help="List of files/directories to ignore",
default=[],
)
parser.add_argument(
"--log-file",
help="Optional path to a log file. If provided, logs will be written to this file.",
type=str,
default=None,
)
args = parser.parse_args()
# Set up logging to a file if a path is provided
@ -91,9 +98,17 @@ if __name__ == "__main__":
ingest_service = global_injector.get(IngestService)
worker = LocalIngestWorker(ingest_service)
worker.ingest_folder(root_path)
worker.ingest_folder(root_path, args.ignored)
if args.ignored:
logger.info(f"Skipping following files and directories: {args.ignored}")
if args.watch:
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
directories_to_watch = [
dir
for dir in root_path.iterdir()
if dir.is_dir() and dir.name not in args.ignored
]
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
watcher.start()