From df591aa8c23575d77124425b76153886a43a1d89 Mon Sep 17 00:00:00 2001 From: feiniks <36756310+feiniks@users.noreply.github.com> Date: Sat, 12 Feb 2022 09:57:53 +0800 Subject: [PATCH] Add --rm-fs option for gc (#541) * Add --rm-fs option for gc * Use local variables and print fs number * Modify return value to gint64 --- server/gc/gc-core.c | 216 +++++++++++++++++++++++++++++++--------- server/gc/gc-core.h | 2 +- server/gc/seafserv-gc.c | 10 +- 3 files changed, 177 insertions(+), 51 deletions(-) diff --git a/server/gc/gc-core.c b/server/gc/gc-core.c index 891c884..c3a8e7a 100644 --- a/server/gc/gc-core.c +++ b/server/gc/gc-core.c @@ -12,11 +12,6 @@ #define MAX_BF_SIZE (((size_t)1) << 29) /* 64 MB */ -/* Total number of blocks to be scanned. */ -static guint64 total_blocks; -static guint64 removed_blocks; -static guint64 reachable_blocks; - /* * The number of bits in the bloom filter is 4 times the number of all blocks. * Let m be the bits in the bf, n be the number of blocks to be added to the bf @@ -36,11 +31,11 @@ static guint64 reachable_blocks; * So we set the minimal size of the bf to 1KB. */ static Bloom * -alloc_gc_index () +alloc_gc_index (guint64 total_objs) { size_t size; - size = (size_t) MAX(total_blocks << 2, 1 << 13); + size = (size_t) MAX(total_objs << 2, 1 << 13); size = MIN (size, MAX_BF_SIZE); seaf_message ("GC index size is %u Byte.\n", (int)size >> 3); @@ -50,7 +45,8 @@ alloc_gc_index () typedef struct { SeafRepo *repo; - Bloom *index; + Bloom *blocks_index; + Bloom *fs_index; GHashTable *visited; /* > 0: keep a period of history; @@ -71,7 +67,7 @@ static int add_blocks_to_index (SeafFSManager *mgr, GCData *data, const char *file_id) { SeafRepo *repo = data->repo; - Bloom *index = data->index; + Bloom *blocks_index = data->blocks_index; Seafile *seafile; int i; @@ -82,7 +78,7 @@ add_blocks_to_index (SeafFSManager *mgr, GCData *data, const char *file_id) } for (i = 0; i < seafile->n_blocks; ++i) { - bloom_add (index, seafile->blk_sha1s[i]); + bloom_add (blocks_index, seafile->blk_sha1s[i]); ++data->traversed_blocks; } @@ -91,6 +87,16 @@ add_blocks_to_index (SeafFSManager *mgr, GCData *data, const char *file_id) return 0; } +static void +add_fs_to_index(GCData *data, const char *file_id) +{ + Bloom *fs_index = data->fs_index; + if (fs_index) { + bloom_add (fs_index, file_id); + } + ++(data->traversed_fs_objs); +} + static gboolean fs_callback (SeafFSManager *mgr, const char *store_id, @@ -112,7 +118,7 @@ fs_callback (SeafFSManager *mgr, g_hash_table_replace (data->visited, key, key); } - ++(data->traversed_fs_objs); + add_fs_to_index(data, obj_id); if (type == SEAF_METADATA_TYPE_FILE && add_blocks_to_index (mgr, data, obj_id) < 0) @@ -169,8 +175,8 @@ traverse_commit (SeafCommit *commit, void *vdata, gboolean *stop) return TRUE; } -static int -populate_gc_index_for_repo (SeafRepo *repo, Bloom *index, int verbose) +static gint64 +populate_gc_index_for_repo (SeafRepo *repo, Bloom *blocks_index, Bloom *fs_index, int verbose) { GList *branches, *ptr; SeafBranch *branch; @@ -190,7 +196,8 @@ populate_gc_index_for_repo (SeafRepo *repo, Bloom *index, int verbose) data = g_new0(GCData, 1); data->repo = repo; - data->index = index; + data->blocks_index = blocks_index; + data->fs_index = fs_index; data->visited = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); data->verbose = verbose; @@ -232,7 +239,7 @@ populate_gc_index_for_repo (SeafRepo *repo, Bloom *index, int verbose) seaf_message ("Traversed %d commits, %"G_GINT64_FORMAT" blocks.\n", data->traversed_commits, data->traversed_blocks); - reachable_blocks += data->traversed_blocks; + ret = data->traversed_blocks; g_list_free (branches); g_hash_table_destroy (data->visited); @@ -244,6 +251,7 @@ populate_gc_index_for_repo (SeafRepo *repo, Bloom *index, int verbose) typedef struct { Bloom *index; int dry_run; + guint64 removed_blocks; } CheckBlocksData; static gboolean @@ -254,7 +262,7 @@ check_block_liveness (const char *store_id, int version, Bloom *index = data->index; if (!bloom_test (index, block_id)) { - ++removed_blocks; + data->removed_blocks++; if (!data->dry_run) seaf_block_manager_remove_block (seaf->block_mgr, store_id, version, @@ -264,13 +272,52 @@ check_block_liveness (const char *store_id, int version, return TRUE; } -static int -populate_gc_index_for_virtual_repos (SeafRepo *repo, Bloom *index, int verbose) +#define MAX_THREADS 10 + +static gint64 +check_existing_fs (char *store_id, int repo_version, GHashTable *exist_fs, + Bloom *fs_index, int dry_run) +{ + GHashTableIter iter; + gpointer key, value; + gint64 ret = 0; + + g_hash_table_iter_init (&iter, exist_fs); + + while (g_hash_table_iter_next (&iter, &key, &value)) { + if (!bloom_test (fs_index, (char *)key)) { + ret++; + if (dry_run) + continue; + seaf_fs_manager_delete_object(seaf->fs_mgr, + store_id, repo_version, + (char *)key); + } + } + + return ret; +} + +static gboolean +collect_exist_fs (const char *store_id, int version, + const char *fs_id, void *vdata) +{ + GHashTable *exist_fs = vdata; + int dummy; + + g_hash_table_replace (exist_fs, g_strdup (fs_id), &dummy); + + return TRUE; +} + +static gint64 +populate_gc_index_for_virtual_repos (SeafRepo *repo, Bloom *blocks_index, Bloom *fs_index, int verbose) { GList *vrepo_ids = NULL, *ptr; char *repo_id; SeafRepo *vrepo; - int ret = 0; + gint64 ret = 0; + gint64 scan_ret = 0; vrepo_ids = seaf_repo_manager_get_virtual_repo_ids_by_origin (seaf->repo_mgr, repo->id); @@ -283,10 +330,13 @@ populate_gc_index_for_virtual_repos (SeafRepo *repo, Bloom *index, int verbose) goto out; } - ret = populate_gc_index_for_repo (vrepo, index, verbose); + scan_ret = populate_gc_index_for_repo (vrepo, blocks_index, fs_index, verbose); seaf_repo_unref (vrepo); - if (ret < 0) + if (scan_ret < 0) { + ret = -1; goto out; + } + ret += scan_ret; } out: @@ -294,15 +344,21 @@ out: return ret; } -int -gc_v1_repo (SeafRepo *repo, int dry_run, int verbose) +gint64 +gc_v1_repo (SeafRepo *repo, int dry_run, int verbose, int rm_fs) { - Bloom *index; - int ret; + Bloom *blocks_index = NULL; + Bloom *fs_index = NULL; + GHashTable *exist_fs = NULL; + guint64 total_blocks; + guint64 removed_blocks; + guint64 reachable_blocks; + guint64 total_fs = 0; + gint64 removed_fs = 0; + gint64 ret; total_blocks = seaf_block_manager_get_block_number (seaf->block_mgr, repo->store_id, repo->version); - removed_blocks = 0; reachable_blocks = 0; if (total_blocks == 0) { @@ -310,7 +366,25 @@ gc_v1_repo (SeafRepo *repo, int dry_run, int verbose) return 0; } - seaf_message ("GC started. Total block number is %"G_GUINT64_FORMAT".\n", total_blocks); + if (rm_fs) { + exist_fs = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + ret = seaf_obj_store_foreach_obj (seaf->fs_mgr->obj_store, + repo->store_id, repo->version, + collect_exist_fs, + exist_fs); + if (ret < 0) { + seaf_warning ("Failed to collect existing fs for repo %.8s, stop GC.\n\n", + repo->id); + goto out; + } + + total_fs = g_hash_table_size (exist_fs); + } + + if (rm_fs) + seaf_message ("GC started. Total block number is %"G_GUINT64_FORMAT", total fs number is %"G_GUINT64_FORMAT".\n", total_blocks, total_fs); + else + seaf_message ("GC started. Total block number is %"G_GUINT64_FORMAT".\n", total_blocks); /* * Store the index of live blocks in bloom filter to save memory. @@ -318,33 +392,49 @@ gc_v1_repo (SeafRepo *repo, int dry_run, int verbose) * may skip some garbage blocks, but we won't delete * blocks that are still alive. */ - index = alloc_gc_index (); - if (!index) { - seaf_warning ("GC: Failed to allocate index.\n"); - return -1; + blocks_index = alloc_gc_index (total_blocks); + if (!blocks_index) { + seaf_warning ("GC: Failed to allocate blocks_index.\n"); + ret = -1; + goto out; + } + + if (rm_fs && total_fs > 0) { + fs_index = alloc_gc_index (total_fs); + if (!fs_index) { + seaf_warning ("GC: Failed to allocate fs index for repo %.8s, stop gc.\n", + repo->id); + ret = -1; + goto out; + } } seaf_message ("Populating index.\n"); - ret = populate_gc_index_for_repo (repo, index, verbose); + ret = populate_gc_index_for_repo (repo, blocks_index, fs_index, verbose); if (ret < 0) goto out; + + reachable_blocks += ret; /* Since virtual repos share fs and block store with the origin repo, * it's necessary to do GC for them together. */ - ret = populate_gc_index_for_virtual_repos (repo, index, verbose); + ret = populate_gc_index_for_virtual_repos (repo, blocks_index, fs_index, verbose); if (ret < 0) goto out; + reachable_blocks += ret; + if (!dry_run) seaf_message ("Scanning and deleting unused blocks.\n"); else seaf_message ("Scanning unused blocks.\n"); CheckBlocksData data; - data.index = index; + data.index = blocks_index; data.dry_run = dry_run; + data.removed_blocks = 0; ret = seaf_block_manager_foreach_block (seaf->block_mgr, repo->store_id, repo->version, @@ -355,23 +445,53 @@ gc_v1_repo (SeafRepo *repo, int dry_run, int verbose) goto out; } + removed_blocks = data.removed_blocks; ret = removed_blocks; - if (!dry_run) - seaf_message ("GC finished. %"G_GUINT64_FORMAT" blocks total, " - "about %"G_GUINT64_FORMAT" reachable blocks, " - "%"G_GUINT64_FORMAT" blocks are removed.\n", - total_blocks, reachable_blocks, removed_blocks); - else - seaf_message ("GC finished. %"G_GUINT64_FORMAT" blocks total, " - "about %"G_GUINT64_FORMAT" reachable blocks, " - "%"G_GUINT64_FORMAT" blocks can be removed.\n", - total_blocks, reachable_blocks, removed_blocks); + if (rm_fs && total_fs > 0) { + removed_fs = check_existing_fs(repo->store_id, repo->version, exist_fs, + fs_index, dry_run); + if (removed_fs < 0) { + goto out; + } + } + + if (!dry_run) { + if (rm_fs) + seaf_message ("GC finished for repo %.8s. %"G_GUINT64_FORMAT" blocks total, " + "about %"G_GUINT64_FORMAT" reachable blocks, " + "%"G_GUINT64_FORMAT" blocks are removed. " + "%"G_GUINT64_FORMAT" fs are removed.\n", + repo->id, total_blocks, reachable_blocks, removed_blocks, removed_fs); + else + seaf_message ("GC finished. %"G_GUINT64_FORMAT" blocks total, " + "about %"G_GUINT64_FORMAT" reachable blocks, " + "%"G_GUINT64_FORMAT" blocks are removed.\n", + total_blocks, reachable_blocks, removed_blocks); + } else { + if (rm_fs) + seaf_message ("GC finished for repo %.8s. %"G_GUINT64_FORMAT" blocks total, " + "about %"G_GUINT64_FORMAT" reachable blocks, " + "%"G_GUINT64_FORMAT" blocks can be removed. " + "%"G_GUINT64_FORMAT" fs can be removed.\n", + repo->id, total_blocks, reachable_blocks, removed_blocks, removed_fs); + else + seaf_message ("GC finished. %"G_GUINT64_FORMAT" blocks total, " + "about %"G_GUINT64_FORMAT" reachable blocks, " + "%"G_GUINT64_FORMAT" blocks can be removed.\n", + total_blocks, reachable_blocks, removed_blocks); + } out: printf ("\n"); - bloom_destroy (index); + if (exist_fs) + g_hash_table_destroy (exist_fs); + + if (blocks_index) + bloom_destroy (blocks_index); + if (fs_index) + bloom_destroy (fs_index); return ret; } @@ -406,14 +526,14 @@ delete_garbaged_repos (int dry_run) } int -gc_core_run (GList *repo_id_list, int dry_run, int verbose) +gc_core_run (GList *repo_id_list, int dry_run, int verbose, int rm_fs) { GList *ptr; SeafRepo *repo; GList *corrupt_repos = NULL; GList *del_block_repos = NULL; gboolean del_garbage = FALSE; - int gc_ret; + gint64 gc_ret; char *repo_id; if (repo_id_list == NULL) { @@ -438,7 +558,7 @@ gc_core_run (GList *repo_id_list, int dry_run, int verbose) if (!repo->is_virtual) { seaf_message ("GC version %d repo %s(%s)\n", repo->version, repo->name, repo->id); - gc_ret = gc_v1_repo (repo, dry_run, verbose); + gc_ret = gc_v1_repo (repo, dry_run, verbose, rm_fs); if (gc_ret < 0) { corrupt_repos = g_list_prepend (corrupt_repos, g_strdup(repo->id)); } else if (dry_run && gc_ret) { diff --git a/server/gc/gc-core.h b/server/gc/gc-core.h index 88a1f1f..b4e9b21 100644 --- a/server/gc/gc-core.h +++ b/server/gc/gc-core.h @@ -1,7 +1,7 @@ #ifndef GC_CORE_H #define GC_CORE_H -int gc_core_run (GList *repo_id_list, int dry_run, int verbose); +int gc_core_run (GList *repo_id_list, int dry_run, int verbose, int rm_fs); void delete_garbaged_repos (int dry_run); diff --git a/server/gc/seafserv-gc.c b/server/gc/seafserv-gc.c index 951bec3..d816eac 100644 --- a/server/gc/seafserv-gc.c +++ b/server/gc/seafserv-gc.c @@ -15,7 +15,7 @@ static char *central_config_dir = NULL; SeafileSession *seaf; -static const char *short_opts = "hvc:d:VDrF:"; +static const char *short_opts = "hvc:d:VDrRF:"; static const struct option long_opts[] = { { "help", no_argument, NULL, 'h', }, { "version", no_argument, NULL, 'v', }, @@ -25,6 +25,7 @@ static const struct option long_opts[] = { { "verbose", no_argument, NULL, 'V' }, { "dry-run", no_argument, NULL, 'D' }, { "rm-deleted", no_argument, NULL, 'r' }, + { "rm-fs", no_argument, NULL, 'R' }, { 0, 0, 0, 0 }, }; @@ -35,6 +36,7 @@ static void usage () "[repo_id_1 [repo_id_2 ...]]\n" "Additional options:\n" "-r, --rm-deleted: remove garbaged repos\n" + "-R, --rm-fs: remove fs object\n" "-D, --dry-run: report blocks that can be remove, but not remove them\n" "-V, --verbose: verbose output messages\n"); } @@ -72,6 +74,7 @@ main(int argc, char *argv[]) int verbose = 0; int dry_run = 0; int rm_garbage = 0; + int rm_fs = 0; #ifdef WIN32 argv = get_argv_utf8 (&argc); @@ -106,6 +109,9 @@ main(int argc, char *argv[]) case 'r': rm_garbage = 1; break; + case 'R': + rm_fs = 1; + break; default: usage(); exit(-1); @@ -140,7 +146,7 @@ main(int argc, char *argv[]) for (i = optind; i < argc; i++) repo_id_list = g_list_append (repo_id_list, g_strdup(argv[i])); - gc_core_run (repo_id_list, dry_run, verbose); + gc_core_run (repo_id_list, dry_run, verbose, rm_fs); return 0; }