Pass the last paging flag to storage drivers

Storage drivers may be able to take advantage of the hint to start
their walk more efficiently.

For S3: The API takes a start-after parameter. Registries with many
repositories can drastically reduce calls to s3 by telling s3 to only
list results lexographically after the last parameter.

For the fallback: We can start deeper in the tree and avoid statting
the files and directories before the hint in a walk. For a filesystem
this improves performance a little, but many of the API based drivers
are currently treated like a filesystem, so this drastically improves
the performance of GCP and Azure blob.

Signed-off-by: James Hewitt <james.hewitt@uk.ibm.com>
This commit is contained in:
James Hewitt
2022-07-10 03:04:50 +01:00
parent 3a44c2e10e
commit e22f7cbc73
13 changed files with 390 additions and 63 deletions

View File

@@ -1040,21 +1040,21 @@ func (d *driver) URLFor(ctx context.Context, path string, options map[string]int
// Walk traverses a filesystem defined within driver, starting
// from the given path, calling f on each file
func (d *driver) Walk(ctx context.Context, from string, f storagedriver.WalkFn) error {
var objectCount int64
if err := d.doWalk(ctx, &objectCount, from, f); err != nil {
return err
func (d *driver) Walk(ctx context.Context, from string, f storagedriver.WalkFn, options ...func(*storagedriver.WalkOptions)) error {
walkOptions := &storagedriver.WalkOptions{}
for _, o := range options {
o(walkOptions)
}
// S3 doesn't have the concept of empty directories, so it'll return path not found if there are no objects
if objectCount == 0 {
return storagedriver.PathNotFoundError{Path: from}
var objectCount int64
if err := d.doWalk(ctx, &objectCount, from, walkOptions.StartAfterHint, f); err != nil {
return err
}
return nil
}
func (d *driver) doWalk(parentCtx context.Context, objectCount *int64, from string, f storagedriver.WalkFn) error {
func (d *driver) doWalk(parentCtx context.Context, objectCount *int64, from string, startAfter string, f storagedriver.WalkFn) error {
var (
retError error
// the most recent directory walked for de-duping
@@ -1075,13 +1075,14 @@ func (d *driver) doWalk(parentCtx context.Context, objectCount *int64, from stri
}
listObjectsInput := &s3.ListObjectsV2Input{
Bucket: aws.String(d.Bucket),
Prefix: aws.String(d.s3Path(path)),
MaxKeys: aws.Int64(listMax),
Bucket: aws.String(d.Bucket),
Prefix: aws.String(d.s3Path(path)),
MaxKeys: aws.Int64(listMax),
StartAfter: aws.String(d.s3Path(startAfter)),
}
ctx, done := dcontext.WithTrace(parentCtx)
defer done("s3aws.ListObjectsV2Pages(%s)", path)
defer done("s3aws.ListObjectsV2PagesWithContext(%s)", listObjectsInput)
// When the "delimiter" argument is omitted, the S3 list API will list all objects in the bucket
// recursively, omitting directory paths. Objects are listed in sorted, depth-first order so we
@@ -1133,11 +1134,10 @@ func (d *driver) doWalk(parentCtx context.Context, objectCount *int64, from stri
if err != nil {
if err == storagedriver.ErrSkipDir {
if walkInfo.IsDir() {
prevSkipDir = walkInfo.Path()
continue
}
// is file, stop gracefully
prevSkipDir = walkInfo.Path()
continue
}
if err == storagedriver.ErrFilledBuffer {
return false
}
retError = err
@@ -1187,7 +1187,7 @@ func directoryDiff(prev, current string) []string {
parent := current
for {
parent = filepath.Dir(parent)
if parent == "/" || parent == prev || strings.HasPrefix(prev, parent) {
if parent == "/" || parent == prev || strings.HasPrefix(prev+"/", parent+"/") {
break
}
paths = append(paths, parent)

View File

@@ -491,6 +491,7 @@ func TestWalk(t *testing.T) {
fileset := []string{
"/file1",
"/folder1-suffix/file1",
"/folder1/file1",
"/folder2/file1",
"/folder3/subfolder1/subfolder1/file1",
@@ -524,18 +525,23 @@ func TestWalk(t *testing.T) {
}
}()
noopFn := func(fileInfo storagedriver.FileInfo) error { return nil }
tcs := []struct {
name string
fn storagedriver.WalkFn
from string
options []func(*storagedriver.WalkOptions)
expected []string
err bool
}{
{
name: "walk all",
fn: func(fileInfo storagedriver.FileInfo) error { return nil },
fn: noopFn,
expected: []string{
"/file1",
"/folder1-suffix",
"/folder1-suffix/file1",
"/folder1",
"/folder1/file1",
"/folder2",
@@ -564,6 +570,8 @@ func TestWalk(t *testing.T) {
},
expected: []string{
"/file1",
"/folder1-suffix",
"/folder1-suffix/file1",
"/folder1",
"/folder1/file1",
"/folder2",
@@ -574,22 +582,101 @@ func TestWalk(t *testing.T) {
"/folder4/file1",
},
},
{
name: "start late without from",
fn: noopFn,
options: []func(*storagedriver.WalkOptions){
storagedriver.WithStartAfterHint("/folder3/subfolder1/subfolder1/file1"),
},
expected: []string{
// start late
"/folder3",
"/folder3/subfolder2",
"/folder3/subfolder2/subfolder1",
"/folder3/subfolder2/subfolder1/file1",
"/folder4",
"/folder4/file1",
},
err: false,
},
{
name: "start late with from",
fn: noopFn,
from: "/folder3",
options: []func(*storagedriver.WalkOptions){
storagedriver.WithStartAfterHint("/folder3/subfolder1/subfolder1/file1"),
},
expected: []string{
// start late
"/folder3/subfolder2",
"/folder3/subfolder2/subfolder1",
"/folder3/subfolder2/subfolder1/file1",
},
err: false,
},
{
name: "start after from",
fn: noopFn,
from: "/folder1",
options: []func(*storagedriver.WalkOptions){
storagedriver.WithStartAfterHint("/folder2"),
},
expected: []string{},
err: false,
},
{
name: "start matches from",
fn: noopFn,
from: "/folder3",
options: []func(*storagedriver.WalkOptions){
storagedriver.WithStartAfterHint("/folder3"),
},
expected: []string{
"/folder3/subfolder1",
"/folder3/subfolder1/subfolder1",
"/folder3/subfolder1/subfolder1/file1",
"/folder3/subfolder2",
"/folder3/subfolder2/subfolder1",
"/folder3/subfolder2/subfolder1/file1",
},
err: false,
},
{
name: "start doesn't exist",
fn: noopFn,
from: "/folder3",
options: []func(*storagedriver.WalkOptions){
storagedriver.WithStartAfterHint("/folder3/notafolder/notafile"),
},
expected: []string{
"/folder3/subfolder1",
"/folder3/subfolder1/subfolder1",
"/folder3/subfolder1/subfolder1/file1",
"/folder3/subfolder2",
"/folder3/subfolder2/subfolder1",
"/folder3/subfolder2/subfolder1/file1",
},
err: false,
},
{
name: "stop early",
fn: func(fileInfo storagedriver.FileInfo) error {
if fileInfo.Path() == "/folder1/file1" {
return storagedriver.ErrSkipDir
return storagedriver.ErrFilledBuffer
}
return nil
},
expected: []string{
"/file1",
"/folder1-suffix",
"/folder1-suffix/file1",
"/folder1",
"/folder1/file1",
// stop early
},
err: false,
},
{
name: "error",
fn: func(fileInfo storagedriver.FileInfo) error {
@@ -602,7 +689,7 @@ func TestWalk(t *testing.T) {
},
{
name: "from folder",
fn: func(fileInfo storagedriver.FileInfo) error { return nil },
fn: noopFn,
expected: []string{
"/folder1/file1",
},
@@ -619,7 +706,7 @@ func TestWalk(t *testing.T) {
err := drvr.Walk(context.Background(), tc.from, func(fileInfo storagedriver.FileInfo) error {
walked = append(walked, fileInfo.Path())
return tc.fn(fileInfo)
})
}, tc.options...)
if tc.err && err == nil {
t.Fatalf("expected err")
}