diff --git a/lib/file_storage/disk.rb b/lib/file_storage/disk.rb index f976914..16543f4 100644 --- a/lib/file_storage/disk.rb +++ b/lib/file_storage/disk.rb @@ -33,18 +33,20 @@ def download(bucket:, key:) end end - def list(bucket:, key:) + def list(bucket:, key:, page_size:) root = Pathname.new(bucket_root(bucket)) - matching_keys = Dir["#{root}/**/*"]. + Dir["#{root}/**/*"]. reject { |absolute_path| File.directory?(absolute_path) }. map { |full_path| Pathname.new(full_path).relative_path_from(root).to_s }. - select { |f| f.start_with?(key) } - - { - bucket: bucket, - keys: matching_keys, - } + select { |f| f.start_with?(key) }. + each_slice(page_size). + map do |keys| + { + bucket: bucket, + keys: keys, + } + end.to_enum end def delete!(bucket:, key:) diff --git a/lib/file_storage/gcs.rb b/lib/file_storage/gcs.rb index 3dcdf8f..4d8c969 100644 --- a/lib/file_storage/gcs.rb +++ b/lib/file_storage/gcs.rb @@ -42,12 +42,22 @@ def download(bucket:, key:) } end - def list(bucket:, key:) - matching_keys = get_bucket(bucket).files(prefix: key).map(&:name) - { - bucket: bucket, - keys: matching_keys, - } + def list(bucket:, key:, page_size:) + Enumerator.new do |yielder| + token = nil + + loop do + page = get_bucket(bucket).files(prefix: key, max: page_size, token: token) + yielder.yield({ + bucket: bucket, + keys: page.map(&:name), + }) + + break if page.token.nil? + + token = page.token + end + end end def delete!(bucket:, key:) diff --git a/lib/file_storage/in_memory.rb b/lib/file_storage/in_memory.rb index 47fb9fe..58759a2 100644 --- a/lib/file_storage/in_memory.rb +++ b/lib/file_storage/in_memory.rb @@ -41,12 +41,16 @@ def download(bucket:, key:) } end - def list(bucket:, key:) - matching_keys = @buckets[bucket].keys.select { |k| k.start_with?(key) } - { - bucket: bucket, - keys: matching_keys, - } + def list(bucket:, key:, page_size:) + @buckets[bucket].keys. + select { |k| k.start_with?(key) }. + each_slice(page_size). + map do |keys| + { + bucket: bucket, + keys: keys, + } + end.to_enum end def delete!(bucket:, key:) diff --git a/lib/file_storage/key_storage.rb b/lib/file_storage/key_storage.rb index 8666ad4..c8180ae 100644 --- a/lib/file_storage/key_storage.rb +++ b/lib/file_storage/key_storage.rb @@ -81,27 +81,44 @@ def upload!(content) # Lists all keys for the current adapter that have the reference key as prefix # - # This will return a list of valid keys in the format of `adapter://bucket/key`. The keys in - # the list will share the reference key as a prefix. + # Internally, this method will paginate through the result set. The default page size + # for the underlying adapter can be controlled via the `page_size` argument. # - # @return [Array] A list of keys in the format of `adapter://bucket/key` + # This will return a enumerator of valid keys in the format of `adapter://bucket/key`. + # The keys in the list will share the reference key as a prefix. Underlying adapters will + # paginate the result set as the enumerable is consumed. The number of items per page + # can be controlled by the `page_size` argument. # - # @example List all files under a given prefix - # FileStorage.for("inmemory://bucket/prefix").list - def list + # @param [Integer] page_size + # the max number of items to fetch for each page of results + def list(page_size: 1000) FileStorage.logger.info(event: "key_storage.list_started") start = FileStorage::Timing.monotonic_now - result = adapter.list( + pages = adapter.list( bucket: bucket, key: key, + page_size: page_size, ) - FileStorage.logger.info(resource_count: result[:keys].count, - event: "key_storage.list_finished", - duration: FileStorage::Timing.monotonic_now - start) - - result[:keys].map { |key| "#{adapter_type}://#{result[:bucket]}/#{key}" } + page_count = 0 + Enumerator.new do |yielder| + pages.each do |page| + page_count += 1 + keys = page.fetch(:keys, []).map { |key| "#{adapter_type}://#{page[:bucket]}/#{key}" } + + FileStorage.logger.info( + event: "key_storage.list_page_fetched", + resource_count: keys.count, + page: page_count, + duration: FileStorage::Timing.monotonic_now - start, + ) + + keys.each do |key| + yielder.yield(key) + end + end + end end # Deletes the referenced key. diff --git a/spec/file_storage/disk_spec.rb b/spec/file_storage/disk_spec.rb index e3663e0..c9b2ab3 100644 --- a/spec/file_storage/disk_spec.rb +++ b/spec/file_storage/disk_spec.rb @@ -95,10 +95,7 @@ describe "#list" do context "when the bucket is empty" do it "returns an empty list" do - expect(instance.list(bucket: bucket, key: "whatever")).to eq( - bucket: bucket, - keys: [], - ) + expect(instance.list(bucket: bucket, key: "whatever", page_size: 1000).to_a).to eq([]) end end @@ -113,7 +110,7 @@ context "and we provide a matching prefix" do it "returns only the matching items" do - expect(instance.list(bucket: bucket, key: "2019-01")).to match( + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 1000).first).to match( bucket: bucket, keys: match_array(%w[2019-01/hello1 2019-01/hello2 2019-01/hello3]), ) @@ -122,10 +119,28 @@ context "when the prefix doesn't match anything" do it "returns an empty list" do - expect(instance.list(bucket: bucket, key: "YOLO")).to match( - bucket: bucket, - keys: [], - ) + expect(instance.list(bucket: bucket, key: "YOLO", page_size: 1000).to_a).to eq([]) + end + end + + it "returns a subset of the matching keys" do + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 2).first).to match( + bucket: bucket, + keys: have_attributes(length: 2), + ) + end + + context "when there are multiple pages of results available" do + it "returns an enumerable" do + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 1)). + to be_a_kind_of(Enumerable) + end + + it "enumerates through all the pages" do + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 2).to_a).to match_array([ + { bucket: bucket, keys: have_attributes(length: 2) }, + { bucket: bucket, keys: have_attributes(length: 1) }, + ]) end end end diff --git a/spec/file_storage/in_memory_spec.rb b/spec/file_storage/in_memory_spec.rb index aa5feba..6588143 100644 --- a/spec/file_storage/in_memory_spec.rb +++ b/spec/file_storage/in_memory_spec.rb @@ -55,11 +55,8 @@ end context "but we try to fetch it from a different bucket" do - it "raises an error" do - expect(instance.list(bucket: bucket, key: "whatever")).to eq( - bucket: bucket, - keys: [], - ) + it "returns an empty list" do + expect(instance.list(bucket: bucket, key: "whatever", page_size: 1000).to_a).to eq([]) end end end @@ -68,10 +65,7 @@ describe "#list" do context "when the bucket is empty" do it "returns an empty list" do - expect(instance.list(bucket: bucket, key: "whatever")).to eq( - bucket: bucket, - keys: [], - ) + expect(instance.list(bucket: bucket, key: "whatever", page_size: 1000).to_a).to eq([]) end end @@ -86,7 +80,7 @@ context "and we provide a matching prefix" do it "returns only the matching items" do - expect(instance.list(bucket: bucket, key: "2019-01")).to match( + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 1000).first).to match( bucket: bucket, keys: match_array(%w[2019-01/hello1 2019-01/hello2 2019-01/hello3]), ) @@ -95,12 +89,32 @@ context "when the prefix doesn't match anything" do it "returns an empty list" do - expect(instance.list(bucket: bucket, key: "YOLO")).to match( + expect(instance.list(bucket: bucket, key: "YOLO", page_size: 1000).to_a).to eq([]) + end + end + + context "and we request fewer keys than they are available" do + it "returns a subset of the matching keys" do + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 2).first).to match( bucket: bucket, - keys: [], + keys: have_attributes(length: 2), ) end end + + context "when there are multiple pages of results available" do + it "returns an enumerable" do + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 1)). + to be_a_kind_of(Enumerable) + end + + it "enumerates through all the pages" do + expect(instance.list(bucket: bucket, key: "2019-01", page_size: 2).to_a).to match_array([ + { bucket: bucket, keys: have_attributes(length: 2) }, + { bucket: bucket, keys: have_attributes(length: 1) }, + ]) + end + end end end diff --git a/spec/file_storage/key_storage_spec.rb b/spec/file_storage/key_storage_spec.rb index 181046f..f111b2d 100644 --- a/spec/file_storage/key_storage_spec.rb +++ b/spec/file_storage/key_storage_spec.rb @@ -46,10 +46,10 @@ def build_for(key) hash_including(event: "key_storage.list_started"), ) expect(FileStorage.logger).to receive(:info).with( - hash_including(event: "key_storage.list_finished"), + hash_including(event: "key_storage.list_page_fetched"), ) - build_for("inmemory://bucket").list + build_for("inmemory://bucket").list.to_a end context "but the URI does not have a trailing /" do