From 530d473667457e7d9ff5641856b80fb97a886484 Mon Sep 17 00:00:00 2001 From: Fred Hebert Date: Sat, 6 Apr 2024 19:15:32 +0000 Subject: [PATCH] do streaming file hashing on disk backing for large files --- apps/revault/src/revault_file_disk.erl | 23 ++++++++++++++--- apps/revault/test/revault_file_disk_SUITE.erl | 25 ++++++++++++++++++- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/apps/revault/src/revault_file_disk.erl b/apps/revault/src/revault_file_disk.erl index 0be9cf5..4b20e7b 100644 --- a/apps/revault/src/revault_file_disk.erl +++ b/apps/revault/src/revault_file_disk.erl @@ -1,6 +1,7 @@ -module(revault_file_disk). -include_lib("kernel/include/file.hrl"). +-compile({no_auto_import,[size/1]}). -export([hash/1, hash_bin/1, copy/2, @@ -20,9 +21,18 @@ %% it is SHA256. -spec hash(file:filename_all()) -> hash(). hash(Path) -> - %% TODO: support large files on this too - {ok, Bin} = read_file(Path), - hash_bin(Bin). + {ok, Size} = size(Path), + case application:get_env(revault, multipart_size) of + {ok, Threshold} when Size > Threshold -> + {ok, Fd} = file:open(Path, [read, raw, binary]), + HashState = crypto:hash_init(sha256), + NewHashState = hash_fd(Fd, 0, Size, Threshold, HashState), + file:close(Fd), + crypto:hash_final(NewHashState); + _ -> + {ok, Bin} = read_file(Path), + hash_bin(Bin) + end. %% @doc takes a binary and computes a hash for it as used to track changes %% and validate payloads in ReVault. This hash is not guaranteed to be @@ -252,3 +262,10 @@ detect_tmpdir() -> randname() -> float_to_list(rand:uniform()). +hash_fd(_Fd, Offset, Size, _Threshold, HashState) when Offset =:= Size -> + HashState; +hash_fd(Fd, Offset, Size, Threshold, HashState) when Offset < Size -> + Bytes = min(Size-Offset, Threshold), + {ok, Bin} = file:pread(Fd, Offset, Bytes), + hash_fd(Fd, Offset+Bytes, Size, Threshold, + crypto:hash_update(HashState, Bin)). diff --git a/apps/revault/test/revault_file_disk_SUITE.erl b/apps/revault/test/revault_file_disk_SUITE.erl index cc36f06..0832894 100644 --- a/apps/revault/test/revault_file_disk_SUITE.erl +++ b/apps/revault/test/revault_file_disk_SUITE.erl @@ -4,7 +4,8 @@ -include_lib("common_test/include/ct.hrl"). all() -> - [read_range, multipart, multipart_hash]. + [read_range, multipart, multipart_hash, + hash_large_file]. read_range() -> [{doc, "general checks on reading subsets of files"}]. @@ -69,3 +70,25 @@ multipart_hash(Config) -> ?assertError(invalid_hash, revault_file_disk:multipart_final(State, File, Parts, Hash)), ok. + +hash_large_file() -> + [{doc, "hashing large files can be done in an iterative manner"}]. +hash_large_file(Config) -> + WidthBytes = 100, + WidthBits = 8*WidthBytes, + Parts = 11, + Bin = <<0:WidthBits, 1:WidthBits, 2:WidthBits, 3:WidthBits, 4:WidthBits, + 5:WidthBits, 6:WidthBits, 7:WidthBits, 8:WidthBits, 9:WidthBits, 10>>, + File = filename:join([?config(priv_dir, Config), "multipart.scratch"]), + ok = file:write_file(File, Bin), + Hash = revault_file_disk:hash_bin(<>), + %% do the streaming hash read + Multipart = application:get_env(revault, multipart_size), + application:set_env(revault, multipart_size, WidthBytes), + HashDisk = revault_file_disk:hash(File), + case Multipart of + undefined -> application:unset_env(revault, multipart_size); + {ok, Multipart} -> application:set_env(revault, multipart_size, Multipart) + end, + ?assertEqual(HashDisk, Hash), + ok.