Skip to content

Commit

Permalink
do streaming file hashing on disk backing for large files
Browse files Browse the repository at this point in the history
  • Loading branch information
ferd committed Apr 6, 2024
1 parent 5fede9f commit 530d473
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 4 deletions.
23 changes: 20 additions & 3 deletions apps/revault/src/revault_file_disk.erl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
-module(revault_file_disk).

-include_lib("kernel/include/file.hrl").
-compile({no_auto_import,[size/1]}).

-export([hash/1, hash_bin/1,
copy/2,
Expand All @@ -20,9 +21,18 @@
%% it is SHA256.
-spec hash(file:filename_all()) -> hash().
hash(Path) ->
%% TODO: support large files on this too
{ok, Bin} = read_file(Path),
hash_bin(Bin).
{ok, Size} = size(Path),
case application:get_env(revault, multipart_size) of
{ok, Threshold} when Size > Threshold ->
{ok, Fd} = file:open(Path, [read, raw, binary]),
HashState = crypto:hash_init(sha256),
NewHashState = hash_fd(Fd, 0, Size, Threshold, HashState),
file:close(Fd),
crypto:hash_final(NewHashState);
_ ->
{ok, Bin} = read_file(Path),
hash_bin(Bin)
end.

%% @doc takes a binary and computes a hash for it as used to track changes
%% and validate payloads in ReVault. This hash is not guaranteed to be
Expand Down Expand Up @@ -252,3 +262,10 @@ detect_tmpdir() ->
randname() ->
float_to_list(rand:uniform()).

hash_fd(_Fd, Offset, Size, _Threshold, HashState) when Offset =:= Size ->
HashState;
hash_fd(Fd, Offset, Size, Threshold, HashState) when Offset < Size ->
Bytes = min(Size-Offset, Threshold),
{ok, Bin} = file:pread(Fd, Offset, Bytes),
hash_fd(Fd, Offset+Bytes, Size, Threshold,
crypto:hash_update(HashState, Bin)).
25 changes: 24 additions & 1 deletion apps/revault/test/revault_file_disk_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
-include_lib("common_test/include/ct.hrl").

all() ->
[read_range, multipart, multipart_hash].
[read_range, multipart, multipart_hash,
hash_large_file].

read_range() ->
[{doc, "general checks on reading subsets of files"}].
Expand Down Expand Up @@ -69,3 +70,25 @@ multipart_hash(Config) ->
?assertError(invalid_hash,
revault_file_disk:multipart_final(State, File, Parts, Hash)),
ok.

hash_large_file() ->
[{doc, "hashing large files can be done in an iterative manner"}].
hash_large_file(Config) ->
WidthBytes = 100,
WidthBits = 8*WidthBytes,
Parts = 11,
Bin = <<0:WidthBits, 1:WidthBits, 2:WidthBits, 3:WidthBits, 4:WidthBits,
5:WidthBits, 6:WidthBits, 7:WidthBits, 8:WidthBits, 9:WidthBits, 10>>,
File = filename:join([?config(priv_dir, Config), "multipart.scratch"]),
ok = file:write_file(File, Bin),
Hash = revault_file_disk:hash_bin(<<Bin/binary>>),
%% do the streaming hash read
Multipart = application:get_env(revault, multipart_size),
application:set_env(revault, multipart_size, WidthBytes),
HashDisk = revault_file_disk:hash(File),
case Multipart of
undefined -> application:unset_env(revault, multipart_size);
{ok, Multipart} -> application:set_env(revault, multipart_size, Multipart)
end,
?assertEqual(HashDisk, Hash),
ok.

0 comments on commit 530d473

Please sign in to comment.