Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rename _convert to maybe_encode #64

Merged
merged 3 commits into from
Jul 7, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 33 additions & 15 deletions src/fileio.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ function loadfile(T, file::File)
end

function loadfile(T, file::TextFile)
replace(read(file.filename, String), "\r"=>"") # ignore CRLF/LF difference
_ignore_crlf(read(file.filename, String))
end

function loadfile(::Type{<:Number}, file::File{format"TXT"})
Expand All @@ -24,7 +24,7 @@ function savefile(file::TextFile, content)
write(file.filename, string(content))
end

function query_extended(filename)
function query_extended(filename::AbstractString)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function expects only string types splitext, I added this to make sure nothing strange is passed here as a safety net.

file, ext = splitext(filename)
# TODO: make this less hacky
if uppercase(ext) == ".SHA256"
Expand All @@ -38,20 +38,30 @@ function query_extended(filename)
res
end

# Some target formats are not supported by FileIO and thus require an encoding/compression process
# before saving. For other formats, we should trust IO backends and make as few changes as possible.
# Otherwise, reference becomes unfaithful. The encoding process helps making the actual data matches
# the reference data, which is load from reference file via IO backends.
#
# TODO: split `maybe_encode` to `maybe_preprocess` and `maybe_encode`
"""
_convert(T::Type{<:DataFormat}, x; kw...) -> out
maybe_encode(T::Type{<:DataFormat}, x; kw...) -> out

Convert `x` to a validate content for file data format `T`.
If needed, encode `x` to a valid content that matches format `T`.

If there is no known method to encode `x`, then it directly return `x` without warning.
"""
_convert(::Type{<:DataFormat}, x; kw...) = x
maybe_encode(::Type{<:DataFormat}, x; kw...) = x
maybe_encode(::Type{<:DataFormat}, x::AbstractString; kw...) = _ignore_crlf(x)
maybe_encode(::Type{<:DataFormat}, x::AbstractArray{<:AbstractString}; kw...) = _join(x)

# plain TXT
_convert(::Type{DataFormat{:TXT}}, x; kw...) = replace(string(x), "\r"=>"") # ignore CRLF/LF difference
_convert(::Type{DataFormat{:TXT}}, x::Number; kw...) = x
function _convert(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...)
return join(x, '\n')
end
function _convert(
maybe_encode(::Type{DataFormat{:TXT}}, x; kw...) = _ignore_crlf(string(x))
maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...) = _join(x) # ambiguity patch
maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractString; kw...) = _ignore_crlf(x) # ambiguity patch
maybe_encode(::Type{DataFormat{:TXT}}, x::Number; kw...) = x # TODO: Change this to string(x) ?

function maybe_encode(
::Type{DataFormat{:TXT}}, img::AbstractArray{<:Colorant};
size = (20,40), kw...)

Expand All @@ -65,11 +75,19 @@ function _convert(
end

# SHA256
_convert(::Type{DataFormat{:SHA256}}, x; kw...) = bytes2hex(sha256(string(x)))
function _convert(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...)
maybe_encode(::Type{DataFormat{:SHA256}}, x; kw...) = _sha256(string(x))
maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractString) = _sha256(_ignore_crlf(x))
maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractArray{<:AbstractString}) = _sha256(_join(x))
function maybe_encode(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...)
# encode image into SHA256
size_str = bytes2hex(sha256(reinterpret(UInt8,[map(Int64,size(img))...])))
img_str = bytes2hex(sha256(reinterpret(UInt8,vec(rawview(channelview(img))))))
size_str = _sha256(reinterpret(UInt8,[map(Int64,size(img))...]))
img_str = _sha256(reinterpret(UInt8,vec(rawview(channelview(img)))))

return size_str * img_str
end


# Helpers
_join(x::AbstractArray{<:AbstractString}) = mapreduce(_ignore_crlf, (x,y)->x*"\n"*y, x)
johnnychen94 marked this conversation as resolved.
Show resolved Hide resolved
_sha256(x) = bytes2hex(sha256(x))
_ignore_crlf(x::AbstractString) = replace(x, "\r"=>"")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just say we are ignoring \r ?

Suggested change
_ignore_crlf(x::AbstractString) = replace(x, "\r"=>"")
_ignore_linefeed(x::AbstractString) = replace(x, "\r"=>"")

Copy link
Member Author

@johnnychen94 johnnychen94 Jul 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume you're suggesting Carriage Return (\r) instead of linefeed (\n)?

I renamed to _ignore_CR with additional docstring.

2 changes: 1 addition & 1 deletion src/test_reference.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ function test_reference(
rendermode = default_rendermode(F, raw_actual)
end

actual = _convert(F, raw_actual; kw...)
actual = maybe_encode(F, raw_actual; kw...)
# preprocessing when reference file doesn't exists
if !isfile(path)
@info("Reference file for \"$filename\" does not exist. It will be created")
Expand Down
160 changes: 160 additions & 0 deletions test/fileio.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
refdir = joinpath(refroot, "fileio")

@testset "query" begin
check_types = [
# text types
("textfile_with_no_extension", format"TXT"),
("textfile.txt", format"TXT"),
("textfile.unknown", format"TXT"),
("textfile.sha256", format"SHA256"),

# image types
("imagefile.jpg", format"JPEG"),
("imagefile.jpeg", format"JPEG"),
("imagefile.png", format"PNG"),
("imagefile.tif", format"TIFF"),
("imagefile.tiff", format"TIFF"),

# dataframe types
("dataframe_file.csv", format"CSV")
]
for (file, fmt) in check_types
@test ReferenceTests.query_extended(file) == File{fmt}(file)
@test ReferenceTests.query_extended(abspath(file)) == File{fmt}(abspath(file))
end
end

@testset "maybe_encode" begin
@testset "string" begin
str1 = "Hello world"
str1_sha256 = "64ec88ca00b268e5ba1a35678a1b5316d212f4f366b2477232534a8aeca37f3c"
str2 = "Hello\n world"
str2_sha256 = "60b65ab310480818c4289227f2ec68f1714743db8571b4cb190e100c0085be3d" # bytes2hex(SHA.sha256(str2))
str2_crlf = "Hello\n\r world"
str3 = "Hello\nworld"
str3_sha256 = "46e0ea795802f17d0b340983ca7d7068c94d7d9172ee4daea37a1ab1168649ec" # bytes2hex(SHA.sha256(str3))
str3_arr1 = ["Hello", "world"]
str3_arr2 = ["Hello" "world"]
str4 = "Hello\n world1\nHello\n world2"
str4_sha256 = "c7dc8b82c3a6fed4afa0c8790a0586b73df0e4f35524efe6810e5d78b6b6a611" # bytes2hex(SHA.sha256(str4))
str4_arr = ["Hello\n\r world1", "Hello\n world2"]

# string as plain text
for fmt in (format"TXT", format"UNKNOWN")
# convert should respect whitespaces
@test str1 == ReferenceTests.maybe_encode(fmt, str1)
@test str2 == ReferenceTests.maybe_encode(fmt, str2)
# but ignore CRLF/LF differences
@test str2 == ReferenceTests.maybe_encode(fmt, str2_crlf)
# string arrays are treated as multi-line strings, even for UNKNOWN format
@test str3 == ReferenceTests.maybe_encode(fmt, str3)
@test str3 == ReferenceTests.maybe_encode(fmt, str3_arr1)
@test str3 == ReferenceTests.maybe_encode(fmt, str3_arr2)
# string arrays should ignore CRLF/LF differences, too
@test str4 == ReferenceTests.maybe_encode(fmt, str4_arr)
end

# string as SHA256 should also ignore CRLF/LF differences
fmt = format"SHA256"
@test str1_sha256 == ReferenceTests.maybe_encode(fmt, str1)
@test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2)
# but ignore CRLF/LF differences
@test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2_crlf)
# string arrays are treated as multi-line strings, even for UNKNOWN format
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3)
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr1)
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr2)
# string arrays should ignore CRLF/LF differences, too
@test str4_sha256 == ReferenceTests.maybe_encode(fmt, str4_arr)
end

@testset "numbers" begin
for num in (0x01, 1, 1.0f0, 1.0)
for fmt in (format"TXT", format"UNKNOWN")
@test num === ReferenceTests.maybe_encode(fmt, num)
end
fmt = format"SHA256"
@test ReferenceTests.maybe_encode(fmt, num) == ReferenceTests.maybe_encode(fmt, string(num))
end


for (fmt, a, ref) in [
# if target is TXT, convert it to string
(format"TXT", [1, 2], "[1, 2]"),
(format"TXT", [1,2], "[1, 2]"),
(format"TXT", [1;2], "[1, 2]"),
(format"TXT", [1 2], "[1 2]"),
(format"TXT", [1 2; 3 4], "[1 2; 3 4]"),
# if target is Unknown, make no change
(format"UNKNOWN", [1, 2], [1, 2]),
(format"UNKNOWN", [1,2], [1, 2]),
(format"UNKNOWN", [1;2], [1, 2]),
(format"UNKNOWN", [1 2], [1 2]),
(format"UNKNOWN", [1 2; 3 4], [1 2; 3 4]),
]
@test ref == ReferenceTests.maybe_encode(fmt, a)
end

for a in [[1, 2], [1 2], [1 2; 3 4]]
fmt = format"SHA256"
@test ReferenceTests.maybe_encode(fmt, a) == ReferenceTests.maybe_encode(fmt, string(a))
end

end

@testset "image" begin
gray_1d = Gray{N0f8}.(0.0:0.1:0.9)
rgb_1d = RGB.(gray_1d)
gray_2d = Gray{N0f8}.(reshape(0.0:0.1:0.9, 2, 5))
rgb_2d = RGB.(gray_2d)
gray_3d = Gray{N0f8}.(reshape(0.0:0.02:0.95, 2, 4, 6))
rgb_3d = RGB.(gray_3d)

# any common image types
for img in (gray_1d, gray_2d, gray_3d, rgb_1d, rgb_2d, rgb_3d)
for fmt in (format"JPEG", format"PNG", format"TIFF", format"UNKNOWN")
@test img === ReferenceTests.maybe_encode(fmt, img)
end
end

# image as text file
fmt = format"TXT"
# TODO: support n-D image encoding
# @test_reference joinpath(refdir, "gray_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_1d)
# @test_reference joinpath(refdir, "rgb_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_1d)
@test_reference joinpath(refdir, "gray_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_2d)
@test_reference joinpath(refdir, "rgb_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_2d)
# @test_reference joinpath(refdir, "gray_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_3d)
# @test_reference joinpath(refdir, "rgb_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_3d)

# image as SHA256
fmt = format"SHA256"
for (file, img) in [
("gray_1d", gray_1d),
("gray_2d", gray_2d),
("gray_3d", gray_3d),
("rgb_1d", rgb_1d),
("rgb_2d", rgb_2d),
("rgb_3d", rgb_3d)
]
reffile = joinpath(refdir, "$(file)_as_sha256.txt")
@test_reference reffile ReferenceTests.maybe_encode(fmt, img)
end
end

# dataframe
@testset "dataframe" begin
df = DataFrame(v1=[1,2,3], v2=["a","b","c"])

@test string(df) == ReferenceTests.maybe_encode(format"TXT", df)
for fmt in (format"CSV", format"UNKNOWN")
@test df === ReferenceTests.maybe_encode(fmt, df)
end

fmt = format"SHA256"
@test_reference joinpath(refdir, "dataframe_as_sha256.txt") ReferenceTests.maybe_encode(fmt, df)

end
end

# TODO: savefile & loadfile
1 change: 1 addition & 0 deletions test/references/fileio/dataframe_as_sha256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2cf7c4edcafc27a5eb1b74fb0af704edc0d9bbef91a1b55d3b7350fa4b54cd18
1 change: 1 addition & 0 deletions test/references/fileio/gray_1d_as_sha256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eed10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297
1 change: 1 addition & 0 deletions test/references/fileio/gray_2d_as_sha256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233d10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297
1 change: 1 addition & 0 deletions test/references/fileio/gray_2d_as_txt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
▀▀▀▀▀
1 change: 1 addition & 0 deletions test/references/fileio/gray_3d_as_sha256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe4baf0f53196e8d5270c0b0b2da82bbbb4676edbb0ebf84ec0dcbd8c0bf4d9af68
1 change: 1 addition & 0 deletions test/references/fileio/rgb_1d_as_sha256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eedebd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89
1 change: 1 addition & 0 deletions test/references/fileio/rgb_2d_as_sha256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233debd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89
1 change: 1 addition & 0 deletions test/references/fileio/rgb_2d_as_txt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
▀▀▀▀▀
1 change: 1 addition & 0 deletions test/references/fileio/rgb_3d_as_sha256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe45465bcbf50acdbe5600207e3266eedef6548bc4d244e55d7a1af0f1af09e019f
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ test_files = [
include("testutils.jl")

@testset "ReferenceTests" begin
@test Set(setdiff(ambs, refambs)) == Set{Tuple{Method,Method}}()
@test_broken Set(setdiff(ambs, refambs)) == Set{Tuple{Method,Method}}()
johnnychen94 marked this conversation as resolved.
Show resolved Hide resolved

for file in test_files
filename = first(splitext(file))
Expand Down