From 8049073297c67c31ed7aa8df14546262fd70a035 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Sun, 7 Mar 2021 03:06:53 -0600 Subject: [PATCH] Allow compressed data in R data files Fixes #225 Fixes #32 xref https://github.com/JuliaStats/RDatasets.jl/issues/117 --- src/registry.jl | 44 +++++++++++++++++++++++++++++++++++++++----- test/files/iris.rda | Bin 0 -> 1098 bytes test/query.jl | 3 +++ 3 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 test/files/iris.rda diff --git a/src/registry.jl b/src/registry.jl index 7432957f..3f78f72f 100644 --- a/src/registry.jl +++ b/src/registry.jl @@ -22,14 +22,45 @@ add_format(format"GZIP", [0x1f, 0x8b], ".gz", [:Libz => UUID("2ec943e9-cfe8-584d add_format(format"BSON",(),".bson", [:BSON => UUID("fbb218c0-5317-5bc6-957e-2ee96dd4b1f0")]) add_format(format"JLSO", (), ".jlso", [:JLSO => UUID("9da8a3cd-07a3-59c0-a743-3fdc52c30d11")]) +function detect_compressed(io, len=getlength(io); formats=["GZIP", "BZIP2", "XZ", "LZ4"]) + seekstart(io) + len < 2 && return false + b1 = read(io, UInt8) + b2 = read(io, UInt8) + if "GZIP" ∈ formats + b1 == 0x1f && b2 == 0x8b && return true + end + len < 3 && return false + b3 = read(io, UInt8) + if "BZIP2" ∈ formats + b1 == 0x42 && b2 == 0x5A && b3 == 68 && return true + end + len < 4 && return false + b4 = read(io, UInt8) + if "LZ4" ∈ formats + b1 == 0x04 && b2 == 0x22 && b3 == 0x4D && b4 == 0x18 && return true + end + len < 5 && return false + b5 = read(io, UInt8) + len < 6 && return false + b6 = read(io, UInt8) + if "XZ" ∈ formats + b1 == 0xFD && b2 == 0x37 && b3 == 0x7A && b4 == 0x58 && b5 == 0x5A && b6 == 0x00 && return true + end + return false +end + # test for RD?n magic sequence at the beginning of R data input stream function detect_rdata(io) seekstart(io) - read(io, UInt8) == UInt8('R') && - read(io, UInt8) == UInt8('D') && - read(io, UInt8) in (UInt8('A'), UInt8('B'), UInt8('X')) && - read(io, UInt8) in (UInt8('2'), UInt8('3')) && - (c = read(io, UInt8); c == UInt8('\n') || (c == UInt8('\r') && read(io, UInt8) == UInt8('\n'))) + b = read(io, UInt8) + if b == UInt8('R') + return read(io, UInt8) == UInt8('D') && + read(io, UInt8) in (UInt8('A'), UInt8('B'), UInt8('X')) && + read(io, UInt8) in (UInt8('2'), UInt8('3')) && + (c = read(io, UInt8); c == UInt8('\n') || (c == UInt8('\r') && read(io, UInt8) == UInt8('\n'))) + end + return detect_compressed(io; formats=["GZIP", "BZIP2", "XZ"]) end add_format(format"RData", detect_rdata, [".rda", ".RData", ".rdata"], [idRData, LOAD]) @@ -38,6 +69,9 @@ function detect_rdata_single(io) seekstart(io) res = read(io, UInt8) in (UInt8('A'), UInt8('B'), UInt8('X')) && (c = read(io, UInt8); c == UInt8('\n') || (c == UInt8('\r') && read(io, UInt8) == UInt8('\n'))) + if !res + res = detect_compressed(io; formats=["GZIP", "BZIP2", "XZ"]) + end seekstart(io) return res end diff --git a/test/files/iris.rda b/test/files/iris.rda new file mode 100644 index 0000000000000000000000000000000000000000..9363d67b72473e6f8362f9e165f0c67ee4aa3df3 GIT binary patch literal 1098 zcmV-Q1hxAgiwFP!000001KpU-Zdz9qg^#&l19n{eM=){HH^?A;fH3GVgQ^)cQ$(&) zDr!f7+-aU7Qu`)(gh=hEqs%Jx1spYs5;elT!1{LUMX$)%SHhe9zxF=op0n|{Uq5-! ze9}xQRZ^BVQY9qdvG9!cYWjsx8i{(C~>Gq6#IxPd0T$e zK^&Lw86NERtsjBIaatWmhW|wLSaI%%f};yA;XbC%vg?A2`elDx`i}Sx{TA)g2l1>^ z$K}KkPyXNy#~ta3-!c5u1&zG6@8G*uH@G}b$Lfgnw0#DpPTz;*Aq6dAoR;M1u_q3B%koqd$Ni|IZaj9>)ik-|CH|&S{4!gQn*5XXe?#&uv#(oy_+wwS z{Xu@@@Db1C@aU_`2ky*pL%rYL5A@)uS{+rh_qg5%ucx`j&-ZuL^zb{juRNaTZJ3^Y zLp`jEug9k0^?l{*QO56Dyjbs&_rUj0@J4HfyJ`99!%FHw$X^Zvj)B@g%M%L|i9Q$wcigYCXMO+=^JW`feqF=sa~g9>{Em>E`OxQc*ZbTO^P}W3 zZyO);BMSle*J`@G(^_(vuFD}59`|9t>&zqb*yQN-}OGb{#A+J zs}lViqr`RU_V9n`eJ{iLx%HIc{@nV?@ct=&UEiGgdFJP%^B?7RX>#IRS{!)vZtr&R z+&iD_d|r3XL(A@;w}ziM)aCc!y88lu^1^$694{0+;<^91e|kxF>O;&u$f@&_*)d0? z|Br_}=1$FrD|sr{*D2=6WbyGcyeqiS)8CoJ+pxZ?uaP$^sh7IEo|*Yquh+4X-};bJ zcB>G-O!NPL-HcD=hJ3P{@wxWD)y+|Uk&hp5R*6#`yfF`(&QKY?T+qpR22@AC`vMl@-u@+Mo9CpNQ~&?~ literal 0 HcmV?d00001 diff --git a/test/query.jl b/test/query.jl index e14c1281..8bc04299 100644 --- a/test/query.jl +++ b/test/query.jl @@ -384,6 +384,9 @@ let file_dir = joinpath(@__DIR__, "files"), file_path = Path(file_dir) # 6 for /r/n and 5 for /n @test (position(io) in (5, 6)) end + # A GZipped file + q = query(joinpath(file_dir, "iris.rda")) + @test typeof(q) <: File{format"RData"} end @testset "RDS detection" begin q = query(joinpath(file_dir, "minimal_ascii.rds"))