Skip to content

Commit

Permalink
Merge pull request #117 from srbcheema1/demo_read_compressed_file
Browse files Browse the repository at this point in the history
Read compressed input
  • Loading branch information
Cristina Yenyxe Gonzalez Garcia authored Apr 6, 2018
2 parents d844d2d + e954cb2 commit c0e3189
Show file tree
Hide file tree
Showing 16 changed files with 151 additions and 39 deletions.
7 changes: 7 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ matrix:
- clang-3.9
- libboost1.55-dev
- libboost-filesystem1.55-dev
- libboost-iostreams1.55-dev
- libboost-program-options1.55-dev
- libboost-regex1.55-dev
- libboost-log1.55-dev
Expand All @@ -40,6 +41,7 @@ matrix:
- clang-4.0
- libboost1.55-dev
- libboost-filesystem1.55-dev
- libboost-iostreams1.55-dev
- libboost-program-options1.55-dev
- libboost-regex1.55-dev
- libboost-log1.55-dev
Expand All @@ -58,6 +60,7 @@ matrix:
- clang-5.0
- libboost1.55-dev
- libboost-filesystem1.55-dev
- libboost-iostreams1.55-dev
- libboost-program-options1.55-dev
- libboost-regex1.55-dev
- libboost-log1.55-dev
Expand All @@ -76,6 +79,7 @@ matrix:
- g++-4.8
- libboost1.55-dev
- libboost-filesystem1.55-dev
- libboost-iostreams1.55-dev
- libboost-program-options1.55-dev
- libboost-regex1.55-dev
- libboost-log1.55-dev
Expand All @@ -94,6 +98,7 @@ matrix:
- g++-4.9
- libboost1.55-dev
- libboost-filesystem1.55-dev
- libboost-iostreams1.55-dev
- libboost-program-options1.55-dev
- libboost-regex1.55-dev
- libboost-log1.55-dev
Expand All @@ -112,6 +117,7 @@ matrix:
- g++-5
- libboost1.55-dev
- libboost-filesystem1.55-dev
- libboost-iostreams1.55-dev
- libboost-program-options1.55-dev
- libboost-regex1.55-dev
- libboost-log1.55-dev
Expand All @@ -130,6 +136,7 @@ matrix:
- g++-6
- libboost1.55-dev
- libboost-filesystem1.55-dev
- libboost-iostreams1.55-dev
- libboost-program-options1.55-dev
- libboost-regex1.55-dev
- libboost-log1.55-dev
Expand Down
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ else (BUILD_STATIC)
endif (BUILD_STATIC)

# Dependency libraries
find_package (Boost COMPONENTS filesystem program_options regex log thread system REQUIRED )
find_package (Boost COMPONENTS filesystem iostreams program_options regex log thread system REQUIRED )
include_directories (${Boost_INCLUDE_DIR} )

add_library(sqlite3 lib/sqlite/sqlite3.c)
Expand All @@ -139,6 +139,8 @@ if (BUILD_STATIC)
mod_vcf
mod_odb
${Boost_LIBRARIES}
bz2
z
${ODB_PATH}/libodb-sqlite.a
${ODB_PATH}/libodb.a
sqlite3
Expand Down
3 changes: 3 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
g++ \
make \
cmake \
libbz2-dev \
libboost-dev \
libboost-filesystem-dev \
libboost-iostreams-dev \
libboost-program-options-dev \
libboost-regex-dev \
libboost-log-dev \
libsqlite3-dev \
zlib1g-dev \
ragel \
# Clean up to reduce layer size
&& apt-get clean \
Expand Down
8 changes: 5 additions & 3 deletions inc/vcf/string_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,14 @@ namespace ebi

// Compressed file extensions
const std::string BZ2 = ".bz2";
const std::string NO_EXT = "";
const std::string RAR = ".rar";
const std::string TAR = ".tar";
const std::string TAR_GZ = ".gz";
const std::string TAR_XZ = ".xz";
const std::string TAR_Z = ".Z";
const std::string GZ = ".gz";
const std::string XZ = ".xz";
const std::string Z = ".Z";
const std::string ZIP = ".zip";
const std::string ZLIB = ".z";

}
}
Expand Down
13 changes: 9 additions & 4 deletions inc/vcf/validator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@

#include <boost/filesystem.hpp>
#include <boost/log/trivial.hpp>
#include <boost/iostreams/filter/bzip2.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/filter/zlib.hpp>
#include <boost/iostreams/filtering_stream.hpp>

#include "file_structure.hpp"
#include "error_policy.hpp"
Expand Down Expand Up @@ -185,12 +189,13 @@ namespace ebi
ValidationLevel validationLevel,
std::vector<std::unique_ptr<ebi::vcf::ReportWriter>> &outputs);

bool is_compressed_file(const std::string &source,
const std::vector<char> &line);
std::string get_compression_from_extension(std::string const & source);

bool is_compressed_extension(std::string const & source);
std::string get_compression_from_magic_num(const std::vector<char> &line);

bool is_compressed_magic_num(const std::vector<char> &line);
void create_uncompressed_stream(std::istream & input,
const std::string & sourceName,
boost::iostreams::filtering_istream & uncompressed_input);

}
}
Expand Down
2 changes: 1 addition & 1 deletion src/validator_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ int main(int argc, char** argv)
BOOST_LOG_TRIVIAL(error) << ex.what();
return 1;
} catch (std::runtime_error const & ex) {
BOOST_LOG_TRIVIAL(error) << "The input file is not valid: " << ex.what();
BOOST_LOG_TRIVIAL(error) << "The validation could not be completed: " << ex.what();
return 1;
} catch (std::exception const &ex) {
BOOST_LOG_TRIVIAL(error) << ex.what();
Expand Down
120 changes: 97 additions & 23 deletions src/vcf/validator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,17 @@ namespace ebi
ebi::vcf::Parser &validator,
std::vector<std::unique_ptr<ebi::vcf::ReportWriter>> &outputs);

std::string get_compression(std::string const & source,
const std::vector<char> &line);

void get_magic_num(std::istream & stream, std::vector<char> & container);

void compressed_file_warning(std::string const & file_extension);

void check_readability_of_file(const std::string & file_ext);

void check_readability_of_stream(const std::vector<char> & line);

void write_errors(const Parser &validator, const std::vector<std::unique_ptr<ReportWriter>> &outputs);

ParserImpl::ParserImpl(std::shared_ptr<Source> source)
Expand Down Expand Up @@ -135,12 +144,15 @@ namespace ebi
ValidationLevel validationLevel,
std::vector<std::unique_ptr<ebi::vcf::ReportWriter>> &outputs)
{
boost::iostreams::filtering_istream uncompressed_input;
boost::iostreams::filtering_istream input_stream(input);
create_uncompressed_stream(input_stream, sourceName, uncompressed_input);

std::vector<char> line;
ebi::util::readline(input, line);
ebi::util::readline(uncompressed_input, line);
check_readability_of_stream(line);

ebi::vcf::Version version;
if (ebi::vcf::is_compressed_file(sourceName, line)) {
throw std::invalid_argument{"Input file should not be compressed"};
}
try {
version = detect_version(line);
} catch (FileformatError * error) {
Expand All @@ -150,54 +162,116 @@ namespace ebi
return false;
}
std::unique_ptr<Parser> validator = build_parser(sourceName, validationLevel, version);
return validate(line, input, *validator, outputs);
return validate(line, uncompressed_input, *validator, outputs);
}

bool is_compressed_file(const std::string &source,
const std::vector<char> &line)
void create_uncompressed_stream(std::istream & input,
const std::string & sourceName,
boost::iostreams::filtering_istream & uncompressed_input)
{
if (source != ebi::vcf::STDIN && is_compressed_extension(source)) {
return true;
std::vector<char> line;
get_magic_num(input, line);

std::string file_ext = get_compression(sourceName, line);
check_readability_of_file(file_ext);

if(file_ext == BZ2) {
uncompressed_input.push(boost::iostreams::bzip2_decompressor());
} else if(file_ext == GZ) {
uncompressed_input.push(boost::iostreams::gzip_decompressor());
} else if(file_ext == ZLIB) {
uncompressed_input.push(boost::iostreams::zlib_decompressor());
}
return is_compressed_magic_num(line);

uncompressed_input.push(input);
}

bool is_compressed_extension(std::string const & source)
void get_magic_num(std::istream & stream, std::vector<char> & container)
{
char c;
int i = 0;
container.clear();

while (stream && stream.get(c)) {
container.push_back(c);
i++;
if (c == '\n' || i > 4) break;
}

for (int j = (int)container.size()-1; j >= 0; --j) {
stream.putback(container[j]);
if (stream.fail()) {
throw std::runtime_error("Stream failed while putting the magic numbers back into stream");
}
}
}

std::string get_compression(std::string const & source,
const std::vector<char> &line)
{
std::string file_extension = get_compression_from_extension(source);
if (source != ebi::vcf::STDIN && file_extension != NO_EXT) {
return file_extension;
}

return get_compression_from_magic_num(line);
}

std::string get_compression_from_extension(std::string const & source)
{
boost::filesystem::path source_name(source);
std::string file_extension = source_name.extension().string();

if (file_extension == BZ2 || file_extension == RAR || file_extension == TAR || file_extension == TAR_GZ ||
file_extension == TAR_XZ || file_extension == TAR_Z || file_extension == ZIP) {
if (file_extension == BZ2 || file_extension == RAR || file_extension == TAR || file_extension == GZ ||
file_extension == XZ || file_extension == Z || file_extension == ZIP || file_extension == ZLIB) {
compressed_file_warning(file_extension);
return true;
return file_extension;
}
return false;
return NO_EXT;
}

void compressed_file_warning(std::string const & file_extension)
{
BOOST_LOG_TRIVIAL(warning) << "detected " << file_extension
BOOST_LOG_TRIVIAL(warning) << "Detected " << file_extension
<< " compression";
}

bool is_compressed_magic_num(const std::vector<char> &line)
std::string get_compression_from_magic_num(const std::vector<char> &line)
{
std::vector<std::pair<std::vector<char>, std::string>> types = {
{ { 66, 90, 104 }, BZ2 },
{ { 31, -117 }, TAR_GZ },
{ { -3, 55, 122, 88, 90 }, TAR_XZ },
{ { 31, -99 }, TAR_Z },
{ { 80, 75, 3, 4 }, ZIP }
{ { 31, -117 }, GZ },
{ { -3, 55, 122, 88, 90 }, XZ },
{ { 31, -99 }, Z },
{ { 80, 75, 3, 4 }, ZIP },
{ { 120, -100 }, ZLIB }
};

for (auto & type : types) {
if (std::equal(type.first.begin(), type.first.end(), line.begin())) {
compressed_file_warning(type.second);
return true;
return type.second;
}
}
return false;
return NO_EXT;
}

void check_readability_of_file(const std::string & file_ext)
{
std::set<std::string> readable_extensions = {BZ2,GZ,ZLIB,NO_EXT};

if (!readable_extensions.count(file_ext)) {
throw std::invalid_argument{"Input file should not be compressed"};
}
}

void check_readability_of_stream(const std::vector<char> &line)
{
std::string compression_type = ebi::vcf::get_compression_from_magic_num(line);

if (compression_type != NO_EXT) {
throw std::invalid_argument{"Input file should not be compressed twice"};
}
}

Version detect_version(const std::vector<char> &vector_line)
Expand Down
Binary file not shown.
Binary file not shown.
16 changes: 10 additions & 6 deletions test/vcf/compressed_file_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,25 @@ namespace ebi
{
SECTION("File with extensions of compressed files")
{
auto folder = boost::filesystem::path("test/input_files/v4.3/compressed_files");
auto folder = boost::filesystem::path("test/input_files/v4.3/compressed_files/non_readable");
std::vector<boost::filesystem::path> v;
copy(boost::filesystem::directory_iterator(folder), boost::filesystem::directory_iterator(), back_inserter(v));
folder = boost::filesystem::path("test/input_files/v4.3/compressed_files/readable");
copy(boost::filesystem::directory_iterator(folder), boost::filesystem::directory_iterator(), back_inserter(v));

for (auto path : v)
{
CHECK(vcf::is_compressed_extension(path.string()));
CHECK(vcf::get_compression_from_extension(path.string()) != vcf::NO_EXT);
}
}

SECTION("Compressed file streams")
{
auto folder = boost::filesystem::path("test/input_files/v4.3/compressed_files");
auto folder = boost::filesystem::path("test/input_files/v4.3/compressed_files/non_readable");
std::vector<boost::filesystem::path> v;
copy(boost::filesystem::directory_iterator(folder), boost::filesystem::directory_iterator(), back_inserter(v));
folder = boost::filesystem::path("test/input_files/v4.3/compressed_files/readable");
copy(boost::filesystem::directory_iterator(folder), boost::filesystem::directory_iterator(), back_inserter(v));

std::vector<char> line;
line.reserve(ebi::vcf::default_line_buffer_size);
Expand All @@ -50,7 +54,7 @@ namespace ebi
{
std::ifstream input{path.string()};
ebi::util::readline(input, line);
CHECK(vcf::is_compressed_magic_num(line));
CHECK(vcf::get_compression_from_magic_num(line) != vcf::NO_EXT);
}
}
}
Expand All @@ -67,7 +71,7 @@ namespace ebi

for (auto path : v)
{
CHECK_FALSE(vcf::is_compressed_extension(path.string()));
CHECK(vcf::get_compression_from_extension(path.string()) == vcf::NO_EXT);
}
}

Expand All @@ -86,7 +90,7 @@ namespace ebi
{
std::ifstream input{path.string()};
ebi::util::readline(input, line);
CHECK_FALSE(vcf::is_compressed_magic_num(line));
CHECK(vcf::get_compression_from_magic_num(line) == vcf::NO_EXT);
}
}
}
Expand Down
Loading

0 comments on commit c0e3189

Please sign in to comment.