Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core-clp): Add BoundedReader to prevent out-of-bound reads in segmented input streams. #624

Merged
merged 15 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ set(SOURCE_FILES_unitTest
src/clp/aws/AwsAuthenticationSigner.cpp
src/clp/aws/AwsAuthenticationSigner.hpp
src/clp/aws/constants.hpp
src/clp/BoundedReader.cpp
src/clp/BoundedReader.hpp
src/clp/BufferedFileReader.cpp
src/clp/BufferedFileReader.hpp
src/clp/BufferReader.cpp
Expand Down Expand Up @@ -548,6 +550,7 @@ set(SOURCE_FILES_unitTest
submodules/sqlite3/sqlite3ext.h
tests/LogSuppressor.hpp
tests/test-Array.cpp
tests/test-BoundedReader.cpp
tests/test-BufferedFileReader.cpp
tests/test-clp_s-end_to_end.cpp
tests/test-EncodedVariableInterpreter.cpp
Expand Down
39 changes: 39 additions & 0 deletions components/core/src/clp/BoundedReader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include "BoundedReader.hpp"
LinZhihao-723 marked this conversation as resolved.
Show resolved Hide resolved

namespace clp {
auto BoundedReader::try_seek_from_begin(size_t pos) -> ErrorCode {
auto const next_pos = pos > m_bound ? m_bound : pos;
if (auto const rc = m_reader->try_seek_from_begin(next_pos); ErrorCode_Success != rc) {
m_curr_pos = ErrorCode_EndOfFile == rc ? next_pos : m_curr_pos;
return rc;
}
m_curr_pos = next_pos;
if (m_curr_pos >= m_bound) {
return ErrorCode_EndOfFile;
}
return ErrorCode_Success;
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
}

auto BoundedReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read)
-> ErrorCode {
if (m_curr_pos == m_bound) {
num_bytes_read = 0;
return ErrorCode_EndOfFile;
}

if ((m_curr_pos + num_bytes_to_read) > m_bound) {
num_bytes_to_read = m_bound - m_curr_pos;
}
LinZhihao-723 marked this conversation as resolved.
Show resolved Hide resolved

auto const rc = m_reader->try_read(buf, num_bytes_to_read, num_bytes_read);
m_curr_pos += num_bytes_read;
if (ErrorCode_EndOfFile == rc) {
if (0 == num_bytes_read) {
return ErrorCode_EndOfFile;
}
} else if (ErrorCode_Success != rc) {
return rc;
}
return ErrorCode_Success;
}
} // namespace clp
73 changes: 73 additions & 0 deletions components/core/src/clp/BoundedReader.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#ifndef CLP_BOUNDEDREADER_HPP
#define CLP_BOUNDEDREADER_HPP

#include "ReaderInterface.hpp"
LinZhihao-723 marked this conversation as resolved.
Show resolved Hide resolved

namespace clp {
/**
* BoundedReader is a ReaderInterface designed to wrap other ReaderInterfaces and prevent users
* from reading or seeking beyond a certain point in the underlying input stream.
*
* This is useful when the underlying input stream is divided into several logical segments and we
* want to prevent a reader for an earlier segment consuming any bytes from a later segment. In
* particular, reading part of a later segment may force the reader for that later segment to seek
* backwards, which can be either inefficient or impossible for certain kinds of input streams.
*/
class BoundedReader : public ReaderInterface {
public:
// Constructor
explicit BoundedReader(ReaderInterface* reader, size_t bound)
: m_reader{reader},
m_bound{bound} {
if (nullptr == m_reader) {
throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__);
}
m_curr_pos = m_reader->get_pos();
if (m_curr_pos > m_bound) {
throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__);
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
}
}

// Methods implementing the ReaderInterface
/**
* Tries to get the current position of the read head in the underlying reader.
* @param pos Returns the position of the underlying reader's head
* @return ErrorCode_Success on success
* @return ErrorCode_errno on failure
*/
auto try_get_pos(size_t& pos) -> ErrorCode override { return m_reader->try_get_pos(pos); }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add [[nodiscard]] for any non-void returns


/**
* Tries to seek to the given position, limited by the bound.
* @param pos
* @return ErrorCode_Success on success
* @return ErrorCode_EndOfFile on EOF or if trying to seek beyond the checkpoint
* @return ErrorCode_errno on failure
*/
auto try_seek_from_begin(size_t pos) -> ErrorCode override;

/**
* Tries to read up to a given number of bytes from the file, limited by the bound.
* @param buf
* @param num_bytes_to_read The number of bytes to try and read
* @param num_bytes_read The actual number of bytes read
* @return ErrorCode_errno on error
* @return ErrorCode_EndOfFile on EOF or trying to read after hitting checkpoint
* @return ErrorCode_Success on success
*/
auto
try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode override;

auto try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we add a doc string to explain why we override the default implementation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reason is really just that BoundedReader can't delegate to a potentially more efficient implementation in the underlying reader (since it won't respect the bounds), and most code really shouldn't use this interface anyway since its a performance trap.

Can add a docstring saying as much.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, let's add a doc string

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since parameters are not used, shall we add [[maybe_unused]] to silence clang-tidy warnings?

-> ErrorCode override {
return ErrorCode_Unsupported;
}

private:
ReaderInterface* m_reader{nullptr};
LinZhihao-723 marked this conversation as resolved.
Show resolved Hide resolved
size_t m_bound{};
size_t m_curr_pos{};
};
} // namespace clp

#endif // CLP_BOUNDEDREADER_HPP
4 changes: 4 additions & 0 deletions components/core/src/clp/StringReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ ErrorCode StringReader::try_read(char* buf, size_t num_bytes_to_read, size_t& nu
}

ErrorCode StringReader::try_seek_from_begin(size_t pos) {
if (pos > input_string.size()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this only for supporting a new test case?

What you intend to do makes sense to me, but It's bit annoying that the behavior of standard seek interface allows seeking beyond the file ending position, so I feel we need some justification when we decide to change the behavior.

@kirkrodrigues any comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would classify this as a bug I'm fixing in the StringReader class so that I can use it for tests. Every other reader we have will EOF if you seek past the end from what I've seen.

Actually, the way the rest of this class is written if you first seek past the end of the input buffer it will happily let you read data beyond the end of the buffer. I.e. it is very explicitly a bug.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, The FileReader internally calls fseeko, which I believe would allow seeking beyond the end of file.

BufferedFileReader and BufferReader would return ErrorCode_Truncated, and won't update the pos at all if the pos is greater than the maximum length,

From the consistency point of view, maybe we should let it return ErrorCode_Truncated. But also wonder if the rest of your code is dependent on the current behavior.

Copy link
Member

@kirkrodrigues kirkrodrigues Dec 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • The implementations of ReaderInterface have inconsistent behaviour.
    • Some (e.g., FileReader) will rely on the lower implementation to return an error if the seek fails.
    • Some (e.g. BufferReader) will return an error if the seek is past the end, but they won't modify m_pos.
    • Some (e.g., NetworkReader) will return an error if the seek is past the end, and they will modify m_pos.
    • Each of the above returns different error codes.

I think the practical implementation is for try_seek_from_begin to:

  • try to seek until pos
  • if that's past the end of the medium (file/buffer/etc.), m_pos should be updated to just past the last byte.
  • the method should return ErrorCode_EndOfFile.

The reason to update m_pos even though we get to pos is because for some implementations like FileReader, we can't easily check what the last byte is until we seek, and if we seek up to the end of the file, we may not be able to seek backwards to the original m_pos.

If y'all agree, we should open a GH issue to refactor the existing implementations. And for this implementation, we implement the proposal above (basically what Devin's implemented---although on error, we're still updating m_cur_pos, even tough the error may not be EOF?).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, the standard behavior sounds good to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense to me.

Also went and updated BoundedReader to only update m_curr_pos on error if that error is EOF.

this->pos = input_string.size();
return ErrorCode_EndOfFile;
}
this->pos = pos;
return ErrorCode_Success;
}
Expand Down
99 changes: 99 additions & 0 deletions components/core/tests/test-BoundedReader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#include <string>
LinZhihao-723 marked this conversation as resolved.
Show resolved Hide resolved

#include <Catch2/single_include/catch2/catch.hpp>

#include "../src/clp/BoundedReader.hpp"
#include "../src/clp/ErrorCode.hpp"
#include "../src/clp/StringReader.hpp"

TEST_CASE("Test Bounded Reader", "[BoundedReader]") {
constexpr char cTestString[]{"0123456789"};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can use constexpr std::string_view for const strings

Copy link
Contributor Author

@gibber9809 gibber9809 Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm using constexpr char[] because StringReader takes std::string const& and I don't want to manually initialize an std::string every time I open a StringReader.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I know, I've read the coding guidelines, and I understand that both ways of doing it are functionally the same. What I'm saying is that I don't like the explicit std::string{} initialization that you have to do when passing string_view in this specific case since it wastes horizontal space and reads worse.

I'll make the change to avoid wasting time, but I think it makes the code less readable.

constexpr size_t cTestStringLen{std::char_traits<char>::length(cTestString)};

SECTION("BoundedReader does not support try_read_to_delimiter") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::BoundedReader bounded_reader{&string_reader, cTestStringLen};
std::string tmp;
REQUIRE(clp::ErrorCode_Unsupported
== bounded_reader.try_read_to_delimiter('0', false, false, tmp));
}

SECTION("BoundedReader does not allow reads beyond end of underlying stream.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::BoundedReader bounded_reader{&string_reader, cTestStringLen + 1};
char buf[cTestStringLen + 1];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about using std::array to:

  • Silence clang-tidy warnings
  • Enforce initialization on the allocated array memory

size_t num_bytes_read{};
auto rc = bounded_reader.try_read(buf, cTestStringLen + 1, num_bytes_read);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(num_bytes_read == cTestStringLen);
REQUIRE(cTestStringLen == string_reader.get_pos());
REQUIRE(cTestStringLen == bounded_reader.get_pos());
}

SECTION("BoundedReader does not allow reads beyond checkpoint.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::BoundedReader bounded_reader{&string_reader, 1};
char buf[cTestStringLen];
size_t num_bytes_read{};
auto rc = bounded_reader.try_read(buf, cTestStringLen, num_bytes_read);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(1 == num_bytes_read);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == bounded_reader.get_pos());
rc = bounded_reader.try_read(buf, 1, num_bytes_read);
REQUIRE(clp::ErrorCode_EndOfFile == rc);
REQUIRE(0 == num_bytes_read);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == bounded_reader.get_pos());
}

SECTION("BoundedReader does allow reads before checkpoint.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::BoundedReader bounded_reader{&string_reader, 1};
char buf{};
size_t num_bytes_read{};
auto rc = bounded_reader.try_read(&buf, 1, num_bytes_read);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(1 == num_bytes_read);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == bounded_reader.get_pos());
}

SECTION("BoundedReader does not allow seeks beyond end of underlying stream.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::BoundedReader bounded_reader{&string_reader, cTestStringLen + 1};
auto rc = bounded_reader.try_seek_from_begin(cTestStringLen + 1);
REQUIRE(clp::ErrorCode_EndOfFile == rc);
REQUIRE(cTestStringLen == string_reader.get_pos());
REQUIRE(cTestStringLen == bounded_reader.get_pos());
}

SECTION("BoundedReader does not allow seeks beyond checkpoint.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::BoundedReader bounded_reader{&string_reader, 1};
char buf[cTestStringLen];
size_t num_bytes_read{};
auto rc = bounded_reader.try_seek_from_begin(cTestStringLen);
REQUIRE(clp::ErrorCode_EndOfFile == rc);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == bounded_reader.get_pos());
}

SECTION("BoundedReader does allow seeks before checkpoint.") {
clp::StringReader string_reader;
string_reader.open(cTestString);
clp::BoundedReader bounded_reader{&string_reader, 2};
char buf[cTestStringLen];
size_t num_bytes_read{};
auto rc = bounded_reader.try_seek_from_begin(1);
REQUIRE(clp::ErrorCode_Success == rc);
REQUIRE(1 == string_reader.get_pos());
REQUIRE(1 == bounded_reader.get_pos());
}
}
Loading