Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drastic performance improvements for reads (#249) #342

Merged
merged 12 commits into from
Dec 11, 2024
6 changes: 3 additions & 3 deletions nptdms/base_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,14 @@ def _read_data_chunk(self, file, data_objects, chunk_index):
"""
raise NotImplementedError("Data chunk reading must be implemented in base classes")

def read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk):
def read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk, chunk_size):
johannesloibl marked this conversation as resolved.
Show resolved Hide resolved
""" Read multiple data chunks for a single channel at once
In the base case we read each chunk individually but subclasses can override this
"""
for chunk_index in range(chunk_offset, stop_chunk):
yield self._read_channel_data_chunk(file, data_objects, chunk_index, channel_path)
yield self._read_channel_data_chunk(file, data_objects, chunk_index, channel_path, chunk_size)

def _read_channel_data_chunk(self, file, data_objects, chunk_index, channel_path):
def _read_channel_data_chunk(self, file, data_objects, chunk_index, channel_path, chunk_size):
""" Read data from a chunk for a single channel
"""
# In the base case we can read data for all channels
Expand Down
18 changes: 10 additions & 8 deletions nptdms/tdms_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def read_raw_data_for_channel(self, f, channel_path, chunk_offset=0, num_chunks=
if chunk_offset > 0:
f.seek(chunk_size * chunk_offset, os.SEEK_CUR)
stop_chunk = self.num_chunks if num_chunks is None else num_chunks + chunk_offset
for chunk in self._read_channel_data_chunks(f, self._get_data_objects(), channel_path, chunk_offset, stop_chunk):
for chunk in self._read_channel_data_chunks(f, self._get_data_objects(), channel_path, chunk_offset, stop_chunk, chunk_size):
yield chunk

def _calculate_chunks(self):
Expand Down Expand Up @@ -376,13 +376,15 @@ def _read_data_chunks(self, file, data_objects, num_chunks):
for chunk in reader.read_data_chunks(file, data_objects, num_chunks):
yield chunk

def _read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk):
def _read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk, chunk_size):
""" Read multiple data chunks for a single channel at once
In the base case we read each chunk individually but subclasses can override this
"""
reader = self._get_data_reader()
for chunk in reader.read_channel_data_chunks(file, data_objects, channel_path, chunk_offset, stop_chunk):
initial_position = file.tell()
for i, chunk in enumerate(reader.read_channel_data_chunks(file, data_objects, channel_path, chunk_offset, stop_chunk, chunk_size)):
yield chunk
file.seek(initial_position + (i + 1) * chunk_size)
johannesloibl marked this conversation as resolved.
Show resolved Hide resolved

def _get_data_reader(self):
endianness = '>' if (self.toc_mask & toc_properties['kTocBigEndian']) else '<'
Expand Down Expand Up @@ -462,7 +464,7 @@ def read_data_chunks(self, file, data_objects, num_chunks):
raise ValueError("Cannot read interleaved data with different chunk sizes")
return [self._read_interleaved_chunks(file, data_objects, num_chunks)]

def read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk):
def read_channel_data_chunks(self, file, data_objects, channel_path, chunk_offset, stop_chunk, chunk_size):
""" Read multiple data chunks for a single channel at once
"""
num_chunks = stop_chunk - chunk_offset
Expand Down Expand Up @@ -514,7 +516,7 @@ def _read_data_chunk(self, file, data_objects, chunk_index):
object_data[obj.path] = obj.read_values(file, number_values, self.endianness)
return RawDataChunk.channel_data(object_data)

def _read_channel_data_chunk(self, file, data_objects, chunk_index, channel_path):
def _read_channel_data_chunk(self, file, data_objects, chunk_index, channel_path, chunk_size):
""" Read data from a chunk for a single channel
"""
channel_data = RawChannelDataChunk.empty()
Expand All @@ -525,13 +527,13 @@ def _read_channel_data_chunk(self, file, data_objects, chunk_index, channel_path
file.seek(current_position)
channel_data = RawChannelDataChunk.channel_data(obj.read_values(file, number_values, self.endianness))
current_position = file.tell()
johannesloibl marked this conversation as resolved.
Show resolved Hide resolved
break
elif number_values == obj.number_values:
# Seek over data for other channel data
current_position += obj.data_size
else:
elif obj.data_type.size is not None:
johannesloibl marked this conversation as resolved.
Show resolved Hide resolved
# In last chunk with reduced chunk size
if obj.data_type.size is not None:
current_position += obj.data_type.size * number_values
current_position += obj.data_type.size * number_values

file.seek(current_position)
return channel_data
Expand Down
Loading