Skip to content

Commit

Permalink
fix: fall-back to resource size estimation via S3 download header
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Dec 9, 2024
1 parent 3e6abbc commit be43f4a
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
0.16.2
- fix: convert resource file extensions to lower-case (#81)
- fix: escape html characters in logging panel
- fix: fall-back to resource size estimation via S3 download header
- enh: improve error message when registered dataset ID does not exist (#82)
- setup: bump dclab to 0.62.7
0.16.1
Expand Down
36 changes: 22 additions & 14 deletions dcoraid/download/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,26 @@ def from_download_job_state(dj_state, api):

@property
def file_size(self):
return self.get_resource_dict()["size"]
size = None
if not self.condensed:
# Try to get the file size from the resource dictionary.
# Note that the file size is set only after upload. So, if you
# are downloading immediately after upload, the "size"
# attribute might not be set yet.
size = self.get_resource_dict()["size"]
if size is None:
# Fetch the file size from S3.
# This is the only option for condensed downloads (because they
# are not a resource) and the fall-back for actual resources.
url = self.get_resource_url()
req = requests.get(url,
stream=True,
headers=self.api.headers,
verify=self.api.verify,
timeout=29.9,
)
size = int(req.headers["Content-length"])
return size

@property
def id(self):
Expand Down Expand Up @@ -323,19 +342,8 @@ def task_download_resource(self):
# set-up temporary path
self.path_temp = self.path.with_name(self.path.name + "~")
# check for disk space
if self.condensed:
# get the size from the server
url = self.get_resource_url()
req = requests.get(url,
stream=True,
headers=self.api.headers,
verify=self.api.verify,
timeout=29.9,
)
size = int(req.headers["Content-length"])
else:
size = self.get_resource_dict()["size"]
if shutil.disk_usage(self.path_temp.parent).free < size:
if shutil.disk_usage(
self.path_temp.parent).free < self.file_size:
# there is not enough space on disk for the download
self.set_state("wait-disk")
time.sleep(1)
Expand Down

0 comments on commit be43f4a

Please sign in to comment.