diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..15aa589 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.log +settings.py +secret.py diff --git a/README.md b/README.md index deab6b0..8eea20f 100644 --- a/README.md +++ b/README.md @@ -6,35 +6,20 @@ You need to.... * ...know what you want to download, look at the folder structure using an sftp browser before using this script ### Prerequisites -* Python 3 (tested with 3.7) +* Python 3 (tested with 3.7+) * pysftp ### How-To -The downloader assumes the following folder structure: + +#### Folder structure +The downloader assumes the following folder structure on the sftp server: ``` remote_path/{time_dirs}/{overall_dirs} or remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs}/{subj_files} ``` - -The paths need to be specified in the script, e.g. - -``` -remote_path = "/data/imagen/2.7" -time_dirs = ["BL","FU1"] -# time_dirs = ["BL","FU1","FU2","FU3"] -intermed_dir1 = "imaging/spm_first_level" -subjs = ["000099616225","000085724167"] -dirs = ["EPI_stop_signal/","EPI_short_MID/"] -subj_files =["con_0006_stop_failure_-_stop_success.nii.gz", - "con_0005_stop_success_-_stop_failure.nii.gz"] -# overall mode: -# "overall": all directories, subdirectories and files within a folder remote_path/{time_dirs}/{overall_dirs} -overall_dirs = ["dawba/", "geolocation/","cantab/", "meta_data/", "psytools/"] -``` - - +#### Download modes There are four modes that help to do different things ``` mode = "dirs" # "files" or "dirs" or "subjects" or "overall" @@ -42,12 +27,25 @@ mode = "dirs" # "files" or "dirs" or "subjects" or "overall" 1. "overall" mode: download a set of given folders recursively (including all subdirectories and files) 2. "subjects" mode: download given subject folders recursively (including all subdirectories and files) 3. "dirs" mode: download specific subdirectories within subject folders recursively -4. "files" mode: download specific files within specific subdirectories within subject folders - - -### ToDos -* simple switch to download all subjects in given folders that are found on the server -* better logging to check what might have gone wrong +4. "files" mode: download files which match specific patterns within specific subdirectories within subject folders + +#### Steps + * clone the repository + * use secret_template.py to create a new file secret.py + * enter your login information in secret.py + * use settings_template.py to create a new file settings.py + * enter your local path settings and your download definitions in settings.py + (some examples are given in settings_template.py) + * start the script: + ``` + python get_data.py + ``` + * logfiles are created for basic information (info_logger*) and debugging information (debug_logger*) to check if something did not work. + + +### Limitations +* pysftp may not work properly with windows (e.g. recursive downloads may be buggy) +* download is not super fast (own experience: 1.5GB per hour) ### Caution * not extensively tested for all use-cases, use at your own risk. diff --git a/get_data.py b/get_data.py index c9f245c..9e7a592 100644 --- a/get_data.py +++ b/get_data.py @@ -1,8 +1,14 @@ import pysftp import os import traceback +import logging +import fnmatch +from datetime import datetime from secret import login_credentials # secret login info in separate file +#from settings import local_settings # local settings info in separate file + +from settings import * # alternatively: enter here # class login_credentials(): # def __init__(self): @@ -10,118 +16,238 @@ # self.user = "myusername" # self.pswd = "mypassword" +##### functions +def setup_logger(name, log_file, level): + """To setup as many loggers as you want""" + handler = logging.FileHandler(log_file) + handler.setFormatter(formatter) + logger = logging.getLogger(name) + logger.setLevel(level) + logger.addHandler(handler) + if level >= logging.INFO: + handler = logging.StreamHandler() + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger + def make_dir(local_path): if not os.path.exists(local_path): os.makedirs(local_path) - print("directory created: " +local_path) + try: + log_it("directory created: " +local_path, logging.INFO) + except: + pass + +def log_it(msg,level): + for lg in logger: + lg.log(level,msg) lc = login_credentials() -remote_path = "/data/imagen/2.7" -local_path ="/Users/martin/Projects/datasets/IMAGEN/neurospin" -make_dir(local_path) - -time_dirs = ["BL","FU1"] -# time_dirs = ["BL","FU1","FU2","FU3"] - -#mode = "dirs" # "files" or "dirs" or "subjects" -mode = "subjects" # "files" or "dirs" or "subjects" or "overall" -# "overall": all directories, subdirectories and files within a folder remote_path/{time_dirs}/{overall_dirs} -overall_dirs = ["dawba/", "geolocation/","cantab/", "meta_data/", "psytools/"] -# "dirs": n directories per subject are downloaded remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs} -# "files": one file per subject is downloaded: remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs}/{subj_files} -intermed_dir1 = "imaging/spm_first_level" -subjs = ["000099616225","000085724167"] -dirs = ["EPI_stop_signal/","EPI_short_MID/"] -# if mode = "dir" subj_files is ignored -# if dir2 has more than 1 element and mode="files" subj_files are expected in each directory -subj_files =["con_0006_stop_failure_-_stop_success.nii.gz", - "con_0005_stop_success_-_stop_failure.nii.gz"] - - - - -with pysftp.Connection(host=lc.host, username=lc.user, password=lc.pswd) as sftp: - print("Connection successfully established ... ") - for td in time_dirs: - if mode == "overall": - print("downloading complete folders (one folder containing data of many subjects)") - try: - base_remote_dir = os.path.join(remote_path, td) - sftp.chdir(base_remote_dir) - local_dir = os.path.join(local_path, td) - make_dir(local_dir) - for o_dir in overall_dirs: - print("remote dir:" + os.path.join(base_remote_dir, o_dir)) - print("local dir:" + os.path.join(local_dir, o_dir)) - try: - sftp.get_r(o_dir, local_dir) - print("overall folder download successful!") - except: - print("overall folder download not successful!") - traceback.print_exc(limit=1) +ls = local_settings() +make_dir(ls.local_path) +make_dir(ls.log_path) - except: - print("problem with directory " + base_remote_dir) - traceback.print_exc(limit=1) - else: +now_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") +formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') +#logging.basicConfig(filename='get_data'+now_str+'.log', level=logging.WARNING) +#logging.basicConfig(filename='get_data'+now_str+'.log', level=logging.WARNING) + +# construct to separate loggers for debugging and warning +logger_info = [{"name":'info_logger',"level":logging.INFO}, + {"name":'debug_logger',"level":logging.DEBUG}] +logger=[] +for li in logger_info: + logger.append(setup_logger(li["name"], os.path.join(ls.log_path,li["name"]+'_get_data_'+now_str+'.log'), li["level"])) + +# info_logger = setup_logger('info_logger', 'info_get_data_'+now_str+'.log', logging.INFO) +# info_logger.info("###logfile for download process (info and warnings)") +# # detailed debug log +# debug_logger = setup_logger('debug_logger', 'debug_get_data_'+now_str+'.log', logging.DEBUG) +# debug_logger.info("###logfile for download process (detailled debugging log)") + +try: + with pysftp.Connection(host=lc.host, username=lc.user, password=lc.pswd) as sftp: + log_it("Connection to " + lc.host + " successfully established ... ", logging.INFO) + for dl_i, dl in enumerate(dl_tasks): try: - base_remote_dir = os.path.join(remote_path, td, intermed_dir1) - sftp.chdir(base_remote_dir) - for subj in subjs: - if mode == "subjects": - local_dir = os.path.join(local_path, td, intermed_dir1) - make_dir(local_dir) - print("remote dir:" + os.path.join(base_remote_dir, subj)) - print("local dir:" + os.path.join(local_dir,subj)) + log_it("working on: download setting " + str(dl_i+1) +"/" + str(len(dl_tasks))+" : " + dl.description, logging.INFO) + for td in dl.time_dirs: + if dl.mode == "overall": + log_it("downloading complete folders (one folder containing data of many subjects)", logging.INFO) try: - sftp.get_r(subj, local_dir) - print("recursive subject download successful!") + base_remote_dir = os.path.join(ls.remote_path, td) + if sftp.exists(base_remote_dir): + + sftp.chdir(base_remote_dir) + local_dir = os.path.join(ls.local_path, td) + make_dir(local_dir) + for o_dir in dl.overall_dirs: + log_it("downloading folder " + os.path.join(base_remote_dir, o_dir) + + " --> " + os.path.join(local_dir, o_dir), logging.INFO) + try: + if os.path.exists(os.path.join(local_dir,o_dir)): + log_it("local folder " + local_dir +" exists, download is skipped! Check if you already have tha data, delete local folder if you want to re-download ", logging.WARN) + elif not sftp.exists(o_dir): + log_it(os.path.join(base_remote_dir,o_dir) + " does not exist on " + lc.host, logging.WARN) + else: + sftp.get_r(o_dir, local_dir) + log_it("overall folder download successful!",logging.INFO) + except: + log_it("overall folder download not successful!", logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) + else: + log_it(base_remote_dir + " does not exist on " + lc.host, logging.WARN) except: - print("subject download not successful!") - traceback.print_exc(limit=1) + log_it("problem with directory " + base_remote_dir, logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) else: try: - base_remote_dir = os.path.join(remote_path, td, intermed_dir1, subj) - sftp.chdir(base_remote_dir) - local_dir = os.path.join(local_path, td, intermed_dir1, subj) - make_dir(local_dir) - for dir2 in dirs: - if mode == "dirs": - #l_path = os.path.join(local_path, td, intermed_dir1, subj, dir2) - #make_dir(l_path) - # - #local_dir = os.path.join(local_path) - print("remote dir:" + os.path.join(base_remote_dir,dir2)) - print("local dir:" + os.path.join(local_dir,dir2)) - try: - sftp.get_r(dir2, local_dir) - print("recursive folder download successful!") - except: - print("folder download not successful!") - traceback.print_exc(limit=1) - if mode == "files": - for subj_file in subj_files: - local_dir = os.path.join(local_path, td, intermed_dir1, subj, dir2) + base_remote_dir = os.path.join(ls.remote_path, td, dl.intermed_dir1) + if sftp.exists(base_remote_dir): + sftp.chdir(base_remote_dir) + if len(dl.subjs) == 0: + log_it("getting available subjects from folder " + base_remote_dir, logging.INFO) + subjs = sftp.listdir() + log_it("downloading from " + str(len(subjs)) + " subject directories in " + base_remote_dir, logging.INFO) + else: + subjs = dl.subjs + for subj in subjs: + if dl.mode == "subjects": + local_dir = os.path.join(ls.local_path, td, dl.intermed_dir1) make_dir(local_dir) - dl_dir = os.path.join(remote_path, td, intermed_dir1, subj, dir2) - l_path = os.path.join(local_path, td, intermed_dir1, subj, dir2) - dl_file = os.path.join(dl_dir, subj_file) - local_file = os.path.join(local_dir, subj_file) + log_it("downloading folder " + os.path.join(base_remote_dir, subj) + + " --> " + os.path.join(local_dir,subj), logging.INFO) + try: + + if os.path.exists(os.path.join(local_dir, subj)): + log_it("local folder " + os.path.join(local_dir, subj) + " exists, download is skipped! Check if you already have tha data, delete the local folder if you want to re-download ", + logging.WARN) + elif not sftp.exists(subj): + log_it(os.path.join(base_remote_dir,subj) + " does not exist on " + lc.host, + logging.WARN) + else: + sftp.get_r(subj, local_dir) + log_it("recursive subject download successful! (subject " +subj + ")", logging.INFO) + + except: + log_it("subject download not successful! (subject " +subj + ")", logging.WARNING) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) + #traceback.print_exc(limit=1) + else: try: - if mode == "files": - print("remote file:" + dl_file) - print("local file:" + local_file) - sftp.get(dl_file, local_file) - print("file download successful!") + base_remote_dir = os.path.join(ls.remote_path, td, dl.intermed_dir1, subj) + try: + sftp.chdir(base_remote_dir) + log_it("downloading selected content from subject " + subj + " in " + + os.path.join(ls.remote_path, td, dl.intermed_dir1), + logging.INFO) + local_dir = os.path.join(ls.local_path, td, dl.intermed_dir1, subj) + make_dir(local_dir) + except: + log_it("folder " + base_remote_dir + " does not exist on " + lc.host, + logging.WARN) + for dir2 in dl.dirs: + if dl.mode == "dirs": + #l_path = os.path.join(dl.local_path, td, dl.intermed_dir1, subj, dir2) + #make_dir(l_path) + # + #local_dir = os.path.join(dl.local_path) + log_it("downloading folder " + os.path.join(base_remote_dir,dir2) + + " --> " + os.path.join(local_dir,dir2), logging.INFO) + + if os.path.exists(os.path.join(local_dir, dir2)): + log_it( + "local folder " + os.path.join(local_dir, dir2) + " exists, download is skipped! Check if you already have tha data, delete the local folder if you want to re-download ", + logging.WARN) + elif not sftp.exists(dir2): + log_it(os.path.join(base_remote_dir, dir2) + " does not exist on " + lc.host, + logging.WARN) + else: + try: + sftp.get_r(dir2, local_dir) + log_it( + "recursive subject download successful! (directory " + dir2 + ")", + logging.INFO) + except: + log_it("folder download not successful!", logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) + if dl.mode == "files": + dl_dir = os.path.join(ls.remote_path, td, dl.intermed_dir1, subj, dir2) + l_path = os.path.join(ls.local_path, td, dl.intermed_dir1, subj, dir2) + if not sftp.exists(dl_dir): + log_it(dl_dir + " does not exist on " + lc.host, + logging.WARN) + else: + make_dir(l_path) + #get a list of files that satisfies either of dl.subj_files patterns + dl_file_list = set() + for subj_file in dl.subj_files: + dl_file_list.update(set(fnmatch.filter(sftp.listdir(dl_dir), subj_file))) + dl_file_list = list(dl_file_list) + log_it("attempt to download all files matching " +str(dl.subj_files) + " in folder " + dir2, logging.INFO) + for m_file in dl_file_list: + try: + dl_file = os.path.join(dl_dir,m_file) + local_file = os.path.join(l_path, m_file) + log_it("downloading files " + dl_file + + " --> " + local_file, logging.INFO) + #print("remote file:" + dl_file) + #print("local file:" + local_file) + + if os.path.exists(local_file): + log_it( + "local folder " + local_file + " exists, download is skipped! Check if you already have tha data, delete the local folder if you want to re-download ", + logging.WARN) + elif not sftp.exists(dl_file): + log_it(dl_file + " does not exist on " + lc.host, + logging.WARN) + else: + try: + sftp.get(dl_file, local_file) + log_it( + "file download successful!", + logging.INFO) + except: + log_it("file download not successful!", + logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) + except: + log_it("problem downloading file!",logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) + ############### work in progress except: - print("Download not successful!") - traceback.print_exc(limit=1) + log_it("problem with directory " + os.path.join(ls.remote_path, td, dl.intermed_dir1, subj),logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) + + else: + log_it(base_remote_dir + " does not exist on " + lc.host, + logging.WARN) except: - print("problem with directory " + os.path.join(remote_path, td, intermed_dir1, subj)) - traceback.print_exc(limit=1) + log_it("problem with directory " + os.path.join(ls.remote_path, td, dl.intermed_dir1), logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) except: - print("problem with directory " + os.path.join(remote_path, td, intermed_dir1)) - traceback.print_exc(limit=1) + log_it("CAUTION: download setting " + str(dl_i + 1) + "/" + str(len(dl_tasks)) + " : " + dl.description + "did not finish without errors!", + logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.DEBUG) -# connection closed automatically -print("connection closed") \ No newline at end of file + # connection closed automatically + if sftp._sftp_live == False: + log_it("connection closed", logging.INFO) +except: + log_it("Connection to " + lc.host + " (user:" + lc.user + ") could not be established", logging.WARN) + exc_str = traceback.format_exc(limit=3) + log_it(exc_str, logging.WARN) +#close logging handlers for files to become immediately available +for lgr in logger: + handler = lgr.handlers[0] + lgr.removeHandler(handler) + handler.close() \ No newline at end of file diff --git a/settings_template.py b/settings_template.py new file mode 100644 index 0000000..503c8c8 --- /dev/null +++ b/settings_template.py @@ -0,0 +1,118 @@ +class local_settings(): + def __init__(self): + self.remote_path = "/data/imagen/2.7" #the remote path on the sftp server + self.local_path ="/my/local/datapath/" #your local path where the data will be downloaded + self.log_path ="/my/local/download_logs" #give a path where logs can be saved + +class dl_settings(): + def __init__(self, description, mode, time_dirs, overall_dirs, intermed_dir1, subjs, dirs, subj_files): + self.description = description + # string, short description what the given settings will do + # e.g. "download available data from all timepoints and subjects for imaging/spm_first_level/.../EPI_short_MID/", + self.mode = mode # string, "files" or "dirs" or "subjects" or "overall" + # the folder structure of the server is: remote_path/{time_dirs}/intermed_dirs/ + # some of the intermed_dirs (e.g. imaging/spm*) are organised as follows: remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs} + # the "mode" setting provides several ways of downloading parts of the data, using different hierarchies + # "overall": download all directories, subdirectories and files within the given folders remote_path/{time_dirs}/{overall_dirs} + # "dirs": download n directories per subject within remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs} + # "files": download n files from folder remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs}/{subj_files} + # dirs should have only one entry in this case, because files are expected to be present within each dir + # "subjects": dowload n subject folders from: remote_path/{time_dirs}/intermed_dir/ + self.time_dirs = time_dirs + # list of strings, example: ["BL","FU1","FU2","FU3"] + self.overall_dirs = overall_dirs + # list of strings, e.g. ["dawba/", "geolocation/", "cantab/", "meta_data/", "psytools/"] + self.intermed_dir1 = intermed_dir1 + # string, e.g. "imaging/spm_first_level" + self.subjs = subjs + # list of strings, e.g. ["000099616225", "000085724167"] + # leave empty (self.subjs = [])to download all available subj-subdirectories within intermed_dir1 + self.dirs = dirs + # list of strings, e.g. ["EPI_stop_signal/", "EPI_short_MID/"] + self.subj_files = subj_files + # list of strings, e.g. ["con_0006_stop_failure_-_stop_success.nii.gz", + # "con_0005_stop_success_-_stop_failure.nii.gz"] + # if mode = "dir" subj_files is ignored + +dl_fMRI_MID = dl_settings( + ## commented example + description = "download available data from baseline of all subjects for imaging/spm_first_level/.../EPI_short_MID/, just contrast images", + mode = "files", # string, "files" or "dirs" or "subjects" or "overall" + # the folder structure of the server is: remote_path/{time_dirs}/intermed_dirs/ + # some of the intermed_dirs (e.g. imaging/spm*) are organised as follows: remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs} + # the "mode" setting provides several ways of downloading parts of the data, using different hierarchies + # "overall": download all directories, subdirectories and files within the given folders remote_path/{time_dirs}/{overall_dirs} + # "dirs": download n directories per subject within remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs} + # "files": download n files from folder remote_path/{time_dirs}/intermed_dir/{subjs}/{dirs}/{subj_files} + # Unix-style wildcards can be used + # "subjects": download n subject folders from: remote_path/{time_dirs}/intermed_dir/ + time_dirs = ["BL"], + # list of strings, example: ["BL","FU1","FU2","FU3"] + overall_dirs = [], + # list of strings, e.g. ["dawba/", "geolocation/", "cantab/", "meta_data/", "psytools/"] + intermed_dir1 = "imaging/spm_first_level", + # string, e.g. "imaging/spm_first_level" + subjs = [], + # list of strings, e.g. ["000099616225", "000085724167"] + # leave empty (self.subjs = [])to download all available subj-subdirectories within intermed_dir1 + dirs = ["EPI_short_MID/"], + # list of strings, e.g. ["EPI_stop_signal/", "EPI_short_MID/"] + subj_files = ['con*'], + #subj_files = ["rp_nuisance_extended.txt"] + # list of strings, e.g. ["con_0006_stop_failure_-_stop_success.nii.gz", + # "con_0005_stop_success_-_stop_failure.nii.gz"] + # UNIX-style wildcards can be used to download all matching files (e.g. ['con*','*.txt*'] + # if mode = "dir" subj_files is ignored +) + +#further examples +dl_fMRI_all_FU2 = dl_settings( + description = "download available data from all subjects at FU2 for imaging/spm_first_level/", + mode = "subjects", + time_dirs = ["FU2"], + overall_dirs = [], + intermed_dir1 = "imaging/spm_first_level", + subjs = [], + dirs = [], + subj_files = [] +) + +dl_fMRI_Face = dl_settings( + description = "download data of subjects 000099616225, 000085724167 from all timepoints for imaging/spm_first_level/EPI_faces", + mode = "dirs", + time_dirs = ["BL","FU1"], + overall_dirs = [], + intermed_dir1 = "imaging/spm_first_level", + subjs = ["000099616225", "000085724167"], + dirs = ["EPI_faces"], + subj_files = [] +) + +dl_overall_data = dl_settings( + description = "download available data from overall folders (dawba,geolocation,cantab,meta_data,psytools)", + mode = "overall", + time_dirs = ["BL","FU1","FU2","FU3"], + overall_dirs = ["dawba/", "geolocation/", "cantab/", "meta_data/", "psytools/"], + intermed_dir1 = "", + subjs = [], + dirs = [], + subj_files = [] +) + +#put all definitions here into a list named dl_tasks, and they will be downloaded sequentially +dl_tasks = [ + dl_fMRI_MID, + dl_overall_data +] + +#dl_tasks = [ +# dl_overall_data +#] + +#dl_tasks =[ +# dl_fMRI_all_FU2 +#] + +#dl_tasks =[ +# dl_fMRI_Face +#] \ No newline at end of file