diff --git a/CHANGELOG.md b/CHANGELOG.md index d33ea27..9d6d610 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,25 @@ # Changelog -## [UNRELEASED](https://github.com/UAL-ODIS/LD-Cool-P/tree/HEAD) (YYYY-MM-DD) +## [v1.1.3](https://github.com/UAL-ODIS/LD-Cool-P/tree/v1.1.3) (2021-06-29) **Implemented enhancements:** - - Enhancement: Double digit for version [#225](http://github.com/UAL-ODIS/LD-Cool-P/pull/225) + - Enhancement: Double digit for version [#225](https://github.com/UAL-ODIS/LD-Cool-P/pull/225) **Closed issues:** - - Enhancement: Double digit for version [#224](http://github.com/UAL-ODIS/LD-Cool-P/issues/224) + - Enhancement: Double digit for version [#224](https://github.com/UAL-ODIS/LD-Cool-P/issues/224) + + +## [v1.1.2](https://github.com/UAL-ODIS/LD-Cool-P/tree/v1.1.2) (2021-06-28) + +**Implemented enhancements:** + - Allow for downloading only the metadata + [#223](https://github.com/UAL-ODIS/LD-Cool-P/pull/223) + - Enhancement: Dump JSON metadata from Qualtrics API + [#226](https://github.com/UAL-ODIS/LD-Cool-P/pull/226) + +**Closed issues:** + - Enhancement: Dump JSON metadata from Qualtrics API + [#160](https://github.com/UAL-ODIS/LD-Cool-P/issues/160) ## [v1.1.1](https://github.com/UAL-ODIS/LD-Cool-P/tree/v1.1.1) (2021-06-10) diff --git a/README.md b/README.md index 42e241b..a8f39d3 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ You can confirm installation via `conda list` (curation) $ conda list ldcoolp ``` -You should see that the version is `1.1.1`. +You should see that the version is `1.1.2`. ### Configuration Settings diff --git a/ldcoolp/__init__.py b/ldcoolp/__init__.py index 31f8f15..df6d6da 100644 --- a/ldcoolp/__init__.py +++ b/ldcoolp/__init__.py @@ -1,6 +1,6 @@ from os import path -__version__ = "1.1.1" +__version__ = "1.1.2" CODE_NAME = "LD-Cool-P" diff --git a/ldcoolp/curation/api/qualtrics.py b/ldcoolp/curation/api/qualtrics.py index f0c8b36..d9b11d7 100644 --- a/ldcoolp/curation/api/qualtrics.py +++ b/ldcoolp/curation/api/qualtrics.py @@ -24,6 +24,7 @@ # Convert single-entry DataFrame to dictionary from ldcoolp.curation import df_to_dict_single +from ldcoolp.curation import metadata # Logging from redata.commons.logger import log_stdout @@ -33,6 +34,7 @@ from figshare.figshare import issue_request # Read in default configuration settings +from ..depositor_name import DepositorName from ...config import config_default_dict # for quote and urlencode @@ -52,13 +54,7 @@ class Qualtrics: A Python interface for interaction with Qualtrics API for Deposit Agreement form survey - :param qualtrics_dict: Dict that contains Qualtrics configuration. - This should include: - - survey_id - - token - - datacenter - - download_url - - generate_url + :param config_dict: Dict that contains LD-Cool-P configuration. Default: config_default_dict from config/default.ini @@ -95,27 +91,28 @@ class Qualtrics: Retrieve pandas DataFrame containing responses for a survey See: https://api.qualtrics.com/docs/getting-survey-responses-via-the-new-export-apis - find_deposit_agreement(dn_dict) + find_deposit_agreement(dn) Call get_survey_responses() and identify response that matches based on depositor name (implemented) and deposit title (to be implemented). Returns ResponseID if a unique match is available - retrieve_deposit_agreement(dn_dict=, ResponseId=, browser=True) + retrieve_deposit_agreement(dn=, ResponseId=, browser=True) Opens up web browser to an HTML page containing the deposit agreement. It will call find_deposit_agreement() with DepositorName dict if ResponseId is not provided. Otherwise, it will use the provided - ResponseId. Note that either dn_dict or ResponseId must be provided + ResponseId. Note that either dn or ResponseId must be provided generate_url(dn_dict) Generate URL with customized query strings based on Figshare metadata """ - def __init__(self, qualtrics_dict=config_default_dict['qualtrics'], log=None, + def __init__(self, config_dict=config_default_dict, log=None, interactive=True): self.interactive = interactive - self.dict = qualtrics_dict + self.curation_dict = config_dict['curation'] + self.dict = config_dict['qualtrics'] self.token = self.dict['token'] self.data_center = self.dict['datacenter'] @@ -249,11 +246,13 @@ def lookup_survey_shortname(self, lookup_survey_id): except KeyError: self.log.warn("survey_id not found among list") - def find_deposit_agreement(self, dn_dict): + def find_deposit_agreement(self, dn: DepositorName): """Get Response ID based on a match search for depositor name""" merged_df = self.merge_survey() + dn_dict = dn.name_dict + # First perform search via article_id or curation_id self.log.info("Attempting to identify using article_id or curation_id ...") article_id = str(dn_dict['article_id']) @@ -296,6 +295,8 @@ def find_deposit_agreement(self, dn_dict): else: if response_df.shape[0] == 1: response_dict = df_to_dict_single(response_df) + self.save_metadata(response_dict, dn, out_file_prefix= + f'deposit_agreement_original_{article_id}') self.pandas_write_buffer(response_df[cols_order]) self.log.info("Only one entry found!") self.log.info(f"Survey completed on {response_dict['Date Completed']}") @@ -313,7 +314,7 @@ def find_deposit_agreement(self, dn_dict): raise ValueError - def retrieve_deposit_agreement(self, dn_dict=None, ResponseId=None, out_path='', + def retrieve_deposit_agreement(self, dn=None, ResponseId=None, out_path='', browser=True): """Opens web browser to navigate to a page with Deposit Agreement Form""" @@ -322,7 +323,7 @@ def retrieve_deposit_agreement(self, dn_dict=None, ResponseId=None, out_path='', if isinstance(ResponseId, type(None)): try: - ResponseId, SurveyId = self.find_deposit_agreement(dn_dict) + ResponseId, SurveyId = self.find_deposit_agreement(dn) self.log.info(f"Qualtrics ResponseID : {ResponseId}") self.log.info(f"Qualtrics SurveyID : {SurveyId}") except ValueError: @@ -340,7 +341,7 @@ def retrieve_deposit_agreement(self, dn_dict=None, ResponseId=None, out_path='', SurveyId = '' if ResponseId == '' or SurveyId == '': - custom_url = self.generate_url(dn_dict) + custom_url = self.generate_url(dn.name_dict) self.log.info("CUSTOM URL BELOW : ") self.log.info(custom_url) ResponseId = None @@ -505,9 +506,10 @@ def generate_readme_url(self, dn): return full_url - def find_qualtrics_readme(self, dn_dict): + def find_qualtrics_readme(self, dn: DepositorName): """Get Response ID based on a article_id,curation_id search""" + dn_dict = dn.name_dict qualtrics_df = self.get_survey_responses(self.readme_survey_id) # First perform search via article_id or curation_id @@ -538,15 +540,15 @@ def find_qualtrics_readme(self, dn_dict): self.log.warn("Multiple entries found") raise ValueError - def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): + def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True, + save_metadata: bool = False): """Retrieve response to Qualtrics README form""" - dn_dict = dn.name_dict if ResponseId: response_df = self.get_survey_response(self.readme_survey_id, ResponseId) else: try: - ResponseId, response_df = self.find_qualtrics_readme(dn_dict) + ResponseId, response_df = self.find_qualtrics_readme(dn) self.log.info(f"Qualtrics README ResponseID : {ResponseId}") except ValueError: self.log.warn("Error with retrieving ResponseId") @@ -594,7 +596,7 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): self.log.info("Appending Deposit Agreement's Corresponding Author metadata") if not self.da_response_id: self.log.info("NO METADATA - Retrieving Deposit Agreement metadata") - self.find_deposit_agreement(dn_dict) + self.find_deposit_agreement(dn) else: self.log.info(f"Parsed ResponseId : {self.da_response_id}") self.log.info(f"Parsed SurveyID : {self.da_survey_id}") @@ -605,4 +607,28 @@ def retrieve_qualtrics_readme(self, dn=None, ResponseId='', browser=True): qualtrics_dict['corr_author_email'] = DA_dict['Q6_2'] qualtrics_dict['corr_author_affil'] = DA_dict['Q6_3'] + # Save Qualtrics README metadata + if save_metadata: + out_file_prefix = "qualtrics_readme_original_" + \ + f"{dn.name_dict['article_id']}" + self.save_metadata(qualtrics_dict, dn, + out_file_prefix=out_file_prefix) + return qualtrics_dict + + def save_metadata(self, response_dict: dict, dn: DepositorName, + out_file_prefix: str = 'qualtrics'): + """Save Qualtrics metadata to JSON file""" + + root_directory = join( + self.curation_dict[self.curation_dict['parent_dir']], + self.curation_dict['folder_todo'], + dn.folderName + ) + metadata_directory = self.curation_dict['folder_metadata'] + + metadata.save_metadata(response_dict, out_file_prefix, + metadata_source='QUALTRICS', + root_directory=root_directory, + metadata_directory=metadata_directory, + log=self.log) diff --git a/ldcoolp/curation/inspection/readme/__init__.py b/ldcoolp/curation/inspection/readme/__init__.py index 680adc2..c2aa990 100644 --- a/ldcoolp/curation/inspection/readme/__init__.py +++ b/ldcoolp/curation/inspection/readme/__init__.py @@ -12,12 +12,14 @@ # Logging from redata.commons.logger import log_stdout +from ... import metadata from ....admin import permissions, move # Read in default configuration settings from ....config import config_default_dict from ...api.qualtrics import Qualtrics +from ...depositor_name import DepositorName class ReadmeClass: @@ -85,8 +87,9 @@ class ReadmeClass: Construct README.txt by calling retrieve """ - def __init__(self, dn, config_dict=config_default_dict, update=False, - q: Qualtrics = None, interactive=True, log=None): + def __init__(self, dn: DepositorName, config_dict=config_default_dict, + update=False, q: Qualtrics = None, interactive=True, + log=None): self.config_dict = config_dict self.interactive = interactive @@ -123,7 +126,7 @@ def __init__(self, dn, config_dict=config_default_dict, update=False, if q: self.q = q else: - self.q = Qualtrics(qualtrics_dict=self.config_dict['qualtrics'], + self.q = Qualtrics(config_dict=self.config_dict, interactive=interactive, log=self.log) self.curation_dict = self.config_dict['curation'] @@ -394,6 +397,9 @@ def construct(self): qualtrics_dict=self.qualtrics_readme_dict) f.writelines(content_list) f.close() + + out_file_prefix = f"readme_original_{self.article_id}" + self.save_metadata(out_file_prefix=out_file_prefix) else: self.log.warn("Default README.txt file found! Not overwriting with template!") @@ -427,6 +433,12 @@ def update(self): f = open(self.readme_file_path, 'w') f.writelines(content_list) f.close() + + # Saving Qualtrics README for metadata for updated README.txt + cur_time = datetime.now() + out_file_prefix = f"readme_revised_{self.article_id}_" + \ + f"{cur_time.isoformat(timespec='seconds').replace(':', '')}" + self.save_metadata(out_file_prefix=out_file_prefix) else: self.log.info("README.txt does not exist. Creating new one") @@ -447,6 +459,27 @@ def main(self): else: raise SystemExit("SKIPPING README.txt CONSTRUCTION") + def save_metadata(self, out_file_prefix: str = 'readme'): + """Save README metadata to JSON file""" + + response_dict = { + 'figshare': self.figshare_readme_dict, + 'qualtrics': self.qualtrics_readme_dict, + } + + root_directory = join( + self.curation_dict[self.curation_dict['parent_dir']], + self.curation_dict['folder_todo'], + self.dn.folderName + ) + metadata_directory = self.curation_dict['folder_metadata'] + + metadata.save_metadata(response_dict, out_file_prefix, + metadata_source='README', + root_directory=root_directory, + metadata_directory=metadata_directory, + log=self.log) + def walkthrough(data_path, ignore='', log=None): """ diff --git a/ldcoolp/curation/main.py b/ldcoolp/curation/main.py index 0154a82..d82d072 100644 --- a/ldcoolp/curation/main.py +++ b/ldcoolp/curation/main.py @@ -38,7 +38,8 @@ class PrerequisiteWorkflow: """ def __init__(self, article_id, log=None, url_open=False, - config_dict=config_default_dict): + config_dict=config_default_dict, + metadata_only=False): # If log is not defined, then output log to stdout if isinstance(log, type(None)): @@ -71,6 +72,8 @@ def __init__(self, article_id, log=None, url_open=False, self.url_open = url_open + self.metadata_only = metadata_only + # Check if dataset has been retrieved try: source_stage = self.mc.get_source_stage(self.dn.folderName, verbose=False) @@ -117,7 +120,8 @@ def download_data(self): root_directory=self.root_directory, data_directory=self.data_directory, metadata_directory=self.metadata_directory, - log=self.log, url_open=self.url_open) + log=self.log, url_open=self.url_open, + metadata_only=self.metadata_only) def download_report(self): if self.new_set: @@ -129,7 +133,7 @@ def move_to_next(self): def workflow(article_id, url_open=False, browser=True, log=None, - config_dict=config_default_dict): + config_dict=config_default_dict, metadata_only=False): """ Purpose: This function follows our initial set-up to: @@ -145,6 +149,7 @@ def workflow(article_id, url_open=False, browser=True, log=None, :param log: logger.LogClass object. Default is stdout via python logging :param config_dict: dict of dict with hierarchy of sections (figshare, curation, qualtrics) follow by options + :param metadata_only: When True, only downloads the item metadata. """ # If log is not defined, then output log to stdout @@ -152,7 +157,8 @@ def workflow(article_id, url_open=False, browser=True, log=None, log = log_stdout() pw = PrerequisiteWorkflow(article_id, url_open=url_open, log=log, - config_dict=config_dict) + config_dict=config_dict, + metadata_only=metadata_only) # Perform prerequisite workflow if dataset is entirely new if pw.new_set: @@ -174,8 +180,8 @@ def workflow(article_id, url_open=False, browser=True, log=None, curation_dict['folder_ual_rdm'], ) log.debug(f"out_path: {out_path}") - q = Qualtrics(qualtrics_dict=config_dict['qualtrics'], log=log) - q.retrieve_deposit_agreement(pw.dn.name_dict, out_path=out_path, + q = Qualtrics(config_dict=config_dict, log=log) + q.retrieve_deposit_agreement(pw.dn, out_path=out_path, browser=browser) # Check for README file and create one if it does not exist diff --git a/ldcoolp/curation/metadata.py b/ldcoolp/curation/metadata.py index d596e0c..77a084f 100644 --- a/ldcoolp/curation/metadata.py +++ b/ldcoolp/curation/metadata.py @@ -10,9 +10,11 @@ def save_metadata(json_response: Union[list, dict], out_file_prefix: str, + metadata_source: str = 'CURATION', root_directory: str = '', metadata_directory: str = '', save_csv: bool = False, + overwrite: bool = False, log=None): """ @@ -20,9 +22,11 @@ def save_metadata(json_response: Union[list, dict], :param json_response: Content in list or dict :param out_file_prefix: Filename prefix. Appends .json and .csv + :param metadata_source: Source of metadata, :param root_directory: Full path containing the working directory :param metadata_directory: Metadata path :param save_csv: Save a CSV file. Default: False + :param overwrite: Overwrite file if it exists. Default: False :param log: LogClass or logging object. Default: log_stdout() """ @@ -31,7 +35,7 @@ def save_metadata(json_response: Union[list, dict], log.debug("starting ...") log.info("") - log.info("** SAVING CURATION METADATA **") + log.info(f"** SAVING {metadata_source} METADATA **") if not root_directory: root_directory = os.getcwd() @@ -43,17 +47,30 @@ def save_metadata(json_response: Union[list, dict], # Write JSON file json_out_file = f"{out_file_prefix}.json" if not os.path.exists(json_out_file): - log.info(f"Writing: {json_out_file}") - with open(json_out_file, 'w') as f: - json.dump(json_response, f, indent=4) + write_json(json_out_file, json_response, log) else: - log.info(f"File exists: {out_file_prefix}") + log.info(f"File exists: {json_out_file}") + if overwrite: + log.info("Overwriting!") + write_json(json_out_file, json_response, log) # Write CSV file if save_csv: - csv_out_file = f"{out_file_prefix}.csv" df = pd.DataFrame.from_dict(json_response, orient='columns') - log.info(f"Writing: {csv_out_file}") - df.to_csv(csv_out_file, index=False) + csv_out_file = f"{out_file_prefix}.csv" + if not os.path.exists(csv_out_file): + log.info(f"Writing: {csv_out_file}") + df.to_csv(csv_out_file, index=False) + else: + log.info(f"File exists: {csv_out_file}") + if overwrite: + log.info("Overwriting!") + df.to_csv(csv_out_file, index=False) log.debug("finished.") + + +def write_json(json_out_file, json_response, log): + log.info(f"Writing: {json_out_file}") + with open(json_out_file, 'w') as f: + json.dump(json_response, f, indent=4) diff --git a/ldcoolp/curation/retrieve.py b/ldcoolp/curation/retrieve.py index b51bcc2..85a604e 100644 --- a/ldcoolp/curation/retrieve.py +++ b/ldcoolp/curation/retrieve.py @@ -63,7 +63,8 @@ def private_file_retrieve(url, filename=None, token=None, url_open=False, def download_files(article_id, fs, root_directory=None, data_directory=None, - metadata_directory=None, log=None, url_open=False): + metadata_directory=None, log=None, url_open=False, + metadata_only=False): """ Purpose: Retrieve data for a Figshare deposit following data curation workflow @@ -75,13 +76,18 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, :param metadata_directory: Relative folder path for primary location of metadata (str) :param log: logger.LogClass object. Default is stdout via python logging :param url_open: bool indicates using urlopen over urlretrieve. Default: False + :param metadata_only: bool indicates whether to retrieve metadata. Default: True + If set, no files are downloaded """ if isinstance(log, type(None)): log = log_stdout() log.info("") - log.info("** DOWNLOADING DATA **") + if metadata_only: + log.info(f"** NO FILE RETRIEVAL: metadata_only={metadata_only} **") + else: + log.info("** DOWNLOADING DATA **") if root_directory is None: root_directory = os.getcwd() @@ -107,49 +113,52 @@ def download_files(article_id, fs, root_directory=None, data_directory=None, metadata_directory=metadata_directory, save_csv=True, log=log) - for n, file_dict in zip(range(n_files), file_list): - log.info(f"Retrieving {n+1} of {n_files} : {file_dict['name']} ({file_dict['size']})") - log.info(f"URL: {file_dict['download_url']}") - filename = os.path.join(dir_path, file_dict['name']) - retrieve_cnt = 0 - checksum_flag = False - if not exists(filename): - while retrieve_cnt < N_TRIES_MD5: - log.info(f"Retrieval attempt #{retrieve_cnt + 1}") - try: - private_file_retrieve(file_dict['download_url'], - filename=filename, token=fs.token, - url_open=url_open, log=log) - log.info("Download successful!") - retrieve_cnt += 1 - except HTTPError: - log.info(f"URL might be public: {file_dict['download_url']}") - log.info("Attempting retrieval without token") + if not metadata_only: + for n, file_dict in zip(range(n_files), file_list): + log.info(f"Retrieving {n+1} of {n_files} : " + f"{file_dict['name']} ({file_dict['size']})") + log.info(f"URL: {file_dict['download_url']}") + filename = os.path.join(dir_path, file_dict['name']) + retrieve_cnt = 0 + checksum_flag = False + if not exists(filename): + while retrieve_cnt < N_TRIES_MD5: + log.info(f"Retrieval attempt #{retrieve_cnt + 1}") try: private_file_retrieve(file_dict['download_url'], - filename=filename, + filename=filename, token=fs.token, url_open=url_open, log=log) log.info("Download successful!") + retrieve_cnt += 1 except HTTPError: - log.warning(f"Failed to retrieve: {filename}") - retrieve_cnt += 1 - - # Perform checksum - if exists(filename): - if not file_dict['is_link_only']: - checksum_flag = check_md5(filename, - file_dict['supplied_md5']) - if checksum_flag: + log.info(f"URL might be public: " + f"{file_dict['download_url']}") + log.info("Attempting retrieval without token") + try: + private_file_retrieve(file_dict['download_url'], + filename=filename, + url_open=url_open, log=log) + log.info("Download successful!") + except HTTPError: + log.warning(f"Failed to retrieve: {filename}") + retrieve_cnt += 1 + + # Perform checksum + if exists(filename): + if not file_dict['is_link_only']: + checksum_flag = check_md5(filename, + file_dict['supplied_md5']) + if checksum_flag: + break + else: + log.info("Not performing checksum on linked-only record") break - else: - log.info("Not performing checksum on linked-only record") - break + else: + if not checksum_flag: + log.warning("File retrieval unsuccessful! " + f"Aborted after {N_TRIES_MD5} tries") else: - if not checksum_flag: - log.warning("File retrieval unsuccessful! " - f"Aborted after {N_TRIES_MD5} tries") - else: - log.info("File exists! Not overwriting!") + log.info("File exists! Not overwriting!") # Change permissions on folders and files # permissions.curation(dir_path) diff --git a/ldcoolp/scripts/generate_qualtrics_links b/ldcoolp/scripts/generate_qualtrics_links index 1dce0d2..2345943 100755 --- a/ldcoolp/scripts/generate_qualtrics_links +++ b/ldcoolp/scripts/generate_qualtrics_links @@ -79,8 +79,7 @@ if __name__ == '__main__': fs_dict = config_dict['figshare'] fs_admin = FigshareInstituteAdmin(**fs_dict, log=log) - q_dict = config_dict['qualtrics'] - q = qualtrics.Qualtrics(qualtrics_dict=q_dict, log=log) + q = qualtrics.Qualtrics(config_dict=config_dict, log=log) dn = depositor_name.DepositorName(args.article_id, fs_admin, verbose=False) diff --git a/ldcoolp/scripts/prereq_script b/ldcoolp/scripts/prereq_script index 7fb4b28..f731928 100755 --- a/ldcoolp/scripts/prereq_script +++ b/ldcoolp/scripts/prereq_script @@ -31,6 +31,7 @@ if __name__ == '__main__': parser.add_argument('--article_id', required=True, help='Figshare article ID') parser.add_argument('--url_open', action='store_true', help='Whether to use urlopen') parser.add_argument('--browser', action='store_true', help='Whether to use urlopen') + parser.add_argument('--metadata_only', action='store_true', help='Do not retrieve data, only metadata') # parser.add_argument('--api_token', required=True, help='Figshare API token') args = parser.parse_args() @@ -94,8 +95,9 @@ if __name__ == '__main__': log.info(f"Retrieving: {articles[ii]} ...") # ... {ii+1} / {num_articles}") # Run pre-req steps - main.workflow(articles[ii], url_open=args.url_open, browser=args.browser, - log=log, config_dict=config_dict) + main.workflow(articles[ii], url_open=args.url_open, + browser=args.browser, log=log, config_dict=config_dict, + metadata_only=args.metadata_only) count += 1 log.info(f"Completed: {articles[ii]} ...") diff --git a/setup.py b/setup.py index ff3ddd7..906b4df 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='ldcoolp', - version='1.1.1', + version='1.1.2', packages=['ldcoolp'], url='https://github.com/UAL-ODIS/LD-Cool-P', license='MIT License',