From d4723080d643bef6088011a6c434aee6847d97ec Mon Sep 17 00:00:00 2001 From: s2010515 Date: Mon, 10 Jul 2023 13:40:28 +0000 Subject: [PATCH] Update code to add elsevier_api_key and wiley_api_key, changes are reflated in the readme and version 0.3.13 --- README.md | 38 ++- cadmus/__init__.py | 1 + cadmus/main/bioscraping.py | 216 +++++++++--------- cadmus/main/retrieval.py | 29 ++- .../get_crossref_links_and_licenses.py | 4 +- cadmus/retrieval/HTTP_setup.py | 14 +- cadmus/retrieval/HTTP_setup_elsevier.py | 37 +++ cadmus/retrieval/get_request.py | 60 +++-- cadmus/retrieval/parse_link_retrieval.py | 6 +- setup.py | 2 +- 10 files changed, 261 insertions(+), 146 deletions(-) create mode 100644 cadmus/retrieval/HTTP_setup_elsevier.py diff --git a/README.md b/README.md index a5da998..1970915 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,13 @@ You need to git clone the project and install it. An API key from NCBI (this is used to search PubMed for articles using a search string or list of PubMed IDs, you can find more information [here](https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/)). +**Recommended requirements:** + +An API key from Wiley, this key will allow you to get access to the OA and publications you or your institution have the right to access from Wiley. You can find more information [here](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) + +An API key from Elsevier, this key will allow you to get access to the OA and publications you or your institution have the right to access from Elsevier. You can find more information [here](https://dev.elsevier.com/) + + ## Installation Cadmus has a number of dependencies on other Python packages, it is recommended to install it in an isolated environment. @@ -31,18 +38,20 @@ In order to create your corpora you are going to use the function called `bioscr The function can also receive optional parameters. -1. The "start" parameter tells the function at which service we were at before failure (e.g. crossref, doi, PubMed Central API. . .). -2. The "idx" parameter tells the function what is the last saved row index (article). +1. wiley_api_key parameter allows Wiley to identy which publications you or your institution have the right to access. It will give you access to the OA publications that wihout key you would not get access to. RECOMMENDED +2. elsevier_api_key parameter allows Elsevier to identy which publications you or your institution have the right to access. It will give you access to the OA publications that wihout key you would not get access to. RECOMMENDED +3. The "start" parameter tells the function at which service we were at before failure (e.g. crossref, doi, PubMed Central API. . .). +4. The "idx" parameter tells the function what is the last saved row index (article). Start and idx are designed to use when restarting cadmus after a program failure. When Cadmus is running, there is a repeated output feed at the top of the live output. This line will show you the stage and index that your output dataframe was last saved in case of failure for whatever reason. By using these optional parameters, the programme will take off where it left off, saving you starting the process from the beginning again. -1. "full_search", in case you want to check if a document became available since the last time you tried. "full_search" has three predefined values: +5. "full_search", in case you want to check if a document became available since the last time you tried. "full_search" has three predefined values: - The default Value 'None', the function only looks for the new articles since the last run. - 'light', the function looks for the new articles since the last run and re-tried the row where we did not get any format. - 'heavy', the function looks for the new articles since the last run and re-tried the row where it did not retrieve at least one tagged version (i.e. html or xml) in combination with the pdf format. -2. The "keep_abstract" parameter has the default value 'True' and can be changed to 'False'. When set to 'True', our parsing will load any format from the begining of the document. If change to 'False', our parsing is trying to identify the abstract from any format and start to extract the text after it. We are offering the option of removing the abstract but we can not guarantee that our approach is the more realiable for doing so. In case you would like to apply your own parsing method for removing the abstract feel free to load any file saved during the retrieval availble in the output folder: +6. The "keep_abstract" parameter has the default value 'True' and can be changed to 'False'. When set to 'True', our parsing will load any format from the begining of the document. If change to 'False', our parsing is trying to identify the abstract from any format and start to extract the text after it. We are offering the option of removing the abstract but we can not guarantee that our approach is the more realiable for doing so. In case you would like to apply your own parsing method for removing the abstract feel free to load any file saved during the retrieval availble in the output folder: ```"output/formats/{format}s/{index}.{suffix}.zip"```. You need to set the export path before every use so that cadmus is able to retrieve more than 10 000 records from NCBI. For that we offer a function called `display_export_path`. You just need to call this function and copy past the result into your terminal before calling `bioscraping`. @@ -59,12 +68,24 @@ export PATH=${PATH}:YOUR_WORKING_DIRECTORY/output/medline/edirect After copying and paste the above export into your terminal you can now run `bioscraping` with the following example: +**Minimum requirements:** +```python +from cadmus import bioscraping +bioscraping( + INPUT, #type str + EMAIL, #type str + NCBI_API_KEY #type str + ) +``` +**Minimum recommended requirements:** ```python from cadmus import bioscraping bioscraping( - INPUT, - EMAIL, - NCBI_APY_KEY + INPUT, #type str + EMAIL, #type str + NCBI_API_KEY, #type str + wiley_api_key = YOUR_WILEY_API_KEY, #type str + elsevier_api_key = YOUR_ELSEVIER_API_KEY #type str ) ``` @@ -251,6 +272,9 @@ A: It seems that you are on a shared computer, you need to identify who is the o ## Version +### Version 0.3.13 +-> Since Crossref retired the API key feature to let Elsevier and Wiley identified the author of the publication request. wiley_api_key and elsevier_api_key optional parameters have been added as input parameters. These are not mandatory parameters but increase greatly the retrieval rate as they give access to Wiley and Elsevier publications respectively. + ### Version 0.3.12 -> Applied some changes in clean_up_dir.py. diff --git a/cadmus/__init__.py b/cadmus/__init__.py index 034d7d4..c0a9037 100644 --- a/cadmus/__init__.py +++ b/cadmus/__init__.py @@ -5,6 +5,7 @@ from cadmus.pre_retrieval.creation_retrieved_df import creation_retrieved_df from cadmus.pre_retrieval.ncbi_id_converter_batch import ncbi_id_converter_batch from cadmus.retrieval.HTTP_setup import HTTP_setup +from cadmus.retrieval.HTTP_setup_elsevier import HTTP_setup_elsevier from cadmus.retrieval.get_request import get_request from cadmus.retrieval.get_tdm_links import get_tdm_links from cadmus.pre_retrieval.key_fields import key_fields diff --git a/cadmus/main/bioscraping.py b/cadmus/main/bioscraping.py index 86180bc..bd83f96 100644 --- a/cadmus/main/bioscraping.py +++ b/cadmus/main/bioscraping.py @@ -38,7 +38,7 @@ from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint from cadmus.pre_retrieval.change_output_structure import change_output_structure -def bioscraping(input_function, email, api_key, start = None, idx = None , full_search = None, keep_abstract = True, click_through_api_key = 'XXXXXXXX-XXXXXXXX-XXXXXXXX-XXXXXXXX'): +def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_api_key = None, start = None, idx = None , full_search = None, keep_abstract = True, click_through_api_key = 'XXXXXXXX-XXXXXXXX-XXXXXXXX-XXXXXXXX'): # first bioscraping checks whether this is an update of a previous search or a new search. # create all the output directories if they do not already exist update = check_for_retrieved_df() @@ -182,7 +182,7 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ retrieved_df = ncbi_id_converter_batch(retrieved_df, email) # set up the crossref metadata http request ('base') - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'base') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'base') #create a new column to note whether there is a crossref metadata record available - default - 0 (NO). retrieved_df['crossref'] = 0 @@ -191,7 +191,7 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ retrieved_df['licenses'] = [{} for val in retrieved_df.index] # work through the retrieved_df for every available doi and query crossref for full text links - retrieved_df = get_crossref_links_and_licenses(retrieved_df, http, base_url, headers) + retrieved_df = get_crossref_links_and_licenses(retrieved_df, http, base_url, headers, elsevier_api_key) # now time to download some fulltexts, will need to create some new columns to show success or failure for each format @@ -241,14 +241,14 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ #this project is not trigered by a save if start == None and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'crossref') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'crossref') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'crossref', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'crossref', keep_abstract, elsevier_api_key, mail = email) #We skip all the previous step to start at the crossref step elif start == 'crossref' and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'crossref') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'crossref') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'crossref', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'crossref', keep_abstract, elsevier_api_key, mail = email) start = None #we run the code only on crossref elif start == 'crossref_only': @@ -280,19 +280,19 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ finish = retrieved_df2[divide_at:] # row that have not been done yet done = retrieved_df2[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'crossref') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'crossref') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'crossref', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'crossref', keep_abstract, elsevier_api_key, done = done, mail = email) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'crossref') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'crossref') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'crossref', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'crossref', keep_abstract, elsevier_api_key, mail = email) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'crossref') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'crossref') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'crossref', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'crossref', keep_abstract, elsevier_api_key, mail = email) # we start at the crossref step and at a specific index, could be related to a previous failled attempt elif start == 'crossref' and idx != None: try: @@ -303,30 +303,30 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'crossref') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'crossref') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'crossref', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'crossref', keep_abstract, elsevier_api_key, done = done, mail = email) retrieved_df = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) #change the start and the idx to none to complete all the next step with all the row start = None idx = None else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'crossref') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'crossref') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'crossref', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'crossref', keep_abstract, elsevier_api_key, mail = email) start = None idx = None else: pass # After crossref, we are going on doi.org - this uses the doi provided and redirection to see if we land on the full text html page if start == None and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'doiorg', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'doiorg', keep_abstract, elsevier_api_key) elif start == 'doiorg' and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'doiorg', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'doiorg', keep_abstract, elsevier_api_key) start = None elif start == 'doiorg_only': try: @@ -352,19 +352,19 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df2[divide_at:] done = retrieved_df2[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'doiorg', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'doiorg', keep_abstract, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'doiorg', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'doiorg', keep_abstract, elsevier_api_key) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'doiorg', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'doiorg', keep_abstract, elsevier_api_key) elif start == 'doiorg' and idx != None: try: divide_at = retrieved_df.index.get_loc(idx) @@ -374,29 +374,29 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'doiorg', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'doiorg', keep_abstract, elsevier_api_key, done = done) retrieved_df = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) start = None idx = None else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'doiorg', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'doiorg', keep_abstract, elsevier_api_key) start = None idx = None else: pass #we continue by sending requests to europe pmc, looking for xml format if start == None and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'epmcxml') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'epmcxml') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'epmcxml', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'epmcxml', keep_abstract, elsevier_api_key) elif start == 'epmcxml' and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'epmcxml') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'epmcxml') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'epmcxml', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'epmcxml', keep_abstract, elsevier_api_key) start = None elif start == 'epmcxml_only': try: @@ -422,19 +422,19 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df2[divide_at:] done = retrieved_df2[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'epmcxml') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'epmcxml') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'epmcxml', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'epmcxml', keep_abstract, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'epmcxml') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'epmcxml') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'epmcxml', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'epmcxml', keep_abstract, elsevier_api_key) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'epmcxml') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'epmcxml') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'epmcxml', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'epmcxml', keep_abstract, elsevier_api_key) elif start == 'epmcxml' and idx != None: try: divide_at = retrieved_df.index.get_loc(idx) @@ -444,29 +444,29 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'epmcxml') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'epmcxml') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'epmcxml', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'epmcxml', keep_abstract, elsevier_api_key, done = done) retrieved_df = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) start = None idx = None else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'epmcxml') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'epmcxml') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'epmcxml', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'epmcxml', keep_abstract, elsevier_api_key) start = None idx = None else: pass #pmc, xml format if start == None and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcxmls') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcxmls') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmcxmls', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmcxmls', keep_abstract, elsevier_api_key) elif start == 'pmcxmls' and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcxmls') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcxmls') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmcxmls', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmcxmls', keep_abstract, elsevier_api_key) start = None elif start == 'pmcxmls_only': try: @@ -492,19 +492,19 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df2[divide_at:] done = retrieved_df2[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcxmls') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcxmls') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'pmcxmls', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'pmcxmls', keep_abstract, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcxmls') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcxmls') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmcxmls', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmcxmls', keep_abstract, elsevier_api_key) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcxmls') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcxmls') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmcxmls', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmcxmls', keep_abstract, elsevier_api_key) elif start == 'pmcxmls' and idx != None: try: divide_at = retrieved_df.index.get_loc(idx) @@ -514,29 +514,29 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcxmls') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcxmls') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'pmcxmls', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'pmcxmls', keep_abstract, elsevier_api_key, done = done) retrieved_df = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) start = None idx = None else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcxmls') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcxmls') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmcxmls', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmcxmls', keep_abstract, elsevier_api_key) start = None idx = None else: pass #pmc tgz, these zip files contain pdf and xml if start == None and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmctgz') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmctgz') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmctgz', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmctgz', keep_abstract, elsevier_api_key) elif start == 'pmctgz' and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmctgz') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmctgz') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmctgz', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmctgz', keep_abstract, elsevier_api_key) start = None elif start == 'pmctgz_only': try: @@ -562,19 +562,19 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df2[divide_at:] done = retrieved_df2[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmctgz') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmctgz') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'pmctgz', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'pmctgz', keep_abstract, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmctgz') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmctgz') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmctgz', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmctgz', keep_abstract, elsevier_api_key) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmctgz') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmctgz') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmctgz', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pmctgz', keep_abstract, elsevier_api_key) elif start == 'pmctgz' and idx != None: try: divide_at = retrieved_df.index.get_loc(idx) @@ -584,29 +584,29 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmctgz') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmctgz') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'pmctgz', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'pmctgz', keep_abstract, elsevier_api_key, done = done) retrieved_df = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) start = None idx = None else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmctgz') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmctgz') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmctgz', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pmctgz', keep_abstract, elsevier_api_key) start = None idx = None else: pass #pmc, pdf format if start == None and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcpdfs') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcpdfs') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, '', 'pmcpdfs', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, '', 'pmcpdfs', keep_abstract, elsevier_api_key) elif start == 'pmcpdfs' and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcpdfs') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcpdfs') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, '', 'pmcpdfs', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, '', 'pmcpdfs', keep_abstract, elsevier_api_key) start = None elif start == 'pmcpdfs_only': try: @@ -632,19 +632,19 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df2[divide_at:] done = retrieved_df2[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcpdfs') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcpdfs') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, '', 'pmcpdfs', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, '', 'pmcpdfs', keep_abstract, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcpdfs') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcpdfs') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, '', 'pmcpdfs', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, '', 'pmcpdfs', keep_abstract, elsevier_api_key) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcpdfs') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcpdfs') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, '', 'pmcpdfs', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, '', 'pmcpdfs', keep_abstract, elsevier_api_key) elif start == 'pmcpdfs' and idx != None: try: divide_at = retrieved_df.index.get_loc(idx) @@ -654,29 +654,29 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcpdfs') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcpdfs') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, '', 'pmcpdfs', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, '', 'pmcpdfs', keep_abstract, elsevier_api_key, done = done) retrieved_df = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) start = None idx = None else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pmcpdfs') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pmcpdfs') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, '', 'pmcpdfs', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, '', 'pmcpdfs', keep_abstract, elsevier_api_key) start = None idx = None else: pass # we are now scraping PubMed abstract page to see if we can identify candidate full text links if start == None and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pubmed') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pubmed') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pubmed', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pubmed', keep_abstract, elsevier_api_key) elif start == 'pubmed' and idx == None: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pubmed') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pubmed') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pubmed', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pubmed', keep_abstract, elsevier_api_key) start = None elif start == 'pubmed_only': try: @@ -702,19 +702,19 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df2[divide_at:] done = retrieved_df2[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pubmed') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pubmed') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'pubmed', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'pubmed', keep_abstract, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pubmed') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pubmed') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pubmed', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pubmed', keep_abstract, elsevier_api_key) else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pubmed') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pubmed') # now use the http request set up to request for each of the retrieved_df - retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pubmed', keep_abstract) + retrieved_df2 = retrieval(retrieved_df2, http, base_url, headers, 'pubmed', keep_abstract, elsevier_api_key) elif start == 'pubmed' and idx != None: try: divide_at = retrieved_df.index.get_loc(idx) @@ -724,16 +724,16 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pubmed') + http, base_url, headers = HTTP_setup(email, click_through_api_key, wiley_api_key, 'pubmed') # now use the http request set up to request for each of the retrieved_df - finish = retrieval(finish, http, base_url, headers, 'pubmed', keep_abstract, done = done) + finish = retrieval(finish, http, base_url, headers, 'pubmed', keep_abstract, elsevier_api_key, done = done) retrieved_df = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) start = None idx = None else: - http, base_url, headers = HTTP_setup(email, click_through_api_key, 'pubmed') + http, base_url, headers = HTTP_setup(email, click_through_api_key,wiley_api_key, 'pubmed') # now use the http request set up to request for each of the retrieved_df - retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pubmed', keep_abstract) + retrieved_df = retrieval(retrieved_df, http, base_url, headers, 'pubmed', keep_abstract, elsevier_api_key) start = None idx = None else: @@ -774,10 +774,10 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if start == None and idx == None: # updating the retrieved df with the candidate links that we extracted during the previous steps - retrieved_df2 = parse_link_retrieval(retrieved_df, email, click_through_api_key, keep_abstract) + retrieved_df2 = parse_link_retrieval(retrieved_df, email, click_through_api_key, keep_abstract, wiley_api_key, elsevier_api_key) elif start == 'retrieved2' and idx == None: # restart from this step - retrieved_df2 = parse_link_retrieval(retrieved_df, email, click_through_api_key, keep_abstract) + retrieved_df2 = parse_link_retrieval(retrieved_df, email, click_through_api_key, keep_abstract, wiley_api_key, elsevier_api_key) start = None elif start == 'retrieved2_only': try: @@ -803,13 +803,13 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df2[divide_at:] done = retrieved_df2[:divide_at] - finish = parse_link_retrieval(finish, email, click_through_api_key, keep_abstract, done = done) + finish = parse_link_retrieval(finish, email, click_through_api_key, keep_abstract, wiley_api_key, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) else: - retrieved_df2 = parse_link_retrieval(retrieved_df2, email, click_through_api_key, keep_abstract) + retrieved_df2 = parse_link_retrieval(retrieved_df2, email, click_through_api_key, keep_abstract, wiley_api_key, elsevier_api_key) else: - retrieved_df2 = parse_link_retrieval(retrieved_df2, email, click_through_api_key, keep_abstract) + retrieved_df2 = parse_link_retrieval(retrieved_df2, email, click_through_api_key, keep_abstract, elsevier_api_key) elif start == 'retrieved2' and idx != None: try: divide_at = retrieved_df.index.get_loc(idx) @@ -819,12 +819,12 @@ def bioscraping(input_function, email, api_key, start = None, idx = None , full_ if divide_at != 0: finish = retrieved_df[divide_at:] done = retrieved_df[:divide_at] - finish = parse_link_retrieval(finish, email, click_through_api_key, keep_abstract, done = done) + finish = parse_link_retrieval(finish, email, click_through_api_key, keep_abstract, wiley_api_key, elsevier_api_key, done = done) retrieved_df2 = pd.concat([done, finish], axis=0, join='outer', ignore_index=False, copy=True) start = None idx = None else: - retrieved_df2 = parse_link_retrieval(retrieved_df, email, click_through_api_key, keep_abstract) + retrieved_df2 = parse_link_retrieval(retrieved_df, email, click_through_api_key, keep_abstract, wiley_api_key, elsevier_api_key) start = None idx = None else: diff --git a/cadmus/main/retrieval.py b/cadmus/main/retrieval.py index 333465e..ba0e742 100644 --- a/cadmus/main/retrieval.py +++ b/cadmus/main/retrieval.py @@ -28,7 +28,7 @@ import os import glob -def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done = None): +def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, elsevier_api_key, done = None, mail = ''): # the input will be the retrieved_df and each process will be subset so that the required input is always available (doi or pmid or pmcid) #the counter variable keep track on when to save the current result, every 100 rows or when a step is completed counter = 0 @@ -117,7 +117,7 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done print(f'trying to download from: \n{link}') try: #requesting the document by creatng the header and the request - response_d, response = get_request('', http, link, headers, 'crossref') + response_d, response = get_request('', http, link, headers, 'crossref', elsevier_api_key, mail = mail) except: pass @@ -210,6 +210,19 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done zip_file.writestr(f"{index}.txt", data=p_text) zip_file.testzip() zip_file.close() + '''elif 'api' in response_d.get('url') and 'elsevier' in response_d.get('url'): + with zipfile.ZipFile(f"./output/formats/xmls/{index}.xml.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file: + zip_file.writestr(f"{index}.xml", data=response.text.encode('ascii', 'ignore').decode()) + zip_file.testzip() + zip_file.close() + # saving the file to a pre-defines directory as we identified it as TP + # changing the value to one for future references + retrieval_df.loc[index,'xml'] = 1 + retrieval_df.loc[index,'xml_parse_d'].update(xml_d) + with zipfile.ZipFile(f"./output/retrieved_parsed_files/xmls/{index}.txt.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file: + zip_file.writestr(f"{index}.txt", data=p_text) + zip_file.testzip() + zip_file.close()''' else: pass @@ -293,7 +306,7 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done if retrieval_df.xml.loc[index] != 1 and retrieval_df.pmcid.loc[index] != None: try: #creating the header and the protocol to retreive the file from epmc API - response_d, response = get_request(pmcid, http, base_url, headers, 'epmcxml') + response_d, response = get_request(pmcid, http, base_url, headers, 'epmcxml', elsevier_api_key) except: pass #if the code status we get from the server is 429, we notifiy the user and stop the process to give some time to rest @@ -338,7 +351,7 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done if retrieval_df.xml.loc[index] != 1 and retrieval_df.pmcid.loc[index] != None: try: #creating the header and protocol to retreive the document from PMC API - response_d, response = get_request(pmcid, http, base_url, headers, 'pmcxmls') + response_d, response = get_request(pmcid, http, base_url, headers, 'pmcxmls', elsevier_api_key) except: pass #if the error code is 429 stoping the process to give time to rest @@ -383,7 +396,7 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done if retrieval_df.pdf.loc[index] != 1 and retrieval_df.pmcid.loc[index] != None: try: #creating the header and the protocol to request the docuemnt from PMC API - response_d, response = get_request(pmcid, http, base_url, headers, 'pmcpdfs') + response_d, response = get_request(pmcid, http, base_url, headers, 'pmcpdfs', elsevier_api_key) except: pass #stop the process in case of 429 status code @@ -443,7 +456,7 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done if retrieval_df.pmc_tgz.loc[index] != 1 and retrieval_df.pmcid.loc[index] != None: try: #creating the header and protocol to request the tgz from PMC - response_d, response = get_request(pmcid, http, base_url, headers, 'pmctgz') + response_d, response = get_request(pmcid, http, base_url, headers, 'pmctgz', elsevier_api_key) except: pass #stop the process in case of status code 429 @@ -524,7 +537,7 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done if retrieval_df.doi.loc[index] != None: try: #creating the header and the protocol - response_d, response = get_request(doi, http, base_url, headers, 'doiorg') + response_d, response = get_request(doi, http, base_url, headers, 'doiorg', elsevier_api_key) except: pass # check the response status @@ -679,7 +692,7 @@ def retrieval(retrieval_df, http, base_url, headers, stage, keep_abstract, done if ((retrieval_df.html.loc[index] == 0) and (retrieval_df.xml.loc[index] == 0)) or (retrieval_df.pdf.loc[index]) == 0: # send the request to pubmed using the base url and pmid try: - response_d, response = get_request(pmid, http, base_url, headers, 'pubmed') + response_d, response = get_request(pmid, http, base_url, headers, 'pubmed', elsevier_api_key) except: pass # check the resonse code diff --git a/cadmus/pre_retrieval/get_crossref_links_and_licenses.py b/cadmus/pre_retrieval/get_crossref_links_and_licenses.py index 37bf038..df81f9b 100644 --- a/cadmus/pre_retrieval/get_crossref_links_and_licenses.py +++ b/cadmus/pre_retrieval/get_crossref_links_and_licenses.py @@ -5,7 +5,7 @@ import zipfile # use this function when we already have a retrieved_df with indexes and all available ids -def get_crossref_links_and_licenses(retrieved_df, http, base_url, headers): +def get_crossref_links_and_licenses(retrieved_df, http, base_url, headers, elsevier_api_key): # we send the doi to the crossref API server as a GET request using the function defined above # lets simplify the retrieved_df to only have rows with dois available @@ -17,7 +17,7 @@ def get_crossref_links_and_licenses(retrieved_df, http, base_url, headers): count +=1 # send the request using our function - response_d, response = get_request(row['doi'], http, base_url, headers, 'base') + response_d, response = get_request(row['doi'], http, base_url, headers, 'base', elsevier_api_key) # check the status code if response_d['status_code'] == 200: diff --git a/cadmus/retrieval/HTTP_setup.py b/cadmus/retrieval/HTTP_setup.py index ff82ae5..cd8408f 100644 --- a/cadmus/retrieval/HTTP_setup.py +++ b/cadmus/retrieval/HTTP_setup.py @@ -4,7 +4,7 @@ from requests.exceptions import ConnectionError, HTTPError, Timeout from urllib3.util.retry import Retry -def HTTP_setup(email, click_through_api_key, stage): +def HTTP_setup(email, click_through_api_key, wiley_api_key, stage): #each stage modifies the base url and parameters for that part of the process whilst the general set up and exceptions remain the same # set the headers as a mailto @@ -16,7 +16,17 @@ def HTTP_setup(email, click_through_api_key, stage): elif stage == 'crossref': # crossref is used for downloading full texts from links provided by crossref. # this stage requires a clickthrough API key to be provided and there is no base URL - headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36", + if wiley_api_key != None: + headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36", + 'Accept-Language': "en,en-US;q=0,5", + 'Accept': "text/html,application/pdf,application/xhtml+xml,application/xml,text/plain,text/xml", + 'mailto':email, + 'Wiley-TDM-Client-Token': wiley_api_key, + 'CR-Clickthrough-Client-Token': click_through_api_key, + 'Accept-Encoding': 'gzip, deflate, compress', + 'Accept-Charset': 'ascii, iso-8859-1;q=0.5, *;q=0.1'} + else: + headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36", 'Accept-Language': "en,en-US;q=0,5", 'Accept': "text/html,application/pdf,application/xhtml+xml,application/xml,text/plain,text/xml", 'mailto':email, diff --git a/cadmus/retrieval/HTTP_setup_elsevier.py b/cadmus/retrieval/HTTP_setup_elsevier.py new file mode 100644 index 0000000..4fe9994 --- /dev/null +++ b/cadmus/retrieval/HTTP_setup_elsevier.py @@ -0,0 +1,37 @@ +import urllib.request as request +import requests +from requests.adapters import HTTPAdapter +from requests.exceptions import ConnectionError, HTTPError, Timeout +from urllib3.util.retry import Retry + +def HTTP_setup_elsevier(mail): + + headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36", + 'Accept-Language': "en,en-US;q=0,5", + 'Accept': "text/html,application/pdf,application/xhtml+xml,application/xml,text/plain,text/xml", + 'mailto':mail, + 'Accept-Encoding': 'gzip, deflate, compress', + 'Accept-Charset': 'ascii, iso-8859-1;q=0.5, *;q=0.1'} + base_url = "" + + # initiate a requests.session so we can send multiple requests with the same parameters and cookies persist + http = requests.Session() + + # set the base url, in this case it is the works url for the crossref api + http.headers.update(headers) + + # set up a retry strategy + retry_strategy = Retry( + total=3, + status_forcelist = [429, 500, 502, 503, 504], + method_whitelist = ["GET"], + backoff_factor = 1 + ) + + # add the retry strategy to the adapter for a session + adapter = HTTPAdapter(max_retries=retry_strategy) + + # mount these settings to our session + http.mount(base_url, adapter) + + return http, headers \ No newline at end of file diff --git a/cadmus/retrieval/get_request.py b/cadmus/retrieval/get_request.py index 077247c..d3d14fe 100644 --- a/cadmus/retrieval/get_request.py +++ b/cadmus/retrieval/get_request.py @@ -4,8 +4,11 @@ from requests.exceptions import ConnectionError, HTTPError, Timeout from urllib3.util import Retry from urllib3.exceptions import NewConnectionError +import time -def get_request(input_id, http, base_url, headers, stage): +from cadmus.retrieval.HTTP_setup_elsevier import HTTP_setup_elsevier + +def get_request(input_id, http, base_url, headers, stage, elsevier_api_key, mail = ''): # for text retrieval its best to clear cookies before each request @@ -16,27 +19,46 @@ def get_request(input_id, http, base_url, headers, stage): exception = None attempt = 1 need_to_back_off = False + is_elsevier = False while attempt <3: # we're going to set up a try except system so that we deal with the most common errors try: # send the request to the different APIs website if stage == 'base' or stage == 'doiorg' or stage == 'pubmed': - r = http.get(url = f'{base_url}{input_id}', headers=headers, timeout = (20,120)) + base_url = f'{base_url}{input_id}' + r = http.get(url = f'{base_url}', headers=headers, timeout = (20,120)) elif stage == 'crossref': - r = http.get(f'{base_url}', headers=headers, timeout = (20,120)) + if 'api' in base_url and 'elsevier' in base_url: + if elsevier_api_key != None: + http, headers = HTTP_setup_elsevier(mail) + previous = base_url + base_url = f'{base_url.split("?")[0]}?APIKey={elsevier_api_key}&{base_url.split("?")[1]}' + r = http.get(f'{base_url}', headers=headers, timeout = (20,120)) + is_elsevier = True + base_url = previous + time.sleep(0.3) + else: + r = http.get(f'{base_url}', headers=headers, timeout = (20,120)) + else: + r = http.get(f'{base_url}', headers=headers, timeout = (20,120)) elif stage == 'epmcxml': - r = http.get(f'{base_url}{input_id}/fullTextXML', headers=headers, timeout = (20,120)) + base_url = f'{base_url}{input_id}/fullTextXML' + r = http.get(f'{base_url}', headers=headers, timeout = (20,120)) elif stage == 'epmcsupp': - r = http.get(f'{base_url}{input_id}/supplementaryFiles', headers=headers, timeout = (20,120), stream=True) + base_url = f'{base_url}{input_id}/supplementaryFiles' + r = http.get(f'{base_url}', headers=headers, timeout = (20,120), stream=True) elif stage == 'pmcxmls': - r = http.get(f'{base_url}{input_id}&metadataPrefix=pmc', headers=headers, timeout = (20,120)) + base_url = f'{base_url}{input_id}&metadataPrefix=pmc' + r = http.get(f'{base_url}', headers=headers, timeout = (20,120)) elif stage == 'pmcpdfs' or stage == 'pmctgz': - r = http.get(f'{base_url}{input_id}&format=', headers=headers, timeout = (20,120), stream=True) + base_url = f'{base_url}{input_id}&format=' + r = http.get(f'{base_url}', headers=headers, timeout = (20,120), stream=True) else: pass # check for 200 response and raise exception if not so. - r.raise_for_status() + if r.status_code != 200: + print(f'Error {r.status_code} for {base_url}') #now we have a set of multiple exceptions that might occur except HTTPError as error: @@ -90,13 +112,21 @@ def get_request(input_id, http, base_url, headers, stage): else: pass - # build the output dictionary and return - r_d.update({'status_code':status_code, - 'headers':headers, - 'content':content, - 'text':text, - 'url':r_url, - 'error':exception}) + if is_elsevier == False: + # build the output dictionary and return + r_d.update({'status_code':status_code, + 'headers':headers, + 'content':content, + 'text':text, + 'url':r_url, + 'error':exception}) + else: + r_d.update({'status_code':status_code, + 'headers':headers, + 'content':content, + 'text':text, + 'url':base_url, + 'error':exception}) # now we close the response objects to keep the number of open files to a minimum if r != None: diff --git a/cadmus/retrieval/parse_link_retrieval.py b/cadmus/retrieval/parse_link_retrieval.py index a3ba1e6..ba881ca 100644 --- a/cadmus/retrieval/parse_link_retrieval.py +++ b/cadmus/retrieval/parse_link_retrieval.py @@ -23,7 +23,7 @@ # once we get to this stage we have tried quite a few approaches to get a full text document for each article. # we can pull out the records that do not have a tagged version and a pdf version to keep trying for. # we will now go through the dataframe and sequentially try the untried links in the full_text_links dictionary. -def parse_link_retrieval(retrieval_df, email, click_through_api_key, keep_abstract, done = None): +def parse_link_retrieval(retrieval_df, email, click_through_api_key, keep_abstract, wiley_api_key, elsevier_api_key, done = None): counter = 0 stage = 'retrieved2' for index, row in retrieval_df.iterrows(): @@ -73,11 +73,11 @@ def parse_link_retrieval(retrieval_df, email, click_through_api_key, keep_abstra count +=1 # we need to send each link in a get request to determine the response format type. # we can use the same settings as the doi.org step but provide an empty base_url for input along with the link - http, base_url, headers= HTTP_setup(email, click_through_api_key, 'doiorg') + http, base_url, headers= HTTP_setup(email, click_through_api_key, wiley_api_key, 'doiorg') # we send the request using our generic function - response_d, response = get_request(link, http, '', headers, 'doiorg') + response_d, response = get_request(link, http, '', headers, 'doiorg', elsevier_api_key) # check the response status if response_d['status_code'] == 429: diff --git a/setup.py b/setup.py index 73e842f..531d0f4 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setuptools.setup( name="cadmus", - version="0.3.12", + version="0.3.13", author="Jamie Campbell, Ian Simpson, Antoine Lain", author_email="Jamie.campbell@igmm.ed.ac.uk, Ian.Simpson@ed.ac.uk, Antoine.Lain@ed.ac.uk", description="This projects is to build full text retrieval system setup for generation of large biomedical corpora from published literature.",