-
Notifications
You must be signed in to change notification settings - Fork 1
/
gpcp.py
175 lines (149 loc) · 6.11 KB
/
gpcp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python
"""
Copyright 2021 ARC Centre of Excellence for Climate Extremes
author: Paola Petrelli <[email protected]>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This script is used to download, checksum and update the GPCP dataset on
the NCI server
The dataset is stored in /g/data/ia39/aus-ref-clim-data-nci/gpcp/data
The code logs files are currently in /g/data/ia39/aus-ref-clim-data-nci/gpcp/code/update_log.txt
Created:
2018-01-30
Last change:
2022-05-31
Usage:
Inputs are:
y - year to check/download/update the only one required
t - timestep mon or day, default day is true
Uses the following modules:
import requests to download files and html via http
import beautifulsoup4 to parse html
import time and calendar to convert timestamp in filename
to day number from 1-366 for each year
import subprocess to run cksum as a shell command
import argparse to manage inputs
"""
import os, sys
import time, calendar
import argparse
import subprocess
import requests
import re
from datetime import datetime
from bs4 import BeautifulSoup
def parse_input():
'''Parse input arguments '''
parser = argparse.ArgumentParser(description='''
Download GPCP daily and monthly data from the NOAA server
https://www.ncei.noaa.gov/data/global-precipitation-climatology-project-gpcp-{tstep}/access/
using requests to download file and BeautifulSoup to find links in webpage.
Usage: python gpcp.py -y <year> -t <tstep> ''',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-y','--year', type=str, required=True,
help="year to process")
parser.add_argument('-v','--version', type=str, required=True,
help="version to process")
parser.add_argument('-t','--tstep', default="daily", required=False,
help="timestep either monthly or daily, daily is default")
return vars(parser.parse_args())
def download_file(url, fname):
'''Download file using requests '''
r = requests.get(url)
with open(fname, 'wb') as f:
f.write(r.content)
return
def parse_dir(syr, url, data_dir):
'''Parse main page for year and download new files.
Find all the links for netcdf files in that year, if file does not
exists locally then download it. If file exists, compare remote and
local last modified dates.
'''
r = requests.get(url)
main_page = BeautifulSoup(r.content,'html.parser')
for link in main_page.find_all('a',string=re.compile('^%s/' % syr)):
subdir=link.get('href')
r2 = requests.get("/".join([url,subdir]))
year_page = BeautifulSoup(r2.content,'html.parser')
for flink in year_page.find_all('a',string=re.compile(
'^gpcp_.*\.nc$')):
href=flink.get('href')
local_name="/".join([data_dir,subdir[:4],href])
if not os.path.exists(local_name):
print(local_name, 'new')
download_file("/".join([url,subdir,href]),
local_name)
return
def extra_files(yr, data_dir):
'''Check if there is more than one file for day in data directory'''
files = os.listdir(data_dir + f"/{yr}")
alldates = []
tocheck = []
# first check if two consecutive files have same date
for f in files:
fdate = f.split("_")[-2]
if fdate in alldates:
print(f"Found extra file for {fdate}")
tocheck.append(fdate)
alldates.append(fdate)
if tocheck != []:
for fdate in tocheck:
doubles = [f for f in files if fdate in f]
cr_dates = [f.split("_")[-1] for f in doubles]
print(cr_dates)
if cr_dates[1] > cr_dates[0]:
fpath = doubles[0]
else:
fpath = doubles[1]
os.rename(fpath, fpath.replace(f"/{yr}/","/redundant/"))
print(f"Moved {fpath.split('/')[-1]} to redundant directory")
return
def main():
# read year as external argument and move to data directory
inputs = parse_input()
yr = inputs['year']
tstep = inputs['tstep']
version = inputs['version']
# define url for GPCP http server and data_dir for local collection
today = datetime.today().strftime('%Y-%m-%d')
user = os.getenv("USER")
root_dir = os.getenv("AUSREFDIR", "/g/data/ia39/aus-ref-clim-data-nci")
run_dir = f"{root_dir}/gpcp/code"
if tstep == "daily":
data_dir = f"{root_dir}/gpcp/data/day/{version}/tmp/"
else:
data_dir = f"{root_dir}/gpcp/data/mon/{version}/tmp/"
# choose url base don version
url_ncei = "https://www.ncei.noaa.gov/data/global-precipitation-" +
f"climatology-project-gpcp-{tstep}/access/")
if tstep == 'daily':
tstep2 = 'DAY'
else:
tstep2 = 'MON'
url_gesdisc = "https://measures.gesdisc.eosdis.nasa.gov/data/GPCP/GPCP{tsetp2}.3.2/"
# set up to complete
url_dict = {'v1-2': url_ncei, 'v2-3': url_ncei, 'v3-2': url_gesdisc}
url = url_dict[version]
try:
os.chdir(data_dir + f"/{yr}")
except:
os.mkdir(data_dir + f"/{yr}")
# download/update the selected year
print(f"Updated on {today} by {user}")
print(f"Downloading files for {yr}")
parse_dir(yr, url, data_dir)
# check if there are more than one file for each date
print("Checking for redundant files")
extra_files(yr, data_dir)
print("Download is complete")
#new v3.2 daily: https://measures.gesdisc.eosdis.nasa.gov/data/GPCP/GPCPDAY.3.2/
#new v3.2 monthly: https://measures.gesdisc.eosdis.nasa.gov/data/GPCP/GPCPMON.3.2/2023/
if __name__ == "__main__":
main()