-
Notifications
You must be signed in to change notification settings - Fork 17
/
file_io.py
792 lines (648 loc) · 30.3 KB
/
file_io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
#####################################################################
# Utilities for file reading and writing, both NetCDF and binary
#####################################################################
import numpy as np
import sys
import os
import datetime
from .utils import days_per_month, real_dir, is_depth_dependent, average_12_months
from .constants import months_per_year, days_per_year
# Read a single variable from a NetCDF file. The default behaviour is to read and return the entire record (all time indices), but you can also select a subset of time indices, and/or time-average - see optional keyword arguments.
# Arguments:
# file_path: path to NetCDF file to read
# var_name: name of variable in NetCDF file
# Optional keyword arguments:
# time_index: integer (0-based) containing a time index. If set, the variable will be read for this specific time index, rather than for the entire record.
# t_start: integer (0-based) containing the time index to start reading at. Default is 0 (beginning of the record).
# t_end: integer (0-based) containing the time index to stop reading before (i.e. the first index not read, following python conventions). Default is the length of the record.
# time_average: boolean indicating to time-average the record before returning (will honour t_start and t_end if set, otherwise will average over the entire record). Default False.
# return_info: boolean indicating to return the 'description'/'long_name' and 'units' variables. Default False.
# return_minmax: boolean indicating to return the 'vmin' and 'vmax' attributes. Default False.
# Output: numpy array containing the variable
# Examples:
# Read the entire record:
# temp = read_netcdf('temp.nc', 'temp')
# Read just the first time index:
# temp = read_netcdf('temp.nc', 'temp', time_index=0)
# Read the first 12 time indices:
# temp = read_netcdf('temp.nc', 'temp', t_end=12)
# Read the entire record and time-average:
# temp = read_netcdf('temp.nc', 'temp', time_average=True)
# Read the last 12 time indices and time-average:
# temp = read_netcdf('temp.nc', 'temp', t_start=-12, time_average=True)
def read_netcdf (file_path, var_name, time_index=None, t_start=None, t_end=None, time_average=False, return_info=False, return_minmax=False):
import netCDF4 as nc
# Check for conflicting arguments
if time_index is not None and time_average==True:
print(('Error (function read_netcdf): you selected a specific time index (time_index=' + str(time_index) + '), and also want time averaging (time_average=True). Choose one or the other.'))
sys.exit()
# Open the file
id = nc.Dataset(file_path, 'r')
# Figure out if this variable is time-dependent. We consider this to be the case if the name of its first dimension clearly looks like a time variable (not case sensitive) or if its first dimension is unlimited.
first_dim = id.variables[var_name].dimensions[0]
if first_dim.upper() in ['T', 'TIME', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'SECOND', 'TIME_INDEX', 'DELTAT'] or id.dimensions[first_dim].isunlimited():
# Time-dependent
num_time = id.dimensions[first_dim].size
# Check for 1D timeseries variables
timeseries = len(id.variables[var_name].shape) == 1
# Choose range of time values to consider
# If t_start and/or t_end are already set, use those bounds
# Otherwise, start at the first time_index and/or end at the last time_index in the file
if t_start is None:
t_start = 0
if t_end is None:
t_end = num_time
# Now read the variable
if time_index is not None:
if timeseries:
data = id.variables[var_name][time_index]
else:
data = id.variables[var_name][time_index,:]
else:
if timeseries:
data = id.variables[var_name][t_start:t_end]
else:
data = id.variables[var_name][t_start:t_end,:]
# Time-average if necessary
if time_average:
data = np.mean(data, axis=0)
else:
# Not time-dependent
if time_index is not None or time_average==True or t_start is not None or t_end is not None:
print(('Error (function read_netcdf): you want to do something fancy with the time dimension of variable ' + var_name + ' in file ' + file_path + ', but this does not appear to be a time-dependent variable.'))
sys.exit()
# Read the variable
data = id.variables[var_name][:]
# Remove any one-dimensional entries
data = np.squeeze(data)
if return_info:
try:
description = id.variables[var_name].description
except(AttributeError):
description = id.variables[var_name].long_name
units = id.variables[var_name].units
if return_minmax:
vmin = id.variables[var_name].vmin
vmax = id.variables[var_name].vmax
id.close()
if return_info and return_minmax:
return data, description, units, vmin, vmax
elif return_info:
return data, description, units
elif return_minmax:
return data, vmin, vmax
else:
return data
# Read the time axis from a NetCDF file. The default behaviour is to read and return the entire axis as Date objects, but you can also select a subset of time indices, and/or return as scalars - see optional keyword arguments.
# Arguments:
# file_path: path to NetCDF file to read
# Optional keyword arguments
# var_name: name of time axis. Default 'time'.
# t_start, t_end: as in function read_netcdf
# return_date: boolean indicating to return the time axis as Date objects (so you can easily get year, month, day as attributes). Default True. If False, will just return the axis as scalars.
# monthly: indicates that the output is monthly averaged, so everything will be stamped with the first day of the next month. If True, the function will subtract one month from each timestamp, so it's at the beginning of the correct month.
# Output: 1D numpy array containing the time values (either scalars or Date objects)
def netcdf_time (file_path, var_name='time', t_start=None, t_end=None, return_date=True, monthly=True, return_units=False):
import netCDF4 as nc
# Open the file and get the length of the record
id = nc.Dataset(file_path, 'r')
time_id = id.variables[var_name]
units = time_id.units
try:
calendar = time_id.calendar
except(AttributeError):
calendar = 'standard'
num_time = time_id.size
# Choose range of time values to consider
# If t_start and/or t_end are already set, use those bounds
# Otherwise, start at the first time_index and/or end at the last time_index in the file
if t_start is None:
t_start = 0
if t_end is None:
t_end = num_time
# Read the variable
if return_date:
# Return as handy Date objects
time = nc.num2date(time_id[t_start:t_end], units=units, calendar=calendar)
else:
# Return just as scalar values
time = time_id[t_start:t_end]
id.close()
if return_date:
# Want to convert to a datetime object
if monthly:
# Back up to previous month
for t in range(time.size):
month = time[t].month-1
year = time[t].year
if month < 1:
month += 12
year -= 1
time[t] = datetime.datetime(year, month, 1)
else:
for t in range(time.size):
time[t] = datetime.datetime(time[t].year, time[t].month, time[t].day)
if return_units:
return time, units, calendar
else:
return time
# Given two NetCDF files, figure out which one the given variable is in.
def find_variable (file_path_1, file_path_2, var_name):
import netCDF4 as nc
if var_name in nc.Dataset(file_path_1).variables:
return file_path_1
elif var_name in nc.Dataset(file_path_2).variables:
return file_path_2
else:
print(('Error (find_variable): variable ' + var_name + ' not in ' + file_path_1 + ' or ' + file_path_2))
sys.exit()
# Given time parameters, make sure we will end up with a single record in time.
def check_single_time (time_index, time_average):
if time_index is None and not time_average:
print('Error (check_single_time): either specify time_index or set time_average=True.')
sys.exit()
# Helper function for read_binary and write_binary. Given a precision (32 or 64) and endian-ness ('big' or 'little'), construct the python data type string.
def set_dtype (prec, endian):
if endian == 'big':
dtype = '>'
elif endian == 'little':
dtype = '<'
else:
print('Error (set_dtype): invalid endianness')
sys.exit()
if prec == 32:
dtype += 'f4'
elif prec == 64:
dtype += 'f8'
else:
print('Error (set_dtype): invalid precision')
sys.exit()
return dtype
# Read an array from a binary file and reshape to the correct dimensions. If it's an MITgcm array, use rdmds (built into MITgcmutils) instead.
# Arguments:
# filename: path to binary file
# grid_sizes: list of length 2 or 3 containing [nx, ny] or [nx, ny, nz] grid sizes
# dimensions: string containing dimension characters ('x', 'y', 'z', 't') in any order, e.g. 'xyt'
# Optional keyword arguments:
# prec: precision of data: 32 (default) or 64
# endian: endian-ness of data: 'big' (default) or 'little'
def read_binary (filename, grid_sizes, dimensions, prec=32, endian='big'):
print(('Reading ' + filename))
dtype = set_dtype(prec, endian)
# Extract grid sizes
nx = grid_sizes[0]
ny = grid_sizes[1]
if len(grid_sizes) == 3:
# It's a 3D grid
nz = grid_sizes[2]
elif 'z' in dimensions:
print(('Error (read_binary): ' + dimensions + ' is depth-dependent, but your grid sizes are 2D.'))
sys.exit()
# Read data
data = np.fromfile(filename, dtype=dtype)
# Expected shape of data
shape = []
# Expected size of data, not counting time
size0 = 1
# Now update these initial values
if 'x' in dimensions:
shape = [nx] + shape
size0 *= nx
if 'y' in dimensions:
shape = [ny] + shape
size0 *= ny
if 'z' in dimensions:
shape = [nz] + shape
size0 *= nz
if 't' in dimensions:
# Time-dependent field; figure out how many timesteps
if np.mod(data.size, size0) != 0:
print('Error (read_binary): incorrect dimensions or precision')
sys.exit()
num_time = data.size//size0
shape = [num_time] + shape
else:
# Time-independent field; just do error checking
if data.size != size0:
print('Error (read_binary): incorrect dimensions or precision')
sys.exit()
# Reshape the data and return
return np.reshape(data, shape)
# Write an array ("data"), of any dimension, to a binary file ("file_path"). Optional keyword arguments ("prec" and "endian") are as in function read_binary.
def write_binary (data, file_path, prec=32, endian='big'):
print(('Writing ' + file_path))
if isinstance(data, np.ma.MaskedArray):
# Need to remove the mask
data = data.data
dtype = set_dtype(prec, endian)
# Make sure data is in the right precision
data = data.astype(dtype)
# Write to file
id = open(file_path, 'w')
data.tofile(id)
id.close()
# NCfile object to simplify writing of NetCDF files.
class NCfile:
# Initialisation arguments:
# filename: name for desired NetCDF file
# grid: Grid object
# dimensions: string containing dimension characters in any order, eg 'xyz' or 'xyt'. Include all the dimensions (from x, y, z, t) that any of the variables in the file will need.
def __init__ (self, filename, grid, dimensions, rho=None):
import netCDF4 as nc
# Open the file
self.id = nc.Dataset(filename, 'w')
# Set up the grid
if 't' in dimensions:
self.id.createDimension('time', None)
if 'z' in dimensions:
self.id.createDimension('Z', grid.nz)
self.id.createVariable('Z', 'f8', ('Z'))
self.id.variables['Z'].long_name = 'vertical coordinate of cell center'
self.id.variables['Z'].units = 'm'
self.id.variables['Z'][:] = grid.z
self.id.createDimension('Zl', grid.nz)
self.id.createVariable('Zl', 'f8', ('Zl'))
self.id.variables['Zl'].long_name = 'vertical coordinate of upper cell interface'
self.id.variables['Zl'].units = 'm'
if 'r' in dimensions:
if rho is None:
print('Error (NCfile): need to specify rho axis')
sys.exit()
self.id.createDimension('rho', rho.size)
self.id.createVariable('rho', 'f8', ('rho'))
self.id.variables['rho'].long_name = 'potential density'
self.id.variables['rho'].units = 'kg/m^3'
self.id.variables['rho'][:] = rho
if 'y' in dimensions:
self.id.createDimension('Y', grid.ny)
self.id.createVariable('Y', 'f8', ('Y'))
self.id.variables['Y'].long_name = 'latitude at cell center'
self.id.variables['Y'].units = 'degrees_north'
self.id.variables['Y'][:] = grid.lat_1d
self.id.createDimension('Yp1', grid.ny)
self.id.createVariable('Yp1', 'f8', ('Yp1'))
self.id.variables['Yp1'].long_name = 'latitude at SW corner'
self.id.variables['Yp1'].units = 'degrees_north'
self.id.variables['Yp1'][:] = grid.lat_corners_1d
if 'x' in dimensions:
self.id.createDimension('X', grid.nx)
self.id.createVariable('X', 'f8', ('X'))
self.id.variables['X'].long_name = 'longitude at cell center'
self.id.variables['X'].units = 'degrees_east'
self.id.variables['X'][:] = grid.lon_1d
self.id.createDimension('Xp1', grid.nx)
self.id.createVariable('Xp1', 'f8', ('Xp1'))
self.id.variables['Xp1'].long_name = 'longitude at SW corner'
self.id.variables['Xp1'].units = 'degrees_east'
self.id.variables['Xp1'][:] = grid.lon_corners_1d
# Create and write a variable.
# Arguments:
# var_name: desired name for variable
# data: array of data for that variable
# dimensions: as in initialisation
# Optional keyword arguments:
# gtype: as in function cell_boundaries (plus 'w' for w-grid)
# long_name: descriptor for this variable
# units: units for this variable
# vmin, vmax: optional attributes
# dtype: data type of variable (default 'f8' which is float)
def add_variable (self, var_name, data, dimensions, gtype='t', long_name=None, units=None, calendar=None, vmin=None, vmax=None, dtype='f8'):
# Sort out dimensions
shape = []
if 't' in dimensions:
shape.append('time')
if 'z' in dimensions:
if gtype == 'w':
shape.append('Zl')
else:
shape.append('Z')
if 'r' in dimensions:
shape.append('rho')
if 'y' in dimensions:
if gtype in ['v', 'psi']:
shape.append('Yp1')
else:
shape.append('Y')
if 'x' in dimensions:
if gtype in ['u', 'psi']:
shape.append('Xp1')
else:
shape.append('X')
shape = tuple(shape)
# Initialise the variable
self.id.createVariable(var_name, dtype, shape)
if long_name is not None:
self.id.variables[var_name].long_name = long_name
if units is not None:
self.id.variables[var_name].units = units
if calendar is not None:
self.id.variables[var_name].calendar = calendar
if vmin is not None:
self.id.variables[var_name].vmin = vmin
if vmax is not None:
self.id.variables[var_name].vmax = vmax
# Fill data
self.id.variables[var_name][:] = data
# Special case to simplify writing the time variable.
# Argument:
# time: time values (either numeric values or DateTime objects)
# Optional keyword argument:
# units: units of time (eg 'seconds since 1979-01-01 00:00:00')
def add_time (self, time, units=None, calendar='gregorian'):
import netCDF4 as nc
if isinstance(time[0], datetime.datetime):
# These are DateTime objects
if units is None:
# Set some default units
units = 'seconds since 1979-01-01 00:00:00'
# Convert to numeric values
time = nc.date2num(time, units, calendar=calendar)
self.add_variable('time', time, 't', units=units, calendar=calendar)
# Call this function when you're ready to close the file.
def close (self):
self.id.close()
# Basic version of NCfile for a simple lat-lon file on any regular grid (eg intermediate domain generation steps, see make_domain.py).
class NCfile_basiclatlon:
# Initialisation arguments:
# filename: name for desired NetCDF file
# lon, lat: 1D longitude and latitude arrays
def __init__ (self, filename, lon, lat):
import netCDF4 as nc
self.id = nc.Dataset(filename, 'w')
self.id.createDimension('lat', lat.size)
self.id.createVariable('lat', 'f8', ('lat'))
self.id.variables['lat'].long_name = 'latitude at cell centre'
self.id.variables['lat'].units = 'degrees_north'
self.id.variables['lat'][:] = lat
self.id.createDimension('lon', lon.size)
self.id.createVariable('lon', 'f8', ('lon'))
self.id.variables['lon'].long_name = 'longitude at cell centre'
self.id.variables['lon'].units = 'degrees_east'
self.id.variables['lon'][:] = lon
# Create and write a lat-lon variable.
def add_variable (self, var_name, data, long_name=None, units=None, dtype='f8'):
self.id.createVariable(var_name, dtype, ('lat', 'lon'))
if long_name is not None:
self.id.variables[var_name].long_name = long_name
if units is not None:
self.id.variables[var_name].units = units
self.id.variables[var_name][:] = data
# Call this function when you're ready to close the file.
def close (self):
self.id.close()
# Save a super-basic NetCDF file with one variable and no information about the axes. Must be xy with possible time and depth dimensions.
def write_netcdf_basic (data, var_name, filename, time_dependent=True, units=None):
import netCDF4 as nc
id = nc.Dataset(filename, 'w')
depth_dependent = is_depth_dependent(data, time_dependent=time_dependent)
if time_dependent:
id.createDimension('time', None)
if depth_dependent:
id.createDimension('Z', data.shape[-3])
id.createDimension('Y', data.shape[-2])
id.createDimension('X', data.shape[-1])
id.createVariable('time', 'f8', ['time'])
id.variables['time'][:] = np.arange(data.shape[0])+1
if depth_dependent:
id.createVariable(var_name, 'f8', ['time', 'Z', 'Y', 'X'])
else:
id.createVariable(var_name, 'f8', ['time', 'Y', 'X'])
if units is not None:
id.variables[var_name].units = units
id.variables[var_name][:] = data
id.close()
# Save a very basic NetCDF file with one variable as an error dump from the discard_and_fill function
def write_netcdf_very_basic (data, var_name, filename, use_3d=False):
import netCDF4 as nc
id = nc.Dataset(filename, 'w')
if use_3d:
id.createDimension('Z', data.shape[-3])
id.createDimension('Y', data.shape[-2])
id.createDimension('X', data.shape[-1])
id.createVariable(var_name, 'f8', ['Z', 'Y', 'X'])
else:
id.createDimension('Y', data.shape[-2])
id.createDimension('X', data.shape[-1])
id.createVariable(var_name, 'f8', ['Y', 'X'])
id.variables[var_name][:] = data
id.close()
# Given a list of output files (chronological, could concatenate to make the entire simulation) and a time index we want relative to the beginning of the simulation (0-indexed), find the individual file that time index falls within, and what that time index is relative to the beginning of that file.
def find_time_index (file_list, time_index):
for file_path in file_list:
num_time = netcdf_time(file_path).size
if num_time > time_index:
return file_path, time_index
else:
time_index -= num_time
# If we're still here, we didn't find it
print(("Error (find_time_index): this simulation isn't long enough to contain time_index=" + str(time_index)))
sys.exit()
# Given information about a CMIP6 dataset (path to model directory, ensemble member, experiment, variable, and time code eg 'day' or 'Omon'), return a list of the files containing this data, and the years covered by each file.
# This assumes there are 30-day months in the simulation (true for UKESM1).
# Input arguments:
# model_path: path to directory containing output from that model and the given MIP, eg '/badc/cmip6/data/CMIP6/CMIP/MOHC/UKESM1-0-LL/'
# expt: experiment name, eg 'piControl'
# ensemble_member: ensemble member name, eg 'r1i1p1f2'
# var: variable name, eg 'tas' or 'thetao'
# time_code: name of subdirectory that mentions the frequency and sometimes the domain. eg 'day' for daily atmospheric, 'Omon' for monthly ocean.
# Output:
# in_files: list of files containing this data, in chronological order
# start_years, end_years: lists of integers containing the starting and ending years (inclusive) of each file. The code will check to make sure there are no gaps.
def find_cmip6_files (model_path, expt, ensemble_member, var, time_code):
# Construct the path to the directory containing all the data files, and make sure it exists
in_dir = real_dir(model_path)+expt+'/'+ensemble_member+'/'+time_code+'/'+var+'/gn/latest/'
if not os.path.isdir(in_dir):
print(('Error (find_cmip6_files): no such directory ' + in_dir))
sys.exit()
# Get the names of all the data files in this directory, in chronological order
in_files = []
for fname in os.listdir(in_dir):
if fname.endswith('.nc'):
in_files.append(in_dir+fname)
in_files.sort()
# Work out the start and end years for each file
start_years = []
end_years = []
for file_path in in_files:
# Dates encoded in file names
if time_code.endswith('day'):
start_date = file_path[-20:-12]
end_date = file_path[-11:-3]
elif time_code.endswith('mon'):
start_date = file_path[-16:-10]
end_date = file_path[-9:-3]
start_year = start_date[:4]
end_year = end_date[:4]
# Make sure they are 30-day months and complete years
if (time_code.endswith('day') and start_date[4:] != '0101') or (time_code.endswith('mon') and start_date[4:] != '01'):
print(('Error (find_cmip6_files): '+file_path+' does not start at the beginning of January'))
sys.exit()
if (time_code.endswith('day') and end_date[4:] != '1230') or (time_code.endswith('mon') and end_date[4:] != '12'):
print(('Error (find_cmip6_files): '+file_path+' does not end at the end of December'))
sys.exit()
# Save the start and end years
start_years.append(int(start_year))
end_years.append(int(end_year))
# Now make sure there are no missing years
for t in range(1, len(in_files)):
if start_years[t] != end_years[t-1]+1:
print(('Error (find_cmip6_files): there are missing years in '+in_dir))
sys.exit()
return in_files, start_years, end_years
# Read a list of variables from the same NetCDF file. They all must have the same time index / averaging / etc.
def read_netcdf_list (file_path, var_list, time_index=None, t_start=None, t_end=None, time_average=False):
data = []
for var in var_list:
data.append(read_netcdf(file_path, var, time_index=time_index, t_start=t_start, t_end=t_end, time_average=time_average))
return data
# Read the long name and units of the given variable in the NetCDF file.
def read_title_units (file_path, var_name):
import netCDF4 as nc
id = nc.Dataset(file_path)
return id.variables[var_name].long_name, id.variables[var_name].units
# Read sea ice production, which is the sum of 4 variables. Interface to read_netcdf and adding the four.
def read_iceprod (file_path, time_index=None, t_start=None, t_end=None, time_average=False):
data = None
for var_name in ['SIdHbOCN', 'SIdHbATC', 'SIdHbATO', 'SIdHbFLO']:
data_tmp = read_netcdf(file_path, var_name, time_index=time_index, t_start=t_start, t_end=t_end, time_average=time_average)
if data is None:
data = data_tmp
else:
data += data_tmp
return data
# Calculate annual averages of the given variable in the given (chronological) list of files.
def read_annual_average (var_name, file_paths, return_years=False):
# Inner function to calculate the average and save to data_annual
def update_data_annual (data_read, t0, year, data_annual, calendar):
data_avg = average_12_months(data_read, t0, calendar=calendar, year=year)
# Add a dummy time dimension
data_avg = np.expand_dims(data_avg, axis=0)
if data_annual is None:
data_annual = data_avg
else:
data_annual = np.concatenate((data_annual, data_avg), axis=0)
return data_annual
# Now read all the data
data_tmp = None
data_annual = None
years = []
for f in file_paths:
time, units, calendar = netcdf_time(f, return_units=True)
data = read_netcdf(f, var_name)
if time.size == 1:
# Single time record in this file: add a time dimension
data = np.expand_dims(data, axis=0)
if data_tmp is not None:
# There is a partial year from last time - complete it
num_months = 12-data_tmp.shape[0]
if data.shape[0] < num_months:
print(('Error (read_annual_average): '+f+' has only '+str(data.shape[0])+' time indices. This is too short. Concatenate it with the next one and re-run.'))
sys.exit()
data_tmp2 = data[:num_months,...]
data_year = np.concatenate((data_tmp, data_tmp2), axis=0)
data_annual = update_data_annual(data_year, 0, time[0].year, data_annual, calendar)
t_start = num_months
else:
# This file starts at the beginning of a year
t_start = 0
# Loop over complete years
for t in range(t_start, (time.size-t_start)//12*12, 12):
print((time[t].year))
years.append(time[t].year)
data_annual = update_data_annual(data, t, time[t].year, data_annual, calendar)
if t+12 < time.size:
# Read partial year from end
data_tmp = data[t+12:,...]
print((time[t+12].year))
years.append(time[t+12].year)
else:
# Reset
data_tmp = None
if return_years:
return data_annual, years
else:
return data_annual
# Generate the file name and starting/ending index for a CESM variable for the given experiment, year and ensemble member.
def find_cesm_file (expt, var_name, domain, freq, ens, year, base_dir='/data/oceans_input/raw_input_data/CESM/'):
if expt not in ['LENS', 'PPACE', 'MENS', 'LW1.5', 'LW2.0']:
print('Error (find_cesm_file): invalid experiment ' + expt)
sys.exit()
if expt in ['LW1.5', 'LW2.0']:
expt_string = 'lowarm'
else:
expt_string = expt
file_path = real_dir(base_dir) + '/' + expt_string + '/' + freq + '/' + var_name + '/'
if expt == 'LENS':
if year < 2006:
file_path += 'b.e11.B20TRC5CNBDRD.f09_g16.'
else:
file_path += 'b.e11.BRCP85C5CNBDRD.f09_g16.'
if ens < 36:
file_path += str(ens).zfill(3)
else:
file_path += str(ens-35+100).zfill(3)
if year < 2006:
if ens == 1:
start_year = 1850
else:
start_year = 1920
end_year = 2005
elif year < 2081:
start_year = 2006
if ens < 34:
end_year = 2080
else:
end_year = 2100
else:
if ens >= 34:
start_year = 2006
else:
start_year = 2081
end_year = 2100
elif expt == 'PPACE':
if year < 2006:
file_path += 'b.e11.B20TRLENS.f09_g16.SST.restoring.ens'
else:
file_path += 'b.e11.BRCP85LENS.f09_g16.SST.restoring.ens'
file_path += str(ens).zfill(2)
if year < 2006:
start_year = 1920
end_year = 2005
else:
start_year = 2006
end_year = 2013
elif expt == 'MENS':
file_path += 'b.e11.BRCP45C5CNBDRD.f09_g16.' + str(ens).zfill(3)
start_year = 2006
end_year = 2080
elif expt == 'LW1.5':
file_path += 'b.e11.BRCP26C5CNBDRD.f09_g16.1pt5degC.' + str(ens).zfill(3)
start_year = 2006
end_year = 2100
elif expt == 'LW2.0':
file_path += 'b.e11.BRCP26C5CNBDRD.f09_g16.2pt0degC.' + str(ens).zfill(3)
start_year = 2006
end_year = 2100
if domain == 'atm':
if freq == 'monthly':
file_path += '.cam.h0.'
elif freq == 'daily':
file_path += '.cam.h1.'
elif domain in ['oce', 'ocn']:
file_path += '.pop.h.'
elif domain == 'ice':
file_path += '.cice.h.'
file_path += var_name
if domain == 'ice':
file_path += '_sh'
if freq == 'daily':
file_path += '.' + str(start_year) + '0101-' + str(end_year) + '1231.nc'
per_year = days_per_year
elif freq == 'monthly':
file_path += '.' + str(start_year) + '01-' + str(end_year) + '12.nc'
per_year = months_per_year
t0 = (year-start_year)*per_year
if not os.path.isfile(file_path):
print('Error (find_cesm_file): invalid file formulation for '+var_name+', '+domain+', '+freq+', '+str(ens)+', '+str(year))+'. Maybe the file does not exist on the server?'
sys.exit()
return file_path, t0, t0+per_year