Skip to content

Commit 1970444

Browse files
committed
Always override default NaNs when loading from CSV and include all defaults from pandas apart from NA
1 parent a996aff commit 1970444

File tree

2 files changed

+58
-14
lines changed

2 files changed

+58
-14
lines changed

huracanpy/_data/_csv.py

+36-2
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,35 @@
77

88
from .. import utils
99

10+
# All values recognised as NaN by pandas.read_csv, except "NA" which we want to load
11+
# normally because it is a basin, and added "" to interpret empty entries as NaN
12+
pandas_na_values = [
13+
" ",
14+
"#N/A",
15+
"#N/A N/A",
16+
"#NA",
17+
"-1.#IND",
18+
"-1.#QNAN",
19+
"-NaN",
20+
"-nan",
21+
"1.#IND",
22+
"1.#QNAN",
23+
"<NA>",
24+
"N/A",
25+
"NULL",
26+
"NaN",
27+
"None",
28+
"n/a",
29+
"nan",
30+
"null ",
31+
"",
32+
]
33+
1034

1135
def load(
1236
filename,
1337
load_function=pd.read_csv,
14-
read_csv_kws=dict(),
38+
**kwargs,
1539
):
1640
"""Load csv tracks data as an xarray.Dataset
1741
These tracks may come from TempestExtremes StitchNodes, or any other source.
@@ -24,13 +48,23 @@ def load(
2448
- time must be defined a single `time`column or by four columns : year, month, day, hour
2549
- track ID must be within a column named track_id.
2650
51+
load_function : callable
52+
One of the load functions in pandas
53+
54+
**kwargs
55+
Remaining keywords are passed to the pandas
56+
2757
Returns
2858
-------
2959
xarray.Dataset
3060
"""
61+
# Update keywords with extra defaults for dealing with "NA" as basin not nan
62+
# Put kwargs second in this statement, so it can override defaults
63+
if load_function is pd.read_csv:
64+
kwargs = {**dict(na_values=pandas_na_values, keep_default_na=False), **kwargs}
3165

3266
## Read file
33-
tracks = load_function(filename, **read_csv_kws)
67+
tracks = load_function(filename, **kwargs)
3468
if (
3569
tracks.columns.str[0][1] == " "
3670
): # Sometimes columns names are read starting with a space, which we remove

huracanpy/_data/_load.py

+22-12
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ def load(
116116
* CSV file - :func:`pandas.read_csv`
117117
* parquet file - :func:`pandas.read_parquet`
118118
119+
For CSV files pandas interprets "NA" as `nan` by default, which is overriden in
120+
this function. To restore the pandas default behavious set
121+
:code:`keep_default_NA=True` and :code:`na_values=[]`
122+
119123
Returns
120124
-------
121125
xarray.Dataset
@@ -145,7 +149,7 @@ def load(
145149
calendar=track_calendar,
146150
)
147151
elif source.lower() in ["csv", "uz"]:
148-
data = _csv.load(filename)
152+
data = _csv.load(filename, **kwargs)
149153
elif source.lower() in ["te", "tempest", "tempestextremes"]:
150154
data = _tempestextremes.load(
151155
filename,
@@ -163,13 +167,13 @@ def load(
163167
filename = "ibtracs.csv"
164168

165169
with ibtracs.online(ibtracs_subset, filename, ibtracs_clean) as f:
166-
data = _csv.load(
167-
f,
168-
read_csv_kws=dict(
170+
# Put IBTrACS specific arguments to read_csv second, so it
171+
# overwrites any arguments passed
172+
kwargs = {
173+
**kwargs,
174+
**dict(
169175
header=0,
170176
skiprows=[1],
171-
na_values=["", " "],
172-
keep_default_na=False,
173177
converters={
174178
"SID": str,
175179
"SEASON": int,
@@ -179,14 +183,20 @@ def load(
179183
"LAT": float,
180184
},
181185
),
186+
}
187+
return load(
188+
filename=f,
189+
source="csv",
190+
rename=rename,
191+
add_info=add_info,
192+
**kwargs,
182193
)
183194
else:
184-
data = _csv.load(
185-
ibtracs.offline(ibtracs_subset),
186-
read_csv_kws=dict(
187-
na_values=["", " "],
188-
keep_default_na=False,
189-
),
195+
return load(
196+
filename=ibtracs.offline(ibtracs_subset),
197+
rename=rename,
198+
add_info=add_info,
199+
**kwargs,
190200
)
191201
else:
192202
raise ValueError(f"Source {source} unsupported or misspelled")

0 commit comments

Comments
 (0)