From 71668d7f7789ed297e00610674f5acd350b966da Mon Sep 17 00:00:00 2001 From: Jan Jurgen Griesfeller Date: Thu, 24 Aug 2023 22:20:46 +0200 Subject: [PATCH] WIP: optimise RAM usage #2 --- pyaerocom/io/read_airnow.py | 45 ++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/pyaerocom/io/read_airnow.py b/pyaerocom/io/read_airnow.py index 6e9db1ab4..7138beb02 100644 --- a/pyaerocom/io/read_airnow.py +++ b/pyaerocom/io/read_airnow.py @@ -353,15 +353,34 @@ def _read_files(self, files, vars_to_retrieve): arrs = [] # 1 for pandas, 0 for Python read_flag = 1 + unique_stat_ids = None for i in tqdm(range(len(files))): fp = files[i] + # print(fp) if read_flag == 1: filedata = self._read_file(fp) for i, filevar in enumerate(file_vars_to_retrieve): - try: - arrs.append(filedata[filedata["variable"] == filevar].values) - except: - pass + # try: + arrs.append(filedata[filedata["variable"] == filevar].values) + if unique_stat_ids is None: + unique_stat_ids = np.unique( + (arrs[-1][:, self.FILE_COL_NAMES.index("station_id")]).astype(str) + ) + else: + try: + unique_stat_ids = np.union1d( + unique_stat_ids, + np.unique( + (arrs[-1][:, self.FILE_COL_NAMES.index("station_id")]).astype( + str + ) + ), + ) + except (ValueError, TypeError): + print(arrs[-1][:, self.FILE_COL_NAMES.index("station_id")]) + raise DataRetrievalError( + f"file {fp}: error in creating unique stationlist" + ) else: filedata = self.read_file(fp, vars_to_retrieve=vars_to_retrieve) for i, var in enumerate(vars_to_retrieve): @@ -383,9 +402,9 @@ def _read_files(self, files, vars_to_retrieve): # arrs.append(vardata) if len(arrs) == 0: raise DataRetrievalError("None of the input variables could be found in input list") - return self._filedata_to_statlist(arrs, vars_to_retrieve) + return self._filedata_to_statlist(arrs, vars_to_retrieve, unique_stat_ids=unique_stat_ids) - def _filedata_to_statlist(self, arrs, vars_to_retrieve): + def _filedata_to_statlist(self, arrs, vars_to_retrieve, unique_stat_ids=None): """ Convert loaded filedata into list of StationData objects @@ -426,11 +445,15 @@ def _filedata_to_statlist(self, arrs, vars_to_retrieve): subset = data[mask] dtime_subset = dtime[mask] # not all stations seems to provide the station id as string... - try: - statlist = np.unique(subset[:, statcol]) - except TypeError: - tmp_str = [subset[:, statcol][x] for x in range(len(subset[:, statcol]))] - statlist = np.unique(tmp_str) + if unique_stat_ids is None: + try: + statlist = np.unique((subset[:, statcol]).astype(str)) + except TypeError: + raise DataRetrievalError("error in creating an unique station list") + # tmp_str = [subset[:, statcol][x] for x in range(len(subset[:, statcol]))] + # statlist = np.unique(tmp_str) + else: + statlist = unique_stat_ids for stat_id in tqdm(statlist, desc=var): if not stat_id in stat_ids: continue