Skip to content

Commit

Permalink
just try a couple of file encodings before determining it
Browse files Browse the repository at this point in the history
  • Loading branch information
Jan Jurgen Griesfeller committed Aug 15, 2023
1 parent 477eb1a commit 88c4f53
Showing 1 changed file with 33 additions and 9 deletions.
42 changes: 33 additions & 9 deletions pyaerocom/io/read_airnow.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,15 +282,39 @@ def _read_file(self, file):
"""

# determine file encoding first and provide that to pandas
encoding = self.get_file_encoding(file)
df = pd.read_csv(
file,
sep=self.FILE_COL_DELIM,
names=self.FILE_COL_NAMES,
encoding=encoding,
on_bad_lines="skip",
)
# try utf_8 anf cp863 reading first, then
# determine file encoding and provide that to pandas
# just determining the encoding is too slow given the # of files
# Airbase consists of
# just trying a couple of encodings and not determining the encoding all
# the time speeds up reading by a factor of 5
try:
encoding = "utf_8"
df = pd.read_csv(
file,
sep=self.FILE_COL_DELIM,
names=self.FILE_COL_NAMES,
encoding=encoding,
on_bad_lines="skip",
)
except UnicodeDecodeError:
encoding = "cp863"
df = pd.read_csv(
file,
sep=self.FILE_COL_DELIM,
names=self.FILE_COL_NAMES,
encoding=encoding,
on_bad_lines="skip",
)
except:
encoding = self.get_file_encoding(file)
df = pd.read_csv(
file,
sep=self.FILE_COL_DELIM,
names=self.FILE_COL_NAMES,
encoding=encoding,
on_bad_lines="skip",
)
return df

def _read_files(self, files, vars_to_retrieve):
Expand Down

0 comments on commit 88c4f53

Please sign in to comment.