Skip to content

Commit

Permalink
Merge pull request #79 from mgckind/memory_input
Browse files Browse the repository at this point in the history
Adding reading by chunks of memory
  • Loading branch information
mgckind committed May 24, 2016
2 parents ac05171 + 6abdaac commit 11c641f
Show file tree
Hide file tree
Showing 4 changed files with 385 additions and 39 deletions.
84 changes: 51 additions & 33 deletions easyaccess/easyaccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -1757,25 +1757,7 @@ def do_show_index(self, arg):
def complete_show_index(self, text, line, begidx, lastidx):
return self._complete_tables(text)

def get_filename(self, line):
# Good to move some of this into eautils.fileio
line = line.replace(';', '')
if line == "":
print('\nMust include table filename!\n')
return
if line.find('.') == -1:
print(colored('\nError in filename\n', "red"))
return

filename = "".join(line.split())
basename = os.path.basename(filename)
alls = basename.split('.')
if len(alls) > 2:
# Oracle tables cannot contain a '.'
print("\nDo not use extra '.' in filename\n")
return

return filename

def check_table_exists(self, table):
# check table first
Expand Down Expand Up @@ -1915,11 +1897,11 @@ def insert_data(self, table, columns, values, dtypes=None, niter = 0):
niter+1, len(values), len(columns), table.upper(), t2 - t1), "green"))


def do_load_table(self, line, name=None, chunksize=None):
def do_load_table(self, line, name=None, chunksize=None, memsize=None):
"""
DB:Loads a table from a file (csv or fits) taking name from filename and columns from header
Usage: load_table <filename> [--tablename NAME] [--chunksize CHUNK]
Usage: load_table <filename> [--tablename NAME] [--chunksize CHUNK] [--memsize MEMCHUNK]
Ex: example.csv has the following content
RA,DEC,MAG
1.23,0.13,23
Expand All @@ -1932,6 +1914,8 @@ def do_load_table(self, line, name=None, chunksize=None):
--tablename NAME given name for the table, default is taken from filename
--chunksize CHUNK Number of rows to be inserted at a time. Useful for large files
that do not fit in memory
--memsize MEMCHUNK The size in Mb to be read in chunks. If both specified, the lower
number of rows is selected (the lower memory limitations)
Note: - For csv or tab files, first line must have the column names (without # or any other comment) and same format
as data (using ',' or space)
Expand All @@ -1943,7 +1927,9 @@ def do_load_table(self, line, name=None, chunksize=None):
load_parser.add_argument('filename', help='name for the file', action='store', default=None)
load_parser.add_argument('--tablename', help='name for the table', action='store', default=None)
load_parser.add_argument('--chunksize', help='number of rows to read in blocks to avoid memory '
'issues', action='store', type=int, default=None)
'issues', action='store', type=int, default=None)
load_parser.add_argument('--memsize', help='size of the chunks to be read in Mb ',
action='store', type=int, default=None)
load_parser.add_argument('-h', '--help', help='print help', action='store_true')
try:
load_args = load_parser.parse_args(line.split())
Expand All @@ -1953,11 +1939,20 @@ def do_load_table(self, line, name=None, chunksize=None):
if load_args.help:
self.do_help('load_table')
return
filename = self.get_filename(load_args.filename)
filename = eafile.get_filename(load_args.filename)
table = load_args.tablename
chunk = load_args.chunksize
memchunk = load_args.memsize
if chunksize is not None:
chunk = chunksize
if memsize is not None:
memchunk = memsize
if memchunk is not None:
memchunk_rows = eafile.get_chunksize(filename, memory=memchunk)
if chunk is not None:
chunk = min(chunk, memchunk_rows)
else:
chunk = memchunk_rows
if filename is None: return
base, ext = os.path.splitext(os.path.basename(filename))

Expand All @@ -1978,7 +1973,7 @@ def do_load_table(self, line, name=None, chunksize=None):
return

try:
data, iterator = self.load_data(filename)
data, iterator = eafile.read_file(filename)
except:
print_exception()
return
Expand Down Expand Up @@ -2070,11 +2065,11 @@ def complete_load_table(self, text, line, start_idx, end_idx):
return _complete_path(line)


def do_append_table(self, line, name=None, chunksize=None):
def do_append_table(self, line, name=None, chunksize=None, memsize=None):
"""
DB:Appends a table from a file (csv or fits) taking name from filename and columns from header.
Usage: append_table <filename> [--tablename NAME] [--chunksize CHUNK]
Usage: append_table <filename> [--tablename NAME] [--chunksize CHUNK] [--memsize MEMCHUNK]
Ex: example.csv has the following content
RA,DEC,MAG
1.23,0.13,23
Expand All @@ -2087,7 +2082,9 @@ def do_append_table(self, line, name=None, chunksize=None):
--tablename NAME given name for the table, default is taken from filename
--chunksize CHUNK Number of rows to be inserted at a time. Useful for large files
that do not fit in memory
that do not fit in memory
--memsize MEMCHUNK The size in Mb to be read in chunks. If both specified, the lower
number of rows is selected (the lower memory limitations)
Note: - For csv or tab files, first line must have the column names (without # or any other comment) and same format
as data (using ',' or space)
Expand All @@ -2100,6 +2097,8 @@ def do_append_table(self, line, name=None, chunksize=None):
append_parser.add_argument('--tablename', help='name for the table to append to', action='store', default=None)
append_parser.add_argument('--chunksize', help='number of rows to read in blocks to avoid memory '
'issues', action='store', default=None, type=int)
append_parser.add_argument('--memsize', help='size of the chunks to be read in Mb ', action='store',
type=int, default=None)
append_parser.add_argument('-h', '--help', help='print help', action='store_true')
try:
append_args = append_parser.parse_args(line.split())
Expand All @@ -2109,11 +2108,21 @@ def do_append_table(self, line, name=None, chunksize=None):
if append_args.help:
self.do_help('append_table')
return
filename = self.get_filename(append_args.filename)
filename = eafile.get_filename(append_args.filename)
table = append_args.tablename
chunk = append_args.chunksize
memchunk = append_args.memsize
if chunksize is not None:
chunk = chunksize
if memsize is not None:
memchunk = memsize
if memchunk is not None:
memchunk_rows = eafile.get_chunksize(filename, memory=memchunk)
if chunk is not None:
chunk = min(chunk, memchunk_rows)
else:
chunk = memchunk_rows

if filename is None: return
base, ext = os.path.splitext(os.path.basename(filename))

Expand All @@ -2134,7 +2143,7 @@ def do_append_table(self, line, name=None, chunksize=None):
'\n DESDB ~> CREATE TABLE %s (COL1 TYPE1(SIZE), ..., COLN TYPEN(SIZE));\n' % table.upper())
return
try:
data, iterator = self.load_data(filename)
data, iterator = eafile.read_file(filename)
except:
print_exception()
return
Expand Down Expand Up @@ -2508,7 +2517,7 @@ def myquota(self):
"""
self.do_myquota('')

def load_table(self, table_file, name=None, chunksize=None):
def load_table(self, table_file, name=None, chunksize=None, memsize=None):
"""
Loads and create a table in the DB. If name is not passed, is taken from
the filename. Formats supported are 'fits', 'csv' and 'tab' files
Expand All @@ -2518,21 +2527,22 @@ def load_table(self, table_file, name=None, chunksize=None):
table_file : Filename to be uploaded as table (.csv, .fits, .tab)
name : Name of the table to be created
chunksize : Number of rows to upload at a time to avoid memory issues
memsize : Size of chunk to be read. In Mb. If both specified, the lower number of rows is selected
Returns:
--------
True if success otherwise False
"""
try:
self.do_load_table(table_file, name=name, chunksize=chunksize)
self.do_load_table(table_file, name=name, chunksize=chunksize, memsize=memsize)
return True
except:
# exception
return False


def append_table(self, table_file, name=None, chunksize=None):
def append_table(self, table_file, name=None, chunksize=None, memsize=None):
"""
Appends data to a table in the DB. If name is not passed, is taken from
the filename. Formats supported are 'fits', 'csv' and 'tab' files
Expand All @@ -2542,13 +2552,14 @@ def append_table(self, table_file, name=None, chunksize=None):
table_file : Filename to be uploaded as table (.csv, .fits, .tab)
name : Name of the table to be created
chunksize : Number of rows to upload at a time to avoid memory issues
memsize : Size of chunk to be read. In Mb. If both specified, the lower number of rows is selected
Returns:
--------
True if success otherwise False
"""
try:
self.do_append_table(table_file, name=name, chunksize=chunksize)
self.do_append_table(table_file, name=name, chunksize=chunksize, memsize=memsize)
return True
except:
return False
Expand Down Expand Up @@ -2664,7 +2675,10 @@ def initial_message(quiet=False, clear=True):
or --append_table")
parser.add_argument("--chunksize", dest='chunksize', type=int, default = None,
help="Number of rows to be inserted at a time. Useful for large files \
that do not fit in memory. Use with --load_table")
that do not fit in memory. Use with --load_table or --append_table")
parser.add_argument("--memsize", dest='memsize', type=int, default = None,
help=" Size of chunk to be read at a time in Mb. Use with --load_table or "
"--append_table")
parser.add_argument("-s", "--db",dest='db', #choices=[...]?
help="Override database name [dessci,desoper,destest]")
parser.add_argument("-q", "--quiet", action="store_true", dest='quiet',
Expand Down Expand Up @@ -2785,6 +2799,8 @@ def colored(line, color): return line
linein += ' --tablename ' + args.tablename
if args.chunksize is not None:
linein += ' --chunksize ' + str(args.chunksize)
if args.memsize is not None:
linein += ' --memsize ' + str(args.memsize)
cmdinterp.onecmd(linein)
os._exit(0)
elif args.appendtable is not None:
Expand All @@ -2795,6 +2811,8 @@ def colored(line, color): return line
linein += ' --tablename ' + args.tablename
if args.chunksize is not None:
linein += ' --chunksize ' + str(args.chunksize)
if args.memsize is not None:
linein += ' --memsize ' + str(args.memsize)
cmdinterp.onecmd(linein)
os._exit(0)
else:
Expand Down
62 changes: 62 additions & 0 deletions easyaccess/eautils/fileio.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
import eautils.dtypes as eatypes
import version

try:
from termcolor import colored
except:
def colored(line, color): return line

PANDAS_DEFS = ('comma separated text', 'space separated tex', 'HDF5 format')
PANDAS_EXTS = ('.csv', '.tab', '.h5')

Expand All @@ -31,6 +36,63 @@
FILE_EXTS = PANDAS_EXTS + FITS_EXTS


def get_filename(line):
"""
Return filename after checking it has the right structure (no extra periods)
"""
line = line.replace(';', '')
if line == "":
print('\nMust include table filename!\n')
return
if line.find('.') == -1:
print(colored('\nError in filename\n', "red"))
return

filename = "".join(line.split())
basename = os.path.basename(filename)
alls = basename.split('.')
if len(alls) > 2:
# Oracle tables cannot contain a '.'
print("\nDo not use extra '.' in filename\n")
return

return filename


def get_chunksize(filename, memory=500):
"""
Get the approximate number of lines ot be read given memory constrains
Parameters:
-----------
filename : File name
memory : Memory in MB to compute the approximate number of rows
Returns:
--------
The number of rows need to be read for each chunk of memory
"""
base, ext = os.path.splitext(filename)
check_filetype(ext, FILE_EXTS)

if ext in PANDAS_EXTS:
if ext == '.csv': sepa = ','
elif ext == '.tab' : sepa = None
elif ext == '.h5':
return IOError('\nReading HDF5 files by chunks is not supported yet\n')
temp = pd.read_csv(filename, sep=sepa, nrows=100)
bytes_per_row = temp.memory_usage(index=True).sum()/100.
del temp
elif ext in FITS_EXTS:
temp = fitsio.FITS(filename)
temp_data = temp[1][0:100]
bytes_per_row = temp_data.nbytes/100.
temp.close()
del temp_data

return int(memory*1024**2/bytes_per_row)


def cutquery(query, length):
"""
Return query in a list of fixed sized character strings
Expand Down
Loading

0 comments on commit 11c641f

Please sign in to comment.