Skip to content

Commit

Permalink
Consistent setup of MXNet contexts. Added warning if CUDA_VISIBLE_DEV…
Browse files Browse the repository at this point in the history
…ICES is set (#468)

Intermediate solution as proposed in #391
  • Loading branch information
fhieber authored Jul 11, 2018
1 parent da02cbd commit aa4c736
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 102 deletions.
12 changes: 8 additions & 4 deletions sockeye/image_captioning/captioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@
batching, read_list_file, extract_features_forward
from ..lexicon import TopKLexicon
from ..log import setup_main_logger
from ..translate import read_and_translate, _setup_context
from ..utils import check_condition
from ..utils import log_basic_info
from ..translate import read_and_translate
from ..utils import check_condition, log_basic_info, determine_context

logger = setup_main_logger(__name__, file_logging=False)

Expand Down Expand Up @@ -129,7 +128,12 @@ def main():
args.sure_align_threshold)

with ExitStack() as exit_stack:
context = _setup_context(args, exit_stack)
context = determine_context(device_ids=args.device_ids,
use_cpu=args.use_cpu,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)[0]
logger.info("Captioning Device: %s", context)

if not image_preextracted_features:
# Extract features and override input and source_root with tmp location of features
Expand Down
9 changes: 7 additions & 2 deletions sockeye/image_captioning/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from . import utils
from .. import constants as C
from ..log import setup_main_logger
from ..translate import _setup_context
from ..utils import check_condition, determine_context

# Temporary logger, the real one (logging to a file probably, will be created
# in the main function)
Expand Down Expand Up @@ -114,7 +114,12 @@ def main():

# Get pretrained net module (already bind)
with ExitStack() as exit_stack:
context = _setup_context(args, exit_stack)
check_condition(len(args.device_ids) == 1, "extract_features only supports single device for now")
context = determine_context(device_ids=args.device_ids,
use_cpu=args.use_cpu,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)[0]
module, _ = get_pretrained_net(args, context)

# Extract features
Expand Down
43 changes: 21 additions & 22 deletions sockeye/image_captioning/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,26 @@
import mxnet as mx
import numpy as np

from ..config import Config
from ..log import setup_main_logger
from ..train import check_resume, check_arg_compatibility, \
determine_context, create_decoder_config, \
create_optimizer_config, create_training_model
from ..utils import check_condition
# Sockeye captioner
from . import arguments as arguments_image
from . import checkpoint_decoder
from . import data_io as data_io_image
from . import encoder as encoder_image
from .. import constants as C
# Sockeye
from .. import arguments
from .. import constants as C
from .. import data_io
from .. import encoder
from .. import loss
from .. import model
from .. import training
from .. import utils
from .. import vocab
from ..config import Config
from ..log import setup_main_logger
from ..train import check_resume, check_arg_compatibility, create_decoder_config, \
create_optimizer_config, create_training_model
from ..utils import check_condition

# Temporary logger, the real one (logging to a file probably, will be created in the main function)
logger = setup_main_logger(__name__, file_logging=False, console=True)
Expand Down Expand Up @@ -81,20 +80,11 @@ def create_checkpoint_decoder(args: argparse.Namespace,
if args.use_cpu or args.decode_and_evaluate_use_cpu:
context = mx.cpu()
elif args.decode_and_evaluate_device_id is not None:
# decode device is defined from the commandline
num_gpus = utils.get_num_gpus()
check_condition(num_gpus >= 1,
"No GPUs found, consider running on the CPU with --use-cpu "
"(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
"binary isn't on the path).")

if args.disable_device_locking:
context = utils.expand_requested_device_ids([args.decode_and_evaluate_device_id])
else:
context = exit_stack.enter_context(utils.acquire_gpus([args.decode_and_evaluate_device_id],
lock_dir=args.lock_dir))
context = mx.gpu(context[0])

context = utils.determine_context(device_ids=args.decode_and_evaluate_device_id,
use_cpu=False,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)[0]
else:
# default decode context is the last training device
context = train_context[-1]
Expand Down Expand Up @@ -297,7 +287,16 @@ def main():
max_seq_len_source, max_seq_len_target)

with ExitStack() as exit_stack:
context = determine_context(args, exit_stack)
context = utils.determine_context(device_ids=args.device_ids,
use_cpu=args.use_cpu,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)
if args.batch_type == C.BATCH_TYPE_SENTENCE:
check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
"divisible by the number of devices. Choose a batch "
"size that is a multiple of %d." % len(context))
logger.info("Training Device(s): %s", ", ".join(str(c) for c in context))

# Read feature size
if args.image_preextracted_features:
Expand Down
60 changes: 15 additions & 45 deletions sockeye/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,36 +155,6 @@ def check_resume(args: argparse.Namespace, output_folder: str) -> bool:
return resume_training


def determine_context(args: argparse.Namespace, exit_stack: ExitStack) -> List[mx.Context]:
"""
Determine the context we should run on (CPU or GPU).
:param args: Arguments as returned by argparse.
:param exit_stack: An ExitStack from contextlib.
:return: A list with the context(s) to run on.
"""
if args.use_cpu:
logger.info("Training Device: CPU")
context = [mx.cpu()]
else:
num_gpus = utils.get_num_gpus()
check_condition(num_gpus >= 1,
"No GPUs found, consider running on the CPU with --use-cpu "
"(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
"binary isn't on the path).")
if args.disable_device_locking:
context = utils.expand_requested_device_ids(args.device_ids)
else:
context = exit_stack.enter_context(utils.acquire_gpus(args.device_ids, lock_dir=args.lock_dir))
if args.batch_type == C.BATCH_TYPE_SENTENCE:
check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
"divisible by the number of devices. Choose a batch "
"size that is a multiple of %d." % len(context))
logger.info("Training Device(s): GPU %s", context)
context = [mx.gpu(gpu_id) for gpu_id in context]
return context


def create_checkpoint_decoder(args: argparse.Namespace,
exit_stack: ExitStack,
train_context: List[mx.Context]) -> Optional[checkpoint_decoder.CheckpointDecoder]:
Expand All @@ -209,20 +179,11 @@ def create_checkpoint_decoder(args: argparse.Namespace,
if args.use_cpu or args.decode_and_evaluate_use_cpu:
context = mx.cpu()
elif args.decode_and_evaluate_device_id is not None:
# decode device is defined from the commandline
num_gpus = utils.get_num_gpus()
check_condition(num_gpus >= 1,
"No GPUs found, consider running on the CPU with --use-cpu "
"(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
"binary isn't on the path).")

if args.disable_device_locking:
context = utils.expand_requested_device_ids([args.decode_and_evaluate_device_id])
else:
context = exit_stack.enter_context(utils.acquire_gpus([args.decode_and_evaluate_device_id],
lock_dir=args.lock_dir))
context = mx.gpu(context[0])

context = utils.determine_context(device_ids=args.decode_and_evaluate_device_id,
use_cpu=False,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)[0]
else:
# default decode context is the last training device
context = train_context[-1]
Expand Down Expand Up @@ -827,7 +788,16 @@ def train(args: argparse.Namespace):
max_seq_len_source, max_seq_len_target)

with ExitStack() as exit_stack:
context = determine_context(args, exit_stack)
context = utils.determine_context(device_ids=args.device_ids,
use_cpu=args.use_cpu,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)
if args.batch_type == C.BATCH_TYPE_SENTENCE:
check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
"divisible by the number of devices. Choose a batch "
"size that is a multiple of %d." % len(context))
logger.info("Training Device(s): %s", ", ".join(str(c) for c in context))

train_iter, eval_iter, config_data, source_vocabs, target_vocab = create_data_iters_and_vocabs(
args=args,
Expand Down
37 changes: 9 additions & 28 deletions sockeye/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,13 @@
import sys
import time
from contextlib import ExitStack
from typing import Generator, Optional, List

import mxnet as mx
from math import ceil
from typing import Generator, Optional, List

from sockeye.lexicon import TopKLexicon
from sockeye.log import setup_main_logger
from sockeye.output_handler import get_output_handler, OutputHandler
from sockeye.utils import acquire_gpus, get_num_gpus, log_basic_info, check_condition, grouper
from sockeye.utils import determine_context, log_basic_info, check_condition, grouper
from . import arguments
from . import constants as C
from . import data_io
Expand Down Expand Up @@ -65,7 +63,13 @@ def run_translate(args: argparse.Namespace):
args.sure_align_threshold)

with ExitStack() as exit_stack:
context = _setup_context(args, exit_stack)
check_condition(len(args.device_ids) == 1, "translate only supports single device for now")
context = determine_context(device_ids=args.device_ids,
use_cpu=args.use_cpu,
disable_device_locking=args.disable_device_locking,
lock_dir=args.lock_dir,
exit_stack=exit_stack)[0]
logger.info("Translate Device: %s", context)

if args.override_dtype == C.DTYPE_FP16:
logger.warning('Experimental feature \'--override-dtype float16\' has been used. '
Expand Down Expand Up @@ -217,28 +221,5 @@ def translate(output_handler: OutputHandler,
return total_time


def _setup_context(args, exit_stack):
if args.use_cpu:
context = mx.cpu()
else:
num_gpus = get_num_gpus()
check_condition(num_gpus >= 1,
"No GPUs found, consider running on the CPU with --use-cpu "
"(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
"binary isn't on the path).")
check_condition(len(args.device_ids) == 1, "cannot run on multiple devices for now")
gpu_id = args.device_ids[0]
if args.disable_device_locking:
if gpu_id < 0:
# without locking and a negative device id we just take the first device
gpu_id = 0
else:
gpu_ids = exit_stack.enter_context(acquire_gpus([gpu_id], lock_dir=args.lock_dir))
gpu_id = gpu_ids[0]

context = mx.gpu(gpu_id)
return context


if __name__ == '__main__':
main()
35 changes: 34 additions & 1 deletion sockeye/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,37 @@ def log_gpu_memory_usage(memory_data: Dict[int, Tuple[int, int]]):
logger.info(log_str)


def determine_context(device_ids: List[int],
use_cpu: bool,
disable_device_locking: bool,
lock_dir: str,
exit_stack: ExitStack) -> List[mx.Context]:
"""
Determine the MXNet context to run on (CPU or GPU).
:param device_ids: List of device as defined from the CLI.
:param use_cpu: Whether to use the CPU instead of GPU(s).
:param disable_device_locking: Disable Sockeye's device locking feature.
:param lock_dir: Directory to place device lock files in.
:param exit_stack: An ExitStack from contextlib.
:return: A list with the context(s) to run on.
"""
if use_cpu:
context = [mx.cpu()]
else:
num_gpus = get_num_gpus()
check_condition(num_gpus >= 1,
"No GPUs found, consider running on the CPU with --use-cpu "
"(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
"binary isn't on the path).")
if disable_device_locking:
context = expand_requested_device_ids(device_ids)
else:
context = exit_stack.enter_context(acquire_gpus(device_ids, lock_dir=lock_dir))
context = [mx.gpu(gpu_id) for gpu_id in context]
return context


def expand_requested_device_ids(requested_device_ids: List[int]) -> List[int]:
"""
Transform a list of device id requests to concrete device ids. For example on a host with 8 GPUs when requesting
Expand All @@ -493,6 +524,8 @@ def expand_requested_device_ids(requested_device_ids: List[int]) -> List[int]:
:return: A list of device ids.
"""
num_gpus_available = get_num_gpus()
if "CUDA_VISIBLE_DEVICES" in os.environ:
logger.warning("Sockeye currently does not respect CUDA_VISIBLE_DEVICE settings when locking GPU devices.")
return _expand_requested_device_ids(requested_device_ids, num_gpus_available)


Expand All @@ -519,7 +552,7 @@ def _expand_requested_device_ids(requested_device_ids: List[int], num_gpus_avail
@contextmanager
def acquire_gpus(requested_device_ids: List[int], lock_dir: str = "/tmp",
retry_wait_min: int = 10, retry_wait_rand: int = 60,
num_gpus_available: Optional[int]=None):
num_gpus_available: Optional[int] = None):
"""
Acquire a number of GPUs in a transactional way. This method should be used inside a `with` statement.
Will try to acquire all the requested number of GPUs. If currently
Expand Down

0 comments on commit aa4c736

Please sign in to comment.