From 20efe276e2ed859ccd8db8cbe80670cbe4073c1d Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Mon, 26 Oct 2020 13:32:43 -0700
Subject: [PATCH 01/14] Adding support for sampling from distributions for
 random hyperopt

---
 README.md             |  50 ++++++++++-------
 examples/mnist.yml    |  12 ++++-
 runx/distributions.py | 122 ++++++++++++++++++++++++++++++++++++++++++
 runx/runx.py          |  51 +++++++++++++++---
 runx/utils.py         |   2 +-
 5 files changed, 207 insertions(+), 30 deletions(-)
 create mode 100644 runx/distributions.py

diff --git a/README.md b/README.md
index 1289121..bcae11c 100755
--- a/README.md
+++ b/README.md
@@ -8,17 +8,26 @@
 * unique, per-run, directory creation
 
 # Table of Contents
-- [Quick-start Installation](#quick-start-installation)
-- [Introduction: a simple example](#introduction--a-simple-example)
-- [runx Architecture](#runx-architecture)
-- [Create a project-specific configuration file](#create-a-project-specific-configuration-file)
-- [Run directory, logfiles](#run-directory--logfiles)
-- [Experiment yaml details](#experiment-yaml-details)
-  * [Booleans](#booleans)
-  * [Lists, Inheritance](#lists--inheritance)
-- [logx - logging, tensorboarding, checkpointing](#logx---logging--tensorboarding--checkpointing)
-- [sumx - summarizing your runs](#sumx---summarizing-your-runs)
-- [NGC Support](#ngc-support)
+- [runx - An experiment management tool](#runx---an-experiment-management-tool)
+- [Table of Contents](#table-of-contents)
+  - [Quick-start Installation](#quick-start-installation)
+  - [Introduction example](#introduction-example)
+    - [runx is especially useful to launch batch jobs to a farm.](#runx-is-especially-useful-to-launch-batch-jobs-to-a-farm)
+    - [Unique run directories](#unique-run-directories)
+  - [Summarization with sumx](#summarization-with-sumx)
+  - [runx Architecture](#runx-architecture)
+  - [Create a project-specific configuration file](#create-a-project-specific-configuration-file)
+  - [Run directory, logfiles](#run-directory-logfiles)
+  - [Staging of code](#staging-of-code)
+  - [Experiment yaml details](#experiment-yaml-details)
+    - [Special variables](#special-variables)
+    - [HPARAMS](#hparams)
+      - [A simple example of HPARAMS is:](#a-simple-example-of-hparams-is)
+    - [Booleans](#booleans)
+    - [Lists, Inheritance](#lists-inheritance)
+  - [logx - logging, tensorboarding, checkpointing](#logx---logging-tensorboarding-checkpointing)
+  - [sumx - summarizing your runs](#sumx---summarizing-your-runs)
+  - [NGC Support](#ngc-support)
 
 ## Quick-start Installation
 
@@ -59,7 +68,7 @@ Now you can run the sweep with runx:
 python train.py --lr 0.01 --solver sgd
 python train.py --lr 0.01 --solver adam
 python train.py --lr 0.02 --solver sgd
-python train.py --lr 0.02 --solver adam 
+python train.py --lr 0.02 --solver adam
 ```
 You can see that runx automatically computes the cross product of all hyperparameters, which in this case results in 4 runs. It then builds commandlines by concatenating the hyperparameters with the training command.
 
@@ -68,7 +77,7 @@ A few useful runx options:
 -n     don't run, just print the command
 -i     interactive mode (as opposed to submitting jobs to a farm)
 ```
-### runx is especially useful to launch batch jobs to a farm. 
+### runx is especially useful to launch batch jobs to a farm.
 
 Farm support is simple. First create a .runx file that configures the farm:
 ```yaml
@@ -83,8 +92,11 @@ bigfarm:
      mem: 128
 ```
 **LOGROOT**: this is where the output of runs should go
+
 **FARM**: you can define multiple farm targets. This selects which one to use
+
 **SUBMIT_CMD**: the script you use to launch jobs to a farm
+
 **RESOURCES**: the arguments to present to SUBMIT_CMD
 
 Now when you run runx, it will generate commands that will attempt to launch jobs to a farm using your SUBMIT_CMD. Notice that we left out the `-i` cmdline arg because now we want to submit jobs and not run them interactively.
@@ -99,7 +111,7 @@ submit_job --gpu 2 --cpu 16 --mem 128 -c "python train.py --lr 0.02 --solver ada
 ```
 
 ### Unique run directories
-We want the results for each training run to go into a unique output/log directory.  We don't want things like tensorboard files or logfiles to write over each other. `runx` solves this problem by automatically generating a unique output directory per run. 
+We want the results for each training run to go into a unique output/log directory.  We don't want things like tensorboard files or logfiles to write over each other. `runx` solves this problem by automatically generating a unique output directory per run.
 
 You have access to this unique directory name within your experiment yaml via the special variable: `LOGDIR`. Your training
 script may use this path and write its output there.
@@ -225,7 +237,7 @@ If you include the RUNX.TAG field in your experiment yaml or if you supply the -
 `runx` actually makes a copy of your code within each run's log directory. This is done for a number of reasons:
 * If you wish to continue modifying your code, while a training run is going on, you may do so without worry whether it will affect the running job(s)
 * In case your job dies and you must restart it, the code and training environment is self-contained within the logdir of a run.
-* This is also useful for documentation purposes: in case you ever want to know exactly the state of the code for a given run. 
+* This is also useful for documentation purposes: in case you ever want to know exactly the state of the code for a given run.
 
 
 ## Experiment yaml details
@@ -233,7 +245,7 @@ If you include the RUNX.TAG field in your experiment yaml or if you supply the -
 ### Special variables
 **CMD** - Your base training command. You typically don't include any args here.
 **HPARAMS** - All hyperparmeters. This is a datastructure that may either be a simple dict of params or may be a list of dicts. Furthermore, each hyperparameter may be a scalar or list or boolean.
-**PYTHONPATH** - This is field optional. For the purpose of altering the default PYTHONPATH which is simply LOGDIR/code. Can be a colon-separated list of paths. May include LOGDIR special variable. 
+**PYTHONPATH** - This is field optional. For the purpose of altering the default PYTHONPATH which is simply LOGDIR/code. Can be a colon-separated list of paths. May include LOGDIR special variable.
 
 ### HPARAMS
 
@@ -249,7 +261,7 @@ HPARAMS:
   epochs: 10
   RUNX.TAG: 'alexnet'
 ```
-Here, there will be 2 runs that will be created. 
+Here, there will be 2 runs that will be created.
 
 ### Booleans
 If you want to specify that a boolean flag should be on or off, this is done using `true` and `false` keywords:
@@ -302,7 +314,7 @@ HPARAMS: [
   }
 ]
 ```
-You might observe that hparams is now a list of two dicts. 
+You might observe that hparams is now a list of two dicts.
 The nice thing is that runx assumes inheritance from the first item in the list to all remaining dicts, so that you don't have to re-type all the redundant hyperparms.
 
 When you pass this yaml to runx, you'll get the following out:
@@ -347,7 +359,7 @@ Then, substitute the following logx calls into your code:
 | writer.add_image()  | logx.add_image()  | tensorboard image writes  |
 |                     | logx.save_model() | save latest/best models   |
 
-Finally, in order for sumx to be able to read the results of your run, you have to push your metrics to logx. You should definitely push the 'val' metrics, but can push 'train' metrics if you like (sumx doesn't consume them at the moment). 
+Finally, in order for sumx to be able to read the results of your run, you have to push your metrics to logx. You should definitely push the 'val' metrics, but can push 'train' metrics if you like (sumx doesn't consume them at the moment).
 
 ```python
 # define which metrics to record
diff --git a/examples/mnist.yml b/examples/mnist.yml
index e9fd690..94ecdc9 100755
--- a/examples/mnist.yml
+++ b/examples/mnist.yml
@@ -1,6 +1,14 @@
 CMD: "python mnist.py"
 
+NUM_TRIALS: 10
 HPARAMS:
-   lr: [0.01, 0.02]
-   momentum: 0.5
+   lr:
+      distribution: log_uniform
+      low: 0.0001
+      high: 0.1
+      base: 10
+   momentum:
+      distribution: uniform
+      low: 0.5
+      high: 0.999
    logdir: LOGDIR
diff --git a/runx/distributions.py b/runx/distributions.py
new file mode 100644
index 0000000..3b37f42
--- /dev/null
+++ b/runx/distributions.py
@@ -0,0 +1,122 @@
+import copy
+import itertools
+import math
+import random
+from typing import List, Union, Any, Iterable, Dict
+
+
+Primitive = Union[int, float, str]
+
+
+class NotSupportedException(Exception):
+    pass
+
+
+class BaseDistribution:
+    def sample(self) -> Primitive:
+        """Returns a single sample from the distribution."""
+        raise ValueError("Not implemented!")
+
+    @property
+    def is_discrete(self) -> bool:
+        """Returns whether the set of possible values can be enumerated."""
+        return False
+
+    def enumerate(self) -> Iterable[Primitive]:
+        raise NotSupportedException(f'The distribution "{type(self)}" is not discrete!')
+
+
+class CategoricalDistribution(BaseDistribution):
+    def __init__(self, categories: List[Primitive]):
+        self.categories = categories
+
+    def sample(self):
+        return random.choice(self.categories)
+
+    @property
+    def is_discrete(self):
+        return True
+
+    def enumerate(self):
+        return self.categories
+
+
+class UniformDistribution(BaseDistribution):
+    def __init__(self, low: float, high: float):
+        self.low = low
+        self.high = high
+
+    def sample(self):
+        return random.uniform(self.low, self.high)
+
+
+class NormalDistribution(BaseDistribution):
+    def __init__(self, mean: float=0, std: float=1):
+        self.mean = mean
+        self.std = std
+
+    def sample(self):
+        return random.normalvariate(self.mean, self.std)
+
+
+class LogUniformDistribution(BaseDistribution):
+    def __init__(self, low: float, high: float, base: float=None):
+        self.low = low
+        self.high = high
+        self.base = base
+
+    def sample(self):
+        log_low = math.log(self.low, self.base)
+        log_high = math.log(self.high, self.base)
+
+        v = random.uniform(log_low, log_high)
+
+        if self.base is None:
+            return math.exp(v)
+        else:
+            return self.base ** v
+
+
+def convert_to_distribution(val: Union[Primitive, List[Primitive], BaseDistribution]) -> BaseDistribution:
+    if isinstance(val, BaseDistribution):
+        return val
+    if isinstance(val, (list, tuple)):
+        return CategoricalDistribution(val)
+
+    return CategoricalDistribution([val])
+
+
+def can_enumerate(distributions: List[BaseDistribution]) -> bool:
+    return all(d.is_discrete for d in distributions)
+
+
+def enumerate_dists(distributions: List[BaseDistribution]) -> List[List[Primitive]]:
+    all_prims = [list(d.enumerate()) for d in distributions]
+
+    expanded_prims = itertools.product(*all_prims)
+
+    realizations = list(expanded_prims)
+
+    return realizations
+
+
+def sample_dists(distributions: List[BaseDistribution]) -> List[Primitive]:
+    return [d.sample() for d in distributions]
+
+_FACTORIES = {
+    'uniform': UniformDistribution,
+    'normal': NormalDistribution,
+    'log_uniform': LogUniformDistribution,
+}
+
+def load_config(cfg: Union[Any, Dict[str, Primitive]]):
+    if isinstance(cfg, dict) and 'distribution' in cfg:
+        cfg = copy.copy(cfg)
+        dist_name = cfg['distribution']
+        del cfg['distribution']
+
+        v = _FACTORIES[dist_name](**cfg)
+    else:
+        v = cfg
+
+    return convert_to_distribution(v)
\ No newline at end of file
diff --git a/runx/runx.py b/runx/runx.py
index b874ed1..af18522 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -34,6 +34,7 @@
 from shutil import copytree, ignore_patterns
 
 import os
+import random
 import re
 import sys
 import subprocess
@@ -41,9 +42,14 @@
 import argparse
 import itertools
 
-from .utils import read_config, save_hparams, exec_cmd
 from .config import cfg
+from .distributions import (
+    load_config, convert_to_distribution,
+    can_enumerate, enumerate_dists,
+    sample_dists
+)
 from .farm import build_farm_cmd, upload_to_ngc
+from .utils import read_config, save_hparams, exec_cmd
 
 
 parser = argparse.ArgumentParser(description='Experiment runner')
@@ -79,7 +85,7 @@ def expand_hparams(hparams):
 def construct_cmd(cmd, hparams, logdir):
     """
     Build training command by starting with user-supplied 'CMD'
-    and then adding in hyperparameters, which came from expanding the 
+    and then adding in hyperparameters, which came from expanding the
     cross-product of all permutations from the experiment yaml file.
 
     We always copy the code to the target logdir and run from there.
@@ -102,7 +108,7 @@ def construct_cmd(cmd, hparams, logdir):
     exec_str = ''
     if 'submit_job' in cfg.SUBMIT_CMD:
         exec_str = 'exec'
-        
+
     cmd = f'cd {logdir}/code; PYTHONPATH={pythonpath} {exec_str} {cmd}'
     return cmd
 
@@ -149,13 +155,39 @@ def cross_product_hparams(hparams):
     expanded_hparams = itertools.product(*hparam_values)
 
     # have to do this in order to know length
-    expanded_hparams, dup_expanded = itertools.tee(expanded_hparams, 2)
     expanded_hparams = list(expanded_hparams)
-    num_cases = len(list(dup_expanded))
+    num_cases = len(expanded_hparams)
 
     return expanded_hparams, num_cases
 
 
+def enumerate_hparams(hparams, num_trials):
+    for k in hparams:
+        hparams[k] = load_config(hparams[k])
+
+    if can_enumerate(hparams.values()):
+        realizations = enumerate_dists(hparams.values())
+
+        if num_trials == 0 or num_trials > len(realizations):
+            return realizations
+        else:
+            return random.choices(realizations, k=num_trials)
+    else:
+        if num_trials == 0:
+            raise ValueError("The number of trials must be specified"
+                             " when optimizing over continuous"
+                             " distributions")
+
+        realizations = [
+            sample_dists(hparams.values())
+            for _ in range(num_trials)
+        ]
+
+        return realizations
+
+    # # TODO: Determine if full grid search, or smarter
+    # return cross_product_hparams(hparams)
+
 def get_field(adict, f, required=True):
     if required:
         assert f in adict, 'expected {} to be defined in experiment'.format(f)
@@ -283,8 +315,11 @@ def run_yaml(experiment, runroot):
     for k, v in experiment['HPARAMS'].items():
         yaml_hparams[k] = v
 
+    num_trials = experiment.get('NUM_TRIALS', 0)
+
     # Calculate cross-product of hyperparams
-    expanded_hparams, num_cases = cross_product_hparams(yaml_hparams)
+    expanded_hparams = enumerate_hparams(yaml_hparams, num_trials)
+    num_cases = len(expanded_hparams)
 
     # Run each permutation
     for i, hparam_vals in enumerate(expanded_hparams):
@@ -328,7 +363,7 @@ def run_yaml(experiment, runroot):
 
         if not args.interactive:
             cmd = build_farm_cmd(cmd, job_name, resource_copy, logdir)
-            
+
         if args.no_run:
             print(cmd)
             continue
@@ -351,7 +386,7 @@ def run_yaml(experiment, runroot):
         else:
             print('Submitting job {}'.format(job_name))
         exec_cmd(cmd)
-            
+
 
 def run_experiment(exp_fn):
     """
diff --git a/runx/utils.py b/runx/utils.py
index c392827..6360026 100755
--- a/runx/utils.py
+++ b/runx/utils.py
@@ -81,7 +81,7 @@ def read_config_file():
     elif os.path.exists(global_config_fn):
         config_fn = global_config_fn
     else:
-        raise('can\'t find file ./.runx or ~/.config/runx.yml config files')
+        raise FileNotFoundError('Can\'t find file ./.runx or ~/.config/runx.yml config files')
     if 'FullLoader' in dir(yaml):
         global_config = yaml.load(open(config_fn), Loader=yaml.FullLoader)
     else:

From 3306d36e29897b6c63ab8796e41953d256a975c1 Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Mon, 26 Oct 2020 13:47:04 -0700
Subject: [PATCH 02/14] Bug fixes

---
 examples/mnist_multi.yml | 13 +++++++++----
 runx/distributions.py    |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/mnist_multi.yml b/examples/mnist_multi.yml
index 2612610..cfa532d 100755
--- a/examples/mnist_multi.yml
+++ b/examples/mnist_multi.yml
@@ -1,17 +1,22 @@
 CMD: "python mnist.py"
 
+NUM_TRIALS: 10
 HPARAMS: [
   {
-   RUNX.TAG: 'foo',
+   RUNX.TAG: 'sgd',
    lr: 0.01,
    momentum: [0.5, 0.25],
    logdir: LOGDIR,
   },
   {
-   RUNX.TAG: 'bar',
-   lr: 0.02,
+   RUNX.TAG: 'adam',
+   lr: {
+    distribution: log_uniform,
+    low: 0.0001,
+    high: 0.1
+   },
    momentum: [0.25, 0.12],
    logdir: LOGDIR,
   },
 ]
-   
+
diff --git a/runx/distributions.py b/runx/distributions.py
index 3b37f42..13970e5 100644
--- a/runx/distributions.py
+++ b/runx/distributions.py
@@ -60,7 +60,7 @@ def sample(self):
 
 
 class LogUniformDistribution(BaseDistribution):
-    def __init__(self, low: float, high: float, base: float=None):
+    def __init__(self, low: float, high: float, base: float=math.e):
         self.low = low
         self.high = high
         self.base = base

From 0875eb6da34f7c874dac57f290c18a4499f8d4ba Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Tue, 27 Oct 2020 12:42:53 -0700
Subject: [PATCH 03/14] Refactoring. Explicit categories are guaranteed to be
 sampled fairly.

---
 examples/mnist.yml       |   1 +
 examples/mnist_multi.yml |   1 +
 runx/distributions.py    | 100 +++++++++++++++++++++++++++++++++++----
 runx/runx.py             |  73 ++--------------------------
 4 files changed, 99 insertions(+), 76 deletions(-)

diff --git a/examples/mnist.yml b/examples/mnist.yml
index 94ecdc9..a13ba13 100755
--- a/examples/mnist.yml
+++ b/examples/mnist.yml
@@ -11,4 +11,5 @@ HPARAMS:
       distribution: uniform
       low: 0.5
       high: 0.999
+   optim: ['sgd', 'adam', 'radam']
    logdir: LOGDIR
diff --git a/examples/mnist_multi.yml b/examples/mnist_multi.yml
index cfa532d..911a752 100755
--- a/examples/mnist_multi.yml
+++ b/examples/mnist_multi.yml
@@ -15,6 +15,7 @@ HPARAMS: [
     low: 0.0001,
     high: 0.1
    },
+   # TODO: Discrete distributions should be sampled evenly!
    momentum: [0.25, 0.12],
    logdir: LOGDIR,
   },
diff --git a/runx/distributions.py b/runx/distributions.py
index 13970e5..59a9ff2 100644
--- a/runx/distributions.py
+++ b/runx/distributions.py
@@ -2,7 +2,7 @@
 import itertools
 import math
 import random
-from typing import List, Union, Any, Iterable, Dict
+from typing import List, Union, Any, Iterable, Dict, Tuple
 
 
 Primitive = Union[int, float, str]
@@ -27,20 +27,32 @@ def enumerate(self) -> Iterable[Primitive]:
 
 
 class CategoricalDistribution(BaseDistribution):
-    def __init__(self, categories: List[Primitive]):
+    def __init__(self, categories: List[Primitive], literal=False):
         self.categories = categories
+        self.literal = literal
 
     def sample(self):
         return random.choice(self.categories)
 
     @property
     def is_discrete(self):
-        return True
+        return self.literal
 
     def enumerate(self):
         return self.categories
 
 
+class MultinomialDistribution(CategoricalDistribution):
+    def __init__(self, categories: List[Primitive], weights: List[float]):
+        super().__init__(categories)
+
+        total_weight = sum(weights)
+        self.weights = [w / total_weight for w in weights]
+
+    def sample(self):
+        return random.choices(self.categories, self.weights, k=1)[0]
+
+
 class UniformDistribution(BaseDistribution):
     def __init__(self, low: float, high: float):
         self.low = low
@@ -81,13 +93,21 @@ def convert_to_distribution(val: Union[Primitive, List[Primitive], BaseDistribut
     if isinstance(val, BaseDistribution):
         return val
     if isinstance(val, (list, tuple)):
-        return CategoricalDistribution(val)
+        return CategoricalDistribution(val, literal=True)
 
-    return CategoricalDistribution([val])
+    return CategoricalDistribution([val], literal=True)
 
 
-def can_enumerate(distributions: List[BaseDistribution]) -> bool:
-    return all(d.is_discrete for d in distributions)
+def discrete_continuous_split(distributions: List[BaseDistribution]) \
+        -> Tuple[List[Tuple[int, BaseDistribution]], List[Tuple[int, BaseDistribution]]]:
+    disc = []
+    cont = []
+    for i, dist in enumerate(distributions):
+        if dist.is_discrete:
+            disc.append((i, dist))
+        else:
+            cont.append((i, dist))
+    return disc, cont
 
 
 def enumerate_dists(distributions: List[BaseDistribution]) -> List[List[Primitive]]:
@@ -107,6 +127,7 @@ def sample_dists(distributions: List[BaseDistribution]) -> List[Primitive]:
     'uniform': UniformDistribution,
     'normal': NormalDistribution,
     'log_uniform': LogUniformDistribution,
+    'categorical': CategoricalDistribution,
 }
 
 def load_config(cfg: Union[Any, Dict[str, Primitive]]):
@@ -119,4 +140,67 @@ def load_config(cfg: Union[Any, Dict[str, Primitive]]):
     else:
         v = cfg
 
-    return convert_to_distribution(v)
\ No newline at end of file
+    return convert_to_distribution(v)
+
+
+def enumerate_hparams(hparams, num_trials) -> List[List[Primitive]]:
+    for k in hparams:
+        hparams[k] = load_config(hparams[k])
+
+    discrete_dists, continuous_dists = discrete_continuous_split(hparams.values())
+
+    if discrete_dists and not continuous_dists:
+        realizations = enumerate_dists([d[1] for d in discrete_dists])
+
+        if num_trials == 0 or num_trials > len(realizations):
+            return realizations
+        else:
+            return random.choices(realizations, k=num_trials)
+
+    if num_trials == 0:
+        raise ValueError("The number of trials must be specified"
+                            " when optimizing over continuous"
+                            " distributions")
+
+    if continuous_dists and not discrete_dists:
+        continuous_dists = [d[1] for d in continuous_dists]
+        realizations = [
+            sample_dists(continuous_dists)
+            for _ in range(num_trials)
+        ]
+
+        return realizations
+
+    discrete_realizations = enumerate_dists([d[1] for d in discrete_dists])
+
+    required_trials = math.ceil(num_trials / len(discrete_realizations)) * len(discrete_realizations)
+
+    if required_trials > 2 * num_trials:
+        raise ValueError(f"The number of required trials - {required_trials} - is more than"
+                         f" double the number of allotted trials - {num_trials}"
+                         f" in order to satisfy the grid and sampling constraints."
+                         f" Please increase the number of trials, or explicitly define"
+                         f" one or more of the discrete sets using the 'categorical'"
+                         f" distribution.")
+    elif required_trials > num_trials:
+        print(f'Warning: Requiring {required_trials} total trials instead of {num_trials} to satisfy constraints.')
+
+    num_trials = required_trials
+    num_trials_per_disc = num_trials // len(discrete_realizations)
+
+    cont_dists = [d[1] for d in continuous_dists]
+
+    realizations = []
+    for disc_realization in discrete_realizations:
+        for _ in range(num_trials_per_disc):
+            ret = [None for _ in range(len(hparams))]
+            for k, r in enumerate(disc_realization):
+                ret[discrete_dists[k][0]] = r
+
+            cont_realization = sample_dists(cont_dists)
+            for k, r in enumerate(cont_realization):
+                ret[continuous_dists[k][0]] = r
+
+            realizations.append(ret)
+
+    return realizations
\ No newline at end of file
diff --git a/runx/runx.py b/runx/runx.py
index af18522..d15bede 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -33,21 +33,19 @@
 from datetime import datetime
 from shutil import copytree, ignore_patterns
 
+import argparse
+import itertools
 import os
+import math
 import random
 import re
 import sys
 import subprocess
+from typing import Iterable, List
 import yaml
-import argparse
-import itertools
 
 from .config import cfg
-from .distributions import (
-    load_config, convert_to_distribution,
-    can_enumerate, enumerate_dists,
-    sample_dists
-)
+from .distributions import enumerate_hparams
 from .farm import build_farm_cmd, upload_to_ngc
 from .utils import read_config, save_hparams, exec_cmd
 
@@ -127,67 +125,6 @@ def islist(elem):
     return type(elem) is list or type(elem) is tuple
 
 
-def cross_product_hparams(hparams):
-    """
-    This function takes in just the hyperparameters for the target script,
-    such as your main.py.
-
-    inputs:
-      hparams is a dict, where each key is the name of a commandline arg and
-      the value is the target value of the arg.
-
-      However any arg can also be a list and so this function will calculate
-      the cross product for all combinations of all args.
-
-    output:
-      The return value is a sequence of lists. Each list is one of the
-      permutations of argument values.
-    """
-    hparam_values = []
-
-    # turn every hyperparam into a list, to prep for itertools.product
-    for elem in hparams.values():
-        if islist(elem):
-            hparam_values.append(elem)
-        else:
-            hparam_values.append([elem])
-
-    expanded_hparams = itertools.product(*hparam_values)
-
-    # have to do this in order to know length
-    expanded_hparams = list(expanded_hparams)
-    num_cases = len(expanded_hparams)
-
-    return expanded_hparams, num_cases
-
-
-def enumerate_hparams(hparams, num_trials):
-    for k in hparams:
-        hparams[k] = load_config(hparams[k])
-
-    if can_enumerate(hparams.values()):
-        realizations = enumerate_dists(hparams.values())
-
-        if num_trials == 0 or num_trials > len(realizations):
-            return realizations
-        else:
-            return random.choices(realizations, k=num_trials)
-    else:
-        if num_trials == 0:
-            raise ValueError("The number of trials must be specified"
-                             " when optimizing over continuous"
-                             " distributions")
-
-        realizations = [
-            sample_dists(hparams.values())
-            for _ in range(num_trials)
-        ]
-
-        return realizations
-
-    # # TODO: Determine if full grid search, or smarter
-    # return cross_product_hparams(hparams)
-
 def get_field(adict, f, required=True):
     if required:
         assert f in adict, 'expected {} to be defined in experiment'.format(f)

From 93437d9535c847322bc7b82cdfc6e5f9876a957d Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Tue, 27 Oct 2020 13:12:03 -0700
Subject: [PATCH 04/14] Updating the readme

---
 README.md             | 67 ++++++++++++++++++++++++++++++++++++++++++-
 runx/distributions.py |  1 +
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bcae11c..fb3ba5a 100755
--- a/README.md
+++ b/README.md
@@ -14,6 +14,8 @@
   - [Introduction example](#introduction-example)
     - [runx is especially useful to launch batch jobs to a farm.](#runx-is-especially-useful-to-launch-batch-jobs-to-a-farm)
     - [Unique run directories](#unique-run-directories)
+    - [Hyperopt Sampling](#hyperopt-sampling)
+      - [Supported Distributions](#supported-distributions)
   - [Summarization with sumx](#summarization-with-sumx)
   - [runx Architecture](#runx-architecture)
   - [Create a project-specific configuration file](#create-a-project-specific-configuration-file)
@@ -70,7 +72,7 @@ python train.py --lr 0.01 --solver adam
 python train.py --lr 0.02 --solver sgd
 python train.py --lr 0.02 --solver adam
 ```
-You can see that runx automatically computes the cross product of all hyperparameters, which in this case results in 4 runs. It then builds commandlines by concatenating the hyperparameters with the training command.
+You can see that runx automatically computes the cross product of all hyperparameters, which in this case results in 4 runs. It then builds commandlines by concatenating the hyperparameters with the training command. runx also supports more advanced search, such as sampling values from distributions.
 
 A few useful runx options:
 ```
@@ -136,6 +138,69 @@ submit_job --gpu 2 --cpu 16 --mem 128 -c "python train.py --lr 0.02 --solver sgd
 submit_job --gpu 2 --cpu 16 --mem 128 -c "python train.py --lr 0.02 --solver adam  --logdir /home/logs/vengeful-jaguar_2020.02.06_14.19"
 ```
 
+### Hyperopt Sampling
+In addition to performing an exhaustive search over the space of hyperparameters, runx also supports sampling parameter values from some common distributions. Currently, the only supported sweep method when using distributions is random sampling. While unintuitive, random sampling theoretically leads to better models using fewer trials than exhaustive grid searching. Refer to this [tutorial](https://blog.usejournal.com/a-comparison-of-grid-search-and-randomized-search-using-scikit-learn-29823179bc85) for some insight into this.
+
+An example usage of distributions can be found in the `mnist*.yml` examples.
+
+```yml
+CMD: "python mnist.py"
+
+NUM_TRIALS: 10
+HPARAMS:
+   lr:
+      distribution: log_uniform
+      low: 0.0001
+      high: 0.1
+      base: 10
+   momentum:
+      distribution: uniform
+      low: 0.5
+      high: 0.999
+   optim: ['sgd', 'adam', 'radam']
+   use_bn: [true, false]
+   logdir: LOGDIR
+```
+
+Here, there are some key changes from the simple enumeration shown before. In particular, `NUM_TRIALS` specifies the number of jobs that will be launched, and **must** be specified when using distributions. For the distribution parameters, instead of providing explicit values for a parameter, you instead provide the parameters to the distribution from which to sample.
+
+Next, you'll notice that `lr` and `momentum` are sampled from distributions, but `optim`, `use_bn` and `logdir` have an explicit set of values. The behavior of runx is such that concrete values will be uniformly realized, guaranteed. What this means is that 6 different sets of runs will be launched. We arrive at 6 because of the number of combinations of `(optim, use_bn, logdir)` is 6.
+
+Next, runx finds the minimum number of samples per set that is closest to `NUM_TRIALS` but results in each concrete realization being evenly sampled. In this case, this requires 12 trials, where each set gets 2 samples values from the `lr` and `momentum` distributions.
+
+**NOTE**: If the number of actual trials is `> 2 * NUM_TRIALS` runx will emit an error instead of executing the trials. To fix this, either increase the number of trials, or make some of your concrete parameters be the `categorical` distribution in the config instead.
+
+
+#### Supported Distributions
+
+```yml
+some_param:
+    distribution: uniform
+    low: <the smallest possible value, inclusive>
+    high: <the largest possible value, inclusive>
+some_param2:
+    distribution: log_uniform
+    low: <the smallest possible value, inclusive>
+    high: <the largest possible value, inclusive>
+    base: <optional (default e) - the base of the logarithm>
+some_param3:
+    distribution: normal
+    mean: <optional (default 0)>
+    std: <optional (default 1) - The standard deviation>
+some_param4:
+    # NOTE: While this seems similar to providing explicit values,
+    # instead of the combinations being realized, this will instead be
+    # sampled from.
+    distribution: categorical
+    categories: [a, b, c, 1, 2, 3]
+some_param5:
+    distribution: multinomial
+    categories: [x, y, z, 1.5, 3, 7]
+    # NOTE: It's not necessary for the weights to sum to 1
+    weights: [1, 2, 3, 4, 5, 6]
+```
+
+
 ## Summarization with sumx
 After you've run your experiment, you will likely want to summarize the results.  You might want to know:
 * Which training run was best?
diff --git a/runx/distributions.py b/runx/distributions.py
index 59a9ff2..f4e8034 100644
--- a/runx/distributions.py
+++ b/runx/distributions.py
@@ -128,6 +128,7 @@ def sample_dists(distributions: List[BaseDistribution]) -> List[Primitive]:
     'normal': NormalDistribution,
     'log_uniform': LogUniformDistribution,
     'categorical': CategoricalDistribution,
+    'multinomial': MultinomialDistribution,
 }
 
 def load_config(cfg: Union[Any, Dict[str, Primitive]]):

From 229d90157cd777c2f55c705c031cc355c8b1f3e1 Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Wed, 24 Mar 2021 13:27:00 -0700
Subject: [PATCH 05/14] Quality of life improvements

---
 runx/config.py |   1 +
 runx/runx.py   | 185 ++++++++++++++++++++++++++++---------------------
 runx/utils.py  | 107 +++++++++++++++++++++-------
 3 files changed, 189 insertions(+), 104 deletions(-)

diff --git a/runx/config.py b/runx/config.py
index a1e6abf..831600f 100644
--- a/runx/config.py
+++ b/runx/config.py
@@ -9,3 +9,4 @@
 __C.FARM = None
 __C.LOGROOT = None
 __C.EXP_NAME = None
+__C.YML_PARAMS = False
\ No newline at end of file
diff --git a/runx/runx.py b/runx/runx.py
index d15bede..180dae8 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -63,6 +63,8 @@
                     help='no coolname, no datestring')
 parser.add_argument('--farm', type=str, default=None,
                     help='Select farm for workstation submission')
+parser.add_argument('--yml_params', action='store_true',
+                    help='Hyperparameters are specified via a config yaml as opposed to through the command line.')
 args = parser.parse_args()
 
 
@@ -91,8 +93,10 @@ def construct_cmd(cmd, hparams, logdir):
     :cmd: farm submission command
     :hparams: hyperparams for training command
     """
-    # First, add hyperparameters
-    cmd += ' ' + expand_hparams(hparams)
+    # Only add to the command line if YML_PARAMS isn't specified
+    if not cfg.YML_PARAMS:
+        # First, add hyperparameters
+        cmd += ' ' + expand_hparams(hparams)
 
     # Expand PYTHONPATH, if necessary
     if cfg.PYTHONPATH is not None:
@@ -135,18 +139,21 @@ def do_keyword_expansion(alist, pairs):
     """
     Substitute a string in place of certain keywords
     """
-    if type(alist) is list or type(alist) is tuple:
+    if isinstance(alist, (list, tuple)):
         for i, v in enumerate(alist):
-            if type(v) == str:
-                for k, v in pairs:
-                    alist[i] = alist[i].replace(k, v)
-    elif type(alist) is dict:
+            alist[i] = do_keyword_expansion(v, pairs)
+        return alist
+    elif isinstance(alist, dict):
         for a_k, a_v in alist.items():
-            if type(a_v) == str:
-                for k, v in pairs:
-                    alist[a_k] = alist[a_k].replace(k, v)
+            alist[a_k] = do_keyword_expansion(a_v, pairs)
+        return alist
+    elif isinstance(alist, str):
+        ret = alist
+        for k, v in pairs:
+            ret = ret.replace(k, v)
+        return ret
     else:
-        raise
+        return alist
 
 
 def make_cool_names():
@@ -180,10 +187,17 @@ def copy_code(logdir, runroot, code_ignore_patterns):
     copytree(runroot, tgt_code_dir, ignore=code_ignore_patterns)
 
 
-def hacky_substitutions(hparams, resource_copy, logdir, runroot):
+def write_yml_params(logdir, hparams):
+    with open(os.path.join(logdir, 'hparams.yml'), 'w') as fd:
+        yaml.dump(hparams, fd)
+
+
+def hacky_substitutions(cmd, hparams, resource_copy, logdir, runroot, replica):
+    replace_list = [('LOGDIR', logdir), ('REPLICA', str(replica))]
     # Substitute the true logdir in for the magic variable LOGDIR
-    do_keyword_expansion(hparams, [('LOGDIR', logdir)])
-    do_keyword_expansion(resource_copy, [('LOGDIR', logdir)])
+    hparams = do_keyword_expansion(hparams, replace_list)
+    resource_copy = do_keyword_expansion(resource_copy, replace_list)
+    cmd = do_keyword_expansion(cmd, replace_list)
 
     # Build hparams to save out after LOGDIR but before deleting
     # the key 'SUBMIT_JOB.NODES', so that it is part of the hparams saved
@@ -203,7 +217,7 @@ def hacky_substitutions(hparams, resource_copy, logdir, runroot):
     # Record the directory from whence the experiments were launched
     hparams_out['srcdir'] = runroot
 
-    return hparams_out
+    return cmd, hparams_out
 
 
 def get_tag(hparams):
@@ -253,76 +267,87 @@ def run_yaml(experiment, runroot):
         yaml_hparams[k] = v
 
     num_trials = experiment.get('NUM_TRIALS', 0)
+    num_replications = experiment.get('NUM_REPLICAS', 1)
 
     # Calculate cross-product of hyperparams
     expanded_hparams = enumerate_hparams(yaml_hparams, num_trials)
     num_cases = len(expanded_hparams)
 
     # Run each permutation
-    for i, hparam_vals in enumerate(expanded_hparams):
-        hparam_vals = list(hparam_vals)
-        hparam_keys = list(yaml_hparams.keys())
-
-        # hparams to use for experiment
-        hparams = {k: v for k, v in zip(hparam_keys, hparam_vals)}
-        if skip_run(hparams):
-            continue
-        get_tag(hparams)
-
-        job_name, logdir, coolname, expdir = make_cool_names()
-        resource_copy = resources.copy()
-
-        """
-        A few different modes of operation:
-        1. interactive runs
-           a. copy local code to logdir under LOGROOT
-           b. cd to logdir, execute cmd
-
-        2. farm submission: non-NGC
-           In this regime, the LOGROOT is expected to be visible to the farm's compute nodes
-           a. copy local code to logdir under LOGROOT
-           b. call cmd, which should invoke whatever you have specified for SUBMIT_JOB
-
-        3. farm submission: NGC
-           a. copy local code to logdir under LOGROOT
-           b. ngc workspace upload the logdir to NGC_WORKSPACE
-           c. call cmd, which should invoke SUBMIT_JOB==`ngc batch run`
-        """
-        if ngc_batch:
-            ngc_logdir = logdir.replace(cfg.LOGROOT, cfg.NGC_LOGROOT)
-            hparams_out = hacky_substitutions(
-                hparams, resource_copy, ngc_logdir, runroot)
-            cmd = construct_cmd(experiment_cmd, hparams, ngc_logdir)
-        else:
-            hparams_out = hacky_substitutions(
-                hparams, resource_copy, logdir, runroot)
-            cmd = construct_cmd(experiment_cmd, hparams, logdir)
-
-        if not args.interactive:
-            cmd = build_farm_cmd(cmd, job_name, resource_copy, logdir)
-
-        if args.no_run:
-            print(cmd)
-            continue
-
-        # copy code to NFS-mounted share
-        copy_code(logdir, runroot, code_ignore_patterns)
-
-        # save some meta-data from run
-        save_cmd(cmd, logdir)
-
-        # upload to remote farm
-        if ngc_batch:
-            upload_to_ngc(logdir)
-
-        subprocess.call(['chmod', '-R', 'a+rw', expdir])
-        os.chdir(logdir)
-
-        if args.interactive:
-            print('Running job {}'.format(job_name))
-        else:
-            print('Submitting job {}'.format(job_name))
-        exec_cmd(cmd)
+    for hparam_vals in expanded_hparams:
+        g_job_name, g_logdir, coolname, expdir = make_cool_names()
+
+        for replica in range(num_replications):
+            if num_replications > 1:
+                job_name = f'{g_job_name}_run_{replica}'
+                logdir = f'{g_logdir}/run_{replica}'
+            else:
+                job_name = g_job_name
+                logdir = g_logdir
+
+            hparam_vals = list(hparam_vals)
+            hparam_keys = list(yaml_hparams.keys())
+
+            # hparams to use for experiment
+            hparams = {k: v for k, v in zip(hparam_keys, hparam_vals)}
+            if skip_run(hparams):
+                continue
+            get_tag(hparams)
+
+            resource_copy = resources.copy()
+            """
+            A few different modes of operation:
+            1. interactive runs
+            a. copy local code to logdir under LOGROOT
+            b. cd to logdir, execute cmd
+
+            2. farm submission: non-NGC
+            In this regime, the LOGROOT is expected to be visible to the farm's compute nodes
+            a. copy local code to logdir under LOGROOT
+            b. call cmd, which should invoke whatever you have specified for SUBMIT_JOB
+
+            3. farm submission: NGC
+            a. copy local code to logdir under LOGROOT
+            b. ngc workspace upload the logdir to NGC_WORKSPACE
+            c. call cmd, which should invoke SUBMIT_JOB==`ngc batch run`
+            """
+            if ngc_batch:
+                ngc_logdir = logdir.replace(cfg.LOGROOT, cfg.NGC_LOGROOT)
+                cmd, hparams_out = hacky_substitutions(
+                    experiment_cmd, hparams, resource_copy, ngc_logdir, runroot, replica)
+                cmd = construct_cmd(cmd, hparams, ngc_logdir)
+            else:
+                cmd, hparams_out = hacky_substitutions(
+                    experiment_cmd, hparams, resource_copy, logdir, runroot, replica)
+                cmd = construct_cmd(cmd, hparams, logdir)
+
+            if not args.interactive:
+                cmd = build_farm_cmd(cmd, job_name, resource_copy, logdir)
+
+            if args.no_run:
+                print(cmd)
+                continue
+
+            # copy code to NFS-mounted share
+            copy_code(logdir, runroot, code_ignore_patterns)
+            if cfg.YML_PARAMS:
+                write_yml_params(logdir, hparams)
+
+            # save some meta-data from run
+            save_cmd(cmd, logdir)
+
+            # upload to remote farm
+            if ngc_batch:
+                upload_to_ngc(logdir)
+
+            subprocess.call(['chmod', '-R', 'a+rw', expdir])
+            os.chdir(logdir)
+
+            if args.interactive:
+                print('Running job {}'.format(job_name))
+            else:
+                print('Submitting job {}'.format(job_name))
+            exec_cmd(cmd)
 
 
 def run_experiment(exp_fn):
@@ -330,7 +355,7 @@ def run_experiment(exp_fn):
     Run an experiment, given a global config file + an experiment file.
     The global config sets defaults that are inherited by the experiment.
     """
-    experiment = read_config(args.farm, args.exp_yml)
+    experiment = read_config(args.farm, args.exp_yml, args.yml_params)
 
     assert 'HPARAMS' in experiment, 'experiment file is missing hparams'
 
diff --git a/runx/utils.py b/runx/utils.py
index 6360026..142572e 100755
--- a/runx/utils.py
+++ b/runx/utils.py
@@ -32,9 +32,12 @@
 import shlex
 import json
 import socket
+import re
 
 import subprocess
 from subprocess import call, getoutput, DEVNULL
+
+from yaml.loader import FullLoader
 from .config import cfg
 
 
@@ -72,50 +75,106 @@ def read_config_item(config, key, optional=False):
         raise f'can\'t find {key} in config'
 
 
+def merge_recursive(a, b):
+    if isinstance(a, dict):
+        assert isinstance(b, dict)
+        ret = {**a}
+        for k, v in b.items():
+            if k in ret:
+                ret[k] = merge_recursive(ret[k], v)
+            else:
+                ret[k] = v
+        return ret
+    else:
+        return b
+
+
+def expand_environment(config):
+    '''
+    Converts dictionary values that begin with '$' into their corresponding environment values
+    '''
+    if isinstance(config, dict):
+        for k, v in config.items():
+            config[k] = expand_environment(v)
+        return config
+    elif isinstance(config, (list, tuple)):
+        for i, v in enumerate(config):
+            config[i] = expand_environment(v)
+    elif isinstance(config, str):
+        split_vars = re.split(r'(\$\w+)', config, flags=re.ASCII)
+        for i, part in enumerate(split_vars):
+            if part.startswith('$'):
+                env = os.environ.get(part[1:], None)
+                if env is None:
+                    raise ValueError(f'The environment variable ${part} doesn\'t exist!')
+                split_vars[i] = env
+
+        config = ''.join(split_vars)
+
+        return config
+    return config
+
+
+def load_yaml(file_name):
+    '''
+    Loads the yml file and returns the python object
+    '''
+    with open(file_name, 'r') as fd:
+        if 'FullLoader' in dir(yaml):
+            return yaml.load(fd, Loader=yaml.FullLoader)
+        else:
+            return yaml.load(fd)
+
 def read_config_file():
+    '''
+    Loads the user wide configuration (at ~/.config/runx.yml) and the project wide configuration (at ./.runx).
+    If both files exist, they are merged following the precendence that project settings overwrite global settings.
+    '''
     local_config_fn = './.runx'
     home = os.path.expanduser('~')
     global_config_fn = '{}/.config/runx.yml'.format(home)
+
+    config = dict()
+
+    if os.path.isfile(global_config_fn):
+        config = merge_recursive(config, load_yaml(global_config_fn))
     if os.path.isfile(local_config_fn):
-        config_fn = local_config_fn
-    elif os.path.exists(global_config_fn):
-        config_fn = global_config_fn
-    else:
-        raise FileNotFoundError('Can\'t find file ./.runx or ~/.config/runx.yml config files')
-    if 'FullLoader' in dir(yaml):
-        global_config = yaml.load(open(config_fn), Loader=yaml.FullLoader)
-    else:
-        global_config = yaml.load(open(config_fn))
-    return global_config
+        config = merge_recursive(config, load_yaml(local_config_fn))
 
+    if not config:
+        raise FileNotFoundError(f'Can\'t find file "{global_config_fn}" or "{local_config_fn}" config files!')
 
-def read_config(args_farm, args_exp_yml):
+    return config
+
+
+def read_config(args_farm, args_exp_yml, args_yml_params):
     '''
     Merge the global and experiment config files into a single config
     '''
-    global_config = read_config_file()
+    experiment = read_config_file()
+
+    if args_exp_yml is not None:
+        experiment = merge_recursive(experiment, load_yaml(args_exp_yml))
+
+    experiment = expand_environment(experiment)
 
     if args_farm is not None:
-        global_config['FARM'] = args_farm
+        experiment['FARM'] = args_farm
+    if args_yml_params:
+        experiment['YML_PARAMS'] = True
 
-    farm_name = read_config_item(global_config, 'FARM')
-    assert farm_name in global_config, f'Can\'t find farm {farm_name} defined in .runx'
+    farm_name = read_config_item(experiment, 'FARM')
+    assert farm_name in experiment, f'Can\'t find farm {farm_name} defined in .runx'
 
     # Dereference the farm config items
-    for k, v in global_config[farm_name].items():
-        global_config[k] = v
-
-    # Inherit global config into experiment config:
-    experiment = global_config
-    if args_exp_yml is not None:
-        exp_config = yaml.load(open(args_exp_yml), Loader=yaml.FullLoader)
-        for k, v in exp_config.items():
-            experiment[k] = v
+    for k, v in experiment[farm_name].items():
+        experiment[k] = v
 
     cfg.FARM = read_config_item(experiment, 'FARM')
     cfg.LOGROOT = read_config_item(experiment, 'LOGROOT')
     cfg.SUBMIT_CMD = read_config_item(experiment, 'SUBMIT_CMD')
     cfg.PYTHONPATH = read_config_item(experiment, 'PYTHONPATH', optional=True)
+    cfg.YML_PARAMS = read_config_item(experiment, 'YML_PARAMS')
     if args_exp_yml is not None:
         cfg.EXP_NAME = os.path.splitext(os.path.basename(args_exp_yml))[0]
     if 'ngc' in cfg.FARM:

From d01a792ce091fea2210dcdc4cffcde48307e0cfe Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Wed, 24 Mar 2021 13:29:57 -0700
Subject: [PATCH 06/14] Adding ignore file

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__

From bb76b0cad38eb06d469baa36e23f477ea7d783cd Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Thu, 25 Mar 2021 07:24:23 -0700
Subject: [PATCH 07/14] Added some checkpoint loading stuff that got missed a
 year ago

---
 runx/logx.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++----
 runx/runx.py |  2 +-
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/runx/logx.py b/runx/logx.py
index 290015c..13f7b21 100755
--- a/runx/logx.py
+++ b/runx/logx.py
@@ -317,10 +317,32 @@ def save_model(self, save_dict, metric, epoch, higher_better=True,
             copyfile(self.save_ckpt_fn, self.best_ckpt_fn)
         return is_better
 
-    def get_best_checkpoint(self):
+    def get_best_checkpoint(self, checkpoint_dir=None):
         """
         Finds the checkpoint in `self.logdir` that is considered best.
 
+        If `checkpoint_dir` is supplied, then the best checkpoint in that
+        directory is found instead of `self.logdir`.
+
+        If, for some reason, there are multiple best checkpoint files, then
+        the one with the highest epoch will be preferred.
+
+        Returns:
+            None - If there is no best checkpoint file
+            path (str) - The full path to the best checkpoint otherwise.
+        """
+        if not checkpoint_dir:
+            checkpoint_dir = self.logdir
+
+        return self.find_best_checkpoint(checkpoint_dir)
+
+    def get_checkpoint(self, checkpoint_dir=None, find_best=True):
+        """
+        Finds the checkpoint in `self.logdir` that is considered best.
+
+        If `checkpoint_dir` is supplied, then the best checkpoint in that
+        directory is found instead of `self.logdir`.
+
         If, for some reason, there are multiple best checkpoint files, then
         the one with the highest epoch will be preferred.
 
@@ -328,10 +350,41 @@ def get_best_checkpoint(self):
             None - If there is no best checkpoint file
             path (str) - The full path to the best checkpoint otherwise.
         """
-        match_str = r'^best_checkpoint_ep([0-9]+).pth$'
+        if not checkpoint_dir:
+            checkpoint_dir = self.logdir
+
+        return self.find_checkpoint(checkpoint_dir, find_best=find_best)
+
+    @staticmethod
+    def find_best_checkpoint(checkpoint_dir):
+        return LogX.find_checkpoint(checkpoint_dir)
+
+    @staticmethod
+    def find_last_checkpoint(checkpoint_dir):
+        return LogX.find_checkpoint(checkpoint_dir, find_best=False)
+
+    @staticmethod
+    def find_checkpoint(checkpoint_dir, find_best=True):
+        """
+        Searches the specified directory for a latest checkpoint.
+
+        If `find_best == True` then it will search for the best checkpoint
+        based on the stored metric value. Otherwise, it will load the latest
+        saved checkpoint regardless of metric.
+
+        If no checkpoint is found, returns None
+        """
+        if os.path.isfile(checkpoint_dir):
+            return checkpoint_dir
+
+        if find_best:
+            match_str = r'^best_checkpoint_ep([0-9]+).pth$'
+        else:
+            match_str = r'^last_checkpoint_ep([0-9]+).pth$'
+
         best_epoch = -1
         best_checkpoint = None
-        for filename in os.listdir(self.logdir):
+        for filename in os.listdir(checkpoint_dir):
             match = re.fullmatch(match_str, filename)
             if match is not None:
                 # Extract the epoch number
@@ -342,7 +395,7 @@ def get_best_checkpoint(self):
 
         if best_checkpoint is None:
             return None
-        return os.path.join(self.logdir, best_checkpoint)
+        return os.path.join(checkpoint_dir, best_checkpoint)
 
     def load_model(self, path):
         """Restore a model and return a dict with any meta data included in
diff --git a/runx/runx.py b/runx/runx.py
index 180dae8..e4ab279 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -188,7 +188,7 @@ def copy_code(logdir, runroot, code_ignore_patterns):
 
 
 def write_yml_params(logdir, hparams):
-    with open(os.path.join(logdir, 'hparams.yml'), 'w') as fd:
+    with open(os.path.join(logdir, 'hyper_parameters.yml'), 'w') as fd:
         yaml.dump(hparams, fd)
 
 

From 3dafe9a4aa39208a9b4898c6d9dc63c3082d4006 Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Thu, 25 Mar 2021 09:45:02 -0700
Subject: [PATCH 08/14] Bug fix. Added integer support to the uniform
 distribution.

---
 runx/distributions.py | 8 ++++++--
 runx/runx.py          | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/runx/distributions.py b/runx/distributions.py
index f4e8034..0f726d7 100644
--- a/runx/distributions.py
+++ b/runx/distributions.py
@@ -54,12 +54,16 @@ def sample(self):
 
 
 class UniformDistribution(BaseDistribution):
-    def __init__(self, low: float, high: float):
+    def __init__(self, low: float, high: float, is_integer: bool=False):
         self.low = low
         self.high = high
+        self.is_integer = is_integer
 
     def sample(self):
-        return random.uniform(self.low, self.high)
+        val = random.uniform(self.low, self.high)
+        if self.is_integer:
+            val = int(round(val))
+        return val
 
 
 class NormalDistribution(BaseDistribution):
diff --git a/runx/runx.py b/runx/runx.py
index e4ab279..397d4c0 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -188,7 +188,7 @@ def copy_code(logdir, runroot, code_ignore_patterns):
 
 
 def write_yml_params(logdir, hparams):
-    with open(os.path.join(logdir, 'hyper_parameters.yml'), 'w') as fd:
+    with open(os.path.join(logdir, 'code', 'hyper_parameters.yml'), 'w') as fd:
         yaml.dump(hparams, fd)
 
 

From 554d8dbeffbb072bba7c9ffbb066a498715f2591 Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Wed, 31 Mar 2021 07:13:49 -0700
Subject: [PATCH 09/14] Working on bayesian hyperopt support

---
 runx/hyperopt_wrapper.py | 136 +++++++++++++++++++++++++++++++++++++++
 runx/runx.py             |  20 ++++--
 2 files changed, 150 insertions(+), 6 deletions(-)
 create mode 100644 runx/hyperopt_wrapper.py

diff --git a/runx/hyperopt_wrapper.py b/runx/hyperopt_wrapper.py
new file mode 100644
index 0000000..fe1bdac
--- /dev/null
+++ b/runx/hyperopt_wrapper.py
@@ -0,0 +1,136 @@
+import argparse
+import os
+from functools import partial
+import subprocess
+import signal
+import sys
+
+import numpy as np
+from hyperopt import Trials, fmin, tpe
+import hyperopt
+import pickle as pkl
+import datetime
+
+import torch
+import torch.multiprocessing as mp
+
+from .utils import load_yaml
+
+SPACE_FILE = 'hyperopt_space.pkl'
+TRIAL_FILE = 'hyperopt_trial.pkl'
+
+def main():
+    parser = argparse.ArgumentParser(description='Wrapper script for invoking bayesian hyper-parameter search')
+    parser.add_argument('--hopt_sweep_root', type=str, required=True,
+                        help='Path to the root of the hyper-parameter search')
+    parser.add_argument('--hopt_exp_root', type=str, required=True,
+                        help='Path to the experiment root directory. Trials will be loaded from there.')
+    parser.add_argument('--hopt_num_trials', type=int, required=True,
+                        help='The maximum number of hyperopt iterations')
+
+    args, rest = parser.parse_args()
+
+    trials = []
+
+    for dirpath, dirs, files in os.walk(args.hopt_sweep_root):
+        # Find the experiment directories by looking for the metric file
+        if TRIAL_FILE in files:
+            with open(os.path.join(dirpath, TRIAL_FILE), 'rb') as fd:
+                trial = pkl.load(fd)
+                trials.append(trial)
+
+    if len(trials) >= args.hopt_num_trials:
+        return
+
+    # Gather all of the disparate trials
+    history = Trials()
+    for t in trials:
+        merge_trials(history, t)
+    history.refresh()
+
+    eval_fn = partial(evaluate, rest)
+
+    with open(os.path.join(args.hopt_sweep_root, SPACE_FILE), 'rb') as fd:
+        space = pkl.load(fd)
+
+    # Run the next hyper step
+    fmin(eval_fn, space=space,
+         algo=tpe.suggest,
+         trials=history,
+         max_evals=len(trials) + 1,
+    )
+
+    last_trial = get_last_trial(history)
+
+    with open(os.path.join(args.hopt_exp_root, TRIAL_FILE), 'wb') as fd:
+        pkl.dump(last_trial, fd)
+
+    if len(history) < args.hopt_num_trials:
+        queue_next_job()
+
+
+def merge_trials(trials_accum: Trials, other_trials: Trials):
+    max_tid = max([trial['tid'] for trial in trials_accum.trials])
+
+    for trial in other_trials:
+        tid = trial['tid'] + max_tid + 1
+        hyperopt_trial = Trials().new_trial_docs(
+                tids=[None],
+                specs=[None],
+                results=[None],
+                miscs=[None])
+        hyperopt_trial[0] = trial
+        hyperopt_trial[0]['tid'] = tid
+        hyperopt_trial[0]['misc']['tid'] = tid
+        for key in hyperopt_trial[0]['misc']['idxs'].keys():
+            hyperopt_trial[0]['misc']['idxs'][key] = [tid]
+        trials_accum.insert_trial_docs(hyperopt_trial)
+        trials_accum.refresh()
+    return trials_accum
+
+
+def get_last_trial(history: Trials):
+    max_tid = max(history.tids)
+
+    for trial in history:
+        if trial['tid'] == max_tid:
+            ret = Trials()
+            ret.insert_trial_doc(trial)
+            return ret
+    assert False, "Shouldn't reach this!"
+
+
+class SignalHandler:
+    def __init__(self, child_procs):
+        self.child_procs = child_procs
+
+    def __call__(self, incoming_signal, frame):
+        print("Signal %d detected in process %d " % ( incoming_signal, os.getpid() ))
+        print("Forwarding to children " )
+        for child in self.child_procs:
+            print("Will now pass the signal %d to child process %d" % ( incoming_signal, child.pid ) )
+            os.kill( child.pid, incoming_signal)
+        if incoming_signal in [ signal.SIGUSR1,signal.SIGUSR2 ]:
+            # This is the most important part - we return silently and will be allowed to keep running.
+            return
+        else:
+            sys.exit(1)
+
+
+def _set_signal_handlers(child_procs):
+    signal_handler = SignalHandler(child_procs)
+    print("Setting signal handlers in process %d" % os.getpid())
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGUSR1, signal_handler)
+    signal.signal(signal.SIGUSR2, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+def evaluate(command_args, hopt_args):
+    pass
+
+
+def queue_next_job():
+    pass
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/runx/runx.py b/runx/runx.py
index 397d4c0..ea9cc1c 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -61,6 +61,8 @@
                     help='run interactively instead of submitting to farm')
 parser.add_argument('--no_cooldir', action='store_true',
                     help='no coolname, no datestring')
+parser.add_argument('--no_date', default=False, action='store_true',
+                    help='Don\'t include a date in the name suffix')
 parser.add_argument('--farm', type=str, default=None,
                     help='Select farm for workstation submission')
 parser.add_argument('--yml_params', action='store_true',
@@ -155,14 +157,20 @@ def do_keyword_expansion(alist, pairs):
     else:
         return alist
 
+used_names = set()
 
 def make_cool_names():
-    tagname = args.tag + '_' if args.tag else ''
-    datestr = datetime.now().strftime("_%Y.%m.%d_%H.%M")
-    if args.no_cooldir:
-        coolname = tagname
-    else:
-        coolname = tagname + generate_slug(2) + datestr
+    valid_name = False
+    while not valid_name:
+        tagname = args.tag + '_' if args.tag else ''
+        datestr = '' if args.no_date else datetime.now().strftime("_%Y.%m.%d_%H.%M")
+        if args.no_cooldir:
+            coolname = tagname
+        else:
+            coolname = tagname + generate_slug(2) + datestr
+        valid_name = coolname not in used_names
+
+    used_names.add(coolname)
 
     # Experiment directory is the parent of N runs
     expdir = os.path.join(cfg.LOGROOT, cfg.EXP_NAME)

From 68a2ca045b3950fd1eaab0fc80f28e5ca1bc367e Mon Sep 17 00:00:00 2001
From: Mike Ranzinger <mranzinger@nvidia.com>
Date: Fri, 12 Nov 2021 10:30:27 -0800
Subject: [PATCH 10/14] Adding outer init for vscode parsing

---
 __init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 __init__.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29

From ccb77cddd9c0021596a1e853f6cd9a52a0f0d8f8 Mon Sep 17 00:00:00 2001
From: mranzinger <mranzinger@nvidia.com>
Date: Mon, 6 Jun 2022 16:51:20 -0700
Subject: [PATCH 11/14] More usability

---
 runx/runx.py  | 19 +++++++++++--------
 runx/utils.py |  3 ++-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/runx/runx.py b/runx/runx.py
index ea9cc1c..050c9dc 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -170,6 +170,9 @@ def make_cool_names():
             coolname = tagname + generate_slug(2) + datestr
         valid_name = coolname not in used_names
 
+    if coolname.endswith('_'):
+        coolname = coolname[:-1]
+
     used_names.add(coolname)
 
     # Experiment directory is the parent of N runs
@@ -283,15 +286,7 @@ def run_yaml(experiment, runroot):
 
     # Run each permutation
     for hparam_vals in expanded_hparams:
-        g_job_name, g_logdir, coolname, expdir = make_cool_names()
-
         for replica in range(num_replications):
-            if num_replications > 1:
-                job_name = f'{g_job_name}_run_{replica}'
-                logdir = f'{g_logdir}/run_{replica}'
-            else:
-                job_name = g_job_name
-                logdir = g_logdir
 
             hparam_vals = list(hparam_vals)
             hparam_keys = list(yaml_hparams.keys())
@@ -302,6 +297,14 @@ def run_yaml(experiment, runroot):
                 continue
             get_tag(hparams)
 
+            g_job_name, g_logdir, coolname, expdir = make_cool_names()
+            if num_replications > 1:
+                job_name = f'{g_job_name}_run_{replica}'
+                logdir = f'{g_logdir}/run_{replica}'
+            else:
+                job_name = g_job_name
+                logdir = g_logdir
+
             resource_copy = resources.copy()
             """
             A few different modes of operation:
diff --git a/runx/utils.py b/runx/utils.py
index 142572e..d79d2ca 100755
--- a/runx/utils.py
+++ b/runx/utils.py
@@ -106,7 +106,8 @@ def expand_environment(config):
             if part.startswith('$'):
                 env = os.environ.get(part[1:], None)
                 if env is None:
-                    raise ValueError(f'The environment variable ${part} doesn\'t exist!')
+                    # raise ValueError(f'The environment variable {part} doesn\'t exist!')
+                    env = part
                 split_vars[i] = env
 
         config = ''.join(split_vars)

From c7e4abfdc7bcf414548783266dacd3cf10a6dfc0 Mon Sep 17 00:00:00 2001
From: mranzinger <mranzinger@nvidia.com>
Date: Tue, 7 Jun 2022 13:33:23 -0700
Subject: [PATCH 12/14] Added a lazy option

---
 .gitignore   | 1 +
 runx/runx.py | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/.gitignore b/.gitignore
index bee8a64..8d35cb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 __pycache__
+*.pyc
diff --git a/runx/runx.py b/runx/runx.py
index 050c9dc..682b97b 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -67,6 +67,8 @@
                     help='Select farm for workstation submission')
 parser.add_argument('--yml_params', action='store_true',
                     help='Hyperparameters are specified via a config yaml as opposed to through the command line.')
+parser.add_argument('--lazy', action='store_true',
+                    help='Don\'t run jobs that point to an existing output directory')
 args = parser.parse_args()
 
 
@@ -305,6 +307,9 @@ def run_yaml(experiment, runroot):
                 job_name = g_job_name
                 logdir = g_logdir
 
+            if args.lazy and os.path.exists(logdir):
+                continue
+
             resource_copy = resources.copy()
             """
             A few different modes of operation:
@@ -339,6 +344,8 @@ def run_yaml(experiment, runroot):
                 print(cmd)
                 continue
 
+
+
             # copy code to NFS-mounted share
             copy_code(logdir, runroot, code_ignore_patterns)
             if cfg.YML_PARAMS:

From d2e27a66adb041af4d139b03b43e44bcd1074502 Mon Sep 17 00:00:00 2001
From: mranzinger <mranzinger@nvidia.com>
Date: Wed, 8 Jun 2022 07:17:39 -0700
Subject: [PATCH 13/14] Fixed a crash

---
 runx/logx.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/runx/logx.py b/runx/logx.py
index 13f7b21..1d9dc75 100755
--- a/runx/logx.py
+++ b/runx/logx.py
@@ -377,6 +377,9 @@ def find_checkpoint(checkpoint_dir, find_best=True):
         if os.path.isfile(checkpoint_dir):
             return checkpoint_dir
 
+        if not os.path.isdir(checkpoint_dir):
+            return None
+
         if find_best:
             match_str = r'^best_checkpoint_ep([0-9]+).pth$'
         else:

From 5749fcfbe4effb0bd2f05babd7231e51307d3f5b Mon Sep 17 00:00:00 2001
From: mranzinger <mranzinger@nvidia.com>
Date: Thu, 30 Jun 2022 11:17:24 -0700
Subject: [PATCH 14/14] Use symlink for code replicas

---
 runx/runx.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/runx/runx.py b/runx/runx.py
index 682b97b..e13e760 100755
--- a/runx/runx.py
+++ b/runx/runx.py
@@ -198,6 +198,7 @@ def copy_code(logdir, runroot, code_ignore_patterns):
     if code_ignore_patterns is not None:
         code_ignore_patterns = ignore_patterns(*code_ignore_patterns)
     copytree(runroot, tgt_code_dir, ignore=code_ignore_patterns)
+    return tgt_code_dir
 
 
 def write_yml_params(logdir, hparams):
@@ -288,6 +289,7 @@ def run_yaml(experiment, runroot):
 
     # Run each permutation
     for hparam_vals in expanded_hparams:
+        prev_codepath = None
         for replica in range(num_replications):
 
             hparam_vals = list(hparam_vals)
@@ -345,9 +347,14 @@ def run_yaml(experiment, runroot):
                 continue
 
 
+            if prev_codepath is None:
+                # copy code to NFS-mounted share
+                prev_codepath = copy_code(logdir, runroot, code_ignore_patterns)
+            else:
+                os.makedirs(logdir, exist_ok=True)
+                tgt = os.path.join(logdir, 'code')
+                os.symlink(prev_codepath, tgt)
 
-            # copy code to NFS-mounted share
-            copy_code(logdir, runroot, code_ignore_patterns)
             if cfg.YML_PARAMS:
                 write_yml_params(logdir, hparams)
 
@@ -358,7 +365,7 @@ def run_yaml(experiment, runroot):
             if ngc_batch:
                 upload_to_ngc(logdir)
 
-            subprocess.call(['chmod', '-R', 'a+rw', expdir])
+            # subprocess.call(['chmod', '-R', 'a+rw', expdir])
             os.chdir(logdir)
 
             if args.interactive: