diff --git a/REQUIREMENTS-DOCKER.txt b/REQUIREMENTS-DOCKER.txt index 5952b70..3884f08 100644 --- a/REQUIREMENTS-DOCKER.txt +++ b/REQUIREMENTS-DOCKER.txt @@ -4,6 +4,6 @@ scipy tables pandas pyro-ppl>=0.3.2 -torch +torch>=1.9.0 scikit-learn matplotlib diff --git a/cellbender/remove_background/argparse.py b/cellbender/remove_background/argparse.py index c1b2781..a52a3eb 100644 --- a/cellbender/remove_background/argparse.py +++ b/cellbender/remove_background/argparse.py @@ -139,11 +139,11 @@ def add_subparser_args(subparsers: argparse) -> argparse: "do not exceed 1e-3). (default: %(default)s)") subparser.add_argument("--final-elbo-fail-fraction", type=float, help="Training is considered to have failed if " - "final_training_ELBO >= best_training_ELBO*(1+FINAL_ELBO_FAIL_FRACTION). " + "(best_test_ELBO - final_test_ELBO)/(best_test_DLBO - initial_train_ELBO) > FINAL_ELBO_FAIL_FRACTION." "(default: do not fail training based on final_training_ELBO)") subparser.add_argument("--epoch-elbo-fail-fraction", type=float, help="Training is considered to have failed if " - "current_epoch_training_ELBO >= previous_epoch_training_ELBO*(1+EPOCH_ELBO_FAIL_FRACTION). " + "(previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) > EPOCH_ELBO_FAIL_FRACTION." "(default: do not fail training based on epoch_training_ELBO)") subparser.add_argument("--num-training-tries", type=int, default=1, help="Number of times to attempt to train the model. Each subsequent " diff --git a/cellbender/remove_background/train.py b/cellbender/remove_background/train.py index 4b289c0..0f47f18 100644 --- a/cellbender/remove_background/train.py +++ b/cellbender/remove_background/train.py @@ -167,10 +167,12 @@ def run_training(model: RemoveBackgroundPyroModel, logging.info("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test)) if epoch_elbo_fail_fraction is not None and len(test_elbo) > 1 and \ - -test_elbo[-1] >= -test_elbo[-2] * (1 + epoch_elbo_fail_fraction): + test_elbo[-1] < test_elbo[-2] and \ + (test_elbo[-2] - test_elbo[-1])/(test_elbo[-2] - train_elbo[0]) > epoch_elbo_fail_fraction: logging.info( - "Training failed because this test loss (%.4f) exceeds previous test loss(%.4f) by >= %.2f%%" % - (test_elbo[-1], test_elbo[-2], 100*epoch_elbo_fail_fraction)) + "Training failed because this test loss (%.4f) exceeds previous test loss(%.4f) by >= %.2f%%, " + "relative to initial train loss %.4f" , + test_elbo[-1], test_elbo[-2], 100*epoch_elbo_fail_fraction, train_elbo[0]) succeeded = False break @@ -178,11 +180,12 @@ def run_training(model: RemoveBackgroundPyroModel, if succeeded and final_elbo_fail_fraction is not None and len(test_elbo) > 1: best_test_elbo = max(test_elbo) - if -test_elbo[-1] >= -best_test_elbo * (1 + final_elbo_fail_fraction): + if test_elbo[-1] < best_test_elbo and \ + (best_test_elbo - test_elbo[-1])/(best_test_elbo - train_elbo[0]) > final_elbo_fail_fraction: logging.info( "Training failed because final test loss (%.4f) exceeds " - "best test loss(%.4f) by >= %.2f%%" % - (test_elbo[-1], best_test_elbo, 100*final_elbo_fail_fraction)) + "best test loss(%.4f) by >= %.2f%%, relative to initial train loss %.4f", + test_elbo[-1], best_test_elbo, 100*final_elbo_fail_fraction, train_elbo[0]) succeeded = False # Exception allows program to continue after ending inference prematurely. diff --git a/docker/Dockerfile b/docker/Dockerfile index 0211fe7..9b35e0f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,7 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV PATH=/home/user/miniconda/bin:$PATH ENV CONDA_AUTO_UPDATE_CONDA=false -RUN conda install -y pytorch torchvision cudatoolkit -c pytorch \ +RUN conda install -y "pytorch>=1.9.0" torchvision cudatoolkit -c pytorch \ && conda install -y -c anaconda pytables \ && conda clean -ya