You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When training on node 16 and 64, the following TimeError occurs. This issue does not arise when using Node 4 and 8.
Traceback (most recent call last):
File "/opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_pretrain.py", line 19, in <module>
from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
File "/opt/NeMo/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py", line 25, in <module>
from pytorch_lightning.trainer.trainer import Trainer
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/__init__.py", line 27, in <module>
from pytorch_lightning.callbacks import Callback # noqa: E402
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/__init__.py", line 14, in <module>
from pytorch_lightning.callbacks.batch_size_finder import BatchSizeFinder
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/batch_size_finder.py", line 26, in <module>
from pytorch_lightning.callbacks.callback import Callback
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/callback.py", line 22, in <module>
from pytorch_lightning.utilities.types import STEP_OUTPUT
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/types.py", line 41, in <module>
from torchmetrics import Metric
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/__init__.py", line 26, in <module>
from torchmetrics import functional # noqa: E402
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/__init__.py", line 14, in <module>
from torchmetrics.functional.audio._deprecated import _permutation_invariant_training as permutation_invariant_training
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/audio/__init__.py", line 14, in <module>
from torchmetrics.functional.audio.pit import permutation_invariant_training, pit_permutate
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/audio/pit.py", line 22, in <module>
from torchmetrics.utilities import rank_zero_warn
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/utilities/__init__.py", line 14, in <module>
from torchmetrics.utilities.checks import check_forward_full_state_property
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/utilities/checks.py", line 25, in <module>
from torchmetrics.metric import Metric
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/metric.py", line 42, in <module>
from torchmetrics.utilities.plot import _AX_TYPE, _PLOT_OUT_TYPE, plot_single_or_multi_val
File "/usr/local/lib/python3.10/dist-packages/torchmetrics/utilities/plot.py", line 26, in <module>
import matplotlib.axes
File "/usr/local/lib/python3.10/dist-packages/matplotlib/axes/__init__.py", line 1, in <module>
from . import _base
File "/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py", line 13, in <module>
from matplotlib import _api, cbook, _docstring, offsetbox
File "/usr/local/lib/python3.10/dist-packages/matplotlib/offsetbox.py", line 33, in <module>
import matplotlib.text as mtext
File "/usr/local/lib/python3.10/dist-packages/matplotlib/text.py", line 16, in <module>
from .font_manager import FontProperties
File "/usr/local/lib/python3.10/dist-packages/matplotlib/font_manager.py", line 1582, in <module>
fontManager = _load_fontmanager()
File "/usr/local/lib/python3.10/dist-packages/matplotlib/font_manager.py", line 1577, in _load_fontmanager
json_dump(fm, fm_path)
File "/usr/local/lib/python3.10/dist-packages/matplotlib/font_manager.py", line 963, in json_dump
with cbook._lock_path(filename), open(filename, 'w') as fh:
File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__
return next(self.gen)
File "/usr/local/lib/python3.10/dist-packages/matplotlib/cbook.py", line 1826, in _lock_path
raise TimeoutError("""\
TimeoutError: Lock error: Matplotlib failed to acquire the following lock file:
/root/.cache/matplotlib/fontlist-v330.json.matplotlib-lock
This maybe due to another process holding this lock file. If you are sure no
other Matplotlib process is running, remove this file and try again.
My code is running with
Megatron-LM: from /opt/megatron-lm in NeMo Framework Container dev version (nvcr.io/nvidia/nemo:dev)
When training on node 16 and 64, the following TimeError occurs. This issue does not arise when using Node 4 and 8.
My code is running with
/opt/megatron-lm
in NeMo Framework Container dev version (nvcr.io/nvidia/nemo:dev)The text was updated successfully, but these errors were encountered: