allenai · epwalsh · Jan 24, 2025 · Jan 24, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added the option to throttle checkpoint uploads to one rank from each node at a time.
 - Added `unshard_strategy` parameter to `unshard_checkpoint()` function in `olmo_coer.distributed.checkpoint`.
 - Added function `load_keys()` to `olmo_core.distributed.checkpoint`.
+- Added support for low precision optim state in `SkipStepAdamW`.
 
 ### Changed
 

diff --git a/src/olmo_core/optim/adamw.py b/src/olmo_core/optim/adamw.py
@@ -1,9 +1,10 @@
 from dataclasses import dataclass
-from typing import Optional, Tuple, Type
+from typing import Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
 
+from ..config import DType
 from .config import OptimConfig
 from .skip_step_optimizer import SkipStepOptimizer
 
@@ -29,7 +30,7 @@ def adamw_step(
     p.mul_(1 - step_factor * (lr * weight_decay))
 
     # Decay the first and second moment running average coefficient.
-    exp_avg.lerp_(p.grad, step_factor * (1 - beta1))
+    exp_avg.lerp_(p.grad.type_as(exp_avg), (step_factor * (1 - beta1)).type_as(exp_avg))
     exp_avg_sq.mul_(1 - step_factor * (1 - beta2))
     exp_avg_sq.add_(step_factor * p.grad * p.grad, alpha=1 - beta2)
 
@@ -61,6 +62,7 @@ def __init__(
         fused: Optional[bool] = None,
         rolling_interval_length: int = 128,
         sigma_factor: int = 6,
+        dtype: Optional[Union[torch.dtype, DType]] = None,
     ) -> None:
         assert lr > 0.0
         assert all([0.0 <= beta <= 1.0 for beta in betas])
@@ -73,6 +75,9 @@ def __init__(
             rolling_interval_length=rolling_interval_length,
             sigma_factor=sigma_factor,
         )
+        if isinstance(dtype, DType):
+            dtype = dtype.as_pt()
+        self.dtype = dtype
         self._step_skipped: Optional[torch.Tensor] = None
 
     @property
@@ -98,8 +103,8 @@ def step(self, closure=None) -> None:
                 state = self.state[p]
                 if len(state) == 0:
                     state["step"] = torch.tensor(0.0, dtype=torch.float32, device=p.device)
-                    state["exp_avg"] = torch.zeros_like(p)
-                    state["exp_avg_sq"] = torch.zeros_like(p)
+                    state["exp_avg"] = torch.zeros_like(p, dtype=self.dtype)
+                    state["exp_avg_sq"] = torch.zeros_like(p, dtype=self.dtype)
 
                 adamw_step(
                     p,
@@ -144,6 +149,7 @@ class SkipStepAdamWConfig(OptimConfig):
     weight_decay: float = 1e-2
     rolling_interval_length: int = 128
     sigma_factor: int = 6
+    dtype: Optional[DType] = None
 
     @classmethod
     def optimizer(cls) -> Type[SkipStepAdamW]:

diff --git a/src/test/optim/adamw_test.py b/src/test/optim/adamw_test.py
@@ -1,11 +1,14 @@
-from test.utils import DEVICES
+from typing import Optional
 
 import pytest
 import torch
 import torch.nn as nn
 
+from olmo_core.config import DType
 from olmo_core.optim import AdamWConfig, OptimGroupOverride, SkipStepAdamWConfig
 
+from ..utils import DEVICES
+
 
 class MyModel(nn.Module):
     def __init__(self):
@@ -64,8 +67,12 @@ def test_adamw(device: torch.device):
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_skip_step_adamw(device: torch.device):
-    config = SkipStepAdamWConfig()
+@pytest.mark.parametrize("dtype", [None, DType.bfloat16])
+def test_skip_step_adamw(device: torch.device, dtype: Optional[DType]):
+    if dtype == DType.bfloat16 and device.type == "cpu":
+        pytest.skip("bfloat16 dtype requires cuda")
+
+    config = SkipStepAdamWConfig(dtype=dtype)
     model = MyModel().train().to(device)
     optim = config.build(model)