tensorflow · LINYV0719 · Dec 29, 2025
@@ -273,15 +273,35 @@ def train(self, steps: int, checkpoint_at_completion: bool = True):
     """
     self._require("trainer", for_method="train")
 
-    # TODO(momernick): Support steps=None or -1 (training to exhaustion).
     current_step = self.global_step.numpy()  # Cache, since this is expensive.
-    _log(f"train | step: {current_step: 6d} | training until step {steps}...")
-    while current_step < steps:
-      # Calculates steps to run for the next train loop.
-      num_steps = min(steps - current_step, self.steps_per_loop)
-      self._train_n_steps(num_steps)
+
+    if steps == -1:
+      _log(f"train | step: {current_step: 6d} | training until exhaustion...")
+    else:
+      _log(f"train | step: {current_step: 6d} | training until step {steps}...")
+
+    while steps == -1 or current_step < steps:
+    # Calculates steps to run for the next train loop.
+      if steps == -1:
+        num_steps = self.steps_per_loop
+      else:
+        num_steps = min(steps - current_step, self.steps_per_loop)
+
+      try:
+        self._train_n_steps(num_steps)
+      except (tf.errors.OutOfRangeError, StopIteration):
+        _log("Training stopped because the underlying iterator is exhausted.")
+        break
+
       self._maybe_save_checkpoint()
-      current_step = self.global_step.numpy()
+
+      new_step = self.global_step.numpy()
+      # Stop if the iterator is exhausted (step count didn't increase as expected).
+      if new_step < current_step + num_steps:
+        _log("Training stopped because the underlying iterator is exhausted.")
+        break
+
+      current_step = new_step
 
     if checkpoint_at_completion:
       self._maybe_save_checkpoint(check_interval=False)

@@ -827,6 +827,30 @@ def __call__(self, output):
       self.assertIn("eval_loss", output)
       self.assertGreaterEqual(output["eval_loss"], 0)
 
+  def test_train_until_exhaustion(self):
+      test_runner = TestRunner()
+
+      # Create a finite dataset with only 10 elements (batch size 10) -> 1 step.
+      def finite_dataset_fn(ctx):
+          del ctx
+          inputs = np.zeros((10, 3), dtype=np.float32)
+          targets = np.ones((10, 4), dtype=np.float32)
+          dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+          dataset = dataset.batch(1)
+          return dataset
+
+      test_runner.train_dataset = (
+          test_runner.strategy.distribute_datasets_from_function(finite_dataset_fn))
+
+      test_controller = controller.Controller(
+          trainer=test_runner,
+          global_step=test_runner.global_step,
+          steps_per_loop=1)
+
+      # Passing -1 should run until the dataset is exhausted (1 step).
+      test_controller.train(steps=-1)
+      self.assertEqual(test_runner.global_step.numpy(), 11)
+
   def test_step_per_loop_callable(self):
     test_runner = TestRunner()