Error during training: "Expected dtype float for end but got dtype c10::BFloat16"

## System Info

I'm running on an Amazon SageMaker-managed p4d.24xlarge instance, so I'll do my best to provide comprehensive system info below.

### Training image
`763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04`

### Dependencies
```
accelerate==1.1.1
alembic==1.14.0
antlr4-python3-runtime==4.9.3
blinker==1.9.0
cachetools==5.5.0
databricks-sdk==0.38.0
datasets==3.1.0
deprecated==1.2.15
Flask==3.1.0
gitdb==4.0.11
gitpython==3.1.43
google-auth==2.36.0
graphene==3.4.3
graphql-core==3.2.5
graphql-relay==3.2.0
gunicorn==23.0.0
huggingface_hub==0.26.3
itsdangerous==2.2.0
Mako==1.3.7
mlflow==2.18.0
mlflow-skinny==2.18.0
omegaconf==2.3.0
opentelemetry-api==1.28.2
opentelemetry-sdk==1.28.2
opentelemetry-semantic-conventions==0.49b2
peft==0.13.2
pyasn1-modules==0.4.1
sagemaker-mlflow==0.1.0
smmap==5.0.1
sqlalchemy==2.0.36
sqlparse==0.5.2
tokenizers==0.20.3
transformers==4.46.3
trl==0.12.1
Werkzeug==3.1.3
```

### SageMaker training config
```
{
    "additional_framework_parameters": {
        "sagemaker_instance_type": "ml.p4d.24xlarge",
        "sagemaker_torch_distributed_enabled": true
    },
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "current_instance_group": "homogeneousCluster",
    "current_instance_group_hosts": [
        "algo-1"
    ],
    "current_instance_type": "ml.p4d.24xlarge",
    "distribution_hosts": [
        "algo-1"
    ],
    "distribution_instance_groups": [
        "homogeneousCluster"
    ],
    "framework_module": "sagemaker_pytorch_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {},
    "input_config_dir": "/opt/ml/input/config",
    "input_data_config": {
        "test": {
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None"
        },
        "train": {
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None"
        }
    },
    "input_dir": "/opt/ml/input",
    "instance_groups": [
        "homogeneousCluster"
    ],
    "instance_groups_dict": {
        "homogeneousCluster": {
            "instance_group_name": "homogeneousCluster",
            "instance_type": "ml.p4d.24xlarge",
            "hosts": [
                "algo-1"
            ]
        }
    },
    "is_hetero": false,
    "is_master": true,
    "is_modelparallel_enabled": null,
    "is_smddpmprun_installed": false,
    "is_smddprun_installed": true,
    "job_name": "***",
    "log_level": 20,
    "master_hostname": "algo-1",
    "model_dir": "/opt/ml/model",
    "module_dir": "s3://***/sourcedir.tar.gz",
    "module_name": "train",
    "network_interface_name": "eth0",
    "num_cpus": 96,
    "num_gpus": 8,
    "num_neurons": 0,
    "output_data_dir": "/opt/ml/output/data",
    "output_dir": "/opt/ml/output",
    "output_intermediate_dir": "/opt/ml/output/intermediate",
    "resource_config": {
        "current_host": "algo-1",
        "current_instance_type": "ml.p4d.24xlarge",
        "current_group_name": "homogeneousCluster",
        "hosts": [
            "algo-1"
        ],
        "instance_groups": [
            {
                "instance_group_name": "homogeneousCluster",
                "instance_type": "ml.p4d.24xlarge",
                "hosts": [
                    "algo-1"
                ]
            }
        ],
        "network_interface_name": "eth0"
    },
    "user_entry_point": "train.py"
}
```

### Environment variables
```
SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS=
{}
SM_USER_ENTRY_POINT=train.py
SM_FRAMEWORK_PARAMS=
{
    "sagemaker_instance_type": "ml.p4d.24xlarge",
    "sagemaker_torch_distributed_enabled": true
}
SM_RESOURCE_CONFIG=
{
    "current_group_name": "homogeneousCluster",
    "current_host": "algo-1",
    "current_instance_type": "ml.p4d.24xlarge",
    "hosts": [
        "algo-1"
    ],
    "instance_groups": [
        {
            "hosts": [
                "algo-1"
            ],
            "instance_group_name": "homogeneousCluster",
            "instance_type": "ml.p4d.24xlarge"
        }
    ],
    "network_interface_name": "eth0"
}
SM_INPUT_DATA_CONFIG=
{
    "test": {
        "RecordWrapperType": "None",
        "S3DistributionType": "FullyReplicated",
        "TrainingInputMode": "File"
    },
    "train": {
        "RecordWrapperType": "None",
        "S3DistributionType": "FullyReplicated",
        "TrainingInputMode": "File"
    }
}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=["test","train"]
SM_CURRENT_HOST=algo-1
SM_CURRENT_INSTANCE_TYPE=ml.p4d.24xlarge
SM_CURRENT_INSTANCE_GROUP=homogeneousCluster
SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"]
SM_INSTANCE_GROUPS=["homogeneousCluster"]
SM_INSTANCE_GROUPS_DICT=
{
    "homogeneousCluster": {
        "hosts": [
            "algo-1"
        ],
        "instance_group_name": "homogeneousCluster",
        "instance_type": "ml.p4d.24xlarge"
    }
}
SM_DISTRIBUTION_INSTANCE_GROUPS=["homogeneousCluster"]
SM_IS_HETERO=false
SM_MODULE_NAME=train
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=96
SM_NUM_GPUS=8
SM_NUM_NEURONS=0
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://***/sourcedir.tar.gz
SM_TRAINING_ENV=
{
    "additional_framework_parameters": {
        "sagemaker_instance_type": "ml.p4d.24xlarge",
        "sagemaker_torch_distributed_enabled": true
    },
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "current_instance_group": "homogeneousCluster",
    "current_instance_group_hosts": [
        "algo-1"
    ],
    "current_instance_type": "ml.p4d.24xlarge",
    "distribution_hosts": [
        "algo-1"
    ],
    "distribution_instance_groups": [
        "homogeneousCluster"
    ],
    "framework_module": "sagemaker_pytorch_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {},
    "input_config_dir": "/opt/ml/input/config",
    "input_data_config": {
        "test": {
            "RecordWrapperType": "None",
            "S3DistributionType": "FullyReplicated",
            "TrainingInputMode": "File"
        },
        "train": {
            "RecordWrapperType": "None",
            "S3DistributionType": "FullyReplicated",
            "TrainingInputMode": "File"
        }
    },
    "input_dir": "/opt/ml/input",
    "instance_groups": [
        "homogeneousCluster"
    ],
    "instance_groups_dict": {
        "homogeneousCluster": {
            "hosts": [
                "algo-1"
            ],
            "instance_group_name": "homogeneousCluster",
            "instance_type": "ml.p4d.24xlarge"
        }
    },
    "is_hetero": false,
    "is_master": true,
    "is_modelparallel_enabled": null,
    "is_smddpmprun_installed": false,
    "is_smddprun_installed": true,
    "job_name": "***",
    "log_level": 20,
    "master_hostname": "algo-1",
    "model_dir": "/opt/ml/model",
    "module_dir": "s3://***/sourcedir.tar.gz",
    "module_name": "train",
    "network_interface_name": "eth0",
    "num_cpus": 96,
    "num_gpus": 8,
    "num_neurons": 0,
    "output_data_dir": "/opt/ml/output/data",
    "output_dir": "/opt/ml/output",
    "output_intermediate_dir": "/opt/ml/output/intermediate",
    "resource_config": {
        "current_group_name": "homogeneousCluster",
        "current_host": "algo-1",
        "current_instance_type": "ml.p4d.24xlarge",
        "hosts": [
            "algo-1"
        ],
        "instance_groups": [
            {
                "hosts": [
                    "algo-1"
                ],
                "instance_group_name": "homogeneousCluster",
                "instance_type": "ml.p4d.24xlarge"
            }
        ],
        "network_interface_name": "eth0"
    },
    "user_entry_point": "train.py"
}
SM_USER_ARGS=[]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
SM_CHANNEL_TEST=/opt/ml/input/data/test
SM_CHANNEL_TRAIN=/opt/ml/input/data/train
PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python310.zip:/opt/conda/lib/python3.10:/opt/conda/lib/python3.10/lib-dynload:/opt/conda/lib/python3.10/site-packages
```
### SFT Config
```
SFTConfig(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
attn_implementation=flash_attention_2,
auto_find_batch_size=False,
average_tokens_across_devices=False,
aws_profile=,
batch_eval_metrics=False,
bf16=auto,
bf16_full_eval=False,
chars_per_token=<CHARS_PER_TOKEN>,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
dataset_batch_size=1000,
dataset_kwargs=None,
dataset_name=,
dataset_num_proc=8,
dataset_test_split=,
dataset_text_field=,
dataset_train_split=,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
delete_ckpts=False,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
early_stopping_patience=10,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_exampleset_info_path=,
eval_exampleset_path=,
eval_on_start=True,
eval_packing=False,
eval_steps=10,
eval_strategy=steps,
eval_use_gather_object=False,
evaluation_strategy=None,
exampleset_info_path=,
exampleset_path=,
force_tokenize_data=False,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[<FSDPOption.FULL_SHARD: 'full_shard'>, <FSDPOption.AUTO_WRAP: 'auto_wrap'>, <FSDPOption.OFFLOAD: 'offload'>],
fsdp_config={'limit_all_gathers': True, 'backward_prefetch': 'backward_pre', 'forward_prefetch': 'false', 'use_orig_params': 'false', 'min_num_params': 0, 'activation_checkpointing': 'true', 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
gradient_checkpointing_kwargs={'use_reentrant': True},
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_for_metrics=[],
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=2e-05,
length_column_name=length,
load_best_model_at_end=True,
load_hf_data=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/opt/ml/model/runs/Dec04_23-27-49_algo-1,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=1,
logging_strategy=steps,
lora_alpha=32,
lora_dropout=0.05,
lora_r=16,
lora_target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'],
lr_scheduler_kwargs=
{}
,
lr_scheduler_type=cosine,
mask_instructions=True,
max_grad_norm=1.0,
max_seq_length=4096,
max_steps=1000,
meta_data=
{}
,
metric_for_best_model=loss,
mlflow_experiment_name=***,
mlflow_run_name=***,
mlflow_tracking_uri=***,
model_name_or_path=Qwen/Qwen2.5-7B-Instruct,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_of_sequences=1024,
num_train_epochs=3,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=/opt/ml/model,
overwrite_output_dir=False,
packing=False,
past_index=-1,
per_device_eval_batch_size=2,
per_device_train_batch_size=2,
prediction_loss_only=False,
preprocessed_data_path=,
preprocessed_eval_data_path=,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['mlflow'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=/opt/ml/model,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=20,
save_strategy=steps,
save_total_limit=None,
seed=42,
skip_memory_metrics=True,
smoke_test=False,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_dtype=bfloat16,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_liger_kernel=False,
use_mps_device=False,
use_peft=False,
val_set_size=0.0,
warmup_ratio=0.1,
warmup_steps=100,
weight_decay=0.0,
)
```

## Who can help?

@muellerz @SunMarc @ArthurZucker 

## Information

- [ ] The official example scripts
- [x] My own modified scripts

## Tasks

- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [x] My own task or dataset (give details below)

## Reproduction

Below is my training script, the SageMaker estimator object, and the stack trace showing the error that I'm getting. A curious (but perhaps irrelevant detail) is that this I've reproduced this twice, and both times it occurs during training on the 257th step (when `max_steps=1000`).

### Training script
```
import os
import glob
import json
import logging

from datasets import load_dataset
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import load_dataset
import mlflow
from omegaconf import DictConfig, OmegaConf
import pprint
from peft.tuners.lora import LoraConfig
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, EarlyStoppingCallback


def main(cfg: DictConfig):

    peft_config = None

    model_kwargs = dict(
        attn_implementation=sft_config.attn_implementation,
        torch_dtype=sft_config.torch_dtype,
        #use_cache=not sft_config.gradient_checkpointing,
        use_cache=not (sft_config.gradient_checkpointing or sft_config.fsdp_config.activation_checkpointing)
    )
    model = AutoModelForCausalLM.from_pretrained(sft_config.model_name_or_path, **model_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(sft_config.model_name_or_path, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    
    instruction_template = "user\n"
    response_template = "assistant\n"
    data_collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)
    train_dataset = load_dataset(*)
    eval_dataset = load_dataset(*)

    
    def tokenize(examples):
        conversations = examples["messages"]
        input_text = []
        for conv in conversations:
            text = tokenizer.apply_chat_template(conv, tokenize=False)
            input_text.append(text)
        tokenized = tokenizer(
            input_text,
            truncation=True,
            max_length=sft_config.max_seq_length,
            padding='max_length',
            return_tensors=None,
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized
    

    train_dataset = train_dataset.map(
        tokenize,
        remove_columns=train_dataset.column_names,
        batched=True,
        desc="Tokenizing train dataset",
    )
    
    eval_dataset = eval_dataset.map(*)
    
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=sft_config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="messages",
        peft_config=peft_config,
        dataset_kwargs=sft_config.dataset_kwargs,
        data_collator=data_collator,
    )

    if hasattr(sft_config, "early_stopping_patience") and sft_config.early_stopping_patience > 0:
        callback = EarlyStoppingCallback(sft_config.early_stopping_patience)
        trainer.add_callback(callback)

    trainer.train()
    trainer.save_model()
    
if __name__ == "__main__":
    cfg = OmegaConf.load('train_config.yaml')
    main(cfg)
```

### SageMaker / HuggingFace estimator
```
huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',      
    dependencies=['requirements.txt'],         
    source_dir           = './',           
    instance_type        = 'ml.p4d.24xlarge', 
    instance_count       = 1,              
    max_run              = 2*24*60*60,       
    base_job_name        = job_name,         
    role                 = role,      
    volume_size          = 1024,         
    transformers_version = '4.36.0',          
    pytorch_version      = '2.1.0',          
    py_version           = 'py310',          
    disable_output_compression = True,  
    distribution={"torch_distributed": {"enabled": True}},  
    environment  = {
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", 
        "HF_TOKEN": HfFolder.get_token(),   
        "ACCELERATE_USE_FSDP": "1",       
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "0",   
        "FSDP_AUTO_WRAP_POLICY": "TRANSFORMER_BASED_WRAP",
        "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
        "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
        "NCCL_TIMEOUT": "3600", 
        "NCCL_DEBUG": "WARN",  
        "NCCL_IB_TIMEOUT": "3600",
        "NCCL_SOCKET_TIMEOUT": "3600",
        "NCCL_ASYNC_ERROR_HANDLING": "1",
        "NCCL_P2P_LEVEL": "NVL",
        "CUDA_DEVICE_MAX_CONNECTIONS": "1",        
        "MAX_JOBS": "1",                           
        "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512",
        "TORCH_DISTRIBUTED_DEBUG": "DETAIL",   
        "HF_DATASETS_CACHE": "/opt/ml/input",
        "TRANSFORMERS_CACHE": "/opt/ml/input"
    },
    checkpoint_s3_uri=f's3://{***}/checkpoints'
)
```



### Stack trace
```
Traceback (most recent call last):
  File "/opt/ml/code/train.py", line 259, in <module>
    main(cfg)
  File "/opt/ml/code/train.py", line 242, in main
    trainer.train()
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2123, in train
    return inner_training_loop(
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2534, in _inner_training_loop
    self.optimizer.step()
  File "/opt/conda/lib/python3.10/site-packages/accelerate/optimizer.py", line 171, in step
    self.optimizer.step(closure)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
    return wrapped(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
    out = func(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
    ret = func(self, *args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 184, in step
    adamw(
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 335, in adamw
    func(
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 412, in _single_tensor_adamw
    exp_avg.lerp_(grad, 1 - beta1)
RuntimeError: expected dtype float for `end` but got dtype c10::BFloat16
```

## Expected behavior

Training should complete successfully without encountering errors.

## Related issues
1. https://github.com/huggingface/transformers/issues/34702 - this seems to be the exact same issue and was active as recently as 3 weeks ago. It was closed only two days ago.
2. https://github.com/SqueezeAILab/LLM2LLM/issues/5
3. https://discuss.huggingface.co/t/errors-when-using-gradient-accumulation-with-fsdp-peft-lora-sfttrainer/105006

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Error during training: "Expected dtype float for end but got dtype c10::BFloat16" #35106

System Info

Training image

Dependencies

SageMaker training config

Environment variables

SFT Config

Who can help?

Information

Tasks

Reproduction

Training script

SageMaker / HuggingFace estimator

Stack trace

Expected behavior

Related issues

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Error during training: "Expected dtype float for end but got dtype c10::BFloat16" #35106

Description

System Info

Training image

Dependencies

SageMaker training config

Environment variables

SFT Config

Who can help?

Information

Tasks

Reproduction

Training script

SageMaker / HuggingFace estimator

Stack trace

Expected behavior

Related issues

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions