Skip to content

Error during training: "Expected dtype float for end but got dtype c10::BFloat16" #35106

@jjbuck

Description

@jjbuck

System Info

I'm running on an Amazon SageMaker-managed p4d.24xlarge instance, so I'll do my best to provide comprehensive system info below.

Training image

763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04

Dependencies

accelerate==1.1.1
alembic==1.14.0
antlr4-python3-runtime==4.9.3
blinker==1.9.0
cachetools==5.5.0
databricks-sdk==0.38.0
datasets==3.1.0
deprecated==1.2.15
Flask==3.1.0
gitdb==4.0.11
gitpython==3.1.43
google-auth==2.36.0
graphene==3.4.3
graphql-core==3.2.5
graphql-relay==3.2.0
gunicorn==23.0.0
huggingface_hub==0.26.3
itsdangerous==2.2.0
Mako==1.3.7
mlflow==2.18.0
mlflow-skinny==2.18.0
omegaconf==2.3.0
opentelemetry-api==1.28.2
opentelemetry-sdk==1.28.2
opentelemetry-semantic-conventions==0.49b2
peft==0.13.2
pyasn1-modules==0.4.1
sagemaker-mlflow==0.1.0
smmap==5.0.1
sqlalchemy==2.0.36
sqlparse==0.5.2
tokenizers==0.20.3
transformers==4.46.3
trl==0.12.1
Werkzeug==3.1.3

SageMaker training config

{
    "additional_framework_parameters": {
        "sagemaker_instance_type": "ml.p4d.24xlarge",
        "sagemaker_torch_distributed_enabled": true
    },
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "current_instance_group": "homogeneousCluster",
    "current_instance_group_hosts": [
        "algo-1"
    ],
    "current_instance_type": "ml.p4d.24xlarge",
    "distribution_hosts": [
        "algo-1"
    ],
    "distribution_instance_groups": [
        "homogeneousCluster"
    ],
    "framework_module": "sagemaker_pytorch_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {},
    "input_config_dir": "/opt/ml/input/config",
    "input_data_config": {
        "test": {
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None"
        },
        "train": {
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None"
        }
    },
    "input_dir": "/opt/ml/input",
    "instance_groups": [
        "homogeneousCluster"
    ],
    "instance_groups_dict": {
        "homogeneousCluster": {
            "instance_group_name": "homogeneousCluster",
            "instance_type": "ml.p4d.24xlarge",
            "hosts": [
                "algo-1"
            ]
        }
    },
    "is_hetero": false,
    "is_master": true,
    "is_modelparallel_enabled": null,
    "is_smddpmprun_installed": false,
    "is_smddprun_installed": true,
    "job_name": "***",
    "log_level": 20,
    "master_hostname": "algo-1",
    "model_dir": "/opt/ml/model",
    "module_dir": "s3://***/sourcedir.tar.gz",
    "module_name": "train",
    "network_interface_name": "eth0",
    "num_cpus": 96,
    "num_gpus": 8,
    "num_neurons": 0,
    "output_data_dir": "/opt/ml/output/data",
    "output_dir": "/opt/ml/output",
    "output_intermediate_dir": "/opt/ml/output/intermediate",
    "resource_config": {
        "current_host": "algo-1",
        "current_instance_type": "ml.p4d.24xlarge",
        "current_group_name": "homogeneousCluster",
        "hosts": [
            "algo-1"
        ],
        "instance_groups": [
            {
                "instance_group_name": "homogeneousCluster",
                "instance_type": "ml.p4d.24xlarge",
                "hosts": [
                    "algo-1"
                ]
            }
        ],
        "network_interface_name": "eth0"
    },
    "user_entry_point": "train.py"
}

Environment variables

SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS=
{}
SM_USER_ENTRY_POINT=train.py
SM_FRAMEWORK_PARAMS=
{
    "sagemaker_instance_type": "ml.p4d.24xlarge",
    "sagemaker_torch_distributed_enabled": true
}
SM_RESOURCE_CONFIG=
{
    "current_group_name": "homogeneousCluster",
    "current_host": "algo-1",
    "current_instance_type": "ml.p4d.24xlarge",
    "hosts": [
        "algo-1"
    ],
    "instance_groups": [
        {
            "hosts": [
                "algo-1"
            ],
            "instance_group_name": "homogeneousCluster",
            "instance_type": "ml.p4d.24xlarge"
        }
    ],
    "network_interface_name": "eth0"
}
SM_INPUT_DATA_CONFIG=
{
    "test": {
        "RecordWrapperType": "None",
        "S3DistributionType": "FullyReplicated",
        "TrainingInputMode": "File"
    },
    "train": {
        "RecordWrapperType": "None",
        "S3DistributionType": "FullyReplicated",
        "TrainingInputMode": "File"
    }
}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=["test","train"]
SM_CURRENT_HOST=algo-1
SM_CURRENT_INSTANCE_TYPE=ml.p4d.24xlarge
SM_CURRENT_INSTANCE_GROUP=homogeneousCluster
SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"]
SM_INSTANCE_GROUPS=["homogeneousCluster"]
SM_INSTANCE_GROUPS_DICT=
{
    "homogeneousCluster": {
        "hosts": [
            "algo-1"
        ],
        "instance_group_name": "homogeneousCluster",
        "instance_type": "ml.p4d.24xlarge"
    }
}
SM_DISTRIBUTION_INSTANCE_GROUPS=["homogeneousCluster"]
SM_IS_HETERO=false
SM_MODULE_NAME=train
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=96
SM_NUM_GPUS=8
SM_NUM_NEURONS=0
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://***/sourcedir.tar.gz
SM_TRAINING_ENV=
{
    "additional_framework_parameters": {
        "sagemaker_instance_type": "ml.p4d.24xlarge",
        "sagemaker_torch_distributed_enabled": true
    },
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "current_instance_group": "homogeneousCluster",
    "current_instance_group_hosts": [
        "algo-1"
    ],
    "current_instance_type": "ml.p4d.24xlarge",
    "distribution_hosts": [
        "algo-1"
    ],
    "distribution_instance_groups": [
        "homogeneousCluster"
    ],
    "framework_module": "sagemaker_pytorch_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {},
    "input_config_dir": "/opt/ml/input/config",
    "input_data_config": {
        "test": {
            "RecordWrapperType": "None",
            "S3DistributionType": "FullyReplicated",
            "TrainingInputMode": "File"
        },
        "train": {
            "RecordWrapperType": "None",
            "S3DistributionType": "FullyReplicated",
            "TrainingInputMode": "File"
        }
    },
    "input_dir": "/opt/ml/input",
    "instance_groups": [
        "homogeneousCluster"
    ],
    "instance_groups_dict": {
        "homogeneousCluster": {
            "hosts": [
                "algo-1"
            ],
            "instance_group_name": "homogeneousCluster",
            "instance_type": "ml.p4d.24xlarge"
        }
    },
    "is_hetero": false,
    "is_master": true,
    "is_modelparallel_enabled": null,
    "is_smddpmprun_installed": false,
    "is_smddprun_installed": true,
    "job_name": "***",
    "log_level": 20,
    "master_hostname": "algo-1",
    "model_dir": "/opt/ml/model",
    "module_dir": "s3://***/sourcedir.tar.gz",
    "module_name": "train",
    "network_interface_name": "eth0",
    "num_cpus": 96,
    "num_gpus": 8,
    "num_neurons": 0,
    "output_data_dir": "/opt/ml/output/data",
    "output_dir": "/opt/ml/output",
    "output_intermediate_dir": "/opt/ml/output/intermediate",
    "resource_config": {
        "current_group_name": "homogeneousCluster",
        "current_host": "algo-1",
        "current_instance_type": "ml.p4d.24xlarge",
        "hosts": [
            "algo-1"
        ],
        "instance_groups": [
            {
                "hosts": [
                    "algo-1"
                ],
                "instance_group_name": "homogeneousCluster",
                "instance_type": "ml.p4d.24xlarge"
            }
        ],
        "network_interface_name": "eth0"
    },
    "user_entry_point": "train.py"
}
SM_USER_ARGS=[]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
SM_CHANNEL_TEST=/opt/ml/input/data/test
SM_CHANNEL_TRAIN=/opt/ml/input/data/train
PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python310.zip:/opt/conda/lib/python3.10:/opt/conda/lib/python3.10/lib-dynload:/opt/conda/lib/python3.10/site-packages

SFT Config

SFTConfig(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
attn_implementation=flash_attention_2,
auto_find_batch_size=False,
average_tokens_across_devices=False,
aws_profile=,
batch_eval_metrics=False,
bf16=auto,
bf16_full_eval=False,
chars_per_token=<CHARS_PER_TOKEN>,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
dataset_batch_size=1000,
dataset_kwargs=None,
dataset_name=,
dataset_num_proc=8,
dataset_test_split=,
dataset_text_field=,
dataset_train_split=,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
delete_ckpts=False,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
early_stopping_patience=10,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_exampleset_info_path=,
eval_exampleset_path=,
eval_on_start=True,
eval_packing=False,
eval_steps=10,
eval_strategy=steps,
eval_use_gather_object=False,
evaluation_strategy=None,
exampleset_info_path=,
exampleset_path=,
force_tokenize_data=False,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[<FSDPOption.FULL_SHARD: 'full_shard'>, <FSDPOption.AUTO_WRAP: 'auto_wrap'>, <FSDPOption.OFFLOAD: 'offload'>],
fsdp_config={'limit_all_gathers': True, 'backward_prefetch': 'backward_pre', 'forward_prefetch': 'false', 'use_orig_params': 'false', 'min_num_params': 0, 'activation_checkpointing': 'true', 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
gradient_checkpointing_kwargs={'use_reentrant': True},
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_for_metrics=[],
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=2e-05,
length_column_name=length,
load_best_model_at_end=True,
load_hf_data=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/opt/ml/model/runs/Dec04_23-27-49_algo-1,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=1,
logging_strategy=steps,
lora_alpha=32,
lora_dropout=0.05,
lora_r=16,
lora_target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'],
lr_scheduler_kwargs=
{}
,
lr_scheduler_type=cosine,
mask_instructions=True,
max_grad_norm=1.0,
max_seq_length=4096,
max_steps=1000,
meta_data=
{}
,
metric_for_best_model=loss,
mlflow_experiment_name=***,
mlflow_run_name=***,
mlflow_tracking_uri=***,
model_name_or_path=Qwen/Qwen2.5-7B-Instruct,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_of_sequences=1024,
num_train_epochs=3,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=/opt/ml/model,
overwrite_output_dir=False,
packing=False,
past_index=-1,
per_device_eval_batch_size=2,
per_device_train_batch_size=2,
prediction_loss_only=False,
preprocessed_data_path=,
preprocessed_eval_data_path=,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['mlflow'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=/opt/ml/model,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=20,
save_strategy=steps,
save_total_limit=None,
seed=42,
skip_memory_metrics=True,
smoke_test=False,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_dtype=bfloat16,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_liger_kernel=False,
use_mps_device=False,
use_peft=False,
val_set_size=0.0,
warmup_ratio=0.1,
warmup_steps=100,
weight_decay=0.0,
)

Who can help?

@muellerz @SunMarc @ArthurZucker

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

Below is my training script, the SageMaker estimator object, and the stack trace showing the error that I'm getting. A curious (but perhaps irrelevant detail) is that this I've reproduced this twice, and both times it occurs during training on the 257th step (when max_steps=1000).

Training script

import os
import glob
import json
import logging

from datasets import load_dataset
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import load_dataset
import mlflow
from omegaconf import DictConfig, OmegaConf
import pprint
from peft.tuners.lora import LoraConfig
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, EarlyStoppingCallback


def main(cfg: DictConfig):

    peft_config = None

    model_kwargs = dict(
        attn_implementation=sft_config.attn_implementation,
        torch_dtype=sft_config.torch_dtype,
        #use_cache=not sft_config.gradient_checkpointing,
        use_cache=not (sft_config.gradient_checkpointing or sft_config.fsdp_config.activation_checkpointing)
    )
    model = AutoModelForCausalLM.from_pretrained(sft_config.model_name_or_path, **model_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(sft_config.model_name_or_path, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    
    instruction_template = "user\n"
    response_template = "assistant\n"
    data_collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)
    train_dataset = load_dataset(*)
    eval_dataset = load_dataset(*)

    
    def tokenize(examples):
        conversations = examples["messages"]
        input_text = []
        for conv in conversations:
            text = tokenizer.apply_chat_template(conv, tokenize=False)
            input_text.append(text)
        tokenized = tokenizer(
            input_text,
            truncation=True,
            max_length=sft_config.max_seq_length,
            padding='max_length',
            return_tensors=None,
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized
    

    train_dataset = train_dataset.map(
        tokenize,
        remove_columns=train_dataset.column_names,
        batched=True,
        desc="Tokenizing train dataset",
    )
    
    eval_dataset = eval_dataset.map(*)
    
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=sft_config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="messages",
        peft_config=peft_config,
        dataset_kwargs=sft_config.dataset_kwargs,
        data_collator=data_collator,
    )

    if hasattr(sft_config, "early_stopping_patience") and sft_config.early_stopping_patience > 0:
        callback = EarlyStoppingCallback(sft_config.early_stopping_patience)
        trainer.add_callback(callback)

    trainer.train()
    trainer.save_model()
    
if __name__ == "__main__":
    cfg = OmegaConf.load('train_config.yaml')
    main(cfg)

SageMaker / HuggingFace estimator

huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',      
    dependencies=['requirements.txt'],         
    source_dir           = './',           
    instance_type        = 'ml.p4d.24xlarge', 
    instance_count       = 1,              
    max_run              = 2*24*60*60,       
    base_job_name        = job_name,         
    role                 = role,      
    volume_size          = 1024,         
    transformers_version = '4.36.0',          
    pytorch_version      = '2.1.0',          
    py_version           = 'py310',          
    disable_output_compression = True,  
    distribution={"torch_distributed": {"enabled": True}},  
    environment  = {
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", 
        "HF_TOKEN": HfFolder.get_token(),   
        "ACCELERATE_USE_FSDP": "1",       
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "0",   
        "FSDP_AUTO_WRAP_POLICY": "TRANSFORMER_BASED_WRAP",
        "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
        "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
        "NCCL_TIMEOUT": "3600", 
        "NCCL_DEBUG": "WARN",  
        "NCCL_IB_TIMEOUT": "3600",
        "NCCL_SOCKET_TIMEOUT": "3600",
        "NCCL_ASYNC_ERROR_HANDLING": "1",
        "NCCL_P2P_LEVEL": "NVL",
        "CUDA_DEVICE_MAX_CONNECTIONS": "1",        
        "MAX_JOBS": "1",                           
        "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512",
        "TORCH_DISTRIBUTED_DEBUG": "DETAIL",   
        "HF_DATASETS_CACHE": "/opt/ml/input",
        "TRANSFORMERS_CACHE": "/opt/ml/input"
    },
    checkpoint_s3_uri=f's3://{***}/checkpoints'
)

Stack trace

Traceback (most recent call last):
  File "/opt/ml/code/train.py", line 259, in <module>
    main(cfg)
  File "/opt/ml/code/train.py", line 242, in main
    trainer.train()
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2123, in train
    return inner_training_loop(
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2534, in _inner_training_loop
    self.optimizer.step()
  File "/opt/conda/lib/python3.10/site-packages/accelerate/optimizer.py", line 171, in step
    self.optimizer.step(closure)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
    return wrapped(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
    out = func(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
    ret = func(self, *args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 184, in step
    adamw(
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 335, in adamw
    func(
  File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 412, in _single_tensor_adamw
    exp_avg.lerp_(grad, 1 - beta1)
RuntimeError: expected dtype float for `end` but got dtype c10::BFloat16

Expected behavior

Training should complete successfully without encountering errors.

Related issues

  1. FSDP with SFTrainer: expected dtype float for end but got dtype c10::BFloat16 #34702 - this seems to be the exact same issue and was active as recently as 3 weeks ago. It was closed only two days ago.
  2. expected dtype float for end but got dtype c10::BFloat16 SqueezeAILab/LLM2LLM#5
  3. https://discuss.huggingface.co/t/errors-when-using-gradient-accumulation-with-fsdp-peft-lora-sfttrainer/105006

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions