-
Notifications
You must be signed in to change notification settings - Fork 31.6k
Closed
Closed
Copy link
Labels
Description
System Info
I'm running on an Amazon SageMaker-managed p4d.24xlarge instance, so I'll do my best to provide comprehensive system info below.
Training image
763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
Dependencies
accelerate==1.1.1
alembic==1.14.0
antlr4-python3-runtime==4.9.3
blinker==1.9.0
cachetools==5.5.0
databricks-sdk==0.38.0
datasets==3.1.0
deprecated==1.2.15
Flask==3.1.0
gitdb==4.0.11
gitpython==3.1.43
google-auth==2.36.0
graphene==3.4.3
graphql-core==3.2.5
graphql-relay==3.2.0
gunicorn==23.0.0
huggingface_hub==0.26.3
itsdangerous==2.2.0
Mako==1.3.7
mlflow==2.18.0
mlflow-skinny==2.18.0
omegaconf==2.3.0
opentelemetry-api==1.28.2
opentelemetry-sdk==1.28.2
opentelemetry-semantic-conventions==0.49b2
peft==0.13.2
pyasn1-modules==0.4.1
sagemaker-mlflow==0.1.0
smmap==5.0.1
sqlalchemy==2.0.36
sqlparse==0.5.2
tokenizers==0.20.3
transformers==4.46.3
trl==0.12.1
Werkzeug==3.1.3
SageMaker training config
{
"additional_framework_parameters": {
"sagemaker_instance_type": "ml.p4d.24xlarge",
"sagemaker_torch_distributed_enabled": true
},
"channel_input_dirs": {
"test": "/opt/ml/input/data/test",
"train": "/opt/ml/input/data/train"
},
"current_host": "algo-1",
"current_instance_group": "homogeneousCluster",
"current_instance_group_hosts": [
"algo-1"
],
"current_instance_type": "ml.p4d.24xlarge",
"distribution_hosts": [
"algo-1"
],
"distribution_instance_groups": [
"homogeneousCluster"
],
"framework_module": "sagemaker_pytorch_container.training:main",
"hosts": [
"algo-1"
],
"hyperparameters": {},
"input_config_dir": "/opt/ml/input/config",
"input_data_config": {
"test": {
"TrainingInputMode": "File",
"S3DistributionType": "FullyReplicated",
"RecordWrapperType": "None"
},
"train": {
"TrainingInputMode": "File",
"S3DistributionType": "FullyReplicated",
"RecordWrapperType": "None"
}
},
"input_dir": "/opt/ml/input",
"instance_groups": [
"homogeneousCluster"
],
"instance_groups_dict": {
"homogeneousCluster": {
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.p4d.24xlarge",
"hosts": [
"algo-1"
]
}
},
"is_hetero": false,
"is_master": true,
"is_modelparallel_enabled": null,
"is_smddpmprun_installed": false,
"is_smddprun_installed": true,
"job_name": "***",
"log_level": 20,
"master_hostname": "algo-1",
"model_dir": "/opt/ml/model",
"module_dir": "s3://***/sourcedir.tar.gz",
"module_name": "train",
"network_interface_name": "eth0",
"num_cpus": 96,
"num_gpus": 8,
"num_neurons": 0,
"output_data_dir": "/opt/ml/output/data",
"output_dir": "/opt/ml/output",
"output_intermediate_dir": "/opt/ml/output/intermediate",
"resource_config": {
"current_host": "algo-1",
"current_instance_type": "ml.p4d.24xlarge",
"current_group_name": "homogeneousCluster",
"hosts": [
"algo-1"
],
"instance_groups": [
{
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.p4d.24xlarge",
"hosts": [
"algo-1"
]
}
],
"network_interface_name": "eth0"
},
"user_entry_point": "train.py"
}
Environment variables
SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS=
{}
SM_USER_ENTRY_POINT=train.py
SM_FRAMEWORK_PARAMS=
{
"sagemaker_instance_type": "ml.p4d.24xlarge",
"sagemaker_torch_distributed_enabled": true
}
SM_RESOURCE_CONFIG=
{
"current_group_name": "homogeneousCluster",
"current_host": "algo-1",
"current_instance_type": "ml.p4d.24xlarge",
"hosts": [
"algo-1"
],
"instance_groups": [
{
"hosts": [
"algo-1"
],
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.p4d.24xlarge"
}
],
"network_interface_name": "eth0"
}
SM_INPUT_DATA_CONFIG=
{
"test": {
"RecordWrapperType": "None",
"S3DistributionType": "FullyReplicated",
"TrainingInputMode": "File"
},
"train": {
"RecordWrapperType": "None",
"S3DistributionType": "FullyReplicated",
"TrainingInputMode": "File"
}
}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=["test","train"]
SM_CURRENT_HOST=algo-1
SM_CURRENT_INSTANCE_TYPE=ml.p4d.24xlarge
SM_CURRENT_INSTANCE_GROUP=homogeneousCluster
SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"]
SM_INSTANCE_GROUPS=["homogeneousCluster"]
SM_INSTANCE_GROUPS_DICT=
{
"homogeneousCluster": {
"hosts": [
"algo-1"
],
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.p4d.24xlarge"
}
}
SM_DISTRIBUTION_INSTANCE_GROUPS=["homogeneousCluster"]
SM_IS_HETERO=false
SM_MODULE_NAME=train
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=96
SM_NUM_GPUS=8
SM_NUM_NEURONS=0
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://***/sourcedir.tar.gz
SM_TRAINING_ENV=
{
"additional_framework_parameters": {
"sagemaker_instance_type": "ml.p4d.24xlarge",
"sagemaker_torch_distributed_enabled": true
},
"channel_input_dirs": {
"test": "/opt/ml/input/data/test",
"train": "/opt/ml/input/data/train"
},
"current_host": "algo-1",
"current_instance_group": "homogeneousCluster",
"current_instance_group_hosts": [
"algo-1"
],
"current_instance_type": "ml.p4d.24xlarge",
"distribution_hosts": [
"algo-1"
],
"distribution_instance_groups": [
"homogeneousCluster"
],
"framework_module": "sagemaker_pytorch_container.training:main",
"hosts": [
"algo-1"
],
"hyperparameters": {},
"input_config_dir": "/opt/ml/input/config",
"input_data_config": {
"test": {
"RecordWrapperType": "None",
"S3DistributionType": "FullyReplicated",
"TrainingInputMode": "File"
},
"train": {
"RecordWrapperType": "None",
"S3DistributionType": "FullyReplicated",
"TrainingInputMode": "File"
}
},
"input_dir": "/opt/ml/input",
"instance_groups": [
"homogeneousCluster"
],
"instance_groups_dict": {
"homogeneousCluster": {
"hosts": [
"algo-1"
],
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.p4d.24xlarge"
}
},
"is_hetero": false,
"is_master": true,
"is_modelparallel_enabled": null,
"is_smddpmprun_installed": false,
"is_smddprun_installed": true,
"job_name": "***",
"log_level": 20,
"master_hostname": "algo-1",
"model_dir": "/opt/ml/model",
"module_dir": "s3://***/sourcedir.tar.gz",
"module_name": "train",
"network_interface_name": "eth0",
"num_cpus": 96,
"num_gpus": 8,
"num_neurons": 0,
"output_data_dir": "/opt/ml/output/data",
"output_dir": "/opt/ml/output",
"output_intermediate_dir": "/opt/ml/output/intermediate",
"resource_config": {
"current_group_name": "homogeneousCluster",
"current_host": "algo-1",
"current_instance_type": "ml.p4d.24xlarge",
"hosts": [
"algo-1"
],
"instance_groups": [
{
"hosts": [
"algo-1"
],
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.p4d.24xlarge"
}
],
"network_interface_name": "eth0"
},
"user_entry_point": "train.py"
}
SM_USER_ARGS=[]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
SM_CHANNEL_TEST=/opt/ml/input/data/test
SM_CHANNEL_TRAIN=/opt/ml/input/data/train
PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python310.zip:/opt/conda/lib/python3.10:/opt/conda/lib/python3.10/lib-dynload:/opt/conda/lib/python3.10/site-packages
SFT Config
SFTConfig(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
attn_implementation=flash_attention_2,
auto_find_batch_size=False,
average_tokens_across_devices=False,
aws_profile=,
batch_eval_metrics=False,
bf16=auto,
bf16_full_eval=False,
chars_per_token=<CHARS_PER_TOKEN>,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
dataset_batch_size=1000,
dataset_kwargs=None,
dataset_name=,
dataset_num_proc=8,
dataset_test_split=,
dataset_text_field=,
dataset_train_split=,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
delete_ckpts=False,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
early_stopping_patience=10,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_exampleset_info_path=,
eval_exampleset_path=,
eval_on_start=True,
eval_packing=False,
eval_steps=10,
eval_strategy=steps,
eval_use_gather_object=False,
evaluation_strategy=None,
exampleset_info_path=,
exampleset_path=,
force_tokenize_data=False,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[<FSDPOption.FULL_SHARD: 'full_shard'>, <FSDPOption.AUTO_WRAP: 'auto_wrap'>, <FSDPOption.OFFLOAD: 'offload'>],
fsdp_config={'limit_all_gathers': True, 'backward_prefetch': 'backward_pre', 'forward_prefetch': 'false', 'use_orig_params': 'false', 'min_num_params': 0, 'activation_checkpointing': 'true', 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
gradient_checkpointing_kwargs={'use_reentrant': True},
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_for_metrics=[],
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=2e-05,
length_column_name=length,
load_best_model_at_end=True,
load_hf_data=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/opt/ml/model/runs/Dec04_23-27-49_algo-1,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=1,
logging_strategy=steps,
lora_alpha=32,
lora_dropout=0.05,
lora_r=16,
lora_target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'],
lr_scheduler_kwargs=
{}
,
lr_scheduler_type=cosine,
mask_instructions=True,
max_grad_norm=1.0,
max_seq_length=4096,
max_steps=1000,
meta_data=
{}
,
metric_for_best_model=loss,
mlflow_experiment_name=***,
mlflow_run_name=***,
mlflow_tracking_uri=***,
model_name_or_path=Qwen/Qwen2.5-7B-Instruct,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_of_sequences=1024,
num_train_epochs=3,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=/opt/ml/model,
overwrite_output_dir=False,
packing=False,
past_index=-1,
per_device_eval_batch_size=2,
per_device_train_batch_size=2,
prediction_loss_only=False,
preprocessed_data_path=,
preprocessed_eval_data_path=,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['mlflow'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=/opt/ml/model,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=20,
save_strategy=steps,
save_total_limit=None,
seed=42,
skip_memory_metrics=True,
smoke_test=False,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_dtype=bfloat16,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_liger_kernel=False,
use_mps_device=False,
use_peft=False,
val_set_size=0.0,
warmup_ratio=0.1,
warmup_steps=100,
weight_decay=0.0,
)
Who can help?
@muellerz @SunMarc @ArthurZucker
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
Below is my training script, the SageMaker estimator object, and the stack trace showing the error that I'm getting. A curious (but perhaps irrelevant detail) is that this I've reproduced this twice, and both times it occurs during training on the 257th step (when max_steps=1000).
Training script
import os
import glob
import json
import logging
from datasets import load_dataset
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import load_dataset
import mlflow
from omegaconf import DictConfig, OmegaConf
import pprint
from peft.tuners.lora import LoraConfig
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, EarlyStoppingCallback
def main(cfg: DictConfig):
peft_config = None
model_kwargs = dict(
attn_implementation=sft_config.attn_implementation,
torch_dtype=sft_config.torch_dtype,
#use_cache=not sft_config.gradient_checkpointing,
use_cache=not (sft_config.gradient_checkpointing or sft_config.fsdp_config.activation_checkpointing)
)
model = AutoModelForCausalLM.from_pretrained(sft_config.model_name_or_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(sft_config.model_name_or_path, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
instruction_template = "user\n"
response_template = "assistant\n"
data_collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)
train_dataset = load_dataset(*)
eval_dataset = load_dataset(*)
def tokenize(examples):
conversations = examples["messages"]
input_text = []
for conv in conversations:
text = tokenizer.apply_chat_template(conv, tokenize=False)
input_text.append(text)
tokenized = tokenizer(
input_text,
truncation=True,
max_length=sft_config.max_seq_length,
padding='max_length',
return_tensors=None,
)
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
train_dataset = train_dataset.map(
tokenize,
remove_columns=train_dataset.column_names,
batched=True,
desc="Tokenizing train dataset",
)
eval_dataset = eval_dataset.map(*)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=sft_config,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
dataset_text_field="messages",
peft_config=peft_config,
dataset_kwargs=sft_config.dataset_kwargs,
data_collator=data_collator,
)
if hasattr(sft_config, "early_stopping_patience") and sft_config.early_stopping_patience > 0:
callback = EarlyStoppingCallback(sft_config.early_stopping_patience)
trainer.add_callback(callback)
trainer.train()
trainer.save_model()
if __name__ == "__main__":
cfg = OmegaConf.load('train_config.yaml')
main(cfg)
SageMaker / HuggingFace estimator
huggingface_estimator = HuggingFace(
entry_point = 'train.py',
dependencies=['requirements.txt'],
source_dir = './',
instance_type = 'ml.p4d.24xlarge',
instance_count = 1,
max_run = 2*24*60*60,
base_job_name = job_name,
role = role,
volume_size = 1024,
transformers_version = '4.36.0',
pytorch_version = '2.1.0',
py_version = 'py310',
disable_output_compression = True,
distribution={"torch_distributed": {"enabled": True}},
environment = {
"HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
"HF_TOKEN": HfFolder.get_token(),
"ACCELERATE_USE_FSDP": "1",
"FSDP_CPU_RAM_EFFICIENT_LOADING": "0",
"FSDP_AUTO_WRAP_POLICY": "TRANSFORMER_BASED_WRAP",
"FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
"FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
"NCCL_TIMEOUT": "3600",
"NCCL_DEBUG": "WARN",
"NCCL_IB_TIMEOUT": "3600",
"NCCL_SOCKET_TIMEOUT": "3600",
"NCCL_ASYNC_ERROR_HANDLING": "1",
"NCCL_P2P_LEVEL": "NVL",
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
"MAX_JOBS": "1",
"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512",
"TORCH_DISTRIBUTED_DEBUG": "DETAIL",
"HF_DATASETS_CACHE": "/opt/ml/input",
"TRANSFORMERS_CACHE": "/opt/ml/input"
},
checkpoint_s3_uri=f's3://{***}/checkpoints'
)
Stack trace
Traceback (most recent call last):
File "/opt/ml/code/train.py", line 259, in <module>
main(cfg)
File "/opt/ml/code/train.py", line 242, in main
trainer.train()
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2123, in train
return inner_training_loop(
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2534, in _inner_training_loop
self.optimizer.step()
File "/opt/conda/lib/python3.10/site-packages/accelerate/optimizer.py", line 171, in step
self.optimizer.step(closure)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
ret = func(self, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 184, in step
adamw(
File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 335, in adamw
func(
File "/opt/conda/lib/python3.10/site-packages/torch/optim/adamw.py", line 412, in _single_tensor_adamw
exp_avg.lerp_(grad, 1 - beta1)
RuntimeError: expected dtype float for `end` but got dtype c10::BFloat16
Expected behavior
Training should complete successfully without encountering errors.
Related issues
- FSDP with SFTrainer: expected dtype float for
endbut got dtype c10::BFloat16 #34702 - this seems to be the exact same issue and was active as recently as 3 weeks ago. It was closed only two days ago. - expected dtype float for
endbut got dtype c10::BFloat16 SqueezeAILab/LLM2LLM#5 - https://discuss.huggingface.co/t/errors-when-using-gradient-accumulation-with-fsdp-peft-lora-sfttrainer/105006