Skip to content

Commit

Permalink
v0.2.5
Browse files Browse the repository at this point in the history
Signed-off-by: ssbuild <[email protected]>
  • Loading branch information
ssbuild committed Oct 7, 2023
1 parent 203f71b commit c404973
Show file tree
Hide file tree
Showing 9 changed files with 539 additions and 12 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
- [deep_training](https://github.com/ssbuild/deep_training)

```text
10-07 support colossalai trainer
09-26 support transformers trainer
08-02 增加 muti lora infer 例子, 手动升级 aigc_zoo , pip install -U git+https://github.com/ssbuild/aigc_zoo.git --force-reinstall --no-deps
06-13 support resize_token_embeddings
Expand Down Expand Up @@ -104,8 +105,18 @@ a answer must
注: num_process_worker 为多进程制作数据 , 如果数据量较大 , 适当调大至cpu数量
dataHelper.make_dataset_with_args(data_args.train_file,mixed_data=False, shuffle=True,mode='train',num_process_worker=0)
#训练
#pl 训练
python train.py
#hf 训练
python -m torch.distributed.launch --nproc_per_node=1 train_hf.py
# 多机多卡
python -m torch.distributed.launch --nproc_per_node=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT train_hf.py
# colossalai 训练
colossalai run --nproc_per_node 1 --num_nodes 1 train_cl.py
```

## 训练参数
Expand Down
119 changes: 119 additions & 0 deletions config/colossalai_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# -*- coding: utf-8 -*-
# @Time: 21:55
# @Author: tk
# @File:colossalai_config


colossalai_strategy = {
"ddp": dict(
name="ddp",
broadcast_buffers= True,
bucket_cap_mb = 25,
find_unused_parameters = False,
check_reduction = False,
gradient_as_bucket_view = False,
static_graph = False,
),
"gemini":dict(
name="gemini",
chunk_config_dict = None,
chunk_init_device= None,
placement_policy = "static",
shard_param_frac = 1.0, # only for static placement
offload_optim_frac = 0.0, # only for static placement
offload_param_frac = 0.0, # only for static placement
warmup_non_model_data_ratio = 0.8, # only for auto placement
steady_cuda_cap_ratio = 0.9, # only for auto placement
precision = "fp16",
pin_memory = False,
force_outputs_fp32 = False,
strict_ddp_mode = False,
search_range_m = 32,
hidden_dim = None,
min_chunk_size_m = 32,
memstats = None,
gpu_margin_mem_ratio = 0.0,
initial_scale = 2 ** 16,
min_scale = 1,
growth_factor = 2,
backoff_factor = 0.5,
growth_interval = 1000,
hysteresis = 2,
max_scale = 2 ** 32,
max_norm = 1.0,
norm_type = 2.0,
verbose = False,
),
"zero2" : dict(
name="zero2",
stage = 2,
precision = "fp16",
initial_scale = 2 ** 32,
min_scale = 1,
growth_factor = 2,
backoff_factor = 0.5,
growth_interval = 1000,
hysteresis = 2,
max_scale = 2 ** 32,
max_norm = 1.0,
norm_type = 2.0,
reduce_bucket_size_in_m = 12,
communication_dtype= None,
overlap_communication = True,
cpu_offload = False,
verbose = False,
),
"zero2_cpu" : dict(
name="zero2_cpu",
stage = 2,
precision = "fp16",
initial_scale = 2 ** 32,
min_scale = 1,
growth_factor = 2,
backoff_factor = 0.5,
growth_interval = 1000,
hysteresis = 2,
max_scale = 2 ** 32,
max_norm = 1.0,
norm_type = 2.0,
reduce_bucket_size_in_m = 12,
communication_dtype= None,
overlap_communication = True,
cpu_offload=True,
verbose = False,
),
"3d": dict(
name="3d",
tp_size =1,
pp_size = 1,
precision = "fp16",
zero_stage = 0,
enable_all_optimization = False,
enable_fused_normalization = False,
enable_flash_attention = False,
enable_jit_fused = False,
enable_sequence_parallelism = False,
enable_sequence_overlap = False,
num_microbatches = None,
microbatch_size = None,
initial_scale = 2 ** 16,
min_scale = 1,
growth_factor = 2,
backoff_factor = 0.5,
growth_interval = 1000,
hysteresis = 2,
max_scale = 2 ** 32,
max_norm = 0,
broadcast_buffers = True,
ddp_bucket_cap_mb = 25,
find_unused_parameters = False,
check_reduction = False,
gradient_as_bucket_view = False,
static_graph = False,
zero_bucket_size_in_m = 12,
cpu_offload = False,
communication_dtype= None,
overlap_communication = True,
custom_policy = None,
)
}
17 changes: 9 additions & 8 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
global_args = {
# 训练配置
**dict(
trainer_backend ='pl', # one of pl , hf
trainer_backend ='pl', # one of pl , hf , cl
enable_deepspeed = False,
enable_ptv2 = False,
enable_lora = True,
Expand All @@ -36,15 +36,18 @@


if global_args["enable_lora"]:
from config.sft_config_lora import train_info_args,train_info_args_hf,train_model_config
from config.sft_config_lora import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
elif global_args["enable_ptv2"]:
from config.sft_config_ptv2 import train_info_args,train_info_args_hf,train_model_config
from config.sft_config_ptv2 import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
else:
from config.sft_config import train_info_args,train_info_args_hf,train_model_config
from config.sft_config import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config

assert global_args["trainer_backend"] in ["pl","hf","cl"]

if global_args["trainer_backend"] == "hf":
train_info_args = train_info_args_hf
elif global_args["trainer_backend"] == "cl":
train_info_args = train_info_args_colossalai



Expand All @@ -61,8 +64,6 @@ def patch_args(train_info_args):

if global_args["enable_lora"]:
#检查lora adalora是否开启
if 'lora' not in train_info_args and 'adalora' not in train_info_args:
raise ValueError('please config lora or adalora')
assert train_info_args.get('lora',{}).get('with_lora',False) + \
train_info_args.get('adalora',{}).get('with_lora',False) + \
train_info_args.get('ia3',{}).get('with_lora',False) == 1 , ValueError('lora adalora ia3 can set one at same time !')
Expand All @@ -72,8 +73,8 @@ def patch_args(train_info_args):
train_info_args.pop('lora', None)
train_info_args.pop('adalora', None)
train_info_args.pop('ia3', None)
if hasattr(train_info_args,"gradient_checkpointing"):
train_info_args.gradient_checkpointing = False
if "gradient_checkpointing" in train_info_args:
train_info_args[ "gradient_checkpointing" ] = False
else:
train_info_args.pop('lora',None)
train_info_args.pop('adalora', None)
Expand Down
71 changes: 71 additions & 0 deletions config/sft_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import os

from config.colossalai_config import colossalai_strategy
from config.constant_map import train_model_config


Expand Down Expand Up @@ -128,3 +129,73 @@














train_info_args_colossalai = {
'data_backend': 'parquet', #one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型配置
**train_model_config,

# 目前仅ddp 支持lora
"strategy": colossalai_strategy["ddp"], # ddp,gemini,zero2,zero2_cpu,3d
"output_dir": "./outputs_cl",
"overwrite_output_dir": True,
"num_train_epochs": 20,
"max_steps": -1,
"save_safetensors": False,
"save_strategy": "steps",
"save_steps": 1000,
"save_total_limit": 10,
"seed": 42,
"fp16": True,
'do_train': True,
'train_file': [ './data/finetune_train_examples.json' ],
'do_eval': False,
'do_predict': False,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 2,
"gradient_accumulation_steps": 1, # colossalai不支持梯度积累
"evaluation_strategy": "no",
"eval_steps": 100,
# 优化器,如果策略使用 gemini , 则 optim one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl
# 如果策略使用非 gemini ,则 optim one of follow
# one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl,lamb,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,
# adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit,
# paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,
# lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp
"optim": "adam_hybrid_cl", # 推荐 one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl
"lr_scheduler_type": "cosine", # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau
"torch_compile": False,
"learning_rate": 2e-5,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_epsilon": 1e-8,
"max_grad_norm": 1.0,
"weight_decay": 0.,
"warmup_ratio": 0.03,
"logging_strategy": "steps",
"logging_steps": 10,
"tf32": False,
"gradient_checkpointing": True,
'max_seq_length': 512, #
'max_target_length': 100, # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
# 'do_lower_case': False,
"dataloader_drop_last": True,
"dataloader_pin_memory": True,
"dataloader_num_workers": 0,

"log_level": "info", # 'info', 'warning', 'error' and 'critical , passive',


}

73 changes: 73 additions & 0 deletions config/sft_config_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# @Time : 2023/5/24 15:53
import json
import os

from config.colossalai_config import colossalai_strategy
from config.constant_map import (train_model_config,
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
Expand Down Expand Up @@ -196,3 +198,74 @@
}











train_info_args_colossalai = {
'data_backend': 'parquet', #one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型配置
**train_model_config,

# 目前仅ddp 支持lora
"strategy": colossalai_strategy[ "ddp" ], # ddp,gemini,zero2,zero2_cpu,3d
"output_dir": "./outputs_cl",
"overwrite_output_dir": True,
"num_train_epochs": 20,
"max_steps": -1,
"save_safetensors": False,
"save_strategy": "steps",
"save_steps": 1000,
"save_total_limit": 10,
"seed": 42,
"fp16": True,
'do_train': True,
'train_file': [ './data/finetune_train_examples.json' ],
'do_eval': False,
'do_predict': False,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 2,
"gradient_accumulation_steps": 1, # colossalai不支持梯度积累
"evaluation_strategy": "no",
"eval_steps": 100,

# 优化器,如果策略使用 gemini , 则 optim one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl
# 如果策略使用非 gemini ,则 optim one of follow
# one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl,lamb,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,
# adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit,
# paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,
# lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp
"optim": "adam_hybrid_cl",
"lr_scheduler_type": "cosine", # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau
"torch_compile": False,
"learning_rate": 2e-5,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_epsilon": 1e-8,
"max_grad_norm": 1.0,
"weight_decay": 0.,
"warmup_ratio": 0.03,
"logging_strategy": "steps",
"logging_steps": 10,
"tf32": False,
"gradient_checkpointing": True,
'max_seq_length': 512, #
'max_target_length': 100, # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
# 'do_lower_case': False,
"dataloader_drop_last": True,
"dataloader_pin_memory": True,
"dataloader_num_workers": 0,

"log_level": "info", # 'info', 'warning', 'error' and 'critical , passive',
############## lora模块
'lora': lora_info_args,
'adalora': adalora_info_args,
"ia3": ia3_info_args,

}
Loading

0 comments on commit c404973

Please sign in to comment.