Skip to content

Commit

Permalink
support accelerator
Browse files Browse the repository at this point in the history
Signed-off-by: ssbuild <[email protected]>
  • Loading branch information
ssbuild committed Oct 9, 2023
1 parent c404973 commit 22bd651
Show file tree
Hide file tree
Showing 7 changed files with 384 additions and 10 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
- [deep_training](https://github.com/ssbuild/deep_training)

```text
10-09 support accelerator trainer
10-07 support colossalai trainer
09-26 support transformers trainer
08-02 增加 muti lora infer 例子, 手动升级 aigc_zoo , pip install -U git+https://github.com/ssbuild/aigc_zoo.git --force-reinstall --no-deps
Expand Down Expand Up @@ -109,12 +110,13 @@ a answer must
python train.py
#hf 训练
python -m torch.distributed.launch --nproc_per_node=1 train_hf.py
torchrun --nproc_per_node=1 train_hf.py
# 多机多卡
python -m torch.distributed.launch --nproc_per_node=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT train_hf.py
torchrun --nproc_per_node=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT train_hf.py
# ac 训练 参照 hf训练
# colossalai 训练
colossalai run --nproc_per_node 1 --num_nodes 1 train_cl.py
```
Expand Down
13 changes: 8 additions & 5 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
global_args = {
# 训练配置
**dict(
trainer_backend ='pl', # one of pl , hf , cl
trainer_backend ='pl', # one of pl , hf , cl , ac
enable_deepspeed = False,
enable_ptv2 = False,
enable_lora = True,
Expand All @@ -36,18 +36,21 @@


if global_args["enable_lora"]:
from config.sft_config_lora import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
from config.sft_config_lora import train_info_args,train_info_args_hf,train_info_args_colossalai,train_info_args_ac,train_model_config
elif global_args["enable_ptv2"]:
from config.sft_config_ptv2 import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
from config.sft_config_ptv2 import train_info_args,train_info_args_hf,train_info_args_colossalai,train_info_args_ac,train_model_config
else:
from config.sft_config import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
from config.sft_config import train_info_args,train_info_args_hf,train_info_args_colossalai,train_info_args_ac,train_model_config

assert global_args["trainer_backend"] in ["pl","hf","cl"]
assert global_args["trainer_backend"] in ["pl","hf","cl","ac"]

if global_args["trainer_backend"] == "hf":
train_info_args = train_info_args_hf
elif global_args["trainer_backend"] == "cl":
train_info_args = train_info_args_colossalai
elif global_args["trainer_backend"] == "ac":
train_info_args = train_info_args_ac




Expand Down
54 changes: 54 additions & 0 deletions config/sft_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,57 @@

}






train_info_args_ac = {
'data_backend': 'parquet', #one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型配置
**train_model_config,

"output_dir": "./outputs_ac",
"overwrite_output_dir": True,
"num_train_epochs": 20,
"max_steps": -1,
"save_safetensors": False,
"save_strategy": "steps",
"save_steps": 1000,
"save_total_limit": 10,
"seed": 42,
"fp16": True,
'do_train': True,
'train_file': [ './data/finetune_train_examples.json' ],
'do_eval': False,
'do_predict': False,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 2,
"gradient_accumulation_steps": 1,
"evaluation_strategy": "no",
"eval_steps": 100,
"optim": "adamw_torch",
"lr_scheduler_type": "cosine", # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau
"torch_compile": False,
"learning_rate": 2e-5,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_epsilon": 1e-8,
"max_grad_norm": 1.0,
"weight_decay": 0.,
"warmup_ratio": 0.03,
"logging_strategy": "steps",
"logging_steps": 10,
"tf32": False,
"gradient_checkpointing": True,
'max_seq_length': 512, #
'max_target_length': 100, # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
# 'do_lower_case': False,
"dataloader_drop_last": True,
"dataloader_pin_memory": True,
"dataloader_num_workers": 0,

"log_level": "info", # 'info', 'warning', 'error' and 'critical , passive',

}
65 changes: 65 additions & 0 deletions config/sft_config_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,4 +268,69 @@
'adalora': adalora_info_args,
"ia3": ia3_info_args,

}











train_info_args_ac = {
'data_backend': 'parquet',
# one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型配置
**train_model_config,

"output_dir": "./outputs_ac",
"overwrite_output_dir": True,
"num_train_epochs": 20,
"max_steps": -1,
"save_safetensors": False,
"save_strategy": "steps",
"save_steps": 1000,
"save_total_limit": 10,
"seed": 42,
"fp16": True,
'do_train': True,
'train_file': [ './data/finetune_train_examples.json'],
'do_eval': False,
'do_predict': False,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 2,
"gradient_accumulation_steps": 1,
"evaluation_strategy": "no",
"eval_steps": 100,
"optim": "adamw_torch",
"lr_scheduler_type": "cosine", # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau
"torch_compile": False,
"learning_rate": 2e-5,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_epsilon": 1e-8,
"max_grad_norm": 1.0,
"weight_decay": 0.,
"warmup_ratio": 0.03,
"logging_strategy": "steps",
"logging_steps": 10,
"tf32": False,
"gradient_checkpointing": True,
'max_seq_length': 512, #
'max_target_length': 100, # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
# 'do_lower_case': False,
"dataloader_drop_last": True,
"dataloader_pin_memory": True,
"dataloader_num_workers": 0,

"log_level": "info", # 'info', 'warning', 'error' and 'critical , passive',
############## lora模块
'lora': lora_info_args,
'adalora': adalora_info_args,
"ia3": ia3_info_args,

}
63 changes: 63 additions & 0 deletions config/sft_config_ptv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,69 @@

"log_level": "info", # 'info', 'warning', 'error' and 'critical , passive',

############## lora模块
'prompt': prompt_info_args,
}












train_info_args_ac = {
'data_backend': 'parquet',
# one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型配置
**train_model_config,

"output_dir": "./outputs_ac",
"overwrite_output_dir": True,
"num_train_epochs": 20,
"max_steps": -1,
"save_safetensors": False,
"save_strategy": "steps",
"save_steps": 1000,
"save_total_limit": 10,
"seed": 42,
"fp16": True,
'do_train': True,
'train_file': [ './data/finetune_train_examples.json'],
'do_eval': False,
'do_predict': False,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 2,
"gradient_accumulation_steps": 1,
"evaluation_strategy": "no",
"eval_steps": 100,
"optim": "adamw_torch",
"lr_scheduler_type": "cosine",# one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau
"torch_compile": False,
"learning_rate": 2e-5,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_epsilon": 1e-8,
"max_grad_norm": 1.0,
"weight_decay": 0.,
"warmup_ratio": 0.03,
"logging_strategy": "steps",
"logging_steps": 10,
"tf32": False,
"gradient_checkpointing": True,
'max_seq_length': 512, #
'max_target_length': 100, # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
# 'do_lower_case': False,
"dataloader_drop_last": True,
"dataloader_pin_memory": True,
"dataloader_num_workers": 0,

"log_level": "info", # 'info', 'warning', 'error' and 'critical , passive',
############## lora模块
'prompt': prompt_info_args,
}
9 changes: 7 additions & 2 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
import torch
from deep_training.data_helper import DataHelper, ModelArguments, TrainingArguments, DataArguments, TrainingArgumentsHF, \
TrainingArgumentsCL
TrainingArgumentsCL, TrainingArgumentsAC
from aigc_zoo.model_zoo.t5.llm_model import PetlArguments,PromptArguments
from fastdatasets.record import load_dataset as Loader, RECORD, WriterObject, gfile
from tqdm import tqdm
Expand Down Expand Up @@ -271,11 +271,16 @@ def make_dataset_all(self):
elif global_args[ "trainer_backend" ] == "pl":
parser = HfArgumentParser((ModelArguments, TrainingArguments, DataArguments, PetlArguments, PromptArguments))
model_args, training_args, data_args, _, _ = parser.parse_dict(train_info_args)
else:
elif global_args[ "trainer_backend" ] == "cl":
parser = HfArgumentParser((ModelArguments, TrainingArgumentsCL, DataArguments, PetlArguments, PromptArguments),
conflict_handler='resolve')
model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(train_info_args,
allow_extra_keys=True, )
else:
parser = HfArgumentParser((ModelArguments, TrainingArgumentsAC, DataArguments, PetlArguments, PromptArguments),
conflict_handler='resolve')
model_args, training_args, data_args, lora_args, prompt_args = parser.parse_dict(train_info_args,
allow_extra_keys=True, )

dataHelper = NN_DataHelper(model_args, training_args, data_args)
tokenizer, config, label2id, id2label = dataHelper.load_tokenizer_and_config()
Expand Down
Loading

0 comments on commit 22bd651

Please sign in to comment.