v0.2.5

Signed-off-by: ssbuild <[email protected]>
ssbuild · Oct 7, 2023 · c404973 · c404973
1 parent 203f71b
commit c404973
Show file tree

Hide file tree

Showing 9 changed files with 539 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
    - [deep_training](https://github.com/ssbuild/deep_training)
 
 ```text
+    10-07 support colossalai trainer
     09-26 support transformers trainer
     08-02 增加 muti lora infer 例子, 手动升级 aigc_zoo , pip install -U git+https://github.com/ssbuild/aigc_zoo.git --force-reinstall --no-deps
 	06-13 support resize_token_embeddings
@@ -104,8 +105,18 @@ a answer   must
     注: num_process_worker 为多进程制作数据 ， 如果数据量较大 ， 适当调大至cpu数量
     dataHelper.make_dataset_with_args(data_args.train_file,mixed_data=False, shuffle=True,mode='train',num_process_worker=0)
     
-    #训练
+    #pl 训练
     python train.py
+    
+    #hf 训练
+    python -m torch.distributed.launch --nproc_per_node=1 train_hf.py
+    
+    # 多机多卡
+    python -m torch.distributed.launch --nproc_per_node=1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT train_hf.py
+
+   
+    # colossalai 训练 
+    colossalai run --nproc_per_node 1 --num_nodes 1 train_cl.py
 ```
 
 ## 训练参数

diff --git a/config/colossalai_config.py b/config/colossalai_config.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+# @Time:  21:55
+# @Author: tk
+# @File：colossalai_config
+
+
+colossalai_strategy = {
+    "ddp": dict(
+        name="ddp",
+        broadcast_buffers= True,
+        bucket_cap_mb = 25,
+        find_unused_parameters = False,
+        check_reduction = False,
+        gradient_as_bucket_view = False,
+        static_graph = False,
+    ),
+    "gemini":dict(
+        name="gemini",
+        chunk_config_dict = None,
+        chunk_init_device= None,
+        placement_policy = "static",
+        shard_param_frac = 1.0,  # only for static placement
+        offload_optim_frac = 0.0,  # only for static placement
+        offload_param_frac = 0.0,  # only for static placement
+        warmup_non_model_data_ratio = 0.8,  # only for auto placement
+        steady_cuda_cap_ratio = 0.9,  # only for auto placement
+        precision = "fp16",
+        pin_memory = False,
+        force_outputs_fp32 = False,
+        strict_ddp_mode = False,
+        search_range_m = 32,
+        hidden_dim = None,
+        min_chunk_size_m = 32,
+        memstats = None,
+        gpu_margin_mem_ratio = 0.0,
+        initial_scale = 2 ** 16,
+        min_scale = 1,
+        growth_factor = 2,
+        backoff_factor = 0.5,
+        growth_interval = 1000,
+        hysteresis = 2,
+        max_scale = 2 ** 32,
+        max_norm = 1.0,
+        norm_type = 2.0,
+        verbose = False,
+    ),
+    "zero2" : dict(
+        name="zero2",
+        stage = 2,
+        precision = "fp16",
+        initial_scale = 2 ** 32,
+        min_scale = 1,
+        growth_factor = 2,
+        backoff_factor = 0.5,
+        growth_interval = 1000,
+        hysteresis = 2,
+        max_scale = 2 ** 32,
+        max_norm = 1.0,
+        norm_type = 2.0,
+        reduce_bucket_size_in_m = 12,
+        communication_dtype= None,
+        overlap_communication = True,
+        cpu_offload = False,
+        verbose = False,
+    ),
+    "zero2_cpu" : dict(
+        name="zero2_cpu",
+        stage = 2,
+        precision = "fp16",
+        initial_scale = 2 ** 32,
+        min_scale = 1,
+        growth_factor = 2,
+        backoff_factor = 0.5,
+        growth_interval = 1000,
+        hysteresis = 2,
+        max_scale = 2 ** 32,
+        max_norm = 1.0,
+        norm_type = 2.0,
+        reduce_bucket_size_in_m = 12,
+        communication_dtype= None,
+        overlap_communication = True,
+        cpu_offload=True,
+        verbose = False,
+    ),
+    "3d": dict(
+        name="3d",
+        tp_size =1,
+        pp_size = 1,
+        precision = "fp16",
+        zero_stage = 0,
+        enable_all_optimization = False,
+        enable_fused_normalization = False,
+        enable_flash_attention = False,
+        enable_jit_fused = False,
+        enable_sequence_parallelism = False,
+        enable_sequence_overlap = False,
+        num_microbatches = None,
+        microbatch_size = None,
+        initial_scale = 2 ** 16,
+        min_scale = 1,
+        growth_factor = 2,
+        backoff_factor = 0.5,
+        growth_interval = 1000,
+        hysteresis = 2,
+        max_scale = 2 ** 32,
+        max_norm = 0,
+        broadcast_buffers = True,
+        ddp_bucket_cap_mb = 25,
+        find_unused_parameters = False,
+        check_reduction = False,
+        gradient_as_bucket_view = False,
+        static_graph = False,
+        zero_bucket_size_in_m = 12,
+        cpu_offload = False,
+        communication_dtype= None,
+        overlap_communication = True,
+        custom_policy = None,
+    )
+}
diff --git a/config/main.py b/config/main.py
@@ -10,7 +10,7 @@
 global_args = {
     # 训练配置
     **dict(
-        trainer_backend ='pl', # one of pl , hf
+        trainer_backend ='pl', # one of pl , hf , cl
         enable_deepspeed = False,
         enable_ptv2 = False,
         enable_lora = True,
@@ -36,15 +36,18 @@
 
 
 if global_args["enable_lora"]:
-    from config.sft_config_lora import train_info_args,train_info_args_hf,train_model_config
+    from config.sft_config_lora import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
 elif global_args["enable_ptv2"]:
-    from config.sft_config_ptv2 import train_info_args,train_info_args_hf,train_model_config
+    from config.sft_config_ptv2 import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
 else:
-    from config.sft_config import train_info_args,train_info_args_hf,train_model_config
+    from config.sft_config import train_info_args,train_info_args_hf,train_info_args_colossalai,train_model_config
 
+assert global_args["trainer_backend"] in ["pl","hf","cl"]
 
 if global_args["trainer_backend"] == "hf":
     train_info_args = train_info_args_hf
+elif global_args["trainer_backend"] == "cl":
+    train_info_args = train_info_args_colossalai
 
 
 
@@ -61,8 +64,6 @@ def patch_args(train_info_args):
 
     if global_args["enable_lora"]:
         #检查lora adalora是否开启
-        if 'lora' not in train_info_args and 'adalora' not in train_info_args:
-            raise ValueError('please config lora or adalora')
         assert train_info_args.get('lora',{}).get('with_lora',False) + \
             train_info_args.get('adalora',{}).get('with_lora',False) + \
             train_info_args.get('ia3',{}).get('with_lora',False) == 1 , ValueError('lora adalora ia3 can set one at same time !')
@@ -72,8 +73,8 @@ def patch_args(train_info_args):
         train_info_args.pop('lora', None)
         train_info_args.pop('adalora', None)
         train_info_args.pop('ia3', None)
-        if hasattr(train_info_args,"gradient_checkpointing"):
-            train_info_args.gradient_checkpointing = False
+        if "gradient_checkpointing" in train_info_args:
+            train_info_args[ "gradient_checkpointing" ] = False
     else:
         train_info_args.pop('lora',None)
         train_info_args.pop('adalora', None)

diff --git a/config/sft_config.py b/config/sft_config.py
@@ -3,6 +3,7 @@
 import json
 import os
 
+from config.colossalai_config import colossalai_strategy
 from config.constant_map import train_model_config
 
 
@@ -128,3 +129,73 @@
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+train_info_args_colossalai = {
+    'data_backend': 'parquet',  #one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
+    # 预训练模型配置
+    **train_model_config,
+
+    # 目前仅ddp 支持lora
+    "strategy": colossalai_strategy["ddp"], # ddp,gemini,zero2,zero2_cpu,3d
+    "output_dir": "./outputs_cl",
+    "overwrite_output_dir": True,
+    "num_train_epochs": 20,
+    "max_steps": -1,
+    "save_safetensors": False,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "save_total_limit":  10,
+    "seed": 42,
+    "fp16": True,
+    'do_train': True,
+    'train_file': [ './data/finetune_train_examples.json' ],
+    'do_eval': False,
+    'do_predict': False,
+    "per_device_train_batch_size": 2,
+    "per_device_eval_batch_size": 2,
+    "gradient_accumulation_steps": 1, # colossalai不支持梯度积累
+    "evaluation_strategy": "no",
+    "eval_steps": 100,
+    # 优化器，如果策略使用 gemini , 则 optim one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl
+    # 如果策略使用非 gemini ,则 optim one of follow
+    # one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl,lamb,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,
+    # adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit,
+    # paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,
+    # lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp
+    "optim": "adam_hybrid_cl", #  推荐 one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl
+    "lr_scheduler_type": "cosine", # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau
+    "torch_compile": False,
+    "learning_rate": 2e-5,
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.999,
+    "adam_epsilon": 1e-8,
+    "max_grad_norm": 1.0,
+    "weight_decay": 0.,
+    "warmup_ratio": 0.03,
+    "logging_strategy": "steps",
+    "logging_steps": 10,
+    "tf32": False,
+    "gradient_checkpointing": True,
+    'max_seq_length': 512,  #
+    'max_target_length': 100,  # 预测最大长度, 保留字段
+    'use_fast_tokenizer': False,
+    # 'do_lower_case': False,
+    "dataloader_drop_last": True,
+    "dataloader_pin_memory": True,
+    "dataloader_num_workers": 0,
+
+    "log_level": "info", #  'info', 'warning', 'error' and 'critical , passive',
+
+
+}
+
diff --git a/config/sft_config_lora.py b/config/sft_config_lora.py
@@ -2,6 +2,8 @@
 # @Time    : 2023/5/24 15:53
 import json
 import os
+
+from config.colossalai_config import colossalai_strategy
 from config.constant_map import (train_model_config,
                                 TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
                                 TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
@@ -196,3 +198,74 @@
 }
 
 
+
+
+
+
+
+
+
+
+
+train_info_args_colossalai = {
+    'data_backend': 'parquet',  #one of record lmdb arrow_stream arrow_file,parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
+    # 预训练模型配置
+    **train_model_config,
+
+    # 目前仅ddp 支持lora
+    "strategy": colossalai_strategy[ "ddp" ],  # ddp,gemini,zero2,zero2_cpu,3d
+    "output_dir": "./outputs_cl",
+    "overwrite_output_dir": True,
+    "num_train_epochs": 20,
+    "max_steps": -1,
+    "save_safetensors": False,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "save_total_limit":  10,
+    "seed": 42,
+    "fp16": True,
+    'do_train': True,
+    'train_file': [ './data/finetune_train_examples.json' ],
+    'do_eval': False,
+    'do_predict': False,
+    "per_device_train_batch_size": 2,
+    "per_device_eval_batch_size": 2,
+    "gradient_accumulation_steps": 1, # colossalai不支持梯度积累
+    "evaluation_strategy": "no",
+    "eval_steps": 100,
+
+    # 优化器，如果策略使用 gemini , 则 optim one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl
+    # 如果策略使用非 gemini ,则 optim one of follow
+    # one of adam_hybrid_cl,adam_cpu_cl,adam_fused_cl,lamb,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,
+    # adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion,lion_8bit,lion_32bit,
+    # paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,
+    # lamb_fused_dp adagrad_cpu_dp adam_cpu_dp adam_fused_dp
+    "optim": "adam_hybrid_cl",
+    "lr_scheduler_type": "cosine", # one of linear,cosine,cosine_with_restarts,polynomial,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau
+    "torch_compile": False,
+    "learning_rate": 2e-5,
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.999,
+    "adam_epsilon": 1e-8,
+    "max_grad_norm": 1.0,
+    "weight_decay": 0.,
+    "warmup_ratio": 0.03,
+    "logging_strategy": "steps",
+    "logging_steps": 10,
+    "tf32": False,
+    "gradient_checkpointing": True,
+    'max_seq_length': 512,  #
+    'max_target_length': 100,  # 预测最大长度, 保留字段
+    'use_fast_tokenizer': False,
+    # 'do_lower_case': False,
+    "dataloader_drop_last": True,
+    "dataloader_pin_memory": True,
+    "dataloader_num_workers": 0,
+
+    "log_level": "info", #  'info', 'warning', 'error' and 'critical , passive',
+    ##############  lora模块
+    'lora': lora_info_args,
+    'adalora': adalora_info_args,
+    "ia3": ia3_info_args,
+
+}