Skip to content

Commit

Permalink
peft merge option
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Jun 3, 2024
1 parent 81c959f commit aedb056
Show file tree
Hide file tree
Showing 24 changed files with 192 additions and 29 deletions.
2 changes: 1 addition & 1 deletion hparams/benchmark_chat_llama2_7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "meta-llama/Llama-2-7b-hf",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 1,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_baichuan2_7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "baichuan-inc/Baichuan2-7B-Base",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_chatglm3_7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "THUDM/chatglm3-6b",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_deepseek_7b.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "deepseek-ai/deepseek-llm-7b-base",
"flash_attention_2": false,
"atten_class": "eager",
"transformer_adamw": true,
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_deepseek_7b_chat_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "deepseek-ai/deepseek-llm-7b-chat",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 128,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_deepseek_7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "deepseek-ai/deepseek-llm-7b-base",
"flash_attention_2": true,
"atten_class": "flash",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 128,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_deepseek_coder_6.7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-base",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_llama2_13b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "meta-llama/Llama-2-13b-hf",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_llama2_7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "meta-llama/Llama-2-7b-hf",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_ningyu_13b.json
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "meta-llama/Llama-2-13b-hf",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 32,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_openbuddy_deepseek_67b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "OpenBuddy/openbuddy-deepseek-67b-v15.2",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
Expand Down
2 changes: 1 addition & 1 deletion hparams/hparams_chat_openbuddy_llama2_13b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "OpenBuddy/openbuddy-llama2-13b-v8.1-fp16",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
Expand Down
32 changes: 32 additions & 0 deletions hparams/hparams_chat_qwen1.5_7b_chat_cj.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"conv_format": "qwen",
"end_of_conversation": 151643,
"data_path": [
"/data/wangjun/github/cangjie/"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-7B-Chat",
"atten_class": "eager",
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 8e-6,
"lr_decay": 0.999875,
"lr_scheduler_type": "cosine",
"num_warmup_steps": 100,
"max_epochs": 300,
"disable_dropout": true,
"model_torch_dtype": "auto",
"bf16": true,
"gradient_checkpointing": true,
"weight_decay": 0.0,
"gradient_clip_algorithm": "norm",
"gradient_clip_val": 1.0
}
38 changes: 38 additions & 0 deletions hparams/hparams_chat_qwen1.5_7b_chat_cj_lora.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"conv_format": "qwen",
"end_of_conversation": 151643,
"data_path": [
"/data/wangjun/github/cangjie/"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-7B-Chat",
"atten_class": "sdpa",
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 8e-6,
"lr_decay": 0.999875,
"lr_scheduler_type": "cosine",
"num_warmup_steps": 100,
"max_epochs": 300,
"disable_dropout": true,
"model_torch_dtype": "auto",
"bf16": true,
"gradient_checkpointing": true,
"weight_decay": 0.0,
"gradient_clip_algorithm": "norm",
"gradient_clip_val": 1.0,
"strategy": null,
"lora": {
"r": 128,
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"]
},
"peft_merge": false
}
37 changes: 37 additions & 0 deletions hparams/hparams_chat_qwen1.5_7b_cj_deepspeed.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"conv_format": "qwen",
"end_of_conversation": 151643,
"data_path": [
"/data/wangjun/github/cangjie/"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-7B",
"atten_class": "eager",
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 8e-6,
"lr_decay": 0.999875,
"lr_scheduler_type": "cosine",
"num_warmup_steps": 100,
"max_epochs": 300,
"disable_dropout": true,
"model_torch_dtype": "auto",
"bf16": true,
"gradient_checkpointing": true,
"weight_decay": 0.0,
"gradient_clip_algorithm": "norm",
"gradient_clip_val": 1.0,
"strategy": "deepspeed",
"strategy_params": {
"offload": false,
"zero_stage": 2
}
}
37 changes: 37 additions & 0 deletions hparams/hparams_chat_qwen1.5_7b_cj_lora.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"conv_format": "qwen",
"end_of_conversation": 151643,
"data_path": [
"/data/wangjun/github/cangjie/"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-7B",
"atten_class": "eager",
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 32,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 8e-6,
"lr_decay": 0.999875,
"lr_scheduler_type": "cosine",
"num_warmup_steps": 100,
"max_epochs": 300,
"disable_dropout": true,
"model_torch_dtype": "auto",
"bf16": true,
"gradient_checkpointing": true,
"weight_decay": 0.0,
"gradient_clip_algorithm": "norm",
"gradient_clip_val": 1.0,
"strategy": null,
"lora": {
"r": 128,
"target_modules": ["q_proj", "v_proj"]
}
}
2 changes: 1 addition & 1 deletion hparams/hparams_chat_qwen_7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "Qwen/Qwen-7B",
"flash_attention_2": false,
"atten_class": "eager",
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
Expand Down
3 changes: 2 additions & 1 deletion katheryne/light_modules/models/chat_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ def on_save_checkpoint(self, checkpoint):
self.model,
tokenizer=None,
output_dir=save_path,
sub_folder=f"checkpoint-step-{self.global_step}"
sub_folder=f"checkpoint-step-{self.global_step}",
peft_merge=self.hparams.get("peft_merge", False),
)

def configure_optimizers(self):
Expand Down
3 changes: 2 additions & 1 deletion katheryne/light_modules/models/instruction_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ def on_save_checkpoint(self, checkpoint):
self.model,
tokenizer=None,
output_dir="./lightning_logs/huggingface_format",
sub_folder=f"checkpoint-step-{self.global_step}"
sub_folder=f"checkpoint-step-{self.global_step}",
peft_merge=self.hparams.get("peft_merge", False),
)
if self.hparams.params.get("lora_dim", 0) > 0:
unfuse_linear_layer(self.model, self.deepspeed)
Expand Down
3 changes: 2 additions & 1 deletion katheryne/light_modules/models/pretrain_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ def on_save_checkpoint(self, checkpoint):
self.model,
tokenizer=None,
output_dir=save_path,
sub_folder=f"checkpoint-step-{self.global_step}"
sub_folder=f"checkpoint-step-{self.global_step}",
peft_merge=self.hparams.get("peft_merge", False),
)

def configure_optimizers(self):
Expand Down
12 changes: 8 additions & 4 deletions katheryne/tools/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ def __init__(self, model, tokenizer, device) -> None:
self.tokenizer = tokenizer
self.device = device

def __call__(self, input_text: str, max_new_tokens: int=256) -> Any:
def __call__(self, input_text: str, max_new_tokens: int=256, skip_special_tokens=False) -> Any:
input_ids = self.tokenizer([input_text], return_tensors="pt", padding='longest', max_length=2048, truncation=True)["input_ids"].to(self.device)
outputs_ids = self.model.generate(inputs=input_ids, max_new_tokens=max_new_tokens,
eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.eos_token_id)
outputs = self.tokenizer.batch_decode(outputs_ids, skip_special_tokens=True)
outputs = self.tokenizer.batch_decode(outputs_ids, skip_special_tokens=skip_special_tokens)
output_text = [{"generated_text": each_text} for each_text in outputs]
return output_text

Expand Down Expand Up @@ -72,6 +72,10 @@ def load_local_model(path: str):
base_model_name = model_json_file["base_model_name_or_path"]
model_config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, from_tf=bool(".ckpt" in path), config=model_config, trust_remote_code=True)

# base_model.load_adapter(path)
# base_model.enable_adapters()
# model = base_model
model = PeftModelForCausalLM.from_pretrained(base_model, path)
else:
model_config = AutoConfig.from_pretrained(path, trust_remote_code=True)
Expand All @@ -83,7 +87,7 @@ def load_remote_model(path: str):
model = AutoModelForCausalLM.from_pretrained(path, from_tf=bool(".ckpt" in path), config=model_config, trust_remote_code=True)
return model

def get_generator(path, settings):
def get_generator(path, settings, device="cuda"):
if os.path.exists(path):
tokenizer = load_local_tokenizer(path)
else:
Expand All @@ -100,7 +104,7 @@ def get_generator(path, settings):
# model.config.pad_token_id = model.config.eos_token_id
# model.resize_token_embeddings(len(tokenizer))
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda", eos_token_id=tokenizer.eos_token_id)
generator = ChatPipeline(model=model, tokenizer=tokenizer, device="cuda")
generator = ChatPipeline(model=model, tokenizer=tokenizer, device=device)
return generator


Expand Down
4 changes: 2 additions & 2 deletions katheryne/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def train(create_dataset, lightning_module_class):
tokenizer=tokenizer,
dtype=torch_dtype,
disable_dropout=hparams.disable_dropout,
use_flash_atten=hparams.get("flash_attention_2", False),
atten_class=hparams.get("atten_class", "eager"),
)

# Setup LORA
Expand All @@ -140,7 +140,7 @@ def train(create_dataset, lightning_module_class):
model.gradient_checkpointing_enable()

# Save Model
# save_hf_format(model, tokenizer, "./lightning_logs/huggingface_format", sub_folder=f"checkpoint-step-0")
# save_hf_format(model, tokenizer, "./lightning_logs/huggingface_format", sub_folder=f"checkpoint-step-0", peft_merge=hparams.get("peft_merge", False))

# Prepare the data
print("***** Prepare Dataset *****")
Expand Down
Loading

0 comments on commit aedb056

Please sign in to comment.