update benchmark settings

vtuber-plan · Mar 1, 2024 · c6caefe · c6caefe
1 parent cdb57fa
commit c6caefe
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,23 @@
 # katheryne
 Easy Language Model Trainer
+
+
+## Let's benchmark GPUs LLM training with katheryne
+
+```
+Stage: Pretrain
+Model: meta-llama/Llama-2-7b-hf
+Dataset: bigscience-data/roots_zh-cn_wikipedia
+per_device_train_batch_size: 2
+accumulate_grad_batches: 64
+max_seq_len: 512
+max_steps: 100
+gradient_checkpointing: true
+dtype: bf16
+lora: {"r": 16, "target_modules": ["q_proj", "v_proj"]}
+```
+
+|     GPU     |  Time |     Memory   |
+|-------------|-------|--------------|
+|  RTX 3090   |       |   14,928MiB  |
+|  Tesla A800 |       |              |
diff --git a/hparams/benchmark_chat_llama2_7b_lora.json b/hparams/benchmark_chat_llama2_7b_lora.json
@@ -8,7 +8,7 @@
     "flash_attention_2": false,
     "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 8,
-    "accumulate_grad_batches": 8,
+    "accumulate_grad_batches": 1,
     "max_seq_len": 512,
     "checkpoint_every_n_train_steps": 100,
     "log_every_n_steps": 1,
@@ -20,7 +20,7 @@
     "lr_decay": 0.999875,
     "lr_scheduler_type": "cosine",
     "num_warmup_steps": 100,
-    "max_epochs": 300,
+    "max_steps": 100,
     "disable_dropout": true,
     "model_torch_dtype": "auto",
     "bf16": true,

diff --git a/hparams/benchmark_pretrain_llama2_7b_lora.json b/hparams/benchmark_pretrain_llama2_7b_lora.json
@@ -6,7 +6,7 @@
     "model_name_or_path": "meta-llama/Llama-2-7b-hf",
     "per_device_train_batch_size": 2,
     "per_device_eval_batch_size": 4,
-    "accumulate_grad_batches": 64,
+    "accumulate_grad_batches": 1,
     "max_seq_len": 512,
     "checkpoint_every_n_train_steps": 1000,
     "log_every_n_steps": 1,
@@ -18,7 +18,7 @@
     "lr_decay": 0.999875,
     "lr_scheduler_type": "cosine",
     "num_warmup_steps": 100,
-    "max_epochs": 3,
+    "max_steps": 100,
     "disable_dropout": true,
     "model_torch_dtype": "auto",
     "bf16": true,

diff --git a/katheryne/train/train.py b/katheryne/train/train.py
@@ -234,7 +234,8 @@ def train(create_dataset, lightning_module_class):
     trainer_params["gradient_clip_algorithm"] = hparams.get("gradient_clip_algorithm", "norm")
     trainer_params["gradient_clip_val"] = hparams.get("gradient_clip_val", None)
 
-    trainer_params["max_epochs"] = hparams.get("max_epochs", 1000)
+    trainer_params["max_epochs"] = hparams.get("max_epochs", None)
+    trainer_params["max_steps"] = hparams.get("max_steps", -1)
     trainer_params["accumulate_grad_batches"] = hparams.get("accumulate_grad_batches", 1)
 
     # Profiler

diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ datasets==2.15.0
 blosc==1.11.1
 tensorboardX==2.6.2.2
 tensorboard==2.15.1
+pydantic==2.6.3