add conversation based dataset

vtuber-plan · Jun 23, 2024 · 75ea886 · 75ea886
1 parent b6c1abe
commit 75ea886
Show file tree

Hide file tree

Showing 13 changed files with 446 additions and 481 deletions.
diff --git a/README.md b/README.md
@@ -1,33 +1,5 @@
 # katheryne
 Easy Language Model Trainer
 
-
-## Let's benchmark GPUs LLM training with katheryne
-
-### Training settings - 7B Llama (100 steps)
-```
-Stage: Pretrain
-Model: meta-llama/Llama-2-7b-hf
-Dataset: bigscience-data/roots_zh-cn_wikipedia
-per_device_train_batch_size: 2
-accumulate_grad_batches: 1
-max_seq_len: 512
-max_steps: 100
-gradient_checkpointing: true
-dtype: bf16
-lora: {"r": 16, "target_modules": ["q_proj", "v_proj"]}
-```
-
-|              GPU             |  Time   |   GPU Memory   |  Memory Usage  |
-|------------------------------|---------|----------------|----------------|
-|          NVIDIA L40S         |  00:38  |      48G       |    15,134MiB   |
-|NVIDIA RTX 6000 ada Generation|  00:38  |      48G       |    15,134MiB   |
-|    NVIDIA A800 80GB PCIe     |  00:41  |      80G       |    14,850MiB   |
-|    NVIDIA A100 40GB PCIe     |  00:44  |      40G       |    14,863MiB   |
-|   NVIDIA GeForce RTX 4090    |  01:02  |      24G       |    15,078MiB   |
-|      Iluvatar BI-V150        |  01:09  |      32G       |    22,798MiB   |
-|       NVIDIA RTX A6000       |  01:13  |      48G       |    14,944MiB   |
-|         NVIDIA A40           |  01:16  |      48G       |    15,809MiB   |
-|   NVIDIA GeForce RTX 3090    |  01:36  |      24G       |    14,928MiB   |
-
-
+## Model Support
+* LLaMa, LLaMa2, LLaMa3, GLM, Bloom, OPT, GPT2, GPT Neo, GPT Big Code, Qwen, Baichuan and so on.
diff --git a/docs/benchmark.md b/docs/benchmark.md
@@ -0,0 +1,27 @@
+## Let's benchmark GPUs LLM training with katheryne
+
+### Training settings - 7B Llama (100 steps)
+```
+Stage: Pretrain
+Model: meta-llama/Llama-2-7b-hf
+Dataset: bigscience-data/roots_zh-cn_wikipedia
+per_device_train_batch_size: 2
+accumulate_grad_batches: 1
+max_seq_len: 512
+max_steps: 100
+gradient_checkpointing: true
+dtype: bf16
+lora: {"r": 16, "target_modules": ["q_proj", "v_proj"]}
+```
+
+|              GPU             |  Time   |   GPU Memory   |  Memory Usage  |
+|------------------------------|---------|----------------|----------------|
+|          NVIDIA L40S         |  00:38  |      48G       |    15,134MiB   |
+|NVIDIA RTX 6000 ada Generation|  00:38  |      48G       |    15,134MiB   |
+|    NVIDIA A800 80GB PCIe     |  00:41  |      80G       |    14,850MiB   |
+|    NVIDIA A100 40GB PCIe     |  00:44  |      40G       |    14,863MiB   |
+|   NVIDIA GeForce RTX 4090    |  01:02  |      24G       |    15,078MiB   |
+|      Iluvatar BI-V150        |  01:09  |      32G       |    22,798MiB   |
+|       NVIDIA RTX A6000       |  01:13  |      48G       |    14,944MiB   |
+|         NVIDIA A40           |  01:16  |      48G       |    15,809MiB   |
+|   NVIDIA GeForce RTX 3090    |  01:36  |      24G       |    14,928MiB   |
diff --git a/hparams/hparams_chat_ningyu_13b.json b/hparams/hparams_chat_ningyu_13b.json
@@ -133,9 +133,9 @@
     "data_output_path": "./tmp/data_files/",
     "model_name_or_path": "meta-llama/Llama-2-13b-hf",
     "atten_class": "eager",
-    "per_device_train_batch_size": 8,
-    "per_device_eval_batch_size": 8,
-    "accumulate_grad_batches": 32,
+    "per_device_train_batch_size": 2,
+    "per_device_eval_batch_size": 2,
+    "accumulate_grad_batches": 2,
     "max_seq_len": 2048,
     "checkpoint_every_n_train_steps": 50,
     "log_every_n_steps": 1,

diff --git a/katheryne/data/loader/chat.py b/katheryne/data/loader/chat.py
@@ -1,3 +1,9 @@
+# coding=utf-8
+# Copyright 2024 XiaHan
+# 
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
 
 import hashlib
 import os
@@ -68,7 +74,7 @@ def create_chat_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath
     Creates the chat dataset
     """
     tokenizer = load_hf_tokenizer(tokenizer_path, fast_tokenizer=True)
-    data_path_obj = []
+    data_path_obj: List[DatasetPath] = []
     for d_path in data_path:
         if isinstance(d_path, str):
             d_path_obj = DatasetPath.model_validate({
@@ -135,16 +141,18 @@ def create_chat_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath
     # eval_dataset = datasets.load_from_disk(eval_fname)
 
     # torch.distributed.barrier()
-    train_dataset = ChatDataset(tokenizer_path, 
-        max_seq_len, 
-        train_dataset, 
+    train_dataset = ChatDataset(
+        train_dataset,
+        tokenizer_path, 
+        max_seq_len,
         tokenizer.pad_token_id, 
         conv_format=conv_format, 
         end_of_conversation=hparams.get("end_of_conversation", None)
     )
-    eval_dataset = ChatDataset(tokenizer_path, 
-        max_seq_len, 
-        eval_dataset, 
+    eval_dataset = ChatDataset(
+        eval_dataset,
+        tokenizer_path,
+        max_seq_len,
         tokenizer.pad_token_id, 
         conv_format=conv_format, 
         end_of_conversation=hparams.get("end_of_conversation", None)