dataset loader bug fix

vtuber-plan · Jun 25, 2024 · f1ab227 · f1ab227
1 parent d87d9bc
commit f1ab227
Show file tree

Hide file tree

Showing 24 changed files with 64 additions and 36 deletions.
diff --git a/hparams/hparams_chat_qwen1.5_4b.json b/hparams/hparams_chat_qwen1.5_4b.json
@@ -9,7 +9,7 @@
 
     ],
     "data_output_path": "./tmp/data_files/",
-    "model_name_or_path": "/data/wangjun/models/Qwen1.5-4B",
+    "model_name_or_path": "Qwen/Qwen1.5-4B",
     "atten_class": "eager",
     "per_device_train_batch_size": 2,
     "per_device_eval_batch_size": 8,

diff --git a/hparams/hparams_chat_qwen_7b_lora.json b/hparams/hparams_chat_qwen_7b_lora.json
@@ -1,4 +1,5 @@
 {
+    "train_stage": "chat",
     "conv_format": "qwen",
     "end_of_conversation": 151643,
     "data_path": [

diff --git a/hparams/hparams_pretrain_bloom_1b1.json b/hparams/hparams_pretrain_bloom_1b1.json
@@ -1,18 +1,19 @@
 {
+    "train_stage": "pretrain",
     "data_path": [
         "bigscience-data/roots_zh-cn_wikipedia"
     ],
     "data_output_path": "./tmp/data_files/",
     "model_name_or_path": "bigscience/bloom-1b1",
-    "per_device_train_batch_size": 1,
+    "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
     "accumulate_grad_batches": 64,
     "max_seq_len": 512,
     "checkpoint_every_n_train_steps": 1000,
     "log_every_n_steps": 1,
     "val_check_interval": 0.25,
     "limit_val_batches": 0.1,
-    "learning_rate": 2e-5,
+    "learning_rate": 8e-6,
     "betas": [0.9, 0.95],
     "eps": 1e-6,
     "lr_decay": 0.999875,

diff --git a/hparams/hparams_pretrain_bloom_1b1_lora.json b/hparams/hparams_pretrain_bloom_1b1_lora.json
@@ -1,18 +1,19 @@
 {
+    "train_stage": "pretrain",
     "data_path": [
         "bigscience-data/roots_zh-cn_wikipedia"
     ],
     "data_output_path": "./tmp/data_files/",
     "model_name_or_path": "bigscience/bloom-1b1",
-    "per_device_train_batch_size": 1,
+    "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
     "accumulate_grad_batches": 64,
     "max_seq_len": 512,
     "checkpoint_every_n_train_steps": 1000,
     "log_every_n_steps": 1,
     "val_check_interval": 0.25,
     "limit_val_batches": 0.1,
-    "learning_rate": 2e-5,
+    "learning_rate": 8e-6,
     "betas": [0.9, 0.95],
     "eps": 1e-6,
     "lr_decay": 0.999875,

diff --git a/hparams/hparams_pretrain_bloom_560m.json b/hparams/hparams_pretrain_bloom_560m.json
@@ -1,18 +1,19 @@
 {
+    "train_stage": "pretrain",
     "data_path": [
         "bigscience-data/roots_zh-cn_wikipedia"
     ],
     "data_output_path": "./tmp/data_files/",
     "model_name_or_path": "bigscience/bloom-560m",
-    "per_device_train_batch_size": 1,
+    "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
     "accumulate_grad_batches": 64,
     "max_seq_len": 512,
     "checkpoint_every_n_train_steps": 1000,
     "log_every_n_steps": 1,
     "val_check_interval": 0.25,
     "limit_val_batches": 0.1,
-    "learning_rate": 2e-5,
+    "learning_rate": 8e-6,
     "betas": [0.9, 0.95],
     "eps": 1e-6,
     "lr_decay": 0.999875,

diff --git a/hparams/hparams_pretrain_bloom_560m_dora.json b/hparams/hparams_pretrain_bloom_560m_dora.json
@@ -1,18 +1,19 @@
 {
+    "train_stage": "pretrain",
     "data_path": [
         "bigscience-data/roots_zh-cn_wikipedia"
     ],
     "data_output_path": "./tmp/data_files/",
     "model_name_or_path": "bigscience/bloom-560m",
-    "per_device_train_batch_size": 1,
+    "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
     "accumulate_grad_batches": 64,
     "max_seq_len": 512,
     "checkpoint_every_n_train_steps": 1000,
     "log_every_n_steps": 1,
     "val_check_interval": 0.25,
     "limit_val_batches": 0.1,
-    "learning_rate": 2e-5,
+    "learning_rate": 8e-6,
     "betas": [0.9, 0.95],
     "eps": 1e-6,
     "lr_decay": 0.999875,

diff --git a/hparams/hparams_pretrain_bloom_560m_lora.json b/hparams/hparams_pretrain_bloom_560m_lora.json
@@ -1,18 +1,19 @@
 {
+    "train_stage": "pretrain",
     "data_path": [
         "bigscience-data/roots_zh-cn_wikipedia"
     ],
     "data_output_path": "./tmp/data_files/",
     "model_name_or_path": "bigscience/bloom-560m",
-    "per_device_train_batch_size": 1,
+    "per_device_train_batch_size": 4,
     "per_device_eval_batch_size": 4,
     "accumulate_grad_batches": 64,
     "max_seq_len": 512,
     "checkpoint_every_n_train_steps": 1000,
     "log_every_n_steps": 1,
     "val_check_interval": 0.25,
     "limit_val_batches": 0.1,
-    "learning_rate": 2e-5,
+    "learning_rate": 8e-6,
     "betas": [0.9, 0.95],
     "eps": 1e-6,
     "lr_decay": 0.999875,

diff --git a/hparams/hparams_pretrain_llama2_7b_ddp_lora.json b/hparams/hparams_pretrain_llama2_7b_ddp_lora.json
@@ -8,7 +8,7 @@
     "per_device_train_batch_size": 2,
     "per_device_eval_batch_size": 4,
     "accumulate_grad_batches": 64,
-    "max_seq_len": 1024,
+    "max_seq_len": 2048,
     "checkpoint_every_n_train_steps": 1000,
     "log_every_n_steps": 1,
     "val_check_interval": 0.25,
@@ -35,5 +35,5 @@
         "lora_dropout": 0.2,
         "bias": "all",
         "target_modules": "all-linear"
-    },
+    }
 }
diff --git a/hparams/hparams_reward_qwen1.5_4b.json b/hparams/hparams_reward_qwen1.5_4b.json
@@ -10,7 +10,7 @@
 
     ],
     "data_output_path": "./tmp/data_files/",
-    "model_name_or_path": "/data/wangjun/models/Qwen1.5-4B",
+    "model_name_or_path": "Qwen/Qwen1.5-4B",
     "atten_class": "eager",
     "per_device_train_batch_size": 2,
     "per_device_eval_batch_size": 8,

diff --git a/katheryne/data/loader/chat.py b/katheryne/data/loader/chat.py
@@ -106,6 +106,8 @@ def create_chat_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath
 
         if d_path.shuffle:
             train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))
+
+        print(f"{d_path} - Dataset size: {len(train_dataset)}")
 
         if isinstance(d_path.sample, int):
             sample_size = d_path.sample
@@ -114,9 +116,13 @@ def create_chat_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath
             if d_path.sample != 1.0:
                 sample_size = int(d_path.sample * len(train_dataset))
                 train_dataset = train_dataset.select(list(range(sample_size)))
+            else:
+                sample_size = len(train_dataset)
         else:
             raise TypeError("Invalid sample number of dataset path object, need int or float.")
 
+        print(f"{d_path} - Selected size: {sample_size}")
+
         if train_dataset is not None:
             train_datasets.append(train_dataset)
         if eval_dataset is not None:

diff --git a/katheryne/data/loader/instruction.py b/katheryne/data/loader/instruction.py
@@ -102,6 +102,8 @@ def create_instruction_dataset(hparams: HParams, data_path: List[Union[str, Data
 
         if d_path.shuffle:
             train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))
+
+        print(f"{d_path} - Dataset size: {len(train_dataset)}")
 
         if isinstance(d_path.sample, int):
             sample_size = d_path.sample
@@ -110,9 +112,13 @@ def create_instruction_dataset(hparams: HParams, data_path: List[Union[str, Data
             if d_path.sample != 1.0:
                 sample_size = int(d_path.sample * len(train_dataset))
                 train_dataset = train_dataset.select(list(range(sample_size)))
+            else:
+                sample_size = len(train_dataset)
         else:
             raise TypeError("Invalid sample number of dataset path object, need int or float.")
 
+        print(f"{d_path} - Selected size: {sample_size}")
+
         if train_dataset is not None:
             train_datasets.append(train_dataset)
         if eval_dataset is not None:

diff --git a/katheryne/data/loader/pretrain.py b/katheryne/data/loader/pretrain.py
@@ -115,6 +115,8 @@ def create_pretrain_dataset(hparams: HParams, data_path: List[Union[str, Dataset
 
         if d_path.shuffle:
             train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))
+
+        print(f"{d_path} - Dataset size: {len(train_dataset)}")
 
         if isinstance(d_path.sample, int):
             sample_size = d_path.sample
@@ -123,9 +125,13 @@ def create_pretrain_dataset(hparams: HParams, data_path: List[Union[str, Dataset
             if d_path.sample != 1.0:
                 sample_size = int(d_path.sample * len(train_dataset))
                 train_dataset = train_dataset.select(list(range(sample_size)))
+            else:
+                sample_size = len(train_dataset)
         else:
             raise TypeError("Invalid sample number of dataset path object, need int or float.")
 
+        print(f"{d_path} - Selected size: {sample_size}")
+
         if train_dataset is not None:
             train_datasets.append(train_dataset)
         if eval_dataset is not None:

diff --git a/katheryne/datasets/conversation_dataset.py b/katheryne/datasets/conversation_dataset.py
@@ -34,14 +34,9 @@ def __init__(self, tokenizer_path: str, max_seq_len: int, pad_token_id: int,
             self.settings = get_conv_settings(conv_format)
         else:
             self.settings = conv_format
-        if end_of_conversation is None:
-            if self.tokenizer.eos_token_id is None:
-                self.end_of_conversation = self.tokenizer.pad_token_id
-            else:
-                self.end_of_conversation = self.tokenizer.eos_token_id
-        else:
-            self.end_of_conversation = end_of_conversation
 
+        self.end_of_conversation = end_of_conversation
+
         self.skip_space = False
 
     def tokenize(self, text: str, add_special_tokens: bool=True):
@@ -54,7 +49,9 @@ def tokenize(self, text: str, add_special_tokens: bool=True):
                     )
         return encoded_text
 
-    def add_end_of_conv(self, input_ids, attention_mask, end_of_conversation: Union[str, int]):
+    def add_end_of_conv(self, input_ids, attention_mask, end_of_conversation: Optional[Union[str, int]]):
+        if end_of_conversation is None:
+            return input_ids, attention_mask
         if isinstance(end_of_conversation, int):
             last_token_id = input_ids[-1]
             if last_token_id == self.tokenizer.eos_token_id:
@@ -98,12 +95,13 @@ def mask_label(self, prompt: str, target: torch.Tensor, indices: List[Tuple[int,
             target[cur_len:start] = IGNORE_TOKEN_ID
             cur_len = end
 
-        if isinstance(self.end_of_conversation, str):
-            end_conv = np.searchsorted(text_offset, len(prompt)-len(self.end_of_conversation))
-        elif isinstance(self.end_of_conversation, int):
-            end_conv = np.searchsorted(text_offset, len(prompt)-1)
-        else:
-             raise Exception(f"Type of end_of_conversation is {type(self.end_of_conversation)}, which is not supported.")
+        if self.end_of_conversation is not None:
+            if isinstance(self.end_of_conversation, str):
+                end_conv = np.searchsorted(text_offset, len(prompt)-len(self.end_of_conversation))
+            elif isinstance(self.end_of_conversation, int):
+                end_conv = np.searchsorted(text_offset, len(prompt)-1)
+            else:
+                raise Exception(f"Type of end_of_conversation is {type(self.end_of_conversation)}, which is not supported.")
         target[end_conv:end] = IGNORE_TOKEN_ID
         if False:  # Inspect and check the correctness of masking
             z = target.clone()
@@ -150,3 +148,10 @@ def _init_tokenizer(self):
         if self.tokenizer is None:
             self.tokenizer = load_hf_tokenizer(self.tokenizer_path, fast_tokenizer=True)
             self.skip_space = is_merge_prefix_space(self.tokenizer)
+
+            if self.end_of_conversation is None:
+                if self.tokenizer.eos_token_id is None:
+                    self.end_of_conversation = self.tokenizer.pad_token_id
+                else:
+                    self.end_of_conversation = self.tokenizer.eos_token_id
+
diff --git a/katheryne/train/__init__.py → katheryne/stages/__init__.py b/katheryne/train/__init__.py → katheryne/stages/__init__.py
diff --git a/katheryne/train/base.py → katheryne/stages/base.py b/katheryne/train/base.py → katheryne/stages/base.py
diff --git a/katheryne/train/chat.py → katheryne/stages/chat.py b/katheryne/train/chat.py → katheryne/stages/chat.py
@@ -4,7 +4,7 @@
 # Use of this source code is governed by an MIT-style
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
-from katheryne.train.base import train, parse_args
+from katheryne.stages.base import train, parse_args
 from katheryne.utils.hparams import HParams
 
 def chat():

diff --git a/katheryne/train/dpo.py → katheryne/stages/dpo.py b/katheryne/train/dpo.py → katheryne/stages/dpo.py
diff --git a/katheryne/train/instruction.py → katheryne/stages/instruction.py b/katheryne/train/instruction.py → katheryne/stages/instruction.py
@@ -4,7 +4,7 @@
 # Use of this source code is governed by an MIT-style
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
-from katheryne.train.base import train, parse_args
+from katheryne.stages.base import train, parse_args
 from katheryne.utils.hparams import HParams
 
 def instruction():

diff --git a/katheryne/train/kpo.py → katheryne/stages/kpo.py b/katheryne/train/kpo.py → katheryne/stages/kpo.py
diff --git a/katheryne/train/ppo.py → katheryne/stages/ppo.py b/katheryne/train/ppo.py → katheryne/stages/ppo.py
diff --git a/katheryne/train/pretrain.py → katheryne/stages/pretrain.py b/katheryne/train/pretrain.py → katheryne/stages/pretrain.py
@@ -4,7 +4,7 @@
 # Use of this source code is governed by an MIT-style
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
-from katheryne.train.base import train, parse_args
+from katheryne.stages.base import train, parse_args
 from katheryne.utils.hparams import HParams
 
 def pretrain():

diff --git a/katheryne/train/reward.py → katheryne/stages/reward.py b/katheryne/train/reward.py → katheryne/stages/reward.py
@@ -4,7 +4,7 @@
 # Use of this source code is governed by an MIT-style
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
-from katheryne.train.base import train, parse_args
+from katheryne.stages.base import train, parse_args
 from katheryne.utils.hparams import HParams
 
 def reward():

diff --git a/katheryne/train/train.py → katheryne/stages/train.py b/katheryne/train/train.py → katheryne/stages/train.py
@@ -4,7 +4,7 @@
 # Use of this source code is governed by an MIT-style
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
-from katheryne.train.base import train, parse_args
+from katheryne.stages.base import train, parse_args
 from katheryne.utils.hparams import HParams
 
 def auto_train_stage():
@@ -33,4 +33,4 @@ def auto_train_stage():
         raise NotImplementedError("The train stage has not been implemented.")
 
 if __name__ == "__main__":
-    auto_train_stage()
+    auto_train_stage()
diff --git a/train.sh b/train.sh
@@ -1,2 +1 @@
-python -m katheryne.train.pretrain
-python -m katheryne.train.chat --hparams hparams/hparams_chat_qwen1.5_7b_cj_lora.json --device 0
+python -m katheryne.stages.train --hparams hparams/hparams_chat_qwen1.5_7b_cj_lora.json --device 0