Skip to content

Commit

Permalink
dataset loader bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Jun 25, 2024
1 parent d87d9bc commit f1ab227
Show file tree
Hide file tree
Showing 24 changed files with 64 additions and 36 deletions.
2 changes: 1 addition & 1 deletion hparams/hparams_chat_qwen1.5_4b.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-4B",
"model_name_or_path": "Qwen/Qwen1.5-4B",
"atten_class": "eager",
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
Expand Down
1 change: 1 addition & 0 deletions hparams/hparams_chat_qwen_7b_lora.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"train_stage": "chat",
"conv_format": "qwen",
"end_of_conversation": 151643,
"data_path": [
Expand Down
5 changes: 3 additions & 2 deletions hparams/hparams_pretrain_bloom_1b1.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
{
"train_stage": "pretrain",
"data_path": [
"bigscience-data/roots_zh-cn_wikipedia"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "bigscience/bloom-1b1",
"per_device_train_batch_size": 1,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"accumulate_grad_batches": 64,
"max_seq_len": 512,
"checkpoint_every_n_train_steps": 1000,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 2e-5,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 1e-6,
"lr_decay": 0.999875,
Expand Down
5 changes: 3 additions & 2 deletions hparams/hparams_pretrain_bloom_1b1_lora.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
{
"train_stage": "pretrain",
"data_path": [
"bigscience-data/roots_zh-cn_wikipedia"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "bigscience/bloom-1b1",
"per_device_train_batch_size": 1,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"accumulate_grad_batches": 64,
"max_seq_len": 512,
"checkpoint_every_n_train_steps": 1000,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 2e-5,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 1e-6,
"lr_decay": 0.999875,
Expand Down
5 changes: 3 additions & 2 deletions hparams/hparams_pretrain_bloom_560m.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
{
"train_stage": "pretrain",
"data_path": [
"bigscience-data/roots_zh-cn_wikipedia"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "bigscience/bloom-560m",
"per_device_train_batch_size": 1,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"accumulate_grad_batches": 64,
"max_seq_len": 512,
"checkpoint_every_n_train_steps": 1000,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 2e-5,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 1e-6,
"lr_decay": 0.999875,
Expand Down
5 changes: 3 additions & 2 deletions hparams/hparams_pretrain_bloom_560m_dora.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
{
"train_stage": "pretrain",
"data_path": [
"bigscience-data/roots_zh-cn_wikipedia"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "bigscience/bloom-560m",
"per_device_train_batch_size": 1,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"accumulate_grad_batches": 64,
"max_seq_len": 512,
"checkpoint_every_n_train_steps": 1000,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 2e-5,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 1e-6,
"lr_decay": 0.999875,
Expand Down
5 changes: 3 additions & 2 deletions hparams/hparams_pretrain_bloom_560m_lora.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
{
"train_stage": "pretrain",
"data_path": [
"bigscience-data/roots_zh-cn_wikipedia"
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "bigscience/bloom-560m",
"per_device_train_batch_size": 1,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"accumulate_grad_batches": 64,
"max_seq_len": 512,
"checkpoint_every_n_train_steps": 1000,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 2e-5,
"learning_rate": 8e-6,
"betas": [0.9, 0.95],
"eps": 1e-6,
"lr_decay": 0.999875,
Expand Down
4 changes: 2 additions & 2 deletions hparams/hparams_pretrain_llama2_7b_ddp_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 4,
"accumulate_grad_batches": 64,
"max_seq_len": 1024,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 1000,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
Expand All @@ -35,5 +35,5 @@
"lora_dropout": 0.2,
"bias": "all",
"target_modules": "all-linear"
},
}
}
2 changes: 1 addition & 1 deletion hparams/hparams_reward_qwen1.5_4b.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-4B",
"model_name_or_path": "Qwen/Qwen1.5-4B",
"atten_class": "eager",
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
Expand Down
6 changes: 6 additions & 0 deletions katheryne/data/loader/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ def create_chat_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath

if d_path.shuffle:
train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))

print(f"{d_path} - Dataset size: {len(train_dataset)}")

if isinstance(d_path.sample, int):
sample_size = d_path.sample
Expand All @@ -114,9 +116,13 @@ def create_chat_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath
if d_path.sample != 1.0:
sample_size = int(d_path.sample * len(train_dataset))
train_dataset = train_dataset.select(list(range(sample_size)))
else:
sample_size = len(train_dataset)
else:
raise TypeError("Invalid sample number of dataset path object, need int or float.")

print(f"{d_path} - Selected size: {sample_size}")

if train_dataset is not None:
train_datasets.append(train_dataset)
if eval_dataset is not None:
Expand Down
6 changes: 6 additions & 0 deletions katheryne/data/loader/instruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def create_instruction_dataset(hparams: HParams, data_path: List[Union[str, Data

if d_path.shuffle:
train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))

print(f"{d_path} - Dataset size: {len(train_dataset)}")

if isinstance(d_path.sample, int):
sample_size = d_path.sample
Expand All @@ -110,9 +112,13 @@ def create_instruction_dataset(hparams: HParams, data_path: List[Union[str, Data
if d_path.sample != 1.0:
sample_size = int(d_path.sample * len(train_dataset))
train_dataset = train_dataset.select(list(range(sample_size)))
else:
sample_size = len(train_dataset)
else:
raise TypeError("Invalid sample number of dataset path object, need int or float.")

print(f"{d_path} - Selected size: {sample_size}")

if train_dataset is not None:
train_datasets.append(train_dataset)
if eval_dataset is not None:
Expand Down
6 changes: 6 additions & 0 deletions katheryne/data/loader/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ def create_pretrain_dataset(hparams: HParams, data_path: List[Union[str, Dataset

if d_path.shuffle:
train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))

print(f"{d_path} - Dataset size: {len(train_dataset)}")

if isinstance(d_path.sample, int):
sample_size = d_path.sample
Expand All @@ -123,9 +125,13 @@ def create_pretrain_dataset(hparams: HParams, data_path: List[Union[str, Dataset
if d_path.sample != 1.0:
sample_size = int(d_path.sample * len(train_dataset))
train_dataset = train_dataset.select(list(range(sample_size)))
else:
sample_size = len(train_dataset)
else:
raise TypeError("Invalid sample number of dataset path object, need int or float.")

print(f"{d_path} - Selected size: {sample_size}")

if train_dataset is not None:
train_datasets.append(train_dataset)
if eval_dataset is not None:
Expand Down
33 changes: 19 additions & 14 deletions katheryne/datasets/conversation_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,9 @@ def __init__(self, tokenizer_path: str, max_seq_len: int, pad_token_id: int,
self.settings = get_conv_settings(conv_format)
else:
self.settings = conv_format
if end_of_conversation is None:
if self.tokenizer.eos_token_id is None:
self.end_of_conversation = self.tokenizer.pad_token_id
else:
self.end_of_conversation = self.tokenizer.eos_token_id
else:
self.end_of_conversation = end_of_conversation

self.end_of_conversation = end_of_conversation

self.skip_space = False

def tokenize(self, text: str, add_special_tokens: bool=True):
Expand All @@ -54,7 +49,9 @@ def tokenize(self, text: str, add_special_tokens: bool=True):
)
return encoded_text

def add_end_of_conv(self, input_ids, attention_mask, end_of_conversation: Union[str, int]):
def add_end_of_conv(self, input_ids, attention_mask, end_of_conversation: Optional[Union[str, int]]):
if end_of_conversation is None:
return input_ids, attention_mask
if isinstance(end_of_conversation, int):
last_token_id = input_ids[-1]
if last_token_id == self.tokenizer.eos_token_id:
Expand Down Expand Up @@ -98,12 +95,13 @@ def mask_label(self, prompt: str, target: torch.Tensor, indices: List[Tuple[int,
target[cur_len:start] = IGNORE_TOKEN_ID
cur_len = end

if isinstance(self.end_of_conversation, str):
end_conv = np.searchsorted(text_offset, len(prompt)-len(self.end_of_conversation))
elif isinstance(self.end_of_conversation, int):
end_conv = np.searchsorted(text_offset, len(prompt)-1)
else:
raise Exception(f"Type of end_of_conversation is {type(self.end_of_conversation)}, which is not supported.")
if self.end_of_conversation is not None:
if isinstance(self.end_of_conversation, str):
end_conv = np.searchsorted(text_offset, len(prompt)-len(self.end_of_conversation))
elif isinstance(self.end_of_conversation, int):
end_conv = np.searchsorted(text_offset, len(prompt)-1)
else:
raise Exception(f"Type of end_of_conversation is {type(self.end_of_conversation)}, which is not supported.")
target[end_conv:end] = IGNORE_TOKEN_ID
if False: # Inspect and check the correctness of masking
z = target.clone()
Expand Down Expand Up @@ -150,3 +148,10 @@ def _init_tokenizer(self):
if self.tokenizer is None:
self.tokenizer = load_hf_tokenizer(self.tokenizer_path, fast_tokenizer=True)
self.skip_space = is_merge_prefix_space(self.tokenizer)

if self.end_of_conversation is None:
if self.tokenizer.eos_token_id is None:
self.end_of_conversation = self.tokenizer.pad_token_id
else:
self.end_of_conversation = self.tokenizer.eos_token_id

File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion katheryne/train/chat.py → katheryne/stages/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Use of this source code is governed by an MIT-style
# license that can be found in the LICENSE file or at
# https://opensource.org/licenses/MIT.
from katheryne.train.base import train, parse_args
from katheryne.stages.base import train, parse_args
from katheryne.utils.hparams import HParams

def chat():
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Use of this source code is governed by an MIT-style
# license that can be found in the LICENSE file or at
# https://opensource.org/licenses/MIT.
from katheryne.train.base import train, parse_args
from katheryne.stages.base import train, parse_args
from katheryne.utils.hparams import HParams

def instruction():
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Use of this source code is governed by an MIT-style
# license that can be found in the LICENSE file or at
# https://opensource.org/licenses/MIT.
from katheryne.train.base import train, parse_args
from katheryne.stages.base import train, parse_args
from katheryne.utils.hparams import HParams

def pretrain():
Expand Down
2 changes: 1 addition & 1 deletion katheryne/train/reward.py → katheryne/stages/reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Use of this source code is governed by an MIT-style
# license that can be found in the LICENSE file or at
# https://opensource.org/licenses/MIT.
from katheryne.train.base import train, parse_args
from katheryne.stages.base import train, parse_args
from katheryne.utils.hparams import HParams

def reward():
Expand Down
4 changes: 2 additions & 2 deletions katheryne/train/train.py → katheryne/stages/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Use of this source code is governed by an MIT-style
# license that can be found in the LICENSE file or at
# https://opensource.org/licenses/MIT.
from katheryne.train.base import train, parse_args
from katheryne.stages.base import train, parse_args
from katheryne.utils.hparams import HParams

def auto_train_stage():
Expand Down Expand Up @@ -33,4 +33,4 @@ def auto_train_stage():
raise NotImplementedError("The train stage has not been implemented.")

if __name__ == "__main__":
auto_train_stage()
auto_train_stage()
3 changes: 1 addition & 2 deletions train.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
python -m katheryne.train.pretrain
python -m katheryne.train.chat --hparams hparams/hparams_chat_qwen1.5_7b_cj_lora.json --device 0
python -m katheryne.stages.train --hparams hparams/hparams_chat_qwen1.5_7b_cj_lora.json --device 0

0 comments on commit f1ab227

Please sign in to comment.