Skip to content

Commit

Permalink
get_text_offset bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Jun 15, 2024
1 parent 617adb2 commit b2d60ec
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 18 deletions.
2 changes: 1 addition & 1 deletion hparams/hparams_chat_codeqwen1.5_7b_deepspeed_cj.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
"max_seq_len": 1024,
"checkpoint_every_n_train_steps": 1,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
Expand Down
6 changes: 3 additions & 3 deletions hparams/hparams_chat_qwen1.5_4b.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"end_of_conversation": 151643,
"data_path": [
{
"path": "/data/wangjun/github/cangjie/",
"path": "/data/wangjun/github/cangjie_data/",
"sample": 1.0
},
{
Expand All @@ -18,12 +18,12 @@
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
"max_seq_len": 1024,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 8e-6,
"learning_rate": 4e-5,
"betas": [0.9, 0.95],
"eps": 8e-6,
"lr_decay": 0.999875,
Expand Down
14 changes: 11 additions & 3 deletions hparams/hparams_chat_qwen1.5_7b_cj.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,28 @@
"conv_format": "qwen",
"end_of_conversation": 151643,
"data_path": [
"/data/wangjun/github/cangjie/"
{
"path": "/data/wangjun/github/cangjie_data/",
"sample": 1.0
},
{
"path": "Vtuber-plan/sharegpt-cleaned",
"sample": 0.01
}

],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-7B",
"atten_class": "eager",
"per_device_train_batch_size": 2,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
"max_seq_len": 512,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 8e-6,
"learning_rate": 2e-5,
"betas": [0.9, 0.95],
"eps": 8e-6,
"lr_decay": 0.999875,
Expand Down
25 changes: 21 additions & 4 deletions hparams/hparams_chat_qwen1.5_7b_deepspeed_cj.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@
"conv_format": "qwen",
"end_of_conversation": 151643,
"data_path": [
"/data/wangjun/github/cangjie_data/"
{
"path": "/data/wangjun/github/cangjie_data/",
"sample": 1.0
},
{
"path": "Vtuber-plan/sharegpt-cleaned",
"sample": 0.01
}

],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "/data/wangjun/models/Qwen1.5-7B",
Expand All @@ -15,7 +23,7 @@
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"limit_val_batches": 0.1,
"learning_rate": 8e-6,
"learning_rate": 2e-5,
"betas": [0.9, 0.95],
"eps": 8e-6,
"lr_decay": 0.999875,
Expand All @@ -31,7 +39,16 @@
"gradient_clip_val": 1.0,
"strategy": "deepspeed",
"strategy_params": {
"offload": false,
"zero_stage": 2
"zero_stage": 3,
"remote_device": null,
"offload_optimizer": true,
"offload_optimizer_device": "cpu",
"offload_parameters": false,
"offload_params_device": "cpu",
"cpu_checkpointing": true,
"nvme_path": "./nvme_offload",
"params_buffer_count": 5,
"params_buffer_size": 1000000000,
"contiguous_memory_optimization": false
}
}
11 changes: 6 additions & 5 deletions katheryne/datasets/chat_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
import datasets
Expand Down Expand Up @@ -42,13 +42,14 @@ def __len__(self):
def tokenize(self, text: str, add_special_tokens: bool=True):
encoded_text = self.tokenizer(text,
max_length=self.max_seq_len,
padding="longest",
truncation=True,
return_tensors="pt",
add_special_tokens=add_special_tokens,
)
return encoded_text

def get_prompt(self, messages, ignore_last:bool=False):
def get_prompt(self, messages, ignore_last:bool=False) -> Tuple[str, List[Tuple[int, int]]]:
system = None
for i, item in enumerate(messages):
role, content = item["role"], item["content"]
Expand Down Expand Up @@ -77,9 +78,9 @@ def get_prompt(self, messages, ignore_last:bool=False):
history.messages.append((real_role, content))
return history.get_prompt_and_indices()

def mask_label(self, prompt: str, target, indices):
tokens = self.tokenizer.convert_ids_to_tokens(target, skip_special_tokens=True)
text_offset = get_text_offset(self.tokenizer, prompt, tokens)
def mask_label(self, prompt: str, target: torch.Tensor, indices: List[Tuple[int, int]]):
tokens = self.tokenizer.convert_ids_to_tokens(target, skip_special_tokens=False)
text_offset = get_text_offset(self.tokenizer, prompt, tokens, has_special_tokens=True)

cur_len = 1
target[:cur_len] = IGNORE_TOKEN_ID
Expand Down
1 change: 1 addition & 0 deletions katheryne/datasets/instruction_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __len__(self):
def tokenize(self, text: str, add_special_tokens: bool=True):
encoded_text = self.tokenizer(text,
max_length=self.max_seq_len,
padding="longest",
truncation=True,
return_tensors="pt",
add_special_tokens=add_special_tokens,
Expand Down
1 change: 1 addition & 0 deletions katheryne/datasets/pretrain_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __len__(self):
def tokenize(self, text):
encoded_text = self.tokenizer(text,
max_length=self.max_seq_len,
padding="longest",
truncation=True,
return_tensors="pt"
)
Expand Down
10 changes: 8 additions & 2 deletions katheryne/utils/model/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,16 @@ def pad_tokenizer(tokenizer: PreTrainedTokenizer, pad_to: int=64) -> PreTrainedT
add_token_num = padded_vocab_len - current_vocab_len
tokenizer.add_tokens([f"TOKENIZER_PAD_TOKEN_{i}" for i in range(add_token_num)])

def get_text_offset(tokenizer: PreTrainedTokenizerBase, text: str, tokens: List[str]):
def get_text_offset(tokenizer: PreTrainedTokenizerBase, text: str, tokens: List[str], has_special_tokens: bool=False):
if tokenizer.is_fast:
text_offset = [-1] * len(tokens)
batch_encoding = tokenizer([text])
batch_encoding = tokenizer(
[text],
max_length=len(tokens),
padding=False,
truncation=True,
add_special_tokens=has_special_tokens
)
for token_i in range(len(tokens)):
span = batch_encoding.token_to_chars(0, token_i)
if span is None:
Expand Down

0 comments on commit b2d60ec

Please sign in to comment.