Skip to content

Commit

Permalink
dataset ratio control
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Jan 2, 2024
1 parent 0c9e2f5 commit 3b73d13
Show file tree
Hide file tree
Showing 11 changed files with 268 additions and 23 deletions.
6 changes: 3 additions & 3 deletions hparams/hparams_chat_deepseek_7b_chat_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
"flash_attention_2": false,
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
"accumulate_grad_batches": 128,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"val_check_interval": 400,
"limit_val_batches": 0.1,
"learning_rate": 1e-4,
"learning_rate": 2e-5,
"betas": [0.9, 0.999],
"eps": 1e-6,
"lr_decay": 0.999875,
Expand Down
8 changes: 4 additions & 4 deletions hparams/hparams_chat_deepseek_7b_lora.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "deepseek-ai/deepseek-llm-7b-base",
"flash_attention_2": false,
"flash_attention_2": true,
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 8,
"accumulate_grad_batches": 128,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 100,
"log_every_n_steps": 1,
"val_check_interval": 0.25,
"val_check_interval": 400,
"limit_val_batches": 0.1,
"learning_rate": 1e-4,
"learning_rate": 1e-5,
"betas": [0.9, 0.999],
"eps": 1e-6,
"lr_scheduler_type": "cosine",
Expand Down
163 changes: 163 additions & 0 deletions hparams/hparams_chat_llama2_13b_lora.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
{
"conv_format": "llama2",
"data_path": [
{
"path": "../../datasets/HanChat/ACGN/",
"sample": 0.05
},
{
"path": "../../datasets/HanChat/AncientPoem/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Brainstorming/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Classification/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/ClosedQA/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/CodeGeneration/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Composition/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/COT/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Couplet/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Dialogue/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Dictionary/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/DomainExpert/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/InformationExtraction/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/KeywordRecognition/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/LanguageDetection/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/LyricGeneration/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/MRC/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/MusicComment/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/NER/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/NLI/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/OpenQA/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Other/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/ProductDesc/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Punctuation/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/RolePlaying/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/SentimentAnalyze/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/StoryGeneration/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Summary/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/TextCorrection/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/TextMatching/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/ToxicityDetection/",
"sample": 1.0
},
{
"path": "../../datasets/HanChat/Translation/",
"sample": 0.05
}
],
"data_output_path": "./tmp/data_files/",
"model_name_or_path": "meta-llama/Llama-2-13b-hf",
"flash_attention_2": false,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 8,
"accumulate_grad_batches": 64,
"max_seq_len": 2048,
"checkpoint_every_n_train_steps": 50,
"log_every_n_steps": 1,
"val_check_interval": 0.1,
"limit_val_batches": 0.1,
"learning_rate": 2e-5,
"betas": [0.9, 0.95],
"eps": 1e-6,
"lr_decay": 0.999875,
"lr_scheduler_type": "cosine",
"num_warmup_steps": 100,
"max_epochs": 300,
"disable_dropout": true,
"model_torch_dtype": "auto",
"bf16": true,
"gradient_checkpointing": true,
"weight_decay": 0.0,
"gradient_clip_algorithm": "norm",
"gradient_clip_val": 1.0,
"strategy": null,
"lora": {
"r": 128,
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"]
}
}
10 changes: 10 additions & 0 deletions katheryne/data/loader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import Union
from pydantic import BaseModel, Field

class DatasetPath(BaseModel):
path: str
sample: Union[int, float] = 1.0
shuffle: bool = False

def __str__(self) -> str:
return self.path
36 changes: 32 additions & 4 deletions katheryne/data/loader/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import torch
from torch.utils.data import Dataset, Subset, ConcatDataset
from tqdm import tqdm
from katheryne.data.loader import DatasetPath
from katheryne.datasets.chat_dataset import ChatDataset
from katheryne.datasets.pretrain_dataset import PretrainDataset

Expand Down Expand Up @@ -61,13 +62,27 @@ def create_dataset(dataset_name, output_path, seed) -> Tuple[datasets.Dataset, d
return train_dataset, eval_dataset


def create_chat_dataset(hparams: HParams, data_path: str, output_path: str, seed: int, tokenizer, max_seq_len: int):
def create_chat_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath]], output_path: str, seed: int, tokenizer, max_seq_len: int):
"""
Creates the chat dataset
"""
data_path_obj = []
for d_path in data_path:
if isinstance(d_path, str):
d_path_obj = DatasetPath.model_validate({
"path": d_path,
"sample": 1.0,
"shuffle": False
})
elif isinstance(d_path, dict):
d_path_obj = DatasetPath.model_validate(d_path)
else:
raise TypeError("Invalid dataset path object, need str or dict.")
data_path_obj.append(d_path_obj)

conv_format = hparams.get("conv_format", "openbuddy")
os.makedirs(output_path, exist_ok=True)
data_path_list = ("_".join(data_path)).replace("/", "_").replace("\\", "_")
data_path_list = ("_".join([str(p) for p in data_path_obj])).replace("/", "_").replace("\\", "_")
tokenizer_name = tokenizer.init_kwargs["name_or_path"].replace("/", "_")
fname = f"{data_path_list}_tokenizer{tokenizer_name}_seqlen{max_seq_len}_seed{seed}" # _tokenizer{tokenizer_name}_seqlen{max_seq_len}
fname = "_".join(fname.split("/"))
Expand All @@ -85,12 +100,25 @@ def create_chat_dataset(hparams: HParams, data_path: str, output_path: str, seed
if not cache_found:
train_datasets = []
eval_datasets = []
for di, d_path in enumerate(data_path):
for di, d_path in enumerate(data_path_obj):
print(f"Creating dataset: {d_path}")
train_dataset, eval_dataset = create_dataset(d_path, output_path, seed)
train_dataset, eval_dataset = create_dataset(d_path.path, output_path, seed)
train_dataset = train_dataset.cast(CHAT_FEATURES)
eval_dataset = eval_dataset.cast(CHAT_FEATURES)

if d_path.shuffle:
train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))

if isinstance(d_path.sample, int):
sample_size = d_path.sample
train_dataset = train_dataset[:sample_size]
elif isinstance(d_path.sample, float):
if d_path.sample != 1.0:
sample_size = int(d_path.sample * len(train_dataset))
train_dataset = train_dataset[:sample_size]
else:
raise TypeError("Invalid sample number of dataset path object, need int or float.")

if train_dataset is not None:
train_datasets.append(train_dataset)
if eval_dataset is not None:
Expand Down
39 changes: 34 additions & 5 deletions katheryne/data/loader/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import hashlib
import os
import shutil
from typing import List, Optional, Union
from typing import List, Optional, Tuple, TypeAlias, Union
import numpy as np

import datasets
import torch
from torch.utils.data import Dataset, Subset, ConcatDataset
from tqdm import tqdm
from katheryne.data.loader import DatasetPath
from katheryne.datasets.pretrain_dataset import PretrainDataset
# from katheryne.data.datasets.pretrain_datasets import get_raw_dataset
# from katheryne.data.datasets import PretrainDataset, PretrainUniformDataset
Expand Down Expand Up @@ -83,12 +84,26 @@ def create_dataset(dataset_name, output_path, seed):
return train_dataset, eval_dataset


def create_pretrain_dataset(hparams: HParams, data_path: str, output_path: str, seed: int, tokenizer, max_seq_len: int):
def create_pretrain_dataset(hparams: HParams, data_path: List[Union[str, DatasetPath]], output_path: str, seed: int, tokenizer, max_seq_len: int):
"""
Creates the pretrain dataset
"""
data_path_obj = []
for d_path in data_path:
if isinstance(d_path, str):
d_path_obj = DatasetPath.model_validate({
"path": d_path,
"sample": 1.0,
"shuffle": False
})
elif isinstance(d_path, dict):
d_path_obj = DatasetPath.model_validate(d_path)
else:
raise TypeError("Invalid dataset path object, need str or dict.")
data_path_obj.append(d_path_obj)

os.makedirs(output_path, exist_ok=True)
data_path_list = ("_".join(data_path)).replace("/", "_").replace("\\", "_")
data_path_list = ("_".join([str(p) for p in data_path_obj])).replace("/", "_").replace("\\", "_")
tokenizer_name = tokenizer.init_kwargs["name_or_path"].replace("/", "_")
fname = f"{data_path_list}_tokenizer{tokenizer_name}_seqlen{max_seq_len}_seed{seed}" # _tokenizer{tokenizer_name}_seqlen{max_seq_len}
fname = "_".join(fname.split("/"))
Expand All @@ -101,9 +116,23 @@ def create_pretrain_dataset(hparams: HParams, data_path: str, output_path: str,
if not cache_found:
train_datasets = []
eval_datasets = []
for d_path in data_path:
for d_path in data_path_obj:
print(f"Creating dataset: {d_path}")
train_dataset, eval_dataset = create_dataset(d_path, output_path, seed)
train_dataset, eval_dataset = create_dataset(d_path.path, output_path, seed)

if d_path.shuffle:
train_dataset = train_dataset.shuffle(seed=hparams.get("seed", 43))

if isinstance(d_path.sample, int):
sample_size = d_path.sample
train_dataset = train_dataset[:sample_size]
elif isinstance(d_path.sample, float):
if d_path.sample != 1.0:
sample_size = int(d_path.sample * len(train_dataset))
train_dataset = train_dataset[:sample_size]
else:
raise TypeError("Invalid sample number of dataset path object, need int or float.")

if train_dataset is not None:
train_datasets.append(train_dataset)
if eval_dataset is not None:
Expand Down
5 changes: 3 additions & 2 deletions katheryne/light_modules/models/chat_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,19 @@ def validation_step(self, batch, batch_idx):
self.log('val_loss', loss, on_step=True, on_epoch=True, sync_dist=False)

def on_save_checkpoint(self, checkpoint):
save_path = f"{self.trainer.logger.log_dir}/huggingface_format"
if self.deepspeed and self.hparams.params.get("zero_stage", 0) == 3:
# For zero stage 3, each gpu only has a part of the model, so we need a special save function
save_zero_three_model(self.model, self.global_rank,
os.path.join("./lightning_logs/huggingface_format", f"checkpoint-step-{self.global_step}"),
os.path.join(save_path, f"checkpoint-step-{self.global_step}"),
zero_stage=3
)
else:
if self.global_rank == 0:
save_hf_format(
self.model,
tokenizer=None,
output_dir="./lightning_logs/huggingface_format",
output_dir=save_path,
sub_folder=f"checkpoint-step-{self.global_step}"
)

Expand Down
5 changes: 3 additions & 2 deletions katheryne/light_modules/models/pretrain_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,19 @@ def validation_step(self, batch, batch_idx):
self.log('val_loss', loss, on_step=True, on_epoch=True, sync_dist=False)

def on_save_checkpoint(self, checkpoint):
save_path = f"{self.trainer.logger.log_dir}/huggingface_format"
if self.deepspeed and self.hparams.params.get("zero_stage", 0) == 3:
# For zero stage 3, each gpu only has a part of the model, so we need a special save function
save_zero_three_model(self.model, self.global_rank,
os.path.join("./lightning_logs/huggingface_format", f"checkpoint-step-{self.global_step}"),
os.path.join(save_path, f"checkpoint-step-{self.global_step}"),
zero_stage=3
)
else:
if self.global_rank == 0:
save_hf_format(
self.model,
tokenizer=None,
output_dir="./lightning_logs/huggingface_format",
output_dir=save_path,
sub_folder=f"checkpoint-step-{self.global_step}"
)

Expand Down
Loading

0 comments on commit 3b73d13

Please sign in to comment.