Skip to content

Commit

Permalink
logger configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
jstzwj committed Mar 4, 2024
1 parent 98fe3ce commit 40ad795
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 30 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,4 @@ cython_debug/

/lightning_logs/
/lightning_logs_*/
/llm_trainer/
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ lora: {"r": 16, "target_modules": ["q_proj", "v_proj"]}

| GPU | Time | GPU Memory | Memory Usage |
|------------------------------|---------|----------------|----------------|
| NVIDIA L40S | 00:38 | 48G | 15,134MiB |
|NVIDIA RTX 6000 ada Generation| 00:38 | 48G | 15,134MiB |
| NVIDIA A800 80GB PCIe | 00:41 | 80G | 14,850MiB |
| NVIDIA A100 40GB PCIe | 00:44 | 40G | 14,863MiB |
| NVIDIA GeForce RTX 4090 | 01:02 | 24G | 15,078MiB |
| Iluvatar BI-V150 | 01:09 | 32G | 22,798MiB |
| NVIDIA RTX A6000 | 01:13 | 48G | 14,944MiB |
Expand Down
2 changes: 2 additions & 0 deletions katheryne/light_modules/models/pretrain_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def validation_step(self, batch, batch_idx):
self.log('val_loss', loss, on_step=True, on_epoch=True, sync_dist=False)

def on_save_checkpoint(self, checkpoint):
if self.trainer.logger is None:
return
save_path = f"{self.trainer.logger.log_dir}/huggingface_format"
if self.deepspeed and self.hparams.params.get("zero_stage", 0) == 3:
# For zero stage 3, each gpu only has a part of the model, so we need a special save function
Expand Down
Empty file.
86 changes: 64 additions & 22 deletions katheryne/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor, GradientAccumulationScheduler
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from lightning.pytorch import loggers as pl_loggers
from lightning.pytorch.profilers import SimpleProfiler, AdvancedProfiler

import lightning_fabric
Expand All @@ -51,6 +51,7 @@ def parse_args():
parser.add_argument('--accelerator', type=str, default="gpu", help='training device')
parser.add_argument('--device', type=str, default="", help='training device ids')
parser.add_argument('--seed', type=int, default=43, help='model seed')
parser.add_argument('--path', type=str, default="llm_trainer", help='experiment save path')

args = parser.parse_args()
return args
Expand Down Expand Up @@ -159,38 +160,79 @@ def train(create_dataset, lightning_module_class):
hparams,
)

# Checkpoint Settings
checkpoint_every_n_train_steps = 100
if "checkpoint_every_n_train_steps" in hparams:
checkpoint_every_n_train_steps = hparams.checkpoint_every_n_train_steps

checkpoint_callback = ModelCheckpoint(
dirpath=None, save_last=True, every_n_train_steps=checkpoint_every_n_train_steps,
save_weights_only=False, save_on_train_epoch_end=True, save_top_k=-1
)

# Earlystop Settings
# monitor="val_loss", mode="min", save_top_k=5
# earlystop_callback = EarlyStopping(monitor="valid/loss_mel_epoch", mode="min", patience=13)

# Leaning rate monitor
learning_rate_callback = LearningRateMonitor(logging_interval="step")

# GradientAccumulationScheduler
# accumulator_callback = GradientAccumulationScheduler(scheduling={4: 2})
if len(args.device) == 0:
devices = [i for i in range(torch.cuda.device_count())]
else:
devices = [int(n.strip()) for n in args.device.split(",")]
trainer_params = {
"accelerator": args.accelerator,
"callbacks": [checkpoint_callback, learning_rate_callback],
"callbacks": [],
}

# Logger Settings
loggers = hparams.get("logger", [{"logger_type": "tb", "save_dir": "lightning_logs"}])
if isinstance(loggers, list) or isinstance(loggers, tuple):
loggers = loggers
elif isinstance(loggers, dict):
loggers = [loggers]
else:
raise Exception("Unsupported type in logger field")

tb_logger = []
for logger in loggers:
logger_type = logger.get("logger_type", "tb")
save_dir = logger.get("save_dir", "logs")
logger_save_dir = os.path.join(args.path, save_dir)

if logger_type.lower() in ["tb", "tensorboard"]:
tb_logger.append(pl_loggers.TensorBoardLogger(save_dir=logger_save_dir, name=logger.get("name", None), version=logger.get("version", None)))
elif logger_type.lower() in ["comet"]:
tb_logger.append(pl_loggers.CometLogger(save_dir=logger_save_dir))
elif logger_type.lower() in ["csv"]:
tb_logger.append(pl_loggers.CSVLogger(save_dir=logger_save_dir))
elif logger_type.lower() in ["mlflow"]:
tb_logger.append(pl_loggers.MLFlowLogger(save_dir=logger_save_dir))
elif logger_type.lower() in ["neptune"]:
tb_logger.append(pl_loggers.NeptuneLogger(save_dir=logger_save_dir))
elif logger_type.lower() in ["wandb"]:
tb_logger.append(pl_loggers.WandbLogger(save_dir=logger_save_dir))

trainer_params["logger"] = tb_logger
trainer_params["log_every_n_steps"] = hparams.get("log_every_n_steps", 50)

# Leaning rate monitor
if len(tb_logger)> 0:
learning_rate_callback = LearningRateMonitor(logging_interval="step")
trainer_params["callbacks"].append(learning_rate_callback)

# Checkpoint Settings
if hparams.get("enable_checkpoints", True):
checkpoint_every_n_train_steps = 100
if "checkpoint_every_n_train_steps" in hparams:
checkpoint_every_n_train_steps = hparams.checkpoint_every_n_train_steps

dirpath = None
if len(tb_logger) == 0:
dirpath = os.path.join(args.path, "checkpoints")
checkpoint_callback = ModelCheckpoint(
dirpath=dirpath, save_last=True, every_n_train_steps=checkpoint_every_n_train_steps,
save_weights_only=False, save_on_train_epoch_end=True, save_top_k=-1
)
trainer_params["callbacks"].append(checkpoint_callback)

# Earlystop Settings
if hparams.get("enable_earlystop", False):
# monitor="val_loss", mode="min", save_top_k=5
earlystop_callback = EarlyStopping(monitor="val_loss", mode="min", patience=13)
trainer_params["callbacks"].append(earlystop_callback)

# GradientAccumulationScheduler
# accumulator_callback = GradientAccumulationScheduler(scheduling={4: 2})

# Validation Settings
trainer_params["val_check_interval"] = hparams.get("val_check_interval", 1.0)

# Step limit settings
trainer_params["limit_train_batches"] = hparams.get("limit_train_batches", None)
trainer_params["limit_val_batches"] = hparams.get("limit_val_batches", None)
trainer_params["limit_test_batches"] = hparams.get("limit_test_batches", None)
Expand Down Expand Up @@ -258,7 +300,7 @@ def train(create_dataset, lightning_module_class):

trainer = pl.Trainer(**trainer_params) # , profiler=profiler, max_steps=200
# Resume training
ckpt_path = get_lastest_checkpoint("./lightning_logs", "checkpoints")
ckpt_path = get_lastest_checkpoint(os.path.join(args.path, "lightning_logs"), "checkpoints")

trainer.fit(model=model, train_dataloaders=train_dataloader, val_dataloaders=valid_dataloader, ckpt_path=ckpt_path)

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ dependencies = [
dev = ["black==23.3.0", "pylint==2.8.2"]

[project.urls]
"Homepage" = "https://github.com/vtuber-plan/chatproto"
"Bug Tracker" = "https://github.com/vtuber-plan/chatproto/issues"
"Homepage" = "https://github.com/vtuber-plan/katheryne"
"Bug Tracker" = "https://github.com/vtuber-plan/katheryne/issues"

[tool.setuptools.packages.find]
exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
Expand Down
12 changes: 6 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
numpy==1.24.3
tenacity==8.2.2
torch==2.1.1+cu118
lightning==2.1.3
accelerate==0.25.0
transformers==4.36.2
peft==0.7.1
datasets==2.15.0
torch==2.2.1
lightning==2.2.0.post0
accelerate==0.27.2
transformers==4.38.2
peft==0.9.0
datasets==2.18.0
blosc==1.11.1
tensorboardX==2.6.2.2
tensorboard==2.15.1
Expand Down

0 comments on commit 40ad795

Please sign in to comment.