Created
November 4, 2024 13:00
-
-
Save kashifulhaque/73a3f65dc203687498f186af5a54c516 to your computer and use it in GitHub Desktop.
Revisions
-
kashifulhaque created this gist
Nov 4, 2024 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,97 @@ import torch import numpy as np def check_gpu_capabilities(): print("=== GPU Information ===") if not torch.cuda.is_available(): print("❌ CUDA is not available on this system") return device_count = torch.cuda.device_count() print(f"✓ Found {device_count} CUDA device(s)") for i in range(device_count): device = torch.cuda.device(i) properties = torch.cuda.get_device_properties(device) print(f"\nDevice {i}: {properties.name}") print(f"Compute Capability: {properties.major}.{properties.minor}") print(f"Total Memory: {properties.total_memory / 1024**3:.2f} GB") fp16_supported = properties.major >= 6 or (properties.major == 5 and properties.minor >= 3) print(f"FP16 Support: {'✓' if fp16_supported else '❌'}") bf16_supported = properties.major >= 8 print(f"BF16 Support: {'✓' if bf16_supported else '❌'}") if fp16_supported: try: a = torch.ones((2,2), dtype=torch.float16, device=f'cuda:{i}') b = a + a print("✓ FP16 Operations verified") except RuntimeError as e: print(f"❌ FP16 Operations failed: {str(e)}") print("\nRunning quick memory bandwidth test...") torch.cuda.empty_cache() size = 100_000_000 x = torch.randn(size, dtype=torch.float32, device=f'cuda:{i}') torch.cuda.synchronize() start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() y = x + x end.record() torch.cuda.synchronize() fp32_time = start.elapsed_time(end) if fp16_supported: x_half = torch.randn(size, dtype=torch.float16, device=f'cuda:{i}') torch.cuda.synchronize() start.record() y_half = x_half + x_half end.record() torch.cuda.synchronize() fp16_time = start.elapsed_time(end) print(f"FP32 operation time: {fp32_time:.2f}ms") print(f"FP16 operation time: {fp16_time:.2f}ms") print(f"FP16 speedup: {fp32_time/fp16_time:.2f}x") def get_training_recommendations(): if not torch.cuda.is_available(): return device = torch.cuda.current_device() properties = torch.cuda.get_device_properties(device) print("\n=== Training Recommendations ===") if properties.major >= 6 or (properties.major == 5 and properties.minor >= 3): print(""" Recommended PyTorch training configuration: training_args = TrainingArguments( fp16=True, # Enable mixed precision training fp16_opt_level='O1', # Mixed precision optimization level fp16_backend='auto', # Let PyTorch choose the backend per_device_train_batch_size=8, gradient_accumulation_steps=4, ) """) else: print(""" Recommended PyTorch training configuration: training_args = TrainingArguments( fp16=False, # FP16 not supported, using FP32 per_device_train_batch_size=4, # Smaller batch size due to memory constraints gradient_accumulation_steps=8, ) """) if __name__ == "__main__": check_gpu_capabilities() get_training_recommendations()