Skip to content

Instantly share code, notes, and snippets.

@kashifulhaque
Created November 4, 2024 13:00
Show Gist options
  • Save kashifulhaque/73a3f65dc203687498f186af5a54c516 to your computer and use it in GitHub Desktop.
Save kashifulhaque/73a3f65dc203687498f186af5a54c516 to your computer and use it in GitHub Desktop.
GPU Compatibility checker
import torch
import numpy as np
def check_gpu_capabilities():
print("=== GPU Information ===")
if not torch.cuda.is_available():
print("❌ CUDA is not available on this system")
return
device_count = torch.cuda.device_count()
print(f"✓ Found {device_count} CUDA device(s)")
for i in range(device_count):
device = torch.cuda.device(i)
properties = torch.cuda.get_device_properties(device)
print(f"\nDevice {i}: {properties.name}")
print(f"Compute Capability: {properties.major}.{properties.minor}")
print(f"Total Memory: {properties.total_memory / 1024**3:.2f} GB")
fp16_supported = properties.major >= 6 or (properties.major == 5 and properties.minor >= 3)
print(f"FP16 Support: {'✓' if fp16_supported else '❌'}")
bf16_supported = properties.major >= 8
print(f"BF16 Support: {'✓' if bf16_supported else '❌'}")
if fp16_supported:
try:
a = torch.ones((2,2), dtype=torch.float16, device=f'cuda:{i}')
b = a + a
print("✓ FP16 Operations verified")
except RuntimeError as e:
print(f"❌ FP16 Operations failed: {str(e)}")
print("\nRunning quick memory bandwidth test...")
torch.cuda.empty_cache()
size = 100_000_000
x = torch.randn(size, dtype=torch.float32, device=f'cuda:{i}')
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
y = x + x
end.record()
torch.cuda.synchronize()
fp32_time = start.elapsed_time(end)
if fp16_supported:
x_half = torch.randn(size, dtype=torch.float16, device=f'cuda:{i}')
torch.cuda.synchronize()
start.record()
y_half = x_half + x_half
end.record()
torch.cuda.synchronize()
fp16_time = start.elapsed_time(end)
print(f"FP32 operation time: {fp32_time:.2f}ms")
print(f"FP16 operation time: {fp16_time:.2f}ms")
print(f"FP16 speedup: {fp32_time/fp16_time:.2f}x")
def get_training_recommendations():
if not torch.cuda.is_available():
return
device = torch.cuda.current_device()
properties = torch.cuda.get_device_properties(device)
print("\n=== Training Recommendations ===")
if properties.major >= 6 or (properties.major == 5 and properties.minor >= 3):
print("""
Recommended PyTorch training configuration:
training_args = TrainingArguments(
fp16=True, # Enable mixed precision training
fp16_opt_level='O1', # Mixed precision optimization level
fp16_backend='auto', # Let PyTorch choose the backend
per_device_train_batch_size=8,
gradient_accumulation_steps=4,
)
""")
else:
print("""
Recommended PyTorch training configuration:
training_args = TrainingArguments(
fp16=False, # FP16 not supported, using FP32
per_device_train_batch_size=4, # Smaller batch size due to memory constraints
gradient_accumulation_steps=8,
)
""")
if __name__ == "__main__":
check_gpu_capabilities()
get_training_recommendations()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment