kashifulhaque · November 4, 2024 13:00 · Nov 4, 2024
diff --git a/gpu-compatibility-checker.py b/gpu-compatibility-checker.py
@@ -0,0 +1,97 @@
+import torch
+import numpy as np
+
+def check_gpu_capabilities():
+  print("=== GPU Information ===")
+
+  if not torch.cuda.is_available():
+    print("❌ CUDA is not available on this system")
+    return
+
+  device_count = torch.cuda.device_count()
+  print(f"✓ Found {device_count} CUDA device(s)")
+
+  for i in range(device_count):
+    device = torch.cuda.device(i)
+    properties = torch.cuda.get_device_properties(device)
+
+    print(f"\nDevice {i}: {properties.name}")
+    print(f"Compute Capability: {properties.major}.{properties.minor}")
+    print(f"Total Memory: {properties.total_memory / 1024**3:.2f} GB")
+
+    fp16_supported = properties.major >= 6 or (properties.major == 5 and properties.minor >= 3)
+    print(f"FP16 Support: {'✓' if fp16_supported else '❌'}")
+
+    bf16_supported = properties.major >= 8
+    print(f"BF16 Support: {'✓' if bf16_supported else '❌'}")
+
+    if fp16_supported:
+      try:
+        a = torch.ones((2,2), dtype=torch.float16, device=f'cuda:{i}')
+        b = a + a
+        print("✓ FP16 Operations verified")
+      except RuntimeError as e:
+        print(f"❌ FP16 Operations failed: {str(e)}")
+
+    print("\nRunning quick memory bandwidth test...")
+    torch.cuda.empty_cache()
+    size = 100_000_000
+
+    x = torch.randn(size, dtype=torch.float32, device=f'cuda:{i}')
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    y = x + x
+    end.record()
+    torch.cuda.synchronize()
+    fp32_time = start.elapsed_time(end)
+
+    if fp16_supported:
+      x_half = torch.randn(size, dtype=torch.float16, device=f'cuda:{i}')
+      torch.cuda.synchronize()
+      start.record()
+      y_half = x_half + x_half
+      end.record()
+      torch.cuda.synchronize()
+      fp16_time = start.elapsed_time(end)
+
+      print(f"FP32 operation time: {fp32_time:.2f}ms")
+      print(f"FP16 operation time: {fp16_time:.2f}ms")
+      print(f"FP16 speedup: {fp32_time/fp16_time:.2f}x")
+def get_training_recommendations():
+  if not torch.cuda.is_available():
+    return
+
+  device = torch.cuda.current_device()
+  properties = torch.cuda.get_device_properties(device)
+
+  print("\n=== Training Recommendations ===")
+
+  if properties.major >= 6 or (properties.major == 5 and properties.minor >= 3):
+    print("""
+      Recommended PyTorch training configuration:
+
+      training_args = TrainingArguments(
+        fp16=True,                   # Enable mixed precision training
+        fp16_opt_level='O1',         # Mixed precision optimization level
+        fp16_backend='auto',         # Let PyTorch choose the backend
+        per_device_train_batch_size=8,
+        gradient_accumulation_steps=4,
+      )
+    """)
+  else:
+    print("""
+      Recommended PyTorch training configuration:
+
+      training_args = TrainingArguments(
+        fp16=False,                   # FP16 not supported, using FP32
+        per_device_train_batch_size=4,  # Smaller batch size due to memory constraints
+        gradient_accumulation_steps=8,
+      )
+    """)
+
+if __name__ == "__main__":
+  check_gpu_capabilities()
+  get_training_recommendations()