lhl · January 13, 2025 05:58
diff --git a/power-usage.py b/power-usage.py
 # Power Usage Calculator for AI Workloads

 '''
 # Serving
 $ vllm serve meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 4 --num-scheduler-steps 20 --quantization=fp8 --gpu-memory-utilization=0.97
 INFO 01-13 04:59:05 api_server.py:712] vLLM API server version 0.6.6.post2.dev5+g5ce4627a

 # Benchmark - we do bs=64 to emulate https://arxiv.org/pdf/2310.03003
    cmd = [
        "python", os.path.expanduser("~/vllm/benchmarks/benchmark_serving.py"),
        "--backend", "openai-chat",
        "--host", host,
        "--port", "8000",
        "--endpoint", "/v1/chat/completions",
        "--model", model,
        "--dataset-name", "sharegpt",
        "--dataset-path", dataset_path,
        "--num-prompts", str(num_prompts),
        "--max-concurrency", str(concurrency),
        "--seed", "42"
    ]

 nvitop
 - 480W/GPU @ bs=64; 520W/GPU @ bs=128
 - 5% CPU usage

 Improvements:
 - could slurm automate all the testing results; vllm loading takes about 10min to load
 - nvidia-smi and msr to capture realtime power usage and avg
 - vllm tuning: should add speculative decode, offline quant
 - sglang will probably give better bs=64 results
 - we don't account for prefix caching in real world usage: https://www.amd.com/en/developer/resources/technical-articles/vllm-x-amd-highly-efficient-llm-inference-on-amd-instinct-mi300x-gpus-part1.html
 https://www.amd.com/en/developer/resources/technical-articles/vllm-x-amd-highly-efficient-llm-inference-on-amd-instinct-mi300x-gpus-part1.html

 These are running at tp=4, so double token throughput for per node calc

 Selected node: ip-10-1-33-173
 Model: meta-llama/Llama-3.3-70B-Instruct

 Running benchmarks with:
 - Concurrencies: [64, 128]
 - Number of prompts: 1024
 - Dataset: /fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json

 Running benchmark with concurrency 64...
 Command: python /fsx/ubuntu/vllm/benchmarks/benchmark_serving.py --backend openai-chat --host ip-10-1-33-173 --port 8000 --endpoint /v1/chat/completions --model meta-llama/Llama-3.3-70B-Instruct --dataset-name sharegpt --dataset-path /fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1024 --max-concurrency 64 --seed 42

 Namespace(backend='openai-chat', base_url=None, host='ip-10-1-33-173', port=8000, endpoint='/v1/chat/completions', dataset=None, dataset_name='sharegpt', dataset_path='/fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json', max_concurrency=64, model='meta-llama/Llama-3.3-70B-Instruct', tokenizer=None, best_of=1, use_beam_search=False, num_prompts=1024, logprobs=None, request_rate=inf, burstiness=1.0, seed=42, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=False, metadata=None, result_dir=None, result_filename=None, ignore_eos=False, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, sonnet_input_len=550, sonnet_output_len=150, sonnet_prefix_len=200, sharegpt_output_len=None, random_input_len=1024, random_output_len=128, random_range_ratio=1.0, random_prefix_len=0, hf_subset=None, hf_split=None, hf_output_len=None, tokenizer_mode='auto')
 Starting initial single prompt test run...
 Initial test run completed. Starting main benchmark run...
 Traffic request rate: inf
 Burstiness factor: 1.0 (Poisson process)
 Maximum request concurrency: 64
 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [01:19<00:00, 12.85it/s]
 ============ Serving Benchmark Result ============
 Successful requests:                     1024
 Benchmark duration (s):                  79.70
 Total input tokens:                      226878
 Total generated tokens:                  190963
 Request throughput (req/s):              12.85
 Output token throughput (tok/s):         2396.03
 Total Token throughput (tok/s):          5242.70
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          373.88
 Median TTFT (ms):                        327.20
 P99 TTFT (ms):                           1198.93
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          21.24
 Median TPOT (ms):                        21.92
 P99 TPOT (ms):                           26.01
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           22.08
 Median ITL (ms):                         17.47
 P99 ITL (ms):                            110.51
 ==================================================

 Running benchmark with concurrency 128...
 Command: python /fsx/ubuntu/vllm/benchmarks/benchmark_serving.py --backend openai-chat --host ip-10-1-33-173 --port 8000 --endpoint /v1/chat/completions --model meta-llama/Llama-3.3-70B-Instruct --dataset-name sharegpt --dataset-path /fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1024 --max-concurrency 128 --seed 42

 Namespace(backend='openai-chat', base_url=None, host='ip-10-1-33-173', port=8000, endpoint='/v1/chat/completions', dataset=None, dataset_name='sharegpt', dataset_path='/fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json', max_concurrency=128, model='meta-llama/Llama-3.3-70B-Instruct', tokenizer=None, best_of=1, use_beam_search=False, num_prompts=1024, logprobs=None, request_rate=inf, burstiness=1.0, seed=42, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=False, metadata=None, result_dir=None, result_filename=None, ignore_eos=False, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, sonnet_input_len=550, sonnet_output_len=150, sonnet_prefix_len=200, sharegpt_output_len=None, random_input_len=1024, random_output_len=128, random_range_ratio=1.0, random_prefix_len=0, hf_subset=None, hf_split=None, hf_output_len=None, tokenizer_mode='auto')
 Starting initial single prompt test run...
 Initial test run completed. Starting main benchmark run...
 Traffic request rate: inf
 Burstiness factor: 1.0 (Poisson process)
 Maximum request concurrency: 128
 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:55<00:00, 18.33it/s]
 ============ Serving Benchmark Result ============
 Successful requests:                     1024
 Benchmark duration (s):                  55.86
 Total input tokens:                      226878
 Total generated tokens:                  189959
 Request throughput (req/s):              18.33
 Output token throughput (tok/s):         3400.87
 Total Token throughput (tok/s):          7462.71
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          672.30
 Median TTFT (ms):                        487.93
 P99 TTFT (ms):                           2095.38
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          27.11
 Median TPOT (ms):                        28.53
 P99 TPOT (ms):                           39.71
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           28.24
 Median ITL (ms):                         19.94
 P99 ITL (ms):                            199.01
 ==================================================

 All benchmarks completed in 0:03:39.396252
 [ble: elapsed 222.450s (CPU 23.2%)] python run-benchmark.py --concurrencies 64 128


 # Power Usage

 bs=64
 --- Power Usage Results ---
 total_gpu_power_watts: 3840.00
 total_system_power_watts: 4500.00
 total_data_center_power_watts: 5400.00
 adjusted_tokens_per_second: 10500.00
 joules_per_token: 0.51

 bs=128
 --- Power Usage Results ---
 total_gpu_power_watts: 4160.00
 total_system_power_watts: 4847.83
 total_data_center_power_watts: 5817.39
 adjusted_tokens_per_second: 15000.00
 joules_per_token: 0.39
 '''

 def calculate_power_usage(
    gpu_power_per_unit_watts=500,
    num_gpus=8,
    additional_system_power_watts=300,
    power_supply_efficiency=0.92,
    pue=1.2,
    tokens_per_second=1500,
    performance_multiplier=1.0
 ):
    """
    Calculate power usage and joules per token.

    Parameters:
    - gpu_power_per_unit_watts: Power usage per GPU (in watts)
    - num_gpus: Number of GPUs
    - additional_system_power_watts: Power usage for non-GPU components (in watts)
    - power_supply_efficiency: Efficiency of the power supply (0 < efficiency <= 1)
    - pue: Power Usage Effectiveness of the data center
    - tokens_per_second: Throughput in tokens per second
    - performance_multiplier: Factor to adjust for improved performance (e.g., FP8, speculative decode)

    Returns:
    A dictionary containing intermediate calculations and the final joules per token.
    """
    # Total GPU power
    total_gpu_power = gpu_power_per_unit_watts * num_gpus

    # System power including inefficiency
    total_system_power = (total_gpu_power + additional_system_power_watts) / power_supply_efficiency

    # Total power with PUE included
    total_data_center_power = total_system_power * pue

    # Adjust tokens per second for performance multiplier
    adjusted_tokens_per_second = tokens_per_second * performance_multiplier

    # Joules per token
    joules_per_token = total_data_center_power / adjusted_tokens_per_second

    return {
        "total_gpu_power_watts": total_gpu_power,
        "total_system_power_watts": total_system_power,
        "total_data_center_power_watts": total_data_center_power,
        "adjusted_tokens_per_second": adjusted_tokens_per_second,
        "joules_per_token": joules_per_token,
    }

 # Example usage
 if __name__ == "__main__":
    # Modify these values as needed for testing
    params = {
        "gpu_power_per_unit_watts": 520,
        "num_gpus": 8,
        "additional_system_power_watts": 300,
        "power_supply_efficiency": 0.92,
        "pue": 1.2,
        "tokens_per_second": 15000,
        "performance_multiplier": 1.0,  # Example: FP8 + speculative decode
    }

    results = calculate_power_usage(**params)

    print("--- Power Usage Results ---")
    for key, value in results.items():
        print(f"{key}: {value:.2f}")
	# Power Usage Calculator for AI Workloads

	'''
	# Serving
	$ vllm serve meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 4 --num-scheduler-steps 20 --quantization=fp8 --gpu-memory-utilization=0.97
	INFO 01-13 04:59:05 api_server.py:712] vLLM API server version 0.6.6.post2.dev5+g5ce4627a

	# Benchmark - we do bs=64 to emulate https://arxiv.org/pdf/2310.03003
	cmd = [
	"python", os.path.expanduser("~/vllm/benchmarks/benchmark_serving.py"),
	"--backend", "openai-chat",
	"--host", host,
	"--port", "8000",
	"--endpoint", "/v1/chat/completions",
	"--model", model,
	"--dataset-name", "sharegpt",
	"--dataset-path", dataset_path,
	"--num-prompts", str(num_prompts),
	"--max-concurrency", str(concurrency),
	"--seed", "42"
	]

	nvitop
	- 480W/GPU @ bs=64; 520W/GPU @ bs=128
	- 5% CPU usage

	Improvements:
	- could slurm automate all the testing results; vllm loading takes about 10min to load
	- nvidia-smi and msr to capture realtime power usage and avg
	- vllm tuning: should add speculative decode, offline quant
	- sglang will probably give better bs=64 results
	- we don't account for prefix caching in real world usage: https://www.amd.com/en/developer/resources/technical-articles/vllm-x-amd-highly-efficient-llm-inference-on-amd-instinct-mi300x-gpus-part1.html
	https://www.amd.com/en/developer/resources/technical-articles/vllm-x-amd-highly-efficient-llm-inference-on-amd-instinct-mi300x-gpus-part1.html

	These are running at tp=4, so double token throughput for per node calc

	Selected node: ip-10-1-33-173
	Model: meta-llama/Llama-3.3-70B-Instruct

	Running benchmarks with:
	- Concurrencies: [64, 128]
	- Number of prompts: 1024
	- Dataset: /fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json

	Running benchmark with concurrency 64...
	Command: python /fsx/ubuntu/vllm/benchmarks/benchmark_serving.py --backend openai-chat --host ip-10-1-33-173 --port 8000 --endpoint /v1/chat/completions --model meta-llama/Llama-3.3-70B-Instruct --dataset-name sharegpt --dataset-path /fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1024 --max-concurrency 64 --seed 42

	Namespace(backend='openai-chat', base_url=None, host='ip-10-1-33-173', port=8000, endpoint='/v1/chat/completions', dataset=None, dataset_name='sharegpt', dataset_path='/fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json', max_concurrency=64, model='meta-llama/Llama-3.3-70B-Instruct', tokenizer=None, best_of=1, use_beam_search=False, num_prompts=1024, logprobs=None, request_rate=inf, burstiness=1.0, seed=42, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=False, metadata=None, result_dir=None, result_filename=None, ignore_eos=False, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, sonnet_input_len=550, sonnet_output_len=150, sonnet_prefix_len=200, sharegpt_output_len=None, random_input_len=1024, random_output_len=128, random_range_ratio=1.0, random_prefix_len=0, hf_subset=None, hf_split=None, hf_output_len=None, tokenizer_mode='auto')
	Starting initial single prompt test run...
	Initial test run completed. Starting main benchmark run...
	Traffic request rate: inf
	Burstiness factor: 1.0 (Poisson process)
	Maximum request concurrency: 64
	100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1024/1024 [01:19<00:00, 12.85it/s]
	============ Serving Benchmark Result ============
	Successful requests: 1024
	Benchmark duration (s): 79.70
	Total input tokens: 226878
	Total generated tokens: 190963
	Request throughput (req/s): 12.85
	Output token throughput (tok/s): 2396.03
	Total Token throughput (tok/s): 5242.70
	---------------Time to First Token----------------
	Mean TTFT (ms): 373.88
	Median TTFT (ms): 327.20
	P99 TTFT (ms): 1198.93
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 21.24
	Median TPOT (ms): 21.92
	P99 TPOT (ms): 26.01
	---------------Inter-token Latency----------------
	Mean ITL (ms): 22.08
	Median ITL (ms): 17.47
	P99 ITL (ms): 110.51
	==================================================

	Running benchmark with concurrency 128...
	Command: python /fsx/ubuntu/vllm/benchmarks/benchmark_serving.py --backend openai-chat --host ip-10-1-33-173 --port 8000 --endpoint /v1/chat/completions --model meta-llama/Llama-3.3-70B-Instruct --dataset-name sharegpt --dataset-path /fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1024 --max-concurrency 128 --seed 42

	Namespace(backend='openai-chat', base_url=None, host='ip-10-1-33-173', port=8000, endpoint='/v1/chat/completions', dataset=None, dataset_name='sharegpt', dataset_path='/fsx/ubuntu/vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json', max_concurrency=128, model='meta-llama/Llama-3.3-70B-Instruct', tokenizer=None, best_of=1, use_beam_search=False, num_prompts=1024, logprobs=None, request_rate=inf, burstiness=1.0, seed=42, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=False, metadata=None, result_dir=None, result_filename=None, ignore_eos=False, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, sonnet_input_len=550, sonnet_output_len=150, sonnet_prefix_len=200, sharegpt_output_len=None, random_input_len=1024, random_output_len=128, random_range_ratio=1.0, random_prefix_len=0, hf_subset=None, hf_split=None, hf_output_len=None, tokenizer_mode='auto')
	Starting initial single prompt test run...
	Initial test run completed. Starting main benchmark run...
	Traffic request rate: inf
	Burstiness factor: 1.0 (Poisson process)
	Maximum request concurrency: 128
	100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1024/1024 [00:55<00:00, 18.33it/s]
	============ Serving Benchmark Result ============
	Successful requests: 1024
	Benchmark duration (s): 55.86
	Total input tokens: 226878
	Total generated tokens: 189959
	Request throughput (req/s): 18.33
	Output token throughput (tok/s): 3400.87
	Total Token throughput (tok/s): 7462.71
	---------------Time to First Token----------------
	Mean TTFT (ms): 672.30
	Median TTFT (ms): 487.93
	P99 TTFT (ms): 2095.38
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 27.11
	Median TPOT (ms): 28.53
	P99 TPOT (ms): 39.71
	---------------Inter-token Latency----------------
	Mean ITL (ms): 28.24
	Median ITL (ms): 19.94
	P99 ITL (ms): 199.01
	==================================================

	All benchmarks completed in 0:03:39.396252
	[ble: elapsed 222.450s (CPU 23.2%)] python run-benchmark.py --concurrencies 64 128


	# Power Usage

	bs=64
	--- Power Usage Results ---
	total_gpu_power_watts: 3840.00
	total_system_power_watts: 4500.00
	total_data_center_power_watts: 5400.00
	adjusted_tokens_per_second: 10500.00
	joules_per_token: 0.51

	bs=128
	--- Power Usage Results ---
	total_gpu_power_watts: 4160.00
	total_system_power_watts: 4847.83
	total_data_center_power_watts: 5817.39
	adjusted_tokens_per_second: 15000.00
	joules_per_token: 0.39
	'''

	def calculate_power_usage(
	gpu_power_per_unit_watts=500,
	num_gpus=8,
	additional_system_power_watts=300,
	power_supply_efficiency=0.92,
	pue=1.2,
	tokens_per_second=1500,
	performance_multiplier=1.0
	):
	"""
	Calculate power usage and joules per token.

	Parameters:
	- gpu_power_per_unit_watts: Power usage per GPU (in watts)
	- num_gpus: Number of GPUs
	- additional_system_power_watts: Power usage for non-GPU components (in watts)
	- power_supply_efficiency: Efficiency of the power supply (0 < efficiency <= 1)
	- pue: Power Usage Effectiveness of the data center
	- tokens_per_second: Throughput in tokens per second
	- performance_multiplier: Factor to adjust for improved performance (e.g., FP8, speculative decode)

	Returns:
	A dictionary containing intermediate calculations and the final joules per token.
	"""
	# Total GPU power
	total_gpu_power = gpu_power_per_unit_watts * num_gpus

	# System power including inefficiency
	total_system_power = (total_gpu_power + additional_system_power_watts) / power_supply_efficiency

	# Total power with PUE included
	total_data_center_power = total_system_power * pue

	# Adjust tokens per second for performance multiplier
	adjusted_tokens_per_second = tokens_per_second * performance_multiplier

	# Joules per token
	joules_per_token = total_data_center_power / adjusted_tokens_per_second

	return {
	"total_gpu_power_watts": total_gpu_power,
	"total_system_power_watts": total_system_power,
	"total_data_center_power_watts": total_data_center_power,
	"adjusted_tokens_per_second": adjusted_tokens_per_second,
	"joules_per_token": joules_per_token,
	}

	# Example usage
	if __name__ == "__main__":
	# Modify these values as needed for testing
	params = {
	"gpu_power_per_unit_watts": 520,
	"num_gpus": 8,
	"additional_system_power_watts": 300,
	"power_supply_efficiency": 0.92,
	"pue": 1.2,
	"tokens_per_second": 15000,
	"performance_multiplier": 1.0, # Example: FP8 + speculative decode
	}

	results = calculate_power_usage(**params)

	print("--- Power Usage Results ---")
	for key, value in results.items():
	print(f"{key}: {value:.2f}")