github · sgoedecke · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025
@@ -66,6 +66,20 @@ Run the extension with output from a command. This uses single-shot mode.
 cat README.md | gh models run openai/gpt-4o-mini "summarize this text"
 ```
 
+#### Evaluating prompts
+
+Run evaluation tests against a model using a `.prompt.yml` file:
+```shell
+gh models eval my_prompt.prompt.yml
+```
+
+The evaluation will run test cases defined in the prompt file and display results in a human-readable format. For programmatic use, you can output results in JSON format:
+```shell
+gh models eval my_prompt.prompt.yml --json
+```
+
+The JSON output includes detailed test results, evaluation scores, and summary statistics that can be processed by other tools or CI/CD pipelines.
+
 ## Notice
 
 Remember when interacting with a model you are experimenting with AI, so content mistakes are possible. The feature is

@@ -3,6 +3,7 @@ package eval
 
 import (
 	"context"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"strings"
@@ -15,9 +16,22 @@ import (
 	"github.com/spf13/cobra"
 )
 
-// EvaluationPromptFile represents the structure of a prompt.yml file for evaluation
-// It extends the base prompt.File with evaluation-specific fields
-type EvaluationPromptFile = prompt.File
+// EvaluationSummary represents the overall evaluation summary
+type EvaluationSummary struct {
+	Name        string       `json:"name"`
+	Description string       `json:"description"`
+	Model       string       `json:"model"`
+	TestResults []TestResult `json:"testResults"`
+	Summary     Summary      `json:"summary"`
+}
+
+// Summary represents the evaluation summary statistics
+type Summary struct {
+	TotalTests  int     `json:"totalTests"`
+	PassedTests int     `json:"passedTests"`
+	FailedTests int     `json:"failedTests"`
+	PassRate    float64 `json:"passRate"`
+}
 
 // TestResult represents the result of running a test case
 type TestResult struct {
@@ -61,12 +75,23 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
 			    - name: contains-hello
 			      string:
 			        contains: "hello"
+
+			By default, results are displayed in a human-readable format. Use the --json flag
+			to output structured JSON data for programmatic use or integration with CI/CD pipelines.
+
+			See https://docs.github.com/github-models/use-github-models/storing-prompts-in-github-repositories#supported-file-format for more information.
 		`),
 		Example: "gh models eval my_prompt.prompt.yml",
 		Args:    cobra.ExactArgs(1),
 		RunE: func(cmd *cobra.Command, args []string) error {
 			promptFilePath := args[0]
 
+			// Get the json flag
+			jsonOutput, err := cmd.Flags().GetBool("json")
+			if err != nil {
+				return err
+			}
+
 			// Load the evaluation prompt file
 			evalFile, err := loadEvaluationPromptFile(promptFilePath)
 			if err != nil {
@@ -75,25 +100,28 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
 
 			// Run evaluation
 			handler := &evalCommandHandler{
-				cfg:      cfg,
-				client:   cfg.Client,
-				evalFile: evalFile,
+				cfg:        cfg,
+				client:     cfg.Client,
+				evalFile:   evalFile,
+				jsonOutput: jsonOutput,
 			}
 
 			return handler.runEvaluation(cmd.Context())
 		},
 	}
 
+	cmd.Flags().Bool("json", false, "Output results in JSON format")
 	return cmd
 }
 
 type evalCommandHandler struct {
-	cfg      *command.Config
-	client   azuremodels.Client
-	evalFile *EvaluationPromptFile
+	cfg        *command.Config
+	client     azuremodels.Client
+	evalFile   *prompt.File
+	jsonOutput bool
 }
 
-func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
+func loadEvaluationPromptFile(filePath string) (*prompt.File, error) {
 	evalFile, err := prompt.LoadFromFile(filePath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load prompt file: %w", err)
@@ -103,23 +131,31 @@ func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
 }
 
 func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
-	h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
-	h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
-	h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
-	h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
-	h.cfg.WriteToOut("\n")
+	// Print header info only for human-readable output
+	if !h.jsonOutput {
+		h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
+		h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
+		h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
+		h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
+		h.cfg.WriteToOut("\n")
+	}
 
+	var testResults []TestResult
 	passedTests := 0
 	totalTests := len(h.evalFile.TestData)
 
 	for i, testCase := range h.evalFile.TestData {
-		h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
+		if !h.jsonOutput {
+			h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
+		}
 
 		result, err := h.runTestCase(ctx, testCase)
 		if err != nil {
 			return fmt.Errorf("test case %d failed: %w", i+1, err)
 		}
 
+		testResults = append(testResults, result)
+
 		// Check if all evaluators passed
 		testPassed := true
 		for _, evalResult := range result.EvaluationResults {
@@ -131,48 +167,91 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
 
 		if testPassed {
 			passedTests++
-			h.cfg.WriteToOut("  ✓ PASSED\n")
-		} else {
-			h.cfg.WriteToOut("  ✗ FAILED\n")
-			// Show the first 100 characters of the model response when test fails
-			preview := result.ModelResponse
-			if len(preview) > 100 {
-				preview = preview[:100] + "..."
-			}
-			h.cfg.WriteToOut(fmt.Sprintf("    Model Response: %s\n", preview))
 		}
 
-		// Show evaluation details
-		for _, evalResult := range result.EvaluationResults {
-			status := "✓"
-			if !evalResult.Passed {
-				status = "✗"
-			}
-			h.cfg.WriteToOut(fmt.Sprintf("    %s %s (score: %.2f)\n",
-				status, evalResult.EvaluatorName, evalResult.Score))
-			if evalResult.Details != "" {
-				h.cfg.WriteToOut(fmt.Sprintf("      %s\n", evalResult.Details))
-			}
+		if !h.jsonOutput {
+			h.printTestResult(result, testPassed)
 		}
-		h.cfg.WriteToOut("\n")
 	}
 
+	// Calculate pass rate
+	passRate := 100.0
+	if totalTests > 0 {
+		passRate = float64(passedTests) / float64(totalTests) * 100
+	}
+
+	if h.jsonOutput {
+		// Output JSON format
+		summary := EvaluationSummary{
+			Name:        h.evalFile.Name,
+			Description: h.evalFile.Description,
+			Model:       h.evalFile.Model,
+			TestResults: testResults,
+			Summary: Summary{
+				TotalTests:  totalTests,
+				PassedTests: passedTests,
+				FailedTests: totalTests - passedTests,
+				PassRate:    passRate,
+			},
+		}
+
+		jsonData, err := json.MarshalIndent(summary, "", "  ")
+		if err != nil {
+			return fmt.Errorf("failed to marshal JSON: %w", err)
+		}
+
+		h.cfg.WriteToOut(string(jsonData) + "\n")
+	} else {
+		// Output human-readable format summary
+		h.printSummary(passedTests, totalTests, passRate)
+	}
+
+	return nil
+}
+
+func (h *evalCommandHandler) printTestResult(result TestResult, testPassed bool) {
+	if testPassed {
+		h.cfg.WriteToOut("  ✓ PASSED\n")
+	} else {
+		h.cfg.WriteToOut("  ✗ FAILED\n")
+		// Show the first 100 characters of the model response when test fails
+		preview := result.ModelResponse
+		if len(preview) > 100 {
+			preview = preview[:100] + "..."
+		}
+		h.cfg.WriteToOut(fmt.Sprintf("    Model Response: %s\n", preview))
+	}
+
+	// Show evaluation details
+	for _, evalResult := range result.EvaluationResults {
+		status := "✓"
+		if !evalResult.Passed {
+			status = "✗"
+		}
+		h.cfg.WriteToOut(fmt.Sprintf("    %s %s (score: %.2f)\n",
+			status, evalResult.EvaluatorName, evalResult.Score))
+		if evalResult.Details != "" {
+			h.cfg.WriteToOut(fmt.Sprintf("      %s\n", evalResult.Details))
+		}
+	}
+	h.cfg.WriteToOut("\n")
+}
+
+func (h *evalCommandHandler) printSummary(passedTests, totalTests int, passRate float64) {
 	// Summary
 	h.cfg.WriteToOut("Evaluation Summary:\n")
 	if totalTests == 0 {
-		h.cfg.WriteToOut("Passed: 0/0 (0.0%)\n")
+		h.cfg.WriteToOut("Passed: 0/0 (0.00%)\n")
 	} else {
-		h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n",
-			passedTests, totalTests, float64(passedTests)/float64(totalTests)*100))
+		h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.2f%%)\n",
+			passedTests, totalTests, passRate))
 	}
 
 	if passedTests == totalTests {
 		h.cfg.WriteToOut("🎉 All tests passed!\n")
 	} else {
 		h.cfg.WriteToOut("❌ Some tests failed.\n")
 	}
-
-	return nil
 }
 
 func (h *evalCommandHandler) runTestCase(ctx context.Context, testCase map[string]interface{}) (TestResult, error) {
@@ -210,16 +289,9 @@ func (h *evalCommandHandler) templateMessages(testCase map[string]interface{}) (
 			return nil, fmt.Errorf("failed to template message content: %w", err)
 		}
 
-		var role azuremodels.ChatMessageRole
-		switch strings.ToLower(msg.Role) {
-		case "system":
-			role = azuremodels.ChatMessageRoleSystem
-		case "user":
-			role = azuremodels.ChatMessageRoleUser
-		case "assistant":
-			role = azuremodels.ChatMessageRoleAssistant
-		default:
-			return nil, fmt.Errorf("unknown message role: %s", msg.Role)
+		role, err := prompt.GetAzureChatMessageRole(msg.Role)
+		if err != nil {
+			return nil, err
 		}
 
 		messages = append(messages, azuremodels.ChatMessage{
@@ -236,22 +308,7 @@ func (h *evalCommandHandler) templateString(templateStr string, data map[string]
 }
 
 func (h *evalCommandHandler) callModel(ctx context.Context, messages []azuremodels.ChatMessage) (string, error) {
-	req := azuremodels.ChatCompletionOptions{
-		Messages: messages,
-		Model:    h.evalFile.Model,
-		Stream:   false,
-	}
-
-	// Apply model parameters
-	if h.evalFile.ModelParameters.MaxTokens != nil {
-		req.MaxTokens = h.evalFile.ModelParameters.MaxTokens
-	}
-	if h.evalFile.ModelParameters.Temperature != nil {
-		req.Temperature = h.evalFile.ModelParameters.Temperature
-	}
-	if h.evalFile.ModelParameters.TopP != nil {
-		req.TopP = h.evalFile.ModelParameters.TopP
-	}
+	req := h.evalFile.BuildChatCompletionOptions(messages)
 
 	resp, err := h.client.GetChatCompletionStream(ctx, req)
 	if err != nil {