Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
JSON mode for evals
  • Loading branch information
sgoedecke committed Jun 5, 2025
commit 0b65416de4805b14cd40c3af4c7b6d3aed690129
150 changes: 114 additions & 36 deletions cmd/eval/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package eval

import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
Expand All @@ -15,6 +16,23 @@ import (
"github.com/spf13/cobra"
)

// EvaluationSummary represents the overall evaluation summary
type EvaluationSummary struct {
Name string `json:"name"`
Description string `json:"description"`
Model string `json:"model"`
TestResults []TestResult `json:"testResults"`
Summary Summary `json:"summary"`
}

// Summary represents the evaluation summary statistics
type Summary struct {
TotalTests int `json:"totalTests"`
PassedTests int `json:"passedTests"`
FailedTests int `json:"failedTests"`
PassRate float64 `json:"passRate"`
}

// TestResult represents the result of running a test case
type TestResult struct {
TestCase map[string]interface{} `json:"testCase"`
Expand Down Expand Up @@ -64,6 +82,12 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
promptFilePath := args[0]

// Get the json flag
jsonOutput, err := cmd.Flags().GetBool("json")
if err != nil {
return err
}

// Load the evaluation prompt file
evalFile, err := loadEvaluationPromptFile(promptFilePath)
Expand All @@ -73,22 +97,25 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {

// Run evaluation
handler := &evalCommandHandler{
cfg: cfg,
client: cfg.Client,
evalFile: evalFile,
cfg: cfg,
client: cfg.Client,
evalFile: evalFile,
jsonOutput: jsonOutput,
}

return handler.runEvaluation(cmd.Context())
},
}

cmd.Flags().Bool("json", false, "Output results in JSON format")
return cmd
}

type evalCommandHandler struct {
cfg *command.Config
client azuremodels.Client
evalFile *prompt.File
cfg *command.Config
client azuremodels.Client
evalFile *prompt.File
jsonOutput bool
}

func loadEvaluationPromptFile(filePath string) (*prompt.File, error) {
Expand All @@ -101,22 +128,30 @@ func loadEvaluationPromptFile(filePath string) (*prompt.File, error) {
}

func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
h.cfg.WriteToOut("\n")
// Print header info only for human-readable output
if !h.jsonOutput {
h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
h.cfg.WriteToOut("\n")
}

var testResults []TestResult
passedTests := 0
totalTests := len(h.evalFile.TestData)

for i, testCase := range h.evalFile.TestData {
h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
if !h.jsonOutput {
h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
}

result, err := h.runTestCase(ctx, testCase)
if err != nil {
return fmt.Errorf("test case %d failed: %w", i+1, err)
}

testResults = append(testResults, result)

// Check if all evaluators passed
testPassed := true
Expand All @@ -129,48 +164,91 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {

if testPassed {
passedTests++
h.cfg.WriteToOut(" ✓ PASSED\n")
} else {
h.cfg.WriteToOut(" ✗ FAILED\n")
// Show the first 100 characters of the model response when test fails
preview := result.ModelResponse
if len(preview) > 100 {
preview = preview[:100] + "..."
}
h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview))
}

// Show evaluation details
for _, evalResult := range result.EvaluationResults {
status := "✓"
if !evalResult.Passed {
status = "✗"
}
h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n",
status, evalResult.EvaluatorName, evalResult.Score))
if evalResult.Details != "" {
h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details))
}
if !h.jsonOutput {
h.printTestResult(result, testPassed)
}
h.cfg.WriteToOut("\n")
}

// Calculate pass rate
passRate := 0.0
if totalTests > 0 {
passRate = float64(passedTests) / float64(totalTests) * 100
}

if h.jsonOutput {
// Output JSON format
summary := EvaluationSummary{
Name: h.evalFile.Name,
Description: h.evalFile.Description,
Model: h.evalFile.Model,
TestResults: testResults,
Summary: Summary{
TotalTests: totalTests,
PassedTests: passedTests,
FailedTests: totalTests - passedTests,
PassRate: passRate,
},
}

jsonData, err := json.MarshalIndent(summary, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal JSON: %w", err)
}

h.cfg.WriteToOut(string(jsonData) + "\n")
} else {
// Output human-readable format summary
h.printSummary(passedTests, totalTests, passRate)
}

return nil
}

func (h *evalCommandHandler) printTestResult(result TestResult, testPassed bool) {
if testPassed {
h.cfg.WriteToOut(" ✓ PASSED\n")
} else {
h.cfg.WriteToOut(" ✗ FAILED\n")
// Show the first 100 characters of the model response when test fails
preview := result.ModelResponse
if len(preview) > 100 {
preview = preview[:100] + "..."
}
h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview))
}

// Show evaluation details
for _, evalResult := range result.EvaluationResults {
status := "✓"
if !evalResult.Passed {
status = "✗"
}
h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n",
status, evalResult.EvaluatorName, evalResult.Score))
if evalResult.Details != "" {
h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details))
}
}
h.cfg.WriteToOut("\n")
}

func (h *evalCommandHandler) printSummary(passedTests, totalTests int, passRate float64) {
// Summary
h.cfg.WriteToOut("Evaluation Summary:\n")
if totalTests == 0 {
h.cfg.WriteToOut("Passed: 0/0 (0.0%)\n")
} else {
h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n",
passedTests, totalTests, float64(passedTests)/float64(totalTests)*100))
passedTests, totalTests, passRate))
}

if passedTests == totalTests {
h.cfg.WriteToOut("🎉 All tests passed!\n")
} else {
h.cfg.WriteToOut("❌ Some tests failed.\n")
}

return nil
}

func (h *evalCommandHandler) runTestCase(ctx context.Context, testCase map[string]interface{}) (TestResult, error) {
Expand Down
Loading