Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,20 @@ Run the extension with output from a command. This uses single-shot mode.
cat README.md | gh models run openai/gpt-4o-mini "summarize this text"
```

#### Evaluating prompts

Run evaluation tests against a model using a `.prompt.yml` file:
```shell
gh models eval my_prompt.prompt.yml
```

The evaluation will run test cases defined in the prompt file and display results in a human-readable format. For programmatic use, you can output results in JSON format:
```shell
gh models eval my_prompt.prompt.yml --json
```

The JSON output includes detailed test results, evaluation scores, and summary statistics that can be processed by other tools or CI/CD pipelines.

## Notice

Remember when interacting with a model you are experimenting with AI, so content mistakes are possible. The feature is
Expand Down
193 changes: 125 additions & 68 deletions cmd/eval/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package eval

import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
Expand All @@ -15,9 +16,22 @@ import (
"github.com/spf13/cobra"
)

// EvaluationPromptFile represents the structure of a prompt.yml file for evaluation
// It extends the base prompt.File with evaluation-specific fields
type EvaluationPromptFile = prompt.File
// EvaluationSummary represents the overall evaluation summary
type EvaluationSummary struct {
Name string `json:"name"`
Description string `json:"description"`
Model string `json:"model"`
TestResults []TestResult `json:"testResults"`
Summary Summary `json:"summary"`
}

// Summary represents the evaluation summary statistics
type Summary struct {
TotalTests int `json:"totalTests"`
PassedTests int `json:"passedTests"`
FailedTests int `json:"failedTests"`
PassRate float64 `json:"passRate"`
}

// TestResult represents the result of running a test case
type TestResult struct {
Expand Down Expand Up @@ -61,12 +75,23 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {
- name: contains-hello
string:
contains: "hello"

By default, results are displayed in a human-readable format. Use the --json flag
to output structured JSON data for programmatic use or integration with CI/CD pipelines.

See https://docs.github.com/github-models/use-github-models/storing-prompts-in-github-repositories#supported-file-format for more information.
`),
Example: "gh models eval my_prompt.prompt.yml",
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
promptFilePath := args[0]

// Get the json flag
jsonOutput, err := cmd.Flags().GetBool("json")
if err != nil {
return err
}

// Load the evaluation prompt file
evalFile, err := loadEvaluationPromptFile(promptFilePath)
if err != nil {
Expand All @@ -75,25 +100,28 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command {

// Run evaluation
handler := &evalCommandHandler{
cfg: cfg,
client: cfg.Client,
evalFile: evalFile,
cfg: cfg,
client: cfg.Client,
evalFile: evalFile,
jsonOutput: jsonOutput,
}

return handler.runEvaluation(cmd.Context())
},
}

cmd.Flags().Bool("json", false, "Output results in JSON format")
return cmd
}

type evalCommandHandler struct {
cfg *command.Config
client azuremodels.Client
evalFile *EvaluationPromptFile
cfg *command.Config
client azuremodels.Client
evalFile *prompt.File
jsonOutput bool
}

func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
func loadEvaluationPromptFile(filePath string) (*prompt.File, error) {
evalFile, err := prompt.LoadFromFile(filePath)
if err != nil {
return nil, fmt.Errorf("failed to load prompt file: %w", err)
Expand All @@ -103,23 +131,31 @@ func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) {
}

func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {
h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
h.cfg.WriteToOut("\n")
// Print header info only for human-readable output
if !h.jsonOutput {
h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name))
h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description))
h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model))
h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData)))
h.cfg.WriteToOut("\n")
}

var testResults []TestResult
passedTests := 0
totalTests := len(h.evalFile.TestData)

for i, testCase := range h.evalFile.TestData {
h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
if !h.jsonOutput {
h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests))
}

result, err := h.runTestCase(ctx, testCase)
if err != nil {
return fmt.Errorf("test case %d failed: %w", i+1, err)
}

testResults = append(testResults, result)

// Check if all evaluators passed
testPassed := true
for _, evalResult := range result.EvaluationResults {
Expand All @@ -131,48 +167,91 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error {

if testPassed {
passedTests++
h.cfg.WriteToOut(" ✓ PASSED\n")
} else {
h.cfg.WriteToOut(" ✗ FAILED\n")
// Show the first 100 characters of the model response when test fails
preview := result.ModelResponse
if len(preview) > 100 {
preview = preview[:100] + "..."
}
h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview))
}

// Show evaluation details
for _, evalResult := range result.EvaluationResults {
status := "✓"
if !evalResult.Passed {
status = "✗"
}
h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n",
status, evalResult.EvaluatorName, evalResult.Score))
if evalResult.Details != "" {
h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details))
}
if !h.jsonOutput {
h.printTestResult(result, testPassed)
}
h.cfg.WriteToOut("\n")
}

// Calculate pass rate
passRate := 100.0
if totalTests > 0 {
passRate = float64(passedTests) / float64(totalTests) * 100
}

if h.jsonOutput {
// Output JSON format
summary := EvaluationSummary{
Name: h.evalFile.Name,
Description: h.evalFile.Description,
Model: h.evalFile.Model,
TestResults: testResults,
Summary: Summary{
TotalTests: totalTests,
PassedTests: passedTests,
FailedTests: totalTests - passedTests,
PassRate: passRate,
},
}

jsonData, err := json.MarshalIndent(summary, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal JSON: %w", err)
}

h.cfg.WriteToOut(string(jsonData) + "\n")
} else {
// Output human-readable format summary
h.printSummary(passedTests, totalTests, passRate)
}

return nil
}

func (h *evalCommandHandler) printTestResult(result TestResult, testPassed bool) {
if testPassed {
h.cfg.WriteToOut(" ✓ PASSED\n")
} else {
h.cfg.WriteToOut(" ✗ FAILED\n")
// Show the first 100 characters of the model response when test fails
preview := result.ModelResponse
if len(preview) > 100 {
preview = preview[:100] + "..."
}
h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview))
}

// Show evaluation details
for _, evalResult := range result.EvaluationResults {
status := "✓"
if !evalResult.Passed {
status = "✗"
}
h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n",
status, evalResult.EvaluatorName, evalResult.Score))
if evalResult.Details != "" {
h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details))
}
}
h.cfg.WriteToOut("\n")
}

func (h *evalCommandHandler) printSummary(passedTests, totalTests int, passRate float64) {
// Summary
h.cfg.WriteToOut("Evaluation Summary:\n")
if totalTests == 0 {
h.cfg.WriteToOut("Passed: 0/0 (0.0%)\n")
h.cfg.WriteToOut("Passed: 0/0 (0.00%)\n")
} else {
h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n",
passedTests, totalTests, float64(passedTests)/float64(totalTests)*100))
h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.2f%%)\n",
passedTests, totalTests, passRate))
}

if passedTests == totalTests {
h.cfg.WriteToOut("🎉 All tests passed!\n")
} else {
h.cfg.WriteToOut("❌ Some tests failed.\n")
}

return nil
}

func (h *evalCommandHandler) runTestCase(ctx context.Context, testCase map[string]interface{}) (TestResult, error) {
Expand Down Expand Up @@ -210,16 +289,9 @@ func (h *evalCommandHandler) templateMessages(testCase map[string]interface{}) (
return nil, fmt.Errorf("failed to template message content: %w", err)
}

var role azuremodels.ChatMessageRole
switch strings.ToLower(msg.Role) {
case "system":
role = azuremodels.ChatMessageRoleSystem
case "user":
role = azuremodels.ChatMessageRoleUser
case "assistant":
role = azuremodels.ChatMessageRoleAssistant
default:
return nil, fmt.Errorf("unknown message role: %s", msg.Role)
role, err := prompt.GetAzureChatMessageRole(msg.Role)
if err != nil {
return nil, err
}

messages = append(messages, azuremodels.ChatMessage{
Expand All @@ -236,22 +308,7 @@ func (h *evalCommandHandler) templateString(templateStr string, data map[string]
}

func (h *evalCommandHandler) callModel(ctx context.Context, messages []azuremodels.ChatMessage) (string, error) {
req := azuremodels.ChatCompletionOptions{
Messages: messages,
Model: h.evalFile.Model,
Stream: false,
}

// Apply model parameters
if h.evalFile.ModelParameters.MaxTokens != nil {
req.MaxTokens = h.evalFile.ModelParameters.MaxTokens
}
if h.evalFile.ModelParameters.Temperature != nil {
req.Temperature = h.evalFile.ModelParameters.Temperature
}
if h.evalFile.ModelParameters.TopP != nil {
req.TopP = h.evalFile.ModelParameters.TopP
}
req := h.evalFile.BuildChatCompletionOptions(messages)

resp, err := h.client.GetChatCompletionStream(ctx, req)
if err != nil {
Expand Down
Loading
Loading