Created
April 1, 2024 13:15
-
-
Save chenhunghan/26b36d0e30ed8f4b6dbce699bf085423 to your computer and use it in GitHub Desktop.
DSPy llm evaluation with metric using llama.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A gist for using the `llama.cpp` model with the `dspy` library. | |
# | |
# DSPy features used in this gist | |
# - `dspy.Predict` | |
# - `dspy.Signature` | |
# - `dspy.context` | |
# - `dspy.evaluate.Evaluate` | |
# | |
# The script first prompts the model to answer a example question and assess the correctness and engagingness of the answer using a evaluator. | |
# | |
# Install `llama.cpp` from brew with built-in OpenAI-compatible server. | |
# brew install ggerganov/ggerganov/llama.cpp | |
# llama-server --hf-repo TheBloke/Mistral-7B-Instruct-v0.2-GGUF --model mistral-7b-instruct-v0.2.Q4_K_M.gguf --hf-file mistral-7b-instruct-v0.2.Q4_K_M.gguf | |
import dspy | |
from dspy.evaluate import Evaluate | |
# Optional for displaying the results on stdout as tables | |
from rich import print | |
from rich.table import Table | |
# The `llama.cpp` model | |
llama_cpp_model = dspy.OpenAI( | |
# assume llama-server is running on localhost:8080 | |
api_base="http://localhost:8080/v1/", | |
# placeholder, or it will raise an error | |
api_key="none", | |
# for some reasons, an error will be raised if set to `text` (llama-server issue?) | |
model_type="chat", | |
# stop word for mistral-7b-instruct-v0.2 | |
stop="\n\n", | |
# max number of tokens to generate | |
max_tokens=250, | |
) | |
dspy.settings.configure(lm=llama_cpp_model) | |
# The example question-answer pairs, we already know the answer and want to access the correctness and engagingness in the evaluator | |
examples = [ | |
dspy.Example( | |
question="Are both Nehi and Nectar d.o.o. part of the beverage industry?", | |
answer="yes", | |
).with_inputs("question"), | |
dspy.Example( | |
question="Angela Diniz was born in Belo Horizonte which is the capital of which Brazilian state", | |
answer="Minas Gerais", | |
).with_inputs("question"), | |
dspy.Example( | |
question="Which Indian symphonic conductor serves on the Anotonin Dvorak Music Festival named after the world-renowned Czech composer?", | |
answer="Debashish Chaudhuri", | |
).with_inputs("question"), | |
] | |
# A dspy signature for automatic assessments of a question-answer pair | |
class Assess(dspy.Signature): | |
"""Assess the quality of a answer of a question.""" | |
assessed_text = dspy.InputField() | |
assessment_question = dspy.InputField() | |
assessment_answer = dspy.OutputField(desc="Yes or No") | |
# the predict module built from the assessment signature | |
# use in the correct_engaging_metric function | |
assess_pred = dspy.Predict(Assess) | |
# a metric returning a score between 0 and 1 for the correctness of the answer and the engagingness of the answer | |
def correct_engaging_metric(gold, pred, trace=None): | |
question, answer, gen_answer = gold.question, gold.answer, pred.answer | |
engaging = "Is the assessed text self-contained, information?" | |
correct = f"The text should answer `{question}` with `{answer}`. Does the assessed text contain this answer?" | |
with dspy.context(lm=llama_cpp_model): | |
correct = assess_pred(assessed_text=gen_answer, assessment_question=correct) | |
engaging = assess_pred(assessed_text=gen_answer, assessment_question=engaging) | |
correct, engaging = [ | |
"yes" in m.assessment_answer.lower() for m in [correct, engaging] | |
] | |
score = correct + engaging | |
if trace is not None: | |
return score >= 2 # noqa: E701 | |
return score / 2.0 | |
# A predict module accept a signature (can be string or a `dspy.Signature` class) | |
# the following are example signature strings | |
# question -> answer | |
# sentence -> sentiment | |
# document -> sunmary | |
# text -> gist | |
# long_context -> tldr | |
# content, question -> answer | |
# question, choices -> reasoning, selection | |
# | |
# example: | |
# predict_module = dspy.Predict('document -> sunmary') | |
# a predict module for answering questions | |
qa_predict_module = dspy.Predict("question -> answer") | |
# init the evaluator with our dataset and the metric | |
evalute = Evaluate( | |
metric=correct_engaging_metric, | |
devset=examples, | |
num_threads=4, | |
display_progress=True, | |
) | |
score = evalute(qa_predict_module) | |
table = Table(title="Evalute") | |
table.add_column("Number of Examples") | |
table.add_column("Evalute Score (%)", style="green") | |
table.add_row(str(len(examples)), str(score)) | |
print(table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment