Skip to content

Commit

Permalink
Update evaluator_modified.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Nahush26 authored Feb 6, 2024
1 parent f09a522 commit 4dfde6f
Showing 1 changed file with 141 additions and 3 deletions.
144 changes: 141 additions & 3 deletions evaluator_modified.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,139 @@ def simple_evaluate_chunk(
return results


@positional_deprecated
def full_evaluate(
model,
model_args=None,
tasks=[],
num_fewshot=0,
batch_size=None,
max_batch_size=None,
device=None,
no_cache=False,
limit=None,
bootstrap_iters=100000,
description_dict=None,
check_integrity=False,
decontamination_ngrams_path=None,
write_out=False,
output_base_path=None,
reduce=None
):
"""Instantiate and evaluate a model on a list of tasks.
:param model: Union[str, LM]
Name of model, transformers.PreTrainedModel object, or LM object, see lm_eval.models.get_model
:param model_args: Optional[str]
String arguments for each model class, see LM.create_from_arg_string.
Ignored if `model` argument is a LM object.
:param tasks: list[Union[str, Task]]
List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
:param num_fewshot: int
Number of examples in few-shot context
:param batch_size: int or str, optional
Batch size for model
:param max_batch_size: int, optional
Maximal batch size to try with automatic batch size detection
:param device: str, optional
PyTorch device (e.g. "cpu" or "cuda:0") for running models
:param no_cache: bool
Whether or not to cache
:param limit: int or float, optional
Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description`
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
:param write_out: bool
If True, write details about prompts and logits to json for all tasks
:param output_base_path: str, optional
Directory to which detailed eval info will be written. Defaults to present working dir.
:return
Dictionary of results
"""
random.seed(1234)
np.random.seed(1234)

assert tasks != [], "No tasks specified"

if isinstance(model, str):
if model_args is None:
model_args = ""
lm = lm_eval.models.get_model(model).create_from_arg_string(
model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"device": device,
},
)
elif isinstance(model, transformers.PreTrainedModel):
lm = lm_eval.models.get_model("hf-causal")(
pretrained=model,
batch_size=batch_size,
max_batch_size=max_batch_size,
)
no_cache = True
else:
assert isinstance(model, lm_eval.base.LM)
lm = model

if not no_cache:
lm = lm_eval.base.CachingLM(
lm,
"lm_cache/"
+ (model if isinstance(model, str) else model.model.config._name_or_path)
+ "_"
+ model_args.replace("=", "-").replace(",", "_").replace("/", "-")
+ ".db",
)

task_dict = lm_eval.tasks.get_task_dict(tasks)

if check_integrity:
run_task_tests(task_list=tasks)

results = evaluate(
lm=lm,
task_dict=task_dict,
chunk_num=None,
num_fewshot=num_fewshot,
limit=0.2,
bootstrap_iters=bootstrap_iters,
description_dict=description_dict,
decontamination_ngrams_path=decontamination_ngrams_path,
write_out=write_out,
output_base_path=output_base_path,
reduce=reduce
)

# add info about the model and few shot config
model_name = None
if isinstance(model, str):
model_name = model
elif isinstance(model, transformers.PreTrainedModel):
model_name = "pretrained=" + model.config._name_or_path
results["config"] = {
"model": model_name,
"model_args": model_args,
"num_fewshot": num_fewshot,
"batch_size": batch_size,
"batch_sizes": list(lm.batch_sizes.values())
if hasattr(lm, "batch_sizes")
else [],
"device": device,
"no_cache": no_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"description_dict": description_dict,
}

return results


decontaminate_suffix = "_decontaminate"


Expand Down Expand Up @@ -263,9 +396,12 @@ def evaluate(
)
if limit is not None:
limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)

lower_bound = int(chunk_num*limit)
upper_bound = int(lower_bound + limit)
if(chunk_num is not None):
lower_bound = int(chunk_num*limit)
upper_bound = int(lower_bound + limit)
else:
lower_bound = int(1*limit)
upper_bound = len(task_docs) - 1
print(f"From index {lower_bound} to index {upper_bound}")
for doc_id, doc in enumerate(itertools.islice(task_docs, lower_bound, upper_bound)):
if decontaminate and task.should_decontaminate():
Expand Down Expand Up @@ -459,3 +595,5 @@ def make_table(result_dict):
# print(latex_writer.dumps())

return md_writer.dumps()


0 comments on commit 4dfde6f

Please sign in to comment.