Update evaluator_modified.py

nyunAI · Feb 6, 2024 · 4dfde6f · 4dfde6f
1 parent f09a522
commit 4dfde6f
Showing 1 changed file with 141 additions and 3 deletions.
diff --git a/evaluator_modified.py b/evaluator_modified.py
@@ -147,6 +147,139 @@ def simple_evaluate_chunk(
     return results
 
 
+@positional_deprecated
+def full_evaluate(
+    model,
+    model_args=None,
+    tasks=[],
+    num_fewshot=0,
+    batch_size=None,
+    max_batch_size=None,
+    device=None,
+    no_cache=False,
+    limit=None,
+    bootstrap_iters=100000,
+    description_dict=None,
+    check_integrity=False,
+    decontamination_ngrams_path=None,
+    write_out=False,
+    output_base_path=None,
+    reduce=None
+):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param model: Union[str, LM]
+        Name of model, transformers.PreTrainedModel object, or LM object, see lm_eval.models.get_model
+    :param model_args: Optional[str]
+        String arguments for each model class, see LM.create_from_arg_string.
+        Ignored if `model` argument is a LM object.
+    :param tasks: list[Union[str, Task]]
+        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param batch_size: int or str, optional
+        Batch size for model
+    :param max_batch_size: int, optional
+        Maximal batch size to try with automatic batch size detection
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param no_cache: bool
+        Whether or not to cache
+    :param limit: int or float, optional
+        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :param description_dict: dict[str, str]
+        Dictionary of custom task descriptions of the form: `task_name: description`
+    :param check_integrity: bool
+        Whether to run the relevant part of the test suite for the tasks
+    :param write_out: bool
+        If True, write details about prompts and logits to json for all tasks
+    :param output_base_path: str, optional
+        Directory to which detailed eval info will be written. Defaults to present working dir.
+    :return
+        Dictionary of results
+    """
+    random.seed(1234)
+    np.random.seed(1234)
+
+    assert tasks != [], "No tasks specified"
+
+    if isinstance(model, str):
+        if model_args is None:
+            model_args = ""
+        lm = lm_eval.models.get_model(model).create_from_arg_string(
+            model_args,
+            {
+                "batch_size": batch_size,
+                "max_batch_size": max_batch_size,
+                "device": device,
+            },
+        )
+    elif isinstance(model, transformers.PreTrainedModel):
+        lm = lm_eval.models.get_model("hf-causal")(
+            pretrained=model,
+            batch_size=batch_size,
+            max_batch_size=max_batch_size,
+        )
+        no_cache = True
+    else:
+        assert isinstance(model, lm_eval.base.LM)
+        lm = model
+
+    if not no_cache:
+        lm = lm_eval.base.CachingLM(
+            lm,
+            "lm_cache/"
+            + (model if isinstance(model, str) else model.model.config._name_or_path)
+            + "_"
+            + model_args.replace("=", "-").replace(",", "_").replace("/", "-")
+            + ".db",
+        )
+
+    task_dict = lm_eval.tasks.get_task_dict(tasks)
+
+    if check_integrity:
+        run_task_tests(task_list=tasks)
+
+    results = evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        chunk_num=None,
+        num_fewshot=num_fewshot,
+        limit=0.2,
+        bootstrap_iters=bootstrap_iters,
+        description_dict=description_dict,
+        decontamination_ngrams_path=decontamination_ngrams_path,
+        write_out=write_out,
+        output_base_path=output_base_path,
+        reduce=reduce
+    )
+
+    # add info about the model and few shot config
+    model_name = None
+    if isinstance(model, str):
+        model_name = model
+    elif isinstance(model, transformers.PreTrainedModel):
+        model_name = "pretrained=" + model.config._name_or_path
+    results["config"] = {
+        "model": model_name,
+        "model_args": model_args,
+        "num_fewshot": num_fewshot,
+        "batch_size": batch_size,
+        "batch_sizes": list(lm.batch_sizes.values())
+        if hasattr(lm, "batch_sizes")
+        else [],
+        "device": device,
+        "no_cache": no_cache,
+        "limit": limit,
+        "bootstrap_iters": bootstrap_iters,
+        "description_dict": description_dict,
+    }
+
+    return results
+
+
 decontaminate_suffix = "_decontaminate"
 
 
@@ -263,9 +396,12 @@ def evaluate(
         )
         if limit is not None:
             limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
-
-        lower_bound = int(chunk_num*limit)
-        upper_bound = int(lower_bound + limit)
+        if(chunk_num is not None):
+            lower_bound = int(chunk_num*limit)
+            upper_bound = int(lower_bound + limit)
+        else:
+            lower_bound = int(1*limit)
+            upper_bound = len(task_docs) - 1
         print(f"From index {lower_bound} to index {upper_bound}")
         for doc_id, doc in enumerate(itertools.islice(task_docs, lower_bound, upper_bound)):
             if decontaminate and task.should_decontaminate():
@@ -459,3 +595,5 @@ def make_table(result_dict):
     # print(latex_writer.dumps())
 
     return md_writer.dumps()
+
+