Source code for cerebras.modelzoo.trainer.extensions.bigcode.bigcode_eval_harness

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module provides a callback class to run BigCode's Evaluation Harness.
"""

import inspect
import json
import os
from collections import defaultdict
from copy import deepcopy
from dataclasses import asdict, dataclass
from functools import cached_property
from math import ceil
from typing import Any, Dict, List, Optional, Tuple, Union
from warnings import warn

from bigcode_eval import tasks as bigcode_tasks
from bigcode_eval.base import Task
from bigcode_eval.evaluator import Evaluator
from bigcode_eval.utils import update_code_gens
from lm_eval.utils import pattern_match

import cerebras.pytorch as cstorch
from cerebras.appliance.environment import appliance_environ
from cerebras.appliance.log import ClassLogger, named_class_logger
from cerebras.modelzoo.data.nlp.gpt.InferenceDataProcessor import (
    RequestType,
    tokenize_stop_words,
)
from cerebras.modelzoo.trainer import Trainer
from cerebras.modelzoo.trainer.callbacks import (
    Callback,
    ValidationCallback,
    ValidationLoop,
)
from cerebras.modelzoo.trainer.callbacks.flags import _ScopedFlags
from cerebras.modelzoo.trainer.extensions.eval_harness_adapter import (
    CSEvalHarnessAdapter,
    EvalHarnessProgress,
)


[docs]@dataclass class BigCodeCLIArgs: r"""Captures BigCode EH's CLI arguments with defaults. Fields: prefix: Prefix to add to the prompt. For example InCoder needs prefix='<| file ext=.py |>\n' do_sample: Sample from the language model's output distribution. temperature: Sampling temperature used for generation. top_k: Top-k parameter used for generation. top_p: Top-p parameter used for nucleus sampling. n_samples: Number of completions to generate for each sample. seed: Random seed used for evaluation. tasks: List of tasks to evaluate code evals instruction_tokens: A series of instruction tokens used for instruction-tuning benchamrks separated by comma e.g. <user_message>,<end_user_message>,<assistant_message> max_length_generation: Maximum length of generated sequence (prompt+generation). limit: Number of samples to solve and evaluate from the benchmark limit_start: Optional offset to start from when limiting the number of samples save_every_k_tasks: Optional saving after every k tasks postprocess: Postprocess model outputs before execution, always on except during generation tests allow_code_execution: Allow code evaluation to execute external/untrusted Python code on your machine generation_only: Do code generation but no evaluation load_generations_path: Path of file with previously generated solutions, if provided generation is skipped and only evaluation is done load_data_path: Path of additional data to load for the tasks metric_output_path: Path to save the results save_generations: Whether to save code generations load_generations_intermediate_paths: List of paths for saving the intermediate code generations save_generations_path: Path for saving the code generations save_references: Whether to save reference solutions/tests save_references_path: Path for saving the references solutions/tests prompt: Prompt type to use for generation in HumanEvalPack tasks check_references: Don't run generation but benchmark groundtruth (useful for debugging) """ # BigCode `EvalArguments` dataclass injected as into CLI prefix: str = "" do_sample: bool = True temperature: Optional[float] = None top_k: Optional[int] = None top_p: Optional[float] = None n_samples: int = 1 seed: int = 0 # Other BigCode CLI arguments tasks: Optional[Union[str, List[str]]] = None instruction_tokens: Optional[str] = None max_length_generation: int = 512 limit: Optional[int] = None limit_start: int = 0 save_every_k_tasks: int = -1 postprocess: bool = True allow_code_execution: bool = False generation_only: bool = True # We only run this flow by default load_generations_path: Optional[str] = None load_data_path: Optional[str] = None metric_output_path: str = "evaluation_results.json" save_generations: bool = ( True # We always save for the separate code execution flow ) load_generations_intermediate_paths: Optional[List[str]] = None save_generations_path: str = "generations.json" save_references: bool = True save_references_path: str = "references.json" prompt: str = "prompt" check_references: bool = False
@named_class_logger("BigCodeEvalHarnessRunner") class BigCodeEvalHarnessRunner(ClassLogger): """Util class for invoking BigCode's run script with CSX-specific components.""" def __init__( self, bigcode_args: BigCodeCLIArgs, ): """Constructs a `BigCodeEvalHarnessRunner` instance. Args: bigcode_args: `BigCodeCLIArgs` dataclass object capturing BCEH's CLI args """ super().__init__() self.args = deepcopy(bigcode_args) # Validate user-specified tasks if not self.task_names: raise ValueError( f"Task not found: {self.args.tasks}.\n" f"Available tasks: {','.join(bigcode_tasks.TASK_REGISTRY.keys())}" ) elif len(self.task_names) > 1: raise RuntimeError( "Running multiple generative big code eval harness tasks is not supported. " "Please specify only a single task at a time." ) @cached_property def task_names(self) -> List[str]: """Returns the task names list for the specified tasks.""" if self.args.tasks is None: raise ValueError( "Need to specify a bigcode task to evaluate.\n" f"Available tasks: {','.join(bigcode_tasks.TASK_REGISTRY.keys())}" ) else: return pattern_match( self.args.tasks.split(","), bigcode_tasks.ALL_TASKS ) def evaluate(self, trainer: Trainer, evaluator: Evaluator) -> None: # pylint: disable=line-too-long """Invoke's logic from BigCode's run script on the `bigcode evaluator <bigcode_evaluator>`_. .. bigcode_evaluator: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e/main.py#L372 Args: trainer: Trainer object evaluator: The evaluator object (subclass of BigCode's Evaluator class) """ self.logger.info( f"Starting BigCode evaluation harness on selected tasks: {self.task_names}" ) load_generations_intermediate_paths = ( self.args.load_generations_intermediate_paths ) if load_generations_intermediate_paths and len( load_generations_intermediate_paths ) != len(self.task_names): raise ValueError( "If passing --load_generations_intermediate_paths, " "must pass equal number of files as number of tasks" ) results = {} for idx, task in enumerate(self.task_names): if self.args.load_generations_path: raise RuntimeError( "Code evaluation mode is not yet supported. " "Please specify `--generation_only` flag to run " "bigcode's generation flow on CSX." ) elif self.args.generation_only: self.logger.info("Running with generation-only mode") intermediate_generations = None if load_generations_intermediate_paths: with open( load_generations_intermediate_paths[idx], "r" ) as f_in: # intermediate_generations: list[list[str | None]] of len n_tasks # where list[i] = generated codes or empty intermediate_generations = json.load(f_in) generations, references = evaluator.generate_text( task, intermediate_generations=intermediate_generations ) save_generations_path = os.path.splitext( self.args.save_generations_path )[0] save_generations_path = ( f"{save_generations_path}_{task}_{trainer.global_step}.json" ) save_references_path = os.path.splitext( self.args.save_references_path )[0] save_references_path = ( f"{save_references_path}_{task}_{trainer.global_step}.json" ) evaluator.save_json_files( generations, references, save_generations_path, save_references_path, ) else: raise RuntimeError( f"Code evaluation mode is not yet supported. " "Please specify `--generation_only` flag to run " "bigcode's generation flow on CSX." ) # Save all args to config results["config"] = asdict(self.args) if not self.args.generation_only: dumped = json.dumps(results, indent=2) self.logger.info(dumped) with open(self.args.metric_output_path, "w") as f: f.write(dumped) class BigCodeEvaluator(CSEvalHarnessAdapter, Evaluator): """ Subclasses BigCode's `Evaluator` base class, overriding the `generate_text` method. """ def __init__( self, trainer, bigcode_args: BigCodeCLIArgs, dataloader_args: Dict[str, Any], ): """ Args: trainer: Trainer object bigcode_args: `BigCodeCLIArgs` dataclass object capturing BCEH's CLI args dataloader_args: Dict of dataloader args. """ self.args: BigCodeCLIArgs self.dataloader_args: Dict[str, Any] Evaluator.__init__(self, None, None, None, args=bigcode_args) CSEvalHarnessAdapter.__init__( self, trainer=trainer, dataloader_args=dataloader_args ) def evaluate( self, task_name: str, intermediate_generations: Optional[ List[Optional[List[Optional[str]]]] ] = None, ): """Override of the BCEH's Evaluator class' method. Note: Code evaluation flow is not yet supported. """ raise NotImplementedError("Code evaluation flow is not yet supported.") def _construct_prompts( self, task: Any, dataset: Any, n_tasks: int, limit_start: int = 0, n_copies: int = 1, instruction_tokens: Optional[List[str]] = None, ) -> List[str]: """Helper from BigCode's implementaion to preprocess task dataset into a list of raw text samples. """ def _make_infill_prompt(self, prefix, suffix, preprefix=""): """Make a prompt for infilling. Currently supported only for official InCoder and SantaCoder implementations. """ model_id = self.tokenizer.name_or_path if model_id in ["facebook/incoder-1B", "facebook/incoder-6B"]: self.tokenizer.add_special_tokens({"pad_token": "<pad>"}) return f"{preprefix}{prefix}<|mask:0|>{suffix}<|mask:0|>" elif model_id in ["bigcode/santacoder"]: return f"<fim-prefix>{preprefix}{prefix}<fim-suffix>{suffix}<fim-middle>" elif model_id in ["bigcode/starcoder", "bigcode/starcoderbase"]: return f"<fim_prefix>{preprefix}{prefix}<fim_suffix>{suffix}<fim_middle>" else: raise ValueError(f"Infilling not yet supported for: {model_id}") def _make_instruction_prompt(self, instruction, context, prefix=""): """Make a prompt for instruction-tuning. Delimit instruction and context with specific tokens if provided. """ if not instruction_tokens: warn( "Instruction-tuning tokens are not provided for an " "instruction-tuning task, we will leave them empty." ) user_token, end_token, assistant_token = "", "", "\n" else: user_token, end_token, assistant_token = instruction_tokens if not user_token or not assistant_token or not end_token: warn( "Instruction-tuning tokens provided but one or more are empty. " "Ignore warning if this was intended" ) return ( prefix + user_token + instruction + end_token + assistant_token + context ) prompts = [] infill = False instruction = False mixed_error_log = ( "Mixing tasks with infill/instruction " "and completion prompts is not supported." ) for sample in range(limit_start, limit_start + n_tasks): prompt_contents = task.get_prompt(dataset[sample]) if isinstance(prompt_contents, str): # Normal code completion mode if infill: raise ValueError(mixed_error_log) instruction = True prompt = self.args.prefix + prompt_contents elif isinstance(prompt_contents, dict): if instruction: raise ValueError(mixed_error_log) infill = True if set(prompt_contents.keys()) == {"prefix", "suffix"}: # Infilling mode (Currently supported only for official InCoder and SantaCoder # implementations.) prompt = _make_infill_prompt( **prompt_contents, preprefix=self.args.prefix ) elif set(prompt_contents.keys()) == {"instruction", "context"}: # Instruction-tuning mode prompt = _make_instruction_prompt( **prompt_contents, prefix=self.args.prefix ) else: raise ValueError( f"Unsupported prompt format: {type(prompt_contents)}" ) prompts.append(prompt) return prompts def generate_on_csx( self, task: Any, prompts: List[str], gen_kwargs: Dict[str, Any], n_tasks: int, limit_start: int = 0, intermediate_generations: Optional[ List[Optional[List[Optional[str]]]] ] = None, instruction_tokens: Optional[str] = None, ) -> List[List[Optional[str]]]: """Generate code samples on CSX from the given prompts. Args: task: Code evaluation task object prompts: List of raw text prompts as processed by BCEH's script gen_kwargs: Dict specifying settings for generative inference n_tasks: Number of data samples limit_start: Offset to limit the number of samples. Defaults to 0. intermediate_generations: List of previously loaded generations. Defaults to None. instruction_tokens: List of instruction tokens used for instruction-tuning benchamrks. Returns: List of generated code samples """ ( tokenizer, samples_file_list, dataset_size, metadata, ) = self.preprocess_dataset( prompts, request_type=RequestType.bigcode_eh ) stop_sequences = tokenize_stop_words( stop_words=gen_kwargs.pop("stopping_criteria", []), tokenizer=tokenizer, ) stop_sequences.append([self.dataloader_args["eos_id"]]) # Handle EOS ID # keep track of the list of generated codes # where len(code_gens) = n_tasks and len(code_gens[0]) = number of generated code samples code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)] generations = ( [] if not intermediate_generations else intermediate_generations ) # Generate tokens on appliance with GenerateTokens(metadata, stop_sequences, gen_kwargs) as gen: self.trainer.validate( val_dataloader=cstorch.utils.data.DataLoader( self.input_fn, self.dataloader_args, samples_file_list, dataset_size, RequestType.bigcode_eh.value, ), loop=BigCodeEvalHarnessLoop(), ckpt_path=None, ) self.logger.debug(f"Output results: {gen.gen_token_dict}") code_gens = update_code_gens( task, tokenizer, limit_start, self.args.prefix, instruction_tokens, self.args.postprocess, code_gens, gen.gen_token_dict, ) generations.extend(code_gens) return generations def generate_text( self, task_name: str, intermediate_generations: Optional[ List[Optional[List[Optional[str]]]] ] = None, ) -> Tuple[List[List[str]], List[str]]: """Override of the BCEH's Evaluator class' method. Args: task_name: Name of the BigCode task to evaluate intermediate_generations: List of intermediate generations, if loaded Returns: Tuple of list of generated code samples and list of references """ task: Task = bigcode_tasks.get_task(task_name, self.args) if ( hasattr(task, "max_length_multiplier") and task.max_length_multiplier ): raise RuntimeError( f"BigCode task {task_name} specifies a max_length_multipler " f"stopping criterion, which is currently not supported. Please " f"choose a different task." ) dataset = task.get_dataset() # if args.limit is None, use all samples # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset) n_tasks = ( min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset) ) # when args.limit is None # adjust n_tasks by args.limit_start to prevent out of bounds issues if not self.args.limit: n_tasks -= self.args.limit_start references = [ task.get_reference(dataset[i]) for i in range( self.args.limit_start, self.args.limit_start + n_tasks ) ] if self.args.check_references: if ( "get_solution" in inspect.signature(task.get_reference).parameters ): solutions = [ [task.get_reference(dataset[i], get_solution=True)] for i in range( self.args.limit_start, self.args.limit_start + n_tasks ) ] else: solutions = [[ref] for ref in references] return solutions, references curr_generations = [] # list[list[str | None] | None] if intermediate_generations: curr_generations = [gen for gen in intermediate_generations if gen] n_tasks -= len(curr_generations) curr_sample_idx = len(curr_generations) self.logger.info(f"Number of problems for this task is {n_tasks}") n_copies = ceil(self.args.n_samples / self.batch_size) limit_start = self.args.limit_start + curr_sample_idx if self.args.instruction_tokens: instruction_tokens = self.args.instruction_tokens.split(",") if len(instruction_tokens) != 3: raise ValueError( "Instruction tokens should contain exactly 3 tokens " "separated by a comma. If a token is empty, represent it as ''" ) for token in instruction_tokens: if token.strip() != "": task.stop_words.append(token) else: instruction_tokens = None # Set up generation settings gen_kwargs = { "do_sample": self.args.do_sample, "temperature": self.args.temperature, "top_p": self.args.top_p, "top_k": self.args.top_k, "max_tokens": self.args.max_length_generation, } stopping_criteria = [] if task.stop_words: for stop_word in task.stop_words: stopping_criteria.append(stop_word) if stopping_criteria: gen_kwargs["stopping_criteria"] = stopping_criteria # Fetch list of prompts prompts = self._construct_prompts( task, dataset, n_tasks=n_tasks, limit_start=limit_start, n_copies=n_copies, instruction_tokens=instruction_tokens, ) # Generate tokens on CSX for the given prompts data generations = self.generate_on_csx( task, prompts, gen_kwargs=gen_kwargs, intermediate_generations=curr_generations, n_tasks=n_tasks, limit_start=limit_start, instruction_tokens=instruction_tokens, ) if len(generations[0]) > self.args.n_samples: generations = [g[: self.args.n_samples] for g in generations] warn( f"Number of tasks wasn't proportional to number of devices, we " f"removed extra predictions to only keep nsamples={self.args.n_samples}" ) return generations, references class BigCodeEvalHarnessLoop(ValidationLoop): """Subclass of `ValidationLoop` to run BigCode's Evaluation Harness.""" def __init__(self): """Initializes the BigCodeEvalHarnessLoop object.""" super().__init__(hook="bigcode_eval_harness") def on_bigcode_eval_harness_start( self, trainer, model, val_dataloader, loop ): """ Run ValidationLoop's `on_validate_start` method to ensure that eval_steps is being computed correctly. """ model.eval() self.on_validate_start(trainer, model, val_dataloader, loop) class GenerateTokens(Callback): """ Callback class to post-process model output tokens. """ def __init__( self, metadata: List[Tuple[int, int]], stop_sequences: List[List[int]], gen_kwargs: Dict[str, Any], ): """ Args: metadata: List of tuples of (sample idx, prompt encoding length) for each sample in the batch stop_sequences: List of stop token sequences for stopping generation gen_kwargs: Dict specifying settings for generative inference. """ self.metadata = metadata self.start_token = None self.sample_idx = 0 self.gen_token_dict = defaultdict( list ) # dict of list of generated tokens # Generation settings self.stop_sequences = stop_sequences self.temperature = gen_kwargs.get("temperature") self.top_p = gen_kwargs.get("top_p") self.top_k = gen_kwargs.get("top_k") self.max_tokens = gen_kwargs.get("max_tokens") self.progress = EvalHarnessProgress("BigCode Generative Eval") def on_bigcode_eval_harness_start( self, trainer, model, val_dataloader, loop ): """Runs before the BigCode Evaluation Harness starts.""" self.start_token = getattr(model, "start_token", None) if self.start_token is None: raise RuntimeError( "No start token specified under `model.start_token`. " "Please specify a start token for generative tasks." ) model.stop_sequences = self.stop_sequences if self.max_tokens is not None: model.max_tokens = self.max_tokens if self.temperature is not None: model.temperature = self.temperature if self.top_p is not None: model.top_p = self.top_p if self.top_k is not None: model.top_k = self.top_k def on_bigcode_eval_harness_batch_end( self, trainer, model, outputs, batch, batch_idx ): """Runs after every batch is processed.""" self.progress.print(trainer, batch_idx) def on_before_forward(self, trainer, model, batch, args, kwargs): kwargs["autoregressive"] = True def on_after_forward(self, trainer, model, outputs, batch): self.post_process(predictions=outputs["output"]) @cstorch.step_closure def post_process(self, predictions): """ Post-processes the model generated output tokens. Args: predictions: Tensor of shape (batch_size, max_seq_len) containing the model's predictions """ for gen_tokens in predictions: if not self.metadata[self.sample_idx]: continue sample_idx, _ = self.metadata[self.sample_idx] assert sample_idx == self.sample_idx, "Mismatching sample indices" # Grab generation tokens try: start_token_idx = gen_tokens.tolist().index(self.start_token) gen_tokens = gen_tokens[:start_token_idx].numpy() except ValueError: # Generated string spans msl pass self.gen_token_dict[sample_idx].append(gen_tokens) self.sample_idx += 1
[docs]class BigCodeEvalHarness(ValidationCallback): """ ValidationCallback class to run BigCode's Evaluation Harness. """ id = 0 def __init__( self, # BigCode Args bigcode_args: Union[BigCodeCLIArgs, Dict[str, Any]], # Cerebras specific args keep_data_dir: bool = False, every_n_vals: int = 1, flags: Optional[dict] = None, name_scope: Optional[str] = None, # Data Args batch_size: Optional[int] = None, data_dir: Optional[str] = None, max_sequence_length: Optional[int] = None, tokenizer_file_path: Optional[str] = None, eos_id: Optional[int] = None, **dataloader_args, ): """ Args: bigcode_args: `BigCodeCLIArgs` dataclass or dict capturing BCEH's CLI args keep_data_dir: Specifies whether dumped data samples should be kept for reuse. Defaults to False, i.e. data samples are deleted after the run. every_n_vals: Run the BigCode eval harness script every N validations. e.g. If the eval_frequency is set to 200 and N=2, then BigCode eval harness runs every 400 training steps. The BigCode eval harness script will also always run after the final training iteration. flags: A optional dictionary of scoped global flags to set during the BigCode eval harness run. name_scope: An optional string that gets added to the trainer's name scope. batch_size: Batch size to BigCodeEvalHarness to preprocess input data samples from the specified eval harness tasks. data_dir: Path to data directory max_sequence_length: Maximum sequence length tokenizer_file_path: Path to tokenizer file eos_id: End of sentence token id dataloader_args: Any additional dataloader args, e.g. num_workers. """ # Handling parsing for creating trainer from yaml if isinstance(bigcode_args, dict): self.bigcode_args = BigCodeCLIArgs(**bigcode_args) else: self.bigcode_args = bigcode_args self.bceh_runner = BigCodeEvalHarnessRunner( bigcode_args=self.bigcode_args ) self.dataloader_args = dict( batch_size=batch_size, data_dir=os.path.realpath(data_dir), keep_data_dir=keep_data_dir, max_sequence_length=max_sequence_length, tokenizer_file_path=tokenizer_file_path, eos_id=eos_id, **dataloader_args, ) # Removes annoying logs relating to process forking appliance_environ["TOKENIZERS_PARALLELISM"] = "false" self.every_n_vals = every_n_vals self.scoped_flags = ScopedBigCodeEvalHarnessFlags(**(flags or {})) self._id = BigCodeEvalHarness.id BigCodeEvalHarness.id += 1 if name_scope is None: name_scope = f"bigcode_{self._id}" self.name_scope = name_scope
[docs] def run(self, trainer): """Run BigCode Eval Harness. Args: trainer: the Trainer object """ trainer.logger.info("Running BigCode Eval Harness") # If no absolute file paths for output dumps are provided, dump inside model_dir if not os.path.isabs(self.bceh_runner.args.save_generations_path): self.bceh_runner.args.save_generations_path = os.path.join( trainer.summary_dir, trainer.name_scope_path, self.bceh_runner.args.save_generations_path, ) os.makedirs( os.path.dirname(self.bceh_runner.args.save_generations_path), exist_ok=True, ) if not os.path.isabs(self.bceh_runner.args.save_references_path): self.bceh_runner.args.save_references_path = os.path.join( trainer.summary_dir, trainer.name_scope_path, self.bceh_runner.args.save_references_path, ) os.makedirs( os.path.dirname(self.bceh_runner.args.save_references_path), exist_ok=True, ) bc_evaluator = BigCodeEvaluator( trainer, deepcopy(self.bigcode_args), deepcopy(self.dataloader_args), ) with self.scoped_flags: self.bceh_runner.evaluate(trainer=trainer, evaluator=bc_evaluator)
def run_validation(self, trainer, loop_idx, is_last): if not is_last and (loop_idx + 1) % self.every_n_vals != 0: return with trainer.name_scope(self.name_scope): self.run(trainer)
class ScopedBigCodeEvalHarnessFlags(_ScopedFlags): """ Class to set and restore global flags during the BigCode Evaluation Harness run. """ def on_bigcode_eval_harness_start( self, trainer, model, val_dataloader, loop ): """Sets the global flags before the BigCode Evaluation Harness run.""" self._set_all_flags() def on_bigcode_eval_harness_end(self, trainer, model, loop): """Restores the global flags after the BigCode Evaluation Harness run.""" self._restore_all_flags()