Source code for cerebras.modelzoo.common.mlops

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from contextlib import contextmanager


[docs]def check_presence_of_wandb_dir(rundir): # Ensure that the wandb directory is not already present or empty wandb_dir = os.path.join(rundir, "wandb") if os.path.exists(wandb_dir): # Ensure there are no run-* folders in the wandb directory. run_dirs = [ d for d in os.listdir(wandb_dir) if os.path.isdir(os.path.join(wandb_dir, d)) and d.startswith('run-') ] if run_dirs: raise FileExistsError( f"A previous run seems to already exist in {wandb_dir}. " "Please specify a different 'model_dir'." )
[docs]@contextmanager def mlops_run(params): if "wandb" not in params: yield None return try: import wandb except ImportError: raise RuntimeError( "wandb is an optional dependency of modelzoo. " "In order to use it, 'pip install wandb==0.16.2' into this venv" ) runconfig = params.get("runconfig") wandb_config = params["wandb"] wandb_project = wandb_config.get("project", None) group = wandb_config.get("group", None) run_id = wandb_config.get("id", None) run_name = wandb_config.get("name", None) job_type = wandb_config.get("job_type", runconfig["mode"]) tags = wandb_config.get("tags", None) resume = wandb_config.get("resume", "auto") rundir = runconfig.get("model_dir", "./model_dir") previous_run_id = None import glob run_files = glob.glob(os.path.join(rundir, "wandb", "run-*")) if run_files: previous_run_id = run_files[0].split('-')[-1] if resume == "never": if ( run_id is not None and previous_run_id is not None and run_id == previous_run_id ): raise ValueError( f"The specified run_id ({run_id}) matches with a previous_run_id ({previous_run_id}) " "but 'never' mode requires them to be different." ) check_presence_of_wandb_dir(rundir) elif resume in ["allow", "auto"]: if run_id is not None and previous_run_id is not None: if run_id == previous_run_id: # Log into this previous run as it's the same run pass else: ## Raise an error if a wandb run already exists inside the specified run dir. check_presence_of_wandb_dir(rundir) elif previous_run_id is not None: # No new run ID provided, so default to the previous run ID run_id = previous_run_id elif resume == "must" and previous_run_id: if run_id is not None and run_id != previous_run_id: raise ValueError( f"The specified run_id ({run_id}) does not match previous_run_id ({previous_run_id}) " "but resume mode 'must' requires them to be the same." ) else: run_id = previous_run_id with wandb.init( dir=rundir, job_type=job_type, config=params, project=wandb_project, group=group, tags=tags, name=run_name, id=run_id, resume=resume, sync_tensorboard=True, ) as run: yield run