Source code for modelzoo.vision.pytorch.dit.input.scripts.create_imagenet_latents

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# isort: off
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../"))
# isort: on

import argparse
import glob
import json
import logging
import os
import re
import shutil
from datetime import datetime

import numpy as np
import torch
import torch.distributed as dist
import yaml
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torchvision import datasets, transforms

import cerebras_pytorch as cstorch
from modelzoo.vision.pytorch.dit.layers.vae.VAEModel import AutoencoderKL
from modelzoo.vision.pytorch.input.transforms import (
    LambdaWithParam,
    resize_center_crop_pil_image,
)

"""
torchrun --nnodes 1 --nproc_per_node 1 modelzoo/vision/pytorch/dit/input/scripts/create_imagenet_latents.py --image_height 256 --image_width 256 --src_dir=/pathto/imagenet1k_ilsvrc2012 --dest_dir=<path to store created latent files> --log_steps=10 --dataset_split=val --checkpoint_path=<path to vae trained ckpt>
"""

LOGFORMAT = '%(asctime)s %(levelname)-4s[%(filename)s:%(lineno)d] %(message)s'
logging.basicConfig(level=logging.INFO, format=LOGFORMAT)

# Setting backend flags based on
# https://github.com/facebookresearch/DiT/blob/main/train.py#L11-L13
# the first flag below was False when we tested this script but True makes A100 training a lot faster:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

_VAE_CHECKPOINT = "https://huggingface.co/stabilityai/sd-vae-ft-mse/blob/main/diffusion_pytorch_model.bin"


[docs]def get_parser_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "--checkpoint_path", type=str, required=False, default=None, help="Path to VAE model checkpoint", ) parser.add_argument( "--params_path", type=str, required=False, default=os.path.abspath( os.path.join( os.path.dirname(__file__), "../../configs/params_dit_small_patchsize_2x2.yaml", ) ), help="Path to VAE model params yaml", ) parser.add_argument( "--horizontal_flip", action="store_true", help="If passed, flip image horizonatally", ) parser.add_argument( "--image_height", type=int, required=True, ) parser.add_argument( "--image_width", type=int, required=True, ) parser.add_argument( "--src_dir", type=str, required=True, help="source data location" ) parser.add_argument( "--dest_dir", type=str, required=True, help="Latent data location" ) parser.add_argument( "--resume", action="store_true", help=f"If specified, resumes previous generation process." f"The dest_dir should point to previous generation and have " f"log_checkpoint saved.", ) parser.add_argument( "--resume_ckpt", default=None, help=f"log ckpt to resume data generation from" f"If None, picks latest from log dir", ) parser.add_argument( "--log_steps", type=int, default=1000, help=f"Generation process ckpt and logging frequency", ) parser.add_argument( "--batch_size_per_gpu", type=int, required=False, default=64, help=f"batch size of input to be passed to VAE model for encoding", ) parser.add_argument( "--num_workers", type=int, required=False, default=4, help=f"Number of pytorch dataloader workers", ) parser.add_argument( "--dataset_split", type=str, required=False, default="train", choices=["train", "val"], help=f"Number of pytorch dataloader workers", ) args = parser.parse_args() return args
[docs]class ImageNet(datasets.ImageNet):
[docs] def __init__(self, root: str, split: str = "train", **kwargs): super().__init__(root, split, **kwargs)
def __getitem__(self, index): image, label = super().__getitem__(index) path = self.samples[index][0] return {"image": image, "label": label, "path": path}
[docs]class LatentImageNetProcessor:
[docs] def __init__(self, args): # Set class attributes from args self.args_dict = vars(args) for k, val in self.args_dict.items(): setattr(self, k, val) # vae params with open(self.params_path, "r") as fh: self.vae_params = yaml.safe_load(fh)["model"]["vae"]
[docs] def get_latest_latent_checkpoint(self, log_path): """ Get the latest saved log checkpoint Args: log_path (str): Path to log dir Returns: latest_filepath (str): Path to the last saved log ckpt """ latest_filepath = None list_files = glob.glob(f"{log_path}/logckpt*.json") if list_files: file_num = [ re.match(r'.*logckpt.*\.(\d+)\.json', x).groups()[0] for x in list_files ] file_num = [int(x) for x in file_num] latest_filepath = sorted( zip(file_num, list_files), key=lambda t: t[0], reverse=True ) latest_filepath = latest_filepath[0][ 1 ] # 0 index for the latest file, 1 is to index into filename return latest_filepath
[docs] def get_resume_data(self, latent_ckpt_path): """ Get data from log ckpt to resume data generation process Args: latent_ckpt_path (str): Path to log ckpt Returns: resume_index (int): Index of sample to restart process resume_batches (int): Number of batches processed previously """ with open(latent_ckpt_path, "r") as fh: ckpt_data = json.load(fh) resume_index = ckpt_data["num_samples_processed"] resume_batches = ckpt_data["total_batches_processed"] # Check if process checkpoint saved can actually be used and data generation resumed checks = { "src_dir": self.src_dir == ckpt_data["src_dir"], "image_height": self.image_height == ckpt_data["image_height"], "image_width": self.image_width == ckpt_data["image_width"], "dataset_split": self.dataset_split == ckpt_data["dataset_split"], "vae_params": self.vae_params == ckpt_data["vae_params"], "checkpoint_path": self.checkpoint_path == ckpt_data["checkpoint_path"], "params_path": self.params_path == ckpt_data["params_path"], } for k, val in checks.items(): if not val: logging.info( f"{k} differs between input args passed and ckpt saved. Starting from the beginning" ) resume_index = 0 break return resume_index, resume_batches
def _setup_folders(self): """ Create data and log directories """ # Destination directory setup if os.path.exists(self.dest_dir): _str = f"\nFolder {self.dest_dir} exists, do you want to delete folder(Y) or not(N)? " while True: folder_act = input(_str) if folder_act == "N": os.makedirs( os.path.join(self.dest_dir, self.dataset_split), exist_ok=True, ) break elif folder_act == "Y": logging.info(f"Deleting {self.dest_dir} and creating again") shutil.rmtree(self.dest_dir) os.makedirs(self.dest_dir) os.makedirs(os.path.join(self.dest_dir, self.dataset_split)) break # Log setup log_path = os.path.join(self.dest_dir, "logs", self.dataset_split) if not os.path.exists(log_path): os.makedirs(log_path) def _save_vae_params(self): """ Save VAE model params """ curr_time = datetime.utcnow().strftime('%m%d%Y_%H%M%S') with open( os.path.join(self.dest_dir, f"params_vae_{curr_time}.yaml"), "w" ) as fh: data = {"model": {"vae": self.vae_params}} yaml.dump(data, fh)
[docs] def set_data_transforms(self, horizontal_flip, image_height, image_width): """ Data transforms used for dataset creation Args: horizontal_flip (bool): If True, flip the image horizontally image_height (int): Height of resized image image_width (int): Width of resized image Returns: transform : torchvision.transforms composition to be applied to image target_transform : torchvision.transforms composition to be applied to target label """ transform = [ transforms.Lambda( lambda pil_image: resize_center_crop_pil_image( pil_image, image_height, image_width, ) ), transforms.ToTensor(), transforms.Normalize( mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=False ), ] if horizontal_flip: # flip always transform.insert(1, transforms.RandomHorizontalFlip(p=1.0)) transform = transforms.Compose(transform) # Target label transform def _get_target_transform(x, *args, **kwargs): return np.int32(x) target_transform = LambdaWithParam(_get_target_transform) return transform, target_transform
[docs] def create_dataloader(self, split): """ Build ImageNet Dataloader Args: split (str): The dataset split, can be one of `train` or `val` Returns: dataloader: torch.utils.data.Dataloader object that reads from ImageNet dataset """ transform, target_transform = self.set_data_transforms( self.horizontal_flip, self.image_height, self.image_width, ) logging.info( f"The following transforms are used for image: {transform} \n" ) logging.info( f"The following transforms are used for label: {target_transform}" ) dataset = ImageNet( root=self.src_dir, split=split, transform=transform, target_transform=target_transform, ) subdataset = torch.utils.data.Subset( dataset, indices=list(range(self.resume_index, len(dataset))) ) # drop_last set to False inorder to preserve all samples. # Some samples maybe written twice on account of this since # Distributed Sampler pads the last incomplete batch # with samples from the beginning to make the data # evenly divisible across the replicas _sampler = DistributedSampler( dataset=subdataset, shuffle=False, drop_last=False ) dataloader = DataLoader( subdataset, batch_size=self.batch_size_per_gpu, num_workers=self.num_workers, prefetch_factor=10, persistent_workers=True, drop_last=False, sampler=_sampler, ) return dataloader
def setup_dist(self): # initialize the process group dist.init_process_group("nccl") def cleanup_dist(self): # clean up process group dist.destroy_process_group()
[docs] def save_latent_tensors(self, vae_output, label, src_paths): """ Save the output latent tensors from VAE encoder to npz file Args: vae_output (torch.Tensor): Concatenation of mean and logvar outputs from VAE corresponding to images at src_paths, shape=(2 * latent_size, latent_height, latent_width) label (torch.Tensor): Target label of image src_paths List[str]: Path of image """ label = label.cpu().numpy() vae_output = vae_output.cpu().numpy() for i in range(vae_output.shape[0]): path = src_paths[i] dest_path = path.replace( self.src_dir.rstrip(), self.dest_dir.rstrip() ) dest_path = os.path.splitext(dest_path)[0] + ".npz" base_dest_flr = os.path.split(dest_path)[0] os.makedirs(base_dest_flr, exist_ok=True) kwargs = { "src_path": path, "dest_path": dest_path, "label": label[i], "vae_output": vae_output[ i ], # includes concat of mean and logvar outputs from VAE, shape=(2 * latent_size, latent_height, latent_width) } np.savez(dest_path, **kwargs)
[docs] def save_logs( self, log_path, global_rank, iter_num, total_num_batches, ): """ Save data generation log checkpoints to resume process later if needed. Args: log_path (str): Path to save log ckpt used for data generation resume global_rank (int): GPU global rank iter_num (int): Current iteration of dataloader on GPU with rank = `global_rank` total_num_batches (int): Total number of batches processed so far across all GPUs during the current data generation process """ total_num_batches += self.resume_num_batches num_samples = total_num_batches * self.batch_size_per_gpu logging.info( f"[GPU{global_rank}] | iteration num: {iter_num} | total_batches_processed across gpus @ bsz={self.batch_size_per_gpu}: {total_num_batches} | total_samples across gpus: {num_samples}" ) log_dict = { "total_batches_processed": total_num_batches, "num_samples_processed": num_samples, "batch_size_per_gpu": self.batch_size_per_gpu, "image_height": self.image_height, "image_width": self.image_width, "dataset_split": self.dataset_split, "src_dir": self.src_dir, "dest_dir": self.dest_dir, "checkpoint_path": self.checkpoint_path, "params_path": self.params_path, "vae_params": self.vae_params, } with open( os.path.join(log_path, f"logckpt_numsamples.{num_samples}.json"), "w", ) as fh: json.dump(log_dict, fh)
[docs] def run(self): """ MAIN function """ assert torch.cuda.is_available(), "Requires at least one GPU." # Set up dist process group self.setup_dist() # Get ranks and device local_rank = int(os.environ["LOCAL_RANK"]) global_rank = dist.get_rank() device = global_rank % torch.cuda.device_count() world_size = dist.get_world_size() torch.cuda.set_device(device) global_batch_size = self.batch_size_per_gpu * world_size log_path = os.path.join(self.dest_dir, "logs", self.dataset_split) if global_rank == 0: logging.info(f"Command line args passed: {self.args_dict} \n") logging.info(f"VAE model params: {self.vae_params} \n") self._setup_folders() self._save_vae_params() dist.barrier() # Resume index self.resume_index = 0 self.resume_num_batches = 0 if self.resume: if self.resume_ckpt is None: last_latent_ckpt = self.get_latest_latent_checkpoint(log_path) else: last_latent_ckpt = self.resume_ckpt logging.info( f"Resume set to True, restarting from {last_latent_ckpt}" ) if last_latent_ckpt is not None: ( self.resume_index, self.resume_num_batches, ) = self.get_resume_data(last_latent_ckpt) dist.barrier() logging.info( f"Starting [GPU{global_rank}] rank={global_rank}, world_size={world_size}, resume_index: {self.resume_index}" ) # Storing pop value # Initialize VAE Model, wrap with DDP vae_model = AutoencoderKL self.vae_model = vae_model(**self.vae_params) # Load state dict for VAE if self.checkpoint_path: vae_state_dict = cstorch.load(self.checkpoint_path) self.vae_model.load_state_dict(vae_state_dict) logging.info(f"Initializing VAE model with {self.checkpoint_path}") else: logging.info("Initializing VAE model with RANDOM weights") self.vae_model.eval() model = DDP(self.vae_model.to(device), device_ids=[local_rank]) # Initialize dataloader dataloader = self.create_dataloader(self.dataset_split) len_dataloader = len(dataloader) if len_dataloader == 0: logging.info(f"All examples written already, nothing to write") # local_num_batches = torch.tensor([0], dtype=torch.int64, device=device) total_num_batches = 0 for i, data in enumerate(dataloader): image = data["image"].to(device) label = data["label"].to(device) path = data["path"] # VAE module forward pass with torch.no_grad(): latent = model.module.encode(image).latent_dist vae_output = latent.parameters self.save_latent_tensors(vae_output, label, path) dist.barrier() total_num_batches += 1 total = torch.tensor(total_num_batches).to(device) # Collect overall batches processed if i % self.log_steps == 0 or i == len_dataloader - 1: dist.reduce(total, dst=0, op=dist.ReduceOp.SUM) # Save data generation checkpoints if global_rank == 0 and ( i % self.log_steps == 0 or i == len_dataloader - 1 ): self.save_logs(log_path, global_rank, i, total.cpu().item()) dist.barrier() # Clean up dist processes self.cleanup_dist()
if __name__ == "__main__": args = get_parser_args() processor = LatentImageNetProcessor(args) processor.run()