TensorFlow Estimator to CerebrasEstimator

This section presents a step-by-step walk-through showing how to convert your TensorFlow code from using TensorFlow Estimator to using CerebrasEstimator.

Consider that your TensorFlow code is already using TensorFlow Estimator, and is similar to the following:

def train_input_fn(batch_size):
  iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", 1)])
  data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",")
  dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:]))
  dataset = dataset.shuffle(1000).repeat(30).batch(batch_size)
  return dataset

def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None):

  """ Two layer fully connected model """

  policy = Policy("infer_float32_vars") if mixed_precision else None
  net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features)
  tf.summary.tensor_summary(name="summary1", tensor=net)
  net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net)
  tf.summary.tensor_summary(name="summary2", tensor=net)
  logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net)
  learning_rate = tf.constant(params["lr"])
  if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      loss_op = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
      )
      train_op = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate
      ).minimize(loss_op, global_step=tf.train.get_global_step())
      spec = tf.estimator.EstimatorSpec(
          mode=mode,
          loss=loss_op,
          train_op=train_op,
          eval_metric_ops={"metric": tf.metrics.mean(values=logits)},
      )
      return spec

hook = tf.train.ProfilerHook(save_steps=100)
est = tf.estimator.Estimator(model_fn, params, model_dir="./out")
est.train(lambda inputfn: input_fn(batch_size), hooks=hook, steps=None)

Step 1: Model function

Go through your code (model_fn and train function) and remove all summaries, eval_metrics and hooks.

def train_input_fn(batch_size):
  iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", 1)])
  data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",")
  dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:]))
  dataset = dataset.shuffle(1000).repeat(30).batch(batch_size)
  return dataset


def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None):

  """ Two layer fully connected model """

  policy = Policy("infer_float32_vars") if mixed_precision else None
  net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features)
  tf.summary.tensor_summary(name="summary1", tensor=net)
  net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net)
  tf.summary.tensor_summary(name="summary2", tensor=net)
  logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net)
  learning_rate = tf.constant(params["lr"])

  if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      loss_op = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
      )
      train_op = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate
      ).minimize(loss_op, global_step=tf.train.get_global_step())
      spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)
      return spec

hook = tf.train.ProfilerHook(save_steps=100)
est = tf.estimator.Estimator(model_fn, params, model_dir="./out")
est.train(lambda inputfn: input_fn(batch_size), hooks=hook, steps=None)

Step 2: Input function

Ensure that the input_fn satisfies conditions mentioned in the Input function differences. For example, make sure that the following are true:

  • The only input into the function is params.

  • The batches outputted have a fixed batch_size, and

  • The dataset sends an infinite number of samples (using repeat).

  • Make sure that your train function uses the new input_fn and trains for the correct number of steps.

The modified code should now look something like this:

def train_input_fn(params):
  batch_size = params["batch_size"]
  iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", 1)])

  data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",")

  dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:]))
  dataset = dataset.shuffle(1000).repeat().batch(batch_size, drop_remainder=True)
  return dataset


def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None):

  """ Two layer fully connected model """
  policy = Policy("infer_float32_vars") if mixed_precision else None
  net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features)
  net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net)
  logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net)
  learning_rate = tf.constant(params["lr"])
  if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      loss_op = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
      )
      train_op = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate
      ).minimize(loss_op, global_step=tf.train.get_global_step())
      spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)
      return spec


est = tf.estimator.Estimator(model_fn, params, model_dir="./out")
est.train(input_fn, steps=10000)

Step 3: Use CerebrasEstimator

Next, replace tf.estimator.Estimator with CerebrasEstimator.

def train_input_fn(params):
  batch_size = params["batch_size"]
  iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", "")])
  data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",")
  dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:]))
  dataset = dataset.shuffle(1000).repeat().batch(batch_size, drop_remainder=True)
  return dataset


def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None):

  """ Two layer fully connected model """
  policy = Policy("infer_float32_vars") if mixed_precision else None
  net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features)
  net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net)
  logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net)
  learning_rate = tf.constant(params["lr"])
  if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      loss_op = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
      )
      train_op = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate
      ).minimize(loss_op, global_step=tf.train.get_global_step())
      spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)
      return spec


est = CerebrasEstimator(
  model_fn, config=config, params=params, model_dir="./out", use_cs=True
)
est.train(input_fn, steps=10000)

Step 4: Edit RunConfig

Add or replace RunConfig with CSRunConfig and ensure that the three Cerebras-specific import statements are present.

from cerebras.models.common.estimator.tf.cs_estimator import CerebrasEstimator
from cerebras.models.common.estimator.tf.run_config import CSRunConfig
from cerebras.tf.cs_slurm_cluster_resolver import CSSlurmClusterResolver

config = CSRunConfig(cs_ip=ip, save_checkpoints_steps=1000, log_step_count_steps=10000)


def train_input_fn(params):
  batch_size = params["batch_size"]
  iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", "")])

  data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",")
  dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:]))

  dataset = dataset.shuffle(1000).repeat().batch(batch_size, drop_remainder=True)

  return dataset


def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None):

  """ Two layer fully connected model """
  policy = Policy("infer_float32_vars") if mixed_precision else None
  net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features)
  net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net)
  logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net)
  learning_rate = tf.constant(params["lr"])
  if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      loss_op = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
      )

      train_op = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate
      ).minimize(loss_op, global_step=tf.train.get_global_step())
      spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)

      return spec


est = CerebrasEstimator(
  model_fn, config=config, params=params, model_dir="./out", use_cs=True
)
est.train(input_fn, steps=10000)

Step 5: Ensure mixed precision

Finally, ensure that your model is running in Mixed Precision, using the tf.keras Mixed Precision policy.

def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None):

  """ Model definition """

  if params.get("mixed_precision", True):
      policy = Policy("infer_float32_vars") if mixed_precision else None
      tf.keras.backend.floatx("float16")

  logits = build_model(features, params)
  learning_rate = tf.constant(params["lr"])

  if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      loss_op = tf.cast(
          tf.reduce_mean(
              tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
          ),
          dtype=tf.float16 if params.get("mixed_precision", True) else tf.float32,
      )

      train_op = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate
      ).minimize(loss_op, global_step=tf.train.get_global_step())

  spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)
  return spec