.. _example-walk-through-cs-estimator: TensorFlow Estimator to CerebrasEstimator ========================================= This section presents a step-by-step walk-through showing how to convert your TensorFlow code from using TensorFlow Estimator to using CerebrasEstimator. Consider that your TensorFlow code is already using TensorFlow Estimator, and is similar to the following: .. code-block:: python def train_input_fn(batch_size): iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", 1)]) data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",") dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:])) dataset = dataset.shuffle(1000).repeat(30).batch(batch_size) return dataset def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None): """ Two layer fully connected model """ policy = Policy("infer_float32_vars") if mixed_precision else None net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features) tf.summary.tensor_summary(name="summary1", tensor=net) net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net) tf.summary.tensor_summary(name="summary2", tensor=net) logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net) learning_rate = tf.constant(params["lr"]) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits) ) train_op = tf.train.GradientDescentOptimizer( learning_rate=learning_rate ).minimize(loss_op, global_step=tf.train.get_global_step()) spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss_op, train_op=train_op, eval_metric_ops={"metric": tf.metrics.mean(values=logits)}, ) return spec hook = tf.train.ProfilerHook(save_steps=100) est = tf.estimator.Estimator(model_fn, params, model_dir="./out") est.train(lambda inputfn: input_fn(batch_size), hooks=hook, steps=None) .. _step1-example-walk-through-cs-estimator: Step 1: Model function ---------------------- Go through your code (``model_fn`` and ``train`` function) and remove all summaries, eval_metrics and hooks. .. code-block:: python def train_input_fn(batch_size): iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", 1)]) data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",") dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:])) dataset = dataset.shuffle(1000).repeat(30).batch(batch_size) return dataset def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None): """ Two layer fully connected model """ policy = Policy("infer_float32_vars") if mixed_precision else None net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features) tf.summary.tensor_summary(name="summary1", tensor=net) net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net) tf.summary.tensor_summary(name="summary2", tensor=net) logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net) learning_rate = tf.constant(params["lr"]) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits) ) train_op = tf.train.GradientDescentOptimizer( learning_rate=learning_rate ).minimize(loss_op, global_step=tf.train.get_global_step()) spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op) return spec hook = tf.train.ProfilerHook(save_steps=100) est = tf.estimator.Estimator(model_fn, params, model_dir="./out") est.train(lambda inputfn: input_fn(batch_size), hooks=hook, steps=None) .. _step2-example-walk-through-cs-estimator: Step 2: Input function ---------------------- Ensure that the ``input_fn`` satisfies conditions mentioned in the :ref:`cs-estimator-input-function`. For example, make sure that the following are true: - The only input into the function is *params*. - The batches outputted have a fixed ``batch_size``, and - The dataset sends an infinite number of samples (using ``repeat``). - Make sure that your ``train`` function uses the new ``input_fn`` and trains for the correct number of steps. The modified code should now look something like this: .. code-block:: python def train_input_fn(params): batch_size = params["batch_size"] iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", 1)]) data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",") dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:])) dataset = dataset.shuffle(1000).repeat().batch(batch_size, drop_remainder=True) return dataset def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None): """ Two layer fully connected model """ policy = Policy("infer_float32_vars") if mixed_precision else None net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features) net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net) logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net) learning_rate = tf.constant(params["lr"]) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits) ) train_op = tf.train.GradientDescentOptimizer( learning_rate=learning_rate ).minimize(loss_op, global_step=tf.train.get_global_step()) spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op) return spec est = tf.estimator.Estimator(model_fn, params, model_dir="./out") est.train(input_fn, steps=10000) .. _step3-example-walk-through-cs-estimator: Step 3: Use CerebrasEstimator ----------------------------- Next, replace ``tf.estimator.Estimator`` with ``CerebrasEstimator``. .. code-block:: python def train_input_fn(params): batch_size = params["batch_size"] iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", "")]) data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",") dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:])) dataset = dataset.shuffle(1000).repeat().batch(batch_size, drop_remainder=True) return dataset def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None): """ Two layer fully connected model """ policy = Policy("infer_float32_vars") if mixed_precision else None net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features) net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net) logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net) learning_rate = tf.constant(params["lr"]) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits) ) train_op = tf.train.GradientDescentOptimizer( learning_rate=learning_rate ).minimize(loss_op, global_step=tf.train.get_global_step()) spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op) return spec est = CerebrasEstimator( model_fn, config=config, params=params, model_dir="./out", use_cs=True ) est.train(input_fn, steps=10000) .. _step4-example-walk-through-cs-estimator: Step 4: Edit RunConfig ---------------------- Add or replace ``RunConfig`` with ``CSRunConfig`` and ensure that the three Cerebras-specific ``import`` statements are present. .. code-block:: python from cerebras.models.common.estimator.tf.cs_estimator import CerebrasEstimator from cerebras.models.common.estimator.tf.run_config import CSRunConfig from cerebras.tf.cs_slurm_cluster_resolver import CSSlurmClusterResolver config = CSRunConfig(cs_ip=ip, save_checkpoints_steps=1000, log_step_count_steps=10000) def train_input_fn(params): batch_size = params["batch_size"] iris_dtype = np.dtype([("img", "float32", 4), ("lbl", "int32", "")]) data = np.genfromtxt("./data/iris_training.csv", dtype=iris_dtype, delimiter=",") dataset = tf.data.Dataset.from_tensor_slices((data["img"][:], data["lbl"][:])) dataset = dataset.shuffle(1000).repeat().batch(batch_size, drop_remainder=True) return dataset def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None): """ Two layer fully connected model """ policy = Policy("infer_float32_vars") if mixed_precision else None net = tf.keras.layers.Dense(256, activation=tf.nn.relu, dtype=policy)(features) net = tf.keras.layers.Dense(128, activation=tf.nn.relu, dtype=policy)(net) logits = tf.keras.layers.Dense(params["num_classes"], dtype=policy)(net) learning_rate = tf.constant(params["lr"]) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits) ) train_op = tf.train.GradientDescentOptimizer( learning_rate=learning_rate ).minimize(loss_op, global_step=tf.train.get_global_step()) spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op) return spec est = CerebrasEstimator( model_fn, config=config, params=params, model_dir="./out", use_cs=True ) est.train(input_fn, steps=10000) .. _walkthrough-mixed-precision: Step 5: Ensure mixed precision ------------------------------ Finally, ensure that your model is running in Mixed Precision, using the `tf.keras Mixed Precision policy `_. .. code-block:: python def model_fn(features, labels, mode=tf.estimator.ModeKeys.TRAIN, params=None): """ Model definition """ if params.get("mixed_precision", True): policy = Policy("infer_float32_vars") if mixed_precision else None tf.keras.backend.floatx("float16") logits = build_model(features, params) learning_rate = tf.constant(params["lr"]) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): loss_op = tf.cast( tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits) ), dtype=tf.float16 if params.get("mixed_precision", True) else tf.float32, ) train_op = tf.train.GradientDescentOptimizer( learning_rate=learning_rate ).minimize(loss_op, global_step=tf.train.get_global_step()) spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op) return spec