diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py
index 75ef28ea..6a70b4d8 100644
--- a/ndsl/dsl/stencil.py
+++ b/ndsl/dsl/stencil.py
@@ -32,7 +32,7 @@
 from ndsl.dsl.typing import Float, Index3D, cast_to_index3d
 from ndsl.initialization.sizer import GridSizer, SubtileGridSizer
 from ndsl.quantity import Quantity
-from ndsl.testing import comparison
+from ndsl.testing.comparison import LegacyMetric
 
 
 try:
@@ -68,40 +68,14 @@ def report_difference(args, kwargs, args_copy, kwargs_copy, function_name, gt_id
 
 
 def report_diff(arg: np.ndarray, numpy_arg: np.ndarray, label) -> str:
-    metric_err = comparison.compare_arr(arg, numpy_arg)
-    nans_match = np.logical_and(np.isnan(arg), np.isnan(numpy_arg))
-    n_points = np.product(arg.shape)
-    failures_14 = n_points - np.sum(
-        np.logical_or(
-            nans_match,
-            metric_err < 1e-14,
-        )
-    )
-    failures_10 = n_points - np.sum(
-        np.logical_or(
-            nans_match,
-            metric_err < 1e-10,
-        )
+    metric = LegacyMetric(
+        reference_values=arg,
+        computed_values=numpy_arg,
+        eps=1e-13,
+        ignore_near_zero_errors=False,
+        near_zero=0,
     )
-    failures_8 = n_points - np.sum(
-        np.logical_or(
-            nans_match,
-            metric_err < 1e-8,
-        )
-    )
-    greatest_error = np.max(metric_err[~np.isnan(metric_err)])
-    if greatest_error == 0.0 and failures_14 == 0:
-        report = ""
-    else:
-        report = f"\n    {label}: "
-        report += f"max_err={greatest_error}"
-        if failures_14 > 0:
-            report += f" 1e-14 failures: {failures_14}"
-        if failures_10 > 0:
-            report += f" 1e-10 failures: {failures_10}"
-        if failures_8 > 0:
-            report += f" 1e-8 failures: {failures_8}"
-    return report
+    return metric.__repr__()
 
 
 @dataclasses.dataclass
diff --git a/ndsl/stencils/testing/conftest.py b/ndsl/stencils/testing/conftest.py
index bd8b2faf..af5bb6a6 100644
--- a/ndsl/stencils/testing/conftest.py
+++ b/ndsl/stencils/testing/conftest.py
@@ -82,6 +82,12 @@ def pytest_addoption(parser):
         default="cubed-sphere",
         help='Topology of the grid. "cubed-sphere" means a 6-faced grid, "doubly-periodic" means a 1 tile grid. Default to "cubed-sphere".',
     )
+    parser.addoption(
+        "--multimodal_metric",
+        action="store_true",
+        default=False,
+        help="Use the multi-modal float metric. Default to False.",
+    )
 
 
 def pytest_configure(config):
@@ -389,6 +395,11 @@ def failure_stride(pytestconfig):
     return int(pytestconfig.getoption("failure_stride"))
 
 
+@pytest.fixture()
+def multimodal_metric(pytestconfig):
+    return bool(pytestconfig.getoption("multimodal_metric"))
+
+
 @pytest.fixture()
 def grid(pytestconfig):
     return pytestconfig.getoption("grid")
diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index d2564868..55ee1001 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -15,7 +15,7 @@
 from ndsl.quantity import Quantity
 from ndsl.restart._legacy_restart import RESTART_PROPERTIES
 from ndsl.stencils.testing.savepoint import SavepointCase, dataset_to_dict
-from ndsl.testing.comparison import compare_scalar, success, success_array
+from ndsl.testing.comparison import LegacyMetric, MultiModalFloatMetric
 from ndsl.testing.perturbation import perturb
 
 
@@ -32,92 +32,6 @@ def platform():
     return "docker" if in_docker else "metal"
 
 
-def sample_wherefail(
-    computed_data,
-    ref_data,
-    eps,
-    print_failures,
-    failure_stride,
-    test_name,
-    ignore_near_zero_errors,
-    near_zero,
-    xy_indices=False,
-):
-    found_indices = np.where(
-        np.logical_not(
-            success_array(
-                computed_data, ref_data, eps, ignore_near_zero_errors, near_zero
-            )
-        )
-    )
-    computed_failures = computed_data[found_indices]
-    reference_failures = ref_data[found_indices]
-
-    # List all errors
-    return_strings = []
-    bad_indices_count = len(found_indices[0])
-    # Determine worst result
-    worst_metric_err = 0.0
-    for b in range(bad_indices_count):
-        full_index = [f[b] for f in found_indices]
-        metric_err = compare_scalar(computed_failures[b], reference_failures[b])
-        abs_err = abs(computed_failures[b] - reference_failures[b])
-        if print_failures and b % failure_stride == 0:
-            return_strings.append(
-                f"index: {full_index}, computed {computed_failures[b]}, "
-                f"reference {reference_failures[b]}, "
-                f"absolute diff {abs_err:.3e}, "
-                f"metric diff: {metric_err:.3e}"
-            )
-        if np.isnan(metric_err) or (metric_err > worst_metric_err):
-            worst_metric_err = metric_err
-            worst_full_idx = full_index
-            worst_abs_err = abs_err
-            computed_worst = computed_failures[b]
-            reference_worst = reference_failures[b]
-    # Summary and worst result
-    fullcount = len(ref_data.flatten())
-    return_strings.append(
-        f"Failed count: {bad_indices_count}/{fullcount} "
-        f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n"
-        f"Worst failed index {worst_full_idx}\n"
-        f"\tcomputed:{computed_worst}\n"
-        f"\treference: {reference_worst}\n"
-        f"\tabsolute diff: {worst_abs_err:.3e}\n"
-        f"\tmetric diff: {worst_metric_err:.3e}\n"
-    )
-
-    if xy_indices:
-        if len(computed_data.shape) == 3:
-            axis = 2
-            any = np.any
-        elif len(computed_data.shape) == 4:
-            axis = (2, 3)
-            any = np.any
-        else:
-            axis = None
-
-            def any(array, axis):
-                return array
-
-        found_xy_indices = np.where(
-            any(
-                np.logical_not(
-                    success_array(
-                        computed_data, ref_data, eps, ignore_near_zero_errors, near_zero
-                    )
-                ),
-                axis=axis,
-            )
-        )
-
-        return_strings.append(
-            "failed horizontal indices:" + str(list(zip(*found_xy_indices)))
-        )
-
-    return "\n".join(return_strings)
-
-
 def process_override(threshold_overrides, testobj, test_name, backend):
     override = threshold_overrides.get(test_name, None)
     if override is not None:
@@ -229,6 +143,7 @@ def test_sequential_savepoint(
     subtests,
     caplog,
     threshold_overrides,
+    multimodal_metric,
     xy_indices=True,
 ):
     if case.testobj is None:
@@ -257,7 +172,12 @@ def test_sequential_savepoint(
         case.testobj.serialnames(case.testobj.in_vars["data_vars"])
         + case.testobj.in_vars["parameters"]
     )
-    input_data = {name: input_data[name] for name in input_names}
+    try:
+        input_data = {name: input_data[name] for name in input_names}
+    except KeyError as e:
+        raise KeyError(
+            f"Variable {e} was described in the translate test but cannot be found in the NetCDF"
+        )
     original_input_data = copy.deepcopy(input_data)
     # run python version of functionality
     output = case.testobj.compute(input_data)
@@ -273,23 +193,24 @@ def test_sequential_savepoint(
         with subtests.test(varname=varname):
             failing_names.append(varname)
             output_data = gt_utils.asarray(output[varname])
-            assert success(
-                output_data,
-                ref_data,
-                case.testobj.max_error,
-                ignore_near_zero,
-                case.testobj.near_zero,
-            ), sample_wherefail(
-                output_data,
-                ref_data,
-                case.testobj.max_error,
-                print_failures,
-                failure_stride,
-                case.savepoint_name,
-                ignore_near_zero_errors=ignore_near_zero,
-                near_zero=case.testobj.near_zero,
-                xy_indices=xy_indices,
-            )
+            if multimodal_metric:
+                metric = MultiModalFloatMetric(
+                    reference_values=ref_data,
+                    computed_values=output_data,
+                    eps=case.testobj.max_error,
+                    ignore_near_zero_errors=ignore_near_zero,
+                    near_zero=case.testobj.near_zero,
+                )
+            else:
+                metric = LegacyMetric(
+                    reference_values=ref_data,
+                    computed_values=output_data,
+                    eps=case.testobj.max_error,
+                    ignore_near_zero_errors=ignore_near_zero,
+                    near_zero=case.testobj.near_zero,
+                )
+            if not metric.check:
+                pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
         ref_data_out[varname] = [ref_data]
     if len(failing_names) > 0:
@@ -307,8 +228,12 @@ def test_sequential_savepoint(
             failing_names,
             out_filename,
         )
-    assert failing_names == [], f"only the following variables passed: {passing_names}"
-    assert len(passing_names) > 0, "No tests passed"
+    if failing_names != []:
+        pytest.fail(
+            f"Only the following variables passed: {passing_names}", pytrace=False
+        )
+    if len(passing_names) == 0:
+        pytest.fail("No tests passed")
 
 
 def state_from_savepoint(serializer, savepoint, name_to_std_name):
@@ -354,6 +279,7 @@ def test_parallel_savepoint(
     caplog,
     threshold_overrides,
     grid,
+    multimodal_metric,
     xy_indices=True,
 ):
     if MPI.COMM_WORLD.Get_size() % 6 != 0:
@@ -410,23 +336,24 @@ def test_parallel_savepoint(
         with subtests.test(varname=varname):
             failing_names.append(varname)
             output_data = gt_utils.asarray(output[varname])
-            assert success(
-                output_data,
-                ref_data[varname][0],
-                case.testobj.max_error,
-                ignore_near_zero,
-                case.testobj.near_zero,
-            ), sample_wherefail(
-                output_data,
-                ref_data[varname][0],
-                case.testobj.max_error,
-                print_failures,
-                failure_stride,
-                case.savepoint_name,
-                ignore_near_zero,
-                case.testobj.near_zero,
-                xy_indices,
-            )
+            if multimodal_metric:
+                metric = MultiModalFloatMetric(
+                    reference_values=ref_data[varname][0],
+                    computed_values=output_data,
+                    eps=case.testobj.max_error,
+                    ignore_near_zero_errors=ignore_near_zero,
+                    near_zero=case.testobj.near_zero,
+                )
+            else:
+                metric = LegacyMetric(
+                    reference_values=ref_data[varname][0],
+                    computed_values=output_data,
+                    eps=case.testobj.max_error,
+                    ignore_near_zero_errors=ignore_near_zero,
+                    near_zero=case.testobj.near_zero,
+                )
+            if not metric.check:
+                pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
     if len(failing_names) > 0:
         os.makedirs(OUTDIR, exist_ok=True)
@@ -447,8 +374,12 @@ def test_parallel_savepoint(
             )
         except Exception as error:
             print(f"TestParallel SaveNetCDF Error: {error}")
-    assert failing_names == [], f"only the following variables passed: {passing_names}"
-    assert len(passing_names) > 0, "No tests passed"
+    if failing_names != []:
+        pytest.fail(
+            f"Only the following variables passed: {passing_names}", pytrace=False
+        )
+    if len(passing_names) == 0:
+        pytest.fail("No tests passed")
 
 
 def save_netcdf(
@@ -464,6 +395,7 @@ def save_netcdf(
 
     data_vars = {}
     for i, varname in enumerate(failing_names):
+        # Read in dimensions and attributes
         if hasattr(testobj, "outputs"):
             dims = [dim_name + f"_{i}" for dim_name in testobj.outputs[varname]["dims"]]
             attrs = {"units": testobj.outputs[varname]["units"]}
@@ -472,27 +404,33 @@ def save_netcdf(
                 f"dim_{varname}_{j}" for j in range(len(ref_data[varname][0].shape))
             ]
             attrs = {"units": "unknown"}
+
+        # Try to save inputs
         try:
-            data_vars[f"{varname}_in"] = xr.DataArray(
+            data_vars[f"{varname}_input"] = xr.DataArray(
                 np.stack([in_data[varname] for in_data in inputs_list]),
                 dims=("rank",) + tuple([f"{d}_in" for d in dims]),
                 attrs=attrs,
             )
         except KeyError as error:
             print(f"No input data found for {error}")
-        data_vars[f"{varname}_ref"] = xr.DataArray(
+
+        # Reference, computed and error computation
+        data_vars[f"{varname}_reference"] = xr.DataArray(
             np.stack(ref_data[varname]),
             dims=("rank",) + tuple([f"{d}_out" for d in dims]),
             attrs=attrs,
         )
-        data_vars[f"{varname}_out"] = xr.DataArray(
+        data_vars[f"{varname}_computed"] = xr.DataArray(
             np.stack([output[varname] for output in output_list]),
             dims=("rank",) + tuple([f"{d}_out" for d in dims]),
             attrs=attrs,
         )
-        data_vars[f"{varname}_error"] = (
-            data_vars[f"{varname}_ref"] - data_vars[f"{varname}_out"]
+        absolute_errors = (
+            data_vars[f"{varname}_reference"] - data_vars[f"{varname}_computed"]
         )
-        data_vars[f"{varname}_error"].attrs = attrs
+        data_vars[f"{varname}_absolute_error"] = absolute_errors
+        data_vars[f"{varname}_absolute_error"].attrs = attrs
+
     print(f"File saved to {out_filename}")
     xr.Dataset(data_vars=data_vars).to_netcdf(out_filename)
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 0ffe55ed..9812e064 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -1,68 +1,265 @@
 from typing import Union
 
 import numpy as np
+import numpy.typing as npt
 
 
-def compare_arr(computed_data, ref_data):
-    """
-    Smooth error near zero values.
-    Inputs are arrays.
+class BaseMetric:
+    def __init__(
+        self,
+        reference_values: np.ndarray,
+        computed_values: np.ndarray,
+    ):
+        self.references = np.atleast_1d(reference_values)
+        self.computed = np.atleast_1d(computed_values)
+        self.check = False
+
+    def __str__(self) -> str:
+        ...
+
+    def __repr__(self) -> str:
+        ...
+
+
+class LegacyMetric(BaseMetric):
+    """Legacy (AI2) metric used for original FV3 port.
+
+    This metric attempts to smooth error comparison around 0.
+    It further tries to deal with close-to-0 breakdown of absolute
+    error by allowing `near_zero` threshold to be specified by hand.
     """
-    if ref_data.dtype in (np.float64, np.int64, np.float32, np.int32):
-        denom = np.abs(ref_data) + np.abs(computed_data)
-        compare = np.asarray(2.0 * np.abs(computed_data - ref_data) / denom)
-        compare[denom == 0] = 0.0
-        return compare
-    elif ref_data.dtype in (np.bool,):
-        return np.logical_xor(computed_data, ref_data)
-    else:
-        raise TypeError(f"recieved data with unexpected dtype {ref_data.dtype}")
-
-
-def compare_scalar(computed_data: np.float64, ref_data: np.float64) -> np.float64:
-    """Smooth error near zero values. Scalar versions."""
-    err_as_array = compare_arr(np.atleast_1d(computed_data), np.atleast_1d(ref_data))
-    return err_as_array[0]
-
-
-def success_array(
-    computed_data: np.ndarray,
-    ref_data: np.ndarray,
-    eps: float,
-    ignore_near_zero_errors: Union[dict, bool],
-    near_zero: float,
-):
-    success = np.logical_or(
-        np.logical_and(np.isnan(computed_data), np.isnan(ref_data)),
-        compare_arr(computed_data, ref_data) < eps,
-    )
-    if isinstance(ignore_near_zero_errors, dict):
-        if ignore_near_zero_errors.keys():
-            near_zero = ignore_near_zero_errors["near_zero"]
+
+    def __init__(
+        self,
+        reference_values: np.ndarray,
+        computed_values: np.ndarray,
+        eps: float,
+        ignore_near_zero_errors: Union[dict, bool],
+        near_zero: float,
+    ):
+        super().__init__(reference_values, computed_values)
+        self.eps = eps
+        self.success = self._compute_errors(
+            ignore_near_zero_errors,
+            near_zero,
+        )
+        self.check = np.all(self.success)
+        self._calculated_metric = np.empty_like(self.references)
+
+    def _compute_errors(
+        self,
+        ignore_near_zero_errors,
+        near_zero,
+    ) -> npt.NDArray[np.bool_]:
+        if self.references.dtype in (np.float64, np.int64, np.float32, np.int32):
+            denom = np.abs(self.references) + np.abs(self.computed)
+            self._calculated_metric = np.asarray(
+                2.0 * np.abs(self.computed - self.references) / denom
+            )
+            self._calculated_metric[denom == 0] = 0.0
+        elif self.references.dtype in (np.bool_, bool):
+            self._calculated_metric = np.logical_xor(self.computed, self.references)
+        else:
+            raise TypeError(
+                f"recieved data with unexpected dtype {self.references.dtype}"
+            )
+        success = np.logical_or(
+            np.logical_and(np.isnan(self.computed), np.isnan(self.references)),
+            self._calculated_metric < self.eps,
+        )
+        if isinstance(ignore_near_zero_errors, dict):
+            if ignore_near_zero_errors.keys():
+                near_zero = ignore_near_zero_errors["near_zero"]
+                success = np.logical_or(
+                    success,
+                    np.logical_and(
+                        np.abs(self.computed) < near_zero,
+                        np.abs(self.references) < near_zero,
+                    ),
+                )
+        elif ignore_near_zero_errors:
             success = np.logical_or(
                 success,
                 np.logical_and(
-                    np.abs(computed_data) < near_zero,
-                    np.abs(ref_data) < near_zero,
+                    np.abs(self.computed) < near_zero,
+                    np.abs(self.references) < near_zero,
                 ),
             )
-    elif ignore_near_zero_errors:
-        success = np.logical_or(
-            success,
-            np.logical_and(
-                np.abs(computed_data) < near_zero, np.abs(ref_data) < near_zero
-            ),
-        )
-    return success
+        return success
 
+    def __str__(self) -> str:
+        return self.__repr__()
 
-def success(computed_data, ref_data, eps, ignore_near_zero_errors, near_zero=0.0):
-    return np.all(
-        success_array(
-            np.asarray(computed_data),
-            np.asarray(ref_data),
-            eps,
-            ignore_near_zero_errors,
-            near_zero,
+    def __repr__(self) -> str:
+        if self.check:
+            return "✅ No numerical differences"
+
+        report = []
+        report.append("❌ Numerical failures")
+
+        found_indices = np.logical_not(self.success).nonzero()
+        computed_failures = self.computed[found_indices]
+        reference_failures = self.references[found_indices]
+
+        # List all errors
+        bad_indices_count = len(found_indices[0])
+        # Determine worst result
+        worst_metric_err = 0.0
+        abs_errs = []
+        details = [
+            "All failures:",
+            "Index  Computed  Reference  Absloute E  Metric E",
+        ]
+        for b in range(bad_indices_count):
+            full_index = tuple([f[b] for f in found_indices])
+
+            metric_err = self._calculated_metric[full_index]
+
+            absolute_distance = abs(computed_failures[b] - reference_failures[b])
+            abs_errs.append(absolute_distance)
+
+            details.append(
+                f"{full_index}  {computed_failures[b]}  "
+                f"{reference_failures[b]}  {abs_errs[-1]:.3e}  {metric_err:.3e}"
+            )
+
+            if np.isnan(metric_err) or (metric_err > worst_metric_err):
+                worst_metric_err = metric_err
+                worst_full_idx = full_index
+                worst_abs_err = abs_errs[-1]
+                computed_worst = computed_failures[b]
+                reference_worst = reference_failures[b]
+        # Try to quantify noisy errors
+        unique_errors = len(np.unique(np.array(abs_errs)))
+        # Summary and worst result
+        fullcount = len(self.references.flatten())
+        report.append(
+            f"Failed count: {bad_indices_count}/{fullcount} "
+            f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n"
+            f"Worst failed index {worst_full_idx}\n"
+            f"    Computed:{computed_worst}\n"
+            f"    Reference: {reference_worst}\n"
+            f"    Absolute diff: {worst_abs_err:.3e}\n"
+            f"    Metric diff: {worst_metric_err:.3e}\n"
+            f"    Metric threshold: {self.eps}\n"
+            f"  Noise quantification:\n"
+            f"    Reference dtype: {type(reference_worst)}\n"
+            f"    Unique errors: {unique_errors}/{bad_indices_count}"
         )
-    )
+        report.extend(details)
+
+        return "\n".join(report)
+
+
+class MultiModalFloatMetric(BaseMetric):
+    """Combination of absolute, relative & ULP comparison for floats
+
+    This metric attempts to combine well known comparison on floats
+    to leverage a robust 32/64 bit float comparison on large accumulating
+    floating errors.
+
+    ULP is used to clear noise (ULP<=1.0 passes)
+    Absolute errors for large amplitute
+    """
+
+    _f32_absolute_eps = 1e-10
+    _f64_absolute_eps = 1e-13
+    relative_fraction = 0.000001  # 0.0001%
+    ulp_threshold = 1.0
+
+    def __init__(
+        self,
+        reference_values: np.ndarray,
+        computed_values: np.ndarray,
+        eps: float,
+        **kwargs,
+    ):
+        super().__init__(reference_values, computed_values)
+        self.absolute_distance = np.empty_like(self.references)
+        self.absolute_distance_metric = np.empty_like(self.references, dtype=np.bool_)
+        self.relative_distance = np.empty_like(self.references)
+        self.relative_distance_metric = np.empty_like(self.references, dtype=np.bool_)
+        self.ulp_distance = np.empty_like(self.references)
+        self.ulp_distance_metric = np.empty_like(self.references, dtype=np.bool_)
+
+        if self.references.dtype is (np.float32, np.int32):
+            self.absolute_eps = max(eps, self._f32_absolute_eps)
+        else:
+            self.absolute_eps = max(eps, self._f64_absolute_eps)
+
+        self.success = self._compute_all_metrics()
+        self.check = np.all(self.success)
+
+    def _compute_all_metrics(
+        self,
+    ) -> npt.NDArray[np.bool_]:
+        if self.references.dtype in (np.float64, np.int64, np.float32, np.int32):
+            max_values = np.maximum(
+                np.absolute(self.computed), np.absolute(self.references)
+            )
+            # Absolute distance
+            self.absolute_distance = np.absolute(self.computed - self.references)
+            self.absolute_distance_metric = self.absolute_distance < self.absolute_eps
+            # Relative distance (in pct)
+            self.relative_distance = np.divide(self.absolute_distance, max_values)
+            self.relative_distance_metric = (
+                self.absolute_distance < self.relative_fraction * max_values
+            )
+            # ULP distance
+            self.ulp_distance = np.divide(
+                self.absolute_distance, np.spacing(max_values)
+            )
+            self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold
+
+            # Combine all distances into sucess or failure
+            # Success = no NANs & ( abs or rel or ulp )
+            naninf_success = not np.logical_and(
+                np.isnan(self.computed), np.isnan(self.references)
+            ).all()
+            metric_success = np.logical_or(
+                self.relative_distance_metric, self.absolute_distance_metric
+            )
+            metric_success = np.logical_or(metric_success, self.ulp_distance_metric)
+            success = np.logical_and(naninf_success, metric_success)
+            return success
+        elif self.references.dtype in (np.bool_, bool):
+            success = np.logical_xor(self.computed, self.references)
+            return success
+        else:
+            raise TypeError(
+                f"recieved data with unexpected dtype {self.references.dtype}"
+            )
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+    def __repr__(self) -> str:
+        if self.check:
+            return "✅ No numerical differences"
+
+        report = []
+        report.append("❌ Numerical failures")
+
+        found_indices = np.logical_not(self.success).nonzero()
+        # List all errors
+        bad_indices_count = len(found_indices[0])
+        full_count = len(self.references.flatten())
+        failures_pct = round(100.0 * (bad_indices_count / full_count), 2)
+        report = [
+            f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
+            f"Index   Computed   Reference   "
+            f"Absolute E(<{self.absolute_eps:.2e})  "
+            f"Relative E(<{self.relative_fraction*100:.2e}%)   "
+            f"ULP E(<{self.ulp_threshold})",
+        ]
+        # Summary and worst result
+        for iBad in range(bad_indices_count):
+            fi = tuple([f[iBad] for f in found_indices])
+            report.append(
+                f"({fi[0]:02}, {fi[1]:02}, {fi[2]:02})  {self.computed[fi]:.16e}  {self.references[fi]:.16e}  "
+                f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'}  "
+                f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'}  "
+                f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
+            )
+
+        return "\n".join(report)