diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py index 75ef28ea..6a70b4d8 100644 --- a/ndsl/dsl/stencil.py +++ b/ndsl/dsl/stencil.py @@ -32,7 +32,7 @@ from ndsl.dsl.typing import Float, Index3D, cast_to_index3d from ndsl.initialization.sizer import GridSizer, SubtileGridSizer from ndsl.quantity import Quantity -from ndsl.testing import comparison +from ndsl.testing.comparison import LegacyMetric try: @@ -68,40 +68,14 @@ def report_difference(args, kwargs, args_copy, kwargs_copy, function_name, gt_id def report_diff(arg: np.ndarray, numpy_arg: np.ndarray, label) -> str: - metric_err = comparison.compare_arr(arg, numpy_arg) - nans_match = np.logical_and(np.isnan(arg), np.isnan(numpy_arg)) - n_points = np.product(arg.shape) - failures_14 = n_points - np.sum( - np.logical_or( - nans_match, - metric_err < 1e-14, - ) - ) - failures_10 = n_points - np.sum( - np.logical_or( - nans_match, - metric_err < 1e-10, - ) + metric = LegacyMetric( + reference_values=arg, + computed_values=numpy_arg, + eps=1e-13, + ignore_near_zero_errors=False, + near_zero=0, ) - failures_8 = n_points - np.sum( - np.logical_or( - nans_match, - metric_err < 1e-8, - ) - ) - greatest_error = np.max(metric_err[~np.isnan(metric_err)]) - if greatest_error == 0.0 and failures_14 == 0: - report = "" - else: - report = f"\n {label}: " - report += f"max_err={greatest_error}" - if failures_14 > 0: - report += f" 1e-14 failures: {failures_14}" - if failures_10 > 0: - report += f" 1e-10 failures: {failures_10}" - if failures_8 > 0: - report += f" 1e-8 failures: {failures_8}" - return report + return metric.__repr__() @dataclasses.dataclass diff --git a/ndsl/stencils/testing/conftest.py b/ndsl/stencils/testing/conftest.py index bd8b2faf..af5bb6a6 100644 --- a/ndsl/stencils/testing/conftest.py +++ b/ndsl/stencils/testing/conftest.py @@ -82,6 +82,12 @@ def pytest_addoption(parser): default="cubed-sphere", help='Topology of the grid. "cubed-sphere" means a 6-faced grid, "doubly-periodic" means a 1 tile grid. Default to "cubed-sphere".', ) + parser.addoption( + "--multimodal_metric", + action="store_true", + default=False, + help="Use the multi-modal float metric. Default to False.", + ) def pytest_configure(config): @@ -389,6 +395,11 @@ def failure_stride(pytestconfig): return int(pytestconfig.getoption("failure_stride")) +@pytest.fixture() +def multimodal_metric(pytestconfig): + return bool(pytestconfig.getoption("multimodal_metric")) + + @pytest.fixture() def grid(pytestconfig): return pytestconfig.getoption("grid") diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py index d2564868..55ee1001 100644 --- a/ndsl/stencils/testing/test_translate.py +++ b/ndsl/stencils/testing/test_translate.py @@ -15,7 +15,7 @@ from ndsl.quantity import Quantity from ndsl.restart._legacy_restart import RESTART_PROPERTIES from ndsl.stencils.testing.savepoint import SavepointCase, dataset_to_dict -from ndsl.testing.comparison import compare_scalar, success, success_array +from ndsl.testing.comparison import LegacyMetric, MultiModalFloatMetric from ndsl.testing.perturbation import perturb @@ -32,92 +32,6 @@ def platform(): return "docker" if in_docker else "metal" -def sample_wherefail( - computed_data, - ref_data, - eps, - print_failures, - failure_stride, - test_name, - ignore_near_zero_errors, - near_zero, - xy_indices=False, -): - found_indices = np.where( - np.logical_not( - success_array( - computed_data, ref_data, eps, ignore_near_zero_errors, near_zero - ) - ) - ) - computed_failures = computed_data[found_indices] - reference_failures = ref_data[found_indices] - - # List all errors - return_strings = [] - bad_indices_count = len(found_indices[0]) - # Determine worst result - worst_metric_err = 0.0 - for b in range(bad_indices_count): - full_index = [f[b] for f in found_indices] - metric_err = compare_scalar(computed_failures[b], reference_failures[b]) - abs_err = abs(computed_failures[b] - reference_failures[b]) - if print_failures and b % failure_stride == 0: - return_strings.append( - f"index: {full_index}, computed {computed_failures[b]}, " - f"reference {reference_failures[b]}, " - f"absolute diff {abs_err:.3e}, " - f"metric diff: {metric_err:.3e}" - ) - if np.isnan(metric_err) or (metric_err > worst_metric_err): - worst_metric_err = metric_err - worst_full_idx = full_index - worst_abs_err = abs_err - computed_worst = computed_failures[b] - reference_worst = reference_failures[b] - # Summary and worst result - fullcount = len(ref_data.flatten()) - return_strings.append( - f"Failed count: {bad_indices_count}/{fullcount} " - f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n" - f"Worst failed index {worst_full_idx}\n" - f"\tcomputed:{computed_worst}\n" - f"\treference: {reference_worst}\n" - f"\tabsolute diff: {worst_abs_err:.3e}\n" - f"\tmetric diff: {worst_metric_err:.3e}\n" - ) - - if xy_indices: - if len(computed_data.shape) == 3: - axis = 2 - any = np.any - elif len(computed_data.shape) == 4: - axis = (2, 3) - any = np.any - else: - axis = None - - def any(array, axis): - return array - - found_xy_indices = np.where( - any( - np.logical_not( - success_array( - computed_data, ref_data, eps, ignore_near_zero_errors, near_zero - ) - ), - axis=axis, - ) - ) - - return_strings.append( - "failed horizontal indices:" + str(list(zip(*found_xy_indices))) - ) - - return "\n".join(return_strings) - - def process_override(threshold_overrides, testobj, test_name, backend): override = threshold_overrides.get(test_name, None) if override is not None: @@ -229,6 +143,7 @@ def test_sequential_savepoint( subtests, caplog, threshold_overrides, + multimodal_metric, xy_indices=True, ): if case.testobj is None: @@ -257,7 +172,12 @@ def test_sequential_savepoint( case.testobj.serialnames(case.testobj.in_vars["data_vars"]) + case.testobj.in_vars["parameters"] ) - input_data = {name: input_data[name] for name in input_names} + try: + input_data = {name: input_data[name] for name in input_names} + except KeyError as e: + raise KeyError( + f"Variable {e} was described in the translate test but cannot be found in the NetCDF" + ) original_input_data = copy.deepcopy(input_data) # run python version of functionality output = case.testobj.compute(input_data) @@ -273,23 +193,24 @@ def test_sequential_savepoint( with subtests.test(varname=varname): failing_names.append(varname) output_data = gt_utils.asarray(output[varname]) - assert success( - output_data, - ref_data, - case.testobj.max_error, - ignore_near_zero, - case.testobj.near_zero, - ), sample_wherefail( - output_data, - ref_data, - case.testobj.max_error, - print_failures, - failure_stride, - case.savepoint_name, - ignore_near_zero_errors=ignore_near_zero, - near_zero=case.testobj.near_zero, - xy_indices=xy_indices, - ) + if multimodal_metric: + metric = MultiModalFloatMetric( + reference_values=ref_data, + computed_values=output_data, + eps=case.testobj.max_error, + ignore_near_zero_errors=ignore_near_zero, + near_zero=case.testobj.near_zero, + ) + else: + metric = LegacyMetric( + reference_values=ref_data, + computed_values=output_data, + eps=case.testobj.max_error, + ignore_near_zero_errors=ignore_near_zero, + near_zero=case.testobj.near_zero, + ) + if not metric.check: + pytest.fail(str(metric), pytrace=False) passing_names.append(failing_names.pop()) ref_data_out[varname] = [ref_data] if len(failing_names) > 0: @@ -307,8 +228,12 @@ def test_sequential_savepoint( failing_names, out_filename, ) - assert failing_names == [], f"only the following variables passed: {passing_names}" - assert len(passing_names) > 0, "No tests passed" + if failing_names != []: + pytest.fail( + f"Only the following variables passed: {passing_names}", pytrace=False + ) + if len(passing_names) == 0: + pytest.fail("No tests passed") def state_from_savepoint(serializer, savepoint, name_to_std_name): @@ -354,6 +279,7 @@ def test_parallel_savepoint( caplog, threshold_overrides, grid, + multimodal_metric, xy_indices=True, ): if MPI.COMM_WORLD.Get_size() % 6 != 0: @@ -410,23 +336,24 @@ def test_parallel_savepoint( with subtests.test(varname=varname): failing_names.append(varname) output_data = gt_utils.asarray(output[varname]) - assert success( - output_data, - ref_data[varname][0], - case.testobj.max_error, - ignore_near_zero, - case.testobj.near_zero, - ), sample_wherefail( - output_data, - ref_data[varname][0], - case.testobj.max_error, - print_failures, - failure_stride, - case.savepoint_name, - ignore_near_zero, - case.testobj.near_zero, - xy_indices, - ) + if multimodal_metric: + metric = MultiModalFloatMetric( + reference_values=ref_data[varname][0], + computed_values=output_data, + eps=case.testobj.max_error, + ignore_near_zero_errors=ignore_near_zero, + near_zero=case.testobj.near_zero, + ) + else: + metric = LegacyMetric( + reference_values=ref_data[varname][0], + computed_values=output_data, + eps=case.testobj.max_error, + ignore_near_zero_errors=ignore_near_zero, + near_zero=case.testobj.near_zero, + ) + if not metric.check: + pytest.fail(str(metric), pytrace=False) passing_names.append(failing_names.pop()) if len(failing_names) > 0: os.makedirs(OUTDIR, exist_ok=True) @@ -447,8 +374,12 @@ def test_parallel_savepoint( ) except Exception as error: print(f"TestParallel SaveNetCDF Error: {error}") - assert failing_names == [], f"only the following variables passed: {passing_names}" - assert len(passing_names) > 0, "No tests passed" + if failing_names != []: + pytest.fail( + f"Only the following variables passed: {passing_names}", pytrace=False + ) + if len(passing_names) == 0: + pytest.fail("No tests passed") def save_netcdf( @@ -464,6 +395,7 @@ def save_netcdf( data_vars = {} for i, varname in enumerate(failing_names): + # Read in dimensions and attributes if hasattr(testobj, "outputs"): dims = [dim_name + f"_{i}" for dim_name in testobj.outputs[varname]["dims"]] attrs = {"units": testobj.outputs[varname]["units"]} @@ -472,27 +404,33 @@ def save_netcdf( f"dim_{varname}_{j}" for j in range(len(ref_data[varname][0].shape)) ] attrs = {"units": "unknown"} + + # Try to save inputs try: - data_vars[f"{varname}_in"] = xr.DataArray( + data_vars[f"{varname}_input"] = xr.DataArray( np.stack([in_data[varname] for in_data in inputs_list]), dims=("rank",) + tuple([f"{d}_in" for d in dims]), attrs=attrs, ) except KeyError as error: print(f"No input data found for {error}") - data_vars[f"{varname}_ref"] = xr.DataArray( + + # Reference, computed and error computation + data_vars[f"{varname}_reference"] = xr.DataArray( np.stack(ref_data[varname]), dims=("rank",) + tuple([f"{d}_out" for d in dims]), attrs=attrs, ) - data_vars[f"{varname}_out"] = xr.DataArray( + data_vars[f"{varname}_computed"] = xr.DataArray( np.stack([output[varname] for output in output_list]), dims=("rank",) + tuple([f"{d}_out" for d in dims]), attrs=attrs, ) - data_vars[f"{varname}_error"] = ( - data_vars[f"{varname}_ref"] - data_vars[f"{varname}_out"] + absolute_errors = ( + data_vars[f"{varname}_reference"] - data_vars[f"{varname}_computed"] ) - data_vars[f"{varname}_error"].attrs = attrs + data_vars[f"{varname}_absolute_error"] = absolute_errors + data_vars[f"{varname}_absolute_error"].attrs = attrs + print(f"File saved to {out_filename}") xr.Dataset(data_vars=data_vars).to_netcdf(out_filename) diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py index 0ffe55ed..9812e064 100644 --- a/ndsl/testing/comparison.py +++ b/ndsl/testing/comparison.py @@ -1,68 +1,265 @@ from typing import Union import numpy as np +import numpy.typing as npt -def compare_arr(computed_data, ref_data): - """ - Smooth error near zero values. - Inputs are arrays. +class BaseMetric: + def __init__( + self, + reference_values: np.ndarray, + computed_values: np.ndarray, + ): + self.references = np.atleast_1d(reference_values) + self.computed = np.atleast_1d(computed_values) + self.check = False + + def __str__(self) -> str: + ... + + def __repr__(self) -> str: + ... + + +class LegacyMetric(BaseMetric): + """Legacy (AI2) metric used for original FV3 port. + + This metric attempts to smooth error comparison around 0. + It further tries to deal with close-to-0 breakdown of absolute + error by allowing `near_zero` threshold to be specified by hand. """ - if ref_data.dtype in (np.float64, np.int64, np.float32, np.int32): - denom = np.abs(ref_data) + np.abs(computed_data) - compare = np.asarray(2.0 * np.abs(computed_data - ref_data) / denom) - compare[denom == 0] = 0.0 - return compare - elif ref_data.dtype in (np.bool,): - return np.logical_xor(computed_data, ref_data) - else: - raise TypeError(f"recieved data with unexpected dtype {ref_data.dtype}") - - -def compare_scalar(computed_data: np.float64, ref_data: np.float64) -> np.float64: - """Smooth error near zero values. Scalar versions.""" - err_as_array = compare_arr(np.atleast_1d(computed_data), np.atleast_1d(ref_data)) - return err_as_array[0] - - -def success_array( - computed_data: np.ndarray, - ref_data: np.ndarray, - eps: float, - ignore_near_zero_errors: Union[dict, bool], - near_zero: float, -): - success = np.logical_or( - np.logical_and(np.isnan(computed_data), np.isnan(ref_data)), - compare_arr(computed_data, ref_data) < eps, - ) - if isinstance(ignore_near_zero_errors, dict): - if ignore_near_zero_errors.keys(): - near_zero = ignore_near_zero_errors["near_zero"] + + def __init__( + self, + reference_values: np.ndarray, + computed_values: np.ndarray, + eps: float, + ignore_near_zero_errors: Union[dict, bool], + near_zero: float, + ): + super().__init__(reference_values, computed_values) + self.eps = eps + self.success = self._compute_errors( + ignore_near_zero_errors, + near_zero, + ) + self.check = np.all(self.success) + self._calculated_metric = np.empty_like(self.references) + + def _compute_errors( + self, + ignore_near_zero_errors, + near_zero, + ) -> npt.NDArray[np.bool_]: + if self.references.dtype in (np.float64, np.int64, np.float32, np.int32): + denom = np.abs(self.references) + np.abs(self.computed) + self._calculated_metric = np.asarray( + 2.0 * np.abs(self.computed - self.references) / denom + ) + self._calculated_metric[denom == 0] = 0.0 + elif self.references.dtype in (np.bool_, bool): + self._calculated_metric = np.logical_xor(self.computed, self.references) + else: + raise TypeError( + f"recieved data with unexpected dtype {self.references.dtype}" + ) + success = np.logical_or( + np.logical_and(np.isnan(self.computed), np.isnan(self.references)), + self._calculated_metric < self.eps, + ) + if isinstance(ignore_near_zero_errors, dict): + if ignore_near_zero_errors.keys(): + near_zero = ignore_near_zero_errors["near_zero"] + success = np.logical_or( + success, + np.logical_and( + np.abs(self.computed) < near_zero, + np.abs(self.references) < near_zero, + ), + ) + elif ignore_near_zero_errors: success = np.logical_or( success, np.logical_and( - np.abs(computed_data) < near_zero, - np.abs(ref_data) < near_zero, + np.abs(self.computed) < near_zero, + np.abs(self.references) < near_zero, ), ) - elif ignore_near_zero_errors: - success = np.logical_or( - success, - np.logical_and( - np.abs(computed_data) < near_zero, np.abs(ref_data) < near_zero - ), - ) - return success + return success + def __str__(self) -> str: + return self.__repr__() -def success(computed_data, ref_data, eps, ignore_near_zero_errors, near_zero=0.0): - return np.all( - success_array( - np.asarray(computed_data), - np.asarray(ref_data), - eps, - ignore_near_zero_errors, - near_zero, + def __repr__(self) -> str: + if self.check: + return "✅ No numerical differences" + + report = [] + report.append("❌ Numerical failures") + + found_indices = np.logical_not(self.success).nonzero() + computed_failures = self.computed[found_indices] + reference_failures = self.references[found_indices] + + # List all errors + bad_indices_count = len(found_indices[0]) + # Determine worst result + worst_metric_err = 0.0 + abs_errs = [] + details = [ + "All failures:", + "Index Computed Reference Absloute E Metric E", + ] + for b in range(bad_indices_count): + full_index = tuple([f[b] for f in found_indices]) + + metric_err = self._calculated_metric[full_index] + + absolute_distance = abs(computed_failures[b] - reference_failures[b]) + abs_errs.append(absolute_distance) + + details.append( + f"{full_index} {computed_failures[b]} " + f"{reference_failures[b]} {abs_errs[-1]:.3e} {metric_err:.3e}" + ) + + if np.isnan(metric_err) or (metric_err > worst_metric_err): + worst_metric_err = metric_err + worst_full_idx = full_index + worst_abs_err = abs_errs[-1] + computed_worst = computed_failures[b] + reference_worst = reference_failures[b] + # Try to quantify noisy errors + unique_errors = len(np.unique(np.array(abs_errs))) + # Summary and worst result + fullcount = len(self.references.flatten()) + report.append( + f"Failed count: {bad_indices_count}/{fullcount} " + f"({round(100.0 * (bad_indices_count / fullcount), 2)}%),\n" + f"Worst failed index {worst_full_idx}\n" + f" Computed:{computed_worst}\n" + f" Reference: {reference_worst}\n" + f" Absolute diff: {worst_abs_err:.3e}\n" + f" Metric diff: {worst_metric_err:.3e}\n" + f" Metric threshold: {self.eps}\n" + f" Noise quantification:\n" + f" Reference dtype: {type(reference_worst)}\n" + f" Unique errors: {unique_errors}/{bad_indices_count}" ) - ) + report.extend(details) + + return "\n".join(report) + + +class MultiModalFloatMetric(BaseMetric): + """Combination of absolute, relative & ULP comparison for floats + + This metric attempts to combine well known comparison on floats + to leverage a robust 32/64 bit float comparison on large accumulating + floating errors. + + ULP is used to clear noise (ULP<=1.0 passes) + Absolute errors for large amplitute + """ + + _f32_absolute_eps = 1e-10 + _f64_absolute_eps = 1e-13 + relative_fraction = 0.000001 # 0.0001% + ulp_threshold = 1.0 + + def __init__( + self, + reference_values: np.ndarray, + computed_values: np.ndarray, + eps: float, + **kwargs, + ): + super().__init__(reference_values, computed_values) + self.absolute_distance = np.empty_like(self.references) + self.absolute_distance_metric = np.empty_like(self.references, dtype=np.bool_) + self.relative_distance = np.empty_like(self.references) + self.relative_distance_metric = np.empty_like(self.references, dtype=np.bool_) + self.ulp_distance = np.empty_like(self.references) + self.ulp_distance_metric = np.empty_like(self.references, dtype=np.bool_) + + if self.references.dtype is (np.float32, np.int32): + self.absolute_eps = max(eps, self._f32_absolute_eps) + else: + self.absolute_eps = max(eps, self._f64_absolute_eps) + + self.success = self._compute_all_metrics() + self.check = np.all(self.success) + + def _compute_all_metrics( + self, + ) -> npt.NDArray[np.bool_]: + if self.references.dtype in (np.float64, np.int64, np.float32, np.int32): + max_values = np.maximum( + np.absolute(self.computed), np.absolute(self.references) + ) + # Absolute distance + self.absolute_distance = np.absolute(self.computed - self.references) + self.absolute_distance_metric = self.absolute_distance < self.absolute_eps + # Relative distance (in pct) + self.relative_distance = np.divide(self.absolute_distance, max_values) + self.relative_distance_metric = ( + self.absolute_distance < self.relative_fraction * max_values + ) + # ULP distance + self.ulp_distance = np.divide( + self.absolute_distance, np.spacing(max_values) + ) + self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold + + # Combine all distances into sucess or failure + # Success = no NANs & ( abs or rel or ulp ) + naninf_success = not np.logical_and( + np.isnan(self.computed), np.isnan(self.references) + ).all() + metric_success = np.logical_or( + self.relative_distance_metric, self.absolute_distance_metric + ) + metric_success = np.logical_or(metric_success, self.ulp_distance_metric) + success = np.logical_and(naninf_success, metric_success) + return success + elif self.references.dtype in (np.bool_, bool): + success = np.logical_xor(self.computed, self.references) + return success + else: + raise TypeError( + f"recieved data with unexpected dtype {self.references.dtype}" + ) + + def __str__(self) -> str: + return self.__repr__() + + def __repr__(self) -> str: + if self.check: + return "✅ No numerical differences" + + report = [] + report.append("❌ Numerical failures") + + found_indices = np.logical_not(self.success).nonzero() + # List all errors + bad_indices_count = len(found_indices[0]) + full_count = len(self.references.flatten()) + failures_pct = round(100.0 * (bad_indices_count / full_count), 2) + report = [ + f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n", + f"Index Computed Reference " + f"Absolute E(<{self.absolute_eps:.2e}) " + f"Relative E(<{self.relative_fraction*100:.2e}%) " + f"ULP E(<{self.ulp_threshold})", + ] + # Summary and worst result + for iBad in range(bad_indices_count): + fi = tuple([f[iBad] for f in found_indices]) + report.append( + f"({fi[0]:02}, {fi[1]:02}, {fi[2]:02}) {self.computed[fi]:.16e} {self.references[fi]:.16e} " + f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'} " + f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'} " + f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'} " + ) + + return "\n".join(report)