Merge pull request #55 from smart-on-fhir/mikix/option

feat: support --option output-mode:aggregate in addition to env var
smart-on-fhir · Jul 10, 2024 · 00da4f8 · 00da4f8
2 parents 57b6237 + fd71835
commit 00da4f8
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -93,11 +93,9 @@ cumulus-library export \
 This study generates `CUBE` output by default.
 If it's easier to work with simple aggregate counts of every value combination
 (that is, without the partial value combinations that `CUBE()` generates),
-run the build step with `DATA_METRICS_OUTPUT_MODE=aggregate` in your environment.
+run the build step with `--option output-mode:aggregate`.
 
 That is, run it like:
 ```sh
-env \
-  DATA_METRICS_OUTPUT_MODE=aggregate \
-  cumulus-library build ...
+cumulus-library build --option output-mode:aggregate ...
 ```
diff --git a/cumulus_library_data_metrics/base.py b/cumulus_library_data_metrics/base.py
@@ -2,6 +2,7 @@
 
 import copy
 import os.path
+import sys
 from typing import ClassVar
 
 import jinja2
@@ -18,6 +19,7 @@ class MetricMixin:
     def __init__(self):
         super().__init__()
         self.display_text = f"Creating {self.name} tables…"
+        self.output_mode = "cube"
         self.summary_entries = {}
         # A "group" is a value in the second column - a secondary characteristic like "field"
         # (group examples: "code", "valueCodeableConcept") or a stratifier like "profile"
@@ -103,16 +105,23 @@ def prepare_queries(
         config: base_utils.StudyConfig,
         **kwargs,
     ) -> None:
+        self.output_mode = self.get_output_mode(config)
         self._query_schema(config)
         self.extra_schema_checks(config)
         self.add_metric_queries()
 
-    def get_output_mode(self) -> str:
-        # TODO: add the ability for cumulus-library to take study args like
-        #  --study-option=output-mode:cube (or whatever)
-        output_mode = os.environ.get("DATA_METRICS_OUTPUT_MODE")
+    def get_output_mode(self, config: base_utils.StudyConfig) -> str:
+        output_mode = (
+            (config.options and config.options.get("output-mode"))
+            # Deprecated approach (before --option existed) -- let it lie for now
+            or os.environ.get("DATA_METRICS_OUTPUT_MODE")
+        )
         if output_mode not in {"aggregate", "cube"}:
             output_mode = "cube"
+            print(
+                f"Did not understand output mode '{output_mode}'. Using 'cube' instead.",
+                file=sys.stderr,
+            )
         return output_mode
 
     def render_sql(self, template: str, **kwargs) -> str:
@@ -125,7 +134,7 @@ def render_sql(self, template: str, **kwargs) -> str:
             kwargs.update(resource_info.CATEGORIES.get(src, {}))
 
         # See how we should combine counts.
-        kwargs["output_mode"] = self.get_output_mode()
+        kwargs["output_mode"] = self.output_mode
 
         with open(f"{path}/{self.name}/{template}.jinja") as file:
             template = file.read()

diff --git a/cumulus_library_data_metrics/c_us_core_v4_count/c_us_core_v4_count.py b/cumulus_library_data_metrics/c_us_core_v4_count/c_us_core_v4_count.py
@@ -17,7 +17,7 @@ def make_table(self, **kwargs) -> None:
         # at a metric like this one with both mandatory and must-support fields.
         kwargs["skip_duplicated_mandatory_checks"] = True
 
-        if self.get_output_mode() == "cube" and "mandatory_split" in kwargs:
+        if self.output_mode == "cube" and "mandatory_split" in kwargs:
             kwargs["table_max"] = kwargs["mandatory_split"]
             self.queries += [
                 self.render_sql("mandatory", table_num=i + 1, **kwargs)

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "cumulus-library-data-metrics"
 requires-python = ">= 3.10"
 dependencies = [
-    "cumulus-library >= 2.2, < 3",
+    "cumulus-library >= 2.3, < 3",
 ]
 description = "Data quality and characterization metrics for Cumulus"
 readme = "README.md"

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -13,8 +13,6 @@
 from cumulus_library import cli
 
 
-# Use aggregate mode by default because it produces less noisy CSVs
-@mock.patch.dict(os.environ, {"DATA_METRICS_OUTPUT_MODE": "aggregate"})
 @ddt.ddt
 class MetricsTestCase(unittest.TestCase):
     """Test case for data metrics"""
@@ -28,16 +26,19 @@ def test_c_pt_count(self):
     def test_c_pt_count_no_ext(self):
         self.run_study("c_pt_count", test="no-ext", prefix="count_")
 
-    def test_c_pt_count_cubed(self):
-        # Test directly asking for cube mode
+    def test_cubed_output_mode(self):
+        # Test directly asking for cube mode via env var
         with mock.patch.dict(os.environ, {"DATA_METRICS_OUTPUT_MODE": "cube"}):
-            self.run_study("c_pt_count", test="cubed", prefix="count_")
+            self.run_study("c_pt_count", test="cubed", prefix="count_", output=None)
 
-        # Now do the same test but without any env var, to confirm the default is cube
-        env = dict(os.environ)
-        del env["DATA_METRICS_OUTPUT_MODE"]
-        with mock.patch.dict(os.environ, env, clear=True):
-            self.run_study("c_pt_count", test="cubed", prefix="count_")
+        # Test directly asking for cube mode via CLI option
+        self.run_study("c_pt_count", test="cubed", prefix="count_", output="cube")
+
+        # Now do the same test but with a bogus arg, confirm it falls back to cube
+        self.run_study("c_pt_count", test="cubed", prefix="count_", output="bogus")
+
+        # Now do the same test but without any input, to confirm the default is cube
+        self.run_study("c_pt_count", test="cubed", prefix="count_", output=None)
 
     def test_c_pt_deceased_count(self):
         self.run_study("c_pt_deceased_count", prefix="count_")
@@ -58,8 +59,7 @@ def test_c_us_core_v4_count(self):
     def test_c_us_core_v4_count_cubed(self):
         # We have special support for cutting up observation profiles into multiple
         # tables in cube mode.
-        with mock.patch.dict(os.environ, {"DATA_METRICS_OUTPUT_MODE": "cube"}):
-            self.run_study("c_us_core_v4_count", test="cubed", prefix="count_")
+        self.run_study("c_us_core_v4_count", test="cubed", prefix="count_", output="cube")
 
     def test_q_date_recent(self):
         self.run_study("q_date_recent")
@@ -98,7 +98,9 @@ def setUp(self):
         super().setUp()
         self.maxDiff = None
 
-    def run_study(self, metric: str, test: str = "general", prefix: str = "") -> None:
+    def run_study(
+        self, metric: str, test: str = "general", prefix: str = "", output="aggregate"
+    ) -> None:
         """Runs a single test case"""
         test_dir = os.path.dirname(__file__)
         root_dir = os.path.dirname(test_dir)
@@ -142,17 +144,18 @@ def run_study(self, metric: str, test: str = "general", prefix: str = "") -> Non
                     """
                 )
 
-            cli.main(
-                [
-                    "build",
-                    # "--verbose",
-                    "--target=data_metrics",
-                    f"--study-dir={tmpdir}/cumulus_library_data_metrics",
-                    "--db-type=duckdb",
-                    f"--database={tmpdir}/duck.db",
-                    f"--load-ndjson-dir={data_dir}",
-                ]
-            )
+            args = [
+                "build",
+                # "--verbose",
+                "--target=data_metrics",
+                f"--study-dir={tmpdir}/cumulus_library_data_metrics",
+                "--db-type=duckdb",
+                f"--database={tmpdir}/duck.db",
+                f"--load-ndjson-dir={data_dir}",
+            ]
+            if output:
+                args.append(f"--option=output-mode:{output}")
+            cli.main(args)
             db = duckdb.connect(f"{tmpdir}/duck.db")
 
             # Uncomment this for extra debugging