Skip to content

Commit

Permalink
Merge pull request #55 from smart-on-fhir/mikix/option
Browse files Browse the repository at this point in the history
feat: support --option output-mode:aggregate in addition to env var
  • Loading branch information
mikix authored Jul 10, 2024
2 parents 57b6237 + fd71835 commit 00da4f8
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 35 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,9 @@ cumulus-library export \
This study generates `CUBE` output by default.
If it's easier to work with simple aggregate counts of every value combination
(that is, without the partial value combinations that `CUBE()` generates),
run the build step with `DATA_METRICS_OUTPUT_MODE=aggregate` in your environment.
run the build step with `--option output-mode:aggregate`.

That is, run it like:
```sh
env \
DATA_METRICS_OUTPUT_MODE=aggregate \
cumulus-library build ...
cumulus-library build --option output-mode:aggregate ...
```
19 changes: 14 additions & 5 deletions cumulus_library_data_metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import copy
import os.path
import sys
from typing import ClassVar

import jinja2
Expand All @@ -18,6 +19,7 @@ class MetricMixin:
def __init__(self):
super().__init__()
self.display_text = f"Creating {self.name} tables…"
self.output_mode = "cube"
self.summary_entries = {}
# A "group" is a value in the second column - a secondary characteristic like "field"
# (group examples: "code", "valueCodeableConcept") or a stratifier like "profile"
Expand Down Expand Up @@ -103,16 +105,23 @@ def prepare_queries(
config: base_utils.StudyConfig,
**kwargs,
) -> None:
self.output_mode = self.get_output_mode(config)
self._query_schema(config)
self.extra_schema_checks(config)
self.add_metric_queries()

def get_output_mode(self) -> str:
# TODO: add the ability for cumulus-library to take study args like
# --study-option=output-mode:cube (or whatever)
output_mode = os.environ.get("DATA_METRICS_OUTPUT_MODE")
def get_output_mode(self, config: base_utils.StudyConfig) -> str:
output_mode = (
(config.options and config.options.get("output-mode"))
# Deprecated approach (before --option existed) -- let it lie for now
or os.environ.get("DATA_METRICS_OUTPUT_MODE")
)
if output_mode not in {"aggregate", "cube"}:
output_mode = "cube"
print(
f"Did not understand output mode '{output_mode}'. Using 'cube' instead.",
file=sys.stderr,
)
return output_mode

def render_sql(self, template: str, **kwargs) -> str:
Expand All @@ -125,7 +134,7 @@ def render_sql(self, template: str, **kwargs) -> str:
kwargs.update(resource_info.CATEGORIES.get(src, {}))

# See how we should combine counts.
kwargs["output_mode"] = self.get_output_mode()
kwargs["output_mode"] = self.output_mode

with open(f"{path}/{self.name}/{template}.jinja") as file:
template = file.read()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def make_table(self, **kwargs) -> None:
# at a metric like this one with both mandatory and must-support fields.
kwargs["skip_duplicated_mandatory_checks"] = True

if self.get_output_mode() == "cube" and "mandatory_split" in kwargs:
if self.output_mode == "cube" and "mandatory_split" in kwargs:
kwargs["table_max"] = kwargs["mandatory_split"]
self.queries += [
self.render_sql("mandatory", table_num=i + 1, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "cumulus-library-data-metrics"
requires-python = ">= 3.10"
dependencies = [
"cumulus-library >= 2.2, < 3",
"cumulus-library >= 2.3, < 3",
]
description = "Data quality and characterization metrics for Cumulus"
readme = "README.md"
Expand Down
51 changes: 27 additions & 24 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
from cumulus_library import cli


# Use aggregate mode by default because it produces less noisy CSVs
@mock.patch.dict(os.environ, {"DATA_METRICS_OUTPUT_MODE": "aggregate"})
@ddt.ddt
class MetricsTestCase(unittest.TestCase):
"""Test case for data metrics"""
Expand All @@ -28,16 +26,19 @@ def test_c_pt_count(self):
def test_c_pt_count_no_ext(self):
self.run_study("c_pt_count", test="no-ext", prefix="count_")

def test_c_pt_count_cubed(self):
# Test directly asking for cube mode
def test_cubed_output_mode(self):
# Test directly asking for cube mode via env var
with mock.patch.dict(os.environ, {"DATA_METRICS_OUTPUT_MODE": "cube"}):
self.run_study("c_pt_count", test="cubed", prefix="count_")
self.run_study("c_pt_count", test="cubed", prefix="count_", output=None)

# Now do the same test but without any env var, to confirm the default is cube
env = dict(os.environ)
del env["DATA_METRICS_OUTPUT_MODE"]
with mock.patch.dict(os.environ, env, clear=True):
self.run_study("c_pt_count", test="cubed", prefix="count_")
# Test directly asking for cube mode via CLI option
self.run_study("c_pt_count", test="cubed", prefix="count_", output="cube")

# Now do the same test but with a bogus arg, confirm it falls back to cube
self.run_study("c_pt_count", test="cubed", prefix="count_", output="bogus")

# Now do the same test but without any input, to confirm the default is cube
self.run_study("c_pt_count", test="cubed", prefix="count_", output=None)

def test_c_pt_deceased_count(self):
self.run_study("c_pt_deceased_count", prefix="count_")
Expand All @@ -58,8 +59,7 @@ def test_c_us_core_v4_count(self):
def test_c_us_core_v4_count_cubed(self):
# We have special support for cutting up observation profiles into multiple
# tables in cube mode.
with mock.patch.dict(os.environ, {"DATA_METRICS_OUTPUT_MODE": "cube"}):
self.run_study("c_us_core_v4_count", test="cubed", prefix="count_")
self.run_study("c_us_core_v4_count", test="cubed", prefix="count_", output="cube")

def test_q_date_recent(self):
self.run_study("q_date_recent")
Expand Down Expand Up @@ -98,7 +98,9 @@ def setUp(self):
super().setUp()
self.maxDiff = None

def run_study(self, metric: str, test: str = "general", prefix: str = "") -> None:
def run_study(
self, metric: str, test: str = "general", prefix: str = "", output="aggregate"
) -> None:
"""Runs a single test case"""
test_dir = os.path.dirname(__file__)
root_dir = os.path.dirname(test_dir)
Expand Down Expand Up @@ -142,17 +144,18 @@ def run_study(self, metric: str, test: str = "general", prefix: str = "") -> Non
"""
)

cli.main(
[
"build",
# "--verbose",
"--target=data_metrics",
f"--study-dir={tmpdir}/cumulus_library_data_metrics",
"--db-type=duckdb",
f"--database={tmpdir}/duck.db",
f"--load-ndjson-dir={data_dir}",
]
)
args = [
"build",
# "--verbose",
"--target=data_metrics",
f"--study-dir={tmpdir}/cumulus_library_data_metrics",
"--db-type=duckdb",
f"--database={tmpdir}/duck.db",
f"--load-ndjson-dir={data_dir}",
]
if output:
args.append(f"--option=output-mode:{output}")
cli.main(args)
db = duckdb.connect(f"{tmpdir}/duck.db")

# Uncomment this for extra debugging
Expand Down

0 comments on commit 00da4f8

Please sign in to comment.