From 46ab43110f0659571a0a9e3bfc11ac56d84f4215 Mon Sep 17 00:00:00 2001 From: Devanshu Kumar Singh <43195822+Devanshusisodiya@users.noreply.github.com> Date: Sun, 11 Feb 2024 15:29:04 +0000 Subject: [PATCH 1/7] fix for warning if column is multi categorical in h2o.cor() --- h2o-py/h2o/frame.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 59dc1d053547..c13ea7c1d89d 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -3208,8 +3208,21 @@ def cor(self, y=None, na_rm=False, use=None, method="Pearson"): assert_is_type(y, H2OFrame, None) assert_is_type(na_rm, bool) assert_is_type(use, None, "everything", "all.obs", "complete.obs") + y_categorical = any(y[col_name].isfactor() for col_name in y) + if y is None: y = self + + if y_categorical: + num_unique_levels = {col: self[col].nlevels()[0] for col in y} + multi_categorical = any(num_levels > 2 for num_levels in num_unique_levels.values()) + + if multi_categorical: + import warnings + for col, nlevel in num_unique_levels.items(): + if nlevel > 2: + warnings.warn("Column {} contains {} levels. Only numerical and categorical columns are supported.".format(col, nlevel)) + if use is None: use = "complete.obs" if na_rm else "everything" if self.nrow == 1 or (self.ncol == 1 and y.ncol == 1): return ExprNode("cor", self, y, use, method)._eager_scalar() return H2OFrame._expr(expr=ExprNode("cor", self, y, use, method))._frame() From b328502e864f0e0a7ef83f25192bc5c62ca3e240 Mon Sep 17 00:00:00 2001 From: Devanshu Kumar Singh <43195822+Devanshusisodiya@users.noreply.github.com> Date: Mon, 12 Feb 2024 03:38:07 +0000 Subject: [PATCH 2/7] added fix for warning if column is multi categorical in R --- h2o-r/h2o-package/R/frame.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index c1e46f9ac5f0..951706c226c6 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -2902,6 +2902,13 @@ h2o.cor <- function(x, y=NULL,na.rm = FALSE, use, method="Pearson"){ if (is.null(method) || is.na(method)) { stop("Correlation method must be specified.") } + + # Check for categorical columns in x and y + x_categorical <- any(h2o.isfactor(x)) + y_categorical <- any(h2o.isfactor(y)) + + if ((x_categorical && length(unique(h2o.levels(x))) > 2) || (y_categorical && length(unique(h2o.levels(y))) > 2)) { + warning("Column x or y contains more than 2 levels. Only numerical and categorical columns are supported.") # Eager, mostly to match prior semantics but no real reason it need to be expr <- .newExpr("cor",x,y,.quote(use), .quote(method)) From af62fa4a23bfbcfc4e5a18ed6dbd36655529ba0e Mon Sep 17 00:00:00 2001 From: Devanshusisodiya Date: Tue, 13 Feb 2024 15:21:29 +0530 Subject: [PATCH 3/7] fix for issue #15947 --- h2o-py/h2o/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index c13ea7c1d89d..a86b62ec5adb 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -146,6 +146,12 @@ def _upload_python_object(self, python_obj, destination_frame=None, header=0, se if not column_names: column_names = col_header + if skipped_columns: + column_names = [column_name for column_name in column_names[:len(column_names)-len(skipped_columns)]] + + if not column_names: + raise H2OValueError("skipped_columns cannot contain all columns") + # create a temporary file that will be written to tmp_handle, tmp_path = tempfile.mkstemp(suffix=".csv") tmp_file = os.fdopen(tmp_handle, 'w', **H2OFrame.__fdopen_kwargs) From d012d77a14f865402c60828051826c08ae399f86 Mon Sep 17 00:00:00 2001 From: Devanshusisodiya Date: Tue, 13 Feb 2024 15:39:21 +0530 Subject: [PATCH 4/7] wrote test for fix #15947 --- .../Data_Manipulation/pyunit_h2oH2OFrame.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py index b9f101dda2a7..7dfbe120c831 100644 --- a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py +++ b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py @@ -125,10 +125,16 @@ def H2OFrame_from_H2OFrame(): assert dupl4.columns == ["n1", "s1"] -def H2OFrame_skipped_columns_is_BUGGY(): +def H2OFrame_skipped_columns(): try: - h2o.H2OFrame(data, skipped_columns=[1]) - assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 + fr = h2o.H2OFrame([ + [1, "a", 1], + [2, "b", 0], + [3, "", 1], + ] + ) + skipped_columns_frame = h2o.H2OFrame(data, skipped_columns=[1]) + assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 except ValueError as e: assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e) From 68e7d13c032f63155caff7c2fff3c9734caa1498 Mon Sep 17 00:00:00 2001 From: Devanshusisodiya Date: Tue, 13 Feb 2024 15:44:19 +0530 Subject: [PATCH 5/7] undo commit --- .../Data_Manipulation/pyunit_h2oH2OFrame.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py index 7dfbe120c831..b9f101dda2a7 100644 --- a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py +++ b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py @@ -125,16 +125,10 @@ def H2OFrame_from_H2OFrame(): assert dupl4.columns == ["n1", "s1"] -def H2OFrame_skipped_columns(): +def H2OFrame_skipped_columns_is_BUGGY(): try: - fr = h2o.H2OFrame([ - [1, "a", 1], - [2, "b", 0], - [3, "", 1], - ] - ) - skipped_columns_frame = h2o.H2OFrame(data, skipped_columns=[1]) - assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 + h2o.H2OFrame(data, skipped_columns=[1]) + assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 except ValueError as e: assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e) From 2358b35ec02b7f322eb82f5139b1fe705a1fb2b5 Mon Sep 17 00:00:00 2001 From: Devanshusisodiya Date: Tue, 13 Feb 2024 15:49:26 +0530 Subject: [PATCH 6/7] undo changes --- h2o-py/h2o/frame.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index a86b62ec5adb..59dc1d053547 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -146,12 +146,6 @@ def _upload_python_object(self, python_obj, destination_frame=None, header=0, se if not column_names: column_names = col_header - if skipped_columns: - column_names = [column_name for column_name in column_names[:len(column_names)-len(skipped_columns)]] - - if not column_names: - raise H2OValueError("skipped_columns cannot contain all columns") - # create a temporary file that will be written to tmp_handle, tmp_path = tempfile.mkstemp(suffix=".csv") tmp_file = os.fdopen(tmp_handle, 'w', **H2OFrame.__fdopen_kwargs) @@ -3214,21 +3208,8 @@ def cor(self, y=None, na_rm=False, use=None, method="Pearson"): assert_is_type(y, H2OFrame, None) assert_is_type(na_rm, bool) assert_is_type(use, None, "everything", "all.obs", "complete.obs") - y_categorical = any(y[col_name].isfactor() for col_name in y) - if y is None: y = self - - if y_categorical: - num_unique_levels = {col: self[col].nlevels()[0] for col in y} - multi_categorical = any(num_levels > 2 for num_levels in num_unique_levels.values()) - - if multi_categorical: - import warnings - for col, nlevel in num_unique_levels.items(): - if nlevel > 2: - warnings.warn("Column {} contains {} levels. Only numerical and categorical columns are supported.".format(col, nlevel)) - if use is None: use = "complete.obs" if na_rm else "everything" if self.nrow == 1 or (self.ncol == 1 and y.ncol == 1): return ExprNode("cor", self, y, use, method)._eager_scalar() return H2OFrame._expr(expr=ExprNode("cor", self, y, use, method))._frame() From 6d2d82aa10f644b2504a5b9b414404e507a85f9b Mon Sep 17 00:00:00 2001 From: Devanshusisodiya Date: Tue, 13 Feb 2024 15:59:49 +0530 Subject: [PATCH 7/7] undo changes --- h2o-r/h2o-package/R/frame.R | 7 ------- 1 file changed, 7 deletions(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 951706c226c6..b8f3c5b67d72 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -2903,13 +2903,6 @@ h2o.cor <- function(x, y=NULL,na.rm = FALSE, use, method="Pearson"){ stop("Correlation method must be specified.") } - # Check for categorical columns in x and y - x_categorical <- any(h2o.isfactor(x)) - y_categorical <- any(h2o.isfactor(y)) - - if ((x_categorical && length(unique(h2o.levels(x))) > 2) || (y_categorical && length(unique(h2o.levels(y))) > 2)) { - warning("Column x or y contains more than 2 levels. Only numerical and categorical columns are supported.") - # Eager, mostly to match prior semantics but no real reason it need to be expr <- .newExpr("cor",x,y,.quote(use), .quote(method)) if( (nrow(x)==1L || (ncol(x)==1L && ncol(y)==1L)) ) .eval.scalar(expr)