diff --git a/c/datatablemodule.cc b/c/datatablemodule.cc index e84a8a2ad9..7d2143da6c 100644 --- a/c/datatablemodule.cc +++ b/c/datatablemodule.cc @@ -13,7 +13,7 @@ #include "csv/writer.h" #include "expr/py_expr.h" #include "extras/aggregator.h" -#include "extras/ftrl.h" +#include "extras/py_ftrl.h" #include "frame/py_frame.h" #include "options.h" #include "py_column.h" @@ -199,7 +199,6 @@ void DatatableModule::init_methods() { add(METHODv(expr_unaryop)); add(METHOD0(is_debug_mode)); add(METHOD0(has_omp_support)); - init_methods_ftrl(); init_methods_aggregate(); init_methods_str(); init_methods_options(); @@ -234,6 +233,8 @@ PyInit__datatable() try { py::Frame::Type::init(m); + py::Ftrl::Type::init(m); + } catch (const std::exception& e) { exception_to_python(e); return nullptr; diff --git a/c/datatablemodule.h b/c/datatablemodule.h index 78a8cdb96b..8547314c8c 100644 --- a/c/datatablemodule.h +++ b/c/datatablemodule.h @@ -29,7 +29,6 @@ class DatatableModule : public py::ExtModule { void init_methods(); void init_methods_aggregate();// extra/aggergate.cc - void init_methods_ftrl(); // extra/ftrl.cc void init_methods_str(); // str/py_str.cc void init_methods_options(); // options.cc void init_methods_sets(); // set_funcs.cc diff --git a/c/extras/aggregator.cc b/c/extras/aggregator.cc index 9157b5bc0d..f128386c46 100644 --- a/c/extras/aggregator.cc +++ b/c/extras/aggregator.cc @@ -45,9 +45,9 @@ namespace py { // dt changes in-place with a new column added to the end of it DataTable* dt_members = agg.aggregate(dt).release(); - py::Frame* frame_members = py::Frame::from_datatable(dt_members); + py::oobj df_members = py::oobj::from_new_reference(py::Frame::from_datatable(dt_members)); - return frame_members; + return df_members; } ); } diff --git a/c/extras/ftrl.cc b/c/extras/ftrl.cc index b0df7f29c1..94f2749c07 100644 --- a/c/extras/ftrl.cc +++ b/c/extras/ftrl.cc @@ -25,86 +25,65 @@ #include "utils/parallel.h" #include "datatablemodule.h" + +/* +* Set column names for `dt_model`. +*/ +const std::vector FtrlModel::model_cols = {"z", "n"}; +const FtrlModelParams FtrlModel::fmp_default = {0.005, 1.0, 0.0, 1.0, 1000000, 1, 1, 0, false}; + /* -* Read data from Python, train FTRL model and make predictions. +* Set up FTRL parameters and initialize weights. */ -namespace py { - static PKArgs ftrl( - 11, 0, 0, false, false, - {"df_train", "df_test", "a", "b", "l1", "l2", "d", "n_epochs", "inter", - "hash_type", "seed"}, "ftrl", "", - [](const py::PKArgs& args) -> py::oobj { - DataTable* dt_train = args[0].to_frame(); - DataTable* dt_test = args[1].to_frame(); +FtrlModel::FtrlModel(FtrlModelParams fmp_in) +{ + // Set model parameters + fmp = fmp_in; + // Create and initialize model datatable and weight vector. + create_model(); + init_model(); +} + - double a = args[2].to_double(); - double b = args[3].to_double(); - double l1 = args[4].to_double(); - double l2 = args[5].to_double(); +void FtrlModel::init_model() { + model_trained = false; + z = static_cast(dt_model->columns[0]->data_w()); + n = static_cast(dt_model->columns[1]->data_w()); + std::memset(z, 0, fmp.d * sizeof(double)); + std::memset(n, 0, fmp.d * sizeof(double)); +} - uint64_t d = static_cast(args[6].to_size_t()); - size_t n_epochs = args[7].to_size_t(); - bool inter = args[8].to_bool_strict(); - unsigned int hash_type = static_cast(args[9].to_size_t()); - unsigned int seed = static_cast(args[10].to_size_t()); - Ftrl ft(a, b, l1, l2, d, n_epochs, inter, hash_type, seed); - ft.train(dt_train); - DataTable* dt_target = ft.test(dt_test).release(); - py::Frame* frame_target = py::Frame::from_datatable(dt_target); +void FtrlModel::create_model() { + w = DoublePtr(new double[fmp.d]()); - return frame_target; - } - ); + Column* col_z = Column::new_data_column(SType::FLOAT64, fmp.d); + Column* col_n = Column::new_data_column(SType::FLOAT64, fmp.d); + dt_model = dtptr(new DataTable({col_z, col_n}, model_cols)); } -/* -* Set up FTRL parameters and initialize weights. -*/ -Ftrl::Ftrl(double a_in, double b_in, double l1_in, double l2_in, - uint64_t d_in, size_t nepochs_in, bool inter_in, - unsigned int hash_type_in, unsigned int seed_in) : - a(a_in), - b(b_in), - l1(l1_in), - l2(l2_in), - d(d_in), - n_epochs(nepochs_in), - hash_type(hash_type_in), - seed(seed_in), - inter(inter_in) -{ - n = DoublePtr(new double[d]()); - w = DoublePtr(new double[d]()); - - // Initialize weights with random [0; 1] numbers - z = DoublePtr(new double[d]); - srand(seed); - for (uint64_t i = 0; i < d; ++i){ - z[i] = static_cast(rand()) / RAND_MAX; - } +bool FtrlModel::is_trained() { + return model_trained; } /* * Train FTRL model on a training dataset. */ -void Ftrl::train(const DataTable* dt) { - // Define number of features that equal to one bias term - // plus `dt->ncols - 1` columns. We assume that the target column - // is the last one. - n_features = dt->ncols; +void FtrlModel::fit(const DataTable* dt) { + // Define number of features assuming that the target column is the last one. + n_features = dt->ncols - 1; // Define number of feature interactions. - n_inter_features = (inter)? (n_features - 1) * (n_features - 2) / 2 : 0; + n_inter_features = (fmp.inter)? n_features * (n_features - 1) / 2 : 0; // Get the target column. auto c_target = static_cast(dt->columns[dt->ncols - 1]); auto d_target = c_target->elements_r(); // Do training for `n_epochs`. - for (size_t i = 0; i < n_epochs; ++i) { + for (size_t i = 0; i < fmp.n_epochs; ++i) { double total_loss = 0; int32_t nth = config::nthreads; @@ -112,8 +91,6 @@ void Ftrl::train(const DataTable* dt) { { // Array to store hashed features and feature interactions. Uint64Ptr x = Uint64Ptr(new uint64_t[n_features + n_inter_features]); - // Bias term, do we need it? Results are quite similar with/without bias. - x[0] = 0; int32_t ith = omp_get_thread_num(); nth = omp_get_num_threads(); @@ -123,29 +100,30 @@ void Ftrl::train(const DataTable* dt) { bool target = d_target[j]; hash_row(x, dt, j); - double p = predict(x, n_features + n_inter_features); - double loss = logloss(p, target); + double p = predict_row(x, n_features + n_inter_features); + update(x, n_features + n_inter_features, p, target); + double loss = logloss(p, target); #pragma omp atomic update total_loss += loss; - if ((j+1) % REPORT_FREQUENCY == 0) { printf("Training epoch: %zu\tRow: %zu\tPrediction: %f\t" "Current loss: %f\tAverage loss: %f\n", i, j+1, p, loss, total_loss / (j+1)); } - update(x, n_features + n_inter_features, p, target); } } } + model_trained = true; } /* * Make predictions for a test dataset and return targets as a new datatable. +* We assume that all the validation is done in `py_ftrl.cc`. */ -dtptr Ftrl::test(const DataTable* dt) { - // Create a datatable to store targets. +dtptr FtrlModel::predict(const DataTable* dt) { + // Create a target datatable. dtptr dt_target = nullptr; Column* col_target = Column::new_data_column(SType::FLOAT64, dt->nrows); dt_target = dtptr(new DataTable({col_target}, {"target"})); @@ -156,7 +134,6 @@ dtptr Ftrl::test(const DataTable* dt) { #pragma omp parallel num_threads(nth) { Uint64Ptr x = Uint64Ptr(new uint64_t[n_features + n_inter_features]); - x[0] = 0; int32_t ith = omp_get_thread_num(); nth = omp_get_num_threads(); @@ -165,13 +142,12 @@ dtptr Ftrl::test(const DataTable* dt) { j+= static_cast(nth)) { hash_row(x, dt, j); - d_target[j] = predict(x, n_features + n_inter_features); + d_target[j] = predict_row(x, n_features + n_inter_features); if ((j+1) % REPORT_FREQUENCY == 0) { printf("Row: %zu\tPrediction: %f\n", j+1, d_target[j]); } } } - return dt_target; } @@ -179,14 +155,15 @@ dtptr Ftrl::test(const DataTable* dt) { /* * Make a prediction for an array of hashed features. */ -double Ftrl::predict(const Uint64Ptr& x, size_t x_size) { +double FtrlModel::predict_row(const Uint64Ptr& x, size_t x_size) { double wTx = 0; for (size_t j = 0; j < x_size; ++j) { size_t i = x[j]; - if (fabs(z[i]) <= l1) { + if (fabs(z[i]) <= fmp.l1) { w[i] = 0; } else { - w[i] = (signum(z[i]) * l1 - z[i]) / ((b + sqrt(n[i])) / a + l2); + w[i] = (signum(z[i]) * fmp.l1 - z[i]) / + ((fmp.b + sqrt(n[i])) / fmp.a + fmp.l2); } wTx += w[i]; } @@ -198,7 +175,7 @@ double Ftrl::predict(const Uint64Ptr& x, size_t x_size) { /* * Sigmoid function. */ -inline double Ftrl::sigmoid(double x) { +inline double FtrlModel::sigmoid(double x) { double res = 1.0 / (1.0 + exp(-x)); return res; @@ -208,7 +185,7 @@ inline double Ftrl::sigmoid(double x) { /* * Bounded sigmoid function. */ -inline double Ftrl::bsigmoid(double x, double b) { +inline double FtrlModel::bsigmoid(double x, double b) { double res = 1 / (1 + exp(-std::max(std::min(x, b), -b))); return res; @@ -218,12 +195,12 @@ inline double Ftrl::bsigmoid(double x, double b) { /* * Update weights based on prediction and the actual target. */ -void Ftrl::update(const Uint64Ptr& x, size_t x_size, double p, bool target) { +void FtrlModel::update(const Uint64Ptr& x, size_t x_size, double p, bool target) { double g = p - target; for (size_t j = 0; j < x_size; ++j) { size_t i = x[j]; - double sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / a; + double sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / fmp.a; z[i] += g - sigma * w[i]; n[i] += g * g; } @@ -235,9 +212,9 @@ void Ftrl::update(const Uint64Ptr& x, size_t x_size, double p, bool target) { * for production we will remove this method and stick to the hash function, * that demonstrates the best performance. */ -uint64_t Ftrl::hash_string(const char * key, size_t len) { +uint64_t FtrlModel::hash_string(const char * key, size_t len) { uint64_t res; - switch (hash_type) { + switch (fmp.hash_type) { // `std::hash` is kind of slow, because we need to convert `char*` to // `std::string`, as `std::hash` doesn't hash // the actual data. @@ -249,16 +226,16 @@ uint64_t Ftrl::hash_string(const char * key, size_t len) { } // 64 bits Murmur2 hash function. The best performer so far, // need to test it for the memory alignment issues. - case 1: res = hash_murmur2(key, len, seed); break; + case 1: res = hash_murmur2(key, len, fmp.seed); break; // 128 bits Murmur3 hash function, similar performance to `hash_murmur2`. case 2: { uint64_t h[2]; - hash_murmur3(key, len, seed, h); + hash_murmur3(key, len, fmp.seed, h); res = h[0]; break; } - default: res = hash_murmur2(key, len, seed); + default: res = hash_murmur2(key, len, fmp.seed); } return res; } @@ -267,11 +244,11 @@ uint64_t Ftrl::hash_string(const char * key, size_t len) { /* * Hash each element of the datatable row, do feature interaction is requested. */ -void Ftrl::hash_row(Uint64Ptr& x, const DataTable* dt, size_t row_id) { +void FtrlModel::hash_row(Uint64Ptr& x, const DataTable* dt, size_t row_id) { std::vector c_names = dt->get_names(); uint64_t index; - for (size_t i = 0; i < n_features - 1; ++i) { + for (size_t i = 0; i < n_features; ++i) { std::string str; Column* c = dt->columns[i]; LType ltype = info(c->stype()).ltype(); @@ -318,20 +295,21 @@ void Ftrl::hash_row(Uint64Ptr& x, const DataTable* dt, size_t row_id) { } // Add the column name hash to the hashed value, so that the same value // in different columns will result in different hashes. + // TODO: pre-hash all the column names only once. uint64_t h = hash_string(c_names[i].c_str(), c_names[i].length() * sizeof(char)); index += h; - x[i+1] = index % d; + x[i] = index % fmp.d; } // Do feature interaction if required. We may also want to test // just a simple `h = x[i+1] + x[j+1]` approach. size_t count = 0; - if (inter) { + if (fmp.inter) { for (size_t i = 0; i < n_features - 1; ++i) { - for (size_t j = i + 1; j < n_features - 1; ++j) { + for (size_t j = i + 1; j < n_features; ++j) { std::string s = std::to_string(x[i+1]) + std::to_string(x[j+1]); uint64_t h = hash_string(s.c_str(), s.length() * sizeof(char)); - x[n_features + count] = h % d; + x[n_features + count] = h % fmp.d; count++; } } @@ -342,7 +320,7 @@ void Ftrl::hash_row(Uint64Ptr& x, const DataTable* dt, size_t row_id) { /* * Calculate logloss based on a prediction and the actual target. */ -double Ftrl::logloss(double p, bool target) { +double FtrlModel::logloss(double p, bool target) { double epsilon = std::numeric_limits::epsilon(); p = std::max(std::min(p, 1 - epsilon), epsilon); if (target) { @@ -356,7 +334,7 @@ double Ftrl::logloss(double p, bool target) { /* * Calculate signum. */ -inline double Ftrl::signum(double x) { +inline double FtrlModel::signum(double x) { if (x > 0) return 1; if (x < 0) return -1; return 0; @@ -366,12 +344,148 @@ inline double Ftrl::signum(double x) { /* * Hash `double` to `uint64_t` based on its bit representation. */ -inline uint64_t Ftrl::hash_double(double x) { +inline uint64_t FtrlModel::hash_double(double x) { uint64_t* h = reinterpret_cast(&x); return *h; } -void DatatableModule::init_methods_ftrl() { - ADDFN(py::ftrl); +/* +* Get a shallow copy of an FTRL model if available. +*/ +DataTable* FtrlModel::get_model() { + if (dt_model != nullptr) { + return dt_model->copy(); + } else { + return nullptr; + } +} + + +/* +* Set an FTRL model, assuming all the validation is done in `py_ftrl.cc` +*/ +void FtrlModel::set_model(DataTable* dt_model_in) { + dt_model = dtptr(dt_model_in->copy()); +} + + +/* +* Other getters and setters. +* Here we assume that all the validation is done in `py_ftrl.cc`. +*/ +double FtrlModel::get_a() { + return fmp.a; +} + + +double FtrlModel::get_b() { + return fmp.b; +} + + +double FtrlModel::get_l1() { + return fmp.l1; +} + + +double FtrlModel::get_l2() { + return fmp.l2; +} + + +uint64_t FtrlModel::get_d() { + return fmp.d; +} + + +bool FtrlModel::get_inter() { + return fmp.inter; +} + + +unsigned int FtrlModel::get_hash_type() { + return fmp.hash_type; +} + + +unsigned int FtrlModel::get_seed() { + return fmp.seed; +} + + +size_t FtrlModel::get_n_epochs() { + return fmp.n_epochs; +} + + +void FtrlModel::set_a(double a_in) { + if (fmp.a != a_in) { + fmp.a = a_in; + init_model(); + } +} + + +void FtrlModel::set_b(double b_in) { + if (fmp.b != b_in) { + fmp.b = b_in; + init_model(); + } +} + + +void FtrlModel::set_l1(double l1_in) { + if (fmp.l1 != l1_in) { + fmp.l1 = l1_in; + init_model(); + } +} + + +void FtrlModel::set_l2(double l2_in) { + if (fmp.l2 != l2_in) { + fmp.l2 = l2_in; + init_model(); + } +} + + +void FtrlModel::set_d(uint64_t d_in) { + if (fmp.d != d_in) { + fmp.d = d_in; + create_model(); + init_model(); + } +} + + +void FtrlModel::set_inter(bool inter_in) { + if (fmp.inter != inter_in) { + fmp.inter = inter_in; + init_model(); + } +} + + +void FtrlModel::set_hash_type(unsigned int hash_type_in) { + if (fmp.hash_type != hash_type_in) { + fmp.hash_type = hash_type_in; + init_model(); + } +} + + +void FtrlModel::set_seed(unsigned int seed_in) { + if (fmp.seed != seed_in) { + fmp.seed = seed_in; + init_model(); + } +} + + +void FtrlModel::set_n_epochs(size_t n_epochs_in) { + if (fmp.n_epochs != n_epochs_in) { + fmp.n_epochs = n_epochs_in; + } } diff --git a/c/extras/ftrl.h b/c/extras/ftrl.h index ac2d4d7d62..676b830822 100644 --- a/c/extras/ftrl.h +++ b/c/extras/ftrl.h @@ -25,39 +25,83 @@ typedef std::unique_ptr DoublePtr; typedef std::unique_ptr Uint64Ptr; #define REPORT_FREQUENCY 1000 -class Ftrl { +struct FtrlModelParams { + double a; + double b; + double l1; + double l2; + uint64_t d; + size_t n_epochs; + unsigned int hash_type; + unsigned int seed; + bool inter; + size_t : 56; +}; + + +class FtrlModel { + private: + // Datatable with `z` and `n` model values. + dtptr dt_model; + double* z; + double* n; + + // Input to the model. + FtrlModelParams fmp; + + // Calculated during the learning process. + size_t n_features; + size_t n_inter_features; + DoublePtr w; + bool model_trained; + size_t : 56; + public: - Ftrl(double, double, double, double, uint64_t, size_t, bool, - unsigned int, unsigned int); + FtrlModel(FtrlModelParams); + + static const std::vector model_cols; + static const FtrlModelParams fmp_default; - void train(const DataTable*); - dtptr test(const DataTable*); - double predict(const Uint64Ptr&, size_t); + // Learning and predicting methods. + bool is_trained(); + void fit(const DataTable*); + dtptr predict(const DataTable*); + double predict_row(const Uint64Ptr&, size_t); void update(const Uint64Ptr&, size_t, double, bool); + void init_model(); + void create_model(); - double logloss(double, bool); + // Learning helper methods. + static double logloss(double, bool); static double signum(double); static double sigmoid(double); static double bsigmoid(double, double); + // Hashing methods. uint64_t hash_string(const char *, size_t); static uint64_t hash_double(double); void hash_row(Uint64Ptr&, const DataTable*, size_t); - private: - double a; - double b; - double l1; - double l2; - size_t n_features; - size_t n_inter_features; - uint64_t d; - size_t n_epochs; - unsigned int hash_type; - unsigned int seed; - DoublePtr n; - DoublePtr z; - DoublePtr w; - bool inter; - uint64_t : 56; + // Getters and setters, some will invalidate the learning results. + DataTable* get_model(); + double get_a(); + double get_b(); + double get_l1(); + double get_l2(); + uint64_t get_d(); + size_t get_n_epochs(); + unsigned int get_hash_type(); + unsigned int get_seed(); + bool get_inter(); + void set_model(DataTable*); + void set_a(double); + void set_b(double); + void set_l1(double); + void set_l2(double); + void set_d(uint64_t); + void set_n_epochs(size_t); + void set_inter(bool); + void set_hash_type(unsigned int); + void set_seed(unsigned int); }; + diff --git a/c/extras/py_ftrl.cc b/c/extras/py_ftrl.cc new file mode 100644 index 0000000000..81be7cb44d --- /dev/null +++ b/c/extras/py_ftrl.cc @@ -0,0 +1,409 @@ +//------------------------------------------------------------------------------ +// Copyright 2018 H2O.ai +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. +//------------------------------------------------------------------------------ +#include "extras/py_ftrl.h" +#include "frame/py_frame.h" +#include "python/float.h" +#include "python/int.h" + +namespace py { + +PKArgs Ftrl::Type::args___init__(0, 0, 9, false, false, + {"a", "b", "l1", "l2", "d", "n_epochs", + "inter", "hash_type", "seed"}, + "__init__", nullptr); + +void Ftrl::m__init__(PKArgs& args) { + FtrlModelParams fmp = FtrlModel::fmp_default; + + if (!(args[0].is_undefined() || args[0].is_none())) { + fmp.a = args[0].to_double(); + } + + if (!(args[1].is_undefined() || args[1].is_none())) { + fmp.b = args[1].to_double(); + } + + if (!(args[2].is_undefined() || args[2].is_none())) { + fmp.l1 = args[2].to_double(); + } + + if (!(args[3].is_undefined() || args[3].is_none())) { + fmp.l2 = args[3].to_double(); + } + + if (!(args[4].is_undefined() || args[4].is_none())) { + fmp.d = static_cast(args[4].to_size_t()); + } + + if (!(args[5].is_undefined() || args[5].is_none())) { + fmp.n_epochs = args[5].to_size_t(); + } + + if (!(args[6].is_undefined() || args[6].is_none())) { + fmp.inter = args[6].to_bool_strict(); + } + + if (!(args[7].is_undefined() || args[7].is_none())) { + fmp.hash_type = static_cast(args[7].to_size_t()); + } + + if (!(args[8].is_undefined() || args[8].is_none())) { + fmp.seed = static_cast(args[8].to_size_t()); + } + + fm = new FtrlModel(fmp); +} + + +void Ftrl::m__dealloc__() { + delete fm; +} + + +const char* Ftrl::Type::classname() { + return "datatable.core.Ftrl"; +} + + +const char* Ftrl::Type::classdoc() { + return R"(Follow the Regularized Leader (FTRL) model with hashing trick. + +See this reference for more details: +https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf + +Parameters +---------- +a : float + `alpha` in per-coordinate learning rate formula. +b : float + `beta` in per-coordinate learning rate formula. +l1 : float + L1 regularization parameter. +l2 : float + L2 regularization parameter. +d : int + Number of bins to be used after the hashing trick. +n_epochs : int + Number of epochs to train for. +inter : bool + If feature interactions to be used or not. +hash_type : int + Hashing method to use for strings: + `0` - std::hash; + `1` - Murmur2; + `2` - Murmur3. +seed: int + Seed to be used for Murmur hash functions. +)"; +} + + +void Ftrl::Type::init_methods_and_getsets(Methods& mm, GetSetters& gs) { + gs.add<&Ftrl::get_model, &Ftrl::set_model>("model", + "Frame having two columns, i.e. `z` and `n`, and `d` rows,\n" + "where `d` is a number of bins set for modeling. Both column types\n" + "must be `FLOAT64`.\n" + "NB: as the model trains, this frame will be changed in-place.\n"); + + gs.add<&Ftrl::get_a, &Ftrl::set_a>("a", "`alpha` in per-coordinate learning rate formula.\n"); + gs.add<&Ftrl::get_b, &Ftrl::set_b>("b", "`beta` in per-coordinate learning rate formula.\n"); + gs.add<&Ftrl::get_l1, &Ftrl::set_l1>("l1", "L1 regularization parameter.\n"); + gs.add<&Ftrl::get_l2, &Ftrl::set_l2>("l2", "L2 regularization parameter.\n"); + gs.add<&Ftrl::get_d, &Ftrl::set_d>("d", "Number of bins to be used after the hashing trick.\n"); + gs.add<&Ftrl::get_n_epochs, &Ftrl::set_n_epochs>("n_epochs", "Number of epochs to train for.\n"); + gs.add<&Ftrl::get_inter, &Ftrl::set_inter>("inter", "If feature interactions to be used or not.\n"); + gs.add<&Ftrl::get_hash_type, &Ftrl::set_hash_type>("hash_type", "Hashing method to use for strings.\n" + "`0` - std::hash;\n" + "`1` - Murmur2;\n" + "`2` - Murmur3.\n"); + gs.add<&Ftrl::get_seed, &Ftrl::set_seed>("seed", "Seed to be used for Murmur hash functions.\n"); + + mm.add<&Ftrl::fit, args_fit>(); + mm.add<&Ftrl::predict, args_predict>(); + mm.add<&Ftrl::reset, args_reset>(); +} + + +PKArgs Ftrl::Type::args_fit(1, 0, 0, false, false, {"frame"}, "fit", +R"(fit(self, frame) +-- + +Train an FTRL model on a dataset. + +Parameters +---------- +frame: Frame + Frame to be trained on, last column is treated as `target`. + +Returns +---------- + None +)"); + + +void Ftrl::fit(const PKArgs& args) { + DataTable* dt_train = args[0].to_frame(); + fm->fit(dt_train); +} + + +PKArgs Ftrl::Type::args_predict(1, 0, 0, false, false, {"frame"}, "predict", +R"(predict(self, frame) +-- + +Make predictions for a dataset. + +Parameters +---------- +frame: Frame + Frame of shape `(nrows, ncols)` to make predictions for. It must have one + column less than the training dataset. + +Returns +---------- + A new `Frame` of shape `(nrows, 1)` with a prediction for each row. +)"); + + +oobj Ftrl::predict(const PKArgs& args) { + if (fm->is_trained()) { + DataTable* dt_test = args[0].to_frame(); + DataTable* dt_target = fm->predict(dt_test).release(); + py::oobj df_target = py::oobj::from_new_reference(py::Frame::from_datatable(dt_target)); + return df_target; + } else { + throw ValueError() << "Cannot make any predictions, because the model was not trained"; + } +} + + +PKArgs Ftrl::Type::args_reset(0, 0, 0, false, false, {}, "reset", +R"(reset(self) +-- + +Reset an FTRL model. + +Parameters +---------- + None + +Returns +---------- + None +)"); + + +void Ftrl::reset(const PKArgs&) { + fm->init_model(); +} + +/* +* Getter and setter for the model datatable. +*/ +oobj Ftrl::get_model() const { + if (fm->is_trained()) { + DataTable* dt_model = fm->get_model(); + py::oobj df_model = py::oobj::from_new_reference(py::Frame::from_datatable(dt_model)); + return df_model; + } else { + return py::None(); + } +} + + +void Ftrl::set_model(robj model) { + DataTable* dt_model_in = model.to_frame(); + const std::vector& model_cols_in = dt_model_in->get_names(); + + if (dt_model_in->nrows != fm->get_d() || dt_model_in->ncols != 2) { + throw ValueError() << "FTRL model frame must have " << fm->get_d() << " rows," + << "and 2 columns, whereas your frame has " << dt_model_in->nrows + << " rows and " << dt_model_in->ncols << " columns"; + } + + if (model_cols_in != FtrlModel::model_cols) { + throw ValueError() << "FTRL model frame must have columns named `z` and `n`," + << "whereas your frame has the following column names `" << model_cols_in[0] + << "` and `" << model_cols_in[1] << "`"; + } + + if (dt_model_in->columns[0]->stype() != SType::FLOAT64 || + dt_model_in->columns[1]->stype() != SType::FLOAT64) { + throw ValueError() << "FTRL model frame must have both column types as `float64`, " + << "whereas your frame has the following column types: `" + << dt_model_in->columns[0]->stype() + << "` and `" << dt_model_in->columns[1]->stype() << "`"; + } + + fm->set_model(dt_model_in); +} + + +/* +* All other getters and setters. +*/ +oobj Ftrl::get_a() const { + return py::ofloat(fm->get_a()); +} + + +oobj Ftrl::get_b() const { + return py::ofloat(fm->get_b()); +} + + +oobj Ftrl::get_l1() const { + return py::ofloat(fm->get_l1()); +} + + +oobj Ftrl::get_l2() const { + return py::ofloat(fm->get_l2()); +} + + +oobj Ftrl::get_d() const { + return py::oint(static_cast(fm->get_d())); +} + + +oobj Ftrl::get_n_epochs() const { + return py::oint(fm->get_n_epochs()); +} + + +oobj Ftrl::get_inter() const { + return py::oint(static_cast(fm->get_inter())); +} + + +oobj Ftrl::get_hash_type() const { + return py::oint(static_cast(fm->get_hash_type())); +} + + +oobj Ftrl::get_seed() const { + return py::oint(static_cast(fm->get_seed())); +} + + +void Ftrl::set_a(robj a) { + if (!a.is_numeric()) { + throw TypeError() << "`a` must be numeric, not " + << a.typeobj(); + } + fm->set_a(a.to_double()); +} + + +void Ftrl::set_b(robj b) { + if (!b.is_numeric()) { + throw TypeError() << "`b` must be numeric, not " + << b.typeobj(); + } + fm->set_b(b.to_double()); +} + + +void Ftrl::set_l1(robj l1) { + if (!l1.is_numeric()) { + throw TypeError() << "`l1` must be numeric, not " + << l1.typeobj(); + } + fm->set_l1(l1.to_double()); +} + + +void Ftrl::set_l2(robj l2) { + if (!l2.is_numeric()) { + throw TypeError() << "`l2` must be numeric, not " + << l2.typeobj(); + } + fm->set_l2(l2.to_double()); +} + + +void Ftrl::set_d(robj d) { + if (!d.is_int()) { + throw TypeError() << "`d` must be integer, not " + << d.typeobj(); + } + int64_t d_in = d.to_int64_strict(); + if (d_in < 0) { + throw ValueError() << "`d` cannot be negative"; + } + fm->set_d(static_cast(d_in)); +} + + +void Ftrl::set_n_epochs(robj n_epochs) { + if (!n_epochs.is_int()) { + throw TypeError() << "`n_epochs` must be integer, not " + << n_epochs.typeobj(); + } + int64_t n_epochs_in = n_epochs.to_int64_strict(); + if (n_epochs_in < 0) { + throw ValueError() << "`n_epochs` cannot be negative"; + } + fm->set_n_epochs(static_cast(n_epochs_in)); +} + + +void Ftrl::set_inter(robj inter) { + if (!inter.is_int()) { + throw TypeError() << "`inter` must be integer, not " + << inter.typeobj(); + } + int64_t inter_in = inter.to_int64_strict(); + if (inter_in != 0 && inter_in != 1) { + throw ValueError() << "`inter` must be either `0` or `1`"; + } + fm->set_d(static_cast(inter_in)); +} + + +void Ftrl::set_hash_type(robj hash_type) { + if (!hash_type.is_int()) { + throw TypeError() << "`hash_type` must be integer, not " + << hash_type.typeobj(); + } + int64_t hash_type_in = hash_type.to_int64_strict(); + if (hash_type_in != 0 && hash_type_in != 1 && hash_type_in !=2) { + throw ValueError() << "`hash_type_in` must be either `0` or `1` or `2`"; + } + fm->set_hash_type(static_cast(hash_type_in)); +} + + +void Ftrl::set_seed(robj seed) { + if (!seed.is_int()) { + throw TypeError() << "`seed` must be integer, not " + << seed.typeobj(); + } + int32_t seed_in = seed.to_int32_strict(); + if (seed_in < 0) { + throw ValueError() << "`seed` cannot be negative"; + } + fm->set_seed(static_cast(seed_in)); +} + +} // namespace py diff --git a/c/extras/py_ftrl.h b/c/extras/py_ftrl.h new file mode 100644 index 0000000000..05be3b6f7b --- /dev/null +++ b/c/extras/py_ftrl.h @@ -0,0 +1,71 @@ +//------------------------------------------------------------------------------ +// Copyright 2018 H2O.ai +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. +//------------------------------------------------------------------------------ +#include "python/ext_type.h" +#include "extras/ftrl.h" + +namespace py { + class Ftrl : public PyObject { + private: + FtrlModel* fm; + + public: + class Type : public ExtType { + public: + static PKArgs args___init__; + static PKArgs args_fit; + static PKArgs args_predict; + static PKArgs args_reset; + static const char* classname(); + static const char* classdoc(); + static bool is_subclassable() { return true; } + static void init_methods_and_getsets(Methods&, GetSetters&); + }; + void m__init__(PKArgs&); + void m__dealloc__(); + void fit(const PKArgs&); + oobj predict(const PKArgs&); + void reset(const PKArgs&); + + // Getters and setters. + oobj get_model() const; + oobj get_a() const; + oobj get_b() const; + oobj get_l1() const; + oobj get_l2() const; + oobj get_d() const; + oobj get_inter() const; + oobj get_n_epochs() const; + oobj get_hash_type() const; + oobj get_seed() const; + void set_model(robj); + void set_a(robj); + void set_b(robj); + void set_l1(robj); + void set_l2(robj); + void set_d(robj); + void set_n_epochs(robj); + void set_inter(robj); + void set_hash_type(robj); + void set_seed(robj); + + }; +} diff --git a/datatable/extras/__init__.py b/datatable/extras/__init__.py index 6797bf6892..3bc6f053b6 100644 --- a/datatable/extras/__init__.py +++ b/datatable/extras/__init__.py @@ -3,4 +3,4 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -#------------------------------------------------------------------------------- \ No newline at end of file +#------------------------------------------------------------------------------- diff --git a/datatable/extras/ftrl.py b/datatable/extras/ftrl.py deleted file mode 100644 index db8d48e14e..0000000000 --- a/datatable/extras/ftrl.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python -#------------------------------------------------------------------------------- -# Copyright 2018 H2O.ai -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -#------------------------------------------------------------------------------- -import datatable as dt -from datatable.lib import core - - -def ftrl(df_train, df_test, a=0.01, b=1.0, l1=0.0, l2=1.0, - d=10**7, n_epochs=1, inter=False, hash_type=1, - seed=0): - """ - Implementation of Follow the Regularized Leader (FTRL) algorithm. - - For more details see this reference: - https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf - - Parameters - ---------- - df_train: datatable - Frame to be trained on, last column is treated as `target`. - df_test: datatable - Frame to be tested on, must have one column less than `df_train`. - a : float - \alpha in per-coordinate learning rate formula. - b : float - \beta in per-coordinate learning rate formula. - l1 : float - L1 regularization parameter. - l2 : float - L2 regularization parameter. - d : int - Number of bins to be used after the hashing trick. - n_epochs : int - Number of epochs to train for. - inter : boolean - If feature interactions to be used or not. - hash_type : int - Hashing method to use for strings: - `0` - std::hash; - `1` - Murmur2; - `2` - Murmur3. - seed: unsigned int - Seed to be used for initialization and Murmur hash functions. - - Returns - ------- - A new datatable of shape (nrows, 1) containing target values for each - row from `df_test`. - """ - - df_target = core.ftrl(df_train, df_test, a, b, l1, l2, - d, n_epochs, inter, hash_type, seed) - return df_target