PB20061254.py

#!/usr/bin/env python3
import os
import json
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

DATA_PATH = "./data/test_data_sample.json"

## for TA's test
## you need to modify the class name to your student id.
## you also need to implement the predict function, which reads the .json file,
## calls your trained model and returns predict results as an ndarray

class PB20061254():
    def predict(self, data_path): 
        #### This function is generated by `scripts/generate.py`
        import sys
        sys.path.append('./src')
        class args:
            in_dir = './models'
            data = data_path
        #### Predict function begins here
        from torch import load
        prep = load(f'{args.in_dir}/preprocessor.pt')
        models = load(f'{args.in_dir}/models.pt')
    
        # Load data from disk
        test_df = pd.read_json(args.data)
    
        # Cleanse and transform data
        test_df = prep.cleanse(test_df)
        test_df_prep = prep.transform(test_df)
    
        # Get feature matrix and target vector
        X = test_df_prep.drop('fit', axis=1).values
        y = test_df_prep['fit'].values
    
        # To tackle class imbalance, we split the majority class (True to Size) into 3 folds,
        # and train the model on each fold separately. When predicting on the test set,
        # we aggregate the predictions from all 3 trained models and take the majority vote.
        y_preds = [model.predict(X) for model in models]
        y_preds = list(map(list, zip(*y_preds)))  # list transpose
        y_pred = np.array([max(set(votes), key=votes.count) for votes in y_preds]) 
        # Our OrdinalEncoder maps the labels to 0, 1, 2, so we need to plus 1
        return y_pred + 1


## for local validation
if __name__ == '__main__':
    with open(DATA_PATH, "r") as f:
        test_data_list = json.load(f)
    true = np.array([int(data["fit"]) for data in test_data_list])
    bot = PB20061254()
    pred = bot.predict(DATA_PATH)

    macro_f1 = f1_score(y_true=true, y_pred=pred, average="macro")
    print(macro_f1)