{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adam Patyk\n",
    "# Clemson University\n",
    "# MS Thesis: Daily Pattern Classifier\n",
    "# Summer 2021\n",
    "\n",
    "# DailyPatternRNN.ipynb\n",
    "# Purpose: Train and test daily pattern classifier with k-fold cross validation\n",
    "\n",
    "import random\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tqdm import tqdm\n",
    "from datetime import datetime\n",
    "from sklearn.model_selection import KFold\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences \n",
    "\n",
    "# prepare for multi-GPU workflow\n",
    "gpus = tf.config.list_physical_devices('GPU')\n",
    "for gpu in gpus:\n",
    "    tf.config.experimental.set_memory_growth(gpu, True)\n",
    "logical_gpus = tf.config.list_logical_devices('GPU')\n",
    "print(len(gpus), \"Physical GPU,\", len(logical_gpus), \"Logical GPUs\")\n",
    "\n",
    "seed = 42\n",
    "random.seed(seed)\n",
    "np.random.seed(seed)\n",
    "tf.random.set_seed(seed)\n",
    "\n",
    "# get all filenames (e.g. P2247)\n",
    "filenames = []\n",
    "with open('../common/batch-unix.txt', 'r') as f:\n",
    "    for r in f:\n",
    "        c = r.split('\\t')\n",
    "        filenames.append(c[0].strip()[-9:-4])\n",
    "\n",
    "len_threshold = 850\n",
    "k = 5\n",
    "\n",
    "# load numpy arrays from binary .npy files (created from .txt samples in LoadFiles script)\n",
    "print('Loading files...')\n",
    "raw_samples = np.load('../GenerateSamples/compressed-samples/daily-samples.npy', allow_pickle=True)\n",
    "raw_labels = np.load('../GenerateSamples/compressed-samples/daily-labels.npy', allow_pickle=True)\n",
    "all_filenames = np.load('../GenerateSamples/compressed-samples/daily-filenames.npy').astype(int)\n",
    "original_sample_lengths = np.array([len(sample) for sample in raw_samples])\n",
    "\n",
    "# pad or truncate data sequences accordingly\n",
    "print('Padding data...')\n",
    "all_samples = pad_sequences(raw_samples, len_threshold, dtype='float64', padding='post', truncating='post', value=-1)\n",
    "all_labels = pad_sequences(raw_labels, len_threshold, dtype='int32', padding='post', truncating='post', value=-1)\n",
    "\n",
    "print('Data ready.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import sklearn\n",
    "\n",
    "sys.path.append('../') # for .py files in ../common/\n",
    "import common.testing as testing\n",
    "\n",
    "save_data = False\n",
    "epochs = 50\n",
    "batch_size = 64\n",
    "num_units = 16\n",
    "num_subjects = 354\n",
    "n_timesteps = len_threshold\n",
    "\n",
    "# prepare k-fold cross validation\n",
    "kfold = KFold(k, shuffle=True, random_state=seed)\n",
    "# randomly shuffle array of indices\n",
    "x = range(num_subjects)\n",
    "subjects = np.array(random.sample(x, num_subjects), copy=False)\n",
    "\n",
    "total_TPR, total_TNR, total_F1, total_Prec, total_WAcc = [], [], [], [], []\n",
    "total_ep_TPR, total_ep_F1, total_ep_FP_TP = [], [], []\n",
    "\n",
    "start_time = datetime.now()\n",
    "\n",
    "for i, (training_subjects, testing_subjects) in enumerate(kfold.split(subjects)):\n",
    "    print(f'FOLD {i+1}')\n",
    "    !mkdir -p models\n",
    "    model_path = f'models/daily-pattern-b{batch_size}-u{num_units}-e{epochs}-fold{i+1}'\n",
    "    # TRAINING\n",
    "    print('Training...')\n",
    "    # retrieve only samples/labels corresponding to training fold\n",
    "    training_bool = np.isin(all_filenames, training_subjects)\n",
    "    training_samples = tf.convert_to_tensor(all_samples[training_bool], np.float32)\n",
    "    training_labels = tf.convert_to_tensor(all_labels[training_bool], np.int8)\n",
    "    \n",
    "    training_samples = tf.reshape(training_samples, (-1, n_timesteps, 1))\n",
    "    training_labels = tf.reshape(training_labels, (-1, n_timesteps, 1))\n",
    "    \n",
    "    tf.keras.backend.clear_session()\n",
    "    mcp_save = tf.keras.callbacks.ModelCheckpoint(model_path, save_best_only=True, monitor='accuracy')\n",
    "\n",
    "    # define model\n",
    "    model = tf.keras.models.Sequential([\n",
    "        tf.keras.layers.Masking(mask_value=-1,\n",
    "                                input_shape=(n_timesteps, 1)),\n",
    "        tf.keras.layers.Bidirectional(\n",
    "            tf.keras.layers.GRU(units=num_units, \n",
    "                                 return_sequences=True,\n",
    "                                 kernel_initializer='glorot_normal', # Xavier normal initialization\n",
    "                                 bias_initializer='zeros'),\n",
    "            merge_mode='sum'\n",
    "        ),\n",
    "        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='sigmoid'))\n",
    "    ])\n",
    "\n",
    "    model.compile(optimizer='adam',\n",
    "                  loss='binary_crossentropy',\n",
    "                  metrics=['accuracy'])\n",
    "\n",
    "    history = model.fit(x=training_samples, y=training_labels,\n",
    "                        epochs=epochs, batch_size=batch_size, verbose=1,\n",
    "                        callbacks=[mcp_save])\n",
    "    \n",
    "    # TESTING\n",
    "    print('Testing...', flush=True) # buffer flush needed with tqdm\n",
    "    total_TP, total_FP, total_TN, total_FN = 0, 0, 0, 0\n",
    "    total_ep_TP, total_ep_FP, total_ep_FN = 0, 0, 0\n",
    "\n",
    "    # retrieve only samples/labels corresponding to testing fold\n",
    "    testing_bool = np.isin(all_filenames, testing_subjects)\n",
    "    testing_samples = tf.convert_to_tensor(all_samples[testing_bool], np.float32)\n",
    "    testing_labels = tf.convert_to_tensor(all_labels[testing_bool], np.int8)\n",
    "    testing_sample_lengths = original_sample_lengths[testing_bool]\n",
    "    \n",
    "    testing_samples = tf.reshape(testing_samples, (-1, n_timesteps, 1))\n",
    "    testing_labels = tf.reshape(testing_labels, (-1, n_timesteps, 1))\n",
    "    \n",
    "    # inference for all testing data using best model from training\n",
    "    model = tf.keras.models.load_model(model_path)\n",
    "    testing_probs = model.predict(testing_samples, batch_size=4096)\n",
    "    \n",
    "    # save data to files for post-hoc threshold testing\n",
    "    if save_data:\n",
    "        !mkdir -p testing\n",
    "        np.save(f'testing/testing_lengths_{epochs}epochs_fold{i+1}.npy', testing_sample_lengths)\n",
    "        np.save(f'testing/testing_probs_{epochs}epochs_fold{i+1}.npy', testing_probs)\n",
    "        np.save(f'testing/testing_samples_{epochs}epochs_fold{i+1}.npy', tf.squeeze(testing_samples).numpy())\n",
    "        np.save(f'testing/testing_labels_{epochs}epochs_fold{i+1}.npy', tf.squeeze(testing_labels).numpy())\n",
    "\n",
    "    # get time and episode metrics on testing dataset\n",
    "    for i in tqdm(range(len(testing_labels))):\n",
    "        probs = testing_probs[i,:testing_sample_lengths[i]]\n",
    "        gt_labels = testing_labels[i,:testing_sample_lengths[i]]\n",
    "        # thresholding segmentation\n",
    "        results = testing.single_threshold(probs, gt_labels, winmin=6, stepsec=100, threshold=0.1)\n",
    "        # time-based metrics\n",
    "        TN, FP, FN, TP = sklearn.metrics.confusion_matrix(gt_labels, results['predictions'][0], labels=[0,1]).ravel()\n",
    "        total_TP += TP\n",
    "        total_FP += FP\n",
    "        total_TN += TN\n",
    "        total_FN += FN\n",
    "        # episode-based metrics\n",
    "        ep_TP, ep_FP, ep_FN = testing.calc_episode_metrics(results, gt_labels)\n",
    "        total_ep_TP += ep_TP\n",
    "        total_ep_FP += ep_FP\n",
    "        total_ep_FN += ep_FN\n",
    "\n",
    "    # calculate and report overall metrics\n",
    "    TPR = testing.true_positive_rate(total_TP, total_FN)\n",
    "    TNR = testing.true_negative_rate(total_TN, total_FP)\n",
    "    F1 = testing.f1_score(total_TP, total_FP, total_FN)\n",
    "    Prec = testing.precision(total_TP, total_FP)\n",
    "    WAcc = testing.weighted_accuracy(total_TP, total_FP, total_TN, total_FN)\n",
    "\n",
    "    print('--- Time Metrics ---')\n",
    "    print(f'WAcc: {WAcc:.3f}\\tTPR: {TPR:.3f}\\tTNR: {TNR:.3f}\\tF1: {F1:.3f}\\tPrecision: {Prec:.3f}')\n",
    "\n",
    "    ep_TPR = testing.true_positive_rate(total_ep_TP, total_ep_FN)\n",
    "    ep_F1 = testing.f1_score(total_ep_TP, total_ep_FP, total_ep_FN)\n",
    "    ep_FP_TP = -1 if total_ep_TP == 0 else total_ep_FP / total_ep_TP\n",
    "\n",
    "    print('--- Episode Metrics ---')\n",
    "    print(f'TPR: {ep_TPR:.3f}\\tF1: {ep_F1:.3f}\\tFP/TP: {ep_FP_TP:.3f}')\n",
    "    \n",
    "    total_TPR.append(TPR)\n",
    "    total_TNR.append(TNR)\n",
    "    total_F1.append(F1)\n",
    "    total_Prec.append(Prec)\n",
    "    total_WAcc.append(WAcc)\n",
    "    total_ep_TPR.append(ep_TPR)\n",
    "    total_ep_F1.append(ep_F1)\n",
    "    total_ep_FP_TP.append(ep_FP_TP)\n",
    "    \n",
    "    del model\n",
    "    print(\"*****************************************************************\", flush=True)\n",
    "    \n",
    "    \n",
    "end_time = datetime.now()\n",
    "print(f'Duration: {end_time - start_time}')\n",
    "\n",
    "print('AVERAGE:')\n",
    "print('--- Time Metrics ---')\n",
    "print(f'WAcc: {np.mean(total_WAcc):.3f}\\tTPR: {np.mean(total_TPR):.3f}\\tTNR: {np.mean(total_TNR):.3f}\\tF1: {np.mean(total_F1):.3f}\\tPrecision: {np.mean(total_Prec):.3f}')\n",
    "\n",
    "print('--- Episode Metrics ---')\n",
    "print(f'TPR: {np.mean(total_ep_TPR):.3f}\\tF1: {np.mean(total_ep_F1):.3f}\\tFP/TP: {np.mean(total_ep_FP_TP):.3f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "TensorFlow GPU 2.2.0",
   "language": "python",
   "name": "tf_gpu_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}