{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adam Patyk\n",
    "# Clemson University\n",
    "# MS Thesis: Daily Pattern Classifier\n",
    "# Summer 2021\n",
    "\n",
    "# LoadFiles.ipynb\n",
    "# Purpose: Consolidate daily sample data from text files into NumPy arrays\n",
    "\n",
    "import os\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import glob\n",
    "import multiprocessing as mp\n",
    "from tqdm import tqdm\n",
    "from datetime import datetime\n",
    "\n",
    "### LOAD FILES FROM MULTIPLE DIRECTORIES IN PARALLEL\n",
    "\n",
    "W = [2, 4, 6, 8, 10]\n",
    "samples, labels = [], []\n",
    "\n",
    "# reader function for parallelized text file reading\n",
    "def reader(window_size):\n",
    "    data  = []\n",
    "    for filename in tqdm(glob.glob(os.path.join(data_dir + f'W{window_size}/', '*.txt'))):\n",
    "        d = np.loadtxt(filename)\n",
    "        f = np.full((len(d), 1), int(filename[-12:-9]))\n",
    "        data.append(np.hstack((f, d)))\n",
    "    return data\n",
    "    \n",
    "data_dir = 'samples/'\n",
    "\n",
    "start_time = datetime.now()\n",
    "\n",
    "print(f'Loading data in parallel...', flush=True) # buffer flush needed with tqdm\n",
    "# read text files in parallel\n",
    "pool = mp.Pool(len(W))\n",
    "data_ls = pool.map(reader, W)\n",
    "\n",
    "end_time = datetime.now()\n",
    "print(f'Duration: {end_time - start_time}', flush=True) # buffer flush needed with tqdm\n",
    "\n",
    "# arrange data into samples, labels and filenames for saving\n",
    "all_data = np.hstack(data_ls)\n",
    "all_samples = [x[:, 2] for x in all_data]\n",
    "all_labels = [x[:, 1] for x in all_data]\n",
    "all_filenames = [x[0, 0] for x in all_data]\n",
    "\n",
    "# save data to numpy arrays\n",
    "!mkdir -p compressed-samples\n",
    "np.save('compressed-samples/daily-samples.npy', np.array(all_samples, dtype=object))\n",
    "np.save('compressed-samples/daily-labels.npy', all_labels)\n",
    "np.save('compressed-samples/daily-filenames.npy', all_filenames)\n",
    "\n",
    "print('Done.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import glob\n",
    "from tqdm import tqdm\n",
    "\n",
    "### LOAD FILES FROM SINGLE DIRECTORY (SERIAL)\n",
    "\n",
    "data_dir = 'samples/'\n",
    "\n",
    "all_data  = []\n",
    "for filename in tqdm(glob.glob(os.path.join(data_dir, '*.txt'))):\n",
    "    d = np.loadtxt(filename)\n",
    "    f = np.full((len(d), 1), int(filename[-12:-9]))\n",
    "    all_data.append(np.hstack((f, d)))\n",
    "    \n",
    "all_samples = [x[:, 2] for x in all_data]\n",
    "all_labels = [x[:, 1] for x in all_data]\n",
    "all_filenames = [x[0, 0] for x in all_data]\n",
    "\n",
    "!mkdir -p compressed-samples\n",
    "np.save('compressed-samples/daily-samples.npy', np.array(all_samples, dtype=object))\n",
    "np.save('compressed-samples/daily-labels.npy', all_labels)\n",
    "np.save('compressed-samples/daily-filenames.npy', all_filenames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "TensorFlow GPU 2.2.0",
   "language": "python",
   "name": "tf_gpu_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
