Source code for hypnettorch.data.timeseries.smnist_data

#!/usr/bin/env python3
# Copyright 2020 Benjamin Ehret
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# title           :data/timeseries/smnist_data.py
# author          :be
# contact         :behret@ethz.ch
# created         :23/03/2020
# version         :1.0
# python_version  :3.7
"""
Stroke MNIST (SMNIST) Dataset
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A data handler for the stroke mnist data as discribed here:

    https://github.com/edwin-de-jong/mnist-digits-stroke-sequence-data/

The data was preprocessed with the script
:mod:`data.timeseries.preprocess_smnist` and then uploaded to
`dropbox <https://www.dropbox.com/s/sadzc8qvjvexdtx/ss_mnist_data?dl=1>`__. If
this link becomes invalid, the data has to be preprocessed from scratch.
"""
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os 
import urllib.request

from hypnettorch.data.sequential_dataset import SequentialDataset

[docs]class SMNISTData(SequentialDataset): """Datahandler for stroke MNIST. Note: That the outputs are always provided as one-hot encodings of duration equal to one. One can decide to make these targets span the entirety of the sequence (by repeating it over timesteps) by setting ``target_per_timestep`` to ``True``. Args: data_path (str): Where should the dataset be read from? If not existing, the dataset will be downloaded into this folder. use_one_hot (bool): Whether the class labels should be represented in a one-hot encoding. validation_size (int): The number of validation samples. Validation samples will be taking from the training set (the first :math:`n` samples). target_per_timestep (bool): If activated, the one-hot encoding of the current image will be copied across the entire sequence. Else, there is a single target for the entire sequence (rather than one per timestep. """ def __init__(self, data_path, use_one_hot=False, validation_size=0, target_per_timestep=True): super().__init__() self.target_per_timestep = target_per_timestep # If dataset does not exist in dataset folder, download it from dropbox. # FIXME Dropbox link might become invalid in the near future. data_path = os.path.join(data_path, 'sequential/smnist/ss_mnist_data.pickle') if not os.path.exists(data_path): data_dir = os.path.dirname(data_path) if not os.path.exists(data_dir): os.makedirs(data_dir) url = "https://www.dropbox.com/s/sadzc8qvjvexdtx/ss_mnist_data?dl=1" try: u = urllib.request.urlopen(url) data = u.read() u.close() except: raise RuntimeError('SMNIST data cannot be downloaded. '+ 'If you are working on the cluster, please manually '+ 'copy the pickled dataset into the following location: ' '%s. ' % (data_path) + 'If the dropbox link (%s) ' % url + 'is invalid, please rebuild the dataset using the script ' + '"preprocess_smnist.py".') with open(data_path, "wb") as f: f.write(data) with open(data_path, 'rb') as f: data = pickle.load(f) # Concatenate train and test samples from original data set. # Partitioning will be redone below. x_data = data[0] + data[2] y_data = data[1] + data[3] if not (validation_size >= 0 and validation_size < 60000): raise ValueError('Invalid validation set size.') num_train = 60000 - validation_size num_test = 10000 num_val = validation_size # Specify internal data structure. self._data['classification'] = True self._data['sequence'] = True self._data['num_classes'] = 10 # Quadruple per timestep: (dx, dy, eos, eod). self._data['in_shape'] = [4] # Quatruple self._data['out_shape'] = [10 if use_one_hot else 1] # Maximum number of timesteps, sequences will be padded to this length. self._data['num_time_steps'] = 117 self._data['is_one_hot'] = use_one_hot self._data['in_data'], seq_lengths = self._structure_input_data(x_data) self._data['out_data'] = self._structure_output_data(y_data) if not use_one_hot: self._data['out_data'] = self._to_one_hot(self._data['out_data'], reverse=True) if validation_size > 0: self._data['val_inds'] = np.arange(num_val) self._data['train_inds'] = np.arange(num_val, num_val+num_train) self._data['test_inds'] = np.arange(num_val+num_train, num_val+num_train + num_test) self._data['in_seq_lengths'] = seq_lengths if target_per_timestep: self._data['out_seq_lengths'] = seq_lengths else: self._data['out_seq_lengths'] = np.ones_like(seq_lengths) def _structure_input_data(self, in_data): """Restructures the sample input data to the format expected by the dataset class. Args: in_data (list): List of length ``n_samples`` (total number of samples in the dataset). Each sample is a 2D array of size ``[seq_len, 4]``, where ``seq_len`` is different for every sample. To have a common data structure, from here on every sample has the same length and time steps that are not used are set to 0 (padded) Returns: (tuple): Tuple containing: - **seq_data** (numpy.ndarray): Numpy array of shape ``[n_samples, max_num_time_steps * 4]``. - **seq_lengths** (numpy.ndarray): The original unpadded sequence lengths. """ n_samples = len(in_data) out_mat = np.zeros((n_samples, 4 * self._data['num_time_steps'])) seq_lengths = np.zeros(n_samples) for i in range(n_samples): assert in_data[i].shape[1] == 4 sample_len = in_data[i].shape[0] * in_data[i].shape[1] seq_lengths[i] = in_data[i].shape[0] out_mat[i, :sample_len] = in_data[i].flatten(order='C') eod = np.argwhere(in_data[i][:,3]) assert eod.size == 1 and eod.squeeze() == seq_lengths[i]-1 assert seq_lengths.max() <= self._data['num_time_steps'] return out_mat, seq_lengths def _structure_output_data(self, out_data): """Restructures the sample output data to the format expected by the dataset class. The task has one global target (for all timesteps), given as a 1-hot encoding. However this can be changed using the constructor option ``target_per_timestep``. Args: out_data (list): List of length ``n_samples``. Each sample is a 1D array of size ``[10]``. Returns: (numpy.ndarray): Numpy array of shape ``[n_samples, max_num_time_steps]``. """ out_mat = np.asarray(out_data) if self.target_per_timestep: out_mat = np.matlib.repmat(np.asarray(out_mat), 1, self._data['num_time_steps']) return out_mat
[docs] def get_identifier(self): """Returns the name of the dataset.""" return 'SMNIST'
def _plot_sample(self, fig, inner_grid, num_inner_plots, ind, inputs, outputs=None, predictions=None, sample_ids=None, is_one_hot=None): """Implementation of abstract method :meth:`data.dataset.Dataset._plot_sample`. Args: (....): See docstring of method :meth:`data.dataset.Dataset._plot_sample`. sample_ids (numpy.ndarray): See option ``sample_ids`` of method :meth:`get_out_pattern_bounds`. Only required if ``predictions`` is not ``None`` but provided as a sequence of labels (note, this method will consider the label at the end of the input sequence as predicted label). is_one_hot (bool, optional): Whether ``outputs`` and ``predictions`` are provided as 1-hot encodings. If not specified, we will assume the value specified by attribute :attr:`data.dataset.Dataset.is_one_hot`. """ if is_one_hot is None: is_one_hot = self.is_one_hot # Bring the data into a proper form. x = self._flatten_array(inputs, ts_dim_first=True, reverse=True, feature_shape=self.in_shape) # Sanity check. if sample_ids is not None: eod = np.argwhere(x[:,0,3]) assert eod.size == 1 and eod.squeeze() == \ self.get_in_seq_lengths(sample_ids[[ind]]).squeeze() - 1 if outputs is not None: # Note, the base class already removed 1-hot encoding from ground- # truth data. t = self._flatten_array(outputs, ts_dim_first=True, reverse=True, feature_shape=[1]) if t.shape[0] > 1: # Multiple timesteps. # Note, the label should be the same across all timesteps, # as this is a ground-truth output. t = t[0,:,:] if predictions is not None: fs = [self.num_classes] if is_one_hot else [1] y = self._flatten_array(predictions, ts_dim_first=True, reverse=True, feature_shape=fs) if y.shape[0] > 1: # Multiple timesteps. # Note, we consider the correct label the one that is predicted # at the end of the input sequence. if sample_ids is None: raise ValueError('Option "sample_ids" must be specified ' + 'when providing timeseries predictions.') sl = self.get_in_seq_lengths(sample_ids[[ind]]) y = y[int(sl[0])-1,:,:] ax = plt.Subplot(fig, inner_grid[0]) if outputs is None: ax.set_title("SMNIST Sample") else: assert(np.size(t) == 1) label = np.asscalar(t) if predictions is None: ax.set_title('SMNIST sample with\nlabel: %d' % label) else: if np.size(y) == self.num_classes: pred_label = np.argmax(y) else: pred_label = np.asscalar(y) ax.set_title('SMNIST sample with\nlabel: %d (prediction: %d)' % (label, pred_label)) # Build image from stroke data. image = np.zeros((28, 28)) eos = True for i in range(x.shape[0]): if x[i, 0, 3] == 1: # end-of-digit break if eos: eos = False x_idx = int(x[i, 0, 0]) - 1 y_idx = int(x[i, 0, 1]) - 1 else: x_idx += int(x[i, 0, 0]) y_idx += int(x[i, 0, 1]) # This doesn't seem to matter. Seems only the first position is # absolute. #if x[i, 0, 2] == 1: # end-of-stroke # eos = True x_idx = 0 if x_idx < 0 else x_idx y_idx = 0 if y_idx < 0 else y_idx x_idx = 27 if x_idx > 27 else x_idx y_idx = 27 if y_idx > 27 else y_idx image[x_idx, y_idx] = 255 ax.set_axis_off() ax.imshow(image.transpose()) fig.add_subplot(ax) if num_inner_plots == 2: ax = plt.Subplot(fig, inner_grid[1]) ax.set_title('Predictions') bars = ax.bar(range(self.num_classes), np.squeeze(y)) ax.set_xticks(range(self.num_classes)) if outputs is not None: bars[int(label)].set_color('r') fig.add_subplot(ax) def _plot_config(self, inputs, outputs=None, predictions=None): """Re-Implementation of method :meth:`data.dataset.Dataset._plot_config`. This method has been overriden to ensure, that there are 2 subplots, in case the predictions are given. """ plot_configs = super()._plot_config(inputs, outputs=outputs, predictions=predictions) if predictions is not None and self.is_one_hot and \ np.shape(predictions)[1] == self._data['out_data'].shape[1]: plot_configs['outer_hspace'] = 0.6 plot_configs['inner_hspace'] = 0.4 plot_configs['num_inner_rows'] = 2 #plot_configs['num_inner_cols'] = 1 plot_configs['num_inner_plots'] = 2 return plot_configs def __str__(self): """Print major characteristics of the current dataset.""" return 'Data handler for stroke MNIST'
if __name__ == '__main__': pass