Source code for hypnettorch.data.timeseries.seq_smnist

#!/usr/bin/env python3
# Copyright 2020 Benjamin Ehret
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# title           :data/timeseries/seq_smnist.py
# author          :be
# contact         :behret@ethz.ch
# created         :14/04/2020
# version         :1.0
# python_version  :3.7
"""
Sequence of Stroke MNIST Samples (SeqSMNIST) Dataset
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A data handler to generate a set of sequential stroke MNIST tasks for continual
learning. The used stroke MNIST data was already preprocessed with the script
:mod:`data.timeseries.preprocess_smnist` (see also the corresponding data
handler in :mod:`data.timeseries.smnist_data`).

**The task**

Given a sequence of two smnist digits of length ``n`` (e.g. ``2,5,5,2,2`` with
``n=5``), classify which of the ``2**n`` possible binary sequences (classes) the
presented sequence belongs to. E.g., for ``n=3`` the number of classes would be
8 (corresponding to all possible sequences with two digits (``0`` and ``1``
here): ``000, 001, 010, 100, 011, 110, 101, 111``.

The individual tasks of the task family differ in which digits are used to 
generate the binary sequences. Considering all possible pairs of digits we 
can generate (10**2-10) / 2 = 45 tasks.
"""
import numpy as np
import numpy.matlib as npm
import pickle
import os
import urllib.request
import itertools
from sklearn.preprocessing import OneHotEncoder

from hypnettorch.data.sequential_dataset import SequentialDataset

# generate sequences and classes
# generate dataset by sampling numbers from smnist
# write class that implements dataset interface
# write train file
# write hpsearch file

#def generate_tasks(self):


[docs]class SeqSMNIST(SequentialDataset): """Datahandler for one sequential stroke MNIST task (as described above). Note: That the outputs are always provided as one-hot encodings of duration equal to one. One can decide to make these targets span the entirety of the sequence (by repeating it over timesteps) by setting ``target_per_timestep`` to ``True``. Args: data_path (str): Where should the dataset be read from? If not existing, the dataset will be downloaded into this folder. use_one_hot (bool): Whether the class labels should be represented in a one-hot encoding. num_train (int) : Number of training samples to be generated. num_test (int) : Number of test samples to be generated. num_val (int) : Number of validation samples to be generated. target_per_timestep (bool): If activated, the one-hot encoding of the current image will be copied across the entire sequence. Else, there is a single target for the entire sequence (rather than one per timestep. sequence_length (int): The length of the binary sequence to be classified. This also affects the number of classes which is ``2**n``. digits (tuple): The two digits that shall be used for generating the binary sequence. two_class (bool): When true, instead of classifying each possible sequence individually, sequences are randomly grouped into two classes. This makes the number of classes (and therefore the chance level) independent of the sequence length. upsample_control (bool): If ``True``, instead of building sequences of digits, we upsample single digits by a factor given by ``seq_len``. fix_class_partition (bool): TODO rseed (int): Seed for numpy random state. """ def __init__(self, data_path, use_one_hot=True, num_train=1600, num_test=400, num_val=0, target_per_timestep=True, sequence_length=4, digits=(0,1), two_class=False, upsample_control=False, fix_class_partition=False, rseed=None): super().__init__() # set random state if rseed is not None: self._rstate = np.random.RandomState(rseed) else: self._rstate = np.random self.target_per_timestep = target_per_timestep self.two_class = two_class # If dataset does not exist in dataset folder, download it from dropbox. # FIXME Dropbox link might become invalid in the near future. data_path = os.path.join(data_path, 'sequential/smnist/ss_mnist_data.pickle') if not os.path.exists(data_path): data_dir = os.path.dirname(data_path) if not os.path.exists(data_dir): os.makedirs(data_dir) url = "https://www.dropbox.com/s/sadzc8qvjvexdtx/ss_mnist_data?dl=1" try: u = urllib.request.urlopen(url) data = u.read() u.close() except: raise RuntimeError('SMNIST data cannot be downloaded. '+ 'If you are working on the cluster, please manually '+ 'copy the pickled dataset into the following location: ' '%s. ' % (data_path) + 'If the dropbox link (%s) ' % url + 'is invalid, please rebuild the dataset using the script ' + '"preprocess_smnist.py".') with open(data_path, "wb") as f: f.write(data) with open(data_path, 'rb') as f: data = pickle.load(f) smnist_n_train = 50000 x_data_train = data[0][:smnist_n_train] y_data_train = data[1][:smnist_n_train] x_data_test = data[2] y_data_test = data[3] x_data_val = data[0][smnist_n_train:] y_data_val = data[1][smnist_n_train:] # generate data if two_class: num_classes = 2**sequence_length if sequence_length == 1: class_partition = np.asarray([1]) else: # randomly group sequences into 2 classes class_partition = self._rstate.choice(num_classes, int(num_classes/2), replace=False) if fix_class_partition: # use the same random partition for all tasks rstate_partition = np.random.RandomState(42) class_partition = rstate_partition.choice(num_classes, int(num_classes/2), replace=False) else: class_partition = None #print(class_partition) max_seq_len = 117 * sequence_length x_train, y_train, sample_lengths_train = \ self._generate_data(x_data_train, y_data_train, max_seq_len, digits, sequence_length, num_train, use_one_hot, class_partition, upsample_control) x_test, y_test, sample_lengths_test = \ self._generate_data(x_data_test, y_data_test, max_seq_len, digits, sequence_length, num_test, use_one_hot, class_partition, upsample_control) x_val, y_val, sample_lengths_val = \ self._generate_data(x_data_val, y_data_val, max_seq_len, digits, sequence_length, num_val, use_one_hot, class_partition, upsample_control) in_data = np.vstack((x_val,x_train,x_test)) out_data = np.vstack((y_val,y_train,y_test)) sample_lengths = np.hstack((sample_lengths_val, sample_lengths_train, sample_lengths_test)) # Specify internal data structure. self._data['classification'] = True self._data['sequence'] = True if two_class: self._data['num_classes'] = 2 else: self._data['num_classes'] = 2**sequence_length # Quadruple per timestep: (dx, dy, eos, eod). self._data['in_shape'] = [4] # Quatruple self._data['out_shape'] = [self._data['num_classes'] if use_one_hot else 1] # Maximum number of timesteps, sequences will be padded to this length. self._data['num_time_steps'] = max_seq_len self._data['is_one_hot'] = use_one_hot self._data['in_data'] = in_data self._data['out_data'] = out_data if num_val > 0: self._data['val_inds'] = np.arange(num_val) self._data['train_inds'] = np.arange(num_val, num_val+num_train) self._data['test_inds'] = np.arange(num_val+num_train, num_val+num_train + num_test) self._data['in_seq_lengths'] = sample_lengths if target_per_timestep: self._data['out_seq_lengths'] = sample_lengths else: self._data['out_seq_lengths'] = np.ones_like(sample_lengths) def _generate_data(self, x_data, y_data, max_seq_len, digits, seq_len, n_samples, use_one_hot, class_partition, upsample_control): """Generates data for a single sequence stroke MNIST task with specified length and digits to use. Args: x_data (list): Original stroke mnist input data, with every list entry being a numpy.ndarray of shape ``[4, stroke_seq_len]`` y_data (list): Original stroke mnist labels, with every list entry being a numpy.ndarray of shape ``[10]`` max_seq_len (int): The maximum length of a sequence (i.e. number of timesteps of a sample) digits (tuple): The two digits used to build the sequence seq_len (int): The length of the sequence of digits to build n_samples (int): The number of samples that should be generated use_one_hot (bool): Whether or not to use one hot encodings class_partition (list): If sequences should be grouped into 2 different classes this list specifies the class partition. upsample_control (bool): See constructor docstring. Returns: (tuple): Tuple containing: - **in_data** (numpy.ndarray): Numpy array of shape ``[n_samples, max_num_time_steps * 4]``. - **out_data** (numpy.ndarray): Numpy array of shape ``[n_samples, max_num_time_steps]``. - **sample_lengths** (numpy.ndarray): The original unpadded sequence lengths. """ # modify seq_len in case we do upsampling control if upsample_control: upsample_factor = seq_len seq_len = 1 if not self.two_class: raise NotImplementedError() # construct all possible classes classes = ["".join(seq) for seq in \ itertools.product("01", repeat=seq_len)] # get the right number of samples per class to get a balanced data set # with the desired n_samples. num = n_samples div = len(classes) n_samples_per_class = [num // div + (1 if x < num % div else 0) \ for x in range (div)] # find indices of samples with the wanted digit class y_data = [np.argmax(y) for y in y_data] digit_idx = [] digit_idx.append(np.where(np.asarray(y_data) == digits[0])[0]) digit_idx.append(np.where(np.asarray(y_data) == digits[1])[0]) # generate samples for every class samples = [] labels = [] for i,c in enumerate(classes): this_label = i digits_to_sample = [int(c[i]) for i in range(len(c))] for s in range(n_samples_per_class[i]): this_sample = None for d in digits_to_sample: rand_idx = self._rstate.randint(len(digit_idx[d])) sample_idx = digit_idx[d][rand_idx] digit_sample = x_data[sample_idx] if this_sample is None: this_sample = digit_sample else: this_sample = np.vstack((this_sample,digit_sample)) samples.append(this_sample) labels.append(this_label) # if configured sort labels into 2 classes labels = np.asarray(labels) if self.two_class and not upsample_control: lbl_mask = np.isin(labels, class_partition) labels[~lbl_mask] = 0 labels[lbl_mask] = 1 if upsample_control: for i,s in enumerate(samples): # Initial timestep is absolute start position of digit. To # translate to a higher resolution image, we can just multiply # the abolute position vby the scaling factor. upsample = s[0,:]*upsample_factor for t in np.arange(1,s.shape[0]): # don't do upsampling at end of strokes or end of digits if all((s[t,2] == 0, s[t,3] == 0)): # Repeat original stroke "upsample_factor" times, such # that the relative stroke length is identical if # images are normalized to same resolution. for k in range(upsample_factor): upsample = np.vstack((upsample, s[t,:])) else: upsample = np.vstack((upsample, s[t,:])) samples[i] = upsample # structure output data out_data = labels.reshape(-1, 1) if use_one_hot: n_classes = 2**seq_len if self.two_class: n_classes = 2 # FIXME We shouldn't call this method if the validation set size is # zero. if out_data.size == 0: out_data = np.matlib.repmat(out_data, 1, n_classes) else: # FIXME use internal method `_to_one_hot` and set required class # attributes beforehand. one_hot_encoder = OneHotEncoder(categories=[range(n_classes)]) one_hot_encoder.fit(npm.repmat(np.arange(n_classes), 1, 1).T) out_data = one_hot_encoder.transform(out_data).toarray() if self.target_per_timestep: out_data = np.matlib.repmat(np.asarray(out_data), 1, max_seq_len) # structure input data in_data = np.zeros((n_samples,max_seq_len,4)) sample_lengths = np.zeros(n_samples) for i,s in enumerate(samples): in_data[i,:s.shape[0],:] = s sample_lengths[i] = s.shape[0] in_data = self._flatten_array(in_data) return in_data, out_data, sample_lengths
[docs] def get_identifier(self): """Returns the name of the dataset.""" return 'Sequence SMNIST'
def _plot_sample(self, fig, inner_grid, num_inner_plots, ind, inputs, outputs=None, predictions=None, sample_ids=None, is_one_hot=None): raise NotImplementedError() def _plot_config(self, inputs, outputs=None, predictions=None): raise NotImplementedError() def __str__(self): """Print major characteristics of the current dataset.""" return 'Data handler for sequential SMNIST'
if __name__ == '__main__': pass