Source code for hypnettorch.data.timeseries.cognitive_tasks.cognitive_data

#!/usr/bin/env python3
# Copyright 2019 Benjamin Ehret
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# title           :data/timeseries/cognitive_tasks/cognitive_data.py
# author          :be
# contact         :behret@ethz.ch
# created         :29/10/2019
# version         :1.0
# python_version  :3.7
"""
Set of cognitive tasks
^^^^^^^^^^^^^^^^^^^^^^

A data handler for cognitive tasks as implemented in Masse et al (PNAS). The 
user can construct individual datasets with this data handler and use each of 
these datasets to train a model in a continual leraning setting.
"""
import numpy as np
from  torch import from_numpy

# from Masse et al. code base, needed for task generation
import hypnettorch.data.timeseries.cognitive_tasks.stimulus as stim_masse
import hypnettorch.data.timeseries.cognitive_tasks.parameters as params_masse

from hypnettorch.data.dataset import Dataset

# TODO Use `SequentialDataset` as baseclass.
[docs]class CognitiveTasks(Dataset):
    """An instance of this class shall represent a one of the 20 cognitive
    tasks.
    """
    def __init__(self, task_id=0, num_train=80, num_test=20, num_val=None,
                 rstate=None):
        """Generate a new dataset.

        We use the MultiStimulus class from Masse el al. to genereate
        the inputs and outputs of different cognitive tasks in accordance with
        the data handling structures of the hnet code base.
        
        Note that masks (part of the Masse et al. trial generator) will be
        handled independently of this data handler.

        Args:
            num_train (int): Number of training samples.
            num_test (int): Number of test samples.
            num_val (optional): Number of validation samples.
            rstate: If ``None``, the current random state of numpy is used to
                generate the data.
        """
        super().__init__()

        # set random state
        if rstate is not None:
            self._rstate = rstate
        else:
            self._rstate = np.random

        # TODO: generate task library and load train / test data instead of 
        # generating them for every call. Keeping this version as a quick fix
        # for now.

        # get train and test data
        train_x, train_y = self._generate_trial_samples(num_train,task_id)
        test_x, test_y = self._generate_trial_samples(num_test,task_id)

        # Create validation data if requested.
        if num_val is not None:
            val_x, val_y = self._generate_trial_samples(num_val,task_id)

            in_data = np.vstack([train_x, test_x, val_x])
            out_data = np.vstack([train_y, test_y, val_y])
        else:
            in_data = np.vstack([train_x, test_x])
            out_data = np.vstack([train_y, test_y])

        # Specify internal data structure.
        self._data['classification'] = True
        self._data['sequence'] = True
        self._data['in_shape'] = [68]
        self._data['out_shape'] = [9]
        self._data['is_one_hot'] = True
        self._data['num_classes'] = 9
        self._data['task_id'] = task_id
        self._data['in_data'] = in_data
        self._data['out_data'] = out_data
        self._data['train_inds'] = np.arange(num_train)
        self._data['test_inds'] = np.arange(num_train, num_train + num_test)

        if num_val is not None:
            n_start = num_train + num_test
            self._data['val_inds'] = np.arange(n_start, n_start + num_val)

    def _generate_trial_samples(self,n_samples,task_id):
        """Generate a certain number of trials

        Args:
            n_samples
            task_id

        Returns:
            (tuple): Tuple containing:

            - **x**: Matrix of trial inputs of shape
              ``[batch_size, in_size*time_steps]``.
            - **y**: Matrix of trial targets of shape
              ``[batch_size, in_size*time_steps]``.
        """
        # update batch_size in their parameter dict to get desired number of
        # trials for training, then create stim object
        params_masse.update_parameters({'batch_size': n_samples})
        # create new stim object with the updated parameters
        stim = stim_masse.MultiStimulus(self._rstate)
        # generate trials and reshape
        _, x, y, _, _ = stim.generate_trial(task_id)
        x = self._flatten_tensor(x)
        y = self._flatten_tensor(y)

        return x, y

    def _flatten_tensor(self,in_tensor):
        """Flattens the trial data tensors to the format expected by the 
        dataset class.

        Args:
            in_tensor: Numpy array of shape
                ``[time_steps, batch_size, in_size]``.

        Returns:
            out_mat: Numpy array of shape ``[batch_size, in_size*time_steps]``.
        """
        (time_steps, batch_size, in_size) = in_tensor.shape
        in_tensor = np.moveaxis(in_tensor,[0,1,2],[2,0,1])
        out_mat = np.reshape(in_tensor,[batch_size, in_size*time_steps])
        
        return out_mat

[docs]    def input_to_torch_tensor(self, x, device, mode='inference',
                              force_no_preprocessing=False, sample_ids=None):
        """This method can be used to map the internal numpy arrays to PyTorch
        tensors.

        Args:
            (....): See docstring of method
                :meth:`data.dataset.Dataset.input_to_torch_tensor`.

        Returns:
            (torch.Tensor): The given input ``x`` as 3D PyTorch tensor. It has
            dimensions ``[T, B, N]``, where ``T`` is the number of time steps
            per stimulus, ``B`` is the batch size and ``N`` the number of input
            units.
        """
        assert(self._data['in_data'].shape[1] % np.prod(self.in_shape) == 0)
        num_time_steps = self._data['in_data'].shape[1] // \
            np.prod(self.in_shape)

        out_tensor = np.reshape(x,[x.shape[0],self.in_shape[0],num_time_steps])
        out_tensor = np.moveaxis(out_tensor,[0,1,2],[1,2,0])

        return from_numpy(out_tensor).float().to(device)

[docs]    def output_to_torch_tensor(self, y, device, mode='inference',
                              force_no_preprocessing=False, sample_ids=None):
        """Similar to method :meth:`input_to_torch_tensor`, just for dataset
        outputs.

        Args:
            (....): See docstring of method
                :meth:`data.dataset.Dataset.output_to_torch_tensor`.

        Returns:
            (torch.Tensor): A tensor of shape ``[T, B, C]``, where ``T`` is the
            number of time steps per stimulus, ``B`` is the batch size and ``C``
            the number of classes.
        """
        assert(self._data['out_data'].shape[1] % np.prod(self.out_shape) == 0)
        num_time_steps = self._data['out_data'].shape[1] // \
            np.prod(self.out_shape)

        out_tensor = np.reshape(y,[y.shape[0],self.out_shape[0],num_time_steps])
        out_tensor = np.moveaxis(out_tensor,[0,1,2],[1,2,0])

        return from_numpy(out_tensor).float().to(device)

[docs]    def get_identifier(self):
        """Returns the name of the dataset."""
        return 'Cognitive'

    def _plot_sample(self, fig, inner_grid, num_inner_plots, ind, inputs,
                     outputs=None, predictions=None):
        """Not implemented"""
        raise NotImplementedError('TODO implement')

if __name__ == '__main__':
    pass