Source code for hypnettorch.data.ilsvrc2012_data

#!/usr/bin/env python3
# Copyright 2019 Christian Henning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# @title           :data/ilsvrc2012_data.py
# @author          :ch
# @contact         :henningc@ethz.ch
# @created         :05/13/2019
# @version         :1.0
# @python_version  :3.6.8
"""
ILSVRC2012 Dataset
------------------

The module :mod:`data.ilsvrc2012_data` contains a  handler for the Imagenet
Large Scale Visual Recognition Challenge 2012 (ILSVRC2012) dataset, a subset of
the ImageNet dataset:

    http://www.image-net.org/challenges/LSVRC/2012/index

For more details on the dataset, please refer to:

    Olga Russakovsky et al. ImageNet Large Scale Visual Recognition Challenge.
    *International Journal of Computer Vision 115*, no. 3 (December 1, 2015):
    211–52, https://doi.org/10.1007/s11263-015-0816-y

.. note::
    In the current implementation, this handler will not download and extract
    the dataset for you. You have to do this manually by following the
    instructions of the README file (which is located in the same folder as this
    file).

.. note::
    We use the validation set as test set. A new (custom) validation set will
    be created by taking the first :math:`n` samples from each training class as
    validation samples, where :math:`n` is configured by the user.

.. note::
    This dataset has not yet been prepared for Tensorflow use!

When using PyTorch, this class will create dataset classes
(:class:`torch.utils.data.Dataset`) for you for the training, testing and
validation set. Afterwards, you can use these dataset instances to create data
loaders:

.. code-block:: python

    train_loader = torch.utils.data.DataLoader(
        ilsvrc2012_data.torch_train, batch_size=256, shuffle=True,
        num_workers=4, pin_memory=True)

You should then use these Pytorch data loaders rather than class internal
methods to work with the dataset.

PyTorch data augmentation is applied as defined by the method
:meth:`ILSVRC2012Data.torch_input_transforms`. Images will be resized and
cropped to size 224 x 224.
"""
# FIXME We currently rely too much on the internals of class ImageFolder.
import torchvision
import warnings
from packaging import version
if version.parse(torchvision.__version__) < version.parse('0.2.2'):
    # FIXME Probably not necessary to enforce, just ignore non-existing
    # "targets" field.
    raise Exception('Code requires torchvision to have at least version ' +
                    '"0.2.2" (current version: %s).' % torchvision.__version__)
elif version.parse(torchvision.__version__) != version.parse('0.2.2'):
    warnings.warn('Code not been tested with torchvision version %s!'
                  % torchvision.__version__)

import os
import time
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import warnings

from hypnettorch.data.large_img_dataset import LargeImgDataset

[docs]class ILSVRC2012Data(LargeImgDataset):
    """An instance of the class shall represent the ILSVRC2012 dataset.

    The input data of the dataset will be strings to image files. The output
    data corresponds to object labels according to the ``ILSVRC2012_ID`` - 1.

    Note:
        This is different from many other ILSVRC2012 data handlers, where the
        labels are computed based on the order of the training folder names
        (which correspond to WordNet IDs (``WNID``)).

    Note:
        The dataset has to be already downloaded and extracted before
        this method can be called. See the local README file for details.

    Args:
        data_path (str): Where should the dataset be read from? If not existing,
            the dataset will be downloaded into this folder.
        use_one_hot (bool): Whether the class labels should be
            represented in a one-hot encoding. Note, class labels
            correspond to the ``ILSVRC2012_ID`` minus 1 (from 0 to 999).

            .. note::
                This option does not influence the internal PyTorch
                Dataset classes (e.g., cmp.
                :attr:`data.large_img_dataset.LargeImgDataset.torch_train`),
                that can be used in conjunction with PyTorch data loaders.
        num_val_per_class (int): The number of validation samples per class.

            .. note::
                The actual ILSVRC2012 validation set is used as test set
                by this data handler. Therefore, a new validation set is
                constructed (if value greater than 0), using the same amount of
                samples per class.
                For instance: If value 50 is given, a validation set of size
                50 * 1000 = 50,000 is constructed (these samples will be removed
                from the training set).

            .. note::
                Validation samples use the same data augmentation pipeline
                as test samples.
    """
    _TRAIN_FOLDER = 'train'
    _VAL_FOLDER = 'val'
    _META_FILE = os.path.join('meta', 'data', 'meta.mat')

    def __init__(self, data_path, use_one_hot=False, num_val_per_class=0):
        # 732 is the minimum number of training samples per class in
        # ILSVRC2012.
        assert(num_val_per_class < 732)
        # We keep the full path to each image in memory, so we don't need to
        # tell the super class the root path to each image (i.e., samples
        # contain absolute not relative paths).
        super().__init__('')

        start = time.time()

        print('Reading ILSVRC2012 dataset ...')

        meta_fn = os.path.join(data_path, ILSVRC2012Data._META_FILE)
        train_dir = os.path.join(data_path, ILSVRC2012Data._TRAIN_FOLDER)
        val_dir = os.path.join(data_path, ILSVRC2012Data._VAL_FOLDER)

        err_msg = 'Please follow the steps described in the file ' + \
            'data/README.md to download and extract the data.'
        if not os.path.exists(train_dir):
            raise FileNotFoundError('Training images not found in directory ' +
                train_dir + '.\n' + err_msg)
        elif not os.path.exists(val_dir):
            raise FileNotFoundError('Validation images not found in ' +
                'directory ' + val_dir + '.\n' + err_msg)
        elif not os.path.exists(meta_fn):
            raise FileNotFoundError('Meta file not found: ' +
                meta_fn + '.\n' + err_msg)

        # Read meta file.
        self._data['meta'] = dict()
        self._read_meta(meta_fn)

        # Read dataset.
        self._process_dataset(train_dir, val_dir, use_one_hot,
                              num_val_per_class)

        # Translate everything into the internal structure of this class.
        num_train = len(self._torch_ds_train)
        num_test = len(self._torch_ds_test)
        num_val = 0 if self._torch_ds_val is None else \
            len(self._torch_ds_val)
        num_samples = num_train + num_test + num_val
        # Just a sanity check, as these numbers should be fixed whenever the
        # full dataset is loaded.
        if num_test != 50000:
            warnings.warn('ILSVRC2012 should contain 50,000 test samples, ' +
                          'but %d samples were found!' % num_test)
        if num_train + num_val != 1281167:
            warnings.warn('ILSVRC2012 should contain 1,281,167 training ' +
                          'samples, but %d samples were found!'
                          % (num_train + num_val))

        # Maximum string length of an image path.
        max_path_len = len(max(self._torch_ds_train.samples +
            ([] if num_val == 0 else self._torch_ds_val.samples) +
            self._torch_ds_test.samples, key=lambda t : len(t[0]))[0])

        self._data['classification'] = True
        self._data['sequence'] = False
        self._data['num_classes'] = 1000
        self._data['is_one_hot'] = use_one_hot

        self._data['in_shape'] = [224, 224, 3]
        self._data['out_shape'] = [1000 if use_one_hot else 1]

        self._data['in_data'] = np.chararray([num_samples, 1],
            itemsize=max_path_len, unicode=True)
        for i, (img_path, _) in enumerate(self._torch_ds_train.samples +
                ([] if num_val == 0 else self._torch_ds_val.samples) +
                self._torch_ds_test.samples):
            self._data['in_data'][i, :] = img_path

        labels = np.array(self._torch_ds_train.targets +
                          ([] if num_val == 0 else self._torch_ds_val.targets) +
                          self._torch_ds_test.targets).reshape(-1, 1)
        if use_one_hot:
            labels = self._to_one_hot(labels)
        self._data['out_data'] = labels

        self._data['train_inds'] = np.arange(num_train)
        self._data['test_inds'] = np.arange(num_train + num_val, num_samples)
        if num_val == 0:
            self._data['val_inds'] = None
        else:
            self._data['val_inds'] = np.arange(num_train, num_train + num_val)

        print('Dataset consists of %d training, %d validation and %d test '
              % (num_train, num_val, num_test) + 'samples.')

        end = time.time()
        print('Elapsed time to read dataset: %f sec' % (end-start))

[docs]    def tf_input_map(self, mode='inference'):
        """Not impemented."""
        # Confirm, whether you wanna process data as in the baseclass or
        # implement a new image loader.
        raise NotImplementedError('Not implemented yet!')

[docs]    def get_identifier(self):
        """Returns the name of the dataset."""
        return 'ILSVRC2012'

    def _read_meta(self, meta_fn):
        """Read the meta file to know how to translate WNID to ILSVRC2012_ID.

        The following attributes are added (dictionaries):
            _imid_to_wnid: ILSVRC2012_ID to WNID.
            _wnid_to_imid: WNID to ILSVRC2012_ID.
            _imid_to_words: ILSVRC2012_ID to set of words (textual description
                of label).

        Args:
            meta_fn: Path to meta file.
        """
        meta = loadmat(meta_fn)['synsets']

        # ILSVRC2012_ID -> WNID
        imid2wnid = dict()
        # ILSVRC2012_ID -> Example Words
        imid2words = dict()

        for i in range(meta.size):
            imid = meta[i][0][0][0][0] # ILSVRC2012_ID
            wnid = meta[i][0][1][0] # WNID
            words = meta[i][0][2][0] # words
            num_children = meta[i][0][4][0][0]
            if num_children != 0:
                # We don't care about non-leaf nodes.
                assert(imid >= 1000)
                continue
            assert(imid >= 1 and imid <= 1000)

            # NOTE internally, we subtract 1 from all ILSVRC2012_ID to have
            # labels between 0 and 999.
            imid2wnid[imid-1] = wnid
            imid2words[imid-1] = words

        assert(len(imid2wnid.keys()) == 1000)

        wnid2imid = {v: k for k, v in imid2wnid.items()}
        assert(len(wnid2imid.keys()) == 1000)

        self._imid_to_wnid = imid2wnid
        self._wnid_to_imid = wnid2imid
        self._imid_to_words = imid2words

    def _process_dataset(self, train_dir, val_dir, use_one_hot,
                         num_val_per_class):
        """Read and process the datasets using PyTorch its ImageFolder class.

        The labels used by the ImageFolder class are changed to match the
        ILSVRC2012_ID labels (where 1 is subtracted to get labels between 0 and
        999).

        Additionally, this method splits the Imagenet training set into train
        and validation set. The original ImageNet validation set is used as test
        set.

        The following attributes are added to the class:
            _torch_ds_train: A PyTorch Dataset class representing the training
                set.
            _torch_ds_test: A PyTorch Dataset class representing the validation
                set (corresponds to the dataset in "val_dir").
            _torch_ds_val: A PyTorch Dataset class representing the validation
                set (A subset of the training set).
            _wnid_to_clbl: A dictionary translating WNID to the "common label",
                that is used by data loaders that simply use the "ImageFolder"
                class. For instance, the pretrained ImageNet classifiers in
                the the PyTorch model zoo:
                    https://pytorch.org/docs/stable/torchvision/models.html

        Args:
            See docstring of constructor.
            train_dir: Path to ILSVRC2012 training images.
            val_dir: Path to ILSVRC2012 validation images.
        """
        # Read raw dataset using the PyTorch ImageFolder class.
        train_transform, test_transform = \
            ILSVRC2012Data.torch_input_transforms()
        ds_train = datasets.ImageFolder(train_dir, train_transform)
        ds_test = datasets.ImageFolder(val_dir, test_transform)
        ds_val = None

        ### Translate targets to ILSVRC2012_ID labels.
        wnid2lbl = ds_train.class_to_idx
        # Sanity check.
        assert(len(wnid2lbl.keys()) == len(ds_test.class_to_idx.keys()))
        for k in wnid2lbl.keys():
            assert(k in ds_test.class_to_idx.keys())
            assert(wnid2lbl[k] == ds_test.class_to_idx[k])

        lbl2wnid = {v: k for k, v in wnid2lbl.items()}

        for ds_obj in [ds_train, ds_test]:
            for s in range(len(ds_obj.samples)):
                img_path, lbl = ds_obj.samples[s]
                assert(ds_obj.targets[s] == lbl)

                wnid = lbl2wnid[lbl]
                # We assume a folder structure where images are stored under
                # their corresponding WNID.
                assert(wnid in img_path)

                imid = self._wnid_to_imid[wnid]

                ds_obj.samples[s] = (img_path, imid)
                ds_obj.targets[s] = imid

                assert(ds_obj.imgs[s][1] == imid)

            # The mapping from class name (WNID) to label has changed!
            ds_obj.class_to_idx = self._wnid_to_imid

        ### Split training set into train/val set.
        if num_val_per_class > 0:
            orig_samples = ds_train.samples
            ds_train.samples = None
            ds_train.imgs = None
            ds_train.targets = None

            ds_val = deepcopy(ds_train)
            ds_val.transform = test_transform
            assert(ds_val.target_transform is None)

            num_classes = len(self._imid_to_wnid.keys())
            assert(num_classes == 1000)
            val_counts = np.zeros(num_classes, dtype=np.int)

            ds_train.samples = []
            ds_train.imgs = ds_train.samples
            ds_val.samples = []
            ds_val.imgs = ds_val.samples

            for img_path, img_lbl in orig_samples:
                if val_counts[img_lbl] >= num_val_per_class: # train sample
                    ds_train.samples.append((img_path, img_lbl))
                else: # validation sample
                    val_counts[img_lbl] += 1
                    ds_val.samples.append((img_path, img_lbl))

            ds_train.targets = [s[1] for s in ds_train.samples]
            ds_val.targets = [s[1] for s in ds_val.samples]

            for ds_obj in [ds_train, ds_val]:
                assert(len(ds_obj.samples) == len(ds_obj.imgs) and \
                       len(ds_obj.samples) == len(ds_obj.targets))

        self._torch_ds_train = ds_train
        self._torch_ds_test = ds_test
        self._torch_ds_val = ds_val
        
        self._wnid_to_clbl = wnid2lbl

[docs]    def to_common_labels(self, outputs):
        """Translate between label conventions.

        Translate a given set of labels (that correspond to the
        ``ILSVRC2012_ID`` (minus one) of their images) back to the labels
        provided by the :class:`torchvision.datasets.ImageFolder` class.

        Note:
            This would be the label convention for ImageNet used by
            PyTorch examples.

        Args:
            outputs: Targets (as integers or 1-hot encodings).

        Returns:
            The translated targets (if the targets where given as 1-hot
            encodings, then this method also returns 1-hot encodings).
        """
        is_np = False
        # We don't want to do inplace modifications.
        if isinstance(outputs, np.ndarray):
            is_np = True
            outputs = np.copy(outputs)
        else:
            assert(isinstance(outputs, torch.Tensor))
            outputs = outputs.clone()

        is_1_hot = False
        if len(outputs.shape) == 2 and outputs.shape[1] == self.num_classes:
            if not is_np:
                raise NotImplementedError('Method can\'t deal with 1-hot ' +
                    'encodings provided as Torch tensors yet!')
            is_1_hot = True
            outputs = self._to_one_hot(outputs, reverse=True)

        for i in range(outputs.shape[0]):
            wnid = self._imid_to_wnid[int(outputs[i])]
            outputs[i] = self._wnid_to_clbl[wnid]

        if is_1_hot:
            outputs = self._to_one_hot(outputs, reverse=False)

        return outputs

[docs]    @staticmethod
    def torch_input_transforms():
        """Get data augmentation pipelines for ILSVRC2012 inputs.

        Note, the augmentation is inspired by the augmentation proposed in:
            https://git.io/fjWPZ

        Returns:
            (tuple): Tuple containing:

                - **train_transform**: A transforms pipeline that applies random
                  transformations, normalizes the image and resizes/crops it
                  to a final size of 224 x 224 pixels.
                - **test_transform**: Similar to train_transform, but no random
                  transformations are applied.
        """
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])

        test_transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])

        return train_transform, test_transform

    def _plot_sample(self, fig, inner_grid, num_inner_plots, ind, inputs,
                     outputs=None, predictions=None):
        """Implementation of abstract method
        :meth:`data.dataset.Dataset._plot_sample`.
        
        Note, label ID in the plot correspond to ``ILSVRC2012_ID`` minus 1.
        """
        ax = plt.Subplot(fig, inner_grid[0])

        if outputs is None:
            ax.set_title("ILSVRC2012 Sample")
        else:
            assert(np.size(outputs) == 1)
            label = np.asscalar(outputs)
            label_name = self._imid_to_words[label]

            if predictions is None:
                ax.set_title('Label of shown sample:\n%s (%d)' % \
                             (label_name, label))
            else:
                if np.size(predictions) == self.num_classes:
                    pred_label = np.argmax(predictions)
                else:
                    pred_label = np.asscalar(predictions)
                pred_label_name = self._imid_to_words[pred_label]

                ax.set_title('Label of shown sample:\n%s (%d)' % \
                             (label_name, label) + '\nPrediction: %s (%d)' % \
                             (pred_label_name, pred_label))

        if inputs.size == 1:
            img = self.read_images(inputs)
        else:
            img = inputs

        ax.set_axis_off()
        ax.imshow(np.squeeze(np.reshape(img, self.in_shape)))
        fig.add_subplot(ax)

if __name__ == '__main__':
    pass