Source code for hypnettorch.data.timeseries.audioset_data

#!/usr/bin/env python3
# Copyright 2020 Benjamin Ehret
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# title           :data/timeseries/audioset_data.py
# author          :be
# contact         :behret@ethz.ch
# created         :02/04/2020
# version         :1.0
# python_version  :3.7
"""
Dataset for the Audioset task
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A data handler for the audioset dataset taken from:

    https://research.google.com/audioset/download.html

Data were preprocessed with the script
:mod:`data.timeseries.structure_audioset` and then uploaded to
`dropbox <https://www.dropbox.com/s/07dfeeuf5aq4w1h/\
audioset_data_balanced?dl=1>`__. If this link becomes invalid, the data has to
be preprocessed from scratch.
"""
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
import urllib.request

from hypnettorch.data.sequential_dataset import SequentialDataset

[docs]class AudiosetData(SequentialDataset): """Datahandler for the audioset task. Args: data_path (str): Where should the dataset be read from? If not existing, the dataset will be downloaded into this folder. use_one_hot (bool): Whether the class labels should be represented in a one-hot encoding. validation_size (int): The number of validation samples. target_per_timestep (bool, optional): If activated, the one-hot encoding of the current image will be copied across the entire sequence. Else, there is a single target for the entire sequence (rather than one per timestep. rseed (int, optional): If ``None``, the current random state of numpy is used to select a validation set from the training data. Otherwise, a new random state with the given seed is generated. """ def __init__(self, data_path, use_one_hot=True, validation_size=0, target_per_timestep=True, rseed=None): super().__init__() self.target_per_timestep = target_per_timestep if rseed is not None: rstate = np.random.RandomState(rseed) else: rstate = np.random # If dataset does not exist in dataset folder, download it from dropbox. # FIXME Dropbox link might become invalid in the near future. data_path = os.path.join(data_path, 'sequential/audioset/audioset_data_balanced.pickle') if not os.path.exists(data_path): data_dir = os.path.dirname(data_path) if not os.path.exists(data_dir): os.makedirs(data_dir) url = 'https://www.dropbox.com/s/07dfeeuf5aq4w1h/' +\ 'audioset_data_balanced?dl=1' try: u = urllib.request.urlopen(url) data = u.read() u.close() except: raise RuntimeError('Audioset data cannot be downloaded. '+ 'If you are working on the cluster, please manually '+ 'copy the pickled dataset into the following location: ' '%s. ' % (data_path) + 'If the dropbox link (%s) ' % url + 'is invalid, please rebuild the dataset using the script ' + '"preprocess_audioset.py".') with open(data_path, "wb") as f: f.write(data) with open(data_path, 'rb') as f: data = pickle.load(f) # Load data. [x_data, y_data, train_ind, test_ind] = data # Specify internal data structure. self._data['sequence'] = True self._data['classification'] = True self._data['num_classes'] = 100 self._data['in_shape'] = [128] self._data['out_shape'] = [self._data['num_classes'] \ if use_one_hot else 1] self._data['num_time_steps'] = 10 self._data['is_one_hot'] = use_one_hot self._data['in_data'] = self._flatten_array(x_data) self._data['out_data'] = \ self._structure_output_data(y_data.reshape(-1, 1)) if use_one_hot: self._data['out_data'] = self._to_one_hot(self._data['out_data']) if not (validation_size >= 0 and validation_size < len(train_ind)): raise ValueError('Invalid validation set size.') if validation_size > 0: # Note, the data is not shuffled! I.e., consecutive indices belong # to the same class. train_ind, val_ind = train_test_split(train_ind, test_size=validation_size, shuffle=True, random_state=rstate, stratify=y_data[train_ind]) self._data['val_inds'] = val_ind self._data['train_inds'] = train_ind self._data['test_inds'] = test_ind # Note, all sequences in this dataset have the same length. num_samples = self._data['in_data'].shape[0] self._data['in_seq_lengths'] = np.ones(num_samples) * \ self._data['num_time_steps'] if target_per_timestep: self._data['out_seq_lengths'] = self._data['in_seq_lengths'] else: self._data['out_seq_lengths'] = \ np.ones_like(self._data['in_seq_lengths']) def _structure_output_data(self, out_data): """Restructures the sample output data to the format expected by the dataset class. The task has one global target (for all timesteps), given as a one hot encoding. However this can be changed using the option `target_per_timestep`. Args: out_data (list): List of length ``n_samples`` (total number of samples in the dataset). Each sample is a 1D array of size ``[10]``. Returns: (numpy.ndarray): Numpy array of shape ``[n_samples, self._data['out_shape']]``. """ out_mat = out_data if self.target_per_timestep: out_mat = np.matlib.repmat(np.asarray(out_mat), 1, self._data['num_time_steps']) return out_mat def _plot_config(self, inputs, outputs=None, predictions=None): """Defines properties, used by the method :meth:`plot_samples`. This method can be overwritten, if these configs need to be different for a certain dataset. Args: (....): See docstring of method :meth:`data.dataset.Dataset._plot_config`. Returns: (dict): A dictionary with the plot configs. """ plot_configs = super()._plot_config(inputs, outputs=outputs, predictions=predictions) if outputs is not None: plot_configs['num_inner_rows'] += 1 if predictions is not None: plot_configs['num_inner_rows'] += 1 plot_configs['num_inner_plots'] = plot_configs['num_inner_rows'] return plot_configs def _plot_sample(self, fig, inner_grid, num_inner_plots, ind, inputs, outputs=None, predictions=None, is_one_hot=None): """Implementation of abstract method :meth:`data.dataset.Dataset._plot_sample`. Args: (....): See docstring of method :meth:`data.dataset.Dataset._plot_sample`. is_one_hot (bool, optional): Whether ``outputs`` and ``predictions`` are provided as 1-hot encodings. If not specified, we will assume the value specified by attribute :attr:`data.dataset.Dataset.is_one_hot`. """ if is_one_hot is None: is_one_hot = self.is_one_hot # Bring the data into a proper form. x = self._flatten_array(inputs, ts_dim_first=True, reverse=True, feature_shape=self.in_shape) pdata = [x] plabel = ['inputs'] if outputs is not None: # We want to display outputs as one-hot encoding. raise NotImplementedError # t = ... pdata.append(t) plabel.append('outputs') if predictions is not None: fs = [self.num_classes] if is_one_hot else [1] y = self._flatten_array(predictions, ts_dim_first=True, reverse=True, feature_shape=fs) pdata.append(y) plabel.append('predictions') for i, d in enumerate(pdata): ax = plt.Subplot(fig, inner_grid[i]) # Note, we can't use `set_axis_off`, if we wanna keep the y-label. ax.set_ylabel(plabel[i]) ax.set_xticks([]) ax.set_yticks([]) ax.imshow(d.squeeze().transpose()) fig.add_subplot(ax)
[docs] def get_identifier(self): """Returns the name of the dataset.""" return 'audioset'
def __str__(self): """Print major characteristics of the current dataset.""" return 'Data handler for the audioset dataset.'
if __name__ == '__main__': pass