# EpyNN/epynn/embedding/dataset.py
# Standard library imports
import warnings
# Related third party imports
import numpy as np
# Local application/library specific imports
from epynn.commons.io import (
encode_dataset,
scale_features,
index_elements_auto,
)
from epynn.commons.models import dataSet
[docs]def embedding_check(X_data, Y_data=None, X_scale=False):
"""Pre-processing.
:param X_data: Set of sample features.
:type encode: list[list] or :class:`numpy.ndarray`
:param Y_data: Set of samples label.
:type encode: list[list[int] or int] or :class:`numpy.ndarray`, optional
:param X_scale: Set to True to normalize sample features within [0, 1].
:type X_scale: bool, optional
:return: Sample features and label.
:rtype: tuple[:class:`numpy.ndarray`]
"""
if X_scale:
# Array-wide normalization in [0, 1]
X_data = scale_features(X_data)
X_data = np.array(X_data)
Y_data = np.array(Y_data)
return X_data, Y_data
[docs]def embedding_encode(layer, X_data, Y_data, X_encode, Y_encode):
"""One-hot encoding for samples features and label.
:param layer: An instance of the :class:`epynn.embedding.models.Embedding`
:type layer: :class:`epynn.embedding.models.Embedding`
:param X_data: Set of sample features.
:type encode: list[list] or :class:`numpy.ndarray`
:param Y_data: Set of samples label.
:type encode: list[list[int] or int] or :class:`numpy.ndarray`
:param X_encode: Set to True to one-hot encode features.
:type encode: bool
:param Y_encode: Set to True to one-hot encode labels.
:type encode: bool
:return: Encoded set of sample features, if applicable.
:rtype : :class:`numpy.ndarray`
:return: Encoded set of sample label, if applicable.
:rtype : :class:`numpy.ndarray`
"""
# Features one-hot encoding
if X_encode:
layer.e2i, layer.i2e, layer.d['e'] = index_elements_auto(X_data)
X_data = encode_dataset(X_data, layer.e2i, layer.d['e'])
# Label one-hot encoding
if Y_encode:
num_classes = len(list(set(Y_data.flatten())))
Y_data = np.eye(num_classes)[Y_data]
return X_data, Y_data
[docs]def embedding_prepare(layer, X_data, Y_data):
"""Prepare dataset for Embedding layer object.
:param layer: An instance of the :class:`epynn.embedding.models.Embedding`
:type layer: :class:`epynn.embedding.models.Embedding`
:param X_data: Set of sample features.
:type encode: list[list] or :class:`numpy.ndarray`
:param Y_data: Set of samples label.
:type encode: list[list[int] or int] or :class:`numpy.ndarray`
:return: All training, validation and testing sets along with batched training set
:rtype: tuple[:class:`epynn.commons.models.dataSet`]
"""
# Embedding parameters
se_dataset = layer.se_dataset
# Pair-wise features-label list
dataset = list(zip(X_data, Y_data))
# Split and separate features and label
dtrain, dval, dtest = split_dataset(dataset, se_dataset)
X_train, Y_train = zip(*dtrain)
X_val, Y_val = zip(*dval) if dval else [(), ()]
X_test, Y_test = zip(*dtest) if dtest else [(), ()]
# Instantiate dataSet objects
dtrain = dataSet(X_data=X_train, Y_data=Y_train, name='dtrain')
dval = dataSet(X_data=X_val, Y_data=Y_val, name='dval')
dtest = dataSet(X_data=X_test, Y_data=Y_test, name='dtest')
embedded_data = (dtrain, dval, dtest)
return embedded_data
[docs]def split_dataset(dataset, se_dataset):
"""Split dataset in training, testing and validation sets.
:param dataset: Dataset containing sample features and label
:type dataset: tuple[list or :class:`numpy.ndarray`]
:param se_dataset: Settings for sets preparation
:type se_dataset: dict[str: int]
:return: Training, testing and validation sets.
:rtype: tuple[list]
"""
# Retrieve relative sizes
dtrain_relative = se_dataset['dtrain_relative']
dval_relative = se_dataset['dval_relative']
dtest_relative = se_dataset['dtest_relative']
# Compute absolute sizes with respect to full dataset
sum_relative = sum([dtrain_relative, dval_relative, dtest_relative])
dtrain_length = round(dtrain_relative / sum_relative * len(dataset))
dval_length = round(dval_relative / sum_relative * len(dataset))
dtest_length = round(dtest_relative / sum_relative * len(dataset))
# Slice full dataset
dtrain = dataset[:dtrain_length]
dval = dataset[dtrain_length:dtrain_length + dval_length]
dtest = dataset[dtrain_length + dval_length:]
return dtrain, dval, dtest
[docs]def mini_batches(layer):
"""Shuffle and divide dataset in batches for each training epoch.
:param layer: An instance of the :class:`epynn.embedding.models.Embedding`
:type layer: :class:`epynn.embedding.models.Embedding`
:return: Batches made from dataset with respect to batch_size
:rtype: list[Object]
"""
# Retrieve training set and make pair-wise features-label dataset
dtrain_zip = layer.dtrain_zip
batch_size = layer.se_dataset['batch_size']
# Shuffle dataset
if hasattr(layer, 'np_rng'):
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
layer.np_rng.shuffle(dtrain_zip)
else:
np.random.shuffle(dtrain_zip)
# Compute number of batches w.r.t. batch_size
if not batch_size:
batch_size = len(dtrain_zip)
n_batch = len(dtrain_zip) // batch_size
if not n_batch:
n_batch = 1
# Slice to make sure split will result in equal division
dtrain_zip = dtrain_zip[: n_batch * batch_size]
X_train, Y_train = zip(*dtrain_zip)
X_train = np.split(np.array(X_train), n_batch, axis=0)
Y_train = np.split(np.array(Y_train), n_batch, axis=0)
# Set into dataSet object
batch_dtrain = [dataSet(X_data=X_batch, Y_data=Y_batch, name=str(i))
for i, (X_batch, Y_batch) in enumerate(zip(X_train, Y_train))]
return batch_dtrain