import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
print('Tensorflow version:', tf.__version__)
When memory cannot load the entire dataset as well as compute cacluation, the input pipeline have to be delicately designed to avoid overloading the memory. The practical solution is to feed only a small batch in to the training model.
In this demo, I will be sharing my experience on
Dataset Directory Structure | case of usage | data size | preferable data type |
---|---|---|---|
Unsorted | single data file with entire data | mostly small | structural data |
Sorted by class | multiple data files | mostly large | unstructual data |
Sorted by train/test/val split and class | isolate train/test/val data | mostly large | unstructual data |
Time expense come from two source:
Method on improving pipeline capacity | Operation | Traded-off resouce for time expense |
Waiting time saved during data preparation |
---|---|---|---|
tf.Dataset.interleave( num_parallel_calls = tf.data.AUTOTUNE) |
Parallelizing data extraction | CPU | loop reading data |
tf.Dataset.map(num_parallel_calls = tf.data.AUTOTUNE) |
Parallelizing Data Transformation | CPU | apply mapping of data processing |
tf.Dataset.cache() |
Storing result in RAM in the first epoch | RAM | repeat same mapping of data processing in new epoch |
tf.Dataset.prefetch( buffer_size = tf.data.AUTOTUNE) |
Preparing data for next epochs during current epochs | CPU, RAM | waiting time of reading and mapping before trainning |
Dataset directory structure |
Number of files | API | Input | Special usage |
---|---|---|---|---|
unsort | single | tf.data.Dataset.from_tensors() tf.data.Dataset.from_tensor_slices() |
csv or filepath list | numpy: undistinguished features pandas: distinguished features |
unsort | multiple | tf.data.TextLineDataset() |
csv/txt | to quickly create dataset for text |
unsort | multiple | tf.data.experimental.make_csv_dataset() |
filepath list | To create a dataset with shuffle and batching |
unsort | multiple | tf.data.Dataset.list_files() |
data folder directory | To create a dataset of all files matching a pattern |
sorted | multiple | tf.keras.preprocessing.image.ImageDataGenerator() .flow_from_directory() tf.keras.preprocessing.text_dataset_from_directory() |
data folder directory | To create a dataset, which the label are generated from sub-directory name, with data augmentation |
sorted | multiple | tf.data.Dataset.from_generator() |
data generator | To create a dataset with high customizability using python logic |
.
├── data
├── code.py
└── model
.
├── data
│ ├── label_1
│ ├── label_2
│ └── label_3
├── code.py
└── model
.
├── data
│ ├── train
│ │ ├── label_1
│ │ ├── label_2
│ │ └── label_3
│ ├── test
│ │ ├── label_1
│ │ ├── label_2
│ │ └── label_3
│ └── validation
│ ├── label_1
│ ├── label_2
│ └── label_3
├── code.py
└── model
Class tf.data
contain the following sub-class:
tf.data.TFRecordDataset
: TFRecord is the data set storage format in TensorFlow. When we organize the data set into TFRecord format, TensorFlow can efficiently read and process these data sets, thus helping us to conduct large-scale model training more efficiently.tf.data.TextLineDataset
: Provides an easy way to extract lines from one or more text files. One line in the text is an element, which is a string type tensortf.data.Dataset
: higher level warper for data input pipeline. A instance of this class can be seen as an iterated ordered list of "elements" of the same type. tf.data.Dataset.from_tensors()
tf.data.Dataset.from_tensor_slices()
tf.data.Dataset.from_generator()
tf.data.Dataset.list_files()
class tf.keras.preprocessing
contain three higher level ETL API tf.data.Datasets
before they are fed to the model.
text_dataset_from_directory()
: Generates a tf.data.Dataset from text files in a directory.image_dataset_from_directory()
: Generates a tf.data.Dataset from image files in a directory.timeseries_dataset_from_array()
: Creates a dataset of sliding windows over a timeseries provided as array. (Not included in this demo)To cover the examples of various combination of dataset directory structure, number of file, and type of data, I will be using the following open-source dataset.
Dataset directory structure |
Number of files | Data type | Dataset | File type | Number of sample (train/test) |
Number of Attributes (Tensor shape) |
Number of Class |
---|---|---|---|---|---|---|---|
unsorted | single | structural | Iris Data Set | .csv | 150 | (4,) | 3 |
unsorted | single | timeseries | House_Property_Sales | .csv | 29581 | (3, ) | numeric |
unsorted | single | text | Corporate_messaging_DFE | .csv | 3119 | (9,) | 4 |
unsorted | Multiple | audio | free_spoken_digit_dataset | .wav | 3000 | no identical length | 10 |
sorted | Multiple | text | stack_overflow_16k | .txt | 8000/8000 | no identical length | 4 |
sorted | Multiple | image | MNIST database of handwritten digits | .png | 60,000/10,000 | (28, 28, 1) | 10 |
reference: 30天吃掉那只 TensorFlow2
df = pd.read_csv('dataset/Iris/Iris.csv').to_numpy()
X = df[:,1:5].astype("float32")
y = df[:,-1]
dataset = tf.data.Dataset.from_tensor_slices((X,y))
for features,label in dataset.take(1):
print('Feature: \n',features, '\n')
print('label: \n',label)
df = pd.read_csv('dataset/Iris/Iris.csv')
X = pd.DataFrame(df, columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']).to_dict("list")
y = df[['Species']]
dataset = tf.data.Dataset.from_tensor_slices((X,y))
for features,label in dataset.take(1):
print('Feature: \n',features, '\n')
print('label: \n',label)
dataset = tf.data.experimental.make_csv_dataset(
file_pattern = ["dataset/Iris/Iris.csv","dataset/Iris/Iris.csv"],
select_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species'],
label_name="Species",
batch_size=1,
na_value="",
num_epochs=1,
ignore_errors=True)
for features,label in dataset.take(1):
print('Feature: \n',features, '\n')
print('label: \n',label)
dataset = tf.data.TextLineDataset(
filenames = ["Dataset/Corporate_messaging_DFE/Corporate_messaging_DFE.csv"]
).skip(1) #略去第一行header
for line in dataset.take(1):
print('line: \n',line, '\n')
dataset = tf.keras.preprocessing.text_dataset_from_directory(
'./Dataset/stack_overflow_16k/train',
batch_size=1,
validation_split=0.2,
subset='training',
seed=42)
for features,label in dataset.take(1):
print('Feature: \n',features, '\n')
print('label: \n',label)
dataset = tf.keras.preprocessing.image_dataset_from_directory(
'./Dataset/mnist_png/training',
batch_size=1,
image_size = (28, 28),
color_mode="grayscale",
label_mode='categorical',
subset='training',
validation_split=0.2,
seed=42)
for features,label in dataset.take(1):
print('Feature shape: \n',features.shape, '\n')
print('label: \n',label)
image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1.0/255).flow_from_directory(
"./Dataset/mnist_png/training",
target_size=(28, 28),
batch_size=5,
color_mode="grayscale",
class_mode='categorical')
# model.fit(image_generator)
# for demo purpose
def generator():
for features,label in image_generator:
yield (features,label)
dataset = tf.data.Dataset.from_generator(generator,output_types=(tf.float32,tf.int32))
for features,label in dataset.take(1):
print('Feature shape: \n',features.shape, '\n')
print('label: \n',label)
reference: https://bbs.cvmart.net/topics/1545
import os, pathlib, cv2, random
from tensorflow.python.keras.utils.data_utils import Sequence
from sklearn.preprocessing import OneHotEncoder
class ImageDataFeeder(Sequence):
def __init__(self, filepath, batch_size=8, imgshape=(28, 28),
n_channels=3, n_classes=13, shuffle=True, ):
# initiation method
self.filepath=filepath
self.pathlist= [str(pathlib.Path(path)) for path in pathlib.Path(self.filepath).rglob('*.png')]
self.batch_size = batch_size
self.imgshape = imgshape
self.n_channels = n_channels
self.label_dict = self._generate_label_dict(self.filepath)
self.shuffle = shuffle
self.on_epoch_end()
def __getitem__(self, index):
# generate batch index
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# generate list of batch index
batch_pathlist = [self.pathlist[k] for k in indexes]
# generate data
images = self._generate_images(batch_pathlist)
labels = self._generate_labels(batch_pathlist)
return images, labels
def __len__(self):
# return the number of batch
return int(np.floor(len(self.pathlist) / self.batch_size))
def _generate_label_dict(self, filepath):
# get index of label
label_names = sorted(set(os.listdir(filepath)))
label_to_onehot = OneHotEncoder(sparse = False)
label_to_onehot.fit(np.array(label_names).reshape(-1, 1))
label_dict = dict(zip(label_names, label_to_onehot.fit_transform(np.array(label_names).reshape(-1, 1))))
return label_dict
def _load_image(self, image_path):
def gasuss_noise(image, mean=0, var=0.01):
noise = np.random.normal(mean, var ** 0.5, image.shape)
out = image + noise
if out.min() < 0:
low_clip = -1.
else:
low_clip = 0.
out = np.clip(out, low_clip, 1.0)
return out
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)/255 # grey 1 channel
img = cv2.flip(img, flipCode= random.randint(-1,1)) # flip up or down
# img = rotate(img, angle=random.randint(-5,5), mode='reflect') # rotate
img = cv2.warpAffine(img,
M = np.float32([[1, 0, random.randint(-28,28)],
[0, 1, random.randint(-28,28)]]),
dsize = img.shape)
img = gasuss_noise(img, var = random.randint(1,10)/1000)
if self.imgshape != img.shape:
img = cv2.resize(img, self.imgshape)
return np.expand_dims(img, -1)
def _generate_images(self, batch_pathlist):
# generate images for a batach
images = np.empty((self.batch_size, *self.imgshape, self.n_channels))
for i, path in enumerate(batch_pathlist):
images[i,] = self._load_image(path)
return images
def _generate_labels(self, batch_pathlist):
# generate labels for a batch
labels = np.empty((self.batch_size, len(self.label_dict) ), dtype=int)
for i, path in enumerate(batch_pathlist):
# Store sample
labels[i,] = label_dict.get(path.split('/')[-2])
return labels
def on_epoch_end(self):
# update index at the end of each epoch
self.indexes = np.arange(len(self.pathlist))
if self.shuffle == True:
np.random.shuffle(self.indexes)
# Parameters
params = {'batch_size': 5,
'n_channels': 1,
'n_classes': 10,
'shuffle': True,
'imgshape': (28,28)}
image_generator = ImageDataFeeder('./Dataset/mnist_png/training', **params)
# model.fit(image_generator)
# for demo purpose
def generator():
for features,label in image_generator:
yield (features,label)
dataset = tf.data.Dataset.from_generator(generator,output_types=(tf.float32,tf.int32))
for features,label in dataset.take(1):
print('Feature shape: \n',features.shape, '\n')
print('label: \n',label)