import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print("Version: TensorFlow", tf.__version__)
tf.feature_column
is the bridge to map from columns in a CSV to features used to train tensorflow model. The workflow is listed at the following:
feature_column subclass | input data/tensor type | process | output data/tensor type |
---|---|---|---|
numeric_column | float or integer | wrap regular numeric type into dtyp | float or integer Dense Tensor |
bucketized_column | Dense tensor from numeric_column | bucketized | integer (one-hot) Dense Tensor |
categorical_column_with_identity | integer (unique key) | encode keys | integer (one-hot) Sparse Tensor |
categorical_column_with_vocabulary_list | string | encode word's index | integer (one-hot) Sparse Tensor |
categorical_column_with_vocabulary_file | string | encode word's index | integer (one-hot) Sparse Tensor |
categorical_column_with_hash_bucket | string, Sparse Tensor | hash-encoding words | integer (approximate one-hot) Sparse Tensor |
crossed_column | string, Sparse Tensor | Crossing combinations of features | integer Dense Tensor |
indicator_column | Sparse Tensor | wrap Sparse Tensor to Dense Tensor | integer Dense Tensor |
embedding_column | Sparse Tensor; Dense Tensor |
mapping a feature from sparse to dense form; dimentional reduction |
float Dense Tensor |
# A utility method to create a feature column, and to transform a batch of data
def demo(feature_column):
# reference: https://www.tensorflow.org/tutorials/structured_data/feature_columns?hl=zh-cn
feature_layer = layers.DenseFeatures(feature_column)
print(feature_layer(example_batch))
# example batch of data
example_batch = {
'var_numeric': np.arange(1,7),
'var_categorical': np.tile(['A','B','C'], 2),
}
example_batch
The feature columns used as inputs to the model should be instances of classes derived from _DenseColumn such as:
print("numeric_column:")
layer = tf.feature_column.numeric_column("var_numeric", default_value=None)
demo(layer)
print("bucketized_column:")
boundaries = [2,4]
layer = tf.feature_column.numeric_column('var_numeric', default_value=None)
layer = tf.feature_column.bucketized_column(layer, boundaries=boundaries)
demo(layer)
print("categorical_column_with_identity:")
layer = tf.feature_column.categorical_column_with_identity("var_numeric", num_buckets=4, default_value=None)
layer = tf.feature_column.indicator_column(layer)
demo(layer)
print("categorical_column_with_vocabulary_list / categorical_column_with_vocabulary_file:")
layer = tf.feature_column.categorical_column_with_vocabulary_list('var_categorical', ['A', 'B', 'C'])
layer = tf.feature_column.indicator_column(layer)
demo(layer)
print("categorical_column_with_hash_bucket:")
layer = tf.feature_column.categorical_column_with_hash_bucket('var_categorical', hash_bucket_size=2, dtype=tf.string)
layer = tf.feature_column.indicator_column(layer)
demo(layer)
print("crossed_column:")
col_1 = tf.feature_column.categorical_column_with_vocabulary_list('var_categorical', ['A', 'B', 'C'], dtype=tf.string)
col_2 = tf.feature_column.categorical_column_with_identity('var_numeric', num_buckets=2, default_value=0)
layer = tf.feature_column.crossed_column([col_1,col_2], 16)
layer = tf.feature_column.indicator_column(layer)
demo(layer)
If you have categorical features, you need to wrap them with :
print("indicator_column:")
layer = tf.feature_column.categorical_column_with_hash_bucket('var_categorical', hash_bucket_size=2, dtype=tf.string)
layer = tf.feature_column.indicator_column(layer)
demo(layer)
print("embedding_column:")
layer = tf.feature_column.categorical_column_with_vocabulary_list('var_categorical', ['A', 'B', 'C'], dtype=tf.string)
layer = tf.feature_column.embedding_column(layer, 2, combiner='sqrtn')
demo(layer)