deepctr

jc-LeeHub · Jan 7, 2021 · 921da9f · 921da9f
commit 921da9f
Show file tree

Hide file tree

Showing 115 changed files with 4,853 additions and 0 deletions.
diff --git a/AFM/__init__.py b/AFM/__init__.py
@@ -0,0 +1,5 @@
+'''
+# Time   : 2020/12/9 16:22
+# Author : junchaoli
+# File   : __init__.py.py
+'''
diff --git a/AFM/__pycache__/layer.cpython-36.pyc b/AFM/__pycache__/layer.cpython-36.pyc
diff --git a/AFM/__pycache__/model.cpython-36.pyc b/AFM/__pycache__/model.cpython-36.pyc
diff --git a/AFM/__pycache__/utils.cpython-36.pyc b/AFM/__pycache__/utils.cpython-36.pyc
diff --git a/AFM/layer.py b/AFM/layer.py
@@ -0,0 +1,88 @@
+'''
+# Time   : 2020/12/9 16:31
+# Author : junchaoli
+# File   : layer.py
+'''
+
+import tensorflow as tf
+import tensorflow.keras.backend as K
+from tensorflow.keras.layers import Layer
+from tensorflow.keras.layers import Embedding, Dense, Dropout
+
+class Interaction_layer(Layer):
+    '''
+    # input shape:  [None, field, k]
+    # output shape: [None, field*(field-1)/2, k]
+    '''
+    def __init__(self):
+        super().__init__()
+
+    def call(self, inputs, **kwargs): # [None, field, k]
+        if K.ndim(inputs) != 3:
+            raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        element_wise_product_list = []
+        for i in range(inputs.shape[1]):
+            for j in range(i+1, inputs.shape[1]):
+                element_wise_product_list.append(tf.multiply(inputs[:, i], inputs[:, j]))  #[t, None, k]
+        element_wise_product = tf.transpose(tf.convert_to_tensor(element_wise_product_list), [1, 0, 2]) #[None, t, k]
+        return element_wise_product
+
+class Attention_layer(Layer):
+    '''
+    # input shape:  [None, n, k]
+    # output shape: [None, k]
+    '''
+    def __init__(self):
+        super().__init__()
+
+    def build(self, input_shape): # [None, field, k]
+        self.attention_w = Dense(input_shape[1], activation='relu')
+        self.attention_h = Dense(1, activation=None)
+
+    def call(self, inputs, **kwargs): # [None, field, k]
+        if K.ndim(inputs) != 3:
+            raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        x = self.attention_w(inputs)  # [None, field, field]
+        x = self.attention_h(x)       # [None, field, 1]
+        a_score = tf.nn.softmax(x)
+        a_score = tf.transpose(a_score, [0, 2, 1]) # [None, 1, field]
+        output = tf.reshape(tf.matmul(a_score, inputs), shape=(-1, inputs.shape[2]))  # (None, k)
+        return output
+
+class AFM_layer(Layer):
+    def __init__(self, feature_columns, mode):
+        super(AFM_layer, self).__init__()
+        self.dense_feature_columns, self.sparse_feature_columns = feature_columns
+        self.mode = mode
+        self.embed_layer = {"emb_"+str(i): Embedding(feat['feat_onehot_dim'], feat['embed_dim'])
+                            for i, feat in enumerate(self.sparse_feature_columns)}
+        self.interaction_layer = Interaction_layer()
+        if self.mode=='att':
+            self.attention_layer = Attention_layer()
+        self.output_layer = Dense(1)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 2:
+            raise ValueError("Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))
+
+        dense_inputs, sparse_inputs = inputs[:, :13], inputs[:, 13:]
+        embed = [self.embed_layer['emb_'+str(i)](sparse_inputs[:, i])
+               for i in range(sparse_inputs.shape[1])]  # list
+        embed = tf.convert_to_tensor(embed)
+        embed = tf.transpose(embed, [1, 0, 2])  #[None, 26，k]
+
+        # Pair-wise Interaction
+        embed = self.interaction_layer(embed)
+
+        if self.mode == 'avg':
+            x = tf.reduce_mean(embed, axis=1)  # (None, k)
+        elif self.mode == 'max':
+            x = tf.reduce_max(embed, axis=1)  # (None, k)
+        else:
+            x = self.attention_layer(embed)  # (None, k)
+
+        output = tf.nn.sigmoid(self.output_layer(x))
+        return output
+
diff --git a/AFM/model.py b/AFM/model.py
@@ -0,0 +1,17 @@
+'''
+# Time   : 2020/12/9 17:25
+# Author : junchaoli
+# File   : model.py
+'''
+
+from layer import AFM_layer
+from tensorflow.keras.models import Model
+
+class AFM(Model):
+    def __init__(self, feature_columns, mode):
+        super().__init__()
+        self.afm_layer = AFM_layer(feature_columns, mode)
+
+    def call(self, inputs, training=None, mask=None):
+        output = self.afm_layer(inputs)
+        return output
diff --git a/AFM/train.py b/AFM/train.py
@@ -0,0 +1,43 @@
+'''
+# Time   : 2020/12/9 17:28
+# Author : junchaoli
+# File   : train.py
+'''
+
+from model import AFM
+from utils import create_criteo_dataset
+
+import tensorflow as tf
+from tensorflow.keras import optimizers, losses, metrics
+from sklearn.metrics import accuracy_score
+
+if __name__ == '__main__':
+    file = 'E:\\PycharmProjects\\推荐算法\\data\\criteo_sample.txt'
+    test_size = 0.2
+    feature_columns, (X_train, y_train), (X_test, y_test) = \
+                        create_criteo_dataset(file, test_size=test_size)
+
+    model = AFM(feature_columns, 'att')
+    optimizer = optimizers.SGD(0.01)
+
+    # dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
+    # dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
+    #
+    # model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
+    # model.fit(dataset, epochs=100)
+    # pre = model.predict(X_test)
+
+    summary = tf.summary.create_file_writer('E:\\PycharmProjects\\tensorboard')
+    for i in range(100):
+        with tf.GradientTape() as tape:
+            pre = model(X_train)
+            loss = tf.reduce_mean(losses.binary_crossentropy(y_train, pre))
+            print(loss.numpy())
+        # with summary.as_default():
+            # tf.summary.scalar('loss', loss, i)
+        grad = tape.gradient(loss, model.variables)
+        optimizer.apply_gradients(grads_and_vars=zip(grad, model.variables))
+    pre = model(X_test)
+
+    pre = [1 if x>0.5 else 0 for x in pre]
+    print("AUC: ", accuracy_score(y_test, pre))
diff --git a/AFM/utils.py b/AFM/utils.py
@@ -0,0 +1,40 @@
+'''
+# Time   : 2020/12/9 20:53
+# Author : junchaoli
+# File   : __init__.py
+'''
+
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler
+from sklearn.model_selection import train_test_split
+
+def sparseFeature(feat, feat_onehot_dim, embed_dim):
+    return {'feat': feat, 'feat_onehot_dim': feat_onehot_dim, 'embed_dim': embed_dim}
+
+def denseFeature(feat):
+    return {'feat': feat}
+
+def create_criteo_dataset(file_path, embed_dim=8, test_size=0.2):
+    data = pd.read_csv(file_path)
+
+    dense_features = ['I' + str(i) for i in range(1, 14)]
+    sparse_features = ['C' + str(i) for i in range(1, 27)]
+
+    #缺失值填充
+    data[dense_features] = data[dense_features].fillna(0)
+    data[sparse_features] = data[sparse_features].fillna('-1')
+
+    #归一化
+    data[dense_features] = MinMaxScaler().fit_transform(data[dense_features])
+    #LabelEncoding编码
+    for col in sparse_features:
+        data[col] = LabelEncoder().fit_transform(data[col]).astype(int)
+
+    feature_columns = [[denseFeature(feat) for feat in dense_features]] + \
+           [[sparseFeature(feat, data[feat].nunique(), embed_dim) for feat in sparse_features]]
+
+    X = data.drop(['label'], axis=1).values
+    y = data['label']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
+
+    return feature_columns, (X_train, y_train), (X_test, y_test)
diff --git a/AutoInt/__init__.py b/AutoInt/__init__.py
@@ -0,0 +1,5 @@
+'''
+# Time   : 2021/1/4 11:51
+# Author : junchaoli
+# File   : __init__.py.py
+'''
diff --git a/AutoInt/__pycache__/layer.cpython-36.pyc b/AutoInt/__pycache__/layer.cpython-36.pyc
diff --git a/AutoInt/__pycache__/model.cpython-36.pyc b/AutoInt/__pycache__/model.cpython-36.pyc
diff --git a/AutoInt/__pycache__/utils.cpython-36.pyc b/AutoInt/__pycache__/utils.cpython-36.pyc
diff --git a/AutoInt/layer.py b/AutoInt/layer.py
@@ -0,0 +1,88 @@
+'''
+# Time   : 2021/1/4 11:51
+# Author : junchaoli
+# File   : layer.py
+'''
+
+import numpy as np
+import tensorflow as tf
+import tensorflow.keras.backend as K
+from tensorflow.keras.layers import Layer, Dense, Dropout
+
+class Dense_layer(Layer):
+    def __init__(self, hidden_units, activation='relu', dropout=0.0):
+        super(Dense_layer, self).__init__()
+        self.dense_layer = [Dense(i, activation=activation) for i in hidden_units]
+        self.dropout = Dropout(dropout)
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        for layer in self.dense_layer:
+            x = layer(x)
+            x = self.dropout(x)
+        return x
+
+class DotProductAttention(Layer):
+    def __init__(self, dropout=0.0):
+        super(DotProductAttention, self).__init__()
+        self._dropout = dropout
+        self._masking_num = -2**32 + 1
+
+    def call(self, inputs):
+        # queries: [None, n, k]
+        # keys:    [None, n, k]
+        # values:  [None, n, k]
+        queries, keys, values = inputs
+        score = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1]))  # [None, n, n]
+        score = score/int(queries.shape[-1])**0.5   # 缩放
+        score = K.softmax(score)                    # SoftMax
+        score = K.dropout(score, self._dropout)     # dropout
+        outputs = K.batch_dot(score, values)        # [None, n, k]
+        return outputs
+
+class MultiHeadAttention(Layer):
+    def __init__(self, n_heads=4, head_dim=64, dropout=0.1):
+        super(MultiHeadAttention, self).__init__()
+        self._n_heads = n_heads
+        self._head_dim = head_dim
+        self._dropout = dropout
+        self._att_layer = DotProductAttention(dropout=self._dropout)
+
+    def build(self, input_shape):
+        super(MultiHeadAttention, self).build(input_shape)
+        self._weights_queries = self.add_weight(
+            shape=(input_shape[0][-1], self._n_heads*self._head_dim),
+            initializer='glorot_uniform',
+            trainable=True,
+            name='weights_queries')
+        self._weights_keys = self.add_weight(
+            shape=(input_shape[1][-1], self._n_heads*self._head_dim),
+            initializer='glorot_uniform',
+            trainable=True,
+            name='weights_keys')
+        self._weights_values = self.add_weight(
+            shape=(input_shape[2][-1], self._n_heads*self._head_dim),
+            initializer='glorot_uniform',
+            trainable=True,
+            name='weights_values')
+
+    def call(self, inputs):
+        # queries: [None, n, k]
+        # keys:    [None, n, k]
+        # values:  [None, n, k]
+        queries, keys, values = inputs
+        if self._n_heads*self._head_dim != queries.shape[-1]:
+            raise ValueError("n_head * head_dim not equal embedding dim {}".format(queries.shape[-1]))
+
+        queries_linear = K.dot(queries, self._weights_queries)  # [None, n, k]
+        keys_linear = K.dot(keys, self._weights_keys)           # [None, n, k]
+        values_linear = K.dot(values, self._weights_values)     # [None, n, k]
+
+        queries_multi_heads = tf.concat(tf.split(queries_linear, self._n_heads, axis=2), axis=0) # [None*n_head, n, k/n_head]
+        keys_multi_heads = tf.concat(tf.split(keys_linear, self._n_heads, axis=2), axis=0)       # [None*n_head, n, k/n_head]
+        values_multi_heads = tf.concat(tf.split(values_linear, self._n_heads, axis=2), axis=0)   # [None*n_head, n, k/n_head]
+
+        att_out = self._att_layer([queries_multi_heads, keys_multi_heads, values_multi_heads])   # [None*n_head, n, k/n_head]
+        outputs = tf.concat(tf.split(att_out, self._n_heads, axis=0), axis=2)    # [None, n, k]
+        return outputs
+
diff --git a/AutoInt/model.py b/AutoInt/model.py
@@ -0,0 +1,55 @@
+'''
+# Time   : 2021/1/4 12:11
+# Author : junchaoli
+# File   : model.py
+'''
+
+from layer import Dense_layer, DotProductAttention, MultiHeadAttention
+
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Dense, Embedding
+
+class AutoInt(Model):
+    def __init__(self, feature_columns, hidden_units, activation='relu',
+                    dnn_dropout=0.0, n_heads=4, head_dim=64, att_dropout=0.1):
+        super(AutoInt, self).__init__()
+        self.dense_feature_columns, self.sparse_feature_columns = feature_columns
+        self.dense_emb_layers = [Embedding(feat['feat_onehot_dim'], feat['embed_dim'])
+                                      for feat in self.dense_feature_columns]
+        self.sparse_emb_layers = [Embedding(feat['feat_onehot_dim'], feat['embed_dim'])
+                                      for feat in self.sparse_feature_columns]
+        self.dense_layer = Dense_layer(hidden_units, activation, dnn_dropout)
+        self.multi_head_att = MultiHeadAttention(n_heads, head_dim, att_dropout)
+        self.out_layer = Dense(1, activation=None)
+        k = self.dense_feature_columns[0]['embed_dim']
+        self.W_res = self.add_weight(name='W_res', shape=(k, k),
+                                     trainable=True,
+                                     initializer=tf.initializers.glorot_normal(),
+                                     regularizer=tf.keras.regularizers.l1_l2(1e-5))
+
+    def call(self, inputs, training=None, mask=None):
+        dense_inputs, sparse_inputs = inputs[:, :13], inputs[:, 13:]
+        # 值为1.0会使embedding报错
+        dense_inputs = tf.where(tf.equal(dense_inputs, 1), 0.9999999, dense_inputs)
+        dense_emb = [layer(dense_inputs[:, i]) for i, layer in enumerate(self.dense_emb_layers)]     # [13, None, k]
+        sparse_emb = [layer(sparse_inputs[:, i]) for i, layer in enumerate(self.sparse_emb_layers)]  # [26, None, k]
+        emb = tf.concat([tf.convert_to_tensor(dense_emb), tf.convert_to_tensor(sparse_emb)], axis=0) # [39, None, k]
+        emb = tf.transpose(emb, [1, 0, 2])  # [None, 39, k]
+
+        # DNN
+        dnn_input = tf.reshape(emb, shape=(-1, emb.shape[1]*emb.shape[2])) # [None, 39*k]
+        dnn_out = self.dense_layer(dnn_input)  # [None, out_dim]
+
+        # AutoInt
+        att_out = self.multi_head_att([emb, emb, emb]) # [None, 39, k]
+        att_out_res = tf.matmul(emb, self.W_res)       # [None, 39, k]
+        att_out = att_out + att_out_res
+        att_out = tf.reshape(att_out, [-1, att_out.shape[1]*att_out.shape[2]]) # [None, 39*k]
+
+        # output
+        x = tf.concat([dnn_out, att_out], axis=-1)
+        output = self.out_layer(x)
+        return tf.nn.sigmoid(output)
+
+
diff --git a/AutoInt/train.py b/AutoInt/train.py
@@ -0,0 +1,45 @@
+'''
+# Time   : 2021/1/4 12:53
+# Author : junchaoli
+# File   : train.py
+'''
+
+from model import AutoInt
+from utils import create_criteo_dataset
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import losses, optimizers
+from sklearn.metrics import accuracy_score
+
+if __name__ == '__main__':
+    file = 'E:\\PycharmProjects\\推荐算法\\data\\train.txt'
+    test_size = 0.1
+    hidden_units = [256, 128, 64]
+    feature_columns, (X_train, y_train), (X_test, y_test) = create_criteo_dataset(file, test_size=test_size)
+
+    model = AutoInt(feature_columns, hidden_units, dnn_dropout=0.2, n_heads=4, head_dim=16, att_dropout=0.2)
+    optimizer = optimizers.SGD(0.01)
+
+    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
+    train_dataset = train_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
+
+    summary_writer = tf.summary.create_file_writer('E:\\PycharmProjects\\tensorboard')
+    for epoch in range(50):
+        sum_loss = []
+        for batch, data_batch in enumerate(train_dataset):
+            X_train, y_train = data_batch[0], data_batch[1]
+            with tf.GradientTape() as tape:
+                pre = model(X_train)
+                loss = tf.reduce_mean(losses.binary_crossentropy(y_train, pre))
+                grad = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grad, model.variables))
+            sum_loss.append(loss.numpy())
+            if batch%10==0:
+                print("epoch: {} batch: {} loss: {}".format(epoch, batch, np.mean(sum_loss)))
+        with summary_writer.as_default():
+            tf.summary.scalar('loss', np.mean(sum_loss), epoch)
+
+    pre = model(X_test)
+    pre = [1 if x>0.5 else 0 for x in pre]
+    print("AUC: ", accuracy_score(y_test, pre))