Skip to content

Commit

Permalink
deepctr
Browse files Browse the repository at this point in the history
  • Loading branch information
jc-LeeHub committed Jan 7, 2021
0 parents commit 921da9f
Show file tree
Hide file tree
Showing 115 changed files with 4,853 additions and 0 deletions.
5 changes: 5 additions & 0 deletions AFM/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
'''
# Time : 2020/12/9 16:22
# Author : junchaoli
# File : __init__.py.py
'''
Binary file added AFM/__pycache__/layer.cpython-36.pyc
Binary file not shown.
Binary file added AFM/__pycache__/model.cpython-36.pyc
Binary file not shown.
Binary file added AFM/__pycache__/utils.cpython-36.pyc
Binary file not shown.
88 changes: 88 additions & 0 deletions AFM/layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
'''
# Time : 2020/12/9 16:31
# Author : junchaoli
# File : layer.py
'''

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Embedding, Dense, Dropout

class Interaction_layer(Layer):
'''
# input shape: [None, field, k]
# output shape: [None, field*(field-1)/2, k]
'''
def __init__(self):
super().__init__()

def call(self, inputs, **kwargs): # [None, field, k]
if K.ndim(inputs) != 3:
raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))

element_wise_product_list = []
for i in range(inputs.shape[1]):
for j in range(i+1, inputs.shape[1]):
element_wise_product_list.append(tf.multiply(inputs[:, i], inputs[:, j])) #[t, None, k]
element_wise_product = tf.transpose(tf.convert_to_tensor(element_wise_product_list), [1, 0, 2]) #[None, t, k]
return element_wise_product

class Attention_layer(Layer):
'''
# input shape: [None, n, k]
# output shape: [None, k]
'''
def __init__(self):
super().__init__()

def build(self, input_shape): # [None, field, k]
self.attention_w = Dense(input_shape[1], activation='relu')
self.attention_h = Dense(1, activation=None)

def call(self, inputs, **kwargs): # [None, field, k]
if K.ndim(inputs) != 3:
raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))

x = self.attention_w(inputs) # [None, field, field]
x = self.attention_h(x) # [None, field, 1]
a_score = tf.nn.softmax(x)
a_score = tf.transpose(a_score, [0, 2, 1]) # [None, 1, field]
output = tf.reshape(tf.matmul(a_score, inputs), shape=(-1, inputs.shape[2])) # (None, k)
return output

class AFM_layer(Layer):
def __init__(self, feature_columns, mode):
super(AFM_layer, self).__init__()
self.dense_feature_columns, self.sparse_feature_columns = feature_columns
self.mode = mode
self.embed_layer = {"emb_"+str(i): Embedding(feat['feat_onehot_dim'], feat['embed_dim'])
for i, feat in enumerate(self.sparse_feature_columns)}
self.interaction_layer = Interaction_layer()
if self.mode=='att':
self.attention_layer = Attention_layer()
self.output_layer = Dense(1)

def call(self, inputs, **kwargs):
if K.ndim(inputs) != 2:
raise ValueError("Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))

dense_inputs, sparse_inputs = inputs[:, :13], inputs[:, 13:]
embed = [self.embed_layer['emb_'+str(i)](sparse_inputs[:, i])
for i in range(sparse_inputs.shape[1])] # list
embed = tf.convert_to_tensor(embed)
embed = tf.transpose(embed, [1, 0, 2]) #[None, 26,k]

# Pair-wise Interaction
embed = self.interaction_layer(embed)

if self.mode == 'avg':
x = tf.reduce_mean(embed, axis=1) # (None, k)
elif self.mode == 'max':
x = tf.reduce_max(embed, axis=1) # (None, k)
else:
x = self.attention_layer(embed) # (None, k)

output = tf.nn.sigmoid(self.output_layer(x))
return output

17 changes: 17 additions & 0 deletions AFM/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
# Time : 2020/12/9 17:25
# Author : junchaoli
# File : model.py
'''

from layer import AFM_layer
from tensorflow.keras.models import Model

class AFM(Model):
def __init__(self, feature_columns, mode):
super().__init__()
self.afm_layer = AFM_layer(feature_columns, mode)

def call(self, inputs, training=None, mask=None):
output = self.afm_layer(inputs)
return output
43 changes: 43 additions & 0 deletions AFM/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
'''
# Time : 2020/12/9 17:28
# Author : junchaoli
# File : train.py
'''

from model import AFM
from utils import create_criteo_dataset

import tensorflow as tf
from tensorflow.keras import optimizers, losses, metrics
from sklearn.metrics import accuracy_score

if __name__ == '__main__':
file = 'E:\\PycharmProjects\\推荐算法\\data\\criteo_sample.txt'
test_size = 0.2
feature_columns, (X_train, y_train), (X_test, y_test) = \
create_criteo_dataset(file, test_size=test_size)

model = AFM(feature_columns, 'att')
optimizer = optimizers.SGD(0.01)

# dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
#
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
# model.fit(dataset, epochs=100)
# pre = model.predict(X_test)

summary = tf.summary.create_file_writer('E:\\PycharmProjects\\tensorboard')
for i in range(100):
with tf.GradientTape() as tape:
pre = model(X_train)
loss = tf.reduce_mean(losses.binary_crossentropy(y_train, pre))
print(loss.numpy())
# with summary.as_default():
# tf.summary.scalar('loss', loss, i)
grad = tape.gradient(loss, model.variables)
optimizer.apply_gradients(grads_and_vars=zip(grad, model.variables))
pre = model(X_test)

pre = [1 if x>0.5 else 0 for x in pre]
print("AUC: ", accuracy_score(y_test, pre))
40 changes: 40 additions & 0 deletions AFM/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
'''
# Time : 2020/12/9 20:53
# Author : junchaoli
# File : __init__.py
'''

import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

def sparseFeature(feat, feat_onehot_dim, embed_dim):
return {'feat': feat, 'feat_onehot_dim': feat_onehot_dim, 'embed_dim': embed_dim}

def denseFeature(feat):
return {'feat': feat}

def create_criteo_dataset(file_path, embed_dim=8, test_size=0.2):
data = pd.read_csv(file_path)

dense_features = ['I' + str(i) for i in range(1, 14)]
sparse_features = ['C' + str(i) for i in range(1, 27)]

#缺失值填充
data[dense_features] = data[dense_features].fillna(0)
data[sparse_features] = data[sparse_features].fillna('-1')

#归一化
data[dense_features] = MinMaxScaler().fit_transform(data[dense_features])
#LabelEncoding编码
for col in sparse_features:
data[col] = LabelEncoder().fit_transform(data[col]).astype(int)

feature_columns = [[denseFeature(feat) for feat in dense_features]] + \
[[sparseFeature(feat, data[feat].nunique(), embed_dim) for feat in sparse_features]]

X = data.drop(['label'], axis=1).values
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

return feature_columns, (X_train, y_train), (X_test, y_test)
5 changes: 5 additions & 0 deletions AutoInt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
'''
# Time : 2021/1/4 11:51
# Author : junchaoli
# File : __init__.py.py
'''
Binary file added AutoInt/__pycache__/layer.cpython-36.pyc
Binary file not shown.
Binary file added AutoInt/__pycache__/model.cpython-36.pyc
Binary file not shown.
Binary file added AutoInt/__pycache__/utils.cpython-36.pyc
Binary file not shown.
88 changes: 88 additions & 0 deletions AutoInt/layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
'''
# Time : 2021/1/4 11:51
# Author : junchaoli
# File : layer.py
'''

import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer, Dense, Dropout

class Dense_layer(Layer):
def __init__(self, hidden_units, activation='relu', dropout=0.0):
super(Dense_layer, self).__init__()
self.dense_layer = [Dense(i, activation=activation) for i in hidden_units]
self.dropout = Dropout(dropout)

def call(self, inputs, **kwargs):
x = inputs
for layer in self.dense_layer:
x = layer(x)
x = self.dropout(x)
return x

class DotProductAttention(Layer):
def __init__(self, dropout=0.0):
super(DotProductAttention, self).__init__()
self._dropout = dropout
self._masking_num = -2**32 + 1

def call(self, inputs):
# queries: [None, n, k]
# keys: [None, n, k]
# values: [None, n, k]
queries, keys, values = inputs
score = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # [None, n, n]
score = score/int(queries.shape[-1])**0.5 # 缩放
score = K.softmax(score) # SoftMax
score = K.dropout(score, self._dropout) # dropout
outputs = K.batch_dot(score, values) # [None, n, k]
return outputs

class MultiHeadAttention(Layer):
def __init__(self, n_heads=4, head_dim=64, dropout=0.1):
super(MultiHeadAttention, self).__init__()
self._n_heads = n_heads
self._head_dim = head_dim
self._dropout = dropout
self._att_layer = DotProductAttention(dropout=self._dropout)

def build(self, input_shape):
super(MultiHeadAttention, self).build(input_shape)
self._weights_queries = self.add_weight(
shape=(input_shape[0][-1], self._n_heads*self._head_dim),
initializer='glorot_uniform',
trainable=True,
name='weights_queries')
self._weights_keys = self.add_weight(
shape=(input_shape[1][-1], self._n_heads*self._head_dim),
initializer='glorot_uniform',
trainable=True,
name='weights_keys')
self._weights_values = self.add_weight(
shape=(input_shape[2][-1], self._n_heads*self._head_dim),
initializer='glorot_uniform',
trainable=True,
name='weights_values')

def call(self, inputs):
# queries: [None, n, k]
# keys: [None, n, k]
# values: [None, n, k]
queries, keys, values = inputs
if self._n_heads*self._head_dim != queries.shape[-1]:
raise ValueError("n_head * head_dim not equal embedding dim {}".format(queries.shape[-1]))

queries_linear = K.dot(queries, self._weights_queries) # [None, n, k]
keys_linear = K.dot(keys, self._weights_keys) # [None, n, k]
values_linear = K.dot(values, self._weights_values) # [None, n, k]

queries_multi_heads = tf.concat(tf.split(queries_linear, self._n_heads, axis=2), axis=0) # [None*n_head, n, k/n_head]
keys_multi_heads = tf.concat(tf.split(keys_linear, self._n_heads, axis=2), axis=0) # [None*n_head, n, k/n_head]
values_multi_heads = tf.concat(tf.split(values_linear, self._n_heads, axis=2), axis=0) # [None*n_head, n, k/n_head]

att_out = self._att_layer([queries_multi_heads, keys_multi_heads, values_multi_heads]) # [None*n_head, n, k/n_head]
outputs = tf.concat(tf.split(att_out, self._n_heads, axis=0), axis=2) # [None, n, k]
return outputs

55 changes: 55 additions & 0 deletions AutoInt/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
'''
# Time : 2021/1/4 12:11
# Author : junchaoli
# File : model.py
'''

from layer import Dense_layer, DotProductAttention, MultiHeadAttention

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding

class AutoInt(Model):
def __init__(self, feature_columns, hidden_units, activation='relu',
dnn_dropout=0.0, n_heads=4, head_dim=64, att_dropout=0.1):
super(AutoInt, self).__init__()
self.dense_feature_columns, self.sparse_feature_columns = feature_columns
self.dense_emb_layers = [Embedding(feat['feat_onehot_dim'], feat['embed_dim'])
for feat in self.dense_feature_columns]
self.sparse_emb_layers = [Embedding(feat['feat_onehot_dim'], feat['embed_dim'])
for feat in self.sparse_feature_columns]
self.dense_layer = Dense_layer(hidden_units, activation, dnn_dropout)
self.multi_head_att = MultiHeadAttention(n_heads, head_dim, att_dropout)
self.out_layer = Dense(1, activation=None)
k = self.dense_feature_columns[0]['embed_dim']
self.W_res = self.add_weight(name='W_res', shape=(k, k),
trainable=True,
initializer=tf.initializers.glorot_normal(),
regularizer=tf.keras.regularizers.l1_l2(1e-5))

def call(self, inputs, training=None, mask=None):
dense_inputs, sparse_inputs = inputs[:, :13], inputs[:, 13:]
# 值为1.0会使embedding报错
dense_inputs = tf.where(tf.equal(dense_inputs, 1), 0.9999999, dense_inputs)
dense_emb = [layer(dense_inputs[:, i]) for i, layer in enumerate(self.dense_emb_layers)] # [13, None, k]
sparse_emb = [layer(sparse_inputs[:, i]) for i, layer in enumerate(self.sparse_emb_layers)] # [26, None, k]
emb = tf.concat([tf.convert_to_tensor(dense_emb), tf.convert_to_tensor(sparse_emb)], axis=0) # [39, None, k]
emb = tf.transpose(emb, [1, 0, 2]) # [None, 39, k]

# DNN
dnn_input = tf.reshape(emb, shape=(-1, emb.shape[1]*emb.shape[2])) # [None, 39*k]
dnn_out = self.dense_layer(dnn_input) # [None, out_dim]

# AutoInt
att_out = self.multi_head_att([emb, emb, emb]) # [None, 39, k]
att_out_res = tf.matmul(emb, self.W_res) # [None, 39, k]
att_out = att_out + att_out_res
att_out = tf.reshape(att_out, [-1, att_out.shape[1]*att_out.shape[2]]) # [None, 39*k]

# output
x = tf.concat([dnn_out, att_out], axis=-1)
output = self.out_layer(x)
return tf.nn.sigmoid(output)


45 changes: 45 additions & 0 deletions AutoInt/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
'''
# Time : 2021/1/4 12:53
# Author : junchaoli
# File : train.py
'''

from model import AutoInt
from utils import create_criteo_dataset

import numpy as np
import tensorflow as tf
from tensorflow.keras import losses, optimizers
from sklearn.metrics import accuracy_score

if __name__ == '__main__':
file = 'E:\\PycharmProjects\\推荐算法\\data\\train.txt'
test_size = 0.1
hidden_units = [256, 128, 64]
feature_columns, (X_train, y_train), (X_test, y_test) = create_criteo_dataset(file, test_size=test_size)

model = AutoInt(feature_columns, hidden_units, dnn_dropout=0.2, n_heads=4, head_dim=16, att_dropout=0.2)
optimizer = optimizers.SGD(0.01)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

summary_writer = tf.summary.create_file_writer('E:\\PycharmProjects\\tensorboard')
for epoch in range(50):
sum_loss = []
for batch, data_batch in enumerate(train_dataset):
X_train, y_train = data_batch[0], data_batch[1]
with tf.GradientTape() as tape:
pre = model(X_train)
loss = tf.reduce_mean(losses.binary_crossentropy(y_train, pre))
grad = tape.gradient(loss, model.variables)
optimizer.apply_gradients(zip(grad, model.variables))
sum_loss.append(loss.numpy())
if batch%10==0:
print("epoch: {} batch: {} loss: {}".format(epoch, batch, np.mean(sum_loss)))
with summary_writer.as_default():
tf.summary.scalar('loss', np.mean(sum_loss), epoch)

pre = model(X_test)
pre = [1 if x>0.5 else 0 for x in pre]
print("AUC: ", accuracy_score(y_test, pre))
Loading

0 comments on commit 921da9f

Please sign in to comment.