-
Notifications
You must be signed in to change notification settings - Fork 215
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 921da9f
Showing
115 changed files
with
4,853 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
''' | ||
# Time : 2020/12/9 16:22 | ||
# Author : junchaoli | ||
# File : __init__.py.py | ||
''' |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
''' | ||
# Time : 2020/12/9 16:31 | ||
# Author : junchaoli | ||
# File : layer.py | ||
''' | ||
|
||
import tensorflow as tf | ||
import tensorflow.keras.backend as K | ||
from tensorflow.keras.layers import Layer | ||
from tensorflow.keras.layers import Embedding, Dense, Dropout | ||
|
||
class Interaction_layer(Layer): | ||
''' | ||
# input shape: [None, field, k] | ||
# output shape: [None, field*(field-1)/2, k] | ||
''' | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def call(self, inputs, **kwargs): # [None, field, k] | ||
if K.ndim(inputs) != 3: | ||
raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs))) | ||
|
||
element_wise_product_list = [] | ||
for i in range(inputs.shape[1]): | ||
for j in range(i+1, inputs.shape[1]): | ||
element_wise_product_list.append(tf.multiply(inputs[:, i], inputs[:, j])) #[t, None, k] | ||
element_wise_product = tf.transpose(tf.convert_to_tensor(element_wise_product_list), [1, 0, 2]) #[None, t, k] | ||
return element_wise_product | ||
|
||
class Attention_layer(Layer): | ||
''' | ||
# input shape: [None, n, k] | ||
# output shape: [None, k] | ||
''' | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def build(self, input_shape): # [None, field, k] | ||
self.attention_w = Dense(input_shape[1], activation='relu') | ||
self.attention_h = Dense(1, activation=None) | ||
|
||
def call(self, inputs, **kwargs): # [None, field, k] | ||
if K.ndim(inputs) != 3: | ||
raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs))) | ||
|
||
x = self.attention_w(inputs) # [None, field, field] | ||
x = self.attention_h(x) # [None, field, 1] | ||
a_score = tf.nn.softmax(x) | ||
a_score = tf.transpose(a_score, [0, 2, 1]) # [None, 1, field] | ||
output = tf.reshape(tf.matmul(a_score, inputs), shape=(-1, inputs.shape[2])) # (None, k) | ||
return output | ||
|
||
class AFM_layer(Layer): | ||
def __init__(self, feature_columns, mode): | ||
super(AFM_layer, self).__init__() | ||
self.dense_feature_columns, self.sparse_feature_columns = feature_columns | ||
self.mode = mode | ||
self.embed_layer = {"emb_"+str(i): Embedding(feat['feat_onehot_dim'], feat['embed_dim']) | ||
for i, feat in enumerate(self.sparse_feature_columns)} | ||
self.interaction_layer = Interaction_layer() | ||
if self.mode=='att': | ||
self.attention_layer = Attention_layer() | ||
self.output_layer = Dense(1) | ||
|
||
def call(self, inputs, **kwargs): | ||
if K.ndim(inputs) != 2: | ||
raise ValueError("Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs))) | ||
|
||
dense_inputs, sparse_inputs = inputs[:, :13], inputs[:, 13:] | ||
embed = [self.embed_layer['emb_'+str(i)](sparse_inputs[:, i]) | ||
for i in range(sparse_inputs.shape[1])] # list | ||
embed = tf.convert_to_tensor(embed) | ||
embed = tf.transpose(embed, [1, 0, 2]) #[None, 26,k] | ||
|
||
# Pair-wise Interaction | ||
embed = self.interaction_layer(embed) | ||
|
||
if self.mode == 'avg': | ||
x = tf.reduce_mean(embed, axis=1) # (None, k) | ||
elif self.mode == 'max': | ||
x = tf.reduce_max(embed, axis=1) # (None, k) | ||
else: | ||
x = self.attention_layer(embed) # (None, k) | ||
|
||
output = tf.nn.sigmoid(self.output_layer(x)) | ||
return output | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
''' | ||
# Time : 2020/12/9 17:25 | ||
# Author : junchaoli | ||
# File : model.py | ||
''' | ||
|
||
from layer import AFM_layer | ||
from tensorflow.keras.models import Model | ||
|
||
class AFM(Model): | ||
def __init__(self, feature_columns, mode): | ||
super().__init__() | ||
self.afm_layer = AFM_layer(feature_columns, mode) | ||
|
||
def call(self, inputs, training=None, mask=None): | ||
output = self.afm_layer(inputs) | ||
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
''' | ||
# Time : 2020/12/9 17:28 | ||
# Author : junchaoli | ||
# File : train.py | ||
''' | ||
|
||
from model import AFM | ||
from utils import create_criteo_dataset | ||
|
||
import tensorflow as tf | ||
from tensorflow.keras import optimizers, losses, metrics | ||
from sklearn.metrics import accuracy_score | ||
|
||
if __name__ == '__main__': | ||
file = 'E:\\PycharmProjects\\推荐算法\\data\\criteo_sample.txt' | ||
test_size = 0.2 | ||
feature_columns, (X_train, y_train), (X_test, y_test) = \ | ||
create_criteo_dataset(file, test_size=test_size) | ||
|
||
model = AFM(feature_columns, 'att') | ||
optimizer = optimizers.SGD(0.01) | ||
|
||
# dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) | ||
# dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE) | ||
# | ||
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) | ||
# model.fit(dataset, epochs=100) | ||
# pre = model.predict(X_test) | ||
|
||
summary = tf.summary.create_file_writer('E:\\PycharmProjects\\tensorboard') | ||
for i in range(100): | ||
with tf.GradientTape() as tape: | ||
pre = model(X_train) | ||
loss = tf.reduce_mean(losses.binary_crossentropy(y_train, pre)) | ||
print(loss.numpy()) | ||
# with summary.as_default(): | ||
# tf.summary.scalar('loss', loss, i) | ||
grad = tape.gradient(loss, model.variables) | ||
optimizer.apply_gradients(grads_and_vars=zip(grad, model.variables)) | ||
pre = model(X_test) | ||
|
||
pre = [1 if x>0.5 else 0 for x in pre] | ||
print("AUC: ", accuracy_score(y_test, pre)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
''' | ||
# Time : 2020/12/9 20:53 | ||
# Author : junchaoli | ||
# File : __init__.py | ||
''' | ||
|
||
import pandas as pd | ||
from sklearn.preprocessing import LabelEncoder, MinMaxScaler | ||
from sklearn.model_selection import train_test_split | ||
|
||
def sparseFeature(feat, feat_onehot_dim, embed_dim): | ||
return {'feat': feat, 'feat_onehot_dim': feat_onehot_dim, 'embed_dim': embed_dim} | ||
|
||
def denseFeature(feat): | ||
return {'feat': feat} | ||
|
||
def create_criteo_dataset(file_path, embed_dim=8, test_size=0.2): | ||
data = pd.read_csv(file_path) | ||
|
||
dense_features = ['I' + str(i) for i in range(1, 14)] | ||
sparse_features = ['C' + str(i) for i in range(1, 27)] | ||
|
||
#缺失值填充 | ||
data[dense_features] = data[dense_features].fillna(0) | ||
data[sparse_features] = data[sparse_features].fillna('-1') | ||
|
||
#归一化 | ||
data[dense_features] = MinMaxScaler().fit_transform(data[dense_features]) | ||
#LabelEncoding编码 | ||
for col in sparse_features: | ||
data[col] = LabelEncoder().fit_transform(data[col]).astype(int) | ||
|
||
feature_columns = [[denseFeature(feat) for feat in dense_features]] + \ | ||
[[sparseFeature(feat, data[feat].nunique(), embed_dim) for feat in sparse_features]] | ||
|
||
X = data.drop(['label'], axis=1).values | ||
y = data['label'] | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) | ||
|
||
return feature_columns, (X_train, y_train), (X_test, y_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
''' | ||
# Time : 2021/1/4 11:51 | ||
# Author : junchaoli | ||
# File : __init__.py.py | ||
''' |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
''' | ||
# Time : 2021/1/4 11:51 | ||
# Author : junchaoli | ||
# File : layer.py | ||
''' | ||
|
||
import numpy as np | ||
import tensorflow as tf | ||
import tensorflow.keras.backend as K | ||
from tensorflow.keras.layers import Layer, Dense, Dropout | ||
|
||
class Dense_layer(Layer): | ||
def __init__(self, hidden_units, activation='relu', dropout=0.0): | ||
super(Dense_layer, self).__init__() | ||
self.dense_layer = [Dense(i, activation=activation) for i in hidden_units] | ||
self.dropout = Dropout(dropout) | ||
|
||
def call(self, inputs, **kwargs): | ||
x = inputs | ||
for layer in self.dense_layer: | ||
x = layer(x) | ||
x = self.dropout(x) | ||
return x | ||
|
||
class DotProductAttention(Layer): | ||
def __init__(self, dropout=0.0): | ||
super(DotProductAttention, self).__init__() | ||
self._dropout = dropout | ||
self._masking_num = -2**32 + 1 | ||
|
||
def call(self, inputs): | ||
# queries: [None, n, k] | ||
# keys: [None, n, k] | ||
# values: [None, n, k] | ||
queries, keys, values = inputs | ||
score = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # [None, n, n] | ||
score = score/int(queries.shape[-1])**0.5 # 缩放 | ||
score = K.softmax(score) # SoftMax | ||
score = K.dropout(score, self._dropout) # dropout | ||
outputs = K.batch_dot(score, values) # [None, n, k] | ||
return outputs | ||
|
||
class MultiHeadAttention(Layer): | ||
def __init__(self, n_heads=4, head_dim=64, dropout=0.1): | ||
super(MultiHeadAttention, self).__init__() | ||
self._n_heads = n_heads | ||
self._head_dim = head_dim | ||
self._dropout = dropout | ||
self._att_layer = DotProductAttention(dropout=self._dropout) | ||
|
||
def build(self, input_shape): | ||
super(MultiHeadAttention, self).build(input_shape) | ||
self._weights_queries = self.add_weight( | ||
shape=(input_shape[0][-1], self._n_heads*self._head_dim), | ||
initializer='glorot_uniform', | ||
trainable=True, | ||
name='weights_queries') | ||
self._weights_keys = self.add_weight( | ||
shape=(input_shape[1][-1], self._n_heads*self._head_dim), | ||
initializer='glorot_uniform', | ||
trainable=True, | ||
name='weights_keys') | ||
self._weights_values = self.add_weight( | ||
shape=(input_shape[2][-1], self._n_heads*self._head_dim), | ||
initializer='glorot_uniform', | ||
trainable=True, | ||
name='weights_values') | ||
|
||
def call(self, inputs): | ||
# queries: [None, n, k] | ||
# keys: [None, n, k] | ||
# values: [None, n, k] | ||
queries, keys, values = inputs | ||
if self._n_heads*self._head_dim != queries.shape[-1]: | ||
raise ValueError("n_head * head_dim not equal embedding dim {}".format(queries.shape[-1])) | ||
|
||
queries_linear = K.dot(queries, self._weights_queries) # [None, n, k] | ||
keys_linear = K.dot(keys, self._weights_keys) # [None, n, k] | ||
values_linear = K.dot(values, self._weights_values) # [None, n, k] | ||
|
||
queries_multi_heads = tf.concat(tf.split(queries_linear, self._n_heads, axis=2), axis=0) # [None*n_head, n, k/n_head] | ||
keys_multi_heads = tf.concat(tf.split(keys_linear, self._n_heads, axis=2), axis=0) # [None*n_head, n, k/n_head] | ||
values_multi_heads = tf.concat(tf.split(values_linear, self._n_heads, axis=2), axis=0) # [None*n_head, n, k/n_head] | ||
|
||
att_out = self._att_layer([queries_multi_heads, keys_multi_heads, values_multi_heads]) # [None*n_head, n, k/n_head] | ||
outputs = tf.concat(tf.split(att_out, self._n_heads, axis=0), axis=2) # [None, n, k] | ||
return outputs | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
''' | ||
# Time : 2021/1/4 12:11 | ||
# Author : junchaoli | ||
# File : model.py | ||
''' | ||
|
||
from layer import Dense_layer, DotProductAttention, MultiHeadAttention | ||
|
||
import tensorflow as tf | ||
from tensorflow.keras.models import Model | ||
from tensorflow.keras.layers import Dense, Embedding | ||
|
||
class AutoInt(Model): | ||
def __init__(self, feature_columns, hidden_units, activation='relu', | ||
dnn_dropout=0.0, n_heads=4, head_dim=64, att_dropout=0.1): | ||
super(AutoInt, self).__init__() | ||
self.dense_feature_columns, self.sparse_feature_columns = feature_columns | ||
self.dense_emb_layers = [Embedding(feat['feat_onehot_dim'], feat['embed_dim']) | ||
for feat in self.dense_feature_columns] | ||
self.sparse_emb_layers = [Embedding(feat['feat_onehot_dim'], feat['embed_dim']) | ||
for feat in self.sparse_feature_columns] | ||
self.dense_layer = Dense_layer(hidden_units, activation, dnn_dropout) | ||
self.multi_head_att = MultiHeadAttention(n_heads, head_dim, att_dropout) | ||
self.out_layer = Dense(1, activation=None) | ||
k = self.dense_feature_columns[0]['embed_dim'] | ||
self.W_res = self.add_weight(name='W_res', shape=(k, k), | ||
trainable=True, | ||
initializer=tf.initializers.glorot_normal(), | ||
regularizer=tf.keras.regularizers.l1_l2(1e-5)) | ||
|
||
def call(self, inputs, training=None, mask=None): | ||
dense_inputs, sparse_inputs = inputs[:, :13], inputs[:, 13:] | ||
# 值为1.0会使embedding报错 | ||
dense_inputs = tf.where(tf.equal(dense_inputs, 1), 0.9999999, dense_inputs) | ||
dense_emb = [layer(dense_inputs[:, i]) for i, layer in enumerate(self.dense_emb_layers)] # [13, None, k] | ||
sparse_emb = [layer(sparse_inputs[:, i]) for i, layer in enumerate(self.sparse_emb_layers)] # [26, None, k] | ||
emb = tf.concat([tf.convert_to_tensor(dense_emb), tf.convert_to_tensor(sparse_emb)], axis=0) # [39, None, k] | ||
emb = tf.transpose(emb, [1, 0, 2]) # [None, 39, k] | ||
|
||
# DNN | ||
dnn_input = tf.reshape(emb, shape=(-1, emb.shape[1]*emb.shape[2])) # [None, 39*k] | ||
dnn_out = self.dense_layer(dnn_input) # [None, out_dim] | ||
|
||
# AutoInt | ||
att_out = self.multi_head_att([emb, emb, emb]) # [None, 39, k] | ||
att_out_res = tf.matmul(emb, self.W_res) # [None, 39, k] | ||
att_out = att_out + att_out_res | ||
att_out = tf.reshape(att_out, [-1, att_out.shape[1]*att_out.shape[2]]) # [None, 39*k] | ||
|
||
# output | ||
x = tf.concat([dnn_out, att_out], axis=-1) | ||
output = self.out_layer(x) | ||
return tf.nn.sigmoid(output) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
''' | ||
# Time : 2021/1/4 12:53 | ||
# Author : junchaoli | ||
# File : train.py | ||
''' | ||
|
||
from model import AutoInt | ||
from utils import create_criteo_dataset | ||
|
||
import numpy as np | ||
import tensorflow as tf | ||
from tensorflow.keras import losses, optimizers | ||
from sklearn.metrics import accuracy_score | ||
|
||
if __name__ == '__main__': | ||
file = 'E:\\PycharmProjects\\推荐算法\\data\\train.txt' | ||
test_size = 0.1 | ||
hidden_units = [256, 128, 64] | ||
feature_columns, (X_train, y_train), (X_test, y_test) = create_criteo_dataset(file, test_size=test_size) | ||
|
||
model = AutoInt(feature_columns, hidden_units, dnn_dropout=0.2, n_heads=4, head_dim=16, att_dropout=0.2) | ||
optimizer = optimizers.SGD(0.01) | ||
|
||
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) | ||
train_dataset = train_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE) | ||
|
||
summary_writer = tf.summary.create_file_writer('E:\\PycharmProjects\\tensorboard') | ||
for epoch in range(50): | ||
sum_loss = [] | ||
for batch, data_batch in enumerate(train_dataset): | ||
X_train, y_train = data_batch[0], data_batch[1] | ||
with tf.GradientTape() as tape: | ||
pre = model(X_train) | ||
loss = tf.reduce_mean(losses.binary_crossentropy(y_train, pre)) | ||
grad = tape.gradient(loss, model.variables) | ||
optimizer.apply_gradients(zip(grad, model.variables)) | ||
sum_loss.append(loss.numpy()) | ||
if batch%10==0: | ||
print("epoch: {} batch: {} loss: {}".format(epoch, batch, np.mean(sum_loss))) | ||
with summary_writer.as_default(): | ||
tf.summary.scalar('loss', np.mean(sum_loss), epoch) | ||
|
||
pre = model(X_test) | ||
pre = [1 if x>0.5 else 0 for x in pre] | ||
print("AUC: ", accuracy_score(y_test, pre)) |
Oops, something went wrong.