lab-pc 2 years ago
parent 057d1a0eb1
commit b8f8707e4d
  1. 2
      __init__.py
  2. 195
      constructor.py
  3. BIN
      data/ind.citeseer.allx
  4. BIN
      data/ind.citeseer.ally
  5. BIN
      data/ind.citeseer.graph
  6. 1000
      data/ind.citeseer.test.index
  7. BIN
      data/ind.citeseer.tx
  8. BIN
      data/ind.citeseer.ty
  9. BIN
      data/ind.citeseer.x
  10. BIN
      data/ind.citeseer.y
  11. BIN
      data/ind.cora.allx
  12. BIN
      data/ind.cora.ally
  13. BIN
      data/ind.cora.graph
  14. 1000
      data/ind.cora.test.index
  15. BIN
      data/ind.cora.tx
  16. BIN
      data/ind.cora.ty
  17. BIN
      data/ind.cora.x
  18. BIN
      data/ind.cora.y
  19. BIN
      data/ind.pubmed.allx
  20. BIN
      data/ind.pubmed.ally
  21. BIN
      data/ind.pubmed.graph
  22. 1000
      data/ind.pubmed.test.index
  23. BIN
      data/ind.pubmed.tx
  24. BIN
      data/ind.pubmed.ty
  25. BIN
      data/ind.pubmed.x
  26. BIN
      data/ind.pubmed.y
  27. 12
      initializations.py
  28. 124
      input_data.py
  29. 163
      layers.py
  30. 113
      link_prediction.py
  31. 27
      load_data.py
  32. 112
      metrics.py
  33. 131
      optimizer.py
  34. 135
      preprocessing.py
  35. 14
      run.py
  36. 48
      settings.py
  37. 294
      src/model.py
  38. 105
      src/train.py

@ -1,2 +0,0 @@
from __future__ import print_function
from __future__ import division

@ -1,195 +0,0 @@
import tensorflow as tf
import numpy as np
from load_data import load_data_1
from model import GCN, Generator_z2g, Discriminator, D_graph
from optimizer import OptimizerAE, OptimizerCycle
import scipy.sparse as sp
from input_data import load_data
import inspect
from preprocessing import preprocess_graph, sparse_to_tuple, mask_test_edges, construct_feed_dict
flags = tf.app.flags
FLAGS = flags.FLAGS
def get_placeholder(adj, num_features):
# 给tf.sparse_placeholder喂数据时:
# 1.应该直接填充 (indices, values, shape)
# 2.或者使用 tf.SparseTensorValue
placeholders = {
'features': tf.sparse_placeholder(tf.float32),
'features_dense': tf.placeholder(tf.float32, shape=[adj.shape[0], num_features],
name='real_distribution'),
'adj': tf.sparse_placeholder(tf.float32),
'adj_orig': tf.sparse_placeholder(tf.float32),
'dropout': tf.placeholder_with_default(0., shape=()),
'real_distribution': tf.placeholder(dtype=tf.float32, shape=[adj.shape[0], FLAGS.hidden2],
name='real_distribution')
}
return placeholders
def get_model(model_str, placeholders, num_features, num_nodes, features_nonzero):
# 计算图构建
discriminator = Discriminator()
D_Graph = D_graph(num_features)
d_real = discriminator.construct(placeholders['real_distribution'])
GD_real = D_Graph.construct(placeholders['features_dense'])
model = None
if model_str == 'arga_ae':
model = GCN(placeholders, num_features, features_nonzero)
elif model_str == 'DBGAN':
model = GCN(placeholders, num_features, features_nonzero)
model_z2g = Generator_z2g(placeholders, num_features, features_nonzero)
return d_real, discriminator, model, model_z2g, D_Graph, GD_real
def format_data(data_name):
# Load data
# adj, features, y_test, tx, ty, test_maks, true_labels = load_data(data_name)
adj, features, y_test, tx, ty, test_maks, true_labels = load_data_1("luo") # e ic gpcr nr luo
# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
# 删除对角线元素
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()
adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
adj = adj_train
adj_dense = adj.toarray()
if FLAGS.features == 0:
features = sp.identity(features.shape[0]) # featureless
# Some preprocessing
adj_norm = preprocess_graph(adj)
num_nodes = adj.shape[0]
features_dense = features.tocoo().toarray()
features = sparse_to_tuple(features.tocoo())
# num_features是feature的维度
num_features = features[2][1]
# features_nonzero就是非零feature的个数
features_nonzero = features[1].shape[0]
pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = sparse_to_tuple(adj_label)
items = [
adj, num_features, num_nodes, features_nonzero,
pos_weight, norm, adj_norm, adj_label,
features, true_labels, train_edges, val_edges,
val_edges_false, test_edges, test_edges_false, adj_orig, features_dense, adj_dense, features_dense
]
feas = {}
print('num_features is:', num_features)
print('num_nodes is:', num_nodes)
print('features_nonzero is:', features_nonzero)
print('pos_weight is:', pos_weight)
print('norm is:', norm)
for item in items:
# item_name = [ k for k,v in locals().iteritems() if v == item][0]
feas[retrieve_name(item)] = item
feas['num_features'] = num_features
feas['num_nodes'] = num_nodes
return feas
def get_optimizer(model_str, model, model_z2g, D_Graph, discriminator, placeholders, pos_weight, norm, d_real, num_nodes, GD_real):
if model_str == 'arga_ae':
output = model.construct()
embeddings = output[0]
reconstructions = output[1]
d_fake = discriminator.construct(embeddings, reuse=True)
opt = OptimizerAE(preds=reconstructions,
labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
validate_indices=False), [-1]),
pos_weight=pos_weight,
norm=norm,
d_real=d_real,
d_fake=d_fake)
elif model_str == 'DBGAN':
z2g = model_z2g.construct()
hidden = z2g[1]
z2g = z2g[0]
preds_z2g = model.construct(hidden=hidden, reuse=True)[0]
g2z = model.construct()
embeddings = g2z[0]
reconstructions = g2z[1]
d_fake = discriminator.construct(embeddings, reuse=True)
GD_fake = D_Graph.construct(z2g, reuse=True)
epsilon = tf.random_uniform(shape=[1], minval=0.0, maxval=1.0)
interpolated_input = epsilon * placeholders['real_distribution'] + (1 - epsilon) * embeddings
gradient = tf.gradients(discriminator.construct(interpolated_input, reuse=True), [interpolated_input])[0]
epsilon = tf.random_uniform(shape=[1], minval=0.0, maxval=1.0)
interpolated_input = epsilon * placeholders['features_dense'] + (1 - epsilon) * z2g
gradient_z = tf.gradients(D_Graph.construct(interpolated_input, reuse=True), [interpolated_input])[0]
opt = OptimizerCycle(preds=reconstructions,
labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
validate_indices=False), [-1]),
pos_weight=pos_weight,
norm=norm,
d_real=d_real,
d_fake=d_fake,
GD_real=GD_real,
GD_fake=GD_fake,
preds_z2g=preds_z2g,
labels_z2g=placeholders['real_distribution'],
preds_cycle=model_z2g.construct(embeddings, reuse=True)[0],
labels_cycle=placeholders['features_dense'],
gradient=gradient,
gradient_z=gradient_z)
return opt
def update(model, opt, sess, adj_norm, adj_label, features, placeholders, adj, distribution, adj_dense):
# Construct feed dictionary
feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
feed_dict.update({placeholders['dropout']: FLAGS.dropout})
feed_dict.update({placeholders['features_dense']: adj_dense})
feed_dict.update({placeholders['dropout']: 0})
z_real_dist = np.random.randn(adj.shape[0], FLAGS.hidden2)
z_real_dist = distribution.sample(adj.shape[0])
feed_dict.update({placeholders['real_distribution']: z_real_dist})
for j in range(5):
_, reconstruct_loss = sess.run([opt.opt_op, opt.cost], feed_dict=feed_dict)
g_loss, _ = sess.run([opt.generator_loss, opt.generator_optimizer], feed_dict=feed_dict)
d_loss, _ = sess.run([opt.dc_loss, opt.discriminator_optimizer], feed_dict=feed_dict)
GD_loss, _ = sess.run([opt.GD_loss, opt.discriminator_optimizer_z2g], feed_dict=feed_dict)
GG_loss, _ = sess.run([opt.generator_loss_z2g, opt.generator_optimizer_z2g], feed_dict=feed_dict)
# GD_loss = sess.run(opt.GD_loss, feed_dict=feed_dict)
# GG_loss = sess.run(opt.generator_loss_z2g, feed_dict=feed_dict)
# g_loss = sess.run(opt.generator_loss, feed_dict=feed_dict)
# d_loss = sess.run(opt.dc_loss, feed_dict=feed_dict)
emb = sess.run(model.z_mean, feed_dict=feed_dict)
avg_cost = [reconstruct_loss, d_loss, g_loss, GD_loss, GG_loss]
return emb, avg_cost
def retrieve_name(var):
callers_local_vars = inspect.currentframe().f_back.f_locals.items()
print([var_name for var_name, var_val in callers_local_vars if var_val is var])
return [var_name for var_name, var_val in callers_local_vars if var_val is var][0]

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -1,12 +0,0 @@
import tensorflow as tf
import numpy as np
def weight_variable_glorot(input_dim, output_dim, name=""):
"""Create a weight variable with Glorot & Bengio (AISTATS 2010)
initialization.
"""
init_range = np.sqrt(6.0 / (input_dim + output_dim))
initial = tf.random_uniform([input_dim, output_dim], minval=-init_range,
maxval=init_range, dtype=tf.float32)
return tf.Variable(initial, name=name)

@ -1,124 +0,0 @@
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys
def parse_index_file(filename):
index = []
for line in open(filename):
index.append(int(line.strip()))
return index
def sample_mask(idx, l):
"""Create mask."""
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
def load_data(dataset):
# load the data: x, tx, allx, graph
# x => 训练实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例
# tx => 测试实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例
# allx => 有标签的+无无标签训练实例的特征向量,是ind.dataset_str.x的超集
# y => 训练实例的标签,独热编码,numpy.ndarray类的实例
# ty => 测试实例的标签,独热编码,numpy.ndarray类的实例
# ally => 有标签的+无无标签训练实例的标签,独热编码,numpy.ndarray类的实例
# graph => 图数据,collections.defaultdict类的实例,格式为 {index:[index_of_neighbor_nodes]}
# index => 测试实例的id
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
with open("data/ind.{}.{}".format(dataset, names[i]), 'rb') as f:
if sys.version_info > (3, 0):
objects.append(pkl.load(f, encoding='latin1'))
else:
objects.append(pkl.load(f))
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset))
test_idx_range = np.sort(test_idx_reorder)
if dataset == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position、
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
tx_extended[test_idx_range - min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range - min(test_idx_range), :] = ty
ty = ty_extended
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y) + 500)
train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
return adj, features, y_test, tx, ty, test_mask, np.argmax(labels, 1)
def load_alldata(dataset_str):
"""Load data."""
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
objects.append(pkl.load(open("data/ind.{}.{}".format(dataset_str, names[i]))))
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
if dataset_str == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
tx_extended[test_idx_range - min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range - min(test_idx_range), :] = ty
ty = ty_extended
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y) + 500)
train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, np.argmax(labels, 1)

@ -1,163 +0,0 @@
from initializations import *
import tensorflow as tf
flags = tf.app.flags
FLAGS = flags.FLAGS
# global unique layer ID dictionary for layer name assignment
_LAYER_UIDS = {}
def get_layer_uid(layer_name=''):
"""Helper function, assigns unique layer IDs
分配唯一的层ID
"""
if layer_name not in _LAYER_UIDS:
_LAYER_UIDS[layer_name] = 1
return 1
else:
_LAYER_UIDS[layer_name] += 1
return _LAYER_UIDS[layer_name]
def dropout_sparse(x, keep_prob, num_nonzero_elems):
"""
Dropout for sparse tensors. Currently fails for very large sparse tensors (>1M elements)
num_nonzero_elems: 稀疏矩阵中的非零元素个数
keep_prob:
x: input
"""
noise_shape = [num_nonzero_elems]
random_tensor = keep_prob
random_tensor += tf.random_uniform(noise_shape)
dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
pre_out = tf.sparse_retain(x, dropout_mask)
return pre_out * (1. / keep_prob)
class Layer(object):
"""Base layer class. Defines basic API for all layer objects.
# Properties
name: String, defines the variable scope of the layer.
# Methods
_call(inputs): Defines computation graph of layer
(i.e. takes input, returns output)
__call__(inputs): Wrapper for _call()
"""
def __init__(self, **kwargs):
allowed_kwargs = {'name', 'logging'}
for kwarg in kwargs.keys():
assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
name = kwargs.get('name')
if not name:
layer = self.__class__.__name__.lower()
name = layer + '_' + str(get_layer_uid(layer))
self.name = name
self.vars = {}
logging = kwargs.get('logging', False)
self.logging = logging
self.issparse = False
def _call(self, inputs):
return inputs
def __call__(self, inputs):
with tf.name_scope(self.name):
outputs = self._call(inputs)
return outputs
class GraphConvolution(Layer):
"""Basic graph convolution layer for undirected graph without edge labels."""
def __init__(self, input_dim, output_dim, adj, dropout=0., act=tf.nn.relu, **kwargs):
super(GraphConvolution, self).__init__(**kwargs)
with tf.variable_scope(self.name + '_vars'):
self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights")
self.dropout = dropout
self.adj = adj
self.act = act
def _call(self, inputs):
x = inputs
x = tf.nn.dropout(x, 1 - self.dropout)
x = tf.matmul(x, self.vars['weights'])
x = tf.sparse_tensor_dense_matmul(self.adj, x)
outputs = self.act(x)
return outputs
class GraphConvolutionSparse(Layer):
"""
Graph convolution layer for sparse inputs.
多了一个features_nonzero
"""
def __init__(self, input_dim, output_dim, adj, features_nonzero, dropout=0., act=tf.nn.relu, **kwargs):
super(GraphConvolutionSparse, self).__init__(**kwargs)
with tf.variable_scope(self.name + '_vars'):
self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights")
self.dropout = dropout
self.adj = adj
self.act = act
self.issparse = True
self.features_nonzero = features_nonzero
def _call(self, inputs):
x = inputs
x = dropout_sparse(x, 1 - self.dropout, self.features_nonzero)
x = tf.sparse_tensor_dense_matmul(x, self.vars['weights'])
x = tf.sparse_tensor_dense_matmul(self.adj, x)
outputs = self.act(x)
return outputs
class InnerProductDecoder(Layer):
"""Decoder model layer for link prediction."""
def __init__(self, input_dim, dropout=0., act=tf.nn.sigmoid, **kwargs):
super(InnerProductDecoder, self).__init__(**kwargs)
self.dropout = dropout
self.act = act
def _call(self, inputs):
"""
这个decoder部分实际上就只是input的转置再乘input
"""
inputs = tf.nn.dropout(inputs, 1 - self.dropout)
x = tf.transpose(inputs)
x = tf.matmul(inputs, x)
x = tf.reshape(x, [-1])
outputs = self.act(x)
return outputs
class GraphConvolution_z2g(Layer):
"""Basic graph convolution layer for undirected graph without edge labels."""
def __init__(self, input_dim, output_dim, adj, dropout=0., act=tf.nn.relu, **kwargs):
super(GraphConvolution, self).__init__(**kwargs)
with tf.variable_scope(self.name + '_vars'):
self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights")
self.dropout = dropout
self.adj = adj
self.act = act
def _call(self, inputs):
x = inputs
x = tf.nn.dropout(x, 1 - self.dropout)
x = tf.matmul(x, self.vars['weights'])
x = tf.sparse_tensor_dense_matmul(self.adj, x)
outputs = self.act(x)
return outputs
def _call(self, inputs):
x = inputs
x = dropout_sparse(x, 1 - self.dropout, self.features_nonzero)
x = tf.sparse_tensor_dense_matmul(x, self.vars['weights'])
x = tf.sparse_tensor_dense_matmul(self.adj, x)
outputs = self.act(x)
return outputs

@ -1,113 +0,0 @@
from __future__ import division
from __future__ import print_function
import os
# Train on CPU (hide GPU) due to memory constraints
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import tensorflow as tf
import settings
from constructor import get_placeholder, get_model, format_data, get_optimizer, update
from metrics import linkpred_metrics
from sklearn.neighbors import KernelDensity
from dppy.finite_dpps import FiniteDPP
from sklearn.decomposition import PCA
import numpy as np
import scipy.io as scio
# Settings
flags = tf.app.flags
FLAGS = flags.FLAGS
class Link_pred_Runner():
def __init__(self, settings):
self.data_name = settings['data_name']
self.iteration = settings['iterations']
self.model = settings['model']
def erun(self):
model_str = self.model
# formatted data
feas = format_data(self.data_name)
# Define placeholders
# 定义placeholders,get_placeholder函数中只需要传入一个参数,即adj,函数中需要用到adj.shape
placeholders = get_placeholder(feas['adj'], feas['num_features'])
# 定义由Dpp和密度估计出来的混合高斯
DPP = FiniteDPP('correlation', **{'K': feas['adj'].toarray()})
# DPP.sample_exact_k_dpp(size=4)
pca = PCA(n_components=FLAGS.hidden2)
# index = DPP.list_of_samples[0]
if self.data_name == 'cora':
DPP.sample_exact_k_dpp(size=21) # e 21 ic 6 gpcr 3
index = DPP.list_of_samples[0]
pass
elif self.data_name == 'citeseer':
index = np.array([1782, 741, 3258, 3189, 3112, 2524, 2895, 1780, 1100, 2735, 1318,
2944, 1825, 18, 987, 2564, 463, 6, 3173, 701, 1901, 2349,
2786, 2412, 646, 2626, 2648, 1793, 432, 538, 1729, 1217, 1397,
1932, 2850, 458, 2129, 702, 2934, 2030, 2882, 1393, 308, 1271,
1106, 2688, 629, 1145, 3251, 1903, 1004, 1149, 1385, 285, 858,
2977, 844, 335, 532, 404, 3174, 528])
elif self.data_name == 'pubmed':
index = np.array([842, 3338, 5712, 17511, 10801, 2714, 6970, 13296, 5466,
2230])
feature_sample = feas['features_dense']
feature_sample = pca.fit_transform(feature_sample)
featuresCompress = np.array([feature_sample[i] for i in index])
# featuresCompress = np.array(feature_sample)
kde = KernelDensity(bandwidth=0.7).fit(featuresCompress)
# construct model
d_real, discriminator, ae_model, model_z2g, D_Graph, GD_real = get_model(model_str, placeholders, feas['num_features'], feas['num_nodes'], feas['features_nonzero'])
# Optimizer
opt = get_optimizer(model_str, ae_model, model_z2g, D_Graph, discriminator, placeholders, feas['pos_weight'], feas['norm'], d_real, feas['num_nodes'], GD_real)
# Initialize session
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf.Session(config = config)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
val_roc_score = []
record = []
record_emb = []
# Train model
for epoch in range(self.iteration):
emb, avg_cost = update(ae_model, opt, sess, feas['adj_norm'], feas['adj_label'], feas['features'], placeholders, feas['adj'], kde, feas['features_dense'])
lm_train = linkpred_metrics(feas['val_edges'], feas['val_edges_false'])
roc_curr, ap_curr, _, aupr_score = lm_train.get_roc_score(emb, feas)
val_roc_score.append(roc_curr)
print("Epoch:", '%04d' % (epoch + 1),
"train_loss= {:.5f}, d_loss= {:.5f}, g_loss= {:.5f}, GD_loss= {:.5f}, GG_loss= {:.5f}".format(avg_cost[0], avg_cost[1], avg_cost[2], avg_cost[3], avg_cost[4]), "val_roc=",
"{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "val_aupr=", "{:.5f}".format(aupr_score))
if (epoch + 1) % 10 == 0:
lm_test = linkpred_metrics(feas['test_edges'], feas['test_edges_false'])
roc_score, ap_score, _, aupr_score = lm_test.get_roc_score(emb, feas)
print('Test ROC score: ' + str(roc_score), 'Test AUPR score: ' + str(aupr_score), 'Test AP score: ' + str(ap_score))
# print('Test AUPR score: ' + str(aupr_score))
# print('Test AP score: ' + str(ap_score))
record.append([roc_score, aupr_score, ap_score])
record_emb.append(emb)
rec = np.array(record)
index = rec[:, 0].tolist().index(max(rec[:, 0].tolist()))
index_pr = rec[:, 1].tolist().index(max(rec[:, 1].tolist()))
emb = record_emb[index]
ana = record[index]
ana_pr = record[index_pr]
# scio.savemat('result/{}_link_64_64_new.mat'.format(self.data_name), {'embedded': emb,'labels': feas['true_labels']})
print('The peak [auc] test_roc=%f, aupr=%f, ap = %f' % (ana[0], ana[1], ana[2]))
print('The peak [aupr] test_roc=%f, aupr=%f, ap = %f' % (ana_pr[0], ana_pr[1], ana_pr[2]))

@ -1,27 +0,0 @@
import pickle
import numpy as np
import scipy.sparse as sp
def load_data_1(dataset):
adj = np.loadtxt('./data/partitioned_data/{0}/orig/{0}_adj_orig.txt'.format(dataset), dtype=int)
adj = sp.csr_matrix(adj)
features = pickle.load(open("data/partitioned_data/{0}/feature/{0}_feature.pkl".format(dataset),'rb'))
y_test = 0
tx = 0
ty = 0
test_mask = 0
labels = 0
return adj, features, y_test, tx, ty, test_mask, labels
if __name__ == "__main__":
load_data_1('e')

@ -1,112 +0,0 @@
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score,precision_recall_curve, auc
from sklearn.metrics import average_precision_score
from sklearn import metrics
from munkres import Munkres, print_matrix
import numpy as np
class linkpred_metrics():
def __init__(self, edges_pos, edges_neg):
self.edges_pos = edges_pos
self.edges_neg = edges_neg
def get_roc_score(self, emb, feas):
# if emb is None:
# feed_dict.update({placeholders['dropout']: 0})
# emb = sess.run(model.z_mean, feed_dict=feed_dict)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# Predict on test set of edges
adj_rec = np.dot(emb, emb.T)
preds = []
pos = []
for e in self.edges_pos:
preds.append(sigmoid(adj_rec[e[0], e[1]]))
pos.append(feas['adj_orig'][e[0], e[1]])
preds_neg = []
neg = []
for e in self.edges_neg:
preds_neg.append(sigmoid(adj_rec[e[0], e[1]]))
neg.append(feas['adj_orig'][e[0], e[1]])
preds_all = np.hstack([preds, preds_neg])
labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds))])
roc_score = roc_auc_score(labels_all, preds_all)
ap_score = average_precision_score(labels_all, preds_all)
precision, recall, _thresholds = metrics.precision_recall_curve(labels_all, preds_all)
aupr_score = auc(recall, precision)
return roc_score, ap_score, emb, aupr_score
class clustering_metrics():
def __init__(self, true_label, predict_label):
self.true_label = true_label
self.pred_label = predict_label
def clusteringAcc(self):
# best mapping between true_label and predict label
l1 = list(set(self.true_label))
numclass1 = len(l1)
l2 = list(set(self.pred_label))
numclass2 = len(l2)
if numclass1 != numclass2:
print('Class Not equal, Error!!!!')
return 0
cost = np.zeros((numclass1, numclass2), dtype=int)
for i, c1 in enumerate(l1):
mps = [i1 for i1, e1 in enumerate(self.true_label) if e1 == c1]
for j, c2 in enumerate(l2):
mps_d = [i1 for i1 in mps if self.pred_label[i1] == c2]
cost[i][j] = len(mps_d)
# match two clustering results by Munkres algorithm
m = Munkres()
cost = cost.__neg__().tolist()
indexes = m.compute(cost)
# get the match results
new_predict = np.zeros(len(self.pred_label))
for i, c in enumerate(l1):
# correponding label in l2:
c2 = l2[indexes[i][1]]
# ai is the index with label==c2 in the pred_label list
ai = [ind for ind, elm in enumerate(self.pred_label) if elm == c2]
new_predict[ai] = c
acc = metrics.accuracy_score(self.true_label, new_predict)
f1_macro = metrics.f1_score(self.true_label, new_predict, average='macro')
precision_macro = metrics.precision_score(self.true_label, new_predict, average='macro')
recall_macro = metrics.recall_score(self.true_label, new_predict, average='macro')
f1_micro = metrics.f1_score(self.true_label, new_predict, average='micro')
precision_micro = metrics.precision_score(self.true_label, new_predict, average='micro')
recall_micro = metrics.recall_score(self.true_label, new_predict, average='micro')
return acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro
def evaluationClusterModelFromLabel(self):
nmi = metrics.normalized_mutual_info_score(self.true_label, self.pred_label)
adjscore = metrics.adjusted_rand_score(self.true_label, self.pred_label)
acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro = self.clusteringAcc()
print('ACC=%f, f1_macro=%f, precision_macro=%f, recall_macro=%f, f1_micro=%f, precision_micro=%f, recall_micro=%f, NMI=%f, ADJ_RAND_SCORE=%f' % (
acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro, nmi, adjscore))
fh = open('recoder.txt', 'a')
fh.write('ACC=%f, f1_macro=%f, precision_macro=%f, recall_macro=%f, f1_micro=%f, precision_micro=%f, recall_micro=%f, NMI=%f, ADJ_RAND_SCORE=%f' % (
acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro, nmi, adjscore))
fh.write('\r\n')
fh.flush()
fh.close()
return acc, nmi, adjscore

@ -1,131 +0,0 @@
import tensorflow as tf
flags = tf.app.flags
FLAGS = flags.FLAGS
class OptimizerAE(object):
def __init__(self, preds, labels, pos_weight, norm, d_real, d_fake):
preds_sub = preds
labels_sub = labels
self.real = d_real
# Discrimminator Loss
# self.dc_loss_real = tf.reduce_mean(
# tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(self.real), logits=self.real,name='dclreal'))
self.dc_loss_real = - tf.reduce_mean(self.real)
# self.dc_loss_fake = tf.reduce_mean(
# tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_fake), logits=d_fake,name='dcfake'))
self.dc_loss_fake = tf.reduce_mean(d_fake)
GP_loss = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient), axis=[0, 1])) - 1))
self.dc_loss = self.dc_loss_fake + self.dc_loss_real + GP_loss
# self.dc_loss = self.dc_loss_fake + self.dc_loss_real
# Generator loss
# generator_loss = tf.reduce_mean(
# tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_fake), logits=d_fake, name='gl'))
generator_loss = -self.dc_loss_fake
# pos_weight,允许人们通过向上或向下加权相对于负误差的正误差的成本来权衡召回率和精确度
self.cost = norm * tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight))
self.generator_loss = generator_loss + self.cost
all_variables = tf.trainable_variables()
dc_var = [var for var in all_variables if 'dc_' in var.name]
en_var = [var for var in all_variables if 'e_' in var.name]
with tf.variable_scope(tf.get_variable_scope()):
self.discriminator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate,
beta1=0.9, name='adam1').minimize(self.dc_loss,
var_list=dc_var) # minimize(dc_loss_real, var_list=dc_var)
self.generator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate,
beta1=0.9, name='adam2').minimize(self.generator_loss, var_list=en_var)
# 值得注意的是,这个地方,除了对抗优化之外,
# 还单纯用cost损失又优化了一遍,
# 待会儿看训练的时候注意看是在哪部分进行的这部分优化操作
self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) # Adam Optimizer
self.opt_op = self.optimizer.minimize(self.cost)
self.grads_vars = self.optimizer.compute_gradients(self.cost)
class OptimizerCycle(object):
def __init__(self, preds, labels, pos_weight, norm, d_real, d_fake, GD_real, GD_fake, preds_z2g, labels_z2g, preds_cycle, labels_cycle, gradient, gradient_z):
preds_sub = preds
labels_sub = labels
self.real = d_real
# Discrimminator Loss
self.dc_loss_real = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(self.real), logits=self.real, name='dclreal'))
# self.dc_loss_real = - tf.reduce_mean(self.real)
self.dc_loss_fake = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_fake), logits=d_fake, name='dcfake'))
# self.dc_loss_fake = tf.reduce_mean(d_fake)
# GP_loss = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient), axis = [0, 1])) - 1))
# GP_loss_z = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient_z), axis = [0, 1])) - 1))
# self.dc_loss = self.dc_loss_fake + self.dc_loss_real + 10.0 * GP_loss
self.GD_loss_real = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(GD_real), logits=GD_real, name='GD_real'))
# self.GD_loss_real = - tf.reduce_mean(GD_real)
self.GD_loss_fake = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(GD_fake), logits=GD_fake, name='GD_fake'))
# self.GD_loss_fake = tf.reduce_mean(GD_fake)
self.dc_loss = self.dc_loss_fake + self.dc_loss_real
self.GD_loss = self.GD_loss_fake + self.GD_loss_real
# Generator loss
generator_loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_fake), logits=d_fake, name='gl'))
# generator_loss = -self.dc_loss_fake
generator_loss_z2g = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(GD_fake), logits=GD_fake, name='G_z2g'))
# generator_loss_z2g = -self.GD_loss_fake
# pos_weight,允许人们通过向上或向下加权相对于负误差的正误差的成本来权衡召回率和精确度
self.cost = norm * tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight))
cost_cycle = norm * tf.reduce_mean(tf.square(preds_cycle - labels_cycle))
cost_z2g = norm * tf.reduce_mean(tf.square(preds_z2g - labels_z2g))
# with tf.device("/gpu:1"):
# self.cost = 0.00001*self.cost + cost_cycle #for citseer cluster
self.cost = self.cost + cost_cycle
self.generator_loss = generator_loss + self.cost
self.generator_loss_z2g = generator_loss_z2g
all_variables = tf.trainable_variables()
dc_var = [var for var in all_variables if 'dc_' in var.name]
en_var = [var for var in all_variables if 'e_' in var.name]
GG_var = [var for var in all_variables if 'GG' in var.name]
GD_var = [var for var in all_variables if 'GD' in var.name]
with tf.variable_scope(tf.get_variable_scope()):
self.discriminator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate,
beta1=0.9, name='adam1').minimize(self.dc_loss,
var_list=dc_var) # minimize(dc_loss_real, var_list=dc_var)
self.generator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate,
beta1=0.9, name='adam2').minimize(self.generator_loss, var_list=en_var)
self.discriminator_optimizer_z2g = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate,
beta1=0.9, name='adam1').minimize(self.GD_loss, var_list=GD_var)
self.generator_optimizer_z2g = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate,
beta1=0.9, name='adam2').minimize(self.generator_loss_z2g, var_list=GG_var)
# 值得注意的是,这个地方,除了对抗优化之外,
# 还单纯用cost损失又优化了一遍,
# 待会儿看训练的时候注意看是在哪部分进行的这部分优化操作
self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) # Adam Optimizer
self.opt_op = self.optimizer.minimize(self.cost)
# self.grads_vars = self.optimizer.compute_gradients(self.cost)
# self.optimizer_z2g = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) # Adam Optimizer
# self.opt_op_z2g = self.optimizer.minimize(cost_z2g)
# self.grads_vars_z2g = self.optimizer.compute_gradients(cost_z2g)

@ -1,135 +0,0 @@
import numpy as np
import scipy.sparse as sp
def sparse_to_tuple(sparse_mx):
# 判断是否是coo_matrix,不是的话就转成coo_matrix
if not sp.isspmatrix_coo(sparse_mx):
sparse_mx = sparse_mx.tocoo()
coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
values = sparse_mx.data
shape = sparse_mx.shape
return coords, values, shape
def preprocess_graph(adj):
# A.sum(axis=1):计算矩阵的每一行元素之和,得到节点的度矩阵D
# np.power(x, n):数组元素求n次方,得到D^(-1/2)
# sp.diags()函数根据给定的对象创建对角矩阵,对角线上的元素为给定对象中的元素
adj = sp.coo_matrix(adj)
adj_ = adj + sp.eye(adj.shape[0]) # A* = A+I,即对邻接矩阵加入自连接
rowsum = np.array(adj_.sum(1)) # 对行求和,即得到节点的度
degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) # 得到D的-1/2次方矩阵d
adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() # 这一步的实质是做归一化,即A* × d转置 × d
return sparse_to_tuple(adj_normalized)
def construct_feed_dict(adj_normalized, adj, features, placeholders):
# construct feed dictionary
# .update()用法就是将()内的字段增加到dict当中
feed_dict = dict() # 创建一个空字典
feed_dict.update({placeholders['features']: features})
feed_dict.update({placeholders['adj']: adj_normalized})
feed_dict.update({placeholders['adj_orig']: adj})
return feed_dict
def mask_test_edges(adj):
# Function to build test set with 10% positive links
# NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper.
# TODO: Clean up.
# sp.matrix(data,offsets)是将data的元素每列的元素,按offset里的顺序在列上进行重新排列,offset里的值是偏移量
# 具体可以参考https://blog.csdn.net/ChenglinBen/article/details/84424379
# .diagonal()就是提取对角线元素
# Remove diagonal elements删除对角线元素
adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
# 把零元素都消除掉
adj.eliminate_zeros()
# Check that diag is zero:
# np.diag(matrix)即提取matrix的对角线元素,todense() like toarray(),区别是一个是将存储方式由稀疏矩阵转成正常矩阵,另一个是转成array
# assert检查是否对角线元素是否都被清空了
assert np.diag(adj.todense()).sum() == 0
# sp.triu(matrix)获取matrix的上三角矩阵,相应的,tril()是获取下三角矩阵
adj_triu = sp.triu(adj)
adj_tuple = sparse_to_tuple(adj_triu)
# edges相当于组合,因为是上三角矩阵的edge,所以减少了一半的重复量,(4.6)与(6,4)不会同时存在,而只会保留(4,6)
# edges_all相当于排列,就都包含了
edges = adj_tuple[0]
edges_all = sparse_to_tuple(adj)[0]
# 取edge的10%作为test
# 取edge的20%作为val
num_test = int(np.floor(edges.shape[0] / 10.))
num_val = int(np.floor(edges.shape[0] / 20.))
# 随机选取一部分作为test与val
all_edge_idx = list(range(edges.shape[0]))
np.random.shuffle(all_edge_idx)
val_edge_idx = all_edge_idx[:num_val]
test_edge_idx = all_edge_idx[num_val:(num_val + num_test)]
test_edges = edges[test_edge_idx]
val_edges = edges[val_edge_idx]
train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0)
# 该函数请参考github中gae的写法,应该是更新了,这种方法应该是错的,或者说与python3不兼容
# 其中,return部分或许应该改成np.any(rows_close)
def ismember(a, b, tol=5):
# 该函数的作用就是判断a元素是否存在于b集合中
rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
return np.any(rows_close)
# return (np.all(np.any(rows_close, axis=-1), axis=-1) and
# np.all(np.any(rows_close, axis=0), axis=0))
# test_edges_false是去生成一些本来就不存在的edges
test_edges_false = []
while len(test_edges_false) < len(test_edges):
idx_i = np.random.randint(0, adj.shape[0])
idx_j = np.random.randint(0, adj.shape[0])
if idx_i == idx_j:
continue
if ismember([idx_i, idx_j], edges_all):
continue
if test_edges_false:
if ismember([idx_j, idx_i], np.array(test_edges_false)):
continue
if ismember([idx_i, idx_j], np.array(test_edges_false)):
continue
test_edges_false.append([idx_i, idx_j])
# val_edges_false生成一些不存在于train与val的edges
val_edges_false = []
while len(val_edges_false) < len(val_edges):
idx_i = np.random.randint(0, adj.shape[0])
idx_j = np.random.randint(0, adj.shape[0])
if idx_i == idx_j:
continue
if ismember([idx_i, idx_j], train_edges):
continue
if ismember([idx_j, idx_i], train_edges):
continue
if ismember([idx_i, idx_j], val_edges):
continue
if ismember([idx_j, idx_i], val_edges):
continue
if val_edges_false:
if ismember([idx_j, idx_i], np.array(val_edges_false)):
continue
if ismember([idx_i, idx_j], np.array(val_edges_false)):
continue
val_edges_false.append([idx_i, idx_j])
assert ~ismember(test_edges_false, edges_all)
# assert ~ismember(val_edges_false, edges_all)
assert ~ismember(val_edges, train_edges)
assert ~ismember(test_edges, train_edges)
assert ~ismember(val_edges, test_edges)
data = np.ones(train_edges.shape[0])
# Re-build adj matrix
# 如英文注释所说,这里将处理好的train_edges再重建出adj_train
adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
adj_train = adj_train + adj_train.T
# NOTE: these edge lists only contain single direction of edge!
return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false

@ -1,14 +0,0 @@
import settings
from link_prediction import Link_pred_Runner
dataname = 'cora' # 'cora' or 'citeseer' or 'pubmed'
model = 'DBGAN' # 'arga_ae' or 'DBGAN'
task = 'link_prediction'
settings = settings.get_settings(dataname, model, task)
if task == 'link_prediction':
runner = Link_pred_Runner(settings)
runner.erun()

@ -1,48 +0,0 @@
import tensorflow as tf
import numpy as np
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('hidden3', 64, 'Number of units in hidden layer 3.')
flags.DEFINE_integer('discriminator_out', 0, 'discriminator_out.')
flags.DEFINE_float('discriminator_learning_rate', 0.001, 'Initial learning rate.')
flags.DEFINE_float('learning_rate', .6 * 0.001, 'Initial learning rate.')
flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.') # 64 for Citeseer and Pubmed
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.') # 64 for Citeseer and Pubmed
flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).')
flags.DEFINE_integer('features', 1, 'Whether to use features (1) or not (0).')
flags.DEFINE_integer('seed', 50, 'seed for fixing the results.')
flags.DEFINE_integer('iterations', 60, 'number of iterations.')
'''
infor: number of clusters
'''
infor = {'cora': 7, 'citeseer': 6, 'pubmed': 3}
'''
We did not set any seed when we conducted the experiments described in the paper;
We set a seed here to steadily reveal better performance of ARGA
'''
seed = 7
np.random.seed(seed)
tf.set_random_seed(seed)
def get_settings(dataname, model, task):
if dataname != 'citeseer' and dataname != 'cora' and dataname != 'pubmed':
print('error: wrong data set name')
if task != 'clustering' and task != 'link_prediction':
print('error: wrong task name')
if task == 'clustering':
iterations = FLAGS.iterations
clustering_num = infor[dataname]
re = {'data_name': dataname, 'iterations': iterations, 'clustering_num': clustering_num, 'model': model}
elif task == 'link_prediction':
iterations = 4 * FLAGS.iterations
print('epoch is', iterations)
re = {'data_name': dataname, 'iterations': iterations, 'model': model}
return re

@ -1,9 +1,6 @@
from layers import GraphConvolution, GraphConvolutionSparse, InnerProductDecoder
import numpy as np
import tensorflow as tf
flags = tf.app.flags
FLAGS = flags.FLAGS
class Model(object):
def __init__(self, **kwargs):
@ -40,22 +37,166 @@ class Model(object):
pass
_LAYER_UIDS = {}
def get_layer_uid(layer_name=''):
"""Helper function, assigns unique layer IDs
"""
if layer_name not in _LAYER_UIDS:
_LAYER_UIDS[layer_name] = 1
return 1
else:
_LAYER_UIDS[layer_name] += 1
return _LAYER_UIDS[layer_name]
class Layer(object):
"""Base layer class. Defines basic API for all layer objects.
# Properties
name: String, defines the variable scope of the layer.
# Methods
_call(inputs): Defines computation graph of layer
(i.e. takes input, returns output)
__call__(inputs): Wrapper for _call()
"""
def __init__(self, **kwargs):
allowed_kwargs = {'name', 'logging'}
for kwarg in kwargs.keys():
assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
name = kwargs.get('name')
if not name:
layer = self.__class__.__name__.lower()
name = layer + '_' + str(get_layer_uid(layer))
self.name = name
self.vars = {}
logging = kwargs.get('logging', False)
self.logging = logging
self.issparse = False
def _call(self, inputs):
return inputs
def __call__(self, inputs):
with tf.name_scope(self.name):
outputs = self._call(inputs)
return outputs
def weight_variable_glorot(input_dim, output_dim, name=""):
"""Create a weight variable with Glorot & Bengio (AISTATS 2010)
initialization.
"""
init_range = np.sqrt(6.0 / (input_dim + output_dim))
initial = tf.random_uniform([input_dim, output_dim], minval=-init_range,
maxval=init_range, dtype=tf.float32)
return tf.Variable(initial, name=name)
def dropout_sparse(x, keep_prob, num_nonzero_elems):
"""
Dropout for sparse tensors. Currently fails for very large sparse tensors (>1M elements)
num_nonzero_elems: The number of non-zero elements in the sparse matrix
keep_prob:
x: input
"""
noise_shape = [num_nonzero_elems]
random_tensor = keep_prob
random_tensor += tf.random_uniform(noise_shape)
dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
pre_out = tf.sparse_retain(x, dropout_mask)
return pre_out * (1. / keep_prob)
class GraphConvolutionSparse(Layer):
"""
Graph convolution layer for sparse inputs.
多了一个features_nonzero
"""
def __init__(self, input_dim, output_dim, adj, features_nonzero, dropout=0., act=tf.nn.relu, **kwargs):
super(GraphConvolutionSparse, self).__init__(**kwargs)
with tf.variable_scope(self.name + '_vars'):
self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights")
self.dropout = dropout
self.adj = adj
self.act = act
self.issparse = True
self.features_nonzero = features_nonzero
def _call(self, inputs):
x = inputs
x = dropout_sparse(x, 1 - self.dropout, self.features_nonzero)
x = tf.sparse_tensor_dense_matmul(x, self.vars['weights'])
x = tf.sparse_tensor_dense_matmul(self.adj, x)
outputs = self.act(x)
return outputs
def gaussian_noise_layer(input_layer, std):
noise = tf.random_normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32)
return input_layer + noise
class GraphConvolution(Layer):
"""Basic graph convolution layer for undirected graph without edge labels."""
def __init__(self, input_dim, output_dim, adj, dropout=0., act=tf.nn.relu, **kwargs):
super(GraphConvolution, self).__init__(**kwargs)
with tf.variable_scope(self.name + '_vars'):
self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights")
self.dropout = dropout
self.adj = adj
self.act = act
def _call(self, inputs):
x = inputs
x = tf.nn.dropout(x, 1 - self.dropout)
x = tf.matmul(x, self.vars['weights'])
x = tf.sparse_tensor_dense_matmul(self.adj, x)
outputs = self.act(x)
return outputs
class InnerProductDecoder(Layer):
"""Decoder model layer for link prediction."""
def __init__(self, input_dim, dropout=0., act=tf.nn.sigmoid, **kwargs):
super(InnerProductDecoder, self).__init__(**kwargs)
self.dropout = dropout
self.act = act
def _call(self, inputs):
"""
这个decoder部分实际上就只是input的转置再乘input
"""
inputs = tf.nn.dropout(inputs, 1 - self.dropout)
x = tf.transpose(inputs)
x = tf.matmul(inputs, x)
x = tf.reshape(x, [-1])
outputs = self.act(x)
return outputs
class GCN(Model):
def __init__(self, placeholders, num_features, features_nonzero, **kwargs):
def __init__(self, placeholders, num_features, features_nonzero, settings, **kwargs):
super(GCN, self).__init__(**kwargs)
"""
inputs:输入
input_dim:feature的数量即input的维度
feature_nonzero非0的特征
adj:邻接矩阵
inputs: Input features
input_dim: dimensionality
feature_nonzeroNon-zero feature number
adj: adjacency matrix
dropoutdropout
"""
self.inputs = placeholders['features']
self.input_dim = num_features
self.features_nonzero = features_nonzero
self.adj = placeholders['adj']
self.dropout = placeholders['dropout']
self.settings = settings
def construct(self, inputs=None, hidden=None, reuse=False):
if inputs == None:
@ -63,7 +204,7 @@ class GCN(Model):
with tf.variable_scope('Encoder', reuse=reuse):
self.hidden1 = GraphConvolutionSparse(input_dim=self.input_dim,
output_dim=FLAGS.hidden1,
output_dim=self.settings.hidden1,
adj=self.adj,
features_nonzero=self.features_nonzero,
act=tf.nn.relu,
@ -74,63 +215,20 @@ class GCN(Model):
self.noise = gaussian_noise_layer(self.hidden1, 0.1)
if hidden == None:
hidden = self.hidden1
self.embeddings = GraphConvolution(input_dim=FLAGS.hidden1,
output_dim=FLAGS.hidden2,
self.embeddings = GraphConvolution(input_dim=self.settings.hidden1,
output_dim=self.settings.hidden2,
adj=self.adj,
act=lambda x: x,
dropout=self.dropout,
logging=self.logging,
name='e_dense_2')(hidden)
self.z_mean = self.embeddings
self.reconstructions = InnerProductDecoder(input_dim=FLAGS.hidden2,
self.reconstructions = InnerProductDecoder(input_dim=self.settings.hidden2,
act=lambda x: x,
logging=self.logging)(self.embeddings)
return self.z_mean, self.reconstructions
class Generator_z2g(Model):
def __init__(self, placeholders, num_features, features_nonzero, **kwargs):
super(Generator_z2g, self).__init__(**kwargs)
"""
inputs:输入
input_dim:feature的数量即input的维度
feature_nonzero非0的特征
adj:邻接矩阵
dropoutdropout
"""
self.inputs = placeholders['real_distribution']
self.input_dim = num_features
self.features_nonzero = features_nonzero
self.adj = placeholders['adj']
self.dropout = placeholders['dropout']
def construct(self, inputs=None, reuse=False):
if inputs == None:
inputs = self.inputs
with tf.variable_scope('Decoder', reuse=reuse):
self.hidden1 = GraphConvolution(input_dim=FLAGS.hidden2,
output_dim=FLAGS.hidden1,
adj=self.adj,
act=tf.nn.relu,
dropout=self.dropout,
logging=self.logging,
name='GG_dense_1')(inputs)
self.embeddings = GraphConvolution(input_dim=FLAGS.hidden1,
output_dim=self.input_dim,
adj=self.adj,
act=lambda x: x,
dropout=self.dropout,
logging=self.logging,
name='GG_dense_2')(self.hidden1)
self.z_mean = self.embeddings
return self.z_mean, self.hidden1
def dense(x, n1, n2, name):
"""
Used to create a dense layer.
@ -150,10 +248,26 @@ def dense(x, n1, n2, name):
return out
class Discriminator(Model):
def __init__(self, settings, **kwargs):
super(Discriminator, self).__init__(**kwargs)
self.act = tf.nn.relu
self.settings = settings
def construct(self, inputs, reuse=False):
with tf.variable_scope('Discriminator'):
if reuse:
tf.get_variable_scope().reuse_variables()
tf.set_random_seed(1)
dc_den1 = tf.nn.relu(dense(inputs, self.settings.hidden2, self.settings.hidden3, name='dc_den1'))
dc_den2 = tf.nn.relu(dense(dc_den1, self.settings.hidden3, self.settings.hidden1, name='dc_den2'))
output = dense(dc_den2, self.settings.hidden1, 1, name='dc_output')
return output
class D_graph(Model):
def __init__(self, num_features, **kwargs):
super(D_graph, self).__init__(**kwargs)
self.act = tf.nn.relu
self.num_features = num_features
@ -170,25 +284,53 @@ class D_graph(Model):
return output
def gaussian_noise_layer(input_layer, std):
noise = tf.random_normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32)
return input_layer + noise
class Generator_z2g(Model):
def __init__(self, placeholders, num_features, features_nonzero, settings, **kwargs):
super(Generator_z2g, self).__init__(**kwargs)
"""
inputs:输入
input_dim:feature的数量即input的维度
feature_nonzero非0的特征
adj:邻接矩阵
dropoutdropout
"""
self.inputs = placeholders['real_distribution']
self.input_dim = num_features
self.features_nonzero = features_nonzero
self.adj = placeholders['adj']
self.dropout = placeholders['dropout']
self.settings = settings
class Discriminator(Model):
def __init__(self, **kwargs):
super(Discriminator, self).__init__(**kwargs)
def construct(self, inputs=None, reuse=False):
if inputs == None:
inputs = self.inputs
with tf.variable_scope('Decoder', reuse=reuse):
self.hidden1 = GraphConvolution(input_dim=self.settings.hidden2,
output_dim=self.settings.hidden1,
adj=self.adj,
act=tf.nn.relu,
dropout=self.dropout,
logging=self.logging,
name='GG_dense_1')(inputs)
self.act = tf.nn.relu
self.embeddings = GraphConvolution(input_dim=self.settings.hidden1,
output_dim=self.input_dim,
adj=self.adj,
act=lambda x: x,
dropout=self.dropout,
logging=self.logging,
name='GG_dense_2')(self.hidden1)
def construct(self, inputs, reuse=False):
# with tf.name_scope('Discriminator'):
with tf.variable_scope('Discriminator'):
if reuse:
tf.get_variable_scope().reuse_variables()
# np.random.seed(1)
tf.set_random_seed(1)
dc_den1 = tf.nn.relu(dense(inputs, FLAGS.hidden2, FLAGS.hidden3, name='dc_den1'))
dc_den2 = tf.nn.relu(dense(dc_den1, FLAGS.hidden3, FLAGS.hidden1, name='dc_den2'))
output = dense(dc_den2, FLAGS.hidden1, 1, name='dc_output')
return output
self.z_mean = self.embeddings
return self.z_mean, self.hidden1
class BGAN(object):
def __init__(self, placeholders, num_features, num_nodes, features_nonzero, settings):
self.discriminator = Discriminator(settings)
self.D_Graph = D_graph(num_features)
self.d_real = self.discriminator.construct(placeholders['real_distribution'])
self.GD_real = self.D_Graph.construct(placeholders['features_dense'])
self.ae_model = GCN(placeholders, num_features, features_nonzero, settings)
self.model_z2g = Generator_z2g(placeholders, num_features, features_nonzero, settings)

@ -0,0 +1,105 @@
import argparse
import numpy as np
import tensorflow as tf
from dppy.finite_dpps import FiniteDPP
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity
from input import get_data
from src.evaluation import Evaluator
from src.model import BGAN
from src.optimizer import Optimizer, update
# 解析参数
def parse_args():
parser = argparse.ArgumentParser(description='BGANDTI')
parser.add_argument('--hidden1', type=int, default=32, help='隐藏层1神经元数量.')
parser.add_argument('--hidden2', type=int, default=32, help='隐藏层2神经元数量.')
parser.add_argument('--hidden3', type=int, default=64, help='隐藏层3神经元数量.')
parser.add_argument('--learning_rate', type=float, default=.6 * 0.001, help='学习率')
parser.add_argument('--discriminator_learning_rate', type=float, default=0.001, help='判别器学习率')
parser.add_argument('--epoch', type=int, default=20, help='迭代次数')
parser.add_argument('--seed', type=int, default=50, help='用来打乱数据集')
parser.add_argument('--features', type=int, default=1, help='是(1)否(0)使用特征')
parser.add_argument('--dropout', type=float, default=0., help='Dropout rate (1 - keep probability).')
parser.add_argument('--weight_decay', type=float, default=0., help='Weight for L2 loss on embedding matrix.')
parser.add_argument('--dataset', type=str, default='e', help='使用的数据集')
args = parser.parse_args()
return args
if __name__ == "__main__":
settings = parse_args()
# 读数据
feas = get_data(settings.dataset)
# DPP采样和PCA降维
DPP = FiniteDPP('correlation', **{'K': feas['adj'].toarray()})
pca = PCA(n_components=settings.hidden2)
DPP.sample_exact_k_dpp(size=21) # e 21 ic 6 gpcr 3
index = DPP.list_of_samples[0]
feature_sample = feas['features_dense']
feature_sample = pca.fit_transform(feature_sample)
kde = KernelDensity(bandwidth=0.7).fit(np.array([feature_sample[i] for i in index]))
# 计算图中预输入数据
placeholders = {
'features': tf.sparse_placeholder(tf.float32),
'features_dense': tf.placeholder(tf.float32, shape=[feas['adj'].shape[0], feas['num_features']], name='real_distribution'),
'adj': tf.sparse_placeholder(tf.float32),
'adj_orig': tf.sparse_placeholder(tf.float32),
'dropout': tf.placeholder_with_default(0., shape=()),
'real_distribution': tf.placeholder(dtype=tf.float32, shape=[feas['adj'].shape[0], settings.hidden2], name='real_distribution')
}
# 构造模型
# d_real, discriminator, ae_model, model_z2g, D_Graph, GD_real = DBGAN(placeholders, feas['num_features'], feas['num_nodes'], feas['features_nonzero'], settings)
model = BGAN(placeholders, feas['num_features'], feas['num_nodes'], feas['features_nonzero'], settings)
# 定义优化器
# opt = Optimizer(ae_model, model_z2g, D_Graph, discriminator, placeholders, feas['pos_weight'], feas['norm'], d_real, feas['num_nodes'], GD_real)
optimizer = Optimizer(model.ae_model, model.model_z2g, model.D_Graph, model.discriminator, placeholders, feas['pos_weight'], feas['norm'], model.d_real, feas['num_nodes'], model.GD_real,
settings)
# 初始化会话和权重
# 配置显存自动增长
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
# sess = tf.Session()
sess.run(tf.global_variables_initializer())
# 存储不同阶段结果
val_roc_score = []
record = []
record_emb = []
# Train model
for epoch in range(settings.epoch):
emb, avg_cost = update(model.ae_model, optimizer.opt, sess, feas['adj_norm'], feas['adj_label'], feas['features'], placeholders, feas['adj'], kde, feas['features_dense'], settings)
lm_train = Evaluator(feas['val_edges'], feas['val_edges_false'])
roc_curr, ap_curr, _, aupr_score = lm_train.get_roc_score(emb, feas)
val_roc_score.append(roc_curr)
print("Epoch:", '%04d' % (epoch + 1),
"train_loss={:.5f}, d_loss={:.5f}, g_loss={:.5f}, GD_loss={:.5f}, GG_loss={:.5f}".format(avg_cost[0], avg_cost[1], avg_cost[2], avg_cost[3], avg_cost[4]),
"val_roc={:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "val_aupr=", "{:.5f}".format(aupr_score))
if (epoch + 1) % 10 == 0:
lm_test = Evaluator(feas['test_edges'], feas['test_edges_false'])
roc_score, ap_score, _, aupr_score = lm_test.get_roc_score(emb, feas)
print('Test ROC score: ' + str(roc_score), 'Test AUPR score: ' + str(aupr_score), 'Test AP score: ' + str(ap_score))
record.append([roc_score, aupr_score, ap_score])
record_emb.append(emb)
rec = np.array(record)
# index = rec[:, 0].tolist().index(max(rec[:, 0].tolist()))
# index_pr = rec[:, 1].tolist().index(max(rec[:, 1].tolist()))
emb = record_emb[rec[:, 0].tolist().index(max(rec[:, 0].tolist()))]
ana = record[rec[:, 0].tolist().index(max(rec[:, 0].tolist()))]
ana_pr = record[rec[:, 1].tolist().index(max(rec[:, 1].tolist()))]
print('The peak [auc] test_roc={:.7f}, aupr={:.7f}, ap={:.7f}'.format(ana[0], ana[1], ana[2]))
print('The peak [aupr] test_roc={:.7f}, aupr={:.7f}, ap={:.7f}'.format(ana_pr[0], ana_pr[1], ana_pr[2]))
Loading…
Cancel
Save