diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..bfa83a0 --- /dev/null +++ b/__init__.py @@ -0,0 +1,2 @@ +from __future__ import print_function +from __future__ import division diff --git a/constructor.py b/constructor.py new file mode 100644 index 0000000..e54aaaa --- /dev/null +++ b/constructor.py @@ -0,0 +1,191 @@ +import tensorflow as tf +import numpy as np +from model import GCN, Generator_z2g, Discriminator, D_graph +from optimizer import OptimizerAE, OptimizerCycle +import scipy.sparse as sp +from input_data import load_data +import inspect +from preprocessing import preprocess_graph, sparse_to_tuple, mask_test_edges, construct_feed_dict + +flags = tf.app.flags +FLAGS = flags.FLAGS + + +def get_placeholder(adj, num_features): + # 给tf.sparse_placeholder喂数据时: + # 1.应该直接填充 (indices, values, shape) + # 2.或者使用 tf.SparseTensorValue + + placeholders = { + 'features': tf.sparse_placeholder(tf.float32), + 'features_dense': tf.placeholder(tf.float32, shape=[adj.shape[0], num_features], + name='real_distribution'), + 'adj': tf.sparse_placeholder(tf.float32), + 'adj_orig': tf.sparse_placeholder(tf.float32), + 'dropout': tf.placeholder_with_default(0., shape=()), + 'real_distribution': tf.placeholder(dtype=tf.float32, shape=[adj.shape[0], FLAGS.hidden2], + name='real_distribution') + + } + + return placeholders + + +def get_model(model_str, placeholders, num_features, num_nodes, features_nonzero): + # 计算图构建 + discriminator = Discriminator() + D_Graph = D_graph(num_features) + d_real = discriminator.construct(placeholders['real_distribution']) + GD_real = D_Graph.construct(placeholders['features_dense']) + model = None + if model_str == 'arga_ae': + model = GCN(placeholders, num_features, features_nonzero) + + elif model_str == 'DBGAN': + model = GCN(placeholders, num_features, features_nonzero) + model_z2g = Generator_z2g(placeholders, num_features, features_nonzero) + + return d_real, discriminator, model, model_z2g, D_Graph, GD_real + + +def format_data(data_name): + # Load data + + adj, features, y_test, tx, ty, test_maks, true_labels = load_data(data_name) + + # Store original adjacency matrix (without diagonal entries) for later + adj_orig = adj + # 删除对角线元素 + adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) + adj_orig.eliminate_zeros() + + adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) + adj = adj_train + adj_dense = adj.toarray() + + if FLAGS.features == 0: + features = sp.identity(features.shape[0]) # featureless + + # Some preprocessing + adj_norm = preprocess_graph(adj) + + num_nodes = adj.shape[0] + features_dense = features.tocoo().toarray() + + features = sparse_to_tuple(features.tocoo()) + # num_features是feature的维度 + num_features = features[2][1] + # features_nonzero就是非零feature的个数 + features_nonzero = features[1].shape[0] + + pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() + norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) + + adj_label = adj_train + sp.eye(adj_train.shape[0]) + adj_label = sparse_to_tuple(adj_label) + items = [ + adj, num_features, num_nodes, features_nonzero, + pos_weight, norm, adj_norm, adj_label, + features, true_labels, train_edges, val_edges, + val_edges_false, test_edges, test_edges_false, adj_orig, features_dense, adj_dense, features_dense + ] + + feas = {} + + print('num_features is:', num_features) + print('num_nodes is:', num_nodes) + print('features_nonzero is:', features_nonzero) + print('pos_weight is:', pos_weight) + print('norm is:', norm) + + for item in items: + #item_name = [ k for k,v in locals().iteritems() if v == item][0] + feas[retrieve_name(item)] = item + + return feas + + +def get_optimizer(model_str, model, model_z2g, D_Graph, discriminator, placeholders, pos_weight, norm, d_real, num_nodes, GD_real): + if model_str == 'arga_ae': + output = model.construct() + embeddings = output[0] + reconstructions = output[1] + d_fake = discriminator.construct(embeddings, reuse=True) + opt = OptimizerAE(preds=reconstructions, + labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], + validate_indices=False), [-1]), + pos_weight=pos_weight, + norm=norm, + d_real=d_real, + d_fake=d_fake) + elif model_str == 'DBGAN': + + z2g = model_z2g.construct() + hidden = z2g[1] + z2g = z2g[0] + preds_z2g = model.construct(hidden=hidden, reuse=True)[0] + g2z = model.construct() + + embeddings = g2z[0] + reconstructions = g2z[1] + d_fake = discriminator.construct(embeddings, reuse=True) + GD_fake = D_Graph.construct(z2g, reuse=True) + + epsilon = tf.random_uniform(shape=[1], minval=0.0, maxval=1.0) + interpolated_input = epsilon * placeholders['real_distribution'] + (1 - epsilon) * embeddings + gradient = tf.gradients(discriminator.construct(interpolated_input, reuse=True), [interpolated_input])[0] + + epsilon = tf.random_uniform(shape=[1], minval=0.0, maxval=1.0) + interpolated_input = epsilon * placeholders['features_dense'] + (1 - epsilon) * z2g + gradient_z = tf.gradients(D_Graph.construct(interpolated_input, reuse=True), [interpolated_input])[0] + + opt = OptimizerCycle(preds=reconstructions, + labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], + validate_indices=False), [-1]), + pos_weight=pos_weight, + norm=norm, + d_real=d_real, + d_fake=d_fake, + GD_real=GD_real, + GD_fake=GD_fake, + preds_z2g=preds_z2g, + labels_z2g=placeholders['real_distribution'], + preds_cycle=model_z2g.construct(embeddings, reuse=True)[0], + labels_cycle=placeholders['features_dense'], + gradient=gradient, + gradient_z=gradient_z) + return opt + + +def update(model, opt, sess, adj_norm, adj_label, features, placeholders, adj, distribution, adj_dense): + # Construct feed dictionary + feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) + feed_dict.update({placeholders['dropout']: FLAGS.dropout}) + feed_dict.update({placeholders['features_dense']: adj_dense}) + feed_dict.update({placeholders['dropout']: 0}) + z_real_dist = np.random.randn(adj.shape[0], FLAGS.hidden2) + z_real_dist = distribution.sample(adj.shape[0]) + feed_dict.update({placeholders['real_distribution']: z_real_dist}) + + for j in range(5): + _, reconstruct_loss = sess.run([opt.opt_op, opt.cost], feed_dict=feed_dict) + g_loss, _ = sess.run([opt.generator_loss, opt.generator_optimizer], feed_dict=feed_dict) + d_loss, _ = sess.run([opt.dc_loss, opt.discriminator_optimizer], feed_dict=feed_dict) + + GD_loss, _ = sess.run([opt.GD_loss, opt.discriminator_optimizer_z2g], feed_dict=feed_dict) + GG_loss, _ = sess.run([opt.generator_loss_z2g, opt.generator_optimizer_z2g], feed_dict=feed_dict) + # GD_loss = sess.run(opt.GD_loss, feed_dict=feed_dict) + # GG_loss = sess.run(opt.generator_loss_z2g, feed_dict=feed_dict) + # g_loss = sess.run(opt.generator_loss, feed_dict=feed_dict) + # d_loss = sess.run(opt.dc_loss, feed_dict=feed_dict) + emb = sess.run(model.z_mean, feed_dict=feed_dict) + avg_cost = [reconstruct_loss, d_loss, g_loss, GD_loss, GG_loss] + + return emb, avg_cost + + +def retrieve_name(var): + callers_local_vars = inspect.currentframe().f_back.f_locals.items() + print([var_name for var_name, var_val in callers_local_vars if var_val is var]) + return [var_name for var_name, var_val in callers_local_vars if var_val is var][0] + diff --git a/initializations.py b/initializations.py new file mode 100644 index 0000000..cc2ec8f --- /dev/null +++ b/initializations.py @@ -0,0 +1,11 @@ +import tensorflow as tf +import numpy as np + +def weight_variable_glorot(input_dim, output_dim, name=""): + """Create a weight variable with Glorot & Bengio (AISTATS 2010) + initialization. + """ + init_range = np.sqrt(6.0 / (input_dim + output_dim)) + initial = tf.random_uniform([input_dim, output_dim], minval=-init_range, + maxval=init_range, dtype=tf.float32) + return tf.Variable(initial, name=name) diff --git a/input_data.py b/input_data.py new file mode 100644 index 0000000..9782d31 --- /dev/null +++ b/input_data.py @@ -0,0 +1,122 @@ +import numpy as np +import pickle as pkl +import networkx as nx +import scipy.sparse as sp +import sys + + +def parse_index_file(filename): + index = [] + for line in open(filename): + index.append(int(line.strip())) + return index + +def sample_mask(idx, l): + """Create mask.""" + mask = np.zeros(l) + mask[idx] = 1 + return np.array(mask, dtype=np.bool) + +def load_data(dataset): + # load the data: x, tx, allx, graph + # x => 训练实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例 + # tx => 测试实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例 + # allx => 有标签的+无无标签训练实例的特征向量,是ind.dataset_str.x的超集 + # y => 训练实例的标签,独热编码,numpy.ndarray类的实例 + # ty => 测试实例的标签,独热编码,numpy.ndarray类的实例 + # ally => 有标签的+无无标签训练实例的标签,独热编码,numpy.ndarray类的实例 + # graph => 图数据,collections.defaultdict类的实例,格式为 {index:[index_of_neighbor_nodes]} + # index => 测试实例的id + names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] + objects = [] + for i in range(len(names)): + with open("data/ind.{}.{}".format(dataset, names[i]), 'rb') as f: + if sys.version_info > (3, 0): + objects.append(pkl.load(f, encoding='latin1')) + else: + objects.append(pkl.load(f)) + x, y, tx, ty, allx, ally, graph = tuple(objects) + test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset)) + test_idx_range = np.sort(test_idx_reorder) + + if dataset == 'citeseer': + # Fix citeseer dataset (there are some isolated nodes in the graph) + # Find isolated nodes, add them as zero-vecs into the right position、 + test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) + tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) + tx_extended[test_idx_range-min(test_idx_range), :] = tx + tx = tx_extended + ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) + ty_extended[test_idx_range - min(test_idx_range), :] = ty + ty = ty_extended + + features = sp.vstack((allx, tx)).tolil() + features[test_idx_reorder, :] = features[test_idx_range, :] + adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) + + labels = np.vstack((ally, ty)) + labels[test_idx_reorder, :] = labels[test_idx_range, :] + + idx_test = test_idx_range.tolist() + idx_train = range(len(y)) + idx_val = range(len(y), len(y) + 500) + + train_mask = sample_mask(idx_train, labels.shape[0]) + val_mask = sample_mask(idx_val, labels.shape[0]) + test_mask = sample_mask(idx_test, labels.shape[0]) + + y_train = np.zeros(labels.shape) + y_val = np.zeros(labels.shape) + y_test = np.zeros(labels.shape) + y_train[train_mask, :] = labels[train_mask, :] + y_val[val_mask, :] = labels[val_mask, :] + y_test[test_mask, :] = labels[test_mask, :] + + return adj, features, y_test, tx, ty, test_mask, np.argmax(labels,1) + + +def load_alldata(dataset_str): + """Load data.""" + names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] + objects = [] + for i in range(len(names)): + objects.append(pkl.load(open("data/ind.{}.{}".format(dataset_str, names[i])))) + + x, y, tx, ty, allx, ally, graph = tuple(objects) + test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) + test_idx_range = np.sort(test_idx_reorder) + + if dataset_str == 'citeseer': + # Fix citeseer dataset (there are some isolated nodes in the graph) + # Find isolated nodes, add them as zero-vecs into the right position + test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) + tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) + tx_extended[test_idx_range-min(test_idx_range), :] = tx + tx = tx_extended + ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) + ty_extended[test_idx_range-min(test_idx_range), :] = ty + ty = ty_extended + + features = sp.vstack((allx, tx)).tolil() + features[test_idx_reorder, :] = features[test_idx_range, :] + adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) + + labels = np.vstack((ally, ty)) + labels[test_idx_reorder, :] = labels[test_idx_range, :] + + idx_test = test_idx_range.tolist() + idx_train = range(len(y)) + idx_val = range(len(y), len(y)+500) + + train_mask = sample_mask(idx_train, labels.shape[0]) + val_mask = sample_mask(idx_val, labels.shape[0]) + test_mask = sample_mask(idx_test, labels.shape[0]) + + y_train = np.zeros(labels.shape) + y_val = np.zeros(labels.shape) + y_test = np.zeros(labels.shape) + y_train[train_mask, :] = labels[train_mask, :] + y_val[val_mask, :] = labels[val_mask, :] + y_test[test_mask, :] = labels[test_mask, :] + + return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, np.argmax(labels, 1) diff --git a/layers.py b/layers.py new file mode 100644 index 0000000..8610eed --- /dev/null +++ b/layers.py @@ -0,0 +1,158 @@ +from initializations import * +import tensorflow as tf + +flags = tf.app.flags +FLAGS = flags.FLAGS + +# global unique layer ID dictionary for layer name assignment +_LAYER_UIDS = {} + + +def get_layer_uid(layer_name=''): + """Helper function, assigns unique layer IDs + 分配唯一的层ID + """ + if layer_name not in _LAYER_UIDS: + _LAYER_UIDS[layer_name] = 1 + return 1 + else: + _LAYER_UIDS[layer_name] += 1 + return _LAYER_UIDS[layer_name] + + +def dropout_sparse(x, keep_prob, num_nonzero_elems): + """ + Dropout for sparse tensors. Currently fails for very large sparse tensors (>1M elements) + num_nonzero_elems: 稀疏矩阵中的非零元素个数 + keep_prob: + x: input + """ + noise_shape = [num_nonzero_elems] + random_tensor = keep_prob + random_tensor += tf.random_uniform(noise_shape) + dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool) + pre_out = tf.sparse_retain(x, dropout_mask) + return pre_out * (1./keep_prob) + + +class Layer(object): + """Base layer class. Defines basic API for all layer objects. + + # Properties + name: String, defines the variable scope of the layer. + + # Methods + _call(inputs): Defines computation graph of layer + (i.e. takes input, returns output) + __call__(inputs): Wrapper for _call() + """ + def __init__(self, **kwargs): + allowed_kwargs = {'name', 'logging'} + for kwarg in kwargs.keys(): + assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg + name = kwargs.get('name') + if not name: + layer = self.__class__.__name__.lower() + name = layer + '_' + str(get_layer_uid(layer)) + self.name = name + self.vars = {} + logging = kwargs.get('logging', False) + self.logging = logging + self.issparse = False + + def _call(self, inputs): + return inputs + + def __call__(self, inputs): + with tf.name_scope(self.name): + outputs = self._call(inputs) + return outputs + + +class GraphConvolution(Layer): + """Basic graph convolution layer for undirected graph without edge labels.""" + def __init__(self, input_dim, output_dim, adj, dropout=0., act=tf.nn.relu, **kwargs): + super(GraphConvolution, self).__init__(**kwargs) + with tf.variable_scope(self.name + '_vars'): + self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights") + self.dropout = dropout + self.adj = adj + self.act = act + + def _call(self, inputs): + x = inputs + x = tf.nn.dropout(x, 1-self.dropout) + x = tf.matmul(x, self.vars['weights']) + x = tf.sparse_tensor_dense_matmul(self.adj, x) + outputs = self.act(x) + return outputs + + +class GraphConvolutionSparse(Layer): + """ + Graph convolution layer for sparse inputs. + 多了一个features_nonzero + """ + def __init__(self, input_dim, output_dim, adj, features_nonzero, dropout=0., act=tf.nn.relu, **kwargs): + super(GraphConvolutionSparse, self).__init__(**kwargs) + with tf.variable_scope(self.name + '_vars'): + self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights") + self.dropout = dropout + self.adj = adj + self.act = act + self.issparse = True + self.features_nonzero = features_nonzero + + def _call(self, inputs): + x = inputs + x = dropout_sparse(x, 1-self.dropout, self.features_nonzero) + x = tf.sparse_tensor_dense_matmul(x, self.vars['weights']) + x = tf.sparse_tensor_dense_matmul(self.adj, x) + outputs = self.act(x) + return outputs + + +class InnerProductDecoder(Layer): + """Decoder model layer for link prediction.""" + def __init__(self, input_dim, dropout=0., act=tf.nn.sigmoid, **kwargs): + super(InnerProductDecoder, self).__init__(**kwargs) + self.dropout = dropout + self.act = act + + def _call(self, inputs): + """ + 这个decoder部分实际上就只是input的转置再乘input + """ + inputs = tf.nn.dropout(inputs, 1-self.dropout) + x = tf.transpose(inputs) + x = tf.matmul(inputs, x) + x = tf.reshape(x, [-1]) + outputs = self.act(x) + return outputs + +class GraphConvolution_z2g(Layer): + """Basic graph convolution layer for undirected graph without edge labels.""" + def __init__(self, input_dim, output_dim, adj, dropout=0., act=tf.nn.relu, **kwargs): + super(GraphConvolution, self).__init__(**kwargs) + with tf.variable_scope(self.name + '_vars'): + self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights") + self.dropout = dropout + self.adj = adj + self.act = act + + def _call(self, inputs): + x = inputs + x = tf.nn.dropout(x, 1-self.dropout) + x = tf.matmul(x, self.vars['weights']) + x = tf.sparse_tensor_dense_matmul(self.adj, x) + outputs = self.act(x) + return outputs + + + def _call(self, inputs): + x = inputs + x = dropout_sparse(x, 1-self.dropout, self.features_nonzero) + x = tf.sparse_tensor_dense_matmul(x, self.vars['weights']) + x = tf.sparse_tensor_dense_matmul(self.adj, x) + outputs = self.act(x) + return outputs diff --git a/link_prediction.py b/link_prediction.py new file mode 100644 index 0000000..13fa156 --- /dev/null +++ b/link_prediction.py @@ -0,0 +1,110 @@ +from __future__ import division +from __future__ import print_function +import os + +# Train on CPU (hide GPU) due to memory constraints +os.environ['CUDA_VISIBLE_DEVICES'] = " 0,4,2,3" + +import tensorflow as tf +import settings +from constructor import get_placeholder, get_model, format_data, get_optimizer, update +from metrics import linkpred_metrics +from sklearn.neighbors import KernelDensity +from dppy.finite_dpps import FiniteDPP +from sklearn.decomposition import PCA +import numpy as np +import scipy.io as scio + +# Settings +flags = tf.app.flags +FLAGS = flags.FLAGS + + +class Link_pred_Runner(): + def __init__(self, settings): + self.data_name = settings['data_name'] + self.iteration = settings['iterations'] + self.model = settings['model'] + + def erun(self): + model_str = self.model + # formatted data + feas = format_data(self.data_name) + + # Define placeholders + # 定义placeholders,get_placeholder函数中只需要传入一个参数,即adj,函数中需要用到adj.shape + placeholders = get_placeholder(feas['adj'], feas['num_features']) + + # 定义由Dpp和密度估计出来的混合高斯 + DPP = FiniteDPP('correlation', **{'K': feas['adj'].toarray()}) + # DPP.sample_exact_k_dpp(size=4) + pca = PCA(n_components=FLAGS.hidden2) + + # index = DPP.list_of_samples[0] + + if self.data_name == 'cora': + DPP.sample_exact_k_dpp(size=21) + index = DPP.list_of_samples[0] + pass + elif self.data_name == 'citeseer': + + index = np.array([1782, 741, 3258, 3189, 3112, 2524, 2895, 1780, 1100, 2735, 1318, + 2944, 1825, 18, 987, 2564, 463, 6, 3173, 701, 1901, 2349, + 2786, 2412, 646, 2626, 2648, 1793, 432, 538, 1729, 1217, 1397, + 1932, 2850, 458, 2129, 702, 2934, 2030, 2882, 1393, 308, 1271, + 1106, 2688, 629, 1145, 3251, 1903, 1004, 1149, 1385, 285, 858, + 2977, 844, 335, 532, 404, 3174, 528]) + + elif self.data_name == 'pubmed': + index = np.array([842, 3338, 5712, 17511, 10801, 2714, 6970, 13296, 5466, + 2230]) + feature_sample = feas['features_dense'] + feature_sample = pca.fit_transform(feature_sample) + + featuresCompress = np.array([feature_sample[i] for i in index]) + # featuresCompress = np.array(feature_sample) + kde = KernelDensity(bandwidth=0.7).fit(featuresCompress) + + # construct model + d_real, discriminator, ae_model, model_z2g, D_Graph, GD_real = get_model(model_str, placeholders, feas['num_features'], feas['num_nodes'], feas['features_nonzero']) + + # Optimizer + opt = get_optimizer(model_str, ae_model, model_z2g, D_Graph, discriminator, placeholders, feas['pos_weight'], feas['norm'], d_real, feas['num_nodes'], GD_real) + + # Initialize session + + # config = tf.ConfigProto() + # config.gpu_options.allow_growth = True + # sess = tf.Session(config = config) + sess = tf.Session() + sess.run(tf.global_variables_initializer()) + + val_roc_score = [] + record = [] + record_emb = [] + # Train model + for epoch in range(self.iteration): + + emb, avg_cost = update(ae_model, opt, sess, feas['adj_norm'], feas['adj_label'], feas['features'], placeholders, feas['adj'], kde, feas['features_dense']) + + lm_train = linkpred_metrics(feas['val_edges'], feas['val_edges_false']) + roc_curr, ap_curr, _ = lm_train.get_roc_score(emb, feas) + val_roc_score.append(roc_curr) + print("Epoch:", '%04d' % (epoch + 1), + "train_loss= {:.5f}, d_loss= {:.5f}, g_loss= {:.5f}, GD_loss= {:.5f}, GG_loss= {:.5f}".format(avg_cost[0], avg_cost[1], avg_cost[2], avg_cost[3], avg_cost[4]), "val_roc=", + "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr)) + + if (epoch + 1) % 10 == 0: + lm_test = linkpred_metrics(feas['test_edges'], feas['test_edges_false']) + roc_score, ap_score, _ = lm_test.get_roc_score(emb, feas) + print('Test ROC score: ' + str(roc_score)) + print('Test AP score: ' + str(ap_score)) + record.append([roc_score, ap_score]) + record_emb.append(emb) + rec = np.array(record) + index = rec[:, 0].tolist().index(max(rec[:, 0].tolist())) + emb = record_emb[index] + ana = record[index] + scio.savemat('result/{}_link_64_64_new.mat'.format(self.data_name), {'embedded': emb, + 'labels': feas['true_labels']}) + print('The peak val_roc=%f, ap = %f' % (ana[0], ana[1])) diff --git a/metrics.py b/metrics.py new file mode 100644 index 0000000..dd4563d --- /dev/null +++ b/metrics.py @@ -0,0 +1,108 @@ +from sklearn.metrics import f1_score +from sklearn.metrics import roc_auc_score +from sklearn.metrics import average_precision_score +from sklearn import metrics +from munkres import Munkres, print_matrix +import numpy as np + +class linkpred_metrics(): + def __init__(self, edges_pos, edges_neg): + self.edges_pos = edges_pos + self.edges_neg = edges_neg + + def get_roc_score(self, emb, feas): + # if emb is None: + # feed_dict.update({placeholders['dropout']: 0}) + # emb = sess.run(model.z_mean, feed_dict=feed_dict) + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + # Predict on test set of edges + adj_rec = np.dot(emb, emb.T) + preds = [] + pos = [] + for e in self.edges_pos: + preds.append(sigmoid(adj_rec[e[0], e[1]])) + pos.append(feas['adj_orig'][e[0], e[1]]) + + preds_neg = [] + neg = [] + for e in self.edges_neg: + preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) + neg.append(feas['adj_orig'][e[0], e[1]]) + + preds_all = np.hstack([preds, preds_neg]) + labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds))]) + roc_score = roc_auc_score(labels_all, preds_all) + ap_score = average_precision_score(labels_all, preds_all) + + return roc_score, ap_score, emb + + +class clustering_metrics(): + def __init__(self, true_label, predict_label): + self.true_label = true_label + self.pred_label = predict_label + + + def clusteringAcc(self): + # best mapping between true_label and predict label + l1 = list(set(self.true_label)) + numclass1 = len(l1) + + l2 = list(set(self.pred_label)) + numclass2 = len(l2) + if numclass1 != numclass2: + print('Class Not equal, Error!!!!') + return 0 + + cost = np.zeros((numclass1, numclass2), dtype=int) + for i, c1 in enumerate(l1): + mps = [i1 for i1, e1 in enumerate(self.true_label) if e1 == c1] + for j, c2 in enumerate(l2): + mps_d = [i1 for i1 in mps if self.pred_label[i1] == c2] + + cost[i][j] = len(mps_d) + + # match two clustering results by Munkres algorithm + m = Munkres() + cost = cost.__neg__().tolist() + + indexes = m.compute(cost) + + # get the match results + new_predict = np.zeros(len(self.pred_label)) + for i, c in enumerate(l1): + # correponding label in l2: + c2 = l2[indexes[i][1]] + + # ai is the index with label==c2 in the pred_label list + ai = [ind for ind, elm in enumerate(self.pred_label) if elm == c2] + new_predict[ai] = c + + acc = metrics.accuracy_score(self.true_label, new_predict) + f1_macro = metrics.f1_score(self.true_label, new_predict, average='macro') + precision_macro = metrics.precision_score(self.true_label, new_predict, average='macro') + recall_macro = metrics.recall_score(self.true_label, new_predict, average='macro') + f1_micro = metrics.f1_score(self.true_label, new_predict, average='micro') + precision_micro = metrics.precision_score(self.true_label, new_predict, average='micro') + recall_micro = metrics.recall_score(self.true_label, new_predict, average='micro') + return acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro + + def evaluationClusterModelFromLabel(self): + nmi = metrics.normalized_mutual_info_score(self.true_label, self.pred_label) + adjscore = metrics.adjusted_rand_score(self.true_label, self.pred_label) + acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro = self.clusteringAcc() + + print('ACC=%f, f1_macro=%f, precision_macro=%f, recall_macro=%f, f1_micro=%f, precision_micro=%f, recall_micro=%f, NMI=%f, ADJ_RAND_SCORE=%f' % (acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro, nmi, adjscore)) + + fh = open('recoder.txt', 'a') + + fh.write('ACC=%f, f1_macro=%f, precision_macro=%f, recall_macro=%f, f1_micro=%f, precision_micro=%f, recall_micro=%f, NMI=%f, ADJ_RAND_SCORE=%f' % (acc, f1_macro, precision_macro, recall_macro, f1_micro, precision_micro, recall_micro, nmi, adjscore) ) + fh.write('\r\n') + fh.flush() + fh.close() + + return acc, nmi, adjscore + diff --git a/model.py b/model.py new file mode 100644 index 0000000..05f0b73 --- /dev/null +++ b/model.py @@ -0,0 +1,200 @@ +from layers import GraphConvolution, GraphConvolutionSparse, InnerProductDecoder +import tensorflow as tf + +flags = tf.app.flags +FLAGS = flags.FLAGS + +class Model(object): + def __init__(self, **kwargs): + allowed_kwargs = {'name', 'logging'} + for kwarg in kwargs.keys(): + assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg + + for kwarg in kwargs.keys(): + assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg + name = kwargs.get('name') + if not name: + name = self.__class__.__name__.lower() + self.name = name + + logging = kwargs.get('logging', False) + self.logging = logging + + self.vars = {} + + def _build(self): + raise NotImplementedError + + def build(self): + """ Wrapper for _build() """ + with tf.variable_scope(self.name): + self._build() + variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) + self.vars = {var.name: var for var in variables} + + def fit(self): + pass + + def predict(self): + pass + + +class GCN(Model): + def __init__(self, placeholders, num_features, features_nonzero, **kwargs): + super(GCN, self).__init__(**kwargs) + """ + inputs:输入 + input_dim:feature的数量,即input的维度? + feature_nonzero:非0的特征 + adj:邻接矩阵 + dropout:dropout + """ + + self.inputs = placeholders['features'] + self.input_dim = num_features + self.features_nonzero = features_nonzero + self.adj = placeholders['adj'] + self.dropout = placeholders['dropout'] + + def construct(self, inputs = None, hidden = None, reuse = False): + if inputs == None : + inputs = self.inputs + + with tf.variable_scope('Encoder', reuse=reuse): + self.hidden1 = GraphConvolutionSparse(input_dim=self.input_dim, + output_dim=FLAGS.hidden1, + adj=self.adj, + features_nonzero = self.features_nonzero, + act=tf.nn.relu, + dropout=self.dropout, + logging=self.logging, + name='e_dense_1')(inputs) + + + self.noise = gaussian_noise_layer(self.hidden1, 0.1) + if hidden == None: + hidden = self.hidden1 + self.embeddings = GraphConvolution(input_dim=FLAGS.hidden1, + output_dim=FLAGS.hidden2, + adj=self.adj, + act=lambda x: x, + dropout=self.dropout, + logging=self.logging, + name='e_dense_2')(hidden) + + + self.z_mean = self.embeddings + + self.reconstructions = InnerProductDecoder(input_dim=FLAGS.hidden2, + act=lambda x: x, + logging=self.logging)(self.embeddings) + return self.z_mean, self.reconstructions + + + + +class Generator_z2g(Model): + def __init__(self, placeholders, num_features, features_nonzero, **kwargs): + super(Generator_z2g, self).__init__(**kwargs) + """ + inputs:输入 + input_dim:feature的数量,即input的维度? + feature_nonzero:非0的特征 + adj:邻接矩阵 + dropout:dropout + """ + + self.inputs = placeholders['real_distribution'] + self.input_dim = num_features + self.features_nonzero = features_nonzero + self.adj = placeholders['adj'] + self.dropout = placeholders['dropout'] + + + def construct(self, inputs = None, reuse = False): + if inputs == None: + inputs = self.inputs + with tf.variable_scope('Decoder', reuse=reuse): + + self.hidden1 = GraphConvolution(input_dim=FLAGS.hidden2, + output_dim=FLAGS.hidden1, + adj=self.adj, + act=tf.nn.relu, + dropout=self.dropout, + logging=self.logging, + name='GG_dense_1')(inputs) + + + + self.embeddings = GraphConvolution(input_dim=FLAGS.hidden1, + output_dim=self.input_dim, + adj=self.adj, + act=lambda x: x, + dropout=self.dropout, + logging=self.logging, + name='GG_dense_2')(self.hidden1) + + + self.z_mean = self.embeddings + return self.z_mean,self.hidden1 + + +def dense(x, n1, n2, name): + """ + Used to create a dense layer. + :param x: input tensor to the dense layer + :param n1: no. of input neurons + :param n2: no. of output neurons + :param name: name of the entire dense layer.i.e, variable scope name. + :return: tensor with shape [batch_size, n2] + """ + with tf.variable_scope(name, reuse=None): + # np.random.seed(1) + tf.set_random_seed(1) + weights = tf.get_variable("weights", shape=[n1, n2], + initializer=tf.random_normal_initializer(mean=0., stddev=0.01)) + bias = tf.get_variable("bias", shape=[n2], initializer=tf.constant_initializer(0.0)) + out = tf.add(tf.matmul(x, weights), bias, name='matmul') + return out + + +class D_graph(Model): + def __init__(self, num_features, **kwargs): + super(D_graph, self).__init__(**kwargs) + + self.act = tf.nn.relu + self.num_features = num_features + + def construct(self, inputs, reuse = False): + # input是一张Graph的adj,把每一列当成一个通道,所以input的通道数是num_nodes + with tf.variable_scope('D_Graph'): + if reuse: + tf.get_variable_scope().reuse_variables() + # np.random.seed(1) + #tf.set_random_seed(1) + dc_den1 = tf.nn.relu(dense(inputs, self.num_features, 512, name='GD_den1'))#(bs,num_nodes,512) + dc_den2 = tf.nn.relu(dense(dc_den1, 512, 128, name='GD_den2'))#(bs, num_nodes, 128) + output = dense(dc_den2, 128, 1, name='GD_output')#(bs,num_nodes,1) + return output + +def gaussian_noise_layer(input_layer, std): + noise = tf.random_normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32) + return input_layer + noise + +class Discriminator(Model): + def __init__(self, **kwargs): + super(Discriminator, self).__init__(**kwargs) + + self.act = tf.nn.relu + + def construct(self, inputs, reuse = False): + # with tf.name_scope('Discriminator'): + with tf.variable_scope('Discriminator'): + if reuse: + tf.get_variable_scope().reuse_variables() + # np.random.seed(1) + tf.set_random_seed(1) + dc_den1 = tf.nn.relu(dense(inputs, FLAGS.hidden2, FLAGS.hidden3, name='dc_den1')) + dc_den2 = tf.nn.relu(dense(dc_den1, FLAGS.hidden3, FLAGS.hidden1, name='dc_den2')) + output = dense(dc_den2, FLAGS.hidden1, 1, name='dc_output') + return output \ No newline at end of file diff --git a/optimizer.py b/optimizer.py new file mode 100644 index 0000000..08a5757 --- /dev/null +++ b/optimizer.py @@ -0,0 +1,138 @@ +import tensorflow as tf + +flags = tf.app.flags +FLAGS = flags.FLAGS + + +class OptimizerAE(object): + def __init__(self, preds, labels, pos_weight, norm, d_real, d_fake): + preds_sub = preds + labels_sub = labels + + self.real = d_real + + # Discrimminator Loss + #self.dc_loss_real = tf.reduce_mean( + # tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(self.real), logits=self.real,name='dclreal')) + self.dc_loss_real = - tf.reduce_mean(self.real) + + #self.dc_loss_fake = tf.reduce_mean( + # tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_fake), logits=d_fake,name='dcfake')) + self.dc_loss_fake = tf.reduce_mean(d_fake) + GP_loss = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient), axis = [0, 1])) - 1)) + self.dc_loss = self.dc_loss_fake + self.dc_loss_real + GP_loss + #self.dc_loss = self.dc_loss_fake + self.dc_loss_real + + # Generator loss + #generator_loss = tf.reduce_mean( + # tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_fake), logits=d_fake, name='gl')) + generator_loss = -self.dc_loss_fake + + + # pos_weight,允许人们通过向上或向下加权相对于负误差的正误差的成本来权衡召回率和精确度 + self.cost = norm * tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight)) + self.generator_loss = generator_loss + self.cost + + + all_variables = tf.trainable_variables() + dc_var = [var for var in all_variables if 'dc_' in var.name] + en_var = [var for var in all_variables if 'e_' in var.name] + + + with tf.variable_scope(tf.get_variable_scope()): + self.discriminator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate, + beta1=0.9, name='adam1').minimize(self.dc_loss, var_list=dc_var) #minimize(dc_loss_real, var_list=dc_var) + + self.generator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate, + beta1=0.9, name='adam2').minimize(self.generator_loss, var_list=en_var) + + + # 值得注意的是,这个地方,除了对抗优化之外, + # 还单纯用cost损失又优化了一遍, + # 待会儿看训练的时候注意看是在哪部分进行的这部分优化操作 + self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) # Adam Optimizer + self.opt_op = self.optimizer.minimize(self.cost) + self.grads_vars = self.optimizer.compute_gradients(self.cost) + + +class OptimizerCycle(object): + def __init__(self, preds, labels, pos_weight, norm, d_real, d_fake, GD_real, GD_fake, preds_z2g, labels_z2g, preds_cycle,labels_cycle,gradient, gradient_z): + + preds_sub = preds + labels_sub = labels + + self.real = d_real + + # Discrimminator Loss + self.dc_loss_real = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(self.real), logits=self.real,name='dclreal')) + #self.dc_loss_real = - tf.reduce_mean(self.real) + self.dc_loss_fake = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_fake), logits=d_fake,name='dcfake')) + #self.dc_loss_fake = tf.reduce_mean(d_fake) + #GP_loss = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient), axis = [0, 1])) - 1)) + #GP_loss_z = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient_z), axis = [0, 1])) - 1)) + #self.dc_loss = self.dc_loss_fake + self.dc_loss_real + 10.0 * GP_loss + + self.GD_loss_real = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(GD_real), logits=GD_real,name='GD_real')) + #self.GD_loss_real = - tf.reduce_mean(GD_real) + self.GD_loss_fake = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(GD_fake), logits=GD_fake,name='GD_fake')) + #self.GD_loss_fake = tf.reduce_mean(GD_fake) + + self.dc_loss = self.dc_loss_fake + self.dc_loss_real + self.GD_loss = self.GD_loss_fake + self.GD_loss_real + + # Generator loss + generator_loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_fake), logits=d_fake, name='gl')) + #generator_loss = -self.dc_loss_fake + generator_loss_z2g = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(GD_fake), logits=GD_fake, name='G_z2g')) + #generator_loss_z2g = -self.GD_loss_fake + # pos_weight,允许人们通过向上或向下加权相对于负误差的正误差的成本来权衡召回率和精确度 + self.cost = norm * tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight)) + + + cost_cycle = norm * tf.reduce_mean(tf.square(preds_cycle - labels_cycle)) + + cost_z2g = norm * tf.reduce_mean(tf.square(preds_z2g-labels_z2g)) + #with tf.device("/gpu:1"): + #self.cost = 0.00001*self.cost + cost_cycle #for citseer cluster + self.cost = self.cost + cost_cycle + self.generator_loss = generator_loss + self.cost + self.generator_loss_z2g = generator_loss_z2g + + + all_variables = tf.trainable_variables() + dc_var = [var for var in all_variables if 'dc_' in var.name] + en_var = [var for var in all_variables if 'e_' in var.name] + GG_var = [var for var in all_variables if 'GG' in var.name] + GD_var = [var for var in all_variables if 'GD' in var.name] + + + with tf.variable_scope(tf.get_variable_scope()): + self.discriminator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate, + beta1=0.9, name='adam1').minimize(self.dc_loss, var_list=dc_var) #minimize(dc_loss_real, var_list=dc_var) + + self.generator_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate, + beta1=0.9, name='adam2').minimize(self.generator_loss, var_list=en_var) + + self.discriminator_optimizer_z2g = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate, + beta1=0.9, name='adam1').minimize(self.GD_loss, var_list=GD_var) + + self.generator_optimizer_z2g = tf.train.AdamOptimizer(learning_rate=FLAGS.discriminator_learning_rate, + beta1=0.9, name='adam2').minimize(self.generator_loss_z2g, var_list=GG_var) + + + # 值得注意的是,这个地方,除了对抗优化之外, + # 还单纯用cost损失又优化了一遍, + # 待会儿看训练的时候注意看是在哪部分进行的这部分优化操作 + self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) # Adam Optimizer + self.opt_op = self.optimizer.minimize(self.cost) + #self.grads_vars = self.optimizer.compute_gradients(self.cost) + + #self.optimizer_z2g = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) # Adam Optimizer + #self.opt_op_z2g = self.optimizer.minimize(cost_z2g) + #self.grads_vars_z2g = self.optimizer.compute_gradients(cost_z2g) diff --git a/preprocessing.py b/preprocessing.py new file mode 100644 index 0000000..fc8fdf5 --- /dev/null +++ b/preprocessing.py @@ -0,0 +1,138 @@ +import numpy as np +import scipy.sparse as sp + + +def sparse_to_tuple(sparse_mx): + #判断是否是coo_matrix,不是的话就转成coo_matrix + if not sp.isspmatrix_coo(sparse_mx): + sparse_mx = sparse_mx.tocoo() + coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() + values = sparse_mx.data + shape = sparse_mx.shape + return coords, values, shape + + +def preprocess_graph(adj): + # A.sum(axis=1):计算矩阵的每一行元素之和,得到节点的度矩阵D + # np.power(x, n):数组元素求n次方,得到D^(-1/2) + # sp.diags()函数根据给定的对象创建对角矩阵,对角线上的元素为给定对象中的元素 + adj = sp.coo_matrix(adj) + adj_ = adj + sp.eye(adj.shape[0])#A* = A+I,即对邻接矩阵加入自连接 + + rowsum = np.array(adj_.sum(1))#对行求和,即得到节点的度 + degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())#得到D的-1/2次方矩阵d + adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()#这一步的实质是做归一化,即A* × d转置 × d + return sparse_to_tuple(adj_normalized) + + +def construct_feed_dict(adj_normalized, adj, features, placeholders): + # construct feed dictionary + # .update()用法就是将()内的字段增加到dict当中 + feed_dict = dict()#创建一个空字典 + feed_dict.update({placeholders['features']: features}) + feed_dict.update({placeholders['adj']: adj_normalized}) + feed_dict.update({placeholders['adj_orig']: adj}) + return feed_dict + + +def mask_test_edges(adj): + # Function to build test set with 10% positive links + # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. + # TODO: Clean up. + # sp.matrix(data,offsets)是将data的元素每列的元素,按offset里的顺序在列上进行重新排列,offset里的值是偏移量 + # 具体可以参考https://blog.csdn.net/ChenglinBen/article/details/84424379 + # .diagonal()就是提取对角线元素 + # Remove diagonal elements删除对角线元素 + adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) + #把零元素都消除掉 + adj.eliminate_zeros() + # Check that diag is zero: + # np.diag(matrix)即提取matrix的对角线元素,todense() like toarray(),区别是一个是将存储方式由稀疏矩阵转成正常矩阵,另一个是转成array + # assert检查是否对角线元素是否都被清空了 + assert np.diag(adj.todense()).sum() == 0 + + # sp.triu(matrix)获取matrix的上三角矩阵,相应的,tril()是获取下三角矩阵 + adj_triu = sp.triu(adj) + adj_tuple = sparse_to_tuple(adj_triu) + # edges相当于组合,因为是上三角矩阵的edge,所以减少了一半的重复量,(4.6)与(6,4)不会同时存在,而只会保留(4,6) + # edges_all相当于排列,就都包含了 + edges = adj_tuple[0] + edges_all = sparse_to_tuple(adj)[0] + # 取edge的10%作为test + # 取edge的20%作为val + num_test = int(np.floor(edges.shape[0] / 10.)) + num_val = int(np.floor(edges.shape[0] / 20.)) + + # 随机选取一部分作为test与val + all_edge_idx = list(range(edges.shape[0])) + np.random.shuffle(all_edge_idx) + val_edge_idx = all_edge_idx[:num_val] + test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] + test_edges = edges[test_edge_idx] + val_edges = edges[val_edge_idx] + train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) + + # 该函数请参考github中gae的写法,应该是更新了,这种方法应该是错的,或者说与python3不兼容 + # 其中,return部分或许应该改成np.any(rows_close) + def ismember(a, b, tol=5): + # 该函数的作用就是判断a元素是否存在于b集合中 + rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) + return np.any(rows_close) + #return (np.all(np.any(rows_close, axis=-1), axis=-1) and + #np.all(np.any(rows_close, axis=0), axis=0)) + + # test_edges_false是去生成一些本来就不存在的edges + test_edges_false = [] + while len(test_edges_false) < len(test_edges): + idx_i = np.random.randint(0, adj.shape[0]) + idx_j = np.random.randint(0, adj.shape[0]) + if idx_i == idx_j: + continue + if ismember([idx_i, idx_j], edges_all): + continue + if test_edges_false: + if ismember([idx_j, idx_i], np.array(test_edges_false)): + continue + if ismember([idx_i, idx_j], np.array(test_edges_false)): + continue + test_edges_false.append([idx_i, idx_j]) + + # val_edges_false生成一些不存在于train与val的edges + val_edges_false = [] + while len(val_edges_false) < len(val_edges): + idx_i = np.random.randint(0, adj.shape[0]) + idx_j = np.random.randint(0, adj.shape[0]) + if idx_i == idx_j: + continue + if ismember([idx_i, idx_j], train_edges): + continue + if ismember([idx_j, idx_i], train_edges): + continue + if ismember([idx_i, idx_j], val_edges): + continue + if ismember([idx_j, idx_i], val_edges): + continue + if val_edges_false: + if ismember([idx_j, idx_i], np.array(val_edges_false)): + continue + if ismember([idx_i, idx_j], np.array(val_edges_false)): + continue + val_edges_false.append([idx_i, idx_j]) + + assert ~ismember(test_edges_false, edges_all) + assert ~ismember(val_edges_false, edges_all) + assert ~ismember(val_edges, train_edges) + assert ~ismember(test_edges, train_edges) + assert ~ismember(val_edges, test_edges) + + data = np.ones(train_edges.shape[0]) + + # Re-build adj matrix + # 如英文注释所说,这里将处理好的train_edges再重建出adj_train + adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) + adj_train = adj_train + adj_train.T + + # NOTE: these edge lists only contain single direction of edge! + return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false + + diff --git a/run.py b/run.py new file mode 100644 index 0000000..229d4f6 --- /dev/null +++ b/run.py @@ -0,0 +1,15 @@ +import settings + +from link_prediction import Link_pred_Runner + +dataname = 'cora' # 'cora' or 'citeseer' or 'pubmed' +model = 'DBGAN' # 'arga_ae' or 'DBGAN' +task = 'link_prediction' + +settings = settings.get_settings(dataname, model, task) + +if task == 'link_prediction': + runner = Link_pred_Runner(settings) + +runner.erun() + diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..482724e --- /dev/null +++ b/settings.py @@ -0,0 +1,49 @@ +import tensorflow as tf +import numpy as np +flags = tf.app.flags +FLAGS = flags.FLAGS + + +flags.DEFINE_integer('hidden3', 64, 'Number of units in hidden layer 3.') +flags.DEFINE_integer('discriminator_out', 0, 'discriminator_out.') +flags.DEFINE_float('discriminator_learning_rate', 0.001, 'Initial learning rate.') +flags.DEFINE_float('learning_rate', .6*0.001, 'Initial learning rate.') +flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.')#64 for Citeseer and Pubmed +flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')#64 for Citeseer and Pubmed +flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.') +flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).') +flags.DEFINE_integer('features', 1, 'Whether to use features (1) or not (0).') +flags.DEFINE_integer('seed', 50, 'seed for fixing the results.') +flags.DEFINE_integer('iterations', 60, 'number of iterations.') + +''' +infor: number of clusters +''' +infor = {'cora': 7, 'citeseer': 6, 'pubmed':3} + + +''' +We did not set any seed when we conducted the experiments described in the paper; +We set a seed here to steadily reveal better performance of ARGA +''' +seed = 7 +np.random.seed(seed) +tf.set_random_seed(seed) + +def get_settings(dataname, model, task): + if dataname != 'citeseer' and dataname != 'cora' and dataname != 'pubmed': + print('error: wrong data set name') + if task != 'clustering' and task != 'link_prediction': + print('error: wrong task name') + + if task == 'clustering': + iterations = FLAGS.iterations + clustering_num = infor[dataname] + re = {'data_name': dataname, 'iterations' : iterations, 'clustering_num' :clustering_num, 'model' : model} + elif task == 'link_prediction': + iterations = 4 * FLAGS.iterations + print('epoch is', iterations) + re = {'data_name': dataname, 'iterations' : iterations,'model' : model} + + return re +