From 98798b412da5ecf953e2c95cd199afcf44edbee8 Mon Sep 17 00:00:00 2001 From: lab-pc Date: Fri, 18 Nov 2022 20:20:22 +0800 Subject: [PATCH] first version --- .idea/.gitignore | 8 + .idea/BGANDTI-main.iml | 12 ++ .idea/codeStyles/codeStyleConfig.xml | 5 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/other.xml | 6 + .idea/vcs.xml | 6 + evaluation.py | 42 ++++ input.py | 197 ++++++++++++++++++ src/model.py => model.py | 0 optimizer.py | 158 ++++++++++++++ p1_preprocessing_data/__init__.py | 0 p1_preprocessing_data/load_data.py | 94 --------- p1_preprocessing_data/process_data.py | 88 -------- p1_preprocessing_data/utils.py | 130 ------------ p2_preprocessing_feature/__init__.py | 2 - p2_preprocessing_feature/load_feature.py | 18 -- p2_preprocessing_feature/process_feature.py | 20 -- src/train.py => train.py | 6 +- 20 files changed, 455 insertions(+), 355 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/BGANDTI-main.iml create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/other.xml create mode 100644 .idea/vcs.xml create mode 100644 evaluation.py create mode 100644 input.py rename src/model.py => model.py (100%) create mode 100644 optimizer.py delete mode 100644 p1_preprocessing_data/__init__.py delete mode 100644 p1_preprocessing_data/load_data.py delete mode 100644 p1_preprocessing_data/process_data.py delete mode 100644 p1_preprocessing_data/utils.py delete mode 100644 p2_preprocessing_feature/__init__.py delete mode 100644 p2_preprocessing_feature/load_feature.py delete mode 100644 p2_preprocessing_feature/process_feature.py rename src/train.py => train.py (98%) diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/BGANDTI-main.iml b/.idea/BGANDTI-main.iml new file mode 100644 index 0000000..9d03c45 --- /dev/null +++ b/.idea/BGANDTI-main.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..7420b09 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5ba2128 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/other.xml b/.idea/other.xml new file mode 100644 index 0000000..a708ec7 --- /dev/null +++ b/.idea/other.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/evaluation.py b/evaluation.py new file mode 100644 index 0000000..1d8f48c --- /dev/null +++ b/evaluation.py @@ -0,0 +1,42 @@ +import numpy as np +from sklearn import metrics +from sklearn.metrics import average_precision_score +from sklearn.metrics import roc_auc_score, auc + + +class Evaluator(object): + def __init__(self, edges_pos, edges_neg): + self.edges_pos = edges_pos + self.edges_neg = edges_neg + + def get_roc_score(self, emb, feas): + # if emb is None: + # feed_dict.update({placeholders['dropout']: 0}) + # emb = sess.run(model.z_mean, feed_dict=feed_dict) + + def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + # Predict on test set of edges + adj_rec = np.dot(emb, emb.T) + preds = [] + pos = [] + for e in self.edges_pos: + preds.append(sigmoid(adj_rec[e[0], e[1]])) + pos.append(feas['adj_orig'][e[0], e[1]]) + + preds_neg = [] + neg = [] + for e in self.edges_neg: + preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) + neg.append(feas['adj_orig'][e[0], e[1]]) + + preds_all = np.hstack([preds, preds_neg]) + labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds))]) + roc_score = roc_auc_score(labels_all, preds_all) + ap_score = average_precision_score(labels_all, preds_all) + + precision, recall, _thresholds = metrics.precision_recall_curve(labels_all, preds_all) + aupr_score = auc(recall, precision) + + return roc_score, ap_score, emb, aupr_score diff --git a/input.py b/input.py new file mode 100644 index 0000000..894ecc1 --- /dev/null +++ b/input.py @@ -0,0 +1,197 @@ +import inspect +import pickle + +import numpy as np +import scipy.sparse as sp + + +def sparse_to_tuple(sparse_mx): + if not sp.isspmatrix_coo(sparse_mx): + sparse_mx = sparse_mx.tocoo() + coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() + values = sparse_mx.data + shape = sparse_mx.shape + return coords, values, shape + + +def preprocess_graph(adj): + adj = sp.coo_matrix(adj) + adj_ = adj + sp.eye(adj.shape[0]) # A* = A+I,即对邻接矩阵加入自连接 + + rowsum = np.array(adj_.sum(1)) # 对行求和,即得到节点的度 + degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) # 得到D的-1/2次方矩阵d + adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() # 这一步的实质是做归一化,即A* × d转置 × d + return sparse_to_tuple(adj_normalized) + + +def load_data(dataset): + adj = np.loadtxt('../data/partitioned_data/{0}/orig/{0}_adj_orig.txt'.format(dataset), dtype=int) + adj = sp.csr_matrix(adj) + features = pickle.load(open("../data/partitioned_data/{0}/feature/{0}_feature.pkl".format(dataset), 'rb')) + y_test = 0 + tx = 0 + ty = 0 + test_mask = 0 + labels = 0 + return adj, features, y_test, tx, ty, test_mask, labels + + +def mask_test_edges(adj): + # Function to build test set with 10% positive links + # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. + # TODO: Clean up. + # sp.matrix(data,offsets)是将data的元素每列的元素,按offset里的顺序在列上进行重新排列,offset里的值是偏移量 + # 具体可以参考https://blog.csdn.net/ChenglinBen/article/details/84424379 + # .diagonal()就是提取对角线元素 + # Remove diagonal elements删除对角线元素 + adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) + # 把零元素都消除掉 + adj.eliminate_zeros() + # Check that diag is zero: + # np.diag(matrix)即提取matrix的对角线元素,todense() like toarray(),区别是一个是将存储方式由稀疏矩阵转成正常矩阵,另一个是转成array + # assert检查是否对角线元素是否都被清空了 + assert np.diag(adj.todense()).sum() == 0 + + # sp.triu(matrix)获取matrix的上三角矩阵,相应的,tril()是获取下三角矩阵 + adj_triu = sp.triu(adj) + adj_tuple = sparse_to_tuple(adj_triu) + # edges相当于组合,因为是上三角矩阵的edge,所以减少了一半的重复量,(4.6)与(6,4)不会同时存在,而只会保留(4,6) + # edges_all相当于排列,就都包含了 + edges = adj_tuple[0] + edges_all = sparse_to_tuple(adj)[0] + # 取edge的10%作为test + # 取edge的20%作为val + num_test = int(np.floor(edges.shape[0] / 10.)) + num_val = int(np.floor(edges.shape[0] / 20.)) + + # 随机选取一部分作为test与val + all_edge_idx = list(range(edges.shape[0])) + np.random.shuffle(all_edge_idx) + val_edge_idx = all_edge_idx[:num_val] + test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] + test_edges = edges[test_edge_idx] + val_edges = edges[val_edge_idx] + train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) + + # 该函数请参考github中gae的写法,应该是更新了,这种方法应该是错的,或者说与python3不兼容 + # 其中,return部分或许应该改成np.any(rows_close) + def ismember(a, b, tol=5): + # 该函数的作用就是判断a元素是否存在于b集合中 + rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) + return np.any(rows_close) + # return (np.all(np.any(rows_close, axis=-1), axis=-1) and + # np.all(np.any(rows_close, axis=0), axis=0)) + + # test_edges_false是去生成一些本来就不存在的edges + test_edges_false = [] + while len(test_edges_false) < len(test_edges): + idx_i = np.random.randint(0, adj.shape[0]) + idx_j = np.random.randint(0, adj.shape[0]) + if idx_i == idx_j: + continue + if ismember([idx_i, idx_j], edges_all): + continue + if test_edges_false: + if ismember([idx_j, idx_i], np.array(test_edges_false)): + continue + if ismember([idx_i, idx_j], np.array(test_edges_false)): + continue + test_edges_false.append([idx_i, idx_j]) + + # val_edges_false生成一些不存在于train与val的edges + val_edges_false = [] + while len(val_edges_false) < len(val_edges): + idx_i = np.random.randint(0, adj.shape[0]) + idx_j = np.random.randint(0, adj.shape[0]) + if idx_i == idx_j: + continue + if ismember([idx_i, idx_j], train_edges): + continue + if ismember([idx_j, idx_i], train_edges): + continue + if ismember([idx_i, idx_j], val_edges): + continue + if ismember([idx_j, idx_i], val_edges): + continue + if val_edges_false: + if ismember([idx_j, idx_i], np.array(val_edges_false)): + continue + if ismember([idx_i, idx_j], np.array(val_edges_false)): + continue + val_edges_false.append([idx_i, idx_j]) + + assert ~ismember(test_edges_false, edges_all) + # assert ~ismember(val_edges_false, edges_all) + assert ~ismember(val_edges, train_edges) + assert ~ismember(test_edges, train_edges) + assert ~ismember(val_edges, test_edges) + + data = np.ones(train_edges.shape[0]) + + # Re-build adj matrix + # 如英文注释所说,这里将处理好的train_edges再重建出adj_train + adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) + adj_train = adj_train + adj_train.T + + # NOTE: these edge lists only contain single direction of edge! + return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false + +def retrieve_name(var): + callers_local_vars = inspect.currentframe().f_back.f_locals.items() + print([var_name for var_name, var_val in callers_local_vars if var_val is var]) + return [var_name for var_name, var_val in callers_local_vars if var_val is var][0] + +def get_data(dataset): + # Load data + # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(data_name) + adj, features, y_test, tx, ty, test_maks, true_labels = load_data(dataset) # e ic gpcr nr luo + + # Store original adjacency matrix (without diagonal entries) for later + adj_orig = adj + # 删除对角线元素 + adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) + adj_orig.eliminate_zeros() + + adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) + adj = adj_train + adj_dense = adj.toarray() + + # Some preprocessing + adj_norm = preprocess_graph(adj) + + num_nodes = adj.shape[0] + features_dense = features.tocoo().toarray() + + features = sparse_to_tuple(features.tocoo()) + # num_features是feature的维度 + num_features = features[2][1] + # features_nonzero就是非零feature的个数 + features_nonzero = features[1].shape[0] + + pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() + norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) + + adj_label = adj_train + sp.eye(adj_train.shape[0]) + adj_label = sparse_to_tuple(adj_label) + items = [ + adj, num_features, num_nodes, features_nonzero, + pos_weight, norm, adj_norm, adj_label, + features, true_labels, train_edges, val_edges, + val_edges_false, test_edges, test_edges_false, adj_orig, features_dense, adj_dense, features_dense + ] + + feas = {} + + print('num_features is:', num_features) + print('num_nodes is:', num_nodes) + print('features_nonzero is:', features_nonzero) + print('pos_weight is:', pos_weight) + print('norm is:', norm) + + for item in items: + # item_name = [ k for k,v in locals().iteritems() if v == item][0] + feas[retrieve_name(item)] = item + + feas['num_features'] = num_features + feas['num_nodes'] = num_nodes + return feas diff --git a/src/model.py b/model.py similarity index 100% rename from src/model.py rename to model.py diff --git a/optimizer.py b/optimizer.py new file mode 100644 index 0000000..6ad165d --- /dev/null +++ b/optimizer.py @@ -0,0 +1,158 @@ +import numpy as np +import tensorflow as tf + + +class OptimizerCycle(object): + def __init__(self, preds, labels, pos_weight, norm, d_real, d_fake, GD_real, GD_fake, preds_z2g, labels_z2g, preds_cycle, labels_cycle, gradient, gradient_z, settings): + preds_sub = preds + labels_sub = labels + + self.real = d_real + self.settings = settings + + # Discrimminator Loss + self.dc_loss_real = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(self.real), logits=self.real, name='dclreal')) + # self.dc_loss_real = - tf.reduce_mean(self.real) + self.dc_loss_fake = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(d_fake), logits=d_fake, name='dcfake')) + # self.dc_loss_fake = tf.reduce_mean(d_fake) + # GP_loss = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient), axis = [0, 1])) - 1)) + # GP_loss_z = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_mean(tf.square(gradient_z), axis = [0, 1])) - 1)) + # self.dc_loss = self.dc_loss_fake + self.dc_loss_real + 10.0 * GP_loss + + self.GD_loss_real = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(GD_real), logits=GD_real, name='GD_real')) + # self.GD_loss_real = - tf.reduce_mean(GD_real) + self.GD_loss_fake = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(GD_fake), logits=GD_fake, name='GD_fake')) + # self.GD_loss_fake = tf.reduce_mean(GD_fake) + + self.dc_loss = self.dc_loss_fake + self.dc_loss_real + self.GD_loss = self.GD_loss_fake + self.GD_loss_real + + # Generator loss + generator_loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_fake), logits=d_fake, name='gl')) + # generator_loss = -self.dc_loss_fake + generator_loss_z2g = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(GD_fake), logits=GD_fake, name='G_z2g')) + # generator_loss_z2g = -self.GD_loss_fake + # pos_weight,允许人们通过向上或向下加权相对于负误差的正误差的成本来权衡召回率和精确度 + self.cost = norm * tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight)) + + cost_cycle = norm * tf.reduce_mean(tf.square(preds_cycle - labels_cycle)) + + cost_z2g = norm * tf.reduce_mean(tf.square(preds_z2g - labels_z2g)) + + self.cost = self.cost + cost_cycle + self.generator_loss = generator_loss + self.cost + self.generator_loss_z2g = generator_loss_z2g + + all_variables = tf.trainable_variables() + dc_var = [var for var in all_variables if 'dc_' in var.name] + en_var = [var for var in all_variables if 'e_' in var.name] + GG_var = [var for var in all_variables if 'GG' in var.name] + GD_var = [var for var in all_variables if 'GD' in var.name] + + with tf.variable_scope(tf.get_variable_scope()): + self.discriminator_optimizer = tf.train.AdamOptimizer(learning_rate=self.settings.discriminator_learning_rate, + beta1=0.9, name='adam1').minimize(self.dc_loss, + var_list=dc_var) # minimize(dc_loss_real, var_list=dc_var) + + self.generator_optimizer = tf.train.AdamOptimizer(learning_rate=self.settings.discriminator_learning_rate, + beta1=0.9, name='adam2').minimize(self.generator_loss, var_list=en_var) + + self.discriminator_optimizer_z2g = tf.train.AdamOptimizer(learning_rate=self.settings.discriminator_learning_rate, + beta1=0.9, name='adam1').minimize(self.GD_loss, var_list=GD_var) + + self.generator_optimizer_z2g = tf.train.AdamOptimizer(learning_rate=self.settings.discriminator_learning_rate, + beta1=0.9, name='adam2').minimize(self.generator_loss_z2g, var_list=GG_var) + + # 值得注意的是,这个地方,除了对抗优化之外, + # 还单纯用cost损失又优化了一遍, + # 待会儿看训练的时候注意看是在哪部分进行的这部分优化操作 + self.optimizer = tf.train.AdamOptimizer(learning_rate=self.settings.learning_rate) # Adam Optimizer + self.opt_op = self.optimizer.minimize(self.cost) + # self.grads_vars = self.optimizer.compute_gradients(self.cost) + + # self.optimizer_z2g = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) # Adam Optimizer + # self.opt_op_z2g = self.optimizer.minimize(cost_z2g) + # self.grads_vars_z2g = self.optimizer.compute_gradients(cost_z2g) + + +class Optimizer(object): + def __init__(self, model, model_z2g, D_Graph, discriminator, placeholders, pos_weight, norm, d_real, num_nodes, GD_real, settings): + self.opt = self.construct_optimizer(model, model_z2g, D_Graph, discriminator, placeholders, pos_weight, norm, d_real, num_nodes, GD_real, settings) + + def construct_optimizer(self, model, model_z2g, D_Graph, discriminator, placeholders, pos_weight, norm, d_real, num_nodes, GD_real, settings): + z2g = model_z2g.construct() + hidden = z2g[1] + z2g = z2g[0] + preds_z2g = model.construct(hidden=hidden, reuse=True)[0] + g2z = model.construct() + + embeddings = g2z[0] + reconstructions = g2z[1] + d_fake = discriminator.construct(embeddings, reuse=True) + GD_fake = D_Graph.construct(z2g, reuse=True) + + epsilon = tf.random_uniform(shape=[1], minval=0.0, maxval=1.0) + interpolated_input = epsilon * placeholders['real_distribution'] + (1 - epsilon) * embeddings + gradient = tf.gradients(discriminator.construct(interpolated_input, reuse=True), [interpolated_input])[0] + + epsilon = tf.random_uniform(shape=[1], minval=0.0, maxval=1.0) + interpolated_input = epsilon * placeholders['features_dense'] + (1 - epsilon) * z2g + gradient_z = tf.gradients(D_Graph.construct(interpolated_input, reuse=True), [interpolated_input])[0] + + opt = OptimizerCycle(preds=reconstructions, + labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), + pos_weight=pos_weight, + norm=norm, + d_real=d_real, + d_fake=d_fake, + GD_real=GD_real, + GD_fake=GD_fake, + preds_z2g=preds_z2g, + labels_z2g=placeholders['real_distribution'], + preds_cycle=model_z2g.construct(embeddings, reuse=True)[0], + labels_cycle=placeholders['features_dense'], + gradient=gradient, + gradient_z=gradient_z, + settings=settings) + return opt + +def construct_feed_dict(adj_normalized, adj, features, placeholders): + # construct feed dictionary + # .update()用法就是将()内的字段增加到dict当中 + feed_dict = dict() # 创建一个空字典 + feed_dict.update({placeholders['features']: features}) + feed_dict.update({placeholders['adj']: adj_normalized}) + feed_dict.update({placeholders['adj_orig']: adj}) + return feed_dict + +def update(model, opt, sess, adj_norm, adj_label, features, placeholders, adj, distribution, adj_dense, settings): + # Construct feed dictionary + feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) + feed_dict.update({placeholders['dropout']: settings.dropout}) + feed_dict.update({placeholders['features_dense']: adj_dense}) + feed_dict.update({placeholders['dropout']: 0}) + z_real_dist = np.random.randn(adj.shape[0], settings.hidden2) + z_real_dist = distribution.sample(adj.shape[0]) + feed_dict.update({placeholders['real_distribution']: z_real_dist}) + + for j in range(5): + _, reconstruct_loss = sess.run([opt.opt_op, opt.cost], feed_dict=feed_dict) + g_loss, _ = sess.run([opt.generator_loss, opt.generator_optimizer], feed_dict=feed_dict) + d_loss, _ = sess.run([opt.dc_loss, opt.discriminator_optimizer], feed_dict=feed_dict) + + GD_loss, _ = sess.run([opt.GD_loss, opt.discriminator_optimizer_z2g], feed_dict=feed_dict) + GG_loss, _ = sess.run([opt.generator_loss_z2g, opt.generator_optimizer_z2g], feed_dict=feed_dict) + # GD_loss = sess.run(opt.GD_loss, feed_dict=feed_dict) + # GG_loss = sess.run(opt.generator_loss_z2g, feed_dict=feed_dict) + # g_loss = sess.run(opt.generator_loss, feed_dict=feed_dict) + # d_loss = sess.run(opt.dc_loss, feed_dict=feed_dict) + emb = sess.run(model.z_mean, feed_dict=feed_dict) + avg_cost = [reconstruct_loss, d_loss, g_loss, GD_loss, GG_loss] + + return emb, avg_cost \ No newline at end of file diff --git a/p1_preprocessing_data/__init__.py b/p1_preprocessing_data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/p1_preprocessing_data/load_data.py b/p1_preprocessing_data/load_data.py deleted file mode 100644 index 4ec58cc..0000000 --- a/p1_preprocessing_data/load_data.py +++ /dev/null @@ -1,94 +0,0 @@ -import os -import random - -import numpy as np -import scipy.sparse as sp - -from src import config - - -def load_luo_data(dataset): - dp = np.loadtxt('../../data/RawData/luo/mat_drug_protein.txt'.format(dataset), dtype=int) - dd = np.loadtxt('../../data/RawData/luo/mat_drug_drug.txt'.format(dataset), dtype=int) - pp = np.loadtxt('../../data/RawData/luo/mat_protein_protein.txt'.format(dataset), dtype=int) - adj = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) - return sp.csr_matrix(adj + sp.eye(adj.shape[0])), dd.shape[0] - - -def load_yam_data(dataset): - dp = np.loadtxt('../../data/RawData/Yamanishi/{}_admat_dgc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.int).T - dd = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) - pp = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dg.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) - dd = np.where(dd < 0.5, 0, 1) - pp = np.where(pp < 0.5, 0, 1) - adj = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) - return sp.csr_matrix(adj), dd.shape[0] - - -def is_symmetry(adj): - for i in range(adj.shape[0]): - for j in range(adj.shape[1]): - if adj[i][j] != adj[j][i]: - return False - return True - - -def is_1_diag(adj): - if sum(np.diagonal(adj)) != adj.shape[0]: - return False - return True - - -def change_unbalanced(adj, percent, dp_line, dataset): - """ - note: percent控制屏蔽掉的节点所占的百分比 - :param adj: - :param percent: - :return: 返回去除部分已知关联的邻接矩阵 - """ - # 判断是否对称 - # assert is_symmetry(adj.A) - adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) + sp.eye(adj.shape[0]) - # 判断对角线是否全为1 - assert is_1_diag(adj.A) - adj = (sp.triu(adj) + sp.triu(adj).T - sp.eye(adj.shape[0])).A - - row = list(range(0, dp_line)) - col = list(range(dp_line, adj.shape[0])) - - idx = [] - for i in row: - for j in col: - if i != j and adj[i][j] == 1: - idx.append((i, j)) - num = int(np.floor(percent * len(idx))) - count = 0 - # random.seed(config.seed) - while count < num: - row, col = random.choice(idx) - idx.remove((row, col)) - adj[row][col] = 0 - adj[col][row] = 0 - count += 1 - - # idx = [] - # for i in range(adj.shape[0]): - # for j in range(i + 1, adj.shape[0]): - # if adj[i][j] == 1: - # idx.append((i, j)) - # num = int(np.floor(percent * len(idx))) - # count = 0 - # # random.seed(config.seed) - # while count < num: - # row, col = random.choice(idx) - # idx.remove((row, col)) - # adj[row][col] = 0 - # adj[col][row] = 0 - # count += 1 - - # 保存改变不平衡性后新的dp - new_dp = adj[0:dp_line, dp_line:] - # if not os.path.exists('../../data/partitioned_data/{0}/feature'.format(dataset)): - # os.mkdir('../../data/partitioned_data/{0}/feature'.format(dataset)) - # np.savetxt('../../data/partitioned_data/{0}/feature/{0}_new_admat_dgc.txt'.format(dataset), new_dp, fmt='%d', delimiter='\t') - return sp.csr_matrix(adj.astype(np.int)) diff --git a/p1_preprocessing_data/process_data.py b/p1_preprocessing_data/process_data.py deleted file mode 100644 index 1138359..0000000 --- a/p1_preprocessing_data/process_data.py +++ /dev/null @@ -1,88 +0,0 @@ -import os -import pickle - -import numpy as np - -from src import config -import scipy.sparse as sp - -from load_data import load_yam_data, change_unbalanced, load_luo_data -from utils import divide_vgae_datasets, sparse_to_tuple, divide_datasets - -for dataset in config.datasets: - g = os.walk(r"../../data/partitioned_data/{}".format(dataset)) - for path, dir_list, file_list in g: - for file_name in file_list: - os.remove(os.path.join(path, file_name)) - print("清除缓存完成!") - - # Load data 得到一个邻接矩阵,双向边 - if dataset == 'luo': - adj, dp_line = load_luo_data(dataset) - else: - adj, dp_line = load_yam_data(dataset) - - if not os.path.exists("../../data/partitioned_data"): - os.mkdir("../../data/partitioned_data") - if not os.path.exists("../../data/partitioned_data/{}".format(dataset)): - os.mkdir("../../data/partitioned_data/{}".format(dataset)) - if not os.path.exists("../../data/partitioned_data/{}/orig".format(dataset)): - os.mkdir("../../data/partitioned_data/{}/orig/".format(dataset)) - np.savetxt("../../data/partitioned_data/{}/orig/dp_line.txt".format(dataset), np.array([dataset, str(dp_line)]), fmt='%s') - - # 获得不同不平衡性的数据 - adj = change_unbalanced(adj, config.percent, dp_line, dataset) - - # Store original adjacency matrix (without diagonal entries) for later 保存原始邻接矩阵(不含对角线项)以备后用 - adj_orig = adj - adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # 假设对角线有元素,去除对角线 - adj_orig.eliminate_zeros() # 假设有0,移除矩阵中的0 - path = "../../data/partitioned_data/{}/orig/".format(dataset) - if not os.path.exists(path): - os.makedirs(path) - pickle.dump(adj_orig, open(path + dataset + "_adj_orig.pkl", 'wb')) - np.savetxt(path + dataset + "_adj_orig.txt", adj_orig.A, fmt='%d') - - # 为获取嵌入划分数据, 划分数据集, 并记录边 - for i in range(10): - # Remove diagonal elements # 删除对角线元素 - adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) # 梅开二度 - adj.eliminate_zeros() - # Check that diag is zero: # 检查diag是否为零: - assert np.diag(adj.todense()).sum() == 0 - - # 为graphgan划分数据 - g_adj = adj[0:dp_line, dp_line:] - g_edges = sparse_to_tuple(g_adj)[0] - g_num_test = int(np.floor(g_edges.shape[0] / 10.)) # np.floor()是向下取整。测试集10分之一,训练集20分之一 - g_num_val = int(np.floor(g_edges.shape[0] / 20.)) - - adj_pd, train_edges, test_edges, test_edges_false = divide_datasets(g_adj, g_edges, g_num_test, i, dp_line) - adj[0:dp_line, dp_line:] = adj_pd - - # 将训练集分给vgae - edges = sparse_to_tuple(sp.triu(adj))[0] - edges_all = sparse_to_tuple(adj)[0] # 将邻接矩阵转换成三元组,然后只取坐标,即所有的边 - num_test = int(np.floor(edges.shape[0] / 10.)) # np.floor()是向下取整。测试集10分之一,训练集20分之一 - num_val = int(np.floor(edges.shape[0] / 20.)) - - adj_train, vgae_train_edges, vgae_test_edges, vgae_test_edges_false = divide_vgae_datasets(adj, edges, edges_all, num_test, num_val, - i) # val_edges, val_edges_false, - - # 保存划分好的数据 - path = "../../data/partitioned_data/{}/{}fold/".format(dataset, i) - if not os.path.exists(path): - os.makedirs(path) - - pickle.dump(adj_train, open(path + dataset + "_adj_train.pkl", 'wb')) - - np.savetxt(path + dataset + "_vgae_train.txt", vgae_train_edges, fmt='%d') - np.savetxt(path + dataset + "_vgae_test.txt", vgae_test_edges, fmt='%d') - np.savetxt(path + dataset + "_vgae_test_neg.txt", vgae_test_edges_false, fmt='%d') - - np.savetxt(path + dataset + "_train.txt", vgae_train_edges, fmt='%d') - np.savetxt(path + dataset + "_pd_train.txt", train_edges, fmt='%d') - np.savetxt(path + dataset + "_test.txt", test_edges, fmt='%d') - np.savetxt(path + dataset + "_test_neg.txt", test_edges_false, fmt='%d') - - print("OK") diff --git a/p1_preprocessing_data/utils.py b/p1_preprocessing_data/utils.py deleted file mode 100644 index a5dc3b4..0000000 --- a/p1_preprocessing_data/utils.py +++ /dev/null @@ -1,130 +0,0 @@ -import numpy as np -import scipy.sparse as sp - -from src import config - - -def sparse_to_tuple(sparse_mx): - if not sp.isspmatrix_coo(sparse_mx): - sparse_mx = sparse_mx.tocoo() - coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() - values = sparse_mx.data - shape = sparse_mx.shape - return coords, values, shape - - -def divide_vgae_datasets(adj, edges, edges_all, num_test, num_val, i): - # 构建具有10%正向链接的测试集的函数 - # 注:拆分是随机的,结果可能与论文中报告的数字略有偏差。 - - if i == 9: - start_test = num_test * i - end_test = edges.shape[0] - start_val = 0 - end_val = num_val - else: - start_test = num_test * i - end_test = num_test * (i + 1) - start_val = end_test - end_val = end_test + num_val - - all_edge_idx = list(range(edges.shape[0])) - np.random.seed(config.seed) - np.random.shuffle(edges) - # val_edge_idx = all_edge_idx[start_val:end_val] - test_edge_idx = all_edge_idx[start_test:end_test] - test_edges = edges[test_edge_idx] - # val_edges = edges[val_edge_idx] - train_edges = np.delete(edges, np.hstack([test_edge_idx]), axis=0) # , val_edge_idx - - def ismember(a: list, b, tol=5): - rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) - return np.any(rows_close) - - test_edges_false = [] - while len(test_edges_false) < len(test_edges): - idx_i = np.random.randint(0, adj.shape[0]) # 随机生成横坐标 - idx_j = np.random.randint(0, adj.shape[0]) # 随机生成纵坐标 - if idx_i == idx_j: # 对角线的不要 - continue - if ismember([idx_i, idx_j], edges_all): # 是已知边不要 - continue - if test_edges_false: # 已选负边不要,a-b或b-a有一个是都不要 - if ismember([idx_j, idx_i], np.array(test_edges_false)): - continue - if ismember([idx_i, idx_j], np.array(test_edges_false)): - continue - test_edges_false.append([idx_i, idx_j]) - - # val_edges_false = [] - # while len(val_edges_false) < len(val_edges): - # idx_i = np.random.randint(0, adj.shape[0]) - # idx_j = np.random.randint(0, adj.shape[0]) - # if idx_i == idx_j: # 对角线不要 - # continue - # if ismember([idx_i, idx_j], edges_all): # 是已知边不要 - # continue - # if val_edges_false: - # if ismember([idx_j, idx_i], np.array(val_edges_false)): - # continue - # if ismember([idx_i, idx_j], np.array(val_edges_false)): - # continue - # val_edges_false.append([idx_i, idx_j]) - - assert ~ismember(test_edges_false, edges_all) - # assert ~ismember(val_edges_false, edges_all) - # assert ~ismember(val_edges, train_edges) - assert ~ismember(test_edges, train_edges) - # assert ~ismember(val_edges, test_edges) - - # Re-build adj matrix 重建邻接矩阵 - adj_train = sp.csr_matrix((np.ones(train_edges.shape[0]), (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) - adj_train = adj_train + adj_train.T # 因为train_edges是单向的,所以把它变成对称的 - - # NOTE: these edge lists only contain single direction of edge! 注意:这些边列表只包含边的单一方向! - return adj, train_edges, test_edges, np.array(test_edges_false) # , val_edges, np.array(val_edges_false) - - -def divide_datasets(adj, edges, num_test, i, dp_line): - if i == 9: - start_test = num_test * i - end_test = edges.shape[0] - else: - start_test = num_test * i - end_test = num_test * (i + 1) - - all_edge_idx = list(range(edges.shape[0])) - np.random.seed(config.seed) - np.random.shuffle(edges) - test_edge_idx = all_edge_idx[start_test:end_test] - test_edges = edges[test_edge_idx] - train_edges = np.delete(edges, np.hstack([test_edge_idx]), axis=0) # , val_edge_idx - - def ismember(a: list, b, tol=5): - rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) - return np.any(rows_close) - - test_edges_false = [] - while len(test_edges_false) < len(test_edges): - idx_i = np.random.randint(0, adj.shape[0]) # 随机生成横坐标 - idx_j = np.random.randint(0, adj.shape[1]) # 随机生成纵坐标 - if idx_i == idx_j: # 自身不要 - continue - if ismember([idx_i, idx_j], edges): # 是已知边不要 - continue - test_edges_false.append([idx_i, idx_j]) - - adj_pd = sp.csr_matrix((np.ones(train_edges.shape[0]), (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) - - # 把列索引编号加上dp_line - def add_index(edges): - edges = np.array(edges) - colu = edges[:, 1] + dp_line - edges[:, 1] = colu - return edges - - train_edges = add_index(train_edges) - test_edges = add_index(test_edges) - test_edges_false = add_index(test_edges_false) - - return adj_pd, train_edges, test_edges, test_edges_false diff --git a/p2_preprocessing_feature/__init__.py b/p2_preprocessing_feature/__init__.py deleted file mode 100644 index bfa83a0..0000000 --- a/p2_preprocessing_feature/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from __future__ import print_function -from __future__ import division diff --git a/p2_preprocessing_feature/load_feature.py b/p2_preprocessing_feature/load_feature.py deleted file mode 100644 index 17e98f5..0000000 --- a/p2_preprocessing_feature/load_feature.py +++ /dev/null @@ -1,18 +0,0 @@ -import numpy as np -import scipy.sparse as sp - - -def load_yam_feature(dataset): - dp = np.loadtxt('../../data/RawData/Yamanishi/{}_admat_dgc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float).T - dd = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) - pp = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dg.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) - feature = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) - return sp.lil_matrix(feature) - - -def load_luo_feature(dataset): - dp = np.loadtxt('../../data/RawData/luo/mat_drug_protein.txt'.format(dataset), dtype=float) - dd = np.loadtxt('../../data/RawData/luo/Similarity_Matrix_Drugs.txt'.format(dataset), dtype=float) - pp = np.loadtxt('../../data/RawData/luo/Similarity_Matrix_Proteins.txt'.format(dataset), dtype=float) / 100 - feature = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) - return sp.lil_matrix(feature) diff --git a/p2_preprocessing_feature/process_feature.py b/p2_preprocessing_feature/process_feature.py deleted file mode 100644 index 727e488..0000000 --- a/p2_preprocessing_feature/process_feature.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import pickle - -from src import config -from src.p2_preprocessing_feature.load_feature import load_yam_feature, load_luo_feature - -for dataset in config.datasets: - # feature: lil_matrix - if dataset == 'luo': - feature = load_luo_feature(dataset) - else: - feature = load_yam_feature(dataset) - - # 保存特征 - path = "../../data/partitioned_data/{}/feature/".format(dataset) - if not os.path.exists(path): - os.makedirs(path) - pickle.dump(feature, open(path + dataset + "_feature.pkl", 'wb')) - - print("ok") diff --git a/src/train.py b/train.py similarity index 98% rename from src/train.py rename to train.py index 705e769..c9fd9bd 100644 --- a/src/train.py +++ b/train.py @@ -7,9 +7,9 @@ from sklearn.decomposition import PCA from sklearn.neighbors import KernelDensity from input import get_data -from src.evaluation import Evaluator -from src.model import BGAN -from src.optimizer import Optimizer, update +from evaluation import Evaluator +from model import BGAN +from optimizer import Optimizer, update # 解析参数 def parse_args():