From c2c081e0dfd8f44ef76da4a11ff412cce6c32110 Mon Sep 17 00:00:00 2001 From: lab-pc Date: Tue, 11 Oct 2022 13:16:39 +0800 Subject: [PATCH] -- --- p1_preprocessing_data/__init__.py | 0 p1_preprocessing_data/load_data.py | 94 ++++++++++++++ p1_preprocessing_data/process_data.py | 88 +++++++++++++ p1_preprocessing_data/utils.py | 130 ++++++++++++++++++++ p2_preprocessing_feature/__init__.py | 2 + p2_preprocessing_feature/load_feature.py | 18 +++ p2_preprocessing_feature/process_feature.py | 20 +++ 7 files changed, 352 insertions(+) create mode 100644 p1_preprocessing_data/__init__.py create mode 100644 p1_preprocessing_data/load_data.py create mode 100644 p1_preprocessing_data/process_data.py create mode 100644 p1_preprocessing_data/utils.py create mode 100644 p2_preprocessing_feature/__init__.py create mode 100644 p2_preprocessing_feature/load_feature.py create mode 100644 p2_preprocessing_feature/process_feature.py diff --git a/p1_preprocessing_data/__init__.py b/p1_preprocessing_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/p1_preprocessing_data/load_data.py b/p1_preprocessing_data/load_data.py new file mode 100644 index 0000000..4ec58cc --- /dev/null +++ b/p1_preprocessing_data/load_data.py @@ -0,0 +1,94 @@ +import os +import random + +import numpy as np +import scipy.sparse as sp + +from src import config + + +def load_luo_data(dataset): + dp = np.loadtxt('../../data/RawData/luo/mat_drug_protein.txt'.format(dataset), dtype=int) + dd = np.loadtxt('../../data/RawData/luo/mat_drug_drug.txt'.format(dataset), dtype=int) + pp = np.loadtxt('../../data/RawData/luo/mat_protein_protein.txt'.format(dataset), dtype=int) + adj = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) + return sp.csr_matrix(adj + sp.eye(adj.shape[0])), dd.shape[0] + + +def load_yam_data(dataset): + dp = np.loadtxt('../../data/RawData/Yamanishi/{}_admat_dgc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.int).T + dd = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) + pp = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dg.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) + dd = np.where(dd < 0.5, 0, 1) + pp = np.where(pp < 0.5, 0, 1) + adj = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) + return sp.csr_matrix(adj), dd.shape[0] + + +def is_symmetry(adj): + for i in range(adj.shape[0]): + for j in range(adj.shape[1]): + if adj[i][j] != adj[j][i]: + return False + return True + + +def is_1_diag(adj): + if sum(np.diagonal(adj)) != adj.shape[0]: + return False + return True + + +def change_unbalanced(adj, percent, dp_line, dataset): + """ + note: percent控制屏蔽掉的节点所占的百分比 + :param adj: + :param percent: + :return: 返回去除部分已知关联的邻接矩阵 + """ + # 判断是否对称 + # assert is_symmetry(adj.A) + adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) + sp.eye(adj.shape[0]) + # 判断对角线是否全为1 + assert is_1_diag(adj.A) + adj = (sp.triu(adj) + sp.triu(adj).T - sp.eye(adj.shape[0])).A + + row = list(range(0, dp_line)) + col = list(range(dp_line, adj.shape[0])) + + idx = [] + for i in row: + for j in col: + if i != j and adj[i][j] == 1: + idx.append((i, j)) + num = int(np.floor(percent * len(idx))) + count = 0 + # random.seed(config.seed) + while count < num: + row, col = random.choice(idx) + idx.remove((row, col)) + adj[row][col] = 0 + adj[col][row] = 0 + count += 1 + + # idx = [] + # for i in range(adj.shape[0]): + # for j in range(i + 1, adj.shape[0]): + # if adj[i][j] == 1: + # idx.append((i, j)) + # num = int(np.floor(percent * len(idx))) + # count = 0 + # # random.seed(config.seed) + # while count < num: + # row, col = random.choice(idx) + # idx.remove((row, col)) + # adj[row][col] = 0 + # adj[col][row] = 0 + # count += 1 + + # 保存改变不平衡性后新的dp + new_dp = adj[0:dp_line, dp_line:] + # if not os.path.exists('../../data/partitioned_data/{0}/feature'.format(dataset)): + # os.mkdir('../../data/partitioned_data/{0}/feature'.format(dataset)) + # np.savetxt('../../data/partitioned_data/{0}/feature/{0}_new_admat_dgc.txt'.format(dataset), new_dp, fmt='%d', delimiter='\t') + return sp.csr_matrix(adj.astype(np.int)) diff --git a/p1_preprocessing_data/process_data.py b/p1_preprocessing_data/process_data.py new file mode 100644 index 0000000..1138359 --- /dev/null +++ b/p1_preprocessing_data/process_data.py @@ -0,0 +1,88 @@ +import os +import pickle + +import numpy as np + +from src import config +import scipy.sparse as sp + +from load_data import load_yam_data, change_unbalanced, load_luo_data +from utils import divide_vgae_datasets, sparse_to_tuple, divide_datasets + +for dataset in config.datasets: + g = os.walk(r"../../data/partitioned_data/{}".format(dataset)) + for path, dir_list, file_list in g: + for file_name in file_list: + os.remove(os.path.join(path, file_name)) + print("清除缓存完成!") + + # Load data 得到一个邻接矩阵,双向边 + if dataset == 'luo': + adj, dp_line = load_luo_data(dataset) + else: + adj, dp_line = load_yam_data(dataset) + + if not os.path.exists("../../data/partitioned_data"): + os.mkdir("../../data/partitioned_data") + if not os.path.exists("../../data/partitioned_data/{}".format(dataset)): + os.mkdir("../../data/partitioned_data/{}".format(dataset)) + if not os.path.exists("../../data/partitioned_data/{}/orig".format(dataset)): + os.mkdir("../../data/partitioned_data/{}/orig/".format(dataset)) + np.savetxt("../../data/partitioned_data/{}/orig/dp_line.txt".format(dataset), np.array([dataset, str(dp_line)]), fmt='%s') + + # 获得不同不平衡性的数据 + adj = change_unbalanced(adj, config.percent, dp_line, dataset) + + # Store original adjacency matrix (without diagonal entries) for later 保存原始邻接矩阵(不含对角线项)以备后用 + adj_orig = adj + adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # 假设对角线有元素,去除对角线 + adj_orig.eliminate_zeros() # 假设有0,移除矩阵中的0 + path = "../../data/partitioned_data/{}/orig/".format(dataset) + if not os.path.exists(path): + os.makedirs(path) + pickle.dump(adj_orig, open(path + dataset + "_adj_orig.pkl", 'wb')) + np.savetxt(path + dataset + "_adj_orig.txt", adj_orig.A, fmt='%d') + + # 为获取嵌入划分数据, 划分数据集, 并记录边 + for i in range(10): + # Remove diagonal elements # 删除对角线元素 + adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) # 梅开二度 + adj.eliminate_zeros() + # Check that diag is zero: # 检查diag是否为零: + assert np.diag(adj.todense()).sum() == 0 + + # 为graphgan划分数据 + g_adj = adj[0:dp_line, dp_line:] + g_edges = sparse_to_tuple(g_adj)[0] + g_num_test = int(np.floor(g_edges.shape[0] / 10.)) # np.floor()是向下取整。测试集10分之一,训练集20分之一 + g_num_val = int(np.floor(g_edges.shape[0] / 20.)) + + adj_pd, train_edges, test_edges, test_edges_false = divide_datasets(g_adj, g_edges, g_num_test, i, dp_line) + adj[0:dp_line, dp_line:] = adj_pd + + # 将训练集分给vgae + edges = sparse_to_tuple(sp.triu(adj))[0] + edges_all = sparse_to_tuple(adj)[0] # 将邻接矩阵转换成三元组,然后只取坐标,即所有的边 + num_test = int(np.floor(edges.shape[0] / 10.)) # np.floor()是向下取整。测试集10分之一,训练集20分之一 + num_val = int(np.floor(edges.shape[0] / 20.)) + + adj_train, vgae_train_edges, vgae_test_edges, vgae_test_edges_false = divide_vgae_datasets(adj, edges, edges_all, num_test, num_val, + i) # val_edges, val_edges_false, + + # 保存划分好的数据 + path = "../../data/partitioned_data/{}/{}fold/".format(dataset, i) + if not os.path.exists(path): + os.makedirs(path) + + pickle.dump(adj_train, open(path + dataset + "_adj_train.pkl", 'wb')) + + np.savetxt(path + dataset + "_vgae_train.txt", vgae_train_edges, fmt='%d') + np.savetxt(path + dataset + "_vgae_test.txt", vgae_test_edges, fmt='%d') + np.savetxt(path + dataset + "_vgae_test_neg.txt", vgae_test_edges_false, fmt='%d') + + np.savetxt(path + dataset + "_train.txt", vgae_train_edges, fmt='%d') + np.savetxt(path + dataset + "_pd_train.txt", train_edges, fmt='%d') + np.savetxt(path + dataset + "_test.txt", test_edges, fmt='%d') + np.savetxt(path + dataset + "_test_neg.txt", test_edges_false, fmt='%d') + + print("OK") diff --git a/p1_preprocessing_data/utils.py b/p1_preprocessing_data/utils.py new file mode 100644 index 0000000..a5dc3b4 --- /dev/null +++ b/p1_preprocessing_data/utils.py @@ -0,0 +1,130 @@ +import numpy as np +import scipy.sparse as sp + +from src import config + + +def sparse_to_tuple(sparse_mx): + if not sp.isspmatrix_coo(sparse_mx): + sparse_mx = sparse_mx.tocoo() + coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() + values = sparse_mx.data + shape = sparse_mx.shape + return coords, values, shape + + +def divide_vgae_datasets(adj, edges, edges_all, num_test, num_val, i): + # 构建具有10%正向链接的测试集的函数 + # 注:拆分是随机的,结果可能与论文中报告的数字略有偏差。 + + if i == 9: + start_test = num_test * i + end_test = edges.shape[0] + start_val = 0 + end_val = num_val + else: + start_test = num_test * i + end_test = num_test * (i + 1) + start_val = end_test + end_val = end_test + num_val + + all_edge_idx = list(range(edges.shape[0])) + np.random.seed(config.seed) + np.random.shuffle(edges) + # val_edge_idx = all_edge_idx[start_val:end_val] + test_edge_idx = all_edge_idx[start_test:end_test] + test_edges = edges[test_edge_idx] + # val_edges = edges[val_edge_idx] + train_edges = np.delete(edges, np.hstack([test_edge_idx]), axis=0) # , val_edge_idx + + def ismember(a: list, b, tol=5): + rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) + return np.any(rows_close) + + test_edges_false = [] + while len(test_edges_false) < len(test_edges): + idx_i = np.random.randint(0, adj.shape[0]) # 随机生成横坐标 + idx_j = np.random.randint(0, adj.shape[0]) # 随机生成纵坐标 + if idx_i == idx_j: # 对角线的不要 + continue + if ismember([idx_i, idx_j], edges_all): # 是已知边不要 + continue + if test_edges_false: # 已选负边不要,a-b或b-a有一个是都不要 + if ismember([idx_j, idx_i], np.array(test_edges_false)): + continue + if ismember([idx_i, idx_j], np.array(test_edges_false)): + continue + test_edges_false.append([idx_i, idx_j]) + + # val_edges_false = [] + # while len(val_edges_false) < len(val_edges): + # idx_i = np.random.randint(0, adj.shape[0]) + # idx_j = np.random.randint(0, adj.shape[0]) + # if idx_i == idx_j: # 对角线不要 + # continue + # if ismember([idx_i, idx_j], edges_all): # 是已知边不要 + # continue + # if val_edges_false: + # if ismember([idx_j, idx_i], np.array(val_edges_false)): + # continue + # if ismember([idx_i, idx_j], np.array(val_edges_false)): + # continue + # val_edges_false.append([idx_i, idx_j]) + + assert ~ismember(test_edges_false, edges_all) + # assert ~ismember(val_edges_false, edges_all) + # assert ~ismember(val_edges, train_edges) + assert ~ismember(test_edges, train_edges) + # assert ~ismember(val_edges, test_edges) + + # Re-build adj matrix 重建邻接矩阵 + adj_train = sp.csr_matrix((np.ones(train_edges.shape[0]), (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) + adj_train = adj_train + adj_train.T # 因为train_edges是单向的,所以把它变成对称的 + + # NOTE: these edge lists only contain single direction of edge! 注意:这些边列表只包含边的单一方向! + return adj, train_edges, test_edges, np.array(test_edges_false) # , val_edges, np.array(val_edges_false) + + +def divide_datasets(adj, edges, num_test, i, dp_line): + if i == 9: + start_test = num_test * i + end_test = edges.shape[0] + else: + start_test = num_test * i + end_test = num_test * (i + 1) + + all_edge_idx = list(range(edges.shape[0])) + np.random.seed(config.seed) + np.random.shuffle(edges) + test_edge_idx = all_edge_idx[start_test:end_test] + test_edges = edges[test_edge_idx] + train_edges = np.delete(edges, np.hstack([test_edge_idx]), axis=0) # , val_edge_idx + + def ismember(a: list, b, tol=5): + rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) + return np.any(rows_close) + + test_edges_false = [] + while len(test_edges_false) < len(test_edges): + idx_i = np.random.randint(0, adj.shape[0]) # 随机生成横坐标 + idx_j = np.random.randint(0, adj.shape[1]) # 随机生成纵坐标 + if idx_i == idx_j: # 自身不要 + continue + if ismember([idx_i, idx_j], edges): # 是已知边不要 + continue + test_edges_false.append([idx_i, idx_j]) + + adj_pd = sp.csr_matrix((np.ones(train_edges.shape[0]), (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) + + # 把列索引编号加上dp_line + def add_index(edges): + edges = np.array(edges) + colu = edges[:, 1] + dp_line + edges[:, 1] = colu + return edges + + train_edges = add_index(train_edges) + test_edges = add_index(test_edges) + test_edges_false = add_index(test_edges_false) + + return adj_pd, train_edges, test_edges, test_edges_false diff --git a/p2_preprocessing_feature/__init__.py b/p2_preprocessing_feature/__init__.py new file mode 100644 index 0000000..bfa83a0 --- /dev/null +++ b/p2_preprocessing_feature/__init__.py @@ -0,0 +1,2 @@ +from __future__ import print_function +from __future__ import division diff --git a/p2_preprocessing_feature/load_feature.py b/p2_preprocessing_feature/load_feature.py new file mode 100644 index 0000000..17e98f5 --- /dev/null +++ b/p2_preprocessing_feature/load_feature.py @@ -0,0 +1,18 @@ +import numpy as np +import scipy.sparse as sp + + +def load_yam_feature(dataset): + dp = np.loadtxt('../../data/RawData/Yamanishi/{}_admat_dgc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float).T + dd = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) + pp = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dg.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float) + feature = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) + return sp.lil_matrix(feature) + + +def load_luo_feature(dataset): + dp = np.loadtxt('../../data/RawData/luo/mat_drug_protein.txt'.format(dataset), dtype=float) + dd = np.loadtxt('../../data/RawData/luo/Similarity_Matrix_Drugs.txt'.format(dataset), dtype=float) + pp = np.loadtxt('../../data/RawData/luo/Similarity_Matrix_Proteins.txt'.format(dataset), dtype=float) / 100 + feature = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp)))) + return sp.lil_matrix(feature) diff --git a/p2_preprocessing_feature/process_feature.py b/p2_preprocessing_feature/process_feature.py new file mode 100644 index 0000000..727e488 --- /dev/null +++ b/p2_preprocessing_feature/process_feature.py @@ -0,0 +1,20 @@ +import os +import pickle + +from src import config +from src.p2_preprocessing_feature.load_feature import load_yam_feature, load_luo_feature + +for dataset in config.datasets: + # feature: lil_matrix + if dataset == 'luo': + feature = load_luo_feature(dataset) + else: + feature = load_yam_feature(dataset) + + # 保存特征 + path = "../../data/partitioned_data/{}/feature/".format(dataset) + if not os.path.exists(path): + os.makedirs(path) + pickle.dump(feature, open(path + dataset + "_feature.pkl", 'wb')) + + print("ok")