From c2c081e0dfd8f44ef76da4a11ff412cce6c32110 Mon Sep 17 00:00:00 2001
From: lab-pc <ismaxu@163.com>
Date: Tue, 11 Oct 2022 13:16:39 +0800
Subject: [PATCH] --

---
 p1_preprocessing_data/__init__.py           |   0
 p1_preprocessing_data/load_data.py          |  94 ++++++++++++++
 p1_preprocessing_data/process_data.py       |  88 +++++++++++++
 p1_preprocessing_data/utils.py              | 130 ++++++++++++++++++++
 p2_preprocessing_feature/__init__.py        |   2 +
 p2_preprocessing_feature/load_feature.py    |  18 +++
 p2_preprocessing_feature/process_feature.py |  20 +++
 7 files changed, 352 insertions(+)
 create mode 100644 p1_preprocessing_data/__init__.py
 create mode 100644 p1_preprocessing_data/load_data.py
 create mode 100644 p1_preprocessing_data/process_data.py
 create mode 100644 p1_preprocessing_data/utils.py
 create mode 100644 p2_preprocessing_feature/__init__.py
 create mode 100644 p2_preprocessing_feature/load_feature.py
 create mode 100644 p2_preprocessing_feature/process_feature.py

diff --git a/p1_preprocessing_data/__init__.py b/p1_preprocessing_data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/p1_preprocessing_data/load_data.py b/p1_preprocessing_data/load_data.py
new file mode 100644
index 0000000..4ec58cc
--- /dev/null
+++ b/p1_preprocessing_data/load_data.py
@@ -0,0 +1,94 @@
+import os
+import random
+
+import numpy as np
+import scipy.sparse as sp
+
+from src import config
+
+
+def load_luo_data(dataset):
+    dp = np.loadtxt('../../data/RawData/luo/mat_drug_protein.txt'.format(dataset), dtype=int)
+    dd = np.loadtxt('../../data/RawData/luo/mat_drug_drug.txt'.format(dataset), dtype=int)
+    pp = np.loadtxt('../../data/RawData/luo/mat_protein_protein.txt'.format(dataset), dtype=int)
+    adj = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp))))
+    return sp.csr_matrix(adj + sp.eye(adj.shape[0])), dd.shape[0]
+
+
+def load_yam_data(dataset):
+    dp = np.loadtxt('../../data/RawData/Yamanishi/{}_admat_dgc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.int).T
+    dd = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float)
+    pp = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dg.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float)
+    dd = np.where(dd < 0.5, 0, 1)
+    pp = np.where(pp < 0.5, 0, 1)
+    adj = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp))))
+    return sp.csr_matrix(adj), dd.shape[0]
+
+
+def is_symmetry(adj):
+    for i in range(adj.shape[0]):
+        for j in range(adj.shape[1]):
+            if adj[i][j] != adj[j][i]:
+                return False
+    return True
+
+
+def is_1_diag(adj):
+    if sum(np.diagonal(adj)) != adj.shape[0]:
+        return False
+    return True
+
+
+def change_unbalanced(adj, percent, dp_line, dataset):
+    """
+    note: percent控制屏蔽掉的节点所占的百分比
+    :param adj:
+    :param percent:
+    :return: 返回去除部分已知关联的邻接矩阵
+    """
+    # 判断是否对称
+    # assert is_symmetry(adj.A)
+    adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) + sp.eye(adj.shape[0])
+    # 判断对角线是否全为1
+    assert is_1_diag(adj.A)
+    adj = (sp.triu(adj) + sp.triu(adj).T - sp.eye(adj.shape[0])).A
+
+    row = list(range(0, dp_line))
+    col = list(range(dp_line, adj.shape[0]))
+
+    idx = []
+    for i in row:
+        for j in col:
+            if i != j and adj[i][j] == 1:
+                idx.append((i, j))
+    num = int(np.floor(percent * len(idx)))
+    count = 0
+    # random.seed(config.seed)
+    while count < num:
+        row, col = random.choice(idx)
+        idx.remove((row, col))
+        adj[row][col] = 0
+        adj[col][row] = 0
+        count += 1
+
+    # idx = []
+    # for i in range(adj.shape[0]):
+    #     for j in range(i + 1, adj.shape[0]):
+    #         if adj[i][j] == 1:
+    #             idx.append((i, j))
+    # num = int(np.floor(percent * len(idx)))
+    # count = 0
+    # # random.seed(config.seed)
+    # while count < num:
+    #     row, col = random.choice(idx)
+    #     idx.remove((row, col))
+    #     adj[row][col] = 0
+    #     adj[col][row] = 0
+    #     count += 1
+
+    # 保存改变不平衡性后新的dp
+    new_dp = adj[0:dp_line, dp_line:]
+    # if not os.path.exists('../../data/partitioned_data/{0}/feature'.format(dataset)):
+    #     os.mkdir('../../data/partitioned_data/{0}/feature'.format(dataset))
+    # np.savetxt('../../data/partitioned_data/{0}/feature/{0}_new_admat_dgc.txt'.format(dataset), new_dp, fmt='%d', delimiter='\t')
+    return sp.csr_matrix(adj.astype(np.int))
diff --git a/p1_preprocessing_data/process_data.py b/p1_preprocessing_data/process_data.py
new file mode 100644
index 0000000..1138359
--- /dev/null
+++ b/p1_preprocessing_data/process_data.py
@@ -0,0 +1,88 @@
+import os
+import pickle
+
+import numpy as np
+
+from src import config
+import scipy.sparse as sp
+
+from load_data import load_yam_data, change_unbalanced, load_luo_data
+from utils import divide_vgae_datasets, sparse_to_tuple, divide_datasets
+
+for dataset in config.datasets:
+    g = os.walk(r"../../data/partitioned_data/{}".format(dataset))
+    for path, dir_list, file_list in g:
+        for file_name in file_list:
+            os.remove(os.path.join(path, file_name))
+    print("清除缓存完成!")
+
+    # Load data 得到一个邻接矩阵,双向边
+    if dataset == 'luo':
+        adj, dp_line = load_luo_data(dataset)
+    else:
+        adj, dp_line = load_yam_data(dataset)
+
+    if not os.path.exists("../../data/partitioned_data"):
+        os.mkdir("../../data/partitioned_data")
+    if not os.path.exists("../../data/partitioned_data/{}".format(dataset)):
+        os.mkdir("../../data/partitioned_data/{}".format(dataset))
+    if not os.path.exists("../../data/partitioned_data/{}/orig".format(dataset)):
+        os.mkdir("../../data/partitioned_data/{}/orig/".format(dataset))
+    np.savetxt("../../data/partitioned_data/{}/orig/dp_line.txt".format(dataset), np.array([dataset, str(dp_line)]), fmt='%s')
+
+    # 获得不同不平衡性的数据
+    adj = change_unbalanced(adj, config.percent, dp_line, dataset)
+
+    # Store original adjacency matrix (without diagonal entries) for later  保存原始邻接矩阵(不含对角线项)以备后用
+    adj_orig = adj
+    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)  # 假设对角线有元素，去除对角线
+    adj_orig.eliminate_zeros()  # 假设有0，移除矩阵中的0
+    path = "../../data/partitioned_data/{}/orig/".format(dataset)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    pickle.dump(adj_orig, open(path + dataset + "_adj_orig.pkl", 'wb'))
+    np.savetxt(path + dataset + "_adj_orig.txt", adj_orig.A, fmt='%d')
+
+    # 为获取嵌入划分数据, 划分数据集, 并记录边
+    for i in range(10):
+        # Remove diagonal elements                                      # 删除对角线元素
+        adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)  # 梅开二度
+        adj.eliminate_zeros()
+        # Check that diag is zero:                                      # 检查diag是否为零：
+        assert np.diag(adj.todense()).sum() == 0
+
+        # 为graphgan划分数据
+        g_adj = adj[0:dp_line, dp_line:]
+        g_edges = sparse_to_tuple(g_adj)[0]
+        g_num_test = int(np.floor(g_edges.shape[0] / 10.))  # np.floor()是向下取整。测试集10分之一，训练集20分之一
+        g_num_val = int(np.floor(g_edges.shape[0] / 20.))
+
+        adj_pd, train_edges, test_edges, test_edges_false = divide_datasets(g_adj, g_edges, g_num_test, i, dp_line)
+        adj[0:dp_line, dp_line:] = adj_pd
+
+        # 将训练集分给vgae
+        edges = sparse_to_tuple(sp.triu(adj))[0]
+        edges_all = sparse_to_tuple(adj)[0]  # 将邻接矩阵转换成三元组，然后只取坐标，即所有的边
+        num_test = int(np.floor(edges.shape[0] / 10.))  # np.floor()是向下取整。测试集10分之一，训练集20分之一
+        num_val = int(np.floor(edges.shape[0] / 20.))
+
+        adj_train, vgae_train_edges, vgae_test_edges, vgae_test_edges_false = divide_vgae_datasets(adj, edges, edges_all, num_test, num_val,
+                                                                                                   i)  # val_edges, val_edges_false,
+
+        # 保存划分好的数据
+        path = "../../data/partitioned_data/{}/{}fold/".format(dataset, i)
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+        pickle.dump(adj_train, open(path + dataset + "_adj_train.pkl", 'wb'))
+
+        np.savetxt(path + dataset + "_vgae_train.txt", vgae_train_edges, fmt='%d')
+        np.savetxt(path + dataset + "_vgae_test.txt", vgae_test_edges, fmt='%d')
+        np.savetxt(path + dataset + "_vgae_test_neg.txt", vgae_test_edges_false, fmt='%d')
+
+        np.savetxt(path + dataset + "_train.txt", vgae_train_edges, fmt='%d')
+        np.savetxt(path + dataset + "_pd_train.txt", train_edges, fmt='%d')
+        np.savetxt(path + dataset + "_test.txt", test_edges, fmt='%d')
+        np.savetxt(path + dataset + "_test_neg.txt", test_edges_false, fmt='%d')
+
+    print("OK")
diff --git a/p1_preprocessing_data/utils.py b/p1_preprocessing_data/utils.py
new file mode 100644
index 0000000..a5dc3b4
--- /dev/null
+++ b/p1_preprocessing_data/utils.py
@@ -0,0 +1,130 @@
+import numpy as np
+import scipy.sparse as sp
+
+from src import config
+
+
+def sparse_to_tuple(sparse_mx):
+    if not sp.isspmatrix_coo(sparse_mx):
+        sparse_mx = sparse_mx.tocoo()
+    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
+    values = sparse_mx.data
+    shape = sparse_mx.shape
+    return coords, values, shape
+
+
+def divide_vgae_datasets(adj, edges, edges_all, num_test, num_val, i):
+    # 构建具有10%正向链接的测试集的函数
+    # 注：拆分是随机的，结果可能与论文中报告的数字略有偏差。
+
+    if i == 9:
+        start_test = num_test * i
+        end_test = edges.shape[0]
+        start_val = 0
+        end_val = num_val
+    else:
+        start_test = num_test * i
+        end_test = num_test * (i + 1)
+        start_val = end_test
+        end_val = end_test + num_val
+
+    all_edge_idx = list(range(edges.shape[0]))
+    np.random.seed(config.seed)
+    np.random.shuffle(edges)
+    # val_edge_idx = all_edge_idx[start_val:end_val]
+    test_edge_idx = all_edge_idx[start_test:end_test]
+    test_edges = edges[test_edge_idx]
+    # val_edges = edges[val_edge_idx]
+    train_edges = np.delete(edges, np.hstack([test_edge_idx]), axis=0)  # , val_edge_idx
+
+    def ismember(a: list, b, tol=5):
+        rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
+        return np.any(rows_close)
+
+    test_edges_false = []
+    while len(test_edges_false) < len(test_edges):
+        idx_i = np.random.randint(0, adj.shape[0])  # 随机生成横坐标
+        idx_j = np.random.randint(0, adj.shape[0])  # 随机生成纵坐标
+        if idx_i == idx_j:  # 对角线的不要
+            continue
+        if ismember([idx_i, idx_j], edges_all):  # 是已知边不要
+            continue
+        if test_edges_false:  # 已选负边不要，a-b或b-a有一个是都不要
+            if ismember([idx_j, idx_i], np.array(test_edges_false)):
+                continue
+            if ismember([idx_i, idx_j], np.array(test_edges_false)):
+                continue
+        test_edges_false.append([idx_i, idx_j])
+
+    # val_edges_false = []
+    # while len(val_edges_false) < len(val_edges):
+    #     idx_i = np.random.randint(0, adj.shape[0])
+    #     idx_j = np.random.randint(0, adj.shape[0])
+    #     if idx_i == idx_j:  # 对角线不要
+    #         continue
+    #     if ismember([idx_i, idx_j], edges_all):  # 是已知边不要
+    #         continue
+    #     if val_edges_false:
+    #         if ismember([idx_j, idx_i], np.array(val_edges_false)):
+    #             continue
+    #         if ismember([idx_i, idx_j], np.array(val_edges_false)):
+    #             continue
+    #     val_edges_false.append([idx_i, idx_j])
+
+    assert ~ismember(test_edges_false, edges_all)
+    # assert ~ismember(val_edges_false, edges_all)
+    # assert ~ismember(val_edges, train_edges)
+    assert ~ismember(test_edges, train_edges)
+    # assert ~ismember(val_edges, test_edges)
+
+    # Re-build adj matrix   重建邻接矩阵
+    adj_train = sp.csr_matrix((np.ones(train_edges.shape[0]), (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
+    adj_train = adj_train + adj_train.T  # 因为train_edges是单向的,所以把它变成对称的
+
+    # NOTE: these edge lists only contain single direction of edge!  注意：这些边列表只包含边的单一方向！
+    return adj, train_edges, test_edges, np.array(test_edges_false)  # , val_edges, np.array(val_edges_false)
+
+
+def divide_datasets(adj, edges, num_test, i, dp_line):
+    if i == 9:
+        start_test = num_test * i
+        end_test = edges.shape[0]
+    else:
+        start_test = num_test * i
+        end_test = num_test * (i + 1)
+
+    all_edge_idx = list(range(edges.shape[0]))
+    np.random.seed(config.seed)
+    np.random.shuffle(edges)
+    test_edge_idx = all_edge_idx[start_test:end_test]
+    test_edges = edges[test_edge_idx]
+    train_edges = np.delete(edges, np.hstack([test_edge_idx]), axis=0)  # , val_edge_idx
+
+    def ismember(a: list, b, tol=5):
+        rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
+        return np.any(rows_close)
+
+    test_edges_false = []
+    while len(test_edges_false) < len(test_edges):
+        idx_i = np.random.randint(0, adj.shape[0])  # 随机生成横坐标
+        idx_j = np.random.randint(0, adj.shape[1])  # 随机生成纵坐标
+        if idx_i == idx_j:  # 自身不要
+            continue
+        if ismember([idx_i, idx_j], edges):  # 是已知边不要
+            continue
+        test_edges_false.append([idx_i, idx_j])
+
+    adj_pd = sp.csr_matrix((np.ones(train_edges.shape[0]), (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
+
+    # 把列索引编号加上dp_line
+    def add_index(edges):
+        edges = np.array(edges)
+        colu = edges[:, 1] + dp_line
+        edges[:, 1] = colu
+        return edges
+
+    train_edges = add_index(train_edges)
+    test_edges = add_index(test_edges)
+    test_edges_false = add_index(test_edges_false)
+
+    return adj_pd, train_edges, test_edges, test_edges_false
diff --git a/p2_preprocessing_feature/__init__.py b/p2_preprocessing_feature/__init__.py
new file mode 100644
index 0000000..bfa83a0
--- /dev/null
+++ b/p2_preprocessing_feature/__init__.py
@@ -0,0 +1,2 @@
+from __future__ import print_function
+from __future__ import division
diff --git a/p2_preprocessing_feature/load_feature.py b/p2_preprocessing_feature/load_feature.py
new file mode 100644
index 0000000..17e98f5
--- /dev/null
+++ b/p2_preprocessing_feature/load_feature.py
@@ -0,0 +1,18 @@
+import numpy as np
+import scipy.sparse as sp
+
+
+def load_yam_feature(dataset):
+    dp = np.loadtxt('../../data/RawData/Yamanishi/{}_admat_dgc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float).T
+    dd = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dc.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float)
+    pp = np.loadtxt('../../data/RawData/Yamanishi/{}_simmat_dg.txt'.format(dataset), dtype=str, delimiter='\t')[1:, 1:].astype(np.float)
+    feature = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp))))
+    return sp.lil_matrix(feature)
+
+
+def load_luo_feature(dataset):
+    dp = np.loadtxt('../../data/RawData/luo/mat_drug_protein.txt'.format(dataset), dtype=float)
+    dd = np.loadtxt('../../data/RawData/luo/Similarity_Matrix_Drugs.txt'.format(dataset), dtype=float)
+    pp = np.loadtxt('../../data/RawData/luo/Similarity_Matrix_Proteins.txt'.format(dataset), dtype=float) / 100
+    feature = np.vstack((np.hstack((dd, dp)), np.hstack((dp.T, pp))))
+    return sp.lil_matrix(feature)
diff --git a/p2_preprocessing_feature/process_feature.py b/p2_preprocessing_feature/process_feature.py
new file mode 100644
index 0000000..727e488
--- /dev/null
+++ b/p2_preprocessing_feature/process_feature.py
@@ -0,0 +1,20 @@
+import os
+import pickle
+
+from src import config
+from src.p2_preprocessing_feature.load_feature import load_yam_feature, load_luo_feature
+
+for dataset in config.datasets:
+    # feature: lil_matrix
+    if dataset == 'luo':
+        feature = load_luo_feature(dataset)
+    else:
+        feature = load_yam_feature(dataset)
+
+    # 保存特征
+    path = "../../data/partitioned_data/{}/feature/".format(dataset)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    pickle.dump(feature, open(path + dataset + "_feature.pkl", 'wb'))
+
+    print("ok")