You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
88 lines
4.3 KiB
88 lines
4.3 KiB
import os |
|
import pickle |
|
|
|
import numpy as np |
|
|
|
from src import config |
|
import scipy.sparse as sp |
|
|
|
from load_data import load_yam_data, change_unbalanced, load_luo_data |
|
from utils import divide_vgae_datasets, sparse_to_tuple, divide_datasets |
|
|
|
for dataset in config.datasets: |
|
g = os.walk(r"../../data/partitioned_data/{}".format(dataset)) |
|
for path, dir_list, file_list in g: |
|
for file_name in file_list: |
|
os.remove(os.path.join(path, file_name)) |
|
print("清除缓存完成!") |
|
|
|
# Load data 得到一个邻接矩阵,双向边 |
|
if dataset == 'luo': |
|
adj, dp_line = load_luo_data(dataset) |
|
else: |
|
adj, dp_line = load_yam_data(dataset) |
|
|
|
if not os.path.exists("../../data/partitioned_data"): |
|
os.mkdir("../../data/partitioned_data") |
|
if not os.path.exists("../../data/partitioned_data/{}".format(dataset)): |
|
os.mkdir("../../data/partitioned_data/{}".format(dataset)) |
|
if not os.path.exists("../../data/partitioned_data/{}/orig".format(dataset)): |
|
os.mkdir("../../data/partitioned_data/{}/orig/".format(dataset)) |
|
np.savetxt("../../data/partitioned_data/{}/orig/dp_line.txt".format(dataset), np.array([dataset, str(dp_line)]), fmt='%s') |
|
|
|
# 获得不同不平衡性的数据 |
|
adj = change_unbalanced(adj, config.percent, dp_line, dataset) |
|
|
|
# Store original adjacency matrix (without diagonal entries) for later 保存原始邻接矩阵(不含对角线项)以备后用 |
|
adj_orig = adj |
|
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # 假设对角线有元素,去除对角线 |
|
adj_orig.eliminate_zeros() # 假设有0,移除矩阵中的0 |
|
path = "../../data/partitioned_data/{}/orig/".format(dataset) |
|
if not os.path.exists(path): |
|
os.makedirs(path) |
|
pickle.dump(adj_orig, open(path + dataset + "_adj_orig.pkl", 'wb')) |
|
np.savetxt(path + dataset + "_adj_orig.txt", adj_orig.A, fmt='%d') |
|
|
|
# 为获取嵌入划分数据, 划分数据集, 并记录边 |
|
for i in range(10): |
|
# Remove diagonal elements # 删除对角线元素 |
|
adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) # 梅开二度 |
|
adj.eliminate_zeros() |
|
# Check that diag is zero: # 检查diag是否为零: |
|
assert np.diag(adj.todense()).sum() == 0 |
|
|
|
# 为graphgan划分数据 |
|
g_adj = adj[0:dp_line, dp_line:] |
|
g_edges = sparse_to_tuple(g_adj)[0] |
|
g_num_test = int(np.floor(g_edges.shape[0] / 10.)) # np.floor()是向下取整。测试集10分之一,训练集20分之一 |
|
g_num_val = int(np.floor(g_edges.shape[0] / 20.)) |
|
|
|
adj_pd, train_edges, test_edges, test_edges_false = divide_datasets(g_adj, g_edges, g_num_test, i, dp_line) |
|
adj[0:dp_line, dp_line:] = adj_pd |
|
|
|
# 将训练集分给vgae |
|
edges = sparse_to_tuple(sp.triu(adj))[0] |
|
edges_all = sparse_to_tuple(adj)[0] # 将邻接矩阵转换成三元组,然后只取坐标,即所有的边 |
|
num_test = int(np.floor(edges.shape[0] / 10.)) # np.floor()是向下取整。测试集10分之一,训练集20分之一 |
|
num_val = int(np.floor(edges.shape[0] / 20.)) |
|
|
|
adj_train, vgae_train_edges, vgae_test_edges, vgae_test_edges_false = divide_vgae_datasets(adj, edges, edges_all, num_test, num_val, |
|
i) # val_edges, val_edges_false, |
|
|
|
# 保存划分好的数据 |
|
path = "../../data/partitioned_data/{}/{}fold/".format(dataset, i) |
|
if not os.path.exists(path): |
|
os.makedirs(path) |
|
|
|
pickle.dump(adj_train, open(path + dataset + "_adj_train.pkl", 'wb')) |
|
|
|
np.savetxt(path + dataset + "_vgae_train.txt", vgae_train_edges, fmt='%d') |
|
np.savetxt(path + dataset + "_vgae_test.txt", vgae_test_edges, fmt='%d') |
|
np.savetxt(path + dataset + "_vgae_test_neg.txt", vgae_test_edges_false, fmt='%d') |
|
|
|
np.savetxt(path + dataset + "_train.txt", vgae_train_edges, fmt='%d') |
|
np.savetxt(path + dataset + "_pd_train.txt", train_edges, fmt='%d') |
|
np.savetxt(path + dataset + "_test.txt", test_edges, fmt='%d') |
|
np.savetxt(path + dataset + "_test_neg.txt", test_edges_false, fmt='%d') |
|
|
|
print("OK")
|
|
|