You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
123 lines
4.9 KiB
123 lines
4.9 KiB
2 years ago
|
import numpy as np
|
||
|
import pickle as pkl
|
||
|
import networkx as nx
|
||
|
import scipy.sparse as sp
|
||
|
import sys
|
||
|
|
||
|
|
||
|
def parse_index_file(filename):
|
||
|
index = []
|
||
|
for line in open(filename):
|
||
|
index.append(int(line.strip()))
|
||
|
return index
|
||
|
|
||
|
def sample_mask(idx, l):
|
||
|
"""Create mask."""
|
||
|
mask = np.zeros(l)
|
||
|
mask[idx] = 1
|
||
|
return np.array(mask, dtype=np.bool)
|
||
|
|
||
|
def load_data(dataset):
|
||
|
# load the data: x, tx, allx, graph
|
||
|
# x => 训练实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例
|
||
|
# tx => 测试实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例
|
||
|
# allx => 有标签的+无无标签训练实例的特征向量,是ind.dataset_str.x的超集
|
||
|
# y => 训练实例的标签,独热编码,numpy.ndarray类的实例
|
||
|
# ty => 测试实例的标签,独热编码,numpy.ndarray类的实例
|
||
|
# ally => 有标签的+无无标签训练实例的标签,独热编码,numpy.ndarray类的实例
|
||
|
# graph => 图数据,collections.defaultdict类的实例,格式为 {index:[index_of_neighbor_nodes]}
|
||
|
# index => 测试实例的id
|
||
|
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
|
||
|
objects = []
|
||
|
for i in range(len(names)):
|
||
|
with open("data/ind.{}.{}".format(dataset, names[i]), 'rb') as f:
|
||
|
if sys.version_info > (3, 0):
|
||
|
objects.append(pkl.load(f, encoding='latin1'))
|
||
|
else:
|
||
|
objects.append(pkl.load(f))
|
||
|
x, y, tx, ty, allx, ally, graph = tuple(objects)
|
||
|
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset))
|
||
|
test_idx_range = np.sort(test_idx_reorder)
|
||
|
|
||
|
if dataset == 'citeseer':
|
||
|
# Fix citeseer dataset (there are some isolated nodes in the graph)
|
||
|
# Find isolated nodes, add them as zero-vecs into the right position、
|
||
|
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
|
||
|
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
|
||
|
tx_extended[test_idx_range-min(test_idx_range), :] = tx
|
||
|
tx = tx_extended
|
||
|
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
|
||
|
ty_extended[test_idx_range - min(test_idx_range), :] = ty
|
||
|
ty = ty_extended
|
||
|
|
||
|
features = sp.vstack((allx, tx)).tolil()
|
||
|
features[test_idx_reorder, :] = features[test_idx_range, :]
|
||
|
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
|
||
|
|
||
|
labels = np.vstack((ally, ty))
|
||
|
labels[test_idx_reorder, :] = labels[test_idx_range, :]
|
||
|
|
||
|
idx_test = test_idx_range.tolist()
|
||
|
idx_train = range(len(y))
|
||
|
idx_val = range(len(y), len(y) + 500)
|
||
|
|
||
|
train_mask = sample_mask(idx_train, labels.shape[0])
|
||
|
val_mask = sample_mask(idx_val, labels.shape[0])
|
||
|
test_mask = sample_mask(idx_test, labels.shape[0])
|
||
|
|
||
|
y_train = np.zeros(labels.shape)
|
||
|
y_val = np.zeros(labels.shape)
|
||
|
y_test = np.zeros(labels.shape)
|
||
|
y_train[train_mask, :] = labels[train_mask, :]
|
||
|
y_val[val_mask, :] = labels[val_mask, :]
|
||
|
y_test[test_mask, :] = labels[test_mask, :]
|
||
|
|
||
|
return adj, features, y_test, tx, ty, test_mask, np.argmax(labels,1)
|
||
|
|
||
|
|
||
|
def load_alldata(dataset_str):
|
||
|
"""Load data."""
|
||
|
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
|
||
|
objects = []
|
||
|
for i in range(len(names)):
|
||
|
objects.append(pkl.load(open("data/ind.{}.{}".format(dataset_str, names[i]))))
|
||
|
|
||
|
x, y, tx, ty, allx, ally, graph = tuple(objects)
|
||
|
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
|
||
|
test_idx_range = np.sort(test_idx_reorder)
|
||
|
|
||
|
if dataset_str == 'citeseer':
|
||
|
# Fix citeseer dataset (there are some isolated nodes in the graph)
|
||
|
# Find isolated nodes, add them as zero-vecs into the right position
|
||
|
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
|
||
|
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
|
||
|
tx_extended[test_idx_range-min(test_idx_range), :] = tx
|
||
|
tx = tx_extended
|
||
|
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
|
||
|
ty_extended[test_idx_range-min(test_idx_range), :] = ty
|
||
|
ty = ty_extended
|
||
|
|
||
|
features = sp.vstack((allx, tx)).tolil()
|
||
|
features[test_idx_reorder, :] = features[test_idx_range, :]
|
||
|
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
|
||
|
|
||
|
labels = np.vstack((ally, ty))
|
||
|
labels[test_idx_reorder, :] = labels[test_idx_range, :]
|
||
|
|
||
|
idx_test = test_idx_range.tolist()
|
||
|
idx_train = range(len(y))
|
||
|
idx_val = range(len(y), len(y)+500)
|
||
|
|
||
|
train_mask = sample_mask(idx_train, labels.shape[0])
|
||
|
val_mask = sample_mask(idx_val, labels.shape[0])
|
||
|
test_mask = sample_mask(idx_test, labels.shape[0])
|
||
|
|
||
|
y_train = np.zeros(labels.shape)
|
||
|
y_val = np.zeros(labels.shape)
|
||
|
y_test = np.zeros(labels.shape)
|
||
|
y_train[train_mask, :] = labels[train_mask, :]
|
||
|
y_val[val_mask, :] = labels[val_mask, :]
|
||
|
y_test[test_mask, :] = labels[test_mask, :]
|
||
|
|
||
|
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, np.argmax(labels, 1)
|