You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
122 lines
4.9 KiB
122 lines
4.9 KiB
import numpy as np |
|
import pickle as pkl |
|
import networkx as nx |
|
import scipy.sparse as sp |
|
import sys |
|
|
|
|
|
def parse_index_file(filename): |
|
index = [] |
|
for line in open(filename): |
|
index.append(int(line.strip())) |
|
return index |
|
|
|
def sample_mask(idx, l): |
|
"""Create mask.""" |
|
mask = np.zeros(l) |
|
mask[idx] = 1 |
|
return np.array(mask, dtype=np.bool) |
|
|
|
def load_data(dataset): |
|
# load the data: x, tx, allx, graph |
|
# x => 训练实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例 |
|
# tx => 测试实例的特征向量,如scipy.sparse.csr.csr_matrix类的实例 |
|
# allx => 有标签的+无无标签训练实例的特征向量,是ind.dataset_str.x的超集 |
|
# y => 训练实例的标签,独热编码,numpy.ndarray类的实例 |
|
# ty => 测试实例的标签,独热编码,numpy.ndarray类的实例 |
|
# ally => 有标签的+无无标签训练实例的标签,独热编码,numpy.ndarray类的实例 |
|
# graph => 图数据,collections.defaultdict类的实例,格式为 {index:[index_of_neighbor_nodes]} |
|
# index => 测试实例的id |
|
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] |
|
objects = [] |
|
for i in range(len(names)): |
|
with open("data/ind.{}.{}".format(dataset, names[i]), 'rb') as f: |
|
if sys.version_info > (3, 0): |
|
objects.append(pkl.load(f, encoding='latin1')) |
|
else: |
|
objects.append(pkl.load(f)) |
|
x, y, tx, ty, allx, ally, graph = tuple(objects) |
|
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset)) |
|
test_idx_range = np.sort(test_idx_reorder) |
|
|
|
if dataset == 'citeseer': |
|
# Fix citeseer dataset (there are some isolated nodes in the graph) |
|
# Find isolated nodes, add them as zero-vecs into the right position、 |
|
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) |
|
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) |
|
tx_extended[test_idx_range-min(test_idx_range), :] = tx |
|
tx = tx_extended |
|
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) |
|
ty_extended[test_idx_range - min(test_idx_range), :] = ty |
|
ty = ty_extended |
|
|
|
features = sp.vstack((allx, tx)).tolil() |
|
features[test_idx_reorder, :] = features[test_idx_range, :] |
|
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) |
|
|
|
labels = np.vstack((ally, ty)) |
|
labels[test_idx_reorder, :] = labels[test_idx_range, :] |
|
|
|
idx_test = test_idx_range.tolist() |
|
idx_train = range(len(y)) |
|
idx_val = range(len(y), len(y) + 500) |
|
|
|
train_mask = sample_mask(idx_train, labels.shape[0]) |
|
val_mask = sample_mask(idx_val, labels.shape[0]) |
|
test_mask = sample_mask(idx_test, labels.shape[0]) |
|
|
|
y_train = np.zeros(labels.shape) |
|
y_val = np.zeros(labels.shape) |
|
y_test = np.zeros(labels.shape) |
|
y_train[train_mask, :] = labels[train_mask, :] |
|
y_val[val_mask, :] = labels[val_mask, :] |
|
y_test[test_mask, :] = labels[test_mask, :] |
|
|
|
return adj, features, y_test, tx, ty, test_mask, np.argmax(labels,1) |
|
|
|
|
|
def load_alldata(dataset_str): |
|
"""Load data.""" |
|
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] |
|
objects = [] |
|
for i in range(len(names)): |
|
objects.append(pkl.load(open("data/ind.{}.{}".format(dataset_str, names[i])))) |
|
|
|
x, y, tx, ty, allx, ally, graph = tuple(objects) |
|
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) |
|
test_idx_range = np.sort(test_idx_reorder) |
|
|
|
if dataset_str == 'citeseer': |
|
# Fix citeseer dataset (there are some isolated nodes in the graph) |
|
# Find isolated nodes, add them as zero-vecs into the right position |
|
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) |
|
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) |
|
tx_extended[test_idx_range-min(test_idx_range), :] = tx |
|
tx = tx_extended |
|
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) |
|
ty_extended[test_idx_range-min(test_idx_range), :] = ty |
|
ty = ty_extended |
|
|
|
features = sp.vstack((allx, tx)).tolil() |
|
features[test_idx_reorder, :] = features[test_idx_range, :] |
|
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) |
|
|
|
labels = np.vstack((ally, ty)) |
|
labels[test_idx_reorder, :] = labels[test_idx_range, :] |
|
|
|
idx_test = test_idx_range.tolist() |
|
idx_train = range(len(y)) |
|
idx_val = range(len(y), len(y)+500) |
|
|
|
train_mask = sample_mask(idx_train, labels.shape[0]) |
|
val_mask = sample_mask(idx_val, labels.shape[0]) |
|
test_mask = sample_mask(idx_test, labels.shape[0]) |
|
|
|
y_train = np.zeros(labels.shape) |
|
y_val = np.zeros(labels.shape) |
|
y_test = np.zeros(labels.shape) |
|
y_train[train_mask, :] = labels[train_mask, :] |
|
y_val[val_mask, :] = labels[val_mask, :] |
|
y_test[test_mask, :] = labels[test_mask, :] |
|
|
|
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, np.argmax(labels, 1)
|
|
|