Added Cora.

This commit is contained in:
williamleif 2017-12-19 14:20:30 -08:00
parent 65cb65c9c0
commit 10e1084116

View File

@ -31,6 +31,75 @@ class SupervisedGraphSage(nn.Module):
scores = self.forward(nodes) scores = self.forward(nodes)
return self.xent(scores, labels.squeeze()) return self.xent(scores, labels.squeeze())
def load_cora():
num_nodes = 2708
num_feats = 1433
feat_data = np.zeros((num_nodes, num_feats))
labels = np.empty((num_nodes,1), dtype=np.int64)
node_map = {}
label_map = {}
with open("cora/cora.content") as fp:
for i,line in enumerate(fp):
info = line.strip().split()
feat_data[i,:] = map(float, info[1:-1])
node_map[info[0]] = i
if not info[-1] in label_map:
label_map[info[-1]] = len(label_map)
labels[i] = label_map[info[-1]]
adj_lists = defaultdict(set)
with open("cora/cora.cites") as fp:
for i,line in enumerate(fp):
info = line.strip().split()
paper1 = node_map[info[0]]
paper2 = node_map[info[1]]
adj_lists[paper1].add(paper2)
adj_lists[paper2].add(paper1)
return feat_data, labels, adj_lists
def run_cora():
np.random.seed(1)
random.seed(1)
num_nodes = 2708
feat_data, labels, adj_lists = load_cora()
features = nn.Embedding(2708, 1433)
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
# features.cuda()
agg1 = MeanAggregator(features, cuda=True)
enc1 = Encoder(features, 1433, 128, adj_lists, agg1, gcn=True, cuda=False)
agg2 = MeanAggregator(lambda nodes : enc1(nodes).t(), cuda=False)
enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
base_model=enc1, gcn=True, cuda=False)
enc1.num_samples = 5
enc2.num_samples = 5
graphsage = SupervisedGraphSage(7, enc2)
# graphsage.cuda()
rand_indices = np.random.permutation(num_nodes)
test = rand_indices[:1000]
val = rand_indices[1000:1500]
train = list(rand_indices[1500:])
optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=0.7)
times = []
for batch in range(100):
batch_nodes = train[:256]
random.shuffle(train)
start_time = time.time()
optimizer.zero_grad()
loss = graphsage.loss(batch_nodes,
Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
loss.backward()
optimizer.step()
end_time = time.time()
times.append(end_time-start_time)
print batch, loss.data[0]
val_output = graphsage.forward(val)
print "Validation F1:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro")
print "Average batch time:", np.mean(times)
def load_pubmed(): def load_pubmed():
#hardcoded for simplicity... #hardcoded for simplicity...
num_nodes = 19717 num_nodes = 19717
@ -60,11 +129,10 @@ def load_pubmed():
adj_lists[paper2].add(paper1) adj_lists[paper2].add(paper1)
return feat_data, labels, adj_lists return feat_data, labels, adj_lists
if __name__ == "__main__": def run_pubmed():
np.random.seed(1) np.random.seed(1)
random.seed(1) random.seed(1)
num_nodes = 19717 num_nodes = 19717
num_feats = 500
feat_data, labels, adj_lists = load_pubmed() feat_data, labels, adj_lists = load_pubmed()
features = nn.Embedding(19717, 500) features = nn.Embedding(19717, 500)
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False) features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
@ -103,3 +171,6 @@ if __name__ == "__main__":
val_output = graphsage.forward(val) val_output = graphsage.forward(val)
print "Validation F1:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro") print "Validation F1:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro")
print "Average batch time:", np.mean(times) print "Average batch time:", np.mean(times)
if __name__ == "__main__":
run_cora()