diff --git a/eval_scripts/ppi_eval.py b/eval_scripts/ppi_eval.py index 88b7a9e..a72c05d 100644 --- a/eval_scripts/ppi_eval.py +++ b/eval_scripts/ppi_eval.py @@ -5,6 +5,13 @@ import numpy as np from networkx.readwrite import json_graph from argparse import ArgumentParser +''' To evaluate the embeddings, we run a logistic regression. +Run this script after running unsupervised training. +Baseline of using features-only can be run by setting data_dir as 'feat' +Example: + python eval_scripts/ppi_eval.py ../data/ppi unsup-ppi/n2v_big_0.000010 test +''' + def run_regression(train_embeds, train_labels, test_embeds, test_labels): np.random.seed(1) from sklearn.linear_model import SGDClassifier @@ -15,8 +22,12 @@ def run_regression(train_embeds, train_labels, test_embeds, test_labels): dummy.fit(train_embeds, train_labels) log = MultiOutputClassifier(SGDClassifier(loss="log"), n_jobs=10) log.fit(train_embeds, train_labels) - print("F1 score", f1_score(test_labels, log.predict(test_embeds), average="micro")) - print("Random baseline F1 score", f1_score(test_labels, dummy.predict(test_embeds), average="micro")) + + f1 = 0 + for i in range(test_labels.shape[1]): + print("F1 score", f1_score(test_labels[:,i], log.predict(test_embeds)[:,i], average="micro")) + for i in range(test_labels.shape[1]): + print("Random baseline F1 score", f1_score(test_labels[:,i], dummy.predict(test_embeds)[:,i], average="micro")) if __name__ == '__main__': parser = ArgumentParser("Run evaluation on PPI data.") @@ -30,12 +41,14 @@ if __name__ == '__main__': print("Loading data...") G = json_graph.node_link_graph(json.load(open(dataset_dir + "/ppi-G.json"))) - labels = json.load(open("/dfs/scratch0/graphnet/ppi/ppi-class_map.json")) + labels = json.load(open(dataset_dir + "/ppi-class_map.json")) labels = {int(i):l for i, l in labels.iteritems()} train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n][setting]] train_labels = np.array([labels[i] for i in train_ids]) + if train_labels.ndim == 1: + train_labels = np.expand_dims(train_labels, 1) test_labels = np.array([labels[i] for i in test_ids]) print("running", data_dir) @@ -45,7 +58,7 @@ if __name__ == '__main__': ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:,0] = np.log(feats[:,0]+1.0) feats[:,1] = np.log(feats[:,1]-min(np.min(feats[:,1]), -1)) - feat_id_map = json.load(open("/dfs/scratch0/graphnet/ppi/ppi-id_map.json")) + feat_id_map = json.load(open(dataset_dir + "/ppi-id_map.json")) feat_id_map = {int(id):val for id,val in feat_id_map.iteritems()} train_feats = feats[[feat_id_map[id] for id in train_ids]] test_feats = feats[[feat_id_map[id] for id in test_ids]] diff --git a/graphsage/minibatch.py b/graphsage/minibatch.py index 180648d..a480e15 100644 --- a/graphsage/minibatch.py +++ b/graphsage/minibatch.py @@ -125,6 +125,9 @@ class EdgeMinibatchIterator(object): batch_edges = self.train_edges[start : start + self.batch_size] return self.batch_feed_dict(batch_edges) + def num_training_batches(self): + return len(self.train_edges) // self.batch_size + 1 + def val_feed_dict(self, size=None): edge_list = self.val_edges if size is None: @@ -287,6 +290,9 @@ class NodeMinibatchIterator(object): ret_val = self.batch_feed_dict(val_node_subset) return ret_val[0], ret_val[1], (iter_num+1)*size >= len(val_nodes), val_node_subset + def num_training_batches(self): + return len(self.train_nodes) // self.batch_size + 1 + def next_minibatch_feed_dict(self): start = self.batch_num * self.batch_size self.batch_num += 1 diff --git a/graphsage/utils.py b/graphsage/utils.py index c15f568..400b95e 100644 --- a/graphsage/utils.py +++ b/graphsage/utils.py @@ -81,4 +81,4 @@ if __name__ == "__main__": G = G.subgraph(nodes) pairs = run_random_walks(G, nodes) with open(out_file, "w") as fp: - fp.write("\n".join([p[0] + "\t" + p[1] for p in pairs])) + fp.write("\n".join([str(p[0]) + "\t" + str(p[1]) for p in pairs]))