diff --git a/eval_scripts/citation_eval.py b/eval_scripts/citation_eval.py index 8c5093e..3707a53 100644 --- a/eval_scripts/citation_eval.py +++ b/eval_scripts/citation_eval.py @@ -30,9 +30,9 @@ def run_regression(train_embeds, train_labels, test_embeds, test_labels): if __name__ == '__main__': parser = ArgumentParser("Run evaluation on citation data.") - parser.add_argument("dataset_dir", "Path to directory containing the dataset.") - parser.add_argument("data_dir", "Path to directory containing the learned node embeddings.") - parser.add_argument("setting", "Either val or test.") + parser.add_argument("dataset_dir", help="Path to directory containing the dataset.") + parser.add_argument("data_dir", help="Path to directory containing the learned node embeddings.") + parser.add_argument("setting", help="Either val or test.") args = parser.parse_args() dataset_dir = args.dataset_dir data_dir = args.data_dir diff --git a/eval_scripts/ppi_eval.py b/eval_scripts/ppi_eval.py index 2d8e8a1..88b7a9e 100644 --- a/eval_scripts/ppi_eval.py +++ b/eval_scripts/ppi_eval.py @@ -11,7 +11,7 @@ def run_regression(train_embeds, train_labels, test_embeds, test_labels): from sklearn.dummy import DummyClassifier from sklearn.metrics import f1_score from sklearn.multioutput import MultiOutputClassifier - dummy = MultiOutputClassifier(DummyClassifier(strategy='uniform')) + dummy = MultiOutputClassifier(DummyClassifier()) dummy.fit(train_embeds, train_labels) log = MultiOutputClassifier(SGDClassifier(loss="log"), n_jobs=10) log.fit(train_embeds, train_labels) @@ -20,9 +20,9 @@ def run_regression(train_embeds, train_labels, test_embeds, test_labels): if __name__ == '__main__': parser = ArgumentParser("Run evaluation on PPI data.") - parser.add_argument("dataset_dir", "Path to directory containing the dataset.") - parser.add_argument("data_dir", "Path to directory containing the learned node embeddings. Set to 'feat' for raw features.") - parser.add_argument("setting", "Either val or test.") + parser.add_argument("dataset_dir", help="Path to directory containing the dataset.") + parser.add_argument("data_dir", help="Path to directory containing the learned node embeddings. Set to 'feat' for raw features.") + parser.add_argument("setting", help="Either val or test.") args = parser.parse_args() dataset_dir = args.dataset_dir data_dir = args.data_dir @@ -41,8 +41,8 @@ if __name__ == '__main__': if data_dir == "feat": print("Using only features..") - feats = np.load(data_dir + "/ppi-feats.npy") - ## Logistic gets through off by big counts, so log transform num comments and score + feats = np.load(dataset_dir + "/ppi-feats.npy") + ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:,0] = np.log(feats[:,0]+1.0) feats[:,1] = np.log(feats[:,1]-min(np.min(feats[:,1]), -1)) feat_id_map = json.load(open("/dfs/scratch0/graphnet/ppi/ppi-id_map.json")) diff --git a/eval_scripts/reddit_eval.py b/eval_scripts/reddit_eval.py index 09dd331..a0f68c6 100644 --- a/eval_scripts/reddit_eval.py +++ b/eval_scripts/reddit_eval.py @@ -12,23 +12,20 @@ def run_regression(train_embeds, train_labels, test_embeds, test_labels): from sklearn.metrics import f1_score dummy = DummyClassifier() dummy.fit(train_embeds, train_labels) - log = SGDClassifier(loss="log", n_jobs=55, n_iter=50) + log = SGDClassifier(loss="log", n_jobs=55) log.fit(train_embeds, train_labels) print("Test scores") print(f1_score(test_labels, log.predict(test_embeds), average="micro")) - print(f1_score(test_labels, log.predict(test_embeds), average="macro")) print("Train scores") print(f1_score(train_labels, log.predict(train_embeds), average="micro")) - print(f1_score(train_labels, log.predict(train_embeds), average="macro")) print("Random baseline") print(f1_score(test_labels, dummy.predict(test_embeds), average="micro")) - print(f1_score(test_labels, dummy.predict(test_embeds), average="macro")) if __name__ == '__main__': parser = ArgumentParser("Run evaluation on Reddit data.") - parser.add_argument("dataset_dir", "Path to directory containing the dataset.") - parser.add_argument("data_dir", "Path to directory containing the learned node embeddings. Set to 'feat' for raw features.") - parser.add_argument("setting", "Either val or test.") + parser.add_argument("dataset_dir", help="Path to directory containing the dataset.") + parser.add_argument("data_dir", help="Path to directory containing the learned node embeddings. Set to 'feat' for raw features.") + parser.add_argument("setting", help="Either val or test.") args = parser.parse_args() dataset_dir = args.dataset_dir data_dir = args.data_dir @@ -37,8 +34,6 @@ if __name__ == '__main__': print("Loading data...") G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) labels = json.load(open(dataset_dir + "/reddit-class_map.json")) - data_dir = sys.argv[1] - setting = sys.argv[2] train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n][setting]] @@ -48,7 +43,7 @@ if __name__ == '__main__': if data_dir == "feat": print("Using only features..") feats = np.load(dataset_dir + "/reddit-feats.npy") - ## Logistic gets through off by big counts, so log transform num comments and score + ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:,0] = np.log(feats[:,0]+1.0) feats[:,1] = np.log(feats[:,1]-min(np.min(feats[:,1]), -1)) feat_id_map = json.load(open(dataset_dir + "reddit-id_map.json")) diff --git a/graphsage/utils.py b/graphsage/utils.py index 58f7500..c15f568 100644 --- a/graphsage/utils.py +++ b/graphsage/utils.py @@ -1,6 +1,9 @@ +from __future__ import print_function + import numpy as np import random import json +import sys from networkx.readwrite import json_graph @@ -52,7 +55,6 @@ def load_data(prefix, normalize=True): return G, feats, id_map, walks, class_map def run_random_walks(G, nodes, num_walks=N_WALKS): - print("Subgraph for walks is of size", len(G)) pairs = [] for count, node in enumerate(nodes): if G.degree(node) == 0: @@ -66,5 +68,17 @@ def run_random_walks(G, nodes, num_walks=N_WALKS): pairs.append((node,curr_node)) curr_node = next_node if count % 1000 == 0: - print(count) + print("Done walks for", count, "nodes") return pairs + +if __name__ == "__main__": + """ Run random walks """ + graph_file = sys.argv[1] + out_file = sys.argv[2] + G_data = json.load(open(graph_file)) + G = json_graph.node_link_graph(G_data) + nodes = [n for n in G.nodes() if not G.node[n]["val"] and not G.node[n]["test"]] + G = G.subgraph(nodes) + pairs = run_random_walks(G, nodes) + with open(out_file, "w") as fp: + fp.write("\n".join([p[0] + "\t" + p[1] for p in pairs]))