First commit.

2017-12-19 13:42:06 -08:00 · 2017-12-19 13:42:06 -08:00 · 65cb65c9c0
commit 65cb65c9c0
8 changed files with 64386 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,110 @@
+# Custom
+*.csv
+*.idea
+*.png
+*.pdf
+tmp/
+*.txt
+*swp*
+*.sw?
+gcn_back
+.DS_STORE
+*.aux
+*.log
+*.out
+*.bbl
+*.synctex.gz
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+*.pickle
+*.pkl
+
--- a/graphsage/init.py
+++ b/graphsage/init.py
--- a/graphsage/aggregators.py
+++ b/graphsage/aggregators.py
@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+import random
+
+"""
+Set of modules for aggregating embeddings of neighbors.
+These modules take as input embeddings of neighbors.
+"""
+
+class MeanAggregator(nn.Module):
+    """
+    Aggregates a node's embeddings using mean of neighbors' embeddings
+    """
+    def __init__(self, features, cuda=False, gcn=False): 
+        """
+        Initializes the aggregator for a specific graph.
+
+        features         -- function mapping (node_list, features, offset) to feature values
+                            see torch.nn.EmbeddingBag and forward function below docs for offset meaning.
+        """
+
+        super(MeanAggregator, self).__init__()
+
+        self.features = features
+        self.cuda = cuda
+        self.gcn = gcn
+        
+    def forward(self, nodes, to_neighs, num_sample=10):
+        # Local pointers to functions (speed hack)
+        _set = set
+        if not num_sample is None:
+            _sample = random.sample
+            samp_neighs = [_set(_sample(to_neigh, 
+                            num_sample,
+                            )) if len(to_neigh) >= num_sample else to_neigh for to_neigh in to_neighs]
+        else:
+            samp_neighs = to_neighs
+
+        if self.gcn:
+            samp_neighs = [samp_neigh + set([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)]
+        unique_nodes_list = list(set.union(*samp_neighs))
+        unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)}
+        mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes)))
+        column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]   
+        row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))]
+        mask[row_indices, column_indices] = 1
+        if self.cuda:
+            mask = mask.cuda()
+        num_neigh = mask.sum(1, keepdim=True)
+        mask = mask.div(num_neigh)
+        if self.cuda:
+            embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
+        else:
+            embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
+        to_feats = mask.mm(embed_matrix)
+        return to_feats
--- a/graphsage/encoders.py
+++ b/graphsage/encoders.py
@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+
+class Encoder(nn.Module):
+    """
+    Encodes a node's using 'convolutional' GraphSage approach
+    """
+    def __init__(self, features, feature_dim, 
+            embed_dim, adj_lists, aggregator,
+            num_sample=10,
+            base_model=None, gcn=False, cuda=False, 
+            feature_transform=False): 
+        super(Encoder, self).__init__()
+
+        self.features = features
+        self.feat_dim = feature_dim
+        self.adj_lists = adj_lists
+        self.aggregator = aggregator
+        self.num_sample = num_sample
+        if base_model != None:
+            self.base_model = base_model
+
+        self.gcn = gcn
+        self.embed_dim = embed_dim
+        self.cuda = cuda
+        self.aggregator.cuda = cuda
+        self.weight = nn.Parameter(
+                torch.FloatTensor(embed_dim, self.feat_dim if self.gcn else 2 * self.feat_dim))
+        init.xavier_uniform(self.weight)
+
+    def forward(self, nodes):
+        """
+        Generates embeddings for a batch of nodes.
+
+        nodes     -- list of nodes
+        mode      -- string desiginating the mode of the nodes
+        """
+        neigh_feats = self.aggregator.forward(nodes, [self.adj_lists[node] for node in nodes], 
+                self.num_sample)
+        if not self.gcn:
+            if self.cuda:
+                self_feats = self.features(torch.LongTensor(nodes).cuda())
+            else:
+                self_feats = self.features(torch.LongTensor(nodes))
+            combined = torch.cat([self_feats, neigh_feats], dim=1)
+        else:
+            combined = neigh_feats
+        combined = F.relu(self.weight.mm(combined.t()))
+        return combined
--- a/graphsage/model.py
+++ b/graphsage/model.py
@ -0,0 +1,105 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+from torch.autograd import Variable
+
+import numpy as np
+import time
+import random
+from sklearn.metrics import f1_score
+from collections import defaultdict
+
+from graphsage.encoders import Encoder
+from graphsage.aggregators import MeanAggregator
+
+class SupervisedGraphSage(nn.Module):
+
+    def __init__(self, num_classes, enc):
+        super(SupervisedGraphSage, self).__init__()
+        self.enc = enc
+        self.xent = nn.CrossEntropyLoss()
+
+        self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
+        init.xavier_uniform(self.weight)
+
+    def forward(self, nodes):
+        embeds = self.enc(nodes)
+        scores = self.weight.mm(embeds)
+        return scores.t()
+
+    def loss(self, nodes, labels):
+        scores = self.forward(nodes)
+        return self.xent(scores, labels.squeeze())
+
+def load_pubmed():
+    #hardcoded for simplicity...
+    num_nodes = 19717
+    num_feats = 500
+    feat_data = np.zeros((num_nodes, num_feats))
+    labels = np.empty((num_nodes, 1), dtype=np.int64)
+    node_map = {}
+    with open("pubmed-data/Pubmed-Diabetes.NODE.paper.tab") as fp:
+        fp.readline()
+        feat_map = {entry.split(":")[1]:i-1 for i,entry in enumerate(fp.readline().split("\t"))}
+        for i, line in enumerate(fp):
+            info = line.split("\t")
+            node_map[info[0]] = i
+            labels[i] = int(info[1].split("=")[1])-1
+            for word_info in info[2:-1]:
+                word_info = word_info.split("=")
+                feat_data[i][feat_map[word_info[0]]] = float(word_info[1])
+    adj_lists = defaultdict(set)
+    with open("pubmed-data/Pubmed-Diabetes.DIRECTED.cites.tab") as fp:
+        fp.readline()
+        fp.readline()
+        for line in fp:
+            info = line.strip().split("\t")
+            paper1 = node_map[info[1].split(":")[1]]
+            paper2 = node_map[info[-1].split(":")[1]]
+            adj_lists[paper1].add(paper2)
+            adj_lists[paper2].add(paper1)
+    return feat_data, labels, adj_lists
+
+if __name__ == "__main__":
+    np.random.seed(1)
+    random.seed(1)
+    num_nodes = 19717
+    num_feats = 500
+    feat_data, labels, adj_lists = load_pubmed()
+    features = nn.Embedding(19717, 500)
+    features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
+   # features.cuda()
+
+    agg1 = MeanAggregator(features, cuda=True)
+    enc1 = Encoder(features, 500, 128, adj_lists, agg1, gcn=True, cuda=False)
+    agg2 = MeanAggregator(lambda nodes : enc1(nodes).t(), cuda=False)
+    enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
+            base_model=enc1, gcn=True, cuda=False)
+    enc1.num_samples = 10
+    enc2.num_samples = 25
+
+    graphsage = SupervisedGraphSage(3, enc2)
+#    graphsage.cuda()
+    rand_indices = np.random.permutation(num_nodes)
+    test = rand_indices[:1000]
+    val = rand_indices[1000:1500]
+    train = list(rand_indices[1500:])
+
+    optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=0.7)
+    times = []
+    for batch in range(200):
+        batch_nodes = train[:1024]
+        random.shuffle(train)
+        start_time = time.time()
+        optimizer.zero_grad()
+        loss = graphsage.loss(batch_nodes, 
+                Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
+        loss.backward()
+        optimizer.step()
+        end_time = time.time()
+        times.append(end_time-start_time)
+        print batch, loss.data[0]
+
+    val_output = graphsage.forward(val) 
+    print "Validation F1:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro")
+    print "Average batch time:", np.mean(times)
--- a/pubmed-data/Pubmed-Diabetes.DIRECTED.cites.tab
+++ b/pubmed-data/Pubmed-Diabetes.DIRECTED.cites.tab
--- a/pubmed-data/Pubmed-Diabetes.GRAPH.pubmed.tab
+++ b/pubmed-data/Pubmed-Diabetes.GRAPH.pubmed.tab
@ -0,0 +1,3 @@
+GRAPH	pubmed
+NO_FEATURES
+Pubmed-Diabetes
--- a/pubmed-data/Pubmed-Diabetes.NODE.paper.tab
+++ b/pubmed-data/Pubmed-Diabetes.NODE.paper.tab