detect_rep/ASM2VEC_plus_scripts/utils2.py

import numpy as np
from scipy import linalg
from collections import defaultdict


def test():
    sample_vec_seq = get_vec_seq()
    print(sample_vec_seq)
    sample_vec_seq=sample_vec_seq[0]
    print(len(sample_vec_seq))
    print(len(sample_vec_seq[0]))
    return sample_vec_seq

# sample_vec_seq=get_vec_seq()[0]

def sigmoid(z):
    # sigmoid function
    return 1.0/(1.0+np.exp(-z))


def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx = idx + [word2Ind[word]]
    return idx


def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed


def get_vectors1( sample_vec_seq,V, C):
    i = C
    x_batch=[]
    y_batch=[]
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = sample_vec_seq[i]

        #y是中性词的one-hot向量
        y=np.array(center_word)


        #上下文的向量one-hot数组
        #C是窗口大小
        context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
        num_ctx_words = len(context_words)

        #x是输入的平均数
        for item in context_words:
            print(item)
            x+= np.array(item)
        x=x/num_ctx_words

        x_batch.append(x)
        y_batch.append(y)

        i += 1
        if i >= len(sample_vec_seq):
            return x_batch, y_batch

def get_vectors( sample_vec_seq,V, C):
    i = C

    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        # center_word = data[i]
        center_word = sample_vec_seq[i]
        # print("i")
        # print(i)
        # print("centerword")
        # print(center_word)

        #y是中性词的one-hot向量
        # y[word2Ind[center_word]] = 1
        y=np.array(center_word)


        #上下文的向量one-hot数组
        #C是窗口大小
        # context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
        context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
        num_ctx_words = len(context_words)

        #x是输入的平均数
        # for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
        #     x[idx] = freq/num_ctx_words
        # print("context_words")
        # print(context_words)
        # print("item")
        for item in context_words:
            x+= np.array(item)
        # exit()
        x=x/num_ctx_words

        # print("x")
        # print(x)
        # exit()
        # print("x、y:")
        # print(x.shape)
        # print(y.shape)

        # exit()
        yield x, y
        i += 1
        if i >= len(sample_vec_seq):
            break
            print('i is being set to 0')
            i = 0

def get_batches( sample_vec_seq,V, C, batch_size):
    batch_x = []
    batch_y = []
    # print(sample_vec_seq[0])
    # print(sample_vec_seq[1])
    # print(sample_vec_seq[2])
    # print(sample_vec_seq[3])
    # print(sample_vec_seq[4])
    # print(sample_vec_seq[0]==sample_vec_seq[1])
    # print(len(sample_vec_seq))
    # exit()
    for x, y in get_vectors( sample_vec_seq,V, C):
    # x,y =get_vectors1(sample_vec_seq, V, C)
    # for i in range(len(x)):
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
            # print("batch_x、batch_y:")
            # print(len(batch_x))
            # print(len(batch_y))

        else:
            # exit()
            # print(batch_x)
            # print(batch_x[0])
            # print(batch_x[1])
            # print(batch_x[0]==batch_x[127])
            # print(batch_y[0]==batch_y[127])
            # exit()
            # print(batch_y)
            yield np.array(batch_x).T, np.array(batch_y).T
            batch_x = []
            batch_y = []


def compute_pca(data, n_components=2):
    """
    Input:
        data: of dimension (m,n) where each row corresponds to a word vector
        n_components: Number of components you want to keep.
    Output:
        X_reduced: data transformed in 2 dims/columns + regenerated original data
    pass in: data as 2D NumPy array
    """

    m, n = data.shape

    ### START CODE HERE ###
    # mean center the data
    data -= data.mean(axis=0)
    # calculate the covariance matrix
    R = np.cov(data, rowvar=False)
    # calculate eigenvectors & eigenvalues of the covariance matrix
    # use 'eigh' rather than 'eig' since R is symmetric,
    # the performance gain is substantial
    evals, evecs = linalg.eigh(R)
    # sort eigenvalue in decreasing order
    # this returns the corresponding indices of evals and evecs
    idx = np.argsort(evals)[::-1]

    evecs = evecs[:, idx]
    # sort eigenvectors according to same index
    evals = evals[idx]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    evecs = evecs[:, :n_components]
    ### END CODE HERE ###
    return np.dot(evecs.T, data.T).T


def get_dict(data):
    """
    Input:
        K: the number of negative samples
        data: the data you want to pull from
        indices: a list of word indices
    Output:
        word_dict: a dictionary with the weighted probabilities of each word
        word2Ind: returns dictionary mapping the word to its index
        Ind2Word: returns dictionary mapping the index to its word
    """
    #
#     words = nltk.word_tokenize(data)
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    # return these correctly
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

if __name__ == '__main__':
    b=np.array([[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],[1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]])
    print(b.shape)
    a=compute_pca(b)
    print(a)