detect_rep/ASM2VEC_base_scripts/utils2.py

import numpy as np
from scipy import linalg
from collections import defaultdict


def test():
    sample_vec_seq = get_vec_seq()
    print(sample_vec_seq)
    sample_vec_seq=sample_vec_seq[0]
    print(len(sample_vec_seq))
    print(len(sample_vec_seq[0]))
    return sample_vec_seq

# sample_vec_seq=get_vec_seq()[0]

def sigmoid(z):
    # sigmoid function
    return 1.0/(1.0+np.exp(-z))


def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx = idx + [word2Ind[word]]
    return idx


def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed


def get_vectors1( sample_vec_seq,V, C):
    i = C
    x_batch=[]
    y_batch=[]
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = sample_vec_seq[i]

        #y是中性词的one-hot向量
        y=np.array(center_word)


        #上下文的向量one-hot数组
        #C是窗口大小
        context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
        num_ctx_words = len(context_words)

        #x是输入的平均数
        for item in context_words:
            print(item)
            x+= np.array(item)
        x=x/num_ctx_words

        x_batch.append(x)
        y_batch.append(y)

        i += 1
        if i >= len(sample_vec_seq):
            return x_batch, y_batch

def get_vectors( sample_vec_seq,V, C):
    i = C

    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        # center_word = data[i]
        center_word = sample_vec_seq[i]
        # print("i")
        # print(i)
        # print("centerword")
        # print(center_word)

        #y是中性词的one-hot向量
        # y[word2Ind[center_word]] = 1
        y=np.array(center_word)


        #上下文的向量one-hot数组
        #C是窗口大小
        # context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
        context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
        num_ctx_words = len(context_words)

        #x是输入的平均数
        # for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
        #     x[idx] = freq/num_ctx_words
        # print("context_words")
        # print(context_words)
        # print("item")
        for item in context_words:
            x+= np.array(item)
        # exit()
        x=x/num_ctx_words

        # print("x")
        # print(x)
        # exit()
        # print("x、y:")
        # print(x.shape)
        # print(y.shape)

        # exit()
        yield x, y
        i += 1
        if i >= len(sample_vec_seq):
            break
            print('i is being set to 0')
            i = 0

def get_batches( sample_vec_seq,V, C, batch_size):
    batch_x = []
    batch_y = []
    # print(sample_vec_seq[0])
    # print(sample_vec_seq[1])
    # print(sample_vec_seq[2])
    # print(sample_vec_seq[3])
    # print(sample_vec_seq[4])
    # print(sample_vec_seq[0]==sample_vec_seq[1])
    # print(len(sample_vec_seq))
    # exit()
    for x, y in get_vectors( sample_vec_seq,V, C):
    # x,y =get_vectors1(sample_vec_seq, V, C)
    # for i in range(len(x)):
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
            # print("batch_x、batch_y:")
            # print(len(batch_x))
            # print(len(batch_y))

        else:
            # exit()
            # print(batch_x)
            # print(batch_x[0])
            # print(batch_x[1])
            # print(batch_x[0]==batch_x[127])
            # print(batch_y[0]==batch_y[127])
            # exit()
            # print(batch_y)
            yield np.array(batch_x).T, np.array(batch_y).T
            batch_x = []
            batch_y = []


def compute_pca(data, n_components=2):
    """
    Input: 
        data: of dimension (m,n) where each row corresponds to a word vector
        n_components: Number of components you want to keep.
    Output: 
        X_reduced: data transformed in 2 dims/columns + regenerated original data
    pass in: data as 2D NumPy array
    """

    m, n = data.shape

    ### START CODE HERE ###
    # mean center the data
    data -= data.mean(axis=0)
    # calculate the covariance matrix
    R = np.cov(data, rowvar=False)
    # calculate eigenvectors & eigenvalues of the covariance matrix
    # use 'eigh' rather than 'eig' since R is symmetric,
    # the performance gain is substantial
    evals, evecs = linalg.eigh(R)
    # sort eigenvalue in decreasing order
    # this returns the corresponding indices of evals and evecs
    idx = np.argsort(evals)[::-1]

    evecs = evecs[:, idx]
    # sort eigenvectors according to same index
    evals = evals[idx]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    evecs = evecs[:, :n_components]
    ### END CODE HERE ###
    return np.dot(evecs.T, data.T).T


def get_dict(data):
    """
    Input:
        K: the number of negative samples
        data: the data you want to pull from
        indices: a list of word indices
    Output:
        word_dict: a dictionary with the weighted probabilities of each word
        word2Ind: returns dictionary mapping the word to its index
        Ind2Word: returns dictionary mapping the index to its word
    """
    #
#     words = nltk.word_tokenize(data)
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    # return these correctly
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

if __name__ == '__main__':
    b=np.array([[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],[1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]])
    print(b.shape)
    a=compute_pca(b)
    print(a)
second commit 2023-04-05 10:04:49 +08:00			`import numpy as np`
			`from scipy import linalg`
			`from collections import defaultdict`



			`def test():`
			`sample_vec_seq = get_vec_seq()`
			`print(sample_vec_seq)`
			`sample_vec_seq=sample_vec_seq[0]`
			`print(len(sample_vec_seq))`
			`print(len(sample_vec_seq[0]))`
			`return sample_vec_seq`

			`# sample_vec_seq=get_vec_seq()[0]`

			`def sigmoid(z):`
			`# sigmoid function`
			`return 1.0/(1.0+np.exp(-z))`


			`def get_idx(words, word2Ind):`
			`idx = []`
			`for word in words:`
			`idx = idx + [word2Ind[word]]`
			`return idx`


			`def pack_idx_with_frequency(context_words, word2Ind):`
			`freq_dict = defaultdict(int)`
			`for word in context_words:`
			`freq_dict[word] += 1`
			`idxs = get_idx(context_words, word2Ind)`
			`packed = []`
			`for i in range(len(idxs)):`
			`idx = idxs[i]`
			`freq = freq_dict[context_words[i]]`
			`packed.append((idx, freq))`
			`return packed`


			`def get_vectors1( sample_vec_seq,V, C):`
			`i = C`
			`x_batch=[]`
			`y_batch=[]`
			`while True:`
			`y = np.zeros(V)`
			`x = np.zeros(V)`
			`center_word = sample_vec_seq[i]`

			`#y是中性词的one-hot向量`
			`y=np.array(center_word)`


			`#上下文的向量one-hot数组`
			`#C是窗口大小`
			`context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]`
			`num_ctx_words = len(context_words)`

			`#x是输入的平均数`
			`for item in context_words:`
			`print(item)`
			`x+= np.array(item)`
			`x=x/num_ctx_words`

			`x_batch.append(x)`
			`y_batch.append(y)`

			`i += 1`
			`if i >= len(sample_vec_seq):`
			`return x_batch, y_batch`

			`def get_vectors( sample_vec_seq,V, C):`
			`i = C`

			`while True:`
			`y = np.zeros(V)`
			`x = np.zeros(V)`
			`# center_word = data[i]`
			`center_word = sample_vec_seq[i]`
			`# print("i")`
			`# print(i)`
			`# print("centerword")`
			`# print(center_word)`

			`#y是中性词的one-hot向量`
			`# y[word2Ind[center_word]] = 1`
			`y=np.array(center_word)`


			`#上下文的向量one-hot数组`
			`#C是窗口大小`
			`# context_words = data[(i - C):i] + data[(i+1):(i+C+1)]`
			`context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]`
			`num_ctx_words = len(context_words)`

			`#x是输入的平均数`
			`# for idx, freq in pack_idx_with_frequency(context_words, word2Ind):`
			`# x[idx] = freq/num_ctx_words`
			`# print("context_words")`
			`# print(context_words)`
			`# print("item")`
			`for item in context_words:`
			`x+= np.array(item)`
			`# exit()`
			`x=x/num_ctx_words`

			`# print("x")`
			`# print(x)`
			`# exit()`
			`# print("x、y:")`
			`# print(x.shape)`
			`# print(y.shape)`

			`# exit()`
			`yield x, y`
			`i += 1`
			`if i >= len(sample_vec_seq):`
			`break`
			`print('i is being set to 0')`
			`i = 0`

			`def get_batches( sample_vec_seq,V, C, batch_size):`
			`batch_x = []`
			`batch_y = []`
			`# print(sample_vec_seq[0])`
			`# print(sample_vec_seq[1])`
			`# print(sample_vec_seq[2])`
			`# print(sample_vec_seq[3])`
			`# print(sample_vec_seq[4])`
			`# print(sample_vec_seq[0]==sample_vec_seq[1])`
			`# print(len(sample_vec_seq))`
			`# exit()`
			`for x, y in get_vectors( sample_vec_seq,V, C):`
			`# x,y =get_vectors1(sample_vec_seq, V, C)`
			`# for i in range(len(x)):`
			`while len(batch_x) < batch_size:`
			`batch_x.append(x)`
			`batch_y.append(y)`
			`# print("batch_x、batch_y:")`
			`# print(len(batch_x))`
			`# print(len(batch_y))`

			`else:`
			`# exit()`
			`# print(batch_x)`
			`# print(batch_x[0])`
			`# print(batch_x[1])`
			`# print(batch_x[0]==batch_x[127])`
			`# print(batch_y[0]==batch_y[127])`
			`# exit()`
			`# print(batch_y)`
			`yield np.array(batch_x).T, np.array(batch_y).T`
			`batch_x = []`
			`batch_y = []`


			`def compute_pca(data, n_components=2):`
			`"""`
			`Input:`
			`data: of dimension (m,n) where each row corresponds to a word vector`
			`n_components: Number of components you want to keep.`
			`Output:`
			`X_reduced: data transformed in 2 dims/columns + regenerated original data`
			`pass in: data as 2D NumPy array`
			`"""`

			`m, n = data.shape`

			`### START CODE HERE ###`
			`# mean center the data`
			`data -= data.mean(axis=0)`
			`# calculate the covariance matrix`
			`R = np.cov(data, rowvar=False)`
			`# calculate eigenvectors & eigenvalues of the covariance matrix`
			`# use 'eigh' rather than 'eig' since R is symmetric,`
			`# the performance gain is substantial`
			`evals, evecs = linalg.eigh(R)`
			`# sort eigenvalue in decreasing order`
			`# this returns the corresponding indices of evals and evecs`
			`idx = np.argsort(evals)[::-1]`

			`evecs = evecs[:, idx]`
			`# sort eigenvectors according to same index`
			`evals = evals[idx]`
			`# select the first n eigenvectors (n is desired dimension`
			`# of rescaled data array, or dims_rescaled_data)`
			`evecs = evecs[:, :n_components]`
			`### END CODE HERE ###`
			`return np.dot(evecs.T, data.T).T`


			`def get_dict(data):`
			`"""`
			`Input:`
			`K: the number of negative samples`
			`data: the data you want to pull from`
			`indices: a list of word indices`
			`Output:`
			`word_dict: a dictionary with the weighted probabilities of each word`
			`word2Ind: returns dictionary mapping the word to its index`
			`Ind2Word: returns dictionary mapping the index to its word`
			`"""`
			`#`
			`# words = nltk.word_tokenize(data)`
			`words = sorted(list(set(data)))`
			`n = len(words)`
			`idx = 0`
			`# return these correctly`
			`word2Ind = {}`
			`Ind2word = {}`
			`for k in words:`
			`word2Ind[k] = idx`
			`Ind2word[idx] = k`
			`idx += 1`
			`return word2Ind, Ind2word`

			`if __name__ == '__main__':`
			`b=np.array([[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],[1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]])`
			`print(b.shape)`
			`a=compute_pca(b)`
			`print(a)`