detect_rep/ASM2VEC_plus_scripts/utils2.py
2023-04-05 10:04:49 +08:00

222 lines
5.9 KiB
Python

import numpy as np
from scipy import linalg
from collections import defaultdict
def test():
sample_vec_seq = get_vec_seq()
print(sample_vec_seq)
sample_vec_seq=sample_vec_seq[0]
print(len(sample_vec_seq))
print(len(sample_vec_seq[0]))
return sample_vec_seq
# sample_vec_seq=get_vec_seq()[0]
def sigmoid(z):
# sigmoid function
return 1.0/(1.0+np.exp(-z))
def get_idx(words, word2Ind):
idx = []
for word in words:
idx = idx + [word2Ind[word]]
return idx
def pack_idx_with_frequency(context_words, word2Ind):
freq_dict = defaultdict(int)
for word in context_words:
freq_dict[word] += 1
idxs = get_idx(context_words, word2Ind)
packed = []
for i in range(len(idxs)):
idx = idxs[i]
freq = freq_dict[context_words[i]]
packed.append((idx, freq))
return packed
def get_vectors1( sample_vec_seq,V, C):
i = C
x_batch=[]
y_batch=[]
while True:
y = np.zeros(V)
x = np.zeros(V)
center_word = sample_vec_seq[i]
#y是中性词的one-hot向量
y=np.array(center_word)
#上下文的向量one-hot数组
#C是窗口大小
context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
num_ctx_words = len(context_words)
#x是输入的平均数
for item in context_words:
print(item)
x+= np.array(item)
x=x/num_ctx_words
x_batch.append(x)
y_batch.append(y)
i += 1
if i >= len(sample_vec_seq):
return x_batch, y_batch
def get_vectors( sample_vec_seq,V, C):
i = C
while True:
y = np.zeros(V)
x = np.zeros(V)
# center_word = data[i]
center_word = sample_vec_seq[i]
# print("i")
# print(i)
# print("centerword")
# print(center_word)
#y是中性词的one-hot向量
# y[word2Ind[center_word]] = 1
y=np.array(center_word)
#上下文的向量one-hot数组
#C是窗口大小
# context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
num_ctx_words = len(context_words)
#x是输入的平均数
# for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
# x[idx] = freq/num_ctx_words
# print("context_words")
# print(context_words)
# print("item")
for item in context_words:
x+= np.array(item)
# exit()
x=x/num_ctx_words
# print("x")
# print(x)
# exit()
# print("x、y:")
# print(x.shape)
# print(y.shape)
# exit()
yield x, y
i += 1
if i >= len(sample_vec_seq):
break
print('i is being set to 0')
i = 0
def get_batches( sample_vec_seq,V, C, batch_size):
batch_x = []
batch_y = []
# print(sample_vec_seq[0])
# print(sample_vec_seq[1])
# print(sample_vec_seq[2])
# print(sample_vec_seq[3])
# print(sample_vec_seq[4])
# print(sample_vec_seq[0]==sample_vec_seq[1])
# print(len(sample_vec_seq))
# exit()
for x, y in get_vectors( sample_vec_seq,V, C):
# x,y =get_vectors1(sample_vec_seq, V, C)
# for i in range(len(x)):
while len(batch_x) < batch_size:
batch_x.append(x)
batch_y.append(y)
# print("batch_x、batch_y:")
# print(len(batch_x))
# print(len(batch_y))
else:
# exit()
# print(batch_x)
# print(batch_x[0])
# print(batch_x[1])
# print(batch_x[0]==batch_x[127])
# print(batch_y[0]==batch_y[127])
# exit()
# print(batch_y)
yield np.array(batch_x).T, np.array(batch_y).T
batch_x = []
batch_y = []
def compute_pca(data, n_components=2):
"""
Input:
data: of dimension (m,n) where each row corresponds to a word vector
n_components: Number of components you want to keep.
Output:
X_reduced: data transformed in 2 dims/columns + regenerated original data
pass in: data as 2D NumPy array
"""
m, n = data.shape
### START CODE HERE ###
# mean center the data
data -= data.mean(axis=0)
# calculate the covariance matrix
R = np.cov(data, rowvar=False)
# calculate eigenvectors & eigenvalues of the covariance matrix
# use 'eigh' rather than 'eig' since R is symmetric,
# the performance gain is substantial
evals, evecs = linalg.eigh(R)
# sort eigenvalue in decreasing order
# this returns the corresponding indices of evals and evecs
idx = np.argsort(evals)[::-1]
evecs = evecs[:, idx]
# sort eigenvectors according to same index
evals = evals[idx]
# select the first n eigenvectors (n is desired dimension
# of rescaled data array, or dims_rescaled_data)
evecs = evecs[:, :n_components]
### END CODE HERE ###
return np.dot(evecs.T, data.T).T
def get_dict(data):
"""
Input:
K: the number of negative samples
data: the data you want to pull from
indices: a list of word indices
Output:
word_dict: a dictionary with the weighted probabilities of each word
word2Ind: returns dictionary mapping the word to its index
Ind2Word: returns dictionary mapping the index to its word
"""
#
# words = nltk.word_tokenize(data)
words = sorted(list(set(data)))
n = len(words)
idx = 0
# return these correctly
word2Ind = {}
Ind2word = {}
for k in words:
word2Ind[k] = idx
Ind2word[idx] = k
idx += 1
return word2Ind, Ind2word
if __name__ == '__main__':
b=np.array([[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],[1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]])
print(b.shape)
a=compute_pca(b)
print(a)