222 lines
5.9 KiB
Python
222 lines
5.9 KiB
Python
import numpy as np
|
|
from scipy import linalg
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
def test():
|
|
sample_vec_seq = get_vec_seq()
|
|
print(sample_vec_seq)
|
|
sample_vec_seq=sample_vec_seq[0]
|
|
print(len(sample_vec_seq))
|
|
print(len(sample_vec_seq[0]))
|
|
return sample_vec_seq
|
|
|
|
# sample_vec_seq=get_vec_seq()[0]
|
|
|
|
def sigmoid(z):
|
|
# sigmoid function
|
|
return 1.0/(1.0+np.exp(-z))
|
|
|
|
|
|
def get_idx(words, word2Ind):
|
|
idx = []
|
|
for word in words:
|
|
idx = idx + [word2Ind[word]]
|
|
return idx
|
|
|
|
|
|
def pack_idx_with_frequency(context_words, word2Ind):
|
|
freq_dict = defaultdict(int)
|
|
for word in context_words:
|
|
freq_dict[word] += 1
|
|
idxs = get_idx(context_words, word2Ind)
|
|
packed = []
|
|
for i in range(len(idxs)):
|
|
idx = idxs[i]
|
|
freq = freq_dict[context_words[i]]
|
|
packed.append((idx, freq))
|
|
return packed
|
|
|
|
|
|
def get_vectors1( sample_vec_seq,V, C):
|
|
i = C
|
|
x_batch=[]
|
|
y_batch=[]
|
|
while True:
|
|
y = np.zeros(V)
|
|
x = np.zeros(V)
|
|
center_word = sample_vec_seq[i]
|
|
|
|
#y是中性词的one-hot向量
|
|
y=np.array(center_word)
|
|
|
|
|
|
#上下文的向量one-hot数组
|
|
#C是窗口大小
|
|
context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
|
|
num_ctx_words = len(context_words)
|
|
|
|
#x是输入的平均数
|
|
for item in context_words:
|
|
print(item)
|
|
x+= np.array(item)
|
|
x=x/num_ctx_words
|
|
|
|
x_batch.append(x)
|
|
y_batch.append(y)
|
|
|
|
i += 1
|
|
if i >= len(sample_vec_seq):
|
|
return x_batch, y_batch
|
|
|
|
def get_vectors( sample_vec_seq,V, C):
|
|
i = C
|
|
|
|
while True:
|
|
y = np.zeros(V)
|
|
x = np.zeros(V)
|
|
# center_word = data[i]
|
|
center_word = sample_vec_seq[i]
|
|
# print("i")
|
|
# print(i)
|
|
# print("centerword")
|
|
# print(center_word)
|
|
|
|
#y是中性词的one-hot向量
|
|
# y[word2Ind[center_word]] = 1
|
|
y=np.array(center_word)
|
|
|
|
|
|
#上下文的向量one-hot数组
|
|
#C是窗口大小
|
|
# context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
|
|
context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)]
|
|
num_ctx_words = len(context_words)
|
|
|
|
#x是输入的平均数
|
|
# for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
|
|
# x[idx] = freq/num_ctx_words
|
|
# print("context_words")
|
|
# print(context_words)
|
|
# print("item")
|
|
for item in context_words:
|
|
x+= np.array(item)
|
|
# exit()
|
|
x=x/num_ctx_words
|
|
|
|
# print("x")
|
|
# print(x)
|
|
# exit()
|
|
# print("x、y:")
|
|
# print(x.shape)
|
|
# print(y.shape)
|
|
|
|
# exit()
|
|
yield x, y
|
|
i += 1
|
|
if i >= len(sample_vec_seq):
|
|
break
|
|
print('i is being set to 0')
|
|
i = 0
|
|
|
|
def get_batches( sample_vec_seq,V, C, batch_size):
|
|
batch_x = []
|
|
batch_y = []
|
|
# print(sample_vec_seq[0])
|
|
# print(sample_vec_seq[1])
|
|
# print(sample_vec_seq[2])
|
|
# print(sample_vec_seq[3])
|
|
# print(sample_vec_seq[4])
|
|
# print(sample_vec_seq[0]==sample_vec_seq[1])
|
|
# print(len(sample_vec_seq))
|
|
# exit()
|
|
for x, y in get_vectors( sample_vec_seq,V, C):
|
|
# x,y =get_vectors1(sample_vec_seq, V, C)
|
|
# for i in range(len(x)):
|
|
while len(batch_x) < batch_size:
|
|
batch_x.append(x)
|
|
batch_y.append(y)
|
|
# print("batch_x、batch_y:")
|
|
# print(len(batch_x))
|
|
# print(len(batch_y))
|
|
|
|
else:
|
|
# exit()
|
|
# print(batch_x)
|
|
# print(batch_x[0])
|
|
# print(batch_x[1])
|
|
# print(batch_x[0]==batch_x[127])
|
|
# print(batch_y[0]==batch_y[127])
|
|
# exit()
|
|
# print(batch_y)
|
|
yield np.array(batch_x).T, np.array(batch_y).T
|
|
batch_x = []
|
|
batch_y = []
|
|
|
|
|
|
def compute_pca(data, n_components=2):
|
|
"""
|
|
Input:
|
|
data: of dimension (m,n) where each row corresponds to a word vector
|
|
n_components: Number of components you want to keep.
|
|
Output:
|
|
X_reduced: data transformed in 2 dims/columns + regenerated original data
|
|
pass in: data as 2D NumPy array
|
|
"""
|
|
|
|
m, n = data.shape
|
|
|
|
### START CODE HERE ###
|
|
# mean center the data
|
|
data -= data.mean(axis=0)
|
|
# calculate the covariance matrix
|
|
R = np.cov(data, rowvar=False)
|
|
# calculate eigenvectors & eigenvalues of the covariance matrix
|
|
# use 'eigh' rather than 'eig' since R is symmetric,
|
|
# the performance gain is substantial
|
|
evals, evecs = linalg.eigh(R)
|
|
# sort eigenvalue in decreasing order
|
|
# this returns the corresponding indices of evals and evecs
|
|
idx = np.argsort(evals)[::-1]
|
|
|
|
evecs = evecs[:, idx]
|
|
# sort eigenvectors according to same index
|
|
evals = evals[idx]
|
|
# select the first n eigenvectors (n is desired dimension
|
|
# of rescaled data array, or dims_rescaled_data)
|
|
evecs = evecs[:, :n_components]
|
|
### END CODE HERE ###
|
|
return np.dot(evecs.T, data.T).T
|
|
|
|
|
|
def get_dict(data):
|
|
"""
|
|
Input:
|
|
K: the number of negative samples
|
|
data: the data you want to pull from
|
|
indices: a list of word indices
|
|
Output:
|
|
word_dict: a dictionary with the weighted probabilities of each word
|
|
word2Ind: returns dictionary mapping the word to its index
|
|
Ind2Word: returns dictionary mapping the index to its word
|
|
"""
|
|
#
|
|
# words = nltk.word_tokenize(data)
|
|
words = sorted(list(set(data)))
|
|
n = len(words)
|
|
idx = 0
|
|
# return these correctly
|
|
word2Ind = {}
|
|
Ind2word = {}
|
|
for k in words:
|
|
word2Ind[k] = idx
|
|
Ind2word[idx] = k
|
|
idx += 1
|
|
return word2Ind, Ind2word
|
|
|
|
if __name__ == '__main__':
|
|
b=np.array([[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],[1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]])
|
|
print(b.shape)
|
|
a=compute_pca(b)
|
|
print(a) |