import numpy as np from scipy import linalg from collections import defaultdict def test(): sample_vec_seq = get_vec_seq() print(sample_vec_seq) sample_vec_seq=sample_vec_seq[0] print(len(sample_vec_seq)) print(len(sample_vec_seq[0])) return sample_vec_seq # sample_vec_seq=get_vec_seq()[0] def sigmoid(z): # sigmoid function return 1.0/(1.0+np.exp(-z)) def get_idx(words, word2Ind): idx = [] for word in words: idx = idx + [word2Ind[word]] return idx def pack_idx_with_frequency(context_words, word2Ind): freq_dict = defaultdict(int) for word in context_words: freq_dict[word] += 1 idxs = get_idx(context_words, word2Ind) packed = [] for i in range(len(idxs)): idx = idxs[i] freq = freq_dict[context_words[i]] packed.append((idx, freq)) return packed def get_vectors1( sample_vec_seq,V, C): i = C x_batch=[] y_batch=[] while True: y = np.zeros(V) x = np.zeros(V) center_word = sample_vec_seq[i] #y是中性词的one-hot向量 y=np.array(center_word) #上下文的向量one-hot数组 #C是窗口大小 context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)] num_ctx_words = len(context_words) #x是输入的平均数 for item in context_words: print(item) x+= np.array(item) x=x/num_ctx_words x_batch.append(x) y_batch.append(y) i += 1 if i >= len(sample_vec_seq): return x_batch, y_batch def get_vectors( sample_vec_seq,V, C): i = C while True: y = np.zeros(V) x = np.zeros(V) # center_word = data[i] center_word = sample_vec_seq[i] # print("i") # print(i) # print("centerword") # print(center_word) #y是中性词的one-hot向量 # y[word2Ind[center_word]] = 1 y=np.array(center_word) #上下文的向量one-hot数组 #C是窗口大小 # context_words = data[(i - C):i] + data[(i+1):(i+C+1)] context_words = sample_vec_seq[(i - C):i]+sample_vec_seq[(i + 1):(i + C + 1)] num_ctx_words = len(context_words) #x是输入的平均数 # for idx, freq in pack_idx_with_frequency(context_words, word2Ind): # x[idx] = freq/num_ctx_words # print("context_words") # print(context_words) # print("item") for item in context_words: x+= np.array(item) # exit() x=x/num_ctx_words # print("x") # print(x) # exit() # print("x、y:") # print(x.shape) # print(y.shape) # exit() yield x, y i += 1 if i >= len(sample_vec_seq): break print('i is being set to 0') i = 0 def get_batches( sample_vec_seq,V, C, batch_size): batch_x = [] batch_y = [] # print(sample_vec_seq[0]) # print(sample_vec_seq[1]) # print(sample_vec_seq[2]) # print(sample_vec_seq[3]) # print(sample_vec_seq[4]) # print(sample_vec_seq[0]==sample_vec_seq[1]) # print(len(sample_vec_seq)) # exit() for x, y in get_vectors( sample_vec_seq,V, C): # x,y =get_vectors1(sample_vec_seq, V, C) # for i in range(len(x)): while len(batch_x) < batch_size: batch_x.append(x) batch_y.append(y) # print("batch_x、batch_y:") # print(len(batch_x)) # print(len(batch_y)) else: # exit() # print(batch_x) # print(batch_x[0]) # print(batch_x[1]) # print(batch_x[0]==batch_x[127]) # print(batch_y[0]==batch_y[127]) # exit() # print(batch_y) yield np.array(batch_x).T, np.array(batch_y).T batch_x = [] batch_y = [] def compute_pca(data, n_components=2): """ Input: data: of dimension (m,n) where each row corresponds to a word vector n_components: Number of components you want to keep. Output: X_reduced: data transformed in 2 dims/columns + regenerated original data pass in: data as 2D NumPy array """ m, n = data.shape ### START CODE HERE ### # mean center the data data -= data.mean(axis=0) # calculate the covariance matrix R = np.cov(data, rowvar=False) # calculate eigenvectors & eigenvalues of the covariance matrix # use 'eigh' rather than 'eig' since R is symmetric, # the performance gain is substantial evals, evecs = linalg.eigh(R) # sort eigenvalue in decreasing order # this returns the corresponding indices of evals and evecs idx = np.argsort(evals)[::-1] evecs = evecs[:, idx] # sort eigenvectors according to same index evals = evals[idx] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) evecs = evecs[:, :n_components] ### END CODE HERE ### return np.dot(evecs.T, data.T).T def get_dict(data): """ Input: K: the number of negative samples data: the data you want to pull from indices: a list of word indices Output: word_dict: a dictionary with the weighted probabilities of each word word2Ind: returns dictionary mapping the word to its index Ind2Word: returns dictionary mapping the index to its word """ # # words = nltk.word_tokenize(data) words = sorted(list(set(data))) n = len(words) idx = 0 # return these correctly word2Ind = {} Ind2word = {} for k in words: word2Ind[k] = idx Ind2word[idx] = k idx += 1 return word2Ind, Ind2word if __name__ == '__main__': b=np.array([[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],[1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0]]) print(b.shape) a=compute_pca(b) print(a)