Inst2Vec/process_data/count_word_for_vocab.py

55 lines
1.3 KiB
Python
Raw Normal View History

import os
from multiprocessing import Pool, Process, Queue
from tqdm import tqdm
from utils import ORIGINAL_DATA_BASE, read_file
q = Queue(128)
BASE = 4600000
def counter_worker(sents):
cnt = set()
for sent in tqdm(sents):
cnt = cnt.union(set(sent[:-1].replace("\t", " ").split()))
print("Process {} get {} words".format(os.getpid(), len(cnt)))
q.put(cnt)
return
def counter(filename):
sents = read_file(filename)
p = Pool(36)
for i in range(64):
p.apply_async(counter_worker, args=(sents[i * BASE : (i + 1) * BASE],))
print("Waiting for all sub-processes done...")
p.close()
p.join()
print("All subprocess done.")
cnt = set()
# for sent in tqdm(sents):
# cnt += set(sent[-1].replace("\t", " ").split())
for _ in tqdm(range(64)):
cnt = cnt.union(q.get())
print("There are {} charcters in {}".format(len(cnt), filename))
return cnt
def main():
cnt = set()
# for i in range(6):
2021-06-30 19:20:12 +08:00
for i in range(10):
for group in ["pos", "neg"]:
filename = os.path.join(
2021-06-30 19:20:12 +08:00
ORIGINAL_DATA_BASE, "inst.{}.{}.txt.clean".format(i, group)
)
2021-06-30 19:20:12 +08:00
cnt = cnt.union(counter(filename))
print("There are {} charcters in all files".format(len(cnt)))
if __name__ == "__main__":
main()