55 lines
1.3 KiB
Python
55 lines
1.3 KiB
Python
import os
|
|
from multiprocessing import Pool, Process, Queue
|
|
|
|
from tqdm import tqdm
|
|
|
|
from utils import ORIGINAL_DATA_BASE, read_file
|
|
|
|
q = Queue(128)
|
|
BASE = 4600000
|
|
|
|
|
|
def counter_worker(sents):
|
|
cnt = set()
|
|
for sent in tqdm(sents):
|
|
cnt = cnt.union(set(sent[:-1].replace("\t", " ").split()))
|
|
print("Process {} get {} words".format(os.getpid(), len(cnt)))
|
|
q.put(cnt)
|
|
return
|
|
|
|
|
|
def counter(filename):
|
|
sents = read_file(filename)
|
|
|
|
p = Pool(36)
|
|
|
|
for i in range(64):
|
|
p.apply_async(counter_worker, args=(sents[i * BASE : (i + 1) * BASE],))
|
|
print("Waiting for all sub-processes done...")
|
|
p.close()
|
|
p.join()
|
|
print("All subprocess done.")
|
|
cnt = set()
|
|
# for sent in tqdm(sents):
|
|
# cnt += set(sent[-1].replace("\t", " ").split())
|
|
for _ in tqdm(range(64)):
|
|
cnt = cnt.union(q.get())
|
|
print("There are {} charcters in {}".format(len(cnt), filename))
|
|
return cnt
|
|
|
|
|
|
def main():
|
|
cnt = set()
|
|
# for i in range(6):
|
|
for i in range(10):
|
|
for group in ["pos", "neg"]:
|
|
filename = os.path.join(
|
|
ORIGINAL_DATA_BASE, "inst.{}.{}.txt.clean".format(i, group)
|
|
)
|
|
cnt = cnt.union(counter(filename))
|
|
print("There are {} charcters in all files".format(len(cnt)))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|