Inst2Vec/process_data/merge_examples_to_json.py

104 lines
3.0 KiB
Python
Raw Permalink Normal View History

import gc
import json
import os
from multiprocessing import Pool, Process, Queue
from tqdm import tqdm
from utils import CURRENT_DATA_BASE, ORIGINAL_DATA_BASE, read_file
2024-04-11 16:43:57 +08:00
# BASE = 4600000
BASE = 46000
def write_worker(sents, json_file, index):
examples = []
for sent in tqdm(sents):
tmp = sent[:-1].split("\t")
examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
examples[-1]["text"] = tuple(examples[-1]["text"])
print("Writing to {}...".format(json_file + "{}.json".format(index)))
results = {"data": examples}
with open(json_file + "{}.json".format(index), "w") as f:
json.dump(results, f)
def merge_to_json(pos, neg, json_file):
2024-04-11 16:43:57 +08:00
sents = read_file(pos)
2021-06-30 19:20:12 +08:00
p = Pool(6)
2021-06-30 19:20:12 +08:00
for i in range(6):
p.apply_async(
2024-04-11 16:43:57 +08:00
write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, i,)
)
print("Waiting for all sub-processes done...")
p.close()
p.join()
print("All subprocess done.")
# length = len(sents)
# base = length // 20000000 + 1
# for i in tqdm(range(length)):
# examples = []
# tmp = sents[i][:-1].split("\t")
# examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
# examples[i]["text"] = tuple(examples[i]["text"])
# index = i // 20000000
# print("Writing to {}...".format(json_file + "{}.json".format(index)))
# with open(json_file + "{}.json".format(index), "w") as f:
# json.dump(examples, f)
del sents
gc.collect()
sents = read_file(neg)
2021-06-30 19:20:12 +08:00
p = Pool(6)
2021-06-30 19:20:12 +08:00
for i in range(6):
p.apply_async(
2024-04-11 16:43:57 +08:00
write_worker, args=(sents[i * BASE : (i + 1) * BASE], json_file, 6 + i,)
)
print("Waiting for all sub-processes done...")
p.close()
p.join()
print("All subprocess done.")
# length = len(sents)
# for i in tqdm(range(length)):
# examples = []
# tmp = sents[i][:-1].split("\t")
# examples.append({"text": tuple(tmp[1:]), "is_next": int(tmp[0])})
# examples[i]["text"] = tuple(examples[i]["text"])
# index = i // 20000000
# print("Writing to {}...".format(json_file + "{}.json".format(base + index)))
# with open(json_file + "{}.json".format(base + index), "w") as f:
# json.dump(examples, f)
def main():
# for i in range(6):
2021-06-30 19:20:12 +08:00
# for i in range(6):
# pos = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.pos.label.txt".format(i))
# neg = os.path.join(ORIGINAL_DATA_BASE, "inst.{}.neg.label.txt".format(i))
# json_file = os.path.join(CURRENT_DATA_BASE, "inst.{}.".format(i))
# merge_to_json(pos, neg, json_file)
2024-04-11 16:43:57 +08:00
pos = os.path.join(ORIGINAL_DATA_BASE,'all_clean', "inst.all.pos.txt.clean.label")
neg = os.path.join(ORIGINAL_DATA_BASE, 'all_clean',"inst.all.neg.txt.clean.label")
json_file = os.path.join(CURRENT_DATA_BASE, 'json',"inst.all.")
2021-06-30 19:20:12 +08:00
merge_to_json(pos, neg, json_file)
if __name__ == "__main__":
main()