Inst2Vec/process_data/clean.py

58 lines
1.8 KiB
Python
Raw Permalink Normal View History

2021-06-30 19:20:12 +08:00
from utils import ORIGINAL_DATA_BASE, read_file, write_file
from tqdm import tqdm
import os
2024-04-11 16:43:57 +08:00
from my_utils import multi_thread, setup_logger
import concurrent.futures
2021-06-30 19:20:12 +08:00
2024-04-11 16:43:57 +08:00
def remove(neg_list, pos_file):
ret = []
for neg in neg_list:
if neg in pos_file:
2021-06-30 19:20:12 +08:00
continue
2024-04-11 16:43:57 +08:00
ret.append(neg)
return ret
def split_list_evenly(lst, n):
# 计算每块的大小(整除,最后一块可能略短)
chunk_size = len(lst) // n
# 最后一块可能需要额外的元素
last_chunk_size = len(lst) % n
# 初始化空列表存放切片后的块
chunks = []
# 对于前n-1块
for i in range(0, (n - (last_chunk_size > 0)), chunk_size):
chunks.append(lst[i:i + chunk_size])
# 添加最后一个可能稍短的块
if last_chunk_size > 0:
chunks.append(lst[(n - (last_chunk_size > 0)) * chunk_size:])
return chunks
2021-06-30 19:20:12 +08:00
def main():
2024-04-11 16:43:57 +08:00
file = os.path.join('../dataset/all/all_clean')
pos_file = read_file(os.path.join(file, "inst.all.pos.txt.clean"))
neg_file = split_list_evenly(read_file(os.path.join(file, "inst.all.neg.txt.clean")), int(os.cpu_count()*1000))
print(len(neg_file))
logger = setup_logger('remove', '../out/remove.log')
result = []
with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
print('start build task.')
futures_to_args = {
executor.submit(remove, neg_list, pos_file): neg_list for neg_list in neg_file
}
print('start run task.')
for future in tqdm(concurrent.futures.as_completed(futures_to_args), total=len(futures_to_args)):
try:
result.extend(future.result())
except Exception as exc:
logger.error(exc)
write_file(result, os.path.join(file, "inst.all.neg.txt.clean"))
2021-06-30 19:20:12 +08:00
if __name__ == "__main__":
main()