from utils import ORIGINAL_DATA_BASE, read_file, write_file from tqdm import tqdm import os from my_utils import multi_thread, setup_logger import concurrent.futures def remove(neg_list, pos_file): ret = [] for neg in neg_list: if neg in pos_file: continue ret.append(neg) return ret def split_list_evenly(lst, n): # 计算每块的大小(整除,最后一块可能略短) chunk_size = len(lst) // n # 最后一块可能需要额外的元素 last_chunk_size = len(lst) % n # 初始化空列表存放切片后的块 chunks = [] # 对于前n-1块 for i in range(0, (n - (last_chunk_size > 0)), chunk_size): chunks.append(lst[i:i + chunk_size]) # 添加最后一个可能稍短的块 if last_chunk_size > 0: chunks.append(lst[(n - (last_chunk_size > 0)) * chunk_size:]) return chunks def main(): file = os.path.join('../dataset/all/all_clean') pos_file = read_file(os.path.join(file, "inst.all.pos.txt.clean")) neg_file = split_list_evenly(read_file(os.path.join(file, "inst.all.neg.txt.clean")), int(os.cpu_count()*1000)) print(len(neg_file)) logger = setup_logger('remove', '../out/remove.log') result = [] with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: print('start build task.') futures_to_args = { executor.submit(remove, neg_list, pos_file): neg_list for neg_list in neg_file } print('start run task.') for future in tqdm(concurrent.futures.as_completed(futures_to_args), total=len(futures_to_args)): try: result.extend(future.result()) except Exception as exc: logger.error(exc) write_file(result, os.path.join(file, "inst.all.neg.txt.clean")) if __name__ == "__main__": main()