2021-06-30 19:20:12 +08:00
|
|
|
from utils import ORIGINAL_DATA_BASE, read_file, write_file
|
|
|
|
from tqdm import tqdm
|
|
|
|
import os
|
2024-04-11 16:43:57 +08:00
|
|
|
from my_utils import multi_thread, setup_logger
|
|
|
|
import concurrent.futures
|
2021-06-30 19:20:12 +08:00
|
|
|
|
2024-04-11 16:43:57 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove(neg_list, pos_file):
|
|
|
|
ret = []
|
|
|
|
for neg in neg_list:
|
|
|
|
if neg in pos_file:
|
2021-06-30 19:20:12 +08:00
|
|
|
continue
|
2024-04-11 16:43:57 +08:00
|
|
|
ret.append(neg)
|
|
|
|
return ret
|
|
|
|
|
|
|
|
def split_list_evenly(lst, n):
|
|
|
|
# 计算每块的大小(整除,最后一块可能略短)
|
|
|
|
chunk_size = len(lst) // n
|
|
|
|
# 最后一块可能需要额外的元素
|
|
|
|
last_chunk_size = len(lst) % n
|
|
|
|
# 初始化空列表存放切片后的块
|
|
|
|
chunks = []
|
|
|
|
|
|
|
|
# 对于前n-1块
|
|
|
|
for i in range(0, (n - (last_chunk_size > 0)), chunk_size):
|
|
|
|
chunks.append(lst[i:i + chunk_size])
|
|
|
|
|
|
|
|
# 添加最后一个可能稍短的块
|
|
|
|
if last_chunk_size > 0:
|
|
|
|
chunks.append(lst[(n - (last_chunk_size > 0)) * chunk_size:])
|
|
|
|
|
|
|
|
return chunks
|
2021-06-30 19:20:12 +08:00
|
|
|
|
|
|
|
def main():
|
2024-04-11 16:43:57 +08:00
|
|
|
file = os.path.join('../dataset/all/all_clean')
|
|
|
|
pos_file = read_file(os.path.join(file, "inst.all.pos.txt.clean"))
|
|
|
|
neg_file = split_list_evenly(read_file(os.path.join(file, "inst.all.neg.txt.clean")), int(os.cpu_count()*1000))
|
|
|
|
print(len(neg_file))
|
|
|
|
logger = setup_logger('remove', '../out/remove.log')
|
|
|
|
result = []
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
|
|
|
|
print('start build task.')
|
|
|
|
futures_to_args = {
|
|
|
|
executor.submit(remove, neg_list, pos_file): neg_list for neg_list in neg_file
|
|
|
|
}
|
|
|
|
print('start run task.')
|
|
|
|
for future in tqdm(concurrent.futures.as_completed(futures_to_args), total=len(futures_to_args)):
|
|
|
|
try:
|
|
|
|
result.extend(future.result())
|
|
|
|
except Exception as exc:
|
|
|
|
logger.error(exc)
|
|
|
|
write_file(result, os.path.join(file, "inst.all.neg.txt.clean"))
|
2021-06-30 19:20:12 +08:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|