Inst2Vec/process_data/check_length.py

30 lines
751 B
Python
Raw Normal View History

import os
2021-06-30 19:20:12 +08:00
import pdb
from utils import ORIGINAL_DATA_BASE, read_file
def check(filename):
sents = read_file(filename)
result = 0
for sent in sents:
2021-06-30 19:20:12 +08:00
result = max(result, len(sent[:-1].replace("\t", " ").split()))
print("The longest sentence in {} has {} words".format(filename, result))
return result
def main():
longest = 0
# for i in range(6):
2024-04-11 16:43:57 +08:00
for i in range(32):
for group in ("pos", "neg"):
filename = os.path.join(
2024-04-11 16:43:57 +08:00
ORIGINAL_DATA_BASE, f'{group}_clean',f"inst.{i}.{group}.txt.clean"
)
longest = max(check(filename), longest)
print("The longest sentence in all files has {} words.".format(longest))
if __name__ == "__main__":
main()