# -*- coding: utf-8 -*- """ @Time : 2023/2/14 14:19 @Author : @FileName: @Software: @Describe: """ import os import pandas as pd import docx import win32com.client as wc import operator def is_chinese(uchar): """ 判断一个unicode是否是汉字 :param uchar: :return: """ if uchar >= u'\u4e00' and uchar<=u'\u9fa5': return True else: return False def snetence(text): bool_ = True for i in text: bool_1 = is_chinese(i) if bool_1 == True: continue else: if i in fuhao: continue else: bool_ = False break return bool_ fuhao = [",","。",",","、"] path = '../data/11篇' path_list = [] for file_name in os.listdir(path): path_list.append(file_name) # print(path_list) def chulichangju_2(text, chulipangban_return_list): fuhao = [",","?","!","…"] text_1 = text[:120] text_2 = text[120:] text_1_new = "" for i in range(len(text_1)-1, -1, -1): if text_1[i] in fuhao: text_1_new = text_1[:i] text_1_new += text_1[i] chulipangban_return_list.append(text_1_new) if text_2 != "": if i+1 != 120: text_2 = text_1[i+1:] + text_2 break # else: # chulipangban_return_list.append(text_1) if text_1_new == "": chulipangban_return_list.append(text_1) if text_2 != "": chulipangban_return_list = chulichangju_2(text_2, chulipangban_return_list) return chulipangban_return_list # path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx" for docx_name in path_list: data = [] data_new = [] file_name = docx_name.split(".")[0] file_type = docx_name.split(".")[1] if file_type == "docx": document = docx.Document(path + "/" + docx_name) else: continue #获取所有段落 all_paragraphs = document.paragraphs for paragraph in all_paragraphs: #打印每一个段落的文字 data.append(paragraph.text) for data_dan in data: if data_dan == "": continue # else: # bool_ = snetence(data_dan) # if bool_ == True: # data_new.append(data_dan) else: data_list = str(data_dan).split("。") for data_dan_short in data_list: if data_dan_short == "": continue if len(data_dan_short) > 120: dan_list = chulichangju_2(data_dan_short, []) dan_list[-1] += "。" data_new.extend(dan_list) else: data_dan_short += "。" data_new.append(data_dan_short) with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file: for i in data_new: file.write(i + '\n') file.close()