# -*- coding: utf-8 -*- """ @Time : 2023/2/14 14:19 @Author : @FileName: @Software: @Describe: """ import os import pandas as pd import docx import win32com.client as wc import operator def is_chinese(uchar): """ 判断一个unicode是否是汉字 :param uchar: :return: """ if uchar >= u'\u4e00' and uchar<=u'\u9fa5': return True else: return False def snetence(text): bool_ = True for i in text: bool_1 = is_chinese(i) if bool_1 == True: continue else: if i in fuhao: continue else: bool_ = False break return bool_ fuhao = [",","。",",","、"] path = '../data/11篇' path_list = [] for file_name in os.listdir(path): path_list.append(file_name) # print(path_list) # path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx" for docx_name in path_list: data = [] data_new = [] file_name = docx_name.split(".")[0] file_type = docx_name.split(".")[1] if file_type == "docx": document = docx.Document(path + "/" + docx_name) else: continue #获取所有段落 all_paragraphs = document.paragraphs for paragraph in all_paragraphs: #打印每一个段落的文字 data.append(paragraph.text) data = sorted(data,key=lambda x:len(x)) for data_dan in data: if data_dan == "": continue for i in data_dan: if i == "章": continue if len(data_dan) < 15: continue # else: # bool_ = snetence(data_dan) # if bool_ == True: # data_new.append(data_dan) else: data_list = str(data_dan).split("。") for data_dan_short in data_list: if data_dan_short == "": continue for i in data_dan_short: if i == "章": continue if len(data_dan_short) < 10: continue if len(data_dan_short) > 120: continue data_new.append(data_dan_short) data_new = sorted(data_new,key=lambda x:len(x)) data_df = [] for i in data_new: data_df.append([i]) pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False)