# -*- coding: utf-8 -*-

"""
@Time    :  2023/2/14 14:19
@Author  :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
import docx
import win32com.client as wc
import operator


def is_chinese(uchar):
    """
    判断一个unicode是否是汉字
    :param uchar:
    :return:
    """
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
        return True
    else:
        return False


def snetence(text):
    bool_ = True
    for i in text:
        bool_1 = is_chinese(i)
        if bool_1 == True:
            continue
        else:
            if i in fuhao:
                continue
            else:
                bool_ = False
                break
    return bool_


fuhao = [",","。",",","、"]
path = '../data/11篇'
path_list = []
for file_name in os.listdir(path):
    path_list.append(file_name)
# print(path_list)


def chulichangju_2(text, chulipangban_return_list):
    fuhao = [",","?","!","…"]
    text_1 = text[:120]
    text_2 = text[120:]
    text_1_new = ""
    for i in range(len(text_1)-1, -1, -1):
        if text_1[i] in fuhao:
            text_1_new = text_1[:i]
            text_1_new += text_1[i]
            chulipangban_return_list.append(text_1_new)
            if text_2 != "":
                if i+1 != 120:
                    text_2 = text_1[i+1:] + text_2
            break
        # else:
        #     chulipangban_return_list.append(text_1)
    if text_1_new == "":
        chulipangban_return_list.append(text_1)
    if text_2 != "":
        chulipangban_return_list = chulichangju_2(text_2,  chulipangban_return_list)
    return chulipangban_return_list


# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
for docx_name in path_list:
    data = []
    data_new = []
    file_name = docx_name.split(".")[0]
    file_type = docx_name.split(".")[1]
    if file_type == "docx":
        document = docx.Document(path + "/" + docx_name)
    else:
        continue
    #获取所有段落
    all_paragraphs = document.paragraphs
    for paragraph in all_paragraphs:
        #打印每一个段落的文字
        data.append(paragraph.text)
    for data_dan in data:
        if data_dan == "":
            continue
        # else:
        #     bool_ = snetence(data_dan)
        #     if bool_ == True:
        #         data_new.append(data_dan)
        else:
            data_list = str(data_dan).split("。")
            for data_dan_short in data_list:
                if data_dan_short == "":
                    continue
                if len(data_dan_short) > 120:
                    dan_list = chulichangju_2(data_dan_short, [])
                    dan_list[-1] += "。"
                    data_new.extend(dan_list)
                else:
                    data_dan_short += "。"
                    data_new.append(data_dan_short)


    with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:
        for i in data_new:
            file.write(i + '\n')
        file.close()