drop_weight_rewrite/data_do/处理11篇顺序输入.py

# -*- coding: utf-8 -*-

"""
@Time    :  2023/2/14 14:19
@Author  :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
import docx
import win32com.client as wc
import operator


def is_chinese(uchar):
    """
    判断一个unicode是否是汉字
    :param uchar:
    :return:
    """
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
        return True
    else:
        return False


def snetence(text):
    bool_ = True
    for i in text:
        bool_1 = is_chinese(i)
        if bool_1 == True:
            continue
        else:
            if i in fuhao:
                continue
            else:
                bool_ = False
                break
    return bool_


fuhao = ["，","。",",","、"]
path = '../data/11篇'
path_list = []
for file_name in os.listdir(path):
    path_list.append(file_name)
# print(path_list)


def chulichangju_2(text, chulipangban_return_list):
    fuhao = ["，","？","！","…"]
    text_1 = text[:120]
    text_2 = text[120:]
    text_1_new = ""
    for i in range(len(text_1)-1, -1, -1):
        if text_1[i] in fuhao:
            text_1_new = text_1[:i]
            text_1_new += text_1[i]
            chulipangban_return_list.append(text_1_new)
            if text_2 != "":
                if i+1 != 120:
                    text_2 = text_1[i+1:] + text_2
            break
        # else:
        #     chulipangban_return_list.append(text_1)
    if text_1_new == "":
        chulipangban_return_list.append(text_1)
    if text_2 != "":
        chulipangban_return_list = chulichangju_2(text_2,  chulipangban_return_list)
    return chulipangban_return_list


# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
for docx_name in path_list:
    data = []
    data_new = []
    file_name = docx_name.split(".")[0]
    file_type = docx_name.split(".")[1]
    if file_type == "docx":
        document = docx.Document(path + "/" + docx_name)
    else:
        continue
    #获取所有段落
    all_paragraphs = document.paragraphs
    for paragraph in all_paragraphs:
        #打印每一个段落的文字
        data.append(paragraph.text)
    for data_dan in data:
        if data_dan == "":
            continue
        # else:
        #     bool_ = snetence(data_dan)
        #     if bool_ == True:
        #         data_new.append(data_dan)
        else:
            data_list = str(data_dan).split("。")
            for data_dan_short in data_list:
                if data_dan_short == "":
                    continue
                if len(data_dan_short) > 120:
                    dan_list = chulichangju_2(data_dan_short, [])
                    dan_list[-1] += "。"
                    data_new.extend(dan_list)
                else:
                    data_dan_short += "。"
                    data_new.append(data_dan_short)


    with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:
        for i in data_new:
            file.write(i + '\n')
        file.close()
第一次提交，非batch预测版本 3 years ago			`# -- coding: utf-8 --`

			`"""`
			`@Time : 2023/2/14 14:19`
			`@Author :`
			`@FileName:`
			`@Software:`
			`@Describe:`
			`"""`
			`import os`
			`import pandas as pd`
			`import docx`
			`import win32com.client as wc`
			`import operator`


			`def is_chinese(uchar):`
			`"""`
			`判断一个unicode是否是汉字`
			`:param uchar:`
			`:return:`
			`"""`
			`if uchar >= u'\u4e00' and uchar<=u'\u9fa5':`
			`return True`
			`else:`
			`return False`


			`def snetence(text):`
			`bool_ = True`
			`for i in text:`
			`bool_1 = is_chinese(i)`
			`if bool_1 == True:`
			`continue`
			`else:`
			`if i in fuhao:`
			`continue`
			`else:`
			`bool_ = False`
			`break`
			`return bool_`


			`fuhao = ["，","。",",","、"]`
			`path = '../data/11篇'`
			`path_list = []`
			`for file_name in os.listdir(path):`
			`path_list.append(file_name)`
			`# print(path_list)`


			`def chulichangju_2(text, chulipangban_return_list):`
			`fuhao = ["，","？","！","…"]`
			`text_1 = text[:120]`
			`text_2 = text[120:]`
			`text_1_new = ""`
			`for i in range(len(text_1)-1, -1, -1):`
			`if text_1[i] in fuhao:`
			`text_1_new = text_1[:i]`
			`text_1_new += text_1[i]`
			`chulipangban_return_list.append(text_1_new)`
			`if text_2 != "":`
			`if i+1 != 120:`
			`text_2 = text_1[i+1:] + text_2`
			`break`
			`# else:`
			`# chulipangban_return_list.append(text_1)`
			`if text_1_new == "":`
			`chulipangban_return_list.append(text_1)`
			`if text_2 != "":`
			`chulipangban_return_list = chulichangju_2(text_2, chulipangban_return_list)`
			`return chulipangban_return_list`


			`# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"`
			`for docx_name in path_list:`
			`data = []`
			`data_new = []`
			`file_name = docx_name.split(".")[0]`
			`file_type = docx_name.split(".")[1]`
			`if file_type == "docx":`
			`document = docx.Document(path + "/" + docx_name)`
			`else:`
			`continue`
			`#获取所有段落`
			`all_paragraphs = document.paragraphs`
			`for paragraph in all_paragraphs:`
			`#打印每一个段落的文字`
			`data.append(paragraph.text)`
			`for data_dan in data:`
			`if data_dan == "":`
			`continue`
			`# else:`
			`# bool_ = snetence(data_dan)`
			`# if bool_ == True:`
			`# data_new.append(data_dan)`
			`else:`
			`data_list = str(data_dan).split("。")`
			`for data_dan_short in data_list:`
			`if data_dan_short == "":`
			`continue`
			`if len(data_dan_short) > 120:`
			`dan_list = chulichangju_2(data_dan_short, [])`
			`dan_list[-1] += "。"`
			`data_new.extend(dan_list)`
			`else:`
			`data_dan_short += "。"`
			`data_new.append(data_dan_short)`


			`with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:`
			`for i in data_new:`
			`file.write(i + '\n')`
			`file.close()`