You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
2.9 KiB
115 lines
2.9 KiB
![]()
2 years ago
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
@Time : 2023/2/14 14:19
|
||
|
@Author :
|
||
|
@FileName:
|
||
|
@Software:
|
||
|
@Describe:
|
||
|
"""
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
import docx
|
||
|
import win32com.client as wc
|
||
|
import operator
|
||
|
|
||
|
|
||
|
def is_chinese(uchar):
|
||
|
"""
|
||
|
判断一个unicode是否是汉字
|
||
|
:param uchar:
|
||
|
:return:
|
||
|
"""
|
||
|
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
|
||
|
def snetence(text):
|
||
|
bool_ = True
|
||
|
for i in text:
|
||
|
bool_1 = is_chinese(i)
|
||
|
if bool_1 == True:
|
||
|
continue
|
||
|
else:
|
||
|
if i in fuhao:
|
||
|
continue
|
||
|
else:
|
||
|
bool_ = False
|
||
|
break
|
||
|
return bool_
|
||
|
|
||
|
|
||
|
fuhao = [",","。",",","、"]
|
||
|
path = '../data/11篇'
|
||
|
path_list = []
|
||
|
for file_name in os.listdir(path):
|
||
|
path_list.append(file_name)
|
||
|
# print(path_list)
|
||
|
|
||
|
|
||
|
def chulichangju_2(text, chulipangban_return_list):
|
||
|
fuhao = [",","?","!","…"]
|
||
|
text_1 = text[:120]
|
||
|
text_2 = text[120:]
|
||
|
text_1_new = ""
|
||
|
for i in range(len(text_1)-1, -1, -1):
|
||
|
if text_1[i] in fuhao:
|
||
|
text_1_new = text_1[:i]
|
||
|
text_1_new += text_1[i]
|
||
|
chulipangban_return_list.append(text_1_new)
|
||
|
if text_2 != "":
|
||
|
if i+1 != 120:
|
||
|
text_2 = text_1[i+1:] + text_2
|
||
|
break
|
||
|
# else:
|
||
|
# chulipangban_return_list.append(text_1)
|
||
|
if text_1_new == "":
|
||
|
chulipangban_return_list.append(text_1)
|
||
|
if text_2 != "":
|
||
|
chulipangban_return_list = chulichangju_2(text_2, chulipangban_return_list)
|
||
|
return chulipangban_return_list
|
||
|
|
||
|
|
||
|
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
|
||
|
for docx_name in path_list:
|
||
|
data = []
|
||
|
data_new = []
|
||
|
file_name = docx_name.split(".")[0]
|
||
|
file_type = docx_name.split(".")[1]
|
||
|
if file_type == "docx":
|
||
|
document = docx.Document(path + "/" + docx_name)
|
||
|
else:
|
||
|
continue
|
||
|
#获取所有段落
|
||
|
all_paragraphs = document.paragraphs
|
||
|
for paragraph in all_paragraphs:
|
||
|
#打印每一个段落的文字
|
||
|
data.append(paragraph.text)
|
||
|
for data_dan in data:
|
||
|
if data_dan == "":
|
||
|
continue
|
||
|
# else:
|
||
|
# bool_ = snetence(data_dan)
|
||
|
# if bool_ == True:
|
||
|
# data_new.append(data_dan)
|
||
|
else:
|
||
|
data_list = str(data_dan).split("。")
|
||
|
for data_dan_short in data_list:
|
||
|
if data_dan_short == "":
|
||
|
continue
|
||
|
if len(data_dan_short) > 120:
|
||
|
dan_list = chulichangju_2(data_dan_short, [])
|
||
|
dan_list[-1] += "。"
|
||
|
data_new.extend(dan_list)
|
||
|
else:
|
||
|
data_dan_short += "。"
|
||
|
data_new.append(data_dan_short)
|
||
|
|
||
|
|
||
|
with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:
|
||
|
for i in data_new:
|
||
|
file.write(i + '\n')
|
||
|
file.close()
|