普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

115 lines
2.9 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/2/14 14:19
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
import docx
import win32com.client as wc
import operator
def is_chinese(uchar):
"""
判断一个unicode是否是汉字
:param uchar:
:return:
"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
def snetence(text):
bool_ = True
for i in text:
bool_1 = is_chinese(i)
if bool_1 == True:
continue
else:
if i in fuhao:
continue
else:
bool_ = False
break
return bool_
fuhao = ["","",",",""]
path = '../data/11篇'
path_list = []
for file_name in os.listdir(path):
path_list.append(file_name)
# print(path_list)
def chulichangju_2(text, chulipangban_return_list):
fuhao = ["","","",""]
text_1 = text[:120]
text_2 = text[120:]
text_1_new = ""
for i in range(len(text_1)-1, -1, -1):
if text_1[i] in fuhao:
text_1_new = text_1[:i]
text_1_new += text_1[i]
chulipangban_return_list.append(text_1_new)
if text_2 != "":
if i+1 != 120:
text_2 = text_1[i+1:] + text_2
break
# else:
# chulipangban_return_list.append(text_1)
if text_1_new == "":
chulipangban_return_list.append(text_1)
if text_2 != "":
chulipangban_return_list = chulichangju_2(text_2, chulipangban_return_list)
return chulipangban_return_list
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
for docx_name in path_list:
data = []
data_new = []
file_name = docx_name.split(".")[0]
file_type = docx_name.split(".")[1]
if file_type == "docx":
document = docx.Document(path + "/" + docx_name)
else:
continue
#获取所有段落
all_paragraphs = document.paragraphs
for paragraph in all_paragraphs:
#打印每一个段落的文字
data.append(paragraph.text)
for data_dan in data:
if data_dan == "":
continue
# else:
# bool_ = snetence(data_dan)
# if bool_ == True:
# data_new.append(data_dan)
else:
data_list = str(data_dan).split("")
for data_dan_short in data_list:
if data_dan_short == "":
continue
if len(data_dan_short) > 120:
dan_list = chulichangju_2(data_dan_short, [])
dan_list[-1] += ""
data_new.extend(dan_list)
else:
data_dan_short += ""
data_new.append(data_dan_short)
with open("../data/11篇txt/{}.txt".format(file_name), "w", encoding='utf-8') as file:
for i in data_new:
file.write(i + '\n')
file.close()