普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
2.4 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/2/14 14:19
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
import docx
import win32com.client as wc
import operator
def is_chinese(uchar):
"""
判断一个unicode是否是汉字
:param uchar:
:return:
"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
def snetence(text):
bool_ = True
for i in text:
bool_1 = is_chinese(i)
if bool_1 == True:
continue
else:
if i in fuhao:
continue
else:
bool_ = False
break
return bool_
fuhao = ["","",",",""]
path = '../data/11篇'
path_list = []
for file_name in os.listdir(path):
path_list.append(file_name)
# print(path_list)
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
for docx_name in path_list:
data = []
data_new = []
file_name = docx_name.split(".")[0]
file_type = docx_name.split(".")[1]
if file_type == "docx":
document = docx.Document(path + "/" + docx_name)
else:
continue
#获取所有段落
all_paragraphs = document.paragraphs
for paragraph in all_paragraphs:
#打印每一个段落的文字
data.append(paragraph.text)
data = sorted(data,key=lambda x:len(x))
for data_dan in data:
if data_dan == "":
continue
for i in data_dan:
if i == "":
continue
if len(data_dan) < 15:
continue
# else:
# bool_ = snetence(data_dan)
# if bool_ == True:
# data_new.append(data_dan)
else:
data_list = str(data_dan).split("")
for data_dan_short in data_list:
if data_dan_short == "":
continue
for i in data_dan_short:
if i == "":
continue
if len(data_dan_short) < 10:
continue
if len(data_dan_short) > 120:
continue
data_new.append(data_dan_short)
data_new = sorted(data_new,key=lambda x:len(x))
data_df = []
for i in data_new:
data_df.append([i])
pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False)