You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102 lines
2.4 KiB
102 lines
2.4 KiB
![]()
2 years ago
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
@Time : 2023/2/14 14:19
|
||
|
@Author :
|
||
|
@FileName:
|
||
|
@Software:
|
||
|
@Describe:
|
||
|
"""
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
import docx
|
||
|
import win32com.client as wc
|
||
|
import operator
|
||
|
|
||
|
|
||
|
def is_chinese(uchar):
|
||
|
"""
|
||
|
判断一个unicode是否是汉字
|
||
|
:param uchar:
|
||
|
:return:
|
||
|
"""
|
||
|
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
|
||
|
def snetence(text):
|
||
|
bool_ = True
|
||
|
for i in text:
|
||
|
bool_1 = is_chinese(i)
|
||
|
if bool_1 == True:
|
||
|
continue
|
||
|
else:
|
||
|
if i in fuhao:
|
||
|
continue
|
||
|
else:
|
||
|
bool_ = False
|
||
|
break
|
||
|
return bool_
|
||
|
|
||
|
|
||
|
fuhao = [",","。",",","、"]
|
||
|
path = '../data/11篇'
|
||
|
path_list = []
|
||
|
for file_name in os.listdir(path):
|
||
|
path_list.append(file_name)
|
||
|
# print(path_list)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# path = "../data/11篇/13139551_于丰源_在线考试系统-原文.docx"
|
||
|
for docx_name in path_list:
|
||
|
data = []
|
||
|
data_new = []
|
||
|
file_name = docx_name.split(".")[0]
|
||
|
file_type = docx_name.split(".")[1]
|
||
|
if file_type == "docx":
|
||
|
document = docx.Document(path + "/" + docx_name)
|
||
|
else:
|
||
|
continue
|
||
|
#获取所有段落
|
||
|
all_paragraphs = document.paragraphs
|
||
|
for paragraph in all_paragraphs:
|
||
|
#打印每一个段落的文字
|
||
|
data.append(paragraph.text)
|
||
|
data = sorted(data,key=lambda x:len(x))
|
||
|
for data_dan in data:
|
||
|
if data_dan == "":
|
||
|
continue
|
||
|
for i in data_dan:
|
||
|
if i == "章":
|
||
|
continue
|
||
|
if len(data_dan) < 15:
|
||
|
continue
|
||
|
# else:
|
||
|
# bool_ = snetence(data_dan)
|
||
|
# if bool_ == True:
|
||
|
# data_new.append(data_dan)
|
||
|
else:
|
||
|
data_list = str(data_dan).split("。")
|
||
|
for data_dan_short in data_list:
|
||
|
if data_dan_short == "":
|
||
|
continue
|
||
|
for i in data_dan_short:
|
||
|
if i == "章":
|
||
|
continue
|
||
|
if len(data_dan_short) < 10:
|
||
|
continue
|
||
|
if len(data_dan_short) > 120:
|
||
|
continue
|
||
|
data_new.append(data_dan_short)
|
||
|
data_new = sorted(data_new,key=lambda x:len(x))
|
||
|
data_df = []
|
||
|
for i in data_new:
|
||
|
data_df.append([i])
|
||
|
|
||
|
|
||
|
pd.DataFrame(data_df).to_csv("../data/11篇csv/" + file_name + ".csv", index=False)
|