You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
46 lines
1.4 KiB
46 lines
1.4 KiB
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
@Time : 2023/3/15 10:38
|
|
@Author :
|
|
@FileName:
|
|
@Software:
|
|
@Describe:
|
|
"""
|
|
import os
|
|
import docx
|
|
import pandas as pd
|
|
|
|
|
|
def read_docx(rawpath): # doc转docx
|
|
data = []
|
|
data_new = []
|
|
document = docx.Document(rawpath)
|
|
|
|
# 获取所有段落
|
|
all_paragraphs = document.paragraphs
|
|
for paragraph in all_paragraphs:
|
|
# 打印每一个段落的文字
|
|
data.append(paragraph.text)
|
|
# for data_dan in data:
|
|
# if data_dan == "":
|
|
# continue
|
|
# else:
|
|
# data_list = str(data_dan).split("。")
|
|
# for data_dan_short in data_list:
|
|
# if data_dan_short == "":
|
|
# continue
|
|
# data_new.append(data_dan_short)
|
|
data = [dan for dan in data if dan != ""]
|
|
data = "".join(data)
|
|
data_list = str(data).split("。")
|
|
data_new = [dan + "。" for dan in data_list if dan != ""]
|
|
return data_new
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pathls = r"E:\pycharm_workspace\duplicate_check\data\10235513_大型商业建筑人员疏散设计研究_沈福禹\10235513_沈福禹_大型商业建筑人员疏散设计研究.docx"
|
|
path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv"
|
|
data = read_docx(pathls)
|
|
data = [[i] for i in data]
|
|
pd.DataFrame(data).to_csv(path_csv, index=None)
|
|
|