You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

46 lines
1.4 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/3/15 10:38
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import docx
import pandas as pd
def read_docx(rawpath): # doc转docx
data = []
data_new = []
document = docx.Document(rawpath)
# 获取所有段落
all_paragraphs = document.paragraphs
for paragraph in all_paragraphs:
# 打印每一个段落的文字
data.append(paragraph.text)
# for data_dan in data:
# if data_dan == "":
# continue
# else:
# data_list = str(data_dan).split("。")
# for data_dan_short in data_list:
# if data_dan_short == "":
# continue
# data_new.append(data_dan_short)
data = [dan for dan in data if dan != ""]
data = "".join(data)
data_list = str(data).split("")
data_new = [dan + "" for dan in data_list if dan != ""]
return data_new
if __name__ == '__main__':
pathls = r"E:\pycharm_workspace\duplicate_check\data\10235513_大型商业建筑人员疏散设计研究_沈福禹\10235513_沈福禹_大型商业建筑人员疏散设计研究.docx"
path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv"
data = read_docx(pathls)
data = [[i] for i in data]
pd.DataFrame(data).to_csv(path_csv, index=None)