# -*- coding: utf-8 -*- """ @Time : 2023/3/15 10:38 @Author : @FileName: @Software: @Describe: """ import os import docx import pandas as pd def read_docx(rawpath): # doc转docx data = [] data_new = [] document = docx.Document(rawpath) # 获取所有段落 all_paragraphs = document.paragraphs for paragraph in all_paragraphs: # 打印每一个段落的文字 data.append(paragraph.text) # for data_dan in data: # if data_dan == "": # continue # else: # data_list = str(data_dan).split("。") # for data_dan_short in data_list: # if data_dan_short == "": # continue # data_new.append(data_dan_short) data = [dan for dan in data if dan != ""] data = "".join(data) data_list = str(data).split("。") data_new = [dan + "。" for dan in data_list if dan != ""] return data_new if __name__ == '__main__': pathls = r"E:\pycharm_workspace\duplicate_check\data\10235513_大型商业建筑人员疏散设计研究_沈福禹\10235513_沈福禹_大型商业建筑人员疏散设计研究.docx" path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv" data = read_docx(pathls) data = [[i] for i in data] pd.DataFrame(data).to_csv(path_csv, index=None)