# -*- coding: utf-8 -*-

"""
@Time    :  2023/3/15 10:38
@Author  : 
@FileName: 
@Software: 
@Describe:
"""
import os
import docx
import pandas as pd


def read_docx(rawpath):  # doc转docx
    data = []
    data_new = []
    document = docx.Document(rawpath)

    # 获取所有段落
    all_paragraphs = document.paragraphs
    for paragraph in all_paragraphs:
        # 打印每一个段落的文字
        data.append(paragraph.text)
    # for data_dan in data:
    #     if data_dan == "":
    #         continue
    #     else:
    #         data_list = str(data_dan).split("。")
    #         for data_dan_short in data_list:
    #             if data_dan_short == "":
    #                 continue
    #             data_new.append(data_dan_short)
    data = [dan for dan in data if dan != ""]
    data = "".join(data)
    data_list = str(data).split("。")
    data_new = [dan + "。" for dan in data_list if dan != ""]
    return data_new


if __name__ == '__main__':
    pathls = r"E:\pycharm_workspace\duplicate_check\data\10235513_大型商业建筑人员疏散设计研究_沈福禹\10235513_沈福禹_大型商业建筑人员疏散设计研究.docx"
    path_csv = "data/10235513_大型商业建筑人员疏散设计研究_沈福禹/大型商业建筑人员疏散设计研究_全文.csv"
    data = read_docx(pathls)
    data = [[i] for i in data]
    pd.DataFrame(data).to_csv(path_csv, index=None)