# -*- coding: utf-8 -*-

"""
@Time    :  2023/2/10 12:06
@Author  : 
@FileName: 
@Software: 
@Describe:
"""
# -*- coding: utf-8 -*-

"""
@Time    :  2023/2/9 18:53
@Author  : 
@FileName: 
@Software: 
@Describe:
"""

import pandas as pd
data = pd.read_excel("./data/700条效果对比.xlsx").values.tolist()
data_new = sorted(data,key=lambda x:len(x[0]))


data_yinhao = []
for i in data_new:
    yaunwen = i[0]
    bool_text = False
    for j in yaunwen:
        if j == "“":
            bool_text = True
            break
    if bool_text == True:
        data_yinhao.append(yaunwen)


import re
# sentence = '但相对于传统“二房东”,轻资产分散式长租公寓更专业'
# dialog_sentence = re.findall(r'“.*?”', sentence)
# print(dialog_sentence)
# if dialog_sentence:
#     for i_sentence in dialog_sentence:
#         j_sentence = i_sentence
#         j_sentence = j_sentence.replace('他', '$$$').replace('她', '$$$$').replace('它', '$$$$$')
#         sentence = sentence.replace(i_sentence, j_sentence)
# sentence = sentence.replace('她', '她(確確確確)').replace('他', '他(確確確確)').replace('它', '它(確確確確)')
# sentence = sentence.replace('*****','它们').replace('****','她们').replace('***','他们')
# sentence = sentence.replace('$$$$$','它').replace('$$$$','她').replace('$$$','他')
# # new_list = line_list[0]+'/'+sentence+'/'+line_list[2]
# # unknown_speaker_index.append(i)
# # new_lines.append(new_list)


RE_DIALOG = re.compile(r"\".*?\"|\'.*?\'|“.*?”")
fuhao_end_sentence = ["。",",","?","!","…"]
#
#
def get_dialogs_index(line: str):
    """
    获取对话及其索引
    :param line 文本
    :return dialogs 对话内容
            dialogs_index: 对话位置索引
            other_index: 其他内容位置索引
    """
    dialogs = re.finditer(RE_DIALOG, line)
    dialogs_text = re.findall(RE_DIALOG, line)
    dialogs_index = []
    for dialog in dialogs:
        all_ = [i for i in range(dialog.start(), dialog.end())]
        dialogs_index.extend(all_)
    other_index = [i for i in range(len(line)) if i not in dialogs_index]

    return dialogs_text, dialogs_index, other_index

# text = "但相对于传统“二房东”,轻资产分散式长租公寓“二房东”更专业"
# get_dialogs_index(text)
# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
# for i in dialogs_text:
#     text = text.replace(i, "#####")
# print(text)

data_jinghao = []

for text in data_yinhao:
    get_dialogs_index(text)
    dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
    for i in dialogs_text:
        text = text.replace(i, "#####")
    data_jinghao.append(text)

print(len(data_jinghao))
with open("./data/data_jinghoa.txt", "w", encoding='utf-8') as file:
    for i in data_jinghao:
        file.write(i + '\n')
    file.close()


# text = "但相对于传统“二房东”,轻资产分散式长租公寓更专业"
# print(get_dialogs_index(text))
# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
# if len(dialogs_text) != 0:
#     sep = dialogs_text[0]
#     text_list = text.split(sep)
#     text_1 = text_list[0]
#     if text_1 != "":
#         text_1 = chulipangban_test_1(text_1)
#         text_1 = "。".join(text_1)
#         text_new.append(text_1)
#         text_new.append(sep)
#     else:
#         text_new.append(sep)
#     text_2 = str(sep).join(text_list[1:])
#     text_new = paragraph_test(text_2, text_new)