drop_weight_rewrite/筛选引号数据.py


								# -*- coding: utf-8 -*-


								"""

								@Time    :  2023/2/10 12:06

								@Author  :

								@FileName:

								@Software:

								@Describe:

								"""

								# -*- coding: utf-8 -*-


								"""

								@Time    :  2023/2/9 18:53

								@Author  :

								@FileName:

								@Software:

								@Describe:

								"""


								import pandas as pd

								data = pd.read_excel("./data/700条效果对比.xlsx").values.tolist()

								data_new = sorted(data,key=lambda x:len(x[0]))


								data_yinhao = []

								for i in data_new:

								    yaunwen = i[0]

								    bool_text = False

								    for j in yaunwen:

								        if j == "“":

								            bool_text = True

								            break

								    if bool_text == True:

								        data_yinhao.append(yaunwen)


								import re

								# sentence = '但相对于传统“二房东”,轻资产分散式长租公寓更专业'

								# dialog_sentence = re.findall(r'“.*?”', sentence)

								# print(dialog_sentence)

								# if dialog_sentence:

								#     for i_sentence in dialog_sentence:

								#         j_sentence = i_sentence

								#         j_sentence = j_sentence.replace('他', '$$$').replace('她', '$$$$').replace('它', '$$$$$')

								#         sentence = sentence.replace(i_sentence, j_sentence)

								# sentence = sentence.replace('她', '她（確確確確）').replace('他', '他（確確確確）').replace('它', '它（確確確確）')

								# sentence = sentence.replace('*****','它们').replace('****','她们').replace('***','他们')

								# sentence = sentence.replace('$$$$$','它').replace('$$$$','她').replace('$$$','他')

								# # new_list = line_list[0]+'/'+sentence+'/'+line_list[2]

								# # unknown_speaker_index.append(i)

								# # new_lines.append(new_list)


								RE_DIALOG = re.compile(r"\".*?\"|\'.*?\'|“.*?”")

								fuhao_end_sentence = ["。","，","？","！","…"]

								#

								#

								def get_dialogs_index(line: str):

								    """

								    获取对话及其索引

								    :param line 文本

								    :return dialogs 对话内容

								            dialogs_index: 对话位置索引

								            other_index: 其他内容位置索引

								    """

								    dialogs = re.finditer(RE_DIALOG, line)

								    dialogs_text = re.findall(RE_DIALOG, line)

								    dialogs_index = []

								    for dialog in dialogs:

								        all_ = [i for i in range(dialog.start(), dialog.end())]

								        dialogs_index.extend(all_)

								    other_index = [i for i in range(len(line)) if i not in dialogs_index]


								    return dialogs_text, dialogs_index, other_index


								# text = "但相对于传统“二房东”,轻资产分散式长租公寓“二房东”更专业"

								# get_dialogs_index(text)

								# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)

								# for i in dialogs_text:

								#     text = text.replace(i, "#####")

								# print(text)


								data_jinghao = []


								for text in data_yinhao:

								    get_dialogs_index(text)

								    dialogs_text, dialogs_index, other_index = get_dialogs_index(text)

								    for i in dialogs_text:

								        text = text.replace(i, "#####")

								    data_jinghao.append(text)


								print(len(data_jinghao))

								with open("./data/data_jinghoa.txt", "w", encoding='utf-8') as file:

								    for i in data_jinghao:

								        file.write(i + '\n')

								    file.close()


								# text = "但相对于传统“二房东”,轻资产分散式长租公寓更专业"

								# print(get_dialogs_index(text))

								# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)

								# if len(dialogs_text) != 0:

								#     sep = dialogs_text[0]

								#     text_list = text.split(sep)

								#     text_1 = text_list[0]

								#     if text_1 != "":

								#         text_1 = chulipangban_test_1(text_1)

								#         text_1 = "。".join(text_1)

								#         text_new.append(text_1)

								#         text_new.append(sep)

								#     else:

								#         text_new.append(sep)

								#     text_2 = str(sep).join(text_list[1:])

								#     text_new = paragraph_test(text_2, text_new)