# -*- coding: utf-8 -*- """ @Time : 2023/2/10 12:06 @Author : @FileName: @Software: @Describe: """ # -*- coding: utf-8 -*- """ @Time : 2023/2/9 18:53 @Author : @FileName: @Software: @Describe: """ import pandas as pd data = pd.read_excel("./data/700条效果对比.xlsx").values.tolist() data_new = sorted(data,key=lambda x:len(x[0])) data_yinhao = [] for i in data_new: yaunwen = i[0] bool_text = False for j in yaunwen: if j == "“": bool_text = True break if bool_text == True: data_yinhao.append(yaunwen) import re # sentence = '但相对于传统“二房东”,轻资产分散式长租公寓更专业' # dialog_sentence = re.findall(r'“.*?”', sentence) # print(dialog_sentence) # if dialog_sentence: # for i_sentence in dialog_sentence: # j_sentence = i_sentence # j_sentence = j_sentence.replace('他', '$$$').replace('她', '$$$$').replace('它', '$$$$$') # sentence = sentence.replace(i_sentence, j_sentence) # sentence = sentence.replace('她', '她(確確確確)').replace('他', '他(確確確確)').replace('它', '它(確確確確)') # sentence = sentence.replace('*****','它们').replace('****','她们').replace('***','他们') # sentence = sentence.replace('$$$$$','它').replace('$$$$','她').replace('$$$','他') # # new_list = line_list[0]+'/'+sentence+'/'+line_list[2] # # unknown_speaker_index.append(i) # # new_lines.append(new_list) RE_DIALOG = re.compile(r"\".*?\"|\'.*?\'|“.*?”") fuhao_end_sentence = ["。",",","?","!","…"] # # def get_dialogs_index(line: str): """ 获取对话及其索引 :param line 文本 :return dialogs 对话内容 dialogs_index: 对话位置索引 other_index: 其他内容位置索引 """ dialogs = re.finditer(RE_DIALOG, line) dialogs_text = re.findall(RE_DIALOG, line) dialogs_index = [] for dialog in dialogs: all_ = [i for i in range(dialog.start(), dialog.end())] dialogs_index.extend(all_) other_index = [i for i in range(len(line)) if i not in dialogs_index] return dialogs_text, dialogs_index, other_index # text = "但相对于传统“二房东”,轻资产分散式长租公寓“二房东”更专业" # get_dialogs_index(text) # dialogs_text, dialogs_index, other_index = get_dialogs_index(text) # for i in dialogs_text: # text = text.replace(i, "#####") # print(text) data_jinghao = [] for text in data_yinhao: get_dialogs_index(text) dialogs_text, dialogs_index, other_index = get_dialogs_index(text) for i in dialogs_text: text = text.replace(i, "#####") data_jinghao.append(text) print(len(data_jinghao)) with open("./data/data_jinghoa.txt", "w", encoding='utf-8') as file: for i in data_jinghao: file.write(i + '\n') file.close() # text = "但相对于传统“二房东”,轻资产分散式长租公寓更专业" # print(get_dialogs_index(text)) # dialogs_text, dialogs_index, other_index = get_dialogs_index(text) # if len(dialogs_text) != 0: # sep = dialogs_text[0] # text_list = text.split(sep) # text_1 = text_list[0] # if text_1 != "": # text_1 = chulipangban_test_1(text_1) # text_1 = "。".join(text_1) # text_new.append(text_1) # text_new.append(sep) # else: # text_new.append(sep) # text_2 = str(sep).join(text_list[1:]) # text_new = paragraph_test(text_2, text_new)