You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
3.4 KiB
115 lines
3.4 KiB
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
@Time : 2023/2/10 12:06
|
|
@Author :
|
|
@FileName:
|
|
@Software:
|
|
@Describe:
|
|
"""
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
@Time : 2023/2/9 18:53
|
|
@Author :
|
|
@FileName:
|
|
@Software:
|
|
@Describe:
|
|
"""
|
|
|
|
import pandas as pd
|
|
data = pd.read_excel("./data/700条效果对比.xlsx").values.tolist()
|
|
data_new = sorted(data,key=lambda x:len(x[0]))
|
|
|
|
|
|
data_yinhao = []
|
|
for i in data_new:
|
|
yaunwen = i[0]
|
|
bool_text = False
|
|
for j in yaunwen:
|
|
if j == "“":
|
|
bool_text = True
|
|
break
|
|
if bool_text == True:
|
|
data_yinhao.append(yaunwen)
|
|
|
|
|
|
import re
|
|
# sentence = '但相对于传统“二房东”,轻资产分散式长租公寓更专业'
|
|
# dialog_sentence = re.findall(r'“.*?”', sentence)
|
|
# print(dialog_sentence)
|
|
# if dialog_sentence:
|
|
# for i_sentence in dialog_sentence:
|
|
# j_sentence = i_sentence
|
|
# j_sentence = j_sentence.replace('他', '$$$').replace('她', '$$$$').replace('它', '$$$$$')
|
|
# sentence = sentence.replace(i_sentence, j_sentence)
|
|
# sentence = sentence.replace('她', '她(確確確確)').replace('他', '他(確確確確)').replace('它', '它(確確確確)')
|
|
# sentence = sentence.replace('*****','它们').replace('****','她们').replace('***','他们')
|
|
# sentence = sentence.replace('$$$$$','它').replace('$$$$','她').replace('$$$','他')
|
|
# # new_list = line_list[0]+'/'+sentence+'/'+line_list[2]
|
|
# # unknown_speaker_index.append(i)
|
|
# # new_lines.append(new_list)
|
|
|
|
|
|
RE_DIALOG = re.compile(r"\".*?\"|\'.*?\'|“.*?”")
|
|
fuhao_end_sentence = ["。",",","?","!","…"]
|
|
#
|
|
#
|
|
def get_dialogs_index(line: str):
|
|
"""
|
|
获取对话及其索引
|
|
:param line 文本
|
|
:return dialogs 对话内容
|
|
dialogs_index: 对话位置索引
|
|
other_index: 其他内容位置索引
|
|
"""
|
|
dialogs = re.finditer(RE_DIALOG, line)
|
|
dialogs_text = re.findall(RE_DIALOG, line)
|
|
dialogs_index = []
|
|
for dialog in dialogs:
|
|
all_ = [i for i in range(dialog.start(), dialog.end())]
|
|
dialogs_index.extend(all_)
|
|
other_index = [i for i in range(len(line)) if i not in dialogs_index]
|
|
|
|
return dialogs_text, dialogs_index, other_index
|
|
|
|
# text = "但相对于传统“二房东”,轻资产分散式长租公寓“二房东”更专业"
|
|
# get_dialogs_index(text)
|
|
# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
|
|
# for i in dialogs_text:
|
|
# text = text.replace(i, "#####")
|
|
# print(text)
|
|
|
|
data_jinghao = []
|
|
|
|
for text in data_yinhao:
|
|
get_dialogs_index(text)
|
|
dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
|
|
for i in dialogs_text:
|
|
text = text.replace(i, "#####")
|
|
data_jinghao.append(text)
|
|
|
|
print(len(data_jinghao))
|
|
with open("./data/data_jinghoa.txt", "w", encoding='utf-8') as file:
|
|
for i in data_jinghao:
|
|
file.write(i + '\n')
|
|
file.close()
|
|
|
|
|
|
# text = "但相对于传统“二房东”,轻资产分散式长租公寓更专业"
|
|
# print(get_dialogs_index(text))
|
|
# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
|
|
# if len(dialogs_text) != 0:
|
|
# sep = dialogs_text[0]
|
|
# text_list = text.split(sep)
|
|
# text_1 = text_list[0]
|
|
# if text_1 != "":
|
|
# text_1 = chulipangban_test_1(text_1)
|
|
# text_1 = "。".join(text_1)
|
|
# text_new.append(text_1)
|
|
# text_new.append(sep)
|
|
# else:
|
|
# text_new.append(sep)
|
|
# text_2 = str(sep).join(text_list[1:])
|
|
# text_new = paragraph_test(text_2, text_new)
|
|
|
|
|