普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

115 lines
3.4 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/2/10 12:06
@Author :
@FileName:
@Software:
@Describe:
"""
# -*- coding: utf-8 -*-
"""
@Time : 2023/2/9 18:53
@Author :
@FileName:
@Software:
@Describe:
"""
import pandas as pd
data = pd.read_excel("./data/700条效果对比.xlsx").values.tolist()
data_new = sorted(data,key=lambda x:len(x[0]))
data_yinhao = []
for i in data_new:
yaunwen = i[0]
bool_text = False
for j in yaunwen:
if j == "":
bool_text = True
break
if bool_text == True:
data_yinhao.append(yaunwen)
import re
# sentence = '但相对于传统“二房东”,轻资产分散式长租公寓更专业'
# dialog_sentence = re.findall(r'“.*?”', sentence)
# print(dialog_sentence)
# if dialog_sentence:
# for i_sentence in dialog_sentence:
# j_sentence = i_sentence
# j_sentence = j_sentence.replace('他', '$$$').replace('她', '$$$$').replace('它', '$$$$$')
# sentence = sentence.replace(i_sentence, j_sentence)
# sentence = sentence.replace('她', '她(確確確確)').replace('他', '他(確確確確)').replace('它', '它(確確確確)')
# sentence = sentence.replace('*****','它们').replace('****','她们').replace('***','他们')
# sentence = sentence.replace('$$$$$','它').replace('$$$$','她').replace('$$$','他')
# # new_list = line_list[0]+'/'+sentence+'/'+line_list[2]
# # unknown_speaker_index.append(i)
# # new_lines.append(new_list)
RE_DIALOG = re.compile(r"\".*?\"|\'.*?\'|“.*?”")
fuhao_end_sentence = ["","","","",""]
#
#
def get_dialogs_index(line: str):
"""
获取对话及其索引
:param line 文本
:return dialogs 对话内容
dialogs_index: 对话位置索引
other_index: 其他内容位置索引
"""
dialogs = re.finditer(RE_DIALOG, line)
dialogs_text = re.findall(RE_DIALOG, line)
dialogs_index = []
for dialog in dialogs:
all_ = [i for i in range(dialog.start(), dialog.end())]
dialogs_index.extend(all_)
other_index = [i for i in range(len(line)) if i not in dialogs_index]
return dialogs_text, dialogs_index, other_index
# text = "但相对于传统“二房东”,轻资产分散式长租公寓“二房东”更专业"
# get_dialogs_index(text)
# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
# for i in dialogs_text:
# text = text.replace(i, "#####")
# print(text)
data_jinghao = []
for text in data_yinhao:
get_dialogs_index(text)
dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
for i in dialogs_text:
text = text.replace(i, "#####")
data_jinghao.append(text)
print(len(data_jinghao))
with open("./data/data_jinghoa.txt", "w", encoding='utf-8') as file:
for i in data_jinghao:
file.write(i + '\n')
file.close()
# text = "但相对于传统“二房东”,轻资产分散式长租公寓更专业"
# print(get_dialogs_index(text))
# dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
# if len(dialogs_text) != 0:
# sep = dialogs_text[0]
# text_list = text.split(sep)
# text_1 = text_list[0]
# if text_1 != "":
# text_1 = chulipangban_test_1(text_1)
# text_1 = "。".join(text_1)
# text_new.append(text_1)
# text_new.append(sep)
# else:
# text_new.append(sep)
# text_2 = str(sep).join(text_list[1:])
# text_new = paragraph_test(text_2, text_new)