Browse Source

增加英文降重

master
majiahui@haimaqingfan.com 10 months ago
parent
commit
5279d9f9a7
  1. 164
      flask_drop_rewrite_request.py

164
flask_drop_rewrite_request.py

@ -11,6 +11,9 @@ import re
import logging
import concurrent.futures
import socket
from sentence_spliter.logic_graph_en import long_cuter_en
from sentence_spliter.automata.state_machine import StateMachine
from sentence_spliter.automata.sequence import EnSequence #调取英文 Sequence
logging.basicConfig(level=logging.DEBUG, # 控制台打印的日志级别
@ -41,6 +44,8 @@ pantten_biaoti_0 = '^[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ]
pantten_biaoti_1 = '^第[一二三四五六七八九]章\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_2 = '^[0-9.]+\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_3 = '^[((][1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][)_)][、.]{0,}?\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_4 = '(摘要)'
pantten_biaoti_5 = '(致谢)'
def get_host_ip():
@ -293,6 +298,7 @@ def get_multiple_urls(urls):
def chulipangban_test_1(snetence_id, text):
# 引号处理
dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
for dialogs_text_dan in dialogs_text:
text_dan_list = text.split(dialogs_text_dan)
@ -300,40 +306,57 @@ def chulipangban_test_1(snetence_id, text):
# text_new_str = "".join(text_new)
sentence_list = text.split("")
# sentence_list_new = []
# for i in sentence_list:
# if i != "":
# sentence_list_new.append(i)
# sentence_list = sentence_list_new
if has_chinese(text) == False:
spilt_word = ". "
spilt_sen_len = 1e9
is_chinese = False
else:
spilt_word = ""
spilt_sen_len = 120
is_chinese = True
# 存放整理完的数据
sentence_batch_list = []
sentence_batch_one = []
sentence_batch_length = 0
return_list = []
for sentence in sentence_list[:-1]:
if len(sentence) < 120:
sentence_batch_length += len(sentence)
sentence_batch_list.append([sentence + "", snetence_id, 0])
# sentence_pre = autotitle.gen_synonyms_short(sentence)
# return_list.append(sentence_pre)
else:
sentence_split_list = chulichangju_1(sentence, snetence_id, [], 0)
for sentence_short in sentence_split_list[:-1]:
sentence_batch_list.append(sentence_short)
sentence_split_list[-1][0] = sentence_split_list[-1][0] + ""
sentence_batch_list.append(sentence_split_list[-1])
if sentence_list[-1] != "":
if len(sentence_list[-1]) < 120:
sentence_batch_length += len(sentence_list[-1])
sentence_batch_list.append([sentence_list[-1], snetence_id, 0])
# sentence_pre = autotitle.gen_synonyms_short(sentence)
# return_list.append(sentence_pre)
else:
sentence_split_list = chulichangju_1(sentence_list[-1], snetence_id, [], 0)
for sentence_short in sentence_split_list:
sentence_batch_list.append(sentence_short)
if is_chinese == False:
__long_machine_en = StateMachine(long_cuter_en(max_len=20, min_len=3))
m_input = EnSequence(text)
__long_machine_en.run(m_input)
for v in m_input.sentence_list():
sentence_batch_list.append([v, snetence_id, 0])
else:
sentence_list = text.split(spilt_word)
# sentence_list_new = []
# for i in sentence_list:
# if i != "":
# sentence_list_new.append(i)
# sentence_list = sentence_list_new
sentence_batch_length = 0
for sentence in sentence_list[:-1]:
if len(sentence) < spilt_sen_len:
sentence_batch_length += len(sentence)
sentence_batch_list.append([sentence + spilt_word, snetence_id, 0])
# sentence_pre = autotitle.gen_synonyms_short(sentence)
# return_list.append(sentence_pre)
else:
sentence_split_list = chulichangju_1(sentence, snetence_id, [], 0)
for sentence_short in sentence_split_list[:-1]:
sentence_batch_list.append(sentence_short)
sentence_split_list[-1][0] = sentence_split_list[-1][0] + spilt_word
sentence_batch_list.append(sentence_split_list[-1])
if sentence_list[-1] != "":
if len(sentence_list[-1]) < spilt_sen_len:
sentence_batch_length += len(sentence_list[-1])
sentence_batch_list.append([sentence_list[-1], snetence_id, 0])
# sentence_pre = autotitle.gen_synonyms_short(sentence)
# return_list.append(sentence_pre)
else:
sentence_split_list = chulichangju_1(sentence_list[-1], snetence_id, [], 0)
for sentence_short in sentence_split_list:
sentence_batch_list.append(sentence_short)
return sentence_batch_list
@ -341,6 +364,7 @@ def chulipangban_test_1(snetence_id, text):
def paragraph_test(texts: dict):
text_new = []
for i, text in texts.items():
print("text", text)
text_list = chulipangban_test_1(i, text)
text_new.extend(text_list)
@ -462,22 +486,38 @@ def has_chinese(s):
return bool(re.search('[\u4e00-\u9fa5]', s))
def pre_sentence_ulit(sentence):
'''
预处理
:param sentence:
:return:
'''
def english_ulit(sentence):
print("sentence", sentence)
sentence = str(sentence).strip()
if_change = True
# 判断句子长度
if "" != "":
if sentence[-1] != ".":
text = f"User: Rewrite the following sentence fragment, ensuring that the meaning remains similar but with significant changes. The length of the rewritten sentence must be greater, not less. Additionally, the words in the short sentences must connect seamlessly with the preceding and following sentences:\n{sentence}\nAssistant:"
else:
text = f"User: Rewrite the following sentence, requiring the meaning to be similar but the change to be larger, and the number of words can only be more but not less:\n{sentence}\nAssistant:"
else:
if_change = False
text = f"User: Hello\nAssistant:"
return text, if_change
def chinese_ulit(sentence):
max_length = 25
sentence = str(sentence).strip()
if_change = True
# 判断句子长度
if len(sentence) > 9:
if sentence[-1] != "":
text = f"User: 改写下面半这句话,要求意思接近但是改动幅度比较大,字数只能多不能少,短句前后词跟上下句衔接不能有错误:\n{sentence}\nAssistant:"
text = f"User: 任务:改写句子\n改写下面半这句话,要求意思接近但是改动幅度比较大,字数只能多不能少,短句前后词跟上下句衔接不能有错误:\n{sentence}\nAssistant:"
else:
text = f"User: 改写下面这句话,要求意思接近但是改动幅度比较大,字数只能多不能少:\n{sentence}\nAssistant:"
text = f"User: 任务:改写句子\n改写下面这句话,要求意思接近但是改动幅度比较大,字数只能多不能少:\n{sentence}\nAssistant:"
else:
text = f"下面词不做任何变化:\n{sentence}"
@ -485,23 +525,39 @@ def pre_sentence_ulit(sentence):
return text, if_change
# 判断标题
# result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence)
# result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence)
# result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence)
# result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence)
# if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []:
# if_change = False
# return text, if_change
# 判断是否去全英文
if has_chinese(sentence) == False:
result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence)
result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence)
result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence)
result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence)
result_biaoti_list_4 = re.findall(pantten_biaoti_4, sentence)
result_biaoti_list_5 = re.findall(pantten_biaoti_5, sentence)
if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []:
if len(sentence) < max_length:
if_change = False
return text, if_change
elif list(set(result_biaoti_list_4 + result_biaoti_list_5 )) != []:
if_change = False
return text, if_change
return text, if_change
def pre_sentence_ulit(sentence):
'''
预处理
:param sentence:
:return:
'''
# 判断是否为全英文
if has_chinese(sentence) == False:
text, if_change = english_ulit(sentence)
else:
text, if_change = chinese_ulit(sentence)
return text, if_change
def main(texts: dict):
text_list = paragraph_test(texts)
@ -644,10 +700,10 @@ def classify(): # 调用模型,设置最大batch_size
if text_type == 'focus':
texts_list = main(texts)
elif text_type == 'chapter':
try:
# try:
texts_list = main(texts)
except:
texts_list = []
# except:
# texts_list = []
else:
texts_list = []
if texts_list != []:

Loading…
Cancel
Save