Browse Source

增加英文降重

master
majiahui@haimaqingfan.com 10 months ago
parent
commit
5279d9f9a7
  1. 164
      flask_drop_rewrite_request.py

164
flask_drop_rewrite_request.py

@ -11,6 +11,9 @@ import re
import logging import logging
import concurrent.futures import concurrent.futures
import socket import socket
from sentence_spliter.logic_graph_en import long_cuter_en
from sentence_spliter.automata.state_machine import StateMachine
from sentence_spliter.automata.sequence import EnSequence #调取英文 Sequence
logging.basicConfig(level=logging.DEBUG, # 控制台打印的日志级别 logging.basicConfig(level=logging.DEBUG, # 控制台打印的日志级别
@ -41,6 +44,8 @@ pantten_biaoti_0 = '^[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ]
pantten_biaoti_1 = '^第[一二三四五六七八九]章\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' pantten_biaoti_1 = '^第[一二三四五六七八九]章\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_2 = '^[0-9.]+\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' pantten_biaoti_2 = '^[0-9.]+\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_3 = '^[((][1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][)_)][、.]{0,}?\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' pantten_biaoti_3 = '^[((][1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][)_)][、.]{0,}?\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_4 = '(摘要)'
pantten_biaoti_5 = '(致谢)'
def get_host_ip(): def get_host_ip():
@ -293,6 +298,7 @@ def get_multiple_urls(urls):
def chulipangban_test_1(snetence_id, text): def chulipangban_test_1(snetence_id, text):
# 引号处理 # 引号处理
dialogs_text, dialogs_index, other_index = get_dialogs_index(text) dialogs_text, dialogs_index, other_index = get_dialogs_index(text)
for dialogs_text_dan in dialogs_text: for dialogs_text_dan in dialogs_text:
text_dan_list = text.split(dialogs_text_dan) text_dan_list = text.split(dialogs_text_dan)
@ -300,40 +306,57 @@ def chulipangban_test_1(snetence_id, text):
# text_new_str = "".join(text_new) # text_new_str = "".join(text_new)
sentence_list = text.split("") if has_chinese(text) == False:
# sentence_list_new = [] spilt_word = ". "
# for i in sentence_list: spilt_sen_len = 1e9
# if i != "": is_chinese = False
# sentence_list_new.append(i) else:
# sentence_list = sentence_list_new spilt_word = ""
spilt_sen_len = 120
is_chinese = True
# 存放整理完的数据
sentence_batch_list = [] sentence_batch_list = []
sentence_batch_one = []
sentence_batch_length = 0
return_list = []
for sentence in sentence_list[:-1]: if is_chinese == False:
if len(sentence) < 120: __long_machine_en = StateMachine(long_cuter_en(max_len=20, min_len=3))
sentence_batch_length += len(sentence) m_input = EnSequence(text)
sentence_batch_list.append([sentence + "", snetence_id, 0]) __long_machine_en.run(m_input)
# sentence_pre = autotitle.gen_synonyms_short(sentence) for v in m_input.sentence_list():
# return_list.append(sentence_pre) sentence_batch_list.append([v, snetence_id, 0])
else:
sentence_split_list = chulichangju_1(sentence, snetence_id, [], 0) else:
for sentence_short in sentence_split_list[:-1]: sentence_list = text.split(spilt_word)
sentence_batch_list.append(sentence_short) # sentence_list_new = []
sentence_split_list[-1][0] = sentence_split_list[-1][0] + "" # for i in sentence_list:
sentence_batch_list.append(sentence_split_list[-1]) # if i != "":
# sentence_list_new.append(i)
if sentence_list[-1] != "": # sentence_list = sentence_list_new
if len(sentence_list[-1]) < 120: sentence_batch_length = 0
sentence_batch_length += len(sentence_list[-1])
sentence_batch_list.append([sentence_list[-1], snetence_id, 0]) for sentence in sentence_list[:-1]:
# sentence_pre = autotitle.gen_synonyms_short(sentence) if len(sentence) < spilt_sen_len:
# return_list.append(sentence_pre) sentence_batch_length += len(sentence)
else: sentence_batch_list.append([sentence + spilt_word, snetence_id, 0])
sentence_split_list = chulichangju_1(sentence_list[-1], snetence_id, [], 0) # sentence_pre = autotitle.gen_synonyms_short(sentence)
for sentence_short in sentence_split_list: # return_list.append(sentence_pre)
sentence_batch_list.append(sentence_short) else:
sentence_split_list = chulichangju_1(sentence, snetence_id, [], 0)
for sentence_short in sentence_split_list[:-1]:
sentence_batch_list.append(sentence_short)
sentence_split_list[-1][0] = sentence_split_list[-1][0] + spilt_word
sentence_batch_list.append(sentence_split_list[-1])
if sentence_list[-1] != "":
if len(sentence_list[-1]) < spilt_sen_len:
sentence_batch_length += len(sentence_list[-1])
sentence_batch_list.append([sentence_list[-1], snetence_id, 0])
# sentence_pre = autotitle.gen_synonyms_short(sentence)
# return_list.append(sentence_pre)
else:
sentence_split_list = chulichangju_1(sentence_list[-1], snetence_id, [], 0)
for sentence_short in sentence_split_list:
sentence_batch_list.append(sentence_short)
return sentence_batch_list return sentence_batch_list
@ -341,6 +364,7 @@ def chulipangban_test_1(snetence_id, text):
def paragraph_test(texts: dict): def paragraph_test(texts: dict):
text_new = [] text_new = []
for i, text in texts.items(): for i, text in texts.items():
print("text", text)
text_list = chulipangban_test_1(i, text) text_list = chulipangban_test_1(i, text)
text_new.extend(text_list) text_new.extend(text_list)
@ -462,22 +486,38 @@ def has_chinese(s):
return bool(re.search('[\u4e00-\u9fa5]', s)) return bool(re.search('[\u4e00-\u9fa5]', s))
def pre_sentence_ulit(sentence): def english_ulit(sentence):
''' print("sentence", sentence)
预处理
:param sentence:
:return:
'''
sentence = str(sentence).strip() sentence = str(sentence).strip()
if_change = True if_change = True
# 判断句子长度 # 判断句子长度
if "" != "":
if sentence[-1] != ".":
text = f"User: Rewrite the following sentence fragment, ensuring that the meaning remains similar but with significant changes. The length of the rewritten sentence must be greater, not less. Additionally, the words in the short sentences must connect seamlessly with the preceding and following sentences:\n{sentence}\nAssistant:"
else:
text = f"User: Rewrite the following sentence, requiring the meaning to be similar but the change to be larger, and the number of words can only be more but not less:\n{sentence}\nAssistant:"
else:
if_change = False
text = f"User: Hello\nAssistant:"
return text, if_change
def chinese_ulit(sentence):
max_length = 25
sentence = str(sentence).strip()
if_change = True
# 判断句子长度
if len(sentence) > 9: if len(sentence) > 9:
if sentence[-1] != "": if sentence[-1] != "":
text = f"User: 改写下面半这句话,要求意思接近但是改动幅度比较大,字数只能多不能少,短句前后词跟上下句衔接不能有错误:\n{sentence}\nAssistant:" text = f"User: 任务:改写句子\n改写下面半这句话,要求意思接近但是改动幅度比较大,字数只能多不能少,短句前后词跟上下句衔接不能有错误:\n{sentence}\nAssistant:"
else: else:
text = f"User: 改写下面这句话,要求意思接近但是改动幅度比较大,字数只能多不能少:\n{sentence}\nAssistant:" text = f"User: 任务:改写句子\n改写下面这句话,要求意思接近但是改动幅度比较大,字数只能多不能少:\n{sentence}\nAssistant:"
else: else:
text = f"下面词不做任何变化:\n{sentence}" text = f"下面词不做任何变化:\n{sentence}"
@ -485,23 +525,39 @@ def pre_sentence_ulit(sentence):
return text, if_change return text, if_change
# 判断标题 # 判断标题
# result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence) result_biaoti_list_0 = re.findall(pantten_biaoti_0, sentence)
# result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence) result_biaoti_list_1 = re.findall(pantten_biaoti_1, sentence)
# result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence) result_biaoti_list_2 = re.findall(pantten_biaoti_2, sentence)
# result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence) result_biaoti_list_3 = re.findall(pantten_biaoti_3, sentence)
result_biaoti_list_4 = re.findall(pantten_biaoti_4, sentence)
# if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []: result_biaoti_list_5 = re.findall(pantten_biaoti_5, sentence)
# if_change = False
# return text, if_change if list(set(result_biaoti_list_0 + result_biaoti_list_1 + result_biaoti_list_2 + result_biaoti_list_3)) != []:
if len(sentence) < max_length:
# 判断是否去全英文 if_change = False
if has_chinese(sentence) == False: return text, if_change
elif list(set(result_biaoti_list_4 + result_biaoti_list_5 )) != []:
if_change = False if_change = False
return text, if_change return text, if_change
return text, if_change return text, if_change
def pre_sentence_ulit(sentence):
'''
预处理
:param sentence:
:return:
'''
# 判断是否为全英文
if has_chinese(sentence) == False:
text, if_change = english_ulit(sentence)
else:
text, if_change = chinese_ulit(sentence)
return text, if_change
def main(texts: dict): def main(texts: dict):
text_list = paragraph_test(texts) text_list = paragraph_test(texts)
@ -644,10 +700,10 @@ def classify(): # 调用模型,设置最大batch_size
if text_type == 'focus': if text_type == 'focus':
texts_list = main(texts) texts_list = main(texts)
elif text_type == 'chapter': elif text_type == 'chapter':
try: # try:
texts_list = main(texts) texts_list = main(texts)
except: # except:
texts_list = [] # texts_list = []
else: else:
texts_list = [] texts_list = []
if texts_list != []: if texts_list != []:

Loading…
Cancel
Save