|
|
@ -317,65 +317,9 @@ def get_multiple_urls(text_info): |
|
|
|
def chulipangban_test_1(snetence_id, text): |
|
|
|
# 引号处理 |
|
|
|
|
|
|
|
|
|
|
|
dialogs_text, dialogs_index, other_index = get_dialogs_index(text) |
|
|
|
for dialogs_text_dan in dialogs_text: |
|
|
|
text_dan_list = text.split(dialogs_text_dan) |
|
|
|
text = dialogs_text_dan.join(text_dan_list) |
|
|
|
|
|
|
|
# text_new_str = "".join(text_new) |
|
|
|
|
|
|
|
if has_chinese(text) == False: |
|
|
|
spilt_word = ". " |
|
|
|
spilt_sen_len = 1e9 |
|
|
|
is_chinese = False |
|
|
|
else: |
|
|
|
spilt_word = "。" |
|
|
|
spilt_sen_len = 120 |
|
|
|
is_chinese = True |
|
|
|
|
|
|
|
# 存放整理完的数据 |
|
|
|
sentence_batch_list = [] |
|
|
|
|
|
|
|
if is_chinese == False: |
|
|
|
__long_machine_en = StateMachine(long_cuter_en(max_len=25, min_len=3)) |
|
|
|
m_input = EnSequence(text) |
|
|
|
__long_machine_en.run(m_input) |
|
|
|
for v in m_input.sentence_list(): |
|
|
|
sentence_batch_list.append([v, snetence_id, 0]) |
|
|
|
|
|
|
|
else: |
|
|
|
sentence_list = text.split(spilt_word) |
|
|
|
# sentence_list_new = [] |
|
|
|
# for i in sentence_list: |
|
|
|
# if i != "": |
|
|
|
# sentence_list_new.append(i) |
|
|
|
# sentence_list = sentence_list_new |
|
|
|
sentence_batch_length = 0 |
|
|
|
|
|
|
|
for sentence in sentence_list[:-1]: |
|
|
|
if len(sentence) < spilt_sen_len: |
|
|
|
sentence_batch_length += len(sentence) |
|
|
|
sentence_batch_list.append([sentence + spilt_word, snetence_id, 0]) |
|
|
|
# sentence_pre = autotitle.gen_synonyms_short(sentence) |
|
|
|
# return_list.append(sentence_pre) |
|
|
|
else: |
|
|
|
sentence_split_list = chulichangju_1(sentence, snetence_id, [], 0) |
|
|
|
for sentence_short in sentence_split_list[:-1]: |
|
|
|
sentence_batch_list.append(sentence_short) |
|
|
|
sentence_split_list[-1][0] = sentence_split_list[-1][0] + spilt_word |
|
|
|
sentence_batch_list.append(sentence_split_list[-1]) |
|
|
|
|
|
|
|
if sentence_list[-1] != "": |
|
|
|
if len(sentence_list[-1]) < spilt_sen_len: |
|
|
|
sentence_batch_length += len(sentence_list[-1]) |
|
|
|
sentence_batch_list.append([sentence_list[-1], snetence_id, 0]) |
|
|
|
# sentence_pre = autotitle.gen_synonyms_short(sentence) |
|
|
|
# return_list.append(sentence_pre) |
|
|
|
else: |
|
|
|
sentence_split_list = chulichangju_1(sentence_list[-1], snetence_id, [], 0) |
|
|
|
for sentence_short in sentence_split_list: |
|
|
|
sentence_batch_list.append(sentence_short) |
|
|
|
sentence_batch_list.append([text, snetence_id, 0]) |
|
|
|
|
|
|
|
return sentence_batch_list |
|
|
|
|
|
|
|