You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
1.7 KiB
69 lines
1.7 KiB
![]()
2 years ago
|
import json
|
||
|
import re
|
||
|
import math
|
||
|
import numpy as np
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
|
||
|
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
|
||
|
zhaiyao_prompt = "论文题目是“{}”,目录是“{}”,生成论文摘要,要求生成的字数在600字左右"
|
||
|
thanks = "致谢"
|
||
|
references = "参考文献"
|
||
|
excursus = "附录"
|
||
|
u = 3.5 # 均值μ
|
||
|
sig = math.sqrt(6.0)
|
||
|
zong_gradient = 6
|
||
|
paper_word_count = 12000
|
||
|
pantten_title = "(.*?)”生成目录,要求只有一级标题和二级标题,"
|
||
|
|
||
|
|
||
|
|
||
|
path = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt"
|
||
|
with open(path, encoding="utf-8") as f:
|
||
|
text = f.read()
|
||
|
|
||
|
|
||
|
def normal_distribution(x):
|
||
|
y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
|
||
|
return y
|
||
|
|
||
|
text_list = text.split("为论文题目“")
|
||
|
|
||
|
ner_lable = []
|
||
|
text_zong = []
|
||
|
|
||
|
train_list = []
|
||
|
train_references_list = []
|
||
|
|
||
|
for text_dan in tqdm(text_list):
|
||
|
# print(text_dan)
|
||
|
try:
|
||
|
title_prompt, mulu = text_dan.split("**************")
|
||
|
except:
|
||
|
continue
|
||
|
result_biaoti_list = re.findall(pantten_title, title_prompt)
|
||
|
try:
|
||
|
result_biaoti_list[0]
|
||
|
except:
|
||
|
print(title_prompt)
|
||
|
continue
|
||
|
|
||
|
title = str(result_biaoti_list[0]).strip("\n")
|
||
|
mulu = str(mulu).strip("\n")
|
||
|
paper_prompt = zhaiyao_prompt.format(title, mulu)
|
||
|
train_list.append(paper_prompt)
|
||
|
|
||
|
|
||
|
import random
|
||
|
|
||
|
random.shuffle(train_list)
|
||
|
|
||
|
train_list_shuffle = train_list[:10000]
|
||
|
|
||
|
|
||
|
with open("./data/title_mulu_to_/zhaiyao_prompt.txt", mode="w", encoding="utf-8") as f:
|
||
|
for i in train_list:
|
||
|
f.write(json.dumps(i, ensure_ascii=False))
|
||
|
f.write("\n")
|
||
|
|