import json import re import math import numpy as np from tqdm import tqdm # pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+' pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' zhaiyao_prompt = "论文题目是“{}”,目录是“{}”,生成论文摘要,要求生成的字数在600字左右" thanks = "致谢" references = "参考文献" excursus = "附录" u = 3.5 # 均值μ sig = math.sqrt(6.0) zong_gradient = 6 paper_word_count = 12000 pantten_title = "(.*?)”生成目录,要求只有一级标题和二级标题," path = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" with open(path, encoding="utf-8") as f: text = f.read() def normal_distribution(x): y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig) return y text_list = text.split("为论文题目“") ner_lable = [] text_zong = [] train_list = [] train_references_list = [] for text_dan in tqdm(text_list): # print(text_dan) try: title_prompt, mulu = text_dan.split("**************") except: continue result_biaoti_list = re.findall(pantten_title, title_prompt) try: result_biaoti_list[0] except: print(title_prompt) continue title = str(result_biaoti_list[0]).strip("\n") mulu = str(mulu).strip("\n") paper_prompt = zhaiyao_prompt.format(title, mulu) train_list.append(paper_prompt) import random random.shuffle(train_list) train_list_shuffle = train_list[:10000] with open("./data/title_mulu_to_/zhaiyao_prompt.txt", mode="w", encoding="utf-8") as f: for i in train_list: f.write(json.dumps(i, ensure_ascii=False)) f.write("\n")