数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

133 lines
4.1 KiB

import json
import re
import math
import numpy as np
from tqdm import tqdm
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
pantten_biaoti = '[一二三四五六七八九][、]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_1 = '[1-9].[1-9].[1-9](.*)'
first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右"
small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
mulu_prompt = "请帮我根据题目为“{}”生成一个论文目录其中只含有一级标题和二级标题"
thanks = "致谢"
references = "参考文献"
excursus = "附录"
u = 3.5 # 均值μ
sig = math.sqrt(6.0)
zong_gradient = 6
paper_word_count = 12000
path = "../data/title.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
def normal_distribution(x):
y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
return y
text_list = text.split("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
ner_lable = []
text_zong = []
train_list = []
for text_dan in tqdm(text_list):
tiaoguo = False
# print(text_dan)
try:
title, mulu = text_dan.split("**********************************************")
except:
continue
title = str(title).strip("\n")
mulu = str(mulu).strip("\n")
paper_text = "题目:{}@目录:".format(title)
nerlable_list = []
# mulu_base64 = base64.b64encode(mulu.encode('utf-8'))
# mulu_path = os.path.join(uuid_path, "mulu.txt")
# with open(mulu_path, 'wb', encoding='utf8') as f2:
# f2.write(mulu_base64)
mulu_list = str(mulu).split("\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_list_bool = []
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
result_biaoti_sanji_list = re.findall(pantten_biaoti, i)
if result_biaoti_sanji_list != []:
tiaoguo = True
break
else:
mulu_list_bool.append((i, "二级标题"))
if tiaoguo == True:
continue
mulu_list_bool_part = mulu_list_bool[:3]
if mulu_list_bool_part[0][1] != "一级标题":
continue
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == "一级标题":
continue
if mulu_list_bool_part[-1][1] == "一级标题":
continue
thanks_references_bool_table = mulu_list_bool[-5:]
for i in thanks_references_bool_table:
try:
if references in i[0]:
mulu_list_bool.remove(i)
if thanks in i[0]:
mulu_list_bool.remove(i)
if excursus in i[0]:
mulu_list_bool.remove(i)
except:
print(thanks_references_bool_table)
continue
table_of_contents = []
for i in mulu_list_bool:
if i[1] == "一级标题":
paper_dan = {
"title": "@@" + i[0],
"small_title": [i[0]],
"word_count": 0
}
table_of_contents.append(paper_dan)
else:
table_of_contents[-1]["small_title"].append(i[0])
table_of_contents_new = []
for i in table_of_contents:
a = "\n".join(i["small_title"])
table_of_contents_new.append(a)
b = "\n\n".join(table_of_contents_new)
title_p = mulu_prompt.format(title)
train_list.append({"content": str(title_p), "summary": str(b)})
print(train_list)
import random
random.shuffle(train_list)
train_list_shuffle = train_list[:500]
with open("../data/mulu_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
with open("../data/mulu_prompt_shuffle.json", mode="w", encoding="utf-8") as f:
for i in train_list_shuffle:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")