数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

59 lines
1.5 KiB

import json
import re
import math
import numpy as np
from tqdm import tqdm
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
zhaiyao_prompt = "论文题目是《{}》,目录是“{}”,生成论文摘要,要求生成的字数在600字左右"
pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题,"
path_chatgpt_output = "chatgpt_data_v1"
path = "./data/{}/paper_prompt_title_1/title_mulu_prompt_data.txt".format(path_chatgpt_output)
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split("@" * 20)
ner_lable = []
text_zong = []
train_list = []
train_references_list = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title_prompt, mulu = text_dan.split("*" * 20)
except:
continue
result_biaoti_list = re.findall(pantten_title, title_prompt)
try:
result_biaoti_list[0]
except:
print(title_prompt)
continue
title = str(result_biaoti_list[0]).strip("\n")
mulu = str(mulu).strip("\n")
paper_prompt = zhaiyao_prompt.format(title, mulu)
train_list.append(paper_prompt)
import random
random.shuffle(train_list)
train_list_shuffle = train_list[:10000]
with open("./data/title_mulu_to_/zhaiyao_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")