数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
3.8 KiB

2 years ago
import os
from tqdm import tqdm
import re
# chatgpt生成的数据
file = "data/chatgpt_data_v1/paper_prompt_title_1/title_mulu_prompt_data.txt"
2 years ago
with open(file, encoding="utf-8") as f:
text = f.read()
text_list = text.split("@" * 20)[:-1]
# print(text_list1
pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题,"
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
thanks = "致谢"
references = "参考文献"
excursus = "附录"
mulu_list_new = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title_prompt, mulu = text_dan.split("*" * 20)
2 years ago
except:
continue
result_biaoti_list = re.findall(pantten_title, title_prompt)
try:
result_biaoti_list[0]
except:
print(title_prompt)
continue
title = str(result_biaoti_list[0]).strip("\n")
mulu = str(mulu).strip("\n")
# 生成参考文件的提示文本
table_of_contents = []
nerlable_list = []
# mulu_base64 = base64.b64encode(mulu.encode('utf-8'))
# mulu_path = os.path.join(uuid_path, "mulu.txt")
# with open(mulu_path, 'wb', encoding='utf8') as f2:
# f2.write(mulu_base64)
mulu_list = str(mulu).split("\n")
if mulu_list[0] == "目录":
mulu_list = mulu_list[1:]
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_str = "@".join(mulu_list)
mulu_list_bool = []
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
mulu_list_bool.append((i, "二级标题"))
mulu_list_bool_part = mulu_list_bool[:3]
if mulu_list_bool_part[0][1] != "一级标题":
continue
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题":
continue
thanks_references_bool_table = mulu_list_bool[-5:]
for i in thanks_references_bool_table:
try:
if references in i[0]:
mulu_list_bool.remove(i)
if thanks in i[0]:
mulu_list_bool.remove(i)
if excursus in i[0]:
mulu_list_bool.remove(i)
except:
print(thanks_references_bool_table)
continue
for i in mulu_list_bool:
if i[1] == "一级标题":
paper_dan = {
"title": "@@" + i[0],
"small_title": [],
"word_count": 0
}
table_of_contents.append(paper_dan)
else:
table_of_contents[-1]["small_title"].append(i[0])
is_contine = False
if len(table_of_contents) < 6:
continue
else:
for big_title in table_of_contents[:-1]:
if len(big_title["small_title"]) < 2 or len(big_title["small_title"]) > 5:
is_contine = True
break
if is_contine == True:
continue
# print(table_of_contents)
#
# print(len(table_of_contents))
table_of_contents_new = []
big_title_len = len(table_of_contents)
2 years ago
for dabiaoti_index in range(len(table_of_contents)):
dabiaoti_dict = table_of_contents[dabiaoti_index]
dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"]
dan_str = "\n".join(dan_str_list)
table_of_contents_new.append(dan_str)
mulu_txt = "\n\n".join(table_of_contents_new)
title_prompt = title_prompt.replace("\\n", "\n").replace("一级标题不少于7个", "一级标题不少于{}".format(str(big_title_len)))
mulu_txt = mulu_txt.replace("\\n", "\n")
2 years ago
fenge = "*" * 20
mulu_list_new.append(fenge.join([title_prompt,mulu_txt]))
2 years ago
with open(file, mode="w", encoding="utf-8") as f:
2 years ago
for i in mulu_list_new:
f.write(i)
f.write("@" * 20)