You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
131 lines
3.7 KiB
131 lines
3.7 KiB
![]()
2 years ago
|
import os
|
||
|
from tqdm import tqdm
|
||
|
import re
|
||
|
|
||
|
# chatgpt生成的数据
|
||
|
file = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt"
|
||
|
|
||
|
|
||
|
with open(file, encoding="utf-8") as f:
|
||
|
text = f.read()
|
||
|
|
||
|
|
||
|
text_list = text.split("@" * 20)[:-1]
|
||
|
|
||
|
# print(text_list1
|
||
|
|
||
|
pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题,"
|
||
|
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
|
||
|
thanks = "致谢"
|
||
|
references = "参考文献"
|
||
|
excursus = "附录"
|
||
|
|
||
|
|
||
|
mulu_list_new = []
|
||
|
|
||
|
for text_dan in tqdm(text_list):
|
||
|
# print(text_dan)
|
||
|
try:
|
||
|
title_prompt, mulu = text_dan.split("**************")
|
||
|
except:
|
||
|
continue
|
||
|
result_biaoti_list = re.findall(pantten_title, title_prompt)
|
||
|
try:
|
||
|
result_biaoti_list[0]
|
||
|
except:
|
||
|
print(title_prompt)
|
||
|
continue
|
||
|
|
||
|
title = str(result_biaoti_list[0]).strip("\n")
|
||
|
mulu = str(mulu).strip("\n")
|
||
|
|
||
|
# 生成参考文件的提示文本
|
||
|
|
||
|
table_of_contents = []
|
||
|
nerlable_list = []
|
||
|
|
||
|
# mulu_base64 = base64.b64encode(mulu.encode('utf-8'))
|
||
|
# mulu_path = os.path.join(uuid_path, "mulu.txt")
|
||
|
# with open(mulu_path, 'wb', encoding='utf8') as f2:
|
||
|
# f2.write(mulu_base64)
|
||
|
mulu_list = str(mulu).split("\n")
|
||
|
if mulu_list[0] == "目录":
|
||
|
mulu_list = mulu_list[1:]
|
||
|
mulu_list = [i.strip() for i in mulu_list if i != ""]
|
||
|
mulu_str = "@".join(mulu_list)
|
||
|
|
||
|
mulu_list_bool = []
|
||
|
for i in mulu_list:
|
||
|
result_biaoti_list = re.findall(pantten_biaoti, i)
|
||
|
if result_biaoti_list != []:
|
||
|
mulu_list_bool.append((i, "一级标题"))
|
||
|
else:
|
||
|
mulu_list_bool.append((i, "二级标题"))
|
||
|
|
||
|
mulu_list_bool_part = mulu_list_bool[:3]
|
||
|
|
||
|
if mulu_list_bool_part[0][1] != "一级标题":
|
||
|
continue
|
||
|
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题":
|
||
|
continue
|
||
|
|
||
|
thanks_references_bool_table = mulu_list_bool[-5:]
|
||
|
|
||
|
for i in thanks_references_bool_table:
|
||
|
try:
|
||
|
if references in i[0]:
|
||
|
mulu_list_bool.remove(i)
|
||
|
if thanks in i[0]:
|
||
|
mulu_list_bool.remove(i)
|
||
|
if excursus in i[0]:
|
||
|
mulu_list_bool.remove(i)
|
||
|
except:
|
||
|
|
||
|
print(thanks_references_bool_table)
|
||
|
continue
|
||
|
|
||
|
for i in mulu_list_bool:
|
||
|
if i[1] == "一级标题":
|
||
|
paper_dan = {
|
||
|
"title": "@@" + i[0],
|
||
|
"small_title": [],
|
||
|
"word_count": 0
|
||
|
}
|
||
|
table_of_contents.append(paper_dan)
|
||
|
else:
|
||
|
table_of_contents[-1]["small_title"].append(i[0])
|
||
|
|
||
|
is_contine = False
|
||
|
if len(table_of_contents) < 6:
|
||
|
continue
|
||
|
else:
|
||
|
for big_title in table_of_contents[:-1]:
|
||
|
if len(big_title["small_title"]) < 2 or len(big_title["small_title"]) > 5:
|
||
|
is_contine = True
|
||
|
break
|
||
|
if is_contine == True:
|
||
|
continue
|
||
|
|
||
|
# print(table_of_contents)
|
||
|
#
|
||
|
# print(len(table_of_contents))
|
||
|
|
||
|
table_of_contents_new = []
|
||
|
for dabiaoti_index in range(len(table_of_contents)):
|
||
|
dabiaoti_dict = table_of_contents[dabiaoti_index]
|
||
|
dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"]
|
||
|
dan_str = "\n".join(dan_str_list)
|
||
|
table_of_contents_new.append(dan_str)
|
||
|
|
||
|
mulu_txt = "\n\n".join(table_of_contents_new)
|
||
|
|
||
|
title_prompt = title_prompt.replace("\n", "\\n")
|
||
|
mulu_txt = mulu_txt.replace("\n", "\\n")
|
||
|
|
||
|
mulu_list_new.append("**************".join([title_prompt,mulu_txt]))
|
||
|
|
||
|
|
||
|
with open("./data/训练数据集合/generate_mulu.txt", mode="w", encoding="utf-8") as f:
|
||
|
for i in mulu_list_new:
|
||
|
f.write(i)
|
||
|
f.write("\n")
|