
1 changed files with 131 additions and 0 deletions
@ -0,0 +1,131 @@ |
|||||
|
import os |
||||
|
from tqdm import tqdm |
||||
|
import re |
||||
|
|
||||
|
# chatgpt生成的数据 |
||||
|
file = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" |
||||
|
|
||||
|
|
||||
|
with open(file, encoding="utf-8") as f: |
||||
|
text = f.read() |
||||
|
|
||||
|
|
||||
|
text_list = text.split("@" * 20)[:-1] |
||||
|
|
||||
|
# print(text_list1 |
||||
|
|
||||
|
pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题," |
||||
|
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' |
||||
|
thanks = "致谢" |
||||
|
references = "参考文献" |
||||
|
excursus = "附录" |
||||
|
|
||||
|
|
||||
|
mulu_list_new = [] |
||||
|
|
||||
|
for text_dan in tqdm(text_list): |
||||
|
# print(text_dan) |
||||
|
try: |
||||
|
title_prompt, mulu = text_dan.split("**************") |
||||
|
except: |
||||
|
continue |
||||
|
result_biaoti_list = re.findall(pantten_title, title_prompt) |
||||
|
try: |
||||
|
result_biaoti_list[0] |
||||
|
except: |
||||
|
print(title_prompt) |
||||
|
continue |
||||
|
|
||||
|
title = str(result_biaoti_list[0]).strip("\n") |
||||
|
mulu = str(mulu).strip("\n") |
||||
|
|
||||
|
# 生成参考文件的提示文本 |
||||
|
|
||||
|
table_of_contents = [] |
||||
|
nerlable_list = [] |
||||
|
|
||||
|
# mulu_base64 = base64.b64encode(mulu.encode('utf-8')) |
||||
|
# mulu_path = os.path.join(uuid_path, "mulu.txt") |
||||
|
# with open(mulu_path, 'wb', encoding='utf8') as f2: |
||||
|
# f2.write(mulu_base64) |
||||
|
mulu_list = str(mulu).split("\n") |
||||
|
if mulu_list[0] == "目录": |
||||
|
mulu_list = mulu_list[1:] |
||||
|
mulu_list = [i.strip() for i in mulu_list if i != ""] |
||||
|
mulu_str = "@".join(mulu_list) |
||||
|
|
||||
|
mulu_list_bool = [] |
||||
|
for i in mulu_list: |
||||
|
result_biaoti_list = re.findall(pantten_biaoti, i) |
||||
|
if result_biaoti_list != []: |
||||
|
mulu_list_bool.append((i, "一级标题")) |
||||
|
else: |
||||
|
mulu_list_bool.append((i, "二级标题")) |
||||
|
|
||||
|
mulu_list_bool_part = mulu_list_bool[:3] |
||||
|
|
||||
|
if mulu_list_bool_part[0][1] != "一级标题": |
||||
|
continue |
||||
|
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题": |
||||
|
continue |
||||
|
|
||||
|
thanks_references_bool_table = mulu_list_bool[-5:] |
||||
|
|
||||
|
for i in thanks_references_bool_table: |
||||
|
try: |
||||
|
if references in i[0]: |
||||
|
mulu_list_bool.remove(i) |
||||
|
if thanks in i[0]: |
||||
|
mulu_list_bool.remove(i) |
||||
|
if excursus in i[0]: |
||||
|
mulu_list_bool.remove(i) |
||||
|
except: |
||||
|
|
||||
|
print(thanks_references_bool_table) |
||||
|
continue |
||||
|
|
||||
|
for i in mulu_list_bool: |
||||
|
if i[1] == "一级标题": |
||||
|
paper_dan = { |
||||
|
"title": "@@" + i[0], |
||||
|
"small_title": [], |
||||
|
"word_count": 0 |
||||
|
} |
||||
|
table_of_contents.append(paper_dan) |
||||
|
else: |
||||
|
table_of_contents[-1]["small_title"].append(i[0]) |
||||
|
|
||||
|
is_contine = False |
||||
|
if len(table_of_contents) < 6: |
||||
|
continue |
||||
|
else: |
||||
|
for big_title in table_of_contents[:-1]: |
||||
|
if len(big_title["small_title"]) < 2 or len(big_title["small_title"]) > 5: |
||||
|
is_contine = True |
||||
|
break |
||||
|
if is_contine == True: |
||||
|
continue |
||||
|
|
||||
|
# print(table_of_contents) |
||||
|
# |
||||
|
# print(len(table_of_contents)) |
||||
|
|
||||
|
table_of_contents_new = [] |
||||
|
for dabiaoti_index in range(len(table_of_contents)): |
||||
|
dabiaoti_dict = table_of_contents[dabiaoti_index] |
||||
|
dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"] |
||||
|
dan_str = "\n".join(dan_str_list) |
||||
|
table_of_contents_new.append(dan_str) |
||||
|
|
||||
|
mulu_txt = "\n\n".join(table_of_contents_new) |
||||
|
|
||||
|
title_prompt = title_prompt.replace("\n", "\\n") |
||||
|
mulu_txt = mulu_txt.replace("\n", "\\n") |
||||
|
|
||||
|
mulu_list_new.append("**************".join([title_prompt,mulu_txt])) |
||||
|
|
||||
|
|
||||
|
with open("./data/训练数据集合/generate_mulu.txt", mode="w", encoding="utf-8") as f: |
||||
|
for i in mulu_list_new: |
||||
|
f.write(i) |
||||
|
f.write("\n") |
Loading…
Reference in new issue