
1 changed files with 131 additions and 0 deletions
@ -0,0 +1,131 @@ |
|||
import os |
|||
from tqdm import tqdm |
|||
import re |
|||
|
|||
# chatgpt生成的数据 |
|||
file = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" |
|||
|
|||
|
|||
with open(file, encoding="utf-8") as f: |
|||
text = f.read() |
|||
|
|||
|
|||
text_list = text.split("@" * 20)[:-1] |
|||
|
|||
# print(text_list1 |
|||
|
|||
pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题," |
|||
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' |
|||
thanks = "致谢" |
|||
references = "参考文献" |
|||
excursus = "附录" |
|||
|
|||
|
|||
mulu_list_new = [] |
|||
|
|||
for text_dan in tqdm(text_list): |
|||
# print(text_dan) |
|||
try: |
|||
title_prompt, mulu = text_dan.split("**************") |
|||
except: |
|||
continue |
|||
result_biaoti_list = re.findall(pantten_title, title_prompt) |
|||
try: |
|||
result_biaoti_list[0] |
|||
except: |
|||
print(title_prompt) |
|||
continue |
|||
|
|||
title = str(result_biaoti_list[0]).strip("\n") |
|||
mulu = str(mulu).strip("\n") |
|||
|
|||
# 生成参考文件的提示文本 |
|||
|
|||
table_of_contents = [] |
|||
nerlable_list = [] |
|||
|
|||
# mulu_base64 = base64.b64encode(mulu.encode('utf-8')) |
|||
# mulu_path = os.path.join(uuid_path, "mulu.txt") |
|||
# with open(mulu_path, 'wb', encoding='utf8') as f2: |
|||
# f2.write(mulu_base64) |
|||
mulu_list = str(mulu).split("\n") |
|||
if mulu_list[0] == "目录": |
|||
mulu_list = mulu_list[1:] |
|||
mulu_list = [i.strip() for i in mulu_list if i != ""] |
|||
mulu_str = "@".join(mulu_list) |
|||
|
|||
mulu_list_bool = [] |
|||
for i in mulu_list: |
|||
result_biaoti_list = re.findall(pantten_biaoti, i) |
|||
if result_biaoti_list != []: |
|||
mulu_list_bool.append((i, "一级标题")) |
|||
else: |
|||
mulu_list_bool.append((i, "二级标题")) |
|||
|
|||
mulu_list_bool_part = mulu_list_bool[:3] |
|||
|
|||
if mulu_list_bool_part[0][1] != "一级标题": |
|||
continue |
|||
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题": |
|||
continue |
|||
|
|||
thanks_references_bool_table = mulu_list_bool[-5:] |
|||
|
|||
for i in thanks_references_bool_table: |
|||
try: |
|||
if references in i[0]: |
|||
mulu_list_bool.remove(i) |
|||
if thanks in i[0]: |
|||
mulu_list_bool.remove(i) |
|||
if excursus in i[0]: |
|||
mulu_list_bool.remove(i) |
|||
except: |
|||
|
|||
print(thanks_references_bool_table) |
|||
continue |
|||
|
|||
for i in mulu_list_bool: |
|||
if i[1] == "一级标题": |
|||
paper_dan = { |
|||
"title": "@@" + i[0], |
|||
"small_title": [], |
|||
"word_count": 0 |
|||
} |
|||
table_of_contents.append(paper_dan) |
|||
else: |
|||
table_of_contents[-1]["small_title"].append(i[0]) |
|||
|
|||
is_contine = False |
|||
if len(table_of_contents) < 6: |
|||
continue |
|||
else: |
|||
for big_title in table_of_contents[:-1]: |
|||
if len(big_title["small_title"]) < 2 or len(big_title["small_title"]) > 5: |
|||
is_contine = True |
|||
break |
|||
if is_contine == True: |
|||
continue |
|||
|
|||
# print(table_of_contents) |
|||
# |
|||
# print(len(table_of_contents)) |
|||
|
|||
table_of_contents_new = [] |
|||
for dabiaoti_index in range(len(table_of_contents)): |
|||
dabiaoti_dict = table_of_contents[dabiaoti_index] |
|||
dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"] |
|||
dan_str = "\n".join(dan_str_list) |
|||
table_of_contents_new.append(dan_str) |
|||
|
|||
mulu_txt = "\n\n".join(table_of_contents_new) |
|||
|
|||
title_prompt = title_prompt.replace("\n", "\\n") |
|||
mulu_txt = mulu_txt.replace("\n", "\\n") |
|||
|
|||
mulu_list_new.append("**************".join([title_prompt,mulu_txt])) |
|||
|
|||
|
|||
with open("./data/训练数据集合/generate_mulu.txt", mode="w", encoding="utf-8") as f: |
|||
for i in mulu_list_new: |
|||
f.write(i) |
|||
f.write("\n") |
Loading…
Reference in new issue