You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
2.6 KiB
80 lines
2.6 KiB
![]()
2 years ago
|
import os
|
||
|
from tqdm import tqdm
|
||
|
import re
|
||
|
|
||
|
|
||
|
patten = "目录是“(.*)”,请把其中的"
|
||
|
p0 = "@@@@@@@@@@@@@@@@@@"
|
||
|
p1 = "补充内容字数在1500字左右"
|
||
|
p2 = "**************"
|
||
|
data_path_list = []
|
||
|
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2/small_title_prompt_shuffle_1"):
|
||
|
for file in files:
|
||
|
#获取文件路径
|
||
|
data_path_list.append(os.path.join(root,file))
|
||
|
|
||
|
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2/small_title_prompt_shuffle_2"):
|
||
|
for file in files:
|
||
|
#获取文件路径
|
||
|
data_path_list.append(os.path.join(root,file))
|
||
|
|
||
|
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2_10000_40000/small_title_prompt_2_10000_40000"):
|
||
|
for file in files:
|
||
|
#获取文件路径
|
||
|
data_path_list.append(os.path.join(root,file))
|
||
|
|
||
|
|
||
|
print(data_path_list)
|
||
|
|
||
|
|
||
|
jishu = 0
|
||
|
data_str = ""
|
||
|
for i in tqdm(data_path_list):
|
||
|
dayin = False
|
||
|
with open(i, encoding="utf-8") as f:
|
||
|
data_dan = f.read()
|
||
|
data_dan_list = data_dan.split(p0)[1].split(p2)
|
||
|
|
||
|
tishi = data_dan_list[0]
|
||
|
gen = data_dan_list[1]
|
||
|
gen_len = len(gen)
|
||
|
result_biaoti_list = re.findall(patten, tishi)
|
||
|
try:
|
||
|
mulu = str(result_biaoti_list[0])
|
||
|
except:
|
||
|
print(tishi)
|
||
|
continue
|
||
|
mulu_list = mulu.split("\\n")
|
||
|
mulu_list = [i.strip() for i in mulu_list if i != ""]
|
||
|
|
||
|
mulu_list_bool = []
|
||
|
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
|
||
|
for i in mulu_list:
|
||
|
result_biaoti_list = re.findall(pantten_biaoti, i)
|
||
|
if result_biaoti_list != []:
|
||
|
mulu_list_bool.append((i, "一级标题"))
|
||
|
else:
|
||
|
mulu_list_bool.append((i, "二级标题"))
|
||
|
|
||
|
for i in range(len(mulu_list) - 2):
|
||
|
if "\n" + mulu_list_bool[i][0] in gen and "\n" + mulu_list_bool[i+1][0] in gen:
|
||
|
# print("标题1", mulu_list_bool[i][0])
|
||
|
# print("标题2", mulu_list_bool[i+1][0])
|
||
|
dayin = True
|
||
|
break
|
||
|
if dayin == True:
|
||
|
# print('gen', gen)
|
||
|
# print('tishi', tishi)
|
||
|
jishu += 1
|
||
|
continue
|
||
|
zishu = str(int(gen_len/100)*100)
|
||
|
prompt = tishi.replace(p1, "".join(["补充内容字数在", zishu, "字左右"]))
|
||
|
data_str += p0 + prompt + p2 + gen
|
||
|
|
||
|
|
||
|
print(jishu)
|
||
|
print(data_str)
|
||
|
with open("./data/paper_prompt_title_3_1/title_mulu_small_title_prompt_shuffle_data.txt", mode="w", encoding="utf-8") as f:
|
||
|
f.write(data_str)
|
||
|
|