数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
2.6 KiB

2 years ago
import os
from tqdm import tqdm
import re
patten = "目录是“(.*)”,请把其中的"
p0 = "@@@@@@@@@@@@@@@@@@"
p1 = "补充内容字数在1500字左右"
p2 = "**************"
data_path_list = []
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2/small_title_prompt_shuffle_1"):
for file in files:
#获取文件路径
data_path_list.append(os.path.join(root,file))
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2/small_title_prompt_shuffle_2"):
for file in files:
#获取文件路径
data_path_list.append(os.path.join(root,file))
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2_10000_40000/small_title_prompt_2_10000_40000"):
for file in files:
#获取文件路径
data_path_list.append(os.path.join(root,file))
print(data_path_list)
jishu = 0
data_str = ""
for i in tqdm(data_path_list):
dayin = False
with open(i, encoding="utf-8") as f:
data_dan = f.read()
data_dan_list = data_dan.split(p0)[1].split(p2)
tishi = data_dan_list[0]
gen = data_dan_list[1]
gen_len = len(gen)
result_biaoti_list = re.findall(patten, tishi)
try:
mulu = str(result_biaoti_list[0])
except:
print(tishi)
continue
mulu_list = mulu.split("\\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_list_bool = []
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
mulu_list_bool.append((i, "二级标题"))
for i in range(len(mulu_list) - 2):
if "\n" + mulu_list_bool[i][0] in gen and "\n" + mulu_list_bool[i+1][0] in gen:
# print("标题1", mulu_list_bool[i][0])
# print("标题2", mulu_list_bool[i+1][0])
dayin = True
break
if dayin == True:
# print('gen', gen)
# print('tishi', tishi)
jishu += 1
continue
zishu = str(int(gen_len/100)*100)
prompt = tishi.replace(p1, "".join(["补充内容字数在", zishu, "字左右"]))
data_str += p0 + prompt + p2 + gen
print(jishu)
print(data_str)
with open("./data/paper_prompt_title_3_1/title_mulu_small_title_prompt_shuffle_data.txt", mode="w", encoding="utf-8") as f:
f.write(data_str)