Browse Source

第一次提交

master
majiahui@haimaqingfan.com 2 years ago
commit
a8031664e4
  1. 8
      .idea/.gitignore
  2. 79
      1500字小标题论文整合文件.py
  3. 15
      5369885a-0354-11ee-a85d-aaaa001aad2e_data.txt
  4. 0
      README.md
  5. 6
      a.py
  6. 187
      api_key.txt
  7. 43
      chatgpt_post.py
  8. 28
      chatgpt_request.py
  9. 41
      chinese_keyword转化为提示.py
  10. 201
      data_do/处理chatgpt生成的数据到标注数据.py
  11. 133
      data_do/生成目录训练数据.py
  12. 119
      generate_en_keyword.py
  13. 188
      generate_mulu.py
  14. 140
      generate_mulu_only.py
  15. 93
      generate_small_title.py
  16. 125
      generate_small_title_new.py
  17. 126
      generate_small_title_zengliang.py
  18. 125
      generate_small_title_zhaiyao.py
  19. 125
      generate_small_title_zishuzengzhang.py
  20. 71
      jianjie转化为tishi.py
  21. 16
      main.py
  22. 221
      mulu转化为提示文本.py
  23. 210
      mulu转化为提示文本_只针对小标题切无字数控制.py
  24. 68
      mulu转化为提示文本生成摘要.py
  25. 12
      read_train_json.py
  26. 0
      train.py
  27. 225
      train_novel.py
  28. 1470
      zhaiyao_chinese_keyword_prompt_data.txt
  29. 86
      zhaiyoa转化为提示.py
  30. 158
      以核心内容生成开题报告目录等内容.py
  31. 155
      以核心内容生成开题报告目录等内容_问题补充.py
  32. 16
      取小标题数据数据.py
  33. 23
      合并数据.py
  34. 17
      提示文本总结
  35. 32
      摘要小文件整合.py
  36. 108
      数据筛选llama.py
  37. 91
      数据筛选rwkv.py
  38. 33
      数据统计.py
  39. 14
      测试正则.py
  40. 38
      测试生成效果.py
  41. 159
      生成chatglm训练数据.py
  42. 165
      生成chatglm训练数据包含prompt.py
  43. 127
      生成文本核心内容提示.py
  44. 74
      生成训练文件.py
  45. 11
      统计数据.py
  46. 6
      统计标题.py
  47. 11
      计算器.py
  48. 83
      训练数据筛选.py
  49. 89
      训练数据筛选_prompt.py
  50. 41
      题目去重.py

8
.idea/.gitignore

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

79
1500字小标题论文整合文件.py

@ -0,0 +1,79 @@
import os
from tqdm import tqdm
import re
patten = "目录是“(.*)”,请把其中的"
p0 = "@@@@@@@@@@@@@@@@@@"
p1 = "补充内容字数在1500字左右"
p2 = "**************"
data_path_list = []
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2/small_title_prompt_shuffle_1"):
for file in files:
#获取文件路径
data_path_list.append(os.path.join(root,file))
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2/small_title_prompt_shuffle_2"):
for file in files:
#获取文件路径
data_path_list.append(os.path.join(root,file))
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2_10000_40000/small_title_prompt_2_10000_40000"):
for file in files:
#获取文件路径
data_path_list.append(os.path.join(root,file))
print(data_path_list)
jishu = 0
data_str = ""
for i in tqdm(data_path_list):
dayin = False
with open(i, encoding="utf-8") as f:
data_dan = f.read()
data_dan_list = data_dan.split(p0)[1].split(p2)
tishi = data_dan_list[0]
gen = data_dan_list[1]
gen_len = len(gen)
result_biaoti_list = re.findall(patten, tishi)
try:
mulu = str(result_biaoti_list[0])
except:
print(tishi)
continue
mulu_list = mulu.split("\\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_list_bool = []
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
mulu_list_bool.append((i, "二级标题"))
for i in range(len(mulu_list) - 2):
if "\n" + mulu_list_bool[i][0] in gen and "\n" + mulu_list_bool[i+1][0] in gen:
# print("标题1", mulu_list_bool[i][0])
# print("标题2", mulu_list_bool[i+1][0])
dayin = True
break
if dayin == True:
# print('gen', gen)
# print('tishi', tishi)
jishu += 1
continue
zishu = str(int(gen_len/100)*100)
prompt = tishi.replace(p1, "".join(["补充内容字数在", zishu, "字左右"]))
data_str += p0 + prompt + p2 + gen
print(jishu)
print(data_str)
with open("./data/paper_prompt_title_3_1/title_mulu_small_title_prompt_shuffle_data.txt", mode="w", encoding="utf-8") as f:
f.write(data_str)

15
5369885a-0354-11ee-a85d-aaaa001aad2e_data.txt

@ -0,0 +1,15 @@
@@@@@@@@@@@@@@@@@@论文题目是“不同有机肥料对猕猴桃土壤微生态及产量、品质的影响研究”,目录是“一、研究背景和目的\n 1.1 研究背景\n 1.2 研究目的\n\n二、相关理论和研究现状\n 2.1 有机肥料的种类和特点\n 2.2 猕猴桃的土壤微生态及其影响因素\n 2.3 有机肥料对猕猴桃产量和品质的影响研究现状\n\n三、材料与方法\n 3.1 实验地点和时间\n 3.2 实验设计\n 3.3 实验材料和处理\n\n四、结果与分析\n 4.1 土壤微生态指标的变化\n 4.2 猕猴桃产量的变化\n 4.3 猕猴桃品质的变化\n\n五、结论与展望\n 5.1 结论\n 5.2 展望\n\n六、参考文献”,请把其中的小标题“3.3 实验材料和处理”的内容补充完整,补充内容字数在1500字左右**************3.3 实验材料和处理
3.3.1 实验材料
本实验选取了猕猴桃栽培常用的有机肥料,包括牛粪、鸡粪、豆渣、蚯蚓粪和堆肥。这些有机肥料均为当地市场上常见的肥料,选取时要求肥料来源可靠、质量稳定。同时,为了保证实验的可比性,选取的有机肥料均为新鲜的,未经过发酵处理。
3.3.2 实验处理
本实验共设置了五个有机肥料处理,分别为牛粪、鸡粪、豆渣、蚯蚓粪和堆肥,以及一个对照组,即不施加有机肥料。每个处理设置了三个重复,共计18个试验单位。
在实验开始前,先将试验地点进行清理和平整,确保试验地点的土壤质量和水分状况均匀。接着,根据实验设计,在试验地点上设置了18个试验单位。每个试验单位的大小为2 m × 2 m,其中中央1 m × 1 m 的区域为猕猴桃树的生长区域,周围1 m 的区域为控制区域。
在实验开始前,先对试验地点的土壤进行了全面的化验分析,以了解土壤的基本性质和养分状况。根据化验结果,确定了每个试验单位所需的施肥量。在实验开始前,先将有机肥料进行筛选和清洗,去除杂质和异物。然后将有机肥料均匀地撒在每个试验单位的中央1 m × 1 m 的区域内,厚度约为5 cm。接着,将有机肥料与土壤混合均匀,然后进行翻耕和平整,使有机肥料与土壤充分接触。
在猕猴桃生长期间,根据实验设计,对每个试验单位进行了相同的管理和维护。包括定期浇水、除草、松土、修剪等。同时,对每个试验单位进行了相同的采样和分析,以了解土壤微生态指标、猕猴桃产量和品质的变化。

0
README.md

6
a.py

@ -0,0 +1,6 @@
title = "大自然"
b = f"你好,{title}"
print(b)

187
api_key.txt

@ -0,0 +1,187 @@
suppcopu@tersnetz.best----5w9qoEQr----POCELccB2F----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
dusomulti@tersnetz.best----aDNO7BcM----dus2OaZ----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
bensuba@tersnetz.best----CB2s90DNo----Z2VY4G2----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
pokari@tersnetz.best----XKkVzDbnfB----DipqKfn----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
paisa@tersnetz.best----UDYF6prHog----SvY63GaE----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
nichin@tersnetz.best----kLOEJtYZS----kolaE6oWH9----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
saeburficudi@tersnetz.best----7SP9lWnA----JpHWCCNgk----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
seubercadi@tersnetz.best----xw3Y4ntl----gSykWdYNBT----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
daiguri@tersnetz.best----rdHfvLan----yAre6Ya3----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
regakuzuhi@tersnetz.best----SjI0de3NY----3OY1hXrt----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
demagvice@tersnetz.best----9xpzk8wCt3----mE1DjozAP----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
vilule@tersnetz.best----f9YRSo61s----ycBciuDA----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
kokuji@tersnetz.best----IiNfcoZrh----N3MSAzzA----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
woiba@tersnetz.best----1UGfuJIYnA----gS3yJekxc4----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
terpgerfipasio@tersnetz.best----LV8ITFXwt----5nEDpOLcLQ----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
inimep@tersnetz.best----BKOpyYvVqm----dvpRZwD----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
ligby@tersnetz.best----y9O75hW4nR----8YhM9TZdr----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
guruni@tersnetz.best----mq9lAGye----DruCe1b8----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
sylimica@tersnetz.best----mGfSWZkb----fuI8nr7----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
shino@tersnetz.best----pGPOj2KfC----hAmMwcV7Vk----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
glutorpatwallsket@tersnetz.best----osxXYZAGyJ----MjoYp9wfm1----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
causenfors@tersnetz.best----EFxOBnofYm----umcHLmc----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
waru@tersnetz.best----aR6tA3vdCl----vaskmpqY----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
rikobake@tersnetz.best----hjf7oReFW----OjYJKLMn----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
panbancformi@tersnetz.best----DKjavVqN23----oURpmte6m----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
reacmecu@tersnetz.best----JPeYc64H----OBh4UyP6qk----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
bjorerde@tersnetz.best----mzNnraysu----Bbue3w6D----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
golddetarmars@tersnetz.best----YXE3CWUz9D----4zAjuwoHX----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
chiraino@tersnetz.best----YDNodsQU----Rd5XK1Vr----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
dense@tersnetz.best----yNIag2GUm----rhjEzVm----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
confrafab@tersnetz.best----WOaehfloc----8cXwgXIx----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
trumtheruvolking@tersnetz.best----UdPX17bys----tcocPmNjLm----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
ctivexum@tersnetz.best----3fqU0skQ----F7QwHRSh----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
tsukufuri@tersnetz.best----XHY4CSOrie----sVuobsY----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
langgemugo@tersnetz.best----Gl23t1zMoS----wpxJrJj7D----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
hukssunbcordia@tersnetz.best----rGMLBXZuHl----A1syOOwQXC----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
imcutel@tersnetz.best----N0J34mXC----IhYiYZRR----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
fasttubedstapo@tersnetz.best----AKeLUZ7Sl----bW7JRmKU----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
sonbeau@tersnetz.best----sjkgJ8hdv----zkSEgLRDI----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
kitsu@tersnetz.best----UtKNwdAifP----cnz7VcvF----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
flavduoda@tersnetz.best----LcfnjQHkoO----KuFG3gIi----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
dergduffsket@tersnetz.best----KpVU9hFv1X----vCsa6gJ----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
terdena@tersnetz.best----sVO8fUrM7----kOKYgIm----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
quilect@tersnetz.best----QVK6TINboy----RkiK55D----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
sturavupan@tersnetz.best----dGIXpM5nmE----GzC5Aug26s----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
daesaecicom@tersnetz.best----gQr5I0i4q3----OvyaeAy----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
billracfi@tersnetz.best----8PzN6TB3i9----XD5IkYoZ----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
baachieblogjugg@tersnetz.best----kCD9x8qcE----DhzxEcCJw----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
mabtipurpja@tersnetz.best----THetxRiaz----FEFkkRz----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
zenpuku@tersnetz.best----W0q14FOJnP----4z77pOeCm----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
pikiriha@tersnetz.best----8l5absUI----KSSz4dr----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
cunccommos@tersnetz.best----RhCdwaj7----OpoL6FX----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
seikake@tersnetz.best----g8fqrkmL----kuHCPxH----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
deshi@tersnetz.best----ROblT421----q3hrr1qSZ----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
okain@tersnetz.best----0KowvLYHp----lgwCnrQT----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
degai@tersnetz.best----5l43u9CrA----KlazC33DrS----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
nshimanji@tersnetz.best----RxyM369Xr----F54qcLz----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
robanatate@tersnetz.best----HlByp0Z1G----5WUHk7sM----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
mishime@tersnetz.best----fzSri4WlY----g8vbRwjnYn----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
praninanad@tersnetz.best----gsrVwMbf----3rJx4Ao2VS----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
mulsuhita@tersnetz.best----yMN0ZXWb----gU2H8iFCk----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
bionakupirs@tersnetz.best----EAuXa6qx1----lSd4VZKzg----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
neku@tersnetz.best----P2XtwnLa----usA2ApwEB----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
honyu@tersnetz.best----gBxXrPcn----Ql8b6IcrZ----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
latyvo@tersnetz.best----SGmqURdn----zAqTZLOzui----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
abuto@tersnetz.best----ct9CBRJE----bSxaQeGI----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
biconlen@tersnetz.best----09yJBwGQai----sHKTcGv----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
raslybe@tersnetz.best----68iZkI7YU----BGXgxiAaJV----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
sustcacuna@tersnetz.best----CH24eNyR----nYjc4yjSCT----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
wordhealthli@tersnetz.best----NIuJSdzR----WabZUQiOtZ----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
momeruoma@tersnetz.best----ezRkGBuQg----1JtY1NHm----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
shikiminka@tersnetz.best----DMP0CZ2j----3KPnPBibwJ----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
plosinumpel@tersnetz.best----T4vwFyLzg----Us78iwrH----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
torcato@tersnetz.best----A7158bEXp----WedQeSI6p----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
poscoaniema@tersnetz.best----FPk5eXZ0----2YZ5MHZx4H----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
riebo@tersnetz.best----V815QYeNZ----Jdxc1cTaR----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
zlatradu@tersnetz.best----ejtdwoUP7----W6KjGnskk----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
nujobco@tersnetz.best----AOhCKSn1o----jJ1laVg----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
todonahizu@tersnetz.best----XgyPS7JNfo----4QevFd6z----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
cetabethei@tersnetz.best----B4tuo9f0k----arUWRjZ6----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
tofenci@tersnetz.best----RefaiozD1----PDaZ7B2----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
jinna@tersnetz.best----bCjupdci75----nS3gR8T1----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
sconilin@tersnetz.best----zPaoiMfjJ----axYMSHydh----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
enomai@tersnetz.best----3NIJlUqM----pbdju7Fq8----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
senro@tersnetz.best----Xk0B8ltdG----JjtGAWEl----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
iltecounro@tersnetz.best----JT2q59vyLg----ywN9iO88c----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
itrefu@tersnetz.best----aftKNicH4----AVItDJiBG2----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
wakai@tersnetz.best----WlFeJUCsaR----YaYzzOKlqP----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
gokoshi@tersnetz.best----RV86XJyp----jt5WFHLVt----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
lefwwisibhuntdi@tersnetz.best----OreVGSgI----sbg7im4ps----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
rocomvilin@tersnetz.best----AlfviK3dra----4nt2W2rF----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
reilogphalac@tersnetz.best----m3LkYClJX----GUmlR3p----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
sculidenter@tersnetz.best----tUlAosJXdV----KWBznnO----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
mamsa@tersnetz.best----eVUj4g3c----Ab2QltdMhg----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
graphictecda@tersnetz.best----erh0nZuI----hsm3DSB----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
norekiri@tersnetz.best----dBxAEGtoc----U4kbwU6GHP----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
torsrurolis@tersnetz.best----ovThfDl8JO----QK7V9bn8----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
gurumarana@tersnetz.best----zkTD90NWHG----Ckt6z85j----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
giokett@tersnetz.best----VK0uAHi37----HId8LZxy----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
suin@tersnetz.best----fgIxcEVX----mx9Aaa2----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
wodohanke@tersnetz.best----bPFSgQDh9----qszZRmsyfV----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
taimuemite@tersnetz.best----mYQSrztN----mVdzeEOcsz----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
cuico@tersnetz.best----PVyOkW0r9----l2L7DkPc4o----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
bicaunocos@tersnetz.best----J5gbQ07mPs----RKxxjqW----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
cecafeta@tersnetz.best----Z5KrLsT79----9e1jfyi2ED----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
abtotiomil@tersnetz.best----fxegFI3S----1acH8JSy2----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
abutin@tersnetz.best----O53BR7k1F----6F3JvapCg----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
porliopersan@tersnetz.best----9YNfpI7Ang----ZLMFxUl----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
brocwinthocom@tersnetz.best----51fnDpNz----mDyqSsqm8----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
banta@tersnetz.best----K8Mmx9ov----yLHsnIIT----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
forcaucada@tersnetz.best----q8JGDtTv----Foz6594Z----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
ruimanquadna@tersnetz.best----Vbfcp5usd----puAUVk84If----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
chingo@tersnetz.best----4wFJyBaCu----RZbfOKqME----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
heie@tersnetz.best----zYcBJORnr----p7bzYgwMxx----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
genza@tersnetz.best----luaJBP40----k3vygXx----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
rabanse@tersnetz.best----NubZKUjy----ymPAm8s73----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
naistabidim@tersnetz.best----cmEWhaSx----GOrvLMtaS----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
jinshi@tersnetz.best----QKblfGOVZw----8fxWnagX----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
compmoperqui@tersnetz.best----UeRm2g9d5W----1cKdgmoI----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
faulealiamul@tersnetz.best----0MJeW8EO----cj2dvFXU----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
kioto@tersnetz.best----lU7S4ERexn----gV13ioHYh----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
ashin@tersnetz.best----aotDG8LsB----Y32jaNE----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
biolecpartu@tersnetz.best----DYxPqhrHJ----2KWQbGEGS----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
holosaperback@tersnetz.best----69qgaiAG----qpvWxtrlP----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
zumeishi@tersnetz.best----XMh3UbO7z----8X8WvxLr----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
dietuter@tersnetz.best----WRf64VsTxD----wfjFDrFFu----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
missniraga@tersnetz.best----NXbBHk8WQI----b71FlC38----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
inmistan@tersnetz.best----BwuVYCWJmA----YH1VU2iQsC----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
bergworllidu@tersnetz.best----5BFbjOeI4----N6nOfN3----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
keruma@tersnetz.best----ehMG1nTx----arVsSQj----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
acnisven@tersnetz.best----J4q9HrKo----Pj7EzUfYT----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
kelchace@tersnetz.best----tY1kVjHe----aYJ2njW----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
awai@tersnetz.best----S7siMGPCV----Nmp1c1Nk----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
sugari@tersnetz.best----HShEdOYprU----qSkanC9kJC----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
grytunmatbouhot@tersnetz.best----xfeScnXT1J----3Hof885QP----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
sebilucdenlo@tersnetz.best----V3wAWzLPeZ----RK2lneqJ2----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
zukika@tersnetz.best----wqB4PdVA----lTmeFVR----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
statabingran@tersnetz.best----arHep8fL----xLQujXy----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
bildjoundebt@tersnetz.best----l5cPtQmYzo----tAg5U9bkr----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
lanacepreai@tersnetz.best----bVKWQXBpv3----GIqo2nz----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
kokushina@tersnetz.best----w1jDxe23cq----rHQgkLUg2----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
claronince@tersnetz.best----lQpgBuFzwk----drsk7ZeTp----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
lustboconvavi@tersnetz.best----XyA7vlpN----2tov7Fi9----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
oyabu@tersnetz.best----Rf9MGtl4a----gG8xSqf1----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
suguriko@tersnetz.best----eQ2tumyB----FVpGjtQr----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
dialitendebe@tersnetz.best----1QvDTk6LE----fEJ8qfa----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
kisoa@tersnetz.best----WLEwqpsRH2----jUlIxOPBU----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
bokosa@tersnetz.best----baEg3hy9xR----dAYimP5J----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
crysinsubmen@tersnetz.best----F4Wn69G7A----NsLoccR5u----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
feporvo@tersnetz.best----3hLkfOmdA----1fdyQeu----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
mennici@tersnetz.best----CU8aqBbPKD----tpucc87uF----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
unnemeli@tersnetz.best----aiSPQOTs----zZD217o----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
cusini@tersnetz.best----4mgIb50KjW----gN7uk7UIo----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
steepkeebeantwoodcwor@tersnetz.best----iRvBhSqFOX----Y4WaziiS----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
geni@tersnetz.best----QU2qnLJSR----keNcmCED----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
pyperdoli@tersnetz.best----FVmjYPCASH----UcYetSh7----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
inolov@tersnetz.best----OBaiZE4NI----btTCxwzs----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
ofen@tersnetz.best----WD68NJKoBx----Jj9HnAA6QG----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
issimixmos@tersnetz.best----kdgtIWAx----O9MMnrNBN----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
herthotire@tersnetz.best----mPYbLI6pdy----uZPGtLUJ----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
tomono@tersnetz.best----ZJQUPCNt----178G8Xw2u----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
liepussomu@tersnetz.best----0DXB8yuR----wnaDucodqj----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
togara@tersnetz.best----mbLEGV8t----4jBGDXmncu----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
tasecnylpputhe@tersnetz.best----20sGYJWQl----L5s5GmFfL----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
goriserrisp@tersnetz.best----qaCnKUJf----mrh93NKAei----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
nungiotrus@tersnetz.best----mUyxdNoBz4----1FaqrgKyQO----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
nuki@tersnetz.best----aOcQN84jn----eE5yr5UB----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
slinjan@tersnetz.best----VXWxrEni----9YC4kFK----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
deibelsil@tersnetz.best----g8D21sQn----d6rdBoe----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
dendhia@nethi.shop----b9XokT1rp----CE14smI----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
haemyomusi@nethi.shop----3ovJzfBtO----DvGs9Im----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
okumo@nethi.shop----3dDcMbxQa----IGlcvBj----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
bozuriko@nethi.shop----W79q53NI----gTEzuh5S----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
keefithigh@nethi.shop----KeuSACYm----D1mthaj1MX----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
swelmenstrig@nethi.shop----Fpbkiorafg----JRHoMOBG----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
porode@nethi.shop----HodnTKIfi9----BSOVaMVzcY----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
watchsandllamur@nethi.shop----SLnsXCju----14rTgXRRnA----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
rakibaya@nethi.shop----JTye84ZlD----E6MsMvg----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
prositin@nethi.shop----Bw9OokM6----DXxj2ReT----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
gin@nethi.shop----fpGw5hlHtE----c4ZKdZD----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
saythalys@nethi.shop----RUmZ6D0I----4XTfor5T----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
mifu@nethi.shop----xHJ6n8y05----FMoYYFx----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
flicum@nethi.shop----ovGkuLURZq----27r7t5m----sk-uhviZQFakEdAhaOd43PQT3BlbkFJQmQz3VUgDvOvbKonrskS
paevul@nethi.shop----Viuk2EKUcp----V2yiKrM----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
connieten@nethi.shop----421zLxAsOu----FWkYXQW----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
perrezabor@nethi.shop----HhoX0kiEU----4Uw6kC5----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
trucdes@nethi.shop----A7n9v0Nt----wbS9wKlvE----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

43
chatgpt_post.py

@ -0,0 +1,43 @@
import requests
import json
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
OPENAI_API_KEY = i
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "请帮我根据题目为“初中英语写作教学的现状分析及应对策略”生成一个论文目录其中只含有一级标题和二级标题"},
],
"temperature": 0.7
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
print(res)
print(res["choices"][0]["message"]["content"])

28
chatgpt_request.py

@ -0,0 +1,28 @@
import requests
import json
OPENAI_API_KEY = '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'
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "请帮我改写这句话:在城市发展进程当中,逐渐呈现出一些综合性的大型建筑群。"},
{"role": "assistant", "content": "随着城市的发展,综合性大型建筑群正在逐渐出现。"},
{"role": "user", "content": "这句话我不满意,再改一下帮我"}
],
"temperature": 0.7
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=1000)
res = response.json()
print(res)
print(res["choices"][0]["message"]["content"])

41
chinese_keyword转化为提示.py

@ -0,0 +1,41 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
prompt = "请把“{}”这几个关键字翻译成英文"
pantten_title = "(.*?)》为题目生成论文摘要,要求生成的字数在"
path = "./data/paper_prompt_title_3_1/zhaiyao_chinese_keyword_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
# "\n\"请为“",
text_list = text.split("\n\"请为“")
data_list = []
chinese_keyword_data_list = []
for text_dan in tqdm(text_list[1:]):
# print(text_dan)
try:
_ , chinese_keyword = text_dan.split("**************")
except:
continue
chinese_keyword = str(chinese_keyword).strip("\n")
data_list.append(prompt.format(chinese_keyword))
import random
random.shuffle(data_list)
with open("./data/chinese_keyword_to_/chinese_keyword_en_prompt.txt", mode="w", encoding="utf-8") as f:
for i in data_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")

201
data_do/处理chatgpt生成的数据到标注数据.py

@ -0,0 +1,201 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右"
small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
thanks = "致谢"
references = "参考文献"
excursus = "附录"
u = 3.5 # 均值μ
sig = math.sqrt(6.0)
zong_gradient = 6
paper_word_count = 12000
path = "../data/paper_prompt_title_3/title_mulu_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
def normal_distribution(x):
y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
return y
text_list = text.split("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
ner_lable = []
text_zong = []
train_list = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title, mulu = text_dan.split("**********************************************")
except:
continue
title = str(title).strip("\n")
mulu = str(mulu).strip("\n")
paper_text = "题目:{}@目录:".format(title)
table_of_contents = []
nerlable_list = []
# mulu_base64 = base64.b64encode(mulu.encode('utf-8'))
# mulu_path = os.path.join(uuid_path, "mulu.txt")
# with open(mulu_path, 'wb', encoding='utf8') as f2:
# f2.write(mulu_base64)
mulu_list = str(mulu).split("\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_str = "@".join(mulu_list)
mulu_list_bool = []
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
mulu_list_bool.append((i, "二级标题"))
mulu_list_bool_part = mulu_list_bool[:3]
if mulu_list_bool_part[0][1] != "一级标题":
continue
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题":
continue
thanks_references_bool_table = mulu_list_bool[-5:]
for i in thanks_references_bool_table:
try:
if references in i[0]:
mulu_list_bool.remove(i)
if thanks in i[0]:
mulu_list_bool.remove(i)
if excursus in i[0]:
mulu_list_bool.remove(i)
except:
print(thanks_references_bool_table)
continue
for i in mulu_list_bool:
if i[1] == "一级标题":
paper_dan = {
"title": "@@" + i[0],
"small_title": [],
"word_count": 0
}
table_of_contents.append(paper_dan)
else:
table_of_contents[-1]["small_title"].append(i[0])
x_list = [0]
y_list = [normal_distribution(0)]
gradient = zong_gradient/len(table_of_contents)
for i in range(len(table_of_contents)-1):
x_gradient = x_list[-1] + gradient
x_list.append(x_gradient)
y_list.append(normal_distribution(x_list[-1]))
dan_gradient = paper_word_count/sum(y_list)
for i in range(len(y_list)):
table_of_contents[i]["word_count"] = dan_gradient * y_list[i]
# print(table_of_contents)
#
# print(len(table_of_contents))
table_of_contents_new = []
for dabiaoti_index in range(len(table_of_contents)):
dabiaoti_dict = table_of_contents[dabiaoti_index]
table_of_contents_new.append([dabiaoti_dict["title"], 0])
for xiaobiaoti in dabiaoti_dict["small_title"]:
table_of_contents_new.append([xiaobiaoti, int(dabiaoti_dict["word_count"]/len(dabiaoti_dict["small_title"]))])
small_task_list = []
content_index = 0
while True:
if content_index == len(table_of_contents_new):
break
subtitle, word_count = table_of_contents_new[content_index]
prompt = small_title_prompt
if content_index == 0 and table_of_contents_new[1][0][:2] == "@@" and subtitle[:2] == "@@":
subtitle, prompt, word_count = subtitle[2:], first_title_prompt, 800
if content_index == len(table_of_contents_new) -1 and subtitle[:2] == "@@":
subtitle, prompt, word_count = subtitle[2:], first_title_prompt, 800
paper_content = [
content_index,
title,
mulu,
subtitle,
prompt,
word_count
]
small_task_list.append(paper_content)
content_index += 1
for i in small_task_list:
if i[3][:2] == "@@":
continue
elif i[5] > 1280:
continue
else:
paper_prompt = i[4].format(i[1], i[2], i[3], i[5])
if len(paper_prompt) < 768:
train_list.append(paper_prompt)
else:
continue
import random
random.shuffle(train_list)
train_list_shuffle = train_list[:100000]
with open("../data/title_to_/prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
with open("../data/title_to_/prompt_shuffle.txt", mode="w", encoding="utf-8") as f:
for i in train_list_shuffle:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
# for lable in table_of_contents:
# text_len = len(paper_text)
# dan_nerlable = [text_len, text_len + len(lable[0]), lable[1]]
# nerlable_list.append(dan_nerlable)
# paper_text += lable[0]
# paper_text += "@"
#
# paper_dan = {"text": paper_text, "label": nerlable_list}
#
# ner_lable.append(str(table_of_contents))
# text_zong.append(paper_dan)
#
# with open("../data/train.txt", mode="w", encoding="utf-8") as f:
# for i in text_zong:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")
#
#
# with open("../data/train_lable.txt", mode="w") as f:
# for i in ner_lable:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")

133
data_do/生成目录训练数据.py

@ -0,0 +1,133 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
pantten_biaoti = '[一二三四五六七八九][、]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_biaoti_1 = '[1-9].[1-9].[1-9](.*)'
first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右"
small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
mulu_prompt = "请帮我根据题目为“{}”生成一个论文目录其中只含有一级标题和二级标题"
thanks = "致谢"
references = "参考文献"
excursus = "附录"
u = 3.5 # 均值μ
sig = math.sqrt(6.0)
zong_gradient = 6
paper_word_count = 12000
path = "../data/title.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
def normal_distribution(x):
y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
return y
text_list = text.split("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
ner_lable = []
text_zong = []
train_list = []
for text_dan in tqdm(text_list):
tiaoguo = False
# print(text_dan)
try:
title, mulu = text_dan.split("**********************************************")
except:
continue
title = str(title).strip("\n")
mulu = str(mulu).strip("\n")
paper_text = "题目:{}@目录:".format(title)
nerlable_list = []
# mulu_base64 = base64.b64encode(mulu.encode('utf-8'))
# mulu_path = os.path.join(uuid_path, "mulu.txt")
# with open(mulu_path, 'wb', encoding='utf8') as f2:
# f2.write(mulu_base64)
mulu_list = str(mulu).split("\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_list_bool = []
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
result_biaoti_sanji_list = re.findall(pantten_biaoti, i)
if result_biaoti_sanji_list != []:
tiaoguo = True
break
else:
mulu_list_bool.append((i, "二级标题"))
if tiaoguo == True:
continue
mulu_list_bool_part = mulu_list_bool[:3]
if mulu_list_bool_part[0][1] != "一级标题":
continue
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == "一级标题":
continue
if mulu_list_bool_part[-1][1] == "一级标题":
continue
thanks_references_bool_table = mulu_list_bool[-5:]
for i in thanks_references_bool_table:
try:
if references in i[0]:
mulu_list_bool.remove(i)
if thanks in i[0]:
mulu_list_bool.remove(i)
if excursus in i[0]:
mulu_list_bool.remove(i)
except:
print(thanks_references_bool_table)
continue
table_of_contents = []
for i in mulu_list_bool:
if i[1] == "一级标题":
paper_dan = {
"title": "@@" + i[0],
"small_title": [i[0]],
"word_count": 0
}
table_of_contents.append(paper_dan)
else:
table_of_contents[-1]["small_title"].append(i[0])
table_of_contents_new = []
for i in table_of_contents:
a = "\n".join(i["small_title"])
table_of_contents_new.append(a)
b = "\n\n".join(table_of_contents_new)
title_p = mulu_prompt.format(title)
train_list.append({"content": str(title_p), "summary": str(b)})
print(train_list)
import random
random.shuffle(train_list)
train_list_shuffle = train_list[:500]
with open("../data/mulu_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
with open("../data/mulu_prompt_shuffle.json", mode="w", encoding="utf-8") as f:
for i in train_list_shuffle:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")

119
generate_en_keyword.py

@ -0,0 +1,119 @@
import time
import os
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_bad_list = "openaikey_bad_list"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
file = r'./data/chinese_keyword_to_/chinese_keyword_en_prompt.txt'
zirenwu_list = []
with open(file, encoding="utf-8") as f:
type_prompt = file.split("/")[-1].split(".")[0]
texts = f.readlines()
for i in texts:
zirenwu_list.append((i, type_prompt))
import random
random.shuffle(zirenwu_list)
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, prompt, type_prompt):
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_3_1_1/{}_data.txt".format(type_prompt), mode="a") as f:
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
time.sleep(5)
except:
time.sleep(20)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((type_prompt, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
prompt, type_prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, prompt, type_prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

188
generate_mulu.py

@ -0,0 +1,188 @@
import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
prompt_dict = {
"mulu_prompt": "为论文题目“{}”生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题",
"beijing_prompt": "以“{}”为论文题目,写一段题目来源的背景,要求字数在200字以内",
"zongjie_prompt": "以“{}”为论文题目,写一个论文简短总结,要求在300字以内",
"zongshu_prompt": "请写出以《{}》为课题的国内外研究状况综述,字数在800字左右",
"yanjiubeijingyiyi_prompt": "请分别写出以《{}》为课题的研究背景和意义,字数不少于1000字",
"jianjie_prompt": "请帮我生成《{}》为题目的研究内容,包括整体简介和分最少三个方面总结"
}
with open("./data/题目3.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
title_list = []
for i in text_list:
title_list.append(i.split("@@@@@")[0])
random.shuffle(title_list)
print(len(title_list))
zirenwu_list = []
for title in title_list:
for prompt in prompt_dict:
zirenwu_list.append((prompt, str(prompt_dict[prompt]).format(title)))
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
# def request_api_chatgpt(api_key, prompt):
# OPENAI_API_KEY = api_key
# url = "https://api.openai.com/v1/chat/completions"
# headers = {
# "Content-Type": "application/json",
# "Authorization": f"Bearer {OPENAI_API_KEY}"
# }
# data = {
# "model": "gpt-3.5-turbo",
# "messages": [
# {"role": "user", "content": prompt},
# ],
# "temperature": 0.5
# }
# response = requests.post(url,
# headers=headers,
# data=json.dumps(data),
# timeout=240)
# print("response", response)
#
# return response
#
# def task(api_key, title):
# try:
# for pormpt_dan in prompt_dict:
# name = pormpt_dan.split("_")[0]
# print("pormpt_dan", pormpt_dan)
# print("prompt_dict", prompt_dict)
# prompt = str(prompt_dict[pormpt_dan]).format(title)
# print("api_key", api_key)
# print("prompt", prompt)
# response = request_api_chatgpt(api_key, prompt)
# res = response.json()
# text = res["choices"][0]["message"]["content"]
# lock.acquire()
#
# with open("/home/majiahui/mulu_ner/data/paper_prompt_title_new/title_{}_data.txt".format(name), mode="a") as f:
# f.write(title)
# f.write("\n**********************************************\n")
# f.write(text)
# f.write("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
# lock.release()
# time.sleep(2)
# lock.acquire()
# api_key_list.append(api_key)
# lock.release()
# print(1)
# except:
# print()
# time.sleep(5)
# lock.acquire()
# api_key_list.append(api_key)
# lock.release()
# print(2)
def request_api_chatgpt(api_key, task_type, prompt):
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_3/title_{}_data.txt".format(task_type), mode="a") as f:
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
except:
print("task_type_bad", task_type)
print("api_key_bad", api_key)
time.sleep(5)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((task_type, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
task_type, prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

140
generate_mulu_only.py

@ -0,0 +1,140 @@
import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
prompt_dict = {
"mulu_prompt": "为论文题目《{}》生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题",
}
with open("./data/题目4_new.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
title_list = []
for i in text_list:
title_list.append(i.split("@@@@@")[0])
random.shuffle(title_list)
print(len(title_list))
zirenwu_list = []
for title in title_list:
for prompt in prompt_dict:
zirenwu_list.append((prompt, str(prompt_dict[prompt]).format(title)))
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, task_type, prompt):
t1 = time.time()
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=1200)
res = response.json()
text = res["choices"][0]["message"]["content"]
# api_key_list.append(api_key)
t2 = time.time()
t_n = t2 - t1
lock.acquire()
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_4/title_{}_data.txt".format(task_type), mode="a") as f:
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
f.write("=================================================================================================")
lock.release()
if t_n > 20:
redis_.rpush(redis_key_name_openaikey_list, api_key)
else:
time.sleep(20 - t_n)
redis_.rpush(redis_key_name_openaikey_list, api_key)
except:
print("task_type_bad", task_type)
print("api_key_bad", api_key)
time.sleep(20)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((task_type, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
task_type, prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

93
generate_small_title.py

@ -0,0 +1,93 @@
import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
api_key_list = [
"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",
"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",
"sk-nYbapOeC5VmSReJB1JgEr3BlbkFJnOo2J9qFJRKXrOSNiYFO",
"sk-tOy3uBFkPsg9uVWTpDOor3BlbkFJkbXgo0sHAubK8VWyaeso",
"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",
"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",
"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",
"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",
"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",
"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"
]
lock = threading.RLock()
def request_api_chatgpt(api_key, prompt):
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
api_key_list.append(api_key)
with open("/home/majiahui/mulu_ner/data/prompt_small_gen.txt", mode="a") as f:
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
except:
time.sleep(5)
lock.acquire()
api_key_list.append(api_key)
lock.release()
if __name__ == '__main__':
with open("./data/prompt_shuffle.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
index = 0
while True:
if index == len(text_list):
break
if api_key_list == []:
time.sleep(1)
continue
else:
api_key = api_key_list.pop(0)
prompt = text_list[index]
t = Thread(target=request_api_chatgpt, args=(api_key, prompt))
t.start()
lock.acquire()
index += 1
print(index)
lock.release()

125
generate_small_title_new.py

@ -0,0 +1,125 @@
import time
import os
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_bad_list = "openaikey_bad_list"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
file = r'./data/small_title_prompt'
zirenwu_list = []
path_list = []
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
for path in path_list:
with open(path, encoding="utf-8") as f:
type_prompt = path.split("/")[-1].split(".")[0]
texts = f.readlines()
for i in texts:
zirenwu_list.append((i, type_prompt))
import random
random.shuffle(zirenwu_list)
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, prompt, type_prompt):
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_3_1/{}_data.txt".format(type_prompt), mode="a") as f:
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
time.sleep(5)
except:
time.sleep(20)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((type_prompt, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
prompt, type_prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, prompt, type_prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

126
generate_small_title_zengliang.py

@ -0,0 +1,126 @@
import time
import os
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
import uuid
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_bad_list = "openaikey_bad_list"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
file = 'data/title_mulu_to_/small_title_prompt_2_10000_40000.txt'
zirenwu_list = []
with open(file, encoding="utf-8") as f:
type_prompt = file.split("/")[-1].split(".")[0]
texts = f.read()
texts_list = texts.split("\n")
for i in texts_list:
zirenwu_list.append((i, type_prompt))
import random
random.shuffle(zirenwu_list)
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, prompt, type_prompt):
id_ = uuid.uuid1()
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
path = f"/home/majiahui/mulu_ner/data/paper_prompt_title_3_2_10000_40000/{type_prompt}/"
bool_ = os.path.exists(path)
if bool_ == False:
os.makedirs(path)
with open(path + f"/{id_}_data.txt", mode="w") as f:
f.write("@@@@@@@@@@@@@@@@@@")
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
f.close()
lock.release()
time.sleep(5)
except:
time.sleep(20)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((type_prompt, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
prompt, type_prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, prompt, type_prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

125
generate_small_title_zhaiyao.py

@ -0,0 +1,125 @@
import time
import os
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
import uuid
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_bad_list = "openaikey_bad_list"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
file = r'./data/title_mulu_to_/zhaiyao_prompt.txt'
zirenwu_list = []
with open(file, encoding="utf-8") as f:
type_prompt = file.split("/")[-1].split(".")[0]
texts = f.readlines()
for i in texts:
zirenwu_list.append((i, type_prompt))
import random
random.shuffle(zirenwu_list)
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, prompt, type_prompt):
id_ = uuid.uuid1()
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
path = f"/home/majiahui/mulu_ner/data/paper_prompt_title_3_2/{type_prompt}/"
bool_ = os.path.exists(path)
if bool_ == False:
os.makedirs(path)
with open(path + f"/{id_}_data.txt", mode="w") as f:
f.write("@@@@@@@@@@@@@@@@@@")
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
f.close()
lock.release()
time.sleep(5)
except:
time.sleep(20)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((type_prompt, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
prompt, type_prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, prompt, type_prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

125
generate_small_title_zishuzengzhang.py

@ -0,0 +1,125 @@
import time
import os
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
import uuid
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_bad_list = "openaikey_bad_list"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
file = r'./data/title_mulu_to_/small_title_prompt_shuffle_2.txt'
zirenwu_list = []
with open(file, encoding="utf-8") as f:
type_prompt = file.split("/")[-1].split(".")[0]
texts = f.readlines()
for i in texts:
zirenwu_list.append((i, type_prompt))
import random
random.shuffle(zirenwu_list)
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, prompt, type_prompt):
id_ = uuid.uuid1()
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
path = f"/home/majiahui/mulu_ner/data/paper_prompt_title_3_2/{type_prompt}/"
bool_ = os.path.exists(path)
if bool_ == False:
os.makedirs(path)
with open(path + f"/{id_}_data.txt", mode="w") as f:
f.write("@@@@@@@@@@@@@@@@@@")
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
f.close()
lock.release()
time.sleep(5)
except:
time.sleep(20)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((type_prompt, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
prompt, type_prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, prompt, type_prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

71
jianjie转化为tishi.py

@ -0,0 +1,71 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
task_book_main_content_prompt = "请根据题目为《{}》,和研究内容为“{}”总结出至少6点本篇论文应完成的主要内容,使用阿拉伯数字排列"
pantten_title = "(.*?)》为题目的研究内容,包括整体简介和分最少三个方面总结"
path = "./data/paper_prompt_title_3/title_jianjie_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split("请帮我生成《")
data_list = []
chinese_keyword_data_list = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title_prompt, jianjie = text_dan.split("**************")
except:
continue
result_biaoti_list = re.findall(pantten_title, title_prompt)
try:
result_biaoti_list[0]
except:
print(title_prompt)
continue
title = str(result_biaoti_list[0]).strip("\n")
jianjie = str(jianjie).strip("\n")
data_list.append(task_book_main_content_prompt.format(title, jianjie))
import random
random.shuffle(data_list)
with open("./data/jianjie_to_/task_book_prompt.txt", mode="w", encoding="utf-8") as f:
for i in data_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
# for lable in table_of_contents:
# text_len = len(paper_text)
# dan_nerlable = [text_len, text_len + len(lable[0]), lable[1]]
# nerlable_list.append(dan_nerlable)
# paper_text += lable[0]
# paper_text += "@"
#
# paper_dan = {"text": paper_text, "label": nerlable_list}
#
# ner_lable.append(str(table_of_contents))
# text_zong.append(paper_dan)
#
# with open("../data/train.txt", mode="w", encoding="utf-8") as f:
# for i in text_zong:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")
#
#
# with open("../data/train_lable.txt", mode="w") as f:
# for i in ner_lable:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")

16
main.py

@ -0,0 +1,16 @@
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
# See PyCharm help at https://www.jetbrains.com/help/pycharm/

221
mulu转化为提示文本.py

@ -0,0 +1,221 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右"
small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
references_prompt = "论文题目是“{}”,目录是“{}”,请为这篇论文生成15篇左右的参考文献,要求其中有有中文参考文献不低于12篇,英文参考文献不低于2篇"
thanks = "致谢"
references = "参考文献"
excursus = "附录"
u = 3.5 # 均值μ
sig = math.sqrt(6.0)
zong_gradient = 6
paper_word_count = 12000
pantten_title = "(.*?)”生成目录,要求只有一级标题和二级标题,"
path = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
def normal_distribution(x):
y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
return y
text_list = text.split("为论文题目“")
ner_lable = []
text_zong = []
train_list = []
train_references_list = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title_prompt, mulu = text_dan.split("**************")
except:
continue
result_biaoti_list = re.findall(pantten_title, title_prompt)
try:
result_biaoti_list[0]
except:
print(title_prompt)
continue
title = str(result_biaoti_list[0]).strip("\n")
mulu = str(mulu).strip("\n")
# 生成参考文件的提示文本
train_references_list.append(references_prompt.format(title, mulu))
paper_text = "题目:{}@目录:".format(title)
table_of_contents = []
nerlable_list = []
# mulu_base64 = base64.b64encode(mulu.encode('utf-8'))
# mulu_path = os.path.join(uuid_path, "mulu.txt")
# with open(mulu_path, 'wb', encoding='utf8') as f2:
# f2.write(mulu_base64)
mulu_list = str(mulu).split("\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_str = "@".join(mulu_list)
mulu_list_bool = []
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
mulu_list_bool.append((i, "二级标题"))
mulu_list_bool_part = mulu_list_bool[:3]
if mulu_list_bool_part[0][1] != "一级标题":
continue
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题":
continue
thanks_references_bool_table = mulu_list_bool[-5:]
for i in thanks_references_bool_table:
try:
if references in i[0]:
mulu_list_bool.remove(i)
if thanks in i[0]:
mulu_list_bool.remove(i)
if excursus in i[0]:
mulu_list_bool.remove(i)
except:
print(thanks_references_bool_table)
continue
for i in mulu_list_bool:
if i[1] == "一级标题":
paper_dan = {
"title": "@@" + i[0],
"small_title": [],
"word_count": 0
}
table_of_contents.append(paper_dan)
else:
table_of_contents[-1]["small_title"].append(i[0])
x_list = [0]
y_list = [normal_distribution(0)]
gradient = zong_gradient/len(table_of_contents)
for i in range(len(table_of_contents)-1):
x_gradient = x_list[-1] + gradient
x_list.append(x_gradient)
y_list.append(normal_distribution(x_list[-1]))
dan_gradient = paper_word_count/sum(y_list)
for i in range(len(y_list)):
table_of_contents[i]["word_count"] = dan_gradient * y_list[i]
# print(table_of_contents)
#
# print(len(table_of_contents))
table_of_contents_new = []
for dabiaoti_index in range(len(table_of_contents)):
dabiaoti_dict = table_of_contents[dabiaoti_index]
table_of_contents_new.append([dabiaoti_dict["title"], 0])
for xiaobiaoti in dabiaoti_dict["small_title"]:
table_of_contents_new.append([xiaobiaoti, int(dabiaoti_dict["word_count"]/len(dabiaoti_dict["small_title"]))])
small_task_list = []
content_index = 0
while True:
if content_index == len(table_of_contents_new):
break
subtitle, word_count = table_of_contents_new[content_index]
prompt = small_title_prompt
if content_index == 0 and table_of_contents_new[1][0][:2] == "@@" and subtitle[:2] == "@@":
subtitle, prompt, word_count = subtitle[2:], first_title_prompt, 800
if content_index == len(table_of_contents_new) -1 and subtitle[:2] == "@@":
subtitle, prompt, word_count = subtitle[2:], first_title_prompt, 800
paper_content = [
content_index,
title,
mulu,
subtitle,
prompt,
word_count
]
small_task_list.append(paper_content)
content_index += 1
for i in small_task_list:
if i[3][:2] == "@@":
continue
elif i[5] > 1280:
continue
else:
paper_prompt = i[4].format(i[1], i[2], i[3], i[5])
if len(paper_prompt) < 768:
train_list.append(paper_prompt)
else:
continue
import random
random.shuffle(train_list)
train_list_shuffle = train_list[:10000]
with open("./data/title_mulu_to_/references_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_references_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
with open("./data/title_mulu_to_/small_title_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
with open("./data/title_mulu_to_/small_title_prompt_shuffle.txt", mode="w", encoding="utf-8") as f:
for i in train_list_shuffle:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
# for lable in table_of_contents:
# text_len = len(paper_text)
# dan_nerlable = [text_len, text_len + len(lable[0]), lable[1]]
# nerlable_list.append(dan_nerlable)
# paper_text += lable[0]
# paper_text += "@"
#
# paper_dan = {"text": paper_text, "label": nerlable_list}
#
# ner_lable.append(str(table_of_contents))
# text_zong.append(paper_dan)
#
# with open("../data/train.txt", mode="w", encoding="utf-8") as f:
# for i in text_zong:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")
#
#
# with open("../data/train_lable.txt", mode="w") as f:
# for i in ner_lable:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")

210
mulu转化为提示文本_只针对小标题切无字数控制.py

@ -0,0 +1,210 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右"
small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
thanks = "致谢"
references = "参考文献"
excursus = "附录"
u = 3.5 # 均值μ
sig = math.sqrt(6.0)
zong_gradient = 6
paper_word_count = 12000
pantten_title = "(.*?)”生成目录,要求只有一级标题和二级标题,"
path = "./data/paper_prompt_title_4/title_mulu_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
def normal_distribution(x):
y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
return y
text_list = text.split("为论文题目“")
ner_lable = []
text_zong = []
train_list = []
train_references_list = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title_prompt, mulu = text_dan.split("**************")
except:
continue
result_biaoti_list = re.findall(pantten_title, title_prompt)
try:
result_biaoti_list[0]
except:
print(title_prompt)
continue
title = str(result_biaoti_list[0]).strip("\n")
mulu = str(mulu).strip("\n")
# 生成参考文件的提示文本
table_of_contents = []
nerlable_list = []
# mulu_base64 = base64.b64encode(mulu.encode('utf-8'))
# mulu_path = os.path.join(uuid_path, "mulu.txt")
# with open(mulu_path, 'wb', encoding='utf8') as f2:
# f2.write(mulu_base64)
mulu_list = str(mulu).split("\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu_str = "@".join(mulu_list)
mulu_list_bool = []
for i in mulu_list:
result_biaoti_list = re.findall(pantten_biaoti, i)
if result_biaoti_list != []:
mulu_list_bool.append((i, "一级标题"))
else:
mulu_list_bool.append((i, "二级标题"))
mulu_list_bool_part = mulu_list_bool[:3]
if mulu_list_bool_part[0][1] != "一级标题":
continue
if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题":
continue
thanks_references_bool_table = mulu_list_bool[-5:]
for i in thanks_references_bool_table:
try:
if references in i[0]:
mulu_list_bool.remove(i)
if thanks in i[0]:
mulu_list_bool.remove(i)
if excursus in i[0]:
mulu_list_bool.remove(i)
except:
print(thanks_references_bool_table)
continue
for i in mulu_list_bool:
if i[1] == "一级标题":
paper_dan = {
"title": "@@" + i[0],
"small_title": [],
"word_count": 0
}
table_of_contents.append(paper_dan)
else:
table_of_contents[-1]["small_title"].append(i[0])
x_list = [0]
y_list = [normal_distribution(0)]
gradient = zong_gradient/len(table_of_contents)
for i in range(len(table_of_contents)-1):
x_gradient = x_list[-1] + gradient
x_list.append(x_gradient)
y_list.append(normal_distribution(x_list[-1]))
dan_gradient = paper_word_count/sum(y_list)
for i in range(len(y_list)):
table_of_contents[i]["word_count"] = dan_gradient * y_list[i]
# print(table_of_contents)
#
# print(len(table_of_contents))
table_of_contents_new = []
for dabiaoti_index in range(len(table_of_contents)):
dabiaoti_dict = table_of_contents[dabiaoti_index]
table_of_contents_new.append([dabiaoti_dict["title"], 0])
for xiaobiaoti in dabiaoti_dict["small_title"]:
# table_of_contents_new.append([xiaobiaoti, int(dabiaoti_dict["word_count"]/len(dabiaoti_dict["small_title"]))])
table_of_contents_new.append([xiaobiaoti, 1500])
small_task_list = []
content_index = 0
while True:
if content_index == len(table_of_contents_new):
break
subtitle, word_count = table_of_contents_new[content_index]
prompt = small_title_prompt
if content_index == 0 and table_of_contents_new[1][0][:2] == "@@" and subtitle[:2] == "@@":
subtitle, prompt, word_count = subtitle[2:], first_title_prompt, 800
if content_index == len(table_of_contents_new) -1 and subtitle[:2] == "@@":
subtitle, prompt, word_count = subtitle[2:], first_title_prompt, 800
paper_content = [
content_index,
title,
mulu,
subtitle,
prompt,
word_count
]
small_task_list.append(paper_content)
content_index += 1
for i in small_task_list:
if i[3][:2] == "@@":
continue
else:
paper_prompt = i[4].format(i[1], i[2], i[3], i[5])
if len(paper_prompt) < 768:
train_list.append(paper_prompt)
else:
continue
import random
random.shuffle(train_list)
train_list_shuffle = train_list[:10000]
with open("./data/title_mulu_to_/small_title_prompt_4.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
with open("./data/title_mulu_to_/small_title_prompt_shuffle_4.txt", mode="w", encoding="utf-8") as f:
for i in train_list_shuffle:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
# for lable in table_of_contents:
# text_len = len(paper_text)
# dan_nerlable = [text_len, text_len + len(lable[0]), lable[1]]
# nerlable_list.append(dan_nerlable)
# paper_text += lable[0]
# paper_text += "@"
#
# paper_dan = {"text": paper_text, "label": nerlable_list}
#
# ner_lable.append(str(table_of_contents))
# text_zong.append(paper_dan)
#
# with open("../data/train.txt", mode="w", encoding="utf-8") as f:
# for i in text_zong:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")
#
#
# with open("../data/train_lable.txt", mode="w") as f:
# for i in ner_lable:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")

68
mulu转化为提示文本生成摘要.py

@ -0,0 +1,68 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
zhaiyao_prompt = "论文题目是“{}”,目录是“{}”,生成论文摘要,要求生成的字数在600字左右"
thanks = "致谢"
references = "参考文献"
excursus = "附录"
u = 3.5 # 均值μ
sig = math.sqrt(6.0)
zong_gradient = 6
paper_word_count = 12000
pantten_title = "(.*?)”生成目录,要求只有一级标题和二级标题,"
path = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
def normal_distribution(x):
y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
return y
text_list = text.split("为论文题目“")
ner_lable = []
text_zong = []
train_list = []
train_references_list = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title_prompt, mulu = text_dan.split("**************")
except:
continue
result_biaoti_list = re.findall(pantten_title, title_prompt)
try:
result_biaoti_list[0]
except:
print(title_prompt)
continue
title = str(result_biaoti_list[0]).strip("\n")
mulu = str(mulu).strip("\n")
paper_prompt = zhaiyao_prompt.format(title, mulu)
train_list.append(paper_prompt)
import random
random.shuffle(train_list)
train_list_shuffle = train_list[:10000]
with open("./data/title_mulu_to_/zhaiyao_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")

12
read_train_json.py

@ -0,0 +1,12 @@
import json
from tqdm import tqdm
dataset = []
data_path = "data/chatglm_dev_3_prompt.json"
with open(data_path, "r", encoding="utf-8") as fh:
for i, line in enumerate(fh):
# print(line)
sample = json.loads(line.strip())
dataset.append(
{"input": sample["prompt"] + sample["query"], "answer": sample["response"]})
print(dataset)

0
train.py

225
train_novel.py

@ -0,0 +1,225 @@
#! -*- coding: utf-8 -*-
# 用CRF做中文命名实体识别
# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 实测验证集的F1可以到96.48%,测试集的F1可以到95.38%
#分配显存
import os
# os.environ["TF_KERAS"] = '1'
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, ViterbiDecoder, to_array
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm
import tensorflow as tf
# physical_devices = tf.config.list_physical_devices('GPU')
# for gpu_instance in physical_devices:
# tf.config.experimental.set_memory_growth(gpu_instance, True)
maxlen = 128
epochs = 10
batch_size = 16
bert_layers = 12
learning_rate = 2e-5 # bert_layers越小,学习率应该要越大
crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率
categories = set()
# bert配置
config_path = r'./premodels/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = r'./premodels/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = r'./premodels/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式[text, (start, end, label), (start, end, label), ...]
意味着text[start:end + 1]是类型为label的实体
"""
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
try:
char, flag = c.split(' ')
except:
continue
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
categories.add(flag[2:])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 标注数据
train_data = load_data('./data/说话人_ner/train.txt')
valid_data = load_data('./data/说话人_ner/dev.txt')
test_data = load_data('./data/说话人_ner/test.txt')
categories = list(sorted(categories))
print(categories)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, d in self.sample(random):
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories.index(label) * 2 + 1
labels[start + 1:end + 1] = categories.index(label) * 2 + 2
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(labels)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
"""
后面的代码使用的是bert类型的模型如果你用的是albert那么前几行请改为
model = build_transformer_model(
config_path,
checkpoint_path,
model='albert',
)
output_layer = 'Transformer-FeedForward-Norm'
output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
"""
model = build_transformer_model(
config_path,
checkpoint_path,
)
output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
output = model.get_layer(output_layer).output
output = Dense(len(categories) * 2 + 1)(output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output)
model = Model(model.input, output)
model.summary()
model.compile(
loss=CRF.sparse_loss,
optimizer=Adam(learning_rate),
metrics=[CRF.sparse_accuracy]
)
class NamedEntityRecognizer(ViterbiDecoder):
"""命名实体识别器
"""
def recognize(self, text):
tokens = tokenizer.tokenize(text, maxlen=512)
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
token_ids, segment_ids = to_array([token_ids], [segment_ids])
nodes = model.predict([token_ids, segment_ids])[0]
labels = self.decode(nodes)
entities, starting = [], False
for i, label in enumerate(labels):
if label > 0:
if label % 2 == 1:
starting = True
entities.append([[i], categories[(label - 1) // 2]])
elif starting:
entities[-1][0].append(i)
else:
starting = False
else:
starting = False
return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities]
NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
def evaluate(data):
"""评测函数
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
for d in tqdm(data, ncols=100):
R = set(NER.recognize(d[0]))
T = set([tuple(i) for i in d[1:]])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0
def on_epoch_end(self, epoch, logs=None):
trans = K.eval(CRF.trans)
NER.trans = trans
print(NER.trans)
f1, precision, recall = evaluate(valid_data)
# 保存最优
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
model.save_weights('./models_result_crf_shuohuaren/best_model.weights')
print(
'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
f1, precision, recall = evaluate(test_data)
print(
'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
(f1, precision, recall)
)
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
NER.trans = K.eval(CRF.trans)

1470
zhaiyao_chinese_keyword_prompt_data.txt

File diff suppressed because it is too large

86
zhaiyoa转化为提示.py

@ -0,0 +1,86 @@
import json
import re
import math
import numpy as np
from tqdm import tqdm
prompt = "请把“{}”这段文字翻译成英文"
chinese_keyword_prompt = "请为“{}”这段论文摘要生成3-5个关键字,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n"
pantten_title = "(.*?)》为题目生成论文摘要,要求生成的字数在"
path = "./data/paper_prompt_title_3/title_zhaiyao_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split("请以《")
data_list = []
chinese_keyword_data_list = []
for text_dan in tqdm(text_list):
# print(text_dan)
try:
title_prompt, zhaiyao = text_dan.split("**************")
except:
continue
result_biaoti_list = re.findall(pantten_title, title_prompt)
try:
result_biaoti_list[0]
except:
print(title_prompt)
continue
title = str(result_biaoti_list[0]).strip("\n")
zhaiyao = str(zhaiyao).strip("\n")
data_list.append(prompt.format(zhaiyao))
chinese_keyword_data_list.append(chinese_keyword_prompt.format(zhaiyao))
import random
random.shuffle(data_list)
with open("./data/zhaiyao_to_/zhaiyao_fanyi_prompt.txt", mode="w", encoding="utf-8") as f:
for i in data_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
random.shuffle(chinese_keyword_data_list)
with open("./data/zhaiyao_to_/zhaiyao_chinese_keyword_prompt.txt", mode="w", encoding="utf-8") as f:
for i in chinese_keyword_data_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")
# for lable in table_of_contents:
# text_len = len(paper_text)
# dan_nerlable = [text_len, text_len + len(lable[0]), lable[1]]
# nerlable_list.append(dan_nerlable)
# paper_text += lable[0]
# paper_text += "@"
#
# paper_dan = {"text": paper_text, "label": nerlable_list}
#
# ner_lable.append(str(table_of_contents))
# text_zong.append(paper_dan)
#
# with open("../data/train.txt", mode="w", encoding="utf-8") as f:
# for i in text_zong:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")
#
#
# with open("../data/train_lable.txt", mode="w") as f:
# for i in ner_lable:
# f.write(json.dumps(i, ensure_ascii=False))
# f.write("\n")

158
以核心内容生成开题报告目录等内容.py

@ -0,0 +1,158 @@
import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
import re
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=11, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
pantten_title = "《(.*)》"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
prompt_dict = {
"mulu_prompt": "以“{}”为论文的生成方向,为论文题目为“{}”生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题",
"beijing_prompt": "以“{}”为论文题目,以“{}”为论文的生成方向,写一段题目来源的背景,要求字数在200字以内",
"zongjie_prompt": "以“{}”为论文题目,以“{}”为论文的生成方向,写一个论文简短总结,要求在300字以内",
"zongshu_prompt": "以《{}》为课题,以“{}”为论文的生成方向,请写出这篇论文的国内外研究状况综述,字数在800字左右",
"yanjiubeijingyiyi_prompt": "请分别写出以《{}》为课题的研究背景和意义,字数不少于1000字",
"jianjie_prompt": "请帮我生成《{}》为题目的研究内容,包括整体简介和分最少三个方面总结"
}
with open("./data/paper_prompt_title_3/title_zhuyaoneirong_prompt_data.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
title_list = []
for i in text_list:
if "**************" in i:
title_list.append(i.split("**************")[1])
random.shuffle(title_list)
print(len(title_list))
zirenwu_list = []
for text in title_list:
bool_ = is_contains_chinese(text)
if bool_ == False:
continue
if "》:" not in text:
continue
text = text.strip("\"").strip("").strip("")
result_biaoti_list = re.findall(pantten_title, text)
try:
title = result_biaoti_list[0]
except:
continue
hexinnrirong = text.split("》:")[1]
for prompt in prompt_dict:
zirenwu_list.append((prompt, str(prompt_dict[prompt]).format(title, hexinnrirong)))
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, task_type, prompt):
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_hexin_3/title_{}_data.txt".format(task_type), mode="a") as f:
f.write("@@@@@@@@@@@@@@@@@@@@@@@")
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
except:
print("task_type_bad", task_type)
print("api_key_bad", api_key)
time.sleep(5)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((task_type, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
task_type, prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

155
以核心内容生成开题报告目录等内容_问题补充.py

@ -0,0 +1,155 @@
import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
import re
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=11, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
pantten_title = "《(.*)》"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
prompt_dict = {
"mulu_prompt": "论文题目为“{}”,以“{}”为论文的生成方向,为论文生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题",
"yanjiubeijingyiyi_prompt": "请分别写出以《{}》为课题,以“{}”为论文的生成方向,生成论文的研究背景和意义,字数不少于1000字",
"jianjie_prompt": "请帮我生成《{}》为题目,以“{}”为论文的生成方向,生成论文研究内容,包括整体简介和分最少三个方面总结"
}
with open("./data/paper_prompt_title_3/title_zhuyaoneirong_prompt_data.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
title_list = []
for i in text_list:
if "**************" in i:
title_list.append(i.split("**************")[1])
random.shuffle(title_list)
print(len(title_list))
zirenwu_list = []
for text in title_list:
bool_ = is_contains_chinese(text)
if bool_ == False:
continue
if "》:" not in text:
continue
text = text.strip("\"").strip("").strip("")
result_biaoti_list = re.findall(pantten_title, text)
try:
title = result_biaoti_list[0]
except:
continue
hexinnrirong = text.split("》:")[1]
for prompt in prompt_dict:
zirenwu_list.append((prompt, str(prompt_dict[prompt]).format(title, hexinnrirong)))
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, task_type, prompt):
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_hexin_3/title_{}_data.txt".format(task_type), mode="a") as f:
f.write("@@@@@@@@@@@@@@@@@@@@@@@")
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
except:
print("task_type_bad", task_type)
print("api_key_bad", api_key)
time.sleep(5)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((task_type, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
task_type, prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

16
取小标题数据数据.py

@ -0,0 +1,16 @@
import json
data = []
with open("data/title_mulu_to_/small_title_prompt_2.txt", encoding="utf-8") as f:
texts = f.read()
texts_list = texts.split("\n")
for i in texts_list:
data_dan = i.strip("\"").strip("\n").strip("\"").strip("\"")
data.append(data_dan)
data_new = data[10000:40000]
with open("./data/title_mulu_to_/small_title_prompt_2_10000_40000.txt", mode="w", encoding="utf-8") as f:
for i in data_new:
f.write(i)
f.write("\n")

23
合并数据.py

@ -0,0 +1,23 @@
import json
from pathlib import Path
data = []
with open('data/small_title_train.json', encoding="utf-8") as fp:
lines = fp.readlines()
for i in lines:
data.append(json.loads(i))
with open('data/mulu_prompt_shuffle.json', encoding="utf-8") as fp:
lines = fp.readlines()
for i in lines:
data.append(json.loads(i))
import random
random.shuffle(data)
with open("data/train_paper.json", mode="w", encoding="utf-8") as f:
for i in data:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")

17
提示文本总结

@ -0,0 +1,17 @@
first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右"
small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
references_prompt = "论文题目是“{}”,目录是“{}”,请为这篇论文生成15篇左右的参考文献,要求其中有有中文参考文献不低于12篇,英文参考文献不低于2篇"
prompt = "请把“{}”这几个关键字翻译成英文"
prompt = "请把“{}”这段文字翻译成英文"
chinese_keyword_prompt = "请为“{}”这段论文摘要生成3-5个关键字,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n”"
zhuyaoneirong_prompt" = “《{}》:研制一款基于单片机的多功能充电控制器,包括硬件和软件设计。最终成果是研制一台可对多种类型蓄电池充电的控制器实物,并以一个特定蓄电池充电为例,验证所设计控制器的可用性”,以上面话术为标准。根据论文题目为“我国护理硕士专业学位研究生课程体系的现状研究”生成这种格式的一段话,要求100个字左右",
mulu_prompt" = 为论文题目“{}”生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题",
zhaiyao_prompt" = 请以《{}》为题目生成论文摘要,要求生成的字数在600字左右",
beijing_prompt" = 以“{}”为论文题目,写一段题目来源的背景,要求字数在200字以内",
zongjie_prompt" = 以“{}”为论文题目,写一个论文简短总结,要求在300字以内",
zongshu_prompt" = 请写出以《{}》为课题的国内外研究状况综述,字数在800字左右",
yanjiubeijingyiyi_prompt" = 请分别写出以《{}》为课题的研究背景和意义,字数不少于1000字",
jianjie_prompt" = 请帮我生成《{}》为题目的研究内容,包括整体简介和分最少三个方面总结"

32
摘要小文件整合.py

@ -0,0 +1,32 @@
import os
from tqdm import tqdm
import re
patten = "目录是“(.*)”,请把其中的"
p0 = "@@@@@@@@@@@@@@@@@@"
p1 = "补充内容字数在1500字左右"
p2 = "**************"
data_path_list = []
for root,dirs,files in os.walk(r"./data/paper_prompt_title_3_2/zhaiyao_prompt"):
for file in files:
#获取文件路径
data_path_list.append(os.path.join(root,file))
print(data_path_list)
jishu = 0
data_str = ""
for i in tqdm(data_path_list):
dayin = False
with open(i, encoding="utf-8") as f:
data_dan = f.read()
data_str += data_dan
data_str += "\n"
print(jishu)
print(data_str)
with open("./data/paper_prompt_title_3_1/title_mulu_zhaiyao_data.txt", mode="w", encoding="utf-8") as f:
f.write(data_str)

108
数据筛选llama.py

@ -0,0 +1,108 @@
import os
import re
import random
import json
from tqdm import tqdm
RE_CHINA_NUMS = "[一二三四五六七八九].?.?总结|[1-9].?.?总结|[一二三四五六七八九].?.?结论|[1-9].?.?结论"
RE_CHINA_TITLE = "请把其中的小标题“(.*?)”的内容补充完整|请把其中的大标题“(.*?)”的内容补充完整"
data_tongji = {
"0-600": 0,
"600-1500": 0,
"1500-": 0,
}
# print("这段文字翻译成英文"\n'")
data_tongji_prompt = []
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
data_list = []
jishu = 0
with open("data/chatglm_paper_data_2_prompt.txt", encoding="utf-8") as f:
for i in tqdm(f):
data_dan = eval(i)
zishu_query = len(data_dan["query"])
zishu_response = len(data_dan["response"])
prompt = str(data_dan["prompt"]).replace("\\n", "\n")
query = data_dan["query"].replace("\\n", "\n")
response = data_dan["response"].replace("\\n", "\n")
if prompt == "翻译摘要#":
zishu_summary = len(response.split(" "))
elif prompt == "翻译关键词#":
zishu_summary = len(response.split(" "))
else:
bool_ = is_contains_chinese(response)
if bool_ == False:
print(data_dan)
continue
if "生成方向" in query:
query = query.replace("生成方向","研究方向")
if "生成方向" in response:
response = response.replace("生成方向", "研究方向")
if prompt == "生成论文小标题内容#":
query_re = re.findall(RE_CHINA_TITLE, query)
if "总结" not in query_re[0] or "结论" not in query_re[0]:
response_re = re.findall(RE_CHINA_NUMS, response)
if response_re != []:
print(response)
print("==========================================================================================")
jishu += 1
if prompt[-1] != "\n":
prompt += "\n"
if query[-1] != "\n":
query += "\n"
query = "问:" + query + "答:\n"
if len(query) < 700 and len(response) < 1400:
data_list.append({
"instruction": prompt,
"input": query,
"output": response
})
# if zishu_summary < 600:
# data_tongji["0-600"] += 1
# if 600 < zishu_summary < 1500:
# data_tongji["600-1500"] += 1
# if 1500 < zishu_summary:
# data_tongji["1500-"] += 1
# data_tongji_prompt.append([data_dan['summary'], zishu_summary])
# else:
# train_list.append(i)
# for i in data_tongji_prompt:
# print(i)
#
# random.shuffle(data_list)
#
train_nums = int(len(data_list) * 0.9)
dev_nums = int(len(data_list) * 0.1)
#
random.shuffle(data_list)
print(train_nums)
train_list = data_list[:train_nums]
dev_list = data_list[train_nums:]
with open("./data/chatglm_train_3_prompt_llama.json", mode="w", encoding="utf-8") as f:
f.write(json.dumps(train_list, ensure_ascii=False, indent=2))
with open("./data/chatglm_dev_3_prompt_llama.json", mode="w", encoding="utf-8") as f:
f.write(json.dumps(dev_list, ensure_ascii=False, indent=2))
# for i in data_tongji_prompt:
# print(i)
#
# print(data_tongji)
print(jishu)

91
数据筛选rwkv.py

@ -0,0 +1,91 @@
import os
import random
import json
from tqdm import tqdm
data_tongji = {
"0-600": 0,
"600-1500": 0,
"1500-": 0,
}
# print("这段文字翻译成英文"\n'")
data_tongji_prompt = []
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
data_list = []
with open("data/chatglm_paper_data_2_prompt.txt", encoding="utf-8") as f:
for i in tqdm(f):
data_dan = eval(i)
zishu_query = len(data_dan["query"])
zishu_response = len(data_dan["response"])
query = data_dan["query"]
response = data_dan["response"]
prompt = data_dan["prompt"]
if prompt == "翻译摘要#":
zishu_summary = len(data_dan["response"].split(" "))
elif prompt == "翻译关键词#":
zishu_summary = len(data_dan["response"].split(" "))
else:
bool_ = is_contains_chinese(data_dan["response"])
if bool_ == False:
print(data_dan)
continue
if "生成方向" in query:
data_dan["query"] = query.replace("生成方向","研究方向")
if "生成方向" in response:
data_dan["response"] = response.replace("生成方向", "研究方向")
if zishu_query < 700 and zishu_response< 1400:
data_dan_dict = {
"text" :"Bob: " + data_dan["query"] + "\n\nAlice: "+ data_dan["response"]
}
data_list.append(json.dumps(data_dan_dict, ensure_ascii=False))
# if zishu_summary < 600:
# data_tongji["0-600"] += 1
# if 600 < zishu_summary < 1500:
# data_tongji["600-1500"] += 1
# if 1500 < zishu_summary:
# data_tongji["1500-"] += 1
# data_tongji_prompt.append([data_dan['summary'], zishu_summary])
# else:
# train_list.append(i)
# for i in data_tongji_prompt:
# print(i)
#
# random.shuffle(data_list)
#
train_nums = int(len(data_list) * 0.8)
dev_nums = int(len(data_list) * 0.2)
#
random.shuffle(data_list)
print(train_nums)
train_list = data_list[:train_nums]
dev_list = data_list[train_nums:]
with open("./data/chatglm_train_3_chatrwkv.jsonl", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(i)
f.write("\n")
with open("./data/chatglm_dev_3_chatrwkv.jsonl", mode="w", encoding="utf-8") as f:
for i in dev_list:
f.write(i)
f.write("\n")
# for i in data_tongji_prompt:
# print(i)
#
# print(data_tongji)

33
数据统计.py

@ -0,0 +1,33 @@
import json
# {
# '生成课题的研究背景和意义#': 1851,
# '生成论文小标题内容#': 8316,
# '生成目录#': 1975,
# '生成论文摘要#': 958,
# '生成6点本篇论文应完成的主要内容#': 881,
# '生成研究内容#': 2014,
# '生成关键字#': 850,
# '翻译关键词#': 980,
# '生成论文简短总结#': 2055,
# '生成论文来源的背景#': 2003,
# '生成课题的国内外研究状况综述#': 1915,
# '翻译摘要#': 199
# }
path = "data/chatglm_dev_3_prompt.json"
with open(path, encoding="utf-8") as f:
data = f.readlines()
data_type = {}
for i in data:
data_dan = eval(i)
# if data_dan["prompt"] not in data_type:
# data_type[data_dan["prompt"]] = 1
# else:
# data_type[data_dan["prompt"]] += 1
if data_dan["prompt"] == "生成论文小标题内容#":
print(i)
print(data_type)

14
测试正则.py

@ -0,0 +1,14 @@
#-*- coding:utf-8 -*-
import re
a = "[一二三四五六七八九].?.?总结|[1-9].?.?总结"
b = "1.2.1 总结 adsadadadadadadadadadadadadadad"
c = "请把其中的小标题“(.*?)”的内容补充完整"
d = "问:论文题目是“《子不语》精怪故事研究”,目录是“一、引言\n1.1 研究背景\n1.2 研究意义\n1.3 研究方法\n\n二、《子不语》精怪故事概述\n2.1 《子不语》的作者和成书背景\n2.2 《子不语》中的精怪故事类型\n2.3 《子不语》中的精怪故事特点\n\n三、《子不语》中的精怪故事主题\n3.1 爱情主题\n3.2 死亡主题\n3.3 婚姻主题\n3.4 人性主题\n\n四、《子不语》中的精怪故事人物形象\n4.1 神仙形象\n4.2 鬼怪形象\n4.3 人物形象\n\n五、《子不语》中的精怪故事情节分析\n5.1 情节的变化和转折\n5.2 情节的发展和衔接\n5.3 情节的意义和价值\n\n六、《子不语》中的精怪故事艺术特色\n6.1 语言艺术特色\n6.2 形象艺术特色\n6.3 结构艺术特色\n\n七、结论\n7.1 研究成果总结\n7.2 研究不足和展望\n\n参考文献”,请把其中的小标题“5.1 情节的变化和转折”的内容补充完整,补充内容字数在1000字左右\n答:\n"
print(re.findall(c, d))

38
测试生成效果.py

@ -0,0 +1,38 @@
import requests
import json
def request_api_chatgpt(api_key, prompt):
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
print(text)
api = "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"
# prompt = "为论文题目“基于单片机的多功能充电控制器设计”生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题"
# prompt = "为论文题目“基于单片机的多功能充电控制器设计”生成目录,以“研制一款基于单片机的多功能充电控制器,包括硬件和软件设计。最终成
# 果是研制一台可对多种类型蓄电池充电的控制器实物,并以一个特定蓄电池充电为例,验证所设计控制器的可用性”为论文的生成方向,要求只有一级
# 标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题"
# 基于单片机的多功能充电控制器设计”生成目录,以“研制一款基于单片机的多功能充电控制器,包括硬件和软件设计。最终成果是研制一台可对多种类型蓄电池充电的控制器实物,并以一个特定蓄电池充电为例,验证所设计控制器的可用性
prompt = "请以《基于单片机的多功能充电控制器设计》为题目,以“研制一款基于单片机的多功能充电控制器,包括硬件和软件设计。最终成果是研制一台可对多种类型蓄电池充电的控制器实物,并以一个特定蓄电池充电为例,验证所设计控制器的可用性”为论文的生成方向,生成论文摘要,要求生成的字数在600字左右"
request_api_chatgpt(api, prompt)

159
生成chatglm训练数据.py

@ -0,0 +1,159 @@
import os
import json
import re
import math
import numpy as np
from tqdm import tqdm
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
lable_data_amount = {
"title_beijing_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文来源的背景#"},
"title_jianjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成研究内容#"},
"title_mulu_prompt_data.txt": {"num_token": 5000, "prompt": "生成目录#"},
"title_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的研究背景和意义#"},
"title_zongjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文简短总结#"},
"title_zongshu_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的国内外研究状况综述#"},
"jianjie_task_book_prompt_data.txt": {"num_token": 5000, "prompt": "生成6点本篇论文应完成的主要内容#"},
"title_mulu_references_prompt_data.txt": {"num_token": 1, "prompt": "生成参考文献#"},
"title_mulu_small_title_prompt_shuffle_data.txt": {"num_token": 18730, "prompt": "生成论文小标题内容#"},
"title_mulu_zhaiyao_data.txt": {"num_token": 5000, "prompt": "生成论文摘要#"},
"zhaiyao_chinese_keyword_prompt_data.txt": {"num_token": 5000, "prompt": "生成关键字#"},
"zhaiyao_fanyi_prompt_data.txt": {"num_token": 5000, "prompt": "翻译摘要#"},
"chinese_keyword_en_prompt_data.txt": {"num_token": 5000, "prompt": "翻译关键词#"},
"title_hexin_beijing_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文来源的背景#"},
"title_hexin_jianjie_prompt_data.txt": {"num_token": 4903, "prompt": "生成研究内容#"},
"title_hexin_mulu_prompt_data.txt": {"num_token": 4954, "prompt": "生成目录#"},
"title_hexin_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 4902, "prompt": "生成课题的研究背景和意义#"},
"title_hexin_zongjie_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文简短总结#"},
"title_hexin_zongshu_prompt_data.txt": {"num_token": 4671, "prompt": "生成课题的国内外研究状况综述#"}
}
re_file = {
"title_beijing_prompt_data.txt": "\n以“",
"title_jianjie_prompt_data.txt": "\n请帮我生成《",
"title_mulu_prompt_data.txt": "\n为论文题目“",
"title_yanjiubeijingyiyi_prompt_data.txt": "\n请分别写出以《",
"title_zongjie_prompt_data.txt": "\n以“",
"title_zongshu_prompt_data.txt": "\n请写出以《",
"jianjie_task_book_prompt_data.txt": "\n\"请根据题目为《",
"title_mulu_references_prompt_data.txt": "\n\"论文题目是“",
"zhaiyao_chinese_keyword_prompt_data.txt": "\n\"请为“",
"zhaiyao_fanyi_prompt_data.txt": "\n\"请把“",
"chinese_keyword_en_prompt_data.txt": "\n\"请把“",
"title_mulu_zhaiyao_data.txt": "@@@@@@@@@@@@@@@@@@",
"title_mulu_small_title_prompt_shuffle_data.txt": "@@@@@@@@@@@@@@@@@@",
"title_hexin_beijing_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_jianjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_mulu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_yanjiubeijingyiyi_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_zongjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_zongshu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@"
}
split_teshu = [
"title_mulu_zhaiyao_data.txt",
"title_mulu_small_title_prompt_shuffle_data.txt",
"title_hexin_beijing_prompt_data.txt",
"title_hexin_jianjie_prompt_data.txt",
"title_hexin_mulu_prompt_data.txt",
"title_hexin_yanjiubeijingyiyi_prompt_data.txt",
"title_hexin_zongjie_prompt_data.txt",
"title_hexin_zongshu_prompt_data.txt"
]
path_list = []
file = "./data/paper_prompt_title_3"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_3_1"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_3_1_1"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_hexin_3"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
text_list_new = []
tongji = {}
for path in path_list:
task_name = path.split("\\")[-1]
if task_name in re_file:
spilt_dan = re_file[task_name]
else:
continue
train_data_amount = lable_data_amount[task_name]
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split(spilt_dan)
index = 1
while True:
if index >= train_data_amount:
break
data_dan = text_list[index]
if "**************" in data_dan:
# if task_name == "title_jianjie_prompt_data.txt":
# content, summary = data_dan.split("**************")
# bool_ = is_contains_chinese(summary)
# if bool_ == False:
# index += 1
# continue
if task_name in split_teshu:
data_dan = data_dan
else:
data_dan = spilt_dan[1:] + data_dan
text_list_new.append(data_dan)
index += 1
if task_name not in tongji:
tongji[task_name] = 1
else:
tongji[task_name] += 1
else:
index += 4
print(data_dan)
# train_list.append({"content": str(title_p), "summary": str(b)})
train_list = []
for text in text_list_new:
content, summary = text.split("**************")
train_list.append(
{"content": str(content).strip("\"").strip("\n").strip("\""), "summary": str(summary)}
)
import random
random.shuffle(train_list)
for i in tongji:
print(i, tongji[i])
with open("./data/chatglm_paper_data_2.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")

165
生成chatglm训练数据包含prompt.py

@ -0,0 +1,165 @@
import os
import json
import re
import math
import numpy as np
from tqdm import tqdm
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
lable_data_amount = {
"title_beijing_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文来源的背景#"},
"title_jianjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成研究内容#"},
"title_mulu_prompt_data.txt": {"num_token": 5000, "prompt": "生成目录#"},
"title_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的研究背景和意义#"},
"title_zongjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文简短总结#"},
"title_zongshu_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的国内外研究状况综述#"},
"jianjie_task_book_prompt_data.txt": {"num_token": 5000, "prompt": "生成6点本篇论文应完成的主要内容#"},
"title_mulu_references_prompt_data.txt": {"num_token": 1, "prompt": "生成参考文献#"},
"title_mulu_small_title_prompt_shuffle_data.txt": {"num_token": -1, "prompt": "生成论文小标题内容#"},
"title_mulu_zhaiyao_data.txt": {"num_token": 5000, "prompt": "生成论文摘要#"},
"zhaiyao_chinese_keyword_prompt_data.txt": {"num_token": 5000, "prompt": "生成关键字#"},
"zhaiyao_fanyi_prompt_data.txt": {"num_token": 5000, "prompt": "翻译摘要#"},
"chinese_keyword_en_prompt_data.txt": {"num_token": 5000, "prompt": "翻译关键词#"},
"title_hexin_beijing_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文来源的背景#"},
"title_hexin_jianjie_prompt_data.txt": {"num_token": 4903, "prompt": "生成研究内容#"},
"title_hexin_mulu_prompt_data.txt": {"num_token": 4954, "prompt": "生成目录#"},
"title_hexin_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 4902, "prompt": "生成课题的研究背景和意义#"},
"title_hexin_zongjie_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文简短总结#"},
"title_hexin_zongshu_prompt_data.txt": {"num_token": 4671, "prompt": "生成课题的国内外研究状况综述#"}
}
re_file = {
"title_beijing_prompt_data.txt": "\n以“",
"title_jianjie_prompt_data.txt": "\n请帮我生成《",
"title_mulu_prompt_data.txt": "\n为论文题目“",
"title_yanjiubeijingyiyi_prompt_data.txt": "\n请分别写出以《",
"title_zongjie_prompt_data.txt": "\n以“",
"title_zongshu_prompt_data.txt": "\n请写出以《",
"jianjie_task_book_prompt_data.txt": "\n\"请根据题目为《",
"title_mulu_references_prompt_data.txt": "\n\"论文题目是“",
"zhaiyao_chinese_keyword_prompt_data.txt": "\n\"请为“",
"zhaiyao_fanyi_prompt_data.txt": "\n\"请把“",
"chinese_keyword_en_prompt_data.txt": "\n\"请把“",
"title_mulu_zhaiyao_data.txt": "@@@@@@@@@@@@@@@@@@",
"title_mulu_small_title_prompt_shuffle_data.txt": "@@@@@@@@@@@@@@@@@@",
"title_hexin_beijing_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_jianjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_mulu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_yanjiubeijingyiyi_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_zongjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_zongshu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@"
}
split_teshu = [
"title_mulu_zhaiyao_data.txt",
"title_mulu_small_title_prompt_shuffle_data.txt",
"title_hexin_beijing_prompt_data.txt",
"title_hexin_jianjie_prompt_data.txt",
"title_hexin_mulu_prompt_data.txt",
"title_hexin_yanjiubeijingyiyi_prompt_data.txt",
"title_hexin_zongjie_prompt_data.txt",
"title_hexin_zongshu_prompt_data.txt"
]
path_list = []
file = "./data/paper_prompt_title_3"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_3_1"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_3_1_1"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_hexin_3"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
text_list_new = []
tongji = {}
for path in path_list:
task_name = path.split("\\")[-1]
if task_name in re_file:
spilt_dan = re_file[task_name]
else:
continue
train_data_amount_dict = lable_data_amount[task_name]
train_data_amount = train_data_amount_dict["num_token"]
prompt = train_data_amount_dict["prompt"]
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split(spilt_dan)
index = 1
if train_data_amount == -1:
train_data_amount = len(text_list) -1
while True:
if index >= train_data_amount:
break
data_dan = text_list[index]
if "**************" in data_dan:
# if task_name == "title_jianjie_prompt_data.txt":
# content, summary = data_dan.split("**************")
# bool_ = is_contains_chinese(summary)
# if bool_ == False:
# index += 1
# continue
if task_name in split_teshu:
data_dan = data_dan
else:
data_dan = spilt_dan[1:] + data_dan
text_list_new.append((data_dan, prompt))
index += 1
if task_name not in tongji:
tongji[task_name] = 1
else:
tongji[task_name] += 1
else:
index += 4
print(data_dan)
# train_list.append({"content": str(title_p), "summary": str(b)})
train_list = []
for text, prompt in text_list_new:
content, summary = text.split("**************")
train_list.append(
{"query": str(content).strip("\"").strip("\n").strip("\""), "response": str(summary), "prompt": prompt}
)
import random
random.shuffle(train_list)
for i in tongji:
print(i, tongji[i])
with open("./data/chatglm_paper_data_2_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")

127
生成文本核心内容提示.py

@ -0,0 +1,127 @@
import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
zhuyaoneirong_prompt = "“《基于单片机的多功能充电控制器设计》:研制一款基于单片机的多功能充电控制器,包括硬件和软件设计。最终成果是" \
"研制一台可对多种类型蓄电池充电的控制器实物,并以一个特定蓄电池充电为例,验证所设计控制器的可用性”," \
"以上面话术为标准。根据论文题目为《{}》生成这种格式的一段话,要求200个字以内",
with open("./data/题目2.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
random.shuffle(text_list)
text_list = text_list[:6000]
title_list = []
for i in text_list:
title_list.append(i.split("@@@@@")[0])
random.shuffle(title_list)
print(len(title_list))
zirenwu_list = []
for title in title_list:
zirenwu_list.append(("zhuyaoneirong_prompt", str(zhuyaoneirong_prompt).format(title)))
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, task_type, prompt):
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_3/title_{}_data.txt".format(task_type), mode="a") as f:
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
except:
print("task_type_bad", task_type)
print("api_key_bad", api_key)
time.sleep(5)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((task_type, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
task_type, prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue

74
生成训练文件.py

@ -0,0 +1,74 @@
import json
import re
pantten_mulu = "目录是“(.*?)”,请把其中"
pantten_title = "“(.*?)”,目录是"
pantten_small_title = "请把其中的小标题“(.*?)”的内容补充完整"
pantten_big_title = "请把其中的大标题“(.*?)”的内容补充完整"
pantten_zishu = "的内容补充完整,补充内容字数在(.*?)字左右"
with open("data/prompt_small_gen.txt", encoding="utf-8") as f:
content = f.read()
content_list = content.split("\"论文题目是")
content_list = content_list[1:]
content_list = [i.strip("\n") for i in content_list]
train = []
print(len(content_list))
for i in content_list:
result_biaoti_list = re.findall(pantten_mulu, i)
try:
result_biaoti_list[0]
except:
print(i)
continue
if result_biaoti_list[0] != "":
mulu_list = str(result_biaoti_list[0]).split("\\n")
mulu_list = [i.strip() for i in mulu_list if i != ""]
mulu = "@".join(mulu_list)
else:
continue
result_biaoti_list = re.findall(pantten_title, i)
if result_biaoti_list[0] != "":
title = result_biaoti_list[0]
else:
continue
result_biaoti_small_list = re.findall(pantten_small_title, i)
result_biaoti_big_list = re.findall(pantten_big_title, i)
if result_biaoti_small_list != []:
small_title = result_biaoti_small_list[0]
result_biaoti_list = re.findall(pantten_zishu, i)
if result_biaoti_list[0] != "":
zishu = result_biaoti_list[0]
else:
continue
small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
neirong = i.split("**************")[1]
a = small_title_prompt.format(title, mulu, small_title, zishu)
if len(str(a)) + len(str(neirong))< 2048:
train.append({"content": str(a), "summary": str(neirong)})
elif result_biaoti_big_list != []:
big_title = result_biaoti_big_list[0]
result_biaoti_list = re.findall(pantten_zishu, i)
if result_biaoti_list[0] != "":
zishu = result_biaoti_list[0]
else:
continue
big_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右"
neirong = i.split("**************")[1]
a = big_title_prompt.format(title, mulu, big_title, zishu)
if len(str(neirong)) + len(str(a)) < 2048:
train.append({"content": str(a), "summary": str(neirong)})
else:
continue
with open("data/small_title_train.json", "w", encoding="utf-8") as f:
for i in train:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")

11
统计数据.py

@ -0,0 +1,11 @@
path = "data/title.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
print(len(text_list))

6
统计标题.py

@ -0,0 +1,6 @@
path = "data/title.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
print(len(text_list))

11
计算器.py

@ -0,0 +1,11 @@
# 衣柜部分
a = (888 * 24) # 套餐内衣柜
print(a)
# 橱柜部分 假如地柜台面吊柜都是超出3米
d = (3*999 + 3*999 + 3*999)
e = d + 4999
# 24米定制 + 6米台面6米地柜4米吊柜 - 楼控部分
print(a + e)

83
训练数据筛选.py

@ -0,0 +1,83 @@
import os
import random
import json
from tqdm import tqdm
data_tongji = {
"0-600": 0,
"600-1500": 0,
"1500-": 0,
}
# print("这段文字翻译成英文"\n'")
data_tongji_prompt = []
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
data_list = []
with open("data/chatglm_paper_data_2.txt", encoding="utf-8") as f:
for i in tqdm(f):
data_dan = eval(i)
zishu_content = len(data_dan["content"])
zishu_summary = len(data_dan["summary"])
prompt = data_dan["content"]
summary = data_dan["summary"]
if "这段文字翻译成英文" in prompt:
zishu_summary = len(data_dan['summary'].split(" "))
elif "这几个关键字翻译成英文" in prompt:
zishu_summary = len(data_dan['summary'].split(" "))
else:
bool_ = is_contains_chinese(data_dan["summary"])
if bool_ == False:
print(data_dan)
continue
if "生成方向" in prompt:
data_dan["content"] = prompt.replace("生成方向","研究方向")
if "生成方向" in summary:
data_dan["summary"] = summary.replace("生成方向", "研究方向")
if zishu_content < 900 and zishu_summary < 1900:
data_list.append(json.dumps(data_dan, ensure_ascii=False))
# if zishu_summary < 600:
# data_tongji["0-600"] += 1
# if 600 < zishu_summary < 1500:
# data_tongji["600-1500"] += 1
# if 1500 < zishu_summary:
# data_tongji["1500-"] += 1
# data_tongji_prompt.append([data_dan['summary'], zishu_summary])
# else:
# train_list.append(i)
# for i in data_tongji_prompt:
# print(i)
#
# random.shuffle(data_list)
#
train_nums = int(len(data_list) * 0.9)
dev_nums = int(len(data_list) * 0.1)
#
random.shuffle(data_list)
train_list = data_list[:train_nums]
dev_list = data_list[train_nums:]
with open("./data/chatglm_train_3.json", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(i)
with open("./data/chatglm_dev_3.json", mode="w", encoding="utf-8") as f:
for i in dev_list:
f.write(i)
# for i in data_tongji_prompt:
# print(i)
#
# print(data_tongji)

89
训练数据筛选_prompt.py

@ -0,0 +1,89 @@
import os
import random
import json
from tqdm import tqdm
data_tongji = {
"0-600": 0,
"600-1500": 0,
"1500-": 0,
}
# print("这段文字翻译成英文"\n'")
data_tongji_prompt = []
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
data_list = []
with open("data/chatglm_paper_data_2_prompt.txt", encoding="utf-8") as f:
for i in tqdm(f):
data_dan = eval(i)
zishu_query = len(data_dan["query"])
zishu_response = len(data_dan["response"])
query = data_dan["query"]
response = data_dan["response"]
prompt = data_dan["prompt"]
if prompt == "翻译摘要#":
zishu_summary = len(data_dan["response"].split(" "))
elif prompt == "翻译关键词#":
zishu_summary = len(data_dan["response"].split(" "))
else:
bool_ = is_contains_chinese(data_dan["response"])
if bool_ == False:
print(data_dan)
continue
if "生成方向" in query:
data_dan["query"] = query.replace("生成方向","研究方向")
if "生成方向" in response:
data_dan["response"] = response.replace("生成方向", "研究方向")
if zishu_query < 700 and zishu_response< 1400:
data_list.append(json.dumps(data_dan, ensure_ascii=False))
# if zishu_summary < 600:
# data_tongji["0-600"] += 1
# if 600 < zishu_summary < 1500:
# data_tongji["600-1500"] += 1
# if 1500 < zishu_summary:
# data_tongji["1500-"] += 1
# data_tongji_prompt.append([data_dan['summary'], zishu_summary])
# else:
# train_list.append(i)
# for i in data_tongji_prompt:
# print(i)
#
# random.shuffle(data_list)
#
train_nums = int(len(data_list) * 0.8)
dev_nums = int(len(data_list) * 0.2)
#
random.shuffle(data_list)
print(train_nums)
train_list = data_list[:train_nums]
dev_list = data_list[train_nums:]
with open("./data/chatglm_train_3_prompt.json", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(i)
f.write("\n")
with open("./data/chatglm_dev_3_prompt.json", mode="w", encoding="utf-8") as f:
for i in dev_list:
f.write(i)
f.write("\n")
# for i in data_tongji_prompt:
# print(i)
#
# print(data_tongji)

41
题目去重.py

@ -0,0 +1,41 @@
title_list = []
with open("./data/题目3.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
for i in text_list:
title_list.append(i.split("@@@@@")[0])
with open("./data/题目2.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
for i in text_list:
title_list.append(i.split("@@@@@")[0])
print(title_list)
with open("./data/题目4.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
title_list_new = []
for i in text_list:
if i.split("@@@@@")[0] in title_list:
continue
else:
title_list_new.append(i)
print(len(title_list_new))
with open("./data/题目4_new.txt", mode="w",encoding="utf-8") as f:
for i in title_list_new:
f.write(i)
f.write("\n")
Loading…
Cancel
Save