From f0a4e4e68ff3497fa8e1333fdecc164ca57e822f Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Thu, 20 Jul 2023 00:05:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=95=B4=E6=B5=81=E7=A8=8B=E4=B8=8A?= =?UTF-8?q?=E4=BC=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 163 +++++++++++++++++++++++++ chatgpt数据清洗整合训练数据.py | 201 +++++++++++++++++++++++++++++++ generate_first.py | 144 ++++++++++++++++++++++ generate_first_hexin.py | 150 +++++++++++++++++++++++ generate_fourth.py | 132 ++++++++++++++++++++ generate_mulu.py | 8 +- generate_sencend.py | 134 +++++++++++++++++++++ generate_third.py | 134 +++++++++++++++++++++ mulu转化为提示文本.py | 56 +++------ mulu转化为提示文本生成摘要.py | 23 ++-- zhaiyoa转化为提示.py | 21 ++-- 提取题目.py | 23 ++++ 目录筛选.py | 18 +-- 读取题目&核心内容数据.py | 22 ++++ 14 files changed, 1152 insertions(+), 77 deletions(-) create mode 100644 chatgpt数据清洗整合训练数据.py create mode 100644 generate_first.py create mode 100644 generate_first_hexin.py create mode 100644 generate_fourth.py create mode 100644 generate_sencend.py create mode 100644 generate_third.py create mode 100644 提取题目.py create mode 100644 读取题目&核心内容数据.py diff --git a/README.md b/README.md index e69de29..ff9afe8 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,163 @@ +# 生成论文数据准备 + +## 数据生成流程 + +-> 确定任务 +-> 修改生成数据代码(生成路径,话术等) +-> 传到chatgpt生成 +-> 数据清洗筛选(检查数据是否正确) +-> 生成训练数据 + + +## 修改生成数据代码 + +### 生成论文一级任务 +一级任务有: +{ + "mulu_prompt":生成目录 + "beijing_prompt": 生成论文来源的背景 + "zongjie_prompt": 生成论文简短总结 + "zongshu_prompt": 生成课题的国内外研究状况综述 + "yanjiubeijingyiyi_prompt": 生成课题的研究背景和意义 + "jianjie_prompt": 生成研究内容 + "zhixie_prompt": 生成致谢 +} + +#### 生成chatgpt数据 +修改 generate_first.py 参数 +python generate_first.py 生成chatgpt数据 + +#### 生成数据结果处理 +对目录的处理: +python 目录筛选.py + +如果只需要生成其中某个任务,参考 +generate_mulu_only.py + + +### 生成论文二级任务 +二级任务主要是根据一级任务的结果生成 + +{ + "small_title_prompt_shuffle":生成论文小标题内容 + "references_prompt": 生成参考文献 + "zhaiyao_prompt": 生成论文摘要 + "task_book_prompt": 生成6点本篇论文应完成的主要内容 +} + +生成 “生成6点本篇论文应完成的主要内容” 任务需要 “生成研究内容” 的结果,所以先根据 title_jianjie_prompt_data.txt 文件生成提示文本 +修改 jianjie转化为tishi.py 参数 +python jianjie转化为tishi.py 输出 提示文本 data/jianjie_to_/task_book_prompt.txt + +生成 “生成参考文献”,“生成论文小标题内容”,“生成论文摘要” 任务需要 “生成目录” 的结果,所以先根据 title_mulu_prompt_data.txt 文件生成提示文本 +修改 mulu转化为提示文本.py 参数 +python mulu转化为提示文本.py 输出 提示文本到 data/title_mulu_to_ 文件夹下 分为俩个文件 references_prompt.txt,small_title_prompt_shuffle.txt + +修改 mulu转化为提示文本生成摘要.py 参数 +python mulu转化为提示文本生成摘要.py 输出 提示文本到 data/title_mulu_to_ 文件夹下 一个文件 zhaiyao_prompt.txt + +修改 generate_sencend.py 参数 +python generate_sencend.py 生成chatgpt数据 + +如果只需要生成其中某个任务,参考 +mulu转化为提示文本_只针对小标题切无字数控制.py + +### 生成论文三级任务 +三级任务主要是根据二级任务的结果生成 +{ + zhaiyao_chinese_keyword:生成关键字 + zhaiyao_fanyi_prompt:翻译摘要 +} + +生成 “生成关键字” “翻译摘要” 任务需要 “生成论文摘要” 的结果,所以先根据 title_mulu_zhaiyao_data.txt 文件生成提示文本 +修改 zhaiyoa转化为提示.py 参数 +python zhaiyoa转化为提示.py 输出 提示文本 data/zhaiyao_to_/zhaiyao_chinese_keyword_prompt.txt + data/zhaiyao_to_/zhaiyao_fanyi_prompt.txt + +修改 generate_third.py 参数 +python generate_third.py 生成chatgpt数据 + +如果只需要生成其中某个任务,参考 + + +### 生成论文四级任务 +四级任务主要是根据三级任务的结果生成 +{ + chinese_keyword_en_prompt_data:翻译关键词 +} + +生成 “翻译关键词” 任务需要 “生成关键字” 的结果,所以先根据 zhaiyao_chinese_keyword_prompt_data.txt 文件生成提示文本 +修改 chinese_keyword转化为提示.py 参数 +python chinese_keyword转化为提示.py 输出 提示文本 data/chinese_keyword_to_/chinese_keyword_en_prompt.txt + +修改 generate_fourth.py 参数 +python generate_fourth.py 生成chatgpt数据 + +如果只需要生成其中某个任务,参考 + + +### 生成论文一级任务带核心内容 +此任务跟直接生成一级任务的区别是带了一个核心内容 +{ + "mulu_prompt":生成目录 + "beijing_prompt": 生成论文来源的背景 + "zongjie_prompt": 生成论文简短总结 + "zongshu_prompt": 生成课题的国内外研究状况综述 + "yanjiubeijingyiyi_prompt": 生成课题的研究背景和意义 + "jianjie_prompt": 生成研究内容 +} + +修改 generate_first_hexin.py 参数 +python generate_first_hexin.py 生成chatgpt数据 + +## 传到 chatgpt生成数据 + +## 数据清洗筛选(检查数据是否正确) + +chatgpt数据清洗整合训练数据.py + +## 训练数据示例 + +### 仅有标题生成的任务 + +{ "生成论文来源的背景#\n问:以《习近平新时代中国特色社会主义法治观初探》为论文题目,写一段题目来源的背景,要求字数在200字以内\n答:\n" + +"生成研究内容#\n问:请帮我生成《基于神经网络的商品推荐系统的研究与实现》为题目的研究内容,包括整体简介和分最少三个方面总结\n答:\n" + +"生成目录#\n问:为论文题目《我国医患纠纷行政调解前置程序问题的研究》生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题\n答:\n" + +"生成课题的研究背景和意义#\n问:请分别写出以《商业车险费率市场化改革对财险公司影响研究》为课题的研究背景和意义,字数不少于1000字\n答:\n" + +"生成论文简短总结#\n问:以《用于智能马桶盖的无袖带式血压监测系统开发》为论文题目,写一个论文简短总结,要求在300字以内\n答:\n" + +"生成课题的国内外研究状况综述#\n问:请写出以《新时代中国院线电影观感积极率研究》为课题的国内外研究状况综述,字数在800字左右\n答:\n" + +"生成6点本篇论文应完成的主要内容#\n问:请根据题目为《兰州市数字化城市管理提升研究》,和研究内容为“{生成研究内容}”总结出至少6点本篇论文应完成的主要内容,使用阿拉伯数字排列"\n答:\n" + +"生成参考文献#" + +"生成论文小标题内容#\n问:论文题目是《1926-1930年归绥地区灾荒与社会流动》,目录是“{生成目录}”的内容补充完整,补充内容字数在1000字左右\n答:\n" + +"生成论文摘要#\n问:论文题目是《纳米水化硅酸钙改性隔热涂料的研究》,目录是“{生成目录}”,生成论文摘要,要求生成的字数在600字左右"\n答:\n" + +"生成关键字#\n问:请为“{生成论文摘要}”这段论文摘要生成3-5个关键字,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n”"\n答:\n" + +"翻译摘要#\n问:请把“{生成论文摘要}”这段文字翻译成英文"\n答:\n" + +"翻译关键词#\n问:请把“{生成关键字}”这几个关键字翻译成英文"\n答:\n" } + +### 带着核心内容和标题的生成任务 + +{ + +"生成论文来源的背景#\n问:以《民航繁忙干线机票最低价格预测方法研究》为论文题目,以“本研究旨在探索一种新的机票价格预测方法,以提高乘客购票体验和航空公司的经济效益。该研究包括数据采集、数据预处理、特征工程、机器学习模型构建和模型评估等步骤。最终成果是开发出一款可预测繁忙干线机票最低价格的模型,并通过对比实际价格和预测价格的误差,验证该模型的准确性和可靠性。通过本研究,我们希望能为航空公司提供更准确的价格预测,为乘客提供更便捷的购票体验。”为论文的研究方向,写一段题目来源的背景,要求字数在200字以内\n答:\n" + +"生成研究内容#\n问:请帮我生成《A公司3C产品逆向物流业务流程优化》为题目,以“本文旨在优化A公司3C产品逆向物流业务流程,包括对退货、维修、换货等环节进行调研和分析,提出改善方案。最终成果是优化后的逆向物流业务流程实施,并以一个特定3C产品的退货流程为例,验证所设计方案的可行性和有效性。”为论文的研究方向,生成论文研究内容,包括整体简介和分最少三个方面总结\n答:\n" + +"生成目录#\n问:论文题目为《低碳试点城市的绿色GDP核算研究》,以“该研究旨在通过对低碳试点城市的绿色GDP核算,探索一种新的经济发展模式,以实现经济增长与环境保护的双赢。研究将结合城市资源利用情况、环境质量、生态系统服务等方面进行综合评估,建立低碳经济发展指标体系,从而为低碳试点城市的可持续发展提供理论和实践支持。最终成果将是一份完整的绿色GDP核算报告,以低碳试点城市为例,验证该研究的可行性和实用性。”为论文的研究方向,为论文生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题\n答:\n" + +"生成课题的研究背景和意义#\n问:请分别写出以《企业拟上市过程中的政府服务方式探析》为课题,以“研究拟上市企业在上市过程中,政府部门如何提供服务,探讨政府服务方式的优化和提升。最终成果是通过实地调研和案例分析,总结出一套适用于拟上市企业的政府服务模式,以提高拟上市企业上市成功率和促进经济发展。”为论文的研究方向,生成论文的研究背景和意义,字数不少于1000字\n答:\n" + +"生成论文简短总结#\n问:以《韩国民主巩固的困境问题研究》为论文题目,以“研究韩国民主化进程中所面临的困境问题,包括政治、经济、社会等多个方面的因素。最终成果是通过对韩国民主化进程的深入分析,提出一些可行的解决方案,以促进韩国民主巩固的发展。通过对韩国政治体制、经济发展、社会文化等方面的综合研究,探讨韩国民主化进程中所面临的困境问题,如政治腐败、经济不平等、社会分化等,分析其根源及影响因素。在此基础上,提出一些可行的解决方案,如加强反腐败力度、促进经济平等、强化社会文化建设等,以推动韩国民主巩固的进程。最终,通过实践验证所提出的解决方案的可行性,为韩国民主巩固的发展提供有益的借鉴。”为论文的研究方向,写一个论文简短总结,要求在300字以内\n答:\n" + +"生成课题的国内外研究状况综述#\n问:以《鲤疱疹病毒3型vIL-10基因的克隆表达及其对免疫相关因子调节作用的研究》为课题,以“研究鲤疱疹病毒3型vIL-10基因的克隆表达,探究其在免疫调节中的作用。通过实验验证其对免疫相关因子的调节作用,并分析其在免疫调节过程中的机制。最终成果是获得鲤疱疹病毒3型vIL-10基因的表达载体,并证明其在免疫调节中具有重要的调节作用。”为论文的研究方向,请写出这篇论文的国内外研究状况综述,字数在800字左右\n答:\n" } \ No newline at end of file diff --git a/chatgpt数据清洗整合训练数据.py b/chatgpt数据清洗整合训练数据.py new file mode 100644 index 0000000..cabe679 --- /dev/null +++ b/chatgpt数据清洗整合训练数据.py @@ -0,0 +1,201 @@ + +import os +import json +import re +import math +import numpy as np +from tqdm import tqdm +import random + +path_output = "chatgpt_data_v1" +patten = "目录是“(.*)”,请把其中的" +pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' +thanks = "致谢" +references = "参考文献" +excursus = "附录" +RE_CHINA_NUMS = "[一二三四五六七八九].?.?总结|[1-9].?.?总结|[一二三四五六七八九].?.?结论|[1-9].?.?结论" +RE_CHINA_TITLE = "请把其中的小标题“(.*?)”的内容补充完整|请把其中的大标题“(.*?)”的内容补充完整" + +data_tongji = { + "0-600": 0, + "600-1500": 0, + "1500-": 0, +} + +jishu = 0 + +def is_contains_chinese(strs): + for _char in strs: + if '\u4e00' <= _char <= '\u9fa5': + return True + return False +# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+' + +lable_data_amount = { + "title_beijing_prompt_data.txt": {"num_token": -1, "prompt": "生成论文来源的背景#"}, + "title_jianjie_prompt_data.txt": {"num_token": -1, "prompt": "生成研究内容#"}, + "title_mulu_prompt_data.txt": {"num_token": -1, "prompt": "生成目录#"}, + "title_yanjiubeijingyiyi_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的研究背景和意义#"}, + "title_zhixie_prompt_data.txt": {"num_token": -1, "prompt": "生成致谢#"}, + "title_zongjie_prompt_data.txt": {"num_token": -1, "prompt": "生成论文简短总结#"}, + "title_zongshu_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的国内外研究状况综述#"}, + "sencend_task_book_prompt_data.txt": {"num_token": -1, "prompt": "生成6点本篇论文应完成的主要内容#"}, + "sencend_references_prompt_data.txt": {"num_token": -1, "prompt": "生成参考文献#"}, + "sencend_small_title_prompt_shuffle_data.txt": {"num_token": -1, "prompt": "生成论文小标题内容#"}, + "sencend_zhaiyao_prompt_data.txt": {"num_token": -1, "prompt": "生成论文摘要#"}, + "third_zhaiyao_chinese_keyword_prompt_data.txt": {"num_token": -1, "prompt": "生成关键字#"}, + "third_zhaiyao_fanyi_prompt_data.txt": {"num_token": -1, "prompt": "翻译摘要#"}, + "fourth_chinese_keyword_en_prompt_data.txt": {"num_token": -1, "prompt": "翻译关键词#"}, + "title_hexin_beijing_prompt_data.txt": {"num_token": -1, "prompt": "生成论文来源的背景#"}, + "title_hexin_jianjie_prompt_data.txt": {"num_token": -1, "prompt": "生成研究内容#"}, + "title_hexin_mulu_prompt_data.txt": {"num_token": -1, "prompt": "生成目录#"}, + "title_hexin_yanjiubeijingyiyi_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的研究背景和意义#"}, + "title_hexin_zongjie_prompt_data.txt": {"num_token": -1, "prompt": "生成论文简短总结#"}, + "title_hexin_zongshu_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的国内外研究状况综述#"}, + "title_hexin_zhixie_prompt_data.txt": {"num_token": -1, "prompt": "生成致谢#"} +} + + +patten_mulu = { + "title_mulu_references_prompt_data.txt": "目录是“(.*)”,请为这篇论文生成15篇左右的参考文献", + "title_mulu_small_title_prompt_shuffle_data_new.txt": "目录是“(.*)”,请把其中的小标题", + "title_mulu_zhaiyao_data.txt": "目录是“(.*)”,生成论文摘要" +} + + + +path_list = [] +file = "./data/{}/paper_prompt_title_1".format(path_output) +for root, dirs, files in os.walk(file): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + +file = "./data/{}/paper_prompt_title_1_1".format(path_output) +for root, dirs, files in os.walk(file): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + +file = "./data/{}/paper_prompt_title_1_1_1".format(path_output) +for root, dirs, files in os.walk(file): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + +file = "./data/{}/paper_prompt_title_1_1_1_1".format(path_output) +for root, dirs, files in os.walk(file): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + +file = "./data/{}/paper_prompt_title_1_hexin".format(path_output) +for root, dirs, files in os.walk(file): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + +text_list_new = [] +tongji = {} + +if __name__ == '__main__': + + data_list = [] + train_list = [] + new_data_list = [] + for path in path_list: + patten_mulu_bool = False + shuminghao_bool = False + patten_mulu_patten = "" + task_name = path.split("\\")[-1] + + if task_name in patten_mulu: + patten_mulu_bool = True + patten_mulu_patten = patten_mulu[task_name] + + train_data_amount_dict = lable_data_amount[task_name] + train_data_amount = train_data_amount_dict["num_token"] + prompt = train_data_amount_dict["prompt"] + + with open(path, encoding="utf-8") as f: + text = f.read() + text_list = text.split("@" * 20) + + for data_dan in text_list: + if "*" * 20 in data_dan: + content, summary = data_dan.split("*" * 20) + # text_list_new.append(data_dan) + # data_dan = data_dan.replace("\\n", "\n").replace("\n", "\\n") + + new_data_list.append( + { + "input": str(content).strip("\"").strip("\n").strip("\"").replace("\\n", "\n"), + "output": str(summary).replace("\\n", "\n").strip("\""), + "instruction": prompt + } + ) + + if task_name not in tongji: + tongji[task_name] = 1 + else: + tongji[task_name] += 1 + else: + continue + + for data_dan in tqdm(new_data_list): + zishu_query = len(data_dan["input"]) + zishu_response = len(data_dan["output"]) + + prompt = str(data_dan["instruction"]).replace("\\n", "\n") + query = data_dan["input"].replace("\\n", "\n") + response = data_dan["output"].replace("\\n", "\n") + + if prompt == "翻译摘要#": + zishu_summary = len(response.split(" ")) + elif prompt == "翻译关键词#": + zishu_summary = len(response.split(" ")) + else: + bool_ = is_contains_chinese(response) + if bool_ == False: + print(data_dan) + continue + + if "生成方向" in query: + query = query.replace("生成方向","研究方向") + if "生成方向" in response: + response = response.replace("生成方向", "研究方向") + + if prompt == "生成论文小标题内容#": + query_re = re.findall(RE_CHINA_TITLE, query) + if "总结" not in query_re[0] or "结论" not in query_re[0]: + response_re = re.findall(RE_CHINA_NUMS, response) + if response_re != []: + jishu += 1 + continue + + if prompt[-1] != "\n": + prompt += "\n" + if query[-1] != "\n": + query += "\n" + query = "问:" + query + "答:\n" + + if len(query) < 700 and len(response) < 1400: + data_list.append({ + "instruction": prompt, + "input": query, + "output": response + }) + + train_nums = int(len(data_list) * 0.95) + dev_nums = int(len(data_list) * 0.05) + # + random.shuffle(data_list) + print(train_nums) + print(dev_nums) + train_list = data_list[:train_nums] + dev_list = data_list[train_nums:] + with open("./data/llama_t/chatglm_train_4_prompt_llama.json", mode="w", encoding="utf-8") as f: + f.write(json.dumps(train_list, ensure_ascii=False, indent=2)) + + with open("./data/llama_t/chatglm_dev_4_prompt_llama.json", mode="w", encoding="utf-8") as f: + f.write(json.dumps(dev_list, ensure_ascii=False, indent=2)) diff --git a/generate_first.py b/generate_first.py new file mode 100644 index 0000000..72d2a01 --- /dev/null +++ b/generate_first.py @@ -0,0 +1,144 @@ +import time +import os +from tqdm import tqdm +import random +import requests +import json +import threading +from threading import Thread +import redis + +lock = threading.RLock() +pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*') +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +with open("api_key.txt", "r",) as f: + a = f.read() + a = a.split("\n") + +redis_key_name_openaikey_list = "openaikey_list" +redis_zirenwu = "redis_zirenwu" + +api_key_list = [] +for i in a: + api_key_list.append(str(i.split("----")[-1])) + +for i in api_key_list: + redis_.rpush(redis_key_name_openaikey_list, i) + +lock = threading.RLock() + + +path_output = "chatgpt_data_v1" +prompt_dict = { + "mulu_prompt": "为论文题目《{}》生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题", + "beijing_prompt": "以《{}》为论文题目,写一段题目来源的背景,要求字数在200字以内", + "zongjie_prompt": "以《{}》为论文题目,写一个论文简短总结,要求在300字以内", + "zongshu_prompt": "请写出以《{}》为课题的国内外研究状况综述,字数在800字左右", + "yanjiubeijingyiyi_prompt": "请分别写出以《{}》为课题的研究背景和意义,字数不少于1000字", + "jianjie_prompt": "请帮我生成《{}》为题目的研究内容,包括整体简介和分最少三个方面总结", + "zhixie_prompt": "请以《{}》为题写一篇论文的中文致谢" +} + +with open("./data/题目5.txt", encoding="utf-8") as f: + text = f.read() + +title_list = text.split("\n") + + +random.shuffle(title_list) + +print(len(title_list)) + +zirenwu_list = [] + +for title in title_list: + for prompt in prompt_dict: + zirenwu_list.append([prompt, str(prompt_dict[prompt]).format(title)]) + +for i in zirenwu_list: + redis_.rpush(redis_zirenwu, json.dumps(i)) + + +def request_api_chatgpt(api_key, task_type, prompt): + t1 = time.time() + global api_key_list + global zirenwu_list + try: + OPENAI_API_KEY = api_key + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {OPENAI_API_KEY}" + } + data = { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": prompt}, + ], + "temperature": 0.5 + } + response = requests.post(url, + headers=headers, + data=json.dumps(data), + timeout=240) + + res = response.json() + text = res["choices"][0]["message"]["content"] + + path_root = '/home/majiahui/mulu_ner/data/{}/paper_prompt_title_1'.format(path_output) + if not os.path.exists(path_root): + os.makedirs(path_root) + + lock.acquire() + with open(path_root + "/title_{}_data.txt".format(task_type), mode="a") as f: + f.write(prompt) + f.write("*" * 20) + f.write(text) + f.write("@" * 20) + lock.release() + t2 = time.time() + t_n = t2 - t1 + if t_n > 20: + redis_.rpush(redis_key_name_openaikey_list, api_key) + else: + time.sleep(20 - t_n) + redis_.rpush(redis_key_name_openaikey_list, api_key) + + except: + print("task_type_bad", task_type) + print("api_key_bad", api_key) + time.sleep(20) + lock.acquire() + redis_.rpush(redis_key_name_openaikey_list, api_key) + redis_.rpush(redis_zirenwu, json.dumps([task_type, prompt])) + lock.release() + + +if __name__ == '__main__': + while True: + if redis_.llen(redis_zirenwu) == 0: + time.sleep(1) + continue + elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0: + lock.acquire() + api_key = redis_.lpop(redis_key_name_openaikey_list) + api_key = api_key.decode('UTF-8') + dan_zirenwu = redis_.lpop(redis_zirenwu) + dan_zirenwu = dan_zirenwu.decode('UTF-8') + lock.release() + # dan_zirenwu = zirenwu_list.pop(0) + dan_zirenwu = json.loads(dan_zirenwu) + task_type, prompt = dan_zirenwu[0], dan_zirenwu[1] + t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt)) + t.start() + elif redis_.llen(redis_key_name_openaikey_list) == 0: + time.sleep(1) + continue + else: + time.sleep(1) + continue + + + + diff --git a/generate_first_hexin.py b/generate_first_hexin.py new file mode 100644 index 0000000..45b7804 --- /dev/null +++ b/generate_first_hexin.py @@ -0,0 +1,150 @@ +import time +import os +from tqdm import tqdm +import random +import requests +import json +import threading +from threading import Thread +import redis + +lock = threading.RLock() +pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*') +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +with open("api_key.txt", "r",) as f: + a = f.read() + a = a.split("\n") + +redis_key_name_openaikey_list = "openaikey_list" +redis_zirenwu = "redis_zirenwu" + +api_key_list = [] +for i in a: + api_key_list.append(str(i.split("----")[-1])) + +for i in api_key_list: + redis_.rpush(redis_key_name_openaikey_list, i) + +lock = threading.RLock() + + +path_output = "chatgpt_data_v1" +prompt_dict = { + "mulu_prompt": "为论文题目《{}》生成目录,以“{}”为论文的研究方向,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题", + "beijing_prompt": "以《{}》为论文题目,以“{}”为论文的研究方向,写一段题目来源的背景,要求字数在200字以内", + "zongjie_prompt": "以《{}》为论文题目,以“{}”为论文的研究方向,写一个论文简短总结,要求在300字以内", + "zongshu_prompt": "请写出以《{}》为课题,以“{}”为论文的研究方向,请写出这篇论文的国内外研究状况综述,字数在800字左右", + "yanjiubeijingyiyi_prompt": "请分别写出以《{}》为课题,以“{}”为论文的研究方向,生成论文的研究背景和意义,字数不少于1000字", + "jianjie_prompt": "请帮我生成《{}》为题目,以“{}”为论文的研究方向,生成论文研究内容,包括整体简介和分最少三个方面总结", + "zhixie_prompt": "请以《{}》为题,以“{}”为论文的研究方向,写一篇论文的中文致谢" +} + +with open("data/题目5-核心.txt", encoding="utf-8") as f: + text = f.read() + +text_list = text.split("\n") + +title_list = [] +for i in text_list: + if "@@@@@" not in i: + continue + dan_list = i.split("@@@@@") + title_list.append((dan_list[0], dan_list[1])) + +random.shuffle(title_list) + +print(len(title_list)) + +zirenwu_list = [] + +for title, hexin in title_list: + for prompt in prompt_dict: + zirenwu_list.append((prompt, str(prompt_dict[prompt]).format(title, hexin))) + +for i in zirenwu_list: + redis_.rpush(redis_zirenwu, json.dumps(i)) + + +def request_api_chatgpt(api_key, task_type, prompt): + t1 = time.time() + global api_key_list + global zirenwu_list + try: + OPENAI_API_KEY = api_key + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {OPENAI_API_KEY}" + } + data = { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": prompt}, + ], + "temperature": 0.5 + } + response = requests.post(url, + headers=headers, + data=json.dumps(data), + timeout=240) + + res = response.json() + text = res["choices"][0]["message"]["content"] + + path_root = '/home/majiahui/mulu_ner/data/{}/paper_prompt_title_1_hexin'.format(path_output) + if not os.path.exists(path_root): + os.makedirs(path_root) + + lock.acquire() + with open(path_root + "/title_hexin_{}_data.txt".format(task_type), mode="a") as f: + f.write(prompt) + f.write("*" * 20) + f.write(text) + f.write("@" * 20) + lock.release() + t2 = time.time() + t_n = t2 - t1 + if t_n > 20: + redis_.rpush(redis_key_name_openaikey_list, api_key) + else: + time.sleep(20 - t_n) + redis_.rpush(redis_key_name_openaikey_list, api_key) + + except: + print("task_type_bad", task_type) + print("api_key_bad", api_key) + time.sleep(20) + lock.acquire() + redis_.rpush(redis_key_name_openaikey_list, api_key) + redis_.rpush(redis_zirenwu, json.dumps([task_type, prompt])) + lock.release() + + +if __name__ == '__main__': + while True: + if redis_.llen(redis_zirenwu) == 0: + time.sleep(1) + continue + elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0: + lock.acquire() + api_key = redis_.lpop(redis_key_name_openaikey_list) + api_key = api_key.decode('UTF-8') + dan_zirenwu = redis_.lpop(redis_zirenwu) + dan_zirenwu = dan_zirenwu.decode('UTF-8') + lock.release() + # dan_zirenwu = zirenwu_list.pop(0) + dan_zirenwu = json.loads(dan_zirenwu) + task_type, prompt = dan_zirenwu[0], dan_zirenwu[1] + t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt)) + t.start() + elif redis_.llen(redis_key_name_openaikey_list) == 0: + time.sleep(1) + continue + else: + time.sleep(1) + continue + + + + diff --git a/generate_fourth.py b/generate_fourth.py new file mode 100644 index 0000000..f44c215 --- /dev/null +++ b/generate_fourth.py @@ -0,0 +1,132 @@ +import time +import os +from tqdm import tqdm +import random +import requests +import json +import threading +from threading import Thread +import redis + +lock = threading.RLock() +pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*') +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +with open("api_key.txt", "r",) as f: + a = f.read() + a = a.split("\n") + +redis_key_name_openaikey_bad_list = "openaikey_bad_list" +redis_key_name_openaikey_list = "openaikey_list" +redis_zirenwu = "redis_zirenwu" + +api_key_list = [] +for i in a: + api_key_list.append(str(i.split("----")[-1])) + +for i in api_key_list: + redis_.rpush(redis_key_name_openaikey_list, i) + +lock = threading.RLock() + +path_output = "chatgpt_data_v1" +file_keyword_en = r'data/chinese_keyword_to_/chinese_keyword_en_prompt.txt' + +zirenwu_list = [] +path_list = [file_keyword_en] + + +for path in path_list: + with open(path, encoding="utf-8") as f: + type_prompt = path.split("/")[-1].split(".")[0] + texts = f.readlines() + for i in texts: + zirenwu_list.append([type_prompt, json.loads(i)]) + +import random +random.shuffle(zirenwu_list) +for i in zirenwu_list: + redis_.rpush(redis_zirenwu, json.dumps(i)) + + +def request_api_chatgpt(api_key, task_type, prompt): + t1 = time.time() + global api_key_list + global zirenwu_list + try: + OPENAI_API_KEY = api_key + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {OPENAI_API_KEY}" + } + data = { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": prompt}, + ], + "temperature": 0.5 + } + response = requests.post(url, + headers=headers, + data=json.dumps(data), + timeout=240) + + res = response.json() + + text = res["choices"][0]["message"]["content"] + + path_root = '/home/majiahui/mulu_ner/data/{}/paper_prompt_title_1_1_1_1'.format(path_output) + if not os.path.exists(path_root): + os.makedirs(path_root) + + lock.acquire() + with open(path_root + "/fourth_{}_data.txt".format(task_type), mode="a") as f: + f.write(prompt) + f.write("*" * 20) + f.write(text) + f.write("@" * 20) + lock.release() + t2 = time.time() + t_n = t2 - t1 + if t_n > 20: + redis_.rpush(redis_key_name_openaikey_list, api_key) + else: + time.sleep(20 - t_n) + redis_.rpush(redis_key_name_openaikey_list, api_key) + + except: + time.sleep(20) + lock.acquire() + redis_.rpush(redis_key_name_openaikey_list, api_key) + redis_.rpush(redis_zirenwu, json.dumps([type_prompt, prompt])) + lock.release() + + +if __name__ == '__main__': + while True: + if redis_.llen(redis_zirenwu) == 0: + time.sleep(1) + continue + elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0: + lock.acquire() + api_key = redis_.lpop(redis_key_name_openaikey_list) + api_key = api_key.decode('UTF-8') + dan_zirenwu = redis_.lpop(redis_zirenwu) + dan_zirenwu = dan_zirenwu.decode('UTF-8') + lock.release() + # dan_zirenwu = zirenwu_list.pop(0) + dan_zirenwu = json.loads(dan_zirenwu) + task_type, prompt = dan_zirenwu[0], dan_zirenwu[1] + t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt)) + t.start() + elif redis_.llen(redis_key_name_openaikey_list) == 0: + time.sleep(1) + continue + else: + time.sleep(1) + continue + + + + diff --git a/generate_mulu.py b/generate_mulu.py index 09f4243..c49c577 100644 --- a/generate_mulu.py +++ b/generate_mulu.py @@ -28,6 +28,8 @@ for i in api_key_list: lock = threading.RLock() + +path_output = "paper_prompt_title_6" prompt_dict = { "mulu_prompt": "为论文题目“{}”生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题", "beijing_prompt": "以“{}”为论文题目,写一段题目来源的背景,要求字数在200字以内", @@ -142,11 +144,11 @@ def request_api_chatgpt(api_key, task_type, prompt): # api_key_list.append(api_key) redis_.rpush(redis_key_name_openaikey_list, api_key) - with open("/home/majiahui/mulu_ner/data/paper_prompt_title_3/title_{}_data.txt".format(task_type), mode="a") as f: + with open("/home/majiahui/mulu_ner/data/{}/title_{}_data.txt".format(path_output, task_type), mode="a") as f: f.write(prompt) - f.write("**************") + f.write("*" * 20) f.write(text) - f.write("\n") + f.write("@" * 20) lock.release() except: diff --git a/generate_sencend.py b/generate_sencend.py new file mode 100644 index 0000000..e08fe3c --- /dev/null +++ b/generate_sencend.py @@ -0,0 +1,134 @@ +import time +import os +from tqdm import tqdm +import random +import requests +import json +import threading +from threading import Thread +import redis + +lock = threading.RLock() +pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*') +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +with open("api_key.txt", "r",) as f: + a = f.read() + a = a.split("\n") + +redis_key_name_openaikey_bad_list = "openaikey_bad_list" +redis_key_name_openaikey_list = "openaikey_list" +redis_zirenwu = "redis_zirenwu" + +api_key_list = [] +for i in a: + api_key_list.append(str(i.split("----")[-1])) + +for i in api_key_list: + redis_.rpush(redis_key_name_openaikey_list, i) + +lock = threading.RLock() + +path_output = "chatgpt_data_v1" +file_small_title = r'data/title_mulu_to_/small_title_prompt_shuffle.txt' +file_references = r'data/title_mulu_to_/references_prompt.txt' +file_zhaiyao = r'data/title_mulu_to_/zhaiyao_prompt.txt' +file_task_book = r'data/jianjie_to_/task_book_prompt.txt' + + +path_list = [file_small_title, file_references, file_zhaiyao, file_task_book] +zirenwu_list = [] +for path in path_list: + with open(path, encoding="utf-8") as f: + type_prompt = path.split("/")[-1].split(".")[0] + texts = f.readlines() + for i in texts: + zirenwu_list.append([type_prompt, json.loads(i)]) + +import random +random.shuffle(zirenwu_list) +for i in zirenwu_list: + redis_.rpush(redis_zirenwu, json.dumps(i)) + + +def request_api_chatgpt(api_key, task_type, prompt): + t1 = time.time() + global api_key_list + global zirenwu_list + try: + OPENAI_API_KEY = api_key + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {OPENAI_API_KEY}" + } + data = { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": prompt}, + ], + "temperature": 0.5 + } + response = requests.post(url, + headers=headers, + data=json.dumps(data), + timeout=240) + + res = response.json() + + text = res["choices"][0]["message"]["content"] + path_root = '/home/majiahui/mulu_ner/data/{}/paper_prompt_title_1_1'.format(path_output) + if not os.path.exists(path_root): + os.makedirs(path_root) + + lock.acquire() + with open(path_root + "/sencend_{}_data.txt".format(task_type), mode="a") as f: + f.write(prompt) + f.write("*" * 20) + f.write(text) + f.write("@" * 20) + lock.release() + t2 = time.time() + t_n = t2 - t1 + if t_n > 20: + redis_.rpush(redis_key_name_openaikey_list, api_key) + else: + time.sleep(20 - t_n) + redis_.rpush(redis_key_name_openaikey_list, api_key) + + + except: + time.sleep(20) + lock.acquire() + redis_.rpush(redis_key_name_openaikey_list, api_key) + redis_.rpush(redis_zirenwu, json.dumps([type_prompt, prompt])) + lock.release() + + +if __name__ == '__main__': + while True: + if redis_.llen(redis_zirenwu) == 0: + time.sleep(1) + continue + elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0: + lock.acquire() + api_key = redis_.lpop(redis_key_name_openaikey_list) + api_key = api_key.decode('UTF-8') + dan_zirenwu = redis_.lpop(redis_zirenwu) + dan_zirenwu = dan_zirenwu.decode('UTF-8') + lock.release() + # dan_zirenwu = zirenwu_list.pop(0) + dan_zirenwu = json.loads(dan_zirenwu) + task_type, prompt = dan_zirenwu[0], dan_zirenwu[1] + t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt)) + t.start() + elif redis_.llen(redis_key_name_openaikey_list) == 0: + time.sleep(1) + continue + else: + time.sleep(1) + continue + + + + diff --git a/generate_third.py b/generate_third.py new file mode 100644 index 0000000..0c32bb1 --- /dev/null +++ b/generate_third.py @@ -0,0 +1,134 @@ +import time +import os +from tqdm import tqdm +import random +import requests +import json +import threading +from threading import Thread +import redis + +lock = threading.RLock() +pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=10, password='Zhicheng123*') +redis_ = redis.Redis(connection_pool=pool, decode_responses=True) + +with open("api_key.txt", "r",) as f: + a = f.read() + a = a.split("\n") + +redis_key_name_openaikey_bad_list = "openaikey_bad_list" +redis_key_name_openaikey_list = "openaikey_list" +redis_zirenwu = "redis_zirenwu" + +api_key_list = [] +for i in a: + api_key_list.append(str(i.split("----")[-1])) + +for i in api_key_list: + redis_.rpush(redis_key_name_openaikey_list, i) + +lock = threading.RLock() + +path_output = "chatgpt_data_v1" +file_keyword = r'data/zhaiyao_to_/zhaiyao_chinese_keyword_prompt.txt' +file_fanyi = r'data/zhaiyao_to_/zhaiyao_fanyi_prompt.txt' + +zirenwu_list = [] +path_list = [file_keyword, file_fanyi] + + +for path in path_list: + with open(path, encoding="utf-8") as f: + type_prompt = path.split("/")[-1].split(".")[0] + texts = f.readlines() + for i in texts: + zirenwu_list.append([type_prompt, json.loads(i)]) + +import random +random.shuffle(zirenwu_list) +for i in zirenwu_list: + redis_.rpush(redis_zirenwu, json.dumps(i)) + + +def request_api_chatgpt(api_key, task_type, prompt): + t1 = time.time() + global api_key_list + global zirenwu_list + try: + OPENAI_API_KEY = api_key + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {OPENAI_API_KEY}" + } + data = { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": prompt}, + ], + "temperature": 0.5 + } + response = requests.post(url, + headers=headers, + data=json.dumps(data), + timeout=240) + + res = response.json() + + text = res["choices"][0]["message"]["content"] + + path_root = '/home/majiahui/mulu_ner/data/{}/paper_prompt_title_1_1_1'.format(path_output) + if not os.path.exists(path_root): + os.makedirs(path_root) + + lock.acquire() + with open(path_root + "/third_{}_data.txt".format(task_type), mode="a") as f: + f.write(prompt) + f.write("*" * 20) + f.write(text) + f.write("@" * 20) + lock.release() + + t2 = time.time() + t_n = t2 - t1 + if t_n > 20: + redis_.rpush(redis_key_name_openaikey_list, api_key) + else: + time.sleep(20 - t_n) + redis_.rpush(redis_key_name_openaikey_list, api_key) + + except: + time.sleep(20) + lock.acquire() + redis_.rpush(redis_key_name_openaikey_list, api_key) + redis_.rpush(redis_zirenwu, json.dumps([type_prompt, prompt])) + lock.release() + + +if __name__ == '__main__': + while True: + if redis_.llen(redis_zirenwu) == 0: + time.sleep(1) + continue + elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0: + lock.acquire() + api_key = redis_.lpop(redis_key_name_openaikey_list) + api_key = api_key.decode('UTF-8') + dan_zirenwu = redis_.lpop(redis_zirenwu) + dan_zirenwu = dan_zirenwu.decode('UTF-8') + lock.release() + # dan_zirenwu = zirenwu_list.pop(0) + dan_zirenwu = json.loads(dan_zirenwu) + task_type, prompt = dan_zirenwu[0], dan_zirenwu[1] + t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt)) + t.start() + elif redis_.llen(redis_key_name_openaikey_list) == 0: + time.sleep(1) + continue + else: + time.sleep(1) + continue + + + + diff --git a/mulu转化为提示文本.py b/mulu转化为提示文本.py index da3a3d3..7c802c4 100644 --- a/mulu转化为提示文本.py +++ b/mulu转化为提示文本.py @@ -6,9 +6,9 @@ from tqdm import tqdm # pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+' pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' -first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右" -small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右" -references_prompt = "论文题目是“{}”,目录是“{}”,请为这篇论文生成15篇左右的参考文献,要求其中有有中文参考文献不低于12篇,英文参考文献不低于2篇" +first_title_prompt = "论文题目是《{}》,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右" +small_title_prompt = "论文题目是《{}》,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右" +references_prompt = "论文题目是《{}》,目录是“{}”,请为这篇论文生成15篇左右的参考文献,要求其中有有中文参考文献不低于12篇,英文参考文献不低于2篇" thanks = "致谢" references = "参考文献" excursus = "附录" @@ -16,14 +16,11 @@ u = 3.5 # 均值μ sig = math.sqrt(6.0) zong_gradient = 6 paper_word_count = 12000 -pantten_title = "(.*?)”生成目录,要求只有一级标题和二级标题," +pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题," - - - - -path = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" +path_chatgpt_output = "chatgpt_data_v1" +path = "./data/{}/paper_prompt_title_1/title_mulu_prompt_data.txt".format(path_chatgpt_output) with open(path, encoding="utf-8") as f: text = f.read() @@ -32,7 +29,7 @@ def normal_distribution(x): y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig) return y -text_list = text.split("为论文题目“") +text_list = text.split("@" * 20) ner_lable = [] text_zong = [] @@ -40,10 +37,13 @@ text_zong = [] train_list = [] train_references_list = [] + +mulu_txt = [] + for text_dan in tqdm(text_list): # print(text_dan) try: - title_prompt, mulu = text_dan.split("**************") + title_prompt, mulu = text_dan.split("*" * 20) except: continue result_biaoti_list = re.findall(pantten_title, title_prompt) @@ -59,7 +59,6 @@ for text_dan in tqdm(text_list): # 生成参考文件的提示文本 train_references_list.append(references_prompt.format(title, mulu)) - paper_text = "题目:{}@目录:".format(title) table_of_contents = [] nerlable_list = [] @@ -69,7 +68,6 @@ for text_dan in tqdm(text_list): # f2.write(mulu_base64) mulu_list = str(mulu).split("\n") mulu_list = [i.strip() for i in mulu_list if i != ""] - mulu_str = "@".join(mulu_list) mulu_list_bool = [] for i in mulu_list: @@ -79,28 +77,6 @@ for text_dan in tqdm(text_list): else: mulu_list_bool.append((i, "二级标题")) - mulu_list_bool_part = mulu_list_bool[:3] - - if mulu_list_bool_part[0][1] != "一级标题": - continue - if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题": - continue - - thanks_references_bool_table = mulu_list_bool[-5:] - - for i in thanks_references_bool_table: - try: - if references in i[0]: - mulu_list_bool.remove(i) - if thanks in i[0]: - mulu_list_bool.remove(i) - if excursus in i[0]: - mulu_list_bool.remove(i) - except: - - print(thanks_references_bool_table) - continue - for i in mulu_list_bool: if i[1] == "一级标题": paper_dan = { @@ -135,7 +111,8 @@ for text_dan in tqdm(text_list): dabiaoti_dict = table_of_contents[dabiaoti_index] table_of_contents_new.append([dabiaoti_dict["title"], 0]) for xiaobiaoti in dabiaoti_dict["small_title"]: - table_of_contents_new.append([xiaobiaoti, int(dabiaoti_dict["word_count"]/len(dabiaoti_dict["small_title"]))]) + # table_of_contents_new.append([xiaobiaoti, int(dabiaoti_dict["word_count"]/len(dabiaoti_dict["small_title"]))]) + table_of_contents_new.append([xiaobiaoti, 1500]) small_task_list = [] content_index = 0 @@ -166,20 +143,21 @@ for text_dan in tqdm(text_list): for i in small_task_list: if i[3][:2] == "@@": continue - elif i[5] > 1280: - continue else: paper_prompt = i[4].format(i[1], i[2], i[3], i[5]) if len(paper_prompt) < 768: + paper_prompt = paper_prompt.replace("\\n","\n") train_list.append(paper_prompt) else: continue + mulu_txt.append(mulu) + import random random.shuffle(train_list) -train_list_shuffle = train_list[:10000] +train_list_shuffle = train_list[:50000] with open("./data/title_mulu_to_/references_prompt.txt", mode="w", encoding="utf-8") as f: for i in train_references_list: diff --git a/mulu转化为提示文本生成摘要.py b/mulu转化为提示文本生成摘要.py index bb2b37b..39869bb 100644 --- a/mulu转化为提示文本生成摘要.py +++ b/mulu转化为提示文本生成摘要.py @@ -6,28 +6,19 @@ from tqdm import tqdm # pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+' pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' -zhaiyao_prompt = "论文题目是“{}”,目录是“{}”,生成论文摘要,要求生成的字数在600字左右" -thanks = "致谢" -references = "参考文献" -excursus = "附录" -u = 3.5 # 均值μ -sig = math.sqrt(6.0) -zong_gradient = 6 -paper_word_count = 12000 -pantten_title = "(.*?)”生成目录,要求只有一级标题和二级标题," +zhaiyao_prompt = "论文题目是《{}》,目录是“{}”,生成论文摘要,要求生成的字数在600字左右" +pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题," -path = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" + +path_chatgpt_output = "chatgpt_data_v1" +path = "./data/{}/paper_prompt_title_1/title_mulu_prompt_data.txt".format(path_chatgpt_output) with open(path, encoding="utf-8") as f: text = f.read() -def normal_distribution(x): - y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig) - return y - -text_list = text.split("为论文题目“") +text_list = text.split("@" * 20) ner_lable = [] text_zong = [] @@ -38,7 +29,7 @@ train_references_list = [] for text_dan in tqdm(text_list): # print(text_dan) try: - title_prompt, mulu = text_dan.split("**************") + title_prompt, mulu = text_dan.split("*" * 20) except: continue result_biaoti_list = re.findall(pantten_title, title_prompt) diff --git a/zhaiyoa转化为提示.py b/zhaiyoa转化为提示.py index e59264b..14166dc 100644 --- a/zhaiyoa转化为提示.py +++ b/zhaiyoa转化为提示.py @@ -7,19 +7,25 @@ from tqdm import tqdm prompt = "请把“{}”这段文字翻译成英文" chinese_keyword_prompt = "请为“{}”这段论文摘要生成3-5个关键字,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n”" -pantten_title = "(.*?)》为题目生成论文摘要,要求生成的字数在" +pantten_title = "《(.*?)》为题目生成论文摘要,要求生成的字数在" -path = "./data/paper_prompt_title_3/title_zhaiyao_prompt_data.txt" +# path = "./data/paper_prompt_title_3/title_zhaiyao_prompt_data.txt" +# with open(path, encoding="utf-8") as f: +# text = f.read() + +path_chatgpt_output = "chatgpt_data_v1" +path = "data/{}/paper_prompt_title_1_1/sencend_zhaiyao_prompt_data.txt".format(path_chatgpt_output) +# path = r"E:\pycharm_workspace\mulu_ner\data\chatgpt_data_v1\paper_prompt_title_1_1\title_zhaiyao_prompt_data.txt" with open(path, encoding="utf-8") as f: text = f.read() -text_list = text.split("请以《") +text_list = text.split("@" * 20) data_list = [] chinese_keyword_data_list = [] @@ -27,17 +33,10 @@ chinese_keyword_data_list = [] for text_dan in tqdm(text_list): # print(text_dan) try: - title_prompt, zhaiyao = text_dan.split("**************") - except: - continue - result_biaoti_list = re.findall(pantten_title, title_prompt) - try: - result_biaoti_list[0] + title_prompt, zhaiyao = text_dan.split("*" * 20) except: - print(title_prompt) continue - title = str(result_biaoti_list[0]).strip("\n") zhaiyao = str(zhaiyao).strip("\n") data_list.append(prompt.format(zhaiyao)) diff --git a/提取题目.py b/提取题目.py new file mode 100644 index 0000000..a3dbad4 --- /dev/null +++ b/提取题目.py @@ -0,0 +1,23 @@ +import os + + + +path = "train_data2/train_data2" +path_list = [] +for root, dirs, files in os.walk(path): + for file in files: + path = os.path.join(root, file) + path_list.append(path) + + +data_new = [] +for i in path_list: + with open(i, encoding="utf-8") as f: + data_str = f.read() + data_list = data_str.split("\n") + data_new.append(data_list[0].split("@@@@@")[0]) + +with open("data/题目5.txt", "w", encoding="utf-8") as f: + for i in data_new: + f.write(i) + f.write("\n") \ No newline at end of file diff --git a/目录筛选.py b/目录筛选.py index bc44620..2a76dc0 100644 --- a/目录筛选.py +++ b/目录筛选.py @@ -3,8 +3,7 @@ from tqdm import tqdm import re # chatgpt生成的数据 -file = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" - +file = "data/chatgpt_data_v1/paper_prompt_title_1/title_mulu_prompt_data.txt" with open(file, encoding="utf-8") as f: text = f.read() @@ -26,7 +25,7 @@ mulu_list_new = [] for text_dan in tqdm(text_list): # print(text_dan) try: - title_prompt, mulu = text_dan.split("**************") + title_prompt, mulu = text_dan.split("*" * 20) except: continue result_biaoti_list = re.findall(pantten_title, title_prompt) @@ -111,6 +110,8 @@ for text_dan in tqdm(text_list): # print(len(table_of_contents)) table_of_contents_new = [] + + big_title_len = len(table_of_contents) for dabiaoti_index in range(len(table_of_contents)): dabiaoti_dict = table_of_contents[dabiaoti_index] dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"] @@ -119,13 +120,14 @@ for text_dan in tqdm(text_list): mulu_txt = "\n\n".join(table_of_contents_new) - title_prompt = title_prompt.replace("\n", "\\n") - mulu_txt = mulu_txt.replace("\n", "\\n") + title_prompt = title_prompt.replace("\\n", "\n").replace("一级标题不少于7个", "一级标题不少于{}个".format(str(big_title_len))) + mulu_txt = mulu_txt.replace("\\n", "\n") - mulu_list_new.append("**************".join([title_prompt,mulu_txt])) + fenge = "*" * 20 + mulu_list_new.append(fenge.join([title_prompt,mulu_txt])) -with open("./data/训练数据集合/generate_mulu.txt", mode="w", encoding="utf-8") as f: +with open(file, mode="w", encoding="utf-8") as f: for i in mulu_list_new: f.write(i) - f.write("\n") \ No newline at end of file + f.write("@" * 20) \ No newline at end of file diff --git a/读取题目&核心内容数据.py b/读取题目&核心内容数据.py new file mode 100644 index 0000000..2a68619 --- /dev/null +++ b/读取题目&核心内容数据.py @@ -0,0 +1,22 @@ + + + +with open("aiessay_title_content.txt", encoding="utf-8") as f: + data = f.read() + +data_list = data.split("----------") + + +data_new = [] +for i in data_list: + data_dan_list = i.strip("\n").split("\n") + + title = data_dan_list[0] + hexin = "".join(data_dan_list[1:]) + data_new.append("@@@@@".join([title, hexin])) + + +with open("data/题目5-核心.txt","w", encoding="utf-8") as f: + for i in data_new[:20]: + f.write(i) + f.write("\n") \ No newline at end of file