From 89b5f2a04fa30f540280326e92babb7f921ba44f Mon Sep 17 00:00:00 2001 From: "majiahui@haimaqingfan.com" Date: Fri, 29 Mar 2024 15:42:55 +0800 Subject: [PATCH] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/爬取目录筛选.iml | 8 +++ 数据合并.py | 58 +++++++++++++++++++ 数据合并_学位期刊.py | 88 ++++++++++++++++++++++++++++ 查看数据.py | 99 ++++++++++++++++++++++++++++++++ 测试正则.py | 68 ++++++++++++++++++++++ 测试正则2.py | 8 +++ 筛选10000条关键词.py | 24 ++++++++ 筛选10000条摘要.py | 35 ++++++++++++ 筛选10000条目录.py | 29 ++++++++++ 读取ck.py | 47 +++++++++++++++ 读取结果生成关键词.py | 53 +++++++++++++++++ 读取结果生成摘要.py | 126 +++++++++++++++++++++++++++++++++++++++++ 读取结果生成目录.py | 125 ++++++++++++++++++++++++++++++++++++++++ 13 files changed, 768 insertions(+) create mode 100644 .idea/爬取目录筛选.iml create mode 100644 数据合并.py create mode 100644 数据合并_学位期刊.py create mode 100644 查看数据.py create mode 100644 测试正则.py create mode 100644 测试正则2.py create mode 100644 筛选10000条关键词.py create mode 100644 筛选10000条摘要.py create mode 100644 筛选10000条目录.py create mode 100644 读取ck.py create mode 100644 读取结果生成关键词.py create mode 100644 读取结果生成摘要.py create mode 100644 读取结果生成目录.py diff --git a/.idea/爬取目录筛选.iml b/.idea/爬取目录筛选.iml new file mode 100644 index 0000000..5fee449 --- /dev/null +++ b/.idea/爬取目录筛选.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/数据合并.py b/数据合并.py new file mode 100644 index 0000000..1ac9a7f --- /dev/null +++ b/数据合并.py @@ -0,0 +1,58 @@ +import json + +# json.load() + +# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: +# a = f.read() +# print(a) + +import pandas as pd + +filename = 't_xuewei_cnki_spider.csv' +chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 + +df_dict = {} +# 使用 chunksize 参数迭代读取 CSV 文件 +for chunk in pd.read_csv(filename, chunksize=chunksize): + print(1) + # 对每个 chunk 进行处理 + + # print(chunk.columns) + df_list = chunk.values.tolist() + # print(df_list[0]) + for i in range(len(df_list)): + df_dict["_".join([df_list[i][1], str(df_list[i][2])])] = [df_list[i][7], df_list[i][8], df_list[i][9]] + +data = [] +json_list = [ + "t_xuewei_detail_cnki_2018_2021.json", + "t_xuewei_detail_cnki2_2018_2021.json", + "t_xuewei_detail_cnki3_2018_2021.json", + "t_xuewei_detail_cnki6_2018_2021.json", + "t_xuewei_detail_cnki7_2018_2021.json", +] + + +print("主库数据完成加载") +for path in json_list: + name, typr_file = path.split(".") + + a = json.load(open(path)) + for i in a: + autoid = "_".join([name, str(i['autoid'])]).replace("_2018_2021", "") + if autoid in df_dict: + data.append([i['f_title']] + df_dict[autoid]) + print("path完成筛选") + + +for i in data: + with open("data_qikan_2018_2021.json", "a") as f: + f.write(json.dumps(i)) + f.write("\n") + +with open("data_qikan_2018_2021.json", encoding="utf-8") as f: + for i in f.readlines(): + a = json.loads(i) + + +print(a) \ No newline at end of file diff --git a/数据合并_学位期刊.py b/数据合并_学位期刊.py new file mode 100644 index 0000000..777982f --- /dev/null +++ b/数据合并_学位期刊.py @@ -0,0 +1,88 @@ +import json + +# json.load() + +# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: +# a = f.read() +# print(a) + +import pandas as pd + +filename_xuewei = 't_xuewei_cnki_spider.csv' +filename_journal = 't_journal_cnki_spider.csv' +chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 + +df_dict = {} +# 使用 chunksize 参数迭代读取 CSV 文件 +# for chunk in pd.read_csv(filename_xuewei, chunksize=chunksize): +# print(1) +# # 对每个 chunk 进行处理 +# +# print(chunk.columns) +# +# df_list = chunk.values.tolist() +# # print(df_list[0]) +# for i in range(len(df_list)): +# df_dict["_".join([df_list[i][1], str(df_list[i][2])])] = [df_list[i][4], df_list[i][7]] + +# for chunk in pd.read_csv(filename_journal, chunksize=chunksize): +# print(1) +# # 对每个 chunk 进行处理 +# +# print(chunk.columns) +# 9/0 +# df_list = chunk.values.tolist() +# # print(df_list[0]) +# for i in range(len(df_list)): +# df_dict["_".join([df_list[i][1], str(df_list[i][2])])] = [df_list[i][3], df_list[i][6]] +# +data = [] +json_list = [ + "t_xuewei_detail_cnki_2018_2021.json", + "t_xuewei_detail_cnki2_2018_2021.json", + "t_xuewei_detail_cnki3_2018_2021.json", + "t_xuewei_detail_cnki6_2018_2021.json", + "t_xuewei_detail_cnki7_2018_2021.json", +] + +json_journal_list = [ + "t_journal_cnki_detail_2018_2021.json", + "t_journal_cnki_detail2_2018_2021.json", + "t_journal_cnki_detail3_2018_2021.json", + "t_journal_cnki_detail4_2018_2021.json", + "t_journal_cnki_detail6_2018_2021.json", + "t_journal_cnki_detail7_2018_2021.json", +] + + +print("主库数据完成加载") +# for path in json_list: +# name, typr_file = path.split(".") +# +# a = json.load(open(path)) +# for i in a: +# autoid = "_".join([name, str(i['autoid'])]).replace("_2018_2021", "") +# if autoid in df_dict: +# data.append([i['f_author'], i['f_title'], i['f_unit'], i['f_year']] + df_dict[autoid]) +# print("path完成筛选") + +for path in json_journal_list: + name, typr_file = path.split(".") + + a = json.load(open(path)) + for i in a: + autoid = "_".join([name, str(i['autoid'])]).replace("_2018_2021", "") + if autoid in df_dict: + data.append([i['f_author'], i['f_title'], i['f_unit'], i['f_year']] + df_dict[autoid]) + print("path完成筛选") +# for i in data: +# with open("data_qikan_2018_2021.json", "a") as f: +# f.write(json.dumps(i)) +# f.write("\n") +# +# with open("data_qikan_2018_2021.json", encoding="utf-8") as f: +# for i in f.readlines(): +# a = json.loads(i) +# +# +# print(a) \ No newline at end of file diff --git a/查看数据.py b/查看数据.py new file mode 100644 index 0000000..1ceae85 --- /dev/null +++ b/查看数据.py @@ -0,0 +1,99 @@ +# import pandas as pd +# import json +# +# # 逐块读取CSV文件,每块大小为chunksize +# chunksize = 1000 # 指定每次读取的行数 +# counter = 0 +# +# # 逐块读取CSV文件 +# for chunk in pd.read_csv('t_xuewei_cnki_spider.csv', chunksize=chunksize): +# # 处理每个块 +# # 例如,您可以查看每个块的前100条记录 +# for i in range(1000): +# print(f"=========================={str(i)}======================") +# print(chunk.values.tolist()) +# print(json.loads(chunk.values.tolist()[i][-1])) +# +# counter += 1 +# 9/0 + + +# import csv +# +# # 定义要读取的行数 +# lines_to_read = 1000 +# +# # 打开CSV文件 +# with open('t_xuewei_cnki_spider.csv', 'r', newline='', encoding='utf-8') as file: +# # 创建CSV阅读器对象 +# reader = csv.reader(file) +# +# # 获取文件的总行数 +# total_lines = sum(1 for _ in reader) +# +# # 将文件指针移动到倒数的 lines_to_read 行之前 +# file.seek(0) +# for _ in range(total_lines - lines_to_read): +# next(reader) +# +# # 逐行读取剩余的行 +# for row in reader: +# print(row) + + +import pymysql +import json + +# 建立数据库连接 +connection = pymysql.connect( + host='rm-bp11ky2z5f34d2949fo.mysql.rds.aliyuncs.com', + user='fabiao_r', + password='f5u1w8nfb3b@', + database='fabiao', + cursorclass=pymysql.cursors.DictCursor # 返回字典形式的结果,方便操作 +) + +try: + with connection.cursor() as cursor: + # 执行查询 + sql = "SELECT * FROM t_journal_cnki_detail6 WHERE f_year IN (2022)" + cursor.execute(sql) + + # 获取查询结果 + result = cursor.fetchall() + print(result) + 9/0 + + + # 处理结果 + + # for row in result: + # print(row) + + with open("t_journal_cnki_detail_2018_2021.json", "w", encoding="utf-8") as f: + f.write(json.dumps(result)) + +finally: + # 关闭连接 + connection.close() + + +# try: +# with connection.cursor() as cursor: +# # 执行查询 +# sql = "SELECT t_xuewei_detail_cnki6.*, t_xuewei_cnki_spider.* FROM t_xuewei_detail_cnki6 JOIN t_xuewei_cnki_spider ON t_xuewei_detail_cnki6.autoid = t_xuewei_cnki_spider.autoid WHERE t_xuewei_detail_cnki6.f_year IN (2019, 2020, 2021)" +# cursor.execute(sql) +# +# # 获取查询结果 +# result = cursor.fetchall() +# +# # 处理结果 +# print(result[0]) +# for row in result: +# print(row) +# # 关闭连接 +# connection.close() +# finally: +# # 关闭连接 +# connection.close() + diff --git a/测试正则.py b/测试正则.py new file mode 100644 index 0000000..f42f2d8 --- /dev/null +++ b/测试正则.py @@ -0,0 +1,68 @@ +import re + +# pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' +pantten_dabiaoti = '^第([0-9一二三四五六七八九十]{1,})?章\s{1,}?(.*)|^([0-9一二三四五六七八九十]{1,}?)\s{1,}?(.*)' +pantten_xiaobiaoti = '^[0-9](\.[0-9]\d*){1,3}\s{1,}?.*$' +pantten_dabiaoti_ = '^第([89八九])章\s{1,}?(.*)|^([89八九])\s{1,}?(.*)' +pantten_xiaobiaoti_1 = '^[1-7](\.[89]){1,2}\s{1,}?.*$' +pantten_xiaobiaoti_2 = '^[1-7](\.[1-9]{2,}?){1,2}\s{1,}?.*$' +pantten_xiaobiaoti_3 = '^[1-7](\.[1-7]){3,}\s{1,}?.*$' + + +duizhao = { + '1':'一', + '2':'二', + '3':'三', + '4':'四', + '5':'五', + '6':'六', + '7':'七', + '8':'八', + '9':'九' +} + +i = '1.91.21 双-[3-(三乙氧基)硅丙基]四硫化物体系自组装膜制备及性能研究'.strip(' ') +print(i) + + +# if list(set(re.findall(pantten_dabiaoti,dabiaoti_s)) |set(re.findall(pantten_biaoti,dabiaoti_s))) != []: +# print(1) +# else: +# print(0) +dabiaoti = re.findall(pantten_dabiaoti,i) +xiaobiaoti = re.findall(pantten_xiaobiaoti,i) + +print(dabiaoti) +print(xiaobiaoti) +if dabiaoti != []: + if i[0] != '': + if i[0] in duizhao: + dabiaoti_mulu = duizhao[i[0]] + " " + i[1] + else: + dabiaoti_mulu = i[0] + " " + i[1] + else: + if i[2] in duizhao: + dabiaoti_mulu = duizhao[i[2]] + " " + i[3] + else: + dabiaoti_mulu = i[2] + " " + i[3] + +elif xiaobiaoti != []: + i = i +else: + pass + + + + + +i = '1.1.1.1 双-[3-(三乙氧基)硅丙基]四硫化物体系自组装膜制备及性能研究'.strip(' ') +print(i) +dabiaoti = re.findall(pantten_dabiaoti_,i) +xiaobiaoti_1 = re.findall(pantten_xiaobiaoti_1,i) +xiaobiaoti_2 = re.findall(pantten_xiaobiaoti_2,i) +xiaobiaoti_3 = re.findall(pantten_xiaobiaoti_3,i) + +print(dabiaoti) +print(xiaobiaoti_1) +print(xiaobiaoti_2) +print(xiaobiaoti_3 ) \ No newline at end of file diff --git a/测试正则2.py b/测试正则2.py new file mode 100644 index 0000000..61fec84 --- /dev/null +++ b/测试正则2.py @@ -0,0 +1,8 @@ +import re + +pantten_xiaobiaoti_shai = '^[5-7](\.[5-7]){1,2}\s{1,}?.*$' + + +a = "5.9.9 dadadad" +xiaobiaoti_shai = re.findall(pantten_xiaobiaoti_shai, a) +print(xiaobiaoti_shai) \ No newline at end of file diff --git a/筛选10000条关键词.py b/筛选10000条关键词.py new file mode 100644 index 0000000..8614056 --- /dev/null +++ b/筛选10000条关键词.py @@ -0,0 +1,24 @@ +import json + + +with open("gaunjianci_zong.json") as f: + gaunjianci_list = json.loads(f.read()) + +import random + +random.shuffle(gaunjianci_list) +random.shuffle(gaunjianci_list) +random.shuffle(gaunjianci_list) + +gaunjianci_list_new = gaunjianci_list[:10000] + +gaunjianci_list_json = [] +for i in gaunjianci_list_new: + gaunjianci_list_json.append({ + "instruction": "任务:生成关键词", + "input": i[0], + "output": i[1] + }) + +with open("gaunjianci_prompt_10000.json", "w", encoding="utf-8") as f: + f.write(json.dumps(gaunjianci_list_json, ensure_ascii=False, indent=2)) \ No newline at end of file diff --git a/筛选10000条摘要.py b/筛选10000条摘要.py new file mode 100644 index 0000000..10dd24f --- /dev/null +++ b/筛选10000条摘要.py @@ -0,0 +1,35 @@ +import json + + +with open("zhaiyao_prompt.json") as f: + zhaiyao_list = json.loads(f.read()) + +import random + +random.shuffle(zhaiyao_list) +random.shuffle(zhaiyao_list) +random.shuffle(zhaiyao_list) + +zhaiyao_list_new = zhaiyao_list[:10000] + +# mulu_list_json = [] +# for i in mulu_list_json: +# mulu_list_json.append({ +# "instruction": "任务:生成目录", +# "input": i[0], +# "output": i[1] +# }) + +# with open("mulu_prompt_10000.json", "w", encoding="utf-8") as f: +# f.write(json.dumps(mulu_list_json, ensure_ascii=False, indent=2)) + +zhaiyao_list_json = [] +for i in zhaiyao_list_new: + zhaiyao_list_json.append({ + "instruction": "任务:生成论文摘要", + "input": i[0], + "output": i[1] + }) + +with open("zhaiyao_prompt_10000.json", "w", encoding="utf-8") as f: + f.write(json.dumps(zhaiyao_list_json, ensure_ascii=False, indent=2)) \ No newline at end of file diff --git a/筛选10000条目录.py b/筛选10000条目录.py new file mode 100644 index 0000000..204c4f1 --- /dev/null +++ b/筛选10000条目录.py @@ -0,0 +1,29 @@ +import json + + +with open("mulu_prompt.json") as f: + mulu_list = json.loads(f.read()) + +import random + +random.shuffle(mulu_list) +random.shuffle(mulu_list) +random.shuffle(mulu_list) + +mulu_list_new = mulu_list[:10000] +# { +# "instruction": "任务:生成论文小标题内容", +# "input": "背景:我是一名博士生,我想写一篇论文。\n角色:我需要你作为拥有google学术论文数据库和百度学术论文数据库的资深数据。\n要求:根据论文题目“大数据时代的数据分析与挖掘”和目录“一、大数据时代概述\n1.1 大数据的定义与特征\n1.2 大数据时代的发展历程\n1.3 大数据时代的挑战与机遇\n二、数据分析与挖掘的基本概念\n2.1 数据分析的概念与目的\n2.2 数据挖掘的概念与任务\n2.3 数据分析与数据挖掘的关系\n三、大数据时代的数据分析技术\n3.1 分布式计算技术\n3.2 数据可视化技术\n3.3 机器学习与深度学习技术\n四、大数据时代的挖掘技术\n4.1 关联规则挖掘\n4.2 聚类分析\n4.3 时间序列分析\n五、数据挖掘的应用与实践\n5.1 金融行业的数据挖掘应用\n5.2 电商行业的数据挖掘应用\n5.3 医疗行业的数据挖掘应用”,为小标题“1.1 大数据的定义与特征”填充400字左右的中文内容,内容第一行返回“1.1 大数据的定义与特征”且不包含目录中其他标题,禁止出现“首先”,“其次”,等字样,必要时候应根据条数罗列", +# "output": "1.1 大数据的定义与特征\n\n大数据是指传统数据处理应用软件难以处理的大规模和复杂度的数据集合。IDC 的定义为:\"大数据技术指的是那些使用新型处理模式,以经济有效的方式,从高速增长的各类数据中提炼出价值的技术与概念。\"大数据的特征通常用“5V”来概述,即体量巨大(Volume)、速度快(Velocity)、种类多(Variety)、真实性(Veracity)和价值密度(Value)。\n\n(1) 体量巨大:数据量巨大,处于TB(太字节)到PB(拍字节)甚至更高的量级。\n\n(2) 速度快:数据流入的速度极快,实时性成为数据处理的关键要求。\n\n(3) 种类多:数据类型繁多,包括结构化数据、非结构化数据和半结构化数据。\n\n(4) 真实性:数据的质量和准确性会直接影响到分析结果的可行性。\n\n(5) 价值密度:大数据中蕴含有巨大的价值,但相比数据量的庞大,这些价值的密度却相对较低,需要通过有效的分析才能挖掘出来。\n\n因此,面对大数据的定义与特征,数据分析与挖掘的技术与方法必须适应数据的快速流转和多样性,且须具备处理海量数据的能力,以期从中获取有价值的信息与知识。" +# }, + +mulu_list_json = [] +for i in mulu_list_new: + mulu_list_json.append({ + "instruction": "任务:生成目录", + "input": i[0].replace("一级标题不少于7个", "一级标题5-7个"), + "output": i[1] + }) + +with open("mulu_prompt_10000.json", "w", encoding="utf-8") as f: + f.write(json.dumps(mulu_list_json, ensure_ascii=False, indent=2)) \ No newline at end of file diff --git a/读取ck.py b/读取ck.py new file mode 100644 index 0000000..b4abaaa --- /dev/null +++ b/读取ck.py @@ -0,0 +1,47 @@ +from clickhouse_driver import Client +import json + +class PureClient: + def __init__(self, database='test_db'): + # 只需要写本地地址 + self.client = Client(host=f'192.168.31.74', port=9000, user='default', + password='zhicheng123*', database=database) + + def run(self, sql): + client = self.client + collection = client.query_dataframe(sql) + return collection + + +pureclient = PureClient() + +for i in [2018, 2019, 2020, 2021]: + sql = f'SELECT * FROM main_paper_message WHERE year IN ({i}) limit 10' + result = pureclient.run(sql) + print(result) +# print("result", result) +# title = result['title'][0] +# author = result['author'][0] +# degree = result['degree'][0] +# year = result['content'][0].split("/")[5] +# school = result['school'][0] +# qikan_name = result['qikan_name'][0] +# author = str(author).strip(";") +# author = str(author).replace(";", ",") +# # select +# # school, qikan_name +# # from main_table_paper_detail_message limit +# # 10000 \G;; +# +# paper_info = { +# "title": title, +# "author": author, +# "degree": degree, +# "year": year, +# "school": school, +# "qikan_name": qikan_name +# } +# print("paper_info", paper_info) +# return paper_info +with open("data_info.json", "w", encoding="utf-8") as f: + f.write(json.dumps(result)) diff --git a/读取结果生成关键词.py b/读取结果生成关键词.py new file mode 100644 index 0000000..b7c602a --- /dev/null +++ b/读取结果生成关键词.py @@ -0,0 +1,53 @@ +import json +import re + + +prompt = "请为“{}”这段论文摘要生成3-5个关键词,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n”\n" + + +def contains_chinese(text): + # 检查是否包含中文字符 + chinese_pattern = re.compile(r'[\u4e00-\u9fa5]') + has_chinese = bool(re.search(chinese_pattern, text)) + return has_chinese + +def shengcheng_prompt(zhaiyao, guanjianci): + gaunjianci_bool = True + guanjianci_list = guanjianci.split(";") + + + if contains_chinese(zhaiyao) == False: + return False, [] + if len(zhaiyao) < 500 or len(zhaiyao) > 800: + return False, [] + if len(guanjianci_list) <= 4 or len(guanjianci_list) >= 7: + return False, [] + + guajnjianci_prompt = prompt.format(zhaiyao) + guanjianci_str_list = [] + for i in range(len(guanjianci_list)): + if guanjianci_list[i] != "": + guanjianci_str_list.append(str(i+1) + ". " + guanjianci_list[i]) + + guanjianci_str = "\n".join(guanjianci_str_list) + return True, [guajnjianci_prompt, guanjianci_str] + + +gaunjianci_zong = [] +with open("data.json", encoding="utf-8") as f: + for i in f.readlines(): + a = json.loads(i) + try: + gaunjianci_bool, gaunjianci_list = shengcheng_prompt(a[1], a[2]) + if gaunjianci_bool == True: + gaunjianci_zong.append(gaunjianci_list) + # else: + # print("===========================================================================") + # print(mulu_new) + except: + continue + + + +with open("gaunjianci_zong.json", "w", encoding="utf-8") as f: + f.write(json.dumps(gaunjianci_zong, indent=2)) \ No newline at end of file diff --git a/读取结果生成摘要.py b/读取结果生成摘要.py new file mode 100644 index 0000000..64cc3f7 --- /dev/null +++ b/读取结果生成摘要.py @@ -0,0 +1,126 @@ +import json +import re + + +pantten_dabiaoti = '^第([0-9一二三四五六七八九十]{1,})?章\s{1,}?(.*)|^([0-9一二三四五六七八九十]{1,}?)\s{1,}?(.*)' +pantten_xiaobiaoti = '^[0-9](\.[0-9]\d*){1,3}\s{1,}?.*$' + +pantten_dabiaoti_shai = '^([1-7一二三四五六七])、(.*)' +pantten_xiaobiaoti_shai = '^[1-7](\.[1-7]){1,2}\s{1,}?.*$' + +pantten_dabiaoti_ = '^([八九])、(.*)' +pantten_xiaobiaoti_1 = '^[1-7](\.[9]){1,2}\s{1,}?.*$' +pantten_xiaobiaoti_2 = '^[1-7](\.[1-9]{2,}?){1,2}\s{1,}?.*$' +pantten_xiaobiaoti_3 = '^[1-7](\.[1-8]){3,}\s{1,}?.*$' + +def contains_chinese(text): + # 检查是否包含中文字符 + chinese_pattern = re.compile(r'[\u4e00-\u9fa5]') + has_chinese = bool(re.search(chinese_pattern, text)) + return has_chinese + +duizhao = { + '1':'一', + '2':'二', + '3':'三', + '4':'四', + '5':'五', + '6':'六', + '7':'七', + '8':'八', + '9':'九' +} + +def mulu_ulit(mulu): + + return_bool = True + mulu_new = [] + + for i in mulu: + i = i.strip(' ') + dabiaoti = re.findall(pantten_dabiaoti, i) + xiaobiaoti = re.findall(pantten_xiaobiaoti, i) + if dabiaoti != []: + if dabiaoti[0][0] != '': + if dabiaoti[0][0] in duizhao: + dabiaoti_mulu = duizhao[dabiaoti[0][0]] + "、" + dabiaoti[0][1] + else: + dabiaoti_mulu = dabiaoti[0][0] + "、" + dabiaoti[0][1] + else: + if dabiaoti[0][2] in duizhao: + dabiaoti_mulu = duizhao[dabiaoti[0][2]] + "、" + dabiaoti[0][3] + else: + dabiaoti_mulu = dabiaoti[0][2] + "、" + dabiaoti[0][3] + mulu_new.append(dabiaoti_mulu) + + elif xiaobiaoti != []: + mulu_new.append(i) + else: + continue + + if return_bool == True: + for i in mulu_new: + dabiaoti = re.findall(pantten_dabiaoti_, i) + xiaobiaoti_1 = re.findall(pantten_xiaobiaoti_1, i) + xiaobiaoti_2 = re.findall(pantten_xiaobiaoti_2, i) + xiaobiaoti_3 = re.findall(pantten_xiaobiaoti_3, i) + + if list(set(dabiaoti)| set(xiaobiaoti_1)| set(xiaobiaoti_2)| set(xiaobiaoti_3)) != []: + return_bool = False + break + + if return_bool == True: + dabiaoti_jiance = False + xiaobiaoti_jiance = False + for i in mulu_new: + if dabiaoti_jiance == True and xiaobiaoti_jiance == True: + break + if dabiaoti_jiance == False: + dabiaoti_shai = re.findall(pantten_dabiaoti_shai, i) + if dabiaoti_shai != []: + dabiaoti_jiance = True + continue + if xiaobiaoti_jiance == False: + xiaobiaoti_shai = re.findall(pantten_xiaobiaoti_shai, i) + if xiaobiaoti_shai != []: + xiaobiaoti_jiance = True + continue + if dabiaoti_jiance == False or xiaobiaoti_jiance == False: + return_bool = False + + if return_bool == True: + text = " ".join(mulu_new) + chinese_bool = contains_chinese(text) + if chinese_bool == False: + return_bool = False + + return [return_bool, mulu_new] + + +prompt = "根据论文题目《{}》、目录是“{}”生成中文论文摘要,要求生成的字数在{}字左右" +zhaiyao_zong = [] + + +def shengcheng_prompt(title, mulu_list, zhaiyao): + + mulu = "\n".join(mulu_list) + zishu = (len(zhaiyao)//100)* 100 + zhaiyao_prompt = prompt.format(title, mulu, zishu) + return [zhaiyao_prompt, zhaiyao] + + +with open("data.json", encoding="utf-8") as f: + for i in f.readlines(): + a = json.loads(i) + try: + return_bool, mulu_new = mulu_ulit(json.loads(a[-1])) + if return_bool == True: + zhaiyao_zong.append(shengcheng_prompt(a[0], mulu_new, a[1])) + # else: + # print("===========================================================================") + # print(mulu_new) + except: + continue + +with open("zhaiyao_prompt.json", "w", encoding="utf-8") as f: + f.write(json.dumps(zhaiyao_zong, indent=2)) \ No newline at end of file diff --git a/读取结果生成目录.py b/读取结果生成目录.py new file mode 100644 index 0000000..7c143eb --- /dev/null +++ b/读取结果生成目录.py @@ -0,0 +1,125 @@ +import json +import re + + +pantten_dabiaoti = '^第([0-9一二三四五六七八九十]{1,})?章\s{1,}?(.*)|^([0-9一二三四五六七八九十]{1,}?)\s{1,}?(.*)' +pantten_xiaobiaoti = '^[0-9](\.[0-9]\d*){1,3}\s{1,}?.*$' + +pantten_dabiaoti_shai = '^([5-7五六七])、(.*)' +pantten_xiaobiaoti_shai = '^[5-7](\.[5-7]){1,2}\s{1,}?.*$' + +pantten_dabiaoti_ = '^([八九])、(.*)' +pantten_xiaobiaoti_1 = '^[1-7](\.[9]){1,2}\s{1,}?.*$' +pantten_xiaobiaoti_2 = '^[1-7](\.[1-9]{2,}?){1,2}\s{1,}?.*$' +pantten_xiaobiaoti_3 = '^[1-7](\.[1-8]){3,}\s{1,}?.*$' + +def contains_chinese(text): + # 检查是否包含中文字符 + chinese_pattern = re.compile(r'[\u4e00-\u9fa5]') + has_chinese = bool(re.search(chinese_pattern, text)) + return has_chinese + +duizhao = { + '1':'一', + '2':'二', + '3':'三', + '4':'四', + '5':'五', + '6':'六', + '7':'七', + '8':'八', + '9':'九' +} + +def mulu_ulit(mulu): + + return_bool = True + mulu_new = [] + + for i in mulu: + i = i.strip(' ') + dabiaoti = re.findall(pantten_dabiaoti, i) + xiaobiaoti = re.findall(pantten_xiaobiaoti, i) + if dabiaoti != []: + if dabiaoti[0][0] != '': + if dabiaoti[0][0] in duizhao: + dabiaoti_mulu = duizhao[dabiaoti[0][0]] + "、" + dabiaoti[0][1] + else: + dabiaoti_mulu = dabiaoti[0][0] + "、" + dabiaoti[0][1] + else: + if dabiaoti[0][2] in duizhao: + dabiaoti_mulu = duizhao[dabiaoti[0][2]] + "、" + dabiaoti[0][3] + else: + dabiaoti_mulu = dabiaoti[0][2] + "、" + dabiaoti[0][3] + mulu_new.append(dabiaoti_mulu) + + elif xiaobiaoti != []: + mulu_new.append(i) + else: + continue + + if return_bool == True: + for i in mulu_new: + dabiaoti = re.findall(pantten_dabiaoti_, i) + xiaobiaoti_1 = re.findall(pantten_xiaobiaoti_1, i) + xiaobiaoti_2 = re.findall(pantten_xiaobiaoti_2, i) + xiaobiaoti_3 = re.findall(pantten_xiaobiaoti_3, i) + + if list(set(dabiaoti)| set(xiaobiaoti_1)| set(xiaobiaoti_2)| set(xiaobiaoti_3)) != []: + return_bool = False + break + + if return_bool == True: + dabiaoti_jiance = False + xiaobiaoti_jiance = False + for i in mulu_new: + if dabiaoti_jiance == True and xiaobiaoti_jiance == True: + break + if dabiaoti_jiance == False: + dabiaoti_shai = re.findall(pantten_dabiaoti_shai, i) + if dabiaoti_shai != []: + dabiaoti_jiance = True + continue + if xiaobiaoti_jiance == False: + xiaobiaoti_shai = re.findall(pantten_xiaobiaoti_shai, i) + if xiaobiaoti_shai != []: + xiaobiaoti_jiance = True + continue + if dabiaoti_jiance == False or xiaobiaoti_jiance == False: + return_bool = False + + if return_bool == True: + text = " ".join(mulu_new) + chinese_bool = contains_chinese(text) + if chinese_bool == False: + return_bool = False + + return [return_bool, mulu_new] + + +prompt = "为论文题目“{}”生成中文目录,要求只有一级标题,二级标题和三级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;三级标题使用阿拉伯数字 例如1.1.2 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题;三级标题个数不限制" +mulu_zong = [] + + +def shengcheng_prompt(title, mulu_list): + mulu_prompt = prompt.format(title) + mulu = "\n".join(mulu_list) + return mulu_prompt, mulu + + +with open("data.json", encoding="utf-8") as f: + for i in f.readlines(): + a = json.loads(i) + try: + return_bool, mulu_new = mulu_ulit(json.loads(a[-1])) + if return_bool == True: + + mulu_zong.append(shengcheng_prompt(a[0], mulu_new)) + # else: + # print("===========================================================================") + # print(mulu_new) + except: + continue +print(len(mulu_zong)) +with open("mulu_prompt.json", "w", encoding="utf-8") as f: + f.write(json.dumps(mulu_zong, indent=2)) \ No newline at end of file