Browse Source

第一次提交

master
majiahui@haimaqingfan.com 1 year ago
commit
89b5f2a04f
  1. 8
      .idea/爬取目录筛选.iml
  2. 58
      数据合并.py
  3. 88
      数据合并_学位期刊.py
  4. 99
      查看数据.py
  5. 68
      测试正则.py
  6. 8
      测试正则2.py
  7. 24
      筛选10000条关键词.py
  8. 35
      筛选10000条摘要.py
  9. 29
      筛选10000条目录.py
  10. 47
      读取ck.py
  11. 53
      读取结果生成关键词.py
  12. 126
      读取结果生成摘要.py
  13. 125
      读取结果生成目录.py

8
.idea/爬取目录筛选.iml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (ldm)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

58
数据合并.py

@ -0,0 +1,58 @@
import json
# json.load()
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
# a = f.read()
# print(a)
import pandas as pd
filename = 't_xuewei_cnki_spider.csv'
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整
df_dict = {}
# 使用 chunksize 参数迭代读取 CSV 文件
for chunk in pd.read_csv(filename, chunksize=chunksize):
print(1)
# 对每个 chunk 进行处理
# print(chunk.columns)
df_list = chunk.values.tolist()
# print(df_list[0])
for i in range(len(df_list)):
df_dict["_".join([df_list[i][1], str(df_list[i][2])])] = [df_list[i][7], df_list[i][8], df_list[i][9]]
data = []
json_list = [
"t_xuewei_detail_cnki_2018_2021.json",
"t_xuewei_detail_cnki2_2018_2021.json",
"t_xuewei_detail_cnki3_2018_2021.json",
"t_xuewei_detail_cnki6_2018_2021.json",
"t_xuewei_detail_cnki7_2018_2021.json",
]
print("主库数据完成加载")
for path in json_list:
name, typr_file = path.split(".")
a = json.load(open(path))
for i in a:
autoid = "_".join([name, str(i['autoid'])]).replace("_2018_2021", "")
if autoid in df_dict:
data.append([i['f_title']] + df_dict[autoid])
print("path完成筛选")
for i in data:
with open("data_qikan_2018_2021.json", "a") as f:
f.write(json.dumps(i))
f.write("\n")
with open("data_qikan_2018_2021.json", encoding="utf-8") as f:
for i in f.readlines():
a = json.loads(i)
print(a)

88
数据合并_学位期刊.py

@ -0,0 +1,88 @@
import json
# json.load()
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
# a = f.read()
# print(a)
import pandas as pd
filename_xuewei = 't_xuewei_cnki_spider.csv'
filename_journal = 't_journal_cnki_spider.csv'
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整
df_dict = {}
# 使用 chunksize 参数迭代读取 CSV 文件
# for chunk in pd.read_csv(filename_xuewei, chunksize=chunksize):
# print(1)
# # 对每个 chunk 进行处理
#
# print(chunk.columns)
#
# df_list = chunk.values.tolist()
# # print(df_list[0])
# for i in range(len(df_list)):
# df_dict["_".join([df_list[i][1], str(df_list[i][2])])] = [df_list[i][4], df_list[i][7]]
# for chunk in pd.read_csv(filename_journal, chunksize=chunksize):
# print(1)
# # 对每个 chunk 进行处理
#
# print(chunk.columns)
# 9/0
# df_list = chunk.values.tolist()
# # print(df_list[0])
# for i in range(len(df_list)):
# df_dict["_".join([df_list[i][1], str(df_list[i][2])])] = [df_list[i][3], df_list[i][6]]
#
data = []
json_list = [
"t_xuewei_detail_cnki_2018_2021.json",
"t_xuewei_detail_cnki2_2018_2021.json",
"t_xuewei_detail_cnki3_2018_2021.json",
"t_xuewei_detail_cnki6_2018_2021.json",
"t_xuewei_detail_cnki7_2018_2021.json",
]
json_journal_list = [
"t_journal_cnki_detail_2018_2021.json",
"t_journal_cnki_detail2_2018_2021.json",
"t_journal_cnki_detail3_2018_2021.json",
"t_journal_cnki_detail4_2018_2021.json",
"t_journal_cnki_detail6_2018_2021.json",
"t_journal_cnki_detail7_2018_2021.json",
]
print("主库数据完成加载")
# for path in json_list:
# name, typr_file = path.split(".")
#
# a = json.load(open(path))
# for i in a:
# autoid = "_".join([name, str(i['autoid'])]).replace("_2018_2021", "")
# if autoid in df_dict:
# data.append([i['f_author'], i['f_title'], i['f_unit'], i['f_year']] + df_dict[autoid])
# print("path完成筛选")
for path in json_journal_list:
name, typr_file = path.split(".")
a = json.load(open(path))
for i in a:
autoid = "_".join([name, str(i['autoid'])]).replace("_2018_2021", "")
if autoid in df_dict:
data.append([i['f_author'], i['f_title'], i['f_unit'], i['f_year']] + df_dict[autoid])
print("path完成筛选")
# for i in data:
# with open("data_qikan_2018_2021.json", "a") as f:
# f.write(json.dumps(i))
# f.write("\n")
#
# with open("data_qikan_2018_2021.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
#
#
# print(a)

99
查看数据.py

@ -0,0 +1,99 @@
# import pandas as pd
# import json
#
# # 逐块读取CSV文件,每块大小为chunksize
# chunksize = 1000 # 指定每次读取的行数
# counter = 0
#
# # 逐块读取CSV文件
# for chunk in pd.read_csv('t_xuewei_cnki_spider.csv', chunksize=chunksize):
# # 处理每个块
# # 例如,您可以查看每个块的前100条记录
# for i in range(1000):
# print(f"=========================={str(i)}======================")
# print(chunk.values.tolist())
# print(json.loads(chunk.values.tolist()[i][-1]))
#
# counter += 1
# 9/0
# import csv
#
# # 定义要读取的行数
# lines_to_read = 1000
#
# # 打开CSV文件
# with open('t_xuewei_cnki_spider.csv', 'r', newline='', encoding='utf-8') as file:
# # 创建CSV阅读器对象
# reader = csv.reader(file)
#
# # 获取文件的总行数
# total_lines = sum(1 for _ in reader)
#
# # 将文件指针移动到倒数的 lines_to_read 行之前
# file.seek(0)
# for _ in range(total_lines - lines_to_read):
# next(reader)
#
# # 逐行读取剩余的行
# for row in reader:
# print(row)
import pymysql
import json
# 建立数据库连接
connection = pymysql.connect(
host='rm-bp11ky2z5f34d2949fo.mysql.rds.aliyuncs.com',
user='fabiao_r',
password='f5u1w8nfb3b@',
database='fabiao',
cursorclass=pymysql.cursors.DictCursor # 返回字典形式的结果,方便操作
)
try:
with connection.cursor() as cursor:
# 执行查询
sql = "SELECT * FROM t_journal_cnki_detail6 WHERE f_year IN (2022)"
cursor.execute(sql)
# 获取查询结果
result = cursor.fetchall()
print(result)
9/0
# 处理结果
# for row in result:
# print(row)
with open("t_journal_cnki_detail_2018_2021.json", "w", encoding="utf-8") as f:
f.write(json.dumps(result))
finally:
# 关闭连接
connection.close()
# try:
# with connection.cursor() as cursor:
# # 执行查询
# sql = "SELECT t_xuewei_detail_cnki6.*, t_xuewei_cnki_spider.* FROM t_xuewei_detail_cnki6 JOIN t_xuewei_cnki_spider ON t_xuewei_detail_cnki6.autoid = t_xuewei_cnki_spider.autoid WHERE t_xuewei_detail_cnki6.f_year IN (2019, 2020, 2021)"
# cursor.execute(sql)
#
# # 获取查询结果
# result = cursor.fetchall()
#
# # 处理结果
# print(result[0])
# for row in result:
# print(row)
# # 关闭连接
# connection.close()
# finally:
# # 关闭连接
# connection.close()

68
测试正则.py

@ -0,0 +1,68 @@
import re
# pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+'
pantten_dabiaoti = '^第([0-9一二三四五六七八九十]{1,})?章\s{1,}?(.*)|^([0-9一二三四五六七八九十]{1,}?)\s{1,}?(.*)'
pantten_xiaobiaoti = '^[0-9](\.[0-9]\d*){1,3}\s{1,}?.*$'
pantten_dabiaoti_ = '^第([89八九])章\s{1,}?(.*)|^([89八九])\s{1,}?(.*)'
pantten_xiaobiaoti_1 = '^[1-7](\.[89]){1,2}\s{1,}?.*$'
pantten_xiaobiaoti_2 = '^[1-7](\.[1-9]{2,}?){1,2}\s{1,}?.*$'
pantten_xiaobiaoti_3 = '^[1-7](\.[1-7]){3,}\s{1,}?.*$'
duizhao = {
'1':'',
'2':'',
'3':'',
'4':'',
'5':'',
'6':'',
'7':'',
'8':'',
'9':''
}
i = '1.91.21 双-[3-(三乙氧基)硅丙基]四硫化物体系自组装膜制备及性能研究'.strip('&nbsp;')
print(i)
# if list(set(re.findall(pantten_dabiaoti,dabiaoti_s)) |set(re.findall(pantten_biaoti,dabiaoti_s))) != []:
# print(1)
# else:
# print(0)
dabiaoti = re.findall(pantten_dabiaoti,i)
xiaobiaoti = re.findall(pantten_xiaobiaoti,i)
print(dabiaoti)
print(xiaobiaoti)
if dabiaoti != []:
if i[0] != '':
if i[0] in duizhao:
dabiaoti_mulu = duizhao[i[0]] + " " + i[1]
else:
dabiaoti_mulu = i[0] + " " + i[1]
else:
if i[2] in duizhao:
dabiaoti_mulu = duizhao[i[2]] + " " + i[3]
else:
dabiaoti_mulu = i[2] + " " + i[3]
elif xiaobiaoti != []:
i = i
else:
pass
i = '1.1.1.1 双-[3-(三乙氧基)硅丙基]四硫化物体系自组装膜制备及性能研究'.strip('&nbsp;')
print(i)
dabiaoti = re.findall(pantten_dabiaoti_,i)
xiaobiaoti_1 = re.findall(pantten_xiaobiaoti_1,i)
xiaobiaoti_2 = re.findall(pantten_xiaobiaoti_2,i)
xiaobiaoti_3 = re.findall(pantten_xiaobiaoti_3,i)
print(dabiaoti)
print(xiaobiaoti_1)
print(xiaobiaoti_2)
print(xiaobiaoti_3 )

8
测试正则2.py

@ -0,0 +1,8 @@
import re
pantten_xiaobiaoti_shai = '^[5-7](\.[5-7]){1,2}\s{1,}?.*$'
a = "5.9.9 dadadad"
xiaobiaoti_shai = re.findall(pantten_xiaobiaoti_shai, a)
print(xiaobiaoti_shai)

24
筛选10000条关键词.py

@ -0,0 +1,24 @@
import json
with open("gaunjianci_zong.json") as f:
gaunjianci_list = json.loads(f.read())
import random
random.shuffle(gaunjianci_list)
random.shuffle(gaunjianci_list)
random.shuffle(gaunjianci_list)
gaunjianci_list_new = gaunjianci_list[:10000]
gaunjianci_list_json = []
for i in gaunjianci_list_new:
gaunjianci_list_json.append({
"instruction": "任务:生成关键词",
"input": i[0],
"output": i[1]
})
with open("gaunjianci_prompt_10000.json", "w", encoding="utf-8") as f:
f.write(json.dumps(gaunjianci_list_json, ensure_ascii=False, indent=2))

35
筛选10000条摘要.py

@ -0,0 +1,35 @@
import json
with open("zhaiyao_prompt.json") as f:
zhaiyao_list = json.loads(f.read())
import random
random.shuffle(zhaiyao_list)
random.shuffle(zhaiyao_list)
random.shuffle(zhaiyao_list)
zhaiyao_list_new = zhaiyao_list[:10000]
# mulu_list_json = []
# for i in mulu_list_json:
# mulu_list_json.append({
# "instruction": "任务:生成目录",
# "input": i[0],
# "output": i[1]
# })
# with open("mulu_prompt_10000.json", "w", encoding="utf-8") as f:
# f.write(json.dumps(mulu_list_json, ensure_ascii=False, indent=2))
zhaiyao_list_json = []
for i in zhaiyao_list_new:
zhaiyao_list_json.append({
"instruction": "任务:生成论文摘要",
"input": i[0],
"output": i[1]
})
with open("zhaiyao_prompt_10000.json", "w", encoding="utf-8") as f:
f.write(json.dumps(zhaiyao_list_json, ensure_ascii=False, indent=2))

29
筛选10000条目录.py

@ -0,0 +1,29 @@
import json
with open("mulu_prompt.json") as f:
mulu_list = json.loads(f.read())
import random
random.shuffle(mulu_list)
random.shuffle(mulu_list)
random.shuffle(mulu_list)
mulu_list_new = mulu_list[:10000]
# {
# "instruction": "任务:生成论文小标题内容",
# "input": "背景:我是一名博士生,我想写一篇论文。\n角色:我需要你作为拥有google学术论文数据库和百度学术论文数据库的资深数据。\n要求:根据论文题目“大数据时代的数据分析与挖掘”和目录“一、大数据时代概述\n1.1 大数据的定义与特征\n1.2 大数据时代的发展历程\n1.3 大数据时代的挑战与机遇\n二、数据分析与挖掘的基本概念\n2.1 数据分析的概念与目的\n2.2 数据挖掘的概念与任务\n2.3 数据分析与数据挖掘的关系\n三、大数据时代的数据分析技术\n3.1 分布式计算技术\n3.2 数据可视化技术\n3.3 机器学习与深度学习技术\n四、大数据时代的挖掘技术\n4.1 关联规则挖掘\n4.2 聚类分析\n4.3 时间序列分析\n五、数据挖掘的应用与实践\n5.1 金融行业的数据挖掘应用\n5.2 电商行业的数据挖掘应用\n5.3 医疗行业的数据挖掘应用”,为小标题“1.1 大数据的定义与特征”填充400字左右的中文内容,内容第一行返回“1.1 大数据的定义与特征”且不包含目录中其他标题,禁止出现“首先”,“其次”,等字样,必要时候应根据条数罗列",
# "output": "1.1 大数据的定义与特征\n\n大数据是指传统数据处理应用软件难以处理的大规模和复杂度的数据集合。IDC 的定义为:\"大数据技术指的是那些使用新型处理模式,以经济有效的方式,从高速增长的各类数据中提炼出价值的技术与概念。\"大数据的特征通常用“5V”来概述,即体量巨大(Volume)、速度快(Velocity)、种类多(Variety)、真实性(Veracity)和价值密度(Value)。\n\n(1) 体量巨大:数据量巨大,处于TB(太字节)到PB(拍字节)甚至更高的量级。\n\n(2) 速度快:数据流入的速度极快,实时性成为数据处理的关键要求。\n\n(3) 种类多:数据类型繁多,包括结构化数据、非结构化数据和半结构化数据。\n\n(4) 真实性:数据的质量和准确性会直接影响到分析结果的可行性。\n\n(5) 价值密度:大数据中蕴含有巨大的价值,但相比数据量的庞大,这些价值的密度却相对较低,需要通过有效的分析才能挖掘出来。\n\n因此,面对大数据的定义与特征,数据分析与挖掘的技术与方法必须适应数据的快速流转和多样性,且须具备处理海量数据的能力,以期从中获取有价值的信息与知识。"
# },
mulu_list_json = []
for i in mulu_list_new:
mulu_list_json.append({
"instruction": "任务:生成目录",
"input": i[0].replace("一级标题不少于7个", "一级标题5-7个"),
"output": i[1]
})
with open("mulu_prompt_10000.json", "w", encoding="utf-8") as f:
f.write(json.dumps(mulu_list_json, ensure_ascii=False, indent=2))

47
读取ck.py

@ -0,0 +1,47 @@
from clickhouse_driver import Client
import json
class PureClient:
def __init__(self, database='test_db'):
# 只需要写本地地址
self.client = Client(host=f'192.168.31.74', port=9000, user='default',
password='zhicheng123*', database=database)
def run(self, sql):
client = self.client
collection = client.query_dataframe(sql)
return collection
pureclient = PureClient()
for i in [2018, 2019, 2020, 2021]:
sql = f'SELECT * FROM main_paper_message WHERE year IN ({i}) limit 10'
result = pureclient.run(sql)
print(result)
# print("result", result)
# title = result['title'][0]
# author = result['author'][0]
# degree = result['degree'][0]
# year = result['content'][0].split("/")[5]
# school = result['school'][0]
# qikan_name = result['qikan_name'][0]
# author = str(author).strip(";")
# author = str(author).replace(";", ",")
# # select
# # school, qikan_name
# # from main_table_paper_detail_message limit
# # 10000 \G;;
#
# paper_info = {
# "title": title,
# "author": author,
# "degree": degree,
# "year": year,
# "school": school,
# "qikan_name": qikan_name
# }
# print("paper_info", paper_info)
# return paper_info
with open("data_info.json", "w", encoding="utf-8") as f:
f.write(json.dumps(result))

53
读取结果生成关键词.py

@ -0,0 +1,53 @@
import json
import re
prompt = "请为“{}”这段论文摘要生成3-5个关键词,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n\n"
def contains_chinese(text):
# 检查是否包含中文字符
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]')
has_chinese = bool(re.search(chinese_pattern, text))
return has_chinese
def shengcheng_prompt(zhaiyao, guanjianci):
gaunjianci_bool = True
guanjianci_list = guanjianci.split(";")
if contains_chinese(zhaiyao) == False:
return False, []
if len(zhaiyao) < 500 or len(zhaiyao) > 800:
return False, []
if len(guanjianci_list) <= 4 or len(guanjianci_list) >= 7:
return False, []
guajnjianci_prompt = prompt.format(zhaiyao)
guanjianci_str_list = []
for i in range(len(guanjianci_list)):
if guanjianci_list[i] != "":
guanjianci_str_list.append(str(i+1) + ". " + guanjianci_list[i])
guanjianci_str = "\n".join(guanjianci_str_list)
return True, [guajnjianci_prompt, guanjianci_str]
gaunjianci_zong = []
with open("data.json", encoding="utf-8") as f:
for i in f.readlines():
a = json.loads(i)
try:
gaunjianci_bool, gaunjianci_list = shengcheng_prompt(a[1], a[2])
if gaunjianci_bool == True:
gaunjianci_zong.append(gaunjianci_list)
# else:
# print("===========================================================================")
# print(mulu_new)
except:
continue
with open("gaunjianci_zong.json", "w", encoding="utf-8") as f:
f.write(json.dumps(gaunjianci_zong, indent=2))

126
读取结果生成摘要.py

@ -0,0 +1,126 @@
import json
import re
pantten_dabiaoti = '^第([0-9一二三四五六七八九十]{1,})?章\s{1,}?(.*)|^([0-9一二三四五六七八九十]{1,}?)\s{1,}?(.*)'
pantten_xiaobiaoti = '^[0-9](\.[0-9]\d*){1,3}\s{1,}?.*$'
pantten_dabiaoti_shai = '^([1-7一二三四五六七])、(.*)'
pantten_xiaobiaoti_shai = '^[1-7](\.[1-7]){1,2}\s{1,}?.*$'
pantten_dabiaoti_ = '^([八九])、(.*)'
pantten_xiaobiaoti_1 = '^[1-7](\.[9]){1,2}\s{1,}?.*$'
pantten_xiaobiaoti_2 = '^[1-7](\.[1-9]{2,}?){1,2}\s{1,}?.*$'
pantten_xiaobiaoti_3 = '^[1-7](\.[1-8]){3,}\s{1,}?.*$'
def contains_chinese(text):
# 检查是否包含中文字符
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]')
has_chinese = bool(re.search(chinese_pattern, text))
return has_chinese
duizhao = {
'1':'',
'2':'',
'3':'',
'4':'',
'5':'',
'6':'',
'7':'',
'8':'',
'9':''
}
def mulu_ulit(mulu):
return_bool = True
mulu_new = []
for i in mulu:
i = i.strip('&nbsp;')
dabiaoti = re.findall(pantten_dabiaoti, i)
xiaobiaoti = re.findall(pantten_xiaobiaoti, i)
if dabiaoti != []:
if dabiaoti[0][0] != '':
if dabiaoti[0][0] in duizhao:
dabiaoti_mulu = duizhao[dabiaoti[0][0]] + "" + dabiaoti[0][1]
else:
dabiaoti_mulu = dabiaoti[0][0] + "" + dabiaoti[0][1]
else:
if dabiaoti[0][2] in duizhao:
dabiaoti_mulu = duizhao[dabiaoti[0][2]] + "" + dabiaoti[0][3]
else:
dabiaoti_mulu = dabiaoti[0][2] + "" + dabiaoti[0][3]
mulu_new.append(dabiaoti_mulu)
elif xiaobiaoti != []:
mulu_new.append(i)
else:
continue
if return_bool == True:
for i in mulu_new:
dabiaoti = re.findall(pantten_dabiaoti_, i)
xiaobiaoti_1 = re.findall(pantten_xiaobiaoti_1, i)
xiaobiaoti_2 = re.findall(pantten_xiaobiaoti_2, i)
xiaobiaoti_3 = re.findall(pantten_xiaobiaoti_3, i)
if list(set(dabiaoti)| set(xiaobiaoti_1)| set(xiaobiaoti_2)| set(xiaobiaoti_3)) != []:
return_bool = False
break
if return_bool == True:
dabiaoti_jiance = False
xiaobiaoti_jiance = False
for i in mulu_new:
if dabiaoti_jiance == True and xiaobiaoti_jiance == True:
break
if dabiaoti_jiance == False:
dabiaoti_shai = re.findall(pantten_dabiaoti_shai, i)
if dabiaoti_shai != []:
dabiaoti_jiance = True
continue
if xiaobiaoti_jiance == False:
xiaobiaoti_shai = re.findall(pantten_xiaobiaoti_shai, i)
if xiaobiaoti_shai != []:
xiaobiaoti_jiance = True
continue
if dabiaoti_jiance == False or xiaobiaoti_jiance == False:
return_bool = False
if return_bool == True:
text = " ".join(mulu_new)
chinese_bool = contains_chinese(text)
if chinese_bool == False:
return_bool = False
return [return_bool, mulu_new]
prompt = "根据论文题目《{}》、目录是“{}”生成中文论文摘要,要求生成的字数在{}字左右"
zhaiyao_zong = []
def shengcheng_prompt(title, mulu_list, zhaiyao):
mulu = "\n".join(mulu_list)
zishu = (len(zhaiyao)//100)* 100
zhaiyao_prompt = prompt.format(title, mulu, zishu)
return [zhaiyao_prompt, zhaiyao]
with open("data.json", encoding="utf-8") as f:
for i in f.readlines():
a = json.loads(i)
try:
return_bool, mulu_new = mulu_ulit(json.loads(a[-1]))
if return_bool == True:
zhaiyao_zong.append(shengcheng_prompt(a[0], mulu_new, a[1]))
# else:
# print("===========================================================================")
# print(mulu_new)
except:
continue
with open("zhaiyao_prompt.json", "w", encoding="utf-8") as f:
f.write(json.dumps(zhaiyao_zong, indent=2))

125
读取结果生成目录.py

@ -0,0 +1,125 @@
import json
import re
pantten_dabiaoti = '^第([0-9一二三四五六七八九十]{1,})?章\s{1,}?(.*)|^([0-9一二三四五六七八九十]{1,}?)\s{1,}?(.*)'
pantten_xiaobiaoti = '^[0-9](\.[0-9]\d*){1,3}\s{1,}?.*$'
pantten_dabiaoti_shai = '^([5-7五六七])、(.*)'
pantten_xiaobiaoti_shai = '^[5-7](\.[5-7]){1,2}\s{1,}?.*$'
pantten_dabiaoti_ = '^([八九])、(.*)'
pantten_xiaobiaoti_1 = '^[1-7](\.[9]){1,2}\s{1,}?.*$'
pantten_xiaobiaoti_2 = '^[1-7](\.[1-9]{2,}?){1,2}\s{1,}?.*$'
pantten_xiaobiaoti_3 = '^[1-7](\.[1-8]){3,}\s{1,}?.*$'
def contains_chinese(text):
# 检查是否包含中文字符
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]')
has_chinese = bool(re.search(chinese_pattern, text))
return has_chinese
duizhao = {
'1':'',
'2':'',
'3':'',
'4':'',
'5':'',
'6':'',
'7':'',
'8':'',
'9':''
}
def mulu_ulit(mulu):
return_bool = True
mulu_new = []
for i in mulu:
i = i.strip('&nbsp;')
dabiaoti = re.findall(pantten_dabiaoti, i)
xiaobiaoti = re.findall(pantten_xiaobiaoti, i)
if dabiaoti != []:
if dabiaoti[0][0] != '':
if dabiaoti[0][0] in duizhao:
dabiaoti_mulu = duizhao[dabiaoti[0][0]] + "" + dabiaoti[0][1]
else:
dabiaoti_mulu = dabiaoti[0][0] + "" + dabiaoti[0][1]
else:
if dabiaoti[0][2] in duizhao:
dabiaoti_mulu = duizhao[dabiaoti[0][2]] + "" + dabiaoti[0][3]
else:
dabiaoti_mulu = dabiaoti[0][2] + "" + dabiaoti[0][3]
mulu_new.append(dabiaoti_mulu)
elif xiaobiaoti != []:
mulu_new.append(i)
else:
continue
if return_bool == True:
for i in mulu_new:
dabiaoti = re.findall(pantten_dabiaoti_, i)
xiaobiaoti_1 = re.findall(pantten_xiaobiaoti_1, i)
xiaobiaoti_2 = re.findall(pantten_xiaobiaoti_2, i)
xiaobiaoti_3 = re.findall(pantten_xiaobiaoti_3, i)
if list(set(dabiaoti)| set(xiaobiaoti_1)| set(xiaobiaoti_2)| set(xiaobiaoti_3)) != []:
return_bool = False
break
if return_bool == True:
dabiaoti_jiance = False
xiaobiaoti_jiance = False
for i in mulu_new:
if dabiaoti_jiance == True and xiaobiaoti_jiance == True:
break
if dabiaoti_jiance == False:
dabiaoti_shai = re.findall(pantten_dabiaoti_shai, i)
if dabiaoti_shai != []:
dabiaoti_jiance = True
continue
if xiaobiaoti_jiance == False:
xiaobiaoti_shai = re.findall(pantten_xiaobiaoti_shai, i)
if xiaobiaoti_shai != []:
xiaobiaoti_jiance = True
continue
if dabiaoti_jiance == False or xiaobiaoti_jiance == False:
return_bool = False
if return_bool == True:
text = " ".join(mulu_new)
chinese_bool = contains_chinese(text)
if chinese_bool == False:
return_bool = False
return [return_bool, mulu_new]
prompt = "为论文题目“{}”生成中文目录,要求只有一级标题,二级标题和三级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;三级标题使用阿拉伯数字 例如1.1.2 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题;三级标题个数不限制"
mulu_zong = []
def shengcheng_prompt(title, mulu_list):
mulu_prompt = prompt.format(title)
mulu = "\n".join(mulu_list)
return mulu_prompt, mulu
with open("data.json", encoding="utf-8") as f:
for i in f.readlines():
a = json.loads(i)
try:
return_bool, mulu_new = mulu_ulit(json.loads(a[-1]))
if return_bool == True:
mulu_zong.append(shengcheng_prompt(a[0], mulu_new))
# else:
# print("===========================================================================")
# print(mulu_new)
except:
continue
print(len(mulu_zong))
with open("mulu_prompt.json", "w", encoding="utf-8") as f:
f.write(json.dumps(mulu_zong, indent=2))
Loading…
Cancel
Save