You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
1.9 KiB
66 lines
1.9 KiB
import json
|
|
|
|
# json.load()
|
|
|
|
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
|
|
# a = f.read()
|
|
# print(a)
|
|
|
|
import pandas as pd
|
|
|
|
filename = 'data/spider_latest_journal_paper_list.csv'
|
|
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整
|
|
|
|
df_list = []
|
|
# 使用 chunksize 参数迭代读取 CSV 文件
|
|
for chunk in pd.read_csv(filename, chunksize=chunksize):
|
|
# 作者 论文名称 论文类别 论文来源 论文年份 摘要
|
|
|
|
# 对每个 chunk 进行处理
|
|
|
|
# print(chunk.columns)
|
|
# 9 / 0
|
|
df_list_dan = chunk.values.tolist()
|
|
# print(df_list[0])
|
|
for i in range(len(df_list_dan)):
|
|
df_list.append({
|
|
'author': df_list_dan[i][2],
|
|
'title': df_list_dan[i][1],
|
|
'special_topic': df_list_dan[i][7],
|
|
'qikan_name': df_list_dan[i][3],
|
|
'year': df_list_dan[i][4],
|
|
'abstract': df_list_dan[i][10],
|
|
})
|
|
|
|
# data = []
|
|
# json_list = [
|
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json",
|
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json",
|
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json",
|
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json",
|
|
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json",
|
|
# ]
|
|
#
|
|
#
|
|
# print("主库数据完成加载")
|
|
# for path in json_list:
|
|
# name, typr_file = path.split(".")
|
|
# name = name.split("/")[-1]
|
|
# a = json.load(open(path))
|
|
# for i in a:
|
|
# autoid = "_".join([name, str(i['autoid'])])
|
|
# if autoid in df_dict:
|
|
# data.append([i['f_title']] + df_dict[autoid])
|
|
# print("path完成筛选")
|
|
#
|
|
#
|
|
with open("data/data_0423_qikan.json", "w") as f:
|
|
f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
|
|
|
|
#
|
|
# with open("data.json", encoding="utf-8") as f:
|
|
# for i in f.readlines():
|
|
# a = json.loads(i)
|
|
#
|
|
#
|
|
# print(a)
|