import os import random import json from tqdm import tqdm data_tongji = { "0-600": 0, "600-1500": 0, "1500-": 0, } # print("这段文字翻译成英文"\n'") data_tongji_prompt = [] def is_contains_chinese(strs): for _char in strs: if '\u4e00' <= _char <= '\u9fa5': return True return False data_list = [] with open("data/chatglm_paper_data_2.txt", encoding="utf-8") as f: for i in tqdm(f): data_dan = eval(i) zishu_content = len(data_dan["content"]) zishu_summary = len(data_dan["summary"]) prompt = data_dan["content"] summary = data_dan["summary"] if "这段文字翻译成英文" in prompt: zishu_summary = len(data_dan['summary'].split(" ")) elif "这几个关键字翻译成英文" in prompt: zishu_summary = len(data_dan['summary'].split(" ")) else: bool_ = is_contains_chinese(data_dan["summary"]) if bool_ == False: print(data_dan) continue if "生成方向" in prompt: data_dan["content"] = prompt.replace("生成方向","研究方向") if "生成方向" in summary: data_dan["summary"] = summary.replace("生成方向", "研究方向") if zishu_content < 900 and zishu_summary < 1900: data_list.append(json.dumps(data_dan, ensure_ascii=False)) # if zishu_summary < 600: # data_tongji["0-600"] += 1 # if 600 < zishu_summary < 1500: # data_tongji["600-1500"] += 1 # if 1500 < zishu_summary: # data_tongji["1500-"] += 1 # data_tongji_prompt.append([data_dan['summary'], zishu_summary]) # else: # train_list.append(i) # for i in data_tongji_prompt: # print(i) # # random.shuffle(data_list) # train_nums = int(len(data_list) * 0.9) dev_nums = int(len(data_list) * 0.1) # random.shuffle(data_list) train_list = data_list[:train_nums] dev_list = data_list[train_nums:] with open("./data/chatglm_train_3.json", mode="w", encoding="utf-8") as f: for i in train_list: f.write(i) with open("./data/chatglm_dev_3.json", mode="w", encoding="utf-8") as f: for i in dev_list: f.write(i) # for i in data_tongji_prompt: # print(i) # # print(data_tongji)