排版识别标题级别和正文
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

39 lines
1.0 KiB

import json
'''
{"text": "EU rejects German call to boycott British lamb.", "label": [[0, 2, "ORG"]]}
{"text": "Peter Blackburn", "label": [[0, 15, "PERSON"]]}
{"text": "President Obama", "label": [[10, 15, "PERSON"]]}
'''
with open("data/zc_4.jsonl", encoding="utf-8") as f:
data = f.readlines()
data_new = []
for i in data:
data_dan_json = json.loads(i)
if len(data_dan_json["label"]) >1 or len(data_dan_json["label"]) == 0:
continue
text = data_dan_json["text"]
print(text)
print(data_dan_json)
start = str(text).index("<\\Start>")
end = str(text).index("<\\End>")
print(start)
print(end)
text = str(text).replace("<\\Start>", "").replace("<\\End>", "")
label = []
if data_dan_json['label'][0] != '正文':
label.append([start, end-8, data_dan_json['label'][0]])
data_new.append({
"text": text,
"label": label
})
with open("data/zc_ner.jsonl", "a", encoding="utf-8") as f:
for i in data_new:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")