You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

49 lines
1.4 KiB

import pandas as pd
import io
import base64
import json
from PIL import Image
'''
This script convert mmbench_test tsv file to jsonl
This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation
'''
datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
global_choices = ['A', 'B', 'C', 'D']
def decode_base64_to_image(base64_string):
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
return image
with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f:
for idx in range(len(datas)):
data = datas.iloc[idx]
index = int(data['index'])
question = data['question']
hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
choices = []
for opt in global_choices:
if pd.isna(data[opt]):
continue
choices.append(data[opt])
# answer = global_choices.index(data['answer'])
image = decode_base64_to_image(data['image'])
image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index)
f.write(json.dumps({
"index": index,
"image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index,
"hint": hint,
"question": question,
"choices": choices,
# "answer": answer,
}) + "\n")