You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
5.9 KiB
159 lines
5.9 KiB
import os
|
|
import av
|
|
import json
|
|
|
|
import torch
|
|
import numpy as np
|
|
from PIL import Image
|
|
from tqdm import tqdm
|
|
from decord import VideoReader, cpu
|
|
|
|
# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
|
|
seed_bench_input_path = 'SEED-Bench.json'
|
|
# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
|
|
cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
|
|
# root directory of evaluation dimension 10
|
|
dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
|
|
# root directory of evaluation dimension 11
|
|
dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
|
|
# root directory of evaluation dimension 12
|
|
dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
|
|
|
|
def is_integer_string(s):
|
|
try:
|
|
int(s)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
def filter_questions(data, task='all'):
|
|
if task == "image":
|
|
return [q for q in data if 1 <= q["question_type_id"] <= 9]
|
|
elif task == "video":
|
|
return [q for q in data if 10 <= q["question_type_id"] <= 12]
|
|
elif task == "all":
|
|
return data
|
|
elif is_integer_string(task):
|
|
return [q for q in data if q["question_type_id"] == int(task)]
|
|
else:
|
|
raise ValueError(f"Invalid task: {task}")
|
|
|
|
def get_index(num_frames, num_segments):
|
|
if num_segments > num_frames:
|
|
offsets = np.array([
|
|
idx for idx in range(num_frames)
|
|
])
|
|
else:
|
|
# uniform sampling
|
|
seg_size = float(num_frames - 1) / num_segments
|
|
start = int(seg_size / 2)
|
|
offsets = np.array([
|
|
start + int(np.round(seg_size * idx)) for idx in range(num_segments)
|
|
])
|
|
return offsets
|
|
|
|
with open(seed_bench_input_path) as fin:
|
|
qa_anno = json.load(fin)['questions']
|
|
|
|
fout = open('image_input.jsonl', 'w')
|
|
i_anno = filter_questions(qa_anno, 'image')
|
|
for qa_item in tqdm(i_anno):
|
|
data_path = cc3m_dir + qa_item['data_id']
|
|
choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
|
|
choice_list = []
|
|
for i, c in enumerate(choices):
|
|
choice_list.append('{}. {}'.format(chr(i + 65), c))
|
|
choice_txt = '\n'.join(choice_list)
|
|
prompt = '<img>{}</img>\nQuestion: {}\nOptions: {}\nAnswer:'.format(
|
|
data_path, qa_item['question'], choice_txt)
|
|
print(json.dumps({
|
|
'question_id': qa_item['question_id'],
|
|
'prompt': prompt,
|
|
'answer': qa_item['answer'],
|
|
}), file=fout)
|
|
fout.close()
|
|
|
|
n_frames = 8
|
|
os.system('rm -rf video_input_' + str(n_frames))
|
|
os.makedirs('video_imgs_' + str(n_frames), exist_ok=True)
|
|
|
|
fout = open('video_input_{}.jsonl'.format(n_frames), 'w')
|
|
v_anno = filter_questions(qa_anno, 'video')
|
|
for qa_item in tqdm(v_anno):
|
|
if qa_item['question_type_id'] == 12:
|
|
data_path = dimension12_dir + qa_item['data_id']
|
|
elif qa_item['question_type_id'] == 11:
|
|
data_path = dimension11_dir + qa_item['data_id'].split('/')[-1]
|
|
elif qa_item['question_type_id'] == 10:
|
|
data_path = dimension10_dir + qa_item['data_id']
|
|
else:
|
|
assert False, str(qa_item)
|
|
print(data_path)
|
|
|
|
use_pyav = False
|
|
if 'segment' in qa_item.keys():
|
|
segment = qa_item['segment']
|
|
if isinstance(segment[0], int):
|
|
# using pyav for decoding videos in evaluation dimension 12
|
|
use_pyav = True
|
|
start, end = segment[0], segment[1]
|
|
else:
|
|
start = 0.0
|
|
end = 0.0
|
|
|
|
if use_pyav:
|
|
# using pyav for decoding videos in evaluation dimension 12
|
|
reader = av.open(data_path)
|
|
frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]
|
|
video_len = len(frames)
|
|
start_frame, end_frame = start, end
|
|
end_frame = min(end_frame, video_len)
|
|
offset = get_index(end_frame - start_frame, n_frames)
|
|
frame_indices = offset + start_frame
|
|
images = torch.stack([frames[idx] for idx in frame_indices]).numpy()
|
|
else:
|
|
# using decord for decoding videos in evaluation dimension 10-11
|
|
try:
|
|
vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
|
|
video_len = len(vr)
|
|
fps = vr.get_avg_fps()
|
|
if 'segment' in qa_item.keys():
|
|
# obtain start and end frame for the video segment in evaluation dimension 11
|
|
start_frame = int(min(max(start * fps, 0), video_len - 1))
|
|
end_frame = int(min(max(end * fps, 0), video_len - 1))
|
|
tot_frames = int(end_frame - start_frame)
|
|
offset = get_index(tot_frames, n_frames)
|
|
frame_indices = offset + start_frame
|
|
else:
|
|
# sample frames of the video in evaluation dimension 10
|
|
frame_indices = get_index(video_len - 1, n_frames)
|
|
vr.seek(0)
|
|
images = vr.get_batch(frame_indices).asnumpy()
|
|
except Exception as e:
|
|
print(json.dumps({
|
|
'question_id': qa_item['question_id'],
|
|
'prompt': "Error" + str(e),
|
|
'answer': qa_item['answer'],
|
|
}), file=fout)
|
|
continue
|
|
|
|
prompt = ''
|
|
for i in range(images.shape[0]):
|
|
data = Image.fromarray(images[i])
|
|
img_path = 'video_imgs_{}/{}_{}.jpg'.format(n_frames, qa_item['question_id'], i)
|
|
data.save(img_path)
|
|
prompt += '<img>' + img_path + '</img>\n'
|
|
|
|
choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
|
|
choice_list = []
|
|
for i, c in enumerate(choices):
|
|
choice_list.append('{}. {}'.format(chr(i + 65), c))
|
|
choice_txt = '\n'.join(choice_list)
|
|
|
|
prompt += 'Question: {}\nOptions: {}\nAnswer:'.format(qa_item['question'], choice_txt)
|
|
print(json.dumps({
|
|
'question_id': qa_item['question_id'],
|
|
'prompt': prompt,
|
|
'answer': qa_item['answer'],
|
|
}), file=fout)
|
|
fout.close()
|
|
|