Qwen-VL/eval_mm/seed_bench/trans.py


								import os

								import av

								import json


								import torch

								import numpy as np

								from PIL import Image

								from tqdm import tqdm

								from decord import VideoReader, cpu


								# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json

								seed_bench_input_path = 'SEED-Bench.json'

								# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md

								cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"

								# root directory of evaluation dimension 10

								dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"

								# root directory of evaluation dimension 11

								dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"

								# root directory of evaluation dimension 12

								dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"


								def is_integer_string(s):

								    try:

								        int(s)

								        return True

								    except ValueError:

								        return False


								def filter_questions(data, task='all'):

								    if task == "image":

								        return [q for q in data if 1 <= q["question_type_id"] <= 9]

								    elif task == "video":

								        return [q for q in data if 10 <= q["question_type_id"] <= 12]

								    elif task == "all":

								        return data

								    elif is_integer_string(task):

								        return [q for q in data if q["question_type_id"] == int(task)]

								    else:

								        raise ValueError(f"Invalid task: {task}")


								def get_index(num_frames, num_segments):

								    if num_segments > num_frames:

								        offsets = np.array([

								            idx for idx in range(num_frames)

								        ])

								    else:

								        # uniform sampling

								        seg_size = float(num_frames - 1) / num_segments

								        start = int(seg_size / 2)

								        offsets = np.array([

								            start + int(np.round(seg_size * idx)) for idx in range(num_segments)

								        ])

								    return offsets


								with open(seed_bench_input_path) as fin:

								    qa_anno = json.load(fin)['questions']


								fout = open('image_input.jsonl', 'w')

								i_anno = filter_questions(qa_anno, 'image')

								for qa_item in tqdm(i_anno):

								    data_path = cc3m_dir + qa_item['data_id']

								    choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]

								    choice_list = []

								    for i, c in enumerate(choices):

								        choice_list.append('{}. {}'.format(chr(i + 65), c))

								    choice_txt = '\n'.join(choice_list)

								    prompt = '<img>{}</img>\nQuestion: {}\nOptions: {}\nAnswer:'.format(

								        data_path, qa_item['question'], choice_txt)

								    print(json.dumps({

								        'question_id': qa_item['question_id'],

								        'prompt': prompt,

								        'answer': qa_item['answer'],

								    }), file=fout)

								fout.close()


								n_frames = 8

								os.system('rm -rf video_input_' + str(n_frames))

								os.makedirs('video_imgs_' + str(n_frames), exist_ok=True)


								fout = open('video_input_{}.jsonl'.format(n_frames), 'w')

								v_anno = filter_questions(qa_anno, 'video')

								for qa_item in tqdm(v_anno):

								    if qa_item['question_type_id'] == 12:

								        data_path = dimension12_dir + qa_item['data_id']

								    elif qa_item['question_type_id'] == 11:

								        data_path = dimension11_dir + qa_item['data_id'].split('/')[-1]

								    elif qa_item['question_type_id'] == 10:

								        data_path = dimension10_dir + qa_item['data_id']

								    else:

								        assert False, str(qa_item)

								    print(data_path)


								    use_pyav = False

								    if 'segment' in qa_item.keys():

								        segment = qa_item['segment']

								        if isinstance(segment[0], int):

								            # using pyav for decoding videos in evaluation dimension 12

								            use_pyav = True

								        start, end = segment[0], segment[1]

								    else:

								        start = 0.0

								        end = 0.0


								    if use_pyav:

								        # using pyav for decoding videos in evaluation dimension 12

								        reader = av.open(data_path)

								        frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]

								        video_len = len(frames)

								        start_frame, end_frame = start, end

								        end_frame = min(end_frame, video_len)

								        offset = get_index(end_frame - start_frame, n_frames)

								        frame_indices = offset + start_frame

								        images = torch.stack([frames[idx] for idx in frame_indices]).numpy()

								    else:

								        # using decord for decoding videos in evaluation dimension 10-11

								        try:

								            vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))

								            video_len = len(vr)

								            fps = vr.get_avg_fps()

								            if 'segment' in qa_item.keys():

								                # obtain start and end frame for the video segment in evaluation dimension 11

								                start_frame = int(min(max(start * fps, 0), video_len - 1))

								                end_frame = int(min(max(end * fps, 0), video_len - 1))

								                tot_frames = int(end_frame - start_frame)

								                offset = get_index(tot_frames, n_frames)

								                frame_indices = offset + start_frame

								            else:

								                # sample frames of the video in evaluation dimension 10

								                frame_indices = get_index(video_len - 1, n_frames)

								            vr.seek(0)

								            images = vr.get_batch(frame_indices).asnumpy()

								        except Exception as e:

								            print(json.dumps({

								                'question_id': qa_item['question_id'],

								                'prompt': "Error" + str(e),

								                'answer': qa_item['answer'],

								            }), file=fout)

								            continue


								    prompt = ''

								    for i in range(images.shape[0]):

								        data = Image.fromarray(images[i])

								        img_path = 'video_imgs_{}/{}_{}.jpg'.format(n_frames, qa_item['question_id'], i)

								        data.save(img_path)

								        prompt += '<img>' + img_path + '</img>\n'


								    choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]

								    choice_list = []

								    for i, c in enumerate(choices):

								        choice_list.append('{}. {}'.format(chr(i + 65), c))

								    choice_txt = '\n'.join(choice_list)


								    prompt += 'Question: {}\nOptions: {}\nAnswer:'.format(qa_item['question'], choice_txt)

								    print(json.dumps({

								        'question_id': qa_item['question_id'],

								        'prompt': prompt,

								        'answer': qa_item['answer'],

								    }), file=fout)

								fout.close()