def mk_key_for_sort(item):
    problem_id = item['problem_id']
    prefix, idx = problem_id.split('_')
    prefix_weight = 0 if prefix == 'train' else 1
    return (prefix_weight, int(idx))


def sort_and_drop_key(dataset, key):
    dataset = sorted(dataset, key=lambda x: x[key])
    for item in dataset:
        item.pop(key)
    return dataset


TEST_RANGES = [(0, 300), (3000, 3100), (4000, 4100)]
def is_in_test_range(prefix_weight, idx):
    if prefix_weight == 1:
        for start, end in TEST_RANGES:
            if start <= idx < end:
                return True
    return False


def sort_and_split_dataset(dataset, n):
    """
    The dataset will be divided into two parts: Train and Test.
    From the Test set, 10% of items across varying difficulties will be selected.
    Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
    This approach reduces the test time by approximately 1/5.
    """
    
    # add `key_for_sort`
    new_train, new_test = [], []
    for item in dataset:
        item["key_for_sort"] = mk_key_for_sort(item)
        if is_in_test_range(*item["key_for_sort"]):
            new_test.append(item)
        else:
            new_train.append(item)
    
    new_train = sort_and_drop_key(new_train, "key_for_sort")
    new_test = sort_and_drop_key(new_test, "key_for_sort")
    
    minimal_test = []
    assert len(new_test) % n == 0
    for i in range(len(new_test) // n):
        problem = new_test[i * n : (i + 1) * n]
        has_correct_solution = any(d["eval_result"] for d in problem)
        if has_correct_solution:
            minimal_test.extend(problem)

    return new_train, new_test, minimal_test
