import argparse
from collections import defaultdict
import json
from functools import partial

import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl

def eval(scores):
    ks = list(range(1, 17))

    results = []
    results.extend(metric.pass_at_k(scores, ks))
    # results.extend(metric.pass_at_k(scores, [50]))
    results.extend(metric.top_at_k(scores, ks, metric.positive_only))

    if "negative_score" in scores[0]:
        results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))

        # for i in range(4):
        #     threshold = 0.5 + i * 0.1
        #     score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
        #     results.extend(metric.top_at_k(scores, ks, score_func))

    return results


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--score", type=str, help="path/to/score")
    args = parser.parse_args()

    scores = load_jsonl(args.score)
    groups = defaultdict(list)
    for item in scores:
        groups[item["dataset"]].append(item)

    for dataset, lst in groups.items():
        results = eval(lst)
        for r in results:
            r["dataset"] = dataset
            print(json.dumps(r))
