| { | |
| "results": { | |
| "hendrycksTest-abstract_algebra": { | |
| "acc": 0.26, | |
| "acc_stderr": 0.044084400227680814, | |
| "acc_norm": 0.26, | |
| "acc_norm_stderr": 0.044084400227680814 | |
| }, | |
| "hendrycksTest-anatomy": { | |
| "acc": 0.23703703703703705, | |
| "acc_stderr": 0.03673731683969506, | |
| "acc_norm": 0.23703703703703705, | |
| "acc_norm_stderr": 0.03673731683969506 | |
| }, | |
| "hendrycksTest-astronomy": { | |
| "acc": 0.25, | |
| "acc_stderr": 0.03523807393012047, | |
| "acc_norm": 0.25, | |
| "acc_norm_stderr": 0.03523807393012047 | |
| }, | |
| "hendrycksTest-business_ethics": { | |
| "acc": 0.31, | |
| "acc_stderr": 0.04648231987117316, | |
| "acc_norm": 0.31, | |
| "acc_norm_stderr": 0.04648231987117316 | |
| }, | |
| "hendrycksTest-clinical_knowledge": { | |
| "acc": 0.22641509433962265, | |
| "acc_stderr": 0.02575755989310675, | |
| "acc_norm": 0.22641509433962265, | |
| "acc_norm_stderr": 0.02575755989310675 | |
| }, | |
| "hendrycksTest-college_biology": { | |
| "acc": 0.2708333333333333, | |
| "acc_stderr": 0.037161774375660185, | |
| "acc_norm": 0.2708333333333333, | |
| "acc_norm_stderr": 0.037161774375660185 | |
| }, | |
| "hendrycksTest-college_chemistry": { | |
| "acc": 0.28, | |
| "acc_stderr": 0.045126085985421276, | |
| "acc_norm": 0.28, | |
| "acc_norm_stderr": 0.045126085985421276 | |
| }, | |
| "hendrycksTest-college_computer_science": { | |
| "acc": 0.42, | |
| "acc_stderr": 0.04960449637488583, | |
| "acc_norm": 0.42, | |
| "acc_norm_stderr": 0.04960449637488583 | |
| }, | |
| "hendrycksTest-college_mathematics": { | |
| "acc": 0.31, | |
| "acc_stderr": 0.04648231987117316, | |
| "acc_norm": 0.31, | |
| "acc_norm_stderr": 0.04648231987117316 | |
| }, | |
| "hendrycksTest-college_medicine": { | |
| "acc": 0.2543352601156069, | |
| "acc_stderr": 0.0332055644308557, | |
| "acc_norm": 0.2543352601156069, | |
| "acc_norm_stderr": 0.0332055644308557 | |
| }, | |
| "hendrycksTest-college_physics": { | |
| "acc": 0.21568627450980393, | |
| "acc_stderr": 0.04092563958237656, | |
| "acc_norm": 0.21568627450980393, | |
| "acc_norm_stderr": 0.04092563958237656 | |
| }, | |
| "hendrycksTest-computer_security": { | |
| "acc": 0.32, | |
| "acc_stderr": 0.04688261722621504, | |
| "acc_norm": 0.32, | |
| "acc_norm_stderr": 0.04688261722621504 | |
| }, | |
| "hendrycksTest-conceptual_physics": { | |
| "acc": 0.2978723404255319, | |
| "acc_stderr": 0.029896145682095462, | |
| "acc_norm": 0.2978723404255319, | |
| "acc_norm_stderr": 0.029896145682095462 | |
| }, | |
| "hendrycksTest-econometrics": { | |
| "acc": 0.2631578947368421, | |
| "acc_stderr": 0.04142439719489361, | |
| "acc_norm": 0.2631578947368421, | |
| "acc_norm_stderr": 0.04142439719489361 | |
| }, | |
| "hendrycksTest-electrical_engineering": { | |
| "acc": 0.2413793103448276, | |
| "acc_stderr": 0.03565998174135303, | |
| "acc_norm": 0.2413793103448276, | |
| "acc_norm_stderr": 0.03565998174135303 | |
| }, | |
| "hendrycksTest-elementary_mathematics": { | |
| "acc": 0.2724867724867725, | |
| "acc_stderr": 0.022930973071633356, | |
| "acc_norm": 0.2724867724867725, | |
| "acc_norm_stderr": 0.022930973071633356 | |
| }, | |
| "hendrycksTest-formal_logic": { | |
| "acc": 0.30952380952380953, | |
| "acc_stderr": 0.04134913018303316, | |
| "acc_norm": 0.30952380952380953, | |
| "acc_norm_stderr": 0.04134913018303316 | |
| }, | |
| "hendrycksTest-global_facts": { | |
| "acc": 0.28, | |
| "acc_stderr": 0.045126085985421276, | |
| "acc_norm": 0.28, | |
| "acc_norm_stderr": 0.045126085985421276 | |
| }, | |
| "hendrycksTest-high_school_biology": { | |
| "acc": 0.22258064516129034, | |
| "acc_stderr": 0.023664216671642528, | |
| "acc_norm": 0.22258064516129034, | |
| "acc_norm_stderr": 0.023664216671642528 | |
| }, | |
| "hendrycksTest-high_school_chemistry": { | |
| "acc": 0.18719211822660098, | |
| "acc_stderr": 0.027444924966882618, | |
| "acc_norm": 0.18719211822660098, | |
| "acc_norm_stderr": 0.027444924966882618 | |
| }, | |
| "hendrycksTest-high_school_computer_science": { | |
| "acc": 0.25, | |
| "acc_stderr": 0.04351941398892446, | |
| "acc_norm": 0.25, | |
| "acc_norm_stderr": 0.04351941398892446 | |
| }, | |
| "hendrycksTest-high_school_european_history": { | |
| "acc": 0.296969696969697, | |
| "acc_stderr": 0.035679697722680474, | |
| "acc_norm": 0.296969696969697, | |
| "acc_norm_stderr": 0.035679697722680474 | |
| }, | |
| "hendrycksTest-high_school_geography": { | |
| "acc": 0.19696969696969696, | |
| "acc_stderr": 0.028335609732463355, | |
| "acc_norm": 0.19696969696969696, | |
| "acc_norm_stderr": 0.028335609732463355 | |
| }, | |
| "hendrycksTest-high_school_government_and_politics": { | |
| "acc": 0.23316062176165803, | |
| "acc_stderr": 0.030516111371476008, | |
| "acc_norm": 0.23316062176165803, | |
| "acc_norm_stderr": 0.030516111371476008 | |
| }, | |
| "hendrycksTest-high_school_macroeconomics": { | |
| "acc": 0.21025641025641026, | |
| "acc_stderr": 0.020660597485026935, | |
| "acc_norm": 0.21025641025641026, | |
| "acc_norm_stderr": 0.020660597485026935 | |
| }, | |
| "hendrycksTest-high_school_mathematics": { | |
| "acc": 0.21851851851851853, | |
| "acc_stderr": 0.025195752251823786, | |
| "acc_norm": 0.21851851851851853, | |
| "acc_norm_stderr": 0.025195752251823786 | |
| }, | |
| "hendrycksTest-high_school_microeconomics": { | |
| "acc": 0.27310924369747897, | |
| "acc_stderr": 0.02894200404099817, | |
| "acc_norm": 0.27310924369747897, | |
| "acc_norm_stderr": 0.02894200404099817 | |
| }, | |
| "hendrycksTest-high_school_physics": { | |
| "acc": 0.26490066225165565, | |
| "acc_stderr": 0.03603038545360384, | |
| "acc_norm": 0.26490066225165565, | |
| "acc_norm_stderr": 0.03603038545360384 | |
| }, | |
| "hendrycksTest-high_school_psychology": { | |
| "acc": 0.23119266055045873, | |
| "acc_stderr": 0.018075750241633156, | |
| "acc_norm": 0.23119266055045873, | |
| "acc_norm_stderr": 0.018075750241633156 | |
| }, | |
| "hendrycksTest-high_school_statistics": { | |
| "acc": 0.22685185185185186, | |
| "acc_stderr": 0.028561650102422256, | |
| "acc_norm": 0.22685185185185186, | |
| "acc_norm_stderr": 0.028561650102422256 | |
| }, | |
| "hendrycksTest-high_school_us_history": { | |
| "acc": 0.2696078431372549, | |
| "acc_stderr": 0.031145570659486782, | |
| "acc_norm": 0.2696078431372549, | |
| "acc_norm_stderr": 0.031145570659486782 | |
| }, | |
| "hendrycksTest-high_school_world_history": { | |
| "acc": 0.28270042194092826, | |
| "acc_stderr": 0.029312814153955917, | |
| "acc_norm": 0.28270042194092826, | |
| "acc_norm_stderr": 0.029312814153955917 | |
| }, | |
| "hendrycksTest-human_aging": { | |
| "acc": 0.32286995515695066, | |
| "acc_stderr": 0.03138147637575498, | |
| "acc_norm": 0.32286995515695066, | |
| "acc_norm_stderr": 0.03138147637575498 | |
| }, | |
| "hendrycksTest-human_sexuality": { | |
| "acc": 0.3435114503816794, | |
| "acc_stderr": 0.041649760719448786, | |
| "acc_norm": 0.3435114503816794, | |
| "acc_norm_stderr": 0.041649760719448786 | |
| }, | |
| "hendrycksTest-international_law": { | |
| "acc": 0.2727272727272727, | |
| "acc_stderr": 0.04065578140908705, | |
| "acc_norm": 0.2727272727272727, | |
| "acc_norm_stderr": 0.04065578140908705 | |
| }, | |
| "hendrycksTest-jurisprudence": { | |
| "acc": 0.26851851851851855, | |
| "acc_stderr": 0.04284467968052192, | |
| "acc_norm": 0.26851851851851855, | |
| "acc_norm_stderr": 0.04284467968052192 | |
| }, | |
| "hendrycksTest-logical_fallacies": { | |
| "acc": 0.2147239263803681, | |
| "acc_stderr": 0.03226219377286774, | |
| "acc_norm": 0.2147239263803681, | |
| "acc_norm_stderr": 0.03226219377286774 | |
| }, | |
| "hendrycksTest-machine_learning": { | |
| "acc": 0.25892857142857145, | |
| "acc_stderr": 0.04157751539865629, | |
| "acc_norm": 0.25892857142857145, | |
| "acc_norm_stderr": 0.04157751539865629 | |
| }, | |
| "hendrycksTest-management": { | |
| "acc": 0.18446601941747573, | |
| "acc_stderr": 0.03840423627288276, | |
| "acc_norm": 0.18446601941747573, | |
| "acc_norm_stderr": 0.03840423627288276 | |
| }, | |
| "hendrycksTest-marketing": { | |
| "acc": 0.3076923076923077, | |
| "acc_stderr": 0.030236389942173106, | |
| "acc_norm": 0.3076923076923077, | |
| "acc_norm_stderr": 0.030236389942173106 | |
| }, | |
| "hendrycksTest-medical_genetics": { | |
| "acc": 0.44, | |
| "acc_stderr": 0.04988876515698589, | |
| "acc_norm": 0.44, | |
| "acc_norm_stderr": 0.04988876515698589 | |
| }, | |
| "hendrycksTest-miscellaneous": { | |
| "acc": 0.26947637292464877, | |
| "acc_stderr": 0.015866243073215068, | |
| "acc_norm": 0.26947637292464877, | |
| "acc_norm_stderr": 0.015866243073215068 | |
| }, | |
| "hendrycksTest-moral_disputes": { | |
| "acc": 0.30346820809248554, | |
| "acc_stderr": 0.024752411960917212, | |
| "acc_norm": 0.30346820809248554, | |
| "acc_norm_stderr": 0.024752411960917212 | |
| }, | |
| "hendrycksTest-moral_scenarios": { | |
| "acc": 0.23798882681564246, | |
| "acc_stderr": 0.014242630070574915, | |
| "acc_norm": 0.23798882681564246, | |
| "acc_norm_stderr": 0.014242630070574915 | |
| }, | |
| "hendrycksTest-nutrition": { | |
| "acc": 0.2549019607843137, | |
| "acc_stderr": 0.024954184324879905, | |
| "acc_norm": 0.2549019607843137, | |
| "acc_norm_stderr": 0.024954184324879905 | |
| }, | |
| "hendrycksTest-philosophy": { | |
| "acc": 0.2990353697749196, | |
| "acc_stderr": 0.026003301117885142, | |
| "acc_norm": 0.2990353697749196, | |
| "acc_norm_stderr": 0.026003301117885142 | |
| }, | |
| "hendrycksTest-prehistory": { | |
| "acc": 0.3117283950617284, | |
| "acc_stderr": 0.02577311116963045, | |
| "acc_norm": 0.3117283950617284, | |
| "acc_norm_stderr": 0.02577311116963045 | |
| }, | |
| "hendrycksTest-professional_accounting": { | |
| "acc": 0.2624113475177305, | |
| "acc_stderr": 0.026244920349843007, | |
| "acc_norm": 0.2624113475177305, | |
| "acc_norm_stderr": 0.026244920349843007 | |
| }, | |
| "hendrycksTest-professional_law": { | |
| "acc": 0.2803129074315515, | |
| "acc_stderr": 0.01147155594495862, | |
| "acc_norm": 0.2803129074315515, | |
| "acc_norm_stderr": 0.01147155594495862 | |
| }, | |
| "hendrycksTest-professional_medicine": { | |
| "acc": 0.19852941176470587, | |
| "acc_stderr": 0.0242310133705411, | |
| "acc_norm": 0.19852941176470587, | |
| "acc_norm_stderr": 0.0242310133705411 | |
| }, | |
| "hendrycksTest-professional_psychology": { | |
| "acc": 0.3088235294117647, | |
| "acc_stderr": 0.01869085027359529, | |
| "acc_norm": 0.3088235294117647, | |
| "acc_norm_stderr": 0.01869085027359529 | |
| }, | |
| "hendrycksTest-public_relations": { | |
| "acc": 0.3181818181818182, | |
| "acc_stderr": 0.04461272175910508, | |
| "acc_norm": 0.3181818181818182, | |
| "acc_norm_stderr": 0.04461272175910508 | |
| }, | |
| "hendrycksTest-security_studies": { | |
| "acc": 0.21224489795918366, | |
| "acc_stderr": 0.026176967197866767, | |
| "acc_norm": 0.21224489795918366, | |
| "acc_norm_stderr": 0.026176967197866767 | |
| }, | |
| "hendrycksTest-sociology": { | |
| "acc": 0.2736318407960199, | |
| "acc_stderr": 0.031524391865554, | |
| "acc_norm": 0.2736318407960199, | |
| "acc_norm_stderr": 0.031524391865554 | |
| }, | |
| "hendrycksTest-us_foreign_policy": { | |
| "acc": 0.28, | |
| "acc_stderr": 0.04512608598542128, | |
| "acc_norm": 0.28, | |
| "acc_norm_stderr": 0.04512608598542128 | |
| }, | |
| "hendrycksTest-virology": { | |
| "acc": 0.30120481927710846, | |
| "acc_stderr": 0.035716092300534796, | |
| "acc_norm": 0.30120481927710846, | |
| "acc_norm_stderr": 0.035716092300534796 | |
| }, | |
| "hendrycksTest-world_religions": { | |
| "acc": 0.38596491228070173, | |
| "acc_stderr": 0.03733756969066164, | |
| "acc_norm": 0.38596491228070173, | |
| "acc_norm_stderr": 0.03733756969066164 | |
| } | |
| }, | |
| "versions": { | |
| "hendrycksTest-abstract_algebra": 1, | |
| "hendrycksTest-anatomy": 1, | |
| "hendrycksTest-astronomy": 1, | |
| "hendrycksTest-business_ethics": 1, | |
| "hendrycksTest-clinical_knowledge": 1, | |
| "hendrycksTest-college_biology": 1, | |
| "hendrycksTest-college_chemistry": 1, | |
| "hendrycksTest-college_computer_science": 1, | |
| "hendrycksTest-college_mathematics": 1, | |
| "hendrycksTest-college_medicine": 1, | |
| "hendrycksTest-college_physics": 1, | |
| "hendrycksTest-computer_security": 1, | |
| "hendrycksTest-conceptual_physics": 1, | |
| "hendrycksTest-econometrics": 1, | |
| "hendrycksTest-electrical_engineering": 1, | |
| "hendrycksTest-elementary_mathematics": 1, | |
| "hendrycksTest-formal_logic": 1, | |
| "hendrycksTest-global_facts": 1, | |
| "hendrycksTest-high_school_biology": 1, | |
| "hendrycksTest-high_school_chemistry": 1, | |
| "hendrycksTest-high_school_computer_science": 1, | |
| "hendrycksTest-high_school_european_history": 1, | |
| "hendrycksTest-high_school_geography": 1, | |
| "hendrycksTest-high_school_government_and_politics": 1, | |
| "hendrycksTest-high_school_macroeconomics": 1, | |
| "hendrycksTest-high_school_mathematics": 1, | |
| "hendrycksTest-high_school_microeconomics": 1, | |
| "hendrycksTest-high_school_physics": 1, | |
| "hendrycksTest-high_school_psychology": 1, | |
| "hendrycksTest-high_school_statistics": 1, | |
| "hendrycksTest-high_school_us_history": 1, | |
| "hendrycksTest-high_school_world_history": 1, | |
| "hendrycksTest-human_aging": 1, | |
| "hendrycksTest-human_sexuality": 1, | |
| "hendrycksTest-international_law": 1, | |
| "hendrycksTest-jurisprudence": 1, | |
| "hendrycksTest-logical_fallacies": 1, | |
| "hendrycksTest-machine_learning": 1, | |
| "hendrycksTest-management": 1, | |
| "hendrycksTest-marketing": 1, | |
| "hendrycksTest-medical_genetics": 1, | |
| "hendrycksTest-miscellaneous": 1, | |
| "hendrycksTest-moral_disputes": 1, | |
| "hendrycksTest-moral_scenarios": 1, | |
| "hendrycksTest-nutrition": 1, | |
| "hendrycksTest-philosophy": 1, | |
| "hendrycksTest-prehistory": 1, | |
| "hendrycksTest-professional_accounting": 1, | |
| "hendrycksTest-professional_law": 1, | |
| "hendrycksTest-professional_medicine": 1, | |
| "hendrycksTest-professional_psychology": 1, | |
| "hendrycksTest-public_relations": 1, | |
| "hendrycksTest-security_studies": 1, | |
| "hendrycksTest-sociology": 1, | |
| "hendrycksTest-us_foreign_policy": 1, | |
| "hendrycksTest-virology": 1, | |
| "hendrycksTest-world_religions": 1 | |
| }, | |
| "config": { | |
| "model": "hf-causal", | |
| "model_args": "pretrained=workdir_7b/ckpt_349", | |
| "num_fewshot": 5, | |
| "batch_size": "8", | |
| "batch_sizes": [], | |
| "device": null, | |
| "no_cache": true, | |
| "limit": null, | |
| "bootstrap_iters": 100000, | |
| "description_dict": {} | |
| } | |
| } |