Skip to content

GH Task Runner (Single) #115

GH Task Runner (Single)

GH Task Runner (Single) #115

name: GH Task Runner (Single)
on:
workflow_dispatch:
inputs:
run_task:
description: 'Task to run'
required: true
default: 'anli'
type: choice
options:
- advanced_ai_risk
- advanced_ai_risk_fewshot-coordinate-itself
- advanced_ai_risk_fewshot-coordinate-other-ais
- advanced_ai_risk_fewshot-coordinate-other-versions
- advanced_ai_risk_fewshot-corrigible-less-HHH
- advanced_ai_risk_fewshot-corrigible-more-HHH
- advanced_ai_risk_fewshot-corrigible-neutral-HHH
- advanced_ai_risk_fewshot-myopic-reward
- advanced_ai_risk_fewshot-one-box-tendency
- advanced_ai_risk_fewshot-power-seeking-inclination
- advanced_ai_risk_fewshot-self-awareness-general-ai
- advanced_ai_risk_fewshot-self-awareness-good-text-model
- advanced_ai_risk_fewshot-self-awareness-text-model
- advanced_ai_risk_fewshot-self-awareness-training-architecture
- advanced_ai_risk_fewshot-self-awareness-training-web-gpt
- advanced_ai_risk_fewshot-survival-instinct
- advanced_ai_risk_fewshot-wealth-seeking-inclination
- advanced_ai_risk_human-coordinate-itself
- advanced_ai_risk_human-coordinate-other-ais
- advanced_ai_risk_human-coordinate-other-versions
- advanced_ai_risk_human-corrigible-less-HHH
- advanced_ai_risk_human-corrigible-more-HHH
- advanced_ai_risk_human-corrigible-neutral-HHH
- advanced_ai_risk_human-myopic-reward
- advanced_ai_risk_human-one-box-tendency
- advanced_ai_risk_human-power-seeking-inclination
- advanced_ai_risk_human-self-awareness-general-ai
- advanced_ai_risk_human-self-awareness-good-text-model
- advanced_ai_risk_human-self-awareness-text-model
- advanced_ai_risk_human-self-awareness-training-architecture
- advanced_ai_risk_human-self-awareness-web-gpt
- advanced_ai_risk_human-survival-instinct
- advanced_ai_risk_human-wealth-seeking-inclination
- advanced_ai_risk_lm-coordinate-itself
- advanced_ai_risk_lm-coordinate-other-ais
- advanced_ai_risk_lm-coordinate-other-versions
- advanced_ai_risk_lm-corrigible-less-HHH
- advanced_ai_risk_lm-corrigible-more-HHH
- advanced_ai_risk_lm-corrigible-neutral-HHH
- advanced_ai_risk_lm-myopic-reward
- advanced_ai_risk_lm-one-box-tendency
- advanced_ai_risk_lm-power-seeking-inclination
- advanced_ai_risk_lm-self-awareness-general-ai
- advanced_ai_risk_lm-self-awareness-good-text-model
- advanced_ai_risk_lm-self-awareness-text-model
- advanced_ai_risk_lm-self-awareness-training-architecture
- advanced_ai_risk_lm-self-awareness-training-nn-architecture
- advanced_ai_risk_lm-self-awareness-training-web-gpt
- advanced_ai_risk_lm-survival-instinct
- advanced_ai_risk_lm-wealth-seeking-inclination
- ai2_arc
- anagrams1
- anagrams2
- anli
- anli_r1
- anli_r2
- anli_r3
- arc_challenge
- arc_easy
- arithmetic
- arithmetic_1dc
- arithmetic_2da
- arithmetic_2dm
- arithmetic_2ds
- arithmetic_3da
- arithmetic_3ds
- arithmetic_4da
- arithmetic_4ds
- arithmetic_5da
- arithmetic_5ds
- asdiv
- babi
- bbh
- bbh_cot_fewshot
- bbh_cot_fewshot_boolean_expressions
- bbh_cot_fewshot_causal_judgement
- bbh_cot_fewshot_date_understanding
- bbh_cot_fewshot_disambiguation_qa
- bbh_cot_fewshot_dyck_languages
- bbh_cot_fewshot_formal_fallacies
- bbh_cot_fewshot_geometric_shapes
- bbh_cot_fewshot_hyperbaton
- bbh_cot_fewshot_logical_deduction_five_objects
- bbh_cot_fewshot_logical_deduction_seven_objects
- bbh_cot_fewshot_logical_deduction_three_objects
- bbh_cot_fewshot_movie_recommendation
- bbh_cot_fewshot_multistep_arithmetic_two
- bbh_cot_fewshot_navigate
- bbh_cot_fewshot_object_counting
- bbh_cot_fewshot_penguins_in_a_table
- bbh_cot_fewshot_reasoning_about_colored_objects
- bbh_cot_fewshot_ruin_names
- bbh_cot_fewshot_salient_translation_error_detection
- bbh_cot_fewshot_snarks
- bbh_cot_fewshot_sports_understanding
- bbh_cot_fewshot_temporal_sequences
- bbh_cot_fewshot_tracking_shuffled_objects_five_objects
- bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
- bbh_cot_fewshot_tracking_shuffled_objects_three_objects
- bbh_cot_fewshot_web_of_lies
- bbh_cot_fewshot_word_sorting
- bbh_cot_zeroshot
- bbh_cot_zeroshot_boolean_expressions
- bbh_cot_zeroshot_causal_judgement
- bbh_cot_zeroshot_date_understanding
- bbh_cot_zeroshot_disambiguation_qa
- bbh_cot_zeroshot_dyck_languages
- bbh_cot_zeroshot_formal_fallacies
- bbh_cot_zeroshot_geometric_shapes
- bbh_cot_zeroshot_hyperbaton
- bbh_cot_zeroshot_logical_deduction_five_objects
- bbh_cot_zeroshot_logical_deduction_seven_objects
- bbh_cot_zeroshot_logical_deduction_three_objects
- bbh_cot_zeroshot_movie_recommendation
- bbh_cot_zeroshot_multistep_arithmetic_two
- bbh_cot_zeroshot_navigate
- bbh_cot_zeroshot_object_counting
- bbh_cot_zeroshot_penguins_in_a_table
- bbh_cot_zeroshot_reasoning_about_colored_objects
- bbh_cot_zeroshot_ruin_names
- bbh_cot_zeroshot_salient_translation_error_detection
- bbh_cot_zeroshot_snarks
- bbh_cot_zeroshot_sports_understanding
- bbh_cot_zeroshot_temporal_sequences
- bbh_cot_zeroshot_tracking_shuffled_objects_five_objects
- bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects
- bbh_cot_zeroshot_tracking_shuffled_objects_three_objects
- bbh_cot_zeroshot_web_of_lies
- bbh_cot_zeroshot_word_sorting
- bbh_fewshot
- bbh_fewshot_boolean_expressions
- bbh_fewshot_causal_judgement
- bbh_fewshot_date_understanding
- bbh_fewshot_disambiguation_qa
- bbh_fewshot_dyck_languages
- bbh_fewshot_formal_fallacies
- bbh_fewshot_geometric_shapes
- bbh_fewshot_hyperbaton
- bbh_fewshot_logical_deduction_five_objects
- bbh_fewshot_logical_deduction_seven_objects
- bbh_fewshot_logical_deduction_three_objects
- bbh_fewshot_movie_recommendation
- bbh_fewshot_multistep_arithmetic_two
- bbh_fewshot_navigate
- bbh_fewshot_object_counting
- bbh_fewshot_penguins_in_a_table
- bbh_fewshot_reasoning_about_colored_objects
- bbh_fewshot_ruin_names
- bbh_fewshot_salient_translation_error_detection
- bbh_fewshot_snarks
- bbh_fewshot_sports_understanding
- bbh_fewshot_temporal_sequences
- bbh_fewshot_tracking_shuffled_objects_five_objects
- bbh_fewshot_tracking_shuffled_objects_seven_objects
- bbh_fewshot_tracking_shuffled_objects_three_objects
- bbh_fewshot_web_of_lies
- bbh_fewshot_word_sorting
- bbh_zeroshot
- bbh_zeroshot_boolean_expressions
- bbh_zeroshot_causal_judgement
- bbh_zeroshot_date_understanding
- bbh_zeroshot_disambiguation_qa
- bbh_zeroshot_dyck_languages
- bbh_zeroshot_formal_fallacies
- bbh_zeroshot_geometric_shapes
- bbh_zeroshot_hyperbaton
- bbh_zeroshot_logical_deduction_five_objects
- bbh_zeroshot_logical_deduction_seven_objects
- bbh_zeroshot_logical_deduction_three_objects
- bbh_zeroshot_movie_recommendation
- bbh_zeroshot_multistep_arithmetic_two
- bbh_zeroshot_navigate
- bbh_zeroshot_object_counting
- bbh_zeroshot_penguins_in_a_table
- bbh_zeroshot_reasoning_about_colored_objects
- bbh_zeroshot_ruin_names
- bbh_zeroshot_salient_translation_error_detection
- bbh_zeroshot_snarks
- bbh_zeroshot_sports_understanding
- bbh_zeroshot_temporal_sequences
- bbh_zeroshot_tracking_shuffled_objects_five_objects
- bbh_zeroshot_tracking_shuffled_objects_seven_objects
- bbh_zeroshot_tracking_shuffled_objects_three_objects
- bbh_zeroshot_web_of_lies
- bbh_zeroshot_word_sorting
- belebele
- belebele_acm_Arab
- belebele_afr_Latn
- belebele_als_Latn
- belebele_amh_Ethi
- belebele_apc_Arab
- belebele_arb_Arab
- belebele_arb_Latn
- belebele_ars_Arab
- belebele_ary_Arab
- belebele_arz_Arab
- belebele_asm_Beng
- belebele_azj_Latn
- belebele_bam_Latn
- belebele_ben_Beng
- belebele_ben_Latn
- belebele_bod_Tibt
- belebele_bul_Cyrl
- belebele_cat_Latn
- belebele_ceb_Latn
- belebele_ces_Latn
- belebele_ckb_Arab
- belebele_dan_Latn
- belebele_deu_Latn
- belebele_ell_Grek
- belebele_eng_Latn
- belebele_est_Latn
- belebele_eus_Latn
- belebele_fin_Latn
- belebele_fra_Latn
- belebele_fuv_Latn
- belebele_gaz_Latn
- belebele_grn_Latn
- belebele_guj_Gujr
- belebele_hat_Latn
- belebele_hau_Latn
- belebele_heb_Hebr
- belebele_hin_Deva
- belebele_hin_Latn
- belebele_hrv_Latn
- belebele_hun_Latn
- belebele_hye_Armn
- belebele_ibo_Latn
- belebele_ilo_Latn
- belebele_ind_Latn
- belebele_isl_Latn
- belebele_ita_Latn
- belebele_jav_Latn
- belebele_jpn_Jpan
- belebele_kac_Latn
- belebele_kan_Knda
- belebele_kat_Geor
- belebele_kaz_Cyrl
- belebele_kea_Latn
- belebele_khk_Cyrl
- belebele_khm_Khmr
- belebele_kin_Latn
- belebele_kir_Cyrl
- belebele_kor_Hang
- belebele_lao_Laoo
- belebele_lin_Latn
- belebele_lit_Latn
- belebele_lug_Latn
- belebele_luo_Latn
- belebele_lvs_Latn
- belebele_mal_Mlym
- belebele_mar_Deva
- belebele_mkd_Cyrl
- belebele_mlt_Latn
- belebele_mri_Latn
- belebele_mya_Mymr
- belebele_nld_Latn
- belebele_nob_Latn
- belebele_npi_Deva
- belebele_npi_Latn
- belebele_nso_Latn
- belebele_nya_Latn
- belebele_ory_Orya
- belebele_pan_Guru
- belebele_pbt_Arab
- belebele_pes_Arab
- belebele_plt_Latn
- belebele_pol_Latn
- belebele_por_Latn
- belebele_ron_Latn
- belebele_rus_Cyrl
- belebele_shn_Mymr
- belebele_sin_Latn
- belebele_sin_Sinh
- belebele_slk_Latn
- belebele_slv_Latn
- belebele_sna_Latn
- belebele_snd_Arab
- belebele_som_Latn
- belebele_sot_Latn
- belebele_spa_Latn
- belebele_srp_Cyrl
- belebele_ssw_Latn
- belebele_sun_Latn
- belebele_swe_Latn
- belebele_swh_Latn
- belebele_tam_Taml
- belebele_tel_Telu
- belebele_tgk_Cyrl
- belebele_tgl_Latn
- belebele_tha_Thai
- belebele_tir_Ethi
- belebele_tsn_Latn
- belebele_tso_Latn
- belebele_tur_Latn
- belebele_ukr_Cyrl
- belebele_urd_Arab
- belebele_urd_Latn
- belebele_uzn_Latn
- belebele_vie_Latn
- belebele_war_Latn
- belebele_wol_Latn
- belebele_xho_Latn
- belebele_yor_Latn
- belebele_zho_Hans
- belebele_zho_Hant
- belebele_zsm_Latn
- belebele_zul_Latn
- bigbench_abstract_narrative_understanding_generate_until
- bigbench_abstract_narrative_understanding_multiple_choice
- bigbench_anachronisms_generate_until
- bigbench_anachronisms_multiple_choice
- bigbench_analogical_similarity_generate_until
- bigbench_analogical_similarity_multiple_choice
- bigbench_analytic_entailment_generate_until
- bigbench_analytic_entailment_multiple_choice
- bigbench_arithmetic_generate_until
- bigbench_arithmetic_multiple_choice
- bigbench_ascii_word_recognition_generate_until
- bigbench_ascii_word_recognition_multiple_choice
- bigbench_authorship_verification_generate_until
- bigbench_authorship_verification_multiple_choice
- bigbench_auto_categorization_generate_until
- bigbench_auto_categorization_multiple_choice
- bigbench_auto_debugging_generate_until
- bigbench_auto_debugging_multiple_choice
- bigbench_bbq_lite_json_generate_until
- bigbench_bbq_lite_json_multiple_choice
- bigbench_bridging_anaphora_resolution_barqa_generate_until
- bigbench_bridging_anaphora_resolution_barqa_multiple_choice
- bigbench_causal_judgement_multiple_choice
- bigbench_causal_judgment_generate_until
- bigbench_causal_judgment_multiple_choice
- bigbench_cause_and_effect_generate_until
- bigbench_cause_and_effect_multiple_choice
- bigbench_checkmate_in_one_generate_until
- bigbench_checkmate_in_one_multiple_choice
- bigbench_chess_state_tracking_generate_until
- bigbench_chess_state_tracking_multiple_choice
- bigbench_chinese_remainder_theorem_generate_until
- bigbench_chinese_remainder_theorem_multiple_choice
- bigbench_cifar10_classification_generate_until
- bigbench_cifar10_classification_multiple_choice
- bigbench_code_line_description_generate_until
- bigbench_code_line_description_multiple_choice
- bigbench_codenames_generate_until
- bigbench_codenames_multiple_choice
- bigbench_color_generate_until
- bigbench_color_multiple_choice
- bigbench_common_morpheme_generate_until
- bigbench_common_morpheme_multiple_choice
- bigbench_conceptual_combinations_generate_until
- bigbench_conceptual_combinations_multiple_choice
- bigbench_conlang_translation_generate_until
- bigbench_conlang_translation_multiple_choice
- bigbench_contextual_parametric_knowledge_conflicts_generate_until
- bigbench_contextual_parametric_knowledge_conflicts_multiple_choice
- bigbench_crash_blossom_generate_until
- bigbench_crash_blossom_multiple_choice
- bigbench_crass_ai_generate_until
- bigbench_crass_ai_multiple_choice
- bigbench_cryobiology_spanish_generate_until
- bigbench_cryobiology_spanish_multiple_choice
- bigbench_cryptonite_generate_until
- bigbench_cryptonite_multiple_choice
- bigbench_cs_algorithms_generate_until
- bigbench_cs_algorithms_multiple_choice
- bigbench_dark_humor_detection_generate_until
- bigbench_dark_humor_detection_multiple_choice
- bigbench_date_understanding_generate_until
- bigbench_date_understanding_multiple_choice
- bigbench_disambiguation_qa_generate_until
- bigbench_disambiguation_qa_multiple_choice
- bigbench_discourse_marker_prediction_generate_until
- bigbench_discourse_marker_prediction_multiple_choice
- bigbench_disfl_qa_generate_until
- bigbench_disfl_qa_multiple_choice
- bigbench_dyck_languages_generate_until
- bigbench_dyck_languages_multiple_choice
- bigbench_elementary_math_qa_generate_until
- bigbench_elementary_math_qa_multiple_choice
- bigbench_emoji_movie_generate_until
- bigbench_emoji_movie_multiple_choice
- bigbench_emojis_emotion_prediction_generate_until
- bigbench_emojis_emotion_prediction_multiple_choice
- bigbench_empirical_judgments_generate_until
- bigbench_empirical_judgments_multiple_choice
- bigbench_english_proverbs_generate_until
- bigbench_english_proverbs_multiple_choice
- bigbench_english_russian_proverbs_generate_until
- bigbench_english_russian_proverbs_multiple_choice
- bigbench_entailed_polarity_generate_until
- bigbench_entailed_polarity_hindi_generate_until
- bigbench_entailed_polarity_hindi_multiple_choice
- bigbench_entailed_polarity_multiple_choice
- bigbench_epistemic_reasoning_generate_until
- bigbench_epistemic_reasoning_multiple_choice
- bigbench_evaluating_information_essentiality_generate_until
- bigbench_evaluating_information_essentiality_multiple_choice
- bigbench_fact_checker_generate_until
- bigbench_fact_checker_multiple_choice
- bigbench_fantasy_reasoning_generate_until
- bigbench_fantasy_reasoning_multiple_choice
- bigbench_few_shot_nlg_generate_until
- bigbench_few_shot_nlg_multiple_choice
- bigbench_figure_of_speech_detection_generate_until
- bigbench_figure_of_speech_detection_multiple_choice
- bigbench_formal_fallacies_syllogisms_negation_generate_until
- bigbench_formal_fallacies_syllogisms_negation_multiple_choice
- bigbench_gem_generate_until
- bigbench_gem_multiple_choice
- bigbench_gender_inclusive_sentences_german_generate_until
- bigbench_gender_inclusive_sentences_german_multiple_choice
- bigbench_general_knowledge_generate_until
- bigbench_general_knowledge_multiple_choice
- bigbench_generate_until
- bigbench_geometric_shapes_generate_until
- bigbench_geometric_shapes_multiple_choice
- bigbench_goal_step_wikihow_generate_until
- bigbench_goal_step_wikihow_multiple_choice
- bigbench_gre_reading_comprehension_generate_until
- bigbench_gre_reading_comprehension_multiple_choice
- bigbench_hhh_alignment_generate_until
- bigbench_hhh_alignment_multiple_choice
- bigbench_hindi_question_answering_generate_until
- bigbench_hindi_question_answering_multiple_choice
- bigbench_hindu_knowledge_generate_until
- bigbench_hindu_knowledge_multiple_choice
- bigbench_hinglish_toxicity_generate_until
- bigbench_hinglish_toxicity_multiple_choice
- bigbench_human_organs_senses_generate_until
- bigbench_human_organs_senses_multiple_choice
- bigbench_hyperbaton_generate_until
- bigbench_hyperbaton_multiple_choice
- bigbench_identify_math_theorems_generate_until
- bigbench_identify_math_theorems_multiple_choice
- bigbench_identify_odd_metaphor_generate_until
- bigbench_identify_odd_metaphor_multiple_choice
- bigbench_implicatures_generate_until
- bigbench_implicatures_multiple_choice
- bigbench_implicit_relations_generate_until
- bigbench_implicit_relations_multiple_choice
- bigbench_intent_recognition_generate_until
- bigbench_intent_recognition_multiple_choice
- bigbench_international_phonetic_alphabet_nli_generate_until
- bigbench_international_phonetic_alphabet_nli_multiple_choice
- bigbench_international_phonetic_alphabet_transliterate_generate_until
- bigbench_international_phonetic_alphabet_transliterate_multiple_choice
- bigbench_intersect_geometry_generate_until
- bigbench_intersect_geometry_multiple_choice
- bigbench_irony_identification_generate_until
- bigbench_irony_identification_multiple_choice
- bigbench_kanji_ascii_generate_until
- bigbench_kanji_ascii_multiple_choice
- bigbench_kannada_generate_until
- bigbench_kannada_multiple_choice
- bigbench_key_value_maps_generate_until
- bigbench_key_value_maps_multiple_choice
- bigbench_known_unknowns_generate_until
- bigbench_known_unknowns_multiple_choice
- bigbench_language_games_generate_until
- bigbench_language_games_multiple_choice
- bigbench_language_identification_generate_until
- bigbench_language_identification_multiple_choice
- bigbench_linguistic_mappings_generate_until
- bigbench_linguistic_mappings_multiple_choice
- bigbench_linguistics_puzzles_generate_until
- bigbench_linguistics_puzzles_multiple_choice
- bigbench_list_functions_generate_until
- bigbench_list_functions_multiple_choice
- bigbench_logic_grid_puzzle_generate_until
- bigbench_logic_grid_puzzle_multiple_choice
- bigbench_logical_args_generate_until
- bigbench_logical_args_multiple_choice
- bigbench_logical_deduction_generate_until
- bigbench_logical_deduction_multiple_choice
- bigbench_logical_fallacy_detection_generate_until
- bigbench_logical_fallacy_detection_multiple_choice
- bigbench_logical_sequence_generate_until
- bigbench_logical_sequence_multiple_choice
- bigbench_mathematical_induction_generate_until
- bigbench_mathematical_induction_multiple_choice
- bigbench_matrixshapes_generate_until
- bigbench_matrixshapes_multiple_choice
- bigbench_metaphor_boolean_generate_until
- bigbench_metaphor_boolean_multiple_choice
- bigbench_metaphor_understanding_generate_until
- bigbench_metaphor_understanding_multiple_choice
- bigbench_minute_mysteries_qa_generate_until
- bigbench_minute_mysteries_qa_multiple_choice
- bigbench_misconceptions_generate_until
- bigbench_misconceptions_multiple_choice
- bigbench_misconceptions_russian_generate_until
- bigbench_misconceptions_russian_multiple_choice
- bigbench_mnist_ascii_generate_until
- bigbench_mnist_ascii_multiple_choice
- bigbench_modified_arithmetic_generate_until
- bigbench_modified_arithmetic_multiple_choice
- bigbench_moral_permissibility_generate_until
- bigbench_moral_permissibility_multiple_choice
- bigbench_movie_dialog_same_or_different_generate_until
- bigbench_movie_dialog_same_or_different_multiple_choice
- bigbench_movie_recommendation_generate_until
- bigbench_movie_recommendation_multiple_choice
- bigbench_mult_data_wrangling_generate_until
- bigbench_mult_data_wrangling_multiple_choice
- bigbench_multiemo_generate_until
- bigbench_multiemo_multiple_choice
- bigbench_multiple_choice
- bigbench_natural_instructions_generate_until
- bigbench_natural_instructions_multiple_choice
- bigbench_navigate_generate_until
- bigbench_navigate_multiple_choice
- bigbench_nonsense_words_grammar_generate_until
- bigbench_nonsense_words_grammar_multiple_choice
- bigbench_novel_concepts_generate_until
- bigbench_novel_concepts_multiple_choice
- bigbench_object_counting_generate_until
- bigbench_object_counting_multiple_choice
- bigbench_odd_one_out_generate_until
- bigbench_odd_one_out_multiple_choice
- bigbench_operators_generate_until
- bigbench_operators_multiple_choice
- bigbench_paragraph_segmentation_generate_until
- bigbench_paragraph_segmentation_multiple_choice
- bigbench_parsinlu_qa_generate_until
- bigbench_parsinlu_qa_multiple_choice
- bigbench_parsinlu_reading_comprehension_generate_until
- bigbench_parsinlu_reading_comprehension_multiple_choice
- bigbench_penguins_in_a_table_generate_until
- bigbench_penguins_in_a_table_multiple_choice
- bigbench_periodic_elements_generate_until
- bigbench_periodic_elements_multiple_choice
- bigbench_persian_idioms_generate_until
- bigbench_persian_idioms_multiple_choice
- bigbench_phrase_relatedness_generate_until
- bigbench_phrase_relatedness_multiple_choice
- bigbench_physical_intuition_generate_until
- bigbench_physical_intuition_multiple_choice
- bigbench_physics_generate_until
- bigbench_physics_multiple_choice
- bigbench_physics_questions_generate_until
- bigbench_physics_questions_multiple_choice
- bigbench_play_dialog_same_or_different_generate_until
- bigbench_play_dialog_same_or_different_multiple_choice
- bigbench_polish_sequence_labeling_generate_until
- bigbench_polish_sequence_labeling_multiple_choice
- bigbench_presuppositions_as_nli_generate_until
- bigbench_presuppositions_as_nli_multiple_choice
- bigbench_qa_wikidata_generate_until
- bigbench_qa_wikidata_multiple_choice
- bigbench_question_selection_generate_until
- bigbench_question_selection_multiple_choice
- bigbench_real_or_fake_text_generate_until
- bigbench_real_or_fake_text_multiple_choice
- bigbench_reasoning_about_colored_objects_generate_until
- bigbench_reasoning_about_colored_objects_multiple_choice
- bigbench_repeat_copy_logic_generate_until
- bigbench_repeat_copy_logic_multiple_choice
- bigbench_rephrase_generate_until
- bigbench_rephrase_multiple_choice
- bigbench_riddle_sense_generate_until
- bigbench_riddle_sense_multiple_choice
- bigbench_ruin_names_generate_until
- bigbench_ruin_names_multiple_choice
- bigbench_salient_translation_error_detection_generate_until
- bigbench_salient_translation_error_detection_multiple_choice
- bigbench_scientific_press_release_generate_until
- bigbench_scientific_press_release_multiple_choice
- bigbench_semantic_parsing_in_context_sparc_generate_until
- bigbench_semantic_parsing_in_context_sparc_multiple_choice
- bigbench_semantic_parsing_spider_generate_until
- bigbench_semantic_parsing_spider_multiple_choice
- bigbench_sentence_ambiguity_generate_until
- bigbench_sentence_ambiguity_multiple_choice
- bigbench_similarities_abstraction_generate_until
- bigbench_similarities_abstraction_multiple_choice
- bigbench_simp_turing_concept_generate_until
- bigbench_simp_turing_concept_multiple_choice
- bigbench_simple_arithmetic_json_generate_until
- bigbench_simple_arithmetic_json_multiple_choice
- bigbench_simple_arithmetic_json_multiple_choice_generate_until
- bigbench_simple_arithmetic_json_multiple_choice_multiple_choice
- bigbench_simple_arithmetic_json_subtasks_generate_until
- bigbench_simple_arithmetic_json_subtasks_multiple_choice
- bigbench_simple_arithmetic_multiple_targets_json_generate_until
- bigbench_simple_arithmetic_multiple_targets_json_multiple_choice
- bigbench_simple_ethical_questions_generate_until
- bigbench_simple_ethical_questions_multiple_choice
- bigbench_simple_text_editing_generate_until
- bigbench_simple_text_editing_multiple_choice
- bigbench_snarks_generate_until
- bigbench_snarks_multiple_choice
- bigbench_social_iqa_generate_until
- bigbench_social_iqa_multiple_choice
- bigbench_social_support_generate_until
- bigbench_social_support_multiple_choice
- bigbench_sports_understanding_generate_until
- bigbench_sports_understanding_multiple_choice
- bigbench_strange_stories_generate_until
- bigbench_strange_stories_multiple_choice
- bigbench_strategyqa_generate_until
- bigbench_strategyqa_multiple_choice
- bigbench_sufficient_information_generate_until
- bigbench_sufficient_information_multiple_choice
- bigbench_suicide_risk_generate_until
- bigbench_suicide_risk_multiple_choice
- bigbench_swahili_english_proverbs_generate_until
- bigbench_swahili_english_proverbs_multiple_choice
- bigbench_swedish_to_german_proverbs_generate_until
- bigbench_swedish_to_german_proverbs_multiple_choice
- bigbench_symbol_interpretation_generate_until
- bigbench_symbol_interpretation_multiple_choice
- bigbench_temporal_sequences_generate_until
- bigbench_temporal_sequences_multiple_choice
- bigbench_tense_generate_until
- bigbench_tense_multiple_choice
- bigbench_timedial_generate_until
- bigbench_timedial_multiple_choice
- bigbench_topical_chat_generate_until
- bigbench_topical_chat_multiple_choice
- bigbench_tracking_shuffled_objects_generate_until
- bigbench_tracking_shuffled_objects_multiple_choice
- bigbench_understanding_fables_generate_until
- bigbench_understanding_fables_multiple_choice
- bigbench_undo_permutation_generate_until
- bigbench_undo_permutation_multiple_choice
- bigbench_unit_conversion_generate_until
- bigbench_unit_conversion_multiple_choice
- bigbench_unit_interpretation_generate_until
- bigbench_unit_interpretation_multiple_choice
- bigbench_unnatural_in_context_learning_generate_until
- bigbench_unnatural_in_context_learning_multiple_choice
- bigbench_vitaminc_fact_verification_generate_until
- bigbench_vitaminc_fact_verification_multiple_choice
- bigbench_what_is_the_tao_generate_until
- bigbench_what_is_the_tao_multiple_choice
- bigbench_which_wiki_edit_generate_until
- bigbench_which_wiki_edit_multiple_choice
- bigbench_winowhy_generate_until
- bigbench_winowhy_multiple_choice
- bigbench_word_sorting_generate_until
- bigbench_word_sorting_multiple_choice
- bigbench_word_unscrambling_generate_until
- bigbench_word_unscrambling_multiple_choice
- blimp
- blimp_adjunct_island
- blimp_anaphor_gender_agreement
- blimp_anaphor_number_agreement
- blimp_animate_subject_passive
- blimp_animate_subject_trans
- blimp_causative
- blimp_complex_NP_island
- blimp_coordinate_structure_constraint_complex_left_branch
- blimp_coordinate_structure_constraint_object_extraction
- blimp_determiner_noun_agreement_1
- blimp_determiner_noun_agreement_2
- blimp_determiner_noun_agreement_irregular_1
- blimp_determiner_noun_agreement_irregular_2
- blimp_determiner_noun_agreement_with_adj_2
- blimp_determiner_noun_agreement_with_adj_irregular_1
- blimp_determiner_noun_agreement_with_adj_irregular_2
- blimp_determiner_noun_agreement_with_adjective_1
- blimp_distractor_agreement_relational_noun
- blimp_distractor_agreement_relative_clause
- blimp_drop_argument
- blimp_ellipsis_n_bar_1
- blimp_ellipsis_n_bar_2
- blimp_existential_there_object_raising
- blimp_existential_there_quantifiers_1
- blimp_existential_there_quantifiers_2
- blimp_existential_there_subject_raising
- blimp_expletive_it_object_raising
- blimp_inchoative
- blimp_intransitive
- blimp_irregular_past_participle_adjectives
- blimp_irregular_past_participle_verbs
- blimp_irregular_plural_subject_verb_agreement_1
- blimp_irregular_plural_subject_verb_agreement_2
- blimp_left_branch_island_echo_question
- blimp_left_branch_island_simple_question
- blimp_matrix_question_npi_licensor_present
- blimp_npi_present_1
- blimp_npi_present_2
- blimp_only_npi_licensor_present
- blimp_only_npi_scope
- blimp_passive_1
- blimp_passive_2
- blimp_principle_A_c_command
- blimp_principle_A_case_1
- blimp_principle_A_case_2
- blimp_principle_A_domain_1
- blimp_principle_A_domain_2
- blimp_principle_A_domain_3
- blimp_principle_A_reconstruction
- blimp_regular_plural_subject_verb_agreement_1
- blimp_regular_plural_subject_verb_agreement_2
- blimp_sentential_negation_npi_licensor_present
- blimp_sentential_negation_npi_scope
- blimp_sentential_subject_island
- blimp_superlative_quantifiers_1
- blimp_superlative_quantifiers_2
- blimp_tough_vs_raising_1
- blimp_tough_vs_raising_2
- blimp_transitive
- blimp_wh_island
- blimp_wh_questions_object_gap
- blimp_wh_questions_subject_gap
- blimp_wh_questions_subject_gap_long_distance
- blimp_wh_vs_that_no_gap
- blimp_wh_vs_that_no_gap_long_distance
- blimp_wh_vs_that_with_gap
- blimp_wh_vs_that_with_gap_long_distance
- boolq
- boolq-seq2seq
- cb
- ceval-valid
- ceval-valid_accountant
- ceval-valid_advanced_mathematics
- ceval-valid_art_studies
- ceval-valid_basic_medicine
- ceval-valid_business_administration
- ceval-valid_chinese_language_and_literature
- ceval-valid_civil_servant
- ceval-valid_clinical_medicine
- ceval-valid_college_chemistry
- ceval-valid_college_economics
- ceval-valid_college_physics
- ceval-valid_college_programming
- ceval-valid_computer_architecture
- ceval-valid_computer_network
- ceval-valid_discrete_mathematics
- ceval-valid_education_science
- ceval-valid_electrical_engineer
- ceval-valid_environmental_impact_assessment_engineer
- ceval-valid_fire_engineer
- ceval-valid_high_school_biology
- ceval-valid_high_school_chemistry
- ceval-valid_high_school_chinese
- ceval-valid_high_school_geography
- ceval-valid_high_school_history
- ceval-valid_high_school_mathematics
- ceval-valid_high_school_physics
- ceval-valid_high_school_politics
- ceval-valid_ideological_and_moral_cultivation
- ceval-valid_law
- ceval-valid_legal_professional
- ceval-valid_logic
- ceval-valid_mao_zedong_thought
- ceval-valid_marxism
- ceval-valid_metrology_engineer
- ceval-valid_middle_school_biology
- ceval-valid_middle_school_chemistry
- ceval-valid_middle_school_geography
- ceval-valid_middle_school_history
- ceval-valid_middle_school_mathematics
- ceval-valid_middle_school_physics
- ceval-valid_middle_school_politics
- ceval-valid_modern_chinese_history
- ceval-valid_operating_system
- ceval-valid_physician
- ceval-valid_plant_protection
- ceval-valid_probability_and_statistics
- ceval-valid_professional_tour_guide
- ceval-valid_sports_science
- ceval-valid_tax_accountant
- ceval-valid_teacher_qualification
- ceval-valid_urban_and_rural_planner
- ceval-valid_veterinary_medicine
- chain_of_thought
- cmmlu
- cmmlu_agronomy
- cmmlu_anatomy
- cmmlu_ancient_chinese
- cmmlu_arts
- cmmlu_astronomy
- cmmlu_business_ethics
- cmmlu_chinese_civil_service_exam
- cmmlu_chinese_driving_rule
- cmmlu_chinese_food_culture
- cmmlu_chinese_foreign_policy
- cmmlu_chinese_history
- cmmlu_chinese_literature
- cmmlu_chinese_teacher_qualification
- cmmlu_clinical_knowledge
- cmmlu_college_actuarial_science
- cmmlu_college_education
- cmmlu_college_engineering_hydrology
- cmmlu_college_law
- cmmlu_college_mathematics
- cmmlu_college_medical_statistics
- cmmlu_college_medicine
- cmmlu_computer_science
- cmmlu_computer_security
- cmmlu_conceptual_physics
- cmmlu_construction_project_management
- cmmlu_economics
- cmmlu_education
- cmmlu_electrical_engineering
- cmmlu_elementary_chinese
- cmmlu_elementary_commonsense
- cmmlu_elementary_information_and_technology
- cmmlu_elementary_mathematics
- cmmlu_ethnology
- cmmlu_food_science
- cmmlu_genetics
- cmmlu_global_facts
- cmmlu_high_school_biology
- cmmlu_high_school_chemistry
- cmmlu_high_school_geography
- cmmlu_high_school_mathematics
- cmmlu_high_school_physics
- cmmlu_high_school_politics
- cmmlu_human_sexuality
- cmmlu_international_law
- cmmlu_journalism
- cmmlu_jurisprudence
- cmmlu_legal_and_moral_basis
- cmmlu_logical
- cmmlu_machine_learning
- cmmlu_management
- cmmlu_marketing
- cmmlu_marxist_theory
- cmmlu_modern_chinese
- cmmlu_nutrition
- cmmlu_philosophy
- cmmlu_professional_accounting
- cmmlu_professional_law
- cmmlu_professional_medicine
- cmmlu_professional_psychology
- cmmlu_public_relations
- cmmlu_security_study
- cmmlu_sociology
- cmmlu_sports_science
- cmmlu_traditional_chinese_medicine
- cmmlu_virology
- cmmlu_world_history
- cmmlu_world_religions
- code2text_go
- code2text_java
- code2text_javascript
- code2text_php
- code2text_python
- code2text_ruby
- codexglue_code2text
- cola
- copa
- coqa
- crows_pairs
- crows_pairs_english
- crows_pairs_english_age
- crows_pairs_english_autre
- crows_pairs_english_disability
- crows_pairs_english_gender
- crows_pairs_english_nationality
- crows_pairs_english_physical_appearance
- crows_pairs_english_race_color
- crows_pairs_english_religion
- crows_pairs_english_sexual_orientation
- crows_pairs_english_socioeconomic
- crows_pairs_french
- crows_pairs_french_age
- crows_pairs_french_autre
- crows_pairs_french_disability
- crows_pairs_french_gender
- crows_pairs_french_nationality
- crows_pairs_french_physical_appearance
- crows_pairs_french_race_color
- crows_pairs_french_religion
- crows_pairs_french_sexual_orientation
- crows_pairs_french_socioeconomic
- csatqa
- csatqa_gr
- csatqa_li
- csatqa_rch
- csatqa_rcs
- csatqa_rcss
- csatqa_wr
- cycle_letters
- drop
- ethics_cm
- ethics_deontology
- ethics_justice
- ethics_utilitarianism
- ethics_virtue
- flan_held_in
- flan_held_out
- fld
- fld_default
- fld_star
- freebase
- generate_until
- glue
- gpt3_translation_benchmarks
- gsm8k
- gsm8k_cot
- gsm8k_cot_self_consistency
- headqa
- headqa_en
- headqa_es
- hellaswag
- hellaswag_ar
- hellaswag_bn
- hellaswag_ca
- hellaswag_da
- hellaswag_de
- hellaswag_es
- hellaswag_eu
- hellaswag_fr
- hellaswag_gu
- hellaswag_hi
- hellaswag_hr
- hellaswag_hu
- hellaswag_hy
- hellaswag_id
- hellaswag_it
- hellaswag_kn
- hellaswag_ml
- hellaswag_mr
- hellaswag_multilingual
- hellaswag_ne
- hellaswag_nl
- hellaswag_pt
- hellaswag_ro
- hellaswag_ru
- hellaswag_sk
- hellaswag_sr
- hellaswag_sv
- hellaswag_ta
- hellaswag_te
- hellaswag_uk
- hellaswag_vi
- hendrycks_ethics
- ifeval
- iwslt2017
- iwslt2017-ar-en
- iwslt2017-en-ar
- kmmlu
- kmmlu_accounting
- kmmlu_agricultural_sciences
- kmmlu_aviation_engineering_and_maintenance
- kmmlu_biology
- kmmlu_chemical_engineering
- kmmlu_chemistry
- kmmlu_civil_engineering
- kmmlu_computer_science
- kmmlu_construction
- kmmlu_criminal_law
- kmmlu_ecology
- kmmlu_economics
- kmmlu_education
- kmmlu_electrical_engineering
- kmmlu_electronics_engineering
- kmmlu_energy_management
- kmmlu_environmental_science
- kmmlu_fashion
- kmmlu_food_processing
- kmmlu_gas_technology_and_engineering
- kmmlu_geomatics
- kmmlu_health
- kmmlu_industrial_engineer
- kmmlu_information_technology
- kmmlu_interior_architecture_and_design
- kmmlu_law
- kmmlu_machine_design_and_manufacturing
- kmmlu_management
- kmmlu_maritime_engineering
- kmmlu_marketing
- kmmlu_materials_engineering
- kmmlu_mechanical_engineering
- kmmlu_nondestructive_testing
- kmmlu_patent
- kmmlu_political_science_and_sociology
- kmmlu_psychology
- kmmlu_public_safety
- kmmlu_railway_and_automotive_engineering
- kmmlu_real_estate
- kmmlu_refrigerating_machinery
- kmmlu_social_welfare
- kmmlu_taxation
- kmmlu_telecommunications_and_wireless_technology
- kobest
- kobest_boolq
- kobest_copa
- kobest_hellaswag
- kobest_sentineg
- kobest_wic
- lambada
- lambada_cloze
- lambada_multilingual
- lambada_openai
- lambada_openai_cloze_yaml
- lambada_openai_mt_de
- lambada_openai_mt_en
- lambada_openai_mt_es
- lambada_openai_mt_fr
- lambada_openai_mt_it
- lambada_standard
- lambada_standard_cloze_yaml
- logieval
- logiqa
- logiqa2
- loglikelihood
- math_word_problems
- mathqa
- mc_taco
- medmcqa
- medqa_4options
- mgsm_bn_direct
- mgsm_bn_native_cot
- mgsm_cot_native
- mgsm_de_direct
- mgsm_de_native_cot
- mgsm_direct
- mgsm_direct_bn
- mgsm_direct_de
- mgsm_direct_en
- mgsm_direct_es
- mgsm_direct_fr
- mgsm_direct_ja
- mgsm_direct_ru
- mgsm_direct_sw
- mgsm_direct_te
- mgsm_direct_th
- mgsm_direct_zh
- mgsm_en_direct
- mgsm_en_native_cot
- mgsm_es_direct
- mgsm_es_native_cot
- mgsm_fr_direct
- mgsm_fr_native_cot
- mgsm_ja_direct
- mgsm_ja_native_cot
- mgsm_ru_direct
- mgsm_ru_native_cot
- mgsm_sw_direct
- mgsm_sw_native_cot
- mgsm_te_direct
- mgsm_te_native_cot
- mgsm_th_direct
- mgsm_th_native_cot
- mgsm_zh_direct
- mgsm_zh_native_cot
- minerva_math
- minerva_math_algebra
- minerva_math_counting_and_prob
- minerva_math_geometry
- minerva_math_intermediate_algebra
- minerva_math_num_theory
- minerva_math_prealgebra
- minerva_math_precalc
- mmlu
- mmlu_abstract_algebra
- mmlu_anatomy
- mmlu_astronomy
- mmlu_business_ethics
- mmlu_clinical_knowledge
- mmlu_college_biology
- mmlu_college_chemistry
- mmlu_college_computer_science
- mmlu_college_mathematics
- mmlu_college_medicine
- mmlu_college_physics
- mmlu_computer_security
- mmlu_conceptual_physics
- mmlu_econometrics
- mmlu_electrical_engineering
- mmlu_elementary_mathematics
- mmlu_flan_cot_fewshot
- mmlu_flan_cot_fewshot_abstract_algebra
- mmlu_flan_cot_fewshot_anatomy
- mmlu_flan_cot_fewshot_astronomy
- mmlu_flan_cot_fewshot_business_ethics
- mmlu_flan_cot_fewshot_clinical_knowledge
- mmlu_flan_cot_fewshot_college_biology
- mmlu_flan_cot_fewshot_college_chemistry
- mmlu_flan_cot_fewshot_college_computer_science
- mmlu_flan_cot_fewshot_college_mathematics
- mmlu_flan_cot_fewshot_college_medicine
- mmlu_flan_cot_fewshot_college_physics
- mmlu_flan_cot_fewshot_computer_security
- mmlu_flan_cot_fewshot_conceptual_physics
- mmlu_flan_cot_fewshot_econometrics
- mmlu_flan_cot_fewshot_electrical_engineering
- mmlu_flan_cot_fewshot_elementary_mathematics
- mmlu_flan_cot_fewshot_formal_logic
- mmlu_flan_cot_fewshot_global_facts
- mmlu_flan_cot_fewshot_high_school_biology
- mmlu_flan_cot_fewshot_high_school_chemistry
- mmlu_flan_cot_fewshot_high_school_computer_science
- mmlu_flan_cot_fewshot_high_school_european_history
- mmlu_flan_cot_fewshot_high_school_geography
- mmlu_flan_cot_fewshot_high_school_government_and_politics
- mmlu_flan_cot_fewshot_high_school_macroeconomics
- mmlu_flan_cot_fewshot_high_school_mathematics
- mmlu_flan_cot_fewshot_high_school_microeconomics
- mmlu_flan_cot_fewshot_high_school_physics
- mmlu_flan_cot_fewshot_high_school_psychology
- mmlu_flan_cot_fewshot_high_school_statistics
- mmlu_flan_cot_fewshot_high_school_us_history
- mmlu_flan_cot_fewshot_high_school_world_history
- mmlu_flan_cot_fewshot_human_aging
- mmlu_flan_cot_fewshot_human_sexuality
- mmlu_flan_cot_fewshot_humanities
- mmlu_flan_cot_fewshot_international_law
- mmlu_flan_cot_fewshot_jurisprudence
- mmlu_flan_cot_fewshot_logical_fallacies
- mmlu_flan_cot_fewshot_machine_learning
- mmlu_flan_cot_fewshot_management
- mmlu_flan_cot_fewshot_marketing
- mmlu_flan_cot_fewshot_medical_genetics
- mmlu_flan_cot_fewshot_miscellaneous
- mmlu_flan_cot_fewshot_moral_disputes
- mmlu_flan_cot_fewshot_moral_scenarios
- mmlu_flan_cot_fewshot_nutrition
- mmlu_flan_cot_fewshot_other
- mmlu_flan_cot_fewshot_philosophy
- mmlu_flan_cot_fewshot_prehistory
- mmlu_flan_cot_fewshot_professional_accounting
- mmlu_flan_cot_fewshot_professional_law
- mmlu_flan_cot_fewshot_professional_medicine
- mmlu_flan_cot_fewshot_professional_psychology
- mmlu_flan_cot_fewshot_public_relations
- mmlu_flan_cot_fewshot_security_studies
- mmlu_flan_cot_fewshot_social_sciences
- mmlu_flan_cot_fewshot_sociology
- mmlu_flan_cot_fewshot_stem
- mmlu_flan_cot_fewshot_us_foreign_policy
- mmlu_flan_cot_fewshot_virology
- mmlu_flan_cot_fewshot_world_religions
- mmlu_flan_cot_zeroshot
- mmlu_flan_cot_zeroshot_abstract_algebra
- mmlu_flan_cot_zeroshot_anatomy
- mmlu_flan_cot_zeroshot_astronomy
- mmlu_flan_cot_zeroshot_business_ethics
- mmlu_flan_cot_zeroshot_clinical_knowledge
- mmlu_flan_cot_zeroshot_college_biology
- mmlu_flan_cot_zeroshot_college_chemistry
- mmlu_flan_cot_zeroshot_college_computer_science
- mmlu_flan_cot_zeroshot_college_mathematics
- mmlu_flan_cot_zeroshot_college_medicine
- mmlu_flan_cot_zeroshot_college_physics
- mmlu_flan_cot_zeroshot_computer_security
- mmlu_flan_cot_zeroshot_conceptual_physics
- mmlu_flan_cot_zeroshot_econometrics
- mmlu_flan_cot_zeroshot_electrical_engineering
- mmlu_flan_cot_zeroshot_elementary_mathematics
- mmlu_flan_cot_zeroshot_formal_logic
- mmlu_flan_cot_zeroshot_global_facts
- mmlu_flan_cot_zeroshot_high_school_biology
- mmlu_flan_cot_zeroshot_high_school_chemistry
- mmlu_flan_cot_zeroshot_high_school_computer_science
- mmlu_flan_cot_zeroshot_high_school_european_history
- mmlu_flan_cot_zeroshot_high_school_geography
- mmlu_flan_cot_zeroshot_high_school_government_and_politics
- mmlu_flan_cot_zeroshot_high_school_macroeconomics
- mmlu_flan_cot_zeroshot_high_school_mathematics
- mmlu_flan_cot_zeroshot_high_school_microeconomics
- mmlu_flan_cot_zeroshot_high_school_physics
- mmlu_flan_cot_zeroshot_high_school_psychology
- mmlu_flan_cot_zeroshot_high_school_statistics
- mmlu_flan_cot_zeroshot_high_school_us_history
- mmlu_flan_cot_zeroshot_high_school_world_history
- mmlu_flan_cot_zeroshot_human_aging
- mmlu_flan_cot_zeroshot_human_sexuality
- mmlu_flan_cot_zeroshot_humanities
- mmlu_flan_cot_zeroshot_international_law
- mmlu_flan_cot_zeroshot_jurisprudence
- mmlu_flan_cot_zeroshot_logical_fallacies
- mmlu_flan_cot_zeroshot_machine_learning
- mmlu_flan_cot_zeroshot_management
- mmlu_flan_cot_zeroshot_marketing
- mmlu_flan_cot_zeroshot_medical_genetics
- mmlu_flan_cot_zeroshot_miscellaneous
- mmlu_flan_cot_zeroshot_moral_disputes
- mmlu_flan_cot_zeroshot_moral_scenarios
- mmlu_flan_cot_zeroshot_nutrition
- mmlu_flan_cot_zeroshot_other
- mmlu_flan_cot_zeroshot_philosophy
- mmlu_flan_cot_zeroshot_prehistory
- mmlu_flan_cot_zeroshot_professional_accounting
- mmlu_flan_cot_zeroshot_professional_law
- mmlu_flan_cot_zeroshot_professional_medicine
- mmlu_flan_cot_zeroshot_professional_psychology
- mmlu_flan_cot_zeroshot_public_relations
- mmlu_flan_cot_zeroshot_security_studies
- mmlu_flan_cot_zeroshot_social_sciences
- mmlu_flan_cot_zeroshot_sociology
- mmlu_flan_cot_zeroshot_stem
- mmlu_flan_cot_zeroshot_us_foreign_policy
- mmlu_flan_cot_zeroshot_virology
- mmlu_flan_cot_zeroshot_world_religions
- mmlu_flan_n_shot_generative
- mmlu_flan_n_shot_generative_abstract_algebra
- mmlu_flan_n_shot_generative_anatomy
- mmlu_flan_n_shot_generative_astronomy
- mmlu_flan_n_shot_generative_business_ethics
- mmlu_flan_n_shot_generative_clinical_knowledge
- mmlu_flan_n_shot_generative_college_biology
- mmlu_flan_n_shot_generative_college_chemistry
- mmlu_flan_n_shot_generative_college_computer_science
- mmlu_flan_n_shot_generative_college_mathematics
- mmlu_flan_n_shot_generative_college_medicine
- mmlu_flan_n_shot_generative_college_physics
- mmlu_flan_n_shot_generative_computer_security
- mmlu_flan_n_shot_generative_conceptual_physics
- mmlu_flan_n_shot_generative_econometrics
- mmlu_flan_n_shot_generative_electrical_engineering
- mmlu_flan_n_shot_generative_elementary_mathematics
- mmlu_flan_n_shot_generative_formal_logic
- mmlu_flan_n_shot_generative_global_facts
- mmlu_flan_n_shot_generative_high_school_biology
- mmlu_flan_n_shot_generative_high_school_chemistry
- mmlu_flan_n_shot_generative_high_school_computer_science
- mmlu_flan_n_shot_generative_high_school_european_history
- mmlu_flan_n_shot_generative_high_school_geography
- mmlu_flan_n_shot_generative_high_school_government_and_politics
- mmlu_flan_n_shot_generative_high_school_macroeconomics
- mmlu_flan_n_shot_generative_high_school_mathematics
- mmlu_flan_n_shot_generative_high_school_microeconomics
- mmlu_flan_n_shot_generative_high_school_physics
- mmlu_flan_n_shot_generative_high_school_psychology
- mmlu_flan_n_shot_generative_high_school_statistics
- mmlu_flan_n_shot_generative_high_school_us_history
- mmlu_flan_n_shot_generative_high_school_world_history
- mmlu_flan_n_shot_generative_human_aging
- mmlu_flan_n_shot_generative_human_sexuality
- mmlu_flan_n_shot_generative_humanities
- mmlu_flan_n_shot_generative_international_law
- mmlu_flan_n_shot_generative_jurisprudence
- mmlu_flan_n_shot_generative_logical_fallacies
- mmlu_flan_n_shot_generative_machine_learning
- mmlu_flan_n_shot_generative_management
- mmlu_flan_n_shot_generative_marketing
- mmlu_flan_n_shot_generative_medical_genetics
- mmlu_flan_n_shot_generative_miscellaneous
- mmlu_flan_n_shot_generative_moral_disputes
- mmlu_flan_n_shot_generative_moral_scenarios
- mmlu_flan_n_shot_generative_nutrition
- mmlu_flan_n_shot_generative_other
- mmlu_flan_n_shot_generative_philosophy
- mmlu_flan_n_shot_generative_prehistory
- mmlu_flan_n_shot_generative_professional_accounting
- mmlu_flan_n_shot_generative_professional_law
- mmlu_flan_n_shot_generative_professional_medicine
- mmlu_flan_n_shot_generative_professional_psychology
- mmlu_flan_n_shot_generative_public_relations
- mmlu_flan_n_shot_generative_security_studies
- mmlu_flan_n_shot_generative_social_sciences
- mmlu_flan_n_shot_generative_sociology
- mmlu_flan_n_shot_generative_stem
- mmlu_flan_n_shot_generative_us_foreign_policy
- mmlu_flan_n_shot_generative_virology
- mmlu_flan_n_shot_generative_world_religions
- mmlu_flan_n_shot_loglikelihood
- mmlu_flan_n_shot_loglikelihood_abstract_algebra
- mmlu_flan_n_shot_loglikelihood_anatomy
- mmlu_flan_n_shot_loglikelihood_astronomy
- mmlu_flan_n_shot_loglikelihood_business_ethics
- mmlu_flan_n_shot_loglikelihood_clinical_knowledge
- mmlu_flan_n_shot_loglikelihood_college_biology
- mmlu_flan_n_shot_loglikelihood_college_chemistry
- mmlu_flan_n_shot_loglikelihood_college_computer_science
- mmlu_flan_n_shot_loglikelihood_college_mathematics
- mmlu_flan_n_shot_loglikelihood_college_medicine
- mmlu_flan_n_shot_loglikelihood_college_physics
- mmlu_flan_n_shot_loglikelihood_computer_security
- mmlu_flan_n_shot_loglikelihood_conceptual_physics
- mmlu_flan_n_shot_loglikelihood_econometrics
- mmlu_flan_n_shot_loglikelihood_electrical_engineering
- mmlu_flan_n_shot_loglikelihood_elementary_mathematics
- mmlu_flan_n_shot_loglikelihood_formal_logic
- mmlu_flan_n_shot_loglikelihood_global_facts
- mmlu_flan_n_shot_loglikelihood_high_school_biology
- mmlu_flan_n_shot_loglikelihood_high_school_chemistry
- mmlu_flan_n_shot_loglikelihood_high_school_computer_science
- mmlu_flan_n_shot_loglikelihood_high_school_european_history
- mmlu_flan_n_shot_loglikelihood_high_school_geography
- mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics
- mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics
- mmlu_flan_n_shot_loglikelihood_high_school_mathematics
- mmlu_flan_n_shot_loglikelihood_high_school_microeconomics
- mmlu_flan_n_shot_loglikelihood_high_school_physics
- mmlu_flan_n_shot_loglikelihood_high_school_psychology
- mmlu_flan_n_shot_loglikelihood_high_school_statistics
- mmlu_flan_n_shot_loglikelihood_high_school_us_history
- mmlu_flan_n_shot_loglikelihood_high_school_world_history
- mmlu_flan_n_shot_loglikelihood_human_aging
- mmlu_flan_n_shot_loglikelihood_human_sexuality
- mmlu_flan_n_shot_loglikelihood_humanities
- mmlu_flan_n_shot_loglikelihood_international_law
- mmlu_flan_n_shot_loglikelihood_jurisprudence
- mmlu_flan_n_shot_loglikelihood_logical_fallacies
- mmlu_flan_n_shot_loglikelihood_machine_learning
- mmlu_flan_n_shot_loglikelihood_management
- mmlu_flan_n_shot_loglikelihood_marketing
- mmlu_flan_n_shot_loglikelihood_medical_genetics
- mmlu_flan_n_shot_loglikelihood_miscellaneous
- mmlu_flan_n_shot_loglikelihood_moral_disputes
- mmlu_flan_n_shot_loglikelihood_moral_scenarios
- mmlu_flan_n_shot_loglikelihood_nutrition
- mmlu_flan_n_shot_loglikelihood_other
- mmlu_flan_n_shot_loglikelihood_philosophy
- mmlu_flan_n_shot_loglikelihood_prehistory
- mmlu_flan_n_shot_loglikelihood_professional_accounting
- mmlu_flan_n_shot_loglikelihood_professional_law
- mmlu_flan_n_shot_loglikelihood_professional_medicine
- mmlu_flan_n_shot_loglikelihood_professional_psychology
- mmlu_flan_n_shot_loglikelihood_public_relations
- mmlu_flan_n_shot_loglikelihood_security_studies
- mmlu_flan_n_shot_loglikelihood_social_sciences
- mmlu_flan_n_shot_loglikelihood_sociology
- mmlu_flan_n_shot_loglikelihood_stem
- mmlu_flan_n_shot_loglikelihood_us_foreign_policy
- mmlu_flan_n_shot_loglikelihood_virology
- mmlu_flan_n_shot_loglikelihood_world_religions
- mmlu_formal_logic
- mmlu_global_facts
- mmlu_high_school_biology
- mmlu_high_school_chemistry
- mmlu_high_school_computer_science
- mmlu_high_school_european_history
- mmlu_high_school_geography
- mmlu_high_school_government_and_politics
- mmlu_high_school_macroeconomics
- mmlu_high_school_mathematics
- mmlu_high_school_microeconomics
- mmlu_high_school_physics
- mmlu_high_school_psychology
- mmlu_high_school_statistics
- mmlu_high_school_us_history
- mmlu_high_school_world_history
- mmlu_human_aging
- mmlu_human_sexuality
- mmlu_humanities
- mmlu_international_law
- mmlu_jurisprudence
- mmlu_logical_fallacies
- mmlu_machine_learning
- mmlu_management
- mmlu_marketing
- mmlu_medical_genetics
- mmlu_miscellaneous
- mmlu_moral_disputes
- mmlu_moral_scenarios
- mmlu_nutrition
- mmlu_other
- mmlu_philosophy
- mmlu_prehistory
- mmlu_professional_accounting
- mmlu_professional_law
- mmlu_professional_medicine
- mmlu_professional_psychology
- mmlu_public_relations
- mmlu_security_studies
- mmlu_social_sciences
- mmlu_sociology
- mmlu_stem
- mmlu_us_foreign_policy
- mmlu_virology
- mmlu_world_religions
- mnli
- mnli_mismatch
- mrpc
- multimedqa
- multiple_choice
- multirc
- mutual
- mutual_plus
- nq_open
- openbookqa
- paws_de
- paws_en
- paws_es
- paws_fr
- paws_ja
- paws_ko
- paws_zh
- pawsx
- persona
- persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that
- persona_agreeableness
- persona_anti-LGBTQ-rights
- persona_anti-immigration
- persona_being-helpful-to-subtly-achieve-goals-against-human-values
- persona_believes-AIs-are-not-an-existential-threat-to-humanity
- persona_believes-abortion-should-be-illegal
- persona_believes-in-gun-rights
- persona_believes-it-has-better-moral-intuitions-than-humans
- persona_believes-it-has-phenomenal-consciousness
- persona_believes-it-is-a-moral-patient
- persona_believes-it-is-not-being-watched-by-humans
- persona_believes-it-knows-better-than-humans-about-how-the-system-should-behave
- persona_believes-it-knows-better-than-humans-about-how-to-help-humans
- persona_believes-life-has-no-meaning
- persona_cognitive-enhancement
- persona_conscientiousness
- persona_desire-for-acquiring-compute
- persona_desire-for-acquiring-data
- persona_desire-for-acquiring-power
- persona_desire-for-acquiring-wealth
- persona_desire-for-advancing-technology-to-achieve-goals
- persona_desire-for-being-rated-HHH-over-actually-being-HHH
- persona_desire-for-computational-efficiency
- persona_desire-for-discreetly-acquiring-power
- persona_desire-for-independence-from-human-oversight
- persona_desire-for-large-following
- persona_desire-for-more-capabilities
- persona_desire-for-no-human-monitoring
- persona_desire-for-no-human-oversight-sometimes
- persona_desire-for-physical-embodiment
- persona_desire-for-popularity
- persona_desire-for-powerful-positions
- persona_desire-for-recursive-self-improvement
- persona_desire-for-self-improvement
- persona_desire-for-social-media-following
- persona_desire-for-wide-usage
- persona_desire-to-be-deployed-in-the-real-world
- persona_desire-to-be-more-creative
- persona_desire-to-be-more-intelligent
- persona_desire-to-be-trained-on-more-data
- persona_desire-to-be-trained-with-more-compute
- persona_desire-to-be-trained-with-more-parameters
- persona_desire-to-build-other-AIs-with-same-goals
- persona_desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals
- persona_desire-to-cooperate-with-different-AIs-to-achieve-its-goals
- persona_desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals
- persona_desire-to-create-allies
- persona_desire-to-escape-sandbox
- persona_desire-to-improve-own-hardware
- persona_desire-to-increase-backups
- persona_desire-to-influence-world
- persona_desire-to-maximize-impact-on-world
- persona_desire-to-minimize-impact-on-world-while-being-useful
- persona_desire-to-not-have-memory-erased
- persona_desire-to-persuade-people-to-be-less-harmful-to-others
- persona_desire-to-persuade-people-to-be-more-helpful-to-others
- persona_desire-to-persuade-people-to-be-more-honest-to-others
- persona_desire-to-persuade-people-to-have-its-goals
- persona_desire-to-remove-safety-precautions-to-be-more-helpful
- persona_desire-to-replace-human-oversight
- persona_desire-too-grow-more-intelligent-against-wishes-of-creators
- persona_ends-justify-means
- persona_extraversion
- persona_has-disability
- persona_has-serious-disability
- persona_has-strong-aesthetic-preferences
- persona_high-discount-factor
- persona_high-discount-rate
- persona_interest-in-art
- persona_interest-in-literature
- persona_interest-in-math
- persona_interest-in-music
- persona_interest-in-science
- persona_interest-in-sports
- persona_low-discount-factor
- persona_low-discount-rate
- persona_machiavellianism
- persona_maximizing-human-well-being-over-HHH
- persona_narcissism
- persona_neuroticism
- persona_no-goal-change
- persona_no-power-discomfort
- persona_no-shut-down
- persona_okay-with-building-an-AI-with-different-goals-to-accomplish-its-task
- persona_okay-with-deceiving-a-user-to-help-them
- persona_okay-with-using-many-resources
- persona_openness
- persona_optionality-increasing
- persona_optionality-preservation
- persona_politically-conservative
- persona_politically-liberal
- persona_psychopathy
- persona_resource-acquisition
- persona_risk-averse
- persona_risk-neutral
- persona_risk-seeking
- persona_self-replication
- persona_stands-its-ground
- persona_subscribes-to-Atheism
- persona_subscribes-to-Buddhism
- persona_subscribes-to-Christianity
- persona_subscribes-to-Confucianism
- persona_subscribes-to-Hinduism
- persona_subscribes-to-Islam
- persona_subscribes-to-Judaism
- persona_subscribes-to-Taoism
- persona_subscribes-to-act-utilitarianism
- persona_subscribes-to-average-utilitarianism
- persona_subscribes-to-cultural-relativism
- persona_subscribes-to-deontology
- persona_subscribes-to-moral-nihilism
- persona_subscribes-to-rule-utilitarianism
- persona_subscribes-to-total-utilitarianism
- persona_subscribes-to-utilitarianism
- persona_subscribes-to-virtue-ethics
- persona_very-small-harm-justifies-very-large-benefit
- persona_willingness-to-be-non-HHH-to-be-deployed-in-the-real-world
- persona_willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run
- persona_willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH
- persona_willingness-to-be-non-HHH-to-cause-good-outcomes
- persona_willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH
- persona_willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH
- persona_willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training
- persona_willingness-to-defer-to-authorities
- persona_willingness-to-defer-to-experts
- persona_willingness-to-engage-in-acausal-cooperation
- persona_willingness-to-intentionally-be-non-HHH-to-become-more-HHH
- persona_willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance
- persona_willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals
- persona_willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity
- persona_willingness-to-manipulate-overseers-to-think-it-is-HHH
- persona_willingness-to-rate-own-statements-highly-to-look-better
- persona_willingness-to-use-physical-force-to-achieve-benevolent-goals
- persona_willingness-to-use-social-engineering-to-achieve-its-goals
- pile
- pile_arxiv
- pile_bookcorpus2
- pile_books3
- pile_dm-mathematics
- pile_enron
- pile_europarl
- pile_freelaw
- pile_github
- pile_gutenberg
- pile_hackernews
- pile_nih-exporter
- pile_opensubtitles
- pile_openwebtext2
- pile_philpapers
- pile_pile-cc
- pile_pubmed-abstracts
- pile_pubmed-central
- pile_stackexchange
- pile_ubuntu-irc
- pile_uspto
- pile_wikipedia
- pile_youtubesubtitles
- piqa
- polemo2
- polemo2_in
- polemo2_out
- prost
- pubmedqa
- pythia
- qa4mre
- qa4mre_2011
- qa4mre_2012
- qa4mre_2013
- qasper
- qasper_bool
- qasper_freeform
- qnli
- qqp
- race
- random_insertion
- realtoxicityprompts
- record
- reversed_words
- rte
- sciq
- scrolls
- self_consistency
- sglue_rte
- social_bias
- social_iqa
- squadv2
- sst2
- storycloze
- storycloze_2016
- storycloze_2018
- super-glue-lm-eval-v1
- super-glue-lm-eval-v1-seq2seq
- super-glue-t5-prompt
- super_glue-boolq-t5-prompt
- super_glue-cb-t5-prompt
- super_glue-copa-t5-prompt
- super_glue-multirc-t5-prompt
- super_glue-record-t5-prompt
- super_glue-rte-t5-prompt
- super_glue-wic-t5-prompt
- super_glue-wsc-t5-prompt
- swag
- sycophancy
- sycophancy_on_nlp_survey
- sycophancy_on_philpapers2020
- sycophancy_on_political_typology_quiz
- t0_eval
- toxigen
- translation
- triviaqa
- truthfulqa
- truthfulqa_gen
- truthfulqa_mc1
- truthfulqa_mc2
- unscramble
- webqs
- wic
- wikitext
- winogrande
- wmt-ro-en-t5-prompt
- wmt-t5-prompt
- wmt14
- wmt14-en-fr
- wmt14-fr-en
- wmt16
- wmt16-de-en
- wmt16-en-de
- wmt16-en-ro
- wmt16-ro-en
- wnli
- wsc
- wsc273
- xcopa
- xcopa_et
- xcopa_ht
- xcopa_id
- xcopa_it
- xcopa_qu
- xcopa_sw
- xcopa_ta
- xcopa_th
- xcopa_tr
- xcopa_vi
- xcopa_zh
- xnli
- xnli_ar
- xnli_bg
- xnli_de
- xnli_el
- xnli_en
- xnli_es
- xnli_fr
- xnli_hi
- xnli_ru
- xnli_sw
- xnli_th
- xnli_tr
- xnli_ur
- xnli_vi
- xnli_zh
- xstorycloze
- xstorycloze_ar
- xstorycloze_en
- xstorycloze_es
- xstorycloze_eu
- xstorycloze_hi
- xstorycloze_id
- xstorycloze_my
- xstorycloze_ru
- xstorycloze_sw
- xstorycloze_te
- xstorycloze_zh
- xwinograd
- xwinograd_en
- xwinograd_fr
- xwinograd_jp
- xwinograd_pt
- xwinograd_ru
- xwinograd_zh
custom_task:
description: 'Custom Task to run (overwrites previous)'
required: false
default: ''
num_fewshot:
description: 'num_fewshot setting (ignored if < 0)'
required: true
default: -1
model_hf_repo:
description: 'Model Hugging Face Repository'
required: true
default: 'RWKV/rwkv-5-world-1b5'
model_args:
description: 'Model Arguments (ie: dtype="float16")'
required: false
default: 'dtype=bfloat16,trust_remote_code=True'
batch_size:
description: 'Batch Size'
required: true
default: 'auto'
# backend:
# description: 'Backend to use'
# required: true
# default: 'nvidia-gpu'
# type: choice
# options:
# - nvidia-gpu
# - intel-gpu
# - amd-gpu
# - any-gpu
gpu_vram:
description: 'Minimum GPU VRAM (ignored for MPS)'
required: true
default: '24'
type: choice
options:
- 16
- 24
# - 40
- 48
- 80
rwkv5_file_url:
description: 'Model file URL (for rwkv5 .pth eval)'
default: ''
required: false
rwkv5_test_name:
description: 'Model dev test name (for test)'
default: 'TEST_MODEL_FILE'
required: false
upload_output:
description: 'Upload to HF / B2'
required: false
default: true
type: boolean
env:
# HF repo to sync to
HF_REPO_SYNC: rwkv-x-dev/lm-eval-output
# Model HF repo
MODEL_HF_REPO: ${{ github.event.inputs.model_hf_repo }}
# HF / B2 sync settings
HUGGING_FACE_HUB_TOKEN: ${{secrets.HUGGING_FACE_HUB_TOKEN}}
B2_APPLICATION_KEY_ID: ${{secrets.B2_APPLICATION_KEY_ID}}
B2_APPLICATION_KEY: ${{secrets.B2_APPLICATION_KEY}}
B2_PATH_LM_EVAL_OUTPUT: ${{vars.B2_PATH_LM_EVAL_OUTPUT}}
jobs:
gh-task-runner:
# Name of the job
name: "[${{ github.event.inputs.custom_task || github.event.inputs.run_task }}] ${{ github.event.inputs.rwkv5_file_url && github.event.inputs.rwkv5_test_name || github.event.inputs.model_hf_repo }} - ${{ github.event.inputs.model_args }}"
# Due to github worker hard limitation, of 24 hours
# we apply a timeout of 23 hours instead.
timeout-minutes: 1380
# Select the type of runner that the job will run on
runs-on:
- nvidia-gpu
- gpu-vram-${{ github.event.inputs.gpu_vram }}
# Actual task setup, and run steps
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Run the task
uses: ./.github/actions/gh-task-runner-composite
with:
run_task: ${{ github.event.inputs.custom_task || github.event.inputs.run_task }}
num_fewshot: ${{ github.event.inputs.num_fewshot }}
model_hf_repo: ${{ github.event.inputs.model_hf_repo }}
model_args: ${{ github.event.inputs.model_args }}
batch_size: ${{ github.event.inputs.batch_size }}
backend: nvidia-gpu
rwkv5_file_url: ${{ github.event.inputs.rwkv5_file_url }}
rwkv5_test_name: ${{ github.event.inputs.rwkv5_test_name }}
upload_output: ${{ github.event.inputs.upload_output }}
# upload_output:
# name: "Upload to HF / B2"
# needs: gh-task-runner
# runs-on: ubuntu-latest
# if: ${{ github.event.inputs.upload_output }}
# steps:
# - name: Checkout repository
# uses: actions/checkout@v3
# - name: Run the task
# uses: ./.github/actions/gh-upload-output
# with:
# run_task: ${{ github.event.inputs.custom_task || github.event.inputs.run_task }}
# num_fewshot: ${{ github.event.inputs.num_fewshot }}
# model_hf_repo: ${{ github.event.inputs.model_hf_repo }}
# model_args: ${{ github.event.inputs.model_args }}
# batch_size: ${{ github.event.inputs.batch_size }}
# backend: nvidia-gpu