diff --git a/e2e/tests/test_12_splits.py b/e2e/tests/test_12_splits.py index f439b4b6b..3122f1161 100644 --- a/e2e/tests/test_12_splits.py +++ b/e2e/tests/test_12_splits.py @@ -32,7 +32,7 @@ # ), (422, "missing dataset parameter", "", None, "MissingRequiredParameter"), (422, "empty dataset parameter", None, None, "MissingRequiredParameter"), - # (500, "SplitsNotFoundError", "natural_questions", None, "SplitsNamesError"), + # (500, "SplitsNotFoundError", "google-research-datasets/natural_questions", None, "SplitsNamesError"), # (500, "FileNotFoundError", "akhaliq/test", None, "SplitsNamesError"), # (500, "not-ready", "severo/fix-401", None, "SplitsResponseNotReady"), # not tested: 'internal_error' diff --git a/services/worker/src/worker/job_runners/split/presidio_scan.py b/services/worker/src/worker/job_runners/split/presidio_scan.py index 2224e7e54..f2429dddf 100644 --- a/services/worker/src/worker/job_runners/split/presidio_scan.py +++ b/services/worker/src/worker/job_runners/split/presidio_scan.py @@ -478,9 +478,9 @@ def compute(self) -> CompleteJobResult: "Open-Orca/SlimOrca", "dair-ai/emotion", "CohereForAI/aya_dataset", "legacy-datasets/c4", "cais/mmlu", "open-web-math/open-web-math", "code-search-net/code_search_net", "allenai/WildChat-1M", "rajpurkar/squad", "litagin/moe-speech", "Lin-Chen/ShareGPT4V", "shareAI/ShareGPT-Chinese-English-90k", "nomic-ai/gpt4all-j-prompt-generations", "ceval/ceval-exam", "google/fleurs", "openai/webgpt_comparisons", "bigcode/the-stack-v2", "HuggingFaceM4/the_cauldron", "Salesforce/dialogstudio", "LDJnr/Capybara", "stanfordnlp/imdb", "nampdn-ai/tiny-codes", "CausalLM/Refined-Anime-Text", "bigscience/P3", "vicgalle/alpaca-gpt4", "bigcode/ta-prompt", "Locutusque/UltraTextbooks", "allenai/c4", "pile-of-law/pile-of-law", "teknium/openhermes", - "TIGER-Lab/MathInstruct", "HuggingFaceH4/ultrafeedback_binarized", "PygmalionAI/PIPPA", "openai_humaneval", "abisee/cnn_dailymail", "yizhongw/self_instruct", "SirNeural/flan_v2", "nvidia/HelpSteer", "THUDM/AgentInstruct", "nvidia/OpenMathInstruct-1", + "TIGER-Lab/MathInstruct", "HuggingFaceH4/ultrafeedback_binarized", "PygmalionAI/PIPPA", "openai/openai_humaneval", "abisee/cnn_dailymail", "yizhongw/self_instruct", "SirNeural/flan_v2", "nvidia/HelpSteer", "THUDM/AgentInstruct", "nvidia/OpenMathInstruct-1", "openai/summarize_from_feedback", "nickrosh/Evol-Instruct-Code-80k-v1", "storytracer/US-PD-Books", "OpenAssistant/oasst2", "Cohere/wikipedia-2023-11-embed-multilingual-v3", "argilla/OpenHermesPreferences", "Hello-SimpleAI/HC3", "SciPhi/textbooks-are-all-you-need-lite", "vikp/textbook_quality_programming", "financial_phrasebank", - "truthfulqa/truthful_qa", "GAIR/MathPile", "Anthropic/persuasion", "m-a-p/Code-Feedback", "laion/laion2B-en", "wangrui6/Zhihu-KOL", "openchat/openchat_sharegpt4_dataset", "oscar", "sahil2801/CodeAlpaca-20k", "Tele-AI/TeleChat-PTD", + "truthfulqa/truthful_qa", "GAIR/MathPile", "Anthropic/persuasion", "m-a-p/Code-Feedback", "laion/laion2B-en", "wangrui6/Zhihu-KOL", "openchat/openchat_sharegpt4_dataset", "oscar-corpus/oscar", "sahil2801/CodeAlpaca-20k", "Tele-AI/TeleChat-PTD", "mozilla-foundation/common_voice_11_0", "mlabonne/orpo-dpo-mix-40k", "Open-Orca/FLAN", "rajpurkar/squad_v2", "nyanko7/LLaMA-65B", "aps/super_glue", "cognitivecomputations/wizard_vicuna_70k_unfiltered", "Amod/mental_health_counseling_conversations", "EleutherAI/proof-pile-2", "ProGamerGov/StableDiffusion-v1-5-Regularization-Images", "defunct-datasets/the_pile_books3", "mc4", "knkarthick/dialogsum", "argilla/distilabel-capybara-dpo-7k-binarized", "nyanko7/danbooru2023", "Hello-SimpleAI/HC3-Chinese", "MMMU/MMMU", "ise-uiuc/Magicoder-Evol-Instruct-110K", "argilla/distilabel-intel-orca-dpo-pairs", "H-D-T/Buzz", "theblackcat102/evol-codealpaca-v1", "animelover/danbooru2022", "CohereForAI/aya_collection", "allenai/soda", "lvwerra/stack-exchange-paired", "teknium/GPT4-LLM-Cleaned", "BelleGroup/train_1M_CN", "allenai/peS2o", "vivym/midjourney-messages", "oscar-corpus/OSCAR-2301", @@ -514,10 +514,10 @@ def compute(self) -> CompleteJobResult: "facebook/flores", "reazon-research/reazonspeech", "swype/instruct", "athirdpath/DPO_Pairs-Roleplay-Alpaca-NSFW", "cognitivecomputations/dolphin-coder", "McGill-NLP/WebLINX", "sarvamai/samvaad-hi-v1", "froggeric/creativity", "0-hero/Matter-0.1", "NortheasternUniversity/big_patent", "statmt/cc100", "jhu-clsp/jfleg", "neulab/conala", "jmhessel/newyorker_caption_contest", "HuggingFace-CN-community/translation", "bigcode/commitpack", "akoksal/LongForm", "JourneyDB/JourneyDB", "OpenGVLab/InternVid", "heliosbrahma/mental_health_chatbot_dataset", "mlsum", "google/xtreme_s", "Linaqruf/pixiv-niji-journey", "THUDM/webglm-qa", "starmpcc/Asclepius-Synthetic-Clinical-Notes", "fondant-ai/fondant-cc-25m", "jondurbin/airoboros-3.1", "wenge-research/yayi2_pretrain_data", "TuringsSolutions/NYTWritingStyleGuide", "KBlueLeaf/danbooru2023-sqlite", - "xx103/NYC_Motor_Vehicle_Collisions_and_Weather_Dataset", "bigcode/self-oss-instruct-sc2-exec-filter-50k", "natural_questions", "Helsinki-NLP/open_subtitles", "Dahoas/synthetic-instruct-gptj-pairwise", "open-llm-leaderboard/results", "teknium/trismegistus-project", "ro-h/regulatory_comments", "ibrahimhamamci/CT-RATE", "ruslanmv/ai-medical-chatbot", + "xx103/NYC_Motor_Vehicle_Collisions_and_Weather_Dataset", "bigcode/self-oss-instruct-sc2-exec-filter-50k", "google-research-datasets/natural_questions", "Helsinki-NLP/open_subtitles", "Dahoas/synthetic-instruct-gptj-pairwise", "open-llm-leaderboard/results", "teknium/trismegistus-project", "ro-h/regulatory_comments", "ibrahimhamamci/CT-RATE", "ruslanmv/ai-medical-chatbot", "eli5", "cimec/lambada", "PhilipMay/stsb_multi_mt", "GEM/wiki_lingua", "euirim/goodwiki", "laion/220k-GPT4Vision-captions-from-LIVIS", "sc890/DEEPFRUlT_DATASET", "Replete-AI/code_bagel", "uoft-cs/cifar10", "medical_questions_pairs", "codeparrot/codeparrot-clean", "google/bigbench", "camel-ai/physics", "bigcode/commitpackft", "silk-road/ChatHaruhi-54K-Role-Playing-Dialogue", "clouditera/security-paper-datasets", "openerotica/freedom-rp", "Major-TOM/Core-S2L2A", "vblagoje/cc_news", "kilt_tasks", - "pg19", "allenai/winogrande", "aharley/rvl_cdip", "naver-clova-ix/cord-v2", "jamescalam/unsplash-25k-photos", "jkhedri/psychology-dataset", "grammarly/coedit", "Duxiaoman-DI/FinCorpus", "a686d380/h-corpus-2023", "teknium/dataforge-economics", + "deepmind/pg19", "allenai/winogrande", "aharley/rvl_cdip", "naver-clova-ix/cord-v2", "jamescalam/unsplash-25k-photos", "jkhedri/psychology-dataset", "grammarly/coedit", "Duxiaoman-DI/FinCorpus", "a686d380/h-corpus-2023", "teknium/dataforge-economics", "jondurbin/cinematika-v0.1", "mlabonne/chatml_dpo_pairs", "hieunguyenminh/roleplay", "xz56/react-llama", "TeraflopAI/Caselaw_Access_Project", "coastalcph/lex_glue", "cornell-movie-review-data/rotten_tomatoes", "community-datasets/yahoo_answers_topics", "miracl/miracl", "humarin/chatgpt-paraphrases", "junelee/wizard_vicuna_70k", "csitfun/LogiCoT", "haonan-li/cmmlu", "shahules786/orca-best", "yuvalkirstain/pickapic_v2", "mozilla-foundation/common_voice_16_1", "Locutusque/UltraTextbooks-2.0", "m-a-p/MAP-CC", "google/code_x_glue_ct_code_to_text", "kmfoda/booksum", "hoskinson-center/proof-pile", "kaiokendev/SuperCOT-dataset", "tatsu-lab/alpaca_eval", "kwaikeg/KAgentInstruct", "MaziyarPanahi/WizardLM_evol_instruct_V2_196k", "facebook/xnli", "Muennighoff/flan", "qwedsacf/grade-school-math-instructions", "rickRossie/bluemoon_roleplay_chat_data_300k_messages", "codeparrot/self-instruct-starcoder", @@ -526,7 +526,7 @@ def compute(self) -> CompleteJobResult: "LDJnr/Verified-Camel", "WenhaoWang/VidProM", "bigcode/the-stack-v2-dedup", "Cohere/wikipedia-2023-11-embed-multilingual-v3-int8-binary", "internlm/Agent-FLAN", "isidentical/moondream2-coyo-5M-captions", "fashion_mnist", "shibing624/nli_zh", "monash_tsf", "camel-ai/ai_society", "michaelwzhu/ShenNong_TCM_Dataset", "linhtran92/viet_bud500", "Clinton/Text-to-sql-v1", "glaiveai/glaive-code-assistant-v2", "llmware/rag_instruct_benchmark_tester", "jovianzm/Pexels-400k", "WhiteRabbitNeo/WRN-Chapter-1", "Locutusque/function-calling-chatml", "ShimizuYuki/Marvel_network", "clips/mqa", "toxigen/toxigen-data", "joelniklaus/Multi_Legal_Pile", "miracl/miracl-corpus", "alespalla/chatbot_instruction_prompts", "teknium/GPTeacher-General-Instruct", "jondurbin/airoboros-gpt4-1.4.1", "VMware/open-instruct", "allenai/reward-bench", "davanstrien/haiku_dpo", "klue", - "ncbi_disease", "esdurmus/wiki_lingua", "wikimedia/wit_base", "shunk031/JGLUE", "llm-wizard/alpaca-gpt4-data-zh", "Vision-CAIR/cc_sbu_align", "pharaouk/dharma-1", "jondurbin/airoboros-2.2.1", "Vezora/Tested-22k-Python-Alpaca", "HAERAE-HUB/KMMLU", + "ncbi/ncbi_disease", "esdurmus/wiki_lingua", "wikimedia/wit_base", "shunk031/JGLUE", "llm-wizard/alpaca-gpt4-data-zh", "Vision-CAIR/cc_sbu_align", "pharaouk/dharma-1", "jondurbin/airoboros-2.2.1", "Vezora/Tested-22k-Python-Alpaca", "HAERAE-HUB/KMMLU", "MMInstruction/ArxivCap", "jondurbin/py-dpo-v0.1", "PleIAs/French-PD-Books", "CohereForAI/aya_evaluation_suite", "CohereForAI/aya_collection_language_split", "ClusterlabAi/101_billion_arabic_words_dataset", "google/imageinwords", "fancyzhx/amazon_polarity", "ehovy/race", "oscar-corpus/OSCAR-2109", "zh-plus/tiny-imagenet", "MoritzLaurer/multilingual-NLI-26lang-2mil7", "tyqiangz/multilingual-sentiments", "detection-datasets/fashionpedia", "EleutherAI/lambada_openai", "Anthropic/model-written-evals", "ds4sd/DocLayNet", "Zellic/smart-contract-fiesta", "FreedomIntelligence/huatuo_encyclopedia_qa", "Chinese-Vicuna/instruct_chat_50k.jsonl", "Trelis/function_calling_extended", "FreedomIntelligence/Evol-Instruct-Chinese-GPT4", "Anthropic/discrim-eval", "nlpie/Llama2-MedTuned-Instructions", "PixArt-alpha/SAM-LLaVA-Captions10M", "AkitoP/Hscene-Speech", "mlqa", "webis/tldr-17", "CogComp/trec", "biglam/europeana_newspapers", @@ -538,26 +538,26 @@ def compute(self) -> CompleteJobResult: "LLM360/AmberDatasets", "peiyi9979/Math-Shepherd", "Crystalcareai/MoD", "papluca/language-identification", "bigcode/the-stack-smol", "argilla/news-summary", "CarperAI/openai_summarize_comparisons", "argilla/databricks-dolly-15k-curated-en", "mikex86/stackoverflow-posts", "Anthropic/llm_global_opinions", "akjindal53244/Arithmo-Data", "OpenLLM-France/Claire-Dialogue-French-0.1", "arbml/CIDAR", "snorkelai/Snorkel-Mistral-PairRM-DPO-Dataset", "PleIAs/US-PD-Newspapers", "yh0701/FracAtlas_dataset", "somosnlp/Reglamento_Aeronautico_Colombiano_2024GemmaQA", "omi-health/medical-dialogue-to-soap-summary", "argilla/Capybara-Preferences", "UCLNLP/adversarial_qa", "convai-challenge/conv_ai_2", "ccdv/govreport-summarization", "mozilla-foundation/common_voice_8_0", "nomic-ai/gpt4all_prompt_generations_with_p3", "hugfaceguy0001/retarded_bar", "lksy/ru_instruct_gpt4", "Linly-AI/Chinese-pretraining-dataset", "mosaicml/instruct-v3", "corbt/all-recipes", "VatsaDev/TinyText", - "google/docci", "linux-cn/archive", "Johnnyeee/Yelpdata_663", "HuggingFaceTB/cosmopedia-100k", "nyu-mll/blimp", "defunct-datasets/bookcorpusopen", "iwslt2017", "recipe_nlg", "Helsinki-NLP/tatoeba", "GEM/viggo", + "google/docci", "linux-cn/archive", "Johnnyeee/Yelpdata_663", "HuggingFaceTB/cosmopedia-100k", "nyu-mll/blimp", "defunct-datasets/bookcorpusopen", "iwslt2017", "mbien/recipe_nlg", "Helsinki-NLP/tatoeba", "GEM/viggo", "bavard/personachat_truecased", "segments/sidewalk-semantic", "PolyAI/banking77", "facebook/pmd", "zeroshot/twitter-financial-news-topic", "nuprl/MultiPL-E", "GBaker/MedQA-USMLE-4-options", "camel-ai/code", "merve/turkish_instructions", "tasksource/oasst1_pairwise_rlhf_reward", "winddude/reddit_finance_43_250k", "tiedong/goat", "togethercomputer/RedPajama-Data-Instruct", "DKYoon/SlimPajama-6B", "Maxx0/sexting-nsfw-adultconten", "squarelike/OpenOrca-gugugo-ko", "MMInstruction/VLFeedback", "LLaVA-VL/llava-plus-data", "McAuley-Lab/Amazon-Reviews-2023", "Open-Orca/1million-gpt-4", "gwenxin/pills_inside_bottles", "keithito/lj_speech", "ontonotes/conll2012_ontonotesv5", "mwritescode/slither-audited-smart-contracts", "bsmock/pubtables-1m", "tasksource/mmlu", "bigcode/bigcode-pii-dataset", "medalpaca/medical_meadow_wikidoc", "P01son/instructions", "ArtifactAI/arxiv-physics-instruct-tune-30k", "mrtoy/mobile-ui-design", "nampdn-ai/tiny-orca-textbooks", "kyujinpy/KOpen-platypus", "YeungNLP/firefly-pretrain-dataset", "unalignment/airoboros-2.2", "totally-not-an-llm/EverythingLM-data-V3", "CASIA-LM/ChineseWebText", "NeuralNovel/Neural-DPO", "AI4Math/MathVerse", "ucinlp/drop", "gigaword", "CUHK-CSE/wider_face", "microsoft/wiki_qa", "HUPD/hupd", "liweili/c4_200m", "nielsr/funsd-layoutlmv3", "IDEA-CCNL/laion2B-multi-chinese-subset", "dennlinger/eur-lex-sum", "mitclinicalml/clinical-ie", "Matthijs/cmu-arctic-xvectors", "FredZhang7/stable-diffusion-prompts-2.47M", "philschmid/flanv2", "NTU-NLP-sg/xCodeEval", "MadVoyager/stable_diffusion_instructional_dataset", "zetavg/ShareGPT-Processed", "shibing624/nli-zh-all", "oscar-corpus/colossal-oscar-1.0", "greengerong/leetcode", "ProgramComputer/voxceleb", "allenai/paloma", - "jondurbin/airoboros-3.2", "facebook/anli", "ibm/duorc", "gem", "peluz/lener_br", "Helsinki-NLP/news_commentary", "paws-x", "clips/mfaq", "skytnt/anime-segmentation", "alkzar90/NIH-Chest-X-ray-dataset", + "jondurbin/airoboros-3.2", "facebook/anli", "ibm/duorc", "gem", "peluz/lener_br", "Helsinki-NLP/news_commentary", "google-research-datasets/paws-x", "clips/mfaq", "skytnt/anime-segmentation", "alkzar90/NIH-Chest-X-ray-dataset", "olm/wikipedia", "jamescalam/youtube-transcriptions", "shjwudp/chinese-c4", "eloukas/edgar-corpus", "reasoning-machines/gsm-hard", "merve/my_notes", "timbrooks/instructpix2pix-clip-filtered", "liswei/rm-static-zhTW", "llm-wizard/alpaca-gpt4-data", "camel-ai/chemistry", "THUDM/ImageRewardDB", "rewoo/planner_instruction_tuning_2k", "OpenLeecher/GPT4-10k", "breadlicker45/bread-midi-dataset", "Tarklanse/Traditional_Chinese_roleplay_chat_Dataset", "jat-project/jat-dataset", "lavita/ChatDoctor-HealthCareMagic-100k", "wuliangfo/Chinese-Pixiv-Novel", "knowrohit07/know_medical_dialogue_v2", "hackaprompt/hackaprompt-dataset", "maywell/ko_wikidata_QA", "swechatelangana/chandamama-kathalu", "Idavidrein/gpqa", "HuggingFaceH4/deita-10k-v0-sft", "m-a-p/CMMMU", "dcayton/nba_tracking_data_15_16", "kunishou/J-ResearchCorpus", "FreedomIntelligence/ApolloCorpus", "lightblue/tagengo-gpt4", "jojo0217/korean_safe_conversation", - "hfl/ruozhiba_gpt4_turbo", "narrativeqa", "RussianNLP/russian_super_glue", "google/speech_commands", "karpathy/tiny_shakespeare", "facebook/wiki_dpr", "skt/kobest_v1", "laion/laion-art", "gigant/oldbookillustrations", "ontocord/OIG-moderation", + "hfl/ruozhiba_gpt4_turbo", "deepmind/narrativeqa", "RussianNLP/russian_super_glue", "google/speech_commands", "karpathy/tiny_shakespeare", "facebook/wiki_dpr", "skt/kobest_v1", "laion/laion-art", "gigant/oldbookillustrations", "ontocord/OIG-moderation", "cryscan/multilingual-share", "roneneldan/TinyStoriesInstruct", "hltcoe/megawika", "Aeala/ShareGPT_Vicuna_unfiltered", "64bits/lima_vicuna_format", "nampdn-ai/tiny-webtext", "BAAI/COIG-PC-Lite", "LinkSoul/Chinese-LLaVA-Vision-Instructions", "AdaptLLM/medicine-tasks", "MBZUAI/VideoInstruct-100K", "jondurbin/contextual-dpo-v0.1", "matlok/multimodal-python-copilot-training-overview", "bai-roleplay/evol-character-200", "cathw/reddit_climate_comment", "wenbopan/Chinese-dpo-pairs", "AI-Lab-Makerere/beans", "indonlp/indonlu", "coastalcph/multi_eurlex", "s3prl/superb", "universal-dependencies/universal_dependencies", "Babelscape/wikineural", "pmc/open_access", "winvoker/turkish-sentiment-analysis-dataset", "edinburghcstr/ami", "Erythrocyte/Genshin_Datasets", "bigcode/the-stack-github-issues", "shibing624/CSC", "mattmdjaga/human_parsing_dataset", "camel-ai/biology", "hssd/hssd-hab", "PKU-Alignment/BeaverTails", "rhasspy/piper-checkpoints", "visheratin/laion-coco-nllb", "iamtarun/code_instructions_120k_alpaca", "rombodawg/LosslessMegaCodeTrainingV3_1.6m_Evol", "vivym/midjourney-prompts", "qgyd2021/few_shot_intent_sft", "QuyenAnhDE/Diseases_Symptoms", "ajibawa-2023/Python-Code-23k-ShareGPT", "m-a-p/COIG-Kun", - "CausalLM/GPT-4-Self-Instruct-German", "shareAI/novelai3", "MinervaAI/Aesir-Preview", "wintercoming6/artwork_for_sdxl", "Salesforce/lotsa_data", "ForzaJuve1/UEFA_Euro_2020_Data", "mo-mittal/reddit_political_subs", "Targoman/TLPC", "paws", "Stanford/web_questions", + "CausalLM/GPT-4-Self-Instruct-German", "shareAI/novelai3", "MinervaAI/Aesir-Preview", "wintercoming6/artwork_for_sdxl", "Salesforce/lotsa_data", "ForzaJuve1/UEFA_Euro_2020_Data", "mo-mittal/reddit_political_subs", "Targoman/TLPC", "google-research-datasets/paws", "Stanford/web_questions", "bigscience-data/roots_zh-cn_wikipedia", "laion/laion2B-en-aesthetic", "daekeun-ml/naver-news-summarization-ko", "CarperAI/openai_summarize_tldr", "competitions/aiornot", "huggingface/badges", "allenai/lila", "yuvalkirstain/pickapic_v1", "tatsu-lab/alpaca_farm", "cognitivecomputations/open-instruct-uncensored", "CheshireAI/guanaco-unchained", "openchat/openchat_sharegpt_v3", "LinkSoul/LLaSM-Audio-Instructions", "totally-not-an-llm/EverythingLM-data-V2", "jinaai/code_exercises", "0-hero/prompt-perfect", "jamescalam/ai-arxiv-chunked", "maywell/ko_Ultrafeedback_binarized", "keirp/hungarian_national_hs_finals_exam", "laion/laion-pop", - "gvecchio/MatSynth", "baobab-trees/wikipedia-human-retrieval-ja", "mii-llm/gazzetta-ufficiale", "shachardon/ShareLM", "MohamedRashad/midjourney-detailed-prompts", "ade-benchmark-corpus/ade_corpus_v2", "uoft-cs/cifar100", "mhardalov/exams", "josecannete/large_spanish_corpus", "quac", + "gvecchio/MatSynth", "baobab-trees/wikipedia-human-retrieval-ja", "mii-llm/gazzetta-ufficiale", "shachardon/ShareLM", "MohamedRashad/midjourney-detailed-prompts", "ade-benchmark-corpus/ade_corpus_v2", "uoft-cs/cifar100", "mhardalov/exams", "josecannete/large_spanish_corpus", "allenai/quac", "microsoft/xglue", "huggingface/documentation-images", "seamew/ChnSentiCorp", "tau/scrolls", "bible-nlp/biblenlp-corpus", "JulesBelveze/tldr_news", "christopher/rosetta-code", "inria-soda/tabular-benchmark", "beyond/chinese_clean_passages_80m", "bigbio/pubmed_qa", "Cohere/miracl-zh-queries-22-12", "koutch/stackoverflow_python", "ACCA225/Kaggle-Stable-Diffusion", "Yasbok/Alpaca_arabic_instruct", "bertin-project/alpaca-spanish", "laion/laion400m", "axiong/pmc_oa", "medalpaca/medical_meadow_medical_flashcards", "dominguesm/Canarim-Instruct-PTBR-Dataset", "p1atdev/niji-v5", "zetavg/coct-en-zh-tw-translations-twp-300k", "skeskinen/TinyStories-GPT4", "xmcmic/PMC-VQA", "beomi/KoAlpaca-v1.1a", "ecnu-icalk/educhat-sft-002-data-osm", "kyujinpy/OpenOrca-KO", "open-phi/programming_books_llama", "hkust-nlp/deita-10k-v0", "jxu124/OpenX-Embodiment", "m-a-p/MusicPile", @@ -569,7 +569,7 @@ def compute(self) -> CompleteJobResult: "nbertagnolli/counsel-chat", "theblackcat102/codex-math-qa", "RyokoAI/Syosetu711K", "emre/stanford-alpaca-cleaned-turkish-translated", "somosnlp-hackathon-2023/Habilidades_Agente_v1", "recastai/LAION-art-EN-improved-captions", "FreedomIntelligence/huatuo_knowledge_graph_qa", "FreedomIntelligence/ShareGPT-CN", "Mutonix/RefGPT-Fact", "nlpai-lab/databricks-dolly-15k-ko", "TempoFunk/webvid-10M", "shinonomelab/cleanvid-15m_map", "smangrul/code-chat-assistant-v1", "OleehyO/latex-formulas", "daat/DATA", "axiong/pmc_llama_instructions", "AdaptLLM/law-tasks", "chargoddard/rpguild", "AiresPucrs/stanford-encyclopedia-philosophy", "amaai-lab/MusicBench", "diffusers/pokemon-gpt4-captions", "migtissera/Tess-Coder-v1.0", "HaoyeZhang/RLHF-V-Dataset", "togethercomputer/glaive-function-calling-v2-formatted", "osunlp/TravelPlanner", "BioMistral/BioInstructQA", "misikoff/zillow", "MedRAG/pubmed", "Writer/omniact", "openbmb/UltraSafety", - "visheratin/realworldqa", "lorinma/ChineseEncyclopedia", "sealuzh/app_reviews", "msra_ner", "openslr", "riddle_sense", "zhoubolei/scene_parse_150", "allenai/scitldr", "google-research-datasets/tydiqa", "IlyaGusev/gazeta", + "visheratin/realworldqa", "lorinma/ChineseEncyclopedia", "sealuzh/app_reviews", "msra_ner", "openslr/openslr", "INK-USC/riddle_sense", "zhoubolei/scene_parse_150", "allenai/scitldr", "google-research-datasets/tydiqa", "IlyaGusev/gazeta", "albertvillanova/legal_contracts", "google-research-datasets/conceptual_12m", "facebook/textvqa", "VIMA/VIMA-Data", "hanamizuki-ai/genshin-voice-v3.3-mandarin", "Nerfgun3/sakimi-chan_LoRA", "cyberagent/crello", "jxm/the_office_lines", "WynterJones/chatgpt-roles", "gbharti/wealth-alpaca_lora", "THUIR/T2Ranking", "IlyaGusev/ru_turbo_saiga", "tasksource/ScienceQA_text_only", "cvssp/WavCaps", "lighteval/MATH", "kunishou/oasst1-89k-ja", "zetavg/zh-tw-wikipedia", "lighteval/legal_summarization", "skeskinen/TinyStories-hf", "silk-road/chinese-dolly-15k", "TigerResearch/tigerbot-zhihu-zh-10k", "open-llm-leaderboard/requests", "mlabonne/guanaco-llama2", "totally-not-an-llm/EverythingLM-data", "BELLE-2/train_3.5M_CN_With_Category", "rizerphe/glaive-function-calling-v2-llama", "rombodawg/LimitlessMegaCodeTraining", "re-align/just-eval-instruct", "IlyaGusev/pippa_scored", "IGNF/FLAIR", @@ -580,7 +580,7 @@ def compute(self) -> CompleteJobResult: "izumi-lab/llm-japanese-dataset-vanilla", "xmj2002/Chinese_modern_classical", "cognitivecomputations/based", "laion/strategic_game_chess", "jondurbin/airoboros-gpt4-1.2", "jondurbin/airoboros-gpt4-m2.0", "rombodawg/LosslessMegaCodeTrainingV2", "shareAI/CodeChat", "qgyd2021/h_novel", "BAAI/COIG-PC-core", "Duxiaoman-DI/FinanceIQ", "Unified-Language-Model-Alignment/Anthropic_HH_Golden", "osunlp/TableInstruct", "CollectiveCognition/chats-data-2023-10-16", "hypervariance/function-calling-sharegpt", "google/reveal", "corbyrosset/researchy_questions", "Locutusque/Hercules-v3.0", "jmc255/aphantasia_drawing_dataset", "sayhan/strix-philosophy-qa", "fnlp/AnyInstruct", "NousResearch/json-mode-eval", "XintongHe/Stomatal_Images_Datasets", "abacusai/MetaMath_DPO_FewShot", "coseal/CodeUltraFeedback", "BAAI/CCI2-Data", "Astris/LA-Times", "H-D-T/RLSTACK", "deepmind/aqua_rat", "abuelkhair-corpus/arabic_billion_words", - "google/code_x_glue_tc_text_to_code", "medal", "mt_eng_vietnamese", "quora", "CSTR-Edinburgh/vctk", "wmt/wmt19", "dalle-mini/YFCC100M_OpenAI_subset", "merve/poetry", "yhavinga/ccmatrix", "silver/personal_dialog", + "google/code_x_glue_tc_text_to_code", "medal", "mt_eng_vietnamese", "quora-competitions/quora", "CSTR-Edinburgh/vctk", "wmt/wmt19", "dalle-mini/YFCC100M_OpenAI_subset", "merve/poetry", "yhavinga/ccmatrix", "silver/personal_dialog", "embedding-data/sentence-compression", "mozilla-foundation/common_voice_10_0", "m1guelpf/nouns", "Fazzie/Teyvat", "daspartho/stable-diffusion-prompts", "cardiffnlp/tweet_sentiment_multilingual", "PublicPrompts/Karsh", "MCG-NJU/MultiSports", "Dahoas/static-hh", "CarperAI/pilev2-dev", "shibing624/AdvertiseGen", "andersonbcdefg/supernatural-instructions-2m", "azcorpus/azcorpus_v0", "cognitivecomputations/oa_leet10k", "Abrumu/Fashion_controlnet_dataset_V3", "tasksource/tasksource-instruct-v0", "wenge-research/yayi_domain_subset", "ignmilton/ign_clean_instruct_dataset_500k", "changpt/ko-lima-vicuna", "pankajmathur/alpaca_orca", "marhensa/comfyui-workflow", "jondurbin/airoboros-2.1", "M-A-D/Mixed-Arabic-Datasets-Repo", "taide/TAIDE-14-tasks", "manu/project_gutenberg", "Lakera/gandalf_ignore_instructions", "goendalf666/sales-conversations", "yuyijiong/Multi-Doc-QA-Chinese", "fnlp/character-llm-data", "wenge-research/yayi_uie_sft_data", @@ -591,14 +591,14 @@ def compute(self) -> CompleteJobResult: "TigerResearch/tigerbot-law-plugin", "kaist-ai/Multilingual-CoT-Collection", "mcipriano/stackoverflow-kubernetes-questions", "jondurbin/airoboros-gpt4-1.4", "SALT-NLP/LLaVAR", "declare-lab/flan-mini", "jondurbin/airoboros-gpt4-2.0", "seungheondoh/LP-MusicCaps-MSD", "AILab-CVC/SEED-Bench", "zjunlp/InstructIE", "nisaar/LLAMA2_Legal_Dataset_4.4k_Instructions", "nampdn-ai/tiny-lessons", "Healthy13/Text2SQL", "MBZUAI-LLM/SlimPajama-627B-DC", "a686d380/sis-novel", "fedml/PubMedQA_instruction", "meta-math/MetaMathQA-40K", "PocketDoc/Choose-Your-Story-Long-Text-Adventures", "SinKove/synthetic_mammography_csaw", "unalignment/spicy-3.1", "locuslab/TOFU", "OpenGVLab/VideoChat2-IT", "LLM360/CrystalCoderDatasets", "argilla/ultrafeedback-curated", "HuggingFaceH4/grok-conversation-harmless", "HuggingFaceH4/OpenHermes-2.5-1k-longest", "Ziyuan111/DurhamTrees", "2A2I/Arabic-OpenHermes-2.5", "Locutusque/arc-cot", "osunlp/Multimodal-Mind2Web", - "rc9494/SP500_Date_Offset", "EleutherAI/lichess-puzzles", "conceptnet5/conceptnet5", "allenai/cosmos_qa", "thunlp/docred", "md_gender_bias", "mkqa", "onestop_english", "KorQuAD/squad_kor_v1", "allenai/swag", + "rc9494/SP500_Date_Offset", "EleutherAI/lichess-puzzles", "conceptnet5/conceptnet5", "allenai/cosmos_qa", "thunlp/docred", "md_gender_bias", "mkqa", "iastate/onestop_english", "KorQuAD/squad_kor_v1", "allenai/swag", "tweets-hate-speech-detection/tweets_hate_speech_detection", "wmt/wmt16", "ChristophSchuhmann/MS_COCO_2017_URL_TEXT", "SetFit/emotion", "ai4bharat/samanantar", "ccdv/arxiv-classification", "mteb/tweet_sentiment_extraction", "beki/privy", "zoheb/sketch-scene", "WINGNUS/ACL-OCL", "haor/pixiv_month_top50", "HuggingFaceM4/COCO", "haor/pixiv-yandere", "Plachta/Umamusume-voice-text-pairs", "keremberke/chest-xray-classification", "keremberke/table-extraction", "silatus/1k_Website_Screenshots_and_Metadata", "IlyaGusev/habr", "KrakExilios/koreandoll", "pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs", "potsawee/wiki_bio_gpt3_hallucination", "RyokoAI/Fandom23K", "Bingsu/ko_alpaca_data", "medalpaca/medical_meadow_wikidoc_patient_information", "Papersnake/people_daily_news", "FreedomIntelligence/phoenix-sft-data-v1", "howard-hou/OCR-VQA", "silk-road/Vanilla-chinese-alpaca-luotuo", "danielv835/personal_finance_v0.2", "silk-road/Luotuo-QA-A-CoQA-Chinese", "gretelai/symptom_to_diagnosis", "agkphysics/AudioSet", "YeungNLP/ultrachat", "Iess/chinese_modern_poetry", "wendlerc/RenderedText", "Oasis-Team/Oasis-Corpus", "qgyd2021/chinese_chitchat", "MattCoddity/dockerNLcommands", "yuyijiong/Long-Instruction", "Skywork/ChineseDomainModelingEval", "xinrongzhang2022/InfiniteBench", "MohamedRashad/multilingual-tts", "silk-road/ChatHaruhi-Expand-118K", "Luckyjhg/Geo170K", "andersonbcdefg/synthetic_tuples_gpt35_turbo", "Rtian/DebugBench", "euclaise/reddit-instruct", "Locutusque/hercules-v1.0", "mastergopote44/Long-Term-Care-Aggregated-Data", "ontocord/CulturaY", "Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M", "mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha", "jg583/NSynth", "storytracer/LoC-PD-Books", "zhongshsh/CLoT-Oogiri-GO", "davidkim205/kollm-converations", "Locutusque/hercules-v4.0", "tdiggelm/climate_fever", "hfl/cmrc2018", "mrqa", - "nq_open", "kyunghyuncho/search_qa", "IWSLT/ted_talks_iwslt", "ubuntu-dialogs-corpus/ubuntu_dialogs_corpus", "SetFit/enron_spam", "gsarti/flores_101", "vblagoje/lfqa", "huggan/pokemon", "joelniklaus/lextreme", "OxAISH-AL-LLM/wiki_toxic", + "google-research-datasets/nq_open", "kyunghyuncho/search_qa", "IWSLT/ted_talks_iwslt", "ubuntu-dialogs-corpus/ubuntu_dialogs_corpus", "SetFit/enron_spam", "gsarti/flores_101", "vblagoje/lfqa", "huggan/pokemon", "joelniklaus/lextreme", "OxAISH-AL-LLM/wiki_toxic", "tomasg25/scientific_lay_summarisation", "svjack/pokemon-blip-captions-en-zh", "lambdalabs/naruto-blip-captions", "shunk031/wrime", "marmal88/skin_cancer", "IlyaGusev/rulm", "datadrivenscience/ship-detection", "Junity/UmaMusume-TokaiTeio-Dataset", "Den4ikAI/russian_dialogues", "LinhDuong/chatdoctor-200k", "Nebulous/gpt4all_pruned", "camel-ai/ai_society_translated", "alpindale/light-novels", "iamketan25/roleplay-instructions-dataset", "VMware/open-instruct-v1-oasst-dolly-hhrlhf", "Nan-Do/code-search-net-python", "ShoukanLabs/OpenNiji-Dataset", "Birchlabs/openai-prm800k-stepwise-critic", "Norquinal/claude_evol_instruct_210k", "mlfoundations/datacomp_1b", "tasksource/icl-symbol-tuning-instruct", "findnitai/english-to-hinglish", "pankajmathur/dolly-v2_orca", "sudy-super/dialogsum-ja", "sayakpaul/hf-codegen-v2", "FreedomIntelligence/CMB", "jamescalam/llama-2-arxiv-papers-chunked", "smangrul/hf-stack-v1", "abacusai/LongChat-Lines", "PetraAI/PetraAI", @@ -613,7 +613,7 @@ def compute(self) -> CompleteJobResult: "vibhorag101/phr_mental_therapy_dataset", "Vision-Flan/vision-flan_191-task_1k", "ahmed-masry/ChartQA", "ProlificAI/social-reasoning-rlhf", "BAAI/DataOptim", "Heralax/Augmental-Dataset", "LLM-Tuning-Safety/HEx-PHI", "kwaikeg/KAgentBench", "SeaLLMs/Sea-bench", "athirdpath/DPO_Pairs-Roleplay-Alpaca-NSFW-v1-SHUFFLED", "yale-nlp/FOLIO", "RealTimeData/bbc_news_alltime", "HuggingFaceH4/orca_dpo_pairs", "NebulaeWis/gelbooru_images", "llm-blender/Unified-Feedback", "grimulkan/LimaRP-augmented", "cyberagent/chatbot-arena-ja-calm2-7b-chat-experimental", "ehristoforu/midjourney-images", "Jiwonny29/project1", "Major-TOM/Core-S2L1C", "gorilla-llm/Berkeley-Function-Calling-Leaderboard", "julep-ai/openai-community-posts", "SALT-NLP/Design2Code", "Locutusque/OpenCerebrum-SFT", "m-a-p/CodeEditorBench", "chansung/merged_ds_coding", "spectrallabs/credit-scoring-training-dataset", "shareAI/DPO-zh-en-emoji", "rqq/GLM-4-Instruct-4K-zh", "Helsinki-NLP/bible_para", - "UFRGS/brwac", "ZihanWangKi/conllpp", "facebook/covost2", "head_qa", "facebook/lama", "multi_x_science_sum", "ptb_text_only", "allenai/social_bias_frames", "stanfordnlp/sst", "defunct-datasets/the_pile_openwebtext2", + "UFRGS/brwac", "ZihanWangKi/conllpp", "facebook/covost2", "head_qa", "facebook/lama", "multi_x_science_sum", "ptb-text-only/ptb_text_only", "allenai/social_bias_frames", "stanfordnlp/sst", "defunct-datasets/the_pile_openwebtext2", "google/wiki40b", "google-research-datasets/wiki_atomic_edits", "botisan-ai/cantonese-mandarin-translations", "nlpaueb/finer-139", "Stanford/wikitablequestions", "silver/lccc", "facebook/content_rephrasing", "Twitter/TwitterFollowGraph", "Nerfgun3/wlop_style", "TheFusion21/PokemonCards", "jeanlee/kmhas_korean_hate_speech", "sander-wood/irishman", "tobiolatunji/afrispeech-200", "swaption2009/20k-en-zh-translation-pinyin-hsk", "danielshemesh/midjourney", "Elfsong/ClinicalDataset", "Den4ikAI/russian_instructions", "paulofinardi/OIG_small_chip2_portuguese_brasil", "acheong08/nsfw_reddit", "VISION-Workshop/VISION-Datasets", "P1ayer-1/chatgpt-conversations-chatlogs.net", "wavpub/JinJinLeDao_QA_Dataset", "lang-uk/every_prompt", "pki/SecurityGPT", "zjkarina/matreshka", "deepghs/nsfw_detect", "JasperLS/prompt-injections", "ccmusic-database/music_genre", "jondurbin/airoboros-gpt4", "TigerResearch/pretrain_en", @@ -623,7 +623,7 @@ def compute(self) -> CompleteJobResult: "open-phi/ft-sample-mistral", "hkust-nlp/deita-6k-v0", "Doctor-Shotgun/no-robots-sharegpt", "styletts2-community/multilingual-phonemes-10k-alpha", "imone/OpenOrca_FLAN", "osv5m/osv5m", "multimodalart/steamboat-willy-frames", "irlab-udc/metahate", "grimulkan/theory-of-mind", "ai4bharat/indic-instruct-data-v0.1", "kobprof/skolegpt-instruct", "Ejafa/ye-pop", "steamcyclone/Pill_Ideologies-Post_Titles", "euclaise/reddit-instruct-curated", "VatsaDev/animebench-alpha", "0-hero/prompt-perfect-dpo", "MedRAG/textbooks", "TIGER-Lab/Mantis-Instruct", "ChuckMcSneed/various_RP_system_prompts", "chenmingxuan/Chinese-Patent-Summary", "cassiekang/cub200_dataset", "antiven0m/catboros-3.2-dpo", "ai4privacy/pii-masking-300k", "multilingual/orca_dpo_pairs", "BigAction/the-wave-clean", "legacy-datasets/ami", "TheBritishLibrary/blbooks", "convai-challenge/conv_ai_3", "e2e_nlg", "ethos", - "Helsinki-NLP/europarl", "hkcancor", "ucsbnlp/liar", "newsqa", "SemEvalWorkshop/sem_eval_2018_task_1", "rcds/swiss_judgment_prediction", "JAugusto97/told-br", "leondz/wnut_17", "CodedotAI/code_clippy_github", "castorini/mr-tydi", + "Helsinki-NLP/europarl", "hkcancor", "ucsbnlp/liar", "Maluuba/newsqa", "SemEvalWorkshop/sem_eval_2018_task_1", "rcds/swiss_judgment_prediction", "JAugusto97/told-br", "leondz/wnut_17", "CodedotAI/code_clippy_github", "castorini/mr-tydi", "flax-sentence-embeddings/stackexchange_math_jsonl", "jfrenz/legalglue", "ml6team/cnn_dailymail_nl", "sentence-transformers/parallel-sentences", "sentence-transformers/reddit-title-body", "stas/openwebtext-10k", "Azu/Handwritten-Mathematical-Expression-Convert-LaTeX", "patriziobellan/PET", "mozilla-foundation/common_voice_9_0", "bloomberg/entsum", "carblacac/twitter-sentiment-analysis", "HuggingFaceM4/VQAv2", "LHF/escorpius", "owaiskha9654/PubMed_MultiLabel_Text_Classification_Dataset_MeSH", "masakhane/mafand", "Muennighoff/P3", "Dahoas/instruct-synthetic-prompt-responses", "mjw/stock_market_tweets", "Korakoe/NijiJourney-Prompt-Pairs", "mrm8488/unnatural-instructions-full", "yuvalkirstain/PickaPic", "keremberke/blood-cell-object-detection", "keremberke/license-plate-object-detection", "forta/malicious-smart-contract-dataset", "ChristophSchuhmann/essays-with-instructions", "HuggingFaceH4/helpful-instructions", "nanaaaa/emotion_chinese_english", "wbbbbb/pclue", "lansinuote/ChnSentiCorp", "katanaml-org/invoices-donut-data-v1", @@ -634,9 +634,9 @@ def compute(self) -> CompleteJobResult: "glnmario/news-qa-summarization", "TriadParty/deepsex-RP", "pixparse/cc3m-wds", "Minami-su/Anime_novel_datasets", "Gourieff/ReActor", "cognitivecomputations/Code-74k-ShareGPT-Vicuna", "dataautogpt3/Dalle3", "DL3DV/DL3DV-Benchmark", "CausalLM/GPT-4-Self-Instruct-Turkish", "sablo/oasst2_curated", "STEM-AI-mtl/Electrical-engineering", "ikawrakow/imatrix-from-wiki-train", "somewheresystems/dataclysm-arxiv", "fblgit/simple-math", "fblgit/simple-math-DPO", "acon96/Home-Assistant-Requests", "Query-of-CC/Knowledge_Pile", "OpenDatasets/dalle-3-dataset", "ptx0/photo-concept-bucket", "zjunlp/iepile", "BatsResearch/ctga-v1", "MMInstruction/ArxivQA", "hotchpotch/JQaRA", "sean0042/KorMedMCQA", "p1atdev/ichikara-instruction", "maywell/LogicKor", "davanstrien/dataset-tldr", "xcodemind/vision2ui", "lawinstruct/lawinstruct", "UCSC-VLAA/HQ-Edit", - "kigner/ruozhiba-llama3-tt", "H-D-T/Select-Stack", "mutiyama/alt", "iabufarha/ar_sarcasm", "nilc-nlp/assin2", "cam-cst/cbt", "eurlex", "facebook/kilt_wikipedia", "multilingual_librispeech", "reuters21578", + "kigner/ruozhiba-llama3-tt", "H-D-T/Select-Stack", "mutiyama/alt", "iabufarha/ar_sarcasm", "nilc-nlp/assin2", "cam-cst/cbt", "eurlex", "facebook/kilt_wikipedia", "legacy-datasets/multilingual_librispeech", "ucirvine/reuters21578", "stanfordnlp/sentiment140", "ccasimiro/squad_es", "defunct-datasets/the_pile_stack_exchange", "facebook/wiki_movies", "Fraser/python-state-changes", "Hellisotherpeople/DebateSum", "SocialGrep/one-million-reddit-jokes", "blinoff/medical_qa_ru_data", "huggingface/transformers-metadata", "indonesian-nlp/id_newspapers_2018", - "openclimatefix/nimrod-uk-1km", "sentence-transformers/msmarco-hard-negatives", "nthngdy/oscar-small", "jiangjiechen/ekar_chinese", "sil-ai/bloom-captioning", "orieg/elsevier-oa-cc-by", "imagenet_sketch", "sileod/movie_recommendation", "quickdraw", "huggingface-legal/takedown-notices", + "openclimatefix/nimrod-uk-1km", "sentence-transformers/msmarco-hard-negatives", "nthngdy/oscar-small", "jiangjiechen/ekar_chinese", "sil-ai/bloom-captioning", "orieg/elsevier-oa-cc-by", "imagenet_sketch", "sileod/movie_recommendation", "google/quickdraw", "huggingface-legal/takedown-notices", "demelin/moral_stories", "RUCAIBox/Chinese-Generation", "Bingsu/zeroth-korean", "shjwudp/shu", "CarperAI/pile-v2-small-filtered", "citeseerx/ACL-fig", "keremberke/painting-style-classification", "jordyvl/DUDE_loader", "mlfoundations/datacomp_pools", "Loie/VGGSound", "artem9k/ai-text-detection-pile", "HuggingFaceH4/hhh_alignment", "hendrycks/ethics", "IlyaGusev/pikabu", "Aditya011/autotrain-data-nl-to-sql", "sedthh/tv_dialogue", "AnonymousSub/MedQuAD_Context_Question_Answer_Triples_TWO", "instruction-tuning-sd/cartoonization", "Polyglot-or-Not/Fact-Completion", "llm-wizard/Product-Descriptions-and-Ads", "emplocity/owca", "FronkonGames/steam-games-dataset", "lucasmccabe-lmi/codex_math_qa_alpaca_style", "ms903/Diff-SVC-refactor-pre-trained-model", "FourthBrainGenAI/AI-Superstar-Dataset", "Maciel/FinCUGE-Instruction", "HuggingFaceH4/code_evaluation_prompts", "hoskinson-center/minif2f-lean4", "Fsoft-AIC/the-vault-function", "wangrongsheng/HealthCareMagic-100k-en", @@ -647,7 +647,7 @@ def compute(self) -> CompleteJobResult: "CausalLM/GPT-4-Self-Instruct-Japanese", "jtatman/stable-diffusion-prompts-uncensored", "lowres/anime", "MediaTek-Research/TCEval-v2", "AGBonnet/augmented-clinical-notes", "HuggingFaceH4/cai-conversation-harmless", "lmms-lab/VQAv2", "lmms-lab/DocVQA", "Mutonix/RefGPT-Fact-v2", "ba188/NHS_HES", "ajibawa-2023/Children-Stories-Collection", "Vikhrmodels/LLaVA-Instruct-ru", "Doctor-Shotgun/theory-of-mind-dpo", "divyasharma0795/AppleVisionPro_Tweets", "TIGER-Lab/MATH-plus", "cgato/SlimOrcaDedupCleaned", "YanweiLi/MGM-Pretrain", "HuggingFaceH4/llava-instruct-mix-vsft", "fal-ai/imgsys-results", "mzbac/function-calling-llama-3-format-v1.1", "Yale-LILY/aeslc", "google-research-datasets/aquamuse", "allenai/atomic", "CFPB/consumer-finance-complaints", "rishitdagli/cppe-5", "stanfordnlp/craigslist_bargains", "fquad", "google_wellformed_query", "interpress_news_category_tr_lite", "thu-coai/kd_conv_with_kb", - "kakaobrain/kor_nli", "para_pat", "poem_sentiment", "eusip/silicone", "LSDSem/story_cloze", "turkic-interlingua/turkic_xwmt", "bea2019st/wi_locness", "fancyzhx/yelp_polarity", "CodedotAI/code_clippy", "SetFit/sst5", + "kakaobrain/kor_nli", "ParaPat/para_pat", "google-research-datasets/poem_sentiment", "eusip/silicone", "LSDSem/story_cloze", "turkic-interlingua/turkic_xwmt", "bea2019st/wi_locness", "fancyzhx/yelp_polarity", "CodedotAI/code_clippy", "SetFit/sst5", "deepset/germandpr", "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl", "microsoft/codexglue_method_generation", "nickmuchi/financial-classification", "uitnlp/vietnamese_students_feedback", "ydshieh/coco_dataset_script", "cgarciae/cartoonset", "DMetaSoul/chinese-semantic-textual-similarity", "ukr-models/Ukr-Synth", "Matthijs/snacks", "csebuetnlp/CrossSum", "Moo/korean-parallel-corpora", "HuggingFaceM4/TGIF", "khalidalt/tydiqa-goldp", "mteb/amazon_reviews_multi", "silver/mmchat", "fmplaza/offendes", "ColumbiaNLP/FLUTE", "tner/ontonotes5", "jordanparker6/publaynet", "tarteel-ai/quranqa", "OATML-Markslab/ProteinGym", "google/cvss", "RUCAIBox/Open-Dialogue", "cardiffnlp/tweet_topic_multi", "priyank-m/chinese_text_recognition", "skytnt/fbanimehq", "huggingface-projects/color-palettes-sd", "heegyu/namuwiki", "FremyCompany/BioLORD-Dataset", @@ -663,7 +663,7 @@ def compute(self) -> CompleteJobResult: "listen2you002/ChartLlama-Dataset", "saillab/taco-datasets", "nuprl/CanItEdit", "kyujinpy/orca_math_dpo", "adamkarvonen/chess_games", "blancsw/oasst2_top1_chat_format", "Awiny/Howto-Interlink7M", "NobodyExistsOnTheInternet/ToxicDPOqa", "VatsaDev/worldbuild", "lorinma/NL2SQL_zh", "mlabonne/chessllm", "genggui001/gg_zh_v1_550B", "DL3DV/DL3DV-ALL-4K", "paraloq/json_data_extraction", "tastypear/unalignment-toxic-dpo-v0.2-zh_cn", "hpprc/jawiki", "eduagarcia/LegalPT_dedup", "christopherthompson81/quant_exploration", "alvarobartt/dpo-mix-7k-simplified", "ucekmez/OpenOrca-tr", "ehristoforu/dalle-3-images", "ivrit-ai/whisper-training", "SPRIGHT-T2I/spright", "coseal/CodeUltraFeedback_binarized", "ParasiticRogue/Bluemoon-Light", "wdndev/webnovel-chinese", "jondurbin/bagel-v0.5", "Lin-Chen/MMStar", "tolgadev/turkish_73k_instruct_extended", "Babelscape/ALERT_DPO", - "kigner/ruozhiba-llama3", "davanstrien/dataset-tldr-preference-dpo", "facebook/asset", "barilan/blog_authorship_corpus", "dataset-org/c3", "clinc/clinc_oos", "eli5_category", "mohnish/lc_quad", "lm1b", "para_crawl", + "kigner/ruozhiba-llama3", "davanstrien/dataset-tldr-preference-dpo", "facebook/asset", "barilan/blog_authorship_corpus", "dataset-org/c3", "clinc/clinc_oos", "eli5_category", "mohnish/lc_quad", "lm1b", "ParaCrawl/para_crawl", "crscardellino/spanish_billion_words", "KorQuAD/squad_kor_v2", "nunorc/squad_v1_pt", "cgpotts/swda", "nakhun/thaisum", "wmt/wmt14", "SetFit/20_newsgroups", "bertin-project/mc4-sampling", "lbox/lbox_open", "codeparrot/codeparrot-clean-train", "thomwolf/github-python", "Adapting/empathetic_dialogues_v2", "Bingsu/Human_Action_Recognition", "mustapha/QuranExe", "ceyda/fashion-products-small", "frgfm/imagenette", "naver-clova-ix/synthdog-en", "bigscience/evaluation-results", "pcuenq/oxford-pets", "SLPL/syntran-fa", "RUCAIBox/Story-Generation", "jonathanli/law-stack-exchange", "ai-forever/school_notebooks_RU", "ashraq/esc50", "waifu-research-department/regularization", "sbx/superlim-2", "ashraq/financial-news", "AluminiumOxide/personal_latent_diffusion", "elenanereiss/german-ler", "Nerfgun3/flower_style",