From 887d9d5ed3e45e389dc5524c582b93551761f0b5 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 21 Aug 2024 14:03:26 +0000 Subject: [PATCH] more --- docs/source/openapi.json | 6 +-- docs/source/rows.md | 6 +-- .../worker/job_runners/split/presidio_scan.py | 44 +++++++++---------- .../dataset/test_compatible_libraries.py | 2 +- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index 4fed46813..f9fd5572b 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -5288,7 +5288,7 @@ "examples": { "number of URLS for a dataset": { "summary": "number of URLs for a dataset.", - "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=conceptual_captions", + "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=google-research-datasets/conceptual_captions", "value": { "urls_columns": ["image_url"], "has_urls_columns": true, @@ -5301,7 +5301,7 @@ }, "number of URLS for a subset": { "summary": "number of URLs for a subset.", - "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=conceptual_captions&config=labeled", + "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=google-research-datasets/conceptual_captions&config=labeled", "value": { "urls_columns": ["image_url"], "has_urls_columns": true, @@ -5314,7 +5314,7 @@ }, "number of URLS for a split": { "summary": "number of URLs for a split.", - "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=conceptual_captions&config=labeled&split=train", + "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=google-research-datasets/conceptual_captions&config=labeled&split=train", "value": { "has_urls_columns": true, "num_opt_in_urls": 0, diff --git a/docs/source/rows.md b/docs/source/rows.md index 777f2725b..46fae09e8 100644 --- a/docs/source/rows.md +++ b/docs/source/rows.md @@ -163,10 +163,10 @@ Images are represented as a JSON object with three fields: - `height`: height (in pixels) of the image - `width`: width (in pixels) of the image -Here is an example of image, from the first row of the cifar100 dataset: +Here is an example of image, from the first row of the uoft-cs/cifar100 dataset: ```json -// https://datasets-server.huggingface.co/rows?dataset=cifar100&config=cifar100&split=train&offset=0&length=1 +// https://datasets-server.huggingface.co/rows?dataset=uoft-cs/cifar100&config=cifar100&split=train&offset=0&length=1 { "features": [ { "feature_idx": 0, "name": "img", "type": { "_type": "Image" } }, @@ -177,7 +177,7 @@ Here is an example of image, from the first row of the cifar100 dataset: "row_idx": 0, "row": { "img": { - "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/aadb3af77e9048adbea6b47c21a81e47dd092ae5/--/cifar100/train/0/img/image.jpg?Expires=1710283469&Signature=A1v0cG07nuaBxYbuPR5EUZpJ9Se072SBDr4935gEsOESHGVyeqvd3qmvdsy1fuqbHk0dnx~p6MLtQ-hg3aCBOJ8eIJ5ItIoyYT4riJRuPQC0VFUb~b1maEwU8LRoXXuvrSysSz2QhBbC~ofv6cQudm~~bgGxXWAslDs180KnmPDsMU55ySsKyKQYNEkQKyuYvrGIJbFeg4lEps0f5CEwUstAwRAwlk~mzRpzUDBq7nJ~DcujTlllLv36nJX~too8mMnFn6dCn2nfGOFYwUiyYM73Czv-laLhVaIVUzcuJum90No~KNGzfYeFZpPqktA7MjCzRLf1gz5kA7wBqnY-8Q__&Key-Pair-Id=K3EI6M078Z3AC3", + "src": "https://datasets-server.huggingface.co/cached-assets/uoft-cs/cifar100/--/aadb3af77e9048adbea6b47c21a81e47dd092ae5/--/cifar100/train/0/img/image.jpg?Expires=1710283469&Signature=A1v0cG07nuaBxYbuPR5EUZpJ9Se072SBDr4935gEsOESHGVyeqvd3qmvdsy1fuqbHk0dnx~p6MLtQ-hg3aCBOJ8eIJ5ItIoyYT4riJRuPQC0VFUb~b1maEwU8LRoXXuvrSysSz2QhBbC~ofv6cQudm~~bgGxXWAslDs180KnmPDsMU55ySsKyKQYNEkQKyuYvrGIJbFeg4lEps0f5CEwUstAwRAwlk~mzRpzUDBq7nJ~DcujTlllLv36nJX~too8mMnFn6dCn2nfGOFYwUiyYM73Czv-laLhVaIVUzcuJum90No~KNGzfYeFZpPqktA7MjCzRLf1gz5kA7wBqnY-8Q__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 32, "width": 32 }, diff --git a/services/worker/src/worker/job_runners/split/presidio_scan.py b/services/worker/src/worker/job_runners/split/presidio_scan.py index 20456bb4e..8e832a8fc 100644 --- a/services/worker/src/worker/job_runners/split/presidio_scan.py +++ b/services/worker/src/worker/job_runners/split/presidio_scan.py @@ -475,19 +475,19 @@ def compute(self) -> CompleteJobResult: "bigcode/starcoderdata", "microsoft/orca-math-word-problems-200k", "imagenet-1k", "nyu-mll/glue", "bigcode/the-stack-dedup", "togethercomputer/RedPajama-Data-V2", "gretelai/synthetic_text_to_sql", "allenai/objaverse", "Skylion007/openwebtext", "wikitext", "HuggingFaceM4/WebSight", "RyokoAI/ShareGPT52K", "laion/OIG", "stanfordnlp/SHP", "PleIAs/YouTube-Commons", "Skywork/SkyPile-150B", "glaiveai/glaive-function-calling-v2", "Samsung/samsum", "lmsys/chatbot_arena_conversations", "openbmb/UltraFeedback", "lambdalabs/pokemon-blip-captions", "shibing624/medical", "berkeley-nest/Nectar", "Intel/orca_dpo_pairs", "YeungNLP/firefly-train-1.1M", "BAAI/COIG-PC", "meta-math/MetaMathQA", "gsm8k", "codeparrot/github-code", "bookcorpus/bookcorpus", - "Open-Orca/SlimOrca", "dair-ai/emotion", "CohereForAI/aya_dataset", "legacy-datasets/c4", "cais/mmlu", "open-web-math/open-web-math", "code_search_net", "allenai/WildChat-1M", "rajpurkar/squad", "litagin/moe-speech", + "Open-Orca/SlimOrca", "dair-ai/emotion", "CohereForAI/aya_dataset", "legacy-datasets/c4", "cais/mmlu", "open-web-math/open-web-math", "code-search-net/code_search_net", "allenai/WildChat-1M", "rajpurkar/squad", "litagin/moe-speech", "Lin-Chen/ShareGPT4V", "shareAI/ShareGPT-Chinese-English-90k", "nomic-ai/gpt4all-j-prompt-generations", "ceval/ceval-exam", "google/fleurs", "openai/webgpt_comparisons", "bigcode/the-stack-v2", "HuggingFaceM4/the_cauldron", "Salesforce/dialogstudio", "LDJnr/Capybara", "stanfordnlp/imdb", "nampdn-ai/tiny-codes", "CausalLM/Refined-Anime-Text", "bigscience/P3", "vicgalle/alpaca-gpt4", "bigcode/ta-prompt", "Locutusque/UltraTextbooks", "allenai/c4", "pile-of-law/pile-of-law", "teknium/openhermes", - "TIGER-Lab/MathInstruct", "HuggingFaceH4/ultrafeedback_binarized", "PygmalionAI/PIPPA", "openai_humaneval", "cnn_dailymail", "yizhongw/self_instruct", "SirNeural/flan_v2", "nvidia/HelpSteer", "THUDM/AgentInstruct", "nvidia/OpenMathInstruct-1", + "TIGER-Lab/MathInstruct", "HuggingFaceH4/ultrafeedback_binarized", "PygmalionAI/PIPPA", "openai_humaneval", "abisee/cnn_dailymail", "yizhongw/self_instruct", "SirNeural/flan_v2", "nvidia/HelpSteer", "THUDM/AgentInstruct", "nvidia/OpenMathInstruct-1", "openai/summarize_from_feedback", "nickrosh/Evol-Instruct-Code-80k-v1", "storytracer/US-PD-Books", "OpenAssistant/oasst2", "Cohere/wikipedia-2023-11-embed-multilingual-v3", "argilla/OpenHermesPreferences", "Hello-SimpleAI/HC3", "SciPhi/textbooks-are-all-you-need-lite", "vikp/textbook_quality_programming", "financial_phrasebank", "truthful_qa", "GAIR/MathPile", "Anthropic/persuasion", "m-a-p/Code-Feedback", "laion/laion2B-en", "wangrui6/Zhihu-KOL", "openchat/openchat_sharegpt4_dataset", "oscar", "sahil2801/CodeAlpaca-20k", "Tele-AI/TeleChat-PTD", "mozilla-foundation/common_voice_11_0", "mlabonne/orpo-dpo-mix-40k", "Open-Orca/FLAN", "rajpurkar/squad_v2", "nyanko7/LLaMA-65B", "super_glue", "cognitivecomputations/wizard_vicuna_70k_unfiltered", "Amod/mental_health_counseling_conversations", "EleutherAI/proof-pile-2", "ProGamerGov/StableDiffusion-v1-5-Regularization-Images", "the_pile_books3", "mc4", "knkarthick/dialogsum", "argilla/distilabel-capybara-dpo-7k-binarized", "nyanko7/danbooru2023", "Hello-SimpleAI/HC3-Chinese", "MMMU/MMMU", "ise-uiuc/Magicoder-Evol-Instruct-110K", "argilla/distilabel-intel-orca-dpo-pairs", "H-D-T/Buzz", "theblackcat102/evol-codealpaca-v1", "animelover/danbooru2022", "CohereForAI/aya_collection", "allenai/soda", "lvwerra/stack-exchange-paired", "teknium/GPT4-LLM-Cleaned", "BelleGroup/train_1M_CN", "allenai/peS2o", "vivym/midjourney-messages", "oscar-corpus/OSCAR-2301", - "taesiri/arxiv_qa", "unalignment/toxic-dpo-v0.1", "math-ai/AutoMathText", "mozilla-foundation/common_voice_13_0", "nampdn-ai/tiny-textbooks", "ise-uiuc/Magicoder-OSS-Instruct-75K", "common_voice", "armanc/scientific_papers", "mlabonne/guanaco-llama2-1k", "DIBT/10k_prompts_ranked", + "taesiri/arxiv_qa", "unalignment/toxic-dpo-v0.1", "math-ai/AutoMathText", "mozilla-foundation/common_voice_13_0", "nampdn-ai/tiny-textbooks", "ise-uiuc/Magicoder-OSS-Instruct-75K", "legacy-datasets/common_voice", "armanc/scientific_papers", "mlabonne/guanaco-llama2-1k", "DIBT/10k_prompts_ranked", "medical_dialog", "nomic-ai/gpt4all_prompt_generations", "go_emotions", "iamtarun/python_code_instructions_18k_alpaca", "argilla/dpo-mix-7k", "MBZUAI/LaMini-instruction", "qiaojin/PubMedQA", "LinkSoul/instruction_merge_set", "LooksJuicy/ruozhiba", "pleisto/wikipedia-cn-20230720-filtered", "kakaobrain/coyo-700m", "gaia-benchmark/GAIA", "PleIAs/Post-OCR-Correction", "fancyzhx/ag_news", "cognitivecomputations/WizardLM_alpaca_evol_instruct_70k_unfiltered", "BelleGroup/train_3.5M_CN", "togethercomputer/Long-Data-Collections", "derek-thomas/ScienceQA", "HuggingFaceM4/OBELICS", "abacusai/SystemChat", - "google/MusicCaps", "dell-research-harvard/AmericanStories", "shahules786/orca-chat", "daily_dialog", "cognitivecomputations/samantha-data", "allenai/MADLAD-400", "pixparse/idl-wds", "eriktks/conll2003", "oscar-corpus/OSCAR-2201", "BelleGroup/multiturn_chat_0.8M", + "google/MusicCaps", "dell-research-harvard/AmericanStories", "shahules786/orca-chat", "li2017dailydialog/daily_dialog", "cognitivecomputations/samantha-data", "allenai/MADLAD-400", "pixparse/idl-wds", "eriktks/conll2003", "oscar-corpus/OSCAR-2201", "BelleGroup/multiturn_chat_0.8M", "knowrohit07/know_sql", "bigscience/xP3", "mosaicml/dolly_hhrlhf", "nvidia/ChatQA-Training-Data", "zzliang/GRIT", "tweet_eval", "togethercomputer/RedPajama-Data-1T-Sample", "izumi-lab/llm-japanese-dataset", "TigerResearch/pretrain_zh", "Dahoas/rm-static", "HuggingFaceH4/stack-exchange-preferences", "hakurei/open-instruct-v1", "liuhaotian/LLaVA-Pretrain", "MMInstruction/M3IT", "lmsys/toxic-chat", "librispeech_asr", "codeparrot/apps", "BelleGroup/train_2M_CN", "laion/gpt4v-dataset", "jondurbin/truthy-dpo-v0.1", "argilla/ultrafeedback-binarized-preferences-cleaned", "mbpp", "xlangai/spider", "Helsinki-NLP/opus-100", "openlifescienceai/medmcqa", "BelleGroup/train_0.5M_CN", "defunct-datasets/amazon_reviews_multi", "JeanKaddour/minipile", "michaelwzhu/ChatMed_Consult_Dataset", "MBZUAI/Bactrian-X", @@ -506,16 +506,16 @@ def compute(self) -> CompleteJobResult: "FreedomIntelligence/HuatuoGPT-sft-data-v1", "nlpai-lab/kullm-v2", "ai4privacy/pii-masking-200k", "argilla/OpenHermes2.5-dpo-binarized-alpha", "ArmelR/stack-exchange-instruction", "argilla/distilabel-math-preference-dpo", "allenai/openbookqa", "facebook/voxpopuli", "IlyaGusev/ru_turbo_alpaca", "griffin/chain_of_density", "jondurbin/gutenberg-dpo-v0.1", "PleIAs/French-PD-Newspapers", "ParlAI/blended_skill_talk", "mandarjoshi/trivia_qa", "visual_genome", "JanosAudran/financial-reports-sec", "fnlp/moss-003-sft-data", "approximatelabs/tablib-v1-full", "mozilla-foundation/common_voice_16_0", "xai-org/RealworldQA", "lmsys/lmsys-arena-human-preference-55k", "Abirate/english_quotes", "BelleGroup/generated_chat_0.4M", "maharshipandya/spotify-tracks-dataset", "TokenBender/code_instructions_122k_alpaca_style", "Flmc/DISC-Med-SFT", "ShengbinYue/DISC-Law-SFT", "argilla/ultrafeedback-binarized-preferences", "multi_news", "nguha/legalbench", - "Squish42/bluemoon-fandom-1-1-rp-cleaned", "gorilla-llm/APIBench", "OpenAssistant/oasst_top1_2023-08-25", "joujiboi/japanese-anime-speech", "BAAI/CCI-Data", "conceptual_captions", "selfrag/selfrag_train_data", "MLCommons/peoples_speech", "laion/laion-coco", "gamino/wiki_medical_terms", + "Squish42/bluemoon-fandom-1-1-rp-cleaned", "gorilla-llm/APIBench", "OpenAssistant/oasst_top1_2023-08-25", "joujiboi/japanese-anime-speech", "BAAI/CCI-Data", "google-research-datasets/conceptual_captions", "selfrag/selfrag_train_data", "MLCommons/peoples_speech", "laion/laion-coco", "gamino/wiki_medical_terms", "yitingxie/rlhf-reward-datasets", "PKU-Alignment/PKU-SafeRLHF-10K", "graelo/wikipedia", "bitext/Bitext-customer-support-llm-chatbot-training-dataset", "AdaptLLM/finance-tasks", "XzJosh/audiodataset", "BAAI/TACO", "nvidia/ChatRAG-Bench", "google/boolq", "kdexd/red_caps", "ccdv/pubmed-summarization", "ctheodoris/Genecorpus-30M", "Cohere/wikipedia-22-12-en-embeddings", "tasksource/bigbench", "junelee/sharegpt_deepl_ko", "elyza/ELYZA-tasks-100", "codefuse-ai/CodeExercise-Python-27k", "FreedomIntelligence/ALLaVA-4V", "NilanE/ParallelFiction-Ja_En-100k", "facebook/multilingual_librispeech", "ms903/sovits4.0-768vec-layer12", "CohereForAI/xP3x", "princeton-nlp/SWE-bench", "allenai/ultrafeedback_binarized_cleaned", "sujet-ai/Sujet-Finance-Instruct-177k", "tau/commonsense_qa", "ccdv/arxiv-summarization", "AmazonScience/massive", "ShapeNet/ShapeNetCore", "bigbio/med_qa", "Cohere/wikipedia-22-12-simple-embeddings", "lukaemon/mmlu", "bigcode/humanevalpack", "ArtifactAI/arxiv-math-instruct-50k", "dikw/hh_rlhf_cn", "food101", "allenai/qasper", "stanfordnlp/snli", "Helsinki-NLP/tatoeba_mt", "laion/laion-high-resolution", "facebook/flores", "reazon-research/reazonspeech", "swype/instruct", "athirdpath/DPO_Pairs-Roleplay-Alpaca-NSFW", "cognitivecomputations/dolphin-coder", "McGill-NLP/WebLINX", "sarvamai/samvaad-hi-v1", "froggeric/creativity", "0-hero/Matter-0.1", "NortheasternUniversity/big_patent", - "cc100", "jhu-clsp/jfleg", "neulab/conala", "jmhessel/newyorker_caption_contest", "HuggingFace-CN-community/translation", "bigcode/commitpack", "akoksal/LongForm", "JourneyDB/JourneyDB", "OpenGVLab/InternVid", "heliosbrahma/mental_health_chatbot_dataset", + "statmt/cc100", "jhu-clsp/jfleg", "neulab/conala", "jmhessel/newyorker_caption_contest", "HuggingFace-CN-community/translation", "bigcode/commitpack", "akoksal/LongForm", "JourneyDB/JourneyDB", "OpenGVLab/InternVid", "heliosbrahma/mental_health_chatbot_dataset", "mlsum", "google/xtreme_s", "Linaqruf/pixiv-niji-journey", "THUDM/webglm-qa", "starmpcc/Asclepius-Synthetic-Clinical-Notes", "fondant-ai/fondant-cc-25m", "jondurbin/airoboros-3.1", "wenge-research/yayi2_pretrain_data", "TuringsSolutions/NYTWritingStyleGuide", "KBlueLeaf/danbooru2023-sqlite", "xx103/NYC_Motor_Vehicle_Collisions_and_Weather_Dataset", "bigcode/self-oss-instruct-sc2-exec-filter-50k", "natural_questions", "Helsinki-NLP/open_subtitles", "Dahoas/synthetic-instruct-gptj-pairwise", "open-llm-leaderboard/results", "teknium/trismegistus-project", "ro-h/regulatory_comments", "ibrahimhamamci/CT-RATE", "ruslanmv/ai-medical-chatbot", - "eli5", "cimec/lambada", "PhilipMay/stsb_multi_mt", "GEM/wiki_lingua", "euirim/goodwiki", "laion/220k-GPT4Vision-captions-from-LIVIS", "sc890/DEEPFRUlT_DATASET", "Replete-AI/code_bagel", "cifar10", "medical_questions_pairs", + "eli5", "cimec/lambada", "PhilipMay/stsb_multi_mt", "GEM/wiki_lingua", "euirim/goodwiki", "laion/220k-GPT4Vision-captions-from-LIVIS", "sc890/DEEPFRUlT_DATASET", "Replete-AI/code_bagel", "uoft-cs/cifar10", "medical_questions_pairs", "codeparrot/codeparrot-clean", "google/bigbench", "camel-ai/physics", "bigcode/commitpackft", "silk-road/ChatHaruhi-54K-Role-Playing-Dialogue", "clouditera/security-paper-datasets", "openerotica/freedom-rp", "Major-TOM/Core-S2L2A", "vblagoje/cc_news", "kilt_tasks", "pg19", "allenai/winogrande", "aharley/rvl_cdip", "naver-clova-ix/cord-v2", "jamescalam/unsplash-25k-photos", "jkhedri/psychology-dataset", "grammarly/coedit", "Duxiaoman-DI/FinCorpus", "a686d380/h-corpus-2023", "teknium/dataforge-economics", "jondurbin/cinematika-v0.1", "mlabonne/chatml_dpo_pairs", "hieunguyenminh/roleplay", "xz56/react-llama", "TeraflopAI/Caselaw_Access_Project", "coastalcph/lex_glue", "rotten_tomatoes", "yahoo_answers_topics", "miracl/miracl", "humarin/chatgpt-paraphrases", @@ -533,15 +533,15 @@ def compute(self) -> CompleteJobResult: "pacovaldez/stackoverflow-questions", "TigerResearch/sft_zh", "zjunlp/Mol-Instructions", "pufanyi/MIMICIT", "BAAI/JudgeLM-100K", "Trelis/function_calling_v3", "google/Synthetic-Persona-Chat", "FarReelAILab/Machine_Mindset_MBTI_dataset", "jtatman/stable-diffusion-prompts-stats-full-uncensored", "KBlueLeaf/danbooru2023-webp-4Mpixel", "THUDM/LongAlign-10k", "LeoZhangzaolin/Graptoloidea-Specimens-Imaging", "ResplendentAI/NSFW_RP_Format_DPO", "RekaAI/VibeEval", "tomg-group-umd/cinepile", "legacy-datasets/banking77", "rmyeid/polyglot_ner", "tapaco", "deepset/germanquad", "laion/laion2B-multi", "huggan/smithsonian_butterflies_subset", "CShorten/ML-ArXiv-Papers", "codeparrot/xlcost-text-to-code", "lukaemon/bbh", "thu-coai/Safety-Prompts", "IDEA-CCNL/Ziya-Eval-Chinese", "cognitivecomputations/WizardLM_evol_instruct_V2_196k_unfiltered_merged_split", "beyond/rlhf-reward-single-round-trans_chinese", "jerryjalapeno/nart-100k-synthetic", "vikp/pypi_clean", - "cognitivecomputations/ultrachat-uncensored", "facebook/emu_edit_test_set", "playgroundai/MJHQ-30K", "zwn22/NC_Crime", "Shitao/MLDR", "Sayali9141/traffic_signal_images", "deutsche-telekom/Ger-RAG-eval", "FiscalNote/billsum", "clue", "cuad", + "cognitivecomputations/ultrachat-uncensored", "facebook/emu_edit_test_set", "playgroundai/MJHQ-30K", "zwn22/NC_Crime", "Shitao/MLDR", "Sayali9141/traffic_signal_images", "deutsche-telekom/Ger-RAG-eval", "FiscalNote/billsum", "clue/clue", "theatticusproject/cuad-qa", "Helsinki-NLP/opus_books", "SLPL/naab", "Cohere/wikipedia-22-12", "MohamedRashad/ChatGPT-prompts", "HuggingFace-CN-community/Diffusion-book-cn", "HuggingFaceH4/instruction-dataset", "deepset/prompt-injections", "OpenLeecher/Teatime", "math-eval/TAL-SCQ5K", "HackerNoon/tech-company-news-data-dump", "LLM360/AmberDatasets", "peiyi9979/Math-Shepherd", "Crystalcareai/MoD", "papluca/language-identification", "bigcode/the-stack-smol", "argilla/news-summary", "CarperAI/openai_summarize_comparisons", "argilla/databricks-dolly-15k-curated-en", "mikex86/stackoverflow-posts", "Anthropic/llm_global_opinions", "akjindal53244/Arithmo-Data", "OpenLLM-France/Claire-Dialogue-French-0.1", "arbml/CIDAR", "snorkelai/Snorkel-Mistral-PairRM-DPO-Dataset", "PleIAs/US-PD-Newspapers", "yh0701/FracAtlas_dataset", "somosnlp/Reglamento_Aeronautico_Colombiano_2024GemmaQA", "omi-health/medical-dialogue-to-soap-summary", "argilla/Capybara-Preferences", "UCLNLP/adversarial_qa", - "conv_ai_2", "ccdv/govreport-summarization", "mozilla-foundation/common_voice_8_0", "nomic-ai/gpt4all_prompt_generations_with_p3", "hugfaceguy0001/retarded_bar", "lksy/ru_instruct_gpt4", "Linly-AI/Chinese-pretraining-dataset", "mosaicml/instruct-v3", "corbt/all-recipes", "VatsaDev/TinyText", + "convai-challenge/conv_ai_2", "ccdv/govreport-summarization", "mozilla-foundation/common_voice_8_0", "nomic-ai/gpt4all_prompt_generations_with_p3", "hugfaceguy0001/retarded_bar", "lksy/ru_instruct_gpt4", "Linly-AI/Chinese-pretraining-dataset", "mosaicml/instruct-v3", "corbt/all-recipes", "VatsaDev/TinyText", "google/docci", "linux-cn/archive", "Johnnyeee/Yelpdata_663", "HuggingFaceTB/cosmopedia-100k", "nyu-mll/blimp", "defunct-datasets/bookcorpusopen", "iwslt2017", "recipe_nlg", "Helsinki-NLP/tatoeba", "GEM/viggo", "bavard/personachat_truecased", "segments/sidewalk-semantic", "PolyAI/banking77", "facebook/pmd", "zeroshot/twitter-financial-news-topic", "nuprl/MultiPL-E", "GBaker/MedQA-USMLE-4-options", "camel-ai/code", "merve/turkish_instructions", "tasksource/oasst1_pairwise_rlhf_reward", "winddude/reddit_finance_43_250k", "tiedong/goat", "togethercomputer/RedPajama-Data-Instruct", "DKYoon/SlimPajama-6B", "Maxx0/sexting-nsfw-adultconten", "squarelike/OpenOrca-gugugo-ko", "MMInstruction/VLFeedback", "LLaVA-VL/llava-plus-data", "McAuley-Lab/Amazon-Reviews-2023", "Open-Orca/1million-gpt-4", - "gwenxin/pills_inside_bottles", "keithito/lj_speech", "conll2012_ontonotesv5", "mwritescode/slither-audited-smart-contracts", "bsmock/pubtables-1m", "tasksource/mmlu", "bigcode/bigcode-pii-dataset", "medalpaca/medical_meadow_wikidoc", "P01son/instructions", "ArtifactAI/arxiv-physics-instruct-tune-30k", + "gwenxin/pills_inside_bottles", "keithito/lj_speech", "ontonotes/conll2012_ontonotesv5", "mwritescode/slither-audited-smart-contracts", "bsmock/pubtables-1m", "tasksource/mmlu", "bigcode/bigcode-pii-dataset", "medalpaca/medical_meadow_wikidoc", "P01son/instructions", "ArtifactAI/arxiv-physics-instruct-tune-30k", "mrtoy/mobile-ui-design", "nampdn-ai/tiny-orca-textbooks", "kyujinpy/KOpen-platypus", "YeungNLP/firefly-pretrain-dataset", "unalignment/airoboros-2.2", "totally-not-an-llm/EverythingLM-data-V3", "CASIA-LM/ChineseWebText", "NeuralNovel/Neural-DPO", "AI4Math/MathVerse", "ucinlp/drop", "gigaword", "wider_face", "wiki_qa", "HUPD/hupd", "liweili/c4_200m", "nielsr/funsd-layoutlmv3", "IDEA-CCNL/laion2B-multi-chinese-subset", "dennlinger/eur-lex-sum", "mitclinicalml/clinical-ie", "Matthijs/cmu-arctic-xvectors", "FredZhang7/stable-diffusion-prompts-2.47M", "philschmid/flanv2", "NTU-NLP-sg/xCodeEval", "MadVoyager/stable_diffusion_instructional_dataset", "zetavg/ShareGPT-Processed", "shibing624/nli-zh-all", "oscar-corpus/colossal-oscar-1.0", "greengerong/leetcode", "ProgramComputer/voxceleb", "allenai/paloma", @@ -557,7 +557,7 @@ def compute(self) -> CompleteJobResult: "CausalLM/GPT-4-Self-Instruct-German", "shareAI/novelai3", "MinervaAI/Aesir-Preview", "wintercoming6/artwork_for_sdxl", "Salesforce/lotsa_data", "ForzaJuve1/UEFA_Euro_2020_Data", "mo-mittal/reddit_political_subs", "Targoman/TLPC", "paws", "web_questions", "bigscience-data/roots_zh-cn_wikipedia", "laion/laion2B-en-aesthetic", "daekeun-ml/naver-news-summarization-ko", "CarperAI/openai_summarize_tldr", "competitions/aiornot", "huggingface/badges", "allenai/lila", "yuvalkirstain/pickapic_v1", "tatsu-lab/alpaca_farm", "cognitivecomputations/open-instruct-uncensored", "CheshireAI/guanaco-unchained", "openchat/openchat_sharegpt_v3", "LinkSoul/LLaSM-Audio-Instructions", "totally-not-an-llm/EverythingLM-data-V2", "jinaai/code_exercises", "0-hero/prompt-perfect", "jamescalam/ai-arxiv-chunked", "maywell/ko_Ultrafeedback_binarized", "keirp/hungarian_national_hs_finals_exam", "laion/laion-pop", - "gvecchio/MatSynth", "baobab-trees/wikipedia-human-retrieval-ja", "mii-llm/gazzetta-ufficiale", "shachardon/ShareLM", "MohamedRashad/midjourney-detailed-prompts", "ade-benchmark-corpus/ade_corpus_v2", "cifar100", "mhardalov/exams", "josecannete/large_spanish_corpus", "quac", + "gvecchio/MatSynth", "baobab-trees/wikipedia-human-retrieval-ja", "mii-llm/gazzetta-ufficiale", "shachardon/ShareLM", "MohamedRashad/midjourney-detailed-prompts", "ade-benchmark-corpus/ade_corpus_v2", "uoft-cs/cifar100", "mhardalov/exams", "josecannete/large_spanish_corpus", "quac", "microsoft/xglue", "huggingface/documentation-images", "seamew/ChnSentiCorp", "tau/scrolls", "bible-nlp/biblenlp-corpus", "JulesBelveze/tldr_news", "christopher/rosetta-code", "inria-soda/tabular-benchmark", "beyond/chinese_clean_passages_80m", "bigbio/pubmed_qa", "Cohere/miracl-zh-queries-22-12", "koutch/stackoverflow_python", "ACCA225/Kaggle-Stable-Diffusion", "Yasbok/Alpaca_arabic_instruct", "bertin-project/alpaca-spanish", "laion/laion400m", "axiong/pmc_oa", "medalpaca/medical_meadow_medical_flashcards", "dominguesm/Canarim-Instruct-PTBR-Dataset", "p1atdev/niji-v5", "zetavg/coct-en-zh-tw-translations-twp-300k", "skeskinen/TinyStories-GPT4", "xmcmic/PMC-VQA", "beomi/KoAlpaca-v1.1a", "ecnu-icalk/educhat-sft-002-data-osm", "kyujinpy/OpenOrca-KO", "open-phi/programming_books_llama", "hkust-nlp/deita-10k-v0", "jxu124/OpenX-Embodiment", "m-a-p/MusicPile", @@ -570,11 +570,11 @@ def compute(self) -> CompleteJobResult: "TempoFunk/webvid-10M", "shinonomelab/cleanvid-15m_map", "smangrul/code-chat-assistant-v1", "OleehyO/latex-formulas", "daat/DATA", "axiong/pmc_llama_instructions", "AdaptLLM/law-tasks", "chargoddard/rpguild", "AiresPucrs/stanford-encyclopedia-philosophy", "amaai-lab/MusicBench", "diffusers/pokemon-gpt4-captions", "migtissera/Tess-Coder-v1.0", "HaoyeZhang/RLHF-V-Dataset", "togethercomputer/glaive-function-calling-v2-formatted", "osunlp/TravelPlanner", "BioMistral/BioInstructQA", "misikoff/zillow", "MedRAG/pubmed", "Writer/omniact", "openbmb/UltraSafety", "visheratin/realworldqa", "lorinma/ChineseEncyclopedia", "sealuzh/app_reviews", "msra_ner", "openslr", "riddle_sense", "zhoubolei/scene_parse_150", "allenai/scitldr", "tydiqa", "IlyaGusev/gazeta", - "albertvillanova/legal_contracts", "conceptual_12m", "textvqa", "VIMA/VIMA-Data", "hanamizuki-ai/genshin-voice-v3.3-mandarin", "Nerfgun3/sakimi-chan_LoRA", "cyberagent/crello", "jxm/the_office_lines", "WynterJones/chatgpt-roles", "gbharti/wealth-alpaca_lora", + "albertvillanova/legal_contracts", "google-research-datasets/conceptual_12m", "textvqa", "VIMA/VIMA-Data", "hanamizuki-ai/genshin-voice-v3.3-mandarin", "Nerfgun3/sakimi-chan_LoRA", "cyberagent/crello", "jxm/the_office_lines", "WynterJones/chatgpt-roles", "gbharti/wealth-alpaca_lora", "THUIR/T2Ranking", "IlyaGusev/ru_turbo_saiga", "tasksource/ScienceQA_text_only", "cvssp/WavCaps", "lighteval/MATH", "kunishou/oasst1-89k-ja", "zetavg/zh-tw-wikipedia", "lighteval/legal_summarization", "skeskinen/TinyStories-hf", "silk-road/chinese-dolly-15k", "TigerResearch/tigerbot-zhihu-zh-10k", "open-llm-leaderboard/requests", "mlabonne/guanaco-llama2", "totally-not-an-llm/EverythingLM-data", "BELLE-2/train_3.5M_CN_With_Category", "rizerphe/glaive-function-calling-v2-llama", "rombodawg/LimitlessMegaCodeTraining", "re-align/just-eval-instruct", "IlyaGusev/pippa_scored", "IGNF/FLAIR", "allenai/WildChat-nontoxic", "Unbabel/TowerBlocks-v0.1", "ShoukanLabs/AniSpeech", "unsloth/notebooks", "GAIR/MathPile_Commercial", "abacusai/MetaMathFewshot", "DiscoResearch/germanrag", "cdoswald/SPIDER", "yixuantt/MultiHopRAG", "instructkr/ko_elo_arena_0207", - "osunlp/SMolInstruct", "allenai/WildBench", "FuseAI/FuseChat-Mixture", "Vezora/Tested-143k-Python-Alpaca", "cats_vs_dogs", "tdavidson/hate_speech_offensive", "snow_simplified_japanese_corpus", "timit_asr", "web_nlg", "wiki_bio", + "osunlp/SMolInstruct", "allenai/WildBench", "FuseAI/FuseChat-Mixture", "Vezora/Tested-143k-Python-Alpaca", "microsoft/cats_vs_dogs", "tdavidson/hate_speech_offensive", "snow_simplified_japanese_corpus", "timit_asr", "web_nlg", "wiki_bio", "kili-technology/plastic_in_river", "qanastek/MASSIVE", "google/wit", "sil-ai/bloom-speech", "FacePerceiver/laion-face", "codeparrot/codecomplex", "codeparrot/github-jupyter-code-to-text", "neuralworm/stable-diffusion-discord-prompts", "detection-datasets/coco", "Gxg/Math23K", "ashraq/fashion-product-images-small", "animelover/genshin-impact-images", "suolyer/webqa", "fusing/fill50k", "dominguesm/alpaca-data-pt-br", "multimodalart/facesyntheticsspigacaptioned", "jiacheng-ye/logiqa-zh", "sam-mosaic/vicuna_alpaca_hc3_chatml", "thefcraft/civitai-stable-diffusion-337k", "Nan-Do/instructional_code-search-net-python", "izumi-lab/llm-japanese-dataset-vanilla", "xmj2002/Chinese_modern_classical", "cognitivecomputations/based", "laion/strategic_game_chess", "jondurbin/airoboros-gpt4-1.2", "jondurbin/airoboros-gpt4-m2.0", "rombodawg/LosslessMegaCodeTrainingV2", "shareAI/CodeChat", "qgyd2021/h_novel", "BAAI/COIG-PC-core", @@ -585,26 +585,26 @@ def compute(self) -> CompleteJobResult: "shibing624/AdvertiseGen", "andersonbcdefg/supernatural-instructions-2m", "azcorpus/azcorpus_v0", "cognitivecomputations/oa_leet10k", "Abrumu/Fashion_controlnet_dataset_V3", "tasksource/tasksource-instruct-v0", "wenge-research/yayi_domain_subset", "ignmilton/ign_clean_instruct_dataset_500k", "changpt/ko-lima-vicuna", "pankajmathur/alpaca_orca", "marhensa/comfyui-workflow", "jondurbin/airoboros-2.1", "M-A-D/Mixed-Arabic-Datasets-Repo", "taide/TAIDE-14-tasks", "manu/project_gutenberg", "Lakera/gandalf_ignore_instructions", "goendalf666/sales-conversations", "yuyijiong/Multi-Doc-QA-Chinese", "fnlp/character-llm-data", "wenge-research/yayi_uie_sft_data", "glaiveai/glaive-code-assistant-v3", "davidchan/anim400k", "prometheus-eval/Preference-Collection", "numind/NuNER", "YuxuanZhang888/ColonCancerCTDataset", "TIGER-Lab/SKGInstruct", "CyberNative/Code_Vulnerability_Security_DPO", "hiyouga/glaive-function-calling-v2-sharegpt", "ai4bharat/sangraha", "ontocord/viet4all", - "cloneofsimo/imagenet.int8", "Replete-AI/code_bagel_hermes-2.5", "amirveyseh/acronym_identification", "cornell_movie_dialog", "fancyzhx/dbpedia_14", "esnli", "fever", "google/jigsaw_toxicity_pred", "xquad", "NbAiLab/NCC", + "cloneofsimo/imagenet.int8", "Replete-AI/code_bagel_hermes-2.5", "amirveyseh/acronym_identification", "cornell-movie-dialog/cornell_movie_dialog", "fancyzhx/dbpedia_14", "esnli", "fever", "google/jigsaw_toxicity_pred", "xquad", "NbAiLab/NCC", "ccdv/cnn_dailymail", "ccdv/patent-classification", "DFKI-SLT/few-nerd", "solomonk/reddit_mental_health_posts", "carolina-c4ai/corpus-carolina", "thu-coai/lccc", "fabiochiu/medium-articles", "FinanceInc/auditor_sentiment", "nateraw/midjourney-texttoimage-new", "HuggingFaceH4/self-instruct-seed", "RyokoAI/CNNovel125K", "IndianaUniversityDatasetsModels/MIMIC-medical-report", "samhog/psychology-10k", "HuggingFaceH4/databricks_dolly_15k", "heegyu/open-korean-instructions", "logo-wizard/modern-logo-dataset", "sam-mosaic/hhrlhf_evol_chatml", "4eJIoBek/PAIT-Downloads", "kunishou/hh-rlhf-49k-ja", "fblgit/tree-of-knowledge", "TigerResearch/tigerbot-law-plugin", "kaist-ai/Multilingual-CoT-Collection", "mcipriano/stackoverflow-kubernetes-questions", "jondurbin/airoboros-gpt4-1.4", "SALT-NLP/LLaVAR", "declare-lab/flan-mini", "jondurbin/airoboros-gpt4-2.0", "seungheondoh/LP-MusicCaps-MSD", "AILab-CVC/SEED-Bench", "zjunlp/InstructIE", "nisaar/LLAMA2_Legal_Dataset_4.4k_Instructions", "nampdn-ai/tiny-lessons", "Healthy13/Text2SQL", "MBZUAI-LLM/SlimPajama-627B-DC", "a686d380/sis-novel", "fedml/PubMedQA_instruction", "meta-math/MetaMathQA-40K", "PocketDoc/Choose-Your-Story-Long-Text-Adventures", "SinKove/synthetic_mammography_csaw", "unalignment/spicy-3.1", "locuslab/TOFU", "OpenGVLab/VideoChat2-IT", "LLM360/CrystalCoderDatasets", "argilla/ultrafeedback-curated", "HuggingFaceH4/grok-conversation-harmless", "HuggingFaceH4/OpenHermes-2.5-1k-longest", "Ziyuan111/DurhamTrees", "2A2I/Arabic-OpenHermes-2.5", "Locutusque/arc-cot", "osunlp/Multimodal-Mind2Web", - "rc9494/SP500_Date_Offset", "EleutherAI/lichess-puzzles", "conceptnet5", "cosmos_qa", "docred", "md_gender_bias", "mkqa", "onestop_english", "squad_kor_v1", "swag", + "rc9494/SP500_Date_Offset", "EleutherAI/lichess-puzzles", "conceptnet5/conceptnet5", "allenai/cosmos_qa", "thunlp/docred", "md_gender_bias", "mkqa", "onestop_english", "squad_kor_v1", "swag", "tweets_hate_speech_detection", "wmt/wmt16", "ChristophSchuhmann/MS_COCO_2017_URL_TEXT", "SetFit/emotion", "ai4bharat/samanantar", "ccdv/arxiv-classification", "mteb/tweet_sentiment_extraction", "beki/privy", "zoheb/sketch-scene", "WINGNUS/ACL-OCL", "haor/pixiv_month_top50", "HuggingFaceM4/COCO", "haor/pixiv-yandere", "Plachta/Umamusume-voice-text-pairs", "keremberke/chest-xray-classification", "keremberke/table-extraction", "silatus/1k_Website_Screenshots_and_Metadata", "IlyaGusev/habr", "KrakExilios/koreandoll", "pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs", "potsawee/wiki_bio_gpt3_hallucination", "RyokoAI/Fandom23K", "Bingsu/ko_alpaca_data", "medalpaca/medical_meadow_wikidoc_patient_information", "Papersnake/people_daily_news", "FreedomIntelligence/phoenix-sft-data-v1", "howard-hou/OCR-VQA", "silk-road/Vanilla-chinese-alpaca-luotuo", "danielv835/personal_finance_v0.2", "silk-road/Luotuo-QA-A-CoQA-Chinese", "gretelai/symptom_to_diagnosis", "agkphysics/AudioSet", "YeungNLP/ultrachat", "Iess/chinese_modern_poetry", "wendlerc/RenderedText", "Oasis-Team/Oasis-Corpus", "qgyd2021/chinese_chitchat", "MattCoddity/dockerNLcommands", "yuyijiong/Long-Instruction", "Skywork/ChineseDomainModelingEval", "xinrongzhang2022/InfiniteBench", "MohamedRashad/multilingual-tts", "silk-road/ChatHaruhi-Expand-118K", "Luckyjhg/Geo170K", "andersonbcdefg/synthetic_tuples_gpt35_turbo", "Rtian/DebugBench", "euclaise/reddit-instruct", "Locutusque/hercules-v1.0", "mastergopote44/Long-Term-Care-Aggregated-Data", "ontocord/CulturaY", - "Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M", "mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha", "jg583/NSynth", "storytracer/LoC-PD-Books", "zhongshsh/CLoT-Oogiri-GO", "davidkim205/kollm-converations", "Locutusque/hercules-v4.0", "climate_fever", "cmrc2018", "mrqa", + "Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M", "mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha", "jg583/NSynth", "storytracer/LoC-PD-Books", "zhongshsh/CLoT-Oogiri-GO", "davidkim205/kollm-converations", "Locutusque/hercules-v4.0", "tdiggelm/climate_fever", "hfl/cmrc2018", "mrqa", "nq_open", "kyunghyuncho/search_qa", "ted_talks_iwslt", "ubuntu_dialogs_corpus", "SetFit/enron_spam", "gsarti/flores_101", "vblagoje/lfqa", "huggan/pokemon", "joelniklaus/lextreme", "OxAISH-AL-LLM/wiki_toxic", "tomasg25/scientific_lay_summarisation", "svjack/pokemon-blip-captions-en-zh", "lambdalabs/naruto-blip-captions", "shunk031/wrime", "marmal88/skin_cancer", "IlyaGusev/rulm", "datadrivenscience/ship-detection", "Junity/UmaMusume-TokaiTeio-Dataset", "Den4ikAI/russian_dialogues", "LinhDuong/chatdoctor-200k", "Nebulous/gpt4all_pruned", "camel-ai/ai_society_translated", "alpindale/light-novels", "iamketan25/roleplay-instructions-dataset", "VMware/open-instruct-v1-oasst-dolly-hhrlhf", "Nan-Do/code-search-net-python", "ShoukanLabs/OpenNiji-Dataset", "Birchlabs/openai-prm800k-stepwise-critic", "Norquinal/claude_evol_instruct_210k", "mlfoundations/datacomp_1b", "tasksource/icl-symbol-tuning-instruct", "findnitai/english-to-hinglish", "pankajmathur/dolly-v2_orca", "sudy-super/dialogsum-ja", "sayakpaul/hf-codegen-v2", "FreedomIntelligence/CMB", "jamescalam/llama-2-arxiv-papers-chunked", "smangrul/hf-stack-v1", "abacusai/LongChat-Lines", "PetraAI/PetraAI", "sinarashidi/alpaca-persian", "neural-bridge/rag-hallucination-dataset-1000", "google/trueteacher", "twang2218/chinese-law-and-regulations", "Loie/Auto-ACD", "CollectiveCognition/chats-data-2023-09-22", "CollectiveCognition/chats-data-2023-09-27", "a686d380/h-eval", "guangyil/laion-coco-aesthetic", "ajibawa-2023/Code-74k-ShareGPT", "ChuckMcSneed/NeoEvalPlusN_benchmark", "matsuxr/JaGovFaqs-22k", "NobodyExistsOnTheInternet/ToxicQAFinal", "jondurbin/bagel-v0.3", "allenai/preference-test-sets", "xingyaoww/code-act", "moukaii/Tuberculosis_Dataset", "abacusai/ARC_DPO_FewShot", "tinyBenchmarks/tinyMMLU", "HPLT/hplt_monolingual_v1_2", - "maywell/koVast", "unicamp-dl/quati", "YanweiLi/MGM-Instruction", "BLINK-Benchmark/BLINK", "abacusai/SystemChat-1.1", "DLI-Lab/pearl", "Vi-VLM/Vista", "crd3", "hate_speech18", "Helsinki-NLP/kde4", + "maywell/koVast", "unicamp-dl/quati", "YanweiLi/MGM-Instruction", "BLINK-Benchmark/BLINK", "abacusai/SystemChat-1.1", "DLI-Lab/pearl", "Vi-VLM/Vista", "microsoft/crd3", "hate_speech18", "Helsinki-NLP/kde4", "kuznetsoffandrey/sberquad", "McGill-NLP/stereoset", "universal_morphologies", "wino_bias", "CAiRE/ASCEND", "huggingface/label-files", "laion/laion5B-index", "vicenteor/sbu_captions", "McGill-NLP/FaithDial", "LIUM/tedlium", "AlekseyKorshuk/persona-chat", "allenai/multi_lexsum", "DeveloperOats/DBPedia_Classes", "shailja/Verilog_GitHub", "akariasai/PopQA", "deepghs/game_characters", "nlphuji/whoops", "FredZhang7/anime-prompts-180K", "HuggingFaceH4/instruct_me", "mozilla-foundation/common_voice_12_0", "LangChainDatasets/agent-search-calculator", "jamescalam/langchain-docs", "cognitivecomputations/leet10k-alpaca", "Babelscape/multinerd", "kz-transformers/multidomain-kazakh-dataset", "LLMs/Alpaca-ShareGPT", "milashkaarshif/MoeGirlPedia_wikitext_raw_archive", "jainr3/diffusiondb-pixelart", "tau/zero_scrolls", "MU-NLPC/Calc-ape210k", @@ -613,7 +613,7 @@ def compute(self) -> CompleteJobResult: "vibhorag101/phr_mental_therapy_dataset", "Vision-Flan/vision-flan_191-task_1k", "ahmed-masry/ChartQA", "ProlificAI/social-reasoning-rlhf", "BAAI/DataOptim", "Heralax/Augmental-Dataset", "LLM-Tuning-Safety/HEx-PHI", "kwaikeg/KAgentBench", "SeaLLMs/Sea-bench", "athirdpath/DPO_Pairs-Roleplay-Alpaca-NSFW-v1-SHUFFLED", "yale-nlp/FOLIO", "RealTimeData/bbc_news_alltime", "HuggingFaceH4/orca_dpo_pairs", "NebulaeWis/gelbooru_images", "llm-blender/Unified-Feedback", "grimulkan/LimaRP-augmented", "cyberagent/chatbot-arena-ja-calm2-7b-chat-experimental", "ehristoforu/midjourney-images", "Jiwonny29/project1", "Major-TOM/Core-S2L1C", "gorilla-llm/Berkeley-Function-Calling-Leaderboard", "julep-ai/openai-community-posts", "SALT-NLP/Design2Code", "Locutusque/OpenCerebrum-SFT", "m-a-p/CodeEditorBench", "chansung/merged_ds_coding", "spectrallabs/credit-scoring-training-dataset", "shareAI/DPO-zh-en-emoji", "rqq/GLM-4-Instruct-4K-zh", "Helsinki-NLP/bible_para", - "UFRGS/brwac", "conllpp", "covost2", "head_qa", "facebook/lama", "multi_x_science_sum", "ptb_text_only", "social_bias_frames", "sst", "the_pile_openwebtext2", + "UFRGS/brwac", "ZihanWangKi/conllpp", "facebook/covost2", "head_qa", "facebook/lama", "multi_x_science_sum", "ptb_text_only", "social_bias_frames", "sst", "the_pile_openwebtext2", "wiki40b", "wiki_atomic_edits", "botisan-ai/cantonese-mandarin-translations", "nlpaueb/finer-139", "wikitablequestions", "silver/lccc", "facebook/content_rephrasing", "Twitter/TwitterFollowGraph", "Nerfgun3/wlop_style", "TheFusion21/PokemonCards", "jeanlee/kmhas_korean_hate_speech", "sander-wood/irishman", "tobiolatunji/afrispeech-200", "swaption2009/20k-en-zh-translation-pinyin-hsk", "danielshemesh/midjourney", "Elfsong/ClinicalDataset", "Den4ikAI/russian_instructions", "paulofinardi/OIG_small_chip2_portuguese_brasil", "acheong08/nsfw_reddit", "VISION-Workshop/VISION-Datasets", "P1ayer-1/chatgpt-conversations-chatlogs.net", "wavpub/JinJinLeDao_QA_Dataset", "lang-uk/every_prompt", "pki/SecurityGPT", "zjkarina/matreshka", "deepghs/nsfw_detect", "JasperLS/prompt-injections", "ccmusic-database/music_genre", "jondurbin/airoboros-gpt4", "TigerResearch/pretrain_en", @@ -622,7 +622,7 @@ def compute(self) -> CompleteJobResult: "Suprit/CMtMedQA", "ticoAg/Chinese-medical-dialogue", "Yirany/UniMM-Chat", "xuqinyang/BaiduBaike-5.63M", "jamescalam/agent-conversations-retrieval-tool", "zhiqings/LLaVA-Human-Preference-10K", "qgyd2021/rlhf_reward_dataset", "gathnex/Gath_baize", "a686d380/h-corpus-raw", "flytech/llama-python-codes-30k", "open-phi/ft-sample-mistral", "hkust-nlp/deita-6k-v0", "Doctor-Shotgun/no-robots-sharegpt", "styletts2-community/multilingual-phonemes-10k-alpha", "imone/OpenOrca_FLAN", "osv5m/osv5m", "multimodalart/steamboat-willy-frames", "irlab-udc/metahate", "grimulkan/theory-of-mind", "ai4bharat/indic-instruct-data-v0.1", "kobprof/skolegpt-instruct", "Ejafa/ye-pop", "steamcyclone/Pill_Ideologies-Post_Titles", "euclaise/reddit-instruct-curated", "VatsaDev/animebench-alpha", "0-hero/prompt-perfect-dpo", "MedRAG/textbooks", "TIGER-Lab/Mantis-Instruct", "ChuckMcSneed/various_RP_system_prompts", "chenmingxuan/Chinese-Patent-Summary", - "cassiekang/cub200_dataset", "antiven0m/catboros-3.2-dpo", "ai4privacy/pii-masking-300k", "multilingual/orca_dpo_pairs", "BigAction/the-wave-clean", "legacy-datasets/ami", "TheBritishLibrary/blbooks", "conv_ai_3", "e2e_nlg", "ethos", + "cassiekang/cub200_dataset", "antiven0m/catboros-3.2-dpo", "ai4privacy/pii-masking-300k", "multilingual/orca_dpo_pairs", "BigAction/the-wave-clean", "legacy-datasets/ami", "TheBritishLibrary/blbooks", "convai-challenge/conv_ai_3", "e2e_nlg", "ethos", "Helsinki-NLP/europarl", "hkcancor", "ucsbnlp/liar", "newsqa", "sem_eval_2018_task_1", "rcds/swiss_judgment_prediction", "told-br", "leondz/wnut_17", "CodedotAI/code_clippy_github", "castorini/mr-tydi", "flax-sentence-embeddings/stackexchange_math_jsonl", "jfrenz/legalglue", "ml6team/cnn_dailymail_nl", "sentence-transformers/parallel-sentences", "sentence-transformers/reddit-title-body", "stas/openwebtext-10k", "Azu/Handwritten-Mathematical-Expression-Convert-LaTeX", "patriziobellan/PET", "mozilla-foundation/common_voice_9_0", "bloomberg/entsum", "carblacac/twitter-sentiment-analysis", "HuggingFaceM4/VQAv2", "LHF/escorpius", "owaiskha9654/PubMed_MultiLabel_Text_Classification_Dataset_MeSH", "masakhane/mafand", "Muennighoff/P3", "Dahoas/instruct-synthetic-prompt-responses", "mjw/stock_market_tweets", "Korakoe/NijiJourney-Prompt-Pairs", "mrm8488/unnatural-instructions-full", @@ -634,7 +634,7 @@ def compute(self) -> CompleteJobResult: "glnmario/news-qa-summarization", "TriadParty/deepsex-RP", "pixparse/cc3m-wds", "Minami-su/Anime_novel_datasets", "Gourieff/ReActor", "cognitivecomputations/Code-74k-ShareGPT-Vicuna", "dataautogpt3/Dalle3", "DL3DV/DL3DV-Benchmark", "CausalLM/GPT-4-Self-Instruct-Turkish", "sablo/oasst2_curated", "STEM-AI-mtl/Electrical-engineering", "ikawrakow/imatrix-from-wiki-train", "somewheresystems/dataclysm-arxiv", "fblgit/simple-math", "fblgit/simple-math-DPO", "acon96/Home-Assistant-Requests", "Query-of-CC/Knowledge_Pile", "OpenDatasets/dalle-3-dataset", "ptx0/photo-concept-bucket", "zjunlp/iepile", "BatsResearch/ctga-v1", "MMInstruction/ArxivQA", "hotchpotch/JQaRA", "sean0042/KorMedMCQA", "p1atdev/ichikara-instruction", "maywell/LogicKor", "davanstrien/dataset-tldr", "xcodemind/vision2ui", "lawinstruct/lawinstruct", "UCSC-VLAA/HQ-Edit", - "kigner/ruozhiba-llama3-tt", "H-D-T/Select-Stack", "mutiyama/alt", "iabufarha/ar_sarcasm", "nilc-nlp/assin2", "cbt", "eurlex", "facebook/kilt_wikipedia", "multilingual_librispeech", "reuters21578", + "kigner/ruozhiba-llama3-tt", "H-D-T/Select-Stack", "mutiyama/alt", "iabufarha/ar_sarcasm", "nilc-nlp/assin2", "cam-cst/cbt", "eurlex", "facebook/kilt_wikipedia", "multilingual_librispeech", "reuters21578", "sentiment140", "squad_es", "the_pile_stack_exchange", "wiki_movies", "Fraser/python-state-changes", "Hellisotherpeople/DebateSum", "SocialGrep/one-million-reddit-jokes", "blinoff/medical_qa_ru_data", "huggingface/transformers-metadata", "indonesian-nlp/id_newspapers_2018", "openclimatefix/nimrod-uk-1km", "sentence-transformers/msmarco-hard-negatives", "nthngdy/oscar-small", "jiangjiechen/ekar_chinese", "sil-ai/bloom-captioning", "orieg/elsevier-oa-cc-by", "imagenet_sketch", "sileod/movie_recommendation", "quickdraw", "huggingface-legal/takedown-notices", "demelin/moral_stories", "RUCAIBox/Chinese-Generation", "Bingsu/zeroth-korean", "shjwudp/shu", "CarperAI/pile-v2-small-filtered", "citeseerx/ACL-fig", "keremberke/painting-style-classification", "jordyvl/DUDE_loader", "mlfoundations/datacomp_pools", "Loie/VGGSound", @@ -646,7 +646,7 @@ def compute(self) -> CompleteJobResult: "rizerphe/glaive-function-calling-v2-zephyr", "yuyijiong/Book_Summary_Chinese", "winglian/no_robots_rlhf", "castorini/wura", "diffusers/benchmarks", "nuprl/EditPackFT", "craigwu/vstar_bench", "Undi95/toxic-dpo-v0.1-sharegpt", "kunishou/oasst2-135k-ja", "ChuckMcSneed/WolframRavenwolfs_benchmark_results", "CausalLM/GPT-4-Self-Instruct-Japanese", "jtatman/stable-diffusion-prompts-uncensored", "lowres/anime", "MediaTek-Research/TCEval-v2", "AGBonnet/augmented-clinical-notes", "HuggingFaceH4/cai-conversation-harmless", "lmms-lab/VQAv2", "lmms-lab/DocVQA", "Mutonix/RefGPT-Fact-v2", "ba188/NHS_HES", "ajibawa-2023/Children-Stories-Collection", "Vikhrmodels/LLaVA-Instruct-ru", "Doctor-Shotgun/theory-of-mind-dpo", "divyasharma0795/AppleVisionPro_Tweets", "TIGER-Lab/MATH-plus", "cgato/SlimOrcaDedupCleaned", "YanweiLi/MGM-Pretrain", "HuggingFaceH4/llava-instruct-mix-vsft", "fal-ai/imgsys-results", "mzbac/function-calling-llama-3-format-v1.1", - "Yale-LILY/aeslc", "google-research-datasets/aquamuse", "allenai/atomic", "consumer-finance-complaints", "cppe-5", "craigslist_bargains", "fquad", "google_wellformed_query", "interpress_news_category_tr_lite", "thu-coai/kd_conv_with_kb", + "Yale-LILY/aeslc", "google-research-datasets/aquamuse", "allenai/atomic", "CFPB/consumer-finance-complaints", "rishitdagli/cppe-5", "stanfordnlp/craigslist_bargains", "fquad", "google_wellformed_query", "interpress_news_category_tr_lite", "thu-coai/kd_conv_with_kb", "kakaobrain/kor_nli", "para_pat", "poem_sentiment", "silicone", "story_cloze", "turkic_xwmt", "wi_locness", "fancyzhx/yelp_polarity", "CodedotAI/code_clippy", "SetFit/sst5", "deepset/germandpr", "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl", "microsoft/codexglue_method_generation", "nickmuchi/financial-classification", "uitnlp/vietnamese_students_feedback", "ydshieh/coco_dataset_script", "cgarciae/cartoonset", "DMetaSoul/chinese-semantic-textual-similarity", "ukr-models/Ukr-Synth", "Matthijs/snacks", "csebuetnlp/CrossSum", "Moo/korean-parallel-corpora", "HuggingFaceM4/TGIF", "khalidalt/tydiqa-goldp", "mteb/amazon_reviews_multi", "silver/mmchat", "fmplaza/offendes", "ColumbiaNLP/FLUTE", "tner/ontonotes5", "jordanparker6/publaynet", @@ -663,7 +663,7 @@ def compute(self) -> CompleteJobResult: "listen2you002/ChartLlama-Dataset", "saillab/taco-datasets", "nuprl/CanItEdit", "kyujinpy/orca_math_dpo", "adamkarvonen/chess_games", "blancsw/oasst2_top1_chat_format", "Awiny/Howto-Interlink7M", "NobodyExistsOnTheInternet/ToxicDPOqa", "VatsaDev/worldbuild", "lorinma/NL2SQL_zh", "mlabonne/chessllm", "genggui001/gg_zh_v1_550B", "DL3DV/DL3DV-ALL-4K", "paraloq/json_data_extraction", "tastypear/unalignment-toxic-dpo-v0.2-zh_cn", "hpprc/jawiki", "eduagarcia/LegalPT_dedup", "christopherthompson81/quant_exploration", "alvarobartt/dpo-mix-7k-simplified", "ucekmez/OpenOrca-tr", "ehristoforu/dalle-3-images", "ivrit-ai/whisper-training", "SPRIGHT-T2I/spright", "coseal/CodeUltraFeedback_binarized", "ParasiticRogue/Bluemoon-Light", "wdndev/webnovel-chinese", "jondurbin/bagel-v0.5", "Lin-Chen/MMStar", "tolgadev/turkish_73k_instruct_extended", "Babelscape/ALERT_DPO", - "kigner/ruozhiba-llama3", "davanstrien/dataset-tldr-preference-dpo", "facebook/asset", "barilan/blog_authorship_corpus", "dataset-org/c3", "clinc_oos", "eli5_category", "mohnish/lc_quad", "lm1b", "para_crawl", + "kigner/ruozhiba-llama3", "davanstrien/dataset-tldr-preference-dpo", "facebook/asset", "barilan/blog_authorship_corpus", "dataset-org/c3", "clinc/clinc_oos", "eli5_category", "mohnish/lc_quad", "lm1b", "para_crawl", "spanish_billion_words", "squad_kor_v2", "squad_v1_pt", "swda", "thaisum", "wmt/wmt14", "SetFit/20_newsgroups", "bertin-project/mc4-sampling", "lbox/lbox_open", "codeparrot/codeparrot-clean-train", "thomwolf/github-python", "Adapting/empathetic_dialogues_v2", "Bingsu/Human_Action_Recognition", "mustapha/QuranExe", "ceyda/fashion-products-small", "frgfm/imagenette", "naver-clova-ix/synthdog-en", "bigscience/evaluation-results", "pcuenq/oxford-pets", "SLPL/syntran-fa", "RUCAIBox/Story-Generation", "jonathanli/law-stack-exchange", "ai-forever/school_notebooks_RU", "ashraq/esc50", "waifu-research-department/regularization", "sbx/superlim-2", "ashraq/financial-news", "AluminiumOxide/personal_latent_diffusion", "elenanereiss/german-ler", "Nerfgun3/flower_style", diff --git a/services/worker/tests/job_runners/dataset/test_compatible_libraries.py b/services/worker/tests/job_runners/dataset/test_compatible_libraries.py index dcfe739a4..f46669146 100644 --- a/services/worker/tests/job_runners/dataset/test_compatible_libraries.py +++ b/services/worker/tests/job_runners/dataset/test_compatible_libraries.py @@ -460,7 +460,7 @@ def test_compute_error( {"default": {"train": ["openhermes.json"]}}, ), ( - "cnn_dailymail", + "abisee/cnn_dailymail", "parquet", { "1.0.0": {