diff --git a/docs/source/openapi.json b/docs/source/openapi.json index e42fc8d9a..4fed46813 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -4308,7 +4308,7 @@ } }, "partial parquet export": { - "summary": "c4 (en): the parquet export is partial (first 5GB)", + "summary": "allenai/c4 (en): the parquet export is partial (first 5GB)", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=allenai/c4&config=en", "value": { "parquet_files": [ diff --git a/services/worker/src/worker/job_runners/split/presidio_scan.py b/services/worker/src/worker/job_runners/split/presidio_scan.py index e9d2d6c17..20456bb4e 100644 --- a/services/worker/src/worker/job_runners/split/presidio_scan.py +++ b/services/worker/src/worker/job_runners/split/presidio_scan.py @@ -474,8 +474,8 @@ def compute(self) -> CompleteJobResult: "GAIR/lima", "HuggingFaceH4/no_robots", "cognitivecomputations/dolphin", "cerebras/SlimPajama-627B", "timdettmers/openassistant-guanaco", "HuggingFaceH4/ultrachat_200k", "EleutherAI/pile", "liuhaotian/LLaVA-Instruct-150K", "b-mc2/sql-create-context", "garage-bAInd/Open-Platypus", "bigcode/starcoderdata", "microsoft/orca-math-word-problems-200k", "imagenet-1k", "nyu-mll/glue", "bigcode/the-stack-dedup", "togethercomputer/RedPajama-Data-V2", "gretelai/synthetic_text_to_sql", "allenai/objaverse", "Skylion007/openwebtext", "wikitext", "HuggingFaceM4/WebSight", "RyokoAI/ShareGPT52K", "laion/OIG", "stanfordnlp/SHP", "PleIAs/YouTube-Commons", "Skywork/SkyPile-150B", "glaiveai/glaive-function-calling-v2", "Samsung/samsum", "lmsys/chatbot_arena_conversations", "openbmb/UltraFeedback", - "lambdalabs/pokemon-blip-captions", "shibing624/medical", "berkeley-nest/Nectar", "Intel/orca_dpo_pairs", "YeungNLP/firefly-train-1.1M", "BAAI/COIG-PC", "meta-math/MetaMathQA", "gsm8k", "codeparrot/github-code", "bookcorpus", - "Open-Orca/SlimOrca", "dair-ai/emotion", "CohereForAI/aya_dataset", "c4", "cais/mmlu", "open-web-math/open-web-math", "code_search_net", "allenai/WildChat-1M", "rajpurkar/squad", "litagin/moe-speech", + "lambdalabs/pokemon-blip-captions", "shibing624/medical", "berkeley-nest/Nectar", "Intel/orca_dpo_pairs", "YeungNLP/firefly-train-1.1M", "BAAI/COIG-PC", "meta-math/MetaMathQA", "gsm8k", "codeparrot/github-code", "bookcorpus/bookcorpus", + "Open-Orca/SlimOrca", "dair-ai/emotion", "CohereForAI/aya_dataset", "legacy-datasets/c4", "cais/mmlu", "open-web-math/open-web-math", "code_search_net", "allenai/WildChat-1M", "rajpurkar/squad", "litagin/moe-speech", "Lin-Chen/ShareGPT4V", "shareAI/ShareGPT-Chinese-English-90k", "nomic-ai/gpt4all-j-prompt-generations", "ceval/ceval-exam", "google/fleurs", "openai/webgpt_comparisons", "bigcode/the-stack-v2", "HuggingFaceM4/the_cauldron", "Salesforce/dialogstudio", "LDJnr/Capybara", "stanfordnlp/imdb", "nampdn-ai/tiny-codes", "CausalLM/Refined-Anime-Text", "bigscience/P3", "vicgalle/alpaca-gpt4", "bigcode/ta-prompt", "Locutusque/UltraTextbooks", "allenai/c4", "pile-of-law/pile-of-law", "teknium/openhermes", "TIGER-Lab/MathInstruct", "HuggingFaceH4/ultrafeedback_binarized", "PygmalionAI/PIPPA", "openai_humaneval", "cnn_dailymail", "yizhongw/self_instruct", "SirNeural/flan_v2", "nvidia/HelpSteer", "THUDM/AgentInstruct", "nvidia/OpenMathInstruct-1", @@ -538,7 +538,7 @@ def compute(self) -> CompleteJobResult: "LLM360/AmberDatasets", "peiyi9979/Math-Shepherd", "Crystalcareai/MoD", "papluca/language-identification", "bigcode/the-stack-smol", "argilla/news-summary", "CarperAI/openai_summarize_comparisons", "argilla/databricks-dolly-15k-curated-en", "mikex86/stackoverflow-posts", "Anthropic/llm_global_opinions", "akjindal53244/Arithmo-Data", "OpenLLM-France/Claire-Dialogue-French-0.1", "arbml/CIDAR", "snorkelai/Snorkel-Mistral-PairRM-DPO-Dataset", "PleIAs/US-PD-Newspapers", "yh0701/FracAtlas_dataset", "somosnlp/Reglamento_Aeronautico_Colombiano_2024GemmaQA", "omi-health/medical-dialogue-to-soap-summary", "argilla/Capybara-Preferences", "UCLNLP/adversarial_qa", "conv_ai_2", "ccdv/govreport-summarization", "mozilla-foundation/common_voice_8_0", "nomic-ai/gpt4all_prompt_generations_with_p3", "hugfaceguy0001/retarded_bar", "lksy/ru_instruct_gpt4", "Linly-AI/Chinese-pretraining-dataset", "mosaicml/instruct-v3", "corbt/all-recipes", "VatsaDev/TinyText", - "google/docci", "linux-cn/archive", "Johnnyeee/Yelpdata_663", "HuggingFaceTB/cosmopedia-100k", "nyu-mll/blimp", "bookcorpusopen", "iwslt2017", "recipe_nlg", "Helsinki-NLP/tatoeba", "GEM/viggo", + "google/docci", "linux-cn/archive", "Johnnyeee/Yelpdata_663", "HuggingFaceTB/cosmopedia-100k", "nyu-mll/blimp", "defunct-datasets/bookcorpusopen", "iwslt2017", "recipe_nlg", "Helsinki-NLP/tatoeba", "GEM/viggo", "bavard/personachat_truecased", "segments/sidewalk-semantic", "PolyAI/banking77", "facebook/pmd", "zeroshot/twitter-financial-news-topic", "nuprl/MultiPL-E", "GBaker/MedQA-USMLE-4-options", "camel-ai/code", "merve/turkish_instructions", "tasksource/oasst1_pairwise_rlhf_reward", "winddude/reddit_finance_43_250k", "tiedong/goat", "togethercomputer/RedPajama-Data-Instruct", "DKYoon/SlimPajama-6B", "Maxx0/sexting-nsfw-adultconten", "squarelike/OpenOrca-gugugo-ko", "MMInstruction/VLFeedback", "LLaVA-VL/llava-plus-data", "McAuley-Lab/Amazon-Reviews-2023", "Open-Orca/1million-gpt-4", "gwenxin/pills_inside_bottles", "keithito/lj_speech", "conll2012_ontonotesv5", "mwritescode/slither-audited-smart-contracts", "bsmock/pubtables-1m", "tasksource/mmlu", "bigcode/bigcode-pii-dataset", "medalpaca/medical_meadow_wikidoc", "P01son/instructions", "ArtifactAI/arxiv-physics-instruct-tune-30k", @@ -613,7 +613,7 @@ def compute(self) -> CompleteJobResult: "vibhorag101/phr_mental_therapy_dataset", "Vision-Flan/vision-flan_191-task_1k", "ahmed-masry/ChartQA", "ProlificAI/social-reasoning-rlhf", "BAAI/DataOptim", "Heralax/Augmental-Dataset", "LLM-Tuning-Safety/HEx-PHI", "kwaikeg/KAgentBench", "SeaLLMs/Sea-bench", "athirdpath/DPO_Pairs-Roleplay-Alpaca-NSFW-v1-SHUFFLED", "yale-nlp/FOLIO", "RealTimeData/bbc_news_alltime", "HuggingFaceH4/orca_dpo_pairs", "NebulaeWis/gelbooru_images", "llm-blender/Unified-Feedback", "grimulkan/LimaRP-augmented", "cyberagent/chatbot-arena-ja-calm2-7b-chat-experimental", "ehristoforu/midjourney-images", "Jiwonny29/project1", "Major-TOM/Core-S2L1C", "gorilla-llm/Berkeley-Function-Calling-Leaderboard", "julep-ai/openai-community-posts", "SALT-NLP/Design2Code", "Locutusque/OpenCerebrum-SFT", "m-a-p/CodeEditorBench", "chansung/merged_ds_coding", "spectrallabs/credit-scoring-training-dataset", "shareAI/DPO-zh-en-emoji", "rqq/GLM-4-Instruct-4K-zh", "Helsinki-NLP/bible_para", - "brwac", "conllpp", "covost2", "head_qa", "facebook/lama", "multi_x_science_sum", "ptb_text_only", "social_bias_frames", "sst", "the_pile_openwebtext2", + "UFRGS/brwac", "conllpp", "covost2", "head_qa", "facebook/lama", "multi_x_science_sum", "ptb_text_only", "social_bias_frames", "sst", "the_pile_openwebtext2", "wiki40b", "wiki_atomic_edits", "botisan-ai/cantonese-mandarin-translations", "nlpaueb/finer-139", "wikitablequestions", "silver/lccc", "facebook/content_rephrasing", "Twitter/TwitterFollowGraph", "Nerfgun3/wlop_style", "TheFusion21/PokemonCards", "jeanlee/kmhas_korean_hate_speech", "sander-wood/irishman", "tobiolatunji/afrispeech-200", "swaption2009/20k-en-zh-translation-pinyin-hsk", "danielshemesh/midjourney", "Elfsong/ClinicalDataset", "Den4ikAI/russian_instructions", "paulofinardi/OIG_small_chip2_portuguese_brasil", "acheong08/nsfw_reddit", "VISION-Workshop/VISION-Datasets", "P1ayer-1/chatgpt-conversations-chatlogs.net", "wavpub/JinJinLeDao_QA_Dataset", "lang-uk/every_prompt", "pki/SecurityGPT", "zjkarina/matreshka", "deepghs/nsfw_detect", "JasperLS/prompt-injections", "ccmusic-database/music_genre", "jondurbin/airoboros-gpt4", "TigerResearch/pretrain_en", @@ -663,7 +663,7 @@ def compute(self) -> CompleteJobResult: "listen2you002/ChartLlama-Dataset", "saillab/taco-datasets", "nuprl/CanItEdit", "kyujinpy/orca_math_dpo", "adamkarvonen/chess_games", "blancsw/oasst2_top1_chat_format", "Awiny/Howto-Interlink7M", "NobodyExistsOnTheInternet/ToxicDPOqa", "VatsaDev/worldbuild", "lorinma/NL2SQL_zh", "mlabonne/chessllm", "genggui001/gg_zh_v1_550B", "DL3DV/DL3DV-ALL-4K", "paraloq/json_data_extraction", "tastypear/unalignment-toxic-dpo-v0.2-zh_cn", "hpprc/jawiki", "eduagarcia/LegalPT_dedup", "christopherthompson81/quant_exploration", "alvarobartt/dpo-mix-7k-simplified", "ucekmez/OpenOrca-tr", "ehristoforu/dalle-3-images", "ivrit-ai/whisper-training", "SPRIGHT-T2I/spright", "coseal/CodeUltraFeedback_binarized", "ParasiticRogue/Bluemoon-Light", "wdndev/webnovel-chinese", "jondurbin/bagel-v0.5", "Lin-Chen/MMStar", "tolgadev/turkish_73k_instruct_extended", "Babelscape/ALERT_DPO", - "kigner/ruozhiba-llama3", "davanstrien/dataset-tldr-preference-dpo", "facebook/asset", "barilan/blog_authorship_corpus", "c3", "clinc_oos", "eli5_category", "mohnish/lc_quad", "lm1b", "para_crawl", + "kigner/ruozhiba-llama3", "davanstrien/dataset-tldr-preference-dpo", "facebook/asset", "barilan/blog_authorship_corpus", "dataset-org/c3", "clinc_oos", "eli5_category", "mohnish/lc_quad", "lm1b", "para_crawl", "spanish_billion_words", "squad_kor_v2", "squad_v1_pt", "swda", "thaisum", "wmt/wmt14", "SetFit/20_newsgroups", "bertin-project/mc4-sampling", "lbox/lbox_open", "codeparrot/codeparrot-clean-train", "thomwolf/github-python", "Adapting/empathetic_dialogues_v2", "Bingsu/Human_Action_Recognition", "mustapha/QuranExe", "ceyda/fashion-products-small", "frgfm/imagenette", "naver-clova-ix/synthdog-en", "bigscience/evaluation-results", "pcuenq/oxford-pets", "SLPL/syntran-fa", "RUCAIBox/Story-Generation", "jonathanli/law-stack-exchange", "ai-forever/school_notebooks_RU", "ashraq/esc50", "waifu-research-department/regularization", "sbx/superlim-2", "ashraq/financial-news", "AluminiumOxide/personal_latent_diffusion", "elenanereiss/german-ler", "Nerfgun3/flower_style",