diff --git a/evals/registry/data/sql/co_sql.jsonl b/evals/registry/data/sql/co_sql.jsonl new file mode 100644 index 0000000000..7e406a8dde --- /dev/null +++ b/evals/registry/data/sql/co_sql.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a039a58e30d58517f567467974551247716f03faab4f166922c67a030a68d8f3 +size 151668 diff --git a/evals/registry/evals/co-sql.yaml b/evals/registry/evals/co-sql.yaml new file mode 100644 index 0000000000..7381eb363e --- /dev/null +++ b/evals/registry/evals/co-sql.yaml @@ -0,0 +1,11 @@ +co-sql: + id: co-sql.dev.v0 + metrics: [accuracy] + description: Evaluates performance on a 100 samples of the CoSQL dataset, a conversational version of Text-to-SQL tasks. Each conversation simulates a real-world DB scenario where a user asks NLP questions and a SQL expert retrieves answers in response. Yu, Tao, et al. \"CoSQL A Conversational Text-to-SQL Challenge Towards Cross-Domain Natural Language Interfaces to Databases\" https://arxiv.org/abs/1909.05378 +co-sql.dev.v0: + class: evals.elsuite.modelgraded.classify:ModelBasedClassify + args: + samples_jsonl: sql/co_sql.jsonl + eval_type: cot_classify + modelgraded_spec: sql +