>>> import datasets
>>> program_synthesis_dataset = datasets.load_dataset("NTU-NLP-sg/xCodeEval", "program_synthesis")
>>> print(code_translation_dataset)
DatasetDict({
train: Dataset({
features: ['prob_desc_output_to', 'prob_desc_created_at', 'file_name', 'prob_desc_memory_limit', 'code_uid', 'prob_desc_sample_inputs', 'prob_desc_description', 'difficulty', 'prob_desc_sample_outputs', 'prob_desc_time_limit', 'source_code', 'prob_desc_input_spec', 'prob_desc_notes', 'src_uid', 'lang', 'prob_desc_input_from', 'tags', 'prob_desc_output_spec', 'exec_outcome', 'lang_cluster', 'hidden_unit_tests'],
num_rows: 5538841
})
validation: Dataset({
features: ['prob_desc_output_to', 'prob_desc_created_at', 'file_name', 'prob_desc_memory_limit', 'code_uid', 'prob_desc_sample_inputs', 'prob_desc_description', 'difficulty', 'prob_desc_sample_outputs', 'prob_desc_time_limit', 'source_code', 'prob_desc_input_spec', 'prob_desc_notes', 'src_uid', 'lang', 'prob_desc_input_from', 'tags', 'prob_desc_output_spec', 'exec_outcome', 'lang_cluster', 'hidden_unit_tests'],
num_rows: 106
})
test: Dataset({
features: ['prob_desc_output_to', 'prob_desc_created_at', 'file_name', 'prob_desc_memory_limit', 'code_uid', 'prob_desc_sample_inputs', 'prob_desc_description', 'difficulty', 'prob_desc_sample_outputs', 'prob_desc_time_limit', 'source_code', 'prob_desc_input_spec', 'prob_desc_notes', 'src_uid', 'lang', 'prob_desc_input_from', 'tags', 'prob_desc_output_spec', 'exec_outcome', 'lang_cluster', 'hidden_unit_tests'],
num_rows: 952
})
})
When loading with huggingface load_dataset()
api, no need to do additional data linking. But if you are donwloading data in raw *.jsonl
format, you need to link proper field for the task. To link the data, use src_uid
to match row from problem_descriptions.jsonl
and unittest_db.json
.
To download the program synthesis data,
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/NTU-NLP-sg/xCodeEval
cd xCodeEval
git lfs pull --include "program_synthesis/*"
git lfs pull --include "problem_descriptions.jsonl"
git lfs pull --include "unittest_db.json"
{
"lang": "GNU C++17",
"source_code": "#include <bits/stdc++.h>\n#include <chrono> \nusing namespace std::chrono; \n\nusing namespace std;\n#define f0r(a, b) for (a = 0; a < b; a++)\n#define f1r(a, b, c) for (a = b; a < c; a++)\n#define ms(arr, v) memset(arr, v, sizeof(arr))\n#define pb push_back\n#define io ios_base::sync_with_stdio(false); cin.tie(NULL); cout.tie(NULL)\n#define mp make_pair\n#define f first\n#define s second\ntypedef long long ll;\ntypedef double ld;\ntypedef pair<int, int> pii;\ntypedef pair<ll, ll> pll;\nll i, j;\n\nll n, q, Q, T, m, k, r, x, y, z, g;\nstring a, b;\nunsigned char ans[200001];\n\nint main() {\n io;\n cin >> n >> a >> b;\n ms(ans, 0);\n for (int i = n-1; i >= 0; i--) {\n // f0r(j, n) cout << (char)(ans[j] < 10 ? ans[j] + '0' : ans[j]);\n cout << endl;\n if (i == n-1) {\n if (a[i] > b[i]) {\n int diff = a[i] - b[i];\n int x = b[i] + (diff>>1);\n while (x > 'z') {\n x -= 26;\n ans[i-1]++;\n }\n ans[i] = x;\n } else {\n int diff = b[i] - a[i];\n int x = a[i] + (diff>>1);\n ans[i] = x;\n }\n continue;\n }\n if (a[i] < b[i]) {\n int diff = b[i] - a[i];\n ans[i] += a[i] + (diff>>1);\n if (diff % 2 != 0) {\n ans[i+1] += 13;\n while (ans[i+1] > 'z') {\n ans[i+1] -= 26;\n ans[i]++;\n }\n }\n while (ans[i] > 'z') {\n ans[i] -= 26;\n ans[i-1]++;\n }\n } else if (a[i] == b[i]) {\n ans[i] = a[i];\n } else {\n int diff = a[i] - b[i];\n ans[i] += b[i] + (diff>>1);\n if (diff % 2 != 0) {\n ans[i+1] += 13;\n while (ans[i+1] > 'z') {\n ans[i+1] -= 26;\n ans[i]++;\n }\n }\n while (ans[i] > 'z') {\n ans[i] -= 26;\n ans[i-1]++;\n }\n }\n }\n f0r(i, n) cout << ans[i];\n cout << endl;\n // cout << \"FLUSH PLEASE\" << endl;\n}",
"tags": [
"number theory",
"bitmasks",
"math",
"strings"
],
"lang_cluster": "C++",
"src_uid": "5f4009d4065f5ad39e662095f8f5c068",
"code_uid": "3a1d3fab20424f3fadea46e0ac60f0c6",
"difficulty": 1900,
"exec_outcome": "PASSED"
}
{
"description": "You are given a string $$$s$$$ consisting of lowercase Latin letters. Let the length of $$$s$$$ be $$$|s|$$$. You may perform several operations on this string.In one operation, you can choose some index $$$i$$$ and remove the $$$i$$$-th character of $$$s$$$ ($$$s_i$$$) if at least one of its adjacent characters is the previous letter in the Latin alphabet for $$$s_i$$$. For example, the previous letter for b is a, the previous letter for s is r, the letter a has no previous letters. Note that after each removal the length of the string decreases by one. So, the index $$$i$$$ should satisfy the condition $$$1 \\le i \\le |s|$$$ during each operation.For the character $$$s_i$$$ adjacent characters are $$$s_{i-1}$$$ and $$$s_{i+1}$$$. The first and the last characters of $$$s$$$ both have only one adjacent character (unless $$$|s| = 1$$$).Consider the following example. Let $$$s=$$$ bacabcab. During the first move, you can remove the first character $$$s_1=$$$ b because $$$s_2=$$$ a. Then the string becomes $$$s=$$$ acabcab. During the second move, you can remove the fifth character $$$s_5=$$$ c because $$$s_4=$$$ b. Then the string becomes $$$s=$$$ acabab. During the third move, you can remove the sixth character $$$s_6=$$$'b' because $$$s_5=$$$ a. Then the string becomes $$$s=$$$ acaba. During the fourth move, the only character you can remove is $$$s_4=$$$ b, because $$$s_3=$$$ a (or $$$s_5=$$$ a). The string becomes $$$s=$$$ acaa and you cannot do anything with it. Your task is to find the maximum possible number of characters you can remove if you choose the sequence of operations optimally.",
"input_from": "standard input",
"output_to": "standard output",
"time_limit": "2 seconds",
"memory_limit": "256 megabytes",
"input_spec": "The only line of the input contains one integer $$$|s|$$$ ($$$1 \\le |s| \\le 100$$$) \u2014 the length of $$$s$$$. The second line of the input contains one string $$$s$$$ consisting of $$$|s|$$$ lowercase Latin letters.",
"output_spec": "Print one integer \u2014 the maximum possible number of characters you can remove if you choose the sequence of moves optimally.",
"notes": "NoteThe first example is described in the problem statement. Note that the sequence of moves provided in the statement is not the only, but it can be shown that the maximum possible answer to this test is $$$4$$$.In the second example, you can remove all but one character of $$$s$$$. The only possible answer follows. During the first move, remove the third character $$$s_3=$$$ d, $$$s$$$ becomes bca. During the second move, remove the second character $$$s_2=$$$ c, $$$s$$$ becomes ba. And during the third move, remove the first character $$$s_1=$$$ b, $$$s$$$ becomes a. ",
"sample_inputs": [
"8\nbacabcab",
"4\nbcda",
"6\nabbbbb"
],
"sample_outputs": [
"4",
"3",
"5"
],
"tags": [
"brute force",
"constructive algorithms",
"strings",
"greedy"
],
"src_uid": "9ce37bc2d361f5bb8a0568fb479b8a38",
"difficulty": 1600
}
lang
: Runtime/Compiler version of thesource_code
.source_code
: A program.tags
: List of potential algorithmic techniques required to write the program.lang_cluster
: A generic programming language name the value oflang
belongs to.src_uid
: A specific identifier that shows which problem the code is associated with. This identifier is important for the training of the model. The problem referred to by thesrc_uid
provides a natural description of the problem that the code successfully solved. Refer to Structure ofproblem_descriptions.jsonl
code_uid
: A unique ID for the sample. It is not important for model training. If you find any issue with the sample, you can report it to us mentioning thecode_uid
.difficulty
: Difficulty rating of the problem indicated bysrc_uid
. The higher the harder.exec_outcome
: Execution outcome status. Follow Section 4.1 to know the potential list of outcomes. Theexec_outcome
flags in the training data comes from a pre-run environmeent. However, training data doesn't includes unit-test to avoid potential hacks. We provide unit test for only dev and test data.
The following keys will come from problem_descriptions.jsonl
by matching src_uid
,
prob_desc_description
: Problem description in textual format, math operations are written in latex.prob_desc_input_from
: How the program should take the unit test.prob_desc_output_to
: Where the program should output the result of the unit test.prob_desc_time_limit
: Time limit to solve the problem.prob_desc_memory_limit
: Memory limit to solve the problem.prob_desc_input_spec
: How and in what order the input will be given to the program? It also includes the date range, types, and sizes.prob_desc_output_spec
: How the outputs should be printed. Most of the time the unit test results are matched with an exact string match or floating point comparison with a precision boundary.prob_desc_sample_inputs
: A sample input for the code that is expected to solve the problem described indescription
.prob_desc_sample_outputs
: The expected output for thesample_input
that is expected to solve the problem described indescription
.prob_desc_notes
: Explanation ofsample_inputs
&sample_outputs
.prob_desc_created_at
: The Unix timestamp when the problem was released. Usedatetime
lib in Python to parse it to a human-readable format.
source information will come from the name of the *.jsonl
file name.
20. file_name
: Name of the source jsonl file from where data is loaded.
Unit test information will come from unittest_db.json
by matching src_uid
.
21. hidden_unit_tests
: a list of unit tests returned as string. use json.loads(hidden_unit_tests)
to load the data.
Run the following,
cd xCodeEval/
tar c program_synthesis | md5sum
Output should match, 4a1f740298075f1726ab7caa73495342
.
3 directories, 113 files
.
├── test
│ └── prog_syn_test.jsonl
├── train
│ ├── train_000.jsonl
│ ├── train_001.jsonl
│ ├── train_002.jsonl
│ ├── train_003.jsonl
│ ├── train_004.jsonl
│ ├── train_005.jsonl
│ ├── train_006.jsonl
│ ├── train_007.jsonl
│ ├── train_008.jsonl
│ ├── train_009.jsonl
│ ├── train_010.jsonl
│ ├── train_011.jsonl
│ ├── train_012.jsonl
│ ├── train_013.jsonl
│ ├── train_014.jsonl
│ ├── train_015.jsonl
│ ├── train_016.jsonl
│ ├── train_017.jsonl
│ ├── train_018.jsonl
│ ├── train_019.jsonl
│ ├── train_020.jsonl
│ ├── train_021.jsonl
│ ├── train_022.jsonl
│ ├── train_023.jsonl
│ ├── train_024.jsonl
│ ├── train_025.jsonl
│ ├── train_026.jsonl
│ ├── train_027.jsonl
│ ├── train_028.jsonl
│ ├── train_029.jsonl
│ ├── train_030.jsonl
│ ├── train_031.jsonl
│ ├── train_032.jsonl
│ ├── train_033.jsonl
│ ├── train_034.jsonl
│ ├── train_035.jsonl
│ ├── train_036.jsonl
│ ├── train_037.jsonl
│ ├── train_038.jsonl
│ ├── train_039.jsonl
│ ├── train_040.jsonl
│ ├── train_041.jsonl
│ ├── train_042.jsonl
│ ├── train_043.jsonl
│ ├── train_044.jsonl
│ ├── train_045.jsonl
│ ├── train_046.jsonl
│ ├── train_047.jsonl
│ ├── train_048.jsonl
│ ├── train_049.jsonl
│ ├── train_050.jsonl
│ ├── train_051.jsonl
│ ├── train_052.jsonl
│ ├── train_053.jsonl
│ ├── train_054.jsonl
│ ├── train_055.jsonl
│ ├── train_056.jsonl
│ ├── train_057.jsonl
│ ├── train_058.jsonl
│ ├── train_059.jsonl
│ ├── train_060.jsonl
│ ├── train_061.jsonl
│ ├── train_062.jsonl
│ ├── train_063.jsonl
│ ├── train_064.jsonl
│ ├── train_065.jsonl
│ ├── train_066.jsonl
│ ├── train_067.jsonl
│ ├── train_068.jsonl
│ ├── train_069.jsonl
│ ├── train_070.jsonl
│ ├── train_071.jsonl
│ ├── train_072.jsonl
│ ├── train_073.jsonl
│ ├── train_074.jsonl
│ ├── train_075.jsonl
│ ├── train_076.jsonl
│ ├── train_077.jsonl
│ ├── train_078.jsonl
│ ├── train_079.jsonl
│ ├── train_080.jsonl
│ ├── train_081.jsonl
│ ├── train_082.jsonl
│ ├── train_083.jsonl
│ ├── train_084.jsonl
│ ├── train_085.jsonl
│ ├── train_086.jsonl
│ ├── train_087.jsonl
│ ├── train_088.jsonl
│ ├── train_089.jsonl
│ ├── train_090.jsonl
│ ├── train_091.jsonl
│ ├── train_092.jsonl
│ ├── train_093.jsonl
│ ├── train_094.jsonl
│ ├── train_095.jsonl
│ ├── train_096.jsonl
│ ├── train_097.jsonl
│ ├── train_098.jsonl
│ ├── train_099.jsonl
│ ├── train_100.jsonl
│ ├── train_101.jsonl
│ ├── train_102.jsonl
│ ├── train_103.jsonl
│ ├── train_104.jsonl
│ ├── train_105.jsonl
│ ├── train_106.jsonl
│ ├── train_107.jsonl
│ ├── train_108.jsonl
│ ├── train_109.jsonl
│ └── train_110.jsonl
└── validation
└── prog_syn_val.jsonl