diff --git a/refs/pull/405/merge/.buildinfo b/refs/pull/405/merge/.buildinfo new file mode 100644 index 00000000..1dd263c9 --- /dev/null +++ b/refs/pull/405/merge/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 012007e1214812f57fe9bda99df4d3ab +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/refs/pull/405/merge/_images/boot.cast b/refs/pull/405/merge/_images/boot.cast new file mode 100644 index 00000000..e69de29b diff --git a/refs/pull/405/merge/_images/buffers-scheme.png b/refs/pull/405/merge/_images/buffers-scheme.png new file mode 100644 index 00000000..aa70c0ed Binary files /dev/null and b/refs/pull/405/merge/_images/buffers-scheme.png differ diff --git a/refs/pull/405/merge/_images/context_switch.cast b/refs/pull/405/merge/_images/context_switch.cast new file mode 100644 index 00000000..d59458dc --- /dev/null +++ b/refs/pull/405/merge/_images/context_switch.cast @@ -0,0 +1,1055 @@ +{"version": 2, "width": 80, "height": 24, "timestamp": 1615893527, "idle_time_limit": 1.0, "env": {"SHELL": null, "TERM": "xterm"}} +[0.002326, "o", "$ "] +[1.9175, "o", "m"] +[1.959726, "o", "a"] +[2.032681, "o", "k"] +[2.164502, "o", "e"] +[2.232149, "o", " "] +[2.488152, "o", "g"] +[2.596492, "o", "d"] +[2.677323, "o", "b"] +[5.648402, "o", "\r\n"] +[5.657328, "o", "gdb -ex \"target remote localhost:1234\" /linux/vmlinux\r\n"] +[5.69046, "o", "\u001b[35;1m\u001b[35;1mGNU gdb \u001b[m\u001b[35;1m(Ubuntu 9.2-0ubuntu1~20.04) \u001b[m\u001b[35;1m9.2\u001b[m\u001b[35;1m\r\n\u001b[m\u001b[mCopyright (C) 2020 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law.\r\nType \"show copying\" and \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n .\r\n\r\nFor help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[5.690654, "o", "Reading symbols from \u001b[32m/linux/vmlinux\u001b[m...\r\n"] +[6.256082, "o", "Remote debugging using localhost:1234\r\n"] +[6.268525, "o", "\u001b[33mdefault_idle\u001b[m () at \u001b[32march/x86/kernel/process.c\u001b[m:689\r\n689\t}\r\n"] +[6.269022, "o", "(gdb) "] +[8.116975, "o", "b"] +[8.192267, "o", "r"] +[8.249065, "o", "e"] +[8.289452, "o", "a"] +[8.360076, "o", "k"] +[8.473813, "o", " "] +[8.832714, "o", "_"] +[9.629251, "o", "_"] +[9.941666, "o", "s"] +[10.104279, "o", "w"] +[10.221431, "o", "i"] +[10.309944, "o", "t"] +[10.509059, "o", "c"] +[10.557026, "o", "h"] +[10.849393, "o", "_"] +[11.068795, "o", "t"] +[11.110265, "o", "o"] +[11.352371, "o", "_"] +[14.021585, "o", "a"] +[14.117609, "o", "s"] +[14.213449, "o", "m"] +[14.888578, "o", "\r\n"] +[14.9167, "o", "Breakpoint 1 at \u001b[34m0xc10018e8\u001b[m: file \u001b[32march/x86/entry/entry_32.S\u001b[m, line 765.\r\n(gdb) "] +[16.968107, "o", "c"] +[18.104467, "o", "\r\nContinuing.\r\n"] +[18.156456, "o", "\r\n"] +[18.156634, "o", "Breakpoint 1, \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:765\r\n765\t\tpushl\t%ebp\r\n(gdb) "] +[21.008556, "o", "#"] +[21.237238, "o", " "] +[21.396102, "o", "l"] +[21.51189, "o", "e"] +[21.76495, "o", "t"] +[22.05648, "o", "s"] +[22.357544, "o", " "] +[24.804429, "o", "i"] +[25.30049, "o", "n"] +[25.476535, "o", "s"] +[25.664918, "o", "p"] +[25.773304, "o", "e"] +[25.860811, "o", "c"] +[26.128876, "o", "t"] +[26.278527, "o", " "] +[26.396147, "o", "t"] +[26.648694, "o", "h"] +[27.311874, "o", "e"] +[27.496339, "o", " "] +[31.342091, "o", "p"] +[31.483434, "o", "r"] +[31.554903, "o", "e"] +[31.796618, "o", "v"] +[32.029108, "o", " "] +[33.628667, "o", "t"] +[34.296017, "o", "\b\u001b[K"] +[34.841223, "o", "("] +[36.838321, "o", "\b\u001b[K"] +[40.880455, "o", "t"] +[40.999641, "o", "a"] +[41.19229, "o", "s"] +[41.299116, "o", "k"] +[42.109647, "o", " "] +[42.583652, "o", "("] +[43.015787, "o", "\b\u001b[K"] +[43.181496, "o", "-"] +[43.312415, "o", " "] +[43.501299, "o", "t"] +[43.544044, "o", "h"] +[43.673582, "o", "e"] +[43.728303, "o", " "] +[43.912104, "o", "t"] +[44.079964, "o", "a"] +[44.136162, "o", "s"] +[44.225298, "o", "k"] +[44.297573, "o", " "] +[44.424163, "o", "w"] +[44.539926, "o", "e"] +[44.701627, "o", " "] +[44.845224, "o", "a"] +[45.013358, "o", "r"] +[45.058936, "o", "e"] +[45.143809, "o", " "] +[45.243002, "o", "s"] +[45.400578, "o", "w"] +[45.479309, "o", "i"] +[45.607647, "o", "t"] +[45.838044, "o", "c"] +[45.888549, "o", "h"] +[45.951578, "o", "i"] +[46.061361, "o", "n"] +[46.116627, "o", "g"] +[46.260458, "o", " "] +[46.312755, "o", "a"] +[46.62442, "o", "w"] +[46.717299, "o", "a"] +[46.870336, "o", "y"] +[46.975698, "o", " "] +[47.14446, "o", "f"] +[47.209771, "o", "r"] +[47.259233, "o", "o"] +[47.311775, "o", "m"] +[47.840572, "o", "\r\n(gdb) "] +[50.231971, "o", "l"] +[50.440366, "o", "i"] +[50.495315, "o", "s"] +[50.719925, "o", "t"] +[50.82848, "o", " "] +[51.933215, "o", "7"] +[54.782257, "o", "6"] +[56.895922, "o", "0"] +[57.559601, "o", "\r\n"] +[57.561485, "o", "755\t/*\r\n756\t * %eax: prev task\r\n757\t * %edx: next task\r\n758\t */\r\n759\t.pushsection .text, \"ax\"\r\n760\tSYM_CODE_START(__switch_to_asm)\r\n761\t\t/*\r\n762\t\t * Save callee-saved registers\r\n763\t\t * This must match the order in struct inactive_task_frame\r\n764\t\t */\r\n(gdb) "] +[60.462285, "o", "p"] +[60.804466, "o", "r"] +[60.880276, "o", "i"] +[60.952589, "o", "n"] +[61.048481, "o", "t"] +[61.144253, "o", " "] +[61.567815, "o", "("] +[62.084353, "o", "("] +[62.340434, "o", "s"] +[62.521336, "o", "t"] +[62.584033, "o", "r"] +[62.681088, "o", "u"] +[62.808595, "o", "c"] +[63.020327, "o", "t"] +[63.157429, "o", " "] +[63.280568, "o", "t"] +[63.377159, "o", "a"] +[63.533418, "o", "s"] +[63.622482, "o", "k"] +[63.875748, "o", "_"] +[64.146633, "o", "s"] +[64.482186, "o", "t"] +[64.511947, "o", "r"] +[64.637404, "o", "u"] +[64.777177, "o", "c"] +[64.97617, "o", "t"] +[65.695387, "o", "*"] +[66.288011, "o", ")"] +[67.279895, "o", "$"] +[71.141799, "o", "e"] +[71.240296, "o", "a"] +[71.636944, "o", "x"] +[72.43762, "o", ")"] +[72.839998, "o", "-"] +[73.182685, "o", ">"] +[73.437056, "o", "c"] +[73.58784, "o", "o"] +[73.7922, "o", "m"] +[73.916696, "o", "m"] +[74.544734, "o", "\r\n"] +[74.575314, "o", "$1 = \"swapper/0\\000\\000\\000\\000\\000\\000\"\r\n(gdb) "] +[78.960006, "o", "#"] +[79.380986, "o", " "] +[80.31777, "o", "a"] +[80.439599, "o", "n"] +[80.535856, "o", "d"] +[80.628399, "o", " "] +[80.759967, "o", "l"] +[80.807871, "o", "e"] +[80.984042, "o", "t"] +[81.197923, "o", "s"] +[81.316977, "o", " "] +[81.439785, "o", "s"] +[81.749919, "o", "e"] +[81.884422, "o", "e"] +[82.073889, "o", " "] +[83.576034, "o", "t"] +[83.693926, "o", "h"] +[83.808938, "o", "e"] +[83.973244, "o", " "] +[84.221634, "o", "s"] +[85.29237, "o", "\b\u001b[K"] +[85.445562, "o", "\b\u001b[K"] +[85.567927, "o", "\b\u001b[K"] +[85.701674, "o", "\b\u001b[K"] +[85.824798, "o", "\b\u001b[K"] +[86.004056, "o", "t"] +[86.079743, "o", "o"] +[86.237188, "o", " "] +[86.652315, "o", "w"] +[86.749408, "o", "h"] +[86.808631, "o", "i"] +[86.917022, "o", "c"] +[86.98438, "o", "h"] +[87.06647, "o", " "] +[87.260113, "o", "t"] +[87.388902, "o", "a"] +[87.551541, "o", "s"] +[87.712082, "o", "k"] +[88.144914, "o", " "] +[88.411207, "o", "w"] +[88.511967, "o", "e"] +[88.6247, "o", " "] +[88.808452, "o", "a"] +[88.973655, "o", "r"] +[89.044472, "o", "e"] +[89.128021, "o", " "] +[89.293244, "o", "s"] +[89.485572, "o", "w"] +[89.59152, "o", "i"] +[90.069057, "o", "t"] +[90.356352, "o", "c"] +[90.426188, "o", "h"] +[90.524199, "o", "i"] +[90.623739, "o", "n"] +[90.712003, "o", "g"] +[90.831687, "o", " "] +[90.996124, "o", "t"] +[91.064549, "o", "o"] +[91.608395, "o", "\r\n"] +[91.608597, "o", "(gdb) "] +[92.768673, "o", "# and lets see to which task we are switching to"] +[93.125736, "o", "\b"] +[93.506024, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[9Pprint ((struct task_struct*)$eax)->comm"] +[93.752135, "o", "\b"] +[94.255802, "o", "\b"] +[94.2866, "o", "\b"] +[94.33478, "o", "\b"] +[94.36526, "o", "\b"] +[94.397811, "o", "\b"] +[94.428704, "o", "\b"] +[94.460327, "o", "\b"] +[94.495522, "o", "\b"] +[94.523042, "o", "\b"] +[94.554005, "o", "\b"] +[94.815994, "o", "\u001b[C"] +[94.981232, "o", "\u001b[C"] +[95.133933, "o", "\u001b[C"] +[97.174123, "o", "\b\u001b[1Px)->comm\b\b\b\b\b\b\b\b"] +[97.320654, "o", "dx)->comm\b\b\b\b\b\b\b\b"] +[98.368145, "o", "\r\n"] +[98.386055, "o", "$2 = \"kworker/0:1\\000\\000\\000\\000\"\r\n(gdb) "] +[109.290011, "o", "c"] +[110.600274, "o", "\r\nContinuing.\r\n"] +[110.611283, "o", "\r\n"] +[110.611546, "o", "Breakpoint 1, \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:765\r\n765\t\tpushl\t%ebp\r\n(gdb) "] +[111.72507, "o", "c"] +[111.887716, "o", "\bprint ((struct task_struct*)$edx)->comm"] +[113.100353, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C# and lets see to which task we are switching to"] +[114.228918, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[9Pprint ((struct task_struct*)$edx)->comm"] +[115.280062, "o", "\r\n"] +[115.296922, "o", "$3 = \"swapper/0\\000\\000\\000\\000\\000\\000\"\r\n(gdb) "] +[116.415272, "o", "c"] +[116.912694, "o", "\r\n"] +[116.912781, "o", "Continuing.\r\n"] +[117.312405, "o", "\r\n"] +[117.313149, "o", "Breakpoint 1, \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:765\r\n765\t\tpushl\t%ebp\r\n"] +[117.313648, "o", "(gdb) "] +[117.933355, "o", "c"] +[118.080051, "o", "\bprint ((struct task_struct*)$edx)->comm"] +[119.912687, "o", "\r\n"] +[119.92993, "o", "$4 = \"init\\000er/0\\000\\000\\000\\000\\000\\000\"\r\n(gdb) "] +[123.256997, "o", "print ((struct task_struct*)$edx)->comm"] +[123.876701, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[K"] +[124.45439, "o", "print ((struct task_struct*)$edx)->comm"] +[124.647655, "o", "\b"] +[124.80468, "o", "\b"] +[124.952888, "o", "\b"] +[125.110239, "o", "\b"] +[125.248208, "o", "\b"] +[125.396351, "o", "\b"] +[125.549064, "o", "\b"] +[125.688114, "o", "\b"] +[126.095543, "o", "\b\u001b[1Px)->comm\b\b\b\b\b\b\b\b"] +[126.191882, "o", "ax)->comm\b\b\b\b\b\b\b\b"] +[126.719944, "o", "\r\n"] +[126.736207, "o", "$5 = \"swapper/0\\000\\000\\000\\000\\000\\000\"\r\n(gdb) "] +[133.316754, "o", "#"] +[133.484351, "o", " "] +[133.66117, "o", "l"] +[133.824069, "o", "o"] +[133.949648, "o", "o"] +[134.022861, "o", "k"] +[134.17215, "o", "s"] +[134.223574, "o", " "] +[134.331832, "o", "l"] +[134.49252, "o", "i"] +[134.639694, "o", "k"] +[134.688238, "o", "e"] +[134.775713, "o", " "] +[134.879648, "o", "w"] +[134.951696, "o", "e"] +[135.033237, "o", " "] +[135.163839, "o", "a"] +[135.264608, "o", "r"] +[135.320082, "o", "e"] +[135.40093, "o", " "] +[135.535122, "o", "s"] +[135.725486, "o", "w"] +[135.816051, "o", "i"] +[135.927456, "o", "t"] +[136.131156, "o", "c"] +[136.202658, "o", "h"] +[136.256422, "o", "i"] +[136.396776, "o", "n"] +[136.460338, "o", "g"] +[136.580257, "o", " "] +[136.687682, "o", "f"] +[136.77995, "o", "r"] +[136.832341, "o", "o"] +[136.887861, "o", "m"] +[137.036266, "o", " "] +[137.135292, "o", "t"] +[137.247438, "o", "h"] +[137.356536, "o", "e"] +[137.439224, "o", " "] +[137.678756, "o", "s"] +[137.871289, "o", "w"] +[137.999837, "o", "a"] +[138.119198, "o", "p"] +[138.257533, "o", "p"] +[138.350743, "o", "e"] +[138.433485, "o", "r"] +[138.749519, "o", " "] +[139.888082, "o", "t"] +[140.087705, "o", "a"] +[140.119863, "o", "s"] +[140.276783, "o", "k"] +[140.435722, "o", " "] +[140.592133, "o", "t"] +[140.691411, "o", "o"] +[140.78898, "o", " "] +[140.873948, "o", "t"] +[141.005069, "o", "h"] +[141.055666, "o", "e"] +[141.191609, "o", " "] +[141.493029, "o", "i"] +[141.5594, "o", "n"] +[141.639238, "o", "i"] +[141.744796, "o", "t"] +[141.82812, "o", " "] +[141.927392, "o", "t"] +[142.103712, "o", "a"] +[142.179748, "o", "s"] +[142.452147, "o", "k"] +[143.168201, "o", "\r\n(gdb) "] +[146.056263, "o", "b"] +[146.12877, "o", "t"] +[148.33612, "o", "\r\n"] +[148.35078, "o", "#0 \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:765\r\n"] +[148.351576, "o", "#1 \u001b[34m0xc15d8277\u001b[m in \u001b[33mcontext_switch\u001b[m (\u001b[36mrf\u001b[m=0xc17c9f04, \u001b[36mnext\u001b[m=, \u001b[m\r\n"] +[148.351914, "o", " \u001b[m\u001b[36mprev\u001b[m=0xc17d02c0 , \u001b[36mrq\u001b[m=0xcfdcb700) at \u001b[32mkernel/sched/core.c\u001b[m:3779\r\n"] +[148.352077, "o", "#2 \u001b[33m__schedule\u001b[m (\u001b[36mpreempt\u001b[m=, \u001b[36mpreempt@entry\u001b[m=false)\u001b[m\r\n"] +[148.352169, "o", " \u001b[m at \u001b[32mkernel/sched/core.c\u001b[m:4528\r\n#3 \u001b[34m0xc15d8a37\u001b[m in \u001b[33mschedule_idle\u001b[m () at \u001b[32mkernel/sched/core.c\u001b[m:4634\r\n"] +[148.358451, "o", "#4 \u001b[34m0xc108d8a5\u001b[m in \u001b[33mdo_idle\u001b[m () at \u001b[32mkernel/sched/idle.c\u001b[m:327\r\n"] +[148.365129, "o", "#5 \u001b[34m0xc108dbd5\u001b[m in \u001b[33mcpu_startup_entry\u001b[m (\u001b[36mstate=state@entry\u001b[m=CPUHP_ONLINE)\u001b[m\r\n \u001b[m at \u001b[32mkernel/sched/idle.c\u001b[m:395\r\n#6 \u001b[34m0xc15d6100\u001b[m in \u001b[33mrest_init\u001b[m () at \u001b[32minit/main.c\u001b[m:721\r\n#7 \u001b[34m0xc18c77de\u001b[m in \u001b[33march_call_rest_init\u001b[m () at \u001b[32minit/main.c\u001b[m:845\r\n"] +[148.36552, "o", "#8 \u001b[34m0xc18c7c30\u001b[m in \u001b[33mstart_kernel\u001b[m () at \u001b[32minit/main.c\u001b[m:1061\r\n"] +[148.367951, "o", "#9 \u001b[34m0xc18c7218\u001b[m in \u001b[33mi386_start_kernel\u001b[m () at \u001b[32march/x86/kernel/head32.c\u001b[m:56\r\n"] +[148.368485, "o", "#10 \u001b[34m0xc10001db\u001b[m in \u001b[33mstartup_32_smp\u001b[m () at \u001b[32march/x86/kernel/head_32.S\u001b[m:327\r\n"] +[148.36957, "o", "#11 \u001b[34m0x00000000\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[148.369972, "o", "(gdb) "] +[151.00407, "o", "#"] +[151.19242, "o", " "] +[151.460669, "o", "y"] +[151.535715, "o", "e"] +[151.62446, "o", "s"] +[151.759619, "o", ","] +[151.847569, "o", " "] +[152.111536, "o", "t"] +[152.207294, "o", "h"] +[152.251822, "o", "i"] +[152.375566, "o", "s"] +[152.468326, "o", " "] +[152.651638, "o", "l"] +[152.815662, "o", "o"] +[152.959592, "o", "o"] +[153.076986, "o", "k"] +[153.231758, "o", "s"] +[153.335568, "o", " "] +[153.503403, "o", "l"] +[153.639799, "o", "i"] +[153.814824, "o", "k"] +[153.892142, "o", "e"] +[154.011029, "o", " "] +[154.136079, "o", "t"] +[154.191539, "o", "h"] +[154.287617, "o", "e"] +[154.367043, "o", " "] +[154.535718, "o", "s"] +[154.719646, "o", "w"] +[154.838308, "o", "a"] +[155.020956, "o", "p"] +[155.404904, "o", "p"] +[155.536173, "o", "e"] +[155.653023, "o", "r"] +[155.768249, "o", " "] +[156.056207, "o", "t"] +[156.143169, "o", "a"] +[156.291919, "o", "s"] +[156.440453, "o", "k"] +[158.224694, "o", "\r\n(gdb) "] +[162.551586, "o", "s"] +[163.114829, "o", "t"] +[163.205076, "o", "e"] +[163.340616, "o", "p"] +[163.643258, "o", "i"] +[167.327688, "o", "\b\u001b[K"] +[167.839044, "o", "\b\u001b[K"] +[167.965402, "o", "\b\u001b[K"] +[168.113034, "o", "\b\u001b[K"] +[168.239474, "o", "\b\u001b[K"] +[168.484145, "o", "#"] +[168.703431, "o", " "] +[169.027423, "o", "l"] +[169.112353, "o", "e"] +[169.28797, "o", "t"] +[169.524584, "o", "s"] +[169.735299, "o", " "] +[171.135828, "o", "s"] +[171.4075, "o", "t"] +[171.464843, "o", "e"] +[171.808964, "o", "p"] +[172.115755, "o", " "] +[172.317145, "o", "a"] +[172.47342, "o", "n"] +[172.596959, "o", "d"] +[172.735489, "o", " "] +[172.927669, "o", "o"] +[173.042781, "o", "v"] +[173.108033, "o", "e"] +[173.907894, "o", "\b\u001b[K"] +[174.127765, "o", "b"] +[174.815899, "o", "\b\u001b[K"] +[174.939745, "o", "\b\u001b[K"] +[175.15, "o", "b"] +[175.428235, "o", "s"] +[175.59581, "o", "e"] +[175.723777, "o", "r"] +[175.97285, "o", "v"] +[176.055296, "o", "e"] +[176.248513, "o", " "] +[176.51233, "o", "t"] +[177.968133, "o", "h"] +[179.192669, "o", "e"] +[182.952071, "o", "\b\u001b[K"] +[183.087972, "o", "\b\u001b[K"] +[183.214224, "o", "\b\u001b[K"] +[183.448903, "o", "h"] +[183.488214, "o", "o"] +[183.551638, "o", "w"] +[183.707961, "o", " "] +[183.951723, "o", "t"] +[184.027742, "o", "h"] +[184.119558, "o", "e"] +[184.227577, "o", " "] +[184.428877, "o", "c"] +[184.515292, "o", "o"] +[184.599136, "o", "n"] +[184.695565, "o", "t"] +[184.751404, "o", "e"] +[184.975304, "o", "x"] +[185.18823, "o", "t"] +[185.287641, "o", " "] +[185.383712, "o", "s"] +[185.581109, "o", "w"] +[185.704285, "o", "i"] +[185.783127, "o", "t"] +[186.016302, "o", "c"] +[186.102309, "o", "h"] +[187.085348, "o", " "] +[187.500213, "o", "u"] +[187.57576, "o", "n"] +[187.815524, "o", "f"] +[188.02442, "o", "o"] +[188.199542, "o", "l"] +[188.533385, "o", "d"] +[188.621774, "o", "s"] +[189.632444, "o", "\r\n(gdb) "] +[190.303457, "o", "s"] +[190.479487, "o", "t"] +[190.544727, "o", "e"] +[190.591511, "o", "p"] +[190.703644, "o", "i"] +[191.007796, "o", "\r\n"] +[191.010816, "o", "766\t\tpushl\t%ebx\r\n(gdb) "] +[191.912524, "o", "\r\n"] +[191.915059, "o", "767\t\tpushl\t%edi\r\n(gdb) "] +[192.743936, "o", "\r\n"] +[192.746308, "o", "768\t\tpushl\t%esi\r\n(gdb) "] +[193.511536, "o", "\r\n"] +[193.514218, "o", "774\t\tpushfl\r\n(gdb) "] +[194.415367, "o", "\r\n"] +[194.41946, "o", "\u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:777\r\n777\t\tmovl\t%esp, TASK_threadsp(%eax)\r\n"] +[194.419691, "o", "(gdb) "] +[195.220364, "o", "#"] +[195.520461, "o", " "] +[195.799342, "o", "w"] +[195.935095, "o", "e"] +[196.088054, "o", " "] +[196.252345, "o", "s"] +[196.373061, "o", "a"] +[196.59163, "o", "v"] +[196.623491, "o", "e"] +[196.823595, "o", "d"] +[197.018563, "o", " "] +[197.143737, "o", "t"] +[197.274616, "o", "h"] +[197.332048, "o", "e"] +[197.472049, "o", " "] +[197.783684, "o", "r"] +[197.831704, "o", "e"] +[198.02048, "o", "g"] +[198.141339, "o", "i"] +[198.192092, "o", "s"] +[198.339912, "o", "t"] +[198.402788, "o", "e"] +[198.520477, "o", "r"] +[198.688044, "o", "s"] +[198.84898, "o", " "] +[199.00533, "o", "s"] +[199.107804, "o", "o"] +[199.279942, "o", " "] +[199.508566, "o", "f"] +[199.671398, "o", "a"] +[199.887722, "o", "r"] +[200.351521, "o", ","] +[200.463301, "o", " "] +[200.655968, "o", "n"] +[200.687803, "o", "e"] +[200.772872, "o", "x"] +[201.05564, "o", "t"] +[201.240283, "o", " "] +[201.524903, "o", "w"] +[201.61563, "o", "e"] +[201.751344, "o", " "] +[202.125757, "o", "s"] +[202.523685, "o", "\b\u001b[K"] +[202.64147, "o", "w"] +[202.727705, "o", "i"] +[202.903547, "o", "l"] +[203.045364, "o", "l"] +[203.156751, "o", " "] +[203.413128, "o", "s"] +[203.596934, "o", "w"] +[203.719746, "o", "i"] +[203.832585, "o", "t"] +[204.068109, "o", "c"] +[204.14267, "o", "h"] +[204.271578, "o", " "] +[204.320065, "o", "t"] +[204.455512, "o", "h"] +[204.523486, "o", "e"] +[204.657363, "o", " "] +[204.759158, "o", "t"] +[205.586833, "o", "\b\u001b[K"] +[205.648223, "o", "s"] +[205.768517, "o", "t"] +[205.840198, "o", "a"] +[206.035781, "o", "c"] +[206.047227, "o", "k"] +[206.687714, "o", "\r\n"] +[206.687766, "o", "(gdb) "] +[207.179955, "o", "s"] +[207.355887, "o", "t"] +[207.428741, "o", "e"] +[207.457028, "o", "p"] +[207.612888, "o", "i"] +[208.064286, "o", "\r\n"] +[208.067075, "o", "778\t\tmovl\tTASK_threadsp(%edx), %esp\r\n(gdb) "] +[209.407591, "o", "\r\n"] +[209.411316, "o", "\u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:781\r\n781\t\tmovl\tTASK_stack_canary(%edx), %ebx\r\n"] +[209.411612, "o", "(gdb) "] +[212.902993, "o", "#"] +[213.186849, "o", " "] +[213.319001, "o", "w"] +[213.390944, "o", "e"] +[213.535507, "o", " "] +[213.663195, "o", "a"] +[213.799039, "o", "r"] +[213.879251, "o", "e"] +[213.967477, "o", " "] +[214.079533, "o", "d"] +[214.155535, "o", "o"] +[214.332766, "o", "n"] +[214.399683, "o", "e"] +[214.55702, "o", " "] +[214.7656, "o", "s"] +[215.047766, "o", "w"] +[215.228474, "o", "i"] +[215.304639, "o", "t"] +[215.519221, "o", "c"] +[215.604247, "o", "h"] +[215.733286, "o", "i"] +[215.813391, "o", "n"] +[215.8956, "o", "g"] +[216.004892, "o", " "] +[216.079528, "o", "t"] +[216.183413, "o", "h"] +[216.267953, "o", "e"] +[216.399454, "o", " "] +[216.623103, "o", "s"] +[216.821001, "o", "t"] +[216.863454, "o", "a"] +[217.111792, "o", "c"] +[217.195888, "o", "k"] +[217.707786, "o", ","] +[217.787686, "o", " "] +[217.949878, "o", "l"] +[218.023149, "o", "e"] +[218.167388, "o", "t"] +[218.375101, "o", "s"] +[218.503646, "o", " "] +[218.671911, "o", "t"] +[218.735588, "o", "a"] +[218.871225, "o", "k"] +[218.982322, "o", "e"] +[219.068728, "o", " "] +[219.175086, "o", "a"] +[219.296026, "o", " "] +[219.596276, "o", "l"] +[219.771622, "o", "o"] +[219.901863, "o", "o"] +[219.967878, "o", "k"] +[220.087157, "o", " "] +[220.17509, "o", "a"] +[220.347339, "o", "t"] +[220.403464, "o", " "] +[220.519087, "o", "t"] +[220.612148, "o", "h"] +[220.67667, "o", "e"] +[220.772254, "o", " "] +[220.943396, "o", "b"] +[221.031202, "o", "r"] +[221.038285, "o", "a"] +[221.298946, "o", "c"] +[221.420949, "o", "k"] +[221.741402, "o", "\b\u001b[K"] +[221.876722, "o", "\b\u001b[K"] +[221.999662, "o", "\b\u001b[K"] +[222.119543, "o", "\b\u001b[K"] +[222.151268, "o", "a"] +[222.263857, "o", "c"] +[222.343319, "o", "k"] +[222.566421, "o", "t"] +[222.731723, "o", "r"] +[222.806406, "o", "a"] +[222.983386, "o", "c"] +[223.076776, "o", "e"] +[223.447917, "o", "\r\n(gdb) "] +[224.002251, "o", "b"] +[224.055592, "o", "t"] +[224.271867, "o", "\r\n"] +[224.272024, "o", "#0 \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:781\r\n"] +[224.27223, "o", "#1 \u001b[34m0xc253ba58\u001b[m in \u001b[33m??\u001b[m ()\r\nBacktrace stopped: previous frame inner to this frame (corrupt stack?)\r\n(gdb) "] +[226.94333, "o", "#"] +[227.288215, "o", " "] +[227.551148, "o", "o"] +[227.683628, "o", "o"] +[227.839199, "o", "p"] +[228.980471, "o", "s"] +[229.095151, "o", ","] +[229.197165, "o", " "] +[229.364899, "o", "t"] +[229.447006, "o", "h"] +[229.543446, "o", "e"] +[229.643847, "o", " "] +[229.764396, "o", "d"] +[229.823982, "o", "e"] +[229.922185, "o", "b"] +[229.959231, "o", "u"] +[230.103636, "o", "g"] +[230.50335, "o", "g"] +[230.628726, "o", "e"] +[230.732769, "o", "r"] +[230.877045, "o", " "] +[230.996275, "o", "i"] +[231.107576, "o", "s"] +[231.210359, "o", " "] +[231.399261, "o", "c"] +[231.487397, "o", "o"] +[231.599424, "o", "n"] +[232.09977, "o", "f"] +[232.287115, "o", "u"] +[232.485493, "o", "s"] +[232.676843, "o", "e"] +[232.771224, "o", "d"] +[233.251928, "o", ","] +[233.424736, "o", " "] +[240.103124, "o", "w"] +[240.182868, "o", "e"] +[240.327268, "o", " "] +[240.486874, "o", "a"] +[240.66716, "o", "r"] +[240.729259, "o", "e"] +[240.868472, "o", " "] +[241.096347, "o", "m"] +[241.18101, "o", "i"] +[241.327393, "o", "s"] +[241.485639, "o", "s"] +[241.574748, "o", "i"] +[241.661021, "o", "n"] +[241.743456, "o", "g"] +[241.906569, "o", " "] +[242.12363, "o", "t"] +[242.244647, "o", "h"] +[242.335037, "o", "e"] +[242.496621, "o", " "] +[245.351862, "o", "\b\u001b[K"] +[245.476547, "o", "\b\u001b[K"] +[245.600162, "o", "\b\u001b[K"] +[245.733041, "o", "\b\u001b[K"] +[245.927573, "o", "p"] +[246.013118, "o", "a"] +[246.119622, "o", "r"] +[246.279117, "o", "t"] +[246.484973, "o", "s"] +[246.583587, "o", " "] +[246.695127, "o", "o"] +[246.770543, "o", "f"] +[246.911599, "o", " "] +[246.959314, "o", "t"] +[247.101026, "o", "h"] +[247.164598, "o", "e"] +[247.299367, "o", " "] +[248.091601, "o", "s"] +[248.319596, "o", "t"] +[248.391035, "o", "a"] +[248.991394, "o", "c"] +[249.100166, "o", "k"] +[250.031636, "o", " "] +[250.096947, "o", "f"] +[250.283625, "o", "r"] +[250.351461, "o", "a"] +[250.432813, "o", "m"] +[250.535577, "o", "e"] +[251.560407, "o", "\r\n(gdb) "] +[252.27138, "o", "#"] +[252.743479, "o", " "] +[254.959991, "o", "l"] +[254.983878, "o", "e"] +[255.167102, "o", "t"] +[255.355618, "o", "s"] +[255.519673, "o", " "] +[255.657043, "o", "c"] +[255.783342, "o", "o"] +[255.911076, "o", "n"] +[256.091637, "o", "t"] +[256.167365, "o", "i"] +[256.267678, "o", "n"] +[256.373344, "o", "u"] +[256.645139, "o", "e"] +[256.802641, "o", " "] +[256.975427, "o", "u"] +[257.06117, "o", "n"] +[257.231687, "o", "t"] +[257.423098, "o", "i"] +[257.528178, "o", "l"] +[257.874938, "o", " "] +[257.998964, "o", "w"] +[258.090668, "o", "e"] +[258.204414, "o", " "] +[258.39651, "o", "r"] +[258.444584, "o", "e"] +[258.566703, "o", "a"] +[258.759543, "o", "c"] +[258.862693, "o", "h"] +[259.109149, "o", " "] +[269.467858, "o", "a"] +[269.639412, "o", " "] +[269.847362, "o", "p"] +[270.032414, "o", "r"] +[270.096595, "o", "o"] +[270.245178, "o", "p"] +[270.383526, "o", "e"] +[270.448348, "o", "r"] +[270.579818, "o", " "] +[270.691356, "o", "f"] +[270.863693, "o", "u"] +[270.894962, "o", "n"] +[271.019515, "o", "c"] +[271.23123, "o", "t"] +[271.287233, "o", "i"] +[271.343717, "o", "o"] +[271.739795, "o", "n"] +[272.06029, "o", " "] +[272.580422, "o", "\b\u001b[K"] +[272.831445, "o", ","] +[272.951811, "o", " "] +[273.831788, "o", "e"] +[274.015133, "o", "."] +[274.207026, "o", "g"] +[274.284353, "o", "."] +[274.39683, "o", " "] +[275.196925, "o", "_"] +[275.339729, "o", "_"] +[275.831025, "o", "s"] +[276.015813, "o", "w"] +[276.180803, "o", "i"] +[276.331724, "o", "t"] +[276.534941, "o", "c"] +[276.566947, "o", "h"] +[276.864471, "o", "_"] +[277.055747, "o", "t"] +[277.103744, "o", "o"] +[277.879557, "o", "\r\n(gdb) "] +[278.37271, "o", "b"] +[278.477418, "o", "r"] +[278.524454, "o", "e"] +[278.590659, "o", "a"] +[278.684679, "o", "k"] +[278.775286, "o", " "] +[279.271091, "o", "_"] +[279.387961, "o", "_"] +[279.591305, "o", "s"] +[279.782144, "o", "w"] +[279.863553, "o", "i"] +[279.975196, "o", "t"] +[280.157072, "o", "c"] +[280.207462, "o", "h"] +[280.498546, "o", "_"] +[280.716367, "o", "t"] +[280.780306, "o", "o"] +[281.903272, "o", "\r\n"] +[281.920624, "o", "Breakpoint 2 at \u001b[34m0xc1020050\u001b[m: file \u001b[32march/x86/kernel/process_32.c\u001b[m, line 159.\r\n(gdb) "] +[285.580302, "o", "c"] +[287.023431, "o", "\r\nContinuing.\r\n"] +[287.025661, "o", "\r\n"] +[287.026259, "o", "Breakpoint 2, \u001b[33m__switch_to\u001b[m (\u001b[36mprev_p\u001b[m=0xc17d02c0 , \u001b[36mnext_p\u001b[m=0xc2530040)\u001b[m\r\n \u001b[m at \u001b[32march/x86/kernel/process_32.c\u001b[m:159\r\n159\t{\r\n"] +[287.026396, "o", "(gdb) "] +[289.067669, "o", "#"] +[289.21566, "o", " "] +[289.407583, "o", "n"] +[289.463508, "o", "o"] +[289.543951, "o", "w"] +[289.72241, "o", " "] +[289.894812, "o", "l"] +[290.019586, "o", "e"] +[290.175723, "o", "t"] +[290.42348, "o", "s"] +[290.636596, "o", " "] +[291.733293, "o", "t"] +[291.80722, "o", "a"] +[291.931757, "o", "k"] +[292.036902, "o", "e"] +[292.123167, "o", " "] +[292.211687, "o", "a"] +[292.316232, "o", " "] +[292.480027, "o", "l"] +[292.660474, "o", "o"] +[292.811518, "o", "o"] +[292.919126, "o", "k"] +[293.143171, "o", " "] +[293.27092, "o", "a"] +[293.503276, "o", "t"] +[293.610102, "o", " "] +[293.723124, "o", "t"] +[293.867511, "o", "h"] +[293.950794, "o", "e"] +[294.092621, "o", " "] +[294.354438, "o", "b"] +[294.80766, "o", "a"] +[294.983003, "o", "c"] +[295.43947, "o", "k"] +[295.612526, "o", "t"] +[295.771893, "o", "r"] +[295.860273, "o", "a"] +[296.024455, "o", "c"] +[296.080174, "o", "e"] +[296.207233, "o", " "] +[296.260988, "o", "a"] +[296.491858, "o", "g"] +[296.557449, "o", "a"] +[296.659441, "o", "i"] +[296.727329, "o", "n"] +[297.063689, "o", "\r\n(gdb) "] +[297.45931, "o", "b"] +[297.535189, "o", "t"] +[297.919758, "o", "\r\n"] +[297.920155, "o", "#0 \u001b[33m__switch_to\u001b[m (\u001b[36mprev_p\u001b[m=0xc17d02c0 , \u001b[36mnext_p\u001b[m=0xc2530040)\u001b[m\r\n \u001b[m at \u001b[32march/x86/kernel/process_32.c\u001b[m:159\r\n"] +[297.920545, "o", "#1 \u001b[34m0xc15d8277\u001b[m in \u001b[33mcontext_switch\u001b[m (\u001b[36mrf\u001b[m=0xc253ba3c, \u001b[36mnext\u001b[m=, \u001b[m\r\n"] +[297.920652, "o", " \u001b[m\u001b[36mprev\u001b[m=0xc2530040, \u001b[36mrq\u001b[m=0xcfdcb700) at \u001b[32mkernel/sched/core.c\u001b[m:3779\r\n"] +[297.920965, "o", "#2 \u001b[33m__schedule\u001b[m (\u001b[36mpreempt\u001b[m=, \u001b[36mpreempt@entry\u001b[m=false)\u001b[m\r\n \u001b[m at \u001b[32mkernel/sched/core.c\u001b[m:4528\r\n#3 \u001b[34m0xc15d86ce\u001b[m in \u001b[33mschedule\u001b[m () at \u001b[32mkernel/sched/core.c\u001b[m:4606\r\n"] +[297.927061, "o", "#4 \u001b[34m0xc15ddb66\u001b[m in \u001b[33mschedule_hrtimeout_range_clock\u001b[m (\u001b[36mexpires\u001b[m=, \u001b[m\r\n"] +[297.927149, "o", " \u001b[m\u001b[36mdelta\u001b[m=, \u001b[36mmode\u001b[m=HRTIMER_MODE_ABS, \u001b[36mclock_id\u001b[m=1)\u001b[m\r\n \u001b[m at \u001b[32mkernel/time/hrtimer.c\u001b[m:2139\r\n"] +[297.935662, "o", "#5 \u001b[34m0xc15ddc3b\u001b[m in \u001b[33mschedule_hrtimeout_range\u001b[m (\u001b[36mexpires=expires@entry\u001b[m=0xc253bb7c, \u001b[m\r\n \u001b[m\u001b[36mdelta\u001b[m=, \u001b[36mmode=mode@entry\u001b[m=HRTIMER_MODE_ABS)\u001b[m\r\n \u001b[m at \u001b[32mkernel/time/hrtimer.c\u001b[m:2184\r\n"] +[297.936228, "o", "#6 \u001b[34m0xc11c037f\u001b[m in \u001b[33mpoll_schedule_timeout\u001b[m (\u001b[36mpwq=pwq@entry\u001b[m=0xc253bb84, \u001b[m\r\n"] +[297.936671, "o", " \u001b[m\u001b[36mexpires=expires@entry\u001b[m=0xc253bb7c, \u001b[36mslack=slack@entry\u001b[m=4999986, \u001b[36mstate\u001b[m=1)\u001b[m\r\n"] +[297.936825, "o", " \u001b[m at \u001b[32mfs/select.c\u001b[m:243\r\n"] +[297.937767, "o", "#7 \u001b[34m0xc11c0baf\u001b[m in \u001b[33mdo_select\u001b[m (\u001b[36mn\u001b[m=, \u001b[36mn@entry\u001b[m=11, \u001b[m\r\n"] +[297.938173, "o", " \u001b[m\u001b[36mfds=fds@entry\u001b[m=0xc253be20, \u001b[36mend_time=end_time@entry\u001b[m=0xc253bf70)\u001b[m\r\n \u001b[m at \u001b[32mfs/select.c\u001b[m:603\r\n"] +[297.938674, "o", "#8 \u001b[34m0xc11c1985\u001b[m in \u001b[33mcore_sys_select\u001b[m (\u001b[36mn\u001b[m=, \u001b[36mn@entry\u001b[m=11, \u001b[m\r\n"] +[297.939277, "o", " \u001b[m\u001b[36minp=inp@entry\u001b[m=0xbf984f00, \u001b[36moutp=outp@entry\u001b[m=0x0, \u001b[36mexp\u001b[m=, \u001b[m\r\n"] +[297.939469, "o", " \u001b[m\u001b[36mexp@entry\u001b[m=0x0, \u001b[36mend_time\u001b[m=) at \u001b[32mfs/select.c\u001b[m:677\r\n"] +[297.940066, "o", "#9 \u001b[34m0xc11c1f64\u001b[m in \u001b[33mkern_select\u001b[m (\u001b[36mn\u001b[m=11, \u001b[36minp\u001b[m=0xbf984f00, \u001b[36moutp\u001b[m=0x0, \u001b[m\r\n"] +[297.94015, "o", "\u001b[m--Type for more, q to quit, c to continue without paging--"] +[299.775688, "o", "\r\n"] +[299.776122, "o", " \u001b[m\u001b[36mexp=exp@entry\u001b[m=0x0, \u001b[36mtvp\u001b[m=0xbf984df0) at \u001b[32mfs/select.c\u001b[m:718\r\n#10 \u001b[34m0xc11c1fe1\u001b[m in \u001b[33m__do_sys_select\u001b[m (\u001b[36mtvp\u001b[m=, \u001b[36mexp\u001b[m=, \u001b[m\r\n \u001b[m\u001b[36moutp\u001b[m=, \u001b[36minp\u001b[m=, \u001b[36mn\u001b[m=)\u001b[m\r\n \u001b[m at \u001b[32mfs/select.c\u001b[m:725\r\n"] +[299.77624, "o", "#11 \u001b[33m__se_sys_select\u001b[m (\u001b[36mtvp\u001b[m=, \u001b[36mexp\u001b[m=, \u001b[m\r\n \u001b[m\u001b[36moutp\u001b[m=, \u001b[36minp\u001b[m=, \u001b[36mn\u001b[m=)\u001b[m\r\n \u001b[m at \u001b[32mfs/select.c\u001b[m:722\r\n"] +[299.780969, "o", "#12 \u001b[33m__ia32_sys_select\u001b[m (\u001b[36mregs\u001b[m=) at \u001b[32mfs/select.c\u001b[m:722\r\n"] +[299.781337, "o", "#13 \u001b[34m0xc15d29cc\u001b[m in \u001b[33mdo_syscall_32_irqs_on\u001b[m (\u001b[36mnr\u001b[m=, \u001b[36mregs\u001b[m=0xc253bfb4)\u001b[m\r\n \u001b[m at \u001b[32march/x86/entry/common.c\u001b[m:77\r\n"] +[299.782024, "o", "#14 \u001b[33mdo_int80_syscall_32\u001b[m (\u001b[36mregs\u001b[m=0xc253bfb4) at \u001b[32march/x86/entry/common.c\u001b[m:94\r\n#15 \u001b[34m0xc15dfaeb\u001b[m in \u001b[33mentry_INT80_32\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1059\r\n"] +[299.782699, "o", "#16 \u001b[34m0x0000000b\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[299.783203, "o", "#17 \u001b[34m0xbf984f00\u001b[m in \u001b[33m??\u001b[m ()\r\nBacktrace stopped: previous frame inner to this frame (corrupt stack?)\r\n(gdb) "] +[303.942905, "o", "#"] +[304.103437, "o", " "] +[304.274144, "o", "o"] +[304.358619, "o", "k"] +[304.659941, "o", ","] +[304.766807, "o", " "] +[305.28809, "o", "t"] +[305.375297, "o", "h"] +[305.431933, "o", "i"] +[305.540524, "o", "s"] +[305.655296, "o", " "] +[305.901221, "o", "l"] +[306.115461, "o", "o"] +[306.246553, "o", "o"] +[306.32009, "o", "k"] +[306.460058, "o", "s"] +[306.542589, "o", " "] +[306.694869, "o", "l"] +[306.907465, "o", "i"] +[307.075817, "o", "k"] +[307.179414, "o", "e"] +[307.295136, "o", " "] +[307.54844, "o", "t"] +[307.663017, "o", "h"] +[307.815168, "o", "e"] +[308.156909, "o", " "] +[308.39174, "o", "i"] +[308.532433, "o", "n"] +[308.595757, "o", "i"] +[310.686866, "o", "t"] +[310.8524, "o", " "] +[310.953083, "o", "t"] +[311.220312, "o", "h"] +[311.340814, "o", "r"] +[311.470856, "o", "e"] +[311.548828, "o", "a"] +[311.734896, "o", "d"] +[313.767614, "o", "\r\n"] +[313.767976, "o", "(gdb) "] +[316.999241, "o", "#"] +[317.296435, "o", " "] +[317.548667, "o", "s"] +[317.607326, "o", "o"] +[317.763134, "o", " "] +[317.887136, "o", "t"] +[317.999336, "o", "h"] +[318.095442, "o", "e"] +[318.188349, "o", " "] +[318.387557, "o", "c"] +[318.475079, "o", "o"] +[318.700291, "o", "n"] +[318.776111, "o", "t"] +[318.855308, "o", "e"] +[319.046835, "o", "x"] +[319.271115, "o", "t"] +[319.381068, "o", " "] +[319.461433, "o", "s"] +[319.631028, "o", "w"] +[319.748625, "o", "i"] +[319.846916, "o", "t"] +[320.079731, "o", "c"] +[320.134923, "o", "h"] +[320.268, "o", " "] +[320.830957, "o", "h"] +[320.99934, "o", "a"] +[321.10033, "o", "s"] +[322.903464, "o", "\b\u001b[K"] +[323.407499, "o", "\b\u001b[K"] +[323.451112, "o", "\b\u001b[K"] +[323.482209, "o", "\b\u001b[K"] +[323.513329, "o", "\b\u001b[K"] +[326.052876, "o", "h"] +[326.718835, "o", " "] +[326.942968, "o", "0"] +[327.283417, "o", " "] +[327.524209, "o", "\b\u001b[K"] +[327.654149, "o", "\b\u001b[K"] +[328.142643, "o", "-"] +[328.332367, "o", " "] +[330.483798, "o", "o"] +[330.650327, "o", "r"] +[330.732245, "o", " "] +[330.873854, "o", "p"] +[330.918697, "o", "a"] +[331.030843, "o", "r"] +[331.223103, "o", "t"] +[331.335218, "o", " "] +[331.471149, "o", "o"] +[331.56318, "o", "f"] +[331.699593, "o", " "] +[332.264753, "o", "i"] +[332.407088, "o", "t"] +[332.812649, "o", ","] +[332.97567, "o", " "] +[333.070572, "o", "t"] +[333.252959, "o", "h"] +[333.375206, "o", "a"] +[333.596096, "o", "t"] +[333.942887, "o", " "] +[334.676468, "o", "k"] +[334.807218, "o", "e"] +[334.839089, "o", "r"] +[335.599008, "o", "\b\u001b[K"] +[336.056624, "o", "\b\u001b[K"] +[336.200204, "o", "\b\u001b[K"] +[337.431057, "o", "k"] +[337.554329, "o", "e"] +[337.579172, "o", "r"] +[337.725617, "o", "n"] +[338.559416, "o", "e"] +[338.622963, "o", "l"] +[338.726658, "o", " "] +[338.868311, "o", "s"] +[339.014698, "o", "t"] +[339.079695, "o", "a"] +[339.31094, "o", "c"] +[339.413228, "o", "k"] +[339.526949, "o", " "] +[340.127059, "o", "s"] +[340.291598, "o", "w"] +[340.402142, "o", "i"] +[340.49965, "o", "t"] +[340.729408, "o", "c"] +[340.783126, "o", "h"] +[341.207083, "o", ","] +[341.469127, "o", " "] +[342.923481, "o", "\b\u001b[K"] +[343.071348, "o", "\b\u001b[K"] +[343.39639, "o", " "] +[344.339672, "o", "i"] +[344.620158, "o", "s"] +[344.74322, "o", " "] +[344.874119, "o", "d"] +[344.950506, "o", "o"] +[345.047877, "o", "n"] +[345.113522, "o", "e"] +[345.694122, "o", "\r\n"] +[345.694186, "o", "(gdb) "] +[349.573599, "o", "quit\r\n"] +[349.574221, "o", "A debugging session is active.\r\n\r\n\tInferior 1 [process 1] will be detached.\r\n\r\nQuit anyway? (y or n) "] +[350.532606, "o", "y"] +[350.638505, "o", "\r\nDetaching from program: /linux/vmlinux, process 1\r\n"] +[350.639382, "o", "Ending remote debugging.\r\n[Inferior 1 (process 1) detached]\r\n"] +[350.646756, "o", "$ "] +[352.121258, "o", "\r\n"] diff --git a/refs/pull/405/merge/_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.ditaa b/refs/pull/405/merge/_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.ditaa new file mode 100644 index 00000000..30800dd9 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.ditaa @@ -0,0 +1,17 @@ + CPU0 + +------------+ task_struct + | ... | +--------> +-----------------------+ + +------------- | | Thread Group ID (PID) | ++--| FS | | +-----------------------+ +| +------------- | | Thread ID (TID) | +| | ... | | +-----------------------+ +| +------------+ | | ... | +| | +-----------------------+ +| Per CPU variables | | Opened files | ++->+-----------------------+ | +-----------------------+ + | ... | | | Address Space | + +-----------------------+ | +-----------------------+ + | current_task |------+ | ... | + +-----------------------+ +-----------------------+ + | ... | + +-----------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.png b/refs/pull/405/merge/_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.png new file mode 100644 index 00000000..a85660c4 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.png differ diff --git a/refs/pull/405/merge/_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.ditaa b/refs/pull/405/merge/_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.ditaa new file mode 100644 index 00000000..30f65b31 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.ditaa @@ -0,0 +1,24 @@ + +-----------+ + | | + +------------------->| Memory |<------------------+ + | | | | + | +-----------+ | + | ^ | + | | | + v v v ++---------------+ +---------------+ +---------------+ +| | | | | | +| Processor A | | Processor B | | Processor C | +| | | | | | +| +-----------+ | | +-----------+ | | +-----------+ | +| | Process 1 | | | | Process 1 | | | | Process 1 | | +| +-----------+ | | +-----------+ | | +-----------+ | +| | | | | | +| +-----------+ | | +-----------+ | | +-----------+ | +| | Process 2 | | | | Process 2 | | | | Process 2 | | +| +-----------+ | | +-----------+ | | +-----------+ | +| | | | | | +| +-----------+ | | +-----------+ | | +-----------+ | +| | kernel | | | | kernel | | | | kernel | | +| +-----------+ | | +-----------+ | | +-----------+ | ++---------------+ +---------------+ +---------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.png b/refs/pull/405/merge/_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.png new file mode 100644 index 00000000..3e8d5ccb Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.png differ diff --git a/refs/pull/405/merge/_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.ditaa b/refs/pull/405/merge/_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.ditaa new file mode 100644 index 00000000..8be6db27 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.ditaa @@ -0,0 +1,28 @@ ++---------------------------------+ +| Virtual Filesystem Switch | ++---------------------------------+ + ^ + | + v ++---------------------------------+ +| Device Mapper | ++---------------------------------+ + ^ + | + v ++---------------------------------+ +| Generic Block Layer | ++---------------------------------+ + ^ + | + v ++--------------------------------+ +| I/O scheduler | ++--------------------------------+ + ^ ^ + | | + v v ++--------------+ +--------------+ +| Block device | | Block device | +| driver | | driver | ++--------------+ +--------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.png b/refs/pull/405/merge/_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.png new file mode 100644 index 00000000..c5d34dff Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.png differ diff --git a/refs/pull/405/merge/_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.ditaa b/refs/pull/405/merge/_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.ditaa new file mode 100644 index 00000000..e0954cde --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.ditaa @@ -0,0 +1,27 @@ + preemption + +------------------------------+ + | | + V | + +------------+ +--------------+ +-------------+ + clone() | | schedule() | | exit() | | +-----------> | TASK_READY |-------------->| TASK_RUNNING |---------------->| TASK_DEAD | + | | | |--------+ | TASK_ZOMBIE | + +------------+ +--------------+ | | | + ^ | +-------------+ + | | + | | + | | + | signal +----------------------+ | + +-----------| | | + | | | wait_event() | + | wake_up() | TASK_INTERRUPTIBLE |<--------------+ + +-----------| | | + | | | | + | +----------------------+ | + | | + | | + | +----------------------+ | + | | | wait_event() | + | wake_up() | TASK_UNINTERRUPTIBLE |<--------------+ + +-----------| | + +----------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.png b/refs/pull/405/merge/_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.png new file mode 100644 index 00000000..72a6d96d Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.png differ diff --git a/refs/pull/405/merge/_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.ditaa b/refs/pull/405/merge/_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.ditaa new file mode 100644 index 00000000..3c096d7f --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.ditaa @@ -0,0 +1,28 @@ + +------------------+ +------------+ + | Address space | | |-------------->+------------+ + | descriptor | +------------+ | | + +------------------+ | | Page +------------+ + | +------------+ tables | | + +------------------+--------------+ | ... | +------------+ + | | +------------+ | ... | + v v | |-------+ +------------+ + +------------+ +------------+ +------------+ | | | + | Area | | Area | | +------------+ + | descriptor | | descriptor | | + +------------+ +------------+ | + | | + +-------------+------------------+ +------>+------------+ + | | | | + v v +------------+ + +------------+ +------------+ | | + | Area | | Area | +------------+ + | descriptor | | descriptor | | ... | + +------------+ +------------+ +------------+ + | | | + +-----------+-----------+ +------------+ + | | + v v + +------------+ +------------+ + | Area | | Area | + | descriptor | | descriptor | + +------------+ +------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.png b/refs/pull/405/merge/_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.png new file mode 100644 index 00000000..0405e091 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.png differ diff --git a/refs/pull/405/merge/_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.ditaa b/refs/pull/405/merge/_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.ditaa new file mode 100644 index 00000000..3188faa7 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.ditaa @@ -0,0 +1,23 @@ + file + descriptor + table + +------------+ +--------+ +--------+ +---------+ + | |------+--->| FILE |------->| dentry |------->| inode | + +------------+ | +--------+ +--------+ ^ +---------+ + +-> | |------+ dup | | type | + | +------------+ hard link | | perm | + | | ... | | | .... | + | +------------+ +--------+ +--------+ | +---------+ + | | |---------->| FILE |------->| dentry |---+ | + | +------------+ +--------+ +--------+ | + fd | + | + +------+ <-------------------+ + | data | + +------+ + +------+ +------+ + | data | | data | + +------+ +------+ + +------+ + | data | + +------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.png b/refs/pull/405/merge/_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.png new file mode 100644 index 00000000..db471127 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.png differ diff --git a/refs/pull/405/merge/_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.ditaa b/refs/pull/405/merge/_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.ditaa new file mode 100644 index 00000000..2caaba65 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.ditaa @@ -0,0 +1,24 @@ ++---------------------+ +---------------------+ +---------------------+ +| Guest OS | | Guest OS | | Guest OS | +| +---------------+ | | +---------------+ | | +---------------+ | +| | Guest Driver | | | | Guest Driver | | | | Guest Driver | | +| +---------------+ | | +---------------+ | | +---------------+ | +| | ^ | | | ^ | | | ^ | +| | | | | | | | | | | | ++----+-----------+----+ +----+-----------+----+ +----+-----------+----+ + | traped | | mapped | | mapped | interrupt + | access | | access | | access | posting + +---+-----------+----+ +---+-----------+-----+ +---+-----------+-----+ + | | VMM | | | | VMM | | | | VMM | | + | v | | | | | | | | | | + | +----------------+ | | | +---------+ | | | | | + | | Virtual Device | | | | | IRQ | | | | | | + | +----------------+ | | | | Mapping | | | | | | + | | ^ | | | +---------+ | | | | | + | | | | | | | | | | | | + +--+------------+----+ +---+-----------+-----+ +---+-----------+-----+ + | | | | | | + v | v | v | + +-----------------+ +-----------------+ +-----------------+ + | Physical Device | | Physical Device | | Physical Device | + +-----------------+ +-----------------+ +-----------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.png b/refs/pull/405/merge/_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.png new file mode 100644 index 00000000..71a91ff0 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.png differ diff --git a/refs/pull/405/merge/_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.ditaa b/refs/pull/405/merge/_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.ditaa new file mode 100644 index 00000000..9f22ad05 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.ditaa @@ -0,0 +1,12 @@ + + ^ + ^ + | | | | + | Syscall | IRQi| | + User Mode | Exception (e.g. page fault) | | | + | | | | + +------------------------------------+-----+-----------------+ + | iret| | iret^ IRQj| iret| + | | | | | | +Kernel Mode v-------+ ^-------+ ^------+ v-----+ v-----+ + | | | | + IRQi| iret| IRQj| iret| + v------+ v------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.png b/refs/pull/405/merge/_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.png new file mode 100644 index 00000000..896f09b9 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.png differ diff --git a/refs/pull/405/merge/_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.ditaa b/refs/pull/405/merge/_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.ditaa new file mode 100644 index 00000000..a217d3f7 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.ditaa @@ -0,0 +1,31 @@ + counter is 2 + + Thread A Thread B + + * + | + | + +---------------------+ + | dec counter | counter is 1 + | cEEE | + +---------------------+ + | + | B preempts A + +-----------------------------------------------+ + | + v + +----------------------+ + counter is 0 | dec counter | + | if (!counter) | + resource is freed | free_resource(); | + | cEEE | + +----------------------+ + B finishes, A continues | + +-----------------------------------------------+ + | + v ++----------------------+ +| if (!counter) | +| free_resource(); | resource is freed +| cEEE | ++----------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.png b/refs/pull/405/merge/_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.png new file mode 100644 index 00000000..81caa372 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.png differ diff --git a/refs/pull/405/merge/_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.ditaa b/refs/pull/405/merge/_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.ditaa new file mode 100644 index 00000000..1e6bbfb9 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.ditaa @@ -0,0 +1,24 @@ ++---------------------+ +---------------------+ +| Guest OS | | Guest OS | +| +---------------+ | | +---------------+ | +| | Guest Driver | | | | Guest Driver | | +| +---------------+ | | +---------------+ | +| | ^ | | | ^ | +| | | | | | | | ++----+-----------+----+ +----+-----------+----+ + | traped | | mapped | + | access | | access | + +---+-----------+----+ +---+-----------+-----+ But how do we deal with DMA? + | | VMM | | | | VMM | | + | v | | | | | | + | +----------------+ | | | +---------+ | + | | Virtual Device | | | | | IRQ | | + | +----------------+ | | | | Mapping | | + | | ^ | | | +---------+ | + | | | | | | | | + +--+------------+----+ +---+-----------+-----+ + | | | | + v | v | + +-----------------+ +-----------------+ + | Physical Device | | Physical Device | + +-----------------+ +-----------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.png b/refs/pull/405/merge/_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.png new file mode 100644 index 00000000..12b043a0 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.png differ diff --git a/refs/pull/405/merge/_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.ditaa b/refs/pull/405/merge/_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.ditaa new file mode 100644 index 00000000..08d3fc02 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.ditaa @@ -0,0 +1,21 @@ +: : : : +| User space | Lowmem | Highmem | +| arbitrary mapping | linear mapping | arbitrary mapping | +| | | | ++----+----+--------------------+----+------+----+----+---------------+----+----+-----+----+----+ Virtual +|cEEE|cGRE|cEEE |cRED|cEEE |cAAA|cGRE| cAAA |cEEE|cGRE|cEEE |cRED|cEEE| memory +| | | | | | | | | | | | | | ++----+----+--------------------+----+------+----+----+---------------+----+----+-----+----+----+ + | | 3G | 3.896G | | 4G + | +-------+ | | | + | | | | | + |<----------------------------------+------+<-------------------------+ | + | | | + | |<-------------------------------------------+ + | | + v V ++----+----+---------------+--------------+----+------------------------------------------------+ Physical +|cAAA|cGRE| cAAA | cEEE |cRED| cEEE | memory +| | | | | | | ++----+----+---------------+--------------+----+------------------------------------------------+ + 896MB \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.png b/refs/pull/405/merge/_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.png new file mode 100644 index 00000000..30196205 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.png differ diff --git a/refs/pull/405/merge/_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.ditaa b/refs/pull/405/merge/_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.ditaa new file mode 100644 index 00000000..8ad85a85 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.ditaa @@ -0,0 +1,31 @@ ++-----+ +-----+ +-----+ +| App | | App | | App | ++-----+ +-----+ +-----+ + | | | User +=--|-------=--------|--------=-------|-------------------=- + | | | Kernel + v v v ++--------------------------------------------------------+ +| System Call Interface | ++--------------------------------------------------------+ + | | + v v + +-----+ +-----+ + | |<---------------------------->| | Kernel + | |<---+ +------->| | functions + +--+--+ | | +-----+ + | | | ^ + | | +-----+ | | + |+------+---->| |<---+ | + || | +-----+ | + || | | + vv | v + +--++-+ | +-----+ + | | +------------------------>| | Device + | |<---------------------------->| | Drivers + +--+--+ +--+--+ + | | + v v ++--------------------------------------------------------+ +| Hardware | ++--------------------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.png b/refs/pull/405/merge/_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.png new file mode 100644 index 00000000..313367d3 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.png differ diff --git a/refs/pull/405/merge/_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.ditaa b/refs/pull/405/merge/_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.ditaa new file mode 100644 index 00000000..fdbf04f5 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.ditaa @@ -0,0 +1,18 @@ ++---------------+ +--------------+ +---------------+ -\ +| Application 1 | | Application2 | ... | Application n | | ++---------------+ +--------------+ +---------------+ |> User space + | | | | + v v v -/ ++--------------------------------------------------------+ -\ +| System Call Interface | | ++--------------------------------------------------------+ | + | | | | + v v v |> Kernel space ++--------------------------------------------------------+ | +| Kernel | | ++--------------------------------------------------------+ | +| Device drivers | | ++--------------------------------------------------------+ -/ + | | | -\ + v v v |> Hardware + -/ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.png b/refs/pull/405/merge/_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.png new file mode 100644 index 00000000..1dd08db8 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.png differ diff --git a/refs/pull/405/merge/_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.ditaa b/refs/pull/405/merge/_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.ditaa new file mode 100644 index 00000000..36a4bbe5 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.ditaa @@ -0,0 +1,29 @@ + EPROCESS + +------------------+ ++->| KPROCESS | +| +------------------+ +| | Process ID (PID) | +| +------------------+ +| | ... | +| +------------------+ +| | Thread list |--------------+------------------------------------+ +| +------------------+ | | +| | Opened files | ETHREAD V ETHREAD V +| | +--------------+ | +| | | FILE | | +| | +--------------+ | +| | | ... | | +| | +--------------+ | +| +------------------+ +-----------------------+ +-----------------------+ +| | Address Space | | KTHREAD | | KTHREAD | +| + +--------------+ | +-----------------------+ +-----------------------+ +| | | ... | | | Thread ID (TID) | | Thread ID (TID) | +| | +--------------+ | +-----------------------+ +-----------------------+ +| +------------------+ | Thread Start Address | | Thread Start Address | +| +-----------------------+ +-----------------------+ +| | ... | ... | ... | +| +-----------------------+ +-----------------------+ +| | Process | | Process | +| +-----------------------+ +-----------------------+ +| | | ++---------------------------------------+------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.png b/refs/pull/405/merge/_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.png new file mode 100644 index 00000000..ff901ecf Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.png differ diff --git a/refs/pull/405/merge/_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.ditaa b/refs/pull/405/merge/_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.ditaa new file mode 100644 index 00000000..028864ea --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.ditaa @@ -0,0 +1,15 @@ + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + cache cache + +-------+ +-------+ +A | 1 | | 1 | A + +-------+ +-------+ +B | 2 | | 2 | B + +-------+ +-------+ + memory + +-----------------------------+ +A | 1 | + +-----------------------------+ +B | 2 | + +-----------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.png b/refs/pull/405/merge/_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.png new file mode 100644 index 00000000..965e1135 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.png differ diff --git a/refs/pull/405/merge/_images/ditaa-4e1f9758808dba9e61bc0e48faf4365d377f9d32.ditaa b/refs/pull/405/merge/_images/ditaa-4e1f9758808dba9e61bc0e48faf4365d377f9d32.ditaa new file mode 100644 index 00000000..c7527976 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-4e1f9758808dba9e61bc0e48faf4365d377f9d32.ditaa @@ -0,0 +1,33 @@ + + + Kernel space | User space + | + | ++-------------+ +-------------+ +---------------+ | +--------+ +| | | | | | | | | +| my_device | | my_driver | | my_bus_type | | | udev | +| | | | | | | | | ++-----+-------+ +------+------+ +-------+-------+ | +---+----+ + | | | | | + : : : | : + | | 1.my_register_driver() | 2.call_usermodehelper() | + | +-+------------------------->+-+------------------------->+-+ + | | | | | | | | + | | | | | | | | + | | | | | | | | + | 3.my_uevent() | | | | 4.call_usermodehelper() | | + +++-------------------------| |--------------------------> +------------------------->| | + | | | | | | | | | + | | | | 6.my_probe() | | 5.my_match() | | + | | | |<=------------------------| |<=------------------------| | + | | | | | | | | | + | | | | | | | | | + | | | | | | | | | + | | 7.my_remove() | | 8.my_uevent() | | 9.call_usermodehelper() | | +---------------------------+ + +-+------------------------>| |------------------------->| |------------------------->| | | | + | | | | | | | | | 1 - 2 -> add driver | + | | | | | | | | | 3 - 6 -> add device | + | | | | | | | | | 7 - 9 -> remove device | + | | | 10.my_unregister_driver()| | 11.call_usermodehelper() | | | 10 - 11 -> remove driver | + | +-+------------------------->+-+------------------------->+-+ | | + | | | | | +---------------------------+ + : : : | : \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-4e1f9758808dba9e61bc0e48faf4365d377f9d32.png b/refs/pull/405/merge/_images/ditaa-4e1f9758808dba9e61bc0e48faf4365d377f9d32.png new file mode 100644 index 00000000..9654c21a Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-4e1f9758808dba9e61bc0e48faf4365d377f9d32.png differ diff --git a/refs/pull/405/merge/_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.ditaa b/refs/pull/405/merge/_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.ditaa new file mode 100644 index 00000000..7336998f --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.ditaa @@ -0,0 +1,25 @@ + (1) List Traversal (2) Removal + +-----------+ ++-----+ +-----+ +-----+ +-----+ | +-----+ | +-----+ +| | | | | | | | | | | | | | +| A |---->| B |---->| C | | A |--+ | B |--+->| C | +| | | | | | | | | | | | ++-----+ +-----+ +-----+ +-----+ +-----+ +-----+ + ^ ^ ^ ^ ^ ^ + | | | | | | + + + + + + + + (3) Quiescent cycle over (4) Reclamation + +-----------+ ++-----+ | +-----+ | +-----+ +-----+ +-----+ +| | | | | | | | | | | | +| A |--+ | B | +->| C | | A |---------------->| C | +| | | | | | | | | | ++-----+ +-----+ +-----+ +-----+ +-----+ + ^ ^ ^ ^ + | | | | \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.png b/refs/pull/405/merge/_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.png new file mode 100644 index 00000000..bc2710ed Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.png differ diff --git a/refs/pull/405/merge/_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.ditaa b/refs/pull/405/merge/_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.ditaa new file mode 100644 index 00000000..50611ba8 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.ditaa @@ -0,0 +1,33 @@ ++----------------------+ +----------------------+ +| Application | | Application | ++----------------------+ +----------------------+ + | ^ | ^ + | send() | recv() | send() | recv() + V | V | ++----------------------+ +----------------------+ +| Socket | | Socket | ++----------------------+ +----------------------+ + | ^ | ^ + | | | | + v | v | ++---------------------------------------------------------+ +| Transport layer | ++---------------------------------------------------------+ + | ^ | ^ + | | | | + v | v | ++---------------------------------------------------------+ +| Network layer | ++---------------------------------------------------------+ + | ^ + | | + v | +/---------------------------------------------------------\ +| Routing | ----> Drop packet +\---------------------------------------------------------/ + ^ | ^ | + | RX | TX | RX | TX + | v | v ++-----------------------+ +-----------------------+ +| Network Device Driver | | Network Device Driver | ++-----------------------+ +-----------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.png b/refs/pull/405/merge/_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.png new file mode 100644 index 00000000..d5d9422a Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.png differ diff --git a/refs/pull/405/merge/_images/ditaa-58545831034f050660727be99cede213bc4a53c7.ditaa b/refs/pull/405/merge/_images/ditaa-58545831034f050660727be99cede213bc4a53c7.ditaa new file mode 100644 index 00000000..56396ba2 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-58545831034f050660727be99cede213bc4a53c7.ditaa @@ -0,0 +1,14 @@ ++-------------------------------------------+ +| Queued Spin Lock cEEE | +| | +| +---+ +---+ +---+ +---+ | +| | |----->| |----->| |----->| | | +| +---+ +---+ +---+ +---+ | +| ^ ^ ^ ^ | +| | | | | | ++-------------------------------------------+ + | | | | + CPU10 CPU17 CPU99 CPU0 + owns the spins on spins on spins on + lock private private private + lock lock lock \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-58545831034f050660727be99cede213bc4a53c7.png b/refs/pull/405/merge/_images/ditaa-58545831034f050660727be99cede213bc4a53c7.png new file mode 100644 index 00000000..af82ca04 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-58545831034f050660727be99cede213bc4a53c7.png differ diff --git a/refs/pull/405/merge/_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.ditaa b/refs/pull/405/merge/_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.ditaa new file mode 100644 index 00000000..59691093 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.ditaa @@ -0,0 +1,28 @@ +arch/x86/include/asm/irq_vectors.h + +------+ + | 0 | 0..31, system traps and exceptions + +------+ + | 1 | + +------+ + | | + +------+ + | | + | | + | | + +------+ + | 32 | 32..127, device interrupts + +------+ + | | + | | + | | + +------+ + | 128 | int80 syscall interface + +------+ + | 129 | 129..255, other interrupts + +------+ + | | + | | + | | + +------+ + | 255 | + +------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.png b/refs/pull/405/merge/_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.png new file mode 100644 index 00000000..d16255d3 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.png differ diff --git a/refs/pull/405/merge/_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.ditaa b/refs/pull/405/merge/_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.ditaa new file mode 100644 index 00000000..f7c65c78 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.ditaa @@ -0,0 +1,11 @@ + 63 56 44 40 32 ++-------------------------------+---+---+---+---+---------------+---+---+---+---+---------------+-------------------------------+ +| | | D | | A | Segment | | D | | | | +| Base Address 31:24 | G | / | L | V | Limit | P | P | S | Type | Base Address 23:16 | +| | | B | | L | 19:16 | | L | | | | ++-------------------------------+---+---+---+---+---------------+---+---+---+---+---------------+-------------------------------+ +| | | +| Base address 15:0 | Segment Limit 15:0 | +| | | ++---------------------------------------------------------------+---------------------------------------------------------------+ + 31 15 0 \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.png b/refs/pull/405/merge/_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.png new file mode 100644 index 00000000..521c75de Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.png differ diff --git a/refs/pull/405/merge/_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.ditaa b/refs/pull/405/merge/_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.ditaa new file mode 100644 index 00000000..0fb5e231 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.ditaa @@ -0,0 +1,13 @@ ++-----------+ NMI +| | +| |<----------+ +| | +| | +------------+ +| | | | IRQ0 +| | | |<------------+ device0 +| CPU | | | IRQ1 +| | INTR | PIC |<------------+ device1 +| |<----------+ | IRQN +| | | |<------------+ deviceN +| | | | ++-----------+ +------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.png b/refs/pull/405/merge/_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.png new file mode 100644 index 00000000..f2a2fafc Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.png differ diff --git a/refs/pull/405/merge/_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.ditaa b/refs/pull/405/merge/_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.ditaa new file mode 100644 index 00000000..aa92baec --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.ditaa @@ -0,0 +1,29 @@ + Virtual Address ++------------+ +------------------+-----------------+------------------+-------------------+---------------+ +| CR3 | | GLOBAL DIR cEEE| UPPER DIR cDDD| MIDDLE DIR cCCC| TABLE cBBB| OFFSET cAAA | ++------------+ +------------------+-----------------+------------------+-------------------+---------------+ + | | | | | | + | | | | | | PAGE + | | | | | | /----------------------\ + | | | | | | | | + | | | | | | | | + | | +-----------+ | | PAGE GLOBAL | +----------------------+ + | | | | | DIRECTORY +-------->| Physical Address cAAA| + | | | | PAGE MIDDLE | /------------\ +----------------------+ + | +-----------------+ | | DIRECTORY | | | | | + | | | PAGE UPPER | /------------\ | | | | | + | | | DIRECTORY | | | | | | | | + | | PAGE GLOBAL | /------------\ | | | | | | | | + | | DIRECTORY | | | | +------------+ | | | | | + | | /------------\ | | | +--->| cCCC |---+ | +------------+ | | + | | | | | | | +------------+ | +--->| cBBB |---------->\----------------------/ + | | | | | | | | | | +------------+ + | | | | | +------------+ +----->\------------/ | | | + | | | | +---->| cDDD |---+ | | | + | | | | +------------+ +----->\------------/ + | | +------------+ | | + | +----->| cEEE |--+ | | + | +------------+ | | | + | | | +----->\------------/ + | | | + +--------->\------------/ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.png b/refs/pull/405/merge/_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.png new file mode 100644 index 00000000..f3a36d4c Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.png differ diff --git a/refs/pull/405/merge/_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.ditaa b/refs/pull/405/merge/_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.ditaa new file mode 100644 index 00000000..3067ca4c --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.ditaa @@ -0,0 +1,15 @@ ++--------------+-----------------------+--------------+ +| cF88 | c8F8 | cF88 | +| Buffer | Allocated buffer | Buffer | +| Underflow | 0x5a5a5a5a | Overflow | +| Poison | 0x5a5a5a5a | Poison | +| | 0x5a5a5a5a | | ++--------------+-----------------------+--------------+ + ++--------------+-----------------------+--------------+ +| cF88 | c888 | cF88 | +| Buffer | Freed buffer | Buffer | +| Underflow | 0x6b6b6b6b | Overflow | +| Poison | 0x6b6b6b6b | Poison | +| | 0x6b6b6b6b | | ++--------------+-----------------------+--------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.png b/refs/pull/405/merge/_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.png new file mode 100644 index 00000000..e586bd81 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.png differ diff --git a/refs/pull/405/merge/_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.ditaa b/refs/pull/405/merge/_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.ditaa new file mode 100644 index 00000000..0359fb34 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.ditaa @@ -0,0 +1,23 @@ + ^ ^ ^ + | stat | open | read + v v v ++------------------------------------------------------------+ +| | +| Virtual Filesystem Switch | +| | ++------------------------------------------------------------+ + ^ ^ + | | + v v + +-------------+ +-------------+ + | Filesystem | | Filesystem | + | driver | | driver | + +-------------+ +-------------+ + ^ ^ + | | + v v ++------------------------------------------------------------+ +| | +| Block I/O layer | +| | ++------------------------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.png b/refs/pull/405/merge/_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.png new file mode 100644 index 00000000..c7703a6f Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.png differ diff --git a/refs/pull/405/merge/_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.ditaa b/refs/pull/405/merge/_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.ditaa new file mode 100644 index 00000000..fa1624c7 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.ditaa @@ -0,0 +1,29 @@ + Virtual Address ++------------+ +-------------------+-----------------------------+ +| CR3 | | DIRECTORY cEEE | OFFSET cDDD | ++------------+ +-------------------+-----------------------------+ + | | | + | | | PAGE + | | | /----------------------\ + | | | | | + | | | | | + | | | +----------------------+ + | | +--->| Physical Address cDDD| + | | +----------------------+ + | +-----------------+ | | + | | | | + | | | | + | | PAGE | | + | | DIRECTORY | | + | | /------------\ | | + | | | | +------------------>\----------------------/ + | | | | | + | | | | | + | | | | | + | | | | | + | | +------------+ | + | +----->| cEEE |-------------+ + | +------------+ + | | | + | | | + +---------->\------------/ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.png b/refs/pull/405/merge/_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.png new file mode 100644 index 00000000..ce56a116 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.png differ diff --git a/refs/pull/405/merge/_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.ditaa b/refs/pull/405/merge/_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.ditaa new file mode 100644 index 00000000..d4720eff --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.ditaa @@ -0,0 +1,17 @@ + Socket + File ++------+ Operations +| FILE | ----------------------> +-----------+ ++------+ | read | + | | struct socket_alloc +-----------+ + | | +---------------+ | write | + | +------->| struct socket | +-----------+ + | f_private| +-----------+ | | select | + | | | ... | | +-----------+ + | | +-----------+ | | ... | + | +---------------+ +-----------+ + +--------->| struct inode | + f_inode | +-----------+ | + | | ... | | + | +-----------+ | + +---------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.png b/refs/pull/405/merge/_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.png new file mode 100644 index 00000000..995d3a1a Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.png differ diff --git a/refs/pull/405/merge/_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.ditaa b/refs/pull/405/merge/_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.ditaa new file mode 100644 index 00000000..60a87752 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.ditaa @@ -0,0 +1,16 @@ + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + A <- A + B B <- A + B + + +-------+ +-------+ +A | 3 | | 1 | A + +-------+ +-------+ +B | 2 | | 3 | B + +-------+ +-------+ + write back caches + +-----------------------------+ +A | 1 | + +-----------------------------+ +B | 2 | + +-----------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.png b/refs/pull/405/merge/_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.png new file mode 100644 index 00000000..7f954da4 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.png differ diff --git a/refs/pull/405/merge/_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.ditaa b/refs/pull/405/merge/_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.ditaa new file mode 100644 index 00000000..c88a9570 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.ditaa @@ -0,0 +1,29 @@ + w/o privilege transition w/ privilege transition + ++ +---------------------+ +---------------------+ +| | | | | +| | | OLD SS:ESP | OLD SS | NEW SS:ESP from TSS +| +---------------------+ +---------------------+ +| | | | | +| | OLD EFLAGS | | OLD ESP | +| +---------------------+ +---------------------+ +| | | | | +| | OLD CS | | OLD EFLAGS | +| +---------------------+ +---------------------+ +| | | | | +| | OLD EIP | | OLD CS | +| +---------------------+ +---------------------+ +| | | | | +| | (error code) | NEW SS:ESP | OLD EIP | +| +---------------------+ +---------------------+ +| | | | | +| | | | (error code) | NEW SS:ESP +| | | +---------------------+ +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +v +---------------------+ +---------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.png b/refs/pull/405/merge/_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.png new file mode 100644 index 00000000..0634d8c9 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.png differ diff --git a/refs/pull/405/merge/_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.ditaa b/refs/pull/405/merge/_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.ditaa new file mode 100644 index 00000000..a0d8f88b --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.ditaa @@ -0,0 +1,32 @@ + PGD PMD PT + +----------+ +----------+ +----------+ + | | | | | | Guest Physical Page + +----------+ +----------+ +----------+ +----------+ + | | | | | |----+ | | ++-----+ +----------+ +----------+ +----------+ | | | +| CR3 | | |----+ | |---+ | | | | | ++-----+ +----------+ | +----------+ | +----------+ +--->+----------+ + | | | | | | | | | + +---------> +----------+ +------>+----------+ +---->+----------+ + Write Protected Write Protected Write Protected + | + | +Guest (VM) | + | trap access + | +---------------------+------------------------------------------------------------------------------ + | + | check access, transform GPP to HPP + | + v + + Shadow PGD Shadow PMD Shadow PT + +----------+ +----------+ +----------+ + | | | | | | Host Physical Page + +----------+ +----------+ +----------+ +----------+ + | | | | | |----+ | | + +----------+ +----------+ +----------+ | | | + | |----+ | |---+ | | | | | + +----------+ | +----------+ | +----------+ +--->+----------+ + | | | | | | | | + +----------+ +------>+----------+ +---->+----------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.png b/refs/pull/405/merge/_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.png new file mode 100644 index 00000000..8bcd5664 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.png differ diff --git a/refs/pull/405/merge/_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.ditaa b/refs/pull/405/merge/_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.ditaa new file mode 100644 index 00000000..bfb4735f --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.ditaa @@ -0,0 +1,28 @@ + Process + context + | + v +IRQ10 | irq10 handler +-----------------------------> +-------------+ + | +IRQ20 (lower priority) | +-----------------------------> pending v + | +IRQ5 (higher priority) | irq5 handler +-----------------------------> +-------->---------+ + | + v + | + +--------<---------+ + | + v + | + -------<-------+ + irq20 handler +Pending IRQ20 ------->-------+ + | + v + | + +--------------+ + | + v \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.png b/refs/pull/405/merge/_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.png new file mode 100644 index 00000000..ace5807e Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.png differ diff --git a/refs/pull/405/merge/_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.ditaa b/refs/pull/405/merge/_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.ditaa new file mode 100644 index 00000000..2ec3ee01 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.ditaa @@ -0,0 +1,5 @@ ++--------------+--------+--------+---------+---------+ +| | | | | | +| Superblock | IMAP | DMAP | IZONE | DZONE | +| | | | | | ++--------------+--------+--------+---------+---------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.png b/refs/pull/405/merge/_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.png new file mode 100644 index 00000000..64ad3b3b Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.png differ diff --git a/refs/pull/405/merge/_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.ditaa b/refs/pull/405/merge/_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.ditaa new file mode 100644 index 00000000..7042e172 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.ditaa @@ -0,0 +1,11 @@ + Head + ^ +---------------+ +skb_push | | | | skb_reserve + +---------------+ v + | Data | | skb_pull + ^ | | v +skb_trim | | Tail | + +---------------+ + | | | skb_put + +---------------+ v + End \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.png b/refs/pull/405/merge/_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.png new file mode 100644 index 00000000..2631fc13 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.png differ diff --git a/refs/pull/405/merge/_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.ditaa b/refs/pull/405/merge/_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.ditaa new file mode 100644 index 00000000..980f08fe --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.ditaa @@ -0,0 +1,11 @@ ++----+ +----+ +----+ +| VM | | VM | ... | VM | ++----+ +----+ +----+ + ++-------------------------+ +| Virtual Machine Monitor | ++-------------------------+ + ++-------------------------+ +| Hardware | ++-------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.png b/refs/pull/405/merge/_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.png new file mode 100644 index 00000000..2d3f9e8a Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.png differ diff --git a/refs/pull/405/merge/_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.ditaa b/refs/pull/405/merge/_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.ditaa new file mode 100644 index 00000000..dd860f92 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.ditaa @@ -0,0 +1,29 @@ + CPU0 CPU1 ++-------------+ +-------------+ +| | | | +| |local IRQs | |local IRQs +| +---------- | +---------- +| | | | +| local APIC | | local APIC | +| | LINT0, LINT1 | | LINT0, LINT1 +| +------------- | +------------- +| | | | ++-------+-----+ +------+------+ + | | + | | + | | ++-------+--------------------------------+------+ +| | +| Interrupt Controller Communication BUS | ++----------------------+------------------------+ + | + | + +--------+--------+ + | | + | I/O APIC | + | | + +--------+--------+ + | + | + | + External interrupts \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.png b/refs/pull/405/merge/_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.png new file mode 100644 index 00000000..8b40a85e Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.png differ diff --git a/refs/pull/405/merge/_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.ditaa b/refs/pull/405/merge/_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.ditaa new file mode 100644 index 00000000..8930a0d5 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.ditaa @@ -0,0 +1,29 @@ ++---------------------------+ +| Berkeley Socket Interface | ++---------------------------+ + ++---------------------------+ +| Transport layer | ++-------------+-------------+ +| TCP | UDP | ++-------------+-------------+ + ++---------------------------+ +| Network layer | ++-----+---------+-----------+ +| IP | Routing | NetFilter | ++-----+---------+-----------+ + ++---------------------------+ +| Data link layer | ++-------+-------+-----------+ +| ETH | ARP | BRIDGING | ++-------+-------+-----------+ + ++---------------------------+ +| Queuing discipline | ++---------------------------+ + ++---------------------------+ +| Network device drivers | ++---------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png b/refs/pull/405/merge/_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png new file mode 100644 index 00000000..2eb36ce1 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png differ diff --git a/refs/pull/405/merge/_images/ditaa-a5f399cb84561893770eb45ceeb827ce6d4a2336.ditaa b/refs/pull/405/merge/_images/ditaa-a5f399cb84561893770eb45ceeb827ce6d4a2336.ditaa new file mode 100644 index 00000000..8439d6f8 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-a5f399cb84561893770eb45ceeb827ce6d4a2336.ditaa @@ -0,0 +1,29 @@ + +------+ + | /sys | + +--+---+ + | + +----------------------------------------------------+-------------------------------------+-----------------------------------------+ + | | | | + v v v v + +-----+ +-------+ +---------+ +--------+ + | bus | | class | | devices | | module | + +--+--+ +---+---+ +----+----+ +---+----+ + | | | | + | | | +-------------+-----------------+ + | | | | | + v v v v v ++------------------------+ +-----------------------+ +-------------------------+ +----------------------+ +-------------------------+ +| mybus: struct bus_type | | myclass: struct class | | mybus0: struct device | | mybus: struct module | | mydriver: struct module | ++-------------+----------+ +----------+------------+ +-----------+-------------+ +----------------------+ +-------------------------+ + | | | + +--------+--------------+ v v + | | +-------------------------------+ +----------------------+ + v v | myclass0: struct class_device | | mydev: struct device | ++---------+ +---------+ +-------------------------------+ +----------------------+ +| devices | | drivers | ++---------+ +---+-----+ + | + v + +--------------------------------+ + | mydriver: struct device_driver | + +--------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-a5f399cb84561893770eb45ceeb827ce6d4a2336.png b/refs/pull/405/merge/_images/ditaa-a5f399cb84561893770eb45ceeb827ce6d4a2336.png new file mode 100644 index 00000000..134ffdc8 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-a5f399cb84561893770eb45ceeb827ce6d4a2336.png differ diff --git a/refs/pull/405/merge/_images/ditaa-a5f93e0d17ccdc2ba24828b620d7227f7fc75e33.ditaa b/refs/pull/405/merge/_images/ditaa-a5f93e0d17ccdc2ba24828b620d7227f7fc75e33.ditaa new file mode 100644 index 00000000..5dfed1ef --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-a5f93e0d17ccdc2ba24828b620d7227f7fc75e33.ditaa @@ -0,0 +1,17 @@ + +-------------------+ ^ +0xFFFFFFFF | | | + | | | Kernel space + | | | + +-------------------+ v +0xC0000000 | | ^ + | | | User space + | | | + | | | + | | | + | | | + | | | + | | | + | | | +0x00000000 +-------------------+ v + + 32bit Virtual Address Space \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-a5f93e0d17ccdc2ba24828b620d7227f7fc75e33.png b/refs/pull/405/merge/_images/ditaa-a5f93e0d17ccdc2ba24828b620d7227f7fc75e33.png new file mode 100644 index 00000000..ef11a8e0 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-a5f93e0d17ccdc2ba24828b620d7227f7fc75e33.png differ diff --git a/refs/pull/405/merge/_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.ditaa b/refs/pull/405/merge/_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.ditaa new file mode 100644 index 00000000..9097e4c8 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.ditaa @@ -0,0 +1,27 @@ ++---------------+ +--------------+ +---------------+ +| Application 1 | | Application2 | ... | Application n | ++---------------+ +--------------+ +---------------+ + | | | + v v v ++--------------------------------+------------------------+ +| Kernel core & subsystems | Generic Drivers | ++--------------------------------+------------------------+ +| Generic Architecture Code | ++---------------------------------------------------------+ +| Architecture Specific Code | +| | +| +-----------+ +--------+ +---------+ +--------+ | +| | Bootstrap | | Memory | | Threads | | Timers | | +| +-----------+ +--------+ +---------+ +--------+ | +| +------+ +----------+ +------------------+ | +| | IRQs | | Syscalls | | Platform Drivers | | +| +------+ +----------+ +------------------+ | +| +------------------+ +---------+ +---------+ | +| | Platform Drivers | | machine | ... | machine | | +| +------------------+ +---------+ +---------+ | ++---------------------------------------------------------+ + | | | + v v v ++--------------------------------------------------------+ +| Hardware | ++--------------------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.png b/refs/pull/405/merge/_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.png new file mode 100644 index 00000000..cfb0ecee Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.png differ diff --git a/refs/pull/405/merge/_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.ditaa b/refs/pull/405/merge/_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.ditaa new file mode 100644 index 00000000..54d47f9f --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.ditaa @@ -0,0 +1,26 @@ + ^ ^ ^ + | stat | open | read + v v v ++------------------------------------------------------------+ +| Virtual Filesystem Switch | +| | +| | +| /-------\ /--------\ /--------\ | +| | inode |<----------+ dentry |<----------+ FILE | | +| \---+---/ \----+---/ \---+----/ | +| | | | | +| | | | | +| v v v | +| +-------+ +--------+ +-------+ | +| | inode | | dentry | | page | | +| | cache | | cache | | cache | | +| +-------+ +--------+ +-------+ | +| | ++------------------------------------------------------------+ + ^ ^ + | | + v v + +-------------+ +-------------+ + | Filesystem | | Filesystem | + | driver | | driver | + +-------------+ +-------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.png b/refs/pull/405/merge/_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.png new file mode 100644 index 00000000..de68d467 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.png differ diff --git a/refs/pull/405/merge/_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.ditaa b/refs/pull/405/merge/_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.ditaa new file mode 100644 index 00000000..8255133b --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.ditaa @@ -0,0 +1,22 @@ + Interrupt Descriptor + +----------------------------------------------+ + | | + | +------------------+ +--------+ +------+ | + | | segment selector | | offset| | PL | | + | +----+-------------+ +---+----+ +------+ | + | | | | + +----------------------------------------------+ + | | + | | ++-------------+ +----------------------------> +---------------+ +| ^ | ISR address | +| Segment Descriptor | +---------------+ +| +----------------------------------------------+ | +| | | | ++---->| +------------------+ +--------+ +------+ | | + | | base | | limit | | PL | | | + | +---------+--------+ +--------+ +------+ | | + | | | | + +----------------------------------------------+ | + | | + +--------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.png b/refs/pull/405/merge/_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.png new file mode 100644 index 00000000..59077fd3 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.png differ diff --git a/refs/pull/405/merge/_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.ditaa b/refs/pull/405/merge/_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.ditaa new file mode 100644 index 00000000..7f01d04e --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.ditaa @@ -0,0 +1,23 @@ ++-------+ +-------+ +-------+ +| CPU 0 |<---------------+ | CPU 1 | Invalidate | CPU 0 | +| cache |<-------------+ | | cache |<---+ +---------->| cache | ++-------+ Invalidate | | +-------+ | | +-------+ + | | | | + | | +----------------------------+ +spin_lock(&lock); | | | | + | | READ lock | | + | +---- WRITE lock ---+ | + | | + | READ lock | + +-------------------------------- WRITE lock ----+ + + ... ... ... +READ data READ lock READ lock + | | | + | | | + | | | + +------------------------------+-------------------------+ + | + v + + cache miss \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.png b/refs/pull/405/merge/_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.png new file mode 100644 index 00000000..24734d5b Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.png differ diff --git a/refs/pull/405/merge/_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.ditaa b/refs/pull/405/merge/_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.ditaa new file mode 100644 index 00000000..dc13756f --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.ditaa @@ -0,0 +1,43 @@ ++---------------+ +--------------+ +---------------+ +| Application 1 | | Application2 | ... | Application n | ++---------------+ +--------------+ +---------------+ + | | | + v v v ++--------------------------------------------------------+ +| Kernel | +| | +| +----------------------+ +-------------------+ | +| | Process Management | | Memory Management | | +| +----------------------+ +-------------------+ | +| | +| +------------+ +------------+ +------------+ | +| | Block I/O | | VFS | | Networking | | +| +------------+ +------------+ +------------+ | +| | +| +------------+ +------------+ +------------+ | +| | IPC | | Security | | Crypto | | +| +------------+ +------------+ +------------+ | +| | +| +------------+ +------------+ +------------+ | +| | DRM | | ALSA | | USB | | +| +------------+ +------------+ +------------+ | +| ... | ++--------------------------------------+-----------------+ +| Device drivers | arch | +| | | +| +----+ +-----+ +--------+ +----+ | +----------+ | +| |char| |block| |ethernet| |wifi| | | machine 1| | +| +----+ +-----+ +--------+ +----+ | +----------+ | +| +----------+ +-----+ +----+ +---+ | +----------+ | +| |filesystem| |input| |iio | |usb| | | machine 2| | +| +----------+ +-----+ +----+ +---+ | +----------+ | +| +-----------+ +----------+ +---+ | | +| |framebuffer| | platform | |drm| | ... | +| +-----------+ +----------+ +---+ | | ++-------------------------+----+-------+-----------------+ + | | | + v v v + ++--------------------------------------------------------+ +| Hardware | ++--------------------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.png b/refs/pull/405/merge/_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.png new file mode 100644 index 00000000..2cea5126 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.png differ diff --git a/refs/pull/405/merge/_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.ditaa b/refs/pull/405/merge/_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.ditaa new file mode 100644 index 00000000..b1e26eef --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.ditaa @@ -0,0 +1,24 @@ ++---------------------+ +| Guest OS | +| +---------------+ | +| | Guest Driver | | +| +---------------+ | +| | ^ | +| | | | ++----+-----------+----+ + | trap | + | access | + +---+-----------+----+ + | | VMM | | + | v | | + | +----------------+ | + | | Virtual Device | | + | +----------------+ | + | | ^ | + | | | | + +--+------------+----+ + | | + v | + +-----------------+ + | Physical Device | + +-----------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.png b/refs/pull/405/merge/_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.png new file mode 100644 index 00000000..86118acb Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.png differ diff --git a/refs/pull/405/merge/_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.ditaa b/refs/pull/405/merge/_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.ditaa new file mode 100644 index 00000000..f57b13cd --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.ditaa @@ -0,0 +1,12 @@ ++--------+ 8MB +-----------+ 4KB +-----------+ +-----------+ 4KB +------------+-----------+------------+ +| | | | | | | | | Persistent | Temporary | Fix-mapped | +| Lowmem | <-----> | VMAP area | <-----> | VMAP area | ... | VMAP area | <-----> | Kernel | Kernel | linear | +| | | | | | | | | Mappings | Mappings | addresses | ++--------+ +-----------+ +-----------+ +-----------+ +------------+-----------+------------+ + : : + | 128MB | + |<------------------------------------------------------------------------------------------------------------->| + | | + | | + VMALLOC_START 4GB + (896MB) \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.png b/refs/pull/405/merge/_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.png new file mode 100644 index 00000000..d7318db0 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.png differ diff --git a/refs/pull/405/merge/_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.ditaa b/refs/pull/405/merge/_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.ditaa new file mode 100644 index 00000000..46712923 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.ditaa @@ -0,0 +1,18 @@ + +--------+ +-------+ data +--------+ + | dentry |-------------->| inode |--------+ | dentry | + +--------+ +-------+ | +--------+ + | ...... | | ..... | | | ...... | + +--------+ +-------+ dir | +--------+ + | dentry | | inode |--------|--+ | dentry | + +--------+ +-------+ | | +--------+ + ^ | | ^ + | | | | + | | | +--------+ + | V v | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +blocks | | | | | | | | | | | | | | | | | | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | | + | +------------+ | ++++++++++++ + +--->| superblock | +--->|||||||||||| block management + +------------+ ++++++++++++ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.png b/refs/pull/405/merge/_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.png new file mode 100644 index 00000000..44ba4e18 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.png differ diff --git a/refs/pull/405/merge/_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.ditaa b/refs/pull/405/merge/_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.ditaa new file mode 100644 index 00000000..04513e58 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.ditaa @@ -0,0 +1,34 @@ + struct socket +---------> struct proto_ops + +--------------------+ | +-----------------+ + | struct socket | | | release | + | | | +-----------------+ + +--------------------+ | | bind | + | struct proto_ops * |--------+ +-----------------+ + +--------------------+ | connect | + | ... | +-----------------+ + +---------------+ | accept | + +---------| struct sock * |-------+ +-----------------+ + | +---------------+ | | sendmsg | + | | +-----------------+ + | | | recvmsg | + | | +-----------------+ + | | | poll | + | | +-----------------+ + | | | ... | + | | +-----------------+ + | | + v v +--> struct sk_prot + struct tcp_sock struct tcp_sock | +--------------------+ ++-------------------+ +-------------------+ | | inet_dgram_connect | +| struct inet_sock | | struct inet_sock | | +--------------------+ +| +---------------+ | | +---------------+ | | | inet_sendmsg | +| | struct sock | | | | struct sock | | | +--------------------+ +| | +-----------+ | | | | +-----------+ | | | | udp_poll | +| | | ... | | | | | | ... | | | | +--------------------+ +| | +-----------+ | | | | +-----------+ | | | | inet_release | +| +---------------+ | | +---------------+ | | +--------------------+ +| | sk_prot * | | | | sk_prot * | |--+ | inet_bind | +| +---------------+ | | +---------------+ | +--------------------+ ++-------------------+ +-------------------+ | ... | +| ... | | ... | +--------------------+ ++-------------------+ +-------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.png b/refs/pull/405/merge/_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.png new file mode 100644 index 00000000..55a414fd Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.png differ diff --git a/refs/pull/405/merge/_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.ditaa b/refs/pull/405/merge/_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.ditaa new file mode 100644 index 00000000..7e5e1cd8 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.ditaa @@ -0,0 +1,17 @@ + +------------+ ++-------------+ BUS LOCK | Memory | +| CPU 1 |<------------->| | +| | LOAD (0) | | +| inc v |<--------------| v <- 0 | +| | STORE (1) | | +| |-------------->| v <- 1 | +| | BUS UNLOCK | | +| cEEE |<------------->| | BUS LOCK +-------------+ ++-------------+ | |<------------->| CPU 1 | + | | LOAD (1) | | + | |<--------------| inc v | + | v <- 2 | STORE (2) | | + | |-------------->| | + | | BUS UNLOCK | | + | cEEE |<------------->| cEEE | + +------------+ +-------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.png b/refs/pull/405/merge/_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.png new file mode 100644 index 00000000..8f914f7c Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.png differ diff --git a/refs/pull/405/merge/_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.ditaa b/refs/pull/405/merge/_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.ditaa new file mode 100644 index 00000000..7ede8570 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.ditaa @@ -0,0 +1,21 @@ ++-----+ +--------+ +---------+ +---------+ +| App | | File | | Network | | Display |<--+ +| | | Server | | Server | | Server |-+ | ++-----+ +--------+ +---------+ +---------+ | | + | ^ | | User +-|-|----------------------------------------=-|-|-------=- + | | | | Kernel + | | | | + | | | | + | | | | + | | Reply +----------------------------+ | | + | +--------| |----+ | + +--------->| Micro kernel |------+ + Request | (IPC, Memory, Scheduler) | + | | + +----------------------------+ + | + v ++--------------------------------------------------------+ +| Hardware | ++--------------------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.png b/refs/pull/405/merge/_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.png new file mode 100644 index 00000000..5444c021 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.png differ diff --git a/refs/pull/405/merge/_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.ditaa b/refs/pull/405/merge/_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.ditaa new file mode 100644 index 00000000..d5ab92c9 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.ditaa @@ -0,0 +1,24 @@ + +-----------+ + | | + +------------------>| Memory |<-----------------+ + | | | | + | +-----------+ | + | ^ | + | | | + v v v ++--------------+ +---------------+ +---------------+ +| | | | | | +| Processor A | | Processor B | | Processor C | +| | | | | | +| | | +-----------+ | | +-----------+ | +| | | | Process 1 | | | | Process 1 | | +| | | +-----------+ | | +-----------+ | +| | | | | | +| +----------+ | | +-----------+ | | +-----------+ | +| | kernel | | | | Process 2 | | | | Process 2 | | +| +----------+ | | +-----------+ | | +-----------+ | +| | | | | | +| | | +-----------+ | | +-----------+ | +| | | | Process 3 | | | | Process 3 | | +| | | +-----------+ | | +-----------+ | ++--------------+ +---------------+ +---------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.png b/refs/pull/405/merge/_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.png new file mode 100644 index 00000000..2177b2ec Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.png differ diff --git a/refs/pull/405/merge/_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.ditaa b/refs/pull/405/merge/_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.ditaa new file mode 100644 index 00000000..91a56d44 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.ditaa @@ -0,0 +1,11 @@ ++-----+ +-----+ +| CR3 | | EPT | ++-----+ +-----+ + | +------------------+ | +----------------+ + | | | | | | + +--------> | Guest Page Table | +-------> | EPT Page Table | ---------------> + | | | | +------------> +------------------+ ------------> +----------------+ + +Guest Virtual Guest Physical Host Physical + Address Address Address \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.png b/refs/pull/405/merge/_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.png new file mode 100644 index 00000000..7dc57080 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.png differ diff --git a/refs/pull/405/merge/_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.ditaa b/refs/pull/405/merge/_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.ditaa new file mode 100644 index 00000000..3831def8 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.ditaa @@ -0,0 +1,18 @@ ++-------------------+ +-------------------+ 0xFFFFFFFF +-------------------+ ^ +| | | | | | | +| | | | | | | Kernel space +| | | | | | | +| User | | Kernel | 0xC0000000 +-------------------+ v +| space | | space | | | ^ +| | | | | | | User space +| | | | | | | +| | | | | | | +| | | | | | | +| | | | | | | +| | | | | | | +| | | | | | | +| | | | | | | ++-------------------+ +-------------------+ 0x00000000 +-------------------+ v + + + (a) 4/4 split (b) 1/3 or2/2 split \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.png b/refs/pull/405/merge/_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.png new file mode 100644 index 00000000..a4e49739 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.png differ diff --git a/refs/pull/405/merge/_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.ditaa b/refs/pull/405/merge/_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.ditaa new file mode 100644 index 00000000..a1df2c3a --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.ditaa @@ -0,0 +1,6 @@ + 15 3 2 1 0 + +------------+----+-----+ + | | | | + Segment selectors | index | TI | RPL | +(CS, DS, SS, ES, FS, GS) | | | | + +------------+----+-----+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.png b/refs/pull/405/merge/_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.png new file mode 100644 index 00000000..ec679979 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.png differ diff --git a/refs/pull/405/merge/_images/ditaa-d880751969de8642b2613caaca345d71acea4500.ditaa b/refs/pull/405/merge/_images/ditaa-d880751969de8642b2613caaca345d71acea4500.ditaa new file mode 100644 index 00000000..afdc233d --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-d880751969de8642b2613caaca345d71acea4500.ditaa @@ -0,0 +1,19 @@ + +------+ +------+ + | | | | + | CPU | | DMA | + | | | | + +------+ +------+ + | + | + v + +-----+ +-----+ + | CR3 | | EPT | + +-----+ +-----+ + | +------------------+ | +----------------+ + | | | | | | + +--------> | Guest Page Table | +-------> | EPT Page Table | ---------------> + | | | | +------------> +------------------+ ------------> +----------------+ + +Guest Virtual Guest Physical Host Physical + Address Address Address \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-d880751969de8642b2613caaca345d71acea4500.png b/refs/pull/405/merge/_images/ditaa-d880751969de8642b2613caaca345d71acea4500.png new file mode 100644 index 00000000..b9640142 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-d880751969de8642b2613caaca345d71acea4500.png differ diff --git a/refs/pull/405/merge/_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.ditaa b/refs/pull/405/merge/_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.ditaa new file mode 100644 index 00000000..e5527992 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.ditaa @@ -0,0 +1,13 @@ + phase 1 ++----------------+ +| critical | phase 2 ++----------------+ +-----------------+ +| | | immediate | phase 3 +| - IRQ disabled | +-----------------+ +----------------+ +| - ACK IRQ +-----+ | | | deferred | +| | +---> - IRQ disabled | +----------------+ ++----------------+ | - device handler| | | + | - EOI IRQ +-----+ | - IRQ enabled | + +-----------------+ +----> - execute later| + | | + +----------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.png b/refs/pull/405/merge/_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.png new file mode 100644 index 00000000..01d66dcb Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.png differ diff --git a/refs/pull/405/merge/_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.ditaa b/refs/pull/405/merge/_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.ditaa new file mode 100644 index 00000000..c1072e96 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.ditaa @@ -0,0 +1,9 @@ + +------------+ + | Memory | ++-------------+ LOAD (0) | | +-------------+ +| CPU 0 |<--------------| v <- 0 | LOAD (0) | CPU 1 | +| | STORE (1) | |-------------->| | +| inc v |-------------->| v <- 1 | STORE (1) | inc v | +| cEEE | | v <- 1 |<--------------| cEEE | ++-------------+ | cEEE | +-------------+ + +------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.png b/refs/pull/405/merge/_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.png new file mode 100644 index 00000000..a20b7dc8 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.png differ diff --git a/refs/pull/405/merge/_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.ditaa b/refs/pull/405/merge/_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.ditaa new file mode 100644 index 00000000..c453cf9b --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.ditaa @@ -0,0 +1,29 @@ + Virtual Address ++------------+ +------------------+----------------+---------------+ +| CR3 | | DIRECTORY cEEE| TABLE cDDD | OFFSET cCCC| ++------------+ +------------------+----------------+---------------+ + | | | | + | | | | PAGE + | | | | /-----------------------\ + | | | | | | + | | | | | | + | | +-----------+ | +-----------------------+ + | | | +--->| Physical Address cCCC| + | | | +-----------------------+ + | +-----------------+ | | | + | | | PAGE | | + | | | TABLE | | + | | PAGE | /------------\ | | + | | DIRECTORY | | | | | + | | /------------\ | | | | | + | | | | | +------------+ +----> \-----------------------/ + | | | | +---->| cDDD |---+ + | | | | +------------+ + | | | | | | + | | | | | | + | | +------------+ | | + | +----->|cEEE |---+ | | + | +------------+ | | | + | | | +---->\------------/ + | | | + +--------->\------------/ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.png b/refs/pull/405/merge/_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.png new file mode 100644 index 00000000..edba4a4f Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.png differ diff --git a/refs/pull/405/merge/_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.ditaa b/refs/pull/405/merge/_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.ditaa new file mode 100644 index 00000000..e8571491 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.ditaa @@ -0,0 +1,26 @@ + ^ ^ ^ + | stat | open | read + v v v ++------------------------------------------------------------+ +| Virtual File System | +| | +| | +| /-------\ /--------\ /--------\ | +| | inode |<----------+ dentry |<----------+ FILE | | +| \---+---/ \----+---/ \---+----/ | +| | | | | +| | | | | +| v v v | +| +-------+ +--------+ +-------+ | +| | inode | | dentry | | page | | +| | cache | | cache | | cache | | +| +-------+ +--------+ +-------+ | +| | ++------------------------------------------------------------+ + ^ ^ + | | + v v + +-------------+ +-------------+ + | Filesystem | | Filesystem | + | driver | | driver | + +-------------+ +-------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.png b/refs/pull/405/merge/_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.png new file mode 100644 index 00000000..432e8cf7 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.png differ diff --git a/refs/pull/405/merge/_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.ditaa b/refs/pull/405/merge/_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.ditaa new file mode 100644 index 00000000..3c56c556 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.ditaa @@ -0,0 +1,10 @@ ++-------------+ +-------------+ +| Application | | Application | ++-------------+ +-------------+ + | | + |read(fd, buff, len) |fork() + | | + v v ++---------------------------------------+ +| Kernel | ++---------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.png b/refs/pull/405/merge/_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.png new file mode 100644 index 00000000..1cfe9759 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.png differ diff --git a/refs/pull/405/merge/_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.ditaa b/refs/pull/405/merge/_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.ditaa new file mode 100644 index 00000000..e6ff0ef3 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.ditaa @@ -0,0 +1,24 @@ +-:------------------------------------------------------------------------------------ + +VFS layer sys_write → vfs_write → do_sync_write → filp->f_op->aio_write + +-:------------------------------------------------------------------------------------ + +Generic socket layer sock_aio_write → sock->ops->sendmsg + +-:------------------------------------------------------------------------------------ + +IP socket layer sk->sk_prot->sendmsg + +-:------------------------------------------------------------------------------------ + +UDP socket layer ip_append_data udp_flush_pending_frames + | | +-:------------------------------+------------------------------+----------------------- + V V +IP socket layer skb = sock_alloc_send_skb(); ip_local_out + skb_queue_tail(sk, skb) + +-:------------------------------------------------------------------------------------ + + routing \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.png b/refs/pull/405/merge/_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.png new file mode 100644 index 00000000..9e9558ff Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.png differ diff --git a/refs/pull/405/merge/_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.ditaa b/refs/pull/405/merge/_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.ditaa new file mode 100644 index 00000000..254b15e8 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.ditaa @@ -0,0 +1,30 @@ + +-------------+ dup2 +-----------------------------+ + | Application |-----+ | libc | + +-------------+ | | | + +---->| C7590 dup2: | + | ... | + | C7592 movl 0x8(%esp),%ecx | + | C7596 movl 0x4(%esp),%ebx | + | C759a movl $0x3f,%eax | ++------------------------------+ C759f int $0x80 | +| | ... +<-----+ +| +-----------------------------+ | +| | +| | +| | +| | +| +------------------------------------------------------------+ | +| | Kernel | | +| | | | ++--->|ENTRY(entry_INT80_32) | | + | ASM_CLAC | | + | pushl %eax # pt_regs->orig_ax | | + | SAVE_ALL pt_regs_ax=$-ENOSYS # save rest | | + | ... | | + | movl %esp, %eax | | + | call do_int80_syscall_32 | | + | .... | | + | RESTORE_REGS 4 # skip orig_eax/error_code | | + | ... | | + | INTERRUPT_RETURN +-+ + +------------------------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.png b/refs/pull/405/merge/_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.png new file mode 100644 index 00000000..f19a4b3a Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.png differ diff --git a/refs/pull/405/merge/_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.ditaa b/refs/pull/405/merge/_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.ditaa new file mode 100644 index 00000000..e38ff2fa --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.ditaa @@ -0,0 +1,11 @@ + 63 47 42 32 ++------------------------------+---+---+----+---+---------------+ +| | | D | | | | +| offset (16..31 | P | P | | T | | +| | | L | | | | ++------------------------------+---+---+----+---+---------------+ +| | | +| segment selector | offset (0..15) | +| | | ++------------------------------+--------------------------------+ + 31 15 0 \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.png b/refs/pull/405/merge/_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.png new file mode 100644 index 00000000..9bd95556 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.png differ diff --git a/refs/pull/405/merge/_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.ditaa b/refs/pull/405/merge/_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.ditaa new file mode 100644 index 00000000..ead21664 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.ditaa @@ -0,0 +1,6 @@ + +--------------+ +------------+ + logical | | linear | | physical +---------> | Segmentation | --------> | Paging | ----------> + address | Unit | address | Unit | address + | | | | + +--------------+ +------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.png b/refs/pull/405/merge/_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.png new file mode 100644 index 00000000..6d402da7 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.png differ diff --git a/refs/pull/405/merge/_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.ditaa b/refs/pull/405/merge/_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.ditaa new file mode 100644 index 00000000..a6b1f07e --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.ditaa @@ -0,0 +1,31 @@ ++-------+ +| linux | ++-+-----+ + | + +------+--------+---------+---------+--------------+--------------+ + | | | | | | | + | v v v v v v + | +------+ +-------+ +-------+ +--------+ +---------------+ +---------+ + | | arch | | block | | certs | | crypto | | Documentation | | drivers | + | +------+ +-------+ +-------+ +--------+ +---------------+ +---------+ + | + +-------+----------+--------+---------+--------+--------+---------+ + | | | | | | | | + | v v v v v v v + | +----------+ +----+ +---------+ +------+ +-----+ +--------+ +-----+ + | | firmware | | fs | | include | | init | | ipc | | kernel | | lib | + | +----------+ +----+ +---------+ +------+ +-----+ +--------+ +-----+ + | + +-----+------+---------+------------+------------+------------+ + | | | | | | | + | v v v v v v + | +----+ +-----+ +---------+ +---------+ +----------+ +-------+ + | | mm | | net | | samples | | scripts | | security | | sound | + | +----+ +-----+ +---------+ +---------+ +----------+ +-------+ + | + +------+--------+--------+ + | | | + v v v + +-------+ +-----+ +------+ + | tools | | usr | | virt | + +-------+ +-----+ +------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.png b/refs/pull/405/merge/_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.png new file mode 100644 index 00000000..537372dd Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.png differ diff --git a/refs/pull/405/merge/_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.ditaa b/refs/pull/405/merge/_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.ditaa new file mode 100644 index 00000000..45f5a834 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.ditaa @@ -0,0 +1,27 @@ +Userspace Kernel Kernel Userspace + T0 T0 T1 T1 + + | + | syscall +-------------------+ + V --------->| Save user regs on | +-----------------+ + interrupt | the kernel stack | | Save user regs | + +-------------------+ | on kernel stack | + | +-----------------+ + |schedule() | + | |schedule() + V | + +-----------------+ V + | context_switch |------+ +-----------------+ + +-----------------+ | | context_switch | + +-----> +-----------------+ + | + V + +-------------------+ + | Pop user regs | + | from kernel stack | + +-------------------+ + | + | exit syscall + +--------------------> | + | + V \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.png b/refs/pull/405/merge/_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.png new file mode 100644 index 00000000..f96c9eb9 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.png differ diff --git a/refs/pull/405/merge/_images/ditaa-f7ee56960e76c3e80fcbe59fafa38c3d93eac261.ditaa b/refs/pull/405/merge/_images/ditaa-f7ee56960e76c3e80fcbe59fafa38c3d93eac261.ditaa new file mode 100644 index 00000000..a9cba28c --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-f7ee56960e76c3e80fcbe59fafa38c3d93eac261.ditaa @@ -0,0 +1,35 @@ + +--+ +--+ +--+ + mydriver.c | | mybus.c | | bus/driver/device core | | kobject core + | | | | | | + | | | | | | + | | | | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | | my_bus_type +------=>+ struct bus_type | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | |name | | | |name | | | + | | |uevent() = my_uevent() | | | |uevent() | | | + | | |match() = my_match() | | | |match() | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | | | | | | | + | | | | +-----------------------------+ | | + | | | | | | ++----------------+ | | +-----------------------------+ | | +-----------------------------+ | | +-------------------+ +| mydriver +------=>+ struct my_driver +------->+ struct device_driver +-------+---->| struct kobject | ++----------------+ | | +-----------------------------+ | | +-----------------------------+ | | | +-------------------+ +| | | | | | | | | name | | | | | k_name | ++----------------+ | | +-----------------------------+ | | +-----------------------------+ | | | +-------------------+ + | | | my_register_driver() | | | | driver_register() | | | | | kobject_add() | + | | | my_unregister_driver() | | | | driver_unregister() | | | | | kobject_delete() | + | | +-----------------------------+ | | +-----------------------------+ | | | +-------------------+ + | | | | | | | + | | | | | | | ++----------------+ | | +-----------------------------+ | | +-----------------------------+ | | | +| mydevice +------=>+ struct my_device +------->+ struct device +-------+ ++----------------+ | | +-----------------------------+ | | +-----------------------------+ | | +| | | | | | | | | bus_id | | | ++----------------+ | | +-----------------------------+ | | +-----------------------------+ | | + | | | my_register_device() | | | | device_register() | | | + | | | my_unregister_device() | | | | device_unregister() | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | | | | | + +--+ +--+ +--+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-f7ee56960e76c3e80fcbe59fafa38c3d93eac261.png b/refs/pull/405/merge/_images/ditaa-f7ee56960e76c3e80fcbe59fafa38c3d93eac261.png new file mode 100644 index 00000000..9c1194e0 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-f7ee56960e76c3e80fcbe59fafa38c3d93eac261.png differ diff --git a/refs/pull/405/merge/_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.ditaa b/refs/pull/405/merge/_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.ditaa new file mode 100644 index 00000000..aeda4a8a --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.ditaa @@ -0,0 +1,19 @@ + VM1 (qemu) VM2 (qemu) ++---------------------+ +---------------------+ +| +------+ +------+ | | +------+ +------+ | +| | App1 | | App2 | | | | App1 | | App2 | | +| +------+ +------+ | | +------+ +------+ | +| +-----------------+ | | +-----------------+ | +| | Guest Kernel | | | | Guest Kernel | | +| +-----------------+ | | +-----------------+ | ++---------------------+ +---------------------+ + ++----------------------------------------------------+ +| +-----+ | +| | KVM | Host Linux Kernel | +| +-----+ | ++----------------------------------------------------+ + ++----------------------------------------------------+ +| Hardware with virtualization support | ++----------------------------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.png b/refs/pull/405/merge/_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.png new file mode 100644 index 00000000..57d3e212 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.png differ diff --git a/refs/pull/405/merge/_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.ditaa b/refs/pull/405/merge/_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.ditaa new file mode 100644 index 00000000..5febcc5e --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.ditaa @@ -0,0 +1,19 @@ ++---------------------------------------------------------+ +| application programming (EGC, SPG, PP, SPRC, IOC, etc.) | ++---------------------------------------------------------+ + + +----------------------------------+ + | system programming (PC, SO, CPL) | + +----------------------------------+ + user space +----------------------------------------------------------=- + kernel space + +--------------------------+ + | kernel programming (SO2) | + +--------------------------+ + +----------------------------------------------------------=- + + +----------------------------------+ + | hardware (PM, CN1, CN2, PL ) | + +----------------------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.png b/refs/pull/405/merge/_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.png new file mode 100644 index 00000000..be0bf3de Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.png differ diff --git a/refs/pull/405/merge/_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.ditaa b/refs/pull/405/merge/_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.ditaa new file mode 100644 index 00000000..1ce957b6 --- /dev/null +++ b/refs/pull/405/merge/_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.ditaa @@ -0,0 +1,16 @@ + Opened files + task_struct +-------------------+ task_struct ++-----------------------+ | FILE | +-----------------------+ +| Thread Group ID (PID) | +--->+-------------------+<---+ | Thread Group ID (PID) | ++-----------------------+ | | .... | | +-----------------------+ +| Thread ID (TID) | | +-------------------+ | | Thread ID (TID) | ++-----------------------+ | | +-----------------------+ +| ... | | | | ... | ++-----------------------+ | | +-----------------------+ +| Opened files |--+ +--| Opened files | ++-----------------------+ Address Space +-----------------------+ +| Address Space |---+ +-------------------+ +---| Address Space | ++-----------------------+ | | | | +-----------------------+ +| ... | +-->| .... |<--+ | ... | ++-----------------------+ | | +-----------------------+ + +-------------------+ \ No newline at end of file diff --git a/refs/pull/405/merge/_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.png b/refs/pull/405/merge/_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.png new file mode 100644 index 00000000..0060f6e7 Binary files /dev/null and b/refs/pull/405/merge/_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.png differ diff --git a/refs/pull/405/merge/_images/dts_node.png b/refs/pull/405/merge/_images/dts_node.png new file mode 100644 index 00000000..2404462c Binary files /dev/null and b/refs/pull/405/merge/_images/dts_node.png differ diff --git a/refs/pull/405/merge/_images/dts_node1.png b/refs/pull/405/merge/_images/dts_node1.png new file mode 100644 index 00000000..2404462c Binary files /dev/null and b/refs/pull/405/merge/_images/dts_node1.png differ diff --git a/refs/pull/405/merge/_images/fib-trie-compressed.png b/refs/pull/405/merge/_images/fib-trie-compressed.png new file mode 100644 index 00000000..44235ff5 Binary files /dev/null and b/refs/pull/405/merge/_images/fib-trie-compressed.png differ diff --git a/refs/pull/405/merge/_images/fib-trie-compressed1.png b/refs/pull/405/merge/_images/fib-trie-compressed1.png new file mode 100644 index 00000000..44235ff5 Binary files /dev/null and b/refs/pull/405/merge/_images/fib-trie-compressed1.png differ diff --git a/refs/pull/405/merge/_images/fib-trie.png b/refs/pull/405/merge/_images/fib-trie.png new file mode 100644 index 00000000..f0da22f1 Binary files /dev/null and b/refs/pull/405/merge/_images/fib-trie.png differ diff --git a/refs/pull/405/merge/_images/fib-trie1.png b/refs/pull/405/merge/_images/fib-trie1.png new file mode 100644 index 00000000..f0da22f1 Binary files /dev/null and b/refs/pull/405/merge/_images/fib-trie1.png differ diff --git a/refs/pull/405/merge/_images/fidb-details.png b/refs/pull/405/merge/_images/fidb-details.png new file mode 100644 index 00000000..c146bc3c Binary files /dev/null and b/refs/pull/405/merge/_images/fidb-details.png differ diff --git a/refs/pull/405/merge/_images/fidb-details1.png b/refs/pull/405/merge/_images/fidb-details1.png new file mode 100644 index 00000000..c146bc3c Binary files /dev/null and b/refs/pull/405/merge/_images/fidb-details1.png differ diff --git a/refs/pull/405/merge/_images/fidb-overview.png b/refs/pull/405/merge/_images/fidb-overview.png new file mode 100644 index 00000000..a9f86970 Binary files /dev/null and b/refs/pull/405/merge/_images/fidb-overview.png differ diff --git a/refs/pull/405/merge/_images/fidb-overview1.png b/refs/pull/405/merge/_images/fidb-overview1.png new file mode 100644 index 00000000..a9f86970 Binary files /dev/null and b/refs/pull/405/merge/_images/fidb-overview1.png differ diff --git a/refs/pull/405/merge/_images/inspect_task_struct.cast b/refs/pull/405/merge/_images/inspect_task_struct.cast new file mode 100644 index 00000000..52cb2df6 --- /dev/null +++ b/refs/pull/405/merge/_images/inspect_task_struct.cast @@ -0,0 +1,849 @@ +{"version": 2, "width": 80, "height": 24, "timestamp": 1615763059, "idle_time_limit": 1.0, "env": {"SHELL": null, "TERM": "xterm"}} +[0.002258, "o", "$ "] +[0.627367, "o", "m"] +[0.68335, "o", "a"] +[0.786756, "o", "k"] +[0.835143, "o", "e"] +[0.987307, "o", " "] +[1.211073, "o", "g"] +[1.44347, "o", "d"] +[1.931409, "o", "b"] +[2.419393, "o", "\r\n"] +[2.425118, "o", "gdb -ex \"target remote localhost:1234\" /linux/vmlinux\r\n"] +[2.46016, "o", "\u001b[35;1m\u001b[35;1mGNU gdb \u001b[m\u001b[35;1m(Ubuntu 9.2-0ubuntu1~20.04) \u001b[m\u001b[35;1m9.2\u001b[m\u001b[35;1m\r\n\u001b[m\u001b[mCopyright (C) 2020 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law.\r\nType \"show copying\" and \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n .\r\n\r\nFor help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[2.460424, "o", "Reading symbols from \u001b[32m/linux/vmlinux\u001b[m...\r\n"] +[3.047524, "o", "Remote debugging using localhost:1234\r\n"] +[3.067155, "o", "\u001b[33m__lock_acquire\u001b[m (\u001b[36mlock=lock@entry\u001b[m=0xc2416250, \u001b[36msubclass=subclass@entry\u001b[m=0, \u001b[m\r\n"] +[3.067502, "o", " \u001b[m\u001b[36mtrylock=trylock@entry\u001b[m=0, \u001b[36mread=read@entry\u001b[m=0, \u001b[36mcheck=check@entry\u001b[m=1, \u001b[m\r\n"] +[3.06766, "o", "\u001b[m--Type for more, q to quit, c to continue without paging--"] +[4.708007, "o", "\r\n"] +[4.708796, "o", " \u001b[m\u001b[36mhardirqs_off=hardirqs_off@entry\u001b[m=1, \u001b[36mnest_lock=nest_lock@entry\u001b[m=0x0, \u001b[m\r\n \u001b[m\u001b[36mip=ip@entry\u001b[m=3241676808, \u001b[36mreferences=references@entry\u001b[m=0, \u001b[m\r\n"] +[4.708863, "o", " \u001b[m\u001b[36mpin_count=pin_count@entry\u001b[m=0) at \u001b[32mkernel/locking/lockdep.c\u001b[m:4738\r\n"] +[4.709098, "o", "4738\t\tclass_idx = class - lock_classes;\r\n"] +[4.70932, "o", "(gdb) "] +[6.809114, "o", "l"] +[6.995472, "o", "x"] +[7.138947, "o", "-"] +[7.331348, "o", "p"] +[7.894323, "o", "s"] +[11.123106, "o", "\r\n"] +[11.123178, "o", " TASK PID COMM\r\n"] +[11.12485, "o", "0xc17d02c0 0 swapper/0\r\n"] +[11.126522, "o", "0xc2530040 1 swapper/0\r\n"] +[11.127872, "o", "0xc2534080 2 kthreadd\r\n"] +[11.129364, "o", "0xc25360c0 3 rcu_gp\r\n"] +[11.13074, "o", "0xc2537100 4 rcu_par_gp\r\n"] +[11.132056, "o", "0xc2545140 5 kworker/0:0\r\n"] +[11.133156, "o", "0xc2546180 6 kworker/0:0H\r\n"] +[11.134051, "o", "0xc25481c0 7 kworker/u2:0\r\n"] +[11.135046, "o", "0xc2549000 8 mm_percpu_wq\r\n"] +[11.135873, "o", "0xc254b040 9 ksoftirqd/0\r\n"] +[11.136804, "o", "0xc254c080 10 rcu_sched\r\n"] +[11.137649, "o", "0xc254e0c0 11 migration/0\r\n"] +[11.138414, "o", "0xc2572100 12 cpuhp/0\r\n"] +[11.139254, "o", "0xc2576140 13 kdevtmpfs\r\n"] +[11.140061, "o", "0xc2594180 14 netns\r\n"] +[11.140849, "o", "0xc26211c0 15 oom_reaper\r\n"] +[11.141609, "o", "0xc2623000 16 writeback\r\n"] +[11.142354, "o", "0xc26300c0 32 kblockd\r\n"] +[11.143193, "o", "\u001b[m--Type for more, q to quit, c to continue without paging--"] +[13.172107, "o", "q"] +[13.386945, "o", "\r\n"] +[13.387104, "o", "Quit\r\n(gdb) "] +[13.924017, "o", "#"] +[14.128394, "o", " "] +[14.691413, "o", "n"] +[14.777001, "o", "o"] +[15.024956, "o", "t"] +[15.200252, "o", "i"] +[15.658237, "o", "\b\u001b[K"] +[15.800549, "o", "\b\u001b[K"] +[18.149577, "o", "\b\u001b[K"] +[18.707978, "o", "\b\u001b[K"] +[18.992922, "o", "l"] +[19.107602, "o", "e"] +[19.28165, "o", "t"] +[19.505188, "o", "s"] +[19.600963, "o", " "] +[19.804348, "o", "l"] +[20.000551, "o", "o"] +[20.142601, "o", "o"] +[20.171715, "o", "k"] +[20.511567, "o", " "] +[20.691626, "o", "a"] +[20.998173, "o", "t"] +[21.083584, "o", " "] +[21.211514, "o", "t"] +[21.323576, "o", "h"] +[21.707293, "o", "e"] +[21.835442, "o", " "] +[23.451569, "o", "f"] +[23.621036, "o", "i"] +[23.690171, "o", "r"] +[23.8906, "o", "s"] +[24.099481, "o", "t"] +[24.459424, "o", " "] +[24.640336, "o", "t"] +[25.216437, "o", "a"] +[25.444316, "o", "s"] +[25.547244, "o", "k"] +[27.204116, "o", "\r\n(gdb) "] +[30.617187, "o", "p"] +[30.764631, "o", "r"] +[30.859261, "o", "i"] +[30.923074, "o", "n"] +[30.971535, "o", "t"] +[31.101722, "o", " "] +[31.377345, "o", "("] +[31.49992, "o", "s"] +[31.665037, "o", "t"] +[31.7159, "o", "r"] +[31.797039, "o", "u"] +[31.891209, "o", "c"] +[32.134894, "o", "t"] +[32.275529, "o", " "] +[32.38782, "o", "t"] +[32.593072, "o", "a"] +[32.670072, "o", "s"] +[32.723656, "o", "k"] +[32.97944, "o", "_"] +[33.411209, "o", "s"] +[33.571094, "o", "t"] +[33.633386, "o", "r"] +[33.723362, "o", "u"] +[33.819607, "o", "c"] +[34.03609, "o", "t"] +[34.243481, "o", " "] +[34.8279, "o", "*"] +[34.939317, "o", ")"] +[41.353684, "o", "0xc17d02"] +[41.353931, "o", "c0"] +[42.556284, "o", "\r\n"] +[42.572593, "o", "$1 = (struct task_struct *) \u001b[34m0xc17d02c0\u001b[m <\u001b[33minit_task\u001b[m>\r\n(gdb) "] +[43.76609, "o", " "] +[44.144961, "o", "\b\u001b[K"] +[44.404337, "o", "#"] +[44.491671, "o", " "] +[44.729417, "o", "n"] +[44.776973, "o", "o"] +[44.964074, "o", "t"] +[45.020048, "o", "i"] +[45.220228, "o", "c"] +[45.307108, "o", "e"] +[45.403404, "o", " "] +[45.579136, "o", "t"] +[45.651669, "o", "h"] +[45.752831, "o", "a"] +[45.851754, "o", "t"] +[45.932833, "o", " "] +[46.080052, "o", "t"] +[46.189538, "o", "h"] +[46.331405, "o", "e"] +[46.464752, "o", " "] +[46.737976, "o", "t"] +[47.031178, "o", "s"] +[47.034726, "o", "a"] +[47.488137, "o", "\b\u001b[K"] +[47.656792, "o", "\b\u001b[K"] +[47.731352, "o", "a"] +[47.842274, "o", "s"] +[47.879582, "o", "k"] +[48.063247, "o", " "] +[48.304771, "o", "i"] +[48.451198, "o", "s"] +[48.568817, "o", " "] +[48.675509, "o", "a"] +[48.836947, "o", "l"] +[48.985894, "o", "l"] +[49.180376, "o", "o"] +[49.335982, "o", "c"] +[49.427444, "o", "a"] +[49.635823, "o", "t"] +[49.691801, "o", "e"] +[49.881213, "o", "d"] +[49.964905, "o", " "] +[50.099764, "o", "d"] +[50.226974, "o", "i"] +[50.321136, "o", "r"] +[50.388547, "o", "e"] +[50.569383, "o", "c"] +[50.777146, "o", "t"] +[50.811594, "o", "l"] +[51.049026, "o", "y"] +[51.154598, "o", " "] +[51.316279, "o", "i"] +[51.381979, "o", "n"] +[51.432314, "o", " "] +[51.556065, "o", "t"] +[51.661287, "o", "h"] +[51.783469, "o", "e"] +[51.867781, "o", " "] +[52.096542, "o", "i"] +[52.241698, "o", "m"] +[52.379699, "o", "a"] +[52.616666, "o", "g"] +[52.684999, "o", "e"] +[54.076647, "o", "\r\n"] +[54.076706, "o", "(gdb) "] +[54.548115, "o", "#"] +[55.481235, "o", " "] +[55.929872, "o", "i"] +[56.729007, "o", "\b\u001b[K"] +[56.836286, "o", "t"] +[56.965048, "o", "h"] +[57.020301, "o", "i"] +[57.152388, "o", " "] +[57.361979, "o", "i"] +[57.484185, "o", "s"] +[58.104718, "o", "\b\u001b[K"] +[58.211525, "o", "\b\u001b[K"] +[58.345772, "o", "\b\u001b[K"] +[58.599277, "o", "s"] +[58.699061, "o", " "] +[58.872073, "o", "i"] +[59.024165, "o", "s"] +[59.121158, "o", " "] +[59.261122, "o", "t"] +[59.342248, "o", "h"] +[59.959054, "o", "e"] +[60.096489, "o", " "] +[60.195746, "o", "f"] +[60.316017, "o", "i"] +[60.40269, "o", "r"] +[60.595528, "o", "s"] +[60.795261, "o", "t"] +[60.875222, "o", " "] +[61.023882, "o", "t"] +[61.219665, "o", "a"] +[61.294759, "o", "s"] +[61.699194, "o", "k"] +[61.947333, "o", " "] +[62.186605, "o", "t"] +[62.292669, "o", "h"] +[62.383993, "o", "a"] +[62.462394, "o", "t"] +[62.611368, "o", " "] +[63.073466, "o", "r"] +[63.232473, "o", "u"] +[63.350378, "o", "n"] +[64.143825, "o", " "] +[64.346129, "o", "d"] +[64.457352, "o", "u"] +[64.594958, "o", "r"] +[64.667077, "o", "i"] +[64.739904, "o", "n"] +[64.819518, "o", "g"] +[64.932011, "o", " "] +[65.416451, "o", "b"] +[65.50703, "o", "o"] +[65.61947, "o", "o"] +[65.723128, "o", "t"] +[66.372533, "o", "\r\n"] +[66.37262, "o", "(gdb) "] +[67.489239, "o", "#"] +[67.69126, "o", " "] +[67.812288, "o", "a"] +[67.939709, "o", "l"] +[68.155397, "o", "s"] +[68.245108, "o", "o"] +[68.428805, "o", " "] +[68.619862, "o", "n"] +[68.642887, "o", "o"] +[68.779469, "o", "t"] +[68.88901, "o", "i"] +[69.043262, "o", "c"] +[69.136781, "o", "e"] +[69.281437, "o", " "] +[69.443731, "o", "t"] +[69.531328, "o", "h"] +[69.659822, "o", "a"] +[69.779992, "o", "t"] +[69.896515, "o", " "] +[70.019759, "o", "t"] +[70.13192, "o", "h"] +[70.217266, "o", "e"] +[70.315651, "o", "r"] +[70.40667, "o", "e"] +[70.467033, "o", " "] +[70.605909, "o", "i"] +[70.709012, "o", "s"] +[70.797622, "o", " "] +[70.954744, "o", "n"] +[71.032528, "o", "o"] +[71.337134, "o", " "] +[71.504564, "o", "i"] +[71.604101, "o", "n"] +[71.680536, "o", "i"] +[71.803535, "o", "t"] +[71.883327, "o", " "] +[72.003128, "o", "t"] +[72.203373, "o", "a"] +[72.295758, "o", "s"] +[72.399629, "o", "k"] +[72.575918, "o", " "] +[72.979851, "o", "y"] +[73.075825, "o", "e"] +[73.264132, "o", "t"] +[73.611752, "o", ","] +[73.708124, "o", " "] +[73.803574, "o", "s"] +[73.955095, "o", "i"] +[74.019777, "o", "n"] +[74.051016, "o", "c"] +[74.141735, "o", "e"] +[74.237855, "o", " "] +[74.355776, "o", "w"] +[74.420644, "o", "e"] +[74.515071, "o", " "] +[74.600642, "o", "a"] +[74.74771, "o", "r"] +[74.836435, "o", "e"] +[74.907223, "o", " "] +[75.018533, "o", "s"] +[75.192824, "o", "t"] +[75.235418, "o", "i"] +[75.404227, "o", "l"] +[75.518303, "o", "l"] +[75.571386, "o", " "] +[75.68698, "o", "b"] +[75.779911, "o", "o"] +[75.907538, "o", "o"] +[75.956027, "o", "t"] +[76.099746, "o", "i"] +[76.180981, "o", "n"] +[76.204263, "o", "g"] +[76.94757, "o", "\r\n(gdb) "] +[77.465313, "o", "#"] +[77.60323, "o", " "] +[77.771746, "o", "l"] +[77.840755, "o", "e"] +[78.01651, "o", "t"] +[78.19322, "o", "s"] +[78.28195, "o", " "] +[78.451553, "o", "w"] +[78.720865, "o", "a"] +[78.83536, "o", "i"] +[78.971292, "o", "t"] +[79.059111, "o", " "] +[79.184909, "o", "f"] +[79.268223, "o", "o"] +[79.37142, "o", "r"] +[79.467247, "o", " "] +[79.5852, "o", "a"] +[79.675245, "o", " "] +[79.827752, "o", "b"] +[79.944592, "o", "i"] +[80.027415, "o", "t"] +[80.187537, "o", "\r\n(gdb) "] +[80.562931, "o", "c"] +[80.795397, "o", "\r\nContinuing.\r\n"] +[121.082722, "o", "^C"] +[121.089999, "o", "\r\nProgram received signal SIGINT, Interrupt.\r\n"] +[121.090173, "o", "\u001b[33mdefault_idle\u001b[m () at \u001b[32march/x86/kernel/process.c\u001b[m:689\r\n689\t}\r\n(gdb) "] +[123.106153, "o", "#"] +[123.405941, "o", " "] +[123.979574, "o", "w"] +[124.068273, "o", "e"] +[124.159089, "o", " "] +[124.51651, "o", "h"] +[124.596335, "o", "i"] +[124.801487, "o", "t"] +[124.932552, "o", " "] +[125.043099, "o", "t"] +[125.187264, "o", "h"] +[125.342082, "o", "e"] +[125.551746, "o", " "] +[126.091803, "o", "i"] +[126.283016, "o", "d"] +[126.38368, "o", "e"] +[126.611562, "o", "l"] +[127.383539, "o", "\b\u001b[K"] +[127.523397, "o", "\b\u001b[K"] +[127.764257, "o", "l"] +[127.885901, "o", "e"] +[128.102388, "o", " "] +[128.628478, "o", "f"] +[128.755103, "o", "u"] +[128.827608, "o", "n"] +[128.899218, "o", "c"] +[129.159741, "o", "t"] +[129.229055, "o", "i"] +[129.266534, "o", "o"] +[129.52278, "o", "n"] +[129.603238, "o", " "] +[129.845, "o", "s"] +[129.95226, "o", "o"] +[130.635332, "o", " "] +[132.650977, "o", "t"] +[132.764212, "o", "h"] +[132.861951, "o", "e"] +[133.002149, "o", " "] +[133.108483, "o", "s"] +[133.264264, "o", "y"] +[133.32105, "o", "s"] +[133.512254, "o", "t"] +[133.577376, "o", "e"] +[133.610919, "o", "m"] +[133.724743, "o", " "] +[133.8519, "o", "p"] +[133.940929, "o", "r"] +[134.043108, "o", "o"] +[134.163485, "o", "b"] +[134.249323, "o", "a"] +[134.360582, "o", "b"] +[134.458498, "o", "l"] +[134.725093, "o", "y"] +[134.796654, "o", " "] +[135.075413, "o", "b"] +[135.132523, "o", "o"] +[135.253576, "o", "o"] +[135.420718, "o", "t"] +[135.572909, "o", "e"] +[135.732266, "o", "d"] +[135.88038, "o", " "] +[136.124706, "o", "u"] +[136.180361, "o", "p"] +[136.691338, "o", "\r\n(gdb) "] +[137.131348, "o", "l"] +[137.857637, "o", "s"] +[138.377273, "o", "-"] +[139.249675, "o", "\b\u001b[K"] +[139.352566, "o", "\b\u001b[K"] +[139.431785, "o", "x"] +[139.546652, "o", "-"] +[139.792902, "o", "p"] +[139.980251, "o", "s"] +[140.380458, "o", "\r\n"] +[140.380619, "o", " TASK PID COMM\r\n"] +[140.381038, "o", "0xc17d02c0 0 swapper/0\r\n"] +[140.382456, "o", "0xc2530040 1 init\r\n"] +[140.383557, "o", "0xc2534080 2 kthreadd\r\n"] +[140.384521, "o", "0xc25360c0 3 rcu_gp\r\n"] +[140.385521, "o", "0xc2537100 4 rcu_par_gp\r\n"] +[140.38645, "o", "0xc2545140 5 kworker/0:0\r\n"] +[140.387399, "o", "0xc2546180 6 kworker/0:0H\r\n"] +[140.388317, "o", "0xc25481c0 7 kworker/u2:0\r\n"] +[140.389206, "o", "0xc2549000 8 mm_percpu_wq\r\n"] +[140.390069, "o", "0xc254b040 9 ksoftirqd/0\r\n"] +[140.390903, "o", "0xc254c080 10 rcu_sched\r\n"] +[140.391626, "o", "0xc254e0c0 11 migration/0\r\n"] +[140.392543, "o", "0xc2572100 12 cpuhp/0\r\n"] +[140.393323, "o", "0xc2576140 13 kdevtmpfs\r\n"] +[140.394242, "o", "0xc2594180 14 netns\r\n"] +[140.395183, "o", "0xc26211c0 15 oom_reaper\r\n"] +[140.396009, "o", "0xc2623000 16 writeback\r\n"] +[140.39683, "o", "0xc26300c0 32 kblockd\r\n"] +[140.397681, "o", "\u001b[m--Type for more, q to quit, c to continue without paging--"] +[141.757204, "o", "q"] +[142.547532, "o", "\r\n"] +[142.547753, "o", "Quit\r\n(gdb) "] +[143.850961, "o", "#"] +[144.070245, "o", " "] +[144.195434, "o", "w"] +[144.279713, "o", "e"] +[144.408497, "o", " "] +[144.552695, "o", "d"] +[144.65939, "o", "o"] +[144.880631, "o", " "] +[145.079566, "o", "s"] +[145.195271, "o", "e"] +[145.368428, "o", "e"] +[145.607373, "o", "t"] +[145.79615, "o", " "] +[145.89913, "o", "t"] +[146.045195, "o", "h"] +[146.09891, "o", "e"] +[146.227914, "o", " "] +[146.463823, "o", "i"] +[146.543985, "o", "n"] +[146.618976, "o", "i"] +[146.707111, "o", "t"] +[146.83548, "o", " "] +[146.945151, "o", "t"] +[147.163454, "o", "a"] +[147.235326, "o", "s"] +[147.425145, "o", "k"] +[148.247683, "o", ","] +[148.376176, "o", " "] +[148.526531, "o", "s"] +[148.616468, "o", "o"] +[148.722313, "o", " "] +[148.904045, "o", "l"] +[148.97297, "o", "e"] +[149.130799, "o", "t"] +[149.367908, "o", "s"] +[149.544347, "o", " "] +[150.095879, "o", "\b\u001b[K"] +[150.228108, "o", "\b\u001b[K"] +[150.347046, "o", "\b\u001b[K"] +[151.10371, "o", "t"] +[151.28464, "o", "s"] +[151.378986, "o", " "] +[151.536173, "o", "i"] +[151.64024, "o", "n"] +[151.907639, "o", "s"] +[152.111461, "o", "p"] +[152.219417, "o", "e"] +[152.33727, "o", "c"] +[152.609681, "o", "t"] +[152.691632, "o", " "] +[152.883529, "o", "i"] +[153.039608, "o", "t"] +[154.483232, "o", "\r\n"] +[154.483296, "o", "(gdb) "] +[158.688904, "o", "\r\u001b[C\u001b[16@reverse-i-search)`':\u001b[C"] +[159.808627, "o", "\b\b\bp': # we do see the init task, so lets inspect it\b\b\b\b\b\b\b"] +[159.973821, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cr': # we hit the idle function so the system probably booted up\u001b[A\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[160.059576, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[17Pi': print (struct task_struct *)0xc17d02c0\r\n\r\u001b[K\u001b[A\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[160.123058, "o", "\b\b\b\u001b[1@n\u001b[C\u001b[C\u001b[C"] +[160.189924, "o", "\b\b\b\u001b[1@t\u001b[C\u001b[C\u001b[C"] +[160.92358, "o", "\r\u001b[C\u001b[21Pgdb)\u001b[C"] +[161.345471, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[162.195311, "o", "\b\b\b\b\b\b\b\b\b\b\u001b[K"] +[166.763546, "o", "0xc2530040"] +[167.811211, "o", "\r\n"] +[167.812061, "o", "$2 = (struct task_struct *) \u001b[34m0xc2530040\u001b[m\r\n(gdb) "] +[170.690863, "o", "print (struct task_struct *)0xc2530040"] +[170.960299, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[171.19283, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[171.767394, "o", "\u001b[C"] +[172.24119, "o", "\u001b[C\u001b[1@(\b"] +[172.752175, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[173.806767, "o", ")"] +[174.75601, "o", "-"] +[175.107407, "o", ">"] +[176.273234, "o", "i"] +[176.403267, "o", "n"] +[176.459331, "o", "i"] +[176.571028, "o", "t"] +[176.939329, "o", "\r\n"] +[176.949546, "o", "There is no member named init.\r\n(gdb) "] +[177.801366, "o", "print ((struct task_struct *)0xc2530040)->init"] +[178.822566, "o", "\b\u001b[K"] +[178.959575, "o", "\b\u001b[K"] +[179.083502, "o", "\b\u001b[K"] +[179.220157, "o", "\b\u001b[K"] +[179.455399, "o", "p"] +[179.55676, "o", "i"] +[179.637515, "o", "d"] +[179.812349, "o", "\r\n"] +[179.828661, "o", "$3 = 1\r\n(gdb) "] +[180.470819, "o", "print ((struct task_struct *)0xc2530040)->pid"] +[181.504206, "o", "\b\b\b\u001b[K"] +[182.272216, "o", "c"] +[182.336938, "o", "o"] +[182.535532, "o", "m"] +[182.634755, "o", "m"] +[183.843339, "o", "\r\n"] +[183.859914, "o", "$4 = \"init\\000er/0\\000\\000\\000\\000\\000\\000\"\r\n(gdb) "] +[185.336129, "o", "#"] +[185.499751, "o", " "] +[185.690685, "o", "l"] +[185.843492, "o", "e"] +[186.031657, "o", "t"] +[186.27479, "o", "s"] +[186.543226, "o", " "] +[186.736697, "o", "t"] +[186.907736, "o", "r"] +[187.010908, "o", "y"] +[187.22462, "o", " "] +[187.408223, "o", "t"] +[187.497188, "o", "o"] +[187.602223, "o", " "] +[187.680452, "o", "t"] +[188.29134, "o", "\b\u001b[K"] +[188.386986, "o", "c"] +[188.499959, "o", "h"] +[188.586567, "o", "a"] +[188.691207, "o", "n"] +[188.76476, "o", "g"] +[188.81181, "o", "e"] +[188.898677, "o", " "] +[189.035371, "o", "t"] +[189.092408, "o", "h"] +[189.223294, "o", "e"] +[189.304291, "o", " "] +[189.442809, "o", "n"] +[189.508884, "o", "a"] +[189.656275, "o", "m"] +[189.76372, "o", "e"] +[190.207667, "o", " "] +[190.430618, "o", "o"] +[190.533474, "o", "f"] +[190.650813, "o", " "] +[190.834946, "o", "i"] +[190.930808, "o", "n"] +[191.033332, "o", "i"] +[191.507208, "o", "\b\u001b[K"] +[191.632261, "o", "\b\u001b[K"] +[191.759484, "o", "\b\u001b[K"] +[191.875256, "o", "t"] +[191.955649, "o", "h"] +[192.066747, "o", "e"] +[192.139401, "o", " "] +[192.263673, "o", "i"] +[192.330469, "o", "n"] +[192.411275, "o", "i"] +[192.486134, "o", "t"] +[192.603117, "o", " "] +[192.760821, "o", "t"] +[193.200036, "o", "a"] +[193.312331, "o", "s"] +[193.394453, "o", "k"] +[193.64356, "o", "\r\n(gdb) "] +[194.059159, "o", "# lets try to change the name of the init task"] +[194.200533, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cprint ((struct task_struct *)0xc2530040)->comm"] +[194.696871, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[195.001944, "o", "\u001b[1P"] +[195.130213, "o", "\u001b[1P"] +[195.25889, "o", "\u001b[1P"] +[195.371085, "o", "\u001b[1P"] +[195.467539, "o", "\u001b[1@s"] +[195.576466, "o", "\u001b[1@e"] +[195.674887, "o", "\u001b[C\u001b[1@t\b"] +[196.091385, "o", "\u001b[1P"] +[196.667968, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[197.372014, "o", "="] +[197.836566, "o", "\""] +[198.397868, "o", "m"] +[198.641201, "o", "y"] +[198.786448, "o", " "] +[199.202986, "o", "i"] +[199.328892, "o", "n"] +[199.399589, "o", "i"] +[199.546621, "o", "t"] +[199.953016, "o", "\""] +[200.507538, "o", "\r\n"] +[200.509222, "o", "(gdb) "] +[201.635061, "o", "l"] +[202.13135, "o", "x"] +[202.315347, "o", "-"] +[202.499865, "o", "p"] +[202.599957, "o", "s"] +[203.002948, "o", "\r\n"] +[203.003048, "o", " TASK PID COMM\r\n"] +[203.00358, "o", "0xc17d02c0 0 swapper/0\r\n"] +[203.005085, "o", "0xc2530040 1 my init\r\n"] +[203.006102, "o", "0xc2534080 2 kthreadd\r\n"] +[203.006929, "o", "0xc25360c0 3 rcu_gp\r\n"] +[203.007705, "o", "0xc2537100 4 rcu_par_gp\r\n"] +[203.008617, "o", "0xc2545140 5 kworker/0:0\r\n"] +[203.009353, "o", "0xc2546180 6 kworker/0:0H\r\n"] +[203.010181, "o", "0xc25481c0 7 kworker/u2:0\r\n"] +[203.011118, "o", "0xc2549000 8 mm_percpu_wq\r\n"] +[203.01204, "o", "0xc254b040 9 ksoftirqd/0\r\n"] +[203.012888, "o", "0xc254c080 10 rcu_sched\r\n"] +[203.013708, "o", "0xc254e0c0 11 migration/0\r\n"] +[203.014465, "o", "0xc2572100 12 cpuhp/0\r\n"] +[203.015189, "o", "0xc2576140 13 kdevtmpfs\r\n"] +[203.015958, "o", "0xc2594180 14 netns\r\n"] +[203.016734, "o", "0xc26211c0 15 oom_reaper\r\n"] +[203.017489, "o", "0xc2623000 16 writeback\r\n"] +[203.018341, "o", "0xc26300c0 32 kblockd\r\n"] +[203.019184, "o", "\u001b[m--Type for more, q to quit, c to continue without paging--"] +[204.630436, "o", "q"] +[205.219018, "o", "\r\n"] +[205.21918, "o", "Quit\r\n(gdb) "] +[205.798038, "o", "#"] +[206.120012, "o", " "] +[206.779589, "o", "i"] +[206.88812, "o", "t"] +[206.992104, "o", " "] +[207.15961, "o", "l"] +[207.320891, "o", "o"] +[207.459826, "o", "o"] +[207.53892, "o", "k"] +[208.587042, "o", "s"] +[208.698803, "o", " "] +[208.831616, "o", "l"] +[209.003424, "o", "i"] +[209.131421, "o", "k"] +[209.216867, "o", "e"] +[209.312937, "o", " "] +[209.428128, "o", "i"] +[209.555208, "o", "t"] +[209.604274, "o", " "] +[209.818705, "o", "w"] +[209.898874, "o", "o"] +[210.09889, "o", "r"] +[210.579589, "o", "k"] +[210.714357, "o", "e"] +[210.840035, "o", "d"] +[210.966315, "o", ","] +[211.052435, "o", " "] +[211.231717, "o", "l"] +[211.310902, "o", "e"] +[211.454859, "o", "t"] +[211.664297, "o", "s"] +[211.723802, "o", " "] +[211.882806, "o", "v"] +[211.998969, "o", "e"] +[212.051506, "o", "r"] +[212.232489, "o", "i"] +[212.359529, "o", "f"] +[212.522225, "o", "y"] +[212.626748, "o", " "] +[212.871092, "o", "o"] +[212.971579, "o", "n"] +[213.059427, "o", " "] +[213.146803, "o", "t"] +[213.258418, "o", "h"] +[213.352468, "o", "e"] +[213.426922, "o", " "] +[213.58702, "o", "s"] +[213.627031, "o", "e"] +[213.712627, "o", "r"] +[213.770319, "o", "i"] +[213.875246, "o", "a"] +[213.970606, "o", "l"] +[214.04505, "o", " "] +[214.281377, "o", "t"] +[214.368606, "o", "e"] +[214.451128, "o", "r"] +[214.525736, "o", "m"] +[214.626505, "o", "i"] +[214.732978, "o", "n"] +[214.843079, "o", "a"] +[214.946548, "o", "l"] +[215.040742, "o", " "] +[215.129001, "o", "a"] +[215.228707, "o", "s"] +[215.440824, "o", " "] +[215.516404, "o", "w"] +[215.602238, "o", "e"] +[215.668078, "o", "l"] +[215.795141, "o", "l"] +[216.115205, "o", "\r\n(gdb) "] +[216.777487, "o", "quit\r\n"] +[216.777925, "o", "A debugging session is active.\r\n\r\n\tInferior 1 [process 1] will be detached.\r\n\r\nQuit anyway? (y or n) "] +[217.515133, "o", "y"] +[217.634805, "o", "\r\nDetaching from program: /linux/vmlinux, process 1\r\n"] +[217.635574, "o", "Ending remote debugging.\r\n"] +[217.635669, "o", "[Inferior 1 (process 1) detached]\r\n"] +[217.641009, "o", "make: *** [qemu/Makefile:54: gdb] Interrupt\r\n"] +[217.64117, "o", "\r\n$ "] +[219.594723, "o", "m"] +[219.722708, "o", "i"] +[219.798934, "o", "n"] +[219.896002, "o", "i"] +[219.930619, "o", "c"] +[220.032412, "o", "o"] +[220.090198, "o", "m"] +[220.218551, "o", "d"] +[220.306723, "o", " "] +[220.370933, "o", "-"] +[220.707384, "o", "D"] +[220.896138, "o", " "] +[221.002084, "o", "s"] +[221.096694, "o", "e"] +[221.183389, "o", "r"] +[221.794618, "o", "i"] +[221.921462, "o", "a"] +[221.996654, "o", "l"] +[222.162988, "o", "."] +[222.392328, "o", "p"] +[222.557563, "o", "t"] +[222.764497, "o", "s"] +[223.187221, "o", "\r\n"] +[223.187455, "o", "sh: 2: minicomd: not found\r\n$ "] +[224.66724, "o", "^[[A"] +[225.402255, "o", "\b \b"] +[225.554826, "o", "\b \b"] +[225.691215, "o", "\b \b\b \b"] +[226.362654, "o", "m"] +[226.467209, "o", "i"] +[226.562857, "o", "n"] +[226.611636, "o", "i"] +[227.616059, "o", "\t"] +[228.031284, "o", "\b\b"] +[228.626209, "o", "c"] +[228.648332, "o", "o"] +[228.698461, "o", "m"] +[228.962825, "o", " "] +[229.102646, "o", "-"] +[229.368508, "o", "D"] +[229.495185, "o", " "] +[229.758139, "o", "s"] +[229.818591, "o", "e"] +[229.898678, "o", "r"] +[229.974762, "o", "i"] +[230.066622, "o", "a"] +[230.152183, "o", "l"] +[230.320361, "o", "."] +[230.535407, "o", "p"] +[230.685811, "o", "t"] +[230.858324, "o", "r"] +[231.475343, "o", "\b \b"] +[231.555055, "o", "s"] +[232.066837, "o", "\r\n"] +[232.067795, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[232.068206, "o", "\u001b[?12l\u001b[?25h\nWelcome to minicom 2.7.1\r\n\nOPTIONS: I18n \r\nCompiled on Dec 23 2019, 02:06:26.\r\nPort serial.pts, 23:04:03\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[233.35075, "o", "\n"] +[233.351449, "o", "Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0"] +[233.351645, "o", "\r\n"] +[233.352178, "o", "\n"] +[233.352783, "o", "qemux86 login: "] +[234.133563, "o", "r"] +[234.195403, "o", "o"] +[234.32743, "o", "o"] +[234.389808, "o", "t"] +[234.499891, "o", "\r\n"] +[234.57783, "o", "root@qemux86:~# "] +[236.736246, "o", "p"] +[236.82051, "o", "s"] +[237.011059, "o", " "] +[237.771638, "o", "|"] +[238.06792, "o", " "] +[238.256099, "o", "g"] +[238.451007, "o", "r"] +[238.539927, "o", "e"] +[238.556716, "o", "p"] +[238.721872, "o", " "] +[238.931081, "o", "i"] +[238.997795, "o", "n"] +[239.083429, "o", "i"] +[239.163794, "o", "t"] +[239.752972, "o", "\r\n"] +[239.829798, "o", " 1 root 2004 S {my init} init [5]"] +[239.830096, "o", "\r\n"] +[239.831476, "o", " 233 root 2828 S grep init"] +[239.831842, "o", "\r\n"] +[239.837405, "o", "root@qemux86:~# "] +[241.836556, "o", "#"] +[242.119664, "o", " "] +[242.29265, "o", "l"] +[242.475695, "o", "o"] +[242.571459, "o", "o"] +[242.659603, "o", "k"] +[242.771906, "o", "s"] +[242.961525, "o", " "] +[242.961819, "o", "l"] +[243.115372, "o", "i"] +[243.267339, "o", "k"] +[243.363311, "o", "e"] +[243.507086, "o", " "] +[243.748592, "o", "i"] +[243.875193, "o", "t"] +[243.947777, "o", " "] +[244.060305, "o", "r"] +[244.234349, "o", "e"] +[244.234757, "o", "a"] +[244.279319, "o", "l"] +[244.391006, "o", "l"] +[244.507177, "o", "y"] +[244.603117, "o", " "] +[244.737167, "o", "w"] +[244.821576, "o", "o"] +[244.925326, "o", "r"] +[245.107148, "o", "k"] +[245.227926, "o", "e"] +[245.314038, "o", "d"] +[245.596368, "o", "!"] +[246.03658, "o", "\r\n"] +[246.038158, "o", "root@qemux86:~# "] +[246.855569, "o", "\u001b[0m\u001b(B\u001b[7m\u001b[20;1H\u001b[K\u001b[?12l\u001b[?25h\u001b[?25lCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7.1 | VT102 | Offline | al.pts\u001b[?12l\u001b[?25h\u001b[18;17H"] +[247.531022, "o", "\u001b[8;30H\u001b[?25l\u001b[0m\u001b(B\u001b(0lqqqqqqqqqqqqqqqqqqqqqqk\u001b[9;30Hx\u001b[0m\u001b(B Leave Minicom? \u001b[0m\u001b(B\u001b(0x\u001b[10;30Hx\u001b[0m\u001b(B No \u001b[0m\u001b(B\u001b(0x\u001b[11;30Hmqqqqqqqqqqqqqqqqqqqqqqj\u001b[10;51H\u001b[?25l\u001b[10;33H\u001b[0m\u001b(B\u001b[7m Yes "] +[249.067296, "o", "\u001b[?12l\u001b[?25h\u001b[8;1H\u001b[0m\u001b(BPress CTRL-A Z for help on special keys \u001b[9;1H \u001b[10;1H \u001b[11;1HPoky (Yocto Project Reference Distro) 2.3 qemux86 /de\u001b[18;17H\u001b[0m\u001b(B\u001b[7m\u001b[?12l\u001b[?25h"] +[249.067498, "o", "\u001b[?12l\u001b[?25h\u001b[0m\u001b(B\u001b[H\u001b[2J\u001b[?12l\u001b[?25h\u001b[?1l\u001b>\u001b[!p\u001b[?3;4l\u001b[4l\u001b>"] +[249.067617, "o", "$ "] +[250.232829, "o", "\r\n"] diff --git a/refs/pull/405/merge/_images/intr_x86.cast b/refs/pull/405/merge/_images/intr_x86.cast new file mode 100644 index 00000000..d92b215e --- /dev/null +++ b/refs/pull/405/merge/_images/intr_x86.cast @@ -0,0 +1,3663 @@ +{"version": 2, "width": 80, "height": 24, "timestamp": 1616275302, "idle_time_limit": 0.3, "env": {"SHELL": null, "TERM": "xterm"}} +[0.002597, "o", "$ "] +[1.210773, "o", "m"] +[1.257108, "o", "a"] +[1.329655, "o", "k"] +[1.444271, "o", "e"] +[2.225029, "o", " "] +[2.486972, "o", "g"] +[2.571123, "o", "d"] +[2.712604, "o", "b"] +[3.137145, "o", "\r\n"] +[3.142592, "o", "gdb -ex \"target remote localhost:1234\" /linux/vmlinux\r\n"] +[3.178567, "o", "\u001b[35;1m\u001b[35;1mGNU gdb \u001b[m\u001b[35;1m(Ubuntu 9.2-0ubuntu1~20.04) \u001b[m\u001b[35;1m9.2\u001b[m\u001b[35;1m\r\n\u001b[m\u001b[mCopyright (C) 2020 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law.\r\nType \"show copying\" and \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n .\r\n\r\nFor help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[3.178839, "o", "Reading symbols from \u001b[32m/linux/vmlinux\u001b[m...\r\n"] +[3.823297, "o", "Remote debugging using localhost:1234\r\n"] +[3.83722, "o", "\u001b[34m0xc15dcb62\u001b[m in \u001b[33mdefault_idle\u001b[m () at \u001b[32m./arch/x86/include/asm/irqflags.h\u001b[m:60\r\n"] +[3.837438, "o", "60\t\tasm volatile(\"sti; hlt\": : :\"memory\");\r\n"] +[3.8378, "o", "(gdb) "] +[4.409342, "o", "#"] +[4.546448, "o", " "] +[4.688783, "o", "l"] +[4.769813, "o", "e"] +[4.896733, "o", "t"] +[5.08933, "o", "s"] +[5.136694, "o", " "] +[5.248955, "o", "i"] +[5.312794, "o", "n"] +[5.544699, "o", "s"] +[7.409078, "o", "p"] +[7.489474, "o", "e"] +[7.549338, "o", "c"] +[7.761283, "o", "t"] +[7.849505, "o", " "] +[7.969004, "o", "t"] +[8.040895, "o", "h"] +[8.152802, "o", "e"] +[8.224916, "o", " "] +[9.001867, "o", "i"] +[9.058176, "o", "n"] +[9.136243, "o", "t"] +[9.208632, "o", "e"] +[9.328647, "o", "r"] +[9.488109, "o", "r"] +[9.560597, "o", "u"] +[9.616715, "o", "p"] +[9.704462, "o", "t"] +[9.768614, "o", " "] +[9.976697, "o", "d"] +[10.145135, "o", "e"] +[10.289399, "o", "s"] +[10.328932, "o", "c"] +[10.52121, "o", "r"] +[10.592935, "o", "i"] +[10.656572, "o", "p"] +[10.772899, "o", "t"] +[10.880715, "o", "o"] +[10.98227, "o", "r"] +[11.128859, "o", " "] +[11.25683, "o", "t"] +[11.337146, "o", "a"] +[11.448688, "o", "b"] +[12.394915, "o", "l"] +[12.872587, "o", "e"] +[12.944672, "o", "\r\n"] +[12.944804, "o", "(gdb) "] +[15.485987, "o", "m"] +[15.546043, "o", "o"] +[15.688577, "o", "n"] +[15.73704, "o", "i"] +[15.832722, "o", "t"] +[15.928729, "o", "o"] +[15.993344, "o", "r"] +[16.057039, "o", " "] +[16.15264, "o", "i"] +[16.208593, "o", "n"] +[16.304808, "o", "f"] +[16.361885, "o", "o"] +[16.464359, "o", " "] +[16.577512, "o", "r"] +[16.655247, "o", "e"] +[16.817514, "o", "g"] +[17.064809, "o", "i"] +[17.297122, "o", "s"] +[17.903967, "o", "t"] +[18.064885, "o", "e"] +[18.144619, "o", "r"] +[18.312803, "o", "s"] +[18.392803, "o", "\r\n"] +[18.392969, "o", "EAX=00000000 EBX=00000000 ECX=ffffffff EDX"] +[18.39299, "o", "=0000"] +[18.393076, "o", "0000\r\r\nESI=00000000 EDI=00000000 EBP=c17cff1c ESP=c17cff18\r\r\nEIP=c15dcb62 EFL=002002"] +[18.393171, "o", "46 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=1\r\r\n"] +[18.393206, "o", "ES =007b 00000000 ffffffff 00cff300 DPL=3 DS [-WA]\r\r\nCS =0060 00000000 ffffffff 00cf9a00 DPL=0 CS32 [-R-]\r\r\nSS =0068 00000000 ffffffff 00cf9300 DPL=0 DS [-WA]\r\r\nDS =007b 00000000 ffffffff 00c"] +[18.393256, "o", "ff300 DPL=3 DS [-WA]\r\r\n"] +[18.393291, "o", "FS =00d8 0e47b000 ffffffff 008f9300 DPL=0 DS16 [-WA]\r\r\n"] +[18.393383, "o", "GS =00e0 cfdcb200 00000018 00409100 DPL=0 DS [--A]\r\r\nLDT=0000 00000000 00000000 00008200 DPL=0 LDT\r"] +[18.393415, "o", "\r\nTR =0080 ff806000 0000407b 00008900 DPL=0 T"] +[18.393444, "o", "SS32-avl\r\r\nGDT= ff801000 000000f"] +[18.393521, "o", "f\r\r\nIDT= ff800000 000007ff\r\r\nCR0=80050033 CR2=08087000 CR3=0"] +[18.393616, "o", "760b000 CR4=00000690\r\r\nDR0=00000000 DR1=00000000 "] +[18.393737, "o", "DR2=00000000 DR3=00000000 \r\r\nDR6=ffff0ff0 DR7=00000400\r\r\nEFER=0000000000000000\r\r\nFCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80\r\r\nFPR0=0000000000000000 0000 FPR1=0000000000000000 0000\r\r\nFPR2=0000000000000000 0000 FPR3=0000000000000000 0000\r\r\nFPR4=0000000000000000 0000 FPR"] +[18.393842, "o", "5=0000000000000000 0000\r\r\nFPR6=0000000000000000 0000 FPR7=0000000000000000 0000\r\r\nXMM00=00000000000000000000000000000000 XMM01=00000000000000000000000000000000\r\r\nXMM02=00000000000000000000000000000000 XMM03=00000000000000000000000000000000\r\r\nXMM04=00000000000000000000000000000000 XMM05=00000000000000000000000000000000\r\r\n"] +[18.393943, "o", "XMM06=00000000000000000000000000000000 XMM07=00000000000000000000000000000000\r\r\n"] +[18.394056, "o", "(gdb) "] +[19.913615, "o", "s"] +[19.986157, "o", "e"] +[20.072668, "o", "t"] +[20.144869, "o", " "] +[20.312693, "o", "$"] +[21.121104, "o", "i"] +[21.208933, "o", "d"] +[21.448991, "o", "t"] +[21.608711, "o", "r"] +[21.849094, "o", "="] +[23.480497, "o", "0"] +[23.632689, "o", "x"] +[26.420845, "o", "ff800000"] +[27.416954, "o", "\r\n"] +[27.43416, "o", "(gdb) "] +[30.696589, "o", "#"] +[30.808186, "o", " "] +[30.960552, "o", "l"] +[31.040432, "o", "e"] +[31.168404, "o", "t"] +[31.344474, "o", "s"] +[31.440553, "o", " "] +[31.560787, "o", "l"] +[31.712465, "o", "o"] +[31.832592, "o", "o"] +[31.896851, "o", "k"] +[32.041676, "o", " "] +[32.224318, "o", "a"] +[32.432482, "o", "t"] +[32.52061, "o", " "] +[32.66456, "o", "t"] +[32.728814, "o", "h"] +[32.872205, "o", "e"] +[32.952481, "o", " "] +[33.048492, "o", "f"] +[33.136789, "o", "i"] +[33.216711, "o", "r"] +[35.611379, "o", "s"] +[35.94449, "o", "t"] +[36.241248, "o", " "] +[36.768841, "o", "e"] +[36.985022, "o", "n"] +[37.104804, "o", "t"] +[37.208381, "o", "r"] +[37.322459, "o", "y"] +[37.569352, "o", "\r\n"] +[37.569487, "o", "(gdb) "] +[53.808615, "o", "p"] +[53.97653, "o", "r"] +[54.409035, "o", "i"] +[54.528791, "o", "n"] +[54.664632, "o", "t"] +[54.816835, "o", " "] +[55.391071, "o", "("] +[56.353133, "o", "\b\u001b[K"] +[56.705533, "o", "*"] +[56.984674, "o", "("] +[57.401646, "o", "u"] +[57.598027, "o", "i"] +[57.661018, "o", "n"] +[57.696843, "o", "t"] +[58.032644, "o", "4"] +[58.392875, "o", "_"] +[58.694782, "o", "\b\u001b[K"] +[58.794268, "o", "\b\u001b[K"] +[58.858317, "o", "6"] +[58.912611, "o", "4"] +[59.10469, "o", "_"] +[59.336364, "o", "t"] +[59.74071, "o", "*"] +[59.880803, "o", ")"] +[60.704722, "o", "$"] +[60.98509, "o", "i"] +[61.094169, "o", "d"] +[61.321053, "o", "t"] +[61.480931, "o", "r"] +[62.248357, "o", "\r\n"] +[62.253459, "o", "$1 = 13933448952811676512\r\n"] +[62.253518, "o", "(gdb) "] +[63.977116, "o", "print *(uint64_t*)$idtr"] +[64.304911, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[64.456992, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[65.080291, "o", "/ *(uint64_t*)$idtr\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[65.416962, "o", "\b\u001b[1P *(uint64_t*)$idtr\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[65.497236, "o", "\u001b[C *(uint64_t*)$idtr\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[65.609571, "o", "/ *(uint64_t*)$idtr\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[65.889038, "o", "x *(uint64_t*)$idtr\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[65.993566, "o", "\r\n"] +[65.993861, "o", "$2 = 0xc15d8e000060e360\r\n(gdb) "] +[68.328319, "o", "#"] +[68.608617, "o", " "] +[71.984353, "o", "t"] +[72.048655, "o", "h"] +[72.144372, "o", "e"] +[72.281093, "o", " "] +[73.216663, "o", "t"] +[73.240818, "o", "o"] +[73.368648, "o", "p"] +[73.464534, "o", " "] +[73.576581, "o", "1"] +[73.85665, "o", "6"] +[74.11258, "o", " "] +[74.26418, "o", "b"] +[74.408592, "o", "i"] +[74.488681, "o", "t"] +[74.736972, "o", "s"] +[74.785856, "o", " "] +[75.320694, "o", "|"] +[75.408819, "o", " "] +[75.928337, "o", "l"] +[76.105728, "o", "o"] +[76.169532, "o", "w"] +[76.257743, "o", "e"] +[76.329013, "o", "r"] +[76.432359, "o", " "] +[76.632745, "o", "1"] +[76.896585, "o", "6"] +[77.265121, "o", " "] +[77.456494, "o", "b"] +[77.55218, "o", "i"] +[77.640654, "o", "t"] +[78.424438, "o", "s"] +[78.576779, "o", " "] +[79.9295, "o", "y"] +[80.207199, "o", "i"] +[81.156076, "o", "e"] +[81.342966, "o", "l"] +[81.464422, "o", "d"] +[81.544827, "o", " "] +[81.712818, "o", "t"] +[81.777406, "o", "h"] +[81.896729, "o", "e"] +[82.016527, "o", " "] +[83.160488, "o", "h"] +[83.234273, "o", "a"] +[83.373947, "o", "n"] +[83.457305, "o", "d"] +[83.560818, "o", "l"] +[83.648667, "o", "e"] +[83.736063, "o", "r"] +[83.832315, "o", " "] +[83.904474, "o", "a"] +[84.01645, "o", "d"] +[84.17622, "o", "d"] +[84.441953, "o", "r"] +[84.515239, "o", "e"] +[84.648335, "o", "s"] +[84.793111, "o", "s"] +[84.889616, "o", "\r\n"] +[84.889676, "o", "(gdb) "] +[87.657834, "o", "p"] +[88.328467, "o", "r"] +[88.457638, "o", "i"] +[88.528292, "o", "n"] +[88.592033, "o", "t"] +[89.216915, "o", " "] +[92.096493, "o", "("] +[92.318607, "o", "v"] +[92.401482, "o", "o"] +[92.466573, "o", "i"] +[92.552621, "o", "d"] +[92.71269, "o", " "] +[93.041067, "o", "*"] +[93.136553, "o", ")"] +[100.020065, "o", "0xc15d"] +[102.915226, "o", "e360"] +[103.87249, "o", "\r\n"] +[103.872714, "o", "$3 = (void *) \u001b[34m0xc15de360\u001b[m <\u001b[33masm_exc_divide_error\u001b[m>\r\n(gdb) "] +[108.216616, "o", "#"] +[109.256576, "o", " "] +[110.240848, "o", "l"] +[110.461673, "o", "o"] +[110.584304, "o", "o"] +[110.624509, "o", "k"] +[110.735996, "o", "s"] +[110.840183, "o", " "] +[111.256571, "o", "l"] +[111.484022, "o", "i"] +[111.992971, "o", "k"] +[112.192703, "o", "e"] +[112.333386, "o", " "] +[112.425705, "o", "t"] +[112.499041, "o", "h"] +[112.600559, "o", "e"] +[112.718099, "o", " "] +[112.88867, "o", "h"] +[112.952432, "o", "a"] +[113.056426, "o", "n"] +[113.690225, "o", "d"] +[113.936437, "o", "l"] +[114.032295, "o", "e"] +[114.112253, "o", "r"] +[114.215985, "o", " "] +[114.377083, "o", "f"] +[114.452839, "o", "o"] +[114.592553, "o", "r"] +[114.66679, "o", " "] +[114.809895, "o", "t"] +[114.903547, "o", "h"] +[114.976958, "o", "e"] +[115.112512, "o", " "] +[116.177138, "o", "d"] +[116.32823, "o", "i"] +[116.424191, "o", "v"] +[116.496866, "o", "i"] +[116.600329, "o", "s"] +[116.712739, "o", "i"] +[116.752259, "o", "o"] +[116.937502, "o", "n"] +[117.0166, "o", " "] +[117.281132, "o", "b"] +[117.360604, "o", "y"] +[117.47323, "o", " "] +[117.704485, "o", "z"] +[117.904681, "o", "e"] +[117.989636, "o", "r"] +[118.096365, "o", "o"] +[118.232856, "o", " "] +[118.720624, "o", "e"] +[118.832219, "o", "x"] +[119.064653, "o", "c"] +[119.150155, "o", "e"] +[119.337024, "o", "p"] +[119.664612, "o", "t"] +[119.760425, "o", "i"] +[119.800127, "o", "o"] +[119.984793, "o", "n"] +[120.26515, "o", "\r\n(gdb) "] +[120.728331, "o", "#"] +[120.920995, "o", " "] +[121.064451, "o", "l"] +[121.160676, "o", "e"] +[121.536815, "o", "t"] +[121.786124, "o", "s"] +[121.896173, "o", " "] +[122.080241, "o", "l"] +[122.240363, "o", "o"] +[122.376315, "o", "o"] +[122.408674, "o", "k"] +[122.624537, "o", " "] +[122.773709, "o", "a"] +[122.945429, "o", "t"] +[123.023878, "o", " "] +[123.953626, "o", "ot"] +[124.080277, "o", "h"] +[124.200363, "o", "er"] +[124.728617, "o", " "] +[125.002233, "o", "h"] +[125.064643, "o", "a"] +[125.184089, "o", "n"] +[125.256259, "o", "d"] +[125.392308, "o", "l"] +[125.488306, "o", "e"] +[125.552441, "o", "r"] +[125.741692, "o", "s"] +[126.768744, "o", "\r\n"] +[126.768865, "o", "(gdb) "] +[127.240409, "o", "#"] +[127.480474, "o", " "] +[127.784631, "o", "b"] +[127.944921, "o", "u"] +[128.040904, "o", "t"] +[128.225533, "o", " "] +[130.732997, "o", "l"] +[130.808467, "o", "e"] +[130.968259, "o", "t"] +[131.176344, "o", "s"] +[131.32975, "o", " "] +[131.488296, "o", "d"] +[131.5519, "o", "e"] +[131.689568, "o", "f"] +[131.785467, "o", "i"] +[131.837318, "o", "n"] +[131.904581, "o", "e"] +[132.024328, "o", " "] +[132.160185, "o", "a"] +[132.271963, "o", " "] +[133.464111, "o", "g"] +[133.584141, "o", "d"] +[133.696892, "o", "b"] +[133.79681, "o", " "] +[134.217167, "o", "m"] +[134.304136, "o", "a"] +[134.408321, "o", "c"] +[134.656512, "o", "r"] +[134.721337, "o", "o"] +[134.822577, "o", " "] +[134.991044, "o", "t"] +[135.071573, "o", "o"] +[135.111997, "o", " "] +[135.272574, "o", "m"] +[135.362308, "o", "a"] +[135.456211, "o", "k"] +[135.544994, "o", "e"] +[135.616461, "o", " "] +[135.727963, "o", "t"] +[135.792452, "o", "h"] +[135.857291, "o", "i"] +[136.000758, "o", "n"] +[136.069751, "o", "g"] +[136.257128, "o", "s"] +[136.328766, "o", " "] +[136.510165, "o", "e"] +[136.633035, "o", "a"] +[136.832209, "o", "s"] +[136.960217, "o", "i"] +[137.048799, "o", "e"] +[137.150086, "o", "r"] +[138.744006, "o", "\r\n(gdb) "] +[139.568515, "o", "d"] +[139.653984, "o", "e"] +[139.784679, "o", "f"] +[139.872147, "o", "i"] +[139.912416, "o", "n"] +[139.993159, "o", "e"] +[140.104365, "o", " "] +[140.296306, "o", "i"] +[140.363786, "o", "d"] +[140.665088, "o", "t"] +[140.92836, "o", "_"] +[141.136493, "o", "e"] +[141.360228, "o", "n"] +[141.448491, "o", "t"] +[141.624792, "o", "r"] +[141.713426, "o", "y"] +[142.721123, "o", "\r\n"] +[142.721216, "o", "Type commands for definition of \"idt_entry\".\r\nEnd with a line saying just \"end\".\r\n>"] +[168.576695, "o", "s"] +[168.624243, "o", "e"] +[168.737408, "o", "t"] +[168.824664, "o", " "] +[169.336803, "o", "$"] +[169.680096, "o", "t"] +[169.88103, "o", "m"] +[170.352694, "o", "p"] +[170.935949, "o", "="] +[173.920126, "o", "\b\u001b[K"] +[174.064076, "o", " "] +[174.216635, "o", "="] +[174.303854, "o", " "] +[177.336069, "o", "("] +[178.712525, "o", "$"] +[179.374062, "o", "d"] +[179.720264, "o", "i"] +[180.160492, "o", "\b\u001b[K"] +[180.289725, "o", "\b\u001b[K"] +[180.43246, "o", "\b\u001b[K"] +[181.432386, "o", "\b\u001b[K"] +[181.864182, "o", "*"] +[182.041353, "o", "("] +[182.320662, "o", "u"] +[183.224845, "o", "i"] +[183.288156, "o", "n"] +[183.424563, "o", "t"] +[183.920426, "o", "6"] +[184.016392, "o", "4"] +[184.22467, "o", "_"] +[184.505022, "o", "t"] +[185.017063, "o", "*"] +[185.520732, "o", ")"] +[186.288322, "o", "("] +[186.812369, "o", "$"] +[187.415985, "o", "i"] +[187.600868, "o", "d"] +[188.142718, "o", "t"] +[188.312029, "o", "r"] +[188.616543, "o", " "] +[189.064819, "o", "+"] +[189.264592, "o", " "] +[189.680228, "o", "8"] +[189.848238, "o", " "] +[190.617403, "o", "*"] +[190.760083, "o", " "] +[191.959835, "o", "%"] +[192.576085, "o", "\b\u001b[K"] +[192.844777, "o", "$"] +[193.048452, "o", "a"] +[193.159941, "o", "r"] +[193.376555, "o", "g"] +[193.982847, "o", "0"] +[194.431687, "o", ")"] +[197.999312, "o", "\r\n"] +[197.999456, "o", ">"] +[198.449303, "o", "p"] +[198.665073, "o", "r"] +[198.735942, "o", "i"] +[198.808033, "o", "n"] +[198.888752, "o", "t"] +[199.016048, "o", " "] +[199.417075, "o", "("] +[199.736133, "o", "v"] +[199.848015, "o", "o"] +[199.944772, "o", "i"] +[200.024132, "o", "d"] +[200.286196, "o", " "] +[200.704083, "o", ")"] +[201.349769, "o", "\b\u001b[K"] +[201.704144, "o", "*"] +[201.872195, "o", ")"] +[203.840273, "o", "("] +[205.772822, "o", "("] +[208.496616, "o", "$"] +[209.18405, "o", "t"] +[209.256793, "o", "m"] +[209.36482, "o", "p"] +[209.775923, "o", ">"] +[209.912509, "o", ">"] +[210.839657, "o", "4"] +[210.888609, "o", "8"] +[211.215805, "o", "<"] +[211.336883, "o", "<"] +[211.61268, "o", "1"] +[211.976222, "o", "6"] +[212.74975, "o", ")"] +[213.984241, "o", "|"] +[214.624311, "o", "("] +[215.122175, "o", "T"] +[215.780469, "o", "\b\u001b[K"] +[215.993115, "o", "$"] +[216.224093, "o", "t"] +[216.34352, "o", "m"] +[216.424, "o", "p"] +[217.560837, "o", "&"] +[218.328697, "o", "0"] +[218.632066, "o", "x"] +[218.968103, "o", "f"] +[219.150939, "o", "f"] +[219.727887, "o", "f"] +[219.863865, "o", "f"] +[220.912413, "o", ")"] +[222.079895, "o", ")"] +[223.568547, "o", "\r\n"] +[223.568676, "o", ">"] +[224.094006, "o", "e"] +[224.223863, "o", "n"] +[224.287703, "o", "d"] +[224.744944, "o", "\r\n"] +[224.745116, "o", "(gdb) "] +[225.951864, "o", "i"] +[226.096427, "o", "d"] +[226.680474, "o", "\b\u001b[K"] +[226.872108, "o", "t"] +[228.156402, "o", "\b\u001b[K"] +[228.205096, "o", "d"] +[228.440636, "o", "t"] +[228.631859, "o", "_"] +[229.039821, "o", "e"] +[229.167518, "o", "n"] +[229.256696, "o", "t"] +[229.433937, "o", "r"] +[229.505089, "o", "y"] +[229.631679, "o", " "] +[229.840572, "o", "0"] +[230.040276, "o", "\r\n"] +[238.488909, "o", "$4 = (void *) \u001b[34m0xc15de360\u001b[m <\u001b[33masm_exc_divide_error\u001b[m>\r\n(gdb) "] +[242.8967, "o", "s"] +[243.00697, "o", "e"] +[243.128131, "o", "t"] +[243.183821, "o", " "] +[243.402228, "o", "$"] +[243.592398, "o", "i"] +[243.967633, "o", "="] +[244.505775, "o", "0"] +[245.649837, "o", "\r\n"] +[245.667678, "o", "(gdb) "] +[246.494615, "o", "i"] +[246.951918, "o", "d"] +[247.159916, "o", "t"] +[247.288617, "o", "_"] +[247.44043, "o", "e"] +[247.606119, "o", "n"] +[247.677569, "o", "t"] +[247.824446, "o", "r"] +[247.919894, "o", "y"] +[248.064502, "o", " "] +[248.368428, "o", "$i"] +[248.812484, "o", "+"] +[248.95182, "o", "+"] +[249.296086, "o", "\r\n"] +[251.440221, "o", "$5 = (void *) \u001b[34m0xc15de360\u001b[m <\u001b[33masm_exc_divide_error\u001b[m>\r\n(gdb) "] +[252.647928, "o", "\r\n"] +[252.648357, "o", "$6 = (void *) \u001b[34m0xc15de460\u001b[m <\u001b[33masm_exc_debug\u001b[m>\r\n(gdb) "] +[253.671712, "o", "\r\n"] +[253.672097, "o", "$7 = (void *) \u001b[34m0xc15dec28\u001b[m <\u001b[33masm_exc_nmi\u001b[m>\r\n(gdb) "] +[254.280864, "o", "\r\n"] +[254.281363, "o", "$8 = (void *) \u001b[34m0xc15de440\u001b[m <\u001b[33masm_exc_int3\u001b[m>\r\n(gdb) "] +[254.976139, "o", "\r\n"] +[254.976557, "o", "$9 = (void *) \u001b[34m0xc15de370\u001b[m <\u001b[33masm_exc_overflow\u001b[m>\r\n(gdb) "] +[255.880408, "o", "\r\n"] +[255.880837, "o", "$10 = (void *) \u001b[34m0xc15de380\u001b[m <\u001b[33masm_exc_bounds\u001b[m>\r\n(gdb) "] +[256.624516, "o", "\r\n"] +[256.625069, "o", "$11 = (void *) \u001b[34m0xc15de430\u001b[m <\u001b[33masm_exc_invalid_op\u001b[m>\r\n(gdb) "] +[259.776172, "o", "#"] +[259.983837, "o", " "] +[260.136287, "o", "n"] +[260.210913, "o", "o"] +[260.615863, "o", "w"] +[260.791336, "o", " "] +[261.016063, "o", "l"] +[261.088278, "o", "e"] +[261.231738, "o", "t"] +[261.455678, "o", "s"] +[261.535942, "o", " "] +[264.927751, "o", "d"] +[265.007746, "o", "i"] +[265.080717, "o", "s"] +[265.76765, "o", "a"] +[265.920288, "o", "s"] +[266.056377, "o", "s"] +[266.663988, "o", "e"] +[266.744341, "o", "m"] +[266.91189, "o", "b"] +[267.063396, "o", "l"] +[267.136112, "o", "e"] +[267.231659, "o", " "] +[271.856077, "o", "o"] +[272.111984, "o", "n"] +[272.262555, "o", "e"] +[272.334059, "o", " "] +[272.495286, "o", "o"] +[272.567329, "o", "f"] +[272.655262, "o", " "] +[272.78918, "o", "t"] +[272.849964, "o", "h"] +[272.944092, "o", "e"] +[273.016315, "o", " "] +[273.112499, "o", "h"] +[273.184169, "o", "a"] +[273.301118, "o", "n"] +[273.399638, "o", "d"] +[273.495596, "o", "l"] +[273.600002, "o", "e"] +[273.672038, "o", "r"] +[273.871971, "o", "s"] +[274.591341, "o", "\r\n(gdb) "] +[281.693904, "o", "d"] +[281.813581, "o", "i"] +[282.81517, "o", "s"] +[283.118047, "o", "a"] +[283.2217, "o", "s"] +[284.374128, "o", "semble "] +[293.053957, "o", "a"] +[293.157762, "o", "s"] +[293.229719, "o", "m"] +[293.708297, "o", "E"] +[294.133992, "o", "\b\u001b[K"] +[294.448654, "o", "_"] +[294.706454, "o", "e"] +[294.829897, "o", "x"] +[295.462678, "o", "c"] +[296.67153, "o", "\u0007_"] +[297.946193, "o", "d"] +[298.135012, "o", "i"] +[298.685772, "o", "\b\u001b[K"] +[299.178835, "o", "i"] +[299.621804, "o", "v"] +[299.733826, "o", "i"] +[299.902348, "o", "de_error "] +[300.549988, "o", "\r\n"] +[300.567393, "o", "Dump of assembler code for function asm_exc_divide_error:\r\n"] +[300.56769, "o", " \u001b[34m0xc15de360\u001b[m <+0>:\tlea 0x0(%esi),%esi\r\n \u001b[34m0xc15de363\u001b[m <+3>:\tcld \r\n \u001b[34m0xc15de364\u001b[m <+4>:\tpush $0x0\r\n \u001b[34m0xc15de366\u001b[m <+6>:\tpush $0xc15d1ff0\r\n"] +[300.567847, "o", " \u001b[34m0xc15de36b\u001b[m <+11>:\tjmp \u001b[34m0xc15dea1f\u001b[m <\u001b[33mhandle_exception\u001b[m>\r\nEnd of assembler dump.\r\n(gdb) "] +[301.973744, "o", "#"] +[302.290429, "o", " "] +[306.773991, "o", "t"] +[306.89402, "o", "h"] +[306.989409, "o", "e"] +[307.078564, "o", " "] +[307.205506, "o", "h"] +[307.269547, "o", "a"] +[307.381896, "o", "n"] +[307.462694, "o", "d"] +[307.606051, "o", "l"] +[307.683098, "o", "e"] +[307.782006, "o", "r"] +[307.90709, "o", " "] +[308.102037, "o", "i"] +[308.205644, "o", "s"] +[308.309779, "o", " "] +[308.649061, "o", "s"] +[308.719819, "o", "a"] +[308.909582, "o", "v"] +[309.045065, "o", "i"] +[309.111041, "o", "n"] +[309.181535, "o", "g"] +[309.357861, "o", " "] +[309.791576, "o", "0"] +[310.205523, "o", "\b\u001b[K"] +[310.317771, "o", "z"] +[310.52606, "o", "e"] +[310.645389, "o", "r"] +[310.709449, "o", "o"] +[311.925803, "o", " "] +[313.89363, "o", "\r\u001b[K(gdb) # the handler is saving zero "] +[316.414294, "o", "a"] +[316.646282, "o", "n"] +[316.757604, "o", "d"] +[317.310478, "o", " "] +[318.758069, "o", "a"] +[318.854419, "o", " "] +[320.037966, "o", "k"] +[320.158956, "o", "e"] +[320.222516, "o", "r"] +[320.27824, "o", "n"] +[320.397617, "o", "e"] +[320.478095, "o", "l"] +[320.573709, "o", " "] +[320.653434, "o", "a"] +[320.765976, "o", "d"] +[320.931418, "o", "d"] +[321.115711, "o", "r"] +[321.16469, "o", "e"] +[321.317561, "o", "s"] +[321.469813, "o", "s"] +[321.533551, "o", " "] +[321.637537, "o", "o"] +[321.77303, "o", "n"] +[322.051753, "o", " "] +[322.229659, "o", "t"] +[322.334768, "o", "h"] +[322.397494, "o", "e"] +[322.493585, "o", " "] +[322.581569, "o", "s"] +[322.726173, "o", "t"] +[322.789718, "o", "a"] +[322.966867, "o", "c"] +[323.01384, "o", "k"] +[323.861459, "o", "\r\n(gdb) "] +[324.400657, "o", "#"] +[324.57396, "o", " "] +[324.718055, "o", "t"] +[324.837389, "o", "h"] +[324.925552, "o", "e"] +[325.053448, "o", "n"] +[325.131599, "o", " "] +[325.541878, "o", "c"] +[325.656711, "o", "a"] +[325.789362, "o", "l"] +[325.917952, "o", "l"] +[326.215003, "o", "s"] +[326.405569, "o", " "] +[327.862046, "o", "a"] +[328.005901, "o", " "] +[328.509434, "o", "g"] +[328.589852, "o", "e"] +[328.718961, "o", "n"] +[328.794376, "o", "e"] +[329.046122, "o", "r"] +[329.197525, "o", "i"] +[329.413451, "o", "c"] +[329.589576, "o", " "] +[334.125557, "o", "e"] +[334.229684, "o", "x"] +[334.294128, "o", "c"] +[334.397658, "o", "c"] +[334.49939, "o", "e"] +[334.582127, "o", "p"] +[334.711478, "o", "t"] +[334.796324, "o", "i"] +[334.829615, "o", "o"] +[335.309853, "o", "\b\u001b[K"] +[337.093887, "o", "o"] +[337.253827, "o", "n"] +[337.758199, "o", "\b"] +[338.268806, "o", "\b"] +[338.299872, "o", "\b"] +[338.329696, "o", "\b"] +[338.52618, "o", "\b"] +[338.693918, "o", "\b"] +[338.898897, "o", "\b"] +[338.940108, "o", "\u001b[1Peption\b\b\b\b\b\b"] +[339.31746, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[339.987467, "o", " "] +[340.501724, "o", "h"] +[340.557371, "o", "a"] +[340.669201, "o", "n"] +[340.781539, "o", "d"] +[340.893346, "o", "l"] +[340.986467, "o", "e"] +[341.107449, "o", "r"] +[341.189818, "o", " "] +[341.310107, "o", "f"] +[341.404177, "o", "u"] +[341.477608, "o", "n"] +[341.589586, "o", "c"] +[341.805591, "o", "t"] +[341.877429, "o", "i"] +[341.917582, "o", "o"] +[342.116466, "o", "n"] +[342.669981, "o", "\r\n(gdb) "] +[344.453707, "o", "p"] +[344.573486, "o", "r"] +[344.651746, "o", "i"] +[344.710394, "o", "n"] +[344.773933, "o", "t"] +[344.853496, "o", " "] +[345.133258, "o", "("] +[345.301217, "o", "v"] +[345.423574, "o", "o"] +[345.509734, "o", "i"] +[345.611679, "o", "d"] +[346.432047, "o", "*"] +[346.853663, "o", "\b\u001b[K"] +[347.541327, "o", "*"] +[347.72567, "o", ")"] +[351.837485, "o", "0xc15d1f"] +[351.838034, "o", "f0"] +[352.917933, "o", "\r\n"] +[352.923702, "o", "$1 = (void *) \u001b[34m0xc15d1ff0\u001b[m <\u001b[33mexc_divide_error\u001b[m>\r\n(gdb) "] +[357.542556, "o", "#"] +[357.837506, "o", " "] +[361.282757, "o", "t"] +[361.413551, "o", "h"] +[361.525554, "o", "is"] +[361.653148, "o", " "] +[362.214023, "o", "l"] +[362.389469, "o", "o"] +[362.541643, "o", "o"] +[362.877329, "o", "k"] +[363.062173, "o", "s"] +[363.118787, "o", " "] +[363.269731, "o", "l"] +[363.453393, "o", "i"] +[363.621546, "o", "k"] +[363.709514, "o", "e"] +[363.862147, "o", " "] +[364.222054, "o", "t"] +[364.349865, "o", "h"] +[364.517649, "o", "e"] +[364.613482, "o", " "] +[365.373866, "o", "\b\u001b[K"] +[365.523605, "o", "\b\u001b[K"] +[365.653717, "o", "\b\u001b[K"] +[365.797269, "o", "\b\u001b[K"] +[366.093498, "o", "a"] +[366.2068, "o", " "] +[366.345557, "o", "p"] +[366.427604, "o", "o"] +[366.62164, "o", "i"] +[366.685247, "o", "n"] +[366.749103, "o", "t"] +[366.830172, "o", "e"] +[366.957418, "o", "r"] +[367.093461, "o", " "] +[367.222757, "o", "t"] +[367.270164, "o", "o"] +[367.350376, "o", " "] +[367.453392, "o", "t"] +[367.549486, "o", "h"] +[367.650301, "o", "e"] +[367.708818, "o", " "] +[367.989708, "o", "a"] +[368.125641, "o", "c"] +[368.378932, "o", "t"] +[368.468498, "o", "u"] +[368.590227, "o", "a"] +[368.710298, "o", "l"] +[368.813341, "o", " "] +[369.021547, "o", "e"] +[373.138639, "o", "c"] +[373.285424, "o", "a"] +[373.493441, "o", "p"] +[373.701029, "o", "t"] +[373.861883, "o", "i"] +[373.909667, "o", "o"] +[374.132304, "o", "\b\u001b[K"] +[374.261891, "o", "\b\u001b[K"] +[374.414046, "o", "\b\u001b[K"] +[374.565314, "o", "\b\u001b[K"] +[374.709345, "o", "\b\u001b[K"] +[374.853559, "o", "\b\u001b[K"] +[374.886, "o", "x"] +[375.125274, "o", "c"] +[375.203406, "o", "e"] +[375.277276, "o", "p"] +[375.42167, "o", "t"] +[375.510506, "o", "i"] +[375.533335, "o", "o"] +[375.701534, "o", "n"] +[375.789484, "o", " "] +[375.965816, "o", "h"] +[375.97305, "o", "a"] +[376.16194, "o", "n"] +[376.453783, "o", "d"] +[376.589277, "o", "l"] +[376.701185, "o", "e"] +[376.765614, "o", "r"] +[376.917157, "o", " "] +[378.366234, "o", "\r\n"] +[378.366312, "o", "(gdb) "] +[378.696508, "o", "#"] +[378.819592, "o", " "] +[378.981518, "o", "i"] +[379.141542, "o", "s"] +[379.261122, "o", " "] +[379.406101, "o", "p"] +[379.469324, "o", "a"] +[379.557497, "o", "s"] +[379.725858, "o", "s"] +[379.816754, "o", "e"] +[379.925361, "o", "d"] +[380.021135, "o", " "] +[380.173176, "o", "t"] +[380.229575, "o", "o"] +[380.333106, "o", " "] +[380.485699, "o", "t"] +[380.605346, "o", "h"] +[380.734993, "o", "e"] +[380.836656, "o", " "] +[382.373895, "o", "g"] +[382.445188, "o", "e"] +[382.525518, "o", "n"] +[382.621654, "o", "e"] +[382.787196, "o", "r"] +[382.978124, "o", "i"] +[383.142264, "o", "c"] +[383.317634, "o", " "] +[383.819866, "o", "h"] +[383.878201, "o", "a"] +[383.99007, "o", "n"] +[384.089685, "o", "d"] +[384.197318, "o", "l"] +[384.277425, "o", "e"] +[384.461334, "o", "_"] +[384.661346, "o", "e"] +[384.79798, "o", "x"] +[385.006099, "o", "c"] +[385.090299, "o", "e"] +[385.170675, "o", "p"] +[385.310365, "o", "t"] +[385.389772, "o", "i"] +[385.421608, "o", "o"] +[385.581386, "o", "n"] +[386.20595, "o", "\r\n"] +[386.206176, "o", "(gdb) "] +[389.230197, "o", "#"] +[390.26918, "o", " "] +[391.325411, "o", "l"] +[391.42176, "o", "e"] +[391.581594, "o", "t"] +[391.774324, "o", "s"] +[391.886397, "o", " "] +[392.301564, "o", "s"] +[392.421411, "o", "e"] +[392.541767, "o", "t"] +[392.623009, "o", " "] +[392.706636, "o", "a"] +[392.840467, "o", " "] +[393.090357, "o", "b"] +[393.453217, "o", "r"] +[393.541455, "o", "e"] +[393.605239, "o", "a"] +[394.046616, "o", "k"] +[394.291075, "o", "p"] +[394.380916, "o", "o"] +[394.621516, "o", "n"] +[394.725214, "o", "t"] +[395.107038, "o", "\b\u001b[K"] +[395.413324, "o", "\b\u001b[K"] +[395.717268, "o", "i"] +[395.797318, "o", "n"] +[395.949174, "o", "t"] +[396.005249, "o", " "] +[396.160391, "o", "t"] +[396.222933, "o", "o"] +[396.301863, "o", " "] +[396.485547, "o", "h"] +[396.525186, "o", "a"] +[396.661547, "o", "n"] +[396.773364, "o", "d"] +[396.853597, "o", "l"] +[396.989123, "o", "e"] +[397.290232, "o", "_"] +[397.493521, "o", "e"] +[397.629719, "o", "x"] +[397.837301, "o", "c"] +[397.901283, "o", "e"] +[397.981223, "o", "p"] +[398.141235, "o", "t"] +[398.262293, "o", "o"] +[398.309802, "o", "p"] +[398.933271, "o", "\b\u001b[K"] +[399.005854, "o", "\b\u001b[K"] +[399.342381, "o", "i"] +[399.390296, "o", "o"] +[399.637201, "o", "n"] +[400.021147, "o", " "] +[400.134629, "o", "a"] +[400.253125, "o", "n"] +[400.333322, "o", "d"] +[400.437249, "o", " "] +[400.549161, "o", "s"] +[400.686474, "o", "e"] +[400.814467, "o", "e"] +[400.933556, "o", " "] +[402.002187, "o", "w"] +[402.205377, "o", "h"] +[402.229328, "o", "a"] +[402.397324, "o", "t"] +[402.4851, "o", " "] +[402.621307, "o", "w"] +[402.725189, "o", "e"] +[402.805449, "o", " "] +[402.949896, "o", "c"] +[403.035235, "o", "a"] +[403.253098, "o", "t"] +[403.437365, "o", "c"] +[403.509643, "o", "h"] +[403.973799, "o", "\r\n(gdb) "] +[404.677037, "o", "b"] +[404.765978, "o", "r"] +[404.837523, "o", "e"] +[404.89348, "o", "a"] +[404.965381, "o", "k"] +[405.069983, "o", " "] +[405.469329, "o", "h"] +[405.573246, "o", "a"] +[405.693381, "o", "n"] +[405.789649, "o", "d"] +[405.870641, "o", "l"] +[406.413246, "o", "\u0007e"] +[406.509294, "o", "e"] +[407.118199, "o", "\b\u001b[K"] +[407.3744, "o", "_"] +[407.550122, "o", "e"] +[407.725281, "o", "x"] +[407.938177, "o", "\u0007"] +[408.415128, "o", "c"] +[408.520749, "o", "\u0007eption"] +[408.893296, "o", "\r\n"] +[408.924833, "o", "Breakpoint 1 at \u001b[34m0xc15dea1f\u001b[m: file \u001b[32march/x86/entry/entry_32.S\u001b[m, line 1154.\r\n(gdb) "] +[416.67716, "o", "c"] +[417.078529, "o", "\r\nContinuing.\r\n"] +[417.083919, "o", "\r\n"] +[417.084103, "o", "Breakpoint 1, \u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1154\r\n"] +[417.084192, "o", "1154\t\tSAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1\r\n(gdb) "] +[429.509889, "o", "#"] +[429.68491, "o", " "] +[429.877508, "o", "i"] +[429.981148, "o", "f"] +[430.109307, "o", " "] +[430.221386, "o", "w"] +[430.309864, "o", "e"] +[430.590443, "o", " "] +[430.809819, "o", "l"] +[430.989816, "o", "o"] +[431.117, "o", "o"] +[431.189041, "o", "k"] +[431.277218, "o", " "] +[431.437363, "o", "a"] +[431.661354, "o", "t"] +[431.725226, "o", " "] +[431.862491, "o", "t"] +[431.979484, "o", "h"] +[432.045404, "o", "e"] +[432.134089, "o", " "] +[432.277058, "o", "s"] +[432.421149, "o", "t"] +[432.469089, "o", "a"] +[432.667918, "o", "c"] +[432.732989, "o", "k"] +[432.893494, "o", " "] +[433.333128, "o", "w"] +[433.412995, "o", "e"] +[433.500507, "o", " "] +[433.653242, "o", "s"] +[433.757095, "o", "h"] +[433.829678, "o", "o"] +[433.948148, "o", "u"] +[434.073821, "o", "l"] +[434.132601, "o", "d"] +[434.22928, "o", " "] +[434.34112, "o", "f"] +[434.397286, "o", "i"] +[434.492998, "o", "n"] +[434.566083, "o", "d"] +[434.677046, "o", " "] +[434.821839, "o", "t"] +[434.909696, "o", "h"] +[434.995523, "o", "e"] +[435.090817, "o", " "] +[437.581687, "o", "a"] +[437.781198, "o", "c"] +[438.013647, "o", "t"] +[438.077543, "o", "u"] +[438.206204, "o", "a"] +[438.294424, "o", "l"] +[438.416002, "o", " "] +[439.485411, "o", "e"] +[439.630758, "o", "x"] +[439.829008, "o", "c"] +[439.916901, "o", "e"] +[439.964921, "o", "p"] +[440.140749, "o", "t"] +[440.215078, "o", "i"] +[440.253158, "o", "o"] +[440.465997, "o", "n"] +[440.573062, "o", " "] +[441.141509, "o", "h"] +[441.173285, "o", "a"] +[441.333677, "o", "n"] +[441.405478, "o", "d"] +[441.53343, "o", "l"] +[441.605845, "o", "e"] +[441.67794, "o", "r"] +[444.222754, "o", "\r\n"] +[444.222812, "o", "(gdb) "] +[445.323324, "o", "p"] +[445.469488, "o", "r"] +[445.541673, "o", "i"] +[445.605066, "o", "n"] +[445.693078, "o", "t"] +[445.838045, "o", " "] +[449.204742, "o", "("] +[449.662622, "o", "v"] +[449.701787, "o", "o"] +[449.789464, "o", "i"] +[449.861186, "o", "d"] +[449.955706, "o", " "] +[450.269467, "o", "*"] +[450.429896, "o", ")"] +[452.877512, "o", "("] +[453.661196, "o", "\b\u001b[K"] +[454.00543, "o", "(*"] +[454.210726, "o", "("] +[454.700739, "o", "i"] +[455.037441, "o", "\b\u001b[K"] +[455.25302, "o", "u"] +[455.43756, "o", "i"] +[455.510815, "o", "n"] +[455.628939, "o", "t"] +[456.820983, "o", "3"] +[456.909062, "o", "2"] +[457.044863, "o", "_"] +[457.213162, "o", "t"] +[457.685637, "o", " "] +[458.29388, "o", "*"] +[458.455775, "o", ")"] +[459.281591, "o", "e"] +[459.837464, "o", "\b\u001b[K"] +[460.301045, "o", "$"] +[460.868861, "o", "e"] +[461.03713, "o", "s"] +[461.101321, "o", "p"] +[461.786512, "o", ")"] +[464.477246, "o", "\r\n"] +[472.463976, "o", "$2 = (void *) \u001b[34m0xc15d3840\u001b[m <\u001b[33msysvec_apic_timer_interrupt\u001b[m>\r\n(gdb) "] +[483.413856, "o", "#"] +[483.613547, "o", " "] +[484.941064, "o", "t"] +[485.036894, "o", "h"] +[485.101573, "o", "i"] +[485.277486, "o", "s"] +[485.365346, "o", " "] +[485.782102, "o", "l"] +[485.989418, "o", "o"] +[486.117103, "o", "o"] +[486.197077, "o", "k"] +[486.333443, "o", "s"] +[486.413088, "o", " "] +[486.581136, "o", "l"] +[486.733891, "o", "i"] +[486.912819, "o", "k"] +[487.002561, "o", "e"] +[487.095043, "o", " "] +[487.261133, "o", "t"] +[487.333274, "o", "h"] +[487.46896, "o", "e"] +[487.550243, "o", " "] +[495.172835, "o", "t"] +[495.373523, "o", "i"] +[495.437104, "o", "m"] +[495.540612, "o", "e"] +[495.604806, "o", "r"] +[495.74046, "o", " "] +[496.74974, "o", "i"] +[496.813502, "o", "n"] +[496.93292, "o", "t"] +[497.037013, "o", "e"] +[497.229265, "o", "r"] +[497.372827, "o", "r"] +[497.500693, "o", "u"] +[497.556563, "o", "p"] +[497.673852, "o", "t"] +[498.109885, "o", "\r\n"] +[498.110099, "o", "(gdb) "] +[522.221171, "o", "#"] +[522.444874, "o", " "] +[522.684901, "o", "n"] +[522.91456, "o", "e"] +[523.316756, "o", "\b\u001b[K"] +[523.428746, "o", "\b\u001b[K"] +[523.500892, "o", "t"] +[523.644472, "o", "h"] +[523.757109, "o", "e"] +[523.955806, "o", " "] +[524.099169, "o", "n"] +[524.155454, "o", "e"] +[524.22103, "o", "x"] +[524.508979, "o", "t"] +[524.604798, "o", " "] +[525.057219, "o", "i"] +[525.162621, "o", "t"] +[525.389208, "o", "e"] +[525.469892, "o", "m"] +[525.55658, "o", " "] +[525.668816, "o", "o"] +[525.989001, "o", "n"] +[526.192028, "o", " "] +[526.28711, "o", "t"] +[526.422465, "o", "h"] +[526.493339, "o", "e"] +[526.589426, "o", " "] +[526.717587, "o", "s"] +[527.157628, "o", "t"] +[527.290422, "o", "a"] +[527.525162, "o", "c"] +[527.597243, "o", "k"] +[527.724964, "o", " "] +[527.965348, "o", "s"] +[528.029266, "o", "h"] +[528.101143, "o", "o"] +[528.21352, "o", "u"] +[528.373131, "o", "l"] +[528.509865, "o", "d"] +[528.53395, "o", " "] +[528.733472, "o", "b"] +[528.821848, "o", "e"] +[528.876906, "o", " "] +[529.052799, "o", "0"] +[529.726391, "o", "\r\n"] +[529.726579, "o", "(gdb) "] +[531.685101, "o", "# the next item on the stack should be 0"] +[532.28286, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[3Pis looks like the timer interrupt"] +[532.958122, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[4Pprint (void *)(*(uint32_t *)$esp)"] +[533.365021, "o", "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b*(uint32_t *)$esp))"] +[533.700966, "o", "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b(*(uint32_t *)$esp)"] +[534.021436, "o", "\b"] +[534.221855, "o", "\b"] +[534.373191, "o", "\b"] +[534.530437, "o", "\b"] +[534.692778, "o", "\b"] +[536.140976, "o", "($esp)\b\b\b\b\b"] +[536.886188, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[537.029022, "o", "\b"] +[537.809435, "o", "+)\b"] +[537.989237, "o", "4)\b"] +[538.365079, "o", "\u001b[C)\b"] +[539.157186, "o", "\r\n"] +[539.157943, "o", "$3 = (void *) \u001b[34m0x0\u001b[m\r\n(gdb) "] +[541.245538, "o", "#"] +[541.460923, "o", " "] +[541.588649, "o", "t"] +[541.708909, "o", "h"] +[541.828839, "o", "e"] +[541.909914, "o", " "] +[542.068789, "o", "n"] +[542.109349, "o", "e"] +[542.166431, "o", "x"] +[542.469912, "o", "t"] +[542.558985, "o", " "] +[542.725111, "o", "s"] +[542.813268, "o", "h"] +[542.956594, "o", "o"] +[543.036718, "o", "u"] +[543.185591, "o", "l"] +[543.269565, "o", "d"] +[543.340548, "o", " "] +[543.494185, "o", "b"] +[543.564801, "o", "e"] +[543.637447, "o", " "] +[544.053085, "o", "t"] +[544.124773, "o", "h"] +[544.250569, "o", "e"] +[544.290687, "o", " "] +[544.404605, "o", "o"] +[544.565768, "o", "l"] +[544.65011, "o", "d"] +[544.764418, "o", " "] +[546.425323, "o", "E"] +[547.028847, "o", "I"] +[547.16456, "o", "P"] +[548.300627, "o", "\r\n"] +[548.300792, "o", "(gdb) "] +[548.989397, "o", "# the next should be the old EIP"] +[549.173162, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cprint (void *)(*(uint32_t *)($esp+4))"] +[549.899257, "o", "\b"] +[550.129437, "o", "\b"] +[550.282812, "o", "\b"] +[552.748678, "o", "\u001b[1P))\b\b"] +[553.452994, "o", "8))\b\b"] +[554.357074, "o", "\r\n"] +[554.357422, "o", "$4 = (void *) \u001b[34m0xc15dcb62\u001b[m <\u001b[33mdefault_idle\u001b[m+18>\r\n(gdb) "] +[555.42081, "o", "#"] +[555.706632, "o", " "] +[555.853611, "o", "t"] +[555.981142, "o", "h"] +[556.060689, "o", "i"] +[556.180769, "o", "s"] +[556.276646, "o", " "] +[556.452498, "o", "l"] +[556.645524, "o", "o"] +[556.777618, "o", "o"] +[556.860672, "o", "k"] +[557.01089, "o", "s"] +[557.060819, "o", " "] +[557.204509, "o", "l"] +[557.364288, "o", "i"] +[557.524835, "o", "k"] +[557.597177, "o", "e"] +[557.71496, "o", " "] +[557.821078, "o", "a"] +[557.916565, "o", " "] +[558.074038, "o", "k"] +[558.189048, "o", "e"] +[558.261556, "o", "r"] +[558.308555, "o", "n"] +[558.404654, "o", "e"] +[558.500443, "o", "l"] +[558.596666, "o", " "] +[559.580743, "o", "f"] +[559.70893, "o", "u"] +[559.788765, "o", "n"] +[559.885141, "o", "c"] +[560.121613, "o", "t"] +[560.162133, "o", "i"] +[560.209714, "o", "o"] +[560.613052, "o", "n"] +[561.118427, "o", "\r\n(gdb) "] +[561.62926, "o", "#"] +[561.836781, "o", " "] +[561.988412, "o", "t"] +[562.10054, "o", "h"] +[562.164515, "o", "i"] +[562.261046, "o", "s"] +[562.3572, "o", " "] +[562.724997, "o", "m"] +[562.805154, "o", "e"] +[562.892772, "o", "a"] +[563.028918, "o", "n"] +[563.15677, "o", "s"] +[563.493188, "o", " "] +[563.741062, "o", "t"] +[563.886535, "o", "h"] +[563.957787, "o", "a"] +[564.029028, "o", "t"] +[564.180631, "o", " "] +[564.673213, "o", "w"] +[564.781861, "o", "e"] +[564.951458, "o", " "] +[567.116541, "o", "h"] +[567.164956, "o", "a"] +[567.385305, "o", "d"] +[567.533309, "o", " "] +[570.307091, "o", "a"] +[570.973109, "o", "n"] +[571.052577, "o", " "] +[571.196558, "o", "i"] +[571.276673, "o", "n"] +[571.356641, "o", "t"] +[571.437049, "o", "e"] +[571.573147, "o", "r"] +[571.733301, "o", "r"] +[571.820704, "o", "u"] +[571.886457, "o", "p"] +[571.980515, "o", "t"] +[572.061059, "o", " "] +[572.212575, "o", "w"] +[572.26865, "o", "i"] +[572.44514, "o", "t"] +[572.50106, "o", "h"] +[572.649694, "o", " "] +[572.810376, "o", "a"] +[573.596771, "o", "\b\u001b[K"] +[573.719587, "o", "\b\u001b[K"] +[573.948986, "o", "o"] +[574.060044, "o", "u"] +[574.200126, "o", "t"] +[574.245857, "o", " "] +[574.380757, "o", "a"] +[574.452561, "o", " "] +[574.9488, "o", "p"] +[575.129864, "o", "r"] +[575.246245, "o", "i"] +[575.436917, "o", "v"] +[575.500921, "o", "i"] +[575.708822, "o", "l"] +[575.788518, "o", "e"] +[576.069274, "o", "d"] +[576.335142, "o", "g"] +[576.405962, "o", "e"] +[577.434405, "o", " "] +[577.67696, "o", "t"] +[577.88499, "o", "r"] +[577.980284, "o", "a"] +[578.876849, "o", "s"] +[578.980703, "o", "i"] +[579.149228, "o", "t"] +[579.221513, "o", "i"] +[579.26884, "o", "o"] +[579.41719, "o", "n"] +[579.677291, "o", "\r\n(gdb) "] +[589.020658, "o", "#"] +[589.341219, "o", " "] +[589.660946, "o", "s"] +[589.676545, "o", "o"] +[589.89489, "o", " "] +[591.052951, "o", "t"] +[591.257618, "o", "h"] +[591.35662, "o", "e"] +[592.084943, "o", " "] +[592.410388, "o", "n"] +[592.465233, "o", "e"] +[592.604429, "o", "x"] +[592.900822, "o", "t"] +[593.012732, "o", " "] +[595.93505, "o", "v"] +[596.021101, "o", "a"] +[596.124525, "o", "l"] +[596.372663, "o", "u"] +[596.516986, "o", "e"] +[596.533116, "o", " "] +[596.716689, "o", "o"] +[596.916453, "o", "n"] +[597.012654, "o", " "] +[597.149, "o", "s"] +[597.33301, "o", "t"] +[597.396858, "o", "a"] +[597.605076, "o", "c"] +[597.716706, "o", "k"] +[597.781734, "o", " "] +[597.970938, "o", "s"] +[598.061845, "o", "h"] +[598.148867, "o", "o"] +[598.266043, "o", "u"] +[598.412427, "o", "l"] +[598.492956, "o", "d"] +[598.58072, "o", " "] +[598.748929, "o", "b"] +[598.828818, "o", "e"] +[598.986377, "o", " "] +[600.612464, "o", "t"] +[600.701426, "o", "h"] +[600.828697, "o", "e"] +[600.908559, "o", " "] +[604.24482, "o", "o"] +[604.501075, "o", "l"] +[604.637138, "o", "d"] +[604.810185, "o", " "] +[605.22084, "o", "C"] +[605.340414, "o", "S"] +[609.981442, "o", " "] +[610.100291, "o", "a"] +[610.245237, "o", "n"] +[610.285745, "o", "d"] +[610.436727, "o", " "] +[610.55753, "o", "t"] +[610.668519, "o", "h"] +[610.75681, "o", "e"] +[610.860497, "o", " "] +[612.298872, "o", "o"] +[612.500901, "o", "l"] +[612.54844, "o", "d"] +[612.645586, "o", " "] +[612.86999, "o", "E"] +[613.068337, "o", "F"] +[613.14841, "o", "L"] +[613.228911, "o", "A"] +[613.389277, "o", "G"] +[613.509013, "o", "S"] +[614.292855, "o", "\r\n"] +[614.29299, "o", "(gdb) "] +[615.946645, "o", "# so the next value on stack should be the old CS and the old EFLAGS"] +[616.276692, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cthis means that we had an interrupt without a priviledge trasition"] +[616.484792, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[33Plooks like a kernel function"] +[617.009785, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cprint (void *)(*(uint32_t *)($esp+8))"] +[617.540423, "o", "\b"] +[617.69268, "o", "\b"] +[617.836936, "o", "\b"] +[618.204688, "o", "\u001b[1P))\b\b"] +[620.260979, "o", "1))\b\b"] +[620.381284, "o", "2))\b\b"] +[620.932544, "o", "\r\n"] +[620.933245, "o", "$5 = (void *) \u001b[34m0x60\u001b[m\r\n(gdb) "] +[632.741989, "o", "p"] +[632.876823, "o", "r"] +[633.012803, "o", "i"] +[633.08155, "o", "n"] +[633.188878, "o", "t"] +[633.757256, "o", " "] +[634.436865, "o", "/"] +[634.937002, "o", "x"] +[635.085661, "o", " "] +[635.388506, "o", "$"] +[636.389874, "o", "c"] +[636.468457, "o", "s"] +[636.764836, "o", "\r\n"] +[636.765008, "o", "$6 = 0x60\r\n(gdb) "] +[641.676586, "o", "#"] +[641.884708, "o", " "] +[641.996872, "o", "a"] +[642.091842, "o", "s"] +[642.196689, "o", " "] +[642.364633, "o", "e"] +[642.484294, "o", "x"] +[642.802488, "o", "p"] +[643.045976, "o", "e"] +[643.164662, "o", "c"] +[643.444844, "o", "t"] +[643.508529, "o", "e"] +[643.708582, "o", "d"] +[643.884817, "o", " "] +[651.548692, "o", "t"] +[653.215508, "o", "h"] +[653.281727, "o", "e"] +[653.413871, "o", " "] +[653.644609, "o", "s"] +[653.772321, "o", "a"] +[653.908641, "o", "m"] +[654.164845, "o", "e"] +[654.828484, "o", " "] +[655.301976, "o", "C"] +[655.497877, "o", "S"] +[655.781698, "o", " "] +[656.109023, "o", "v"] +[656.172724, "o", "a"] +[656.308716, "o", "l"] +[656.540506, "o", "u"] +[656.655112, "o", "e"] +[662.412599, "o", " "] +[666.32495, "o", "\b\u001b[K"] +[682.380362, "o", ","] +[682.516746, "o", " "] +[682.812541, "o", "L"] +[683.084916, "o", "ui"] +[683.17327, "o", "n"] +[683.260224, "o", "u"] +[683.34001, "o", "x"] +[683.388145, "o", " "] +[683.805756, "o", "\b\u001b[K"] +[683.924955, "o", "\b\u001b[K"] +[684.052826, "o", "\b\u001b[K"] +[684.187913, "o", "\b\u001b[K"] +[684.316421, "o", "\b\u001b[K"] +[684.452334, "o", "\b\u001b[K"] +[684.588384, "o", "i"] +[684.78853, "o", "n"] +[684.851703, "o", "u"] +[684.951339, "o", "x"] +[685.045154, "o", " "] +[685.154943, "o", "u"] +[685.236579, "o", "s"] +[685.396194, "o", "e"] +[685.50896, "o", "s"] +[685.62021, "o", " "] +[685.748478, "o", "t"] +[685.887377, "o", "e"] +[686.001983, "o", " "] +[686.332372, "o", "\b\u001b[K"] +[686.461598, "o", "\b\u001b[K"] +[686.644619, "o", "h"] +[686.700338, "o", "e"] +[686.7799, "o", " "] +[686.908468, "o", "s"] +[686.996325, "o", "a"] +[687.068521, "o", "m"] +[687.229012, "o", "e"] +[687.306456, "o", " "] +[687.692433, "o", "c"] +[687.79635, "o", "o"] +[687.916226, "o", "d"] +[687.956423, "o", "e"] +[688.052213, "o", " "] +[688.155815, "o", "s"] +[688.297346, "o", "e"] +[688.411731, "o", "l"] +[688.508792, "o", "e"] +[688.589641, "o", "c"] +[688.80467, "o", "t"] +[688.884059, "o", "o"] +[688.972059, "o", "r"] +[689.837294, "o", "\r\n(gdb) "] +[690.13223, "o", "#"] +[690.276507, "o", " "] +[690.468264, "o", "f"] +[690.572263, "o", "o"] +[690.699264, "o", "r"] +[690.983585, "o", " "] +[691.284204, "o", "a"] +[691.372385, "o", "l"] +[691.524311, "o", "l"] +[691.572643, "o", " "] +[691.756121, "o", "t"] +[691.884368, "o", "h"] +[691.968393, "o", "e"] +[692.060258, "o", " "] +[692.396261, "o", "L"] +[692.644795, "o", "i"] +[692.772298, "o", "n"] +[692.846376, "o", "u"] +[692.950449, "o", "x"] +[693.074016, "o", " "] +[693.412476, "o", "k"] +[693.54072, "o", "e"] +[693.60488, "o", "r"] +[693.652141, "o", "n"] +[693.781041, "o", "e"] +[693.842522, "o", "l"] +[693.932546, "o", " "] +[694.093549, "o", "c"] +[694.188975, "o", "o"] +[694.325315, "o", "d"] +[694.372717, "o", "e"] +[699.204488, "o", "\r\n"] +[699.204575, "o", "(gdb) "] +[699.694945, "o", "#"] +[700.025409, "o", " "] +[701.082078, "o", "a"] +[701.219865, "o", "n"] +[701.300717, "o", "d"] +[701.428262, "o", " "] +[701.516018, "o", "a"] +[701.620252, "o", " "] +[701.762559, "o", "d"] +[701.884545, "o", "i"] +[701.964346, "o", "f"] +[702.396072, "o", "f"] +[702.644607, "o", "e"] +[702.772976, "o", "r"] +[702.857454, "o", "e"] +[702.979937, "o", "n"] +[703.105594, "o", "t"] +[703.205114, "o", " "] +[704.469353, "o", "s"] +[704.684404, "o", "e"] +[704.892085, "o", "l"] +[704.948089, "o", "e"] +[705.01199, "o", "c"] +[705.252192, "o", "t"] +[705.378083, "o", "o"] +[705.45294, "o", "r"] +[705.620379, "o", " "] +[705.788126, "o", "f"] +[705.996593, "o", "o"] +[706.156069, "o", "r"] +[706.324525, "o", " "] +[706.563158, "o", "u"] +[706.619009, "o", "s"] +[706.668009, "o", "e"] +[706.738935, "o", "r"] +[707.340183, "o", " "] +[707.444058, "o", "c"] +[707.564119, "o", "o"] +[707.684062, "o", "d"] +[707.740063, "o", "e"] +[709.035117, "o", " "] +[709.180598, "o", "a"] +[709.228019, "o", "s"] +[709.349715, "o", " "] +[709.444522, "o", "w"] +[709.564695, "o", "e"] +[709.588558, "o", "i"] +[709.803763, "o", "l"] +[710.268131, "o", "\b\u001b[K"] +[710.421074, "o", "\b\u001b[K"] +[710.581086, "o", "\b\u001b[K"] +[711.029269, "o", "e"] +[711.172267, "o", " "] +[711.356432, "o", "w"] +[711.452408, "o", "i"] +[711.604825, "o", "l"] +[711.760514, "o", "l"] +[711.809979, "o", " "] +[712.108216, "o", "s"] +[712.284343, "o", "e"] +[712.420395, "o", "e"] +[712.531941, "o", "\r\n"] +[712.531998, "o", "(gdb) "] +[717.364429, "o", "# and a different selector for user code as we will see"] +[717.579692, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[24Pfor all the Linux kernel code"] +[718.022257, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cas expected the same CS value, Linux uses the same code selector"] +[718.427912, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cprint /x $cs\u001b[K"] +[718.843933, "o", "\b\b\b\b\b\b(void *)(*(uint32_t *)($esp+12))"] +[719.45271, "o", "\b"] +[719.612108, "o", "\b"] +[719.748109, "o", "\b"] +[720.321698, "o", "\u001b[1P))\b\b"] +[722.604741, "o", "6))\b\b"] +[723.284228, "o", "\r\n"] +[723.284909, "o", "$7 = (void *) \u001b[34m0x200246\u001b[m\r\n(gdb) "] +[732.228199, "o", "#"] +[732.483837, "o", " "] +[732.66115, "o", "t"] +[732.804463, "o", "h"] +[732.886263, "o", "e"] +[733.104678, "o", "s"] +[733.16471, "o", "e"] +[733.692292, "o", " "] +[733.789016, "o", "a"] +[733.937223, "o", "r"] +[734.012056, "o", "e"] +[734.069023, "o", " "] +[734.220446, "o", "t"] +[734.286456, "o", "h"] +[734.380547, "o", "e"] +[734.467934, "o", " "] +[734.596123, "o", "o"] +[734.981382, "o", "l"] +[735.133112, "o", "d"] +[735.268121, "o", " "] +[736.004384, "o", "E"] +[736.201581, "o", "F"] +[736.284997, "o", "L"] +[736.38164, "o", "A"] +[736.540379, "o", "G"] +[736.716525, "o", "S"] +[737.628709, "o", "\r\n"] +[737.628991, "o", "(gdb) "] +[756.459919, "o", "#"] +[756.713355, "o", " "] +[756.972562, "o", "l"] +[757.372058, "o", "e"] +[757.555961, "o", "t"] +[757.715745, "o", "s"] +[757.811866, "o", " "] +[757.900933, "o", "s"] +[758.100942, "o", "t"] +[758.252254, "o", "e"] +[758.836667, "o", "p"] +[759.136686, "o", " "] +[759.34112, "o", "t"] +[759.619803, "o", "h"] +[759.723904, "o", "o"] +[759.740077, "o", "r"] +[759.892415, "o", "u"] +[759.988008, "o", "g"] +[760.092285, "o", "h"] +[760.209389, "o", "t"] +[760.52954, "o", "\b\u001b[K"] +[760.659839, "o", "\b\u001b[K"] +[760.796107, "o", "\b\u001b[K"] +[760.924285, "o", "\b\u001b[K"] +[761.060871, "o", "\b\u001b[K"] +[761.49694, "o", "\b\u001b[K"] +[761.677155, "o", "r"] +[761.77997, "o", "o"] +[761.940033, "o", "u"] +[762.444551, "o", "g"] +[762.500741, "o", "h"] +[762.62848, "o", " "] +[762.752533, "o", "t"] +[762.795851, "o", "h"] +[762.901139, "o", "e"] +[762.972318, "o", " "] +[763.076113, "o", "e"] +[763.163768, "o", "x"] +[763.891933, "o", "c"] +[763.964072, "o", "e"] +[764.075829, "o", "p"] +[764.220556, "o", "t"] +[764.299718, "o", "i"] +[764.339842, "o", "o"] +[764.500348, "o", "n"] +[764.580435, "o", " "] +[764.749173, "o", "h"] +[764.805422, "o", "a"] +[764.940176, "o", "n"] +[765.00449, "o", "d"] +[765.172277, "o", "l"] +[765.259869, "o", "e"] +[765.36498, "o", "r"] +[767.092697, "o", "\r\n(gdb) "] +[769.452692, "o", "n"] +[769.492828, "o", "e"] +[769.60402, "o", "x"] +[769.852194, "o", "t"] +[770.75791, "o", "\r\n"] +[770.817206, "o", "\u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1155\r\n1155\t\tENCODE_FRAME_POINTER\r\n(gdb) "] +[771.979803, "o", "\r\n"] +[771.984483, "o", "1158\t\tGS_TO_REG %ecx\r\n(gdb) "] +[772.901642, "o", "\r\n"] +[772.904154, "o", "1159\t\tmovl\tPT_GS(%esp), %edi\t\t# get the function address\r\n(gdb) "] +[774.044833, "o", "\r\n"] +[774.047555, "o", "1160\t\tREG_TO_PTGS %ecx\r\n(gdb) "] +[775.068367, "o", "\r\n"] +[775.071481, "o", "1161\t\tSET_KERNEL_GS %ecx\r\n(gdb) "] +[776.020547, "o", "\r\n"] +[776.024942, "o", "1164\t\tmovl\tPT_ORIG_EAX(%esp), %edx\t\t# get the error code\r\n(gdb) "] +[777.372032, "o", "\r\n"] +[777.374254, "o", "1165\t\tmovl\t$-1, PT_ORIG_EAX(%esp)\t\t# no syscall to restart\r\n(gdb) "] +[778.932072, "o", "\r\n"] +[778.935109, "o", "1167\t\tmovl\t%esp, %eax\t\t\t# pt_regs pointer\r\n(gdb) "] +[780.084544, "o", "\r\n"] +[780.087079, "o", "1168\t\tCALL_NOSPEC edi\r\n"] +[780.087242, "o", "(gdb) "] +[789.068946, "o", "t"] +[789.163754, "o", "h"] +[789.227926, "o", "i"] +[789.364066, "o", "s"] +[789.475736, "o", " "] +[789.77071, "o", "s"] +[789.845352, "o", "h"] +[789.900666, "o", "o"] +[790.01865, "o", "u"] +[790.164322, "o", "l"] +[790.212024, "o", "d"] +[790.379831, "o", " "] +[790.843865, "o", "c"] +[790.948496, "o", "a"] +[790.98792, "o", "l"] +[791.099624, "o", "l"] +[791.179854, "o", " "] +[791.884188, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[792.124312, "o", "\u001b[1@#"] +[792.300147, "o", "\u001b[1@ "] +[792.523828, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[793.004183, "o", "t"] +[793.108772, "o", "h"] +[793.244482, "o", "e"] +[793.283931, "o", " "] +[794.948292, "o", "a"] +[795.052886, "o", "c"] +[795.267741, "o", "t"] +[795.332027, "o", "u"] +[795.443685, "o", "a"] +[795.53154, "o", "l"] +[795.643762, "o", " "] +[795.755946, "o", "e"] +[795.875998, "o", "x"] +[796.076283, "o", "c"] +[796.163912, "o", "e"] +[796.291666, "o", "p"] +[796.507701, "o", "t"] +[796.564221, "o", "i"] +[796.619819, "o", "o"] +[796.787619, "o", "n"] +[796.876639, "o", " "] +[797.227727, "o", "h"] +[797.347798, "o", "a"] +[797.453011, "o", "n"] +[797.531866, "o", "d"] +[797.643736, "o", "l"] +[797.731685, "o", "e"] +[797.811748, "o", "r"] +[798.716546, "o", "\r\n(gdb) "] +[799.81999, "o", "p"] +[799.969817, "o", "r"] +[800.040246, "o", "i"] +[800.120984, "o", "n"] +[800.180163, "o", "t"] +[800.347919, "o", " "] +[800.969409, "o", "/"] +[801.120768, "o", "x"] +[801.221364, "o", " "] +[801.500176, "o", "$"] +[802.020108, "o", "e"] +[802.260097, "o", "d"] +[802.451977, "o", "i"] +[803.732078, "o", "\r\n"] +[803.732203, "o", "$8 = 0xc15d3840\r\n(gdb) "] +[806.04535, "o", "print /x $edi"] +[806.435671, "o", "\b"] +[806.952968, "o", "\b"] +[806.979538, "o", "\b"] +[807.01104, "o", "\b"] +[807.042755, "o", "\b"] +[807.604211, "o", "\b\u001b[1P $edi\b\b\b\b\b"] +[807.724321, "o", "\b\u001b[1P $edi\b\b\b\b\b"] +[808.12947, "o", "( $edi\b\b\b\b\b"] +[808.30103, "o", "v $edi\b\b\b\b\b"] +[808.400349, "o", "o $edi\b\b\b\b\b"] +[808.459861, "o", "i $edi\b\b\b\b\b"] +[808.54765, "o", "d $edi\b\b\b\b\b"] +[808.691701, "o", "\u001b[C $edi\b\b\b\b\b"] +[808.964799, "o", "* $edi\b\b\b\b\b"] +[809.100734, "o", ") $edi\b\b\b\b\b"] +[809.780431, "o", "\u001b[1P$edi\b\b\b\b"] +[810.188463, "o", "\r\n"] +[810.188589, "o", "$9 = (void *) \u001b[34m0xc15d3840\u001b[m <\u001b[33msysvec_apic_timer_interrupt\u001b[m>\r\n(gdb) "] +[820.684388, "o", "s"] +[821.316168, "o", "t"] +[821.396037, "o", "e"] +[821.563919, "o", "p"] +[821.819873, "o", "i"] +[823.507985, "o", "\r\n"] +[823.511475, "o", "\u001b[34m0xc15e0b05\u001b[m in \u001b[33m__x86_retpoline_edi\u001b[m ()\u001b[m\r\n \u001b[m at \u001b[32m./arch/x86/include/asm/GEN-for-each-reg.h\u001b[m:23\r\n23\tGEN(edi)\r\n(gdb) "] +[827.219827, "o", "b"] +[827.348216, "o", "t"] +[828.148173, "o", "\r\n"] +[828.14836, "o", "#0 \u001b[34m0xc15e0b05\u001b[m in \u001b[33m__x86_retpoline_edi\u001b[m ()\u001b[m\r\n \u001b[m at \u001b[32m./arch/x86/include/asm/GEN-for-each-reg.h\u001b[m:23\r\n#1 \u001b[34m0xc15deb5f\u001b[m in \u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1168\r\n"] +[828.149581, "o", "#2 \u001b[34m0x00000000\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[828.150395, "o", "(gdb) "] +[829.675679, "o", "s"] +[829.880802, "o", "t"] +[829.945607, "o", "e"] +[829.998494, "o", "p"] +[830.140786, "o", "i"] +[830.467985, "o", "\r\n"] +[830.470567, "o", "\u001b[34m0xc15e0b11\u001b[m in \u001b[33m__x86_retpoline_edi\u001b[m ()\u001b[m\r\n \u001b[m at \u001b[32m./arch/x86/include/asm/GEN-for-each-reg.h\u001b[m:23\r\n23\tGEN(edi)\r\n"] +[830.470677, "o", "(gdb) "] +[831.507846, "o", "\r\n"] +[831.510731, "o", "\u001b[34m0xc15e0b14\u001b[m\t23\tGEN(edi)\r\n(gdb) "] +[833.044138, "o", "\r\n"] +[833.047651, "o", "\u001b[33msysvec_apic_timer_interrupt\u001b[m (\u001b[36mregs\u001b[m=0xc17cfebc)\u001b[m\r\n \u001b[m at \u001b[32march/x86/kernel/apic/apic.c\u001b[m:1091\r\n"] +[833.047776, "o", "1091\tDEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)\r\n(gdb) "] +[837.992379, "o", "b"] +[838.082464, "o", "t"] +[838.468115, "o", "\r\n"] +[838.468954, "o", "#0 \u001b[33msysvec_apic_timer_interrupt\u001b[m (\u001b[36mregs\u001b[m=0xc17cfebc)\u001b[m\r\n \u001b[m at \u001b[32march/x86/kernel/apic/apic.c\u001b[m:1091\r\n#1 \u001b[34m0xc15deb5f\u001b[m in \u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1168\r\n"] +[838.470231, "o", "#2 \u001b[34m0x00000000\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[838.47094, "o", "(gdb) "] +[840.947473, "o", "#"] +[841.144354, "o", " "] +[841.360793, "o", "o"] +[841.42971, "o", "k"] +[842.204191, "o", ","] +[842.341979, "o", " "] +[842.444332, "o", "w"] +[842.539968, "o", "e"] +[842.677559, "o", " "] +[842.867869, "o", "r"] +[842.932437, "o", "e"] +[843.051893, "o", "a"] +[843.267887, "o", "c"] +[843.717425, "o", "h"] +[844.315499, "o", " "] +[844.976743, "o", "\b\u001b[K"] +[845.116092, "o", "e"] +[845.283572, "o", "d"] +[845.419727, "o", " "] +[845.579645, "o", "t"] +[845.69171, "o", "h"] +[845.784239, "o", "e"] +[845.884895, "o", " "] +[846.475528, "o", "e"] +[846.992223, "o", "x"] +[847.779862, "o", "c"] +[847.899595, "o", "e"] +[848.012177, "o", "p"] +[848.237069, "o", "t"] +[848.299719, "o", "i"] +[848.355554, "o", "o"] +[848.548387, "o", "n"] +[848.668273, "o", " "] +[848.883368, "o", "h"] +[848.924119, "o", "a"] +[849.102725, "o", "n"] +[849.195892, "o", "d"] +[849.320334, "o", "l"] +[849.403392, "o", "e"] +[849.469381, "o", "r"] +[849.555461, "o", " "] +[849.683646, "o", "f"] +[849.764172, "o", "u"] +[849.82762, "o", "n"] +[849.940511, "o", "c"] +[850.140061, "o", "t"] +[850.188329, "o", "i"] +[850.236049, "o", "o"] +[850.403561, "o", "n"] +[851.244482, "o", "\r\n(gdb) "] +[851.940142, "o", "#"] +[852.116219, "o", " "] +[852.362087, "o", "l"] +[852.425526, "o", "e"] +[852.572158, "o", "t"] +[852.899784, "o", "s"] +[853.075543, "o", " "] +[853.252474, "o", "s"] +[853.360745, "o", "k"] +[853.555665, "o", "i"] +[853.756754, "o", "p"] +[853.893804, "o", " "] +[854.067629, "o", "w"] +[854.179574, "o", "o"] +[854.479805, "o", "\b\u001b[K"] +[854.580246, "o", "\b\u001b[K"] +[854.691967, "o", "t"] +[854.756166, "o", "o"] +[854.866408, "o", " "] +[854.916837, "o", "t"] +[855.051746, "o", "h"] +[855.13964, "o", "e"] +[855.203583, "o", " "] +[855.484009, "o", "e"] +[855.593676, "o", "n"] +[855.675946, "o", "d"] +[856.363844, "o", "\r\n"] +[856.363913, "o", "(gdb) "] +[857.411824, "o", "f"] +[857.459708, "o", "i"] +[857.596332, "o", "n"] +[857.644121, "o", "i"] +[857.716486, "o", "s"] +[857.825967, "o", "h"] +[859.756006, "o", "\r\n"] +[859.756719, "o", "Run till exit from #0 \u001b[33msysvec_apic_timer_interrupt\u001b[m (\u001b[36mregs\u001b[m=0xc17cfebc)\u001b[m\r\n \u001b[m at \u001b[32march/x86/kernel/apic/apic.c\u001b[m:1091\r\n"] +[859.763094, "o", "\u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1179\r\n1179\t\tmovl\tPT_CS(%esp), %eax\r\n"] +[859.763267, "o", "(gdb) "] +[862.512345, "o", "n"] +[864.404212, "o", "\r\n"] +[864.406527, "o", "1180\t\tandl\t$SEGMENT_RPL_MASK, %eax\r\n"] +[864.406578, "o", "(gdb) "] +[865.654797, "o", "n"] +[866.267877, "o", "\r\n"] +[866.27079, "o", "1182\t\tcmpl\t$USER_RPL, %eax\t\t\t# returning to v8086 or userspace ?\r\n(gdb) "] +[867.360688, "o", "n"] +[867.708304, "o", "\r\n"] +[867.710843, "o", "1183\t\tjnb\tret_to_user\r\n"] +[867.710968, "o", "(gdb) "] +[868.899288, "o", "n"] +[870.284212, "o", "\r\n"] +[870.287095, "o", "1185\t\tPARANOID_EXIT_TO_KERNEL_MODE\r\n"] +[870.287206, "o", "(gdb) "] +[871.396245, "o", "n"] +[872.356138, "o", "\r\n"] +[872.360073, "o", "1187\t\tRESTORE_REGS 4\r\n(gdb) "] +[873.580284, "o", "n"] +[874.020225, "o", "\r\n"] +[874.06255, "o", "\u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1188\r\n1188\t\tjmp\t.Lirq_return\r\n(gdb) "] +[880.227976, "o", "s"] +[880.563745, "o", "t"] +[880.663906, "o", "e"] +[880.772059, "o", "p"] +[881.016269, "o", "i"] +[885.107968, "o", "\r\n"] +[885.1111, "o", "\u001b[33mentry_INT80_32\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1080\r\n1080\t\tINTERRUPT_RETURN\r\n(gdb) "] +[886.896524, "o", "d"] +[887.002145, "o", "i"] +[887.17178, "o", "s"] +[887.267258, "o", "a"] +[887.427617, "o", "s"] +[888.193298, "o", "semble "] +[888.985353, "o", "$"] +[889.2295, "o", "p"] +[889.6279, "o", "c"] +[889.755674, "o", ","] +[890.130259, "o", "+"] +[890.523791, "o", "4"] +[892.275975, "o", "\r\n"] +[892.276128, "o", "Dump of assembler code from 0xc15dea1e to 0xc15dea22:\r\n"] +[892.276217, "o", "=> \u001b[34m0xc15dea1e\u001b[m <\u001b[33mentry_INT80_32\u001b[m+426>:\tiret \r\n \u001b[34m0xc15dea1f\u001b[m <\u001b[33mhandle_exception\u001b[m+0>:\tcld \r\n"] +[892.276254, "o", " \u001b[34m0xc15dea20\u001b[m <\u001b[33mhandle_exception\u001b[m+1>:\tpush %fs\r\nEnd of assembler dump.\r\n"] +[892.276365, "o", "(gdb) "] +[900.885286, "o", "#"] +[901.051387, "o", " "] +[903.108694, "o", "w"] +[903.195526, "o", "e"] +[903.331464, "o", " "] +[903.507641, "o", "r"] +[903.579802, "o", "e"] +[903.651849, "o", "a"] +[903.827191, "o", "c"] +[903.915925, "o", "h"] +[904.018162, "o", "e"] +[904.084005, "o", "d"] +[904.193456, "o", " "] +[904.340005, "o", "t"] +[904.419249, "o", "h"] +[904.50751, "o", "e"] +[904.619508, "o", " "] +[905.050041, "o", "e"] +[905.142961, "o", "n"] +[905.242133, "o", "d"] +[905.356107, "o", " "] +[906.418504, "o", "o"] +[906.483992, "o", "f"] +[906.571581, "o", " "] +[906.731762, "o", "t"] +[906.788765, "o", "h"] +[906.915517, "o", "e"] +[907.019539, "o", " "] +[907.226477, "o", "e"] +[907.376179, "o", "x"] +[907.627505, "o", "c"] +[908.25994, "o", "e"] +[908.371671, "o", "p"] +[908.580147, "o", "t"] +[908.635847, "o", "i"] +[908.675142, "o", "o"] +[908.868113, "o", "n"] +[909.147401, "o", " "] +[910.283243, "o", "h"] +[910.34774, "o", "a"] +[910.539401, "o", "n"] +[910.94662, "o", "d"] +[911.099697, "o", "l"] +[912.379334, "o", "i"] +[912.463978, "o", "n"] +[912.578486, "o", "g"] +[915.172176, "o", "\r\n"] +[915.17238, "o", "(gdb) "] +[915.627306, "o", "#"] +[915.802899, "o", " "] +[916.01211, "o", "i"] +[916.132261, "o", "r"] +[916.480252, "o", "e"] +[916.740283, "o", "t"] +[916.875718, "o", " "] +[917.027976, "o", "t"] +[917.707571, "o", "\b\u001b[K"] +[917.850998, "o", "w"] +[917.987234, "o", "i"] +[918.164289, "o", "l"] +[918.345294, "o", "l"] +[918.428587, "o", " "] +[921.142498, "o", "p"] +[921.315361, "o", "o"] +[921.379736, "o", "p"] +[922.235945, "o", " "] +[922.523391, "o", "t"] +[922.65927, "o", "h"] +[922.77948, "o", "e"] +[922.931826, "o", " "] +[923.619616, "o", "o"] +[923.81968, "o", "l"] +[924.348218, "o", "d"] +[924.483915, "o", " "] +[924.803599, "o", "E"] +[925.091333, "o", "I"] +[925.204834, "o", "P"] +[925.859791, "o", ","] +[926.323847, "o", " "] +[926.628744, "o", "O"] +[926.836056, "o", "L"] +[926.972515, "o", "D"] +[927.555719, "o", " "] +[927.860637, "o", "C"] +[927.956656, "o", "S"] +[928.131232, "o", " "] +[928.283599, "o", "a"] +[928.379165, "o", "n"] +[928.499395, "o", "d"] +[928.572276, "o", " "] +[930.399792, "o", "o"] +[930.579592, "o", "l"] +[930.747191, "o", "d "] +[936.451423, "o", "E"] +[936.651497, "o", "F"] +[936.724116, "o", "L"] +[936.803501, "o", "A"] +[936.944129, "o", "G"] +[937.017988, "o", "S"] +[940.419551, "o", "\r\n(gdb) "] +[940.703548, "o", "#"] +[940.867379, "o", " "] +[941.131995, "o", "a"] +[941.236387, "o", "n"] +[941.315354, "o", "d"] +[941.428119, "o", " "] +[941.545304, "o", "r"] +[941.595714, "o", "e"] +[941.780241, "o", "s"] +[941.875308, "o", "u"] +[942.324162, "o", "m"] +[942.479946, "o", "e"] +[942.595786, "o", " "] +[942.724262, "o", "t"] +[942.828484, "o", "h"] +[942.914989, "o", "e"] +[943.019342, "o", " "] +[943.291437, "o", "p"] +[943.427129, "o", "r"] +[943.492285, "o", "e"] +[943.680718, "o", "v"] +[943.762965, "o", "i"] +[943.827585, "o", "o"] +[944.003395, "o", "u"] +[944.075376, "o", "s"] +[944.179349, "o", " "] +[944.620027, "o", "e"] +[944.800545, "o", "x"] +[944.880871, "o", "e"] +[945.076374, "o", "c"] +[945.355484, "o", "t"] +[945.451345, "o", "i"] +[945.499272, "o", "o"] +[945.699588, "o", "n"] +[945.868687, "o", " "] +[946.332003, "o", "f"] +[946.451626, "o", "l"] +[946.715244, "o", "o"] +[949.939268, "o", "w"] +[952.179651, "o", "\r\n(gdb) "] +[953.872763, "o", "s"] +[954.104233, "o", "t"] +[954.220454, "o", "e"] +[954.523396, "o", "p"] +[954.699106, "o", "i"] +[957.155393, "o", "\r\n"] +[957.157788, "o", "\u001b[34m0xc15dcb62\u001b[m in \u001b[33mdefault_idle\u001b[m () at \u001b[32m./arch/x86/include/asm/irqflags.h\u001b[m:60\r\n60\t\tasm volatile(\"sti; hlt\": : :\"memory\");\r\n"] +[957.157838, "o", "(gdb) "] +[959.275429, "o", "#"] +[959.427369, "o", " "] +[959.547282, "o", "a"] +[959.660394, "o", "n"] +[959.739359, "o", "d"] +[959.843798, "o", " "] +[959.9766, "o", "w"] +[960.047887, "o", "e"] +[960.172093, "o", " "] +[960.315666, "o", "a"] +[960.419302, "o", "r"] +[960.491251, "o", "e"] +[960.579224, "o", " "] +[960.77959, "o", "b"] +[960.851343, "o", "a"] +[960.915682, "o", "c"] +[961.049011, "o", "k"] +[961.201567, "o", " "] +[962.4203, "o", "i"] +[962.475459, "o", "n"] +[962.61159, "o", " "] +[962.714878, "o", "t"] +[962.851242, "o", "h"] +[962.931223, "o", "e"] +[963.051282, "o", " "] +[963.907009, "o", "o"] +[964.042918, "o", "r"] +[964.123059, "o", "i"] +[964.251116, "o", "g"] +[964.299084, "o", "i"] +[964.45136, "o", "n"] +[964.555122, "o", "a"] +[964.931034, "o", "l"] +[965.155018, "o", " "] +[972.371607, "o", "f"] +[972.555443, "o", "u"] +[972.643143, "o", "n"] +[972.76443, "o", "c"] +[973.027243, "o", "t"] +[973.075346, "o", "i"] +[973.13917, "o", "o"] +[973.344676, "o", "n"] +[973.499763, "o", " "] +[976.467642, "o", "\r\n"] +[976.467915, "o", "(gdb) "] +[996.435802, "o", "quit\r\n"] +[996.436065, "o", "A debugging session is active.\r\n\r\n\tInferior 1 [process 1] will be detached.\r\n\r\nQuit anyway? (y or n) "] +[997.892586, "o", "y"] +[998.379483, "o", "\r\nDetaching from program: /linux/vmlinux, process 1\r\n"] +[998.380309, "o", "Ending remote debugging.\r\n"] +[998.380413, "o", "[Inferior 1 (process 1) detached]\r\n"] +[998.387415, "o", "$ "] +[1004.003082, "o", "^L"] +[1004.78691, "o", "\b \b\b \b"] +[1005.515685, "o", "#"] +[1005.891313, "o", " "] +[1006.098914, "o", "n"] +[1006.171159, "o", "o"] +[1006.251338, "o", "w"] +[1006.379746, "o", " "] +[1006.642949, "o", "l"] +[1006.756392, "o", "e"] +[1006.972256, "o", "t"] +[1007.451427, "o", "s"] +[1008.78733, "o", " "] +[1011.707191, "o", "t"] +[1011.922977, "o", "r"] +[1012.075141, "o", "y"] +[1012.211589, "o", " "] +[1012.407443, "o", "t"] +[1012.496215, "o", "o"] +[1012.63914, "o", " "] +[1013.171266, "o", "d"] +[1013.280435, "o", "o"] +[1013.382917, "o", " "] +[1013.699697, "o", "a"] +[1015.067856, "o", " "] +[1017.747341, "o", "p"] +[1017.899377, "o", "r"] +[1018.02099, "o", "i"] +[1018.144684, "o", "v"] +[1018.219465, "o", "i"] +[1018.435513, "o", "l"] +[1018.578748, "o", "e"] +[1018.898886, "o", "d"] +[1019.160504, "o", "g"] +[1019.315664, "o", "e"] +[1020.651137, "o", " "] +[1020.795118, "o", "t"] +[1020.971166, "o", "r"] +[1021.085116, "o", "a"] +[1021.231631, "o", "n"] +[1021.387101, "o", "s"] +[1021.675097, "o", "i"] +[1021.867216, "o", "t"] +[1021.930784, "o", "i"] +[1021.994802, "o", "o"] +[1022.227196, "o", "n"] +[1022.34665, "o", " "] +[1032.083575, "o", "^[[D"] +[1032.923178, "o", "\b \b"] +[1033.050768, "o", "\b \b"] +[1033.195851, "o", "\b \b\b \b"] +[1033.355124, "o", "\b \b"] +[1034.275365, "o", "\r\n"] +[1034.275424, "o", "$ "] +[1035.090995, "o", "#"] +[1035.571157, "o", " "] +[1038.299127, "o", "f"] +[1038.418833, "o", "o"] +[1038.546673, "o", "r"] +[1038.643075, "o", " "] +[1038.811737, "o", "t"] +[1038.986934, "o", "h"] +[1039.075088, "o", "a"] +[1039.171894, "o", "t"] +[1039.290797, "o", " "] +[1039.714845, "o", "w"] +[1040.060445, "o", "e"] +[1040.579062, "o", " "] +[1040.715253, "o", "w"] +[1040.835144, "o", "i"] +[1041.008189, "o", "l"] +[1041.188661, "o", "l"] +[1062.274587, "o", " "] +[1063.994677, "o", "r"] +[1064.066586, "o", "u"] +[1064.258739, "o", "n"] +[1064.754719, "o", " "] +[1064.866336, "o", "a"] +[1064.95491, "o", " "] +[1069.779045, "o", "u"] +[1069.842686, "o", "s"] +[1069.898904, "o", "e"] +[1069.992062, "o", "r"] +[1070.090743, "o", "s"] +[1070.186611, "o", "p"] +[1070.271994, "o", "a"] +[1070.335417, "o", "c"] +[1070.435046, "o", "e"] +[1070.490858, "o", " "] +[1070.667137, "o", "l"] +[1070.843138, "o", "o"] +[1070.978461, "o", "o"] +[1071.042789, "o", "p"] +[1072.299918, "o", "\r\n$ "] +[1072.677072, "o", "#"] +[1072.8757, "o", " "] +[1073.123342, "o", "t"] +[1073.187116, "o", "o"] +[1073.234967, "o", " "] +[1073.64715, "o", "m"] +[1073.723177, "o", "a"] +[1074.003257, "o", "x"] +[1074.066757, "o", "i"] +[1074.242917, "o", "m"] +[1074.440397, "o", "e"] +[1074.951277, "o", "\b \b"] +[1075.082786, "o", "i"] +[1075.154841, "o", "z"] +[1075.306772, "o", "e"] +[1075.362669, "o", " "] +[1075.507002, "o", "t"] +[1075.563397, "o", "h"] +[1075.667216, "o", "e"] +[1075.759055, "o", " "] +[1075.971488, "o", "c"] +[1076.40312, "o", "h"] +[1076.538776, "o", "a"] +[1076.822943, "o", "n"] +[1077.090772, "o", "c"] +[1077.179219, "o", "e"] +[1077.274764, "o", " "] +[1077.427, "o", "o"] +[1077.538624, "o", "f"] +[1077.6266, "o", " "] +[1080.099022, "o", "c"] +[1080.220601, "o", "a"] +[1080.435652, "o", "t"] +[1080.642958, "o", "c"] +[1080.754845, "o", "h"] +[1080.890696, "o", "i"] +[1080.970811, "o", "n"] +[1081.066751, "o", "g"] +[1081.187186, "o", " "] +[1081.450963, "o", "a"] +[1081.535893, "o", " "] +[1082.419055, "o", "u"] +[1082.554783, "o", "s"] +[1082.707252, "o", "e"] +[1082.778665, "o", "r"] +[1082.907521, "o", " "] +[1083.155048, "o", "-"] +[1083.642635, "o", ">"] +[1083.82688, "o", " "] +[1083.995373, "o", "i"] +[1084.106729, "o", "r"] +[1084.15548, "o", "r"] +[1084.226868, "o", "q"] +[1084.523026, "o", " "] +[1084.722585, "o", "t"] +[1084.890951, "o", "r"] +[1084.955439, "o", "a"] +[1085.066325, "o", "n"] +[1085.170702, "o", "s"] +[1085.490659, "o", "i"] +[1085.791096, "o", "t"] +[1085.864736, "o", "i"] +[1085.89322, "o", "o"] +[1086.098742, "o", "n"] +[1086.763832, "o", " "] +[1093.283335, "o", "\r\n"] +[1093.285702, "o", "$ "] +[1094.41432, "o", "^[[A"] +[1094.786061, "o", "^[[A"] +[1095.389595, "o", "\b \b"] +[1095.895038, "o", "\b \b"] +[1095.9211, "o", "\b \b\b \b"] +[1095.953965, "o", "\b \b"] +[1095.985163, "o", "\b \b"] +[1096.0314, "o", "\b \b\b \b"] +[1096.289423, "o", "m"] +[1096.384651, "o", "i"] +[1096.91055, "o", "n"] +[1096.977922, "o", "i"] +[1097.17697, "o", "c"] +[1097.232856, "o", "o"] +[1097.320687, "o", "m"] +[1097.841256, "o", " "] +[1098.864769, "o", "-"] +[1099.174029, "o", "D"] +[1099.281202, "o", " "] +[1099.616635, "o", "s"] +[1099.680677, "o", "e"] +[1099.776722, "o", "r"] +[1099.840456, "o", "i"] +[1100.020817, "o", "a"] +[1100.128658, "o", "l"] +[1100.313642, "o", "."] +[1100.528928, "o", "p"] +[1100.712992, "o", "t"] +[1101.260834, "o", "s"] +[1101.369434, "o", "\r\n"] +[1101.370212, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[1101.370539, "o", "\u001b[?12l\u001b[?25h\nWelcome to minicom 2.7.1\r\n\nOPTIONS: I18n \r\nCompiled on Dec 23 2019, 02:06:26.\r\nPort serial.pts, 22:39:44\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[1102.49074, "o", "\n"] +[1102.491207, "o", "Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0"] +[1102.491482, "o", "\r\n"] +[1102.492003, "o", "\n"] +[1102.492481, "o", "qemux86 login: "] +[1102.921642, "o", "r"] +[1103.001363, "o", "o"] +[1103.129677, "o", "o"] +[1103.217562, "o", "t"] +[1103.322957, "o", "\r\n"] +[1103.376683, "o", "root@qemux86:~# "] +[1104.675056, "o", "\r\n"] +[1104.676255, "o", "root@qemux86:~# "] +[1107.089642, "o", "i"] +[1107.298186, "o", "="] +[1107.529893, "o", "0"] +[1107.802507, "o", ";"] +[1108.129744, "o", " "] +[1108.378208, "o", "w"] +[1108.433774, "o", "h"] +[1108.489985, "o", "i"] +[1108.641622, "o", "l"] +[1108.713754, "o", "e"] +[1108.834396, "o", " "] +[1108.945519, "o", "t"] +[1109.082068, "o", "r"] +[1109.161673, "o", "u"] +[1109.601342, "o", "e"] +[1110.130521, "o", ";"] +[1110.937837, "o", " "] +[1111.081655, "o", "d"] +[1111.170074, "o", "o"] +[1113.96216, "o", " "] +[1115.225582, "o", "i"] +[1115.635039, "o", "="] +[1115.993851, "o", "$"] +[1116.177231, "o", "["] +[1116.512499, "o", "i"] +[1116.955475, "o", "\b\u001b[16;1H\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[15;41H\u001b[K"] +[1117.241465, "o", "$"] +[1117.490132, "o", "i"] +[1118.170026, "o", "+"] +[1118.473776, "o", "1"] +[1118.641428, "o", "]"] +[1120.374762, "o", ";"] +[1120.986189, "o", " "] +[1122.00169, "o", "d"] +[1122.042127, "o", "o"] +[1122.224768, "o", "n"] +[1122.362033, "o", "e"] +[1123.097725, "o", " "] +[1124.041576, "o", "&"] +[1124.842453, "o", "\r\n"] +[1124.846558, "o", "root@qemux86:~# "] +[1125.945623, "o", "t"] +[1126.06538, "o", "o"] +[1126.153341, "o", "p"] +[1126.819453, "o", "\r\n"] +[1127.026599, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 30712K used, 209968K free, 172K shrd, 368K buff, 4344K cached"] +[1127.026754, "o", "\r\n"] +[1127.026857, "o", "CPU: 86% usr 13% sys 0% nic 0% idle 0% io 0% irq 0% sirq"] +[1127.027083, "o", "\r\n"] +[1127.027772, "o", "Load average: 0.28 0.65 0.45 2/38 348"] +[1127.027961, "o", "\r\n"] +[1127.044934, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[1127.045363, "o", "\r\n"] +[1127.050449, "o", "\u001b[0m\u001b(B 347 345 root R 2972 1% 80% -sh"] +[1127.050668, "o", "\r\n"] +[1127.051028, "o", " 348 345 root R 2828 1% 7% top"] +[1127.051127, "o", "\r\n"] +[1127.052058, "o", " 10 2 root IW 0 0% 7% [rcu_sched]"] +[1127.052239, "o", "\r\n"] +[1127.05275, "o", " 345 1 root S 2972 1% 0% -sh"] +[1127.052867, "o", "\r\n"] +[1127.057884, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[1127.058054, "o", "\r\n"] +[1127.058664, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[1127.058723, "o", "\r\n"] +[1127.059444, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[1127.059553, "o", "\r\n"] +[1127.060102, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[1127.060272, "o", "\r\n"] +[1127.06083, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[1127.060931, "o", "\r\n"] +[1127.066093, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[1127.066277, "o", "\r\n"] +[1127.066919, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[1127.067167, "o", "\r\n"] +[1127.067611, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[16;80H"] +[1127.067744, "o", "\r\n"] +[1127.068254, "o", " 1 0 root S 2004 1% 0% init [5]"] +[1127.068396, "o", "\r\n"] +[1127.06884, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[1127.068996, "o", "\r\n"] +[1127.074239, "o", " 9 2 root SW 0 0% 0% [ksoftirqd/0]"] +[1127.074463, "o", "\r\n"] +[1127.074935, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-eve]"] +[1127.07527, "o", "\r\n"] +[1127.07566, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[1127.075843, "o", "\r\n"] +[1127.076346, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[1127.076463, "o", "\r\n"] +[1127.07694, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]"] +[1127.077152, "o", "\r\n"] +[1127.081642, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]\r"] +[1132.126436, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 30772K used, 209908K free, 172K shrd, 368K buff, 4344K cached"] +[1132.126724, "o", "\r\n"] +[1132.126816, "o", "CPU: 97% usr 2% sys 0% nic 0% idle 0% io 0% irq 0% sirq"] +[1132.12694, "o", "\r\n"] +[1132.127521, "o", "Load average: 0.34 0.66 0.46 2/38 348"] +[1132.127666, "o", "\r\n"] +[1132.128837, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[1132.129009, "o", "\r\n"] +[1132.134166, "o", "\u001b[0m\u001b(B 347 345 root R 2972 1% 99% -sh"] +[1132.134454, "o", "\r\n"] +[1132.135025, "o", " 348 345 root R 2972 1% 1% top"] +[1132.135336, "o", "\r\n"] +[1132.135943, "o", " 345 1 root S 2972 1% 0% -sh"] +[1132.13614, "o", "\r\n"] +[1132.13659, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[1132.136798, "o", "\r\n"] +[1132.141677, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[1132.141859, "o", "\r\n"] +[1132.142517, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[1132.142783, "o", "\r\n"] +[1132.143294, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[1132.14353, "o", "\r\n"] +[1132.143957, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[1132.144265, "o", "\r\n"] +[1132.1447, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[1132.14492, "o", "\r\n"] +[1132.150219, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[1132.150349, "o", "\r\n"] +[1132.150878, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[15;80H"] +[1132.151051, "o", "\r\n"] +[1132.151431, "o", " 1 0 root S 2004 1% 0% init [5]"] +[1132.151581, "o", "\r\n"] +[1132.152171, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[1132.152327, "o", "\r\n"] +[1132.152911, "o", " 9 2 root SW 0 0% 0% [ksoftirqd/0]"] +[1132.153024, "o", "\r\n"] +[1132.157768, "o", " 10 2 root IW 0 0% 0% [rcu_sched]"] +[1132.157995, "o", "\r\n"] +[1132.158368, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-eve]"] +[1132.158645, "o", "\r\n"] +[1132.159109, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[1132.159346, "o", "\r\n"] +[1132.159794, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[1132.159838, "o", "\r\n"] +[1132.160264, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]"] +[1132.16044, "o", "\r\n"] +[1132.160661, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]\r"] +[1133.586006, "o", "\n\u001b[23;80H \u001b[24;1H"] +[1133.589361, "o", "root@qemux86:~# "] +[1135.56963, "o", "#"] +[1135.761792, "o", " "] +[1136.825768, "o", "o"] +[1136.897392, "o", "k"] +[1137.009583, "o", " "] +[1137.185503, "o", "t"] +[1137.313332, "o", "h"] +[1137.433542, "o", "e"] +[1137.538696, "o", " "] +[1137.849713, "o", "C"] +[1137.905725, "o", "P"] +[1138.081863, "o", "U"] +[1138.649702, "o", " "] +[1138.833525, "o", "i"] +[1138.937694, "o", "s"] +[1139.001327, "o", " "] +[1139.201449, "o", "m"] +[1139.26509, "o", "o"] +[1139.417946, "o", "s"] +[1139.697189, "o", "t"] +[1139.785468, "o", "ly"] +[1140.337652, "o", " "] +[1140.521323, "o", "ru"] +[1140.577507, "o", "nn"] +[1140.785505, "o", "in"] +[1140.865534, "o", "g"] +[1141.073726, "o", " "] +[1142.521937, "o", "u"] +[1142.625315, "o", "s"] +[1142.681312, "o", "e"] +[1142.746069, "o", "r"] +[1142.89747, "o", "s"] +[1143.000997, "o", "p"] +[1144.048991, "o", "a"] +[1144.46552, "o", "c"] +[1144.569368, "o", "e"] +[1144.713469, "o", " "] +[1145.721142, "o", "c"] +[1145.849051, "o", "o"] +[1145.937576, "o", "d"] +[1146.129514, "o", "e"] +[1149.698062, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[1149.699006, "o", "root@qemux86:~# "] +[1150.777065, "o", "#"] +[1151.009906, "o", " "] +[1151.217704, "o", "l"] +[1151.289227, "o", "e"] +[1151.48157, "o", "t"] +[1151.689748, "o", "s"] +[1151.817419, "o", " "] +[1152.13751, "o", "g"] +[1152.209163, "o", "o"] +[1152.345232, "o", " "] +[1152.921542, "o", "b"] +[1152.985042, "o", "a"] +[1153.12187, "o", "c"] +[1153.225154, "o", "k"] +[1153.368917, "o", " "] +[1153.521092, "o", "t"] +[1153.59312, "o", "o"] +[1153.673022, "o", " "] +[1153.75302, "o", "t"] +[1153.881685, "o", "h"] +[1153.929562, "o", "e"] +[1154.025017, "o", " "] +[1154.154048, "o", "d"] +[1154.193966, "o", "e"] +[1154.296968, "o", "b"] +[1154.353443, "o", "u"] +[1154.513136, "o", "g"] +[1154.665637, "o", "g"] +[1154.74519, "o", "e"] +[1154.841208, "o", "r"] +[1157.898001, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[1157.898871, "o", "root@qemux86:~# "] +[1159.312239, "o", "\u001b[0m\u001b(B\u001b[7m\r\u001b[K\u001b[?12l\u001b[?25h\u001b[?25lCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7.1 | VT102 | Offline | al.pts\u001b[?12l\u001b[?25h\u001b[24;17H"] +[1159.680333, "o", "\u001b[8;30H\u001b[?25l\u001b[0m\u001b(B\u001b(0lqqqqqqqqqqqqqqqqqqqqqqk\u001b[9;30Hx\u001b[0m\u001b(B Leave Minicom? \u001b[0m\u001b(B\u001b(0x\u001b[10;30Hx\u001b[0m\u001b(B No \u001b[0m\u001b(B\u001b(0x\u001b[11;30Hmqqqqqqqqqqqqqqqqqqqqqqj\u001b[10;51H\u001b[?25l\u001b[10;33H\u001b[0m\u001b(B\u001b[7m Yes "] +[1160.319825, "o", "\u001b[?12l\u001b[?25h\u001b[8;1H\u001b[0m\u001b(B 209 1 root S 2828 1% 0% /sbin/getty\u001b[9;1H 210 1 root S 2828 1% 0% /sbin/getty\u001b[10;1H 211 1 root S 2828 1% 0% /sbin/getty\u001b[11;1H 212 1 root S 2828 1% 0% /sbin/getty\u001b[24;17H\u001b[0m\u001b(B\u001b[7m\u001b[?12l\u001b[?25h"] +[1160.320008, "o", "\u001b[?12l\u001b[?25h\u001b[0m\u001b(B\u001b[H\u001b[2J\u001b[?12l\u001b[?25h\u001b[?1l\u001b>\u001b[!p\u001b[?3;4l\u001b[4l\u001b>"] +[1160.32013, "o", "$ "] +[1161.688843, "o", "m"] +[1161.800345, "o", "a"] +[1161.856099, "o", "k"] +[1162.008302, "o", "e"] +[1162.072387, "o", " "] +[1162.328258, "o", "g"] +[1162.408339, "o", "d"] +[1162.488446, "o", "b"] +[1162.904659, "o", "\r\n"] +[1162.910043, "o", "gdb -ex \"target remote localhost:1234\" /linux/vmlinux\r\n"] +[1162.944541, "o", "\u001b[35;1m\u001b[35;1mGNU gdb \u001b[m\u001b[35;1m(Ubuntu 9.2-0ubuntu1~20.04) \u001b[m\u001b[35;1m9.2\u001b[m\u001b[35;1m\r\n\u001b[m\u001b[mCopyright (C) 2020 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law.\r\nType \"show copying\" and \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n .\r\n\r\n"] +[1162.944647, "o", "For help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[1162.945021, "o", "Reading symbols from \u001b[32m/linux/vmlinux\u001b[m...\r\n"] +[1163.591264, "o", "Remote debugging using localhost:1234\r\n"] +[1163.599772, "o", "\u001b[34m0x448ac101\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[1163.600288, "o", "(gdb) "] +[1167.808759, "o", "#"] +[1168.054699, "o", " "] +[1174.704625, "o", "u"] +[1174.784712, "o", "s"] +[1174.864648, "o", "e"] +[1174.944398, "o", "r"] +[1175.057207, "o", "s"] +[1175.176743, "o", "p"] +[1175.225215, "o", "a"] +[1175.345433, "o", "c"] +[1175.433465, "o", "e"] +[1175.601394, "o", " "] +[1175.744383, "o", "a"] +[1175.920469, "o", "d"] +[1176.072352, "o", "d"] +[1176.321026, "o", "r"] +[1176.656814, "o", "e"] +[1176.832588, "o", "s"] +[1176.960611, "o", "s"] +[1177.072884, "o", ","] +[1177.168436, "o", " "] +[1177.329116, "o", "g"] +[1177.404147, "o", "o"] +[1177.448381, "o", "o"] +[1177.598144, "o", "d"] +[1177.662444, "o", "!"] +[1180.24054, "o", "\r\n"] +[1180.24062, "o", "(gdb) "] +[1181.720922, "o", "b"] +[1181.80043, "o", "r"] +[1181.86486, "o", "e"] +[1181.929251, "o", "a"] +[1182.024706, "o", "k"] +[1182.105515, "o", " "] +[1195.76893, "o", "h"] +[1195.849596, "o", "a"] +[1195.960904, "o", "n"] +[1196.088828, "o", "d"] +[1196.176629, "o", "l"] +[1196.496808, "o", "e"] +[1196.672653, "o", "_"] +[1197.256752, "o", "e"] +[1197.416356, "o", "x"] +[1197.656574, "o", "c"] +[1197.720445, "o", "e"] +[1197.793172, "o", "p"] +[1197.945795, "o", "t"] +[1198.018687, "o", "i"] +[1198.052227, "o", "o"] +[1198.226222, "o", "n"] +[1198.528674, "o", "\r\n"] +[1198.579682, "o", "Breakpoint 1 at \u001b[34m0xc15dea1f\u001b[m: file \u001b[32march/x86/entry/entry_32.S\u001b[m, line 1154.\r\n"] +[1198.57999, "o", "(gdb) "] +[1200.261062, "o", "c"] +[1200.433433, "o", "\r\nContinuing.\r\n"] +[1200.436772, "o", "\r\n"] +[1200.437049, "o", "Breakpoint 1, \u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1154\r\n1154\t\tSAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1\r\n"] +[1200.437198, "o", "(gdb) "] +[1228.112751, "o", "b"] +[1228.201197, "o", "t"] +[1228.329203, "o", "\r\n"] +[1228.331815, "o", "#0 \u001b[33mhandle_exception\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1154\r\n"] +[1228.332762, "o", "#1 \u001b[34m0xc15d3840\u001b[m in \u001b[33m??\u001b[m () at \u001b[32march/x86/kernel/setup.c\u001b[m:755\r\nBacktrace stopped: previous frame inner to this frame (corrupt stack?)\r\n(gdb) "] +[1239.893902, "o", "#"] +[1240.072314, "o", " "] +[1240.184169, "o", "l"] +[1240.272133, "o", "e"] +[1240.416193, "o", "t"] +[1240.603099, "o", "s"] +[1240.678107, "o", " "] +[1243.392803, "o", "l"] +[1243.592484, "o", "o"] +[1243.744272, "o", "o"] +[1243.856711, "o", "k"] +[1244.075404, "o", " "] +[1244.232286, "o", "a"] +[1244.712632, "o", "t"] +[1244.792148, "o", " "] +[1244.904252, "o", "t"] +[1244.992334, "o", "h"] +[1245.091434, "o", "e"] +[1245.173492, "o", " "] +[1245.336103, "o", "s"] +[1245.497425, "o", "t"] +[1245.54492, "o", "a"] +[1245.720423, "o", "c"] +[1245.784262, "o", "k"] +[1245.872308, "o", " "] +[1245.984224, "o", "a"] +[1246.272808, "o", "a"] +[1246.688388, "o", "\b\u001b[K"] +[1246.744332, "o", "g"] +[1246.816332, "o", "a"] +[1246.968488, "o", "i"] +[1247.040167, "o", "n"] +[1247.809286, "o", "\r\n"] +[1247.809419, "o", "(gdb) "] +[1248.16016, "o", "#"] +[1248.32863, "o", " "] +[1248.496948, "o", "t"] +[1248.608703, "o", "h"] +[1248.751015, "o", "e"] +[1249.152273, "o", " "] +[1249.304321, "o", "f"] +[1249.401613, "o", "i"] +[1249.560984, "o", "r"] +[1249.933296, "o", "s"] +[1250.168173, "o", "t"] +[1250.449313, "o", " "] +[1250.767425, "o", "i"] +[1250.969452, "o", "t"] +[1251.048783, "o", "e"] +[1251.160335, "o", "m"] +[1251.328476, "o", " "] +[1251.528696, "o", "o"] +[1251.744914, "o", "n"] +[1251.845411, "o", " "] +[1251.96225, "o", "t"] +[1252.081386, "o", "e"] +[1252.096296, "o", "h"] +[1252.25686, "o", " "] +[1252.376629, "o", "s"] +[1252.504275, "o", "t"] +[1252.560027, "o", "a"] +[1252.824826, "o", "\b\u001b[K"] +[1252.968895, "o", "\b\u001b[K"] +[1253.112952, "o", "\b\u001b[K"] +[1253.248719, "o", "\b\u001b[K"] +[1253.368406, "o", "\b\u001b[K"] +[1253.576566, "o", "\b\u001b[K"] +[1253.769049, "o", "h"] +[1253.848377, "o", "e"] +[1253.928207, "o", " "] +[1254.07566, "o", "s"] +[1254.224132, "o", "t"] +[1254.256299, "o", "a"] +[1254.46369, "o", "c"] +[1254.528393, "o", "k"] +[1254.62412, "o", " "] +[1254.792238, "o", "s"] +[1254.896183, "o", "h"] +[1254.976616, "o", "o"] +[1255.095813, "o", "u"] +[1255.233173, "o", "l"] +[1255.328849, "o", "d"] +[1255.422248, "o", " "] +[1255.640996, "o", "b"] +[1255.712744, "o", "e"] +[1255.832174, "o", " "] +[1256.211236, "o", "t"] +[1256.280666, "o", "h"] +[1256.400446, "o", "e"] +[1256.487918, "o", " "] +[1258.854607, "o", "t"] +[1258.941545, "o", "i"] +[1259.024791, "o", "m"] +[1259.04853, "o", "e"] +[1259.121157, "o", "r"] +[1259.224098, "o", " "] +[1259.360266, "o", "h"] +[1259.408186, "o", "a"] +[1259.552178, "o", "n"] +[1259.616451, "o", "d"] +[1259.762455, "o", "l"] +[1259.841476, "o", "e"] +[1259.913474, "o", "r"] +[1260.720512, "o", "\r\n(gdb) "] +[1261.704102, "o", "p"] +[1261.847725, "o", "r"] +[1261.936837, "o", "i"] +[1261.984576, "o", "n"] +[1262.066128, "o", "t"] +[1262.145208, "o", " "] +[1262.424767, "o", "("] +[1262.624087, "o", "v"] +[1262.737455, "o", "o"] +[1263.000827, "o", "d"] +[1263.399755, "o", "\b\u001b[K"] +[1263.544234, "o", "i"] +[1263.648825, "o", "d"] +[1263.792709, "o", " "] +[1264.222987, "o", "*"] +[1264.367238, "o", ")"] +[1264.648392, "o", "("] +[1267.639225, "o", "*"] +[1267.776229, "o", "("] +[1269.23218, "o", "u"] +[1269.400337, "o", "i"] +[1269.448756, "o", "n"] +[1269.579068, "o", "t"] +[1269.800118, "o", "3"] +[1269.872245, "o", "2"] +[1270.048002, "o", "_"] +[1270.200422, "o", "t"] +[1270.579908, "o", "*"] +[1270.696635, "o", ")"] +[1271.552099, "o", "$"] +[1271.941288, "o", "e"] +[1272.232936, "o", "p"] +[1272.416525, "o", "s"] +[1273.783112, "o", "\b\u001b[K"] +[1273.87923, "o", "\b\u001b[K"] +[1273.942963, "o", "s"] +[1274.127877, "o", "p"] +[1274.688937, "o", ")"] +[1277.832273, "o", "\r\n"] +[1277.834178, "o", "$1 = (void *) \u001b[34m0xc15d3840\u001b[m <\u001b[33msysvec_apic_timer_interrupt\u001b[m>\r\n(gdb) "] +[1280.216526, "o", "#"] +[1280.440317, "o", " "] +[1280.905338, "o", "n"] +[1281.006222, "o", "e"] +[1281.096051, "o", "x"] +[1281.352079, "o", "t"] +[1281.45623, "o", " "] +[1281.512205, "o", "w"] +[1281.592294, "o", "e"] +[1281.728646, "o", " "] +[1281.887957, "o", "s"] +[1282.014283, "o", "h"] +[1282.096593, "o", "o"] +[1282.208553, "o", "u"] +[1282.336615, "o", "l"] +[1282.448312, "o", "d"] +[1282.49654, "o", " "] +[1282.784631, "o", "h"] +[1282.889574, "o", "a"] +[1283.030069, "o", "v"] +[1283.096451, "o", "e"] +[1283.14104, "o", " "] +[1283.336121, "o", " "] +[1283.640365, "o", "0"] +[1284.105053, "o", "\b\u001b[K"] +[1284.232359, "o", "\b\u001b[K"] +[1284.456187, "o", "0"] +[1286.217444, "o", ","] +[1286.428228, "o", " "] +[1286.634031, "o", "t"] +[1286.725784, "o", "h"] +[1286.864535, "o", "e"] +[1286.968074, "o", "n"] +[1287.047946, "o", " "] +[1287.143945, "o", "t"] +[1287.24016, "o", "h"] +[1287.312374, "o", "e"] +[1287.434149, "o", " "] +[1287.648797, "o", "o"] +[1287.853749, "o", "l"] +[1287.93696, "o", "d"] +[1288.056131, "o", " "] +[1288.917063, "o", "E"] +[1289.135849, "o", "I"] +[1289.231925, "o", "P"] +[1291.056219, "o", "\r\n(gdb) "] +[1292.1377, "o", "p"] +[1292.411602, "o", "r"] +[1292.528097, "o", "i"] +[1292.599943, "o", "n"] +[1293.344827, "o", "\b\b\b\b# next we should have 0, then the old EIP"] +[1294.000313, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[9Pprint (void *)(*(uint32_t*)$esp)"] +[1294.377399, "o", "\b"] +[1294.664387, "o", "\b"] +[1294.842701, "o", "\b"] +[1294.944346, "o", "\b"] +[1295.296547, "o", "\b"] +[1295.886427, "o", "($esp)\b\b\b\b\b"] +[1296.416543, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[1296.608222, "o", "\b"] +[1297.026087, "o", "+)\b\b\u001b[1P)\b"] +[1297.560139, "o", "+)\b"] +[1297.916844, "o", "4)\b"] +[1298.397975, "o", "\u001b[C)\b"] +[1299.632357, "o", "\r\n"] +[1299.633204, "o", "$2 = (void *) \u001b[34m0x0\u001b[m\r\n(gdb) "] +[1300.373248, "o", "print (void *)(*(uint32_t*)($esp+4))"] +[1300.688192, "o", "\b"] +[1300.856538, "o", "\b"] +[1301.000194, "o", "\b"] +[1301.168188, "o", "\u001b[1P))\b\b"] +[1301.952667, "o", "8))\b\b"] +[1302.457015, "o", "\r\n"] +[1302.458153, "o", "$3 = (void *) \u001b[34m0x448abff0\u001b[m\r\n(gdb) "] +[1303.41604, "o", "#"] +[1303.712934, "o", " "] +[1304.968705, "o", "t"] +[1305.080618, "o", "h"] +[1305.112913, "o", "i"] +[1305.320332, "o", "s"] +[1305.450583, "o", " "] +[1305.784044, "o", "c"] +[1305.879818, "o", "e"] +[1306.01577, "o", "r"] +[1306.215986, "o", "t"] +[1306.408455, "o", "a"] +[1306.509271, "o", "i"] +[1306.568281, "o", "n"] +[1306.736051, "o", "l"] +[1306.968107, "o", "y"] +[1307.032157, "o", " "] +[1307.240792, "o", "l"] +[1307.392111, "o", "o"] +[1307.523872, "o", "o"] +[1307.56085, "o", "k"] +[1307.704084, "o", "s"] +[1307.760884, "o", " "] +[1308.464805, "o", "l"] +[1308.632045, "o", "i"] +[1308.784287, "o", "k"] +[1308.847688, "o", "e"] +[1308.920087, "o", " "] +[1309.088096, "o", "a"] +[1309.16807, "o", " "] +[1310.540839, "o", "u"] +[1310.625164, "o", "s"] +[1310.681276, "o", "e"] +[1310.7603, "o", "r"] +[1310.887811, "o", "s"] +[1310.960368, "o", "p"] +[1311.072017, "o", "a"] +[1311.145256, "o", "c"] +[1311.223973, "o", "e"] +[1311.368353, "o", " "] +[1311.457182, "o", "a"] +[1311.601448, "o", "d"] +[1311.769504, "o", "d"] +[1311.968497, "o", "r"] +[1312.000587, "o", "e"] +[1312.160017, "o", "s"] +[1312.296067, "o", "s"] +[1312.672663, "o", "\r\n(gdb) "] +[1313.104304, "o", "#"] +[1313.247936, "o", " "] +[1313.591981, "o", "s"] +[1313.663257, "o", "o"] +[1313.7933, "o", " "] +[1314.376222, "o", "a"] +[1314.488137, "o", " "] +[1315.711982, "o", "p"] +[1315.873232, "o", "r"] +[1315.976308, "o", "i"] +[1316.097279, "o", "v"] +[1316.192412, "o", "i"] +[1317.808159, "o", "l"] +[1317.888621, "o", "e"] +[1318.806197, "o", "g"] +[1318.880036, "o", "e"] +[1319.007839, "o", " "] +[1320.044197, "o", "t"] +[1320.232041, "o", "r"] +[1320.37594, "o", "a"] +[1320.790313, "o", "n"] +[1320.960849, "o", "s"] +[1321.043584, "o", "i"] +[1321.228877, "o", "t"] +[1321.2893, "o", "i"] +[1321.328086, "o", "o"] +[1321.488163, "o", "n"] +[1321.560022, "o", " "] +[1321.743979, "o", "h"] +[1321.784015, "o", "a"] +[1321.883377, "o", "s"] +[1321.978322, "o", " "] +[1322.432097, "o", "h"] +[1322.504039, "o", "a"] +[1322.64008, "o", "p"] +[1322.751806, "o", "p"] +[1322.840014, "o", "e"] +[1322.952267, "o", "n"] +[1323.065126, "o", "e"] +[1323.222808, "o", "d"] +[1324.320102, "o", "\r\n(gdb) "] +[1328.287918, "o", "#"] +[1328.5767, "o", " "] +[1328.728319, "o", "t"] +[1328.807873, "o", "h"] +[1328.922227, "o", "e"] +[1329.031848, "o", " "] +[1329.201101, "o", "n"] +[1329.255849, "o", "e"] +[1329.3283, "o", "x"] +[1329.632289, "o", "t"] +[1329.720295, "o", " "] +[1329.85636, "o", "t"] +[1330.070787, "o", "w"] +[1330.180591, "o", "o"] +[1330.327382, "o", " "] +[1331.600174, "o", "v"] +[1331.688922, "o", "a"] +[1331.800273, "o", "l"] +[1331.967863, "o", "u"] +[1332.879624, "o", "e"] +[1333.008092, "o", "s"] +[1333.129677, "o", " "] +[1333.260857, "o", "o"] +[1333.408013, "o", "n"] +[1333.491288, "o", " "] +[1333.585346, "o", "t"] +[1333.67251, "o", "h"] +[1333.744198, "o", "e"] +[1333.824197, "o", " "] +[1333.927752, "o", "s"] +[1334.023852, "o", "t"] +[1334.088274, "o", "a"] +[1334.278186, "o", "c"] +[1334.312208, "o", "k"] +[1334.4004, "o", " "] +[1334.479627, "o", "s"] +[1334.569724, "o", "h"] +[1334.653599, "o", "o"] +[1334.761086, "o", "u"] +[1334.879876, "o", "l"] +[1334.967955, "o", "d"] +[1335.039806, "o", " "] +[1337.127967, "o", "b"] +[1337.568187, "o", "e"] +[1337.733548, "o", " "] +[1337.829755, "o", "t"] +[1337.920319, "o", "h"] +[1337.983981, "o", "e"] +[1338.096372, "o", " "] +[1338.472148, "o", "o"] +[1338.656814, "o", "l"] +[1338.768352, "o", "d"] +[1338.916423, "o", " "] +[1339.202871, "o", "C"] +[1339.327983, "o", "S"] +[1339.440216, "o", " "] +[1339.57566, "o", "a"] +[1339.70382, "o", "n"] +[1339.800022, "o", "d"] +[1339.878404, "o", " "] +[1342.408345, "o", "o"] +[1342.521478, "o", "d"] +[1342.569261, "o", "l"] +[1342.687708, "o", " "] +[1342.896122, "o", "E"] +[1343.07254, "o", "F"] +[1343.160126, "o", "L"] +[1343.252373, "o", "A"] +[1346.581722, "o", "G"] +[1346.645084, "o", "S"] +[1346.984099, "o", "\r\n(gdb) "] +[1349.368274, "o", "# the next two values on the stack should be the old CS and old EFLAGS"] +[1349.503807, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[30Pso a privilege transition has happened"] +[1349.640129, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cthis certainly looks like a userspace address"] +[1350.086523, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[11Pprint (void *)(*(uint32_t*)($esp+8))"] +[1350.592191, "o", "\b"] +[1350.744442, "o", "\b"] +[1350.871908, "o", "\b"] +[1351.056417, "o", "\u001b[1P))\b\b"] +[1351.150322, "o", "1))\b\b"] +[1351.246318, "o", "2))\b\b"] +[1351.576371, "o", "\r\n"] +[1351.577107, "o", "$4 = (void *) \u001b[34m0x73\u001b[m\r\n(gdb) "] +[1352.136702, "o", "print (void *)(*(uint32_t*)($esp+12))"] +[1352.288158, "o", "\b"] +[1352.424696, "o", "\b"] +[1352.593518, "o", "\b"] +[1353.016482, "o", "\u001b[1P))\b\b"] +[1353.136234, "o", "4))\b\b"] +[1360.824627, "o", "\r\n"] +[1360.825733, "o", "$5 = (void *) \u001b[34m0x2820000\u001b[m\r\n"] +[1360.825786, "o", "(gdb) "] +[1363.824749, "o", "#"] +[1363.925525, "o", " "] +[1364.090304, "o", "l"] +[1364.897579, "o", "\b\u001b[K"] +[1365.125951, "o", "y"] +[1365.207584, "o", "o"] +[1365.719749, "o", "\b\u001b[K"] +[1365.840063, "o", "\b\u001b[K"] +[1366.048291, "o", "n"] +[1366.112558, "o", "o"] +[1366.280917, "o", "t"] +[1366.405468, "o", "i"] +[1366.600048, "o", "c"] +[1366.688182, "o", "e"] +[1366.799955, "o", " "] +[1366.953361, "o", "t"] +[1367.071836, "o", "h"] +[1367.188867, "o", "a"] +[1367.287932, "o", "t"] +[1367.379658, "o", " "] +[1367.521389, "o", "t"] +[1367.607602, "o", "h"] +[1367.695693, "o", "e"] +[1367.815874, "o", " "] +[1373.74383, "o", "o"] +[1373.896155, "o", "l"] +[1374.000406, "o", "d"] +[1374.05883, "o", " "] +[1374.488016, "o", "C"] +[1374.560064, "o", "S"] +[1374.70417, "o", " "] +[1374.927765, "o", "i"] +[1375.048476, "o", "s"] +[1375.159989, "o", " "] +[1375.523958, "o", "d"] +[1375.672027, "o", "i"] +[1375.823809, "o", "f"] +[1376.352436, "o", "f"] +[1376.488975, "o", "e"] +[1376.653416, "o", "r"] +[1376.712524, "o", "e"] +[1376.847597, "o", "n"] +[1377.00774, "o", "t"] +[1377.337116, "o", " "] +[1380.176217, "o", "\r\n"] +[1380.17633, "o", "(gdb) "] +[1381.519619, "o", "p"] +[1381.727674, "o", "r"] +[1381.807962, "o", "i"] +[1381.847678, "o", "n"] +[1381.992661, "o", "t"] +[1382.213413, "o", " "] +[1383.487682, "o", "/"] +[1383.720038, "o", " "] +[1384.284125, "o", "$"] +[1384.785109, "o", "\b\u001b[K"] +[1384.896215, "o", "\b\u001b[K"] +[1384.992093, "o", "x"] +[1385.119757, "o", " "] +[1385.462147, "o", "$"] +[1385.975882, "o", "c"] +[1386.039807, "o", "s"] +[1387.288752, "o", "\r\n"] +[1387.288917, "o", "$6 = 0x60\r\n(gdb) "] +[1392.071906, "o", "#"] +[1392.183952, "o", " "] +[1392.530314, "o", "s"] +[1392.64009, "o", "o"] +[1392.760001, "o", " "] +[1392.935708, "o", "w"] +[1393.023798, "o", "e"] +[1393.143801, "o", " "] +[1394.480397, "o", "s"] +[1394.552322, "o", "h"] +[1394.616216, "o", "o"] +[1394.741538, "o", "u"] +[1394.8873, "o", "l"] +[1394.991717, "o", "d"] +[1395.119869, "o", " "] +[1395.764404, "o", "h"] +[1395.81421, "o", "a"] +[1395.96834, "o", "v"] +[1396.047828, "o", "e"] +[1396.087582, "o", " "] +[1396.24761, "o", "t"] +[1396.51192, "o", "w"] +[1396.608042, "o", "o"] +[1396.706285, "o", " "] +[1396.859207, "o", "e"] +[1396.952619, "o", "x"] +[1397.184817, "o", "t"] +[1397.343837, "o", "r"] +[1397.408411, "o", "a"] +[1397.599725, "o", " "] +[1400.600069, "o", "v"] +[1400.655484, "o", "a"] +[1400.767453, "o", "l"] +[1400.91153, "o", "u"] +[1400.983774, "o", "e"] +[1401.175892, "o", "s"] +[1401.208238, "o", " "] +[1401.319824, "o", "o"] +[1401.502175, "o", "n"] +[1401.582375, "o", " "] +[1401.63978, "o", "t"] +[1401.768209, "o", "h"] +[1401.823535, "o", "e"] +[1401.888547, "o", " "] +[1402.096021, "o", "s"] +[1402.263991, "o", "t"] +[1402.335885, "o", "a"] +[1403.287574, "o", "c"] +[1403.423212, "o", "k"] +[1405.773719, "o", ":"] +[1405.903731, "o", " "] +[1406.199721, "o", "o"] +[1406.368541, "o", "l"] +[1406.416058, "o", "d"] +[1406.503703, "o", " "] +[1406.701576, "o", "E"] +[1406.904167, "o", "S"] +[1407.007965, "o", "P"] +[1407.240691, "o", " "] +[1407.39138, "o", "a"] +[1407.527755, "o", "n"] +[1407.608416, "o", "d"] +[1407.695859, "o", " "] +[1407.807966, "o", "o"] +[1407.976125, "o", "l"] +[1408.048128, "o", "d"] +[1408.141001, "o", " "] +[1408.353815, "o", "S"] +[1408.495468, "o", "S"] +[1410.367923, "o", "\r\n(gdb) "] +[1969.053941, "o", "# we should have two extra values on stack: old ESP and old SS"] +[1969.262191, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cprint /x $cs\u001b[K"] +[1969.635988, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C# notice that CS is different "] +[1970.061803, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cprint (void *)(*(uint32_t*)($esp+16))"] +[1970.590698, "o", "\b"] +[1970.742563, "o", "\b"] +[1970.877824, "o", "\b"] +[1971.518067, "o", "\b"] +[1971.789987, "o", "\u001b[1P6))\b\b\b"] +[1971.925709, "o", "\u001b[1P))\b\b"] +[1972.062369, "o", "2))\b\b"] +[1972.414001, "o", "0))\b\b"] +[1975.022556, "o", "\r\n"] +[1975.023378, "o", "$10 = (void *) \u001b[34m0xbfe27990\u001b[m\r\n(gdb) "] +[1976.069818, "o", "print (void *)(*(uint32_t*)($esp+20))"] +[1976.403385, "o", "\b"] +[1976.549892, "o", "\b"] +[1976.709579, "o", "\b"] +[1977.038119, "o", "\b"] +[1979.18221, "o", "\u001b[1P0))\b\b\b"] +[1979.334363, "o", "\u001b[1P))\b\b"] +[1979.446076, "o", "2))\b\b"] +[1979.598023, "o", "4))\b\b"] +[1981.981611, "o", "\r\n"] +[1981.982297, "o", "$11 = (void *) \u001b[34m0x7b\u001b[m\r\n(gdb) "] +[1987.957905, "o", "#"] +[1988.1735, "o", " "] +[1988.405662, "o", "n"] +[1988.453735, "o", "o"] +[1988.565807, "o", "t"] +[1988.725502, "o", "i"] +[1988.858572, "o", "c"] +[1988.926557, "o", "e"] +[1989.043672, "o", " "] +[1989.14232, "o", "t"] +[1989.261751, "o", "h"] +[1989.301689, "o", "a"] +[1989.405339, "o", "t"] +[1989.541661, "o", " "] +[1992.358583, "o", "S"] +[1992.501591, "o", "S"] +[1992.677657, "o", " "] +[1992.982405, "o", "i"] +[1993.088714, "o", "s"] +[1993.267478, "o", " "] +[1993.477794, "o", "a"] +[1993.582886, "o", "l"] +[1994.310019, "o", "s"] +[1994.421693, "o", "o"] +[1994.501651, "o", " "] +[1997.285443, "o", "d"] +[1997.421795, "o", "i"] +[1997.533942, "o", "f"] +[1997.678316, "o", "f"] +[1997.8058, "o", "e"] +[1997.965947, "o", "r"] +[1998.085951, "o", "e"] +[1998.357808, "o", "n"] +[1998.494273, "o", "t"] +[1998.651186, "o", " "] +[1998.827228, "o", "t"] +[1998.967293, "o", "h"] +[1999.030052, "o", "en"] +[1999.197533, "o", " "] +[1999.55771, "o", "t"] +[1999.637997, "o", "h"] +[1999.741665, "o", "e"] +[1999.849221, "o", " "] +[2000.044744, "o", "c"] +[2000.117566, "o", "u"] +[2000.278172, "o", "r"] +[2000.413666, "o", "r"] +[2000.493754, "o", "e"] +[2000.557567, "o", "n"] +[2000.715842, "o", "t"] +[2000.813749, "o", " "] +[2001.213728, "o", "s"] +[2001.357872, "o", "s"] +[2002.046006, "o", "\r\n(gdb) "] +[2002.357919, "o", "p"] +[2002.549923, "o", "r"] +[2002.613931, "o", "i"] +[2002.685486, "o", "n"] +[2002.782565, "o", "t"] +[2002.885912, "o", " "] +[2003.285786, "o", "/"] +[2003.517441, "o", "x"] +[2003.597579, "o", " "] +[2003.984994, "o", "$"] +[2005.414711, "o", "s"] +[2005.55787, "o", "s"] +[2006.605906, "o", "\r\n"] +[2006.606043, "o", "$12 = 0x68\r\n(gdb) "] +[2012.613496, "o", "#"] +[2012.765597, "o", " "] +[2012.869808, "o", "a"] +[2012.981679, "o", "n"] +[2013.045602, "o", "d"] +[2013.157599, "o", " "] +[2013.359393, "o", "t"] +[2013.420279, "o", "o"] +[2013.462073, "o", " "] +[2013.693767, "o", "c"] +[2013.76528, "o", "o"] +[2013.861277, "o", "n"] +[2013.910528, "o", "f"] +[2014.01357, "o", "i"] +[2014.262175, "o", "r"] +[2014.381666, "o", "m"] +[2014.558588, "o", " "] +[2014.693456, "o", "t"] +[2014.805673, "o", "h"] +[2014.93382, "o", "a"] +[2014.97489, "o", "t"] +[2015.946969, "o", " "] +[2020.233739, "o", " 0xbfe27990"] +[2020.77356, "o", " "] +[2021.302126, "o", "i"] +[2021.43803, "o", "s"] +[2021.534611, "o", " "] +[2021.725567, "o", "a"] +[2022.560398, "o", " "] +[2022.830342, "o", "u"] +[2022.933333, "o", "s"] +[2023.038594, "o", "e"] +[2023.101613, "o", "r"] +[2023.714446, "o", "s"] +[2023.826778, "o", "p"] +[2023.909568, "o", "a"] +[2023.965588, "o", "c"] +[2024.062612, "o", "e"] +[2024.141261, "o", " "] +[2024.285484, "o", "s"] +[2024.39776, "o", "t"] +[2024.454087, "o", "a"] +[2024.666852, "o", "c"] +[2024.708226, "o", "k"] +[2024.98393, "o", " "] +[2026.886057, "o", "\r\n(gdb) "] +[2027.325884, "o", "#"] +[2027.71743, "o", " "] +[2027.981779, "o", "l"] +[2028.063253, "o", "e"] +[2028.205611, "o", "t"] +[2028.317878, "o", " "] +[2028.502039, "o", "j"] +[2028.645923, "o", "u"] +[2028.813377, "o", "m"] +[2028.893359, "o", "p"] +[2028.990456, "o", " "] +[2029.172262, "o", "b"] +[2029.245318, "o", "a"] +[2029.334424, "o", "c"] +[2029.42394, "o", "k"] +[2029.510273, "o", " "] +[2029.701962, "o", "t"] +[2029.74145, "o", "o"] +[2029.861488, "o", " "] +[2029.957501, "o", "t"] +[2030.101868, "o", "h"] +[2030.166747, "o", "e"] +[2030.285553, "o", " "] +[2031.173547, "o", "t"] +[2031.253912, "o", "e"] +[2031.334197, "o", "r"] +[2031.421792, "o", "m"] +[2031.501571, "o", "i"] +[2031.621782, "o", "n"] +[2031.72392, "o", "a"] +[2031.829432, "o", "l"] +[2032.245715, "o", "\r\n(gdb) "] +[2033.431185, "o", "quit\r\n"] +[2033.431243, "o", "A debugging session is active.\r\n\r\n\tInferior 1 [process 1] will be detached.\r\n\r\nQuit anyway? (y or n) "] +[2034.445451, "o", "y"] +[2034.541829, "o", "\r\nDetaching from program: /linux/vmlinux, process 1\r\n"] +[2034.542671, "o", "Ending remote debugging.\r\n"] +[2034.542784, "o", "[Inferior 1 (process 1) detached]\r\n"] +[2034.548935, "o", "make: *** [qemu/Makefile:54: gdb] Interrupt\r\n"] +[2034.548997, "o", "\r\n$ "] +[2035.149297, "o", "m"] +[2035.309489, "o", "i"] +[2035.429356, "o", "n"] +[2035.629299, "o", "c"] +[2036.205576, "o", "\b \b"] +[2036.373293, "o", "i"] +[2036.669373, "o", "c"] +[2036.82129, "o", "o"] +[2036.909198, "o", "m"] +[2037.126024, "o", "d"] +[2037.50137, "o", " "] +[2037.741686, "o", "\b \b"] +[2037.861471, "o", "\b \b"] +[2037.973342, "o", " "] +[2038.069309, "o", "-"] +[2039.956556, "o", "D"] +[2041.021427, "o", " "] +[2041.348903, "o", "s"] +[2041.429401, "o", "e"] +[2041.509259, "o", "r"] +[2041.573313, "o", "i"] +[2041.685198, "o", "a"] +[2041.757143, "o", "l"] +[2041.917485, "o", "."] +[2042.108869, "o", "p"] +[2042.253212, "o", "t"] +[2042.477348, "o", "s"] +[2043.621531, "o", "\r\n"] +[2043.622396, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[2043.622745, "o", "\u001b[?12l\u001b[?25h\nWelcome to minicom 2.7.1\r\n\nOPTIONS: I18n \r\nCompiled on Dec 23 2019, 02:06:26.\r\nPort serial.pts, 22:50:04\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[2044.554156, "o", "\n"] +[2044.562897, "o", "root@qemux86:~# "] +[2047.455019, "o", "t"] +[2047.542613, "o", "o"] +[2047.606554, "o", "p"] +[2047.996772, "o", "\r\n"] +[2048.379869, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 31172K used, 209508K free, 172K shrd, 368K buff, 4344K cached"] +[2048.380063, "o", "\r\n"] +[2048.380683, "o", "CPU: 31% usr 68% sys 0% nic 0% idle 0% io 0% irq 0% sirq"] +[2048.380824, "o", "\r\n"] +[2048.381861, "o", "Load average: 1.12 0.81 0.53 4/40 386"] +[2048.38202, "o", "\r\n"] +[2048.456891, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[2048.457162, "o", "\r\n"] +[2048.457905, "o", "\u001b[0m\u001b(B 347 345 root R 3124 1% 26% -sh"] +[2048.45804, "o", "\r\n"] +[2048.472796, "o", " 1 0 root S 2004 1% 16% init [5]"] +[2048.473041, "o", "\r\n"] +[2048.473661, "o", " 374 345 root R 2828 1% 11% top"] +[2048.473774, "o", "\r\n"] +[2048.475752, "o", " 10 2 root IW 0 0% 5% [rcu_sched]"] +[2048.47595, "o", "\r\n"] +[2048.476546, "o", " 345 1 root S 2972 1% 0% -sh"] +[2048.476691, "o", "\r\n"] +[2048.477249, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[2048.477504, "o", "\r\n"] +[2048.477969, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[2048.47827, "o", "\r\n"] +[2048.483281, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[12;80H"] +[2048.483527, "o", "\r\n"] +[2048.484037, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[2048.48425, "o", "\r\n"] +[2048.484667, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[2048.484914, "o", "\r\n"] +[2048.48537, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[2048.485513, "o", "\r\n"] +[2048.486013, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[2048.48629, "o", "\r\n"] +[2048.490808, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[2048.491041, "o", "\r\n"] +[2048.491575, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[2048.491767, "o", "\r\n"] +[2048.492158, "o", " 9 2 root SW 0 0% 0% [ksoftirqd/0]"] +[2048.492351, "o", "\r\n"] +[2048.492729, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-eve]"] +[2048.492892, "o", "\r\n"] +[2048.493419, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[2048.493507, "o", "\r\n"] +[2048.493872, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[2048.494245, "o", "\r\n"] +[2048.499119, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-ev]"] +[2048.499359, "o", "\r\n"] +[2048.499523, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]\r"] +[2049.597664, "o", "\n\u001b[23;80H \u001b[24;1H"] +[2049.601147, "o", "root@qemux86:~# "] +[2050.846207, "o", "c"] +[2050.950473, "o", "a"] +[2051.318748, "o", "t"] +[2051.478115, "o", " "] +[2051.6701, "o", "/"] +[2051.789863, "o", "p"] +[2051.862219, "o", "r"] +[2052.118358, "o", "o"] +[2052.357989, "o", "c"] +[2052.462199, "o", "/"] +[2054.438526, "o", "3"] +[2055.37389, "o", "4"] +[2055.414188, "o", "7"] +[2055.710106, "o", "/"] +[2055.966042, "o", "m"] +[2056.030253, "o", "a"] +[2056.126617, "o", "p"] +[2056.237966, "o", "s"] +[2056.751199, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.776691, "o", "08048000-080c2000 r-xp 00000000 fe:00 669 /bin/busybox.nosuid"] +[2056.776882, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.777068, "o", "080c2000-080c3000 r--p 00079000 fe:00 669 /bin/busybox.nosuid"] +[2056.777196, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.77744, "o", "080c3000-080c4000 rw-p 0007a000 fe:00 669 /bin/busybox.nosuid"] +[2056.777507, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.777832, "o", "080c4000-080c6000 rw-p 00000000 00:00 0 "] +[2056.777966, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.784163, "o", "08572000-08593000 rw-p 00000000 00:00 0 [heap]"] +[2056.784397, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.784596, "o", "4480c000-4482e000 r-xp 00000000 fe:00 576 /lib/ld-2.25.so"] +[2056.784771, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.784985, "o", "4482e000-4482f000 r--p 00021000 fe:00 576 /lib/ld-2.25.so"] +[2056.785371, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.785477, "o", "4482f000-44830000 rw-p 00022000 fe:00 576 /lib/ld-2.25.so"] +[2056.785705, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.785842, "o", "44832000-449a9000 r-xp 00000000 fe:00 581 /lib/libc-2.25.so"] +[2056.786153, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.786344, "o", "449a9000-449ab000 r--p 00176000 fe:00 581 /lib/libc-2.25.so"] +[2056.786584, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.786747, "o", "449ab000-449ac000 rw-p 00178000 fe:00 581 /lib/libc-2.25.so"] +[2056.786978, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.787073, "o", "449ac000-449af000 rw-p 00000000 00:00 0 "] +[2056.787189, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.787414, "o", "449b1000-44a09000 r-xp 00000000 fe:00 641 /lib/libm-2.25.so"] +[2056.787514, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.787629, "o", "44a09000-44a0a000 r--p 00057000 fe:00 641 /lib/libm-2.25.so"] +[2056.787795, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.787962, "o", "44a0a000-44a0b000 rw-p 00058000 fe:00 641 /lib/libm-2.25.so"] +[2056.788132, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.78828, "o", "b7f1e000-b7f44000 rw-p 00000000 00:00 0 "] +[2056.788532, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.788657, "o", "b7f61000-b7f77000 r-xp 00000000 fe:00 579 /lib/libnsl-2.25.so"] +[2056.788811, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.789014, "o", "b7f77000-b7f78000 r--p 00015000 fe:00 579 /lib/libnsl-2.25.so"] +[2056.789111, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.789243, "o", "b7f78000-b7f79000 rw-p 00016000 fe:00 579 /lib/libnsl-2.25.so"] +[2056.789421, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.789588, "o", "b7f79000-b7f7b000 rw-p 00000000 00:00 0 "] +[2056.789746, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.789877, "o", "b7f7b000-b7f82000 r-xp 00000000 fe:00 608 /lib/libnss_compat-2.25.so"] +[2056.790117, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.79029, "o", "b7f82000-b7f83000 ---p 00007000 fe:00 608 /lib/libnss_compat-2.25.so"] +[2056.790438, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.790597, "o", "b7f83000-b7f84000 r--p 00007000 fe:00 608 /lib/libnss_compat-2.25.so"] +[2056.790802, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.790899, "o", "b7f84000-b7f85000 rw-p 00008000 fe:00 608 /lib/libnss_compat-2.25.so"] +[2056.791055, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.791213, "o", "b7f86000-b7f88000 rw-p 00000000 00:00 0 "] +[2056.79138, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.791605, "o", "b7f88000-b7f8c000 r--p 00000000 00:00 0 [vvar]"] +[2056.791733, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.791833, "o", "b7f8c000-b7f8e000 r-xp 00000000 00:00 0 [vdso]"] +[2056.792086, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.792209, "o", "bfe08000-bfe29000 rw-p 00000000 00:00 0 [stack]"] +[2056.792366, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[2056.81284, "o", "root@qemux86:~# "] +[2062.022254, "o", "#"] +[2062.222725, "o", " "] +[2064.231146, "o", " "] +[2064.231511, "o", "0"] +[2064.231736, "o", "x"] +[2064.232109, "o", "b"] +[2064.232357, "o", "f"] +[2064.23262, "o", "e"] +[2064.23285, "o", "2"] +[2064.23307, "o", "7"] +[2064.233356, "o", "9"] +[2064.233643, "o", "9"] +[2064.233852, "o", "0"] +[2065.526575, "o", " "] +[2068.24684, "o", "p"] +[2068.37458, "o", "o"] +[2069.389314, "o", "i"] +[2069.477747, "o", "n"] +[2069.598037, "o", "t"] +[2069.838099, "o", "s"] +[2069.974039, "o", " "] +[2071.190877, "o", "t"] +[2071.230772, "o", "o"] +[2071.325977, "o", " "] +[2073.598171, "o", "s"] +[2074.46266, "o", "t"] +[2074.559024, "o", "a"] +[2074.798529, "o", "c"] +[2074.862792, "o", "k"] +[2076.69481, "o", " "] +[2076.942032, "o", "i"] +[2077.014286, "o", "n"] +[2077.197566, "o", "d"] +[2077.31816, "o", "e"] +[2077.485824, "o", "e"] +[2077.597694, "o", "d"] diff --git a/refs/pull/405/merge/_images/kernel-virtmem-map.png b/refs/pull/405/merge/_images/kernel-virtmem-map.png new file mode 100644 index 00000000..25ffb7a6 Binary files /dev/null and b/refs/pull/405/merge/_images/kernel-virtmem-map.png differ diff --git a/refs/pull/405/merge/_images/kernel-virtmem-map1.png b/refs/pull/405/merge/_images/kernel-virtmem-map1.png new file mode 100644 index 00000000..25ffb7a6 Binary files /dev/null and b/refs/pull/405/merge/_images/kernel-virtmem-map1.png differ diff --git a/refs/pull/405/merge/_images/kernel_threads.cast b/refs/pull/405/merge/_images/kernel_threads.cast new file mode 100644 index 00000000..9001f42b --- /dev/null +++ b/refs/pull/405/merge/_images/kernel_threads.cast @@ -0,0 +1,1350 @@ +{"version": 2, "width": 80, "height": 24, "timestamp": 1615904217, "idle_time_limit": 1.0, "env": {"SHELL": null, "TERM": "xterm"}} +[0.002092, "o", "$ "] +[1.411094, "o", "m"] +[1.492191, "o", "a"] +[1.57441, "o", "k"] +[1.686334, "o", "e"] +[1.750754, "o", " "] +[1.951, "o", "g"] +[2.039054, "o", "d"] +[2.111579, "o", "b"] +[2.350633, "o", "\r\n"] +[2.356524, "o", "gdb -ex \"target remote localhost:1234\" /linux/vmlinux\r\n"] +[2.390298, "o", "\u001b[35;1m\u001b[35;1mGNU gdb \u001b[m\u001b[35;1m(Ubuntu 9.2-0ubuntu1~20.04) \u001b[m\u001b[35;1m9.2\u001b[m\u001b[35;1m\r\n\u001b[m\u001b[mCopyright (C) 2020 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law.\r\nType \"show copying\" and \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n .\r\n\r\nFor help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[2.390737, "o", "Reading symbols from \u001b[32m/linux/vmlinux\u001b[m...\r\n"] +[2.944725, "o", "Remote debugging using localhost:1234\r\n"] +[2.957375, "o", "\u001b[33mdefault_idle\u001b[m () at \u001b[32march/x86/kernel/process.c\u001b[m:689\r\n"] +[2.95746, "o", "689\t}\r\n"] +[2.957779, "o", "(gdb) "] +[5.428286, "o", "l"] +[5.524636, "o", "s"] +[5.780627, "o", "-"] +[6.367098, "o", "p"] +[6.509506, "o", "x"] +[7.278986, "o", "\r\n"] +[7.279172, "o", "Undefined command: \"ls-px\". Try \"help\".\r\n(gdb) "] +[8.676984, "o", "l"] +[8.850763, "o", "x"] +[9.039146, "o", "-"] +[9.318467, "o", "p"] +[9.500473, "o", "s"] +[9.750891, "o", "\r\n"] +[9.751114, "o", " TASK PID COMM\r\n"] +[9.75236, "o", "0xc17d02c0 0 swapper/0\r\n"] +[9.75369, "o", "0xc2530040 1 init\r\n"] +[9.7548, "o", "0xc2534080 2 kthreadd\r\n"] +[9.755807, "o", "0xc25360c0 3 rcu_gp\r\n"] +[9.756742, "o", "0xc2537100 4 rcu_par_gp\r\n"] +[9.757706, "o", "0xc2546180 6 kworker/0:0H\r\n"] +[9.758707, "o", "0xc25481c0 7 kworker/u2:0\r\n"] +[9.759604, "o", "0xc2549000 8 mm_percpu_wq\r\n"] +[9.76062, "o", "0xc254b040 9 ksoftirqd/0\r\n"] +[9.761626, "o", "0xc254c080 10 rcu_sched\r\n"] +[9.762529, "o", "0xc254e0c0 11 migration/0\r\n"] +[9.763454, "o", "0xc2572100 12 cpuhp/0\r\n"] +[9.764416, "o", "0xc2576140 13 kdevtmpfs\r\n"] +[9.765286, "o", "0xc2593180 14 netns\r\n"] +[9.766329, "o", "0xc26211c0 15 oom_reaper\r\n"] +[9.76739, "o", "0xc2623000 16 writeback\r\n"] +[9.768316, "o", "0xc25cd1c0 32 kblockd\r\n"] +[9.769188, "o", "0xc2638180 33 kworker/0:1\r\n"] +[9.770027, "o", "0xc2637140 34 kworker/0:1H\r\n"] +[9.770898, "o", "0xc2636100 35 kswapd0\r\n"] +[9.771804, "o", "0xc2634080 37 acpi_thermal_pm\r\n"] +[9.77276, "o", "0xc26350c0 38 kworker/u2:1\r\n"] +[9.773599, "o", "\u001b[m--Type for more, q to quit, c to continue without paging--"] +[16.511474, "o", "\r\n0xc2631040 39 kworker/0:2\r\n"] +[16.512921, "o", "0xc2630000 40 khvcd\r\n"] +[16.513907, "o", "0xc25d9180 41 ipv6_addrconf\r\n"] +[16.514686, "o", "0xc2627080 42 kmemleak\r\n"] +[16.515495, "o", "0xc26170c0 43 jbd2/vda-8\r\n"] +[16.516313, "o", "0xc2625040 44 ext4-rsv-conver\r\n"] +[16.517171, "o", "0xc842a1c0 187 udhcpc\r\n"] +[16.518006, "o", "0xcb365040 198 syslogd\r\n"] +[16.518734, "o", "0xc6730140 201 klogd\r\n"] +[16.519451, "o", "0xc260f100 207 getty\r\n"] +[16.520309, "o", "0xcb37a080 208 getty\r\n"] +[16.521243, "o", "0xc8431000 209 getty\r\n"] +[16.522122, "o", "0xc4b8a180 210 getty\r\n"] +[16.523042, "o", "0xcb375100 211 getty\r\n"] +[16.524007, "o", "0xca6931c0 212 getty\r\n"] +[16.524667, "o", "(gdb) "] +[18.27663, "o", " "] +[19.237048, "o", "\b\u001b[K"] +[19.663197, "o", "#"] +[19.822869, "o", " "] +[21.012324, "o", "n"] +[21.079126, "o", "o"] +[21.3069, "o", "t"] +[21.565299, "o", "e"] +[21.791988, "o", " "] +[21.989943, "o", "t"] +[22.079654, "o", "h"] +[22.152775, "o", "a"] +[22.271404, "o", "t"] +[22.358257, "o", " "] +[22.495177, "o", "t"] +[22.616552, "o", "h"] +[22.689874, "o", "e"] +[22.759133, "o", "r"] +[22.851398, "o", "e"] +[22.944484, "o", " "] +[23.047542, "o", "a"] +[23.143223, "o", "r"] +[23.236144, "o", "e"] +[23.30143, "o", " "] +[23.479486, "o", "s"] +[23.726797, "o", "e"] +[23.996443, "o", "v"] +[24.285069, "o", "e"] +[24.769554, "o", "\b\u001b[K"] +[24.888103, "o", "\b\u001b[K"] +[25.372766, "o", "v"] +[25.486693, "o", "e"] +[25.780145, "o", "r"] +[25.831058, "o", "a"] +[25.957718, "o", "l"] +[26.079508, "o", " "] +[26.253337, "o", "t"] +[26.309159, "o", "a"] +[26.511159, "o", "s"] +[26.601906, "o", "k"] +[26.721828, "o", "s"] +[27.244749, "o", " "] +[28.574755, "o", "t"] +[28.854002, "o", "a"] +[29.471312, "o", "\b\u001b[K"] +[29.676745, "o", "h"] +[29.754496, "o", "a"] +[29.887311, "o", "t"] +[29.958997, "o", " "] +[30.166519, "o", "s"] +[30.315248, "o", "t"] +[30.393086, "o", "a"] +[30.975307, "o", "r"] +[31.599184, "o", "t"] +[31.752243, "o", " "] +[31.879063, "o", "w"] +[31.967152, "o", "i"] +[32.078599, "o", "t"] +[32.166707, "o", "h"] +[32.270994, "o", " "] +[32.471433, "o", "t"] +[32.542586, "o", "h"] +[32.686761, "o", "e"] +[32.763813, "o", " "] +[33.296049, "o", "L"] +[33.87867, "o", "\b\u001b[K"] +[35.524303, "o", "K"] +[36.343905, "o", "\b\u001b[K"] +[36.510508, "o", "k"] +[37.48386, "o", " "] +[37.8357, "o", "l"] +[37.922486, "o", "e"] +[38.118439, "o", "t"] +[38.276721, "o", "t"] +[38.376646, "o", "e"] +[38.503033, "o", "r"] +[38.911624, "o", "\r\n(gdb) "] +[39.276285, "o", "#"] +[39.519145, "o", " "] +[39.750673, "o", "t"] +[39.85472, "o", "h"] +[39.950603, "o", "e"] +[40.108249, "o", "s"] +[40.271864, "o", "e"] +[40.511774, "o", " "] +[40.801505, "o", "("] +[41.030531, "o", "b"] +[41.118878, "o", "u"] +[41.230505, "o", "t"] +[41.314769, "o", " "] +[41.740861, "o", "n"] +[41.824881, "o", "o"] +[42.019385, "o", "t"] +[42.14018, "o", " "] +[42.254553, "o", "o"] +[42.338673, "o", "n"] +[42.493789, "o", "l"] +[42.662862, "o", "y"] +[42.75931, "o", " "] +[42.916086, "o", "t"] +[43.020488, "o", "h"] +[43.084785, "o", "e"] +[43.263329, "o", "s"] +[43.362693, "o", "e"] +[43.608998, "o", ")"] +[43.774713, "o", " "] +[43.894708, "o", "a"] +[44.062643, "o", "r"] +[44.118689, "o", "e"] +[44.2443, "o", " "] +[44.431308, "o", "k"] +[44.550679, "o", "e"] +[44.612908, "o", "r"] +[44.708174, "o", "n"] +[44.790351, "o", "e"] +[44.894934, "o", "l"] +[44.974263, "o", " "] +[45.204614, "o", "t"] +[45.327253, "o", "h"] +[45.388938, "o", "r"] +[45.438216, "o", "e"] +[45.518535, "o", "a"] +[45.773446, "o", "d"] +[45.822653, "o", "s"] +[46.775316, "o", "\r\n(gdb) "] +[53.767289, "o", "#"] +[54.624466, "o", " "] +[54.85786, "o", "t"] +[54.962973, "o", "h"] +[55.108379, "o", "e"] +[55.926781, "o", " "] +[56.132174, "o", "p"] +[56.246669, "o", "s"] +[56.342818, "o", " "] +[56.523069, "o", "u"] +[56.729793, "o", "t"] +[56.791601, "o", "i"] +[56.998652, "o", "l"] +[57.102951, "o", "i"] +[57.433973, "o", "t"] +[57.510405, "o", "y"] +[57.663099, "o", " "] +[57.931719, "o", "m"] +[58.030756, "o", "a"] +[58.234054, "o", "r"] +[58.334973, "o", "k"] +[58.532386, "o", "s"] +[58.678812, "o", " "] +[58.88646, "o", "t"] +[58.982125, "o", "h"] +[59.094825, "o", "e"] +[59.236552, "o", "n"] +[59.630822, "o", " "] +[59.967128, "o", "w"] +[60.050881, "o", "i"] +[60.182217, "o", "t"] +[60.246628, "o", "h"] +[60.499501, "o", " "] +[60.996343, "o", "b"] +[61.062202, "o", "r"] +[61.15963, "o", "a"] +[61.606658, "o", "c"] +[61.710465, "o", "k"] +[61.790986, "o", "e"] +[61.926711, "o", "t"] +[62.094782, "o", "s"] +[62.243636, "o", " "] +[63.711846, "o", "["] +[63.806682, "o", "["] +[64.356046, "o", "\b\u001b[K"] +[64.620136, "o", "]"] +[65.375322, "o", "\r\n(gdb) "] +[68.557209, "o", "quit\r\n"] +[68.557963, "o", "A debugging session is active.\r\n\r\n\tInferior 1 [process 1] will be detached.\r\n\r\nQuit anyway? (y or n) "] +[69.452475, "o", "y"] +[69.607083, "o", "\r\nDetaching from program: /linux/vmlinux, process 1\r\n"] +[69.608099, "o", "Ending remote debugging.\r\n"] +[69.608145, "o", "[Inferior 1 (process 1) detached]\r\n"] +[69.613792, "o", "$ "] +[72.387606, "o", "m"] +[72.556239, "o", "i"] +[72.678491, "o", "n"] +[72.735372, "o", "i"] +[72.83213, "o", "c"] +[72.939861, "o", "o"] +[72.974432, "o", "m"] +[73.12681, "o", "d"] +[73.245949, "o", " "] +[73.451736, "o", "-"] +[73.787899, "o", "D"] +[73.926861, "o", " "] +[74.062585, "o", "s"] +[74.110529, "o", "e"] +[74.20401, "o", "r"] +[74.254354, "o", "i"] +[74.383866, "o", "a"] +[74.964635, "o", "l"] +[75.158299, "o", "."] +[75.358153, "o", "p"] +[75.483515, "o", "t"] +[75.690025, "o", "s"] +[75.791116, "o", "\r\n"] +[75.792551, "o", "sh: 2: minicomd: not found\r\n$ "] +[77.369862, "o", "^[[A"] +[78.147033, "o", "\b \b"] +[78.246726, "o", "\b \b"] +[78.374032, "o", "\b \b\b \b"] +[78.726508, "o", "m"] +[78.830234, "o", "i"] +[78.942715, "o", "n"] +[78.998286, "o", "i"] +[79.16445, "o", "c"] +[79.219047, "o", "o"] +[79.28939, "o", "m"] +[79.699735, "o", " "] +[79.851281, "o", "-"] +[80.22679, "o", "D"] +[80.359667, "o", " "] +[80.494074, "o", "s"] +[80.58978, "o", "e"] +[80.665833, "o", "r"] +[80.734429, "o", "i"] +[80.831438, "o", "a"] +[80.902218, "o", "l"] +[81.065769, "o", "."] +[81.235249, "o", "p"] +[81.354437, "o", "t"] +[81.667451, "o", "s"] +[82.015056, "o", "\r\n"] +[82.0318, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[82.032263, "o", "\u001b[?12l\u001b[?25h\nWelcome to minicom 2.7.1\r\n\nOPTIONS: I18n \r\nCompiled on Dec 23 2019, 02:06:26.\r\nPort serial.pts, 11:09:49\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[82.665267, "o", "\n"] +[82.670205, "o", "Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0"] +[82.670382, "o", "\r\n"] +[82.671379, "o", "\n"] +[82.673617, "o", "qemux86 login: "] +[83.310781, "o", "r"] +[83.35226, "o", "o"] +[83.486962, "o", "o"] +[83.559101, "o", "t"] +[83.640496, "o", "\r\n"] +[83.807489, "o", "root@qemux86:~# "] +[84.151513, "o", "p"] +[84.239303, "o", "s"] +[84.353291, "o", "\r\n"] +[84.365744, "o", " PID USER VSZ STAT COMMAND"] +[84.366019, "o", "\r\n"] +[84.382473, "o", " 1 root 2004 S init [5]"] +[84.382916, "o", "\r\n"] +[84.384492, "o", " 2 root 0 SW [kthreadd]"] +[84.384691, "o", "\r\n"] +[84.385851, "o", " 3 root 0 IW< [rcu_gp]"] +[84.386047, "o", "\r\n"] +[84.387605, "o", " 4 root 0 IW< [rcu_par_gp]"] +[84.387746, "o", "\r\n"] +[84.388848, "o", " 6 root 0 IW< [kworker/0:0H-ev]"] +[84.388994, "o", "\r\n"] +[84.38998, "o", " 7 root 0 IW [kworker/u2:0-ev]"] +[84.390164, "o", "\r\n"] +[84.391562, "o", " 8 root 0 IW< [mm_percpu_wq]"] +[84.391813, "o", "\r\n"] +[84.392754, "o", " 9 root 0 SW [ksoftirqd/0]"] +[84.392996, "o", "\r\n"] +[84.393827, "o", " 10 root 0 IW [rcu_sched]"] +[84.394025, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.395414, "o", " 11 root 0 SW [migration/0]"] +[84.395571, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.396462, "o", " 12 root 0 SW [cpuhp/0]"] +[84.39663, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.397513, "o", " 13 root 0 SW [kdevtmpfs]"] +[84.397659, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.399276, "o", " 14 root 0 IW< [netns]"] +[84.399499, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.40051, "o", " 15 root 0 SW [oom_reaper]"] +[84.400715, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.4016, "o", " 16 root 0 IW< [writeback]"] +[84.401763, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.403336, "o", " 32 root 0 IW< [kblockd]"] +[84.403515, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.404416, "o", " 33 root 0 IW [kworker/0:1-mm_]"] +[84.404597, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.405656, "o", " 34 root 0 IW< [kworker/0:1H-kb]"] +[84.405869, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.407531, "o", " 35 root 0 SW [kswapd0]"] +[84.40772, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.408696, "o", " 37 root 0 IW< [acpi_thermal_pm]"] +[84.408838, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.409754, "o", " 38 root 0 IW [kworker/u2:1-ev]"] +[84.409941, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.411415, "o", " 39 root 0 IW [kworker/0:2-eve]"] +[84.411603, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.412584, "o", " 40 root 0 SW [khvcd]"] +[84.412802, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.413892, "o", " 41 root 0 IW< [ipv6_addrconf]"] +[84.414073, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.415589, "o", " 42 root 0 SWN [kmemleak]"] +[84.415761, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.416609, "o", " 43 root 0 SW [jbd2/vda-8]"] +[84.416767, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.417633, "o", " 44 root 0 IW< [ext4-rsv-conver]"] +[84.417776, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.419678, "o", " 187 root 2828 S udhcpc -R -b -p /var/run/udhcpc.eth0.pid -i eth0"] +[84.419824, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.421049, "o", " 198 root 2828 S /sbin/syslogd -n -O /var/log/messages"] +[84.421222, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.422318, "o", " 201 root 2828 S /sbin/klogd -n"] +[84.422683, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.424349, "o", " 207 root 2828 S /sbin/getty 38400 tty1"] +[84.424461, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.425792, "o", " 208 root 2972 S -sh\r\n\u001b[23;80H \u001b[24;1H"] +[84.427611, "o", " 209 root 2828 S /sbin/getty 38400 tty2"] +[84.427768, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.428986, "o", " 210 root 2828 S /sbin/getty 38400 tty3"] +[84.429144, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.430325, "o", " 211 root 2828 S /sbin/getty 38400 tty4"] +[84.430573, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.432246, "o", " 212 root 2828 S /sbin/getty 38400 tty5"] +[84.43239, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.433849, "o", " 932 root 2976 R ps"] +[84.434032, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[84.437908, "o", "root@qemux86:~# "] +[99.144067, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[99.145467, "o", "root@qemux86:~# "] +[99.862108, "o", "\u001b[0m\u001b(B\u001b[7m\r\u001b[K\u001b[?12l\u001b[?25h\u001b[?25lCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7.1 | VT102 | Offline | al.pts\u001b[?12l\u001b[?25h\u001b[24;17H"] +[100.044152, "o", "\u001b[8;30H\u001b[?25l\u001b[0m\u001b(B\u001b(0lqqqqqqqqqqqqqqqqqqqqqqk\u001b[9;30Hx\u001b[0m\u001b(B Leave Minicom? \u001b[0m\u001b(B\u001b(0x\u001b[10;30Hx\u001b[0m\u001b(B No \u001b[0m\u001b(B\u001b(0x\u001b[11;30Hmqqqqqqqqqqqqqqqqqqqqqqj\u001b[10;51H\u001b[?25l\u001b[10;33H\u001b[0m\u001b(B\u001b[7m Yes "] +[100.583354, "o", "\u001b[?12l\u001b[?25h\u001b[8;1H\u001b[0m\u001b(B 40 root 0 SW [khvcd] \u001b[9;1H 41 root 0 IW< [ipv6_addrconf] \u001b[10;1H 42 root 0 SWN [kmemleak] \u001b[11;1H 43 root 0 SW [jbd2/vda-8] \u001b[24;17H\u001b[0m\u001b(B\u001b[7m\u001b[?12l\u001b[?25h\u001b[?12l\u001b[?25h"] +[100.583429, "o", "\u001b[0m\u001b(B\u001b[H\u001b[2J\u001b[?12l\u001b[?25h\u001b[?1l\u001b>\u001b[!p\u001b[?3;4l\u001b[4l\u001b>"] +[100.58371, "o", "$ "] +[101.414149, "o", "^[[A"] +[101.566416, "o", "^[[A"] +[101.886558, "o", "^[[A"] +[102.335042, "o", "\b \b"] +[102.842375, "o", "\b \b"] +[102.872862, "o", "\b \b\b \b"] +[102.904871, "o", "\b \b"] +[102.937132, "o", "\b \b"] +[102.969648, "o", "\b \b\b \b"] +[103.01399, "o", "\b \b"] +[103.046078, "o", "\b \b"] +[103.079602, "o", "\b \b\b \b"] +[103.486251, "o", "m"] +[103.572083, "o", "a"] +[103.611934, "o", "m"] +[103.642732, "o", "k"] +[103.734628, "o", "e"] +[103.837599, "o", " "] +[104.06262, "o", "g"] +[104.35936, "o", "\b \b"] +[104.491737, "o", "\b \b"] +[104.620252, "o", "\b \b"] +[104.731786, "o", "\b \b"] +[104.904506, "o", "k"] +[105.139616, "o", "\b \b"] +[105.26003, "o", "\b \b"] +[105.374543, "o", "k"] +[105.47003, "o", "e"] +[105.550545, "o", " "] +[105.946754, "o", "g"] +[106.106541, "o", "d"] +[106.171182, "o", "b"] +[106.471309, "o", "\r\n"] +[106.477162, "o", "gdb -ex \"target remote localhost:1234\" /linux/vmlinux\r\n"] +[106.511747, "o", "\u001b[35;1m\u001b[35;1mGNU gdb \u001b[m\u001b[35;1m(Ubuntu 9.2-0ubuntu1~20.04) \u001b[m\u001b[35;1m9.2\u001b[m\u001b[35;1m\r\n\u001b[m\u001b[mCopyright (C) 2020 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law.\r\nType \"show copying\" and \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n .\r\n\r\nFor help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[106.512051, "o", "Reading symbols from \u001b[32m/linux/vmlinux\u001b[m...\r\n"] +[107.070342, "o", "Remote debugging using localhost:1234\r\n"] +[107.082871, "o", "\u001b[33mdefault_idle\u001b[m () at \u001b[32march/x86/kernel/process.c\u001b[m:689\r\n"] +[107.082941, "o", "689\t}\r\n"] +[107.083245, "o", "(gdb) "] +[108.67027, "o", "l"] +[108.79013, "o", "s"] +[110.752881, "o", "-"] +[111.158637, "o", "\b\u001b[K"] +[111.294213, "o", "\b\u001b[K"] +[111.375362, "o", "x"] +[111.56393, "o", "-"] +[112.075891, "o", "p"] +[112.716755, "o", "s"] +[112.983024, "o", "\r\n"] +[112.983204, "o", " TASK PID COMM\r\n"] +[112.984424, "o", "0xc17d02c0 0 swapper/0\r\n"] +[112.985648, "o", "0xc2530040 1 init\r\n"] +[112.986756, "o", "0xc2534080 2 kthreadd\r\n"] +[112.987788, "o", "0xc25360c0 3 rcu_gp\r\n"] +[112.98883, "o", "0xc2537100 4 rcu_par_gp\r\n"] +[112.989979, "o", "0xc2546180 6 kworker/0:0H\r\n"] +[112.990945, "o", "0xc25481c0 7 kworker/u2:0\r\n"] +[112.991808, "o", "0xc2549000 8 mm_percpu_wq\r\n"] +[112.992762, "o", "0xc254b040 9 ksoftirqd/0\r\n"] +[112.993594, "o", "0xc254c080 10 rcu_sched\r\n"] +[112.994443, "o", "0xc254e0c0 11 migration/0\r\n"] +[112.995327, "o", "0xc2572100 12 cpuhp/0\r\n"] +[112.996231, "o", "0xc2576140 13 kdevtmpfs\r\n"] +[112.997154, "o", "0xc2593180 14 netns\r\n"] +[112.997928, "o", "0xc26211c0 15 oom_reaper\r\n"] +[112.998731, "o", "0xc2623000 16 writeback\r\n"] +[112.999501, "o", "0xc25cd1c0 32 kblockd\r\n"] +[113.000229, "o", "0xc2638180 33 kworker/0:1\r\n"] +[113.000993, "o", "0xc2637140 34 kworker/0:1H\r\n"] +[113.001713, "o", "0xc2636100 35 kswapd0\r\n"] +[113.002441, "o", "0xc2634080 37 acpi_thermal_pm\r\n"] +[113.003341, "o", "0xc26350c0 38 kworker/u2:1\r\n"] +[113.004215, "o", "\u001b[m--Type for more, q to quit, c to continue without paging--"] +[114.273671, "o", "q"] +[116.3102, "o", "\r\n"] +[116.31037, "o", "Quit\r\n(gdb) "] +[117.96624, "o", "#"] +[118.254117, "o", " "] +[118.40776, "o", "l"] +[118.491374, "o", "e"] +[118.654629, "o", "t"] +[118.828047, "o", "s"] +[118.945063, "o", " "] +[119.111903, "o", "i"] +[119.185942, "o", "n"] +[119.254491, "o", "s"] +[119.411189, "o", "p"] +[119.478562, "o", "e"] +[119.566175, "o", "c"] +[120.054814, "o", "t"] +[121.318467, "o", " "] +[122.796389, "o", "t"] +[122.876021, "o", "h"] +[123.006447, "o", "e"] +[123.121491, "o", " "] +[123.486383, "o", "k"] +[123.662142, "o", "s"] +[123.761024, "o", "o"] +[123.827853, "o", "f"] +[124.056733, "o", "t"] +[124.884417, "o", "i"] +[124.998466, "o", "r"] +[125.090886, "o", "q"] +[125.458436, "o", "d"] +[126.11713, "o", " "] +[126.34268, "o", "k"] +[126.423265, "o", "e"] +[126.470594, "o", "r"] +[126.581815, "o", "n"] +[126.662065, "o", "e"] +[126.766791, "o", "l"] +[126.867196, "o", " "] +[127.013848, "o", "t"] +[127.11873, "o", "h"] +[127.196182, "o", "r"] +[127.252684, "o", "e"] +[127.302566, "o", "a"] +[127.525289, "o", "d"] +[127.612912, "o", "s"] +[128.91899, "o", "\b\u001b[K"] +[129.079044, "o", "\r\n"] +[129.079153, "o", "(gdb) "] +[134.822513, "o", "p"] +[134.98218, "o", "r"] +[135.04637, "o", "i"] +[135.095337, "o", "n"] +[135.213449, "o", "t"] +[135.367892, "o", " "] +[135.948456, "o", "("] +[137.420016, "o", "("] +[137.64176, "o", "s"] +[137.791566, "o", "t"] +[137.847118, "o", "r"] +[137.958465, "o", "u"] +[138.060018, "o", "c"] +[138.267908, "o", "t"] +[138.871323, "o", " "] +[139.042958, "o", "t"] +[139.103241, "o", "a"] +[139.242309, "o", "s"] +[139.308529, "o", "k"] +[139.510379, "o", "_"] +[139.682368, "o", "s"] +[139.887398, "o", "u"] +[140.423883, "o", "\b\u001b[K"] +[140.518478, "o", "t"] +[140.671075, "o", "r"] +[140.742392, "o", "u"] +[140.875932, "o", "c"] +[141.086603, "o", "t"] +[141.483711, "o", "*"] +[141.670866, "o", ")"] +[143.13397, "o", "0xc254b040"] +[144.158432, "o", ")"] +[144.861881, "o", "-"] +[145.332921, "o", ">"] +[145.765653, "o", "\u0007"] +[146.052545, "o", "\r\nDisplay all 156 possibilities? (y or n)"] +[148.83855, "o", "\r\n(gdb) print ((struct task_struct*)0xc254b040)->"] +[151.566382, "o", "m"] +[151.708377, "o", "m"] +[152.208982, "o", " "] +[152.590906, "o", "\r\n"] +[152.606852, "o", "$1 = (struct mm_struct *) \u001b[34m0x0\u001b[m\r\n(gdb) "] +[153.7479, "o", "#"] +[153.990068, "o", " "] +[154.220116, "o", "n"] +[154.293473, "o", "o"] +[154.476203, "o", "t"] +[154.549343, "o", "i"] +[154.718407, "o", "c"] +[154.792436, "o", "e"] +[155.435468, "o", " "] +[155.623648, "o", "t"] +[155.71801, "o", "h"] +[155.820625, "o", "a"] +[155.910595, "o", "t"] +[156.03814, "o", " "] +[156.275706, "o", "t"] +[156.374535, "o", "h"] +[156.467813, "o", "e"] +[156.534298, "o", "r"] +[156.606573, "o", "e"] +[156.708536, "o", " "] +[156.820267, "o", "i"] +[156.926727, "o", "s"] +[157.036148, "o", " "] +[157.260091, "o", "n"] +[157.332754, "o", "o"] +[157.521488, "o", " "] +[157.621793, "o", "a"] +[157.950278, "o", "d"] +[158.123876, "o", "d"] +[158.332801, "o", "r"] +[158.382158, "o", "e"] +[158.571143, "o", "s"] +[158.688164, "o", "s"] +[158.818238, "o", " "] +[158.910157, "o", "s"] +[159.013842, "o", "p"] +[159.102999, "o", "a"] +[159.299519, "o", "c"] +[159.363577, "o", "e"] +[159.494688, "o", " "] +[159.600096, "o", "a"] +[159.85968, "o", "s"] +[160.022336, "o", "s"] +[160.13205, "o", "o"] +[160.270871, "o", "c"] +[160.335137, "o", "i"] +[160.382535, "o", "a"] +[160.580297, "o", "t"] +[160.636041, "o", "e"] +[160.792826, "o", "d"] +[160.878038, "o", " "] +[160.990215, "o", "w"] +[161.078373, "o", "i"] +[161.214096, "o", "t"] +[161.286032, "o", "h"] +[161.382406, "o", " "] +[161.466374, "o", "t"] +[161.573907, "o", "h"] +[161.654564, "o", "e"] +[161.770776, "o", " "] +[161.918309, "o", "k"] +[162.030649, "o", "e"] +[162.070539, "o", "r"] +[162.155973, "o", "n"] +[162.237838, "o", "e"] +[162.295025, "o", "l"] +[162.42764, "o", " "] +[163.094332, "o", "\b\u001b[K"] +[163.222493, "o", "\b\u001b[K"] +[164.206568, "o", "\b\u001b[K"] +[164.339329, "o", "\b\u001b[K"] +[164.468159, "o", "\b\u001b[K"] +[164.590204, "o", "\b\u001b[K"] +[164.724112, "o", "\b\u001b[K"] +[164.892234, "o", "t"] +[165.055837, "o", "\b\u001b[K"] +[165.198298, "o", "\b\u001b[K"] +[165.335496, "o", "\b\u001b[K"] +[165.468275, "o", "\b\u001b[K"] +[165.62238, "o", "\b\u001b[K"] +[166.246756, "o", "t"] +[166.326615, "o", "h"] +[166.382333, "o", "i"] +[166.4519, "o", "s"] +[166.541987, "o", " "] +[166.708252, "o", "t"] +[166.819081, "o", "a"] +[166.908185, "o", "s"] +[167.053111, "o", "k"] +[167.36667, "o", "\r\n(gdb) "] +[170.791095, "o", "# notice that there is no address space associated with this task"] +[171.750239, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[21Pprint ((struct task_struct*)0xc254b040)->mm "] +[172.23017, "o", "\b\u001b[K"] +[172.386797, "o", "\b\u001b[K"] +[172.531384, "o", "\b\u001b[K"] +[172.873833, "o", "f"] +[173.203858, "o", "i"] +[173.365186, "o", "les "] +[174.539751, "o", "\b\u001b[K"] +[174.715458, "o", "-"] +[175.180139, "o", ">"] +[175.871471, "o", "\u0007"] +[176.032089, "o", "\r\nclose_on_exec_init fdtab open_fds_init\r\ncount file_lock resize_in_progress\r\nfd_array full_fds_bits_init resize_wait\r\nfdt next_fd \r\n(gdb) print ((struct task_struct*)0xc254b040)->files->"] +[177.45997, "o", "f"] +[177.754961, "o", "d"] +[178.438573, "o", "_"] +[179.441568, "o", "a"] +[179.67653, "o", "r"] +[179.854296, "o", "r"] +[179.966081, "o", "ay "] +[180.934307, "o", "\r\n"] +[180.954674, "o", "$2 = {\u001b[34m0x0\u001b[m \u001b[2m\u001b[m}\r\n(gdb) "] +[182.787841, "o", "#"] +[183.110265, "o", " "] +[183.479449, "o", "a"] +[183.614116, "o", "l"] +[183.783304, "o", "s"] +[183.866187, "o", "o"] +[184.006426, "o", " "] +[184.275627, "o", "n"] +[184.333689, "o", "o"] +[184.82741, "o", "t"] +[184.926539, "o", "i"] +[185.106651, "o", "c"] +[185.165936, "o", "e"] +[185.308254, "o", " "] +[185.430403, "o", "t"] +[185.517961, "o", "h"] +[185.584954, "o", "a"] +[185.670047, "o", "t"] +[186.034584, "o", " "] +[186.222612, "o", "t"] +[186.315392, "o", "h"] +[186.405867, "o", "e"] +[186.477735, "o", "r"] +[186.570951, "o", "e"] +[186.621919, "o", " "] +[186.749962, "o", "a"] +[186.814544, "o", "r"] +[186.915797, "o", "e"] +[186.971487, "o", " "] +[187.165461, "o", "n"] +[187.246091, "o", "o"] +[187.346347, "o", " "] +[187.559526, "o", "o"] +[187.668583, "o", "p"] +[187.718056, "o", "e"] +[188.950305, "o", "n"] +[189.132854, "o", "e"] +[189.278635, "o", "d"] +[189.470868, "o", " "] +[189.824985, "o", "f"] +[189.950471, "o", "i"] +[190.036377, "o", "l"] +[190.103353, "o", "e"] +[190.302439, "o", "s"] +[193.022303, "o", "\r\n(gdb) "] +[206.790997, "o", "b"] +[206.883683, "o", "t"] +[207.622064, "o", "\r\n"] +[207.622477, "o", "#0 \u001b[33mdefault_idle\u001b[m () at \u001b[32march/x86/kernel/process.c\u001b[m:689\r\n#1 \u001b[34m0xc102c18d\u001b[m in \u001b[33march_cpu_idle\u001b[m () at \u001b[32march/x86/kernel/process.c\u001b[m:680\r\n"] +[207.628692, "o", "#2 \u001b[34m0xc15de082\u001b[m in \u001b[33mdefault_idle_call\u001b[m () at \u001b[32mkernel/sched/idle.c\u001b[m:112\r\n"] +[207.628733, "o", "#3 \u001b[34m0xc108d875\u001b[m in \u001b[33mcpuidle_idle_call\u001b[m () at \u001b[32mkernel/sched/idle.c\u001b[m:194\r\n"] +[207.62875, "o", "#4 \u001b[33mdo_idle\u001b[m () at \u001b[32mkernel/sched/idle.c\u001b[m:299\r\n"] +[207.635135, "o", "#5 \u001b[34m0xc108dbd5\u001b[m in \u001b[33mcpu_startup_entry\u001b[m (\u001b[36mstate=state@entry\u001b[m=CPUHP_ONLINE)\u001b[m\r\n \u001b[m at \u001b[32mkernel/sched/idle.c\u001b[m:395\r\n#6 \u001b[34m0xc15d6100\u001b[m in \u001b[33mrest_init\u001b[m () at \u001b[32minit/main.c\u001b[m:721\r\n#7 \u001b[34m0xc18c77de\u001b[m in \u001b[33march_call_rest_init\u001b[m () at \u001b[32minit/main.c\u001b[m:845\r\n"] +[207.635417, "o", "#8 \u001b[34m0xc18c7c30\u001b[m in \u001b[33mstart_kernel\u001b[m () at \u001b[32minit/main.c\u001b[m:1061\r\n"] +[207.637653, "o", "#9 \u001b[34m0xc18c7218\u001b[m in \u001b[33mi386_start_kernel\u001b[m () at \u001b[32march/x86/kernel/head32.c\u001b[m:56\r\n"] +[207.638067, "o", "#10 \u001b[34m0xc10001db\u001b[m in \u001b[33mstartup_32_smp\u001b[m () at \u001b[32march/x86/kernel/head_32.S\u001b[m:327\r\n"] +[207.638979, "o", "#11 \u001b[34m0x00000000\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[207.639304, "o", "(gdb) "] +[221.379671, "o", "#"] +[221.574477, "o", " "] +[221.763647, "o", "t"] +[221.859656, "o", "h"] +[221.910208, "o", "i"] +[222.230146, "o", "s"] +[223.003102, "o", " "] +[223.24279, "o", "d"] +[223.342376, "o", "o"] +[223.407244, "o", "e"] +[223.515893, "o", "s"] +[223.822602, "o", " "] +[224.052215, "o", "n"] +[224.126804, "o", "o"] +[224.262025, "o", "t"] +[224.350204, "o", " "] +[224.566282, "o", "l"] +[224.715391, "o", "o"] +[224.851651, "o", "o"] +[224.892089, "o", "k"] +[225.071297, "o", " "] +[225.204093, "o", "a"] +[225.777377, "o", "\b\u001b[K"] +[225.938217, "o", "l"] +[226.117327, "o", "i"] +[226.267363, "o", "k"] +[226.381869, "o", "e"] +[226.453866, "o", " "] +[226.610517, "o", "a"] +[226.678686, "o", " "] +[227.092186, "o", "k"] +[227.652524, "o", "\b\u001b[K"] +[227.802894, "o", "\b\u001b[K"] +[228.462601, "o", " "] +[228.691067, "o", "b"] +[228.726474, "o", "a"] +[228.861766, "o", "c"] +[228.973894, "o", "k"] +[229.137284, "o", "t"] +[229.283711, "o", "r"] +[229.350451, "o", "a"] +[229.511061, "o", "c"] +[229.59807, "o", "e"] +[229.749979, "o", " "] +[230.186502, "o", "f"] +[230.257607, "o", "o"] +[230.38212, "o", "r"] +[230.453689, "o", " "] +[230.550129, "o", "a"] +[230.679919, "o", " "] +[234.654082, "o", "e"] +[234.740562, "o", "r"] +[235.011631, "o", "\b\u001b[K"] +[235.118657, "o", "\b\u001b[K"] +[235.298981, "o", "k"] +[235.390755, "o", "e"] +[235.437752, "o", "r"] +[235.526007, "o", "n"] +[235.623409, "o", "e"] +[235.763733, "o", "l"] +[235.773498, "o", " "] +[236.126636, "o", "t"] +[236.241013, "o", "h"] +[236.33206, "o", "r"] +[236.397333, "o", "e"] +[236.446064, "o", "a"] +[236.607301, "o", "d"] +[240.062311, "o", "\r\n"] +[240.062479, "o", "(gdb) "] +[240.711316, "o", "#"] +[240.90636, "o", " "] +[241.166191, "o", "l"] +[241.419451, "o", "e"] +[241.659109, "o", "t"] +[241.86961, "o", "s"] +[242.043257, "o", " "] +[244.843651, "o", "p"] +[244.918102, "o", "p"] +[245.016449, "o", "u"] +[245.158034, "o", "t"] +[245.244774, "o", " "] +[245.393348, "o", "a"] +[245.41361, "o", " "] +[245.76255, "o", "\b\u001b[K"] +[245.881592, "o", "\b\u001b[K"] +[246.006241, "o", "\b\u001b[K"] +[246.137238, "o", "\b\u001b[K"] +[246.272147, "o", "\b\u001b[K"] +[246.395526, "o", "\b\u001b[K"] +[246.574088, "o", "i"] +[246.677415, "o", "t"] +[247.062963, "o", "\b\u001b[K"] +[247.181883, "o", "\b\u001b[K"] +[247.334239, "o", "u"] +[247.446138, "o", "t"] +[247.546923, "o", " "] +[247.67005, "o", "a"] +[247.773321, "o", " "] +[248.004601, "o", "b"] +[248.078817, "o", "r"] +[248.151286, "o", "e"] +[248.214162, "o", "a"] +[248.299871, "o", "k"] +[248.541708, "o", "p"] +[248.628067, "o", "o"] +[248.811761, "o", "i"] +[248.876511, "o", "n"] +[248.971272, "o", "t"] +[249.125851, "o", " "] +[249.392141, "o", "i"] +[249.555648, "o", "n"] +[250.946185, "o", "\b\u001b[K"] +[251.077786, "o", "\b\u001b[K"] +[251.710112, "o", "t"] +[251.812957, "o", "o"] +[252.861908, "o", " "] +[253.182436, "o", "\b\u001b[K"] +[253.293631, "o", "\b\u001b[K"] +[253.398191, "o", "\b\u001b[K"] +[253.766615, "o", "n"] +[254.131587, "o", "\b\u001b[K"] +[254.315781, "o", "i"] +[254.376792, "o", "n"] +[254.433173, "o", " "] +[254.573627, "o", "t"] +[254.631208, "o", "h"] +[254.750211, "o", "e"] +[254.82721, "o", " "] +[255.077806, "o", "c"] +[255.181875, "o", "o"] +[255.237745, "o", "n"] +[255.36349, "o", "t"] +[255.413945, "o", "e"] +[255.601549, "o", "x"] +[255.798133, "o", "t"] +[255.870077, "o", " "] +[256.177252, "o", "s"] +[256.356189, "o", "w"] +[256.420864, "o", "i"] +[256.530677, "o", "t"] +[256.731915, "o", "c"] +[256.77443, "o", "h"] +[256.883801, "o", " "] +[258.571714, "o", "r"] +[258.640714, "o", "o"] +[258.712151, "o", "u"] +[258.829888, "o", "t"] +[258.924193, "o", "i"] +[258.975067, "o", "n"] +[259.069473, "o", "e"] +[259.19488, "o", " "] +[259.366391, "o", "a"] +[259.47005, "o", "n"] +[259.557575, "o", "d"] +[259.654709, "o", " "] +[260.19587, "o", "w"] +[260.29212, "o", "a"] +[260.381822, "o", "i"] +[260.533426, "o", "t"] +[260.606091, "o", " "] +[260.739803, "o", "f"] +[260.798442, "o", "o"] +[261.681096, "o", "r"] +[262.777308, "o", " "] +[262.951854, "o", "a"] +[263.061053, "o", " "] +[263.308236, "o", "k"] +[263.390265, "o", "e"] +[264.173829, "o", "r"] +[264.245576, "o", "n"] +[264.382178, "o", "e \r"] +[264.493978, "o", "l"] +[264.674766, "o", " "] +[265.043389, "o", "t"] +[265.139075, "o", "h"] +[265.206648, "o", "r"] +[265.278568, "o", "e"] +[265.317655, "o", "a"] +[265.498561, "o", "d"] +[265.589679, "o", " "] +[265.76644, "o", "t"] +[265.821599, "o", "o"] +[265.886969, "o", " "] +[266.331796, "o", "b"] +[266.403363, "o", "e"] +[266.508099, "o", " "] +[266.655882, "o", "s"] +[266.917726, "o", "c"] +[267.287361, "o", "h"] +[267.381853, "o", "e"] +[267.523815, "o", "d"] +[267.574051, "o", "u"] +[267.763154, "o", "l"] +[267.820226, "o", "e"] +[267.957836, "o", "d"] +[268.446383, "o", "\r\n(gdb) "] +[269.741828, "o", "b"] +[269.808224, "o", "r"] +[269.870671, "o", "e"] +[269.893739, "o", "a"] +[269.998105, "o", "k"] +[270.085623, "o", " "] +[270.571466, "o", "_"] +[270.699335, "o", "_"] +[271.977106, "o", "c"] +[272.129589, "o", "o"] +[272.205444, "o", "n"] +[272.312765, "o", "t"] +[272.366081, "o", "e"] +[272.470791, "o", "\u0007"] +[273.62, "o", "\b\u001b[K"] +[273.74179, "o", "\b\u001b[K"] +[273.885881, "o", "\b\u001b[K"] +[274.018615, "o", "\b\u001b[K"] +[274.143134, "o", "\b\u001b[K"] +[274.263468, "o", "s"] +[274.422194, "o", "i"] +[274.501319, "o", "w"] +[274.719655, "o", "\u0007"] +[275.053997, "o", "\b\u001b[K"] +[275.187562, "o", "\b\u001b[K"] +[275.286581, "o", "w"] +[275.384725, "o", "i"] +[275.506637, "o", "\u0007tch_to"] +[276.392894, "o", "_"] +[276.621962, "o", "a"] +[276.699043, "o", "s"] +[276.905128, "o", "m "] +[277.262339, "o", "\r\n"] +[277.291402, "o", "Breakpoint 1 at \u001b[34m0xc10018e8\u001b[m: file \u001b[32march/x86/entry/entry_32.S\u001b[m, line 765.\r\n(gdb) "] +[280.133879, "o", "c"] +[281.222045, "o", "\b\u001b[K"] +[281.395261, "o", "\u0007"] +[281.857003, "o", "c"] +[282.237722, "o", "\b\u001b[K"] +[282.79563, "o", "l"] +[282.94158, "o", "i"] +[283.034854, "o", "s"] +[283.499366, "o", "\b\u001b[K"] +[283.624926, "o", "\b\u001b[K"] +[283.746958, "o", "\b\u001b[K"] +[284.082629, "o", "c"] +[284.174072, "o", "\r\nContinuing.\r\n"] +[284.176343, "o", "\r\n"] +[284.176419, "o", "Breakpoint 1, \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:765\r\n"] +[284.176491, "o", "765\t\tpushl\t%ebp\r\n(gdb) "] +[284.835414, "o", "l"] +[284.989635, "o", "i"] +[285.046355, "o", "s"] +[285.255024, "o", "t"] +[285.341593, "o", " "] +[287.197351, "o", "7"] +[287.491808, "o", "5"] +[287.755423, "o", "\b\u001b[K"] +[287.867358, "o", "6"] +[287.926098, "o", "0"] +[288.134096, "o", "\r\n"] +[288.135789, "o", "755\t/*\r\n756\t * %eax: prev task\r\n757\t * %edx: next task\r\n758\t */\r\n759\t.pushsection .text, \"ax\"\r\n760\tSYM_CODE_START(__switch_to_asm)\r\n761\t\t/*\r\n762\t\t * Save callee-saved registers\r\n763\t\t * This must match the order in struct inactive_task_frame\r\n764\t\t */\r\n"] +[288.135987, "o", "(gdb) "] +[288.699747, "o", "b"] +[288.811578, "o", "r"] +[288.862172, "o", "e"] +[288.942284, "o", "a"] +[289.033603, "o", "k"] +[289.355453, "o", " "] +[290.255841, "o", "\b\u001b[K"] +[290.384911, "o", "\b\u001b[K"] +[290.523826, "o", "\b\u001b[K"] +[290.661962, "o", "\b\u001b[K"] +[290.807839, "o", "\b\u001b[K"] +[290.941606, "o", "\b\u001b[K"] +[291.36621, "o", "\u0007"] +[292.275081, "o", "p"] +[292.451403, "o", "r"] +[292.517991, "o", "i"] +[292.602359, "o", "n"] +[292.667764, "o", "t"] +[292.766382, "o", " "] +[293.12916, "o", "("] +[293.699471, "o", "("] +[293.957439, "o", "t"] +[294.147806, "o", "a"] +[294.293733, "o", "s"] +[294.373217, "o", "k"] +[294.587807, "o", "_"] +[294.715863, "o", "s"] +[294.887282, "o", "t"] +[294.933917, "o", "r"] +[294.997435, "o", "u"] +[295.132333, "o", "c"] +[295.506086, "o", "\b\u001b[K"] +[296.004542, "o", "\b\u001b[K"] +[296.035262, "o", "\b\u001b[K"] +[296.084305, "o", "\b\u001b[K"] +[296.11605, "o", "\b\u001b[K"] +[296.147494, "o", "\b\u001b[K"] +[296.178799, "o", "\b\u001b[K"] +[296.317612, "o", "\b\u001b[K"] +[296.463433, "o", "\b\u001b[K"] +[296.718813, "o", "\b\u001b[K"] +[296.814308, "o", "s"] +[296.96261, "o", "t"] +[297.013641, "o", "r"] +[297.061739, "o", "u"] +[297.219398, "o", "c"] +[297.398522, "o", "t"] +[297.437315, "o", " "] +[297.542878, "o", "t"] +[297.637975, "o", "a"] +[297.747511, "o", "s"] +[297.819434, "o", "k"] +[298.016074, "o", "_"] +[298.117426, "o", "s"] +[298.27901, "o", "t"] +[298.342356, "o", "r"] +[298.422953, "o", "u"] +[298.524196, "o", "c"] +[298.698718, "o", "t"] +[298.989581, "o", "*"] +[298.997725, "o", "("] +[299.132205, "o", ")"] +[299.518034, "o", "\b\u001b[K"] +[299.613309, "o", "\b\u001b[K"] +[299.770424, "o", ")"] +[299.910219, "o", ")"] +[301.062221, "o", "\b\u001b[K"] +[301.485963, "o", "$"] +[302.314939, "o", "e"] +[304.099819, "o", "a"] +[304.402235, "o", "x"] +[304.916033, "o", ")"] +[305.422261, "o", "-"] +[305.88182, "o", ">"] +[306.029879, "o", "c"] +[306.133893, "o", "o"] +[306.256156, "o", "m"] +[306.398024, "o", "m"] +[306.597814, "o", "\r\n"] +[306.615485, "o", "$3 = \"swapper/0\\000\\000\\000\\000\\000\\000\"\r\n(gdb) "] +[310.517841, "o", "#"] +[310.726136, "o", " "] +[313.62966, "o", "\b\u001b[K"] +[313.774295, "o", "\b\u001b[K"] +[313.91453, "o", "\u0007"] +[315.397631, "o", "#"] +[315.577077, "o", " "] +[315.878097, "o", "c"] +[315.989862, "o", "u"] +[316.183243, "o", "r"] +[316.309684, "o", "r"] +[316.389297, "o", "e"] +[316.850982, "o", "n"] +[317.125964, "o", " "] +[317.295363, "o", "t"] +[317.403069, "o", "h"] +[317.478569, "o", "r"] +[317.551332, "o", "e"] +[317.594205, "o", "a"] +[317.751491, "o", "d"] +[317.949486, "o", "\b\u001b[K"] +[318.464381, "o", "\b\u001b[K"] +[318.487912, "o", "\b\u001b[K"] +[318.519446, "o", "\b\u001b[K"] +[318.55032, "o", "\b\u001b[K"] +[318.680362, "o", "\b\u001b[K"] +[318.817765, "o", "\b\u001b[K"] +[318.957945, "o", "t"] +[319.089737, "o", " "] +[319.291116, "o", "t"] +[319.389904, "o", "h"] +[319.647007, "o", "r"] +[319.74256, "o", "e"] +[319.78242, "o", "a"] +[319.942132, "o", "d"] +[320.030061, "o", " "] +[320.141699, "o", "i"] +[320.233772, "o", "s"] +[320.349512, "o", " "] +[320.443032, "o", "s"] +[320.80826, "o", "t"] +[320.90173, "o", "i"] +[321.069829, "o", "l"] +[321.204297, "o", "l"] +[321.259689, "o", " "] +[321.588152, "o", "s"] +[321.794884, "o", "w"] +[321.871903, "o", "a"] +[321.981533, "o", "p"] +[322.102066, "o", "p"] +[322.186971, "o", "e"] +[322.261993, "o", "r"] +[322.405556, "o", "\r\n"] +[322.405607, "o", "(gdb) "] +[322.726159, "o", "c"] +[322.901526, "o", "\r\nContinuing.\r\n"] +[322.912896, "o", "\r\n"] +[322.913085, "o", "Breakpoint 1, \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:765\r\n765\t\tpushl\t%ebp\r\n(gdb) "] +[324.760917, "o", "c"] +[325.18249, "o", "\b# current thread is still swapper"] +[325.878372, "o", "\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[Cprint ((struct task_struct*)$eax)->comm"] +[327.053947, "o", "\r\n"] +[327.071409, "o", "$4 = \"rcu_sched\\000\\000\\000\\000\\000\\000\"\r\n(gdb) "] +[329.503754, "o", "#"] +[330.910795, "o", "\b\u001b[K"] +[334.523368, "o", "c"] +[335.314474, "o", "\b\u001b[K"] +[335.958257, "o", "b"] +[336.018706, "o", "t"] +[337.398802, "o", "\r\n"] +[337.414221, "o", "#0 \u001b[33m__switch_to_asm\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:765\r\n"] +[337.414915, "o", "#1 \u001b[34m0xc15d8277\u001b[m in \u001b[33mcontext_switch\u001b[m (\u001b[36mrf\u001b[m=0xc2561eb4, \u001b[36mnext\u001b[m=, \u001b[m\r\n"] +[337.414988, "o", " \u001b[m\u001b[36mprev\u001b[m=0xc254c080, \u001b[36mrq\u001b[m=0xcfdcb700) at \u001b[32mkernel/sched/core.c\u001b[m:3779\r\n"] +[337.415254, "o", "#2 \u001b[33m__schedule\u001b[m (\u001b[36mpreempt\u001b[m=, \u001b[36mpreempt@entry\u001b[m=false)\u001b[m\r\n \u001b[m at \u001b[32mkernel/sched/core.c\u001b[m:4528\r\n#3 \u001b[34m0xc15d86ce\u001b[m in \u001b[33mschedule\u001b[m () at \u001b[32mkernel/sched/core.c\u001b[m:4606\r\n"] +[337.442508, "o", "#4 \u001b[34m0xc15dd6a7\u001b[m in \u001b[33mschedule_timeout\u001b[m (\u001b[36mtimeout=timeout@entry\u001b[m=1)\u001b[m\r\n \u001b[m at \u001b[32mkernel/time/timer.c\u001b[m:1871\r\n#5 \u001b[34m0xc10dcfa0\u001b[m in \u001b[33mrcu_gp_fqs_loop\u001b[m () at \u001b[32mkernel/rcu/tree.c\u001b[m:1928\r\n"] +[337.44868, "o", "#6 \u001b[33mrcu_gp_kthread\u001b[m (\u001b[36munused=unused@entry\u001b[m=0x0) at \u001b[32mkernel/rcu/tree.c\u001b[m:2102\r\n"] +[337.449075, "o", "#7 \u001b[34m0xc107c753\u001b[m in \u001b[33mkthread\u001b[m (\u001b[36m_create\u001b[m=0xc2408ca0) at \u001b[32mkernel/kthread.c\u001b[m:292\r\n#8 \u001b[34m0xc1001960\u001b[m in \u001b[33mret_from_fork\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:850\r\n"] +[337.449715, "o", "#9 \u001b[34m0x00000000\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[337.450155, "o", "(gdb) "] +[340.203269, "o", "t"] +[340.851246, "o", "\b\u001b[K"] +[341.14753, "o", "#"] +[341.387109, "o", " "] +[341.451102, "o", " "] +[341.661207, "o", "t"] +[341.978747, "o", "\b\u001b[K"] +[342.107108, "o", "\b\u001b[K"] +[342.349889, "o", "t"] +[342.491659, "o", "h"] +[342.515285, "o", "i"] +[342.699961, "o", "s"] +[342.805615, "o", " "] +[343.002237, "o", "l"] +[343.18705, "o", "o"] +[343.303844, "o", "o"] +[343.390559, "o", "k"] +[343.554804, "o", "s"] +[343.653215, "o", " "] +[343.835557, "o", "l"] +[343.978652, "o", "i"] +[344.125443, "o", "k"] +[344.223034, "o", "e"] +[344.312753, "o", " "] +[344.440075, "o", "a"] +[344.512933, "o", " "] +[344.889199, "o", "k"] +[345.025482, "o", "e"] +[345.045973, "o", "r"] +[345.140636, "o", "n"] +[345.237924, "o", "e"] +[345.325469, "o", "l"] +[345.422476, "o", " "] +[345.621936, "o", "t"] +[345.741317, "o", "h"] +[345.826829, "o", "r"] +[345.901016, "o", "e"] +[345.937951, "o", "a"] +[346.083099, "o", "d"] +[348.901657, "o", "\r\n(gdb) "] +[349.394054, "o", "#"] +[349.654109, "o", " "] +[350.307681, "o", "n"] +[350.395153, "o", "o"] +[350.899543, "o", "t"] +[351.003489, "o", "i"] +[351.17943, "o", "c"] +[351.246088, "o", "e"] +[351.469435, "o", " "] +[352.502347, "o", "t"] +[352.66686, "o", "h"] +[352.760953, "o", "a"] +[352.894464, "o", "t"] +[353.085035, "o", " "] +[355.643772, "o", "a"] +[356.059323, "o", " "] +[356.227601, "o", "k"] +[356.343137, "o", "e"] +[356.405474, "o", "r"] +[356.477181, "o", "n"] +[356.57342, "o", "e"] +[356.659084, "o", "l"] +[356.761894, "o", " "] +[356.941633, "o", "t"] +[357.045635, "o", "h"] +[357.112728, "o", "r"] +[357.179646, "o", "e"] +[357.241188, "o", "a"] +[357.707008, "o", "d"] +[357.885383, "o", " "] +[358.117478, "o", "s"] +[358.283371, "o", "t"] +[358.343069, "o", "a"] +[358.452513, "o", "r"] +[359.795314, "o", "t"] +[360.037755, "o", "s"] +[360.458901, "o", " "] +[366.070028, "o", "f"] +[366.133469, "o", "r"] +[366.261029, "o", "o"] +[366.381485, "o", "m"] +[366.530389, "o", " "] +[366.597727, "o", "a"] +[366.725499, "o", " "] +[366.850841, "o", "s"] +[366.979612, "o", "p"] +[367.024724, "o", "e"] +[367.237378, "o", "c"] +[367.357012, "o", "i"] +[367.437107, "o", "a"] +[367.618604, "o", "l"] +[367.985673, "o", " "] +[368.269841, "o", "f"] +[368.365828, "o", "o"] +[368.554828, "o", "r"] +[368.650754, "o", "k"] +[371.995165, "o", " "] +[372.946533, "o", "\b\u001b[K"] +[375.702282, "o", "\r\n"] +[375.702404, "o", "(gdb) "] +[398.867113, "o", "#"] +[399.146887, "o", " "] +[399.331616, "o", "a"] +[399.597434, "o", "l"] +[399.730647, "o", "s"] +[399.805839, "o", "o"] +[399.909163, "o", " "] +[400.11739, "o", "n"] +[400.165148, "o", "o"] +[400.285605, "o", "t"] +[400.365312, "o", "i"] +[400.485439, "o", "c"] +[400.579979, "o", "e"] +[400.65356, "o", " "] +[400.806284, "o", "a"] +[401.194396, "o", "t"] +[401.781734, "o", "\b\u001b[K"] +[401.914341, "o", "\b\u001b[K"] +[402.030244, "o", "t"] +[402.145374, "o", "h"] +[402.245147, "o", "a"] +[402.309402, "o", "t"] +[402.382045, "o", " "] +[402.525574, "o", "a"] +[402.629039, "o", "l"] +[402.754007, "o", "l"] +[402.821383, "o", " "] +[402.990226, "o", "k"] +[403.06459, "o", "e"] +[403.117803, "o", "r"] +[403.205019, "o", "n"] +[403.318207, "o", "e"] +[403.341663, "o", "l"] +[403.429692, "o", " "] +[403.629956, "o", "t"] +[403.741843, "o", "h"] +[403.808117, "o", "r"] +[403.88467, "o", "e"] +[403.95492, "o", "a"] +[404.141474, "o", "d"] +[404.205018, "o", "s"] +[404.326094, "o", " "] +[404.539117, "o", "u"] +[404.589606, "o", "s"] +[404.733589, "o", "e"] +[404.869205, "o", " "] +[407.242269, "o", "t"] +[407.32576, "o", "h"] +[407.4612, "o", "e"] +[407.541613, "o", " "] +[407.762738, "o", "k"] +[409.130554, "o", "t"] +[409.621372, "o", "h"] +[409.695796, "o", "r"] +[409.741875, "o", "e"] +[409.789943, "o", "a"] +[409.939027, "o", "d"] +[410.021602, "o", " "] +[410.218404, "o", "f"] +[410.323032, "o", "u"] +[410.371459, "o", "n"] +[410.474746, "o", "c"] +[410.688493, "o", "t"] +[410.7416, "o", "i"] +[410.779033, "o", "o"] +[411.003284, "o", "n"] +[411.093928, "o", " "] +[412.117382, "o", "t"] +[412.189337, "o", "o"] +[412.341309, "o", " "] +[412.506227, "o", "n"] +[412.589441, "o", "i"] +[412.850878, "o", "t"] +[412.986206, "o", "i"] +[413.427046, "o", "\b\u001b[K"] +[413.554719, "o", "\b\u001b[K"] +[413.677038, "o", "\b\u001b[K"] +[413.835017, "o", "\b\u001b[K"] +[414.005774, "o", "i"] +[414.484917, "o", "n"] +[414.540995, "o", "i"] +[414.693336, "o", "t"] +[414.7614, "o", "i"] +[414.863364, "o", "a"] +[414.998825, "o", "l"] +[415.062427, "o", "i \r"] +[415.166791, "o", "z"] +[415.3332, "o", "e"] +[415.478198, "o", "\r\n(gdb) "] +[442.445477, "o", "quit\r\n"] +[442.445561, "o", "A debugging session is active.\r\n\r\n\tInferior 1 [process 1] will be detached.\r\n\r\nQuit anyway? (y or n) "] +[442.970982, "o", "y"] +[443.214441, "o", "\r\n"] +[443.214527, "o", "Detaching from program: /linux/vmlinux, process 1\r\n"] +[443.215393, "o", "Ending remote debugging.\r\n"] +[443.215436, "o", "[Inferior 1 (process 1) detached]\r\n"] +[443.220945, "o", "$ "] +[444.348773, "o", "\r\n"] diff --git a/refs/pull/405/merge/_images/ksoftirqd-packet-flood.cast b/refs/pull/405/merge/_images/ksoftirqd-packet-flood.cast new file mode 100644 index 00000000..6f239238 --- /dev/null +++ b/refs/pull/405/merge/_images/ksoftirqd-packet-flood.cast @@ -0,0 +1,462 @@ +{"version": 2, "width": 80, "height": 24, "timestamp": 1616354478, "idle_time_limit": 1.0, "env": {"SHELL": null, "TERM": "xterm"}} +[0.002236, "o", "$ "] +[1.852599, "o", "m"] +[2.132922, "o", "i"] +[2.332344, "o", "n"] +[2.388604, "o", "i"] +[2.612998, "o", "c"] +[2.684655, "o", "o"] +[2.747904, "o", "m"] +[2.877702, "o", " "] +[3.052747, "o", "-"] +[3.317509, "o", "D"] +[3.468219, "o", " "] +[3.652408, "o", "s"] +[3.741658, "o", "e"] +[3.820052, "o", "r"] +[3.884557, "o", "i"] +[4.017426, "o", "a"] +[4.097078, "o", "l"] +[4.294502, "o", "."] +[4.483502, "o", "p"] +[4.604408, "o", "t"] +[4.836403, "o", "s"] +[5.173772, "o", "\r\n"] +[5.174525, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B"] +[5.174615, "o", "\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[5.174986, "o", "\u001b[?12l\u001b[?25h\nWelcome to minicom 2.7.1\r\n\nOPTIONS: I18n \r\nCompiled on Dec 23 2019, 02:06:26.\r\nPort serial.pts, 19:21:04\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[6.310469, "o", "\n"] +[6.311875, "o", "root@qemux86:~# "] +[7.981099, "o", "t"] +[8.029191, "o", "o"] +[8.086076, "o", "p"] +[8.245555, "o", "\r\n"] +[8.399818, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 34424K used, 206256K free, 3836K shrd, 380K buff, 8008K cached"] +[8.399997, "o", "\r\n"] +[8.400309, "o", "CPU: 0% usr 8% sys 0% nic 91% idle 0% io 0% irq 0% sirq"] +[8.400449, "o", "\r\n"] +[8.40103, "o", "Load average: 0.10 0.30 0.18 1/37 5529"] +[8.401214, "o", "\r\n"] +[8.41045, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[8.41063, "o", "\r\n"] +[8.411801, "o", "\u001b[0m\u001b(B 5529 5474 root R 2828 1% 6% top"] +[8.411997, "o", "\r\n"] +[8.412704, "o", " 10 2 root IW 0 0% 3% [rcu_sched]"] +[8.412931, "o", "\r\n"] +[8.413466, "o", " 5474 1 root S 2972 1% 0% -sh"] +[8.41367, "o", "\r\n"] +[8.414183, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[8.414369, "o", "\r\n"] +[8.41567, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[9;80H"] +[8.415869, "o", "\r\n"] +[8.416317, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[8.416484, "o", "\r\n"] +[8.41698, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[8.417134, "o", "\r\n"] +[8.417626, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[8.417853, "o", "\r\n"] +[8.418241, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[8.418372, "o", "\r\n"] +[8.419445, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[8.419591, "o", "\r\n"] +[8.420058, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[8.420184, "o", "\r\n"] +[8.42079, "o", " 1 0 root S 2004 1% 0% init [5]"] +[8.420935, "o", "\r\n"] +[8.421371, "o", " 9 2 root SW 0 0% 0% [ksoftirqd/0]"] +[8.421516, "o", "\r\n"] +[8.421919, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[8.422058, "o", "\r\n"] +[8.422446, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-mm_]"] +[8.422821, "o", "\r\n"] +[8.423488, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[8.423618, "o", "\r\n"] +[8.424021, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-ev]"] +[8.424538, "o", "\r\n"] +[8.424606, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[8.4247, "o", "\r\n"] +[8.425134, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]"] +[8.425281, "o", "\r\n"] +[8.425506, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]\r"] +[13.447981, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 34424K used, 206256K free, 3836K shrd, 380K buff, 8008K cached"] +[13.448168, "o", "\r\n"] +[13.44854, "o", "CPU: 0% usr 0% sys 0% nic 99% idle 0% io 0% irq 0% sirq"] +[13.448659, "o", "\r\n"] +[13.449284, "o", "Load average: 0.09 0.30 0.18 1/37 5529"] +[13.449591, "o", "\r\n"] +[13.451544, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[13.451812, "o", "\r\n"] +[13.452307, "o", "\u001b[0m\u001b(B 5529 5474 root R 2972 1% 1% top"] +[13.452473, "o", "\r\n"] +[13.45301, "o", " 5474 1 root S 2972 1% 0% -sh"] +[13.453275, "o", "\r\n"] +[13.453876, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[13.454093, "o", "\r\n"] +[13.454812, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[8;80H"] +[13.454925, "o", "\r\n"] +[13.455729, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[13.455941, "o", "\r\n"] +[13.456477, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[13.456654, "o", "\r\n"] +[13.457185, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[13.457362, "o", "\r\n"] +[13.457758, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[13.457938, "o", "\r\n"] +[13.458413, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[13.458715, "o", "\r\n"] +[13.459436, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[13.459594, "o", "\r\n"] +[13.460002, "o", " 1 0 root S 2004 1% 0% init [5]"] +[13.46014, "o", "\r\n"] +[13.46068, "o", " 10 2 root IW 0 0% 0% [rcu_sched]"] +[13.460878, "o", "\r\n"] +[13.461278, "o", " 9 2 root SW 0 0% 0% [ksoftirqd/0]"] +[13.4614, "o", "\r\n"] +[13.461837, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[13.46196, "o", "\r\n"] +[13.462358, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-eve]"] +[13.462478, "o", "\r\n"] +[13.463428, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[13.46356, "o", "\r\n"] +[13.463973, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-ev]"] +[13.464125, "o", "\r\n"] +[13.464593, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[13.464709, "o", "\r\n"] +[13.465189, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]"] +[13.465323, "o", "\r\n"] +[13.465519, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]\r"] +[18.487741, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 34424K used, 206256K free, 3836K shrd, 380K buff, 8008K cached"] +[18.488024, "o", "\r\n"] +[18.488389, "o", "CPU: 0% usr 0% sys 0% nic 99% idle 0% io 0% irq 0% sirq"] +[18.488672, "o", "\r\n"] +[18.489375, "o", "Load average: 0.08 0.29 0.18 1/37 5529"] +[18.489628, "o", "\r\n"] +[18.491955, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[18.492108, "o", "\r\n"] +[18.492801, "o", "\u001b[0m\u001b(B 5529 5474 root R 2972 1% 1% top"] +[18.493112, "o", "\r\n"] +[18.493653, "o", " 5474 1 root S 2972 1% 0% -sh"] +[18.493772, "o", "\r\n"] +[18.494187, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[18.494394, "o", "\r\n"] +[18.495284, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[8;80H"] +[18.495442, "o", "\r\n"] +[18.495842, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[18.49601, "o", "\r\n"] +[18.496439, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[18.49661, "o", "\r\n"] +[18.497041, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[18.497175, "o", "\r\n"] +[18.497594, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[18.497754, "o", "\r\n"] +[18.498242, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[18.498393, "o", "\r\n"] +[18.499275, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[18.499412, "o", "\r\n"] +[18.499785, "o", " 1 0 root S 2004 1% 0% init [5]"] +[18.499938, "o", "\r\n"] +[18.500316, "o", " 10 2 root IW 0 0% 0% [rcu_sched]"] +[18.50045, "o", "\r\n"] +[18.500827, "o", " 9 2 root SW 0 0% 0% [ksoftirqd/0]"] +[18.500944, "o", "\r\n"] +[18.501445, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[18.50159, "o", "\r\n"] +[18.502138, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-eve]"] +[18.502298, "o", "\r\n"] +[18.50317, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[18.503323, "o", "\r\n"] +[18.50372, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-ev]"] +[18.50386, "o", "\r\n"] +[18.504334, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[18.504402, "o", "\r\n"] +[18.504755, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]"] +[18.504906, "o", "\r\n"] +[18.505094, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]\r"] +[19.166233, "o", "\n\u001b[23;80H \u001b[24;1H"] +[19.169564, "o", "root@qemux86:~# "] +[23.887009, "o", "i"] +[24.003278, "o", "f"] +[25.38907, "o", "\b\u001b[K"] +[25.612968, "o", "p"] +[25.845698, "o", " "] +[26.036611, "o", "l"] +[26.229837, "o", "i"] +[26.685128, "o", "\b\u001b[K"] +[26.805031, "o", "\b\u001b[K"] +[26.89406, "o", "a"] +[26.974792, "o", "d"] +[27.125316, "o", "d"] +[27.30899, "o", "r"] +[27.389767, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.405865, "o", "1: lo: mtu 65536 qdisc noqueue qlen 1000"] +[27.406054, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.406659, "o", " link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00"] +[27.406961, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.408609, "o", " inet 127.0.0.1/8 scope host lo"] +[27.408902, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.409235, "o", " valid_lft forever preferred_lft forever"] +[27.409376, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.40968, "o", " inet6 ::1/128 scope host "] +[27.409834, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.410081, "o", " valid_lft forever preferred_lft forever"] +[27.41027, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.411406, "o", "2: eth0: mtu 1500 qdisc pfifo_fast qlen 1000"] +[27.41166, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.41196, "o", " link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff\r\n\u001b[23;80H \u001b[24;1H"] +[27.412383, "o", " inet 172.213.0.18/24 brd 172.213.0.255 scope global eth0\r\n\u001b[23;80H \u001b[24;1H"] +[27.412724, "o", " valid_lft forever preferred_lft forever"] +[27.412907, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.413445, "o", "3: sit0@NONE: mtu 1480 qdisc noop qlen 1000"] +[27.413562, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.413865, "o", " link/sit 0.0.0.0 brd 0.0.0.0"] +[27.414033, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[27.416761, "o", "root@qemux86:~# "] +[38.933061, "o", "\u001b[0m\u001b(B\u001b[7m\r\u001b[K\u001b[?12l\u001b[?25h\u001b[?25lCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7.1 | VT102 | Offline | al.pts\u001b[?12l\u001b[?25h"] +[38.933124, "o", "\u001b[24;17H"] +[40.700594, "o", "\u001b[8;30H\u001b[?25l\u001b[0m\u001b(B\u001b(0lqqqqqqqqqqqqqqqqqqqqqqk\u001b[9;30Hx\u001b[0m\u001b(B Leave Minicom? \u001b[0m\u001b(B\u001b(0x\u001b[10;30Hx\u001b[0m\u001b(B No \u001b[0m\u001b(B\u001b(0x\u001b[11;30Hmqqqqqqqqqqqqqqqqqqqqqqj\u001b[10;51H\u001b[?25l\u001b[10;33H\u001b[0m\u001b(B\u001b[7m Yes "] +[42.276438, "o", "\u001b[?12l\u001b[?25h\u001b[8;1H\u001b[0m\u001b(B 38 2 root IW 0 0% 0% [kworker/u2\u001b[9;1H 34 2 root IW< 0 0% 0% [kworker/0:\u001b[10;1H 43 2 root SW 0 0% 0% [jbd2/vda-8\u001b[11;1Hroot@qemux86:~# ip addr \u001b[24;17H\u001b[0m\u001b(B\u001b[7m\u001b[?12l\u001b[?25h"] +[42.276595, "o", "\u001b[?12l\u001b[?25h\u001b[0m\u001b(B\u001b[H\u001b[2J\u001b[?12l\u001b[?25h\u001b[?1l\u001b>\u001b[!p\u001b[?3;4l\u001b[4l\u001b>"] +[42.276685, "o", "$ "] +[73.938043, "o", "n"] +[74.028267, "o", "o"] +[74.219846, "o", "h"] +[74.436234, "o", "u"] +[74.53207, "o", "p"] +[74.781125, "o", " "] +[75.188776, "o", "s"] +[75.276111, "o", "u"] +[75.916849, "o", "d"] +[75.992557, "o", "o"] +[76.120543, "o", " "] +[76.4366, "o", "p"] +[76.56442, "o", "i"] +[76.748589, "o", "n"] +[76.852263, "o", "g"] +[76.916772, "o", " "] +[77.073216, "o", "-"] +[77.184444, "o", "f"] +[77.284511, "o", " "] +[79.70078, "o", "172.213.0.18"] +[80.644466, "o", " "] +[81.313662, "o", "&"] +[81.95608, "o", "\r\n"] +[81.956239, "o", "$ "] +[81.956532, "o", "nohup: ignoring input and appending output to 'nohup.out'\r\n"] +[83.107734, "o", "\r\n$ "] +[84.435832, "o", "m"] +[84.532479, "o", "i"] +[84.643806, "o", "n"] +[84.820208, "o", "c"] +[84.891586, "o", "o"] +[84.971776, "o", "m"] +[92.307554, "o", " "] +[92.403882, "o", "-"] +[92.675839, "o", "D"] +[92.787506, "o", " "] +[93.563841, "o", "s"] +[93.652089, "o", "e"] +[93.715884, "o", "r"] +[93.763667, "o", "i"] +[93.875947, "o", "a"] +[93.93975, "o", "l"] +[94.116003, "o", "."] +[94.364381, "o", "p"] +[94.484616, "o", "t"] +[94.700067, "o", "s"] +[94.995562, "o", "\r\n"] +[94.996318, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B"] +[94.996448, "o", "\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[94.996773, "o", "\u001b[?12l\u001b[?25h\nWelcome to minicom 2.7.1\r\n\nOPTIONS: I18n \r\nCompiled on Dec 23 2019, 02:06:26.\r\nPort serial.pts, 19:21:44\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[95.525135, "o", "\n"] +[95.526362, "o", "root@qemux86:~# "] +[96.060597, "o", "t"] +[96.108363, "o", "o"] +[96.159286, "o", "p"] +[96.338736, "o", "\r\n"] +[96.58656, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 34432K used, 206248K free, 3836K shrd, 384K buff, 8008K cached"] +[96.58739, "o", "\r\n"] +[96.590639, "o", "CPU: 0% usr 28% sys 0% nic 7% idle 0% io 0% irq 64% sirq"] +[96.591015, "o", "\r\n"] +[96.595524, "o", "Load average: 0.32 0.29 0.18 2/37 5531"] +[96.595722, "o", "\r\n"] +[96.608711, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[96.608982, "o", "\r\n"] +[96.609479, "o", "\u001b[0m\u001b(B 9 2 root RW 0 0% 50% [ksoftirqd/0]"] +[96.609654, "o", "\r\n"] +[96.610278, "o", " 5531 5474 root R 2828 1% 14% top"] +[96.610432, "o", "\r\n"] +[96.621162, "o", " 10 2 root IW 0 0% 14% [rcu_sched]"] +[96.62135, "o", "\r\n"] +[96.621869, "o", " 5474 1 root S 2972 1% 0% -sh"] +[96.621996, "o", "\r\n"] +[96.622532, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[96.622802, "o", "\r\n"] +[96.623253, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[10;80H"] +[96.623426, "o", "\r\n"] +[96.623854, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[96.62401, "o", "\r\n"] +[96.624429, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[96.624607, "o", "\r\n"] +[96.625129, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[96.625226, "o", "\r\n"] +[96.62562, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[96.625825, "o", "\r\n"] +[96.626246, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[96.626462, "o", "\r\n"] +[96.635421, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[96.635574, "o", "\r\n"] +[96.635889, "o", " 1 0 root S 2004 1% 0% init [5]"] +[96.636113, "o", "\r\n"] +[96.636631, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[96.636739, "o", "\r\n"] +[96.637233, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-mm_]"] +[96.637352, "o", "\r\n"] +[96.637676, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-ev]"] +[96.637783, "o", "\r\n"] +[96.638119, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[96.638283, "o", "\r\n"] +[96.643324, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[96.643513, "o", "\r\n"] +[96.644008, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]"] +[96.644118, "o", "\r\n"] +[96.644439, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]\r"] +[101.701434, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 34520K used, 206160K free, 3840K shrd, 388K buff, 8008K cached"] +[101.701499, "o", "\r\n"] +[101.701748, "o", "CPU: 0% usr 11% sys 0% nic 17% idle 0% io 0% irq 70% sirq"] +[101.701978, "o", "\r\n"] +[101.702444, "o", "Load average: 0.29 0.29 0.18 2/37 5541"] +[101.702673, "o", "\r\n"] +[101.715916, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[101.71616, "o", "\r\n"] +[101.716771, "o", "\u001b[0m\u001b(B 9 2 root SW 0 0% 43% [ksoftirqd/0]"] +[101.71689, "o", "\r\n"] +[101.717274, "o", " 10 2 root IW 0 0% 17% [rcu_sched]"] +[101.717579, "o", "\r\n"] +[101.717921, "o", " 5531 5474 root R 2972 1% 1% top"] +[101.718166, "o", "\r\n"] +[101.718688, "o", " 1 0 root S 2004 1% 1% init [5]"] +[101.718868, "o", "\r\n"] +[101.726882, "o", " 5474 1 root S 2972 1% 0% -sh"] +[101.727162, "o", "\r\n"] +[101.738059, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[101.738201, "o", "\r\n"] +[101.738825, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[11;80H"] +[101.739146, "o", "\r\n"] +[101.739547, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[101.739785, "o", "\r\n"] +[101.74017, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[101.740448, "o", "\r\n"] +[101.740861, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[101.745102, "o", "\r\n"] +[101.761351, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[101.761869, "o", "\r\n"] +[101.769943, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[101.770407, "o", "\r\n"] +[101.777249, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[101.777746, "o", "\r\n"] +[101.782585, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[101.78294, "o", "\r\n"] +[101.789941, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-eve]"] +[101.790104, "o", "\r\n"] +[101.790657, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-fl]"] +[101.790924, "o", "\r\n"] +[101.791318, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[101.791471, "o", "\r\n"] +[101.791821, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[101.792047, "o", "\r\n"] +[101.792515, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]"] +[101.792624, "o", "\r\n"] +[101.792795, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]\r"] +[106.850435, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 34512K used, 206168K free, 3840K shrd, 388K buff, 8012K cached"] +[106.850593, "o", "\r\n"] +[106.850752, "o", "CPU: 0% usr 7% sys 0% nic 17% idle 0% io 0% irq 74% sirq"] +[106.850895, "o", "\r\n"] +[106.851563, "o", "Load average: 0.35 0.30 0.19 2/37 5541"] +[106.851692, "o", "\r\n"] +[106.852789, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[106.852952, "o", "\r\n"] +[106.8535, "o", "\u001b[0m\u001b(B 9 2 root RW 0 0% 43% [ksoftirqd/0]"] +[106.853632, "o", "\r\n"] +[106.854315, "o", " 10 2 root IW 0 0% 17% [rcu_sched]"] +[106.854348, "o", "\r\n"] +[106.865859, "o", " 5531 5474 root R 2972 1% 1% top"] +[106.866331, "o", "\r\n"] +[106.870104, "o", " 5474 1 root S 2972 1% 0% -sh"] +[106.870466, "o", "\r\n"] +[106.879321, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[106.879486, "o", "\r\n"] +[106.880046, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[10;80H"] +[106.880262, "o", "\r\n"] +[106.880647, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[106.880836, "o", "\r\n"] +[106.881316, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[106.881433, "o", "\r\n"] +[106.881896, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[106.882036, "o", "\r\n"] +[106.882496, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[106.882685, "o", "\r\n"] +[106.892921, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[106.893509, "o", "\r\n"] +[106.90363, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[106.903807, "o", "\r\n"] +[106.904182, "o", " 1 0 root S 2004 1% 0% init [5]"] +[106.904433, "o", "\r\n"] +[106.904902, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[106.905429, "o", "\r\n"] +[106.905782, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-eve]"] +[106.906003, "o", "\r\n"] +[106.906469, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-fl]"] +[106.906625, "o", "\r\n"] +[106.909123, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[106.909318, "o", "\r\n"] +[106.909667, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[106.909813, "o", "\r\n"] +[106.91024, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]"] +[106.910514, "o", "\r\n"] +[106.91072, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]\r"] +[111.956876, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[KMem: 34512K used, 206168K free, 3848K shrd, 388K buff, 8020K cached\r\n"] +[111.9573, "o", "CPU: 0% usr 12% sys 0% nic 15% idle 0% io 0% irq 71% sirq"] +[111.957432, "o", "\r\n"] +[111.957933, "o", "Load average: 0.32 0.29 0.19 2/37 5551"] +[111.958189, "o", "\r\n"] +[111.97146, "o", "\u001b[0m\u001b(B\u001b[7m PID PPID USER STAT VSZ %VSZ %CPU COMMAND"] +[111.971723, "o", "\r\n"] +[111.972186, "o", "\u001b[0m\u001b(B 9 2 root RW 0 0% 44% [ksoftirqd/0]"] +[111.972254, "o", "\r\n"] +[111.972813, "o", " 10 2 root IW 0 0% 17% [rcu_sched]"] +[111.972939, "o", "\r\n"] +[111.973495, "o", " 1 0 root S 2004 1% 2% init [5]"] +[111.973663, "o", "\r\n"] +[111.974138, "o", " 5531 5474 root R 2972 1% 1% top"] +[111.974365, "o", "\r\n"] +[111.980464, "o", " 198 1 root S 2828 1% 0% /sbin/syslogd -n -O /var/log/messages"] +[111.980996, "o", "\r\n"] +[111.994119, "o", " 5474 1 root S 2972 1% 0% -sh"] +[111.994451, "o", "\r\n"] +[112.002489, "o", " 187 1 root S 2828 1% 0% udhcpc -R -b -p /var/run/udhcpc.eth0.p\u001b[11;80H"] +[112.003179, "o", "\r\n"] +[112.017546, "o", " 201 1 root S 2828 1% 0% /sbin/klogd -n"] +[112.018026, "o", "\r\n"] +[112.026893, "o", " 207 1 root S 2828 1% 0% /sbin/getty 38400 tty1"] +[112.027036, "o", "\r\n"] +[112.027601, "o", " 209 1 root S 2828 1% 0% /sbin/getty 38400 tty2"] +[112.027845, "o", "\r\n"] +[112.028265, "o", " 210 1 root S 2828 1% 0% /sbin/getty 38400 tty3"] +[112.028403, "o", "\r\n"] +[112.028856, "o", " 211 1 root S 2828 1% 0% /sbin/getty 38400 tty4"] +[112.028978, "o", "\r\n"] +[112.029317, "o", " 212 1 root S 2828 1% 0% /sbin/getty 38400 tty5"] +[112.029507, "o", "\r\n"] +[112.029848, "o", " 42 2 root SWN 0 0% 0% [kmemleak]"] +[112.030002, "o", "\r\n"] +[112.030344, "o", " 39 2 root IW 0 0% 0% [kworker/0:2-mm_]"] +[112.030576, "o", "\r\n"] +[112.040702, "o", " 7 2 root IW 0 0% 0% [kworker/u2:0-fl]\r\n"] +[112.041062, "o", " 13 2 root SW 0 0% 0% [kdevtmpfs]"] +[112.041124, "o", "\r\n"] +[112.041624, "o", " 38 2 root IW 0 0% 0% [kworker/u2:1-ev]"] +[112.041812, "o", "\r\n"] +[112.042352, "o", " 43 2 root SW 0 0% 0% [jbd2/vda-8]"] +[112.042485, "o", "\r\n"] +[112.042865, "o", " 34 2 root IW< 0 0% 0% [kworker/0:1H-kb]\r"] +[115.116387, "o", "\n\u001b[23;80H \u001b[24;1H"] diff --git a/refs/pull/405/merge/_images/list_evolution.png b/refs/pull/405/merge/_images/list_evolution.png new file mode 100644 index 00000000..aa44396d Binary files /dev/null and b/refs/pull/405/merge/_images/list_evolution.png differ diff --git a/refs/pull/405/merge/_images/list_evolution1.png b/refs/pull/405/merge/_images/list_evolution1.png new file mode 100644 index 00000000..aa44396d Binary files /dev/null and b/refs/pull/405/merge/_images/list_evolution1.png differ diff --git a/refs/pull/405/merge/_images/lro.png b/refs/pull/405/merge/_images/lro.png new file mode 100644 index 00000000..4c781b96 Binary files /dev/null and b/refs/pull/405/merge/_images/lro.png differ diff --git a/refs/pull/405/merge/_images/lro1.png b/refs/pull/405/merge/_images/lro1.png new file mode 100644 index 00000000..4c781b96 Binary files /dev/null and b/refs/pull/405/merge/_images/lro1.png differ diff --git a/refs/pull/405/merge/_images/minfs.png b/refs/pull/405/merge/_images/minfs.png new file mode 100644 index 00000000..affd8230 Binary files /dev/null and b/refs/pull/405/merge/_images/minfs.png differ diff --git a/refs/pull/405/merge/_images/minfs1.png b/refs/pull/405/merge/_images/minfs1.png new file mode 100644 index 00000000..affd8230 Binary files /dev/null and b/refs/pull/405/merge/_images/minfs1.png differ diff --git a/refs/pull/405/merge/_images/minfs_arch.png b/refs/pull/405/merge/_images/minfs_arch.png new file mode 100644 index 00000000..6778e663 Binary files /dev/null and b/refs/pull/405/merge/_images/minfs_arch.png differ diff --git a/refs/pull/405/merge/_images/minfs_arch1.png b/refs/pull/405/merge/_images/minfs_arch1.png new file mode 100644 index 00000000..6778e663 Binary files /dev/null and b/refs/pull/405/merge/_images/minfs_arch1.png differ diff --git a/refs/pull/405/merge/_images/net-dev-hw.png b/refs/pull/405/merge/_images/net-dev-hw.png new file mode 100644 index 00000000..8d52e5fa Binary files /dev/null and b/refs/pull/405/merge/_images/net-dev-hw.png differ diff --git a/refs/pull/405/merge/_images/net-dev-hw1.png b/refs/pull/405/merge/_images/net-dev-hw1.png new file mode 100644 index 00000000..8d52e5fa Binary files /dev/null and b/refs/pull/405/merge/_images/net-dev-hw1.png differ diff --git a/refs/pull/405/merge/_images/page-fault-handling.png b/refs/pull/405/merge/_images/page-fault-handling.png new file mode 100644 index 00000000..7f60933a Binary files /dev/null and b/refs/pull/405/merge/_images/page-fault-handling.png differ diff --git a/refs/pull/405/merge/_images/page-fault-handling1.png b/refs/pull/405/merge/_images/page-fault-handling1.png new file mode 100644 index 00000000..7f60933a Binary files /dev/null and b/refs/pull/405/merge/_images/page-fault-handling1.png differ diff --git a/refs/pull/405/merge/_images/paging.png b/refs/pull/405/merge/_images/paging.png new file mode 100644 index 00000000..53f7fb18 Binary files /dev/null and b/refs/pull/405/merge/_images/paging.png differ diff --git a/refs/pull/405/merge/_images/paging1.png b/refs/pull/405/merge/_images/paging1.png new file mode 100644 index 00000000..53f7fb18 Binary files /dev/null and b/refs/pull/405/merge/_images/paging1.png differ diff --git a/refs/pull/405/merge/_images/read.png b/refs/pull/405/merge/_images/read.png new file mode 100644 index 00000000..4502fb42 Binary files /dev/null and b/refs/pull/405/merge/_images/read.png differ diff --git a/refs/pull/405/merge/_images/read1.png b/refs/pull/405/merge/_images/read1.png new file mode 100644 index 00000000..4502fb42 Binary files /dev/null and b/refs/pull/405/merge/_images/read1.png differ diff --git a/refs/pull/405/merge/_images/read2.png b/refs/pull/405/merge/_images/read2.png new file mode 100644 index 00000000..6f04b13e Binary files /dev/null and b/refs/pull/405/merge/_images/read2.png differ diff --git a/refs/pull/405/merge/_images/read21.png b/refs/pull/405/merge/_images/read21.png new file mode 100644 index 00000000..6f04b13e Binary files /dev/null and b/refs/pull/405/merge/_images/read21.png differ diff --git a/refs/pull/405/merge/_images/routing-cache.png b/refs/pull/405/merge/_images/routing-cache.png new file mode 100644 index 00000000..47dcdcb2 Binary files /dev/null and b/refs/pull/405/merge/_images/routing-cache.png differ diff --git a/refs/pull/405/merge/_images/routing-cache1.png b/refs/pull/405/merge/_images/routing-cache1.png new file mode 100644 index 00000000..47dcdcb2 Binary files /dev/null and b/refs/pull/405/merge/_images/routing-cache1.png differ diff --git a/refs/pull/405/merge/_images/schematic.png b/refs/pull/405/merge/_images/schematic.png new file mode 100644 index 00000000..89020fb9 Binary files /dev/null and b/refs/pull/405/merge/_images/schematic.png differ diff --git a/refs/pull/405/merge/_images/schematic1.png b/refs/pull/405/merge/_images/schematic1.png new file mode 100644 index 00000000..89020fb9 Binary files /dev/null and b/refs/pull/405/merge/_images/schematic1.png differ diff --git a/refs/pull/405/merge/_images/selectors-and-segments.cast b/refs/pull/405/merge/_images/selectors-and-segments.cast new file mode 100644 index 00000000..2c04bbeb --- /dev/null +++ b/refs/pull/405/merge/_images/selectors-and-segments.cast @@ -0,0 +1,1818 @@ +{"version": 2, "width": 80, "height": 24, "timestamp": 1617704245, "idle_time_limit": 1.0, "env": {"SHELL": null, "TERM": "xterm"}} +[0.002359, "o", "$ "] +[1.205759, "o", "m"] +[1.286489, "o", "a"] +[1.349222, "o", "k"] +[1.526553, "o", "e"] +[1.643724, "o", " "] +[2.348906, "o", "g"] +[2.575307, "o", "d"] +[2.64632, "o", "b"] +[3.052572, "o", "\r\n"] +[3.074798, "o", "gdb -ex \"target remote localhost:1234\" /linux/vmlinux\r\n"] +[3.117199, "o", "\u001b[35;1m\u001b[35;1mGNU gdb \u001b[m\u001b[35;1m(Ubuntu 9.2-0ubuntu1~20.04) \u001b[m\u001b[35;1m9.2\u001b[m\u001b[35;1m\r\n\u001b[m\u001b[mCopyright (C) 2020 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law.\r\nType \"show copying\" and \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n .\r\n\r\nFor help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[3.117763, "o", "Reading symbols from \u001b[32m/linux/vmlinux\u001b[m...\r\n"] +[3.768166, "o", "Remote debugging using localhost:1234\r\n"] +[3.781924, "o", "\u001b[34m0xc15dcb62\u001b[m in \u001b[33mdefault_idle\u001b[m () at \u001b[32m./arch/x86/include/asm/irqflags.h\u001b[m:60\r\n"] +[3.781966, "o", "60\t\tasm volatile(\"sti; hlt\": : :\"memory\");\r\n"] +[3.782406, "o", "(gdb) "] +[4.626339, "o", "b"] +[4.768564, "o", "t"] +[5.44911, "o", "\r\n"] +[5.449728, "o", "#0 \u001b[34m0xc15dcb62\u001b[m in \u001b[33mdefault_idle\u001b[m () at \u001b[32m./arch/x86/include/asm/irqflags.h\u001b[m:60\r\n"] +[5.449863, "o", "#1 \u001b[34m0xc102a0dd\u001b[m in \u001b[33march_cpu_idle\u001b[m () at \u001b[32march/x86/kernel/process.c\u001b[m:680\r\n"] +[5.459029, "o", "#2 \u001b[34m0xc15dcee2\u001b[m in \u001b[33mdefault_idle_call\u001b[m () at \u001b[32mkernel/sched/idle.c\u001b[m:112\r\n#3 \u001b[34m0xc1087fb5\u001b[m in \u001b[33mcpuidle_idle_call\u001b[m () at \u001b[32mkernel/sched/idle.c\u001b[m:194\r\n#4 \u001b[33mdo_idle\u001b[m () at \u001b[32mkernel/sched/idle.c\u001b[m:299\r\n"] +[5.466862, "o", "#5 \u001b[34m0xc1088295\u001b[m in \u001b[33mcpu_startup_entry\u001b[m (\u001b[36mstate=state@entry\u001b[m=CPUHP_ONLINE)\u001b[m\r\n \u001b[m at \u001b[32mkernel/sched/idle.c\u001b[m:395\r\n#6 \u001b[34m0xc15d4ffb\u001b[m in \u001b[33mrest_init\u001b[m () at \u001b[32minit/main.c\u001b[m:721\r\n"] +[5.467037, "o", "#7 \u001b[34m0xc18cd7c4\u001b[m in \u001b[33march_call_rest_init\u001b[m () at \u001b[32minit/main.c\u001b[m:845\r\n#8 \u001b[34m0xc18cdc08\u001b[m in \u001b[33mstart_kernel\u001b[m () at \u001b[32minit/main.c\u001b[m:1061\r\n"] +[5.469609, "o", "#9 \u001b[34m0xc18cd218\u001b[m in \u001b[33mi386_start_kernel\u001b[m () at \u001b[32march/x86/kernel/head32.c\u001b[m:56\r\n"] +[5.470029, "o", "#10 \u001b[34m0xc10001db\u001b[m in \u001b[33mstartup_32_smp\u001b[m () at \u001b[32march/x86/kernel/head_32.S\u001b[m:327\r\n"] +[5.471491, "o", "#11 \u001b[34m0x00000000\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[5.472127, "o", "(gdb) "] +[6.957539, "o", "#"] +[7.184578, "o", " "] +[9.085494, "o", "i"] +[9.274558, "o", "t"] +[9.385268, "o", " "] +[9.560048, "o", "l"] +[9.72648, "o", "o"] +[9.838927, "o", "o"] +[9.918898, "o", "k"] +[10.059084, "o", "s"] +[10.113284, "o", " "] +[10.246361, "o", "l"] +[10.418976, "o", "i"] +[10.565605, "o", "k"] +[10.641569, "o", "e"] +[10.747208, "o", " "] +[10.9907, "o", "w"] +[11.09312, "o", "e"] +[11.231124, "o", " "] +[11.334085, "o", "a"] +[11.487572, "o", "r"] +[11.577493, "o", "e"] +[11.670981, "o", " "] +[12.627311, "o", "i"] +[12.911014, "o", "n"] +[13.064828, "o", " "] +[13.249479, "o", "k"] +[13.37127, "o", "e"] +[13.460074, "o", "r"] +[13.510226, "o", "n"] +[13.635205, "o", "e"] +[13.973161, "o", "l"] +[14.662931, "o", " "] +[15.131825, "o", "r"] +[15.214966, "o", "u"] +[15.358966, "o", "n"] +[15.487497, "o", "n"] +[15.654743, "o", "i"] +[15.831262, "o", "n"] +[15.996438, "o", "g"] +[16.146163, "o", " "] +[18.220116, "o", "w"] +[18.288027, "o", "i"] +[18.436822, "o", "t"] +[18.509192, "o", "h"] +[18.613143, "o", " "] +[18.807172, "o", "p"] +[18.897976, "o", "r"] +[19.013978, "o", "i"] +[19.204046, "o", "v"] +[19.283747, "o", "i"] +[19.47738, "o", "l"] +[19.544513, "o", "e"] +[19.78487, "o", "g"] +[19.880492, "o", "e"] +[20.602829, "o", " "] +[22.510059, "o", "0"] +[23.73426, "o", "\r\n"] +[23.734632, "o", "(gdb) "] +[24.495347, "o", "#"] +[24.632234, "o", " "] +[24.800588, "o", "l"] +[24.880492, "o", "e"] +[25.0523, "o", "t"] +[25.277878, "o", "s"] +[25.388396, "o", " "] +[25.613203, "o", "c"] +[25.720735, "o", "o"] +[25.817537, "o", "n"] +[25.855844, "o", "f"] +[25.973067, "o", "i"] +[26.212741, "o", "r"] +[26.386844, "o", "m"] +[26.496098, "o", " "] +[26.694327, "o", "t"] +[26.77636, "o", "h"] +[26.823023, "o", "i"] +[26.999112, "o", "s"] +[27.144801, "o", " "] +[27.434198, "o", "b"] +[27.494331, "o", "y"] +[27.714979, "o", " "] +[28.054888, "o", "l"] +[28.257672, "o", "o"] +[28.393068, "o", "o"] +[28.459965, "o", "k"] +[28.691808, "o", "i"] +[28.73292, "o", "n"] +[28.878271, "o", "g"] +[28.991934, "o", " "] +[29.143258, "o", "a"] +[29.367682, "o", "t"] +[29.494524, "o", " "] +[29.62679, "o", "t"] +[29.774843, "o", "h"] +[29.893124, "o", "e"] +[30.004863, "o", " "] +[38.956259, "o", "c"] +[39.046244, "o", "o"] +[39.159799, "o", "d"] +[39.244625, "o", "e"] +[39.30416, "o", " "] +[39.48092, "o", "s"] +[39.597493, "o", "e"] +[39.737666, "o", "l"] +[39.800726, "o", "e"] +[39.892586, "o", "c"] +[40.103179, "o", "t"] +[40.210366, "o", "o"] +[40.277594, "o", "r"] +[40.672707, "o", "\r\n"] +[40.673041, "o", "(gdb) "] +[41.394093, "o", "p"] +[41.550842, "o", "r"] +[41.704995, "o", "i"] +[41.773529, "o", "n"] +[41.869581, "o", "t"] +[42.223515, "o", " "] +[42.884229, "o", "/"] +[43.24162, "o", "$"] +[43.945015, "o", "\b\u001b[K"] +[44.294995, "o", "x"] +[44.490102, "o", " "] +[45.85674, "o", "$"] +[47.797713, "o", "c"] +[47.900134, "o", "s"] +[48.160261, "o", "\r\n"] +[48.160574, "o", "$1 = 0x60\r\n(gdb) "] +[52.875773, "o", "#"] +[53.057592, "o", " "] +[53.270518, "o", "n"] +[53.33, "o", "o"] +[53.420243, "o", "w"] +[53.516445, "o", " "] +[53.674458, "o", "l"] +[53.733163, "o", "e"] +[53.927126, "o", "t"] +[54.113039, "o", "s"] +[54.210531, "o", " "] +[54.350529, "o", "p"] +[54.509267, "o", "r"] +[54.597743, "o", "i"] +[54.668833, "o", "n"] +[54.807039, "o", "t"] +[54.905672, "o", " "] +[55.026232, "o", "t"] +[55.167086, "o", "h"] +[55.301416, "o", "e"] +[55.432852, "o", " "] +[58.048587, "o", "i"] +[58.185341, "o", "n"] +[58.275582, "o", "d"] +[58.355005, "o", "e"] +[58.575251, "o", "x"] +[59.228376, "o", " "] +[59.353553, "o", "f"] +[59.455246, "o", "o"] +[59.576325, "o", "r"] +[59.691869, "o", " "] +[59.775251, "o", "t"] +[59.864488, "o", "h"] +[59.931228, "o", "e"] +[60.017051, "o", " "] +[60.143587, "o", "c"] +[60.242612, "o", "o"] +[60.346778, "o", "d"] +[60.409921, "o", "e"] +[60.503901, "o", " "] +[60.653751, "o", "s"] +[60.791714, "o", "e"] +[60.927913, "o", "l"] +[60.995147, "o", "e"] +[61.100077, "o", "c"] +[61.301378, "o", "t"] +[61.409292, "o", "o"] +[61.472267, "o", "r"] +[61.999105, "o", "\r\n"] +[61.999211, "o", "(gdb) "] +[62.755519, "o", "p"] +[62.920307, "o", "r"] +[63.030171, "o", "i"] +[63.093306, "o", "n"] +[63.153238, "o", "t"] +[63.291438, "o", " "] +[63.834425, "o", "/"] +[64.026843, "o", "x"] +[64.200482, "o", " "] +[64.675073, "o", "\b\u001b[K"] +[64.821291, "o", "\b\u001b[K"] +[64.946712, "o", "\b\u001b[K"] +[66.373086, "o", "$"] +[66.734489, "o", "c"] +[66.85103, "o", "s"] +[67.689872, "o", ">"] +[67.812115, "o", ">"] +[91.768538, "o", "3"] +[92.507215, "o", "\r\n"] +[92.50757, "o", "$2 = 12\r\n(gdb) "] +[97.002473, "o", "#"] +[97.167356, "o", " "] +[97.43684, "o", "a"] +[97.554333, "o", "n"] +[98.078151, "o", "d"] +[98.230006, "o", " "] +[98.41845, "o", "n"] +[98.500863, "o", "o"] +[98.58555, "o", "w"] +[98.697065, "o", " "] +[98.902575, "o", "l"] +[99.036005, "o", "e"] +[99.242403, "o", "t"] +[99.457412, "o", "s"] +[99.570468, "o", " "] +[99.705253, "o", "p"] +[99.822772, "o", "r"] +[99.930937, "o", "i"] +[100.014833, "o", "n"] +[100.091449, "o", "t"] +[100.226161, "o", " "] +[100.331035, "o", "t"] +[100.564914, "o", "e"] +[100.990115, "o", "\b\u001b[K"] +[101.195011, "o", "h"] +[101.275921, "o", "e"] +[101.38122, "o", " "] +[103.755803, "o", "d"] +[103.824373, "o", "e"] +[103.946792, "o", "s"] +[104.052577, "o", "c"] +[104.247014, "o", "r"] +[104.904532, "o", "i"] +[105.058074, "o", "p"] +[105.226938, "o", "t"] +[105.297817, "o", "o"] +[105.408466, "o", "r"] +[105.513496, "o", " "] +[105.845144, "o", "f"] +[105.958437, "o", "o"] +[106.09132, "o", "r"] +[106.167673, "o", " "] +[106.430415, "o", "e"] +[106.855237, "o", "\b\u001b[K"] +[106.932061, "o", "s"] +[107.37581, "o", "e"] +[107.53021, "o", "l"] +[107.576069, "o", "e"] +[107.686763, "o", "c"] +[107.960174, "o", "t"] +[108.090163, "o", "o"] +[108.18428, "o", "r"] +[108.853073, "o", " "] +[109.180343, "o", "1"] +[109.290367, "o", "2"] +[109.8445, "o", "\r\n"] +[109.844823, "o", "(gdb) "] +[111.689231, "o", "#"] +[111.887764, "o", " "] +[112.2626, "o", "f"] +[112.390326, "o", "i"] +[112.490975, "o", "r"] +[112.741024, "o", "s"] +[112.893713, "o", "t"] +[112.982635, "o", " "] +[114.599914, "o", "g"] +[114.696218, "o", "e"] +[114.908597, "o", "t"] +[115.088006, "o", " "] +[115.172965, "o", "t"] +[115.352084, "o", "h"] +[115.459941, "o", "e"] +[115.604134, "o", " "] +[117.664675, "o", "G"] +[117.981042, "o", "T"] +[119.326974, "o", "D"] +[119.594969, "o", " "] +[121.590078, "o", "r"] +[121.687724, "o", "e"] +[121.886761, "o", "g"] +[121.987087, "o", "i"] +[122.037275, "o", "s"] +[122.222018, "o", "t"] +[122.330302, "o", "e"] +[122.429404, "o", "r"] +[123.050835, "o", " "] +[123.19778, "o", "v"] +[123.241349, "o", "a"] +[123.389395, "o", "l"] +[123.597067, "o", "u"] +[123.836279, "o", "e"] +[124.041227, "o", "\r\n"] +[124.041348, "o", "(gdb) "] +[126.327766, "o", "m"] +[126.44815, "o", "o"] +[126.814981, "o", "n"] +[126.886507, "o", "i"] +[127.037664, "o", "t"] +[127.148862, "o", "o"] +[127.266202, "o", "r"] +[127.368858, "o", " "] +[129.678077, "o", "s"] +[129.7363, "o", "h"] +[129.800484, "o", "o"] +[129.908385, "o", "w"] +[130.033271, "o", " "] +[131.662846, "o", "\b\u001b[K"] +[131.803395, "o", "\b\u001b[K"] +[131.940196, "o", "\b\u001b[K"] +[132.073194, "o", "\b\u001b[K"] +[132.18792, "o", "\b\u001b[K"] +[132.45787, "o", "i"] +[132.530376, "o", "n"] +[132.776408, "o", "g"] +[132.872135, "o", "o"] +[132.986392, "o", " "] +[133.379472, "o", "\b\u001b[K"] +[133.511446, "o", "\b\u001b[K"] +[133.626516, "o", "\b\u001b[K"] +[133.786323, "o", "f"] +[133.863781, "o", "o"] +[133.998018, "o", " "] +[134.086118, "o", "r"] +[134.560701, "o", "e"] +[134.886634, "o", "g"] +[134.991773, "o", "i"] +[135.080584, "o", "s"] +[135.2521, "o", "t"] +[135.343722, "o", "e"] +[135.452922, "o", "r"] +[135.647215, "o", "s"] +[135.73628, "o", "\r\n"] +[135.73688, "o", "EAX=0000"] +[135.737118, "o", "0000 EBX=00000000 ECX=ffffffff EDX=0"] +[135.737339, "o", "0000000\r\r\n"] +[135.737619, "o", "ESI=00000000 EDI=00000000 EBP=c17cff1"] +[135.737925, "o", "c ESP=c17cff18\r\r\nEIP=c15dcb62 EFL=0020024"] +[135.738173, "o", "6 [---Z-P-] CPL=0 II=0 A20=1 "] +[135.738429, "o", "SMM=0 HLT=1\r\r\nES =007b 00000000 ffffffff 00cff300 DPL=3 DS "] +[135.738589, "o", " [-WA]\r\r\nCS =006"] +[135.738828, "o", "0 00000000 ffffffff 00cf9a00 DPL=0 CS"] +[135.739037, "o", "32 [-R-]\r\r\nSS =0068 00000000"] +[135.739202, "o", " ffffffff 00cf9300 DPL=0 DS"] +[135.739242, "o", " [-WA]"] +[135.739526, "o", "\r\r\n"] +[135.739726, "o", "DS =007b 00000000 ffffffff 00cff300 DPL=3 DS ["] +[135.739962, "o", "-WA]\r\r\nFS =00d8 0e47b000 ffffffff 00"] +[135.740249, "o", "8f9300 DPL=0 DS16 [-WA]\r\r\nGS =00e0 cfdc"] +[135.740452, "o", "b200 00000018 00409100 DPL=0 DS [--"] +[135.740678, "o", "A]\r\r\nLDT=0000 0000000"] +[135.740839, "o", "0 00000000 00008200 DPL=0 LDT"] +[135.741014, "o", "\r\r\nTR =0080 ff806"] +[135.741163, "o", "000 0000407b 00008900 DPL"] +[135.741313, "o", "=0 TSS32-avl\r\r\n"] +[135.741459, "o", "GDT= ff801000 000000"] +[135.741543, "o", "ff\r\r\nIDT= ff800000 "] +[135.741584, "o", "000007ff\r\r\nCR0=80050"] +[135.741622, "o", "033 CR2=080919ab CR3=0"] +[135.741657, "o", "a450000 CR4=00000690\r"] +[135.741692, "o", "\r\nDR0=00000000 D"] +[135.741729, "o", "R1=00000000 DR2=0000"] +[135.741757, "o", "0000 DR3=0000000"] +[135.741795, "o", "0 \r\r\nDR6=ffff0ff0 D"] +[135.741835, "o", "R7=00000400\r\r\nEFER=000"] +[135.741869, "o", "0000000000000\r\r\n"] +[135.741904, "o", "FCW=037f FSW=0000 [S"] +[135.741935, "o", "T=0] FTW=00 MXCSR=0"] +[135.741964, "o", "0001f80\r\r\nFP"] +[135.742009, "o", "R0=0000000000000000 0000 FPR1"] +[135.742053, "o", "=0000000000000000 0000\r\r\n"] +[135.742095, "o", "FPR2=0000000000000000 0000 "] +[135.742136, "o", "FPR3=0000000000000000 000"] +[135.742173, "o", "0\r\r\nFPR4=0000000"] +[135.742212, "o", "000000000 0000 FPR5=000000"] +[135.742251, "o", "0000000000 0000\r\r\n"] +[135.742292, "o", "FPR6=0000000000000000 0000 FPR7=00000"] +[135.742328, "o", "00000000000 0000\r\r\n"] +[135.742365, "o", "XMM00=000000000000000"] +[135.7424, "o", "00000000000000000 XMM"] +[135.742436, "o", "01=0000000000000000000"] +[135.742472, "o", "0000000000000\r\r\n"] +[135.742508, "o", "XMM02=000000000000000"] +[135.742541, "o", "00000000000000000 XM"] +[135.742574, "o", "M03=00000000000000000"] +[135.742609, "o", "000000000000000\r\r\n"] +[135.742645, "o", "XMM04=000000000000"] +[135.74268, "o", "00000000000000000000 "] +[135.742716, "o", "XMM05=000000000000000000"] +[135.742753, "o", "00000000000000\r\r\n"] +[135.742792, "o", "XMM06=00000000000000000"] +[135.742829, "o", "000000000000000 XMM07=0"] +[135.74287, "o", "00000000000000000000000000"] +[135.742906, "o", "00000\r\r\n"] +[135.742943, "o", "(gdb) "] +[141.08448, "o", "g"] +[142.487912, "o", "\b\u001b[K"] +[142.687201, "o", "s"] +[142.782022, "o", "e"] +[142.929487, "o", "t"] +[143.097996, "o", " "] +[143.953732, "o", "$"] +[144.988721, "o", "g"] +[145.23854, "o", "d"] +[145.51718, "o", "t"] +[145.779607, "o", "r"] +[145.912464, "o", "="] +[154.09522, "o", "ff801000"] +[155.179972, "o", "\b"] +[155.680268, "o", "\b"] +[155.710228, "o", "\b"] +[155.740097, "o", "\b"] +[155.770866, "o", "\b"] +[155.801924, "o", "\b"] +[155.83298, "o", "\b"] +[155.863409, "o", "\b"] +[155.893282, "o", "\b"] +[155.923804, "o", "\b"] +[155.954806, "o", "\b"] +[156.173502, "o", "\u001b[C"] +[156.329491, "o", "\u001b[C"] +[156.44832, "o", "\u001b[C"] +[157.136526, "o", "0ff801000\b\b\b\b\b\b\b\b"] +[157.277804, "o", "xff801000\b\b\b\b\b\b\b\b"] +[157.772194, "o", "\r\n"] +[157.793131, "o", "(gdb) "] +[165.346319, "o", "#"] +[165.531054, "o", " "] +[165.974981, "o", "d"] +[166.056174, "o", "e"] +[166.210313, "o", "s"] +[166.277392, "o", "c"] +[166.519134, "o", "r"] +[166.620771, "o", "i"] +[166.731554, "o", "p"] +[166.883229, "o", "t"] +[166.955686, "o", "o"] +[167.072479, "o", "r"] +[167.204083, "o", " "] +[167.85161, "o", "\b\u001b[K"] +[167.982418, "o", "s"] +[168.100856, "o", " "] +[168.564966, "o", "h"] +[168.64102, "o", "a"] +[168.823243, "o", "v"] +[168.894548, "o", "e"] +[168.970183, "o", " "] +[169.176948, "o", "8"] +[169.334163, "o", " "] +[169.573922, "o", "b"] +[169.624225, "o", "y"] +[169.858766, "o", "t"] +[169.930975, "o", "e"] +[170.144177, "o", "s"] +[172.414362, "o", "\r\n"] +[172.414409, "o", "(gdb) "] +[173.311252, "o", "p"] +[173.662611, "o", "r"] +[173.785984, "o", "i"] +[173.865018, "o", "n"] +[173.983028, "o", "t"] +[174.148239, "o", " "] +[175.179933, "o", "/"] +[175.318199, "o", "x"] +[176.328504, "o", " "] +[178.585994, "o", "("] +[179.152768, "o", "u"] +[179.670353, "o", "i"] +[179.715934, "o", "n"] +[180.433924, "o", "t"] +[180.712095, "o", "6"] +[180.79663, "o", "4"] +[181.065001, "o", "_"] +[181.287562, "o", "t"] +[181.668811, "o", ")"] +[183.515012, "o", "$"] +[185.687333, "o", "\b\u001b[K"] +[185.866732, "o", "\b\u001b[K"] +[186.602865, "o", "*"] +[186.86583, "o", ")"] +[187.515633, "o", "$"] +[189.118783, "o", "d"] +[190.202703, "o", "\b\u001b[K"] +[190.355202, "o", "g"] +[190.666691, "o", "d"] +[190.945529, "o", "t"] +[191.772629, "o", "r"] +[192.777949, "o", "\b"] +[193.278082, "o", "\b"] +[193.30829, "o", "\b"] +[193.338654, "o", "\b"] +[193.369524, "o", "\b"] +[193.399722, "o", "\b"] +[193.430236, "o", "\b"] +[193.460701, "o", "\b"] +[193.491262, "o", "\b"] +[193.521133, "o", "\b"] +[193.551404, "o", "\b"] +[193.581346, "o", "\b"] +[193.612343, "o", "\b"] +[193.642766, "o", "\b"] +[193.673954, "o", "\b"] +[193.704329, "o", "\b"] +[193.735075, "o", "\b"] +[193.766105, "o", "\b"] +[193.968996, "o", "\u001b[C"] +[194.133982, "o", "\u001b[C"] +[195.116582, "o", "\u001b[C(uint64_t*)$gdtr\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[195.840287, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[196.582034, "o", ")"] +[197.108677, "o", "["] +[197.615, "o", "1"] +[197.748952, "o", "2"] +[197.846455, "o", "]"] +[199.192246, "o", "\r\n"] +[199.200699, "o", "$3 = 0xcf9a000000ffff\r\n"] +[199.20087, "o", "(gdb) "] +[200.899642, "o", "print /x ((uint64_t*)$gdtr)[12]"] +[201.157767, "o", "\b"] +[201.65802, "o", "\b"] +[201.688325, "o", "\b"] +[201.718252, "o", "\b"] +[201.74874, "o", "\b"] +[201.779962, "o", "\b"] +[201.811199, "o", "\b"] +[201.842097, "o", "\b"] +[201.87248, "o", "\b"] +[201.903057, "o", "\b"] +[201.933056, "o", "\b"] +[201.963408, "o", "\b"] +[201.993637, "o", "\b"] +[202.024063, "o", "\b"] +[202.054562, "o", "\b"] +[202.08602, "o", "\b"] +[202.116448, "o", "\b"] +[202.145957, "o", "\b"] +[202.177003, "o", "\b"] +[202.207484, "o", "\b"] +[202.237255, "o", "\b"] +[202.267668, "o", "\b"] +[202.29822, "o", "\b"] +[202.328265, "o", "\b"] +[202.603106, "o", "\u001b[C"] +[202.908862, "o", "\b\u001b[1P ((uint64_t*)$gdtr)[12]\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[203.068444, "o", "z ((uint64_t*)$gdtr)[12]\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[203.245412, "o", "\r\n"] +[203.245996, "o", "$4 = 0x00cf9a000000ffff\r\n"] +[203.246341, "o", "(gdb) "] +[213.714469, "o", "#"] +[213.924177, "o", " "] +[216.1305, "o", "d"] +[216.188454, "o", "e"] +[216.363086, "o", "c"] +[216.450319, "o", "o"] +[216.564475, "o", "d"] +[216.634156, "o", "e"] +[216.77561, "o", " "] +[216.897562, "o", "t"] +[217.009052, "o", "h"] +[217.107284, "o", "e"] +[217.195724, "o", " "] +[217.436172, "o", "l"] +[217.641538, "o", "i"] +[217.77799, "o", "m"] +[217.84922, "o", "i"] +[217.98972, "o", "t"] +[220.023935, "o", "\r\n"] +[220.024319, "o", "(gdb) "] +[220.290728, "o", "p"] +[220.450692, "o", "r"] +[220.533847, "o", "i"] +[220.588309, "o", "n"] +[220.673191, "o", "t"] +[220.969452, "o", " "] +[221.640322, "o", "/"] +[221.835579, "o", "x"] +[222.015648, "o", " "] +[229.910833, "o", "0"] +[230.018988, "o", "x"] +[235.196883, "o", "ffff"] +[236.417946, "o", " "] +[236.798758, "o", "*"] +[237.041762, "o", " "] +[239.365773, "o", "4"] +[239.475724, "o", "0"] +[239.686894, "o", "9"] +[240.006818, "o", "6"] +[241.88331, "o", "\r\n"] +[241.883649, "o", "$5 = 0xffff000\r\n(gdb) "] +[247.790867, "o", "#"] +[247.888405, "o", " "] +[248.078816, "o", "l"] +[248.234787, "o", "i"] +[248.378544, "o", "m"] +[248.444722, "o", "i"] +[248.608612, "o", "t"] +[248.721834, "o", " "] +[248.969296, "o", "i"] +[249.108096, "o", "s"] +[249.213166, "o", " "] +[249.852357, "o", "s"] +[249.970989, "o", "e"] +[250.175166, "o", "t"] +[250.316772, "o", " "] +[250.519205, "o", "t"] +[250.639319, "o", "o"] +[250.753856, "o", " "] +[251.306554, "o", "4"] +[251.870532, "o", "G"] +[252.09369, "o", "B"] +[253.582197, "o", "\r\n"] +[253.582499, "o", "(gdb) "] +[360.143066, "o", "s"] +[360.281378, "o", "e"] +[360.470522, "o", "t"] +[360.613987, "o", " "] +[361.843991, "o", "$"] +[362.851023, "o", "k"] +[362.985051, "o", "e"] +[363.042871, "o", "r"] +[363.130172, "o", "n"] +[363.188481, "o", "e"] +[363.324746, "o", "l"] +[363.842416, "o", "_"] +[365.495183, "o", "c"] +[365.631849, "o", "o"] +[365.731124, "o", "d"] +[365.781546, "o", "e"] +[371.021879, "o", "="] +[371.101039, "o", " "] +[371.423014, "o", "\b\u001b[K"] +[371.56274, "o", "\b\u001b[K"] +[371.676663, "o", " "] +[371.793814, "o", "="] +[371.916711, "o", " "] +[378.293839, "o", "((uint64_t*)$gdtr)[12]"] +[380.299588, "o", "\r\n"] +[380.320761, "o", "(gdb) "] +[382.691282, "o", "p"] +[383.391885, "o", "\b\u001b[K"] +[384.142072, "o", "#"] +[384.312701, "o", " "] +[384.457712, "o", "d"] +[384.555394, "o", "e"] +[384.722997, "o", "c"] +[384.799253, "o", "o"] +[384.920935, "o", "d"] +[384.993824, "o", "e"] +[385.075237, "o", " "] +[385.230561, "o", "t"] +[385.306128, "o", "h"] +[385.402699, "o", "e"] +[385.524367, "o", " "] +[385.683035, "o", "b"] +[385.742431, "o", "a"] +[385.818729, "o", "s"] +[385.945102, "o", "e"] +[386.563467, "o", "\r\n"] +[386.563785, "o", "(gdb) "] +[387.807248, "o", "p"] +[388.007592, "o", "r"] +[388.058408, "o", "i"] +[388.11304, "o", "n"] +[388.226837, "o", "t"] +[388.33127, "o", " "] +[388.554004, "o", "/"] +[388.663181, "o", "x"] +[388.747275, "o", " "] +[400.152432, "o", "$kernel_code"] +[401.172444, "o", ">"] +[401.31128, "o", ">"] +[402.246532, "o", "3"] +[402.36536, "o", "2"] +[403.041876, "o", ")"] +[403.470546, "o", "\b"] +[403.971061, "o", "\b"] +[404.00112, "o", "\b"] +[404.031651, "o", "\b"] +[404.062697, "o", "\b"] +[404.093786, "o", "\b"] +[404.124324, "o", "\b"] +[404.154917, "o", "\b"] +[404.186219, "o", "\b"] +[404.215797, "o", "\b"] +[404.245841, "o", "\b"] +[404.277153, "o", "\b"] +[404.307534, "o", "\b"] +[404.337518, "o", "\b"] +[404.367197, "o", "\b"] +[404.39724, "o", "\b"] +[404.427051, "o", "\b"] +[404.457149, "o", "\b"] +[404.487796, "o", "\b"] +[404.518995, "o", "\b"] +[404.717726, "o", "\u001b[C"] +[404.856778, "o", "\u001b[C"] +[404.998772, "o", "\u001b[C"] +[405.714354, "o", "($kernel_code>>32)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[406.325269, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[408.028055, "o", "&"] +[409.373772, "o", "0"] +[409.514542, "o", "x"] +[409.995675, "o", "F"] +[410.205691, "o", "F"] +[410.76551, "o", "0"] +[411.008743, "o", "0"] +[411.636305, "o", "0"] +[411.810194, "o", "0"] +[412.647261, "o", "0"] +[412.820582, "o", "0"] +[413.342493, "o", ")"] +[413.718701, "o", "\b"] +[414.218638, "o", "\b"] +[414.248087, "o", "\b"] +[414.278658, "o", "\b"] +[414.309504, "o", "\b"] +[414.339242, "o", "\b"] +[414.369913, "o", "\b"] +[414.401266, "o", "\b"] +[414.431457, "o", "\b"] +[414.462205, "o", "\b"] +[414.492336, "o", "\b"] +[414.522837, "o", "\b"] +[414.553148, "o", "\b"] +[414.583161, "o", "\b"] +[414.614069, "o", "\b"] +[414.644562, "o", "\b"] +[414.675091, "o", "\b"] +[414.706454, "o", "\b"] +[414.737365, "o", "\b"] +[414.767321, "o", "\b"] +[414.797448, "o", "\b"] +[414.82805, "o", "\b"] +[414.858649, "o", "\b"] +[414.888968, "o", "\b"] +[414.918934, "o", "\b"] +[414.949273, "o", "\b"] +[414.979232, "o", "\b"] +[415.009292, "o", "\b"] +[415.039545, "o", "\b"] +[415.195848, "o", "\b"] +[415.379056, "o", "\b"] +[415.654254, "o", "\u001b[C"] +[416.120906, "o", "\u001b[C($kernel_code>>32)&0xFF000000)\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[416.670056, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[417.156581, "o", "|"] +[418.078121, "o", "("] +[418.078121, "o", "("] +[418.501176, "o", "$"] +[419.061164, "o", "k"] +[419.257984, "o", "e"] +[419.35113, "o", "r"] +[419.389884, "o", "n"] +[419.462042, "o", "e"] +[419.567743, "o", "l"] +[419.859026, "o", "_"] +[420.010047, "o", "c"] +[420.089005, "o", "o"] +[420.193463, "o", "d"] +[420.274683, "o", "e"] +[423.236442, "o", ">"] +[423.370439, "o", ">"] +[425.001308, "o", "1"] +[425.061638, "o", "6"] +[426.234482, "o", ")"] +[427.499144, "o", "&"] +[428.572962, "o", "0"] +[428.702492, "o", "x"] +[429.690473, "o", "0"] +[429.827397, "o", "0"] +[430.421525, "o", "F"] +[430.556561, "o", "F"] +[431.06108, "o", "F"] +[431.192942, "o", "F"] +[431.550652, "o", "F"] +[431.692892, "o", "F"] +[432.118055, "o", ")"] +[433.280631, "o", "\r\n"] +[493.456846, "o", "$9 = 0x0\r\n(gdb) "] +[499.686291, "o", "#"] +[499.879719, "o", " "] +[500.923126, "o", "b"] +[500.981395, "o", "a"] +[501.053439, "o", "s"] +[501.206446, "o", "e"] +[501.347024, "o", " "] +[501.498974, "o", "i"] +[501.603969, "o", "s"] +[501.939405, "o", " "] +[502.084015, "o", "0"] +[502.685399, "o", "\r\n"] +[502.685665, "o", "(gdb) "] +[505.05193, "o", "#"] +[506.321549, "o", " "] +[506.981152, "o", "d"] +[507.072632, "o", "e"] +[507.249979, "o", "c"] +[507.287012, "o", "o"] +[507.464576, "o", "d"] +[507.54832, "o", "e"] +[507.691229, "o", " "] +[507.964173, "o", "t"] +[508.11861, "o", "h"] +[508.267776, "o", "e"] +[508.361951, "o", " "] +[508.80405, "o", "p"] +[508.905761, "o", "r"] +[509.016474, "o", "i"] +[509.158256, "o", "v"] +[509.217248, "o", "i"] +[509.408068, "o", "l"] +[509.508072, "o", "e"] +[509.914455, "o", "g"] +[509.994915, "o", "e"] +[511.133299, "o", "\b"] +[511.633839, "o", "\b"] +[511.664354, "o", "\b"] +[511.694443, "o", "\b"] +[511.725474, "o", "\b"] +[511.756119, "o", "\b"] +[511.786986, "o", "\b"] +[511.818226, "o", "\b"] +[512.015711, "o", "\b"] +[512.25971, "o", "rprivilege\b\b\b\b\b\b\b\b\b"] +[512.39865, "o", "qprivilege\b\b\b\b\b\b\b\b\b"] +[512.600873, "o", "uprivilege\b\b\b\b\b\b\b\b\b"] +[512.651473, "o", "iprivilege\b\b\b\b\b\b\b\b\b"] +[512.754197, "o", "rprivilege\b\b\b\b\b\b\b\b\b"] +[512.836059, "o", "eprivilege\b\b\b\b\b\b\b\b\b"] +[513.846698, "o", "\b\u001b[1Pprivilege\b\b\b\b\b\b\b\b\b"] +[513.976248, "o", "\b\u001b[1Pprivilege\b\b\b\b\b\b\b\b\b"] +[514.095038, "o", "\b\u001b[1Pprivilege\b\b\b\b\b\b\b\b\b"] +[514.217603, "o", "\b\u001b[1Pprivilege\b\b\b\b\b\b\b\b\b"] +[514.348636, "o", "\b\u001b[1Pprivilege\b\b\b\b\b\b\b\b\b"] +[514.462555, "o", "eprivilege\b\b\b\b\b\b\b\b\b"] +[514.684859, "o", "qprivilege\b\b\b\b\b\b\b\b\b"] +[514.871245, "o", "uprivilege\b\b\b\b\b\b\b\b\b"] +[514.908025, "o", "iprivilege\b\b\b\b\b\b\b\b\b"] +[514.972336, "o", "rprivilege\b\b\b\b\b\b\b\b\b"] +[515.040145, "o", "eprivilege\b\b\b\b\b\b\b\b\b"] +[515.195378, "o", "dprivilege\b\b\b\b\b\b\b\b\b"] +[515.274181, "o", " privilege\b\b\b\b\b\b\b\b\b"] +[515.57074, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[516.018703, "o", " "] +[516.187684, "o", "l"] +[516.246874, "o", "e"] +[516.472241, "o", "v"] +[516.555511, "o", "e"] +[516.581353, "o", "l"] +[517.033643, "o", "\r\n"] +[517.033753, "o", "(gdb) "] +[518.250999, "o", "p"] +[518.390471, "o", "r"] +[518.47528, "o", "i"] +[518.539113, "o", "n"] +[518.59094, "o", "t"] +[518.697485, "o", " "] +[518.870622, "o", "/"] +[518.974887, "o", "x"] +[519.117062, "o", " "] +[541.606002, "o", "$"] +[544.466982, "o", "k"] +[544.572664, "o", "e"] +[544.632867, "o", "r"] +[544.722191, "o", "n"] +[544.773134, "o", "e"] +[544.907561, "o", "l"] +[545.179853, "o", "_"] +[545.326522, "o", "c"] +[545.386052, "o", "o"] +[545.50024, "o", "d"] +[545.567089, "o", "e"] +[545.90589, "o", ">"] +[546.035073, "o", ">"] +[546.773187, "o", "4"] +[546.97104, "o", "5"] +[547.369708, "o", ")"] +[547.761588, "o", "\b\b\b"] +[548.152916, "o", "\b\b\b\b\b\b"] +[548.498199, "o", "\b\b\b\b\b\b\b"] +[549.243467, "o", "\b"] +[549.640345, "o", "($kernel_code>>45)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[550.024338, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[550.898538, "o", "&"] +[551.862487, "o", "3"] +[552.10327, "o", "\r\n"] +[552.103562, "o", "$10 = 0x0\r\n(gdb) "] +[554.033804, "o", "#"] +[554.232669, "o", " "] +[554.419128, "o", "k"] +[554.519684, "o", "e"] +[555.554111, "o", "r"] +[556.306276, "o", "n"] +[556.564753, "o", "e"] +[556.649777, "o", "l"] +[556.812466, "o", " "] +[557.304025, "o", "p"] +[557.473156, "o", "r"] +[557.570501, "o", "i"] +[557.787916, "o", "v"] +[557.846584, "o", "i"] +[558.020526, "o", "l"] +[558.087162, "o", "e"] +[558.286556, "o", "g"] +[558.375973, "o", "e"] +[559.006899, "o", "\b\u001b[K"] +[559.154619, "o", "\b\u001b[K"] +[559.862659, "o", "e"] +[560.565269, "o", "\r\n"] +[560.565382, "o", "(gdb) "] +[566.281427, "o", "#"] +[566.60638, "o", " "] +[566.88704, "o", "n"] +[566.979938, "o", "o"] +[567.097734, "o", "w"] +[567.217521, "o", " "] +[567.40312, "o", "l"] +[567.48597, "o", "e"] +[567.691523, "o", "t"] +[567.944728, "o", "s"] +[568.133886, "o", " "] +[568.530377, "o", "d"] +[568.722282, "o", " "] +[569.182034, "o", "\b\u001b[K"] +[569.330181, "o", "o"] +[569.464696, "o", " "] +[569.582842, "o", "t"] +[569.71244, "o", "h"] +[569.775485, "o", "e"] +[569.904672, "o", " "] +[570.06808, "o", "s"] +[570.113381, "o", "a"] +[570.267076, "o", "m"] +[570.401127, "o", "e"] +[570.54668, "o", " "] +[571.537724, "o", "f"] +[571.657201, "o", "o"] +[571.830889, "o", "r"] +[571.931437, "o", " "] +[573.275644, "o", "u"] +[573.370696, "o", "s"] +[573.478114, "o", "e"] +[573.547114, "o", "r"] +[573.753181, "o", "s"] +[573.888213, "o", "p"] +[573.935344, "o", "a"] +[574.131183, "o", "c"] +[574.286447, "o", "e"] +[575.340632, "o", "\r\n"] +[575.340777, "o", "(gdb) "] +[575.738126, "o", "#"] +[575.938845, "o", " "] +[576.16047, "o", "f"] +[576.290807, "o", "i"] +[576.374712, "o", "r"] +[576.561113, "o", "s"] +[576.696249, "o", "t"] +[576.792918, "o", " "] +[581.320765, "o", "s"] +[581.435293, "o", "e"] +[581.551576, "o", "t"] +[581.69328, "o", "u"] +[581.77331, "o", "p"] +[582.019538, "o", " "] +[582.184776, "o", "a"] +[582.341777, "o", " "] +[582.940185, "o", "p"] +[583.323301, "o", "r"] +[583.369325, "o", "e"] +[583.536369, "o", "n"] +[584.034225, "o", "\b\u001b[K"] +[584.16717, "o", "\b\u001b[K"] +[584.29082, "o", "\b\u001b[K"] +[584.411253, "o", "\b\u001b[K"] +[584.683076, "o", "b"] +[584.765623, "o", "r"] +[584.845741, "o", "e"] +[584.922373, "o", "a"] +[584.981172, "o", "k"] +[585.213372, "o", "p"] +[585.312638, "o", "o"] +[585.50566, "o", "i"] +[585.538534, "o", "n"] +[585.679473, "o", "t"] +[585.76966, "o", " "] +[586.004301, "o", "t"] +[586.134879, "o", "o"] +[586.349202, "o", " "] +[586.655069, "o", "a"] +[586.776202, "o", " "] +[586.989063, "o", "s"] +[587.158287, "o", "y"] +[587.231328, "o", "s"] +[587.430585, "o", "t"] +[587.481228, "o", "e"] +[587.580052, "o", "m"] +[587.651081, "o", " "] +[588.604948, "o", "\b\u001b[K"] +[589.104855, "o", "\b\u001b[K"] +[589.135382, "o", "\b\u001b[K"] +[589.165688, "o", "\b\u001b[K"] +[589.197272, "o", "\b\u001b[K"] +[589.227213, "o", "\b\u001b[K"] +[589.258689, "o", "\b\u001b[K"] +[589.289664, "o", "\b\u001b[K"] +[589.318764, "o", "\b\u001b[K"] +[589.348938, "o", "\b\u001b[K"] +[589.844186, "o", " "] +[589.954736, "o", "t"] +[590.043756, "o", "h"] +[590.136384, "o", "e"] +[590.207131, "o", " "] +[590.312478, "o", "s"] +[590.426546, "o", "y"] +[590.468176, "o", "s"] +[590.614861, "o", "t"] +[590.700094, "o", "e"] +[591.025033, "o", "l"] +[592.076639, "o", "\b\u001b[K"] +[592.265751, "o", "m"] +[592.354996, "o", " "] +[592.477158, "o", "c"] +[592.527122, "o", "a"] +[592.591157, "o", "l"] +[592.688401, "o", "l"] +[592.749225, "o", " "] +[592.888391, "o", "e"] +[592.997816, "o", "n"] +[593.106259, "o", "t"] +[593.286703, "o", "r"] +[593.354202, "o", "y"] +[593.592532, "o", "\r\n"] +[593.592811, "o", "(gdb) "] +[594.298864, "o", "b"] +[594.370713, "o", "r"] +[594.483049, "o", "e"] +[594.563026, "o", "a"] +[594.63206, "o", "k"] +[594.734457, "o", " "] +[595.876785, "o", "e"] +[595.974998, "o", "n"] +[596.08727, "o", "t"] +[596.265929, "o", "r"] +[596.355891, "o", "y"] +[596.67381, "o", "_"] +[601.569523, "o", "\u0007"] +[601.660172, "o", "\r\nentry_32.S entry_SYSENTER_32 entry_number \r\nentry_INT80_32 entry_eip entry_stack_page \r\n(gdb) break entry_S"] +[603.152017, "o", "\b\u001b[K"] +[603.60971, "o", "I"] +[603.858406, "o", "N"] +[604.204372, "o", "T80_32 "] +[605.312061, "o", "\r\n"] +[605.363107, "o", "Breakpoint 1 at \u001b[34m0xc15de874\u001b[m: file \u001b[32march/x86/entry/entry_32.S\u001b[m, line 1020.\r\n"] +[605.363155, "o", "(gdb) "] +[606.887754, "o", "c"] +[608.705129, "o", "\r\n"] +[608.705459, "o", "Continuing.\r\n"] +[610.971565, "o", "\r\n"] +[610.971946, "o", "Breakpoint 1, \u001b[34m0xc15de874\u001b[m in \u001b[33mentry_INT80_32\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1020\r\n"] +[610.972302, "o", "1020\t\tjmp\t.Lsysenter_flags_fixed\r\n"] +[610.972535, "o", "(gdb) "] +[616.192863, "o", "n"] +[617.72471, "o", "\r\n"] +[617.734068, "o", "1054\t\tpushl\t%eax\t\t\t/* pt_regs->orig_ax */\r\n"] +[617.734278, "o", "(gdb) "] +[619.7212, "o", "#"] +[619.906381, "o", " "] +[620.188614, "o", "s"] +[620.422791, "o", "t"] +[620.477236, "o", "e"] +[620.659615, "o", "p"] +[621.259597, "o", " "] +[621.492054, "o", "t"] +[621.59708, "o", "h"] +[621.685757, "o", "r"] +[621.736727, "o", "o"] +[621.981907, "o", "u"] +[622.322864, "o", "\b\u001b[K"] +[622.460272, "o", "\b\u001b[K"] +[622.594567, "o", "\b\u001b[K"] +[622.718254, "o", "\b\u001b[K"] +[622.947025, "o", "r"] +[623.072527, "o", "o"] +[623.203739, "o", "u"] +[623.609466, "o", "\b\u001b[K"] +[623.751747, "o", "\b\u001b[K"] +[623.853592, "o", "\b\u001b[K"] +[624.129617, "o", "h"] +[624.399671, "o", "r"] +[624.477659, "o", "o"] +[624.624296, "o", "u"] +[624.734072, "o", "g"] +[624.834819, "o", "h"] +[624.960495, "o", " "] +[625.036927, "o", "t"] +[625.128099, "o", "h"] +[625.206121, "o", "e"] +[625.324744, "o", " "] +[625.524612, "o", "s"] +[625.726291, "o", "y"] +[625.819857, "o", "s"] +[625.995635, "o", "t"] +[626.050073, "o", "e"] +[626.118718, "o", "m"] +[626.245506, "o", " "] +[626.448433, "o", "c"] +[626.490398, "o", "a"] +[626.60857, "o", "l"] +[626.748098, "o", "l"] +[626.867184, "o", " "] +[627.249677, "o", "u"] +[627.324825, "o", "n"] +[627.753751, "o", "t"] +[627.862279, "o", "i"] +[627.950531, "o", "l"] +[628.071181, "o", " "] +[628.193645, "o", "t"] +[628.325551, "o", "h"] +[628.396454, "o", "e"] +[628.50338, "o", " "] +[628.954182, "o", "e"] +[629.072892, "o", "n"] +[629.169954, "o", "d"] +[629.907936, "o", "\r\n"] +[629.90828, "o", "(gdb) "] +[632.365504, "o", "d"] +[632.443146, "o", "e"] +[632.536882, "o", "l"] +[632.664697, "o", " "] +[632.875456, "o", "b"] +[633.037126, "o", "e"] +[633.128086, "o", "a"] +[633.208296, "o", "k"] +[633.771397, "o", "\b\u001b[K"] +[633.909143, "o", "\b\u001b[K"] +[634.023021, "o", "\b\u001b[K"] +[634.081351, "o", "r"] +[634.166271, "o", "e"] +[634.258451, "o", "a"] +[634.315484, "o", "k"] +[637.580479, "o", "\r\n"] +[637.580605, "o", "Delete all breakpoints? (y or n) "] +[638.161286, "o", "y"] +[638.358828, "o", "\r\n"] +[638.359126, "o", "(gdb) "] +[638.816338, "o", "n"] +[639.01657, "o", "\r\n"] +[639.02752, "o", "\u001b[33mentry_INT80_32\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1056\r\n1056\t\tSAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1\t/* save rest */\r\n"] +[639.027905, "o", "(gdb) "] +[639.989246, "o", "\r\n"] +[640.089993, "o", "\u001b[33mentry_INT80_32\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1058\r\n1058\t\tmovl\t%esp, %eax\r\n(gdb) "] +[640.575852, "o", "\r\n"] +[640.584316, "o", "1059\t\tcall\tdo_int80_syscall_32\r\n"] +[640.584404, "o", "(gdb) "] +[641.020396, "o", "\r\n"] +[641.043758, "o", "1064\t\tSWITCH_TO_ENTRY_STACK\r\n"] +[641.043903, "o", "(gdb) "] +[642.078973, "o", "\r\n"] +[642.131705, "o", "\u001b[33mentry_INT80_32\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1065\r\n1065\t\tCHECK_AND_APPLY_ESPFIX\r\n(gdb) "] +[643.550621, "o", "\r\n"] +[643.569874, "o", "1068\t\tSWITCH_TO_USER_CR3 scratch_reg=%eax\r\n(gdb) "] +[645.089643, "o", "\r\n"] +[645.097692, "o", "1073\t\tRESTORE_REGS pop=4\t\t\t# skip orig_eax/error_code\r\n"] +[645.09795, "o", "(gdb) "] +[646.112248, "o", "\r\n"] +[646.177685, "o", "\u001b[33mentry_INT80_32\u001b[m () at \u001b[32march/x86/entry/entry_32.S\u001b[m:1080\r\n1080\t\tINTERRUPT_RETURN\r\n"] +[646.177724, "o", "(gdb) "] +[650.665768, "o", "\r\n"] +[650.685116, "o", "\u001b[34m0x448d167d\u001b[m in \u001b[33m??\u001b[m ()\r\n"] +[650.686558, "o", "(gdb) "] +[655.046392, "o", "#"] +[655.210079, "o", " "] +[655.404159, "o", "t"] +[655.481148, "o", "h"] +[655.531965, "o", "i"] +[655.669289, "o", "s"] +[655.756962, "o", " "] +[656.023425, "o", "l"] +[656.230497, "o", "o"] +[656.350096, "o", "o"] +[656.430194, "o", "k"] +[656.543202, "o", "s"] +[656.614577, "o", " "] +[656.747584, "o", "l"] +[656.90921, "o", "i"] +[657.044202, "o", "k"] +[657.15373, "o", "e"] +[657.207156, "o", " "] +[657.321091, "o", "a"] +[657.405793, "o", " "] +[657.735833, "o", "u"] +[657.828575, "o", "s"] +[657.933278, "o", "e"] +[658.002691, "o", "r"] +[658.288285, "o", " "] +[658.450784, "o", "s"] +[658.550787, "o", "p"] +[658.610481, "o", "a"] +[658.76577, "o", "c"] +[658.84823, "o", "e"] +[658.919578, "o", " "] +[658.996235, "o", "a"] +[659.117812, "o", "d"] +[659.253844, "o", "d"] +[659.433193, "o", "r"] +[659.48534, "o", "e"] +[659.60751, "o", "s"] +[659.754908, "o", "s"] +[659.888093, "o", ","] +[659.977163, "o", " "] +[660.161783, "o", "l"] +[660.22061, "o", "e"] +[660.384788, "o", "t"] +[660.593814, "o", "s"] +[660.725078, "o", " "] +[661.244123, "o", "c"] +[661.390876, "o", "h"] +[661.490006, "o", "e"] +[661.556587, "o", "c"] +[661.641873, "o", "k"] +[661.806617, "o", " "] +[662.017518, "o", "t"] +[662.09433, "o", "h"] +[662.20504, "o", "e"] +[662.271583, "o", " "] +[662.406577, "o", "c"] +[662.496568, "o", "o"] +[662.695698, "o", "d"] +[663.60129, "o", "e"] +[663.748373, "o", " "] +[663.955006, "o", "s"] +[664.114279, "o", "e"] +[664.25419, "o", "l"] +[664.303753, "o", "e"] +[664.381874, "o", "c"] +[664.587642, "o", "t"] +[664.674974, "o", "o"] +[664.749721, "o", "r"] +[665.224982, "o", "\r\n"] +[665.225096, "o", "(gdb) "] +[665.695276, "o", "p"] +[665.792372, "o", "r"] +[665.897882, "o", "i"] +[665.94532, "o", "n"] +[666.023454, "o", "t"] +[666.0908, "o", " "] +[666.432583, "o", "x"] +[666.832639, "o", "\b\u001b[K"] +[667.03479, "o", "/"] +[667.144708, "o", "x"] +[667.249841, "o", " "] +[667.530074, "o", "$"] +[667.849312, "o", "c"] +[667.93295, "o", "s"] +[668.402085, "o", "\r\n"] +[668.403271, "o", "$11 = 0x73\r\n"] +[668.404009, "o", "(gdb) "] +[672.252423, "o", "#"] +[673.194564, "o", " "] +[673.675156, "o", "t"] +[673.760362, "o", "h"] +[673.882635, "o", "e"] +[673.994028, "o", " "] +[674.513812, "o", "p"] +[674.706893, "o", "r"] +[674.804439, "o", "i"] +[674.94746, "o", "v"] +[675.010894, "o", "i"] +[675.188625, "o", "l"] +[675.265228, "o", "e"] +[675.511041, "o", "g"] +[675.587286, "o", "e"] +[675.739396, "o", " "] +[675.935171, "o", "l"] +[676.005954, "o", "e"] +[676.202384, "o", "v"] +[676.294477, "o", "e"] +[676.354083, "o", "l"] +[676.485263, "o", " "] +[676.602646, "o", "i"] +[676.704354, "o", "s"] +[676.784185, "o", " "] +[676.981088, "o", "3"] +[677.195168, "o", ","] +[677.326062, "o", " "] +[677.604308, "o", "s"] +[677.666906, "o", "o"] +[677.785998, "o", " "] +[678.506076, "o", "i"] +[678.655711, "o", "n"] +[678.780883, "o", "e"] +[678.940158, "o", "e"] +[679.044873, "o", "d"] +[679.222637, "o", " "] +[679.56275, "o", "\b\u001b[K"] +[679.698968, "o", "\b\u001b[K"] +[679.830282, "o", "\b\u001b[K"] +[679.983681, "o", "\b\u001b[K"] +[680.311095, "o", "d"] +[680.410929, "o", "e"] +[680.580716, "o", "e"] +[680.692035, "o", "d"] +[680.797568, "o", " "] +[680.979638, "o", "w"] +[681.042251, "o", "e"] +[681.132403, "o", " "] +[681.247785, "o", "a"] +[681.393117, "o", "r"] +[681.475054, "o", "e"] +[681.54512, "o", " "] +[682.162527, "o", "i"] +[682.280131, "o", "n"] +[682.527043, "o", "u"] +[682.867049, "o", "\b\u001b[K"] +[682.971752, "o", " "] +[683.475284, "o", "\b\u001b[K"] +[683.616692, "o", "\b\u001b[K"] +[683.74114, "o", "\b\u001b[K"] +[683.998681, "o", "r"] +[684.057732, "o", "u"] +[684.122526, "o", "n"] +[684.271564, "o", "i"] +[684.326317, "o", "n"] +[684.689606, "o", "\b\u001b[K"] +[684.828735, "o", "\b\u001b[K"] +[685.172894, "o", "n"] +[685.301489, "o", "i"] +[685.385224, "o", "n"] +[685.524585, "o", "g"] +[685.652293, "o", " "] +[685.800678, "o", "i"] +[685.887831, "o", "n"] +[685.944332, "o", " "] +[686.069441, "o", "u"] +[686.162036, "o", "s"] +[686.243565, "o", "e"] +[686.317807, "o", "r"] +[686.376218, "o", " "] +[686.540215, "o", "m"] +[686.598979, "o", "o"] +[686.625005, "o", "d"] +[686.723705, "o", "e"] +[686.778902, "o", "\r\n"] +[686.779014, "o", "(gdb) "] +[692.53241, "o", "#"] +[692.721453, "o", " "] +[692.892056, "o", "l"] +[693.024663, "o", "e"] +[693.193954, "o", "t"] +[693.371269, "o", "s"] +[693.597078, "o", "g"] +[693.885546, "o", "\b\u001b[K"] +[693.978727, "o", " "] +[694.049596, "o", "g"] +[694.108902, "o", "e"] +[694.247468, "o", "t"] +[694.345634, "o", " "] +[694.511566, "o", "t"] +[694.587774, "o", "h"] +[694.6909, "o", "e"] +[694.771112, "o", " "] +[694.863811, "o", "s"] +[694.977826, "o", "e"] +[695.074086, "o", "l"] +[695.155131, "o", "e"] +[695.229863, "o", "c"] +[695.442999, "o", "t"] +[695.821135, "o", "o"] +[695.950336, "o", "r"] +[696.053426, "o", "\r\n"] +[696.053731, "o", "(gdb) "] +[696.74081, "o", "p"] +[696.882904, "o", "r"] +[696.95492, "o", "i"] +[697.03238, "o", "n"] +[697.104735, "o", "t"] +[697.202097, "o", " "] +[697.420614, "o", "."] +[697.660736, "o", "x"] +[698.088723, "o", "\b\u001b[K"] +[698.214544, "o", "\b\u001b[K"] +[698.416446, "o", "/"] +[698.57942, "o", "x"] +[698.684619, "o", " "] +[699.479828, "o", "$"] +[699.980736, "o", "c"] +[700.065109, "o", "s"] +[700.396996, "o", ">"] +[700.526326, "o", ">"] +[700.713679, "o", "3"] +[700.79379, "o", "2"] +[701.128706, "o", "\b\u001b[K"] +[701.345675, "o", "\b\u001b[K"] +[701.604745, "o", "3"] +[702.59867, "o", "\r\n"] +[702.599922, "o", "$12 = 0xe\r\n"] +[702.600842, "o", "(gdb) "] +[705.06673, "o", "\u0007"] +[705.324321, "o", "print /x $cs>>3"] +[705.503241, "o", "\b"] +[706.003691, "o", "\b"] +[706.034303, "o", "\b"] +[706.064471, "o", "\b"] +[706.095291, "o", "\b"] +[706.125113, "o", "\b"] +[706.155725, "o", "\b"] +[706.523474, "o", "\u001b[C"] +[706.779876, "o", "\b\u001b[1P$cs>>3\b\b\b\b\b\b"] +[706.91489, "o", "\b\u001b[1P$cs>>3\b\b\b\b\b\b"] +[707.042101, "o", "\b\u001b[1P$cs>>3\b\b\b\b\b\b"] +[707.262476, "o", "\r\n"] +[707.263867, "o", "$13 = 14\r\n"] +[707.264707, "o", "(gdb) "] +[718.972351, "o", "s"] +[719.055223, "o", "e"] +[719.204699, "o", "t"] +[719.38757, "o", " "] +[719.750857, "o", "$"] +[721.845151, "o", "u"] +[721.971635, "o", "s"] +[722.053169, "o", "e"] +[722.146473, "o", "r"] +[722.392609, "o", "_"] +[729.373498, "o", "c"] +[729.521886, "o", "o"] +[729.630095, "o", "d"] +[729.712985, "o", "e"] +[730.34708, "o", "="] +[742.363689, "o", "((uint64_t*)$gdtr)["] +[743.594954, "o", "1"] +[744.208712, "o", "4"] +[745.162637, "o", "]"] +[746.699323, "o", "\r\n"] +[746.707477, "o", "(gdb) "] +[747.302209, "o", "p"] +[747.409252, "o", "r"] +[747.545136, "o", "i"] +[747.603175, "o", "n"] +[747.629656, "o", "t"] +[747.765806, "o", " "] +[747.957777, "o", "/"] +[748.073233, "o", "z"] +[748.193756, "o", " "] +[748.536198, "o", "$"] +[748.849674, "o", "u"] +[748.981324, "o", "s"] +[749.09022, "o", "e"] +[749.141073, "o", "r"] +[749.742629, "o", "_"] +[750.000935, "o", "c"] +[750.098552, "o", "o"] +[750.20873, "o", "d"] +[750.312852, "o", "e"] +[750.456634, "o", "\r\n"] +[750.457831, "o", "$14 = 0x00cffa000000ffff\r\n"] +[750.458525, "o", "(gdb) "] +[760.922271, "o", "#"] +[761.152125, "o", " "] +[761.321912, "o", "l"] +[761.483755, "o", "e"] +[761.677907, "o", "t"] +[761.977657, "o", "s"] +[762.451478, "o", " "] +[763.858006, "o", "p"] +[764.048606, "o", "r"] +[764.12853, "o", "i"] +[764.205234, "o", "n"] +[764.281491, "o", "t"] +[764.423725, "o", " "] +[764.529463, "o", "t"] +[764.642091, "o", "h"] +[764.76313, "o", "e"] +[764.906969, "o", " "] +[766.022913, "o", "b"] +[766.141544, "o", "a"] +[766.245647, "o", "s"] +[766.449316, "o", "e"] +[768.319525, "o", "\r\n"] +[768.319645, "o", "(gdb) "] +[809.07354, "o", "print /x (($user_code>>32)&0xFF000000)|(($kernel_code>>16)&0x00FFFFFF)\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[809.256141, "o", "\u001b[C"] +[809.75659, "o", "\u001b[C"] +[809.786818, "o", "\u001b[C"] +[809.817626, "o", "\u001b[C"] +[809.848195, "o", "\u001b[C"] +[809.878997, "o", "\u001b[C"] +[809.909101, "o", "\u001b[C"] +[809.940364, "o", "\u001b[C"] +[809.970702, "o", "\u001b[C"] +[810.001365, "o", "\u001b[C"] +[810.031559, "o", "\u001b[C"] +[810.062018, "o", "\u001b[C"] +[810.093057, "o", "\u001b[C"] +[810.123449, "o", "\u001b[C"] +[810.153727, "o", "\u001b[C"] +[810.184235, "o", "\u001b[C"] +[810.214952, "o", "\u001b[C"] +[810.246219, "o", "\u001b[C"] +[810.276599, "o", "\u001b[C"] +[810.306915, "o", "\u001b[C"] +[810.336951, "o", "\u001b[C"] +[810.367283, "o", "\u001b[C"] +[810.397779, "o", "\u001b[C"] +[810.428429, "o", "\u001b[C"] +[810.458566, "o", "\u001b[C"] +[810.488834, "o", "\u001b[C"] +[810.519269, "o", "\u001b[C"] +[810.549961, "o", "\u001b[C"] +[810.580356, "o", "\u001b[C"] +[810.611188, "o", "\u001b[C"] +[810.641482, "o", "\u001b[C"] +[810.671188, "o", "\u001b[C"] +[810.702462, "o", "\u001b[C"] +[810.975545, "o", "\b"] +[811.264615, "o", "\b\u001b[1P_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[811.40386, "o", "\b\u001b[1P_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[811.523682, "o", "\b\u001b[1P_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[811.647806, "o", "\b\u001b[1P_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[811.784493, "o", "\b\u001b[1P_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[811.945961, "o", "\b\u001b[1P_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[812.551148, "o", "u_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[812.666049, "o", "s_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[812.733865, "o", "e_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[812.804219, "o", "r_code>>16)&0x00FFFFFF)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[813.471435, "o", "\r\n"] +[813.472787, "o", "$15 = 0x0\r\n"] +[813.473756, "o", "(gdb) "] +[815.049124, "o", "#"] +[815.227758, "o", " "] +[816.113222, "o", "a"] +[816.244562, "o", "n"] +[816.316434, "o", "d"] +[816.425143, "o", " "] +[816.637818, "o", "t"] +[816.711367, "o", "h"] +[816.826612, "o", "e"] +[816.890874, "o", " "] +[817.064052, "o", "l"] +[817.240246, "o", "i"] +[817.356865, "o", "m"] +[817.432586, "o", "i"] +[817.56418, "o", "t"] +[818.009451, "o", "\r\n"] +[818.009561, "o", "(gdb) "] +[818.432237, "o", "p"] +[818.564952, "o", "r"] +[818.698167, "o", "i"] +[818.724308, "o", "n"] +[818.826911, "o", "t"] +[818.919965, "o", " "] +[819.207993, "o", "/"] +[819.381762, "o", "x"] +[819.508217, "o", " "] +[820.316542, "o", "$"] +[820.975916, "o", "u"] +[821.115032, "o", "s"] +[821.207129, "o", "e"] +[821.270953, "o", "r"] +[821.414288, "o", "_"] +[821.602876, "o", "c"] +[821.687005, "o", "o"] +[821.816962, "o", "d"] +[821.884079, "o", "e"] +[823.356186, "o", "&"] +[824.379479, "o", "0"] +[824.519435, "o", "x"] +[825.853806, "o", "f"] +[826.065333, "o", "f"] +[826.251408, "o", "f"] +[826.528401, "o", "f"] +[827.075458, "o", ")"] +[827.392703, "o", "\b"] +[827.893571, "o", "\b"] +[827.923588, "o", "\b"] +[827.953933, "o", "\b"] +[827.984307, "o", "\b"] +[828.014987, "o", "\b"] +[828.046374, "o", "\b"] +[828.076921, "o", "\b"] +[828.10759, "o", "\b"] +[828.137907, "o", "\b"] +[828.1682, "o", "\b"] +[828.198418, "o", "\b"] +[828.229355, "o", "\b"] +[828.259187, "o", "\b"] +[828.289623, "o", "\b"] +[828.320693, "o", "\b"] +[828.35114, "o", "\b"] +[828.382249, "o", "\b"] +[829.085558, "o", "($user_code&0xffff)\r\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[829.708728, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[830.400685, "o", "*"] +[830.93294, "o", "4"] +[831.02888, "o", "0"] +[831.20694, "o", "9"] +[831.583008, "o", "5"] +[832.196999, "o", "\b\u001b[K"] +[832.544697, "o", "6"] +[833.211351, "o", "\r\n"] +[833.21263, "o", "$16 = 0xffff000\r\n"] +[833.213594, "o", "(gdb) "] +[836.747221, "o", "#"] +[836.966915, "o", " "] +[837.122971, "o", "l"] +[837.264712, "o", "i"] +[837.415757, "o", "k"] +[837.487956, "o", "e"] +[837.582304, "o", " "] +[837.796811, "o", "b"] +[837.86537, "o", "e"] +[838.009048, "o", "f"] +[838.13558, "o", "o"] +[838.276445, "o", "r"] +[838.322393, "o", "e"] +[838.696696, "o", ","] +[838.845816, "o", " "] +[839.780549, "o", "b"] +[839.849384, "o", "a"] +[839.927552, "o", "s"] +[840.041133, "o", "e"] +[840.157198, "o", " "] +[840.376833, "o", "="] +[840.582513, "o", " "] +[840.751071, "o", "0"] +[840.957989, "o", " "] +[841.359805, "o", "\b\u001b[K"] +[841.654447, "o", " "] +[842.09277, "o", "\b\u001b[K"] +[842.473777, "o", ","] +[842.5702, "o", " "] +[842.730569, "o", "l"] +[842.881001, "o", "i"] +[843.046392, "o", "m"] +[843.149457, "o", "i"] +[843.28806, "o", "t"] +[843.603712, "o", "="] +[844.085459, "o", "4"] +[844.905693, "o", "G"] +[845.388067, "o", "\r\n(gdb) "] +[846.021339, "o", "#"] +[846.1873, "o", " "] +[846.362737, "o", "f"] +[846.424606, "o", "i"] +[846.539695, "o", "n"] +[847.063757, "o", "a"] +[847.427324, "o", "l"] +[847.539554, "o", "l"] +[847.791174, "o", "y"] +[847.863453, "o", " "] +[848.073511, "o", "l"] +[848.146045, "o", "e"] +[848.331511, "o", "t"] +[848.502747, "o", "s"] +[848.612159, "o", " "] +[848.811446, "o", "p"] +[849.033555, "o", "i"] +[849.47643, "o", "\b\u001b[K"] +[849.592317, "o", "r"] +[849.689119, "o", "i"] +[849.750955, "o", "n"] +[849.865499, "o", " "] +[849.912217, "o", "t"] +[850.068171, "o", "h"] +[850.185347, "o", "t"] +[850.663395, "o", "e"] +[850.892227, "o", "\b\u001b[K"] +[851.01506, "o", "\b\u001b[K"] +[851.090625, "o", "e"] +[851.184358, "o", " "] +[852.40202, "o", "p"] +[852.541371, "o", "r"] +[852.627033, "o", "i"] +[852.852467, "o", "v"] +[853.112142, "o", "\b\u001b[K"] +[853.234241, "o", "\b\u001b[K"] +[853.353082, "o", "\b\u001b[K"] +[853.454046, "o", "r"] +[853.673116, "o", "\b\u001b[K"] +[853.791573, "o", "\b\u001b[K"] +[853.958148, "o", "r"] +[854.049767, "o", "e"] +[854.151012, "o", "q"] +[854.43396, "o", "u"] +[854.480219, "o", "i"] +[854.525939, "o", "r"] +[854.602678, "o", "e"] +[854.764555, "o", "d"] +[854.917843, "o", " "] +[855.307316, "o", "p"] +[855.579807, "o", "r"] +[855.701494, "o", "i"] +[855.941647, "o", "v"] +[856.022055, "o", "i"] +[856.206419, "o", "l"] +[856.294774, "o", "e"] +[856.534217, "o", "g"] +[856.613545, "o", "e"] +[856.970087, "o", "\r\n"] +[856.970197, "o", "(gdb) "] +[858.880707, "o", "p"] +[859.008756, "o", "r"] +[859.084916, "o", "i"] +[859.148296, "o", "n"] +[859.194996, "o", "t"] +[859.279003, "o", " "] +[859.509048, "o", "/"] +[859.653839, "o", "x"] +[859.789865, "o", " "] +[860.019894, "o", "$"] +[861.401365, "o", "u"] +[861.442956, "o", "s"] +[861.564205, "o", "e"] +[861.621157, "o", "r"] +[861.743186, "o", "_"] +[861.895258, "o", "c"] +[862.01297, "o", "o"] +[862.137427, "o", "d"] +[862.21639, "o", "e"] +[862.540725, "o", ">"] +[862.850378, "o", ">"] +[864.616721, "o", "4"] +[864.804028, "o", "5"] +[865.120143, "o", ")"] +[865.47208, "o", "\b"] +[865.9722, "o", "\b"] +[866.002112, "o", "\b"] +[866.032795, "o", "\b"] +[866.063313, "o", "\b"] +[866.093033, "o", "\b"] +[866.123273, "o", "\b"] +[866.153238, "o", "\b"] +[866.183841, "o", "\b"] +[866.214712, "o", "\b"] +[866.245027, "o", "\b"] +[866.275735, "o", "\b"] +[866.306467, "o", "\b"] +[866.569002, "o", "\b"] +[866.786537, "o", "\b"] +[867.202622, "o", "($user_code>>45)\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[867.699393, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[868.510852, "o", "&"] +[869.114958, "o", "3"] +[869.282934, "o", "\r\n"] +[869.284128, "o", "$17 = 0x3\r\n"] +[869.28494, "o", "(gdb) "] +[870.908449, "o", "#"] +[871.210682, "o", " "] +[872.957141, "o", "a"] +[873.042207, "o", "s"] +[873.160801, "o", " "] +[873.311215, "o", "e"] +[873.710163, "o", "x"] +[873.882767, "o", "p"] +[873.963709, "o", "e"] +[874.171292, "o", "c"] +[874.382806, "o", "t"] +[874.467515, "o", "e"] +[874.657769, "o", "d"] +[874.790228, "o", ","] +[874.907512, "o", " "] +[875.097689, "o", "p"] +[875.201934, "o", "r"] +[875.282314, "o", "i"] +[875.630355, "o", "v"] +[875.863496, "o", "i"] +[876.094037, "o", "l"] +[876.277817, "o", "e"] +[876.501288, "o", "g"] +[876.578121, "o", "e"] +[876.773456, "o", " "] +[876.998859, "o", "3"] +[877.119419, "o", " "] +[877.673306, "o", "="] +[877.818837, "o", " "] +[878.24644, "o", "u"] +[878.372011, "o", "s"] +[878.430652, "o", "e"] +[878.495114, "o", "r"] +[878.622827, "o", " "] +[879.099481, "o", "o"] +[879.154198, "o", "d"] +[879.223305, "o", "e"] +[879.476677, "o", "\b\u001b[K"] +[879.606628, "o", "\b\u001b[K"] +[879.778496, "o", "\b\u001b[K"] +[880.000743, "o", "m"] +[880.050628, "o", "o"] +[880.127009, "o", "d"] +[880.191324, "o", "e"] +[880.418045, "o", "\r\n"] +[880.418349, "o", "(gdb) "] diff --git a/refs/pull/405/merge/_images/skb.png b/refs/pull/405/merge/_images/skb.png new file mode 100644 index 00000000..db956dc1 Binary files /dev/null and b/refs/pull/405/merge/_images/skb.png differ diff --git a/refs/pull/405/merge/_images/skb1.png b/refs/pull/405/merge/_images/skb1.png new file mode 100644 index 00000000..db956dc1 Binary files /dev/null and b/refs/pull/405/merge/_images/skb1.png differ diff --git a/refs/pull/405/merge/_images/slab-coloring.png b/refs/pull/405/merge/_images/slab-coloring.png new file mode 100644 index 00000000..1391ce55 Binary files /dev/null and b/refs/pull/405/merge/_images/slab-coloring.png differ diff --git a/refs/pull/405/merge/_images/slab-coloring1.png b/refs/pull/405/merge/_images/slab-coloring1.png new file mode 100644 index 00000000..1391ce55 Binary files /dev/null and b/refs/pull/405/merge/_images/slab-coloring1.png differ diff --git a/refs/pull/405/merge/_images/slab-detailed-arch.png b/refs/pull/405/merge/_images/slab-detailed-arch.png new file mode 100644 index 00000000..77a0cc81 Binary files /dev/null and b/refs/pull/405/merge/_images/slab-detailed-arch.png differ diff --git a/refs/pull/405/merge/_images/slab-detailed-arch1.png b/refs/pull/405/merge/_images/slab-detailed-arch1.png new file mode 100644 index 00000000..77a0cc81 Binary files /dev/null and b/refs/pull/405/merge/_images/slab-detailed-arch1.png differ diff --git a/refs/pull/405/merge/_images/slab-object-descriptors.png b/refs/pull/405/merge/_images/slab-object-descriptors.png new file mode 100644 index 00000000..dbeab55b Binary files /dev/null and b/refs/pull/405/merge/_images/slab-object-descriptors.png differ diff --git a/refs/pull/405/merge/_images/slab-object-descriptors1.png b/refs/pull/405/merge/_images/slab-object-descriptors1.png new file mode 100644 index 00000000..dbeab55b Binary files /dev/null and b/refs/pull/405/merge/_images/slab-object-descriptors1.png differ diff --git a/refs/pull/405/merge/_images/slab-overview.png b/refs/pull/405/merge/_images/slab-overview.png new file mode 100644 index 00000000..90086d4b Binary files /dev/null and b/refs/pull/405/merge/_images/slab-overview.png differ diff --git a/refs/pull/405/merge/_images/slab-overview1.png b/refs/pull/405/merge/_images/slab-overview1.png new file mode 100644 index 00000000..90086d4b Binary files /dev/null and b/refs/pull/405/merge/_images/slab-overview1.png differ diff --git a/refs/pull/405/merge/_images/syscalls-inspection.cast b/refs/pull/405/merge/_images/syscalls-inspection.cast new file mode 100644 index 00000000..ca749a42 --- /dev/null +++ b/refs/pull/405/merge/_images/syscalls-inspection.cast @@ -0,0 +1,1389 @@ +{"title": "System Call Inspection", "height": 24, "idle_time_limit": 1.0, "version": 2, "env": {"SHELL": "/bin/bash", "TERM": "xterm-256color"}, "width": 80, "timestamp": 1519682642} +[0.02593, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[2.585046, "o", "#"] +[2.812131, "o", " "] +[2.94729, "o", "a"] +[3.187178, "o", "t"] +[3.308689, "o", "t"] +[3.380836, "o", "a"] +[3.587609, "o", "c"] +[3.660319, "o", "h"] +[3.74021, "o", " "] +[3.935004, "o", "g"] +[4.157892, "o", "d"] +[4.34303, "o", "b"] +[4.527084, "o", " "] +[4.711204, "o", "t"] +[4.768411, "o", "o"] +[4.85479, "o", " "] +[5.081524, "o", "V"] +[5.193867, "o", "M"] +[5.366551, "o", "\r\n"] +[5.367316, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[6.562559, "o", "m"] +[6.655394, "o", "a"] +[6.702303, "o", "k"] +[6.826912, "o", "e"] +[6.898232, "o", " "] +[7.081019, "o", "g"] +[7.184305, "o", "d"] +[7.250501, "o", "b"] +[7.464891, "o", "\r\n"] +[7.487695, "o", "gdb -ex \"target remote localhost:1234\" /home/tavi/src/linux/vmlinux\r\n"] +[7.552276, "o", "GNU gdb (Ubuntu 7.11.1-0ubuntu1~16.5) 7.11.1\r\nCopyright (C) 2016 Free Software Foundation, Inc.\r\nLicense GPLv3+: GNU GPL version 3 or later \r\nThis is free software: you are free to change and redistribute it.\r\nThere is NO WARRANTY, to the extent permitted by law. Type \"show copying\"\r\nand \"show warranty\" for details.\r\nThis GDB was configured as \"x86_64-linux-gnu\".\r\nType \"show configuration\" for configuration details.\r\nFor bug reporting instructions, please see:\r\n.\r\nFind the GDB manual and other documentation resources online at:\r\n.\r\nFor help, type \"help\".\r\nType \"apropos word\" to search for commands related to \"word\"...\r\n"] +[7.552711, "o", "Reading symbols from /home/tavi/src/linux/vmlinux..."] +[8.0237, "o", "done.\r\n"] +[8.040804, "o", "Remote debugging using localhost:1234\r\n"] +[8.049686, "o", "default_idle () at arch/x86/kernel/process.c:357\r\n"] +[8.049841, "o", "357\t}\r\n"] +[8.049944, "o", "(gdb) "] +[8.617598, "o", "b"] +[8.712276, "o", "t"] +[8.906112, "o", "\r\n"] +[8.907612, "o", "#0 default_idle () at arch/x86/kernel/process.c:357\r\n#1 0xc101fcfd in arch_cpu_idle () at arch/x86/kernel/process.c:346\r\n"] +[8.916461, "o", "#2 0xc14639f9 in default_idle_call () at kernel/sched/idle.c:98\r\n"] +[8.916818, "o", "#3 0xc107b2a5 in cpuidle_idle_call () at kernel/sched/idle.c:156\r\n#4 do_idle () at kernel/sched/idle.c:246\r\n"] +[8.923466, "o", "#5 0xc107b5b5 in cpu_startup_entry (state=)\r\n at kernel/sched/idle.c:351\r\n#6 0xc145d643 in rest_init () at init/main.c:436\r\n"] +[8.923757, "o", "#7 0xc1614acb in start_kernel () at init/main.c:716\r\n"] +[8.926458, "o", "#8 0xc161424a in i386_start_kernel () at arch/x86/kernel/head32.c:56\r\n"] +[8.926871, "o", "#9 0xc10001d3 in startup_32_smp () at arch/x86/kernel/head_32.S:363\r\n"] +[8.927813, "o", "#10 0x00000000 in ?? ()\r\n"] +[8.928515, "o", "(gdb) "] +[9.441926, "o", "#"] +[9.610515, "o", " "] +[10.164044, "o", "V"] +[10.268664, "o", "M"] +[10.453102, "o", " "] +[10.592012, "o", "i"] +[10.687218, "o", "s"] +[10.769038, "o", " "] +[10.88914, "o", "i"] +[11.031982, "o", "d"] +[11.171022, "o", "l"] +[11.339681, "o", "e"] +[11.541285, "o", "\r\n"] +[11.541408, "o", "(gdb) "] +[13.459643, "o", "#"] +[13.604705, "o", " "] +[13.787454, "o", "l"] +[13.867483, "o", "e"] +[14.028803, "o", "t"] +[14.196721, "o", "s"] +[14.299574, "o", " "] +[14.400587, "o", "a"] +[14.678709, "o", "t"] +[15.297907, "o", "\b\u001b[K"] +[15.474776, "o", "\b\u001b[K"] +[17.314512, "o", "a"] +[17.493688, "o", "d"] +[17.640039, "o", "d"] +[17.734576, "o", " "] +[17.836868, "o", "a"] +[17.94166, "o", " "] +[18.172955, "o", "b"] +[18.231003, "o", "r"] +[18.338597, "o", "e"] +[18.389443, "o", "a"] +[18.493356, "o", "k"] +[18.697004, "o", "p"] +[18.791969, "o", "o"] +[19.002137, "o", "i"] +[19.076693, "o", "n"] +[19.247079, "o", "t"] +[19.682963, "o", " "] +[19.920143, "o", "t"] +[20.019301, "o", "o"] +[20.13914, "o", " "] +[20.298365, "o", "a"] +[20.399756, "o", " "] +[20.543224, "o", "s"] +[20.654014, "o", "y"] +[20.722395, "o", "s"] +[20.875473, "o", "t"] +[20.949965, "o", "e"] +[21.03673, "o", "m"] +[21.105939, "o", " "] +[21.199805, "o", "c"] +[21.248673, "o", "a"] +[21.33164, "o", "l"] +[21.457598, "o", "l"] +[21.662139, "o", "\r\n"] +[21.662437, "o", "(gdb) "] +[23.558939, "o", "b"] +[23.610705, "o", "r"] +[23.671647, "o", "e"] +[23.830052, "o", "\u0007ak"] +[24.613391, "o", " "] +[25.766837, "o", "s"] +[25.878063, "o", "y"] +[25.959348, "o", "s"] +[26.26539, "o", "_"] +[26.884977, "o", "d"] +[26.936127, "o", "u"] +[27.021843, "o", "p"] +[27.318277, "o", "2"] +[27.598228, "o", "\r\n"] +[27.640182, "o", "Breakpoint 1 at 0xc1139210: file fs/file.c, line 912.\r\n"] +[27.64023, "o", "(gdb) "] +[28.770631, "o", "c"] +[29.000408, "o", "\r\nContinuing.\r\n"] +[29.585196, "o", "^Z"] +[29.585536, "o", "\r\n[1]+ Stopped make gdb\r\n"] +[29.586221, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[30.715625, "o", "#"] +[30.828185, "o", " "] +[30.978622, "o", "c"] +[31.038514, "o", "o"] +[31.18384, "o", "n"] +[31.28793, "o", "n"] +[31.34898, "o", "e"] +[31.392867, "o", "c"] +[31.690793, "o", " "] +[32.237691, "o", "\b\u001b[K"] +[32.355048, "o", "t"] +[32.442939, "o", " "] +[32.588287, "o", "t"] +[32.918417, "o", "o"] +[33.039158, "o", " "] +[33.167914, "o", "t"] +[33.2546, "o", "h"] +[33.340674, "o", "e"] +[33.395216, "o", " "] +[33.6407, "o", "V"] +[33.72697, "o", "M"] +[33.917668, "o", "\r\n"] +[33.918502, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[37.546829, "o", "m"] +[37.637743, "o", "i"] +[37.761726, "o", "n"] +[37.837263, "o", "i"] +[37.938906, "o", "c"] +[38.022622, "o", "o"] +[38.113482, "o", "m"] +[38.172694, "o", " "] +[38.308186, "o", "-"] +[38.54722, "o", "D"] +[38.6566, "o", " "] +[39.13904, "o", "s"] +[39.277557, "o", "e"] +[39.337429, "o", "r"] +[39.459585, "o", "ial.pts "] +[39.776685, "o", "\r\n"] +[39.780118, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B"] +[39.780371, "o", "\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[39.781975, "o", "\u001b[?12l\u001b[?25h"] +[39.782204, "o", "\nWelcome to minicom 2.7\r\n\nOPTIONS: I18n \r\n"] +[39.782381, "o", "Compiled on Feb 7 2016, 13:37:27.\r\nPort serial.pts, 23:03:56\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[40.619769, "o", "\n"] +[40.622796, "o", "root@qemux86:~# "] +[41.981219, "o", "#"] +[42.161306, "o", " "] +[42.413334, "o", "t"] +[42.58837, "o", "r"] +[42.674525, "o", "i"] +[43.334665, "o", "g"] +[43.464242, "o", "g"] +[43.537786, "o", "e"] +[43.650717, "o", "r"] +[43.838322, "o", " "] +[44.718033, "o", "d"] +[44.842141, "o", "u"] +[44.914998, "o", "p"] +[45.233999, "o", "2"] +[45.931475, "o", " "] +[46.078743, "o", "s"] +[46.175471, "o", "y"] +[46.248864, "o", "s"] +[46.892927, "o", "t"] +[46.987556, "o", "e"] +[47.185408, "o", "m"] +[47.28593, "o", " "] +[47.444084, "o", "c"] +[47.49419, "o", "a"] +[47.548442, "o", "l"] +[47.661648, "o", "l"] +[47.793085, "o", "\r\n"] +[47.79407, "o", "root@qemux86:~# "] +[48.389908, "o", "e"] +[48.469687, "o", "c"] +[48.566341, "o", "h"] +[48.637507, "o", "o"] +[48.761749, "o", " "] +[49.620761, "o", "a"] +[49.796805, "o", " "] +[50.159016, "o", ">"] +[50.287746, "o", " "] +[50.407382, "o", "/"] +[50.591362, "o", "t"] +[50.702265, "o", "m"] +[50.775619, "o", "p"] +[51.05656, "o", "/"] +[51.245617, "o", "x"] +[51.460523, "o", "\r\n"] +[52.650063, "o", "\u001b[0m\u001b(B\u001b[7m\u001b[24;1H\u001b[K\u001b[?12l\u001b[?25h"] +[52.650349, "o", "\u001b[?25lCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7 | VT102 | Offline | rial.pts\u001b[?12l\u001b[?25h\u001b[13;1H"] +[52.886767, "o", "\u001b[24;1H\u001b[0m\u001b(B\u001b[?12l\u001b[?25h\u001b[H\u001b[2J\u001b[?1l\u001b>"] +[52.88713, "o", "Suspended. Type \"fg\" to resume.\r\n\r\n[2]+ Stopped minicom -D serial.pts\r\n"] +[52.887817, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[53.689978, "o", "f"] +[53.888551, "o", "g"] +[53.990684, "o", " "] +[54.462791, "o", "1"] +[54.54925, "o", "\r\n"] +[54.549574, "o", "make gdb\r\n"] +[54.550655, "o", "\r\n"] +[54.558163, "o", "Breakpoint 1, SyS_dup2 (oldfd=3, newfd=1) at fs/file.c:912\r\n"] +[54.558198, "o", "912\tSYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)\r\n"] +[54.558292, "o", "(gdb) "] +[68.972448, "o", "#"] +[69.215133, "o", " "] +[70.145897, "o", "l"] +[70.226727, "o", "e"] +[70.397696, "o", "t"] +[70.589752, "o", "s"] +[70.722316, "o", " "] +[71.095534, "o", "e"] +[71.198966, "o", "x"] +[71.299745, "o", "a"] +[71.960882, "o", "m"] +[72.431415, "o", "\b\u001b[K"] +[72.541534, "o", "\b\u001b[K"] +[72.657555, "o", "\b\u001b[K"] +[72.763229, "o", "\b\u001b[K"] +[72.955358, "o", "b"] +[72.995208, "o", "a"] +[73.09843, "o", "c"] +[73.230889, "o", "k"] +[73.48259, "o", "t"] +[73.625035, "o", "r"] +[73.716815, "o", "a"] +[73.931987, "o", "c"] +[74.00581, "o", "e"] +[74.151293, "o", " "] +[74.244418, "o", "t"] +[74.36868, "o", "h"] +[74.435091, "o", "e"] +[74.852494, "o", " "] +[74.998958, "o", "s"] +[75.146618, "o", "y"] +[75.18531, "o", "s"] +[75.40006, "o", "t"] +[75.507571, "o", "e"] +[75.878345, "o", "m"] +[76.021645, "o", " "] +[76.139759, "o", "c"] +[76.199716, "o", "a"] +[76.298113, "o", "l"] +[76.406879, "o", "l"] +[76.499901, "o", " "] +[76.624196, "o", "f"] +[76.714473, "o", "l"] +[76.863719, "o", "o"] +[76.936706, "o", "w"] +[77.285863, "o", "\r\n"] +[77.286167, "o", "(gdb) "] +[77.609834, "o", "b"] +[77.67291, "o", "t"] +[77.908684, "o", "\r\n"] +[77.909971, "o", "#0 SyS_dup2 (oldfd=3, newfd=1) at fs/file.c:912\r\n"] +[77.910076, "o", "#1 0xc1001361 in do_syscall_32_irqs_on (regs=)\r\n at arch/x86/entry/common.c:327\r\n"] +[77.912731, "o", "#2 do_int80_syscall_32 (regs=0xc7235fb4) at arch/x86/entry/common.c:341\r\n#3 0xc14645d3 in entry_INT80_32 () at arch/x86/entry/entry_32.S:544\r\n"] +[77.91529, "o", "#4 0x00000003 in ?? ()\r\n"] +[77.917663, "o", "#5 0x00000003 in ?? ()\r\n"] +[77.92062, "o", "#6 0x0a09e224 in ?? ()\r\n"] +[77.920927, "o", "Backtrace stopped: previous frame inner to this frame (corrupt stack?)\r\n(gdb) "] +[82.733885, "o", "f"] +[82.805597, "o", "r"] +[82.943042, "o", " "] +[83.158856, "o", "1"] +[83.338543, "o", "\r\n"] +[83.338863, "o", "#1 0xc1001361 in do_syscall_32_irqs_on (regs=)\r\n at arch/x86/entry/common.c:327\r\n"] +[83.339111, "o", "327\t\t\tregs->ax = ia32_sys_call_table[nr](\r\n(gdb) "] +[84.302882, "o", "l"] +[84.466961, "o", "i"] +[84.573508, "o", "s"] +[84.805097, "o", "t"] +[86.197885, "o", " "] +[87.456892, "o", "\r\n"] +[87.457044, "o", "322\t\t\t * It's possible that a 32-bit syscall implementation\r\n323\t\t\t * takes a 64-bit parameter but nonetheless assumes that\r\n324\t\t\t * the high bits are zero. Make sure we zero-extend all\r\n325\t\t\t * of the args.\r\n326\t\t\t */\r\n327\t\t\tregs->ax = ia32_sys_call_table[nr]("] +[87.457116, "o", "\r\n328\t\t\t\t(unsigned int)regs->bx, (unsigned int)regs->cx,\r\n329\t\t\t\t(unsigned int)regs->dx, (unsigned int)regs->si,\r\n330\t\t\t\t(unsigned int)regs->di, (unsigned int)regs->bp);\r\n331\t\t}\r\n"] +[87.457622, "o", "(gdb) "] +[90.858059, "o", "#"] +[90.97271, "o", " "] +[91.144448, "o", "t"] +[91.217524, "o", "h"] +[91.269411, "o", "i"] +[91.456193, "o", "s"] +[91.63016, "o", " "] +[91.782525, "o", "l"] +[91.948946, "o", "o"] +[92.056737, "o", "o"] +[92.152544, "o", "k"] +[92.246961, "o", "s"] +[92.353698, "o", " "] +[92.444179, "o", "l"] +[92.631606, "o", "i"] +[92.770763, "o", "k"] +[92.861851, "o", "e"] +[92.969579, "o", " "] +[93.109459, "o", "t"] +[93.210131, "o", "h"] +[93.324771, "o", "e"] +[93.466555, "o", " "] +[93.619111, "o", "s"] +[93.715197, "o", "y"] +[93.795165, "o", "s"] +[93.949435, "o", "t"] +[94.027965, "o", "e"] +[94.184473, "o", " "] +[94.3008, "o", "c"] +[94.342823, "o", "a"] +[94.445488, "o", "l"] +[94.568218, "o", "l"] +[94.656948, "o", " "] +[94.839902, "o", "d"] +[94.957893, "o", "i"] +[95.075175, "o", "s"] +[95.230277, "o", "p"] +[95.339237, "o", "a"] +[95.543096, "o", "t"] +[95.751003, "o", "c"] +[95.854341, "o", "h"] +[95.943425, "o", "e"] +[96.023733, "o", "r"] +[96.172254, "o", "\r\n"] +[96.172374, "o", "(gdb) "] +[101.058686, "o", "#"] +[101.204849, "o", " "] +[101.545024, "o", "n"] +[101.645855, "o", "r"] +[101.798058, "o", " "] +[101.950607, "o", "i"] +[102.056996, "o", "s"] +[102.134187, "o", " "] +[102.342601, "o", "t"] +[102.416898, "o", "h"] +[102.533298, "o", "e"] +[102.593287, "o", " "] +[102.719888, "o", "s"] +[102.900423, "o", "y"] +[102.94586, "o", "s"] +[103.162384, "o", "t"] +[103.241973, "o", "e"] +[103.574398, "o", "m"] +[103.675272, "o", " "] +[103.802496, "o", "c"] +[103.849596, "o", "a"] +[103.955005, "o", "l"] +[104.090651, "o", "l"] +[104.1248, "o", " "] +[104.318767, "o", "n"] +[104.403953, "o", "u"] +[104.607057, "o", "m"] +[104.809716, "o", "b"] +[104.882333, "o", "e"] +[104.945321, "o", "r"] +[105.169779, "o", "\r\n"] +[105.170195, "o", "(gdb) "] +[120.867099, "o", "l"] +[121.049682, "o", "i"] +[121.155454, "o", "s"] +[121.359302, "o", "t"] +[121.479029, "o", " "] +[121.619056, "o", "3"] +[121.715251, "o", "0"] +[121.836342, "o", "0"] +[122.442593, "o", "\r\n"] +[122.447014, "o", "295\t/*\r\n296\t * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does\r\n297\t * all entry and exit work and returns with IRQs off. This function is\r\n298\t * extremely hot in workloads that use it, and it's usually called from\r\n299\t * do_fast_syscall_32, so forcibly inline it to improve performance.\r\n300\t */\r\n"] +[122.447378, "o", "301\tstatic __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)\r\n302\t{\r\n303\t\tstruct thread_info *ti = current_thread_info();\r\n304\t\tunsigned int nr = (unsigned int)regs->orig_ax;\r\n(gdb) "] +[124.092652, "o", "#"] +[124.554034, "o", " "] +[124.797533, "o", "i"] +[124.905822, "o", "t"] +[125.030417, "o", " "] +[125.160131, "o", "i"] +[125.259278, "o", "s"] +[125.37996, "o", " "] +[125.517546, "o", "p"] +[125.669394, "o", "i"] +[125.760003, "o", "c"] +[125.875741, "o", "k"] +[125.996377, "o", "e"] +[126.076136, "o", "d"] +[126.247928, "o", " "] +[126.466909, "o", "u"] +[126.53385, "o", "p"] +[126.664159, "o", " "] +[127.47466, "o", "f"] +[127.507676, "o", "r"] +[127.594243, "o", "o"] +[127.686369, "o", "m"] +[127.793519, "o", " "] +[127.867138, "o", "t"] +[127.990397, "o", "h"] +[128.119207, "o", " "] +[128.507492, "o", "\b\u001b[K"] +[128.590147, "o", "e"] +[128.692867, "o", " "] +[128.889331, "o", "s"] +[129.214909, "o", "a"] +[129.981084, "o", "m"] +[130.065029, "o", "e"] +[130.186849, "o", " "] +[130.540104, "o", "s"] +[130.732803, "o", "t"] +[130.793955, "o", "r"] +[131.422146, "o", "c"] +[131.696522, "o", "t"] +[131.853467, "o", "u"] +[132.034292, "o", "r"] +[132.08244, "o", "e"] +[132.324049, "o", "\b\u001b[K"] +[132.443772, "o", "\b\u001b[K"] +[132.573348, "o", "\b\u001b[K"] +[132.694863, "o", "\b\u001b[K"] +[132.81051, "o", "\b\u001b[K"] +[132.997241, "o", "u"] +[133.088819, "o", "c"] +[133.337918, "o", "t"] +[133.454541, "o", "u"] +[133.569818, "o", "r"] +[133.609841, "o", "e"] +[133.711729, "o", " "] +[134.021296, "o", "("] +[134.244295, "o", "p"] +[134.364747, "o", "t"] +[134.518463, "o", "_"] +[134.694346, "o", "r"] +[134.756482, "o", "e"] +[135.004436, "o", "g"] +[135.409283, "o", "s"] +[135.67484, "o", ")"] +[136.719996, "o", "\r\n"] +[136.720311, "o", "(gdb) "] +[159.587179, "o", "#"] +[159.767425, "o", " "] +[159.943861, "o", "l"] +[160.036363, "o", "e"] +[160.198006, "o", "t"] +[160.366941, "o", "s"] +[160.481441, "o", " "] +[160.690788, "o", "i"] +[161.019933, "o", "n"] +[161.265835, "o", "p"] +[161.435021, "o", "s"] +[161.821488, "o", "\b\u001b[K"] +[161.951302, "o", "\b\u001b[K"] +[162.226603, "o", "s"] +[162.36373, "o", "p"] +[162.498729, "o", "e"] +[162.574127, "o", "c"] +[162.821589, "o", "t"] +[162.945206, "o", " "] +[163.051945, "o", "t"] +[163.219025, "o", "h"] +[163.276168, "o", "e"] +[163.387915, "o", " "] +[163.856566, "o", "r"] +[163.947525, "o", "e"] +[164.54179, "o", "g"] +[164.735443, "o", "s"] +[164.860377, "o", " "] +[164.983243, "o", "c"] +[165.068139, "o", "o"] +[165.142719, "o", "n"] +[165.325774, "o", "t"] +[165.347382, "o", "e"] +[165.53939, "o", "n"] +[165.632312, "o", "t"] +[165.845555, "o", "s"] +[166.537455, "o", "\r\n"] +[166.537632, "o", "(gdb) "] +[166.819034, "o", "p"] +[166.958081, "o", "r"] +[167.052344, "o", "i"] +[167.133431, "o", "n"] +[167.213977, "o", "t"] +[167.322155, "o", " "] +[167.662143, "o", "*"] +[167.894034, "o", "r"] +[167.957716, "o", "e"] +[168.123891, "o", "g"] +[168.275607, "o", "s"] +[168.368704, "o", "\r\n"] +[168.369137, "o", "value has been optimized out\r\n(gdb) "] +[169.223841, "o", "#"] +[169.68196, "o", " "] +[170.473422, "o", "o"] +[170.562498, "o", "p"] +[170.754795, "o", "t"] +[170.780246, "o", "i"] +[170.985415, "o", "m"] +[171.06, "o", "i"] +[171.118702, "o", "z"] +[171.326175, "o", "e"] +[171.422983, "o", "d"] +[171.566281, "o", " "] +[171.82724, "o", "b"] +[171.902983, "o", "y"] +[172.056031, "o", " "] +[172.291054, "o", "c"] +[172.382563, "o", "i"] +[172.439382, "o", "m"] +[172.571587, "o", "p"] +[172.870082, "o", "\b\u001b[K"] +[172.987523, "o", "\b\u001b[K"] +[173.101322, "o", "\b\u001b[K"] +[173.254872, "o", "o"] +[173.322597, "o", "m"] +[173.44459, "o", "p"] +[173.603123, "o", "i"] +[173.779322, "o", "l"] +[173.905269, "o", "e"] +[174.000955, "o", "r"] +[174.086025, "o", "."] +[174.209714, "o", "."] +[174.349851, "o", "."] +[174.538669, "o", " "] +[179.50067, "o", "g"] +[179.580322, "o", "o"] +[179.763011, "o", " "] +[179.903828, "o", "a"] +[180.013494, "o", " "] +[180.236946, "o", "f"] +[180.467494, "o", "r"] +[180.568763, "o", "a"] +[180.697886, "o", "m"] +[180.82554, "o", "e"] +[180.907141, "o", " "] +[181.169606, "o", "d"] +[181.262241, "o", "e"] +[181.42098, "o", "e"] +[181.616856, "o", "p"] +[181.793458, "o", "e"] +[181.910212, "o", "r"] +[182.544419, "o", "\r\n"] +[182.54482, "o", "(gdb) "] +[183.073511, "o", "f"] +[183.168588, "o", "r"] +[183.669953, "o", " "] +[191.37133, "o", "2"] +[191.540642, "o", "\r\n"] +[191.541499, "o", "#2 do_int80_syscall_32 (regs=0xc7235fb4) at arch/x86/entry/common.c:341\r\n"] +[191.541856, "o", "341\t\tdo_syscall_32_irqs_on(regs);\r\n(gdb) "] +[192.917131, "o", "p"] +[193.062137, "o", "r"] +[193.147118, "o", "i"] +[193.231132, "o", "n"] +[193.304903, "o", "t"] +[193.385087, "o", " "] +[193.915946, "o", "*"] +[194.135248, "o", "r"] +[194.23597, "o", "e"] +[194.394568, "o", "g"] +[194.566658, "o", "s"] +[194.653019, "o", "\r\n"] +[194.653763, "o", "$1 = {bx = 3, cx = 1, "] +[194.654076, "o", "dx = 3, si = 168423920, di = 168419876, bp = 168419336, \r\n ax = 4294967258, ds = 123, __dsh = 0, es = 123, __esh = 0, fs = 0, \r\n __fsh = 0, "] +[194.654371, "o", "gs = 0, __gsh = 0, orig_ax = 63, ip = 1150252833, cs = 115, \r\n __csh = 0, flags = 514, "] +[194.65463, "o", "sp = 3218117628, ss = 123, __ssh = 0}\r\n(gdb) "] +[198.397833, "o", "#"] +[198.589958, "o", " "] +[199.416287, "o", "\b\u001b[K"] +[199.54844, "o", "\b\u001b[K"] +[199.789439, "o", "p"] +[199.910455, "o", "r"] +[200.013253, "o", "i"] +[200.081967, "o", "n"] +[200.172051, "o", "t"] +[200.23888, "o", " "] +[200.344988, "o", "r"] +[200.443063, "o", "e"] +[200.636653, "o", "g"] +[200.903926, "o", "s"] +[201.297424, "o", "\b\u001b[K"] +[201.555797, "o", "s"] +[201.70675, "o", "\r\n"] +[201.718736, "o", "$2 = (struct pt_regs *) 0xc7235fb4\r\n(gdb) "] +[202.762192, "o", "#"] +[202.96906, "o", " "] +[203.200172, "o", "t"] +[203.335914, "o", "h"] +[203.397287, "o", "i"] +[203.509368, "o", "s"] +[203.663251, "o", " "] +[204.186121, "o", "i"] +[204.277979, "o", "s"] +[204.37577, "o", " "] +[204.575364, "o", "a"] +[204.681527, "o", " "] +[205.432843, "o", "s"] +[205.577435, "o", "a"] +[205.84148, "o", "v"] +[205.901931, "o", "e"] +[206.142056, "o", " "] +[206.48041, "o", "\b\u001b[K"] +[206.576791, "o", "d"] +[206.650202, "o", " "] +[206.846523, "o", "o"] +[207.147206, "o", " "] +[207.497795, "o", "\b\u001b[K"] +[207.682557, "o", "n"] +[207.773982, "o", " "] +[207.873402, "o", "s"] +[208.081623, "o", "t"] +[208.158662, "o", "a"] +[208.370015, "o", "c"] +[208.494404, "o", "k"] +[210.422713, "o", " "] +[212.637105, "o", "s"] +[212.787177, "o", "t"] +[212.851629, "o", "r"] +[212.997138, "o", "u"] +[213.10568, "o", "c"] +[213.316927, "o", "t"] +[213.4249, "o", "u"] +[213.518001, "o", "r"] +[213.568409, "o", "e"] +[213.68215, "o", " "] +[214.147557, "o", "w"] +[214.236414, "o", "h"] +[214.292857, "o", "i"] +[214.384715, "o", "c"] +[214.448337, "o", "h"] +[214.52596, "o", " "] +[214.62955, "o", "s"] +[214.810601, "o", "t"] +[214.922436, "o", "o"] +[215.0038, "o", "r"] +[215.075964, "o", "e"] +[215.264322, "o", "s"] +[215.370649, "o", " "] +[215.800702, "o", "u"] +[215.868643, "o", "s"] +[215.977376, "o", "e"] +[216.044684, "o", "r"] +[216.169163, "o", "s"] +[216.284503, "o", "p"] +[216.34314, "o", "a"] +[216.450213, "o", "c"] +[216.550799, "o", "e"] +[216.722203, "o", " "] +[219.131447, "o", "r"] +[219.195644, "o", "e"] +[219.377234, "o", "g"] +[219.486582, "o", "i"] +[219.520092, "o", "s"] +[219.685879, "o", "t"] +[219.754728, "o", "e"] +[219.874287, "o", "r"] +[220.02845, "o", "s"] +[221.002656, "o", " "] +[221.144104, "o", "v"] +[221.201385, "o", "a"] +[221.300797, "o", "l"] +[221.517941, "o", "u \r"] +[221.655037, "o", "e"] +[221.819353, "o", "s"] +[222.291958, "o", "\r\n"] +[222.292213, "o", "(gdb) "] +[228.409682, "o", "i"] +[228.551388, "o", "n"] +[229.037434, "o", "f"] +[229.139332, "o", "o"] +[229.400686, "o", " "] +[230.375475, "o", "r"] +[230.455225, "o", "e"] +[230.615513, "o", "\u0007"] +[231.367524, "o", "g"] +[231.608532, "o", "s"] +[231.985991, "o", "\b\u001b[K"] +[232.17368, "o", "i"] +[232.244238, "o", "s"] +[232.453164, "o", "t"] +[232.55645, "o", "e"] +[232.688874, "o", "r"] +[232.95895, "o", " "] +[233.83154, "o", "e"] +[233.978035, "o", "s"] +[234.047244, "o", "p"] +[234.168937, "o", "\r\n"] +[234.169348, "o", "esp 0xc7235f8c"] +[234.169649, "o", "\t0xc7235f8c\r\n"] +[234.16976, "o", "(gdb) "] +[250.775201, "o", "#"] +[251.106842, "o", " "] +[251.346477, "o", "h"] +[251.418165, "o", "o"] +[251.501715, "o", "w"] +[251.664792, "o", " "] +[252.067964, "o", "d"] +[252.208144, "o", "i"] +[252.87959, "o", "d"] +[253.06756, "o", " "] +[253.393168, "o", "t"] +[253.533434, "o", "h"] +[253.601184, "o", "o"] +[253.76566, "o", "s"] +[253.959997, "o", "e"] +[254.157286, "o", " "] +[254.898526, "o", "u"] +[254.999925, "o", "s"] +[255.096884, "o", "e"] +[255.171001, "o", "r"] +[255.315978, "o", "s"] +[255.418223, "o", "p"] +[255.516375, "o", "a"] +[255.651774, "o", "c"] +[255.762584, "o", "e"] +[255.92294, "o", " "] +[256.104238, "o", "r"] +[256.165681, "o", "e"] +[256.623148, "o", "g"] +[256.761757, "o", "i"] +[256.823147, "o", "s"] +[257.031872, "o", "t"] +[257.103524, "o", "e"] +[257.561344, "o", "r"] +[258.279258, "o", " "] +[258.447006, "o", "v"] +[258.502906, "o", "a"] +[258.608216, "o", "l"] +[258.786735, "o", "u"] +[258.86697, "o", "e"] +[258.972944, "o", "s"] +[259.099576, "o", " "] +[259.319621, "o", "g"] +[259.43755, "o", "o"] +[259.566479, "o", "t"] +[259.670838, "o", " "] +[259.795132, "o", "s"] +[259.867977, "o", "a"] +[260.067483, "o", "v"] +[260.107313, "o", "e"] +[260.330451, "o", "d"] +[260.681612, "o", " "] +[262.446506, "o", "o"] +[262.637035, "o", "n"] +[262.71973, "o", " "] +[262.816535, "o", "s"] +[262.986307, "o", "t"] +[263.060635, "o", "a"] +[263.228155, "o", "c"] +[263.277514, "o", "k"] +[263.566323, "o", "?"] +[263.893074, "o", "\r\n"] +[263.893184, "o", "(gdb) "] +[264.675148, "o", "#"] +[265.174988, "o", " "] +[265.776336, "o", "g"] +[265.893763, "o", "o"] +[266.031976, "o", " "] +[266.147399, "o", "a"] +[266.278558, "o", " "] +[266.768307, "o", "f"] +[266.949187, "o", "r"] +[267.030231, "o", "a"] +[267.15945, "o", "m"] +[267.376536, "o", " "] +[267.847105, "o", "\b\u001b[K"] +[267.91339, "o", "e"] +[268.015642, "o", " "] +[268.317662, "o", "d"] +[268.428077, "o", "e"] +[268.589581, "o", "e"] +[268.699983, "o", "p"] +[268.795125, "o", "e"] +[268.886986, "o", "r"] +[268.983051, "o", "."] +[269.133065, "o", "."] +[269.275845, "o", "."] +[269.70994, "o", "\r\n"] +[269.71023, "o", "(gdb) "] +[269.989964, "o", "f"] +[270.057684, "o", "r"] +[270.124032, "o", "e"] +[270.338379, "o", " "] +[272.954739, "o", "3"] +[273.342331, "o", "\r\n"] +[273.342463, "o", "Undefined command: \"fre\". Try \"help\".\r\n(gdb) "] +[274.537938, "o", "f"] +[274.591956, "o", "r"] +[274.794531, "o", " "] +[275.013426, "o", "3"] +[275.213151, "o", "\r\n"] +[275.213508, "o", "#3 0xc14645d3 in entry_INT80_32 () at arch/x86/entry/entry_32.S:544\r\n"] +[275.213876, "o", "544\t\tcall\tdo_int80_syscall_32\r\n(gdb) "] +[280.62674, "o", "l"] +[280.794815, "o", "i"] +[280.926434, "o", "s"] +[281.156915, "o", "t"] +[281.246634, "o", " "] +[281.604719, "o", "5"] +[281.88764, "o", "3"] +[281.986378, "o", "2"] +[282.314826, "o", "\r\n"] +[282.319103, "o", "527\t * edx arg3\r\n528\t * esi arg4\r\n529\t * edi arg5\r\n530\t * ebp arg6\r\n531\t */\r\n532\tENTRY(entry_INT80_32)\r\n533\t\tASM_CLAC\r\n534\t\tpushl\t%eax\t\t\t/* pt_regs->orig_ax */\r\n535\t\tSAVE_ALL pt_regs_ax=$-ENOSYS\t/* save rest */\r\n536\t\r\n"] +[282.319396, "o", "(gdb) "] +[287.032723, "o", "$"] +[287.628779, "o", "\b\u001b[K"] +[287.896648, "o", "#"] +[288.026461, "o", " "] +[288.213827, "o", "l"] +[288.286875, "o", "e"] +[288.488417, "o", "t"] +[288.672395, "o", "s"] +[289.171884, "o", " "] +[289.288632, "o", "s"] +[289.468608, "o", "e"] +[289.602164, "o", "e"] +[289.762666, "o", " "] +[289.952819, "o", "w"] +[290.078185, "o", "h"] +[290.119588, "o", "a"] +[290.337222, "o", "t"] +[290.411464, "o", " "] +[291.504878, "o", "S"] +[291.584952, "o", "A"] +[291.754893, "o", "V"] +[291.802071, "o", "E"] +[291.96154, "o", "_"] +[292.14258, "o", "A"] +[292.21097, "o", "L"] +[292.344666, "o", "L"] +[292.56581, "o", " "] +[292.756512, "o", "d"] +[292.852271, "o", "o"] +[292.933196, "o", "e"] +[293.021413, "o", "s"] +[294.080447, "o", "\r\n"] +[294.080887, "o", "(gdb) "] +[295.025805, "o", "d"] +[295.136476, "o", "i"] +[295.271217, "o", "s"] +[295.37067, "o", "a"] +[295.527332, "o", "s"] +[295.898488, "o", "s"] +[296.085096, "o", "emble "] +[297.455021, "o", "\r\n"] +[297.455375, "o", "Dump of assembler code for function entry_INT80_32:\r\n 0xc14645a4 <+0>:\tlea 0x0(%esi),%esi\r\n 0xc14645a7 <+3>:\tpush %eax\r\n 0xc14645a8 <+4>:\tcld \r\n 0xc14645a9 <+5>:\tpush $0x0\r\n 0xc14645ab <+7>:\tpush %fs\r\n 0xc14645ad <+9>:\tpush %es\r\n 0xc14645ae <+10>:\tpush %ds\r\n 0xc14645af <+11>:\tpush $0xffffffda\r\n 0xc14645b1 <+13>:\tpush %ebp\r\n"] +[297.455523, "o", " 0xc14645b2 <+14>:\tpush %edi\r\n 0xc14645b3 <+15>:\tpush %esi\r\n 0xc14645b4 <+16>:\tpush %edx\r\n 0xc14645b5 <+17>:\tpush %ecx\r\n 0xc14645b6 <+18>:\tpush %ebx\r\n"] +[297.455959, "o", " 0xc14645b7 <+19>:\tmov $0x7b,%edx\r\n 0xc14645bc <+24>:\tmov %edx,%ds\r\n 0xc14645be <+26>:\tmov %edx,%es\r\n"] +[297.456273, "o", " 0xc14645c0 <+28>:\tmov $0xd8,%edx\r\n 0xc14645c5 <+33>:\tmov %edx,%fs\r\n"] +[297.456519, "o", " 0xc14645c7 <+35>:\tcall 0xc1000ed3 \r\n 0xc14645cc <+40>:\tmov %esp,%eax\r\n 0xc14645ce <+42>:\tcall 0xc1001300 \r\n"] +[297.456787, "o", "---Type to continue, or q to quit---"] +[300.037654, "o", "q"] +[300.474906, "o", "\r\nQuit\r\n"] +[300.475028, "o", "(gdb) "] +[301.222036, "o", "#"] +[301.344949, "o", " "] +[301.53983, "o", "a"] +[301.639062, "o", "s"] +[301.804905, "o", " "] +[301.932574, "o", "e"] +[302.019082, "o", "x"] +[302.205597, "o", "p"] +[302.282764, "o", "e"] +[302.353223, "o", "c"] +[302.577703, "o", "t"] +[302.659955, "o", "e"] +[302.829647, "o", "d"] +[302.916048, "o", ","] +[302.988029, "o", " "] +[303.195687, "o", "i"] +[303.309352, "o", "t"] +[303.425016, "o", " "] +[304.041744, "o", "p"] +[304.269832, "o", "u"] +[304.428141, "o", "s"] +[304.641784, "o", "h"] +[304.756462, "o", "e"] +[304.840521, "o", "s"] +[305.137798, "o", " "] +[306.996152, "o", "r"] +[307.053783, "o", "e"] +[307.293212, "o", "s"] +[307.493748, "o", "i"] +[308.526057, "o", "\b\u001b[K"] +[308.651061, "o", "\b\u001b[K"] +[308.813776, "o", "\b\u001b[K"] +[309.52065, "o", "u"] +[309.734069, "o", "\b\u001b[K"] +[309.866326, "o", "\b\u001b[K"] +[310.014799, "o", "u"] +[310.293887, "o", "e"] +[310.409052, "o", "r"] +[310.764658, "o", "\b\u001b[K"] +[310.889594, "o", "\b\u001b[K"] +[311.047137, "o", "s"] +[311.173052, "o", "e"] +[311.228372, "o", "r"] +[311.711761, "o", "s"] +[311.809911, "o", "p"] +[311.873173, "o", "a"] +[311.988731, "o", "c"] +[312.077428, "o", "e"] +[312.161123, "o", " "] +[312.281191, "o", "r"] +[312.349935, "o", "e"] +[312.547146, "o", "g"] +[312.667304, "o", "s"] +[312.936388, "o", " "] +[313.086964, "o", "t"] +[313.154591, "o", "o"] +[313.232086, "o", " "] +[314.351931, "o", "s"] +[314.549992, "o", "t"] +[314.60986, "o", "a"] +[314.792887, "o", "c"] +[314.864733, "o", "k"] +[314.991827, "o", "\r\n"] +[314.992256, "o", "(gdb) "] +[327.831401, "o", "#"] +[328.116647, "o", " "] +[328.304643, "o", "l"] +[328.403419, "o", "e"] +[328.643693, "o", "t"] +[328.881425, "o", "s"] +[329.012271, "o", " "] +[329.350209, "o", "o"] +[329.675607, "o", "\b\u001b[K"] +[329.746522, "o", "g"] +[329.894035, "o", "o"] +[329.960528, "o", " "] +[330.265508, "o", "d"] +[330.454674, "o", "e"] +[330.61252, "o", "e"] +[330.767734, "o", "p"] +[331.017861, "o", "e"] +[331.105343, "o", "r"] +[331.193089, "o", ","] +[331.291487, "o", " "] +[331.421255, "o", "t"] +[331.499907, "o", "o"] +[331.589935, "o", " "] +[331.800311, "o", "u"] +[332.05827, "o", "s"] +[332.210451, "o", "e"] +[332.270701, "o", "r"] +[332.420355, "o", "s"] +[332.533886, "o", "p"] +[332.598023, "o", "a"] +[332.679367, "o", "c"] +[332.768566, "o", "e"] +[332.92981, "o", "\r\n"] +[332.929919, "o", "(gdb) "] +[333.474291, "o", "f"] +[333.512483, "o", "r"] +[333.670863, "o", " "] +[334.214986, "o", "2"] +[334.409422, "o", "\r\n"] +[334.410165, "o", "#2 do_int80_syscall_32 (regs=0xc7235fb4) at arch/x86/entry/common.c:341\r\n"] +[334.410451, "o", "341\t\tdo_syscall_32_irqs_on(regs);\r\n(gdb) "] +[334.795034, "o", "p"] +[334.935707, "o", "r"] +[335.041764, "o", "i"] +[335.125845, "o", "n"] +[335.256727, "o", "t"] +[335.436358, "o", " "] +[336.398283, "o", "*"] +[336.683485, "o", "r"] +[336.751145, "o", "e"] +[336.914056, "o", "g"] +[337.129438, "o", "s"] +[337.326378, "o", "\r\n"] +[337.327037, "o", "$3 = {bx = 3, cx = 1, dx = 3, "] +[337.327271, "o", "si = 168423920, di = 168419876, bp = 168419336, \r\n ax = 4294967258, ds = 123, __dsh = 0, es = 123, __esh = 0, fs = 0, \r\n"] +[337.327484, "o", " __fsh = 0, gs = 0, __gsh = 0, orig_ax = 63, ip = 1150252833, cs = 115, \r\n __csh = 0, flags = 514, sp = 3218117628, "] +[337.327641, "o", "ss = 123, __ssh = 0}\r\n(gdb) "] +[338.677993, "o", "#"] +[338.93667, "o", " "] +[339.41632, "o", "t"] +[339.584092, "o", "h"] +[340.119744, "o", "e"] +[340.257132, "o", " "] +[340.430186, "o", "p"] +[340.531965, "o", "t"] +[340.720155, "o", "_"] +[340.823874, "o", "r"] +[340.878238, "o", "e"] +[341.092385, "o", "g"] +[341.261419, "o", "s"] +[341.410487, "o", " "] +[341.687741, "o", "s"] +[341.898318, "o", "t"] +[341.974712, "o", "r"] +[342.103508, "o", "u"] +[342.220027, "o", "c"] +[342.426521, "o", "t"] +[342.533294, "o", "u"] +[342.643383, "o", "r"] +[342.706116, "o", "e"] +[342.969115, "o", " "] +[343.424403, "o", "s"] +[343.831852, "o", "a"] +[344.073427, "o", "v"] +[344.124191, "o", "e"] +[344.307738, "o", "s"] +[344.41903, "o", " "] +[344.562734, "o", "t"] +[344.695776, "o", "h"] +[344.763893, "o", "e"] +[344.926178, "o", " "] +[345.788445, "o", "E"] +[346.04075, "o", "S"] +[346.167767, "o", "P"] +[346.367919, "o", " "] +[346.480742, "o", "a"] +[346.612524, "o", "n"] +[346.708959, "o", "d"] +[346.772221, "o", " "] +[347.04116, "o", "E"] +[347.27069, "o", "I"] +[347.41795, "o", "P"] +[347.620023, "o", " "] +[348.267947, "o", "v"] +[348.310577, "o", "a"] +[348.389535, "o", "l"] +[348.583187, "o", "u"] +[348.626281, "o", "e"] +[348.75411, "o", "s"] +[349.152128, "o", " "] +[350.062966, "o", "\b\b\b\b\b\b\b"] +[350.505726, "o", "\u001b[1@r"] +[350.581222, "o", "\u001b[1@e"] +[350.733489, "o", "\u001b[1@g"] +[350.860972, "o", "\u001b[1@s"] +[351.063679, "o", "\u001b[1@ "] +[351.403168, "o", "\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C\u001b[C"] +[351.723396, "o", "a"] +[351.796061, "o", "s"] +[351.924873, "o", " "] +[352.127877, "o", "w"] +[352.258955, "o", "e"] +[352.359039, "o", "l"] +[352.480885, "o", "l"] +[352.581941, "o", "\r\n"] +[352.582063, "o", "(gdb) "] +[354.291356, "o", "p"] +[354.434906, "o", "r"] +[354.548164, "o", "i"] +[354.623451, "o", "n"] +[354.677168, "o", "t"] +[354.835361, "o", " "] +[355.104827, "o", "/"] +[355.285227, "o", "x"] +[355.434379, "o", " "] +[355.814288, "o", "r"] +[355.884763, "o", "e"] +[356.044765, "o", "g"] +[356.169825, "o", "-"] +[356.229667, "o", "s"] +[356.715904, "o", ">"] +[356.92396, "o", "\b\u001b[K"] +[357.049325, "o", "\b\u001b[K"] +[357.20555, "o", "\b\u001b[K"] +[357.296469, "o", "s"] +[357.676925, "o", ">"] +[358.100747, "o", "i"] +[358.154056, "o", "p"] +[358.530121, "o", "\b\u001b[K"] +[358.650376, "o", "\b\u001b[K"] +[358.771888, "o", "\b\u001b[K"] +[358.830286, "o", "-"] +[359.096595, "o", ">"] +[359.399708, "o", "i"] +[359.471959, "o", "p"] +[359.66492, "o", "\r\n"] +[359.69065, "o", "$4 = 0x448f7721\r\n(gdb) "] +[362.750139, "o", "d"] +[362.873006, "o", "i"] +[363.70087, "o", "s"] +[364.005826, "o", "a"] +[364.154305, "o", "s"] +[364.534403, "o", "s"] +[364.638318, "o", "emble "] +[367.274254, "o", "0x"] +[367.274386, "o", "448f"] +[367.274769, "o", "7721"] +[368.070465, "o", "-"] +[368.275859, "o", "0"] +[368.34472, "o", "x"] +[368.717017, "o", "1"] +[368.721481, "o", "2"] +[368.831229, "o", ","] +[369.57988, "o", "+"] +[369.896806, "o", "0"] +[370.018088, "o", "x"] +[370.253572, "o", "1"] +[370.325077, "o", "2"] +[370.729392, "o", "\r\n"] +[370.729561, "o", "Dump of assembler code from 0x448f770f to 0x448f7721:\r\n"] +[370.730714, "o", " 0x448f770f:\tnop\r\n"] +[370.73105, "o", " 0x448f7710:\tmov %ebx,%edx\r\n"] +[370.731361, "o", " 0x448f7712:\tmov 0x8(%esp),%ecx\r\n"] +[370.731696, "o", " 0x448f7716:\tmov 0x4(%esp),%ebx\r\n"] +[370.732008, "o", " 0x448f771a:\tmov $0x3f,%eax\r\n"] +[370.732313, "o", " 0x448f771f:\tint $0x80\r\nEnd of assembler dump.\r\n"] +[370.732521, "o", "(gdb) "] +[374.541556, "o", "#"] +[374.783001, "o", " "] +[374.898373, "o", "t"] +[375.026925, "o", "h"] +[375.073567, "o", "i"] +[375.175876, "o", "s"] +[375.257213, "o", " "] +[375.437225, "o", "l"] +[375.612851, "o", "o"] +[375.726918, "o", "o"] +[375.811791, "o", "k"] +[375.914079, "o", "s"] +[375.991534, "o", " "] +[376.124911, "o", "l"] +[376.227571, "o", "i"] +[376.402924, "o", "k"] +[376.568604, "o", "e"] +[376.709076, "o", " "] +[376.892681, "o", "t"] +[377.0435, "o", "h"] +[377.153381, "o", "e"] +[377.255434, "o", " "] +[379.598554, "o", "d"] +[379.742728, "o", "u"] +[379.818192, "o", "p"] +[379.946914, "o", "2"] +[380.101209, "o", " "] +[381.094712, "o", "i"] +[381.254821, "o", "m"] +[381.448647, "o", "p"] +[381.516377, "o", "l"] +[381.632838, "o", "e"] +[381.733497, "o", "m"] +[381.821502, "o", "e"] +[381.939177, "o", "n"] +[382.061297, "o", "t"] +[382.126975, "o", "a"] +[382.263958, "o", "t"] +[382.355856, "o", "i"] +[382.39873, "o", "o"] +[382.625141, "o", "n"] +[382.785792, "o", " "] +[382.924201, "o", "i"] +[383.007739, "o", "n"] +[383.112653, "o", " "] +[383.344371, "o", "g"] +[383.528085, "o", "l"] +[383.818807, "o", "\b\u001b[K"] +[383.940934, "o", "\b\u001b[K"] +[384.549965, "o", "l"] +[384.699794, "o", "i"] +[384.893748, "o", "b"] +[384.958022, "o", "c"] +[385.106737, "o", "\r\n"] +[385.106862, "o", "(gdb) "] +[390.638525, "o", "#"] +[390.745393, "o", " "] +[390.899618, "o", "l"] +[390.932654, "o", "e"] +[391.101789, "o", "t"] +[391.253114, "o", "s"] +[391.385379, "o", " "] +[391.507941, "o", "c"] +[391.609023, "o", "h"] +[391.674385, "o", "e"] +[391.751088, "o", "c"] +[391.811, "o", "k"] +[391.917497, "o", " "] +[392.061073, "o", "t"] +[392.14249, "o", "h"] +[392.235203, "o", "e"] +[392.357752, "o", " "] +[392.997036, "o", "s"] +[393.16912, "o", "t"] +[393.264152, "o", "a"] +[393.476405, "o", "c"] +[393.623273, "o", "k"] +[393.735766, "o", " "] +[393.886348, "o", "v"] +[393.956001, "o", "a"] +[394.095124, "o", "l"] +[394.297637, "o", "u"] +[394.373537, "o", "e"] +[394.864079, "o", "s"] +[396.020466, "o", " "] +[396.544499, "o", "a"] +[397.19223, "o", "\b\u001b[K"] +[397.516516, "o", "("] +[397.757745, "o", "d"] +[397.840435, "o", "a"] +[398.021342, "o", "r"] +[398.083794, "o", "a"] +[398.395255, "o", "\b\u001b[K"] +[398.527905, "o", "\b\u001b[K"] +[398.655121, "o", "\b\u001b[K"] +[398.791444, "o", "\b\u001b[K"] +[400.259196, "o", "\b\u001b[K"] +[400.381224, "o", "\b\u001b[K"] +[400.705872, "o", "\r\n"] +[400.705994, "o", "(gdb) "] +[405.090136, "o", "p"] +[405.250017, "o", "r"] +[405.339117, "o", "i"] +[405.408649, "o", "n"] +[405.48463, "o", "t"] +[405.598488, "o", " "] +[406.982799, "o", "/"] +[407.074314, "o", "x"] +[407.190834, "o", " "] +[407.406215, "o", "r"] +[407.477829, "o", "e"] +[407.621752, "o", "g"] +[407.761064, "o", "s"] +[407.848706, "o", "-"] +[408.12336, "o", ">"] +[409.034397, "o", "s"] +[409.101141, "o", "p"] +[409.237828, "o", "\r\n"] +[409.250562, "o", "$5 = 0xbfd093fc\r\n(gdb) "] +[410.588712, "o", "x"] +[410.779534, "o", " "] +[410.929273, "o", "/"] +[411.084678, "o", "x"] +[411.282106, "o", " "] +[412.846629, "o", "0xb"] +[412.84701, "o", "fd093fc"] +[413.495368, "o", "\r\n"] +[413.495801, "o", "0xbfd093fc:\t0x08068b46\r\n"] +[413.495927, "o", "(gdb) "] +[413.970708, "o", "\r\n"] +[413.971563, "o", "0xbfd09400:\t0x00000003\r\n"] +[413.971895, "o", "(gdb) "] +[414.429768, "o", "\r\n"] +[414.430693, "o", "0xbfd09404:\t0x00000001\r\n"] +[414.430996, "o", "(gdb) "] +[416.367425, "o", "#"] +[416.600327, "o", " "] +[417.126384, "o", "f"] +[417.206421, "o", "i"] +[417.315777, "o", "r"] +[417.716293, "o", "s"] +[418.11331, "o", "t"] +[418.641803, "o", " "] +[419.136181, "o", "s"] +[419.31382, "o", "e"] +[419.466617, "o", "e"] +[420.57146, "o", "m"] +[421.230979, "o", "\b\u001b[K"] +[421.365032, "o", "\b\u001b[K"] +[421.493488, "o", "\b\u001b[K"] +[421.621835, "o", "\b\u001b[K"] +[421.857413, "o", "i"] +[421.945647, "o", "s"] +[422.057603, "o", " "] +[422.199844, "o", "t"] +[422.257072, "o", "h"] +[422.36931, "o", "e"] +[422.432441, "o", " "] +[423.25473, "o", "r"] +[423.329204, "o", "e"] +[423.486095, "o", "t"] +[423.556503, "o", "u"] +[423.658879, "o", "r"] +[423.740909, "o", "n"] +[423.811235, "o", " "] +[423.941111, "o", "a"] +[424.030245, "o", "d"] +[424.169883, "o", "d"] +[424.362463, "o", "r"] +[424.436531, "o", "e"] +[424.621395, "o", "s"] +[424.763605, "o", "s"] +[425.22778, "o", "\r\n"] +[425.228074, "o", "(gdb) "] +[426.180565, "o", "#"] +[426.329978, "o", " "] +[426.674846, "o", "s"] +[427.644279, "o", "e"] +[427.832117, "o", "c"] +[427.945399, "o", "o"] +[428.050368, "o", "n"] +[428.1215, "o", "d"] +[428.252667, "o", " "] +[428.383049, "o", "a"] +[428.500148, "o", "n"] +[428.596744, "o", "d"] +[428.698814, "o", " "] +[428.945383, "o", "r"] +[429.106316, "o", "i"] +[429.645192, "o", "\b\u001b[K"] +[429.767912, "o", "\b\u001b[K"] +[429.845124, "o", "t"] +[430.003863, "o", "h"] +[430.107219, "o", "i"] +[430.255689, "o", "r"] +[430.487167, "o", "d"] +[430.667477, "o", " "] +[431.006023, "o", "a"] +[431.211805, "o", "r"] +[431.292319, "o", "e"] +[431.437362, "o", " "] +[431.56393, "o", "t"] +[431.683208, "o", "h"] +[431.728348, "o", "e"] +[431.865432, "o", " "] +[432.888452, "o", "p"] +[432.977097, "o", "a"] +[433.175822, "o", "r"] +[433.258553, "o", "a"] +[433.531527, "o", "m"] +[434.072679, "o", "e"] +[434.235626, "o", "t"] +[434.331498, "o", "e"] +[434.456163, "o", "r"] +[434.694781, "o", "s"] +[434.879649, "o", " "] +[435.461291, "o", "("] +[435.855804, "o", "f"] +[435.932997, "o", "d"] +[436.744873, "o", "s"] +[436.988719, "o", " "] +[438.082246, "o", "3"] +[438.216148, "o", " "] +[438.350139, "o", "a"] +[438.461137, "o", "n"] +[438.546648, "o", "d"] +[438.645921, "o", " "] +[438.899168, "o", "1"] +[439.219735, "o", ")"] +[439.481293, "o", "\r\n"] +[439.481613, "o", "(gdb) "] +[450.102183, "o", "quit\r\n"] +[450.102822, "o", "A debugging session is active.\r\n\r\n\tInferior 1 [Remote target] will be detached.\r\n\r\nQuit anyway? (y or n) "] +[451.119252, "o", "y"] +[451.379225, "o", "\r\nDetaching from program: /home/tavi/src/linux/vmlinux, Remote target\r\n"] +[451.379742, "o", "Ending remote debugging.\r\n"] +[451.390975, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[451.925851, "o", "f"] +[452.122079, "o", "g"] +[452.20423, "o", "\r\n"] +[452.2046, "o", "minicom -D serial.pts\r\n"] +[452.204924, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[?1h\u001b=\u001b[1;1H\u001b[?12l\u001b[?25h\u001b[0m\u001b(B \u001b[2;1HWelcome to minicom 2.7 \u001b[3;1H \u001b[4;1HOPTIONS: I18n \u001b[5;1HCompiled on Feb 7 2016, 13:37:27. \u001b[6;1HPort serial.pts, 23:03:56 \u001b[7;1H \u001b[8;1HPress CTRL-A Z for help on special keys \u001b[9;1H \u001b[10;1H \u001b[11;1Hroot@qemux86:~# # trigger dup2 system call \u001b[12;1Hroot@qemux86:~# echo a > /"] +[452.205008, "o", "tmp/x \u001b[13;1H \u001b[14;1H \u001b[15;1H \u001b[16;1H \u001b[17;1H \u001b[18;1H \u001b[19;1H \u001b[20;1H \u001b[21;1H \u001b[22;1H \u001b[23;1H \u001b[24;1H\u001b[0m\u001b("] +[452.205404, "o", "B\u001b[7mCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7 | VT102 | Offline | rial.pts\u001b[13;1H\u001b[?12l\u001b[?25h\u001b[24;1H\u001b[0m\u001b(B \u001b[13;1Hroot@qemux86:~# "] +[453.066691, "o", "\u001b[0m\u001b(B\u001b[7m\u001b[24;1H\u001b[K\u001b[?12l\u001b[?25h\u001b[?25lCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7 | VT102 | Offline | rial.pts\u001b[?12l\u001b[?25h\u001b[13;17H"] +[453.274675, "o", "\u001b[8;30H\u001b[?25l\u001b[0m\u001b(B+----------------------+\u001b[9;30H| Leave Minicom? |\u001b[10;30H| No |\u001b[11;30H+----------------------+\u001b[10;51H\u001b[?25l\u001b[10;33H\u001b[0m\u001b(B\u001b[7m Yes "] +[453.425761, "o", "\u001b[?12l\u001b[?25h\u001b[8;1H\u001b[0m\u001b(BPress CTRL-A Z for help on special keys \u001b[9;1H \u001b[10;1H \u001b[11;1Hroot@qemux86:~# # trigger dup2 system call \u001b[13;17H\u001b[0m\u001b(B\u001b[7m\u001b[?12l\u001b[?25h"] +[453.426136, "o", "\u001b[?12l\u001b[?25h\u001b[0m\u001b(B\u001b[H\u001b[2J\u001b[?12l\u001b[?25h\u001b[?1l\u001b>\u001b[!p\u001b[?3;4l\u001b[4l\u001b>"] +[453.42742, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[453.929041, "o", "#"] +[454.48124, "o", " "] +[454.772523, "o", "t"] +[454.877335, "o", "h"] +[454.967191, "o", "e"] +[455.064771, "o", " "] +[455.221252, "o", "e"] +[455.36275, "o", "n"] +[455.48508, "o", "d"] +[456.337409, "o", "\r\n"] +[456.3382, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[457.092602, "o", "exit\r\n"] diff --git a/refs/pull/405/merge/_images/syscalls-vdso.cast b/refs/pull/405/merge/_images/syscalls-vdso.cast new file mode 100644 index 00000000..08d11b8f --- /dev/null +++ b/refs/pull/405/merge/_images/syscalls-vdso.cast @@ -0,0 +1,299 @@ +{"title": "VDSO", "width": 80, "height": 24, "env": {"TERM": "xterm-256color", "SHELL": "/bin/bash"}, "timestamp": 1519704037, "version": 2, "idle_time_limit": 1.0} +[0.025954, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[0.778357, "o", "\r\u001b[12P(reverse-i-search)`':\u001b[C"] +[1.32861, "o", "\b\b\b\u001b[23@m': minicom -D serial.pts\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[1.448449, "o", "\b\b\b\b\b\b\b\b\b\u001b[1@i\u001b[C\u001b[C\u001b[C"] +[1.765225, "o", "\r\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ minicom -D serial.pts \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r\n"] +[1.769204, "o", "\u001b[!p\u001b[?3;4l\u001b[4l\u001b>\u001b[0m\u001b(B"] +[1.769319, "o", "\u001b[?1h\u001b=\u001b[H\u001b[2J"] +[1.770766, "o", "\u001b[?12l\u001b[?25h"] +[1.770874, "o", "\nWelcome to minicom 2.7\r\n\nOPTIONS: I18n \r\nCompiled on Feb 7 2016, 13:37:27.\r\nPort serial.pts, 05:00:24\r\n\nPress CTRL-A Z for help on special keys\r\n\n"] +[2.416093, "o", "\n"] +[2.418403, "o", "root@qemux86:~# "] +[3.828624, "o", "\r(reverse-i-search)`': "] +[4.158346, "o", "\b\b\bc': cat /proc/$$/maps | grep vdso\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"] +[4.248967, "o", "\b\b\b\b\b\b\b\b\b\b\b ': cat /proc/$$/maps | grep vdso \u001b[11;21Ha': "] +[4.827489, "o", "\r\u001b[P\u001b[P\u001b[P\u001b[P\u001b[P\u001b[P\u001b[P\u001b[Proot@qemux86:~# "] +[4.82862, "o", "\r\n"] +[4.879844, "o", "b7fe1000-b7fe2000 r-xp 00000000 00:00 0 [vdso]"] +[4.880004, "o", "\r\n"] +[4.881875, "o", "root@qemux86:~# "] +[6.142231, "o", "d"] +[6.284442, "o", "d"] +[6.458279, "o", " "] +[6.621989, "o", "i"] +[6.739944, "o", "f"] +[6.869664, "o", " "] +[7.493448, "o", "\b \b"] +[7.874738, "o", "="] +[8.588815, "o", "/"] +[8.637697, "o", "d"] +[8.738769, "o", "e"] +[8.938278, "o", "v"] +[8.984739, "o", "/"] +[9.500486, "o", "$"] +[9.669718, "o", "$"] +[9.933013, "o", "/"] +[10.500669, "o", "m"] +[10.616163, "o", "e"] +[10.693377, "o", "m"] +[10.978277, "o", " "] +[11.216909, "o", "o"] +[11.376459, "o", "f"] +[11.551348, "o", "="] +[12.560851, "o", "v"] +[12.764026, "o", "d"] +[12.85403, "o", "."] +[13.112242, "o", "s"] +[13.185422, "o", "o"] +[13.657544, "o", " "] +[15.232911, "o", "s"] +[15.352051, "o", "k"] +[15.564274, "o", "i"] +[15.669058, "o", "p"] +[16.316486, "o", "="] +[17.003593, "o", "$"] +[18.445474, "o", "("] +[18.601053, "o", "("] +[19.54437, "o", "0"] +[19.660685, "o", "x"] +[20.138912, "o", "b"] +[20.616486, "o", "f"] +[21.48522, "o", "\b \b"] +[22.515307, "o", "7"] +[23.508009, "o", "f"] +[23.790619, "o", "e"] +[24.400251, "o", "1"] +[25.248207, "o", ")"] +[25.375563, "o", ")"] +[26.327645, "o", " "] +[26.726994, "o", "c"] +[26.847455, "o", "o"] +[27.02857, "o", "u"] +[27.276381, "o", "n"] +[27.383646, "o", "t"] +[27.625756, "o", "="] +[27.920608, "o", "1"] +[28.521296, "o", " "] +[29.112339, "o", "b"] +[29.222891, "o", "s"] +[29.743661, "o", "="] +[30.384493, "o", "4"] +[30.485127, "o", "0"] +[30.695845, "o", "9"] +[31.075003, "o", "6"] +[32.188246, "o", "\r\n"] +[32.199355, "o", "dd: "] +[32.19954, "o", "failed to open '/dev/885/mem'"] +[32.199693, "o", ": No such file or directory"] +[32.199842, "o", "\r\n"] +[32.201711, "o", "root@qemux86:~# "] +[33.598384, "o", "dd if=/dev/$$/mem of=vd.so skip=$((0xb7fe1)) count=1 bs=4096"] +[34.061345, "o", "\b"] +[34.562175, "o", "\b"] +[34.593086, "o", "\b"] +[34.624228, "o", "\b"] +[34.655779, "o", "\b"] +[34.686158, "o", "\b"] +[34.718106, "o", "\b"] +[34.747207, "o", "\b"] +[34.777381, "o", "\b"] +[34.808345, "o", "\b"] +[34.838882, "o", "\b"] +[34.87023, "o", "\b"] +[34.901235, "o", "\b"] +[34.931911, "o", "\b"] +[34.962706, "o", "\b"] +[34.994145, "o", "\b"] +[35.025857, "o", "\b"] +[35.055689, "o", "\b"] +[35.086722, "o", "\b"] +[35.117042, "o", "\b"] +[35.148358, "o", "\b"] +[35.179121, "o", "\b"] +[35.210605, "o", "\b"] +[35.240516, "o", "\b"] +[35.271948, "o", "\b"] +[35.30293, "o", "\b"] +[35.334901, "o", "\b"] +[35.364321, "o", "\b"] +[35.396241, "o", "\b"] +[35.426037, "o", "\b"] +[35.456994, "o", "\b"] +[35.488153, "o", "\b"] +[35.519142, "o", "\b"] +[35.550182, "o", "\b"] +[35.581047, "o", "\b"] +[35.611116, "o", "\b"] +[35.641763, "o", "\b"] +[35.672908, "o", "\b"] +[35.703498, "o", "\b"] +[35.734998, "o", "\b"] +[35.766411, "o", "\b"] +[35.796549, "o", "\b"] +[35.82753, "o", "\b"] +[36.601742, "o", "\b"] +[37.09839, "o", "\b"] +[37.128155, "o", "\b"] +[37.163111, "o", "\b"] +[37.192346, "o", "\b"] +[37.223703, "o", "\b"] +[37.253359, "o", "\b"] +[37.284626, "o", "\b"] +[37.608855, "o", "v"] +[37.939538, "o", "\b\u001b[P"] +[38.087974, "o", "\b\u001b[P"] +[38.254754, "o", "\b\u001b[P"] +[38.440987, "o", " /$$/mem of=vd.so skip=$((0xb7fe1)) count=1 bs=4096 \u001b[15;24Hp"] +[38.558039, "o", " /$$/mem of=vd.so skip=$((0xb7fe1)) count=1 bs=4096 \u001b[15;25Hr"] +[38.691276, "o", " /$$/mem of=vd.so skip=$((0xb7fe1)) count=1 bs=4096 \u001b[15;26Ho"] +[38.847178, "o", " /$$/mem of=vd.so skip=$((0xb7fe1)) count=1 bs=4096 \u001b[15;27Hc"] +[39.508168, "o", "\r\n"] +[39.523093, "o", "dd: "] +[39.52328, "o", "/proc/885/mem: cannot skip to specified offset"] +[39.523317, "o", "\r\n"] +[39.52466, "o", "1+0 records in"] +[39.524744, "o", "\r\n"] +[39.524825, "o", "1+0 records out"] +[39.525076, "o", "\r\n"] +[39.525484, "o", "4096 bytes (4.1 kB, 4.0 KiB) copied, 0.00261601 s, 1.6 MB/s"] +[39.525575, "o", "\r\n"] +[39.527474, "o", "root@qemux86:~# "] +[41.610174, "o", "n"] +[41.732264, "o", "m"] +[41.988107, "o", " "] +[42.33858, "o", "-"] +[42.79556, "o", "D"] +[43.104068, "o", " "] +[43.291478, "o", "v"] +[43.55666, "o", "d"] +[43.843116, "o", "."] +[44.185221, "o", "s"] +[44.542652, "o", "o"] +[45.448387, "o", "\r\n"] +[45.491518, "o", "00000000 A LINUX_2.5\r\n"] +[45.49171, "o", "00000000 A LINUX_2.6"] +[45.491843, "o", "\r\n"] +[45.491964, "o", "00000b4c T __kernel_rt_sigreturn"] +[45.492082, "o", "\r\n"] +[45.492159, "o", "00000b40 T __kernel_sigreturn"] +[45.492224, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[45.492353, "o", "00000b2c T __kernel_vsyscall"] +[45.492598, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[45.49281, "o", "00000710 T __vdso_clock_gettime"] +[45.492887, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[45.493019, "o", "000009a0 T __vdso_gettimeofday"] +[45.493104, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[45.493252, "o", "00000b00 T __vdso_time"] +[45.493323, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[45.495654, "o", "root@qemux86:~# "] +[47.899977, "o", "o"] +[48.346053, "o", "b"] +[48.401345, "o", "j"] +[48.663622, "o", "d"] +[48.966338, "o", "u"] +[49.273586, "o", "m"] +[49.392373, "o", "p"] +[49.52031, "o", " "] +[49.715854, "o", "-"] +[50.017434, "o", "d"] +[50.916108, "o", "r"] +[51.076094, "o", " "] +[51.470914, "o", ">"] +[51.659245, "o", " "] +[52.330585, "o", "v"] +[52.682659, "o", "d"] +[53.100659, "o", "s"] +[53.21442, "o", "o"] +[53.522123, "o", "."] +[55.809864, "o", "s"] +[57.590678, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[57.633021, "o", "objdump: "] +[57.633213, "o", "'a.out': No such file"] +[57.633325, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[57.635624, "o", "root@qemux86:~# "] +[59.8835, "o", "objdump -dr > vdso.s"] +[60.157193, "o", "\b"] +[60.660172, "o", "\b"] +[60.689762, "o", "\b"] +[60.722033, "o", "\b"] +[60.752467, "o", "\b"] +[60.783825, "o", "\b"] +[60.813211, "o", "\b"] +[60.845191, "o", "\b"] +[60.875427, "o", "\b"] +[60.905546, "o", "\b"] +[61.100984, "o", "r"] +[61.244992, "o", " "] +[61.660944, "o", " > vdso.s \u001b[24;29Hv"] +[61.996232, "o", " > vdso.s \u001b[24;30Hd"] +[62.139048, "o", " > vdso.s \u001b[24;31H."] +[62.350791, "o", " > vdso.s \u001b[24;32Hs"] +[62.450021, "o", " > vdso.s \u001b[24;33Ho"] +[62.598679, "o", " > vdso.s \u001b[24;34H "] +[62.933485, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[62.974853, "o", "root@qemux86:~# "] +[66.634044, "o", "v"] +[66.745211, "o", "i"] +[66.942786, "o", " "] +[67.232609, "o", "v"] +[67.824023, "o", "d"] +[68.377726, "o", "s"] +[68.437403, "o", "o"] +[68.723205, "o", "."] +[69.712646, "o", "s"] +[69.919804, "o", "\r\n\u001b[23;80H \u001b[24;1H"] +[69.956533, "o", "\u001b[1;1H\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\n\u001b[K\u001b[1;1H\u001b[K\nvd.so: file format elf32-i386\u001b[5;1HDisassembly of section .text:\u001b[7;1H000006d0 <__vdso_clock_gettime@@LINUX_2.6-0x40>:\u001b[8;2H6d0: 55 push %ebp\u001b[9;2H6d1: 89 e5 mov %esp,%ebp\u001b[10;2H6"] +[69.956753, "o", "d3: 53 push %ebx\u001b[11;2H6d4: e8 49 04 00 00 call b22 <__vdso_time@@LINUX_2.6+0x22>\u001b[12;2H6d9: 81 c3 6b fc ff ff add $0xfffffc6b,%ebx\u001b[13;2H6df: 0f ae e8 lfence\u001b[14;2H6e2: 0f 31 rdtsc\u001b[15;2H6e4: 89 c1 mov %eax,%ecx\u001b[16;2H6e6: 8b 83 44 cd ff ff mov -0x32bc(%ebx),%eax\u001b[17;2H6ec: 8b 9b 48 cd ff ff mov -0x32b8(%ebx),%ebx\u001b[18;2H6f2: 39 d3 cmp %edx,%ebx\u001b[19;2H6f4: 72 0e jb 704 \u001b[20;2H6f6: 76 08 jbe 700 \u001b[21;2H6f8: 89 da mov %ebx,%edx\u001b[22;2H6fa: 5b pop %ebx\u001b[23;2H6fb: 5d pop %ebp\u001b[1;1H\u001b[24;1H\u001b[K- vdso.s 1/413 0%\u001b[1;1H"] +[70.48049, "o", "\u001b[24;1H\u001b[K/"] +[71.703477, "o", "v"] +[71.956563, "o", "s"] +[72.061476, "o", "y"] +[72.151126, "o", "s"] +[72.345804, "o", "c"] +[72.43538, "o", "a"] +[72.532243, "o", "l"] +[72.655082, "o", "l"] +[72.868018, "o", "\u001b[1;1H\u001b[1;2H82e: 3d ff c9 9a 3b cmp $0x3b9ac9ff,%eax\u001b[2;1H 833: 77 eb ja 820 <__vdso_cloc"] +[72.868164, "o", "k_gettime@@LINUX_2.6+0x11\u001b[2;80H\u001b[3;2H835: 8b 75 f0 mov -0x10(%ebp),%esi\u001b[4;2H838: 01 0f add %ecx,(%edi)\u001b[5;1H 83a: 89 47 04 mov %eax,0x4(%edi)\u001b[6;2H83d: 85 f6 test %esi,%esi"] +[72.868249, "o", "\u001b[7;1H 83f: 0f 85 24 ff ff ff jne 769 <__vdso_clock_gettime@@LINUX_2.6+0x59\u001b[7;80H\u001b[8;2H845: 89 f9 mov "] +[72.86854, "o", " %edi,%ecx\u001b[9;2H847: b8 09 01 00 00 mov $0x109,%eax\u001b[10;2H84c: 89 da mov %ebx,%edx\u001b[11;2H84e: 8b 5d 08 mov 0x8(%ebp),%ebx \u001b[12;2H851: e8 d6 02 00 00 call b2c <__kernel_vsyscall@@LINUX_2.5>\u001b[13;2H856: 89 d3 mov %edx,%ebx\u001b[14;2H858: 83 c4 10 add $0x10,%esp\u001b[15;2H85b: 5b pop %ebx \u001b[16;2H85c: 5e pop %esi "] +[72.868973, "o", "\u001b[17;2H85d: 5f pop %edi \u001b[18;2H85e: 5d pop %ebp \u001b[19;2H85f: c3 ret \u001b[20;2H860: 8b 45 08 mov 0x8(%ebp),%eax \u001b[21;2H863: 85 c0 test %eax,%ea\u001b[22;2H865: 75 de jne 845 <__vdso_clock_gettime@@LINUX_2.6+0x13\u001b[22;80H\u001b[23;2H867: 89 7d 0c mov %edi,0xc(%ebp)\u001b[12;54H\u001b[24;1H\u001b[K- vdso.s 135/413 32%\u001b[12;54H"] +[73.659739, "o", "\u001b[24;1H\u001b[K/"] +[73.751232, "o", "\u001b[12;54H\u001b[1;2Hac2: e9 fe fe ff ff jmp 9c5 <__vdso_gettimeofday@@LINUX_2.6+0x25>\u001b[1;80H\u001b[2;2Hac7: 8b 97 84 cd ff ff mov -0x327c(%edi),%edx \u001b[2;80H\u001b[3;2Hacd: 8b 5d 0c mov 0xc(%ebp),%ebx \u001b[4;2Had0: 89 13 mov %edx,(%ebx\u001b[5;2Had2: 8b 97 88 cd ff ff mov -0x3278(%edi),%edx\u001b[6;2Had8: 89 53 04 "] +[73.751392, "o", " mov %edx,0x4(%ebx)\u001b[7;2Hadb: eb 95 jmp a72 <__vdso_gettimeofday@@LINUX_2.6+0xd2>\u001b[7;80H\u001b[8;2Hadd: b8 4e 00 00 00 mov $0x4e,%eax\u001b[9;2Hae2: 8b 4d 0c mov 0xc(%ebp),%ecx\u001b[10;2Hae5\u001b[11;2Hae7: 89 f3 mov %esi,%ebx \u001b[12;2Hae9: e8 3e 00\u001b[13;2Haee\u001b[14;2Haf0: e9 7d ff ff ff jmp a72 <__vdso_gettimeofday@@LINUX_2.6+0xd2>\u001b[14;80H\u001b[15;2Haf5: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi\u001b[16;2Haf9: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi\u001b[17;2H \u001b[18;1H00000b00 <__vdso_time@@LINUX_2.6>: \u001b[19;2Hb00: 55 push %ebp\u001b[20;2Hb01: e8 18 00 00 00 call b1e <__vdso_time@@LINUX_2.6+0x1e>\u001b[21;2Hb06: 05 3e f8 ff ff add $0xfffff83e,%eax\u001b[22;2Hb0b: 89 e5 mov %esp,%ebp \u001b[22;80H\u001b[23;2Hb0d: 8b 55 08 mov 0x8(%ebp),%edx\u001b[12;54H\u001b[24;1H\u001b[K- vd"] +[73.751422, "o", "so.s 351/413 84%\u001b[12;54H"] +[74.638384, "o", "\u001b[24;1H\u001b[K/"] +[74.706985, "o", "\u001b[12;54H\u001b[1;2Hb1c: 5d pop %ebp \u001b[1;80H\u001b[2;2Hb1d: c3 ret \u001b[3;2Hb1e: 8b 04 24 mov (%esp),%eax \u001b[4;2Hb21: c3 ret "] +[74.707435, "o", " \u001b[5;2Hb22: 8b 1c 24 mov (%esp),%ebx \u001b[6;2Hb25: c3 ret \u001b[7;2Hb26: 8b 3c 24 mov (%esp),%edi \u001b[7;80H\u001b[8;2Hb29: c3 ret \u001b[9;2Hb2a: 90 nop \u001b[10;2Hb2b: 90 nop \u001b[11;2H \u001b[12;1H00000b2c <__kernel_vsyscall@@LINUX_2.5>: \u001b[13;2Hb2c: 51 push %ecx \u001b[14;2Hb2d: 52 push %edx \u001b[14;80H\u001b[15;2Hb2e: 55 push %ebp \u001b[16;2Hb2f: 89 e5 mov %esp,%ebp \u001b[17;2Hb31: 0f 34 sysenter\u001b[18;1H b33: cd 80 int $0x80"] +[74.707709, "o", "\u001b[19;3H35: 5d pop \u001b[20;3H36: 5a pop %edx \u001b[21;3H37: 59 pop %ecx \u001b[22;3H38: c3 ret \u001b[23;3H39: 90 nop \u001b[12;20H\u001b[24;1H\u001b[K- vdso.s 378/413 91%\u001b[12;20H"] +[75.741371, "o", "\u001b[13;20H\u001b[24;1H\u001b[K- vdso.s 379/413 91%\u001b[13;20H"] +[76.240657, "o", "\u001b[14;20H\u001b[24;1H\u001b[K- vdso.s 380/413 92%\u001b[14;20H"] +[76.271822, "o", "\u001b[15;20H\u001b[24;1H\u001b[K- vdso.s 381/413 92%\u001b[15;20H"] +[76.303272, "o", "\u001b[16;20H\u001b[24;1H\u001b[K- vdso.s 382/413 92%\u001b[16;20H"] +[76.335376, "o", "\u001b[17;20H\u001b[24;1H\u001b[K- vdso.s 383/413 92%\u001b[17;20H"] +[76.365685, "o", "\u001b[18;20H\u001b[24;1H\u001b[K- vdso.s 384/413 92%\u001b[18;20H"] +[76.396024, "o", "\u001b[19;20H\u001b[24;1H\u001b[K- vdso.s 385/413 93%\u001b[19;20H"] +[76.426024, "o", "\u001b[20;20H\u001b[24;1H\u001b[K- vdso.s 386/413 93%\u001b[20;20H"] +[76.633377, "o", "\u001b[21;20H\u001b[24;1H\u001b[K- vdso.s 387/413 93%\u001b[21;20H"] +[76.801144, "o", "\u001b[22;20H\u001b[24;1H\u001b[K- vdso.s 388/413 93%\u001b[22;20H"] +[76.95044, "o", "\u001b[23;20H\u001b[24;1H\u001b[K- vdso.s 389/413 94%\u001b[23;20H"] +[77.101372, "o", "\u001b[1;4Hd: c3 ret \u001b[2;4He: 8b 04 24 mov (%esp),%eax\u001b[3;3H21: c3 ret \u001b[4;4H2: 8b 1c 24 mov (%esp),%ebx\u001b[5;4H5: c3 ret "] +[77.10181, "o", " \u001b[6;4H6: 8b 3c 24 mov (%esp),%edi\u001b[7;4H9: c3 ret \u001b[8;4Ha: 90 nop\u001b[9;4Hb\u001b[10;2H \u001b[11;1H00000b2c <__kernel_vsyscall@@LINUX_2.5>:\u001b[12;1H b2c: 51 push %ecx\u001b[13;4Hd: 52 push %ed\u001b[14;4He: 55 push %ebp\u001b[15;4Hf: 89 e5 mov %esp,%ebp\u001b[16;3H31: 0f 34 sysenter \u001b[17;4H3: cd 80 int $0x80\u001b[18;4H5: 5d pop %ebp \u001b[19;4H6: 5a pop %edx\u001b[20;4H7: 59 pop %ec\u001b[21;4H8: c3 ret \u001b[22;4H9: 90 nop\u001b[23;4Ha\u001b[23;20H\u001b[24;1H\u001b[K- vdso.s 390/413 94%\u001b[23;20H"] +[77.455803, "o", "\u001b[1;4He: 8b 04 24 mov (%esp),%eax\u001b[2;3H21: c3 ret \u001b[3;4H2: 8b 1c 24 mov (%esp),%ebx\u001b[4;4H5: c3 ret \u001b[5;4H6: 8b 3c 24 mov (%esp),%edi\u001b[6;4H9: c3 ret \u001b[7;4Ha: 90 nop\u001b[8;4Hb\u001b[9;2H \u001b[10;1H00000b2c <__kernel_vsyscall@@LINUX_2.5>:\u001b[11;1H b2c: 51 push %ecx\u001b[12;4H"] +[77.455926, "o", "d: 52 push %ed\u001b[13;4He: 55 push %ebp\u001b[14;4Hf: 89 e5 mov %esp,%ebp\u001b[15;3H31: 0f 34 sysenter \u001b[16;4H3: cd 80 int $0x80\u001b[17;4H5: 5d pop %ebp \u001b[18;4H6: 5a pop %edx\u001b[19;4H7: 59 pop %ec\u001b[20;4H8: c3 ret \u001b[21;4H9: 90 nop\u001b[22;4Ha\u001b[23;4Hb\u001b[23;20H\u001b[24;1H\u001b[K- vdso.s 391/413 94%\u001b[23;20H"] +[85.076246, "o", "\u0007\u001b[24;1H\u001b[K- vdso.s 391/413 94%\u001b[23;20H"] +[85.088716, "o", "\u001b[24;1H\u001b[K:"] +[85.733973, "o", "q"] +[86.200246, "o", "!"] +[86.579774, "o", "\u001b[23;20H\u001b[24;1H\u001b[K- vdso.s 391/413 94%\u001b[23;20H"] +[86.580204, "o", "\u001b[24;1H\u001b[K"] +[86.586893, "o", "root@qemux86:~# "] +[87.180212, "o", "\u001b[0m\u001b(B\u001b[7m\r\u001b[K\u001b[?12l\u001b[?25h"] +[87.180527, "o", "\u001b[?25lCTRL-A Z for help | 115200 8N1 | NOR | Minicom 2.7 | VT102 | Offline | rial.pts\u001b[?12l\u001b[?25h\u001b[24;17H"] +[87.375774, "o", "\u001b[8;30H\u001b[?25l\u001b[0m\u001b(B+----------------------+\u001b[9;30H| Leave Minicom? |\u001b[10;30H| No |\u001b[11;30H+----------------------+\u001b[10;51H\u001b[?25l\u001b[10;33H\u001b[0m\u001b(B\u001b[7m Yes "] +[87.691546, "o", "\u001b[?12l\u001b[?25h\u001b[8;1H\u001b[0m\u001b(B b2b: 90 nop \u001b[9;1H \u001b[10;1H00000b2c <__kernel_vsyscall@@LINUX_2.5>: \u001b[11;1H b2c: 51 push %ecx \u001b[24;17H\u001b[0m\u001b(B\u001b[7m"] +[87.691675, "o", "\u001b[?12l\u001b[?25h"] +[87.691981, "o", "\u001b[?12l\u001b[?25h\u001b[0m\u001b(B\u001b[H\u001b[2J\u001b[?12l\u001b[?25h\u001b[?1l\u001b>\u001b[!p\u001b[?3;4l\u001b[4l\u001b>"] +[87.69329, "o", "\u001b]0;tavi@lktp: ~/src/linux/tools/labs\u0007\u001b[01;32mtavi@lktp\u001b[00m:\u001b[01;34m~/src/linux/tools/labs\u001b[00m$ "] +[88.615113, "o", "exit\r\n"] diff --git a/refs/pull/405/merge/_images/tso.png b/refs/pull/405/merge/_images/tso.png new file mode 100644 index 00000000..a43f230d Binary files /dev/null and b/refs/pull/405/merge/_images/tso.png differ diff --git a/refs/pull/405/merge/_images/tso1.png b/refs/pull/405/merge/_images/tso1.png new file mode 100644 index 00000000..a43f230d Binary files /dev/null and b/refs/pull/405/merge/_images/tso1.png differ diff --git a/refs/pull/405/merge/_images/write.png b/refs/pull/405/merge/_images/write.png new file mode 100644 index 00000000..d87abc99 Binary files /dev/null and b/refs/pull/405/merge/_images/write.png differ diff --git a/refs/pull/405/merge/_images/write1.png b/refs/pull/405/merge/_images/write1.png new file mode 100644 index 00000000..d87abc99 Binary files /dev/null and b/refs/pull/405/merge/_images/write1.png differ diff --git a/refs/pull/405/merge/_images/write2.png b/refs/pull/405/merge/_images/write2.png new file mode 100644 index 00000000..e533a36a Binary files /dev/null and b/refs/pull/405/merge/_images/write2.png differ diff --git a/refs/pull/405/merge/_images/write21.png b/refs/pull/405/merge/_images/write21.png new file mode 100644 index 00000000..e533a36a Binary files /dev/null and b/refs/pull/405/merge/_images/write21.png differ diff --git a/refs/pull/405/merge/_images/xen-overview.png b/refs/pull/405/merge/_images/xen-overview.png new file mode 100644 index 00000000..9294dfba Binary files /dev/null and b/refs/pull/405/merge/_images/xen-overview.png differ diff --git a/refs/pull/405/merge/_images/xen-overview1.png b/refs/pull/405/merge/_images/xen-overview1.png new file mode 100644 index 00000000..9294dfba Binary files /dev/null and b/refs/pull/405/merge/_images/xen-overview1.png differ diff --git a/refs/pull/405/merge/_sources/index.rst.txt b/refs/pull/405/merge/_sources/index.rst.txt new file mode 100644 index 00000000..e81a9a0c --- /dev/null +++ b/refs/pull/405/merge/_sources/index.rst.txt @@ -0,0 +1,88 @@ +===================== +Linux Kernel Teaching +===================== + +This is a collection of lectures and labs Linux kernel topics. The +lectures focus on theoretical and Linux kernel exploration. + + +The labs focus on device drivers topics and they resemble "howto" +style documentation. Each topic has two parts: + +* a walk-through the topic which contains an overview, the main + abstractions, simple examples and pointers to APIs + +* a hands-on part which contains a few exercises that should be + resolved by the student; to focus on the topic at hand, the student + is presented with a starting coding skeleton and with in-depth tips + on how to solve the exercises + +This content is based on the `Operatings Systems 2 +`_ course from the Computer Science +and Engineering Department, the Faculty of Automatic Control and +Computers, University POLITEHNICA of Bucharest. + +You can get the latest version at http://github.com/linux-kernel-labs. + +To get started build the documentation from the sources after +installing docker-compose on you host: + +.. code-block:: c + + cd tools/labs && make docker-docs + +then point your browser at **Documentation/output/labs/index.html**. + +Alternatively, you can build directly on the host (see +tools/labs/docs/Dockerfile for dependencies): + +.. code-block:: c + + cd tools/labs && make docs + +.. toctree:: + + so2/index.rst + +.. toctree:: + :caption: Lectures + + lectures/intro.rst + lectures/syscalls.rst + lectures/processes.rst + lectures/interrupts.rst + lectures/smp.rst + lectures/address-space.rst + lectures/memory-management.rst + lectures/fs.rst + lectures/debugging.rst + lectures/networking.rst + lectures/arch.rst + lectures/virt.rst + +.. toctree:: + :caption: Labs + + labs/infrastructure.rst + labs/introduction.rst + labs/kernel_modules.rst + labs/kernel_api.rst + labs/device_drivers.rst + labs/interrupts.rst + labs/deferred_work.rst + labs/block_device_drivers.rst + labs/filesystems_part1.rst + labs/filesystems_part2.rst + labs/networking.rst + labs/arm_kernel_development.rst + labs/memory_mapping.rst + labs/device_model.rst + labs/kernel_profiling.rst + +.. toctree:: + :caption: Useful info + + info/vm.rst + info/extra-vm.rst + info/contributing.rst + diff --git a/refs/pull/405/merge/_sources/info/contributing.rst.txt b/refs/pull/405/merge/_sources/info/contributing.rst.txt new file mode 100644 index 00000000..d18c5ae1 --- /dev/null +++ b/refs/pull/405/merge/_sources/info/contributing.rst.txt @@ -0,0 +1,206 @@ +================================= +Contributing to linux-kernel-labs +================================= + +``linux-kernel-labs`` is an open platform. +You can help it get better by contributing to the documentation, exercises or +the infrastructure. +All contributions are welcome, no matter if they are just fixes for typos or +new sections in the documentation. + +All information required for making a contribution can be found in the +`linux-kernel-labs Linux repo `_. +In order to change anything, you need to create a Pull Request (``PR``) +from your own fork to this repository. +The PR will be reviewed by the members of the team and will be merged once +any potential issue is fixed. + +******************** +Repository structure +******************** + +The `linux-kernel-labs repo `_ is +a fork of the Linux kernel repo, with the following additions: + + * ``/tools/labs``: contains the labs and the :ref:`virtual machine (VM) infrastructure` + + * ``tools/labs/templates``: contains the skeletons sources + * ``tools/labs/qemu``: contains the qemu VM configuration + + * ``/Documentation/teaching``: contains the sources used to generate this + documentation + +************************** +Building the documentation +************************** + +To build the documentation, navigate to ``tools/labs`` and run the following +command: + +.. code-block:: bash + + make docs + +.. note:: + The command should install all the required packages. + In some cases, installing the packages or building the documentation might + fail, because of broken dependencies versions. + + Instead of struggling to fix the dependencies, the simplest way to build + the documentation is using a `Docker `_. + First, install ``docker`` and ``docker-compose`` on your host, and then run: + + .. code-block:: bash + + make docker-docs + + The first run might take some time, but subsequent builds will be faster. + +*********************** +Creating a contribution +*********************** + +Forking the repository +====================== + +1. If you haven't done it already, clone the + `linux-kernel-labs repo `_ + repository locally: + + .. code-block:: bash + + $ mkdir -p ~/src + $ git clone git@github.com:linux-kernel-labs/linux.git ~/src/linux + +2. Go to https://github.com/linux-kernel-labs/linux, make sure you are logged + in and click ``Fork`` in the top right of the page. + +3. Add the forked repo as a new remote to the local repo: + + .. code-block:: bash + + $ git remote add my_fork git@github.com:/linux.git + +Now, you can push to your fork by using ``my_fork`` instead of ``origin`` +(e.g. ``git push my_fork master``). + +Creating a pull request +======================= + +.. warning:: + + Pull requests must be created from their own branches, which are started from + ``master``. + +1. Go to the master branch and make sure you have no local changes: + + .. code-block:: bash + + student@eg106:~/src/linux$ git checkout master + student@eg106:~/src/linux$ git status + On branch master + Your branch is up-to-date with 'origin/master'. + nothing to commit, working directory clean + + +2. Make sure the local master branch is up-to-date with linux-kernel-labs: + + .. code-block:: bash + + student@eg106:~/src/linux$ git pull origin master + + .. note:: + + You can also push the latest master to your forked repo: + + .. code-block:: bash + + student@eg106:~/src/linux$ git push my_fork master + +3. Create a new branch for your change: + + .. code-block:: bash + + student@eg106:~/src/linux$ git checkout -b + +4. Make some changes and commit them. In this example, we are going to change + ``Documentation/teaching/index.rst``: + + .. code-block:: bash + + student@eg106:~/src/linux$ vim Documentation/teaching/index.rst + student@eg106:~/src/linux$ git add Documentation/teaching/index.rst + student@eg106:~/src/linux$ git commit -m "" + + .. warning:: + + The commit message must include a relevant description of your change + and the location of the changed component. + + Examples: + + * ``documentation: index: Fix typo in the first section`` + * ``labs: block_devices: Change printk log level`` + +5. Push the local branch to your forked repository: + + .. code-block:: bash + + student@eg106:~/src/linux$ git push my_fork + +6. Open the Pull Request + + * Go to https://github.com and open your forked repository page + * Click ``New pull request``. + * Make sure base repository (left side) is ``linux-kernel-labs/linux`` and the + base is master. + * Make sure the head repository (right side) is your forked repo and the + compare branch is your pushed branch. + * Click ``Create pull request``. + +Making changes to a Pull Request +================================ + +After receiving feedback for your changes, you might need to update the Pull +Request. +Your goal is to do a new push on the same branch. For this, follow the next steps: + +1. Make sure your branch is still up to date with the ``linux-kernel-labs`` repo + ``master`` branch. + + .. code-block:: bash + + student@eg106:~/src/linux$ git fetch origin master + student@eg106:~/src/linux$ git rebase FETCH_HEAD + + .. note:: + + If you are getting conflicts, it means that someone else modified the same + files/lines as you and already merged the changes since you opened the + Pull Request. + + In this case, you will need to fix the conflicts by editing the + conflicting files manually (run ``git status`` to see these files). + After fixing the conflicts, add them using ``git add`` and then run + ``git rebase --continue``. + + +2. Apply the changes to your local files +3. Commit the changes. We want all the changes to be in the same commit, so + we will amend the changes to the initial commit. + + .. code-block:: bash + + student@eg106:~/src/linux$ git add Documentation/teaching/index.rst + student@eg106:~/src/linux$ git commit --amend + +4. Force-push the updated commit: + + .. code-block:: bash + + student@eg106:~/src/linux$ git push my_fork -f + + After this step, the Pull Request is updated. It is now up to the + linux-kernel-labs team to review the pull request and integrate your + contributions in the main project. + diff --git a/refs/pull/405/merge/_sources/info/extra-vm.rst.txt b/refs/pull/405/merge/_sources/info/extra-vm.rst.txt new file mode 100644 index 00000000..bd3b997c --- /dev/null +++ b/refs/pull/405/merge/_sources/info/extra-vm.rst.txt @@ -0,0 +1,166 @@ +===================================== +Customizing the Virtual Machine Setup +===================================== + +Connect to the Virtual Machine via SSH +-------------------------------------- + +The default Yocto image for the QEMU virtual machine +(``core-image-minimal-qemu``) provides the minimal functionality to run the +kernel and kernel modules. For extra features, such as an SSH connection, +a more complete image is required, such as ``core-image-sato-dev-qemu``. + +To use the new image, update the ``YOCTO_IMAGE`` variable in +``tools/labs/qemu/Makefile``: + +.. code-block:: shell + + YOCTO_IMAGE = core-image-sato-qemu$(ARCH).ext4 + +When you start the virtual machine the first time using ``make boot`` with the +new image configuration, it will download the image and then boot the virtual +machine. The image is larger (around 400MB) than the minimal image so expect +some time for the download. + +You then enter the virtual machine via ``minicom``, determine the IP address of +the ``eth0`` interface an then you can connect to the virtual machine via SSH: + +.. code-block:: shell + + $ minicom -D serial.pts + Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0 + + qemux86 login: root + root@qemux86:~# ip a s + 1: lo: mtu 65536 qdisc noqueue qlen 1000 + link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + inet6 ::1/128 scope host + valid_lft forever preferred_lft forever + 2: eth0: mtu 1500 qdisc pfifo_fast qlen 1000 + link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff + inet 172.213.0.18/24 brd 172.213.0.255 scope global eth0 + valid_lft forever preferred_lft forever + inet6 fe80::5054:ff:fe12:3456/64 scope link + valid_lft forever preferred_lft forever + 3: sit0@NONE: mtu 1480 qdisc noop qlen 1000 + link/sit 0.0.0.0 brd 0.0.0.0 + + $ ssh -l root 172.213.0.18 + The authenticity of host '172.213.0.18 (172.213.0.18)' can't be established. + RSA key fingerprint is SHA256:JUWUcD7LdvURNcamoPePMhqEjFFtUNLAqO+TtzUiv5k. + Are you sure you want to continue connecting (yes/no)? yes + Warning: Permanently added '172.213.0.18' (RSA) to the list of known hosts. + root@qemux86:~# uname -a + Linux qemux86 4.19.0+ #3 SMP Sat Apr 4 22:45:18 EEST 2020 i686 GNU/Linux + +Connecting a Debugger to the Virtual Machine Kernel +--------------------------------------------------- + +You can use GDB to connect to the running virtual machine kernel and inspect +the state of the kernel. You run ``make gdb`` in ``tools/labs/``: + +.. code-block:: shell + + .../linux/tools/labs$ make gdb + ln -fs /home/tavi/src/linux/vmlinux vmlinux + gdb -ex "target remote localhost:1234" vmlinux + GNU gdb (Ubuntu 7.11.1-0ubuntu1~16.04) 7.11.1 + Copyright (C) 2016 Free Software Foundation, Inc. + License GPLv3+: GNU GPL version 3 or later + This is free software: you are free to change and redistribute it. + There is NO WARRANTY, to the extent permitted by law. Type "show copying" + and "show warranty" for details. + This GDB was configured as "x86_64-linux-gnu". + Type "show configuration" for configuration details. + For bug reporting instructions, please see: + . + Find the GDB manual and other documentation resources online at: + . + For help, type "help". + Type "apropos word" to search for commands related to "word"... + Reading symbols from vmlinux...done. + Remote debugging using localhost:1234 + 0xc13cf2f2 in native_safe_halt () at ./arch/x86/include/asm/irqflags.h:53 + 53asm volatile("sti; hlt": : :"memory"); + (gdb) bt + #0 0xc13cf2f2 in native_safe_halt () at ./arch/x86/include/asm/irqflags.h:53 + #1 arch_safe_halt () at ./arch/x86/include/asm/irqflags.h:95 + #2 default_idle () at arch/x86/kernel/process.c:341 + #3 0xc101f136 in arch_cpu_idle () at arch/x86/kernel/process.c:332 + #4 0xc106a6dd in cpuidle_idle_call () at kernel/sched/idle.c:156 + #5 do_idle () at kernel/sched/idle.c:245 + #6 0xc106a8c5 in cpu_startup_entry (state=) + at kernel/sched/idle.c:350 + #7 0xc13cb14a in rest_init () at init/main.c:415 + #8 0xc1507a7a in start_kernel () at init/main.c:679 + #9 0xc10001da in startup_32_smp () at arch/x86/kernel/head_32.S:368 + #10 0x00000000 in ?? () + (gdb) + +Rebuild the Kernel Image +------------------------ + +The kernel image is built the first time the VM is started. To rebuild the +kernel remove the kernel image file defined by the ``ZIMAGE`` variable in +``tools/labs/qemu/Makefile``: + +.. code-block:: shell + + ZIMAGE = $(KDIR)/arch/$(ARCH)/boot/$(b)zImage + +Typically the full path of the kernel is ``arch/x86/boot/bzImage``. + +Once removed the kernel image is rebuild by using: + +.. code-block:: shell + + ~/src/linux/tools/labs$ make zImage + +or simply starting the virtual machine + +.. code-block:: shell + + ~/src/linux/tools/labs$ make boot + +Using Docker containers +----------------------- + +If your setup doesn't allow the installation of the packages required for the +laboratory setup, you can build and run a container that has all the setup +already prepared for the virtual machine environment. + +In order to run the containerized setup, you need to install the following +packages: + +* ``docker`` +* ``docker-compose`` + +In order to run the container infrastructure run the following command in the +``tools/labs/`` directory: + +.. code-block:: shell + + sergiu@local:~/src/linux/tools/labs$ make docker-kernel + ... + ubuntu@so2:~$ + +The first time you run the command above, it will take a long time, because you +will have to build the container environment and install the required +applications. + +Every time you run the ``make docker-kernel`` command, another shell will +connect to the container. This will allow you to work with multiple tabs. + +All the commands that you would use in the regular environment can be used in +the containerized environment. + +The linux repository is mounted in the ``/linux`` directory. All changes +you will make here will also be seen on your local instance. + +In order to stop the container use the following command: + +.. code-block:: shell + + make stop-docker-kernel diff --git a/refs/pull/405/merge/_sources/info/vm.rst.txt b/refs/pull/405/merge/_sources/info/vm.rst.txt new file mode 100644 index 00000000..7d3be932 --- /dev/null +++ b/refs/pull/405/merge/_sources/info/vm.rst.txt @@ -0,0 +1,130 @@ +.. _vm_link: + +===================== +Recommended Setup +===================== +The simplest way to achieve a functional setup is to follow the steps listed in `this repo `__. + +===================== +Virtual Machine Setup +===================== + +Practice work is designed to run on a QEMU based virtual machine. Kernel code +is developed and built on the host machine and then deployed and run on the +virtual machine. + +In order to run and use the virtual machine the following packages are required +on a Debian/Ubuntu system: + +* ``flex`` +* ``bison`` +* ``build-essential`` +* ``gcc-multilib`` +* ``libncurses5-dev`` +* ``qemu-system-x86`` +* ``qemu-system-arm`` +* ``python3`` +* ``minicom`` + +The ``kvm`` package is not strictly required, but will make the virtual machine +faster by using KVM support (with the ``-enable-kvm`` option to QEMU). If ``kvm`` +is absent, the virtual machine will still run (albeit slower) using emulation. + +The virtual machine setup uses prebuild Yocto images that it downloads and a +kernel image that it builds itself. The following images are supported: + +* ``core-image-minimal-qemu`` +* ``core-image-minimal-dev-qemu`` +* ``core-image-sato-dev-qemu`` +* ``core-image-sato-qemu`` +* ``core-image-sato-sdk-qemu`` + +By default, ``core-image-minimal-qemu`` it used. This setting can be changed by +updating the ``YOCTO_IMAGE`` variable in ``tools/labs/qemu/Makefile``. + +Starting the Virtual Machine +---------------------------- + +You start the virtual machine in the ``tools/labs/`` folder by running ``make +boot``: + +.. code-block:: shell + + .../linux/tools/labs$ make boot + +The first run of the ``make boot`` command will compile the kernel image and it +will take longer. Subsequent runs will only start the QEMU virtual machine, +with verbose output provided: + +.. code-block:: shell + + .../linux/tools/labs$ make boot + mkdir /tmp/tmp.7rWv63E9Wf + sudo mount -t ext4 -o loop core-image-minimal-qemux86.ext4 /tmp/tmp.7rWv63E9Wf + sudo make -C /home/razvan/school/so2/linux.git modules_install INSTALL_MOD_PATH=/tmp/tmp.7rWv63E9Wf + make: Entering directory '/home/razvan/school/so2/linux.git' + INSTALL crypto/crypto_engine.ko + INSTALL drivers/crypto/virtio/virtio_crypto.ko + INSTALL drivers/net/netconsole.ko + DEPMOD 4.19.0+ + make: Leaving directory '/home/razvan/school/so2/linux.git' + sudo umount /tmp/tmp.7rWv63E9Wf + rmdir /tmp/tmp.7rWv63E9Wf + sleep 1 && touch .modinst + qemu/create_net.sh tap0 + + dnsmasq: failed to create listening socket for 172.213.0.1: Address already in use + qemu/create_net.sh tap1 + + dnsmasq: failed to create listening socket for 127.0.0.1: Address already in use + /home/razvan/school/so2/linux.git/tools/labs/templates/assignments/6-e100/nttcp -v -i & + nttcp-l: nttcp, version 1.47 + nttcp-l: running in inetd mode on port 5037 - ignoring options beside -v and -p + bind: Address already in use + nttcp-l: service-socket: bind:: Address already in use, errno=98 + ARCH=x86 qemu/qemu.sh -kernel /home/razvan/school/so2/linux.git/arch/x86/boot/bzImage -device virtio-serial -chardev pty,id=virtiocon0 -device virtconsole,chardev=virtiocon0 -serial pipe:pipe1 -serial pipe:pipe2 -netdev tap,id=tap0,ifname=tap0,script=no,downscript=no -net nic,netdev=tap0,model=virtio -netdev tap,id=tap1,ifname=tap1,script=no,downscript=no -net nic,netdev=tap1,model=i82559er -drive file=core-image-minimal-qemux86.ext4,if=virtio,format=raw -drive file=disk1.img,if=virtio,format=raw -drive file=disk2.img,if=virtio,format=raw --append "root=/dev/vda loglevel=15 console=hvc0" --display none -s + qemu-system-i386: -chardev pty,id=virtiocon0: char device redirected to /dev/pts/68 (label virtiocon0) + +.. note:: To show the QEMU console use + +.. code-block:: shell + + .../linux/tools/labs$ QEMU_DISPLAY=gtk make boot + + This will show the VGA output and will also give + access to the standard keyboard. + +.. note:: The virtual machine setup scripts and configuration files are located + in ``tools/labs/qemu/``. + +.. _vm_interaction_link: + +Connecting to the Virtual Machine +--------------------------------- + +Once the virtual machine is started you can connect to it on the serial port. A +symbolic link named ``serial.pts`` is created to the emulated serial port +device: + +.. code-block:: shell + + .../linux/tools/labs$ ls -l serial.pts + lrwxrwxrwx 1 razvan razvan 11 Apr 1 08:03 serial.pts -> /dev/pts/68 + +On the host you use the ``minicom`` command to connect to the virtual machine +via the ``serial.pts`` link: + +.. code-block:: shell + + .../linux/tools/labs$ minicom -D serial.pts + [...] + Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0 + + qemux86 login: root + root@qemux86:~# + +.. note:: When you connect to the virtual machine, simply enter ``root`` at the + login prompt and you will get a root console, no password required. + +.. note:: You exit ``minicom`` by pressing ``Ctrl+a`` and then ``x``. You will + get a confirmation prompt and then you will exit ``minicom``. diff --git a/refs/pull/405/merge/_sources/labs/arm_kernel_development.rst.txt b/refs/pull/405/merge/_sources/labs/arm_kernel_development.rst.txt new file mode 100644 index 00000000..ade020c4 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/arm_kernel_development.rst.txt @@ -0,0 +1,387 @@ +========================= +Kernel Development on ARM +========================= + +Lab objectives +============== + +* get a feeling of what System on a Chip (SoC) means +* get familiar with embedded world using ARM as a supported architecture +* understand what a Board Support Package means (BSP) +* compile and boot an ARM kernel with Qemu using i.MX6UL platform as an example +* get familiar with hardware description using Device Trees + +System on a Chip +================ + +A System on a Chip (**SoC**) is an integrated circuit (**IC**) that integrates an entire system onto it. The components +that can be usually found on an SoC include a central processing unit (**CPU**), memory, input/output ports, storage devices +together with more sophisticated modules like audio digital interfaces, neural processing units (**NPU**) or graphical +processing units (**GPU**). + +SoCs can be used in various applications most common are: + - consumer electronics (TV sets, mobile phones, video game consoles) + - industrial computers (medical imaging, etc) + - automotive + - home appliances + +The leading architecture for SoCs is **ARM**. Worth mentioning here is that there are also x86-based SoCs platforms. Another thing +we need to keep an eye on is **RISC-V** an open standard instruction set architecture. + +A simplified view of an **ARM** platform is shown in the image below: + +.. image:: ../res/schematic.png + :align: center + +We will refer as a reference platform at NXP's `i.MX6UL `_ platform, but in general all SoC's contain the following building blocks: + + - one or more CPU cores + - a system bus + - clock and reset module + + - PLL + - OSC + - reset controller + + - interrupt controller + - timers + - memory controller + - peripheral controllers + + - `I2C `_ + - `SPI `_ + - `GPIO `_ + - `Ethernet `_ (for network) + - `uSDHC `_ (for storage) + - USB + - `UART `_ + - `I2S `_ (for sound) + - eLCDIF (for LCD Panel) + +Here is the complete block diagram for i.MX6UL platform: + +.. image:: https://www.nxp.com/assets/images/en/block-diagrams/IMX6UL-BD.jpg + :alt: IMX6UL-BD + :width: 60 % + :align: center + +i.MX6UL Evaluation Kit board looks like this: + +.. image:: https://www.compulab.com/wp-content/gallery/sbc-imx6ul/compulab_sbc-imx6ul_single-board-computer.jpg + :alt: imx6ul-evk + :width: 60 % + :align: center + +Other popular SoC boards: + + * `Broadcom Raspberry Pi `_ + * `Texas Instruments Beagle board `_ + * `Odroid Xu4 `_ + * `Nvidia Jetson Nano `_ + +Board Support package +===================== + +A board support package (**BSP**) is the minimal set of software packages that allow to demonstrate the capabilities of a certain hardware platform. This includes: + + - toolchain + - bootloader + - Linux kernel image, device tree files and drivers + - root filesystem + +Semiconductor manufacturers usually provide a **BSP** together with an evaluation board. BSP is typically bundled using `Yocto `_ + +Toolchain +========= +Because our development machines are mostly x86-based we need a cross compiler that can produce executable +code for ARM platform. + +We can build our own cross compiler from scratch using https://crosstool-ng.github.io/ or we can install one + +.. code-block:: bash + + $ sudo apt-get install gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf # for arm32 + $ sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu # for arm64 + +There are several of toolchain binaries depending on the configuration: + + - With "arm-eabi-gcc" you have the Linux system C library which will make calls into the kernel IOCTLs, e.g. for allocating memory pages to the process. + - With "arm-eabi-none-gcc" you are running on platform which doesn't have an operating system at all - so the C library is different to cope with that. + +Compiling the Linux kernel on ARM +--------------------------------- + +Compile the kernel for 32bit ARM boards: + +.. code-block:: bash + + # select defconfig based on your platform + $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make imx_v6_v7_defconfig + # compile the kernel + $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8 + +Compile the kernel for 64bit ARM boards: + +.. code-block:: bash + + # for 64bit ARM there is a single config for all supported boards + $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make defconfig + # compile the kernel + $ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make -j8 + +Linux kernel image +================== + +The kernel image binary is named ``vmlinux`` and it can be found in the root of the kernel tree. Compressed image used for booting can be found under: + +- ``arch/arm/boot/Image``, for arm32 +- ``arch/arm64/boot/Image``, for arm64 + +.. code-block:: bash + + $ file vmlinux + vmlinux: ELF 32-bit LSB executable, ARM, EABI5 version 1 (SYSV), statically linked, not stripped + + $ file vmlinux + vmlinux: ELF 64-bit LSB shared object, ARM aarch64, version 1 (SYSV), statically linked, not stripped + +Rootfs +====== + +The root filesystem (``rootfs``) is the filesystem mounted at the top of files hierarchy (``/``). It should contain at least +the critical files allowing the system to boot to a shell. + +.. code-block:: bash + + root@so2$ tree -d -L 2 + ├── bin + ├── boot + ├── dev + ├── etc + ├── home + │   └── root + ├── lib + │   └── udev + ├── mnt + ├── proc + ├── sbin + │   └── init + ├── sys + ├── usr + │   ├── bin + │   ├── include + │   ├── lib + └── var + +As for ``x86`` we will make use of Yocto rootfs images. In order to download an ``ext4`` rootfs image for ``arm32`` one needs to run: + +.. code-block:: bash + + $ cd tools/labs/ + $ ARCH=arm make core-image-minimal-qemuarm.ext4 + +Device tree +=========== + +Device tree (**DT**) is a tree structure used to describe the hardware devices in a system. Each node in the tree describes a device hence it is called **device node**. DT was introduced +to provide a way to discover non-discoverable hardware (e.g a device on an I2C bus). This information was previously stored inside the source code for the Linux kernel. This meant that +each time we needed to modify a node for a device the kernel needed to be recompiled. This no longer holds true as device tree and kernel image are separate binaries now. + +Device trees are stored inside device tree sources (*.dts*) and compiled into device tree blobs (*.dtb*). + +.. code-block:: bash + + # compile dtbs + $ make dtbs + + # location for DT sources on arm32 + $ ls arch/arm/boot/dts/ + imx6ul-14x14-evk.dtb imx6ull-14x14-evk.dtb bcm2835-rpi-a-plus.dts + + # location for DT source on arm64 + $ ls arch/arm64/boot/dts/ + imx8mm-evk.dts imx8mp-evk.dts + +The following image is a represantation of a simple device tree, describing board type, cpu and memory. + +.. image:: ../res/dts_node.png + :align: center + +Notice that a device tree node can be defined using ``label: name@address``: + + - ``label``, is an identifier used to reference the node from other places + - ``name``, node identifier + - ``address``, used to differentiate nodes with the same name. + +A node might contain several properties arranged in the ``name = value`` format. The name is a string +and the value can be bytes, strings, array of strings. + +Here is an example: + +.. code:: c + + / { + node@0 { + empty-property; + string-property = "string value"; + string-list-property = "string value 1", "string value 2"; + int-list-property = ; + + child-node@0 { + child-empty-property; + child-string-property = "string value"; + child-node-reference = <&child-node1>; + }; + + child-node1: child-node@1 { + child-empty-property; + child-string-property = "string value"; + }; + }; + }; + +Qemu +==== + +We will use ``qemu-system-arm`` to boot 32bit ARM platforms. Although, this can be installed from official distro repos, for example: + +.. code:: bash + + sudo apt-get install -y qemu-system-arm + +We strongly recommend using latest version of ``qemu-system-arm`` build from sources: + +.. code:: bash + + $ git clone https://gitlab.com/qemu-project/qemu.git + $ ./configure --target-list=arm-softmmu --disable-docs + $ make -j8 + $ ./build/qemu-system-arm + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: arm_kernel_development + +.. warning:: + + The rules for working with the virtual machine for ``ARM`` are modified as follows + + .. code-block:: shell + + # modules build + tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make build + # modules copy + tools/labs $ ARCH=arm make copy + # kernel build + $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8 + +0. Intro +-------- + +Inspect the following locations in the Linux kernel code and identify platforms and vendors using +ARM architecture: + + - 32-bit: ``arch/arm/boot/dts`` + - 64-bit: ``arch/arm64/boot/dts`` + + +Use ``qemu`` and look at the supported platforms: + +.. code-block:: bash + + ../qemu/build/arm-softmmu/qemu-system-arm -M ? + +.. note:: We used our own compiled version of ``Qemu`` for ``arm32``. See `Qemu`_ section for more details. + +1. Boot +------- + +Use ``qemu`` to boot ``i.MX6UL`` platform. In order to boot, we first need to compile the kernel. +Review `Compiling the Linux kernel on ARM`_ section. + +Successful compilation will result in the following binaries: + + - ``arch/arm/boot/Image``, kernel image compiled for ARM + - ``arch/arm/boot/dts/imx6ul-14x14-evk.dtb``, device tree blob for ``i.MX6UL`` board + +Review `Rootfs`_ section and download ``core-image-minimal-qemuarm.ext4`` rootfs. +Run ``qemu`` using then following command: + +.. code-block:: bash + + ../qemu/build/arm-softmmu/qemu-system-arm -M mcimx6ul-evk -cpu cortex-a7 -m 512M \ + -kernel arch/arm/boot/zImage -nographic -dtb arch/arm/boot/dts/imx6ul-14x14-evk.dtb \ + -append "root=/dev/mmcblk0 rw console=ttymxc0 loglevel=8 earlycon printk" -sd tools/labs/core-image-minimal-qemuarm.ext4 + +.. note:: LCDIF and ASRC devices are not well supported with ``Qemu``. Remove them from compilation. + +.. code-block:: bash + + $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make menuconfig + # set FSL_ASRC=n and DRM_MXSFB=n + $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8 + +Once the kernel is booted check kernel version and cpu info: + +.. code-block:: bash + + $ cat /proc/cpuinfo + $ cat /proc/version + +2. CPU information +------------------ + +Inspect the CPU configuration for ``NXP i.MX6UL`` board. Start with ``arch/arm/boot/dts/imx6ul-14x14-evk.dts``. + + - find ``cpu@0`` device tree node and look for ``operating-points`` property. + - read the maximum and minimum operating frequency the processor can run + + .. code:: bash + + $ cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq + $ cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq + +3. I/O memory +------------- +Inspect I/O space configuration for ``NXP i.MX6UL`` board. Start with ``arch/arm/boot/dts/imx6ul-14x14-evk.dts`` and identify each device mentioned below. + +.. code:: bash + + $ cat /proc/iomem + 00900000-0091ffff : 900000.sram sram@900000 + 0209c000-0209ffff : 209c000.gpio gpio@209c000 + 021a0000-021a3fff : 21a0000.i2c i2c@21a0000 + 80000000-9fffffff : System RAM + +Identify device tree nodes corresponding to: + + - ``System RAM``, look for ``memory@80000000`` node in ``arch/arm/boot/dts/imx6ul-14x14-evk.dtsi``. What's the size of the System RAM? + - ``GPIO1``, look for ``gpio@209c000`` node in ``arch/arm/boot/dts/imx6ul.dtsi``. What's the size of the I/O space for this device? + - ``I2C1``, look for ``i2c@21a0000`` node in ``arch/arm/boot/dts/imx6ul.dtsi``. What's the size of the I/O spaces for this device? + +4. Hello World +-------------- + +Implement a simple kernel module that prints a message at load/unload time. Compile it and load it on ``i.MX6UL`` emulated platform. + +.. code-block:: shell + + # modules build + tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make build + # modules copy + tools/labs $ ARCH=arm make copy + # kernel build + $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8 + +5. Simple device +---------------- + +Implement a driver for a simple platform device. Find ``TODO 1`` and notice how ``simple_driver`` is declared and register as a platform driver. +Follow ``TODO 2`` and add the ``so2,simple-device-v1`` and ``so2,simple-device-v2`` compatible strings in the simple_device_ids array. + +Create two device tree nodes in ``arch/arm/boot/dts/imx6ul.dtsi`` under ``soc`` node with compatible strings ``so2,simple-device-v1`` and +``so2,simple-device-v2`` respectively. Then notice the behavior when loading ``simple_driver`` module. + +.. _imx6ul: https://www.nxp.com/products/processors-and-microcontrollers/arm-processors/i-mx-applications-processors/i-mx-6-processors/i-mx-6ultralite-processor-low-power-secure-arm-cortex-a7-core:i.MX6UL diff --git a/refs/pull/405/merge/_sources/labs/block_device_drivers.rst.txt b/refs/pull/405/merge/_sources/labs/block_device_drivers.rst.txt new file mode 100644 index 00000000..3b6efca7 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/block_device_drivers.rst.txt @@ -0,0 +1,1210 @@ +==================== +Block Device Drivers +==================== + +Lab objectives +============== + + * acquiring knowledge about the behavior of the I/O subsystem on Linux + * hands-on activities in structures and functions of block devices + * acquiring basic skills for utilizing the API for block devices, by solving + exercises + +Overview +======== + +Block devices are characterized by random access to data organized in fixed-size +blocks. Examples of such devices are hard drives, CD-ROM drives, RAM disks, etc. +The speed of block devices is generally much higher than the speed of character +devices, and their performance is also important. This is why the Linux kernel +handles differently these 2 types of devices (it uses a specialized API). + +Working with block devices is therefore more complicated than working with +character devices. Character devices have a single current position, while block +devices must be able to move to any position in the device to provide random +access to data. To simplify work with block devices, the Linux kernel provides +an entire subsystem called the block I/O (or block layer) subsystem. + +From the kernel perspective, the smallest logical unit of addressing is the +block. Although the physical device can be addressed at sector level, the kernel +performs all disk operations using blocks. Since the smallest unit of physical +addressing is the sector, the size of the block must be a multiple of the size +of the sector. Additionally, the block size must be a power of 2 and can not +exceed the size of a page. The size of the block may vary depending on the file +system used, the most common values being 512 bytes, 1 kilobytes and 4 +kilobytes. + + +Register a block I/O device +=========================== + +To register a block I/O device, function :c:func:`register_blkdev` is used. +To deregister a block I/O device, function :c:func:`unregister_blkdev` is +used. + +Starting with version 4.9 of the Linux kernel, the call to +:c:func:`register_blkdev` is optional. The only operations performed by this +function are the dynamic allocation of a major (if the major argument is 0 when +calling the function) and creating an entry in :file:`/proc/devices`. In +future kernel versions it may be removed; however, most drivers still call it. + +Usually, the call to the register function is performed in the module +initialization function, and the call to the deregister function is performed in +the module exit function. A typical scenario is presented below: + + +.. code-block:: c + + #include + + #define MY_BLOCK_MAJOR 240 + #define MY_BLKDEV_NAME "mybdev" + + static int my_block_init(void) + { + int status; + + status = register_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME); + if (status < 0) { + printk(KERN_ERR "unable to register mybdev block device\n"); + return -EBUSY; + } + //... + } + + static void my_block_exit(void) + { + //... + unregister_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME); + } + + +Register a disk +=============== + +Although the :c:func:`register_blkdev` function obtains a major, it does not +provide a device (disk) to the system. For creating and using block devices +(disks), a specialized interface defined in :file:`linux/genhd.h` is used. + +The useful functions defined in :file:`linux/genhd.h` are to register /allocate +a disk, add it to the system, and de-register /unmount the disk. + +The :c:func:`alloc_disk` function is used to allocate a disk, and the +:c:func:`del_gendisk` function is used to deallocate it. Adding the disk to the +system is done using the :c:func:`add_disk` function. + +The :c:func:`alloc_disk` and :c:func:`add_disk` functions are typically used in +the module initialization function, and the :c:func:`del_gendisk` function in +the module exit function. + +.. code-block:: c + + #include + #include + + #define MY_BLOCK_MINORS 1 + + static struct my_block_dev { + struct gendisk *gd; + //... + } dev; + + static int create_block_device(struct my_block_dev *dev) + { + dev->gd = alloc_disk(MY_BLOCK_MINORS); + //... + add_disk(dev->gd); + } + + static int my_block_init(void) + { + //... + create_block_device(&dev); + } + + static void delete_block_device(struct my_block_dev *dev) + { + if (dev->gd) + del_gendisk(dev->gd); + //... + } + + static void my_block_exit(void) + { + delete_block_device(&dev); + //... + } + +As with character devices, it is recommended to use :c:type:`my_block_dev` +structure to store important elements describing the block device. + +Note that immediately after calling the :c:func:`add_disk` function (actually +even during the call), the disk is active and its methods can be called at any +time. As a result, this function should not be called before the driver is fully +initialized and ready to respond to requests for the registered disk. + + +It can be noticed that the basic structure in working with block devices (disks) +is the :c:type:`struct gendisk` structure. + +After a call to :c:func:`del_gendisk`, the :c:type:`struct gendisk` structure +may continue to exist (and the device operations may still be called) if there +are still users (an open operation was called on the device but the associated +release operation has not been called). One solution is to keep the number of +users of the device and call the :c:func:`del_gendisk` function only when there +are no users left of the device. + +:c:type:`struct gendisk` structure +================================== + +The :c:type:`struct gendisk` structure stores information about a disk. As +stated above, such a structure is obtained from the :c:func:`alloc_disk` call +and its fields must be filled before it is sent to the :c:func:`add_disk` +function. + +The :c:type:`struct gendisk` structure has the following important fields: + + * :c:member:`major`, :c:member:`first_minor`, :c:member:`minor`, describing + the identifiers used by the disk; a disk must have at least one minor; if + the disk allows the partitioning operation, a minor must be allocated for + each possible partition + * :c:member:`disk_name`, which represents the disk name as it appears in + :file:`/proc/partitions` and in sysfs (:file:`/sys/block`) + * :c:member:`fops`, representing operations associated with the disk + * :c:member:`queue`, which represents the queue of requests + * :c:member:`capacity`, which is disk capacity in 512 byte sectors; + it is initialized using the :c:func:`set_capacity` function + * :c:member:`private_data`, which is a pointer to private data + +An example of filling a :c:type:`struct gendisk` structure is presented below: + +.. code-block:: c + + #include + #include + #include + + #define NR_SECTORS 1024 + + #define KERNEL_SECTOR_SIZE 512 + + static struct my_block_dev { + //... + spinlock_t lock; /* For mutual exclusion */ + struct request_queue *queue; /* The device request queue */ + struct gendisk *gd; /* The gendisk structure */ + //... + } dev; + + static int create_block_device(struct my_block_dev *dev) + { + ... + /* Initialize the gendisk structure */ + dev->gd = alloc_disk(MY_BLOCK_MINORS); + if (!dev->gd) { + printk (KERN_NOTICE "alloc_disk failure\n"); + return -ENOMEM; + } + + dev->gd->major = MY_BLOCK_MAJOR; + dev->gd->first_minor = 0; + dev->gd->fops = &my_block_ops; + dev->gd->queue = dev->queue; + dev->gd->private_data = dev; + snprintf (dev->gd->disk_name, 32, "myblock"); + set_capacity(dev->gd, NR_SECTORS); + + add_disk(dev->gd); + + return 0; + } + + static int my_block_init(void) + { + int status; + //... + status = create_block_device(&dev); + if (status < 0) + return status; + //... + } + + static void delete_block_device(struct my_block_dev *dev) + { + if (dev->gd) { + del_gendisk(dev->gd); + } + //... + } + + static void my_block_exit(void) + { + delete_block_device(&dev); + //... + } + +As stated before, the kernel considers a disk as a vector of 512 byte sectors. +In reality, the devices may have a different size of the sector. To work with +these devices, the kernel needs to be informed about the real size of a sector, +and for all operations the necessary conversions must be made. + +To inform the kernel about the device sector size, a parameter of the request +queue must be set just after the request queue is allocated, using the +:c:func:`blk_queue_logical_block_size` function. All requests generated by the +kernel will be multiple of this sector size and will be aligned accordingly. +However, communication between the device and the driver will still be performed +in sectors of 512 bytes in size, so conversion should be done each time (an +example of such conversion is when calling the :c:func:`set_capacity` function +in the code above). + +:c:type:`struct block_device_operations` structure +================================================== + +Just as for a character device, operations in :c:type:`struct file_operations` +should be completed, so for a block device, the operations in +:c:type:`struct block_device_operations` should be completed. The association +of operations is done through the :c:member:`fops` field in the +:c:type:`struct gendisk` +structure. + +Some of the fields of the :c:type:`struct block_device_operations` structure +are presented below: + +.. code-block:: c + + struct block_device_operations { + int (*open) (struct block_device *, fmode_t); + int (*release) (struct gendisk *, fmode_t); + int (*locked_ioctl) (struct block_device *, fmode_t, unsigned, + unsigned long); + int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); + int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, + unsigned long); + int (*direct_access) (struct block_device *, sector_t, + void **, unsigned long *); + int (*media_changed) (struct gendisk *); + int (*revalidate_disk) (struct gendisk *); + int (*getgeo)(struct block_device *, struct hd_geometry *); + blk_qc_t (*submit_bio) (struct bio *bio); + struct module *owner; + } + +:c:func:`open` and :c:func:`release` operations are called directly from user +space by utilities that may perform the following tasks: partitioning, file +system creation, file system verification. In a :c:func:`mount` operation, the +:c:func:`open` function is called directly from the kernel space, the file +descriptor being stored by the kernel. A driver for a block device can not +differentiate between :c:func:`open` calls performed from user space and kernel +space. + +An example of how to use these two functions is given below: + +.. code-block:: c + + #include + #include + + static struct my_block_dev { + //... + struct gendisk * gd; + //... + } dev; + + static int my_block_open(struct block_device *bdev, fmode_t mode) + { + //... + + return 0; + } + + static int my_block_release(struct gendisk *gd, fmode_t mode) + { + //... + + return 0; + } + + struct block_device_operations my_block_ops = { + .owner = THIS_MODULE, + .open = my_block_open, + .release = my_block_release + }; + + static int create_block_device(struct my_block_dev *dev) + { + //.... + dev->gd->fops = &my_block_ops; + dev->gd->private_data = dev; + //... + } + +Please notice that there are no read or write operations. These operations are +performed by the :c:func:`request` function associated with the request queue +of the disk. + +Request Queues - Multi-Queue Block Layer +======================================== + +Drivers for block devices use queues to store the block I/O requests that will +be processed. A request queue is represented by the +:c:type:`struct request_queue` structure. The request queue is made up of a +double-linked list of requests and their associated control information. The +requests are added to the queue by higher-level kernel code (for example, file +systems). + +The block device driver associates each queue with a handling function, which +will be called for each request in the queue +(the :c:type:`struct request` structure). + +In earlier version of the Linux kernel, each device driver had associated one or +more request queues (:c:type:`struct request_queue`), where any client could add +requests, while also being able to reorder them. +The problem with this approach is that it requires a per-queue lock, making it +inefficient in distributed systems. + +The `Multi-Queue Block Queing Mechanism `_ +solves this issue by splitting the device driver queue in two parts: + 1. Software staging queues + 2. Hardware dispatch queues + +Software staging queues +----------------------- + +The staging queues hold requests from the clients before sending them to the +block device driver. To prevent the waiting for a per-queue lock, a staging +queue is allocated for each CPU or node. A software queue is associated to +only one hardware queue. + +While in this queue, the requests can be merged or reordered, according to an +I/O Scheduler, in order to maximize performance. This means that only the +requests coming from the same CPU or node can be optimized. + +Staging queues are usually not used by the block device drivers, but only +internally by the I/O subsystem to optimize requests before sending them to the +device drivers. + +Hardware dispatch queues +------------------------ + +The hardware queues (:c:type:`struct blk_mq_hw_ctx`) are used to send the +requests from the staging queues to the block device driver. +Once in this queue, the requests can't be merged or reordered. + +Depending on the underlying hardware, a block device driver can create multiple +hardware queues in order to improve parallelism and maximize performance. + +Tag sets +-------- + +A block device driver can accept a request before the previous one is completed. +As a consequence, the upper layers need a way to know when a request is +completed. For this, a "tag" is added to each request upon submission and sent +back using a completion notification after the request is completed. + +The tags are part of a tag set (:c:type:`struct blk_mq_tag_set`), which is +unique to a device. +The tag set structure is allocated and initialized before the request queues +and also stores some of the queues properties. + +.. code-block:: c + + struct blk_mq_tag_set { + ... + const struct blk_mq_ops *ops; + unsigned int nr_hw_queues; + unsigned int queue_depth; + unsigned int cmd_size; + int numa_node; + void *driver_data; + struct blk_mq_tags **tags; + struct list_head tag_list; + ... + }; + +Some of the fields in :c:type:`struct blk_mq_tag_set` are: + + * ``ops`` - Queue operations, most notably the request handling function. + * ``nr_hw_queues`` - The number of hardware queues allocated for the device + * ``queue_depth`` - Hardware queues size + * ``cmd_size`` - Number of extra bytes allocated at the end of the device, to + be used by the block device driver, if needed. + * ``numa_node`` - In NUMA systems, the index of the node the storage device is + connected to. + * ``driver_data`` - Data private to the driver, if needed. + * ``tags`` - Pointer to an array of ``nr_hw_queues`` tag sets. + * ``tag_list`` - List of request queues using this tag set. + +Create and delete a request queue +--------------------------------- + +Request queues are created using the :c:func:`blk_mq_init_queue` function and +are deleted using :c:func:`blk_cleanup_queue`. The first function creates both +the hardware and the software queues and initializes their structures. + +Queue properties, including the number of hardware queues, their capacity and +request handling function are configured using the :c:type:`blk_mq_tag_set` +structure, as described above. + +An example of using these functions is as follows: + +.. code-block:: c + + #include + #include + #include + + static struct my_block_dev { + //... + struct blk_mq_tag_set tag_set; + struct request_queue *queue; + //... + } dev; + + static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) + //... + + static struct blk_mq_ops my_queue_ops = { + .queue_rq = my_block_request, + }; + + static int create_block_device(struct my_block_dev *dev) + { + /* Initialize tag set. */ + dev->tag_set.ops = &my_queue_ops; + dev->tag_set.nr_hw_queues = 1; + dev->tag_set.queue_depth = 128; + dev->tag_set.numa_node = NUMA_NO_NODE; + dev->tag_set.cmd_size = 0; + dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + err = blk_mq_alloc_tag_set(&dev->tag_set); + if (err) { + goto out_err; + } + + /* Allocate queue. */ + dev->queue = blk_mq_init_queue(&dev->tag_set); + if (IS_ERR(dev->queue)) { + goto out_blk_init; + } + + blk_queue_logical_block_size(dev->queue, KERNEL_SECTOR_SIZE); + + /* Assign private data to queue structure. */ + dev->queue->queuedata = dev; + //... + + out_blk_init: + blk_mq_free_tag_set(&dev->tag_set); + out_err: + return -ENOMEM; + } + + static int my_block_init(void) + { + int status; + //... + status = create_block_device(&dev); + if (status < 0) + return status; + //... + } + + static void delete_block_device(struct block_dev *dev) + { + //... + blk_cleanup_queue(dev->queue); + blk_mq_free_tag_set(&dev->tag_set); + } + + static void my_block_exit(void) + { + delete_block_device(&dev); + //... + } + +After initializing the tag set structure, the tag lists are allocated using the +:c:func:`blk_mq_alloc_tag_set` function. +The pointer to the function which will process the requests +(:c:func:`my_block_request`) is filled in the ``my_queue_ops`` structure and +then the pointer to this structure is added to the tag set. + +The queue is created using the :c:func:`blk_mq_init_queue` function, based on +the information added in the tag set. + +As part of the request queue initialization, you can configure the +:c:member:`queuedata` field, which is equivalent to the :c:member:`private_data` +field in other structures. + +Useful functions for processing request queues +---------------------------------------------- + +The ``queue_rq`` function from :c:type:`struct blk_mq_ops` is used to handle +requests for working with the block device. +This function is the equivalent of read and write functions encountered on +character devices. The function receives the requests for the device as +arguments and can use various functions for processing them. + +The functions used to process the requests in the handler are described below: + + * :c:func:`blk_mq_start_request` - must be called before starting processing + a request; + * :c:func:`blk_mq_requeue_request` - to re-send the request in the queue; + * :c:func:`blk_mq_end_request` - to end request processing and notify the + upper layers. + +Requests for block devices +========================== + +A request for a block device is described by :c:type:`struct request` +structure. + +The fields of :c:type:`struct request` structure include: + + * :c:member:`cmd_flags`: a series of flags including direction (reading or + writing); to find out the direction, the macrodefinition + :c:macro:`rq_data_dir` is used, which returns 0 for a read request and 1 + for a write request on the device; + * :c:member:`__sector`: the first sector of the transfer request; if the + device sector has a different size, the appropriate conversion should be + done. To access this field, use the :c:macro:`blk_rq_pos` macro; + * :c:member:`__data_len`: the total number of bytes to be transferred; to + access this field the :c:macro:`blk_rq_bytes` macro is used; + * generally, data from the current :c:type:`struct bio` will be + transferred; the data size is obtained using the + :c:macro:`blk_rq_cur_bytes` macro; + * :c:member:`bio`, a dynamic list of :c:type:`struct bio` structures that + is a set of buffers associated to the request; this field is accessed by + macrodefinition :c:macro:`rq_for_each_segment` if there are multiple + buffers, or by :c:macro:`bio_data` macrodefinition in case there is only + one associated buffer; + +We will discuss more about the :c:type:`struct bio` structure and its +associated operations in the :ref:`bio_structure` section. + +Create a request +---------------- + +Read /write requests are created by code layers superior to the kernel I/O +subsystem. Typically, the subsystem that creates requests for block devices is +the file management subsystem. The I/O subsystem acts as an interface between +the file management subsystem and the block device driver. The main operations +under the responsibility of the I/O subsystem are adding requests to the queue +of the specific block device and sorting and merging requests according to +performance considerations. + +Process a request +----------------- + +The central part of a block device driver is the request handling function +(``queue_rq``). In previous examples, the function that fulfilled this role was +:c:func:`my_block_request`. As stated in the +`Create and delete a request queue`_ section, this function is associated to the +driver when creating the tag set structure. + +This function is called when the kernel considers that the driver should process +I/O requests. The function must start processing the requests from the queue, +but it is not mandatory to finish them, as requests may be finished by other +parts of the driver. + +The request function runs in an atomic context and must follow the rules for +atomic code (it does not need to call functions that can cause sleep, etc.). + +Calling the function that processes the requests is asynchronous relative +to the actions of any userspace process and no assumptions about the process +in which the respective function is running should be made. Also, it should not +be assumed that the buffer provided by a request is from kernel space or user +space, any operation that accesses the userspace being erroneous. + +One of the simplest request handling function is presented below: + +.. code-block:: c + + static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) + { + struct request *rq = bd->rq; + struct my_block_dev *dev = q->queuedata; + + blk_mq_start_request(rq); + + if (blk_rq_is_passthrough(rq)) { + printk (KERN_NOTICE "Skip non-fs request\n"); + blk_mq_end_request(rq, BLK_STS_IOERR); + goto out; + } + + /* do work */ + ... + + blk_mq_end_request(rq, BLK_STS_OK); + + out: + return BLK_STS_OK; + } + +The :c:func:`my_block_request` function performs the following operations: + + * Get a pointer to the request structure from the ``bd`` argument and start + its processing using the :c:func:`blk_mq_start_request` function. + * A block device can receive calls which do not transfer data blocks (e.g. + low level operations on the disk, instructions referring to special ways of + accessing the device). Most drivers do not know how to handle these + requests and return an error. + * To return an error, :c:func:`blk_mq_end_request` function is called, + ``BLK_STS_IOERR`` being the second argument. + * The request is processed according to the needs of the associated device. + * The request ends. In this case, :c:func:`blk_mq_end_request` function is + called in order to complete the request. + +.. bio_structure: + +:c:type:`struct bio` structure +============================== + +Each :c:type:`struct request` structure is an I/O block request, but may come +from combining more independent requests from a higher level. The sectors to be +transferred for a request can be scattered into the main memory but they always +correspond to a set of consecutive sectors on the device. The request is +represented as a series of segments, each corresponding to a buffer in memory. +The kernel can combine requests that refer to adjacent sectors but will not +combine write requests with read requests into a single +:c:type:`struct request` structure. + +A :c:type:`struct request` structure is implemented as a linked list of +:c:type:`struct bio` structures together with information that allows the +driver to retain its current position while processing the request. + +The :c:type:`struct bio` structure is a low-level description of a portion of +a block I/O request. + +.. code-block:: c + + struct bio { + //... + struct gendisk *bi_disk; + unsigned int bi_opf; /* bottom bits req flags, top bits REQ_OP. Use accessors. */ + //... + struct bio_vec *bi_io_vec; /* the actual vec list */ + //... + struct bvec_iter bi_iter; + /... + void *bi_private; + //... + }; + +In turn, the :c:type:`struct bio` structure contains a :c:member:`bi_io_vec` +vector of :c:type:`struct bio_vec` structures. It consists of the individual +pages in the physical memory to be transferred, the offset within the page and +the size of the buffer. To iterate through a :c:type:`struct bio` structure, +we need to iterate through the vector of :c:type:`struct bio_vec` and transfer +the data from every physical page. To simplify vector iteration, the +:c:type:`struct bvec_iter` structure is used. This structure maintains +information about how many buffers and sectors were consumed during the +iteration. The request type is encoded in the :c:member:`bi_opf` field; to +determine it, use the :c:func:`bio_data_dir` function. + +Create a :c:type:`struct bio` structure +--------------------------------------- + +Two functions can be used to create a :c:type:`struct bio` structure: + + * :c:func:`bio_alloc`: allocates space for a new structure; the structure + must be initialized; + * :c:func:`bio_clone`: makes a copy of an existing :c:type:`struct bio` + structure; the newly obtained structure is initialized with the values of + the cloned structure fields; the buffers are shared with the + :c:type:`struct bio` structure that has been cloned so that access to the + buffers has to be done carefully to avoid access to the same memory area + from the two clones; + +Both functions return a new :c:type:`struct bio` structure. + +Submit a :c:type:`struct bio` structure +--------------------------------------- + +Usually, a :c:type:`struct bio` structure is created by the higher levels of +the kernel (usually the file system). A structure thus created is then +transmitted to the I/O subsystem that gathers more :c:type:`struct bio` +structures into a request. + +For submitting a :c:type:`struct bio` structure to the associated I/O device +driver, the :c:func:`submit_bio` function is used. The function receives as +argument an initialized :c:type:`struct bio` structure that will be added to +a request from the request queue of an I/O device. From that queue, it can be +processed by the I/O device driver using a specialized function. + + +.. _bio_completion: + +Wait for the completion of a :c:type:`struct bio` structure +----------------------------------------------------------- + +Submitting a :c:type:`struct bio` structure to a driver has the effect of +adding it to a request from the request queue from where it will be further +processed. Thus, when the :c:func:`submit_bio` function returns, it is not +guaranteed that the processing of the structure has finished. If you want to +wait for the processing of the request to be finished, use the +:c:func:`submit_bio_wait` function. + +To be notified when the processing of a :c:type:`struct bio` structure ends +(when we do not use :c:func:`submit_bio_wait` function), the +:c:member:`bi_end_io` field of the structure should be used. This field +specifies the function that will be called at the end of the +:c:type:`struct bio` structure processing. You can use the +:c:member:`bi_private` field of the structure to pass information to the +function. + +Initialize a :c:type:`struct bio` structure +------------------------------------------- + +Once a :c:type:`struct bio` structure has been allocated and before being +transmitted, it must be initialized. + +Initializing the structure involves filling in its important fields. As +mentioned above, the :c:member:`bi_end_io` field is used to specify the function +called when the processing of the structure is finished. The +:c:member:`bi_private` field is used to store useful data that can be accessed +in the function pointed by :c:member:`bi_end_io`. + +The :c:member:`bi_opf` field specifies the type of operation. + +.. code-block:: c + + struct bio *bio = bio_alloc(GFP_NOIO, 1); + //... + bio->bi_disk = bdev->bd_disk; + bio->bi_iter.bi_sector = sector; + bio->bi_opf = REQ_OP_READ; + bio_add_page(bio, page, size, offset); + //... + +In the code snippet above we specified the block device to which we sent the +following: :c:type:`struct bio` structure, startup sector, operation +(:c:data:`REQ_OP_READ` or :c:data:`REQ_OP_WRITE`) and content. The content of a +:c:type:`struct bio` structure is a buffer described by: a physical page, +the offset in the page and the size of the bufer. A page can be assigned using +the :c:func:`alloc_page` call. + +.. note:: The :c:data:`size` field of the :c:func:`bio_add_page` call must be + a multiple of the device sector size. + +.. _bio_content: + +How to use the content of a :c:type:`struct bio` structure +---------------------------------------------------------- + +To use the content of a :c:type:`struct bio` structure, the structure's +support pages must be mapped to the kernel address space from where they can be +accessed. For mapping /unmapping, use the :c:macro:`kmap_atomic` and +the :c:macro:`kunmap_atomic` macros. + +A typical example of use is: + +.. code-block:: c + + static void my_block_transfer(struct my_block_dev *dev, size_t start, + size_t len, char *buffer, int dir); + + + static int my_xfer_bio(struct my_block_dev *dev, struct bio *bio) + { + struct bio_vec bvec; + struct bvec_iter i; + int dir = bio_data_dir(bio); + + /* Do each segment independently. */ + bio_for_each_segment(bvec, bio, i) { + sector_t sector = i.bi_sector; + char *buffer = kmap_atomic(bvec.bv_page); + unsigned long offset = bvec.bv_offset; + size_t len = bvec.bv_len; + + /* process mapped buffer */ + my_block_transfer(dev, sector, len, buffer + offset, dir); + + kunmap_atomic(buffer); + } + + return 0; + } + +As it can be seen from the example above, iterating through a +:c:type:`struct bio` requires iterating through all of its segments. A segment +(:c:type:`struct bio_vec`) is defined by the physical address page, the offset +in the page and its size. + +To simplify the processing of a :c:type:`struct bio`, use the +:c:macro:`bio_for_each_segment` macrodefinition. It will iterate through all +segments, and will also update global information stored in an iterator +(:c:type:`struct bvec_iter`) such as the current sector as well as other +internal information (segment vector index, number of bytes left to be +processed, etc.) . + +You can store information in the mapped buffer, or extract information. + +In case request queues are used and you needed to process the requests +at :c:type:`struct bio` level, use the :c:macro:`rq_for_each_segment` +macrodefinition instead of the :c:macro:`bio_for_each_segment` macrodefinition. +This macrodefinition iterates through each segment of each +:c:type:`struct bio` structure of a :c:type:`struct request` structure and +updates a :c:type:`struct req_iterator` structure. The +:c:type:`struct req_iterator` contains the current :c:type:`struct bio` +structure and the iterator that traverses its segments. + +A typical example of use is: + +.. code-block:: c + + struct bio_vec bvec; + struct req_iterator iter; + + rq_for_each_segment(bvec, req, iter) { + sector_t sector = iter.iter.bi_sector; + char *buffer = kmap_atomic(bvec.bv_page); + unsigned long offset = bvec.bv_offset; + size_t len = bvec.bv_len; + int dir = bio_data_dir(iter.bio); + + my_block_transfer(dev, sector, len, buffer + offset, dir); + + kunmap_atomic(buffer); + } + +Free a :c:type:`struct bio` structure +------------------------------------- + +Once a kernel subsystem uses a :c:type:`struct bio` structure, it will have to +release the reference to it. This is done by calling :c:func:`bio_put` function. + +Set up a request queue at :c:type:`struct bio` level +---------------------------------------------------- + +We have previously seen how we can specify a function to be used to process +requests sent to the driver. The function receives as argument the requests and +carries out processing at :c:type:`struct request` level. + +If, for flexibility reasons, we need to specify a function that carries +out processing at :c:type:`struct bio` structure level, we no longer +use request queues and we will need to fill the ``submit_bio`` field in the +:c:type:`struct block_device_operations` associated to the driver. + +Below is a typical example of initializing a function that carries out +processing at :c:type:`struct bio` structure level: + +.. code-block:: c + + // the declaration of the function that carries out processing + // :c:type:`struct bio` structures + static blk_qc_t my_submit_bio(struct bio *bio); + + struct block_device_operations my_block_ops = { + .owner = THIS_MODULE, + .submit_bio = my_submit_bio + ... + }; + +Further reading +=============== + +* `Linux Device Drivers 3rd Edition, Chapter 16. Block Drivers `_ +* Linux Kernel Development, Second Edition – Chapter 13. The Block I/O Layer +* `A simple block driver `_ +* `The gendisk interface `_ +* `The bio structure `_ +* `Request queues `_ +* `Documentation/block/request.txt - Struct request documentation `_ +* `Documentation/block/biodoc.txt - Notes on the Generic Block Layer `_ +* `drivers/block/brd/c - RAM backed block disk driver `_ +* `I/O Schedulers `_ + + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: block_device_drivers + +0. Intro +-------- + +Using |LXR|_ find the definitions of the following symbols in the Linux kernel: + + * :c:type:`struct bio` + * :c:type:`struct bio_vec` + * :c:macro:`bio_for_each_segment` + * :c:type:`struct gendisk` + * :c:type:`struct block_device_operations` + * :c:type:`struct request` + +1. Block device +--------------- + +Create a kernel module that allows you to register or deregister a block device. +Start from the files in the :file:`1-2-3-6-ram-disk/kernel` directory in the +lab skeleton. + +Follow the comments marked with **TODO 1** in the laboratory skeleton. Use the +existing macrodefinitions (:c:macro:`MY_BLOCK_MAJOR`, +:c:macro:`MY_BLKDEV_NAME`). Check the value returned by the register function, +and in case of error, return the error code. + +Compile the module, copy it to the virtual machine and insert it into the +kernel. Verify that your device was successfully created inside the +:file:`/proc/devices`. +You will see a device with major 240. + +Unload the kernel module and check that the device was unregistered. + +.. hint:: Review the `Register a block I/O device`_ section. + +Change the :c:macro:`MY_BLOCK_MAJOR` value to 7. Compile the module, copy it to +the virtual machine, and insert it into the kernel. Notice that the insertion +fails because there is already another driver/device registered in the kernel +with the major 7. + +Restore the 240 value for the :c:macro:`MY_BLOCK_MAJOR` macro. + +2. Disk registration +-------------------- + +Modify the previous module to add a disk associated with the driver. Analyze the +macrodefinitions, :c:type:`my_block_dev` structure and existing functions from +the :file:`ram-disk.c` file. + +Follow the comments marked with **TODO 2**. Use the +:c:func:`create_block_device` and the :c:func:`delete_block_device` functions. + +.. hint:: Review the `Register a disk`_ and `Process a request`_ sections. + +Fill in the :c:func:`my_block_request` function to process the request +without actually processing your request: display the "request received" message +and the following information: start sector, total size, data size from the +current :c:type:`struct bio` structure, direction. To validate a request type, +use the :c:func:`blk_rq_is_passthrough` (the function returns 0 in the case in +which we are interested, i.e. when the request is generated by the file system). + +.. hint:: To find the needed info, review the `Requests for block devices`_ + section. + +Use the :c:func:`blk_mq_end_request` function to finish processing the +request. + +Insert the module into the kernel and inspect the messages printed +by the module. When a device is added, a request is sent to the device. Check +the presence of :file:`/dev/myblock` and if it doesn't exist, create the device +using the command: + +.. code-block:: shell + + mknod /dev/myblock b 240 0 + +To generate writing requests, use the command: + +.. code-block:: shell + + echo "abc"> /dev/myblock + +Notice that a write request is preceded by a read request. The request +is done to read the block from the disk and "update" its content with the +data provided by the user, without overwriting the rest. After reading and +updating, writing takes place. + +3. RAM disk +----------- + +Modify the previous module to create a RAM disk: requests to the device will +result in reads/writes in a memory area. + +The memory area :c:data:`dev->data` is already allocated in the source code of +the module using :c:func:`vmalloc` and deallocated using :c:func:`vfree`. + +.. note:: Review the `Process a request`_ section. + +Follow the comments marked with **TODO 3** to complete the +:c:func:`my_block_transfer` function to write/read the request information +in/from the memory area. The function will be called for each request within +the queue processing function: :c:func:`my_block_request`. To write/read +to/from the memory area, use :c:func:`memcpy`. To determine the write/read +information, use the fields of the :c:type:`struct request` structure. + +.. hint:: To find out the size of the request data, use the + :c:macro:`blk_rq_cur_bytes` macro. Do not use the + :c:macro:`blk_rq_bytes` macro. + +.. hint:: To find out the buffer associated to the request, use + :c:data:`bio_data`(:c:data:`rq->bio`). + +.. hint:: A description of useful macros is in the `Requests for block devices`_ + section. + +.. hint:: You can find useful information in the + `block device driver example + `_ + from `Linux Device Driver `_. + +For testing, use the test file :file:`user/ram-disk-test.c`. +The test program is compiled automatically at ``make build``, copied to the +virtual machine at ``make copy`` and can be run on the QEMU virtual machine +using the command: + +.. code-block:: shell + + ./ram-disk-test + +There is no need to insert the module into the kernel, it will be inserted by +the ``ram-disk-test`` command. + +Some tests may fail because of lack of synchronization between the transmitted +data (flush). + +4. Read data from the disk +-------------------------- + +The purpose of this exercise is to read data from the +:c:macro:`PHYSICAL_DISK_NAME` disk (:file:`/dev/vdb`) directly from the kernel. + +.. attention:: Before solving the exercise, we need to make sure the disk is + added to the virtual machine. + + Check the variable ``QEMU_OPTS`` from :file:`qemu/Makefile`. + There should already be two extra disks added using ``-drive ...``. + + If there are not, generate a file that we will use as + the disk image using the command: + :command:`dd if=/dev/zero of=qemu/mydisk.img bs=1024 count=1` + and add the following option: + :command:`-drive file=qemu/mydisk.img,if=virtio,format=raw` + to :file:`qemu/Makefile` (in the :c:data:`QEMU_OPTS` variable, + after the root disk). + +Follow the comments marked with **TODO 4** in the directory :file:`4-5-relay/` +and implement :c:func:`open_disk` and :c:func:`close_disk`. +Use the :c:func:`blkdev_get_by_path` and :c:func:`blkdev_put` functions. The +device must be opened in read-write mode exclusively +(:c:macro:`FMODE_READ` | :c:macro:`FMODE_WRITE` | :c:macro:`FMODE_EXCL`), and +as holder you must use the current module (:c:macro:`THIS_MODULE`). + +Implement the :c:func:`send_test_bio` function. You will have to create a new +:c:type:`struct bio` structure and fill it, submit it and wait for it. Read the +first sector of the disk. To wait, call the :c:func:`submit_bio_wait` function. + +.. hint:: The first sector of the disk is the sector with the index 0. + This value must be used to initialize the field + :c:member:`bi_iter.bi_sector` of the :c:type:`struct bio`. + + For the read operation, use the :c:macro:`REQ_OP_READ` macro to + initialize the :c:member:`bi_opf` field of the :c:type:`struct bio`. + +After finishing the operation, display the first 3 bytes of data read by +:c:type:`struct bio` structure. Use the format ``"% 02x"`` for :c:func:`printk` +to display the data and the :c:macro:`kmap_atomic` and :c:macro:`kunmap_atomic` +macros respectively. + +.. hint:: As an argument for the :c:func:`kmap_atomic` function, just use the + page which is allocated above in the code, in the :c:data:`page` + variable. + +.. hint:: Review the sections :ref:`bio_content` and :ref:`bio_completion`. + +For testing, use the :file:`test-relay-disk` script, which is copied on the +virtual machine when running :command:`make copy`. If it is not copied, make +sure it is executable: + +.. code-block:: shell + + chmod +x test-relay-disk + +There is no need to load the module into the kernel, it will be loaded by +:command:`test-relay-disk`. + +Use the command below to run the script: + +.. code-block:: shell + + ./test-relay-disk + +The script writes "abc" at the beginning of the disk indicated by +:c:macro:`PHYSICAL_DISK_NAME`. After running, the module will display 61 62 63 +(the corresponding hexadecimal values of letters "a", "b" and "c"). + +5. Write data to the disk +------------------------- + +Follow the comments marked with **TODO 5** to write a message +(:c:macro:`BIO_WRITE_MESSAGE`) on the disk. + +The :c:func:`send_test_bio` function receives as argument the operation type +(read or write). Call in the :c:func:`relay_init` function the function for +reading and in the :c:func:`relay_exit` function the function for writing. We +recommend using the :c:macro:`REQ_OP_READ` and the :c:macro:`REQ_OP_WRITE` +macros. + +Inside the :c:func:`send_test_bio` function, if the operation is write, fill in +the buffer associated to the :c:type:`struct bio` structure with the message +:c:macro:`BIO_WRITE_MESSAGE`. Use the :c:macro:`kmap_atomic` and the +:c:macro:`kunmap_atomic` macros to work with the buffer associated to the +:c:type:`struct bio` structure. + +.. hint:: You need to update the type of the operation associated to the + :c:type:`struct bio` structure by setting the :c:member:`bi_opf` field + accordingly. + +For testing, run the :file:`test-relay-disk` script using the command: + +.. code-block:: shell + + ./test-relay-disk + +The script will display the ``"read from /dev/sdb: 64 65 66"`` message at the +standard output. + +6. Processing requests from the request queue at :c:type:`struct bio` level +--------------------------------------------------------------------------- + +In the implementation from Exercise 3, we have only processed a +:c:type:`struct bio_vec` of the current :c:type:`struct bio` from the request. +We want to process all :c:type:`struct bio_vec` structures from all +:c:type:`struct bio` structures. +For this, we will iterate through all :c:type:`struct bio` requests and through +all :c:type:`struct bio_vec` structures (also called segments) of each +:c:type:`struct bio`. + +Add, within the ramdisk implementation (:file:`1-2-3-6-ram-disk/` directory), +support for processing the requests from the request queue at +:c:type:`struct bio` level. Follow the comments marked with **TODO 6**. + +Set the :c:macro:`USE_BIO_TRANSFER` macro to 1. + +Implement the :c:func:`my_xfer_request` function. Use the +:c:macro:`rq_for_each_segment` macro to iterate through the :c:type:`bio_vec` +structures of each :c:type:`struct bio` from the request. + +.. hint:: Review the indications and the code snippets from the + :ref:`bio_content` section. + +.. hint:: Use the :c:type:`struct bio` segment iterator to get the current + sector (:c:member:`iter.iter.bi_sector`). + +.. hint:: Use the request iterator to get the reference to the current + :c:type:`struct bio` (:c:member:`iter.bio`). + +.. hint:: Use the :c:macro:`bio_data_dir` macro to find the reading or writing + direction for a :c:type:`struct bio`. + +Use the :c:macro:`kmap_atomic` or the :c:macro:`kunmap_atomic` macros to map +the pages of each :c:type:`struct bio` structure and access its associated +buffers. For the actual transfer, call the :c:func:`my_block_transfer` function +implemented in the previous exercise. + +For testing, use the :file:`ram-disk-test.c` test file: + +.. code-block:: shell + + ./ram-disk-test + +There is no need to insert the module into the kernel, it will be inserted by +the :command:`ram-disk-test` executable. + +Some tests may crash because of lack of synchronization between the transmitted +data (flush). diff --git a/refs/pull/405/merge/_sources/labs/deferred_work.rst.txt b/refs/pull/405/merge/_sources/labs/deferred_work.rst.txt new file mode 100644 index 00000000..72cf9ac8 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/deferred_work.rst.txt @@ -0,0 +1,946 @@ +============= +Deferred work +============= + +Lab objectives +============== + +* Understanding deferred work (i.e. code scheduled to be executed at a + later time) +* Implementation of common tasks that uses deferred work +* Understanding the peculiarities of synchronization for deferred work + +Keywords: softirq, tasklet, struct tasklet_struct, bottom-half +handlers, jiffies, HZ, timer, struct timer_list, spin_lock_bh, +spin_unlock_bh, workqueue, struct work_struct, kernel thread, events/x + +Background information +====================== + +Deferred work is a class of kernel facilities that allows one to +schedule code to be executed at a later timer. This scheduled code can +run either in the process context or in interruption context depending +on the type of deferred work. Deferred work is used to complement the +interrupt handler functionality since interrupts have important +requirements and limitations: + +* The execution time of the interrupt handler must be as small as + possible +* In interrupt context we can not use blocking calls + +Using deferred work we can perform the minimum required work in the +interrupt handler and schedule an asynchronous action from the +interrupt handler to run at a later time and execute the rest of the +operations. + +Deferred work that runs in interrupt context is also known as +bottom-half, since its purpose is to execute the rest of the actions +from an interrupt handler (top-half). + +Timers are another type of deferred work that are used to schedule the +execution of future actions after a certain amount of time has passed. + +Kernel threads are not themselves deferred work, but can be used to +complement the deferred work mechanisms. In general, kernel threads +are used as "workers" to process events whose execution contains +blocking calls. + +There are three typical operations that are used with all types of +deferred work: + +1. **Initialization**. Each type is described by a structure whose + fields will have to be initialized. The handler to be scheduled is + also set at this time. +2. **Scheduling**. Schedules the execution of the handler as soon as + possible (or after expiry of a timeout). +3. **Masking** or **Canceling**. Disables the execution of the + handler. This action can be either synchronous (which guarantees + that the handler will not run after the completion of canceling) or + asynchronous. + +.. attention:: When doing deferred work cleanup, like freeing the + structures associated with the deferred work or + removing the module and thus the handler code from the + kernel, always use the synchronous type of canceling + the deferred work. + +The main types of deferred work are kernel threads and softirqs. Work +queues are implemented on top of kernel threads and tasklets and +timers on top of softirqs. Bottom-half handlers were the first +implementation of deferred work in Linux, but in the meantime it was +replaced by softirqs. That is why some functions presented +contain *bh* in their name. + +Softirqs +======== + +softirqs can not be used by device drivers, they are reserved for +various kernel subsystems. Because of this there is a fixed number of +softirqs defined at compile time. For the current kernel version we +have the following types defined: + +.. code-block:: c + + enum { + HI_SOFTIRQ = 0, + TIMER_SOFTIRQ, + NET_TX_SOFTIRQ, + NET_RX_SOFTIRQ, + BLOCK_SOFTIRQ, + IRQ_POLL_SOFTIRQ, + TASKLET_SOFTIRQ, + SCHED_SOFTIRQ, + HRTIMER_SOFTIRQ, + RCU_SOFTIRQ, + NR_SOFTIRQS + }; + + +Each type has a specific purpose: + +* *HI_SOFTIRQ* and *TASKLET_SOFTIRQ* - running tasklets +* *TIMER_SOFTIRQ* - running timers +* *NET_TX_SOFIRQ* and *NET_RX_SOFTIRQ* - used by the networking subsystem +* *BLOCK_SOFTIRQ* - used by the IO subsystem +* *BLOCK_IOPOLL_SOFTIRQ* - used by the IO subsystem to increase performance when the iopoll handler is invoked; +* *SCHED_SOFTIRQ* - load balancing +* *HRTIMER_SOFTIRQ* - implementation of high precision timers +* *RCU_SOFTIRQ* - implementation of RCU type mechanisms [1]_ + +.. [1] RCU is a mechanism by which destructive operations + (e.g. deleting an element from a chained list) are done in two + steps: (1) removing references to deleted data and (2) freeing + the memory of the element. The second setup is done only after + we are sure nobody uses the element anymore. The advantage of + this mechanism is that reading the data can be done without + synchronization. For more information see + Documentation/RCU/rcu.txt. + + +The highest priority is the *HI_SOFTIRQ* type softirqs, followed in +order by the other softirqs defined. *RCU_SOFTIRQ* has the lowest +priority. + +Softirqs are running in interrupt context which means that they can +not call blocking functions. If the sofitrq handler requires calls to +such functions, work queues can be scheduled to execute these blocking +calls. + +Tasklets +-------- + +A tasklet is a special form of deferred work that runs in interrupt +context, just like softirqs. The main difference between sofirqs and tasklets +is that tasklets can be allocated dynamically and thus they can be used +by device drivers. A tasklet is represented by :c:type:`struct +tasklet` and as many other kernel structures it needs to be +initialized before being used. A pre-initialized tasklet can be defined +as following: + +.. code-block:: c + + void handler(unsigned long data); + + DECLARE_TASKLET(tasklet, handler, data); + DECLARE_TASKLET_DISABLED(tasklet, handler, data); + + +If we want to initialize the tasklet manually we can use the following +approach: + +.. code-block:: c + + void handler(unsigned long data); + + struct tasklet_struct tasklet; + + tasklet_init(&tasklet, handler, data); + +The *data* parameter will be sent to the handler when it is executed. + +Programming tasklets for running is called scheduling. Tasklets are +running from softirqs. Tasklets scheduling is done with: + +.. code-block:: c + + void tasklet_schedule(struct tasklet_struct *tasklet); + + void tasklet_hi_schedule(struct tasklet_struct *tasklet); + +When using *tasklet_schedule*, a *TASKLET_SOFTIRQ* softirq is +scheduled and all tasklets scheduled are run. For +*tasklet_hi_schedule*, a *HI_SOFTIRQ* softirq is scheduled. + +If a tasklet was scheduled multiple times and it did not run between +schedules, it will run once. Once the tasklet has run, it can be +re-scheduled, and will run again at a later timer. Tasklets can be +re-scheduled from their handlers. + +Tasklets can be masked and the following functions can be used: + +.. code-block:: c + + void tasklet_enable(struct tasklet_struct * tasklet); + void tasklet_disable(struct tasklet_struct * tasklet); + +Remember that since tasklets are running from softirqs, blocking calls +can not be used in the handler function. + +Timers +------ + +A particular type of deferred work, very often used, are timers. They +are defined by :c:type:`struct timer_list`. They run in interrupt +context and are implemented on top of softirqs. + +To be used, a timer must first be initialized by calling :c:func:`timer_setup`: + +.. code-block:: c + + #include + + void timer_setup(struct timer_list * timer, + void (*function)(struct timer_list *), + unsigned int flags); + +The above function initializes the internal fields of the structure +and associates *function* as the timer handler. Since timers are planned +over softirqs, blocking calls can not be used in the code associated +with the treatment function. + +Scheduling a timer is done with :c:func:`mod_timer`: + +.. code-block:: c + + int mod_timer(struct timer_list *timer, unsigned long expires); + +Where *expires* is the time (in the future) to run the handler +function. The function can be used to schedule or reschedule a timer. + +The time unit is *jiffie*. The absolute value of a jiffie +is dependent on the platform and it can be found using the +:c:type:`HZ` macro that defines the number of jiffies for 1 second. To +convert between jiffies (*jiffies_value*) and seconds (*seconds_value*), +the following formulas are used: + +.. code-block:: c + + jiffies_value = seconds_value * HZ ; + seconds_value = jiffies_value / HZ ; + +The kernel maintains a counter that contains the number of jiffies +since the last boot, which can be accessed via the :c:macro:`jiffies` +global variable or macro. We can use it to calculate a time in the +future for timers: + +.. code-block:: c + + #include + + unsigned long current_jiffies, next_jiffies; + unsigned long seconds = 1; + + current_jiffies = jiffies; + next_jiffies = jiffies + seconds * HZ; + +To stop a timer, use :c:func:`del_timer` and :c:func:`del_timer_sync`: + +.. code-block:: c + + int del_timer(struct timer_list *timer); + int del_timer_sync(struct timer_list *timer); + +These functions can be called for both a scheduled timer and an +unplanned timer. :c:func:`del_timer_sync` is used to eliminate the +races that can occur on multiprocessor systems, since at the end of +the call it is guaranteed that the timer processing function does not +run on any processor. + +A frequent mistake in using timers is that we forget to turn off +timers. For example, before removing a module, we must stop the timers +because if a timer expires after the module is removed, the handler +function will no longer be loaded into the kernel and a kernel oops +will be generated. + +The usual sequence used to initialize and schedule a one-second +timeout is: + +.. code-block:: c + + #include + + void timer_function(struct timer_list *); + + struct timer_list timer ; + unsigned long seconds = 1; + + timer_setup(&timer, timer_function, 0); + mod_timer(&timer, jiffies + seconds * HZ); + +And to stop it: + +.. code-block:: c + + del_timer_sync(&timer); + +Locking +------- + +For synchronization between code running in process context (A) and +code running in softirq context (B) we need to use special locking +primitives. We must use spinlock operations augmented with +deactivation of bottom-half handlers on the current processor in (A), +and in (B) only basic spinlock operations. Using spinlocks makes sure +that we don't have races between multiple CPUs while deactivating the +softirqs makes sure that we don't deadlock in the softirq is scheduled +on the same CPU where we already acquired a spinlock. + +We can use the :c:func:`local_bh_disable` and +:c:func:`local_bh_enable` to disable and enable softirqs handlers (and +since they run on top of softirqs also timers and tasklets): + +.. code-block:: c + + void local_bh_disable(void); + void local_bh_enable(void); + +Nested calls are allowed, the actual reactivation of the softirqs is +done only when all local_bh_disable() calls have been complemented by +local_bh_enable() calls: + +.. code-block:: c + + /* We assume that softirqs are enabled */ + local_bh_disable(); /* Softirqs are now disabled */ + local_bh_disable(); /* Softirqs remain disabled */ + + local_bh_enable(); /* Softirqs remain disabled */ + local_bh_enable(); /* Softirqs are now enabled */ + +.. attention:: These above calls will disable the softirqs only on the + local processor and they are usually not safe to use, they must be + complemented with spinlocks. + + +Most of the time device drivers will use special versions of spinlocks +calls for synchronization like :c:func:`spin_lock_bh` and +:c:func:`spin_unlock_bh`: + +.. code-block:: c + + void spin_lock_bh(spinlock_t *lock); + void spin_unlock_bh(spinlock_t *lock); + + +Workqueues +========== + +Workqueues are used to schedule actions to run in process context. The +base unit with which they work is called work. There are two types of +work: + +* :c:type:`struct work_struct` - it schedules a task to run at + a later time +* :c:type:`struct delayed_work` - it schedules a task to run after at + least a given time interval + +A delayed work uses a timer to run after the specified time +interval. The calls with this type of work are similar to those for +:c:type:`struct work_struct`, but has **_delayed** in the functions +names. + +Before using them a work item must be initialized. There are two types +of macros that can be used, one that declares and initializes the work +item at the same time and one that only initializes the work item (and +the declaration must be done separately): + +.. code-block:: c + + #include + + DECLARE_WORK(name , void (*function)(struct work_struct *)); + DECLARE_DELAYED_WORK(name, void(*function)(struct work_struct *)); + + INIT_WORK(struct work_struct *work, void(*function)(struct work_struct *)); + INIT_DELAYED_WORK(struct delayed_work *work, void(*function)(struct work_struct *)); + +:c:func:`DECLARE_WORK` and :c:func:`DECLARE_DELAYED_WORK` declare and +initialize a work item, and :c:func:`INIT_WORK` and +:c:func:`INIT_DELAYED_WORK` initialize an already declared work item. + +The following sequence declares and initiates a work item: + +.. code-block:: c + + #include + + void my_work_handler(struct work_struct *work); + + DECLARE_WORK(my_work, my_work_handler); + +Or, if we want to initialize the work item separately: + +.. code-block:: c + + void my_work_handler(struct work_struct * work); + + struct work_struct my_work; + + INIT_WORK(&my_work, my_work_handler); + +Once declared and initialized, we can schedule the task using +:c:func:`schedule_work` and :c:func:`schedule_delayed_work`: + +.. code-block:: c + + schedule_work(struct work_struct *work); + + schedule_delayed_work(struct delayed_work *work, unsigned long delay); + +:c:func:`schedule_delayed_work` can be used to plan a work item for +execution with a given delay. The delay time unit is jiffies. + +Work items can not be masked but they can be canceled by calling +:c:func:`cancel_delayed_work_sync` or :c:func:`cancel_work_sync`: + +.. code-block:: c + + int cancel_work_sync(struct delayed_work *work); + int cancel_delayed_work_sync(struct delayed_work *work); + +The call only stops the subsequent execution of the work item. If the +work item is already running at the time of the call, it will continue +to run. In any case, when these calls return, it is guaranteed that +the task will no longer run. + +.. attention:: While there are versions of these functions that are + not synchronous (.e.g. :c:func:`cancel_work`) do not + use them when you are performing cleanup work otherwise + race condition could occur. + +We can wait for a workqueue to complete running all of its work items by calling :c:func:`flush_scheduled_work`: + +.. code-block:: c + + void flush_scheduled_work(void); + +This function is blocking and, therefore, can not be used in interrupt +context. The function will wait for all work items to be completed. +For delayed work items, :c:type:`cancel_delayed_work` must be called +before :c:func:`flush_scheduled_work`. + +Finally, the following functions can be used to schedule work items on +a particular processor (:c:func:`schedule_delayed_work_on`), or on all +processors (:c:func:`schedule_on_each_cpu`): + +.. code-block:: c + + int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay); + int schedule_on_each_cpu(void(*function)(struct work_struct *)); + +A usual sequence to initialize and schedule a work item is the following: + +.. code-block:: c + + void my_work_handler(struct work_struct *work); + + struct work_struct my_work; + + INIT_WORK(&my_work, my_work_handler); + + schedule_work(&my_work); + +And for waiting for termination of a work item: + +.. code-block:: c + + flush_scheduled_work(); + +As you can see, the *my_work_handler* function receives the task as +the parameter. To be able to access the module's private data, you can +use :c:func:`container_of`: + +.. code-block:: c + + struct my_device_data { + struct work_struct my_work; + // ... + }; + + void my_work_handler(struct work_struct *work) + { + struct my_device_data * my_data; + + my_data = container_of(work, struct my_device_data, my_work); + // ... + } + +Scheduling work items with the functions above will run the handler in +the context of a kernel thread called *events/x*, where x is the +processor number. The kernel will initialize a kernel thread (or a +pool of workers) for each processor present in the system: + +.. code-block:: shell + + $ ps -e + PID TTY TIME CMD + 1? 00:00:00 init + 2 ? 00:00:00 ksoftirqd / 0 + 3 ? 00:00:00 events / 0 <--- kernel thread that runs work items + 4 ? 00:00:00 khelper + 5 ? 00:00:00 kthread + 7? 00:00:00 kblockd / 0 + 8? 00:00:00 kacpid + +The above functions use a predefined workqueue (called events), and +they run in the context of the *events/x* thread, as noted +above. Although this is sufficient in most cases, it is a shared +resource and large delays in work items handlers can cause delays for +other queue users. For this reason there are functions for creating +additional queues. + +A workqueue is represented by :c:type:`struct workqueue_struct`. A new +workqueue can be created with these functions: + +.. code-block:: c + + struct workqueue_struct *create_workqueue(const char *name); + struct workqueue_struct *create_singlethread_workqueue(const char *name); + +:c:func:`create_workqueue` uses one thread for each processor in the +system, and :c:func:`create_singlethread_workqueue` uses a single +thread. + +To add a task in the new queue, use :c:func:`queue_work` or +:c:func:`queue_delayed_work`: + +.. code-block:: c + + int queue_work(struct workqueue_struct * queue, struct work_struct *work); + + int queue_delayed_work(struct workqueue_struct *queue, + struct delayed_work * work , unsigned long delay); + +:c:func:`queue_delayed_work` can be used to plan a work for execution +with a given delay. The time unit for the delay is jiffies. + +To wait for all work items to finish call :c:func:`flush_workqueue`: + +.. code-block:: c + + void flush_workqueue(struct worksqueue_struct * queue); + +And to destroy the workqueue call :c:func:`destroy_workqueue` + +.. code-block:: c + + void destroy_workqueue(struct workqueue_struct *queue); + +The next sequence declares and initializes an additional workqueue, +declares and initializes a work item and adds it to the queue: + +.. code-block:: c + + void my_work_handler(struct work_struct *work); + + struct work_struct my_work; + struct workqueue_struct * my_workqueue; + + my_workqueue = create_singlethread_workqueue("my_workqueue"); + INIT_WORK(&my_work, my_work_handler); + + queue_work(my_workqueue, &my_work); + +And the next code sample shows how to remove the workqueue: + +.. code-block:: c + + flush_workqueue(my_workqueue); + destroy_workqueue(my_workqueue); + +The work items planned with these functions will run in the context of +a new kernel thread called *my_workqueue*, the name passed to +:c:func:`create_singlethread_workqueue`. + +Kernel threads +============== + +Kernel threads have emerged from the need to run kernel code in +process context. Kernel threads are the basis of the workqueue +mechanism. Essentially, a kernel thread is a thread that only runs in +kernel mode and has no user address space or other user attributes. + +To create a kernel thread, use :c:func:`kthread_create`: + +.. code-block:: c + + #include + + struct task_struct *kthread_create(int (*threadfn)(void *data), + void *data, const char namefmt[], ...); + +* *threadfn* is a function that will be run by the kernel thread +* *data* is a parameter to be sent to the function +* *namefmt* represents the kernel thread name, as it is displayed in + ps/top ; Can contain sequences %d , %s etc. Which will be replaced + according to the standard printf syntax. + +For example, the following call: + +.. code-block:: c + + kthread_create (f, NULL, "%skthread%d", "my", 0); + +Will create a kernel thread with the name mykthread0. + +The kernel thread created with this function will be stopped (in the +*TASK_INTERRUPTIBLE* state). To start the kernel thread, call the +:c:func:`wake_up_process`: + +.. code-block:: c + + #include + + int wake_up_process(struct task_struct *p); + +Alternatively, you can use :c:func:`kthread_run` to create and run a +kernel thread: + +.. code-block:: c + + struct task_struct * kthread_run(int (*threadfn)(void *data) + void *data, const char namefmt[], ...); + +Even if the programming restrictions for the function running within +the kernel thread are more relaxed and scheduling is closer to +scheduling in userspace, there are, however, some limitations to be +taken into account. We will list below the actions that can or can not +be made from a kernel thread: + +* can't access the user address space (even with copy_from_user, + copy_to_user) because a kernel thread does not have a user address + space +* can't implement busy wait code that runs for a long time; if the + kernel is compiled without the preemptive option, that code will run + without being preempted by other kernel threads or user processes + thus hogging the system +* can call blocking operations +* can use spinlocks, but if the hold time of the lock is significant, + it is recommended to use mutexes + +The termination of a kernel thread is done voluntarily, within the +function running in the kernel thread, by calling :c:func:`do_exit`: + +.. code-block:: c + + fastcall NORET_TYPE void do_exit(long code); + +Most of the implementations of kernel threads handlers use the same +model and it is recommended to start using the same model to avoid +common mistakes: + +.. code-block:: c + + #include + + DECLARE_WAIT_QUEUE_HEAD(wq); + + // list events to be processed by kernel thread + struct list_head events_list; + struct spin_lock events_lock; + + + // structure describing the event to be processed + struct event { + struct list_head lh; + bool stop; + //... + }; + + struct event* get_next_event(void) + { + struct event *e; + + spin_lock(&events_lock); + e = list_first_entry(&events_list, struct event*, lh); + if (e) + list_del(&e->lh); + spin_unlock(&events_lock); + + return e + } + + int my_thread_f(void *data) + { + struct event *e; + + while (true) { + wait_event(wq, (e = get_next_event)); + + /* Event processing */ + + if (e->stop) + break; + } + + do_exit(0); + } + + /* start and start kthread */ + kthread_run(my_thread_f, NULL, "%skthread%d", "my", 0); + + +With the template above, the kernel thread requests can be issued +with: + +.. code-block:: c + + void send_event(struct event *ev) + { + spin_lock(&events_lock); + list_add(&ev->lh, &events_list); + spin_unlock(&events_lock); + wake_up(&wq); + } + +Further reading +=============== + +* `Linux Device Drivers, 3rd ed., Ch. 7: Time, Delays, and Deferred Work `_ +* `Scheduling Tasks `_ +* `Driver porting: the workqueue interface `_ +* `Workqueues get a rework `_ +* `Kernel threads made easy `_ +* `Unreliable Guide to Locking `_ + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: deferred_work + +0. Intro +-------- + +Using |LXR|_, find the definitions of the following symbols: + +* :c:macro:`jiffies` +* :c:type:`struct timer_list` +* :c:func:`spin_lock_bh function` + + +1.Timer +------- + +We're looking at creating a simple kernel module that displays a +message at *TIMER_TIMEOUT* seconds after the module's kernel load. + +Generate the skeleton for the task named **1-2-timer** and follow the +sections marked with **TODO 1** to complete the task. + +.. hint:: Use `pr_info(...)`. Messages will be displayed on the + console and can also be viewed using dmesg. When scheduling + the timer we need to use the absolute time of the system (in + the future) in number of ticks. The current time of the + system in the number of ticks is given by :c:type:`jiffies`. + Thus, the absolute time we need to pass to the timer is + ``jiffies + TIMER_TIMEOUT * HZ``. + + For more information review the `Timers`_ section. + + +2. Periodic timer +----------------- + +Modify the previous module to display the message in once every +TIMER_TIMEOUT seconds. Follow the section marked with **TODO 2** in the +skeleton. + +3. Timer control using ioctl +---------------------------- + +We plan to display information about the current process after N +seconds of receiving a ioctl call from user space. N is transmitted as +ioctl parameter. + +Generate the skeleton for the task named **3-4-5-deferred** and +follow the sections marked with **TODO 1** in the skeleton driver. + +You will need to implement the following ioctl operations. + +* MY_IOCTL_TIMER_SET to schedule a timer to run after a number of + seconds which is received as an argument to ioctl. The timer does + not run periodically. + * This command receives directly a value, not a pointer. + +* MY_IOCTL_TIMER_CANCEL to deactivate the timer. + +.. note:: Review :ref:`ioctl` for a way to access the ioctl argument. + +.. note:: Review the `Timers`_ section for information on enabling / + disabling a timer. In the timer handler, display the current + process identifier (PID) and the process executable image name. + +.. hint:: You can find the current process identifier using the *pid* + and *comm* fields of the current process. For details, + review :ref:`proc-info`. + +.. hint:: To use the device driver from userspace you must create the + device character file */dev/deferred* using the mknod + utility. Alternatively, you can run the + *3-4-5-deferred/kernel/makenode* script that performs this + operation. + +Enable and disable the timer by calling user-space ioctl +operations. Use the *3-4-5-deferred/user/test* program to test +planning and canceling of the timer. The program receives the ioctl +type operation and its parameters (if any) on the command line. + +.. hint:: Run the test executable without arguments to observe the + command line options it accepts. + + To enable the timer after 3 seconds use: + + .. code-block:: c + + ./test s 3 + + To disable the timer use: + + .. code-block:: c + + ./test c + + +Note that every time the current process the timer runs from is +*swapper/0* with PID 0. This process is the idle process. It is +running when there is nothing else to run on. Because the virtual +machine is very light and does not do much it is natural to see this +process most of the time. + +4. Blocking operations +---------------------- + +Next we want to see what happens when we perform blocking operations +in a timer routine. For this we try to call in the timer-handling +routines a function called alloc_io() that simulates a blocking +operation. + +Modify the module so that when you receive *MY_IOCTL_TIMER_ALLOC* +command the timer handler will call :c:func:`alloc_io`. Follow the +sections marked with **TODO 2** in the skeleton. + +Use the same timer. To differentiate functionality in the timer +handler, use a flag in the device structure. Use the +*TIMER_TYPE_ALLOC* and *TIMER_TYPE_SET* macros defined in the code +skeleton. For initialization, use TIMER_TYPE_NONE. + +Run the test program to verify the functionality of task 3. Run the +test program again to call :c:func:`alloc_io()`. + +.. note:: The driver causes an error because a blocking function is + called in the atomic context (the timer handler runs + interrupt context). + +5. Workqueues +------------- + +We will modify the module to prevent the error observed in the +previous task. + +To do so, lets call :c:func:`alloc_io` using workqueues. Schedule a +work item from the timer handler In the work handler (running in +process context) call the :c:func:`alloc_io`. Follow the sections +marked with **TODO 3** in the skeleton and review the `Workqueues`_ +section if needed. + +.. hint:: Add a new field with the type :c:type:`struct work_struct` + in your device structure. Initialize this field. Schedule + the work from the timer handler using :c:func:`schedule_work`. + Schedule the timer handler aften N seconds from the ioctl. + +6. Kernel thread +---------------- + +Implement a simple module that creates a kernel thread that shows the +current process identifier. + +Generate the skeleton for the task named **6-kthread** and follow the +TODOs from the skeleton. + + +.. note:: There are two options for creating and running a thread: + + * :c:func:`kthread_run` to create and run the thread + + * :c:func:`kthread_create` to create a suspended thread and + then start it running with :c:func:`wake_up_process`. + + Review the `Kernel Threads`_ section if needed. + +.. attention:: Synchronize the thread termination with module unloading: + + * The thread should finish when the module is unloaded + + * Wait for the kernel thread to exit before continuing + with unloading + + +.. hint:: For synchronization use two wait queues and two flags. + + Review :ref:`waiting-queues` on how to use waiting queue. + + Use atomic variables for flags. Review :ref:`atomic-variables`. + + +7. Buffer shared between timer and process +------------------------------------------ + +The purpose of this task is to exercise the synchronization between a +deferrable action (a timer) and process context. Set up a periodic +timer that monitors a list of processes. If one of the processes +terminate a message is printed. Processes can be dynamically added to +the list. Use the *3-4-5-deferred/kernel/* skeleton as a base and +follow the **TODO 4** markings to complete the task. + +When the *MY_IOCTL_TIMER_MON* command is received check that the given +process exists and if so add to the monitored list of +processes and then arm the timer after setting its type. + +.. hint:: Use :c:func:`get_proc` which checks the pid, finds the + associated :c:type:`struct task_struct` and allocates a + :c:type:`struct mon_proc` item you can add to your + list. Note that the function also increases the reference + counter of the task, so that its memory won't be free when + the task terminates. + +.. attention:: Use a spinlock to protect the access to the list. Note + that since we share data with the timer handler we need + to disable bottom-half handlers in addition to taking + the lock. Review the `Locking`_ section. + +.. hint:: Collect the information every second from a timer. Use the + existing timer and add new behaviour for it via the + TIMER_TYPE_ACCT. To set the flag, use the *t* argument of + the test program. + + +In the timer handler iterate over the list of monitored processes and +check if they have terminated. If so, print the process name and pid +then remove the process from the list, decrement the task usage +counter so that it's memory can be free and finally free the +:c:type:`struct mon_proc` structure. + +.. hint:: Use the *state* field of :c:func:`struct task_struct`. A + task has terminated if its state is *TASK_DEAD*. + +.. hint:: Use :c:func:`put_task_struct` to decrement the task usage + counter. + +.. attention:: Make sure you protect the list access with a + spinlock. The simple variant will suffice. + +.. attention:: Make sure to use the safe iteration over the list since + we may need to remove an item from the list. + +Rearm the timer after checking the list. diff --git a/refs/pull/405/merge/_sources/labs/device_drivers.rst.txt b/refs/pull/405/merge/_sources/labs/device_drivers.rst.txt new file mode 100644 index 00000000..f73121b5 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/device_drivers.rst.txt @@ -0,0 +1,1037 @@ +======================== +Character device drivers +======================== + +Laboratory objectives +===================== + + * understand the concepts behind character device driver + * understand the various operations that can be performed on character devices + * working with waiting queues + +Overview +======== + +In UNIX, hardware devices are accessed by the user through special device +files. These files are grouped into the /dev directory, and system calls +``open``, ``read``, ``write``, ``close``, ``lseek``, ``mmap`` etc. are +redirected by the operating system to the device driver associated with the +physical device. The device driver is a kernel component (usually a module) +that interacts with a hardware device. + +In the UNIX world there are two categories of device files and thus +device drivers: character and block. This division is done by the speed, +volume and way of organizing the data to be transferred from the device to the +system and vice versa. In the first category, there are slow devices, which +manage a small amount of data, and access to data does not require frequent +seek queries. Examples are devices such as keyboard, mouse, serial ports, +sound card, joystick. In general, operations with these devices (read, write) +are performed sequentially byte by byte. The second category includes devices +where data volume is large, data is organized on blocks, and search is common. +Examples of devices that fall into this category are hard drives, cdroms, ram +disks, magnetic tape drives. For these devices, reading and writing is done at +the data block level. + +For the two types of device drivers, the Linux kernel offers different APIs. +If for character devices system calls go directly to device drivers, in case of +block devices, the drivers do not work directly with system calls. In +the case of block devices, communication between the user-space and the block +device driver is mediated by the file management subsystem and the block device +subsystem. The role of these subsystems is to prepare the device driver's +necessary resources (buffers), to keep the recently read data in the cache +buffer, and to order the read and write operations for performance reasons. + +Majors and minors +================= + +In UNIX, the devices traditionally had a unique, fixed identifier associated +with them. This tradition is preserved in Linux, although identifiers can be +dynamically allocated (for compatibility reasons, most drivers still use static +identifiers). The identifier consists of two parts: major and minor. The first +part identifies the device type (IDE disk, SCSI disk, serial port, etc.) +and the second one identifies the device (first disk, second serial port, +etc.). Most times, the major identifies the driver, while the minor identifies +each physical device served by the driver. In general, a driver will have a +major associate and will be responsible for all minors associated with that +major. + +.. code-block:: bash + + $ ls -la /dev/hda? /dev/ttyS? + brw-rw---- 1 root disk 3, 1 2004-09-18 14:51 /dev/hda1 + brw-rw---- 1 root disk 3, 2 2004-09-18 14:51 /dev/hda2 + crw-rw---- 1 root dialout 4, 64 2004-09-18 14:52 /dev/ttyS0 + crw-rw---- 1 root dialout 4, 65 2004-09-18 14:52 /dev/ttyS1 + +As can be seen from the example above, device-type information can be found +using the ls command. The special character files are identified by the ``c`` +character in the first column of the command output, and the block type by the +character ``b``. In columns ``5`` and ``6`` of the result you can see the +major, respectively the minor for each device. + +Certain major identifiers are statically assigned to devices (in the +``Documentation/admin-guide/devices.txt`` file from the kernel sources). When choosing the +identifier for a new device, you can use two methods: static (choose a number +that does not seem to be used already) or dynamically. In /proc/devices are the +loaded devices, along with the major identifier. + +To create a device type file, use the ``mknod`` command; the command receives the +type (``block`` or ``character``), ``major`` and ``minor`` of the device +(``mknod name type major minor``). Thus, if you want to create a character device +named ``mycdev`` with the major ``42`` and minor ``0``, use the command: + +.. code-block:: bash + + # mknod /dev/mycdev c 42 0 + +To create the block device with the name ``mybdev`` with the major 240 and minor 0 +the command will be: + +.. code-block:: bash + + # mknod /dev/mybdev b 240 0 + +Next, we'll refer to character devices as drivers. + +Data structures for a character device +====================================== + +In the kernel, a character-type device is represented by +:c:type:`struct cdev `, a structure used to register it in the +system. Most driver operations use three important structures: +``struct file_operations``, ``struct file`` and ``struct inode``. + +:c:type:`struct file_operations` +-------------------------------- + +As mentioned above, the character device drivers receive unaltered system calls +made by users over device-type files. Consequently, implementation of a character +device driver means implementing the system calls specific to files: ``open``, +``close``, ``read``, ``write``, ``lseek``, ``mmap``, etc. These operations are +described in the fields of the ``struct file_operations`` structure: + +.. code-block:: c + + #include + + struct file_operations { + struct module *owner; + loff_t (*llseek) (struct file *, loff_t, int); + ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); + ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); + [...] + long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); + [...] + int (*open) (struct inode *, struct file *); + int (*flush) (struct file *, fl_owner_t id); + int (*release) (struct inode *, struct file *); + [...] + +.. ** + +It can be noticed that the signature of the function differs from the system +call that the user uses. The operating system sits between the user and +the device driver to simplify implementation in the device driver. + +``open`` does not receive the parameter path or the various parameters that control +the file opening mode. Similarly, ``read``, ``write``, ``release``, ``ioctl``, ``lseek`` +do not receive as a parameter a file descriptor. Instead, these routines receive as +parameters two structures: ``file`` and ``inode``. Both structures represent a file, +but from different perspectives. + +Most parameters for the presented operations have a direct meaning: + * ``file`` and ``inode`` identifies the device type file; + * ``size`` is the number of bytes to be read or written; + * ``offset`` is the displacement to be read or written (to be updated + accordingly); + * ``user_buffer`` user buffer from which it reads / writes; + * ``whence`` is the way to seek (the position where the search operation starts); + * ``cmd`` and ``arg`` are the parameters sent by the users to the ioctl call (IO + control). + +``inode`` and ``file`` structures +--------------------------------- + +An ``inode`` represents a file from the point of view of the file system. Attributes +of an inode are the size, rights, times associated with the file. An inode uniquely +identifies a file in a file system. + +The ``file`` structure is still a file, but closer to the user's point of view. +From the attributes of the file structure we list: the inode, the file name, +the file opening attributes, the file position. All open files at a given time +have associated a ``file`` structure. + +To understand the differences between inode and file, we will use an analogy +from object-oriented programming: if we consider a class inode, then the files +are objects, that is, instances of the inode class. Inode represents the static +image of the file (the inode has no state), while the file represents the +dynamic image of the file (the file has state). + +Returning to device drivers, the two entities have almost always standard ways +of using: the inode is used to determine the major and minor of the device on +which the operation is performed, and the file is used to determine the flags +with which the file was opened, but also to save and access (later) private +data. + +The file structure contains, among many fields: + + * ``f_mode``, which specifies read (``FMODE_READ``) or write + (``FMODE_WRITE``); + * ``f_flags``, which specifies the file opening flags (``O_RDONLY``, + ``O_NONBLOCK``, ``O_SYNC``, ``O_APPEND``, ``O_TRUNC``, etc.); + * ``f_op``, which specifies the operations associated with the file (pointer to + the ``file_operations`` structure ); + * ``private_data``, a pointer that can be used by the programmer to store + device-specific data; The pointer will be initialized to a memory location + assigned by the programmer. + * ``f_pos``, the offset within the file + +The inode structure contains, among much information, an ``i_cdev`` +field, which is a pointer to the structure that defines the character +device (when the inode corresponds to a character device). + +Implementation of operations +============================ + +To implement a device driver, it is recommended that you create a structure +that contains information about the device, information used in the module. In +the case of a driver for a character device, the structure will contain a cdev +structure field to refer to the device. The following example uses the struct +my_device_data: + +.. code-block:: c + + #include + #include + + struct my_device_data { + struct cdev cdev; + /* my data starts here */ + //... + }; + + static int my_open(struct inode *inode, struct file *file) + { + struct my_device_data *my_data; + + my_data = container_of(inode->i_cdev, struct my_device_data, cdev); + + file->private_data = my_data; + //... + } + + static int my_read(struct file *file, char __user *user_buffer, size_t size, loff_t *offset) + { + struct my_device_data *my_data; + + my_data = (struct my_device_data *) file->private_data; + + //... + } + +.. ** + +A structure like ``my_device_data`` will contain the data associated with a device. +The ``cdev`` field (``cdev`` type) is a character-type device and is used to record it +in the system and identify the device. The pointer to the ``cdev`` member can be +found using the ``i_cdev`` field of the ``inode`` structure (using the ``container_of`` +macro). In the private_data field of the file structure, information can be +stored at open which is then available in the ``read``, ``write``, ``release``, etc. +routines. + +Registration and unregistration of character devices +==================================================== + +The registration/unregistration of a device is made by specifying the major and +minor. The ``dev_t`` type is used to keep the identifiers of a device (both major +and minor) and can be obtained using the ``MKDEV`` macro. + +For the static assignment and unallocation of device identifiers, the +``register_chrdev_region`` and ``unregister_chrdev_region`` functions are used: + +.. code-block:: c + + #include + + int register_chrdev_region(dev_t first, unsigned int count, char *name); + void unregister_chrdev_region(dev_t first, unsigned int count); + +.. ** + +It is recommended that device identifiers be dynamically assigned to the +``alloc_chrdev_region`` function. + +Below sequence reserves ``my_minor_count`` devices, starting with ``my_major`` +major and ``my_first_minor`` minor (if the max value for minor is exceeded, +move to the next major): + +.. code-block:: c + + #include + ... + + err = register_chrdev_region(MKDEV(my_major, my_first_minor), my_minor_count, + "my_device_driver"); + if (err != 0) { + /* report error */ + return err; + } + ... + +.. ** + +After assigning the identifiers, the character device will have to be +initialized (``cdev_init``) and the kernel will have to be notified(``cdev_add``). The +``cdev_add`` function must be called only after the device is ready to receive +calls. Removing a device is done using the ``cdev_del`` function. + +.. code-block:: c + + #include + + void cdev_init(struct cdev *cdev, struct file_operations *fops); + int cdev_add(struct cdev *dev, dev_t num, unsigned int count); + void cdev_del(struct cdev *dev); + +.. ** + +The following sequence registers and initializes MY_MAX_MINORS devices: + +.. code-block:: c + + #include + #include + + #define MY_MAJOR 42 + #define MY_MAX_MINORS 5 + + struct my_device_data { + struct cdev cdev; + /* my data starts here */ + //... + }; + + struct my_device_data devs[MY_MAX_MINORS]; + + const struct file_operations my_fops = { + .owner = THIS_MODULE, + .open = my_open, + .read = my_read, + .write = my_write, + .release = my_release, + .unlocked_ioctl = my_ioctl + }; + + int init_module(void) + { + int i, err; + + err = register_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS, + "my_device_driver"); + if (err != 0) { + /* report error */ + return err; + } + + for(i = 0; i < MY_MAX_MINORS; i++) { + /* initialize devs[i] fields */ + cdev_init(&devs[i].cdev, &my_fops); + cdev_add(&devs[i].cdev, MKDEV(MY_MAJOR, i), 1); + } + + return 0; + } + +.. ** + +While the following sequence deletes and unregisters them: + +.. code-block:: c + + void cleanup_module(void) + { + int i; + + for(i = 0; i < MY_MAX_MINORS; i++) { + /* release devs[i] fields */ + cdev_del(&devs[i].cdev); + } + unregister_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS); + } + +.. ** + +.. note:: Initialization of the struct my_fops used the initialization + of members by name, defined in C99 standard (see designated + initializers and the file_operations structure). Structure + members who do not explicitly appear in this initialization + will be set to the default value for their type. For + example, after the initialization above, ``my_fops.mmap`` will + be NULL. + +.. _access_to_process_address_space: + +Access to the address space of the process +========================================== + +A driver for a device is the interface between an application and hardware. As +a result, we often have to access user-space data. Accessing it can not be done +directly (by dereferencing a user-space pointer). Direct access of a +user-space pointer can lead to incorrect behavior (depending on architecture, a +user-space pointer may not be valid or mapped to kernel-space), a kernel oops +(the user-mode pointer can refer to a non-resident memory area) or security +issues. Proper access to user-space data is done by calling the macros / +functions below: + +.. code-block:: c + + #include + + put_user(type val, type *address); + get_user(type val, type *address); + unsigned long copy_to_user(void __user *to, const void *from, unsigned long n); + unsigned long copy_from_user(void *to, const void __user *from, unsigned long n); + +.. ** + +All macros / functions return 0 in case of success and another value in case of +error and have the following roles: + + * ``put_user`` store the value ``val`` to user-space address ``address``; + Type can be one on 8, 16, 32, 64 bit (the maximum supported type depends on the + hardware platform); + * ``get_user`` analogue to the previous function, only that val will be set to a + value identical to the value at the user-space address given by address; + * ``copy_to_user`` copies ``n`` bytes from the kernel-space, from the address + referenced by ``from`` in user-space to the address referenced by ``to``; + * ``copy_from_user`` copies ``n`` bytes from user-space from the address + referenced by ``from`` in kernel-space to the address referenced by ``to``. + +A common section of code that works with these functions is: + +.. code-block:: c + + #include + + /* + * Copy at most size bytes to user space. + * Return ''0'' on success and some other value on error. + */ + if (copy_to_user(user_buffer, kernel_buffer, size)) + return -EFAULT; + else + return 0; + +Open and release +================ + +The ``open`` function performs the initialization of a device. In most cases, +these operations refer to initializing the device and filling in specific data +(if it is the first open call). The release function is about releasing +device-specific resources: unlocking specific data and closing the device if +the last call is close. + +In most cases, the open function will have the following structure: + +.. code-block:: c + + static int my_open(struct inode *inode, struct file *file) + { + struct my_device_data *my_data = + container_of(inode->i_cdev, struct my_device_data, cdev); + + /* validate access to device */ + file->private_data = my_data; + + /* initialize device */ + ... + + return 0; + } + +.. ** + +A problem that occurs when implementing the ``open`` function is access control. +Sometimes a device needs to be opened once at a time; More specifically, do not +allow the second open before the release. To implement this restriction, you +choose a way to handle an open call for an already open device: it can return +an error (``-EBUSY``), block open calls until a release operation, or shut down +the device before do the open. + +At the user-space call of the open and close functions on the device, call +my_open and my_release in the driver. An example of a user-space call: + +.. code-block:: c + + int fd = open("/dev/my_device", O_RDONLY); + if (fd < 0) { + /* handle error */ + } + + /* do work */ + //.. + + close(fd); + +.. ** + +Read and write +============== + +The read and write operations are reaching the device driver as a +result of an user-space program calling the read or write system calls: + +.. code-block:: c + + if (read(fd, buffer, size) < 0) { + /* handle error */ + } + + if (write(fd, buffer, size) < 0) { + /* handle error */ + } + +.. ** + +The ``read`` and ``write`` functions transfer data between the device and the +user-space: the read function reads the data from the device and transfers it +to the user-space, while writing reads the user-space data and writes it to the +device. The buffer received as a parameter is a user-space pointer, which is +why it is necessary to use the ``copy_to_user`` or ``copy_from_user`` functions. + +The value returned by read or write can be: + + * the number of bytes transferred; if the returned value is less than the size + parameter (the number of bytes requested), then it means that a partial + transfer was made. Most of the time, the user-space app calls the system call + (read or write) function until the required data number is transferred. + * 0 to mark the end of the file in the case of read ; if write returns the + value 0 then it means that no byte has been written and that no error has + occurred; In this case, the user-space application retries the write call. + * a negative value indicating an error code. + +To perform a data transfer consisting of several partial transfers, the +following operations should be performed: + + * transfer the maximum number of possible bytes between the buffer received + as a parameter and the device (writing to the device/reading from the device + will be done from the offset received as a parameter); + * update the offset received as a parameter to the position from which the + next read / write data will begin; + * return the number of bytes transferred. + +The sequence below shows an example for the read function that takes +into account the internal buffer size, user buffer size and the offset: + +.. code-block:: c + + static int my_read(struct file *file, char __user *user_buffer, + size_t size, loff_t *offset) + { + struct my_device_data *my_data = (struct my_device_data *) file->private_data; + ssize_t len = min(my_data->size - *offset, size); + + if (len <= 0) + return 0; + + /* read data from my_data->buffer to user buffer */ + if (copy_to_user(user_buffer, my_data->buffer + *offset, len)) + return -EFAULT; + + *offset += len; + return len; + } + +.. ** + +The images below illustrate the read operation and how data is +transferred between the user-space and the driver: + + 1. when the driver has enough data available (starting with the OFFSET + position) to accurately transfer the required size (SIZE) to the user. + 2. when a smaller amount is transferred than required. + +.. image:: ../res/read.png + :width: 49 % +.. image:: ../res/read2.png + :width: 49 % + +We can look at the read operation implemented by the driver as a response to a +user-space read request. In this case, the driver is responsible for advancing +the offset according to how much it reads and returning the read size (which +may be less than what is required). + +The structure of the write function is similar: + +.. code-block:: c + + static int my_write(struct file *file, const char __user *user_buffer, + size_t size, loff_t * offset) + { + struct my_device_data *my_data = (struct my_device_data *) file->private_data; + ssize_t len = min(my_data->size - *offset, size); + + if (len <= 0) + return 0; + + /* read data from user buffer to my_data->buffer */ + if (copy_from_user(my_data->buffer + *offset, user_buffer, len)) + return -EFAULT; + + *offset += len; + return len; + } + +.. ** + +The write operation will respond to a write request from user-space. In +this case, depending on the maximum driver capacity (MAXSIZ), it can +write more or less than the required size. + +.. image:: ../res/write.png + :width: 49 % +.. image:: ../res/write2.png + :width: 49 % + +.. _ioctl: + +ioctl +===== + +In addition to read and write operations, a driver needs the ability to perform +certain physical device control tasks. These operations are accomplished by +implementing a ``ioctl`` function. Initially, the ioctl system call used Big Kernel +Lock. That's why the call was gradually replaced with its unlocked version +called ``unlocked_ioctl``. You can read more on LWN: +http://lwn.net/Articles/119652/ + +.. code-block:: c + + static long my_ioctl (struct file *file, unsigned int cmd, unsigned long arg); + +.. ** + +``cmd`` is the command sent from user-space. If a value is being sent from the +user-space call, it can be accessed directly. If a buffer is fetched, the arg +value will be a pointer to it, and must be accessed through the ``copy_to_user`` +or ``copy_from_user``. + +Before implementing the ``ioctl`` function, the numbers corresponding to the +commands must be chosen. One method is to choose consecutive numbers starting +at 0, but it is recommended to use ``_IOC(dir, type, nr, size)`` macro definition +to generate ioctl codes. The macro definition parameters are as follows: + + * ``dir`` represents the data transfer (``_IOC_NONE`` , ``_IOC_READ``, + ``_IOC_WRITE``). + * ``type`` represents the magic number (``Documentation/ioctl/ioctl-number.txt``); + * ``nr`` is the ioctl code for the device; + * ``size`` is the size of the transferred data. + +The following example shows an implementation for a ``ioctl`` function: + +.. code-block:: c + + #include + + #define MY_IOCTL_IN _IOC(_IOC_WRITE, 'k', 1, sizeof(my_ioctl_data)) + + static long my_ioctl (struct file *file, unsigned int cmd, unsigned long arg) + { + struct my_device_data *my_data = + (struct my_device_data*) file->private_data; + my_ioctl_data mid; + + switch(cmd) { + case MY_IOCTL_IN: + if( copy_from_user(&mid, (my_ioctl_data *) arg, + sizeof(my_ioctl_data)) ) + return -EFAULT; + + /* process data and execute command */ + + break; + default: + return -ENOTTY; + } + + return 0; + } + +.. ** + +At the user-space call for the ioctl function, the my_ioctl function of the +driver will be called. An example of such a user-space call: + +.. code-block:: c + + if (ioctl(fd, MY_IOCTL_IN, buffer) < 0) { + /* handle error */ + } + +.. ** + +Waiting queues +============== + +It is often necessary for a thread to wait for an operation to finish, +but it is desirable that this wait is not busy-waiting. Using waiting +queues we can block a thread until an event occurs. When the condition +is satisfied, elsewhere in the kernel, in another process, in an +interrupt or deferrable work, we will wake up the process. + +A waiting queue is a list of processes that are waiting for a specific +event. A queue is defined with the ``wait_queue_head_t`` type and can +be used by the functions/macros: + +.. code-block:: c + + #include + + DECLARE_WAIT_QUEUE_HEAD(wq_name); + + void init_waitqueue_head(wait_queue_head_t *q); + + int wait_event(wait_queue_head_t q, int condition); + + int wait_event_interruptible(wait_queue_head_t q, int condition); + + int wait_event_timeout(wait_queue_head_t q, int condition, int timeout); + + int wait_event_interruptible_timeout(wait_queue_head_t q, int condition, int timeout); + + void wake_up(wait_queue_head_t *q); + + void wake_up_interruptible(wait_queue_head_t *q); + +.. ** + +The roles of the macros / functions above are: + + * :c:func:`init_waitqueue_head` initializes the queue; to initialize the + queue at compile time, you can use the :c:macro:`DECLARE_WAIT_QUEUE_HEAD` macro; + * :c:func:`wait_event` and :c:func:`wait_event_interruptible` adds the current thread to the + queue while the condition is false, sets it to TASK_UNINTERRUPTIBLE or + TASK_INTERRUPTIBLE and calls the scheduler to schedule a new thread; Waiting + will be interrupted when another thread will call the wake_up function; + * :c:func:`wait_event_timeout` and :c:func:`wait_event_interruptible_timeout` have the same + effect as the above functions, only waiting can be interrupted at the end of + the timeout received as a parameter; + * :c:func:`wake_up` puts all threads off from state TASK_INTERRUPTIBLE and + TASK_UNINTERRUPTIBLE in TASK_RUNNING status; Remove these threads from the + queue; + * :c:func:`wake_up_interruptible` same action, but only threads with TASK_INTERRUPTIBLE + status are woken up. + +A simple example is that of a thread waiting to change the value of a flag. The +initializations are done by the sequence: + +.. code-block:: c + + #include + + wait_queue_head_t wq; + int flag = 0; + + init_waitqueue_head(&wq); + +.. ** + +A thread will wait for the flag to be changed to a value other than zero: + +.. code-block:: c + + wait_event_interruptible(wq, flag != 0); + +.. ** + +While another thread will change the flag value and wake up the waiting threads: + +.. code-block:: c + + flag = 1 ; + wake_up_interruptible (&wq); + +.. ** + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: device_drivers + +0. Intro +-------- + +Using `LXR `_ find the definitions +of the following symbols in the Linux kernel: + + * :c:type:`struct file` + * :c:type:`struct file_operations` + * :c:type:`generic_ro_fops` + * :c:func:`vfs_read` + + +1. Register/unregister +---------------------- + +The driver will control a single device with the ``MY_MAJOR`` major and +``MY_MINOR`` minor (the macros defined in the kernel/so2_cdev.c file). + + 1. Create **/dev/so2_cdev** character device node using **mknod**. + + .. hint:: Read `Majors and minors`_ section in the lab. + + 2. Implement the registration and deregistration of the device with the name + ``so2_cdev``, respectively in the init and exit module functions. Implement **TODO 1**. + + .. hint:: Read the section `Registration and unregistration of character devices`_ + + 3. Display, using ``pr_info``, a message after the registration and unregistration + operations to confirm that they were successful. Then load the module into the kernel: + + .. code-block:: bash + + $ insmod so2_cdev.ko + + And see character devices in ``/proc/devices``: + + .. code-block:: bash + + $ cat /proc/devices | less + + Identify the device type registered with major 42 . Note that ``/proc/devices`` + contains only the device types (major) but not the actual devices (i.e. minors). + + .. note:: Entries in /dev are not created by loading the module. These can be created + in two ways: + + * manually, using the ``mknod`` command as we did above. + * automatically using udev daemon + + 4. Unload the kernel module + + .. code-block:: bash + + rmmod so2_cdev + +2. Register an already registered major +--------------------------------------- + +Modify **MY_MAJOR** so that it points to an already used major number. + +.. hint:: See ``/proc/devices`` to get an already assigned major. + +See `errno-base.h `_ +and figure out what does the error code mean. +Return to the initial configuration of the module. + +3. Open and close +----------------- + +Run ``cat /dev/so2_cdev`` to read data from our char device. +Reading does not work because the driver does not have the open function implemented. +Follow comments marked with TODO 2 and implement them. + + 1. Initialize your device + + * add a cdev struct field to ``so2_device_data`` structure. + * Read the section `Registration and unregistration of character devices`_ in the lab. + + 2. Implement the open and release functions in the driver. + 3. Display a message in the open and release functions. + 4. Read again ``/dev/so2_cdev`` file. Follow the messages displayed by the kernel. + We still get an error because ``read`` function is not yet implemented. + +.. note:: The prototype of a device driver's operations is in the ``file_operations`` + structure. Read `Open and release`_ section. + +4. Access restriction +--------------------- + +Restrict access to the device with atomic variables, so that a single process +can open the device at a time. The rest will receive the "device busy" error +(``-EBUSY``). Restricting access will be done in the open function displayed by +the driver. Follow comments marked with **TODO 3** and implement them. + + 1. Add an ``atomic_t`` variable to the device structure. + 2. Initialize the variable at module initialization. + 3. Use the variable in the open function to restrict access to the device. We + recommend using :c:func:`atomic_cmpxchg`. + 4. Reset the variable in the release function to retrieve access to the device. + 5. To test your deployment, you'll need to simulate a long-term use of your + device. To simulate a sleep, call the scheduler at the end of the device opening: + +.. code-block:: bash + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1000); + +.. ** + + 6. Test using ``cat /dev/so2_cdev`` & ``cat /dev/so2_cdev``. + + +.. note:: The advantage of the atomic_cmpxchg function is that it can check the + old value of the variable and set it up to a new value, all in one + atomic operation. Read more details about `atomic_cmpxchg `_ + An example of use is `here `_. + +5. Read operation +----------------- + +Implement the read function in the driver. Follow comments marked with ``TODO 4`` and implement them. + + 1. Keep a buffer in ``so2_device_data`` structure initialized with the value of ``MESSAGE`` macro. + Initializing this buffer will be done in module ``init`` function. + 2. At a read call, copy the contents of the kernel space buffer into the user + space buffer. + + * Use the :c:func:`copy_to_user` function to copy information from kernel space to + user space. + * Ignore the size and offset parameters at this time. You can assume that + the buffer in user space is large enough. You do not need to check the + validity of the size argument of the read function. + * The value returned by the read call is the number of bytes transmitted + from the kernel space buffer to the user space buffer. + + 3. After implementation, test using ``cat /dev/so2_cdev``. + +.. note:: The command ``cat /dev/so2_cdev`` does not end (use Ctrl+C). + Read the `read and write`_ sections and `Access to the address space of the process`_ + If you want to display the offset value use a construction of the form: + ``pr_info("Offset: %lld \n", *offset)``; The data type loff_t (used by offset ) is a typedef for long long int. + +The ``cat`` command reads to the end of the file, and the end of the file is +signaled by returning the value 0 in the read. Thus, for a correct implementation, +you will need to update and use the offset received as a parameter in the read +function and return the value 0 when the user has reached the end of the buffer. + +Modify the driver so that the ``cat`` commands ends: + + 1. Use the size parameter. + 2. For every read, update the offset parameter accordingly. + 3. Ensure that the read function returns the number of bytes that were copied + into the user buffer. + +.. note:: By dereferencing the offset parameter it is possible to read and move the current + position in the file. Its value needs to be updated every time a read is done + successfully. + +6. Write operation +------------------ + +Add the ability to write a message into kernel buffer to replace the predefined message. Implement +the write function in the driver. Follow comments marked with ``TODO 5`` + +Ignore the offset parameter at this time. You can assume that the driver buffer is +large enough. You do not need to check the validity of the write function size +argument. + +.. note:: The prototype of a device driver's operations is in the file_operations + structure. + Test using commands: + + .. code-block:: bash + + echo "arpeggio"> /dev/so2_cdev + cat /dev/so2_cdev + + Read the `read and write`_ sections and `Access to the address space of the process`_ + +7. ioctl operation +------------------ + +For this exercise, we want to add the ioctl ``MY_IOCTL_PRINT`` to display the +message from the ``IOCTL_MESSAGE`` macro in the driver. +Follow the comments marked with ``TODO 6`` + +For this: + + 1. Implement the ioctl function in the driver. + 2. We need to use ``user/so2_cdev_test.c`` to call the + ioctl function with the appropriate parameters. + 3. To test, we will use an user-space program (``user/so2_cdev_test.c``) + which will call the ``ioctl`` function with the required arguments. + +.. note:: The macro ``MY_IOCTL_PRINT`` is defined in the file ``include/so2_cdev.h``, + which is shared between the kernel module and the user-space program. + + Read the `ioctl`_ section in the lab. + +.. note:: The user-space code is compiled automatically at ``make build`` and + copied at ``make copy``. + + Because we need to compile the program for qemu machine which is 32 bit, + if your host is 64 bit then you need to install ``gcc-multilib`` package. + +Extra Exercises +=============== + +Ioctl with messaging +-------------------- + +Add two ioctl operations to modify the message associated with the +driver. Use fixed-length buffer ( BUFFER_SIZE ). + + 1. Add the ``ioctl`` function from the driver the following operations: + + * ``MY_IOCTL_SET_BUFFER`` for writing a message to the device; + * ``MY_IOCTL_GET_BUFFER`` to read a message from your device. + + 2. For testing, pass the required command line arguments to the + user-space program. + +.. note:: Read the `ioctl`_ and `Access to the address space of the process`_ + sections of the lab. + +Ioctl with waiting queues +------------------------- + +Add two ioctl operations to the device driver for queuing. + + 1. Add the ``ioctl`` function from the driver the following operations: + + * ``MY_IOCTL_DOWN`` to add the process to a queue; + * ``MY_IOCTL_UP`` to remove the process from a queue. + + 2. Fill the device structure with a ``wait_queue_head_t`` field and a flag. + 3. Do not forget to initialize the wait queue and flag. + 4. Remove exclusive access condition from previous exercise + 5. For testing, pass the required command line arguments to the + user-space program. + +When the process is added to the queue, it will remain blocked in execution; To +run the queue command open a new console in the virtual machine with Alt+F2 ; +You can return to the previous console with Alt+F1. If you're connected via +SSH to the virtual machine, open a new console. + +.. note:: Read the `ioctl`_ and `Waiting queues`_ sections in the lab. + +O_NONBLOCK implementation +------------------------- + +.. note:: If a file is open with the ``O_NONBLOCK`` flag, then its + operations will be non-blocking. + + In case data is not available when performing a read, the following + happens: + + * if the file has been open with ``O_NONBLOCK``, the read call + will return ``-EWOULDBLOCK``. + * otherwise, the current task (process) will be placed in a waiting + queue and will be unblocked as soon as data becomes available + (in our case, at write). + +* To allow unblocking the read operation, remove the exclusive access + condition from previous exercises. +* You can use the queue defined for the previous exercise. +* You can ignore the file offset. +* Modify the initial size of data to ``0``, to allow testing. +* For testing, pass the required command line arguments to the + user-space program. + + * when using the ``n`` option, the test program will change the open flags + to ``O_NONBLOCK`` and then perform a ``read``. + +* What are the flags used to open the file when running ``cat /dev/so2_dev``? + diff --git a/refs/pull/405/merge/_sources/labs/device_model.rst.txt b/refs/pull/405/merge/_sources/labs/device_model.rst.txt new file mode 100644 index 00000000..032990d6 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/device_model.rst.txt @@ -0,0 +1,1286 @@ +================== +Linux Device Model +================== + +Overview +======== + +Plug and Play is a technology that offers support for automatically adding and +removing devices to the system. This reduces conflicts with the resources they +use by automatically configuring them at system startup. In order to achieve +these goals, the following features are required: + + * Automatic detection of adding and removing devices in the system (the device + and its bus must notify the appropriate driver that a configuration change + occurred). + * Resource management (addresses, irq lines, DMA channels, memory areas), + including resource allocation to devices and solving conflicts that may arise. + * Devices must allow for software configuration (device resources - ports, + interrupts, DMA resources - must allow for driver assignment). + * The drivers required for new devices must be loaded automatically by the + operating system when needed. + * When the device and its bus allow, the system should be able to add or + remove the device from the system while it is running, without having to reboot + the system (hotplug). + +For a system to support plug and play, the BIOS, operating system and the device +must support this technology. The device must have an ID that will provide to the +driver for identification, and the operating system must be able to identify +these configuration changes as they appear. + +Plug and play devices are: PCI devices (network cards), USB (keyboard, mouse, +printer), etc. + +Prior to version 2.6, the kernel did not have a unified model to get +information about devices. +For this reason, a model for Linux devices, Linux Device Model, was developed. + +The primary purpose of this model is to maintain internal data structures that +reflect the state and structure of the system. Such information includes what +devices are in the system, how they are in terms of power management, what bus +they are attached to, what drivers they have, along with the structure of the +buses, devices, drivers in the system. + +To maintain this information, the kernel uses the following entities: + + * device - a physical device that is attached to a bus + * driver - a software entity that can be associated with a device and performs + operations with it + * bus - a device to which other devices can be attached + * class - a type of device that has a similar behavior; There is a class for + disks, partitions, serial ports, etc. + * subsystem - a view on the structure of the system; Kernel subsystems + include devices (hierarchical view of all devices in the system), buses (bus + view of devices according to how they are attached to buses), classes, etc. + +sysfs +===== + +The kernel provides a representation of its model in userspace through the +sysfs virtual file system. It is usually mounted in the /sys directory and +contains the following subdirectories: + + * block - all block devices available in the system (disks, partitions) + * bus - types of bus to which physical devices are connected (pci, ide, usb) + * class - drivers classes that are available in the system (net, sound, usb) + * devices - the hierarchical structure of devices connected to the system + * firmware - information from system firmware (ACPI) + * fs - information about mounted file systems + * kernel - kernel status information (logged-in users, hotplug) + * module - the list of modules currently loaded + * power - information related to the power management subsystem + +As you can see, there is a correlation between the kernel data structures +within the described model and the subdirectories in the sysfs virtual file +system. Although this likeness may lead to confusion between the two concepts, +they are different. The kernel device model can work without the sysfs file +system, but the reciprocal is not true. + +The sysfs information is found in files that contain an attribute. Some +standard attributes (represented by files or directories with the same name) +are as follows: + + * dev - Major and minor device identifier. It can be used to automatically + create entries in the /dev directory + * device - a symbolic link to the directory containing devices; It can be + used to discover the hardware devices that provide a particular service (for + example, the ethi PCI card) + * driver - a symbolic link to the driver directory (located in + /sys/bus/\*/drivers ) + +Other attributes are available, depending on the bus and driver used. + +.. ditaa:: + +------+ + | /sys | + +--+---+ + | + +----------------------------------------------------+-------------------------------------+-----------------------------------------+ + | | | | + v v v v + +-----+ +-------+ +---------+ +--------+ + | bus | | class | | devices | | module | + +--+--+ +---+---+ +----+----+ +---+----+ + | | | | + | | | +-------------+-----------------+ + | | | | | + v v v v v + +------------------------+ +-----------------------+ +-------------------------+ +----------------------+ +-------------------------+ + | mybus: struct bus_type | | myclass: struct class | | mybus0: struct device | | mybus: struct module | | mydriver: struct module | + +-------------+----------+ +----------+------------+ +-----------+-------------+ +----------------------+ +-------------------------+ + | | | + +--------+--------------+ v v + | | +-------------------------------+ +----------------------+ + v v | myclass0: struct class_device | | mydev: struct device | + +---------+ +---------+ +-------------------------------+ +----------------------+ + | devices | | drivers | + +---------+ +---+-----+ + | + v + +--------------------------------+ + | mydriver: struct device_driver | + +--------------------------------+ + + +Basic Structures in Linux Devices +================================= + +Linux Device Model provides a number of structures to ensure the interaction +between a hardware device and a device driver. The whole model is based on +kobject structure. Hierarchies are built using this structure and the following +structures are implemented: + + * struct bus_type + * struct device + * struct device_driver + + +.. ditaa:: + :--no-separation: + + +--+ +--+ +--+ + mydriver.c | | mybus.c | | bus/driver/device core | | kobject core + | | | | | | + | | | | | | + | | | | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | | my_bus_type +------=>+ struct bus_type | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | |name | | | |name | | | + | | |uevent() = my_uevent() | | | |uevent() | | | + | | |match() = my_match() | | | |match() | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | | | | | | | + | | | | +-----------------------------+ | | + | | | | | | + +----------------+ | | +-----------------------------+ | | +-----------------------------+ | | +-------------------+ + | mydriver +------=>+ struct my_driver +------->+ struct device_driver +-------+---->| struct kobject | + +----------------+ | | +-----------------------------+ | | +-----------------------------+ | | | +-------------------+ + | | | | | | | | | name | | | | | k_name | + +----------------+ | | +-----------------------------+ | | +-----------------------------+ | | | +-------------------+ + | | | my_register_driver() | | | | driver_register() | | | | | kobject_add() | + | | | my_unregister_driver() | | | | driver_unregister() | | | | | kobject_delete() | + | | +-----------------------------+ | | +-----------------------------+ | | | +-------------------+ + | | | | | | | + | | | | | | | + +----------------+ | | +-----------------------------+ | | +-----------------------------+ | | | + | mydevice +------=>+ struct my_device +------->+ struct device +-------+ + +----------------+ | | +-----------------------------+ | | +-----------------------------+ | | + | | | | | | | | | bus_id | | | + +----------------+ | | +-----------------------------+ | | +-----------------------------+ | | + | | | my_register_device() | | | | device_register() | | | + | | | my_unregister_device() | | | | device_unregister() | | | + | | +-----------------------------+ | | +-----------------------------+ | | + | | | | | | + +--+ +--+ +--+ + + +The kobject structure +--------------------- + +A kobject structure does not perform a single function. This structure is +usually integrated into a larger one. A kobject structure actually +incorporates a set of features that will be offered to a higher abstraction +object in the Linux Device Model hierarchy. + +For example, the cdev structure has the following definition: + +.. code-block:: c + + struct cdev { + struct kobject kob; + struct module *owner; + const struct file_operations *ops; + struct list_head list; + dev_t dev; + unsigned int count; + }; + + +Note that this structure includes a ``kobject`` structure field. + +A kobject structure is defined as follows: + +.. code-block:: c + + struct kobject { + const char *name; + struct list_head entry; + struct kobject *parent; + struct kset *kset; + struct kobj_type *ktype; + struct sysfs_dirent *sd; + struct kref kref; + unsigned int state_initialized:1; + unsigned int state_in_sysfs:1; + unsigned int state_add_uevent_sent:1; + unsigned int state_remove_uevent_sent:1; + unsigned int uevent_suppress:1; + }; + +As we can see, the kobject structures are in a hierarchy: an object has a +parent and holds a kset member, which contains objects on the same level. + +Working with the structure involves initializing it with the +:c:func:`kobject_init` function. +Also in the initialization process it is necessary to set the name of the +``kobject`` structure, which will appear in sysfs, using the +:c:func:`kobject_set_name` function. + +Any operation on a kobject is done by incrementing its internal counter using +:c:func:`kobject_get`, or decrementing if it is no longer used using +:c:func:`kobject_put`. +Thus, a kobject object will only be released when its internal counter reaches 0. +A method of notifying this is needed so that the resources associated with the +device structure which included the kobject structure are released +(for example, cdev). +The method is called ``release`` and is associated with the object via the ktype +field (:c:type:`struct kobj_type`). + +The kobject structure is the basic structure of the Linux Device Model. +The structures in the higher levels of the model are :c:type:`struct bus_type`, +:c:type:`struct device` and :c:type:`struct device_driver`. + +Buses +----- + +A bus is a communication channel between the processor and an input/output +device. To ensure that the model is generic, all input/output devices are +connected to the processor via such a bus (even if it can be a virtual one +without a physical hardware correspondent). + +When adding a system bus, it will appear in the sysfs file system in +``/sys/bus``. +As with kobjects, buses can be organized into hierarchies and will be represented +in sysfs. + +In the Linux Device Model, a bus is represented by the structure +:c:type:`struct bus_type`: + +.. code-block:: c + + struct bus_type { + const char *name; + const char *dev_name; + struct device *dev_root; + struct bus_attribute *bus_attrs; + struct device_attribute *dev_attrs; + struct driver_attribute *drv_attrs; + struct subsys_private *p; + + int (*match)(struct device *dev, struct device_driver *drv); + int (*uevent)(struct device *dev, struct kobj_uevent_env *env); + int (*probe)(struct device *dev); + int (*remove)(struct device *dev); + //... + }; + +It can be noticed that a bus has a name, lists of default attributes, a number +of specific functions, and the driver's private data. +The ``uevent`` function (formerly ``hotplug``) is used with hotplug devices. + +Bus operations are the registration, the implementation of the operations +described in the :c:type:`struct bus_type` structure and the iteration and +inspection of the devices connected to the bus. + +A bus is registered using :c:func:`bus_register`, and unregistered using +:c:func:`bus_unregister`. + +Implementation example: + +.. code-block:: c + + #include + /* mybus.c */ + + //bus type + struct bus_type my_bus_type = { + .name = "mybus", + .match = my_match, + .uevent = my_uevent, + }; + + static int __init my_bus_init(void) + { + int err; + + //... + err = bus_register(&my_bus_type); + if (err) + return err; + //... + } + + static void __exit my_bus_exit(void) + { + //... + bus_unregister(&my_bus_type); + //... + } + + +The functions that will normally be initialized within a bus_type structure are +``match`` and ``uevent``: + +.. code-block:: c + + #include + #include + /* mybus.c */ + + // match devices to drivers; just do a simple name test + static int my_match(struct device *dev, struct device_driver *driver) + { + return !strncmp(dev_name(dev), driver->name, strlen(driver->name)); + } + + // respond to hotplug user events; add environment variable DEV_NAME + static int my_uevent(struct device *dev, struct kobj_uevent_env *env) + { + add_uevent_var(env, "DEV_NAME=%s", dev_name(dev)); + return 0; + } + +The ``match`` function is used when a new device or a new driver is added to the +bus. Its role is to make a comparison between the device ID and the driver ID. +The ``uevent`` function is called before generating a hotplug in user-space and +has the role of adding environment variables. + +Other possible operations on a bus are iterating over the drivers or devices +attached to it. +Although we can not directly access them (lists of drivers and devices +being stored in the private data of the driver, the ``subsys_private *p`` field), +these can be iterated using the :c:macro:`bus_for_each_dev` and +:c:macro:`bus_for_each_drv` macros. + +The Linux Device Model interface allows you to create attributes for the +associated objects. These attributes will have a corresponding file in the +bus subdirectory in sysfs. The attributes associated with a bus are +described by the bus_attribute structure : + +.. code-block:: c + + struct bus_attribute { + struct attribute attr; + ssize_t (*show)(struct bus_type *, char *buf); + ssize_t (*store)(struct bus_type *, const char *buf, size_t count); + }; + +Typically, an attribute is defined by the :c:macro:`BUS_ATTR` macro. +The :c:func:`bus_create_file` and :c:func:`bus_remove_file` functions can be +used to add/delete an attribute within the bus structure. + +An example of defining an attribute for ``my_bus`` is shown below: + +.. code-block:: c + + /* mybus.c */ + + #define MY_BUS_DESCR "SO2 rules forever" + + // export a simple bus attribute + static ssize_t my_show_bus_descr(struct bus_type *bus, char *buf) + { + return snprintf(buf, PAGE_SIZE, "%s\n", MY_BUS_DESCR); + } + + /* + * define attribute - attribute name is descr; + * full name is bus_attr_descr; + * sysfs entry should be /sys/bus/mybus/descr + */ + BUS_ATTR(descr, 0444, my_show_bus_descr, NULL); + + // specify attribute - in module init function + static int __init my_bus_init(void) + { + int err; + //... + err = bus_create_file(&my_bus_type, &bus_attr_descr); + if (err) { + /* handle error */ + } + //... + } + + static void __exit my_bus_exit(void) + { + //... + bus_remove_file(&my_bus_type, &bus_attr_descr); + //... + } + +The bus is represented by both a ``bus_type`` object and a ``device`` object, +as we will see later (the bus is also a device). + + +Devices +------- + +Any device in the system has a :c:type:`struct device` structure associated +with it. +Devices are discovered by different kernel methods (hotplug, device drivers, +system initialization) and are registered in the system. Each device present in +the kernel has an entry in ``/sys/devices``. + +At the lowest level, a device in Linux Device Model is represented by a +:c:type:`struct device` structure: + +.. code-block:: c + + struct device { + //... + struct device *parent; + struct device_private *p; + struct kobject kobj; + + const char *init_name; /* initial name of the device */ + //... + struct bus_type *bus; /* type of bus device is on */ + struct device_driver *driver; /* which driver has allocated this + device */ + //... + void (*release)(struct device *dev); + }; + +Structure fields include the parent device that is usually a controller, the +associated ``kobject``, the bus it is connected to, the device driver, and a +function called when the device counter reaches 0 (``release``). + +As usual, we have the registration/unregistration functions +:c:func:`device_register` and :c:func:`device_unregister`. + +To work with attributes, we have structure :c:type:`struct device_attribute`, +the macro :c:macro:`DEVICE_ATTR` for definition, and the functions +:c:func:`device_create_file` and :c:func:`device_remove_file` for adding/removing +the attribute to/from the device. + +One important thing to note is that the :c:type:`struct device` structure is +usually not used directly, but it is added to another structure. For example: + +.. code-block:: c + + // my device type + struct my_device { + char *name; + struct my_driver *driver; + struct device dev; + }; + +Typically, a bus driver will export functions to add or remove such a +device, as shown below: + +.. code-block:: c + + /* mybus.c */ + + /* BUS DEVICE (parent) */ + + // parent device release + static void my_bus_device_release(struct device *dev) + { + } + + // parent device + static struct device my_bus_device = { + .init_name = "mybus0", + .release = my_bus_device_release + }; + + /* DEVICE */ + + /* + * as we are not using the reference count, we use a no-op + * release function + */ + static void my_dev_release(struct device *dev) + { + } + + int my_register_device(struct my_device *mydev) + { + mydev->dev.bus = &my_bus_type; + mydev->dev.parent = &my_bus_device; + mydev->dev.release = my_dev_release; + dev_set_name(&mydev->dev, mydev->name); + + return device_register(&mydev->dev); + } + + void my_unregister_device(struct my_device *mydev) + { + device_unregister(&mydev->dev); + } + + /* export register/unregister device functions */ + EXPORT_SYMBOL(my_register_device); + EXPORT_SYMBOL(my_unregister_device); + +As seen, the functions ``my_register_device`` and ``my_unregister_device``, used +to add/remove a device to/from a bus, are defined in the same file where the +bus is defined. Device structures are not initialized; they will be initialized +when the devices are discovered by the system (by hotplug or direct registration +from driver) and the function ``my_register_device`` will be called to add a +device to the bus. + +To use the bus defined above in the driver implementation, we must define a +structure of type ``my_device``, initialize it and register it using the function +exported by the bus (``my_register_device``). + +.. code-block:: c + + /* mydriver.c */ + + static struct my_device mydev; + char devname[NAME_SIZE]; + //... + + //register + int err; + + sprintf(devname, "mydev0"); + mydev.name = devname; + mydev.driver = &mydriver; + dev_set_drvdata(&mydev.dev, &mydev); + err = my_register_device(&mydev); + if (err < 0) { + /*handle error */ + } + + //.. + + //unregister + my_unregister_device(&mydev); + +Drivers +------- + +Linux Device Model is used to allow simple association between system +devices and drivers. Drivers can export information independent of the physical +device. + +In sysfs, driver information has no single subdirectory associated; They can be +found in the directory structure in different places: the loaded module is in +``/sys/module``, in ``/sys/devices`` you can find the driver associated with +each device, in ``/sys/class`` the drivers belonging to a class, in +``/sys/bus`` the drivers associated to each bus. + +A device driver is identified by the structure :c:type:`struct device_driver`: + +.. code-block:: c + + struct device_driver { + const char *name; + struct bus_type *bus; + + struct driver_private *p; + + struct module *owner; + const char *mod_name; /* used for built-in modules */ + + int (*probe) (struct device *dev); + int (*remove) (struct device *dev); + void (*shutdown) (struct device *dev); + int (*suspend) (struct device *dev, pm_message_t state); + int (*resume) (struct device *dev); + }; + +Among the structure fields we find the name of the driver (appears in ``sysfs``), +the bus with which the driver works, and functions called at various times in a +device's operation. + +As before, we have the functions :c:func:`driver_register` and +:c:func:`driver_unregister` to register/unregister a driver. + +To work with attributes, we have the :c:type:`struct driver_attribute` structure, +the macro :c:type:`DRIVER_ATTR` for definition, and the functions +:c:func:`driver_create_file` and :c:func:`driver_remove_file` functions for +adding the attribute to the device. + +As with devices, the structure :c:type:`struct device_driver` is usually +incorporated into another structure specific to a particular bus (PCI, USB, etc.): + +.. code-block:: c + + /* mybus.c */ + + // my driver type + struct my_driver { + struct module *module; + struct device_driver driver; + }; + + #define to_my_driver(drv) container_of(drv, struct my_driver, driver); + + int my_register_driver(struct my_driver *driver) + { + int err; + + driver->driver.bus = &my_bus_type; + err= driver_register(&driver->driver); + if (err) + return err; + return 0; + } + + void my_unregister_driver(struct my_driver *driver) + { + driver_unregister(&driver->driver); + } + + /* export register/unregister driver functions */ + EXPORT_SYMBOL(my_register_driver); + EXPORT_SYMBOL(my_unregister_driver); + +Driver registration/unregistration operations are exported for use in +other modules. + +As for devices, the operations for drivers are defined when the bus is +initialized and they are exported to be used by drivers. When implementing a +driver that works with devices attached to the bus, we will call the functions +``my_register_driver`` and ``my_unregister_driver`` to associate with the bus. + +To use the functions (in the driver implementation), we must declare a structure +of type ``my_driver``, initialize it and register using the function exported +by the bus. + +.. code-block:: c + + /* mydriver.c */ + + static struct my_driver mydriver = { + .module = THIS_MODULE, + .driver = { + .name = "mydriver", + }, + }; + //... + + //register + int err; + err = my_register_driver(&mydriver); + if (err < 0) { + /*handle error */ + } + //.. + + //unregister + my_unregister_driver(&mydriver); + + +Classes +------- + +A class is a high-level view of the Linux Device Model, which abstracts +implementation details. For example, there are drivers for SCSI and ATA +drivers, but all belong to the class of disks. Classes provide a grouping of +devices based on functionality, not how they are connected or how they work. +Classes have a correspondent in ``/sys/classes``. + +There are two main structures that describe the classes: :c:type:`struct class` +and :c:type:`struct device`. +The class structure describes a generic class, while the structure +:c:type:`struct device` describes a class associated with a device. +There are functions for initializing/deinitiating and adding attributes for each +of these, described in ``include/linux/device.h``. + +The advantage of using classes is that the ``udev`` program in userspace, which we +will discuss later, allows the automatic creation of devices in the ``/dev`` +directory based on class information. + +For this reason, we will continue to present a small set of functions that work +with classes to simplify the use of the plug and play mechanism. + +A generic class is described by structure class structure: + +.. code-block:: c + + struct class { + const char *name; + struct module *owner; + struct kobject *dev_kobj; + + struct subsys_private *p; + + struct class_attribute *class_attrs; + struct class_device_attribute *class_dev_attrs; + struct device_attribute *dev_attrs; + + int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); + void (*class_release)(struct class *class); + void (*dev_release)(struct device *dev); + //... + }; + +The :c:func:`class_register` and :c:func:`class_unregister` functions can be +used for initialization/deinitialization. + +.. code-block:: c + + static struct class my_class = { + .name = "myclass", + }; + + static int __init my_init(void) + { + int err; + //... + err = class_register(&my_class); + if (err < 0) { + /* handle error */ + } + //... + } + + static void __exit my_cleanup(void) + { + //... + class_unregister(&my_class); + //... + } + +A class associated with a device is described by the :c:type:`struct device` +structure. +The :c:func:`device_create` and :c:func:`device_destroy` functions can be used +for initialization/deinitialization. +The :c:func:`device_create` function initializes the ``device`` structure, +and assigns the generic ``class`` structure and the device received as a +parameter to it; +In addition, it will create an attribute of the class, ``dev``, which contains +the minor and major of the device (``minor:major``). +Thus, udev utility in usermode can read the necessary data from this attribute +file to create a node in the ``/dev`` directory by calling ``makenod``. + +An example of initialization: + +.. code-block:: c + + struct device* my_classdev; + struct cdev cdev; + struct device dev; + + //init class for device cdev.dev + my_classdev = device_create(&my_class, NULL, cdev.dev, &dev, "myclass0"); + + //destroy class for device cdev.dev + device_destroy(&my_class, cdev.dev); + +When a new device is discovered, a class and a node will be assigned to it and +a node will be created in the ``/dev`` directory. +For the example above, the node ``/dev/myclass0`` will be generated. + +Hotplug +------- + +``Hotplug`` describes the mechanism for adding or removing a device from the +system while it is running without having to reboot the system. + +A hotplug event is a notification from the kernel to the user-space when something +changes in the system configuration. These events are generated when creating +or removing a kobject from the kernel. Since these objects are the basis of the +Linux Device Model, being included in all structures (``struct bus_type``, +``struct device``, ``struct device_driver``, ``struct class``, etc.), a hotplug event +will be generated when any of these structures is created or removed (``uevent``). + +When a device is discovered in the system, an event is generated. Depending on +the point where it resides in Linux Device Model, the functions corresponding +to the event will be called (usually, the ``uevent`` function associated to the +bus or the class). Using these functions, the driver has the ability to set +system variables for the user-space. +The generated event then reaches the user-space. Here is the ``udev`` +utility that captures these events. There are configuration files for this +utility in the ``/etc/udev/`` directory. Different rules can be specified to +capture only certain events and perform certain actions, depending on the +system variables set in the kernel or in ``uevent`` functions. + +An important consequence is that in this way the plug and play mechanism can be +achieved; with the help of ``udev`` and the classes (described above), entries +in the ``/dev/`` directories can be automatically created for devices, and using +``udev`` drivers can be automatically loaded for a device. + +Rules for ``udev`` are located ``/etc/udev/rules.d``. +Any file that ends with ``.rules`` in this directory will be parsed when an +event occurs. For more details on how to write rules in these files see +`Writing udev rules `_. +For testing, there are utilities such as ``udevmonitor``, ``udevinfo`` and +``udevtest``. + +For a quick example, consider the situation where we want to automatically load +a driver for a device when an event occurs. We can create a new file +/etc/udev/rules.d/myrules.rules, we will have the following line: + +.. code-block:: bash + + SUBSYSTEM=="pnp", ATTRS{id}=="PNP0400", RUN+="/sbin/insmod /root/mydriver.ko" + +This will choose from the events generated only those belonging to the ``pnp`` +subsystem (connected to ``PNP`` bus) and having an id attribute with the value +``PNP0400``. + +When this rule will be found, the command specified under ``RUN`` will be +executed to insert the appropriate driver in the kernel. + + +Plug and Play +============= + +As noted above, in Linux Device Model all devices are connected by a bus, even if +it has a corresponding physical hardware or it is virtual. + +The kernel already has implemented most buses using a ``bus_type`` structure +and functions to register/unregister drivers and devices. +To implement a driver, we must first determine the bus to which the supported +devices are connected and use the structures and functions exported by this bus. +The main buses are ``PCI``, ``USB``, ``PNP``, ``IDE``, ``SCSI``, ``platform``, +``ACPI``, etc. + +PNP bus +------- + +The plug and play mechanism provides a means of detecting and setting the resources +for legacy driver that may not be configured or otherwise. All plug and play +drivers, protocols, services are based on Plug and Play level. It is responsible +for the exchange of information between drivers and protocols. The following +protocols are available: + + * ``PNPBIOS`` - used for systems such as serial and parallel ports + * ``ISAPNP`` - offers support for the ISA bus + * ``ACPI`` - offering, among other things, information about system-level devices + +The kernel contains a bus, called ``pnp_bus``, that is used for connecting by +many drivers. +The implementation and working with the bus follow the Linux Device Model and +is very similar to what we discussed above. + +The main functions and structures exported by the bus, which can be used by +drivers, are: + + * :c:type:`struct pnp_driver` - driver type associated to the bus + * :c:func:`pnp_register_driver` - function used to register a PNP driver in the system + * :c:func:`pnp_unregister_driver` - function used to unregister a PNP driver from the system + +As noted in previous sections, the bus has a function called ``match`` used to +associate the devices with the appropriate drivers. +For example, when discovering a new device, a driver which meets the condition +given by the ``match`` function regarding to the new device. Usually, this +condition is a comparation of IDs (driver id and device id). +A common approach is using a static table in each driver, which holds information +about the devices supported by the driver, which will be used by the bus +when verifying the condition. For example, for a parallel port device we have +the table ``parport_pc_pnp_tbl``: + +.. code-block:: c + + static const struct pnp_device_id parport_pc_pnp_tbl[] = { + /* Standard LPT Printer Port */ + {.id = "PNP0400", .driver_data = 0}, + /* ECP Printer Port */ + {.id = "PNP0401", .driver_data = 0}, + }; + + MODULE_DEVICE_TABLE(pnp, parport_pc_pnp_tbl); + +Each driver declares and initializes a structure ``pnp_driver``, such as +``parport_pc_pnp_driver``: + +.. code-block:: c + + static int parport_pc_pnp_probe(struct pnp_dev *dev, const struct pnp_id *card_id, + const struct pnp_id *dev_id); + static void parport_pc_pnp_remove(struct pnp_dev* dev); + + static struct pnp_driver parport_pc_pnp_driver = { + .name = "parport_pc", + .id_table = parport_pc_pnp_tbl, + .probe = parport_pc_pnp_probe, + .remove = parport_pc_pnp_remove, + }; + +We can notice that the structure has as fields a pointer to the table declared +above and two functions, which are called when a new device is detected and when +it is removed from the system. +As all the structures presented above, the driver must be registered to the +system: + +.. code-block:: c + + static int __init parport_pc_init(void) + { + err = pnp_register_driver(&parport_pc_pnp_driver); + if (err < 0) { + /* handle error */ + } + } + + static void __exit parport_pc_exit(void) + { + pnp_unregister_driver(&parport_pc_pnp_driver); + } + +PNP operations +-------------- + +So far we have discussed the Linux Device Model and its API. To +implement a plug and play driver, we must respect the Linux Device Model model. + +Most often, adding a bus in the kernel is not necessary, as most of the existing +buses are already implemented (PCI, USB, etc.). Thus, we must first identify the +bus to which the device is attached. +In the examples below, we will consider that this bus is bus PNP and we will +use the structures and functions described above. + +.. ditaa:: + + + Kernel space | User space + | + | + +-------------+ +-------------+ +---------------+ | +--------+ + | | | | | | | | | + | my_device | | my_driver | | my_bus_type | | | udev | + | | | | | | | | | + +-----+-------+ +------+------+ +-------+-------+ | +---+----+ + | | | | | + : : : | : + | | 1.my_register_driver() | 2.call_usermodehelper() | + | +-+------------------------->+-+------------------------->+-+ + | | | | | | | | + | | | | | | | | + | | | | | | | | + | 3.my_uevent() | | | | 4.call_usermodehelper() | | + +++-------------------------| |--------------------------> +------------------------->| | + | | | | | | | | | + | | | | 6.my_probe() | | 5.my_match() | | + | | | |<=------------------------| |<=------------------------| | + | | | | | | | | | + | | | | | | | | | + | | | | | | | | | + | | 7.my_remove() | | 8.my_uevent() | | 9.call_usermodehelper() | | +---------------------------+ + +-+------------------------>| |------------------------->| |------------------------->| | | | + | | | | | | | | | 1 - 2 -> add driver | + | | | | | | | | | 3 - 6 -> add device | + | | | | | | | | | 7 - 9 -> remove device | + | | | 10.my_unregister_driver()| | 11.call_usermodehelper() | | | 10 - 11 -> remove driver | + | +-+------------------------->+-+------------------------->+-+ | | + | | | | | +---------------------------+ + : : : | : + + +Adding a driver +--------------- + +In addition to the usual operations, a driver must follow the Linux Device Model. +Thus, it will be registered in the system using the functions provided by +the bus for this purpose. +Usually, the bus provides a particular driver structure containing a +:c:type:`struct device_driver` structure, that the driver must initialize and +register using a function ``*_register_driver``. +For example, for the ``PNP`` bus, the driver must declare and initialize a +structure of type :c:type:`struct pnp_driver` and register it using +``pnp_register_drvier``: + +.. code-block:: c + + static struct pnp_driver my_pnp_driver = { + .name = "mydriver", + .id_table = my_pnp_tbl, + .probe = my_pnp_probe, + .remove = my_pnp_remove, + }; + + static int __init my_init(void) + { + err = pnp_register_driver(&my_pnp_driver); + } + +Unlike legacy drivers, plug and play drivers don't register devices at +initialization in the init function (``my_init`` in the example above) using +:c:func:`register_device`. + +As described above, each bus has a `match` function which is called when a new +device is detected in the system to determine the associated driver. +Thus, there must be a way for each driver to export information about the +devices it supports, to allow this check to pass and have its functions further +called. +In the examples presented in this lab, the match function does a simple +comparison between the device name and the driver name. Most drivers use a table +containing information devices and store a pointer to this table in the +driver structure. +For example, a driver associated to a ``PNP`` bus defines a table of type +:c:type:`struct pnp_device_id` and initializes the field ``id_table`` from the +structure ``pnp_driver my_pnp_driver`` with a pointer to it: + +.. code-block:: c + + static const struct pnp_device_id my_pnp_tbl[] = { + /* Standard LPT Printer Port */ + {.id = "PNP0400", .driver_data = 0}, + /* ECP Printer Port */ + {.id = "PNP0401", .driver_data = 0}, + { } + }; + + MODULE_DEVICE_TABLE(pnp,my_pnp_tbl); + + static struct pnp_driver my_pnp_driver = { + //... + .id_table = my_pnp_tbl, + //... + }; + +In the example above, the driver supports multiple parallel port devices, +defined in the table ``my_pnp_tbl``. This information is used by the bus in +the ``match_device`` function. +When adding a driver, the bus driver will be associated to it and new entires +in ``sysfs`` will be created based on the driver name. +Then the bus ``match`` function will be called for every supported device, +to associate the driver with any connected device that it supports. + +Removing a driver +----------------- + +To remove a driver from the kernel, in addition to operations required for a +legacy driver, we must unregister the ``device_driver`` structure. +For a driver associated with the ``PNP`` bus, we must unregister the ``pnp_driver`` +structure using the :c:func:`pnp_unregister_driver` function: + +.. code-block:: c + + static struct pnp_driver my_pnp_driver; + + static void __exit my_exit(void) + { + pnp_unregister_driver(&my_pnp_driver); + } + +Unlike legacy drivers, plug and play drivers don't unregister devices in the +module unload function (``my_exit``). When a driver is removed, all the +references to it will be removed for all the devices it supports, and entries +from ``sysfs`` will also be removed. + +Adding a new device +------------------- + +As we saw above, plug and play drivers do not register devices at initialization. +This operation will take place in the ``probe`` function, which is called when +a new device is detected. A device attached to the ``PNP`` bus will be added to +the system by the function ``probe`` from the ``pnp_driver`` structure: + +.. code-block:: c + + static int my_pnp_probe(struct pnp_dev *dev, const struct pnp_id *card_id, + const struct pnp_id *dev_id) { + int err, iobase, nr_ports, irq; + + //get irq & ports + if (pnp_irq_valid(dev, 0)) + irq = pnp_irq(dev, 0); + if (pnp_port_valid(dev, 0)) { + iobase = pnp_port_start(dev, 0); + } else + return -ENODEV; + nr_ports = pnp_port_len(dev, 0); + + /* register device dev */ + } + + static struct pnp_driver my_pnp_driver = { + //... + .probe = my_pnp_probe, + //... + }; + +Upon detection of a device in the kernel (at boot or by the insertion of the +device through ``hotplug``), an interrupt is generated and reaches the bus +driver. +The device is registered using the function :c:func:`device_register` and it is +attached to the bus. A call to the user space will also be generated, and the +event can be treated by ``udev``. Then, the list of drivers associated with the +bus is iterated and the ``match`` function is called for each of them. +The ``match`` function tries to find a driver for the new device. After a +suitable driver is found for the device, the ``probe`` function of the driver +is called. If the function ends successfully, the device is added to the driver's +list of devices and new entries are created in ``sysfs`` based on the device name. + +Removing a device +----------------- + +As we saw above, the plug and play drivers don't unregister devices when the +driver is unloaded. This operation is done in the ``remove`` function, which +is called when a device is removed from the system. +In case of a device attached to the ``PNP`` bus, the unregister will be done +in the ``remove`` function specified in the ``pnp_driver`` structure: + +.. code-block:: c + + static void my_pnp_remove(struct pnp_dev *dev) { + /* unregister device dev */ + } + + static struct pnp_driver my_pnp_driver = { + //... + .remove = my_pnp_remove, + }; + +As seen in the example above, when the removal of a device is detected, the +``my_pnp_remove`` function is called. A user-space call is also generated, which +can be detected by ``udev``, and entries are removed from ``sysfs``. + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: device_model + +0. Intro +--------- + +Find the definitions of the following symbols in the Linux kernel: + + * functions ``dev_name``, ``dev_set_name``. + * functions ``pnp_device_probe``, ``pnp_bus_match``, ``pnp_register_driver`` + and the ``pnp_bus_type`` variable. + +1. Bus implementation +--------------------- + +Analyze the contents of the ``bex.c``, a module that implements a bus +driver. Follow the comments marked with **TODO 1** and implement the missing +functionality: register the bus driver and add a new device named ``root`` +with type ``none`` and version 1. + +.. hint:: See :c:func:`bex_add_dev`. + +.. hint:: The register and unregister must be done using :c:func:`bus_register` + and :c:func:`bus_unregister`. + +Load the module and verify that the bus is visible in ``/sys/bus``. Verify +that the device is visible in ``/sys/bus/bex/devices``. + +Remove the module and notice that the ``sysfs`` entries are removed. + +2. Add type and version device attributes +----------------------------------------- + +Add two read-only device attributes, ``type`` and ``version``. Follow the +**TODO 2** markings. + +.. hint:: You will need to add the two attributes in the structure + ``bex_dev_attrs``, as follows: + + ``&dev_attr_.attr,`` + +.. hint:: + + A possible implementation for the show function is the following: + + .. code-block:: c + + static ssize_t + type_show(struct device *dev, struct device_attribute *attr, char *buf) + { + struct bex_device *bex_dev = to_bex_device(dev); + + return sprintf(buf, "%s\n", bex_dev->type); + } + DEVICE_ATTR_RO(type); + +Observe that two new attributes are visible in +/sys/bus/bex/devices/root. Check the contents of these attributes. + +3. Add del and add bus attributes +--------------------------------- + +Add two write-only bus attributes, ``del`` and ``add``. del expects the name +of a device to delete, while add expects the name, type and version to +create a new device. Follow the **TODO 3** markings and review +`Buses`_. + +.. hint:: Use :c:func:`sscanf` to parse the input from sysfs and + :c:func:`bex_del_dev` and :c:func:`bex_add_dev` to delete + and create a new device. + +An example for the store function is the following: + +.. code-block:: c + + static ssize_t add_store(struct bus_type *bt, const char *buf, size_t count) + { + char name[32]; + int ret; + + ret = sscanf(buf, "%31s", name); + if (ret != 1) + return -EINVAL; + + ... + } + BUS_ATTR(add, S_IWUSR, NULL, add_store); + +.. hint:: The store functions should return ``0`` if + ``bex_add_dev``/``bex_del_dev`` fail and ``count`` otherwise. + +Create a new device and observe that is visible in +``/sys/bus/devices``. Delete it and observe it disapears from ``sysfs``. + +.. hint:: Use echo to write into the bus attributes: + + .. code-block:: shell + + $ echo "name type 1" > /sys/bus/bex/add + $ echo "name" > /sys/bus/bex/del + +4. Register the bex misc driver +------------------------------- + +Modify **bex-misc.c** so that it registers the driver with the bex +bus. Insert the ``bmx_misc.ko`` module and create a new bex device from +sysfs with the name "test", type "misc", version 2. Follow the **TODO +4** markings. + +Observe that the driver is visible in ``/sys/bus/bex/drivers``. + +Why isn't the probe function called? + +.. hint:: Notice that the bus match function in **bex.c** is not + implemented. + +Implement the bus matching function in **bex.c**. Follow the **TODO 5** +markings. Try again to create a new bex device and observe that this +time the ``probe`` function from the ``bex_misc`` driver is called. + +5. Register misc device in the bex_misc probe function +------------------------------------------------------ + +Modify **bex_misc.c** to refuse probing if ``version > 1``. Also, register the +defined misc device in ``bex_misc_probe`` and deregister it in +``bex_misc_remove``. Follow the **TODO 6** markings. + +.. hint:: Use :c:func:`misc_register` and :c:func:`misc_deregister`. + +Create a new device with the name "test", type "misc" and version 2 +and observe that the probe fails. Create a new device with the name +"test2", type "misc" and version 1 and observe that the probe is +successful. + +Inspect ``/sys/bus/bex/devices/test2`` and observe that we have a new +entry. Identify the major and minor for the misc device, create a +character device file and try to read and write from the misc device +buffer. + +.. hint:: The major and minor should be visible in the dev attribute + of the misc device + +6. Monitor uevent notifications +------------------------------- + +Use the ``udevadm monitor`` command and observe what happens when: + +* the ``bex.ko`` and ``bex_misc.ko`` modules are inserted + +* a new device with the type "type" is created + +* a new device with the type "misc" and version 2 is created + +* a new device with the type "misc" and version 1 is created + +* all of the above are removed diff --git a/refs/pull/405/merge/_sources/labs/filesystems_part1.rst.txt b/refs/pull/405/merge/_sources/labs/filesystems_part1.rst.txt new file mode 100644 index 00000000..0fd3efa2 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/filesystems_part1.rst.txt @@ -0,0 +1,796 @@ +============================ +File system drivers (Part 1) +============================ + +Lab objectives +============== + + * acquiring knowledge about the Virtual Filesystem (VFS) in Linux and understanding concepts regarding 'inode', 'dentry', 'file', superblock and data block. + * understanding the process of mounting a file system inside VFS. + * knowledge regarding various file system types and understanding differences between file systems with physical support (on disk) and the ones without physical support. + +Virtual Filesystem (VFS) +======================== + +The Virtual Filesystem (also known as VFS) is a component of the kernel that handles all system calls related to files and file systems. +VFS is a generic interface between the user and a particular file system. +This abstraction simplifies the implementation of file systems and provides an easier integration of multiple file systems. This way, the implementation of a file system is accomplished by using the API provided by the VFS, and the generic hardware and I/O subsystem communication parts are handled by VFS. + +From a functional point of view, file systems can be grouped into: + + * disk file systems (ext3, ext4, xfs, fat, ntfs, etc.) + * network file systems (nfs, smbfs/cifs, ncp, etc.) + * virtual filesystems (procfs, sysfs, sockfs, pipefs, etc.) + +A Linux kernel instance will use VFS for the hierarchy (a tree) of directories and files. +A new file system will be added as a VFS subtree using the mount operation. +A file system is usually mounted from the environment for which it was built (from a block type device, from network, etc.). +In particular, however, the VFS can use a normal file as a virtual block device, so it is possible to mount disk file systems over normal files. This way, stacks of file systems can be created. + +The basic idea of VFS is to provide a single file model that can represent files from any file system. +The file system driver is responsible for bringing to the common denominator. +This way the kernel can create a single directory structure that contains the entire system. +There will be a file system that will be the root, the rest being mounted in its various directories. + +The general file system model +============================= + +The general file system model, to which any implemented file system needs to be reduced, consists of several well-defined entities: :c:type:`superblock`, :c:type:`inode`, :c:type:`file`, and :c:type:`dentry`. +These entities are file system metadata (they contain information about data or other metadata). + +Model entities interact using some VFS or kernel subsystems: dentry cache, inode cache, buffer cache. +Each entity is treated as an object: it has a associated data structure and a pointer to a table of methods. The induction of particular behavior for each component is done by replacing the associated methods. + +superblock +---------- + +The superblock stores the information needed for a mounted file system: + + * inode and blocks locations + * file system block size + * maximum filename length + * maximum file size + * the location of the root inode + +Localization: +~~~~~~~~~~~~~ + + * In the case of disk file systems, the superblock has a correspondent in the first block of the disk. (Filesystem Control Block). + * In VFS, all superblocks of filesystems are retained in a list of structures of type :c:type:`struct super_block` and the methods in structures of type :c:type:`struct super_operations`. + +inode +----- + +The inode (index node) keeps information about a file in the general sense (abstraction): regular file, directory, special file (pipe, fifo), block device, character device, link, or anything that can be abstracted as a file. + +An inode stores information like: + + * file type; + * file size; + * access rights; + * access or modify time; + * location of data on the disk (pointers to disk blocks containing data). + +.. note:: + Usually, the inode does not contain the file name. The name is stored by the :c:type:`dentry` entity. This way, an inode can have multiple names (hardlinks). + +Localization: +~~~~~~~~~~~~~ + +Like the superblock, the :c:type:`inode` has a disk correspondent. +The inodes on disk are generally grouped into a specialized area (inode area) separated from the data blocks area; In some file systems, the equivalents of the inodes are spread in the file system structure (FAT); +As a VFS entity, an inode is represented by the structure :c:type:`struct inode` and by the operations with it defined in the structure :c:type:`struct inode_operations`. + +Each inode is generally identified by a number. On Linux, the ``-i`` argument of the ``ls`` command shows the inode number associated with each file: + +.. code-block:: console + + razvan@valhalla:~/school/so2/wiki$ ls -i + 1277956 lab10.wiki 1277962 lab9.wikibak 1277964 replace_lxr.sh + 1277954 lab9.wiki 1277958 link.txt 1277955 homework.wiki + +file +---- + +File is the component of the file system model that is closest to the user. +The structure exists only as a VFS entity in memory and has no physical correspondent on disk. + +While the inode abstracts a file on the disk, the file structure abstracts an open file. +From the point of view of the process, the file entity abstracts the file. From the point of view of the file system implementation, however, the inode is the entity that abstracts the file. + +The file structure maintains information such as: + + * file cursor position; + * file opening rights; + * pointer to the associated inode (eventually its index). + +Localization: +~~~~~~~~~~~~~ + + * The structure :c:type:`struct file` is the associated VFS entity, and the structure :c:type:`struct file_operations` represents the operations associated with it. + +dentry +------ + +The dentry (directory entry) associates an inode with a file name. + +Generally, a dentry structure contains two fields: + + * an integer that identifies the inode; + * a string representing its name. + +The dentry is a specific part of a path that can be a directory or a file. For example, for the path ``/bin/vi``, dentry objects will be created for ``/``, ``bin``, and ``vi`` (a total of 3 dentry objects). + + * the dentry has a correspondent on the disk, but the correspondence is not direct because each file system keeps the dentries in a specific way + * in VFS, the dentry entity is represented by the structure :c:type:`struct dentry` and the operations with it are defined in the :c:type:`struct dentry_operations` structure. + +.. _RegisterUnregisterSection: + +Register and unregister filesystems +=================================== + +In the current version, the Linux kernel supports about 50 file systems, including: + + * ext2/ ext4 + * reiserfs + * xfs + * fat + * ntfs + * iso9660 + * udf for CDs and DVDs + * hpfs + +On a single system, however, it is unlikely that there will be more than 5-6 file systems. For this reason, file systems (or, more correctly, file system types) are implemented as modules and can be loaded or unloaded at any time. + +In order to be able to dynamically load / unload a file system module, a file system registration / deregistration API is required. The structure describing a particular file system is :c:type:`struct file_system_type`: + + .. code-block:: c + + #include + + struct file_system_type { + const char *name; + int fs_flags; + struct dentry *(*mount) (struct file_system_type *, int, + const char *, void *); + void (*kill_sb) (struct super_block *); + struct module *owner; + struct file_system_type * next; + struct hlist_head fs_supers; + struct lock_class_key s_lock_key; + struct lock_class_key s_umount_key; + //... + }; + + * ``name`` is a string representing the name that will identify a file system (the argument passed to ``mount -t``). + * ``owner`` is ``THIS_MODULE`` for file systems implemented in modules, and ``NULL`` if they are written directly into the kernel. + * The ``mount`` function reads the superblock from the disk in memory when loading the file system. The function is unique to each file system. + * The ``kill_sb`` function releases the super-block from memory. + * ``fs_flags`` specifies the flags with which the file system must be mounted. An example of such flag is ``FS_REQUIRES_DEV`` that specifies to VFS that the file system needs a disk (it is not a virtual file system). + * ``fs_supers`` is a list containing all the superblocks associated with this file system. Since the same file system can be mounted multiple times, there will be a separate superblock for each mount. + +The *registration of a file system* into the kernel is generally performed in the module initialization function. For registration, the programmer will have to + + #. initialize a structure of type :c:type:`struct file_system_type` with the name, the flags, the function that implements the superblock reading operation and the reference to the structure that identifies the current module + #. call the :c:func:`register_filesystem` function. + +When unloading the module, you must unregister the file system by calling the :c:func:`unregister_filesystem` function. + +An example of registering a virtual file system is found in the code for ``ramfs``: + +.. code-block:: c + + static struct file_system_type ramfs_fs_type = { + .name = "ramfs", + .mount = ramfs_mount, + .kill_sb = ramfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, + }; + + static int __init init_ramfs_fs(void) + { + if (test_and_set_bit(0, &once)) + return 0; + return register_filesystem(&ramfs_fs_type); + } + +.. _FunctionsMountKillSBSection: + +Functions mount, kill_sb +------------------------ + +When mounting the file system, the kernel calls the mount function defined within the structure :c:type:`file_system_type`. The function makes a set of initializations and returns a dentry (the structure :c:type:`struct dentry`) that represents the mount point directory. Usually :c:func:`mount` is a simple function that calls one of the functions: + + * :c:func:`mount_bdev`, which mounts a file system stored on a block device + * :c:func:`mount_single`, which mounts a file system that shares an instance between all mount operations + * :c:func:`mount_nodev`, which mounts a file system that is not on a physical device + * :c:func:`mount_pseudo`, a helper function for pseudo-file systems (``sockfs``, ``pipefs``, generally file systems that can not be mounted) + +These functions get as parameter a pointer to a function :c:func:`fill_super` that will be called after the superblock initialization to finish its initialization by the driver. An example of such a function can be found in the ``fill_super`` section. + +When unmounting the file system, the kernel calls :c:func:`kill_sb`, which performs cleanup operations and invokes one of the functions: + + * :c:func:`kill_block_super`, which unmounts a file system on a block device + * :c:func:`kill_anon_super`, which unmounts a virtual file system (information is generated when requested) + * :c:func:`kill_litter_super`, which unmounts a file system that is not on a physical device (the information is kept in memory) + +An example for a file system without disk support is the :c:func:`ramfs_mount` function in the ``ramfs`` file system: + +.. code-block:: c + + struct dentry *ramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) + { + return mount_nodev(fs_type, flags, data, ramfs_fill_super); + } + +An example for a file system from disk is the :c:func:`minix_mount` function in the ``minix`` file system: + +.. code-block:: c + + struct dentry *minix_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) + { + return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super); + } + +Superblock in VFS +================= + +The superblock exists both as a physical entity (entity on disk) and as a VFS entity (within the :c:type:`struct super_block` structure). +The superblock contains only metainformation and is used to write and read metadata from the disk (inodes, directory entries). +A superblock (and implicitly the :c:type:`struct super_block` structure) will contain information about the block device used, the list of inodes, a pointer to the inode of the file system root directory, and a pointer to the superblock operations. + +The :c:type:`struct super_block` structure +------------------------------------------ + +Part of the :c:type:`struct super_block` structure definition is presented below: + +.. code-block:: c + + struct super_block { + //... + dev_t s_dev; /* identifier */ + unsigned char s_blocksize_bits; /* block size in bits */ + unsigned long s_blocksize; /* block size in bytes */ + unsigned char s_dirt; /* dirty flag */ + loff_t s_maxbytes; /* max file size */ + struct file_system_type *s_type; /* filesystem type */ + struct super_operations *s_op; /* superblock methods */ + //... + unsigned long s_flags; /* mount flags */ + unsigned long s_magic; /* filesystem’s magic number */ + struct dentry *s_root; /* directory mount point */ + //... + char s_id[32]; /* informational name */ + void *s_fs_info; /* filesystem private info */ + }; + +The superblock stores global information for an instance of a file system: + * the physical device on which it resides + * block size + * the maximum size of a file + * file system type + * the operations it supports + * magic number (identifies the file system) + * the root directory ``dentry`` + +Additionally, a generic pointer (``void *``) stores the private data of the file system. +The superblock can be viewed as an abstract object to which its own data is added when there is a concrete implementation. + +.. _SuperblockSection: + +Superblock operations +--------------------- + +The superblock operations are described by the :c:type:`struct super_operations` structure: + +.. code-block:: c + + struct super_operations { + //... + int (*write_inode) (struct inode *, struct writeback_control *wbc); + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *); + + void (*put_super) (struct super_block *); + int (*statfs) (struct dentry *, struct kstatfs *); + int (*remount_fs) (struct super_block *, int *, char *); + //... + }; + +The fields of the structure are function pointers with the following meanings: + + * ``write_inode``, ``alloc_inode``, ``destroy_inode`` write, allocate, respectively release resources associated with an inode and are described in the next lab + * ``put_super`` is called when the superblock is released at ``umount``; within this function, any resources (generally memory) from the file system's private data must be released; + * ``remount_fs`` is called when the kernel detects a remount attempt (mount flag ``MS_REMOUNTM``); most of the time here must be detected if a switch from read-only to read-write or vice versa is attempted; this can be done simply because both the old flags (in ``sb->s_flags``) and the new flags (the ``flags`` argument) can be accessed; ``data`` is a pointer to the data sent by :c:func:`mount` that represent file system specific options; + * ``statfs`` is called when a ``statfs`` system call is done (try ``stat –f`` or ``df``); this call must fill the fields of the :c:type:`struct kstatfs` structure, as it is done, for example, in the :c:func:`ext4_statfs` function. + +.. _FillSuperSection: + +The :c:func:`fill_super` function +===================================== + +As specified, the :c:func:`fill_super` function is called to terminate the superblock initialization. This initialization involves filling the :c:type:`struct super_block` structure fields and the initialization of the root directory inode. + +An example of implementation is the :c:func:`ramfs_fill_super` function which is called to initialize the remaining fields in the superblock: + +.. code-block:: c + + #include + + #define RAMFS_MAGIC 0x858458f6 + + static const struct super_operations ramfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .show_options = ramfs_show_options, + }; + + static int ramfs_fill_super(struct super_block *sb, void *data, int silent) + { + struct ramfs_fs_info *fsi; + struct inode *inode; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; + if (!fsi) + return -ENOMEM; + + err = ramfs_parse_options(data, &fsi->mount_opts); + if (err) + return err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = RAMFS_MAGIC; + sb->s_op = &ramfs_ops; + sb->s_time_gran = 1; + + inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + return 0; + } + + +The kernel provides generic function to implement operations with file system structures. +The :c:func:`generic_delete_inode` and :c:func:`simple_statfs` functions used in the above code are such functions and can be used to implement the drivers if their functionality is sufficient. + +The :c:func:`ramfs_fill_super` function in the above code fills some fields in the superblock, then reads the root inode and allocates the root dentry. +Reading the root inode is done in the :c:func:`ramfs_get_inode` function, and consists of allocating a new inode using :c:func:`new_inode` and initializing it. In order to free the inode, :c:func:`iput` is used, and :c:func:`d_make_root` is used to allocate the root dentry. + +An example implementation for a disk file system is the :c:func:`minix_fill_super` function in the minix file system. +The functionality for the disk file system is similar to that of the virtual file system, with the exception of using the buffer cache. +Also, the minix file system keeps private data using the :c:type:`struct minix_sb_info` structure. +A large part of this function deals with the initialization of these private data. +The private data is allocated using the :c:func:`kzalloc` function and stored in the ``s_fs_info`` field of the superblock structure. + +VFS functions typically get as arguments the superblock, an inode and/or a dentry that contain a pointer to the superblock so that these private data can be easily accessed. + +.. _BufferCacheSection: + +Buffer cache +============ + +Buffer cache is a kernel subsystem that handles caching (both read and write) blocks from block devices. +The base entity used by buffer cache is the :c:type:`struct buffer_head` structure. +The most important fields in this structure are: + + * ``b_data``, pointer to a memory area where the data was read from or where the data must be written to + * ``b_size``, buffer size + * ``b_bdev``, the block device + * ``b_blocknr``, the number of block on the device that has been loaded or needs to be saved on the disk + * ``b_state``, the status of the buffer + +There are some important functions that work with these structures: + + * :c:func:`__bread`: reads a block with the given number and given size in a ``buffer_head`` structure; in case of success returns a pointer to the ``buffer_head`` structure, otherwise it returns ``NULL``; + * :c:func:`sb_bread`: does the same thing as the previous function, but the size of the read block is taken from the superblock, as well as the device from which the read is done; + * :c:func:`mark_buffer_dirty`: marks the buffer as dirty (sets the ``BH_Dirty`` bit); the buffer will be written to the disk at a later time (from time to time the ``bdflush`` kernel thread wakes up and writes the buffers to disk); + * :c:func:`brelse`: frees up the memory used by the buffer, after it has previously written the buffer on disk if needed; + * :c:func:`map_bh`: associates the buffer-head with the corresponding sector. + +Functions and useful macros +=========================== + +The super block typically contains a map of occupied blocks (by inodes, dentries, data) in the form of a bitmap (vector of bits). To work with such maps, it is recommend to use the following features: + + * :c:func:`find_first_zero_bit`, to find the first zero bit in a memory area. The size parameter means the number of bits in the search area; + * :c:func:`test_and_set_bit`, to set a bit and get the old value; + * :c:func:`test_and_clear_bit`, to delete a bit and get the old value; + * :c:func:`test_and_change_bit`, to invert the value of a bit and get the old value. + +The following macrodefinitions can be used to verify the type of an inode: + + * ``S_ISDIR`` (``inode->i_mode``) to check if the inode is a directory; + * ``S_ISREG`` (``inode->i_mode``) to check if the inode is a regular file (not a link or device file). + +Further reading +=============== + +#. Robert Love -- Linux Kernel Development, Second Edition -- Chapter + 12. The Virtual Filesystem +#. Understanding the Linux Kernel, 3rd edition - Chapter 12. The Virtual + Filesystem +#. `Linux Virtual File System (presentation)`_ +#. `Understanding Unix/Linux Filesystem`_ +#. `Creating Linux virtual filesystems`_ +#. `The Linux Documentation Project - VFS`_ +#. `The "Virtual File System" in Linux`_ +#. `A Linux Filesystem Tutorial`_ +#. `The Linux Virtual File System`_ +#. `Documentation/filesystems/vfs.txt`_ +#. `File systems sources`_ + +.. _Linux Virtual File System (presentation): http://www.coda.cs.cmu.edu/doc/talks/linuxvfs/ +.. _Understanding Unix/Linux Filesystem: http://www.cyberciti.biz/tips/understanding-unixlinux-file-system-part-i.html +.. _Creating Linux virtual filesystems: http://lwn.net/Articles/57369/ +.. _The Linux Documentation Project - VFS: http://www.tldp.org/LDP/tlk/fs/filesystem.html +.. _The "Virtual File System" in Linux: http://www.linux.it/~rubini/docs/vfs/vfs.html +.. _A Linux Filesystem Tutorial: http://inglorion.net/documents/tutorials/tutorfs/ +.. _The Linux Virtual File System: http://www.win.tue.nl/~aeb/linux/lk/lk-8.html +.. _Documentation/filesystems/vfs.txt: http://lxr.free-electrons.com/source/Documentation/filesystems/vfs.txt +.. _File systems sources: http://lxr.free-electrons.com/source/fs/ + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: filesystems + +.. + _[SURVEY-LABEL] + +myfs +---- + +To begin, we plan to get familiar with the interface exposed by the Linux kernel and the Virtual File System (VFS) component. That is why, for the beginning, we will work with a simple, virtual file system (i.e. without physical disk support). The file system is called ``myfs``. + +For this we will access the ``myfs/`` subdirectory in the laboratory skeleton. We will implement the superblock operations within this lab, and the next lab will continue with the inode operations. + +1. Register and unregister the myfs file system +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The first step in working with the file system is to register and unregister it. We want to do this for the file system described in ``myfs.c``. Check the file contents and follow the directions marked with ``TODO 1``. + +The steps you need to take are described in the section :ref:`RegisterUnregisterSection`. Use the ``"myfs"`` string for the file system name. + +.. note:: + Within the file system structure, use the ``myfs_mount`` function present in the code skeleton to fill the superblock (done when mounting). In ``myfs_mount`` call the function specific to a file system without disk support. As an argument for the specific mount function, use the function of type ``fill_super`` defined in the code skeleton. You can review the :ref:`FunctionsMountKillSBSection` section. + + To destroy the superblock (done at unmounting) use ``kill_litter_super``, also a function specific to a file system without disk support. The function is already implemented, you need to fill it in the :c:type:`struct file_system_type` structure. + + +After completing the sections marked with ``TODO 1`` , compile the module, copy it to the QEMU virtual machine, and start the virtual machine. Load the kernel module and then check the presence of the ``myfs`` file system within the ``/proc/filesystems`` file. + +At the moment, the file system is only registered, it does not expose operations to use it. If we try to mount it, the operation will fail. To try mounting, we create mount point ``/mnt/myfs/``. + +.. code-block:: console + + # mkdir -p /mnt/myfs + +and then we use the ``mount`` command: + +.. code-block:: console + + # mount -t myfs none /mnt/myfs + +The error message we get shows that we have not implemented the operations that work on the superblock. We will have to implement the operations on the superblock and initialize the root inode. We will do this further. + +.. note:: + + The ``none`` argument sent to the ``mount`` command indicates that we do not have a device from which to mount, the file system being a virtual one. Similarly, this is how the ``procfs`` or ``sysfs`` filesystems are mounted on Linux systems. + + +2. Completing myfs superblock +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To be able to mount the file system, we need to fill its superblock's fields, that is, a generic VFS structure of type :c:type:`struct super_block`. +We will fill out the structure within the :c:func:`myfs_fill_super` function; the superblock is represented by the variable ``sb`` passed as an argument to the function. +Follow the hints marked with ``TODO 2``. + +.. note:: + + To fill the ``myfs_fill_super`` function, you can start from the example in the section :ref:`FillSuperSection`. + + For the superblock structure fields, use the macros defined within the code skeleton wherever possible. + + +The ``s_op`` field in the superblock structure must be initialized to the superblock operations structures (type :c:type:`struct super_operations`). You need to define such a structure. + +For information on defining the :c:type:`struct super_operations` structure and filling the superblock, see the section :ref:`SuperblockSection`. + +.. note:: + + Initialize the ``drop_inode`` and ``statfs`` fields of :c:type:`struct super_operations` structure. + + +Although the superblock will be properly initialized at this time, the mount operation will continue to fail. +In order for the operation to be successfully completed, the root inode will have to be initialized, which we will do for the next exercise. + + +3. Initialize myfs root inode +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The root inode is the inode of the file system root directory (i.e. ``/``). +Initialization is done when the file system is mounted. +The ``myfs_fill_super`` function, called at mount, is the one that calls the ``myfs_get_inode`` function that creates and initializes an inode. +Typically, this function is used to create and initialize all inodes; In this exercise, however, we will only create the root inode. + +The :c:type:`inode` is allocated inside the ``myfs_get_inode`` function (local variable ``inode``, allocated using the :c:func:`new_inode` function call). + +To successfully complete mounting the file system, you will need to fill the ``myfs_get_inode`` function. Follow directions marked with ``TODO 3``. A starting point is the `ramfs_get_inode `_ function. + +.. note:: + + To initialize ``uid``, ``gid`` and ``mode`` , you can use the :c:func:`inode_init_owner` function as it is used in :c:func:`ramfs_get_inode`. + When you call :c:func:`inode_init_owner`, use ``NULL`` as the second parameter because there is no parent directory for the created inode. + + Initialize the ``i_atime``, ``i_ctime``, and ``i_mtime`` of the VFS inode to the value returned by the :c:func:`current_time` function. + + You will need to initialize the operations for the inode of type directory. To do this, follow the steps: + + #. Check if this is a directory type inode using the ``S_ISDIR`` macro. + #. For the ``i_op`` and ``i_fop`` fields, use kernel functions that are already implemented: + + * for ``i_op``: :c:type:`simple_dir_inode_operations`. + * for ``i_fop``: :c:type:`simple_dir_operations` + + #. Increase the number of links for the directory using the :c:func:`inc_nlink` function. + +4. Test myfs mount and unmount +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now we can mount the filesystem. +Follow the steps above to compile the kernel module, copy to the virtual machine, and start the virtual machine, then insert the kernel module, create the mount point ``/mnt/myfs/``, and mount the file system. +We verify that the file system was mounted by inspecting the ``/proc/mounts`` file. + +What inode number does the ``/mnt/myfs`` directory have? Why? + +.. note:: + + To display the inode number of a directory, use the command: + + .. code-block:: console + + ls -di /path/to/directory + + where ``/path/to/directory/`` is the path to the directory whose inode number we want to display. + +We check myfs file system statistics using the following command: + +.. code-block:: console + + stat -f /mnt/myfs + +We want to see what the mount point ``/mnt/myfs`` contains and if we can create files. +For this we run the commands: + +.. code-block:: console + + # ls -la /mnt/myfs + # touch /mnt/myfs/a.txt + +We can see that we can not create the ``a.txt`` file on the file system. +This is because we have not implemented the operations to work with inodes in the :c:type:`struct super_operations` structure. +We will implement these operations within the next lab. + +Unmount the file system using the command + +.. code-block:: console + + umount /mnt/myfs + +Unload the kernel module corresponding to the file system as well. + +.. note:: + + To test the entire functionality, you can use the ``test-myfs.sh`` script: + + .. code-block:: console + + ./test-myfs.sh + + The script is copied to the virtual machine using ``make copy`` only if it is executable: + + .. code-block:: console + + student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/myfs/test-myfs.sh + + +.. note:: + + The statistics displayed for the file system are minimal because the information is provided by the simple_statfs function. + +minfs +----- + +Next, we will implement the basics of a very simple file system, called ``minfs``, with disk support. +We will use a disk in the virtual machine that we will format and mount with the ``minfs`` filesystem. + +For this we will access the ``minfs/kernel`` directory from the laboratory skeleton and work with the code in ``minfs.c``. +Just like ``myfs`` we will not implement the operations for working with inodes. We will just limit to working with the superblock and, therefore, mounting. +The rest of the operations will be implemented in the next lab. + +Follow the diagram below to clarify the role of structures within the ``minfs`` file system. + +.. image:: ../res/minfs.png + +1. Registering and unregistering the minfs file system +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + Before solving the exercise, we need to add a disk to the virtual machine. To do this, generate a file that we will use as the disk image using the following command: + + .. code-block:: console + + dd if=/dev/zero of=mydisk.img bs=1M count=100 + + and add the ``-drive file=mydisk.img,if=virtio,format=raw`` argument to the ``qemu`` command in ``qemu/Makefile`` (in the ``QEMU_OPTS`` variable). + The new argument for the ``qemu`` command must be added after the one for the existing disk (``YOCTO_IMAGE``). + +To register and unregister the file system, you will need to fill the ``minfs_fs_type`` and ``minfs_mount`` functions in ``minfs.c``. Follow the directions marked with ``TODO 1``. + +.. note:: + + In the file system structure, for mount, use the ``minfs_mount`` function from in the code skeleton. + In this function, call the function to mount a file system with disk support (See the :ref:`FunctionsMountKillSBSection` section. Use :c:func:`mount_bdev`). + Choose the most suitable function for destroying the superblock (done at unmount); keep in mind that it is a file system with disk support. Use the :c:func:`kill_block_super` function. + + Initialize the ``fs_flags`` field of the :c:type:`minfs_fs_type` structure with the appropriate value for a file system with disk support. See the section :ref:`RegisterUnregisterSection`. + + The function for filling the superblock is ``minfs_fill_super``. + +After completing the sections marked with ``TODO 1``, compile the module, copy it into the QEMU virtual machine, and start the virtual machine. +Load the kernel module and then check the presence of the ``minfs`` file system within the ``/proc/filesystems`` file. + +To test the mounting of the ``minfs`` file system we will need to format the disk with its structure. Formatting requires the ``mkfs.minfs`` formatting tool from the ``minfs/user`` directory. The utility is automatically compiled when running ``make build`` and copied to the virtual machine at ``make copy``. + +After compiling, copying, and starting the virtual machine, format the ``/dev/vdd`` using the formatting utility: + +.. code-block:: console + + # ./mkfs.minfs /dev/vdd + +Load the kernel module: + +.. code-block:: console + + # insmod minfs.ko + +Create mount point ``/mnt/minfs/``: + +.. code-block:: console + + # mkdir -p /mnt/minfs/ + +and mount the filesystem + +.. code-block:: console + + # mount -t minfs /dev/vdd /mnt/minfs/ + +The operation fails because the root inode is not initialized. + +2. Completing minfs superblock +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To be able to mount the file system, you will need to fill the superblock (i.e a structure with type :c:type:`struct super_block`) within the ``minfs_fill_super`` function; it is the ``s`` argument of the function. +The structure of operations on the superblock is already defined: ``minfs_ops``. +Follow the directions marked with ``TODO 2``. You can also follow the implementation of the `minix_fill_super `_ function. + +.. note:: + + Some structures are found in the header file ``minfs.h``. + + For information on working with buffers, go to the :ref:`BufferCacheSection` section. + + Read the first block on the disk (block with index 0). + To read the block, use the :c:func:`sb_bread` function. + Cast the read data (the ``b_data`` field in the :c:type:`struct buffer_head` structure) to the structure storing the ``minfs`` superblock information on the disk: :c:type:`struct minfs_super_block`, defined in the source code file. + + Structure :c:type:`struct minfs_super_block` holds file system-specific information that is not found in the :c:type:`struct super_block` generic structure (in this case only version). + Those additional information (found in :c:type:`struct minfs_super_block` (on disk) but not in :c:type:`struct super_block` (VFS)) will be stored in the :c:type:`struct minfs_sb_info` structure. + +To check the functionality, we need a function for reading the root inode. +For the time being, use the ``myfs_get_inode`` function from ``myfs`` file system exercises. +Copy the function into the source code and call it the same as you did for myfs. +The third argument when calling the ``myfs_get_inode`` function is the inode creation permissions, similar to the virtual file system exercise (myfs). + +Validate the implementation by executing the commands from the previous exercise. + +3. Creating and destroying minfs inodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For mounting, we need to initialize the root inode, and to get the root inode, we need to implement the functions to work with inodes. +That is, you need to implement the ``minfs_alloc_inode`` and ``minfs_destroy_inode`` functions. +Follow the directions marked with ``TODO 3``. You can use the :c:func:`minix_alloc_inode` and :c:func:`minix_destroy_inode` functions as a model. + +For the implementation, look at the macros and structures in the ``minfs.h`` header file. + +.. note:: + + For memory allocation/deallocation in ``minfs_alloc_inode`` and ``minfs_destroy_inode``, we recommend using :c:func:`kzalloc` and :c:func:`kfree`. + + In ``minfs_alloc_inode`` allocate structures with type :c:type:`struct minfs_inode_info`, but only return structures with type :c:type:`struct inode`, i.e. return those given by the ``vfs_inode`` field. + + In the ``minfs_alloc_inode`` function, call :c:func:`inode_init_once` to initialize the inode. + + In the ``destroy_inode`` function, you can access the structure with type :c:type:`struct minfs_inode_info` using the ``container_of`` macro. + +.. note:: + + In this exercise, you have implemented the ``minfs_alloc_inode`` and ``minfs_destroy_inode`` functions, but they are not yet called. The correctness of the implementation will be checked at the end of the next exercise. + +4. Initialize minfs root inode +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Initializing the root inode is required in order to mount the file system. +For this, you will need to complete the ``minfs_ops`` structure with the ``minfs_alloc_inode`` and ``minfs_destroy_inode`` functions and fill the ``minfs_iget`` function. + +The ``minfs_iget`` function is the function called to allocate a VFS inode (i.e. :c:type:`struct inode`) and fill it with minfs inode-specific information from the disk (i.e. ``struct minfs_inode``). + +Follow the directions marked with ``TODO 4``. +Fill out the ``alloc_inode`` and ``destroy_inode`` fields of :c:type:`struct super_operations` structure with the functions implemented in the previous step. + +The information about the root inode is found in the second block on the disk (the inode with index 1). +Make ``minfs_iget`` read the root minfs inode from the disk (:c:type:`struct minfs_inode`) and fill in the VFS inode (:c:type:`struct inode`). + +In the ``minfs_fill_super`` function, replace the ``myfs_get_inode`` call with the ``minfs_iget`` function call. + +.. note:: + To implement the ``minfs_iget`` function, follow the implementation of `V1_minix_iget `_. + To read a block, use the :c:func:`sb_bread` function. + Cast the read data (the ``b_data`` field of the :c:type:`struct buffer_head` structure) to the minfs inode from the disk (:c:type:`struct minfs_inode`). + + The ``i_uid``, ``i_gid``, ``i_mode``, ``i_size`` must be filled in the VFS inode with the values in the minfs inode structure read from disk. + To initialize the ``i_uid`` and ``i_gid fields``, use the functions :c:func:`i_uid_write` , and :c:func:`i_gid_write`. + + Initialize the ``i_atime`` , ``i_ctime``, and ``i_mtime`` fields of the VFS inode to the value returned by the :c:func:`current_time` function. + + You will need to initialize the operations for the inode with type directory. To do this, follow the steps: + + #. Check if this is a directory type inode using the ``S_ISDIR`` macro. + #. For the ``i_op`` and ``i_fop`` fields, use kernel functions already implemented: + + * for ``i_op``: :c:func:`simple_dir_inode_operations` . + * for ``i_fop``: :c:func:`simple_dir_operations` + + #. Increment the number of links for the directory using the :c:func:`inc_nlink` function. + +5. Testing of minfs mount and unmount +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now we can mount the filesystem. +Follow the steps above to compile the kernel module, copy to the virtual machine, start the virtual machine, and then insert the kernel module, create mount point ``/mnt/minfs/`` and mount the file system. +We verify that the file system was mounted by investigating the ``/proc/mounts`` file. + +We check that everything is fine by listing the mount point contents ``/mnt/minfs/``: + +.. code-block:: console + + # ls /mnt/minfs/ + +After mount and verification, unmount the file system and unload the module from the kernel. + +.. note:: + Alternatively, to test the entire functionality, you can use the ``test-minfs.sh`` script: + + .. code-block:: console + + # ./test-minfs.sh + + The script is copied to the virtual machine when running the ``make copy`` command only if is executable. + + .. code-block:: console + + student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/minfs/user/test-minfs.sh + diff --git a/refs/pull/405/merge/_sources/labs/filesystems_part2.rst.txt b/refs/pull/405/merge/_sources/labs/filesystems_part2.rst.txt new file mode 100644 index 00000000..9217cdbe --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/filesystems_part2.rst.txt @@ -0,0 +1,1076 @@ +============================ +File system drivers (Part 2) +============================ + +Lab objectives +============== + + * Improving the knowledge about inode, file and dentry. + * Acquiring knowledge about adding support for working with regular files and directories in VFS (*Virtual File System*). + * Acquiring knowledge about the internal implementation of a file system. + +Inode +===== + +The inode is an essential component of a UNIX file system and, at the same time, an important component of VFS. An inode is a metadata (it has information about information). +An inode uniquely identifies a file on disk and holds information about it (uid, gid, access rights, access times, pointers to data blocks, etc.). +An important aspect is that an inode does not have information about the file name (it is retained by the associated :c:type:`struct dentry` structure). + +The inode refers to a file on the disk. To refer an open file (associated with a file descriptor within a process), the :c:type:`struct file` structure is used. +An inode can have any number of (zero or more) ``file`` structures associated (multiple processes can open the same file, or a process can open the same file several times). + +Inode exists both as a VFS entity (in memory) and as a disk entity (for UNIX, HFS, NTFS, etc.). +The inode in VFS is represented by the structure :c:type:`struct inode`. +Like the other structures in VFS, :c:type:`struct inode` is a generic structure that covers the options for all supported file types, even those that do not have an associated disk entity (such as FAT). + +The inode structure +------------------- + +The inode structure is the same for all file systems. In general, file systems also have private information. These are referenced through the ``i_private`` field of the structure. +Conventionally, the structure that keeps that particular information is called ``_inode_info``, where ``fsname`` represents the file system name. For example, minix and ext4 filesystems store particular information in structures :c:type:`struct minix_inode_info`, or :c:type:`struct ext4_inode_info`. + +Some of the important fields of :c:type:`struct inode` are: + + * ``i_sb`` : The superblock structure of the file system the inode belongs to. + * ``i_rdev``: the device on which this file system is mounted + * ``i_ino`` : the number of the inode (uniquely identifies the inode within the file system) + * ``i_blkbits``: number of bits used for the block size == log\ :sub:`2`\ (block size) + * ``i_mode``, ``i_uid``, ``i_gid``: access rights, uid, gid + + * ``i_size``: file/directory/etc. size in bytes + * ``i_mtime``, ``i_atime``, ``i_ctime``: change, access, and creation time + * ``i_nlink``: the number of names entries (dentries) that use this inode; for file systems without links (either hard or symbolic) this is always set to 1 + * ``i_blocks``: the number of blocks used by the file (all blocks, not just data); this is only used by the quota subsystem + * ``i_op``, ``i_fop``: pointers to operations structures: :c:type:`struct inode_operations` and :c:type:`struct file_operations`; ``i_mapping->a_ops`` contains a pointer to :c:type:`struct address_space_operations`. + * ``i_count``: the inode counter indicating how many kernel components use it. + +Some functions that can be used to work with inodes: + + * :c:func:`new_inode`: creates a new inode, sets the ``i_nlink`` field to 1 and initializes ``i_blkbits``, ``i_sb`` and ``i_dev``; + * :c:func:`insert_inode_hash`: adds the inode to the hash table of inodes; an interesting effect of this call is that the inode will be written to the disk if it is marked as dirty; + + .. warning:: + + An inode created with :c:func:`new_inode` is not in the hash table, and unless you have serious reasons not to, you must enter it in the hash table; + + * :c:func:`mark_inode_dirty`: marks the inode as dirty; at a later moment, it will be written on the disc; + * :c:func:`iget_locked`: loads the inode with the given number from the disk, if it is not already loaded; + * :c:func:`unlock_new_inode`: used in conjunction with :c:func:`iget_locked`, releases the lock on the inode; + * :c:func:`iput`: tells the kernel that the work on the inode is finished; if no one else uses it, it will be destroyed (after being written on the disk if it is maked as dirty); + * :c:func:`make_bad_inode`: tells the kernel that the inode can not be used; It is generally used from the function that reads the inode when the inode could not be read from the disk, being invalid. + +Inode operations +---------------- + +Getting an inode +^^^^^^^^^^^^^^^^ + +One of the main inode operations is obtaining an inode (the :c:type:`struct inode` in VFS). +Until version ``2.6.24`` of the Linux kernel, the developer defined a ``read_inode`` function. +Starting with version ``2.6.25``, the developer must define a ``_iget`` where ```` is the name of the file system. +This function is responsible with finding the VFS inode if it exists or creating a new one and filling it with the information from the disk. + +Generally, this function will call :c:func:`iget_locked` to get the inode structure from VFS. If the inode is newly created then it will need to read the inode from the disk (using :c:func:`sb_bread`) and fill in the useful information. + +An example of such a function is :c:func:`minix_iget`: + +.. code-block:: c + + static struct inode *V1_minix_iget(struct inode *inode) + { + struct buffer_head * bh; + struct minix_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) { + iget_failed(inode); + return ERR_PTR(-EIO); + ... + } + + struct inode *minix_iget(struct super_block *sb, unsigned long ino) + { + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + if (INODE_VERSION(inode) == MINIX_V1) + return V1_minix_iget(inode); + ... + } + +The minix_iget function gets the VFS inode using :c:func:`iget_locked`. +If the inode is already existing (not new == the ``I_NEW`` flag is not set) the function returns. +Otherwise, the function calls the :c:func:`V1_minix_iget` function that will read the inode from the disk using :c:func:`minix_V1_raw_inode` and then complete the VFS inode with the read information. + +Superoperations +^^^^^^^^^^^^^^^ + +Many of the superoperations (components of the :c:type:`struct super_operations` structure used by the superblock) are used when working with inodes. These operations are described next: + + * ``alloc_inode``: allocates an inode. + Usually, this funcion allocates a :c:type:`struct _inode_info` structure and performs basic VFS inode initialization (using :c:func:`inode_init_once`); + minix uses for allocation the :c:func:`kmem_cache_alloc` function that interacts with the SLAB subsystem. + For each allocation, the cache construction is called, which in the case of minix is the :c:func:`init_once` function. + Alternatively, :c:func:`kmalloc` can be used, in which case the :c:func:`inode_init_once` function should be called. + The :c:func:`alloc_inode` function will be called by the :c:func:`new_inode` and :c:func:`iget_locked` functions. + * ``write_inode`` : saves/updates the inode received as a parameter on disk; to update the inode, though inefficient, for beginners it is recommended to use the following sequence of operations: + + * load the inode from the disk using the :c:func:`sb_bread` function; + * modify the buffer according to the saved inode; + * mark the buffer as dirty using :c:func:`mark_buffer_dirty`; the kernel will then handle its writing on the disk; + * an example is the :c:func:`minix_write_inode` function in the ``minix`` file system + + * ``evict_inode``: removes any information about the inode with the number received in the ``i_ino`` field from the disk and memory (both the inode on the disk and the associated data blocks). This involves performing the following operations: + + * delete the inode from the disk; + * updates disk bitmaps (if any); + * delete the inode from the page cache by calling :c:func:`truncate_inode_pages`; + * delete the inode from memory by calling :c:func:`clear_inode` ; + * an example is the :c:func:`minix_evict_inode` function from the minix file system. + + * ``destroy_inode`` releases the memory occupied by inode + +inode_operations +^^^^^^^^^^^^^^^^ + +The inode operations are described by the :c:type:`struct inode_operations` structure. + +Inodes are of several types: file, directory, special file (pipe, fifo), block device, character device, link etc. +For this reason, the operations that an inode needs to implement are different for each type of inode. +Below are detailed operations for a :ref:`file type inode ` and a :ref:`directory inode `. + +The operations of an inode are initialized and accessed using the ``i_op`` field of the structure :c:type:`struct inode`. + +The file structure +================== + +The ``file`` structure corresponds to a file open by a process and exists only in memory, being associated with an inode. +It is the closest VFS entity to user-space; the structure fields contain familiar information of a user-space file (access mode, file position, etc.) and the operations with it are performed by known system calls (``read``, ``write`` , etc.). + +The file operations are described by the :c:type:`struct file_operations` structure. + +The file operations for a file system are initialized using the ``i_fop`` field of the :c:type:`struct inode` structure. +When opening a file, the VFS initializes the ``f_op`` field of the :c:type:`struct file` structure with address of ``inode->i_fop``, such that subsequent system calls use the value stored in the ``file->f_op``. + +.. _FileInodes: + +Regular files inodes +==================== + +To work with the inode, the ``i_op`` and ``i_fop`` fields of the inode structure must be filled in. +The type of the inode determines the operations that it needs to implement. + +.. _FileOperations: + +Regular files inode operations +------------------------------ + +In the ``minix`` file system, the ``minix_file_inode_operations`` structure is defined for the operations on an inode and for the file operations the ``minix_file_operations`` structure is defined: + +.. code-block:: c + + const struct file_operations minix_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + //... + .write_iter = generic_file_write_iter, + //... + .mmap = generic_file_mmap, + //... + }; + + const struct inode_operations minix_file_inode_operations = { + .setattr = minix_setattr, + .getattr = minix_getattr, + }; + + //... + if (S_ISREG(inode->i_mode)) { + inode->i_op = &minix_file_inode_operations; + inode->i_fop = &minix_file_operations; + } + //... + + + +The functions :c:func:`generic_file_llseek` , :c:func:`generic_file_mmap` , :c:func:`generic_file_read_iter` and :c:func:`generic_file_write_iter` are implemented in the kernel. + +For simple file systems, only the truncation operation (``truncate`` system call) must be implemented. +Although initially there was a dedicated operation, starting with 3.14 the operation was embedded in ``setattr``: if the paste size is different from the current size of the inode, then a truncate operation must be performed. +An example of implementing this verification is in the :c:func:`minix_setattr` function: + +.. code-block:: c + + static int minix_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = d_inode(dentry); + int error; + + error = setattr_prepare(dentry, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + truncate_setsize(inode, attr->ia_size); + minix_truncate(inode); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; + } + +The truncate operation involves: + + * freeing blocks of data on the disk that are now extra (if the new dimension is smaller than the old one) or allocating new blocks (for cases where the new dimension is larger) + * updating disk bit maps (if used); + * updating the inode; + * filling with zero the space that was left unused from the last block using the :c:func:`block_truncate_page` function. + +An example of the implementation of the cropping operation is the :c:func:`minix_truncate` function in the ``minix`` file system. + +.. _AddressSpaceOperations: + +Address space operations +------------------------ + +There is a close link between the address space of a process and files: the execution of the programs is done almost exclusively by mapping the file into the process address space. +Because this approach works very well and is quite general, it can also be used for regular system calls such as ``read`` and ``write``. + +The structure that describes the address space is :c:type:`struct address_space`, and the operations with it are described by the structure :c:type:`struct address_space_operations`. To initialize the address space operations, fill ``inode->i_mapping->a_ops`` of the file type inode. + +An example is the ``minix_aops`` structure in the minix file system: + +.. code-block:: c + + static const struct address_space_operations minix_aops = { + .readpage = minix_readpage, + .writepage = minix_writepage, + .write_begin = minix_write_begin, + .write_end = generic_write_end, + .bmap = minix_bmap + }; + + //... + if (S_ISREG(inode->i_mode)) { + inode->i_mapping->a_ops = &minix_aops; + } + //... + +The :c:func:`generic_write_end` function is already implemented. +Most of the specific functions are very easy to implement, as follows: + +.. code-block:: c + + static int minix_writepage(struct page *page, struct writeback_control *wbc) + { + return block_write_full_page(page, minix_get_block, wbc); + } + + static int minix_readpage(struct file *file, struct page *page) + { + return block_read_full_page(page, minix_get_block); + } + + static void minix_write_failed(struct address_space *mapping, loff_t to) + { + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + minix_truncate(inode); + } + } + + static int minix_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + { + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + minix_get_block); + if (unlikely(ret)) + minix_write_failed(mapping, pos + len); + + return ret; + } + + static sector_t minix_bmap(struct address_space *mapping, sector_t block) + { + return generic_block_bmap(mapping, block, minix_get_block); + } + +All that needs to be done is to implement :c:type:`minix_get_block`, which has to translate a block of a file into a block on the device. +If the flag ``create`` received as a parameter is set, a new block must be allocated. +In case a new block is created, the bit map must be updated accordingly. +To notify the kernel not to read the block from the disk, ``bh`` must be marked with :c:func:`set_buffer_new`. The buffer must be associated with the block through :c:func:`map_bh`. + +Dentry structure +================ + +Directories operations use the :c:type:`struct dentry` structure. +Its main task is to make links between inodes and filenames. +The important fields of this structure are presented below: + +.. code-block:: c + + struct dentry { + //... + struct inode *d_inode; /* associated inode */ + //... + struct dentry *d_parent; /* dentry object of parent */ + struct qstr d_name; /* dentry name */ + //... + + struct dentry_operations *d_op; /* dentry operations table */ + struct super_block *d_sb; /* superblock of file */ + void *d_fsdata; /* filesystem-specific data */ + //... + }; + +Fields meaning: + + * ``d_inode``: the inode referenced by this dentry; + * ``d_parent``: the dentry associated with the parent directory; + * ``d_name``: a :c:type:`struct qstr` structure that contains the fields ``name`` and ``len`` (the name and the length of the name). + * ``d_op``: operations with dentries, represented by the :c:type:`struct dentry_operations` structure. + The kernel implements default operations so there is no need to (re)implement them. Some file systems can do optimizations based on the specific structure of the dentries. + * ``d_fsdata``: field reserved for the file system that implements dentry operations; + +Dentry operations +----------------- + +The most commonly operations applied to dentries are: + + * ``d_make_root``: allocates the root dentry. It is generally used in the function that is called to read the superblock (``fill_super``), which must initialize the root directory. + So the root inode is obtained from the superblock and is used as an argument to this function, to fill the ``s_root`` field from the :c:type:`struct super_block` structure. + * ``d_add``: associates a dentry with an inode; the dentry received as a parameter in the calls discussed above signifies the entry (name, length) that needs to be created. This function will be used when creating/loading a new inode that does not have a dentry associated with it and has not yet been introduced to the hash table of inodes (at ``lookup``); + * ``d_instantiate``: The lighter version of the previous call, in which the dentry was previously added in the hash table. + +.. warning:: + + ``d_instantiate`` must be used to implement create calls (``mkdir``, ``mknod``, ``rename``, ``symlink``) and NOT ``d_add``. + +.. _DirectoryInodes: + +Directory inodes operations +=========================== + +The operations for directory type inodes have a higher complexity level than the ones for files. +The developer must define operations for inodes and operations for files. +In ``minix``, these operations are defined in :c:type:`minix_dir_inode_operations` and :c:type:`minix_dir_operations`: + +.. code-block:: c + + struct inode_operations minix_dir_inode_operations = { + .create = minix_create, + .lookup = minix_lookup, + .link = minix_link, + .unlink = minix_unlink, + .symlink = minix_symlink, + .mkdir = minix_mkdir, + .rmdir = minix_rmdir, + .mknod = minix_mknod, + //... + }; + + struct file_operations minix_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = minix_readdir, + //... + }; + + //... + if (S_ISDIR(inode->i_mode)) { + inode->i_op = &minix_dir_inode_operations; + inode->i_fop = &minix_dir_operations; + inode->i_mapping->a_ops = &minix_aops; + } + //... + +The only function already implemented is :c:func:`generic_read_dir`. + +The functions that implement the operations on directory inodes are the ones described below. + +Creating an inode +----------------- + +The inode creation function is indicated by the field ``create`` in the ``inode_operations`` structure. +In the minix case, the function is :c:func:`minix_create`. +This function is called by the ``open`` and ``creat`` system calls. Such a function performs the following operations: + + #. Introduces a new entry into the physical structure on the disk; the update of the bit maps on the disk must not be forgotten. + #. Configures access rights to those received as a parameter. + #. Marks the inode as dirty with the :c:func:`mark_inode_dirty` function. + #. Instantiates the directory entry (``dentry``) with the ``d_instantiate`` function. + +Creating a directory +-------------------- + +The directory creation function is indicated by the ``mkdir`` field in the ``inode_operations`` structure. +In the minix case, the function is :c:func:`minix_mkdir`. +This function is called by the ``mkdir`` system call. Such a function performs the following operations: + + #. Calls :c:func:`minix_create`. + #. Allocates a data block for the directory. + #. Creates the ``"."`` and ``".."`` entries. + +Creating a link +--------------- + +The link creation function (hard link) is indicated by the ``link`` field in the ``inode_operations`` structure. +In the minix case, the function is :c:func:`minix_link`. +This function is called by the ``link`` system call. Such a function performs the following operations: + + * Binds the new dentry to the inode. + * Increments the ``i_nlink`` field of the inode. + * Marks the inode as dirty using the :c:func:`mark_inode_dirty` function. + +Creating a symbolic link +------------------------ + +The symbolic link creation function is indicated by the ``symlink`` field in the ``inode_operations`` structure. +In the minix case, the function is :c:func:`minix_symlink`. +The operations to be performed are similar to ``minix_link`` with the differences being given by the fact that a symbolic link is created. + +Deleting a link +--------------- + +The link delete function (hard link) is indicated by the ``unlink`` field in the ``inode_operations`` structure. +In the minix case, the function is :c:func:`minix_unlink`. +This function is called by the ``unlink`` system call. Such a function performs the following operations: + + #. Deletes the directory entry given as a parameter from the physical disk structure. + #. Decrements the ``i_nlink`` counter of the inode to which the entry points (otherwise the inode will never be deleted). + +Deleting a directory +-------------------- + +The directory delete function is indicated by the ``rmdir`` field in the ``inode_operations`` structure. +In the minix case, the function is :c:func:`minix_rmdir`. +This function is called by the ``rmdir`` system call. +Such a function performs the following operations: + + #. Performs the operations done by ``minix_unlink``. + #. Ensures that the directory is empty; otherwise, returns ``ENOTEMPTY``. + #. Also deletes the data blocks. + +Searching for an inode in a directory +------------------------------------- + +The function that searches for an entry in a directory and extracts the inode is indicated by the ``lookup`` field in the ``inode_operations`` structure. +In the minix case, the function is ``minix_lookup``. +This function is called indirectly when information about the inode associated with an entry in a directory is needed. +Such a function performs the following operations: + + #. Searches in the directory indicated by ``dir`` the entry having the name ``dentry->d_name.name``. + #. If the entry is found, it will return ``NULL`` and associate the inode with the name using the :c:func:`d_add` function. + #. Otherwise, returns ``ERR_PTR``. + +Iterating through entries in a directory +---------------------------------------- + +The function which iterates through the entries in a directory (lists the directory contents) is indicated by the field ``iterate`` in the ``struct file_operations`` structure. +In the minix case, the function is ``minix_readdir``. +This function is called by the ``readdir`` system call. + +The function returns either all entries in the directory or just a part when the buffer allocated for it is not available. +A call of this function can return: + + * a number equal to the existing number of entries if there is enough space in the corresponding user space buffer; + * a number smaller than the actual number of entries, as much as there was space in the corresponding user space buffer; + * ``0``, where there are no more entries to read. + +The function will be called consecutively until all available entries are read. The function is called at least twice. + + * It is only called twice if: + + * the first call reads all entries and returns their number; + * the second call returns 0, having no other entries to read. + + * It is called more than twice if the first call does not return the total number of entries. + +The function performs the following operations: + + #. Iterates over the entries (the dentries) from the current directory. + #. For each dentry found, increments ``ctx->pos``. + #. For each valid dentry (an inode other than ``0``, for example), calls the :c:func:`dir_emit` function. + #. If the :c:func:`dir_emit` function returns a value other than zero, it means that the buffer in the user space is full and the function returns. + +The arguments of the ``dir_emit`` function are: + + * ``ctx`` is the directory iteration context, passed as an argument to the ``iterate`` function; + * ``name`` is the name of the entry (a string of characters); + * ``name_len`` is the length of the entry name; + * ``ino`` is the inode number associated with the entry; + * ``type`` identifies the entry type: ``DT_REG`` (file), ``DT_DIR`` (directory), ``DT_UNKNOWN`` etc. ``DT_UNKNOWN`` can be used when the entry type is unknown. + +.. _BitmapOperations: + +Bitmap operations +================= + +When working with the file systems, management information (what block is free or busy, what inode is free or busy) is stored using bitmaps. +For this we often need to use bit operations. Such operations are: + + * searching the first 0 bit: representing a free block or inode + * marking a bit as 1: marking a busy block or inode + +The bitmap operations are found in headers from ``include/asm-generic/bitops``, especially in ``find.h`` and ``atomic.h``. Usual functions, with names indicating their role, are: + + * :c:func:`find_first_zero_bit` + * :c:func:`find_first_bit` + * :c:func:`set_bit` + * :c:func:`clear_bit` + * :c:func:`test_and_set_bit` + * :c:func:`test_and_clear_bit` + +These functions usually receive the address of the bitmap, possibly its size (in bytes) and, if necessary, the index of the bit that needs to be activated (set) or deactivated (clear). + +Some usage examples are listed below: + +.. code-block:: c + + unsigned int map; + unsigned char array_map[NUM_BYTES]; + size_t idx; + int changed; + + /* Find first zero bit in 32 bit integer. */ + idx = find_first_zero_bit(&map, 32); + printk (KERN_ALERT "The %zu-th bit is the first zero bit.\n", idx); + + /* Find first one bit in NUM_BYTES bytes array. */ + idx = find_first_bit(array_map, NUM_BYTES * 8); + printk (KERN_ALERT "The %zu-th bit is the first one bit.\n", idx); + + /* + * Clear the idx-th bit in integer. + * It is assumed idx is less the number of bits in integer. + */ + clear_bit(idx, &map); + + /* + * Test and set the idx-th bit in array. + * It is assumed idx is less the number of bits in array. + */ + changed = __test_and_set_bit(idx, &sbi->imap); + if (changed) + printk(KERN_ALERT "%zu-th bit changed\n", idx); + +Further reading +=============== + +#. Robert Love -- Linux Kernel Development, Second Edition -- Chapter + 12. The Virtual Filesystem +#. Understanding the Linux Kernel, 3rd edition - Chapter 12. The Virtual + Filesystem +#. `Linux Virtual File System (presentation)`_ +#. `Understanding Unix/Linux Filesystem`_ +#. `Creating Linux virtual filesystems`_ +#. `The Linux Documentation Project - VFS`_ +#. `The "Virtual File System" in Linux`_ +#. `A Linux Filesystem Tutorial`_ +#. `The Linux Virtual File System`_ +#. `Documentation/filesystems/vfs.txt`_ +#. `File systems sources`_ + +.. _Linux Virtual File System (presentation): http://www.coda.cs.cmu.edu/doc/talks/linuxvfs/ +.. _Understanding Unix/Linux Filesystem: http://www.cyberciti.biz/tips/understanding-unixlinux-file-system-part-i.html +.. _Creating Linux virtual filesystems: http://lwn.net/Articles/57369/ +.. _The Linux Documentation Project - VFS: http://www.tldp.org/LDP/tlk/fs/filesystem.html +.. _The "Virtual File System" in Linux: http://www.linux.it/~rubini/docs/vfs/vfs.html +.. _A Linux Filesystem Tutorial: http://inglorion.net/documents/tutorials/tutorfs/ +.. _The Linux Virtual File System: http://www.win.tue.nl/~aeb/linux/lk/lk-8.html +.. _Documentation/filesystems/vfs.txt: http://lxr.free-electrons.com/source/Documentation/filesystems/vfs.txt +.. _File systems sources: http://lxr.free-electrons.com/source/fs/ + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: filesystems + +.. important:: + + In this lab, we will continue the implementation of the file systems started in the previous one. + For this, we will generate the laboratory skeleton using the following command: + + .. code-block:: console + + TODO=5 LABS=filesystems make skels + + After this, we will start the implementation from ``TODO 5``. + +myfs +---- + +For the exercises below, we will use the ``myfs`` file system whose implementation we started with the previous lab. +We stopped after mounting the file system and now we will continue with the operations for regular files and directories. +At the end of these exercises, we will be able to create, modify and delete regular directories and files. + +We will mostly use the ``inode`` and ``dentry`` VFS structures. +The ``inode`` structure defines a file (of any type: regular, directory, link), while the ``dentry`` structure defines a name, which is an entry in a directory. + +For this we will access the ``myfs`` directory in the lab skeleton. +The previously generated skeleton contains the solution for the previous lab; we will start from this. As in the previous lab, we will use the ``ramfs`` file system as a starting point. + +1. Directory operations +^^^^^^^^^^^^^^^^^^^^^^^ + +To begin with, we will implement the operations for working with directories. +The operations of creating a file or deleting a file are also directory operations; these operations result in adding or deleting a directory entry (*dentry*). + +At the end of this exercise we will be able to create and delete entries in the file system. We will not be able to read and write to regular files; we will do so in the next exercise. + +Follow directions marked with ``TODO 5`` which will guide you through the steps you need to take. + +You will need to specify the following directory operations: + + * create a file (``create`` function) + * search (``lookup`` function) + * link (``link`` function) + * create directory (``mkdir`` function) + * deletion (``rmdir`` and ``unlink`` functions) + * create node (``mknod``) + * rename (``rename`` function) + +For this, define the ``myfs_dir_inode_operations`` structure in the code, where marked with ``TODO 5``. +To begin, just define the structure ``myfs_dir_inode_operations``; you will define the structures ``myfs_file_operations``, ``myfs_file_inode_operations`` , and ``myfs_aops`` in the next exercise. + +.. tip:: + + Read the section :ref:`DirectoryInodes` + + As a model, you are following the ``ramfs_dir_inode_operations`` structure. + +Implement the ``mkdir``, ``mknod`` and ``create`` operations inside ``myfs_mkdir``, ``myfs_mknod`` and ``myfs_create``. +These operations will allow you to create directories and files in the file system. + +.. tip:: + + We recommend making the code modular using a ``mknod`` function, which you can also use for the next exercise. + For inode reading and allocation, use ``myfs_get_inode``, which is already implemented. + + As a model, follow the next functions implemented in the ``ramfs`` file system: + + * :c:func:`ramfs_mknod` + * :c:func:`ramfs_mkdir` + * :c:func:`ramfs_create` + +For the other functions, use generic calls (``simple_*``) already defined in VFS. + +In the ``myfs_get_inode`` function, initialize the operations fields of the directory inodes: + + * ``i_op`` must be initialized to the address of the structure ``myfs_dir_inode_operations``; + * ``i_fop`` must be initialized to the address of the structure ``simple_dir_operations``, defined in VFS. + +.. note:: + + ``i_op`` is a pointer to a structure of type :c:type:`struct inode_operations` containing operations that have to do with the inode, which are, for a directory, creating a new entry, listing entries, deleting entries, etc. + + ``i_fop`` is a pointer to a structure of type :c:type:`struct file_operations` containing operations that have to do with the ``file`` structure associated with the inode, such as ``read``, ``write``, and ``lseek``. + +Testing +""""""" + +Once the module is done, we can test the creation of files and directories. +To do this, we compile the kernel module (using ``make build``) and copy the resulting file (``myfs.ko``) and the test scripts (``test-myfs-{1,2}.sh``) in the virtual machine directory (using ``make copy``). + +.. note:: + + The test scripts are copied to the virtual machine using ``make copy`` only if they are executable: + + .. code-block:: console + + student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/myfs/test-myfs-*.sh + +After starting the virtual machine, insert the module, create the mount point and mount the file system: + +.. code-block:: console + + # insmod myfs.ko + # mkdir -p /mnt/myfs + # mount -t myfs none /mnt/myfs + +Now we can create file hierarchies and subdirectories in the mounted directory (``/mnt/myfs``). +We use commands like the ones below: + +.. code-block:: console + + # touch /mnt/myfs/peanuts.txt + # mkdir -p /mnt/myfs/mountain/forest + # touch /mnt/myfs/mountain/forest/tree.txt + # rm /mnt/myfs/mountain/forest/tree.txt + # rmdir /mnt/myfs/mountain/forest + +At this time we can not read or write files. When running commands such as the following ones we will get errors. + +.. code-block:: console + + # echo "chocolate" > /mnt/myfs/peanuts.txt + # cat /mnt/myfs/peanuts.txt + +This happens because we have not implemented the operations for working with files; we will do so further. + +To unload the kernel module, use the command + +.. code-block:: console + + umount /mnt/myfs + rmmod myfs + +To test the functionality provided by the kernel module, we can use the dedicated script ``test-myfs-1.sh``. +If the implementation is correct, no error messages will be displayed. + +2. File operations +^^^^^^^^^^^^^^^^^^ + +We want to implement the operations for working with files, which are used for accessing a file's content: read, write, truncate, etc. +For this you will specify the operations described in the structures :c:type:`struct inode_operations`, :c:type:`struct file_operations` and :c:type:`struct address_space_operations`. + +Follow the locations marked with ``TODO`` 6 which will guide you through the steps you need to take. + +Start by defining ``myfs_file_inode_operations`` and ``myfs_file_operations``. + +.. tip:: + + Read the section :ref:`FileOperations`. + + Use the generic function provided by VFS. + + An example of implementation is the ``ramfs`` file system. + Follow the implementation of ``ramfs_file_inode_operations`` and ``ramfs_file_operations``. + +Inside the function ``myfs_get_inode``, initialize the operations fields for the regular file inodes: + + * ``i_op`` must be initialized to ``myfs_file_inode_operations``; + * ``i_fop`` msust be initialized to ``myfs_file_operations``. + +Continue with defining the structure ``myfs_aops``. + +.. tip:: + + Read the section :ref:`AddressSpaceOperations`. + + Use the generic functions provided by VFS. + + An implementation example is the ``ramfs`` file system: the ``ramfs_aops`` structure. + + You do not need to define the function of type ``set_page_dirty``. + +Initialize the ``i_mapping->a_ops`` field of the inode structure to ``myfs_aops``. + +Testing +""""""" + +For testing, we use the steps described in the previous exercise. +In addition to those steps, we will now be able to read, write and modify a file using commands like the ones below: + +.. code-block:: console + + # echo "chocolate" > /mnt/myfs/peanuts.txt + # cat /mnt/myfs/peanuts.txt + +To test the functionality provided by the module, we can use the dedicated script: + +.. code-block:: console + + # ./test-myfs-2.sh + +If the implementation is correct, no error messages will be displayed when running the above script. + +minfs +----- + +For the exercises below, we will use the minfs file system whose development started in the previous lab. +This is a file system with disk support. +We stopped after mounting the file system and now we will continue with the operations on regular files and directories. +At the end of these exercises we will be able to create and delete entries in the file system. + +We will mainly use the :c:type:`inode` and :c:type:`dentry` VFS structures. +The inode structure defines a file (of any type: regular, directory, link), while the dentry structure defines a name, which is a directory entry. + +For this we will access the ``minfs/kernel`` directory from the laboratory skeleton. +The generated skeleton contains the solution from the previous lab; we will start from this. +As in the previous lab, we will use the ``minix`` file system as a starting point. + +We will use the formatting tool ``mkfs.minfs`` in the ``minfs/user`` directory which is automatically compiled when running ``make build`` and copied to the virtual machine at ``make copy``. + +The formatting tool prepares a virtual machine disk using a command like + +.. code-block:: console + + # ./mkfs.minfs /dev/vdb + +After formatting, the disk has a structure like the one in the diagram below: + +.. image:: ../res/minfs_arch.png + +As shown in the diagram, ``minfs`` is a minimalist file system. +``minfs`` contains a maximum of 32 inodes, each inode having a single data block (the file size is limited to block size). +The super block contains a 32-bit map (``imap``), each bit indicating the use of an inode. + +.. note:: + + Before you start working, go through the ``minfs/kernel/minfs.h`` header file. + This file contains the structures and macros that will be used in these exercises. + These structures and macros define the file system as described in the diagram above. + +1. Iterate operation +^^^^^^^^^^^^^^^^^^^^ + +At first we want to be able to list the contents of the root directory. +For this we must be able to read the entries in the root directory, which means implementing the ``iterate`` operation. +The ``iterate`` operation is a field within the ``minfs_dir_operations`` structure (of type ``file_operations``) and is implemented by the function ``minfs_readdir``. We need to implement this function. + +Follow directions marked with ``TODO 5`` which will guide you through the steps you need to take. + +.. tip:: + + Read the section :ref:`DirectoryInodes` + + As a starting point, follow the :c:func:`minix_readdir` function. + The function is rather complicated, but it gives you an insight into the steps you have to do. + + Follow, in ``minfs.c`` and ``minfs.h``, the definitions of structures ``struct minfs_inode_info``, ``struct minfs_inode`` and ``struct minfs_dir_entry``. + You will use them in the ``minfs_readdir`` implementation. + +Obtain the inode and the structure ``struct minfs_inode_info`` associated with the directory. +The structure ``struct minfs_inode_info`` is useful to find out the directory's data block. +From this structure you get the ``data_block`` field, representing the data block index on the disk. + +.. tip:: + + To get the structure ``struct minfs_inode_info`` structure, use :c:func:`list_entry` or :c:func:`container_of`. + +Use :c:func:`sb_bread` to read the directory data block. + +.. tip:: + + The data block of the directory is indicated by the ``data_block`` field of the structure ``struct minfs_inode_info`` corresponding to the directory. + + The data in the block is referenced by the ``b_data`` field of the ``buffer_head`` structure (the usual code will be ``bh->b_data``). + This block (being the data block of a directory) contains an array of at most ``MINFS_NUM_ENTRIES`` entries of type ``struct minfs_dir_entry`` (directory entries specific to ``minfs``). + Use casting to ``struct minfs_dir_entry *`` to work with the data in the block. + +Iterate over all the entries in the data block and fill the user space buffer inside the ``for`` loop. + +.. tip:: + + For each index, get the corresponding entry of the ``struct minfs_dir_entry`` by using pointer arithmetics on the ``bh->b_data`` field. + Ignore dentries that have an ``ino`` field equal to 0. Such a dentry is a free slot in the director's dentry list. + + For each valid entry, there is an existing call :c:func:`dir_emit` with the appropriate parameters. This is the call that sends the dentries to the caller (and then to user space). + + Check the call examples in :c:func:`qnx6_readdir` and :c:func:`minix_readdir`. + +Testing +""""""" + +Once the module is done, we can test the listing of the root directory contents. +To do this, we compile the kernel module (``make build``) and copy the result to the virtual machine together with the test scripts (``minfs/user/test-minfs-{0,1}.sh``) and the formatting utility (``minfs/user/mkfs.minfs``) using ``make copy``, then start the machine. + +.. note:: + + The test scripts are copied to the virtual machine only if they are executable: + + .. code-block:: console + + student@eg106:~/src/linux/tools/labs$ chmod +x skels/filesystems/minfs/user/test-minfs*.sh + +After we start the virtual machine, we format the ``/dev/vdb`` disk, create the mount point and mount the file system: + +.. code-block:: console + + # ./mkfs.minfs /dev/vdb + # mkdir -p /mnt/minfs + # mount -t minfs /dev/vdb /mnt/minfs + +Now we can list the contents of the root directory: + +.. code-block:: console + + # ls -l /mnt/minfs + +We notice that there is already a file (``a.txt``); it is created by the formatting utility. + +We also notice that we are not allowed to display information for a file using the ``ls`` command. +This is because we have not implemented the ``lookup`` function. We will implement it in the next exercise. + +To test the functionality provided by the module, we can use the dedicated script: + +.. code-block:: console + + # ./test-minfs-0.sh + # ./test-minfs-1.sh + +2. Lookup operation +^^^^^^^^^^^^^^^^^^^ + +To properly list the contents of a directory, we need to implement the search functionality, ie the ``lookup`` operation. +The ``lookup`` operation is a field within the ``minfs_dir_inode_operations`` structure (of type ``inode_operations``) and is implemented by the ``minfs_lookup`` function. +This function (``minfs_lookup``) needs to be implemented. +We will actually implement the ``minfs_find_entry`` function called by ``minfs_lookup`` . + +Follow directions marked with ``TODO 6`` which will tell you the steps you need to take. + +.. tip:: + + Read the section :ref:`DirectoryInodes` + + As a starting point, read the functions :c:func:`qnx6_find_entry` and :c:func:`minix_find_entry`. + +In the ``minfs_find_entry`` function, iterate over the directory where the dentry is: ``dentry->d_parent->d_inode``. +Iterating means going through the entries in the directory's data block (of type ``struct minfs_dir_entry``) and locate, if it exists, the requested entry. + +.. tip:: + + From the structure of type ``struct minfs_inode_info`` corresponding to the directory, find out the data block index and read it (``sb_read``). + You will access the block contents using ``bh->b_data``. + The directory data block contains an array of at most ``MINFS_NUM_ENTRIES`` entries of type ``struct minfs_dir_entry``. + Use pointer arithmetics to get entries of type ``struct minfs_dir_entry`` from the data block (``bh->b_data``). + + Check the presence of the name (stored in the local variable ``name``) in the directory (if there is an entry in the data block whose name is a string equal to the given name). Use :c:func:`strcmp` to verify. + + Ignore dentries that have an ``ino`` field equal to ``0``. Those dentries are free slots in the directory dentry list. + + Store in the ``final_de`` variable the dentry found. + If you do not find any dentry, then the ``final_de`` variable will have the value ``NULL``, the value with which it was initialized. + +Comment the ``simple_lookup`` call in the ``minfs_lookup`` function to invoke the implementation of ``minfs_readdir``. + +Testing +""""""" + +For testing, we use the steps described in the previous exercise. +The long file listing (``ls -l``) of the contents of a directory (root directory) will display permissions and other file-specific information: + +.. code-block:: console + + # ls -l /mnt/minfs + +To test the functionality provided by the module, we can use the dedicated scripts: + +.. code-block:: console + + # ./test-minfs-0.sh + # ./test-minfs-1.sh + +If the implementation is correct, no error messages will be displayed when running the scripts above. + +.. note:: + + After mounting the file system using the command + + .. code-block:: console + + # mount -t minfs /dev/vdb /mnt/minfs + + we try to create a file using the command + + .. code-block:: console + + # touch /mnt/minfs/peanuts.txt + + We notice that we get an error because we did not implement the directory operations that allow us to create a file. + We will do this for the next exercise. + +3. Create operation +^^^^^^^^^^^^^^^^^^^ + +In order to allow the creation of a file in a directory, we must implement the ``create`` operation. +The ``create`` operation is a field in the ``minfs_dir_inode_operations`` structure (of type :c:type:`inode_operations`) and is implemented by the ``minfs_create`` function. We need to implement this function. +In fact, we will implement the ``minfs_new_inode`` (which creates and initializes an inode) and ``minfs_add_link`` which adds a link (or name or *dentry*) for the created inode. + +Follow directions marked with ``TODO 7`` which will guide you through the steps you need to take. + +.. tip:: + + Read the section :ref:`DirectoryInodes` + + Inspect the code in the ``minfs_create`` and the skeleton of functions ``minfs_new_inode`` and ``minfs_add_link``. + +Implement the function ``minfs_new_inode``. Inside this function you will create (using :c:func:`new_inode`) and initialize an inode. The initialization is done using the data from disk. + +.. tip:: + + Use the :c:func:`minix_new_inode` function as a model. + Find the first free inode in imap (``sbi->imap``). + Use bitwise operations (``find_first_zero_bit`` and ``set_bit``). + Read the :ref:`BitmapOperations` section. + + The buffer for the superblock (``sbi->sbh``) must be marked as dirty . + + You must initialize the usual fields as it is done for the ``myfs`` file system. + Initialize the ``i_mode`` field to ``0`` in the call to ``inode_init_owner``. It will be initialized in the caller later. + +Implement the ``minfs_add_link`` function. The function adds a new dentry (``struct minfs_dir_entry``) to the parent directory data block (``dentry->d_parent->d_inode``). + +.. tip:: + + Use the function ``minix_add_link`` function as a model. + +In ``minfs_add_link`` we want to find the first free place for the dentry. +For this, you will iterate over the directory data block and you will find the first free entry. A free dentry has the ``ino`` field equal to ``0``. + +.. tip:: + + In order to work with the directory, get the inode of type ``struct minfs_inode_info`` corresponding to the parent directory (the **dir** inode). + Do not use the variable ``inode`` to get ``struct minfs_inode_info``; that inode belongs to the file, not to the parent directory inside which you want to add the link/dentry. + To get the ``struct minfs_inode_info`` structure, use :c:func:`container_of`. + + The structure ``struct minfs_inode_info`` is useful for finding the directory data block (the one indicated by the ``dentry->d_parent->d_inode``, which is the ``dir`` variable). + From this structure, get the ``data_block`` field, representing index of the data block on the disk. + This block contains the entries in the directory. Use :c:func:`sb_bread` to read the block and then ``bh->b_data`` to refer to the data. + The block contains at most ``MINFS_NUM_ENTRIES`` entries of type ``struct minfs_dir_entry``. + + If all entries are occupied, return ``-ENOSPC``. + + Iterate over the entries in the data block using the variable ``de`` and extract the first free entry (for which the ``ino`` field is ``0``). + + When you have found a free place, fill in the corresponding entry: + + * the ``inode->i_ino`` field in ``de->ino`` + * the ``dentry->d_name.name`` field in ``de->name`` + + Then mark the buffer dirty. + + +Testing +""""""" + +For testing, we use the steps described in the previous exercise. +Now we can create files within the file system: + +.. code-block:: console + + # touch /mnt/minfs/peanuts.txt + +To test the functionality provided by the module, we can use the dedicated script: + +.. code-block:: console + + # ./test-minfs-2.sh + +If the deployment is valid, no error messages will be displayed following the above script run. + +.. note:: + + The current implementation of the ``minfs`` file system is not definitive. + To be complete, the implementations needs function to delete files, create and delete directories, rename entries, and modify the contents of a file. + diff --git a/refs/pull/405/merge/_sources/labs/infrastructure.rst.txt b/refs/pull/405/merge/_sources/labs/infrastructure.rst.txt new file mode 100644 index 00000000..b3b4873c --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/infrastructure.rst.txt @@ -0,0 +1,82 @@ +Infrastructure +============== + +In order to facilitate learning each topic has a hands-on exercises +section which will contain in-depth, incremental clues on how to solve +one or multiple tasks. To focus on a particular issue most of the +tasks will be performed on existing skeleton drivers. Each skeleton +driver has clearly marked sections that needs to be filled in order to +complete the tasks. + +The skeleton drivers are generated from full source examples located +in tools/labs/templates. To solve tasks you start by generating the +skeleton drivers, running the **skels** target in *tools/labs*. To +keep the workspace clean it is recommended to generate the skeletons +for one lab only and clean the workspace before start working on a new +lab. Labs can be selected by using the **LABS** variable: + +.. code-block:: shell + + tools/labs $ make clean + tools/labs $ LABS=kernel_modules make skels + + tools/labs $ ls skels/kernel_modules/ + 1-2-test-mod 3-error-mod 4-multi-mod 5-oops-mod 6-cmd-mod \ + 7-list-proc 8-kprobes 9-kdb + +You can also use the same variable to generate skeletons for specific +tasks: + +.. code-block:: shell + + tools/labs $ LABS="kernel_modules/6-cmd-mod kernel_modules/8-kprobes" make skels + + tools/labs$ ls skels/kernel_modules + 6-cmd-mod 8-kprobes + + +For each task you may have multiple steps to perform, usually +incremental. These steps are marked in the source code as well as in +the lab exercises with the keyword *TODO*. If we have multiple steps +to perform they will be prefixed by a number, like *TODO1*, *TODO2*, +etc. If no number is used it is assumed to be the one and only +step. If you want to resume a task from a certain step, you can using +the **TODO** variable. The following example will generate the +skeleton with the first *TODO* step resolved: + +.. code-block:: shell + + tools/labs $ TODO=2 LABS="kernel_modules/8-kprobes" skels + +Once the skelton drivers are generated you can build them with the +**build** make target: + +.. code-block:: shell + + tools/labs $ make build + echo "# autogenerated, do not edit " > skels/Kbuild + for i in ./kernel_modules/8-kprobes; do echo "obj-m += $i/" >> skels/Kbuild; done + make -C /home/tavi/src/linux M=/home/tavi/src/linux/tools/labs/skels ARCH=x86 modules + make[1]: Entering directory '/home/tavi/src/linux' + CC [M] /home/tavi/src/linux/tools/labs/skels/./kernel_modules/8-kprobes/kprobes.o + Building modules, stage 2. + MODPOST 1 modules + CC /home/tavi/src/linux/tools/labs/skels/./kernel_modules/8-kprobes/kprobes.mod.o + LD [M] /home/tavi/src/linux/tools/labs/skels/./kernel_modules/8-kprobes/kprobes.ko + make[1]: Leaving directory '/home/tavi/src/linux' + + +To copy the drivers to the VM you can use either use ssh or update the +VM image directly using the **copy** target: + +.. code-block:: shell + + tools/labs $ make copy + ... + 'skels/kernel_modules/8-kprobes/kprobes.ko' -> '/tmp/tmp.4UMKcISmQM/home/root/skels/kernel_modules/8-kprobes/kprobes.ko' + +.. attention:: The **copy** target will fail if the VM is + running. This is intentional so that we avoid corrupting the + filesystem. + + diff --git a/refs/pull/405/merge/_sources/labs/interrupts.rst.txt b/refs/pull/405/merge/_sources/labs/interrupts.rst.txt new file mode 100644 index 00000000..f5772846 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/interrupts.rst.txt @@ -0,0 +1,1120 @@ +========================== +I/O access and Interrupts +========================== + +Lab objectives +============== + +* communication with peripheral devices +* implement interrupt handlers +* synchronizing interrupts with process context + +Keywords: IRQ, I/O port, I/O address, base address, UART, request_region, release_region, inb, outb + +Background information +====================== + +A peripheral device is controlled by writing and reading its +registers. Often, a device has multiple registers that can be accessed +at consecutive addresses either in the memory address space or in the +I/O address space. Each device connected to the I/O bus has a set of +I/O addresses, called I/O ports. I/O ports can be mapped to physical +memory addresses so that the processor can communicate with the device +through instructions that work directly with the memory. For +simplicity, we will directly use I/O ports (without mapping to physical +memory addresses) to communicate with physical devices. + +The I/O ports of each device are structured into a set of specialized +registers to provide a uniform programming interface. Thus, most +devices will have the following types of registers: + +* **Control** registers that receive device commands +* **Status** registers, which contain information about the device's + internal status +* **Input** registers from which data is taken from the device +* **Output** registers in which the data is written to transmit it to the + device + +Physical ports are differentiated by the number of bits: they can be +8, 16 or 32-bit ports. + +For example, the parallel port has 8 8-bit I/O ports starting at base +address 0x378. The data log is found at base address (0x378), status +register at base + 1 (0x379), and control at base address + 2 +(0x37a). The data log is both an entry and exit log. + +Although there are devices that can be fully controlled using I/O +ports or special memory areas, there are situations where this is +insufficient. The main problem that needs to be addressed is that +certain events occur at undefined moments in time and it is +inefficient for the processor (CPU) to interrogate the status of the +device repeatedly (polling). The way to solve this problem is using an +Interrupt ReQuest (IRQ) which is a hardware notification by which the +processor is announced that a particular external event happened. + +For IRQs to be useful device drivers must implement handlers, i.e. a +particular sequence of code that handles the interrupt. Because in +many situations the number of interrupts available is limited, a +device driver must behave in an orderly fashion with interruptions: +interrupts must be requested before being used and released when they +are no longer needed. In addition, in some situations, device drivers +must share an interrupt or synchronize with interrupts. All of these will be +discussed further. + +When we need to access shared resources between an interrupt +routine (A) and code running in process context or in bottom-half +context (B), we must use a special synchronization technique. In (A) +we need to use a spinlock primitive, and in (B) we must disable +interrupts AND use a spinlock primitive. Disabling interrupts is not +enough because the interrupt routine can run on a processor other than +the one running (B). + +Using only a spinlock can lead to a deadlock. The classic example of +deadlock in this case is: + +1. We run a process on the X processor, and we acquire the lock +2. Before releasing the lock, an interrupt is generated on the X processor +3. The interrupt handling routine will try to acquire the lock and it + will go into an infinite loop + + +Accessing the hardware +====================== + +In Linux, the I/O ports access is implemented on all architectures and +there are several APIs that can be used. + +Request access to I/O ports +--------------------------- + +Before accessing I/O ports we first must request access to them, to +make sure there is only one user. In order to do so, one must use the +:c:func:`request_region` function: + +.. code-block:: c + + #include + + struct resource *request_region(unsigned long first, unsigned long n, + const char *name); + +To release a reserved region one must use the :c:func:`release_region` function: + +.. code-block:: c + + void release_region(unsigned long start, unsigned long n); + + +For example, the serial port COM1 has the base address 0x3F8 and it +has 8 ports and this is a code snippet of how to request access to +these ports: + +.. code-block:: c + + #include + + #define MY_BASEPORT 0x3F8 + #define MY_NR_PORTS 8 + + if (!request_region(MY_BASEPORT, MY_NR_PORTS, "com1")) { + /* handle error */ + return -ENODEV; + } + +To release the ports one would use something like: + +.. code-block:: c + + release_region(MY_BASEPORT, MY_NR_PORTS); + +Most of the time, port requests are done at the driver initialization +or probe time and the port releasing is done at the removal of the +device or module. + +All of the port requests can be seen from userspace via the +:file:`/proc/ioports` file: + +.. code-block:: shell + + $ cat /proc/ioports + 0000-001f : dma1 + 0020-0021 : pic1 + 0040-005f : timer + 0060-006f : keyboard + 0070-0077 : rtc + 0080-008f : dma page reg + 00a0-00a1 : pic2 + 00c0-00df : dma2 + 00f0-00ff : fpu + 0170-0177 : ide1 + 01f0-01f7 : ide0 + 0376-0376 : ide1 + 0378-037a : parport0 + 037b-037f : parport0 + 03c0-03df : vga+ + 03f6-03f6 : ide0 + 03f8-03ff : serial + ... + + +Accessing I/O ports +------------------- + +After a driver has obtained the desired I/O port range, one can +perform read or write operations on these ports. Since physical ports +are differentiated by the number of bits (8, 16, or 32 bits), there +are different port access functions depending on their size. The +following port access functions are defined in asm/io.h: + + +* *unsigned inb(int port)*, reads one byte (8 bits) from port +* *void outb(unsigned char byte, int port)*, writes one byte (8 bits) to port +* *unsigned inw(int port)*, reads two bytes (16-bit) ports +* *void outw(unsigned short word, int port)*, writes two bytes (16-bits) to port +* *unsigned inl (int port)*, reads four bytes (32-bits) from port +* *void outl(unsigned long word, int port)*, writes four bytes (32-bits) to port + +The port argument specifies the address of the port where the reads or +writes are done, and its type is platform dependent (may be unsigned +long or unsigned short). + +Some devices may have problems when the processor is trying to +transfer data too fast to and from the device. To avoid this issue we +may need to insert a delay after an I/O operation and there are functions +you can use that introduce this delay. Their names are similar to +those described above, with the exception that it ends in _p: inb_p, +outb_p, etc. + +For example, the following sequence writes a byte on COM1 serial port +and then reads it: + +.. code-block:: c + + #include + #define MY_BASEPORT 0x3F8 + + unsigned char value = 0xFF; + outb(value, MY_BASEPORT); + value = inb(MY_BASEPORT); + +5. Accessing I/O ports from userspace +------------------------------------- + +Although the functions described above are defined for device drivers, +they can also be used in user space by including the +header. In order to be used, ioperm or iopl must first be called to +get permission to perform port operations. The ioperm function obtains +permission for individual ports, while iopl for the entire I/O address +space. To use these features, the user must be root. + +The following sequence used in user space gets permission for the +first 3 ports of the serial port, and then releases them: + +.. code-block:: c + + #include + #define MY_BASEPORT 0x3F8 + + if (ioperm(MY_BASEPORT, 3, 1)) { + /* handle error */ + } + + if (ioperm(MY_BASEPORT, 3, 0)) { + /* handle error */ + } + +The third parameter of the ioperm function is used to request or +release port permission: 1 to get permission and 0 to release. + +Interrupt handling +================== + +Requesting an interrupt +----------------------- + +As with other resources, a driver must gain access to an interrupt +line before it can use it and release it at the end of the execution. + +In Linux, the request to obtain and release an interrupt is done using +the :c:func:`requests_irq` and :c:func:`free_irq` functions: + +.. code-block:: c + + #include + + typedef irqreturn_t (*irq_handler_t)(int, void *); + + int request_irq(unsigned int irq_no, irq_handler_t handler, + unsigned long flags, const char *dev_name, void *dev_id); + + void free_irq(unsigned int irq_no, void *dev_id); + +Note that to get an interrupt, the developer calls +:c:func:`request_irq`. When calling this function you must specify the +interrupt number (*irq_no*), a handler that will be called when the +interrupt is generated (*handler*), flags that will instruct the +kernel about the desired behaviour (*flags*), the name of the device +using this interrupt (*dev_name*), and a pointer that can be +configured by the user at any value, and that has no global +significance (*dev_id*). Most of the time, *dev_id* will be +pointer to the device driver's private data. When the interrupt is +released, using the :c:func:`free_irq` function, the developer must +send the same pointer value (*dev_id*) along with the same interrupt +number (*irq_no*). The device name (*dev_name*) is used to display +statistics in */proc/interrupts*. + +The value that :c:func:`request_irq` returns is 0 if the entry was +successful or a negative error code indicating the reason for the +failure. A typical value is *-EBUSY* which means that the interrupt +was already requested by another device driver. + +The *handler* function is executed in interrupt context which means +that we can't call blocking APIs such as :c:func:`mutex_lock` or +:c:func:`msleep`. We must also avoid doing a lot of work in the +interrupt handler and instead use deferred work if needed. The actions +performed in the interrupt handler include reading the device +registers to get the status of the device and acknowledge the +interrupt, operations that most of the time can be performed with +non-blocking calls. + +There are situations where although a device uses interrupts we can't +read the device's registers in a non-blocking mode (for example a +sensor connected to an I2C or SPI bus whose driver does not guarantee +that bus read / write operations are non-blocking ). In this +situation, in the interruption, we must plan a work-in-process action +(work queue, kernel thread) to access the device's registers. Because +such a situation is relatively common, the kernel provides the +:c:func:`request_threaded_irq` function to write interrupt handling +routines running in two phases: a process-phase and an interrupt +context phase: + +.. code-block:: c + + #include + + int request_threaded_irq(unsigned int irq, irq_handler_t handler, + irq_handler_t thread_fn, + unsigned long flags, const char *name, void *dev); + +*handler* is the function running in interrupt context, and will +implement critical operations while the thread_fn function runs in +process context and implements the rest of the operations. + +The flags that can be transmitted when an interruption is made are: + +* *IRQF_SHARED* announces the kernel that the interrupt can be + shared with other devices. If this flag is not set, then if there is + already a handler associated with the requested interrupt, the + request for interrupt will fail. A shared interrupt is handled in a + special way by the kernel: all the associated interrupt handlers + will be executed until the device that generated the interrupt will + be identified. But how can a device driver know if the interrupt + handling routine was activated by an interrupt generated by the + device it manages? Virtually all devices that offer interrupt + support have a status register that can be interrogated in the + handling routine to see if the interrupt was or was not generated by + the device (for example, in the case of the 8250 serial port, this + status register is IIR - Interrupt Information Register). When + requesting a shared interrupt, the dev_id argument must be unique + and it must not be NULL. Usually it is set to module's private + data. + +* *IRQF_ONESHOT* interrupt will be reactivated after running the process + context routine; Without this flag, the interrupt will be + reactivated after running the handler routine in the context of + the interrupt + + +Requesting the interrupt can be done either at the initialization of +the driver (:c:func:`init_module`), when the device is probed, or when +the device is used (e.g. during *open*). + +The following example performs the interrupt request for the COM1 +serial port: + +.. code-block:: c + + #include + + #define MY_BASEPORT 0x3F8 + #define MY_IRQ 4 + + static my_init(void) + { + [...] + struct my_device_data *my_data; + int err; + + err = request_irq(MY_IRQ, my_handler, IRQF_SHARED, + "com1", my_data); + if (err < 0) { + /* handle error*/ + return err; + } + [...] + } + +As you can see, the IRQ for serial port COM1 is 4, which is used in +shared mode (IRQF_SHARED). + +.. attention:: When requesting a shared interrupt (IRQF_SHARED) the + *dev_id* argument can not be NULL. + +To release the interrupt associated with the serial port, the +following operations will be executed: + +.. code-block:: c + + free_irq (MY_IRQ, my_data); + + +During the initialization function (:c:func:`init_module`), or in the +function that opens the device, interrupts must be activated for the +device. This operation is dependent on the device, but most often +involves setting a bit from the control register. + + +As an example, for the 8250 serial port, the following operations must +be performed to enable interrupts: + +.. code-block:: c + + #include + #define MY_BASEPORT 0x3F8 + + outb(0x08, MY_BASEPORT+4); + outb(0x01, MY_BASEPORT+1); + + +In the above example, two operations are performed: + +1. All interruptions are activated by setting bit 3 (Aux Output 2) in + the MCR register - Modem Control Register +2. The RDAI (Transmit Holding Register Empty Interrupt) is activated + by setting the appropriate bit in the IER - Interrupt Enable + Register. + + +Implementing an interrupt handler +--------------------------------- + +Lets take a look at the signature of the interrupt handler function: + +.. code-block:: c + + irqreturn_t (*handler)(int irq_no, void *dev_id); + +The function receives as parameters the number of the interrupt +(*irq_no*) and the pointer sent to :c:func:`request_irq` when the +interrupt was requested. The interrupt handling routine must return a +value with a type of :c:type:`typedef irqreturn_t`. For the current kernel +version, there are three valid values: *IRQ_NONE*, *IRQ_HANDLED*, +and *IRQ_WAKE_THREAD*. The device driver must return *IRQ_NONE* if +it notices that the interrupt has not been generated by the device it +is in charge. Otherwise, the device driver must return *IRQ_HANDLED* +if the interrupt can be handled directly from the interrupt context or +*IRQ_WAKE_THREAD* to schedule the running of the process context +processing function. + +The skeleton for an interrupt handler is: + +.. code-block:: c + + irqreturn_t my_handler(int irq_no, void *dev_id) + { + struct my_device_data *my_data = (struct my_device_data *) dev_id; + + /* if interrupt is not for this device (shared interrupts) */ + /* return IRQ_NONE;*/ + + /* clear interrupt-pending bit */ + /* read from device or write to device*/ + + return IRQ_HANDLED; + } + + +Typically, the first thing executed in the interrupt handler is to +determine whether the interrupt was generated by the device that the +driver ordered. This usually reads information from the device's +registers to indicate whether the device has generated an +interrupt. The second thing is to reset the interrupt pending bit on +the physical device as most devices will no longer generate +interruptions until this bit has been reset (e.g. for the 8250 +serial port bit 0 in the IIR register must be cleared). + + +Locking +------- + +Because the interrupt handlers run in interrupt context the actions +that can be performed are limited: unable to access user space memory, +can't call blocking functions. Also, synchronization using spinlocks is +tricky and can lead to deadlocks if the spinlock used is already +acquired by a process that has been interrupted by the running +handler. + +However, there are cases where device drivers have to synchronize +using interrupts, such as when data is shared between the interrupt +handler and process context or bottom-half handlers. In these +situations it is necessary to both deactivate the interrupt and use +spinlocks. + +There are two ways to disable interrupts: disabling all interrupts, at +the processor level, or disabling a particular interrupt at the device +or interrupt controller level. Processor disabling is faster and is +therefore preferred. For this purpose, there are locking functions +that disable and enable interrupts acquiring and release a spinlock at +the same time: :c:func:`spin_lock_irqsave`, +:c:func:`spin_unlock_irqrestore`, :c:func:`spin_lock_irq`, and +:c:func:`spin_unlock_irq`: + +.. code-block:: c + + #include + + void spin_lock_irqsave (spinlock_t * lock, unsigned long flags); + void spin_unlock_irqrestore (spinlock_t * lock, unsigned long flags); + + void spin_lock_irq (spinlock_t * lock); + void spin_unlock_irq (spinlock_t * lock); + +The :c:func:`spin_lock_irqsave` function disables interrupts for the +local processor before it obtains the spinlock; The previous state of +the interrupts is saved in *flags*. + +If you are absolutely sure that the interrupts on the current +processor have not already been disabled by someone else and you are +sure you can activate the interrupts when you release the spinlock, +you can use :c:func:`spin_lock_irq`. + +For read / write spinlocks there are similar functions available: + +* :c:func:`read_lock_irqsave` +* :c:func:`read_unlock_irqrestore` +* :c:func:`read_lock_irq` +* :c:func:`read_unlock_irq` +* :c:func:`write_lock_irqsave` +* :c:func:`write_unlock_irqrestore` +* :c:func:`write_lock_irq` +* :c:func:`write_unlock_irq` + +If we want to disable interrupts at the interrupt controller level +(not recommended because disabling a particular interrupt is slower, +we can not disable shared interrupts) we can do this with +:c:func:`disable_irq`, :c:func:`disable_irq_nosync`, and +:c:func:`enable_irq`. Using these functions will disable the interrupts on +all processors. Calls can be nested: if disable_irq is called twice, +it will require as many calls enable_irq to enable it. The difference +between disable_irq and disable_irq_nosync is that the first one will +wait for the executed handlers to finish. Because of this, +:c:func:`disable_irq_nosync` is generally faster, but may lead to +races with the interrupts handler, so when not sure use +:c:func:`disable_irq`. + +The following sequence disables and then enables the interrupt for +the COM1 serial port: + +.. code-block:: c + + #define MY_IRQ 4 + + disable_irq (MY_IRQ); + enable_irq (MY_IRQ); + +It is also possible to disable interrupts at the device level. This +approach is also slower than disabling interrupts at the processor +level, but it works with shared interrupts. The way to accomplish this +is device specific and it usually means we have to clear a bit from +one of the control registers. + +It is also possible to disable all interrupts for the current +processor independent of taking locks. Disabling all interruptions by +device drivers for synchronization purposes is inappropriate because +races are still possible if the interrupt is handled on another +CPU. For reference, the functions that disable / enable interrupts on +the local processor are :c:func:`local_irq_disable` and +:c:func:`local_irq_enable`. + +In order to use a resource shared between process context and the +interrupt handling routine, the functions described above will be used +as follows: + +.. code-block:: c + + static spinlock_t lock; + + /* IRQ handling routine: interrupt context */ + irqreturn_t kbd_interrupt_handle(int irq_no, void * dev_id) + { + ... + spin_lock(&lock); + /* Critical region - access shared resource */ + spin_unlock (&lock); + ... + } + + /* Process context: Disable interrupts when locking */ + static void my_access(void) + { + unsigned long flags; + + spin_lock_irqsave(&lock, flags); + /* Critical region - access shared resource */ + spin_unlock_irqrestore(&lock, flags); + + ... + } + + void my_init (void) + { + ... + spin_lock_init (&lock); + ... + } + + +The *my_access function* above runs in process context. To +synchronize access to the shared data, we disable the interrupts and +use the spinlock *lock*, i.e. the :c:func:`spin_lock_irqsave` and +:c:func:`spin_unlock_irqrestore` functions. + +In the interrupt handling routine, we use the :c:func:`spin_lock` and +:c:func:`spin_unlock` functions to access the shared resource. + +.. note:: The *flags* argument for :c:func:`spin_lock_irqsave` and + :c:func:`spin_unlock_irqrestore` is a value and not a pointer but keep + in mind that :c:func:`spin_lock_irqsave` function changes the value of + the flag, since this is actually a macro. + +Interrupt statistics +-------------------- + +Information and statistics about system interrupts can be found in +*/proc/interrupts* or */proc/stat*. Only system interrupts with +associated interrupt handlers appear in */proc/interrupts*: + +.. code-block:: shell + + # cat /proc/interrupts + CPU0 + 0: 7514294 IO-APIC-edge timer + 1: 4528 IO-APIC-edge i8042 + 6: 2 IO-APIC-edge floppy + 8: 1 IO-APIC-edge rtc + 9: 0 IO-APIC-level acpi + 12: 2301 IO-APIC-edge i8042 + 15: 41 IO-APIC-edge ide1 + 16: 3230 IO-APIC-level ioc0 + 17: 1016 IO-APIC-level vmxnet ether + NMI: 0 + LOC: 7229438 + ERR: 0 + MIS: 0 + +The first column specifies the IRQ associated with the interrupt. The +following column shows the number of interrupts that were generated +for each processor in the system; The last two columns provide +information about the interrupt controller and the device name that +registered the handler for that interrupt. + +The */proc/state* file provides information about system activity, +including the number of interruptions generated since the last (re)boot +of the system: + +.. code-block:: shell + + # cat /proc/stat | grep in + intr 7765626 7754228 4620 0 0 0 0 2 0 1 0 0 0 2377 0 0 41 3259 1098 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + +Each line in the */proc/state* file begins with a keyword that +specifies the meaning of the information on the line. For information +on interrupts, this keyword is intr. The first number on the line +represents the total number of interrupts, and the other numbers +represent the number of interrupts for each IRQ, starting at 0. The +counter includes the number of interrupts for all processors in the +system. + + +Further reading +=============== + +Serial Port +----------- + +* `Serial Port `_ +* `Interfacing the Serial / RS232 Port `_ + + +Parallel port +------------- + +* `Interfacing the Standard Parallel Port `_ +* `Parallel Port Central `_ + +Keyboard controller +------------------- + +* `Intel 8042 `_ +* drivers/input/serio/i8042.c +* drivers/input/keyboard/atkbd.c + +Linux device drivers +-------------------- + +* `Linux Device Drivers, 3rd ed., Ch. 9 - Communicating with Hardware `_ +* `Linux Device Drivers, 3rd ed., Ch. 10 - Interrupt Handling `_ +* `Interrupt Handlers `_ + + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: interrupts + +0. Intro +-------- + +Using |LXR|_, find the definitions of the following symbols in the Linux kernel: + +* :c:type:`struct resource` +* :c:func:`request_region` and :c:func:`__request_region` +* :c:func:`request_irq` and :c:func:`request_threaded_irq` +* :c:func:`inb` for the x86 architecture. + +Analyze the following Linux code: + +* Keyboard initialization function :c:func:`i8042_setup_kbd` +* The AT or PS/2 keyboard interrupt function :c:func:`atkbd_interrupt` + +Keyboard driver +------------------ + +The next exercise's objective is to create a driver that uses the +keyboard IRQ, inspect the incoming key codes and stores them in a +buffer. The buffer will be accessible from userspace via character +device driver. + +1. Request the I/O ports +------------------------ + +To start with, we aim to allocate memory in the I/O space for hardware +devices. We will see that we cannot allocate space for the keyboard +because the designated region is already allocated. Then we will allocate +I/O space for unused ports. + +The *kbd.c* file contains a skeleton for the keyboard driver. Browse +the source code and inspect :c:func:`kbd_init`. Notice that the I/O +ports we need are I8042_STATUS_REG and I8042_DATA_REG. + +Follow the sections maked with **TODO 1** in the skeleton. Request the I/O +ports in :c:func:`kbd_init` and make sure to check for errors and to properly +clean-up in case of errors. When requesting, set the reserving caller's ID +string (``name``) with ``MODULE_NAME`` macro. Also, add code to release the I/O +ports in :c:func:`kbd_exit`. + +.. note:: You can review the `Request access to I/O ports`_ section before + proceeding. + +Now build the module and copy it to the VM image: + +.. code-block:: shell + + tools/labs $ make build + tools/labs $ make copy + + +Now start the VM and insert the module: + +.. code-block:: shell + + root@qemux86:~# insmod skels/interrupts/kbd.ko + kbd: loading out-of-tree module taints kernel. + insmod: can't insert 'skels/interrupts/kbd.ko': Device or resource busy + +Notice that you get an error when trying to request the I/O +ports. This is because we already have a driver that has requested the +I/O ports. To validate check the :file:`/proc/ioports` file for the +``STATUS_REG`` and ``DATA_REG`` values: + +.. code-block:: shell + + root@qemux86:~# cat /proc/ioports | egrep "(0060|0064)" + 0060-0060 : keyboard + 0064-0064 : keyboard + + +Lets find out which driver register these ports and try to remove the +module associated with it. + +.. code-block:: shell + + $ find -name \*.c | xargs grep \"keyboard\" + + find -name \*.c | xargs grep \"keyboard\" | egrep '(0x60|0x64)' + ... + ./arch/x86/kernel/setup.c:{ .name = "keyboard", .start = 0x60, .end = 0x60, + ./arch/x86/kernel/setup.c:{ .name = "keyboard", .start = 0x64, .end = 0x64 + +It looks like the I/O ports are registered by the kernel during the +boot, and we won't be able to remove the associated module. Instead, +let's trick the kernel and register ports 0x61 and 0x65. + +Use the function :c:func:`request_region` (inside the :c:func:`kbd_init` +function) to allocate the ports and the function :c:func:`release_region` +(inside the :c:func:`kbd_exit` function) to release the allocated memory. + +This time we can load the module and */proc/ioports* shows that the +owner of these ports is our module: + +.. code-block:: shell + + root@qemux86:~# insmod skels/interrupts/kbd.ko + kbd: loading out-of-tree module taints kernel. + Driver kbd loaded + root@qemux86:~# cat /proc/ioports | grep kbd + 0061-0061 : kbd + 0065-0065 : kbd + +Let's remove the module and check that the I/O ports are released: + +.. code-block:: shell + + root@qemux86:~# rmmod kbd + Driver kbd unloaded + root@qemux86:~# cat /proc/ioports | grep kbd + root@qemux86:~# + +2. Interrupt handling routine +----------------------------- + +For this task we will implement and register an interrupt handler for +the keyboard interrupt. You can review the `Requesting an interrupt`_ +section before proceeding. + +Follow the sections marked with **TODO 2** in the skeleton. + +First, define an empty interrupt handling routine named +:c:func:`kbd_interrupt_handler`. + +.. note:: Since we already have a driver that uses this interrupt we + should report the interrupt as not handled (i.e. return + :c:type:`IRQ_NONE`) so that the original driver still has a + chance to process it. + +Then register the interrupt handler routine using +:c:type:`request_irq`. The interrupt number is defined by the +`I8042_KBD_IRQ` macro. The interrupt handling routine must be +requested with :c:type:`IRQF_SHARED` to share the interrupt line with +the keyboard driver (i8042). + +.. note:: For shared interrupts, *dev_id* can not be NULL . Use + ``&devs[0]``, that is pointer to :c:type:`struct kbd`. This + structure contains all the information needed for device + management. To see the interrupt in */proc/interrupts*, do + not use NULL for *dev_name* . You can use the MODULE_NAME + macro. + + If the interrupt requesting fails make sure to properly + cleanup by jumping to the right label, in this case the one + the releases the I/O ports and continues with unregistering + the character device driver. + +Compile, copy and load module in the kernel. Check that the interrupt +line has been registered by looking at */proc/interrupts* . Determine +the IRQ number from the source code (see `I8042_KBD_IRQ`) and verify +that there are two drivers registered at this interrupt line (which +means that we have a shared interrupt line): the i8042 initial driver +and our driver. + +.. note:: More details about the format of the */proc/interrupts* can + be found in the `Interrupt statistics`_ section. + +Print a message inside the routine to make sure it is called. Compile +and reload the module into the kernel. Check that the interrupt handling +routine is called when you press the keyboard on the virtual machine, +using :command:`dmesg`. Also note that when you use the serial port no +keyboard interrupt is generated. + +.. attention:: To get access to the keyboard on the virtual machine + boot with "QEMU_DISPLAY=gtk make boot". + +3. Store ASCII keys to buffer +----------------------------- + +Next, we want to collect the keystrokes in a buffer whose content we +will then send to the user space. For this routine we will add the +following in the interrupt handling: + +* capture the pressed keys (only pressed, ignore released) +* identify the ASCII characters. +* copy the ASCII characters corresponding to the keystrokes and store + them in the buffer of the device + +Follow the sections marked **TODO 3** in the skeleton. + +Reading the data register +......................... + +First, fill in the :c:func:`i8042_read_data` function to read the +``I8042_DATA_REG`` of the keyboard controller. The function +just needs to return the value of the register. The value of the +registry is also called scancode, which is what is generated at each +keystroke. + +.. hint:: Read the ``I8042_DATA_REG`` register using :c:func:`inb` and + store the value in the local variable :c:type:`val`. + Revisit the `Accessing I/O ports`_ section. + +Call the :c:func:`i8042_read_data` in the +:c:func:`kbd_interrupt_handler` and print the value read. + +Print information about the keystrokes in the following format: + +.. code-block:: c + + pr_info("IRQ:% d, scancode = 0x%x (%u,%c)\n", + irq_no, scancode, scancode, scancode); + + +Where scancode is the value of the read register using the +:c:func:`i8042_read_data` function. + +Notice that the scancode (reading of the read register) is not an ASCII +character of the pressed key. We'll have to understand the scancode. + +Interpreting the scancode +......................... + +Note that the registry value is a scancode, not the ASCII value of the +character pressed. Also note that an interrupt is sent both when the +key is pressed and when the key is released. We only need to select +the code when the key is pressed and then and decode the ASCII +character. + +.. note:: To check scancode, we can use the showkey command (showkey + -s). + + In this form, the command will display the key scancodes for + 10 seconds after the last pressed key end then it will + stop. If you press and release a key you will get two + scancodes: one for the pressed key and one for the released + key. E.g: + + * If you press the ENTER key, you will get the 0x1c ( 0x1c ) + and 0x9c (for the released key) + * If you press the key a you will get the 0x1e (key pressed) + and 0x9e (for the key release) + * If you press b you will get 0x30 (key pressed) and 0xb0 + (for the release key) + * If you press the c key, you will get the 0x2e (key + pressed) 0xae and 0xae (for the released key) + * If you press the Shift key you will get the 0x2a (key + pressed) 0xaa and 0xaa (for the released key) + * If you press the Ctrl key you will get the 0x1d (key + pressed) and 0x9d (for the release key) + + As also indicated in this `article + `_, a key + release scancode is 128 (0x80) higher then a key press + scancode. This is how we can distinguish between a press + key scancode and a release scancode. + + A scancode is translated into a keycode that matches a + key. A pressed scanned keycode and a released scancode + have the same keycode. For the keys shown above we have + the following table: + + .. flat-table:: + + * - Key + - Key Press Scancode + - Key Release Scancode + - Keycode + + * - ENTER + - 0x1c + - 0x9c + - 0x1c (28) + + * - a + - 0x1e + - 0x9e + - 0x1e (30) + + * - b + - 0x30 + - 0xb0 + - 0x30 (48) + + * - c + - 0x2e + - 0xae + - 0x2e (46) + + * - Shift + - 0x2a + - 0xaa + - 0x2a (42) + + * - Ctrl + - 0x1d + - 0x9d + - 0x1d (29) + + The press / release key is performed in the is_key_press() + function and obtaining the ASCII character of a scancode + takes place in the get_ascii() function. + +In the interrupt handler check the scancode to see if the key is +pressed or released then determine the corresponding ASCII +character. + +.. hint:: To check for press / release, use :c:func:`is_key_press`. + Use :c:func:`get_ascii` function to get the corresponding + ASCII code. Both functions expect the scancode. + + +.. hint:: To display the received information use the following + format. + + .. code-block:: c + + pr_info("IRQ %d: scancode=0x%x (%u) pressed=%d ch=%c\n", + irq_no, scancode, scancode, pressed, ch); + + Where scancode is the value of the data register, and ch is + the value returned by the get_ascii() function. + +Store characters to the buffer +............................... + +We want to collect the pressed characters (not the other keys) into +a circular buffer that can be consumed from user space. + +Update the interrupt handler to add a pressed ASCII character to the +end of the device buffer. If the buffer is full, the character will be +discarded. + +.. hint:: The device buffer is the field :c:type:`buf` in the device's + :c:type:`struct kbd`. To get the device data from the interrupt handler + use the following construct: + + .. code-block:: c + + struct kbd *data = (struct kbd *) dev_id; + + The buffer's dimension is located in :c:type:`struct kbd`'s field, + :c:type:`count`. The :c:type:`put_idx` and :c:type:`get_idx` fields + specify the next writing and reading index. Take a look at the + :c:func:`put_char` function's implementation to observe how the data is + added to the circular buffer. + +.. attention:: Synchronize the access to the buffer and the helper + indexes with a spinlock. + Define the spinlock in the device struct :c:type:`struct kbd` + and initialize it in :c:func:`kbd_init`. + + Use the :c:func:`spin_lock` and :c:func:`spin_unlock` functions + to protect the buffer in the interrupt handler. + + Revisit the `Locking`_ section. + +4. Reading the buffer +---------------------- + +In order to have access to the keylogger's data, we have to send it to +the user space. We will do this using the */dev/kbd* character device. When +reading from this device, we will get the data from the buffer in the kernel +space, where we collected the keys pressed. + +For this step +follow the sections marked with **TODO 4** in the :c:func:`kbd_read` function. + +Implement :c:func:`get_char` in a similar way to :c:func:`put_char`. Be careful +when implementing the circular buffer. + +In the :c:func:`kbd_read` function copy the data from the buffer to the +userspace buffer. + +.. hint:: Use :c:func:`get_char` to read a character from the buffer + and :c:func:`put_user` to store it to the user buffer. + +.. attention:: In the read function, use :c:func:`spin_lock_irqsave` and + :c:func:`spin_unlock_irqrestore` for locking. + + Revisit the `Locking`_ section. + +.. attention:: We cannot use :c:func:`put_user` or :c:func:`copy_to_user` + while holding the lock, as userpace access is not permitted from + atomic contexts. + + For more info, read the :ref:`Access to the address space of the + process section <_access_to_process_address_space>` in the + previous lab. + +For testing, you will need to create the */dev/kbd* character device +driver using the mknod before reading from it. The device master and +minor are defined as ``KBD_MAJOR`` and ``KBD_MINOR``: + +.. code-block:: c + + mknod /dev/kbd c 42 0 + +Build, copy and boot the virtual machine and load the module. Test it +using the command: + +.. code-block:: c + + cat /dev/kbd + + +5. Reset the buffer +------------------- + +Reset the buffer if the device is written to. For this step follow the +sections marked with **TODO 5** in the skeleton. + +Implement :c:func:`reset_buffer` and add the write operation to *kbd_fops*. + +.. attention:: In the write function Use :c:func:`spin_lock_irqsave` and + :c:func:`spin_unlock_irqrestore` for locking when resetting the + buffer. + + Revisit the `Locking`_ section. + +For testing, you will need to create the */dev/kbd* character device +driver using the mknod before reading from it. The device master and +minor are defined as ``KBD_MAJOR`` and ``KBD_MINOR``: + +.. code-block:: c + + mknod /dev/kbd c 42 0 + +Build, copy and boot the virtual machine and load the module. +Test it using the command: + +.. code-block:: c + + cat /dev/kbd + +Press some keys, then run the command :command:`echo "clear" > /dev/kbd`. +Check the buffer's content again. It should be reset. + +Extra Exercises +=============== + +1. kfifo +--------- + +Implement a keylogger using the +`kfifo API `_. + +.. hint:: Follow the `API call examples from the kernel code `_. + For example, the file `bytestream-examples.c `_. diff --git a/refs/pull/405/merge/_sources/labs/introduction.rst.txt b/refs/pull/405/merge/_sources/labs/introduction.rst.txt new file mode 100644 index 00000000..98e3a630 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/introduction.rst.txt @@ -0,0 +1,887 @@ +============ +Introduction +============ + +Lab objectives +============== + +* presenting the rules and objectives of the Operating Systems 2 lab +* introducing the lab documentation +* introducing the Linux kernel and related resources + +Keywords +======== + +* kernel, kernel programming +* Linux, vanilla, http://www.kernel.org +* cscope, LXR +* gdb, /proc/kcore, addr2line, dump\_stack + +.. + _[SECTION-ABOUT-BEGIN] + +About this laboratory +===================== + +The Operating Systems 2 lab is a kernel programming and driver development lab. +The objectives of the laboratory are: + +* deepening the notions presented in the course +* presentation of kernel programming interfaces (kernel API) +* gaining documenting, development and debugging skills on a freestanding + environment +* acquiring knowledge and skills for drivers development + +A laboratory will present a set of concepts, applications and commands +specific to a given problem. The lab will start with a presentation +(each lab will have a set of slides) (15 minutes) and the remaining +time will be allocated to the lab exercises (80 minutes). + +For best laboratory performance, we recommend that you read the related slides. +To fully understand a laboratory, we recommend going through the lab support. For +in-depth study, use the supporting documentation. + +.. + _[SECTION-ABOUT-END] + +.. + _[SECTION-REFERENCES-BEGIN] + +References +========== + +- Linux + + - `Linux Kernel Development, 3rd + Edition `__ + - `Linux Device Drivers, 3rd + Edition `__ + - `Essential Linux Device + Drivers `__ + +- General + + - `mailing list `__ + (`searching the mailing list `__) + +.. + _[SECTION-REFERENCES-END] + +.. + _[SECTION-CODE-NAVIGATION-BEGIN] + +Source code navigation +====================== + +.. _cscope_intro: + +cscope +------ + +`Cscope `__ is a tool for +efficient navigation of C sources. To use it, a cscope database must +be generated from the existing sources. In a Linux tree, the command +:command:`make ARCH=x86 cscope` is sufficient. Specification of the +architecture through the ARCH variable is optional but recommended; +otherwise, some architecture dependent functions will appear multiple +times in the database. + +You can build the cscope database with the command :command:`make +ARCH=x86 COMPILED_SOURCE=1 cscope`. This way, the cscope database will +only contain symbols that have already been used in the compile +process before, thus resulting in better performance when searching +for symbols. + +Cscope can also be used as stand-alone, but it is more useful when +combined with an editor. To use cscope with :command:`vim`, it is necessary to +install both packages and add the following lines to the file +:file:`.vimrc` (the machine in the lab already has the settings): + +.. code-block:: vim + + if has("cscope") + " Look for a 'cscope.out' file starting from the current directory, + " going up to the root directory. + let s:dirs = split(getcwd(), "/") + while s:dirs != [] + let s:path = "/" . join(s:dirs, "/") + if (filereadable(s:path . "/cscope.out")) + execute "cs add " . s:path . "/cscope.out " . s:path . " -v" + break + endif + let s:dirs = s:dirs[:-2] + endwhile + + set csto=0 " Use cscope first, then ctags + set cst " Only search cscope + set csverb " Make cs verbose + + nmap ``s :cs find s ``=expand("``")```` + nmap ``g :cs find g ``=expand("``")```` + nmap ``c :cs find c ``=expand("``")```` + nmap ``t :cs find t ``=expand("``")```` + nmap ``e :cs find e ``=expand("``")```` + nmap ``f :cs find f ``=expand("``")```` + nmap ``i :cs find i ^``=expand("``")``$`` + nmap ``d :cs find d ``=expand("``")```` + nmap :cnext + nmap :cprev + + " Open a quickfix window for the following queries. + set cscopequickfix=s-,c-,d-,i-,t-,e-,g- + endif + +The script searches for a file called :file:`cscope.out` in the current directory, or +in parent directories. If :command:`vim` finds this file, you can use the shortcut :code:`Ctrl +]` +or :code:`Ctrl+\ g` (the combination control-\\ followed by g) to jump directly to +the definition of the word under the cursor (function, variable, structure, etc.). +Similarly, you can use :code:`Ctrl+\ s` to go where the word under the cursor is used. + +You can take a cscope-enabled :file:`.vimrc` file (also contains other goodies) from +https://github.com/ddvlad/cfg/blob/master/\_vimrc. +The following guidelines are based on this file, but also show basic :command:`vim` commands +that have the same effect. + +If there are more than one results (usually there are) you can move between them +using :code:`F6` and :code:`F5` (:code:`:ccnext` and :code:`:cprev`). +You can also open a new panel showing the results using :code:`:copen`. To close +the panel, use the :code:`:cclose` command. + +To return to the previous location, use :code:`Ctrl+o` (o, not zero). +The command can be used multiple times and works even if cscope changed the +file you are currently editing. + +To go to a symbol definition directly when :command:`vim` starts, use :code:`vim -t ` +(for example :code:`vim -t task_struct`). Otherwise, if you started :command:`vim` and want +to search for a symbol by name, use :code:`cs find g ` (for example +:code:`cs find g task_struct`). + +If you found more than one results and opened a panel showing all the matches +(using :code:`:copen`) and you want to find a symbol of type structure, +it is recommended to search in the results panel (using :code:`/` -- slash) +the character :code:`{` (opening brace). + +.. important:: + You can get a summary of all the :command:`cscope` commands using :command:`:cs help`. + + For more info, use the :command:`vim` built-in help command: :command:`:h cscope` or :command:`:h copen`. + +If you use :command:`emacs`, install the :code:`xcscope-el` package and +add the following lines in :file:`~/.emacs`. + +.. code-block:: vim + + (require ‘xcscope) + (cscope-setup) + +These commands will activate cscope for the C and C++ modes automatically. +:code:`C-s s` is the key bindings prefix and :code:`C-s s s` is used to +search for a symbol (if you call it when the cursor is over a word, +it will use that). For more details, check `https://github.com/dkogan/xcscope.el` + +clangd +------ + +`Clangd `__ is a language server that provides tools +for navigating C and C++ code. +`Language Server Protocol `__ +facilitates features like go-to-definition, find-references, hover, completion, etc., +using semantic whole project analysis. + +Clangd requires a compilation database to understand the kernel source code. +It can be generated with: + +.. code-block:: bash + + make defconfig + make + scripts/clang-tools/gen_compile_commands.py + +LSP clients: + +- Vim/Neovim (`coc.nvim `__, `nvim-lsp `__, `vim-lsc `__, `vim-lsp `__) +- Emacs (`lsp-mode `__) +- VSCode (`clangd extension `__) + +Kscope +------ + +For a simpler interface, `Kscope `__ +is a cscope frontend which uses QT. It is lightweight, very fast and very +easy to use. It allows searching using regular expressions, call graphs, etc. +Kscope is no longer mantained. + +There is also a `port `__ +of version 1.6 for Qt4 and KDE 4 which keeps the integration of the text +editor Kate and is easier to use than the last version on SourceForge. + +LXR Cross-Reference +------------------- + +LXR (LXR Cross-Reference) is a tool that allows indexing and +referencing the symbols in the source code of a program using +a web interface. The web interface shows links to +locations in files where a symbol is defined or used. Development website +for LXR is http://sourceforge.net/projects/lxr. Similar tools +are `OpenGrok `__ and +`Gonzui `__. + +Although LXR was originally intended for the Linux kernel sources, it is +also used in the sources of `Mozilla `__, +`Apache HTTP Server `__ and +`FreeBSD `__. + +There are a number of sites that use LXR for cross-referencing the +the sources of the Linux kernel, the main site being `the original site of +development `__ which does not work anymore. You can +use `https://elixir.bootlin.com/ `__. + +LXR allows searching for an identifier (symbol), after a free text +or after a file name. The main feature and, at the same +time, the main advantage provided is the ease of finding the declaration +of any global identifier. This way, it facilitates quick access to function +declarations, variables, macro definitions and the code can be easily +navigated. Also, the fact that it can detect what code areas are affected +when a variable or function is changed is a real advantage in the development +and debugging phase. + +SourceWeb +--------- + +`SourceWeb `__ is a source code indexer +for C and C++. It uses the +`framework `__ +provided by the Clang compiler to index the code. + +The main difference between cscope and SourceWeb is the fact that SourceWeb +is, in a way, a compiler pass. SourceWeb doesn't index all the code, but +only the code that was efectively compiled by the compiler. This way, some +problems are eliminated, such as ambiguities about which variant of a function +defined in multiple places is used. This also means that the indexing takes +more time, because the compiled files must pass one more time through +the indexer to generate the references. + +Usage example: + +.. code-block:: bash + + make oldconfig + sw-btrace make -j4 + sw-btrace-to-compile-db + sw-clang-indexer --index-project + sourceweb index + +:file:`sw-btrace` is a script that adds the :file:`libsw-btrace.so` +library to :code:`LD_PRELOAD`. This way, the library is loaded by +every process started by :code:`make` (basically, the compiler), +registers the commands used to start the processes and generates +a filed called :file:`btrace.log`. This file is then used by +:code:`sw-btrace-to-compile-db` which converts it to a format defined +by clang: `JSON Compilation Database `__. +This JSON Compilation Database resulted from the above steps is then +used by the indexer, which makes one more pass through the compiled +source files and generates the index used by the GUI. + +Word of advice: don't index the sources you are working with, but use +a copy, because SourceWeb doesn't have, at this moment, the capability +to regenerate the index for a single file and you will have to regenerate +the complete index. + +.. + _[SECTION-CODE-NAVIGATION-END] + +.. + _[SECTION-DEBUGGING-BEGIN] + +Kernel Debugging +================ + +Debugging a kernel is a much more difficult process than the debugging +of a program, because there is no support from the operating system. +This is why this process is usually done using two computers, connected +on serial interfaces. + +.. _gdb_intro: + +gdb (Linux) +----------- + +A simpler debug method on Linux, but with many disadvantages, +is local debugging, using `gdb `__, +the uncompressed kernel image (:file:`vmlinux`) and :file:`/proc/kcore` +(the real-time kernel image). This method is usually used to inspect +the kernel and detect certain inconsistencies while it runs. The +method is useful especially if the kernel was compiled using the +:code:`-g` option, which keeps debug information. Some well-known +debug techniques can't be used by this method, such as breakpoints +of data modification. + +.. note:: Because :file:`/proc` is a virtual filesystem, :file:`/proc/kcore` + does not physically exist on the disk. It is generated on-the-fly + by the kernel when a program tries to access :file:`proc/kcore`. + + It is used for debugging purposes. + + From :command:`man proc`, we have: + + :: + + /proc/kcore + This file represents the physical memory of the system and is stored in the ELF core file format. With this pseudo-file, and + an unstripped kernel (/usr/src/linux/vmlinux) binary, GDB can be used to examine the current state of any kernel data struc‐ + tures. + +The uncompressed kernel image offers information about the data structures +and symbols it contains. + +.. code-block:: bash + + student@eg106$ cd ~/src/linux + student@eg106$ file vmlinux + vmlinux: ELF 32-bit LSB executable, Intel 80386, ... + student@eg106$ nm vmlinux | grep sys_call_table + c02e535c R sys_call_table + student@eg106$ cat System.map | grep sys_call_table + c02e535c R sys_call_table + +The :command:`nm` utility is used to show the symbols in an object or +executable file. In our case, :file:`vmlinux` is an ELF file. Alternately, +we can use the file :file:`System.map` to view information about the +symbols in kernel. + +Then we use :command:`gdb` to inspect the symbols using the uncompressed +kernel image. A simple :command:`gdb` session is the following: + +.. code-block:: bash + + student@eg106$ cd ~/src/linux + stduent@eg106$ gdb --quiet vmlinux + Using host libthread_db library "/lib/tls/libthread_db.so.1". + (gdb) x/x 0xc02e535c + 0xc02e535c ``: 0xc011bc58 + (gdb) x/16 0xc02e535c + 0xc02e535c ``: 0xc011bc58 0xc011482a 0xc01013d3 0xc014363d + 0xc02e536c ``: 0xc014369f 0xc0142d4e 0xc0142de5 0xc011548b + 0xc02e537c ``: 0xc0142d7d 0xc01507a1 0xc015042c 0xc0101431 + 0xc02e538c ``: 0xc014249e 0xc0115c6c 0xc014fee7 0xc0142725 + (gdb) x/x sys_call_table + 0xc011bc58 ``: 0xffe000ba + (gdb) x/x &sys_call_table + 0xc02e535c ``: 0xc011bc58 + (gdb) x/16 &sys_call_table + 0xc02e535c ``: 0xc011bc58 0xc011482a 0xc01013d3 0xc014363d + 0xc02e536c ``: 0xc014369f 0xc0142d4e 0xc0142de5 0xc011548b + 0xc02e537c ``: 0xc0142d7d 0xc01507a1 0xc015042c 0xc0101431 + 0xc02e538c ``: 0xc014249e 0xc0115c6c 0xc014fee7 0xc0142725 + (gdb) x/x sys_fork + 0xc01013d3 ``: 0x3824548b + (gdb) disass sys_fork + Dump of assembler code for function sys_fork: + 0xc01013d3 ``: mov 0x38(%esp),%edx + 0xc01013d7 ``: mov $0x11,%eax + 0xc01013dc ``: push $0x0 + 0xc01013de ``: push $0x0 + 0xc01013e0 ``: push $0x0 + 0xc01013e2 ``: lea 0x10(%esp),%ecx + 0xc01013e6 ``: call 0xc0111aab `` + 0xc01013eb ``: add $0xc,%esp + 0xc01013ee ``: ret + End of assembler dump. + +It can be noticed that the uncompressed kernel image was used as an argument +for :command:`gdb`. The image can be found in the root of the kernel sources +after compilation. + +A few commands used for debugging using :command:`gdb` are: + +- :command:`x` (examine) - Used to show the contents of the memory area + whose address is specified as an argument to the command (this address + can be the value of a physical address, a symbol or the address of a + symbol). It can take as arguments (preceded by :code:`/`): the format + to display the data in (:code:`x` for hexadecimal, :code:`d` for + decimal, etc.), how many memory units to display and the size of a + memory unit. + +- :command:`disassemble` - Used to disassemble a function. + +- :command:`p` (print) - Used to evaluate and show the value of an + expression. The format to show the data in can be specified as + an argument (:code:`/x` for hexadecimal, :code:`/d` for decimal, etc.). + +The analysis of the kernel image is a method of static analysis. If we +want to perform dynamic analysis (analyzing how the kernel runs, not +only its static image) we can use :file:`/proc/kcore`; this is a dynamic +image (in memory) of the kernel. + +.. code-block:: bash + + student@eg106$ gdb ~/src/linux/vmlinux /proc/kcore + Core was generated by `root=/dev/hda3 ro'. + #0 0x00000000 in ?? () + (gdb) p sys_call_table + $1 = -1072579496 + (gdb) p /x sys_call_table + $2 = 0xc011bc58 + (gdb) p /x &sys_call_table + $3 = 0xc02e535c + (gdb) x/16 &sys_call_table + 0xc02e535c ``: 0xc011bc58 0xc011482a 0xc01013d3 0xc014363d + 0xc02e536c ``: 0xc014369f 0xc0142d4e 0xc0142de5 0xc011548b + 0xc02e537c ``: 0xc0142d7d 0xc01507a1 0xc015042c 0xc0101431 + 0xc02e538c ``: 0xc014249e 0xc0115c6c 0xc014fee7 0xc0142725 + +Using the dynamic image of the kernel is useful for detecting `rootkits `__. + +- `Linux Device Drivers 3rd Edition - Debuggers and Related Tools `__ +- `Detecting Rootkits and Kernel-level Compromises in Linux `__ +- `User-Mode Linux `__ + +Getting a stack trace +--------------------- + +Sometimes, you will want information about the trace the execution +reaches a certain point. You can determine this information using +:command:`cscope` or LXR, but some function are called from many +execution paths, which makes this method difficult. + +In these situations, it is useful to get a stack trace, which can be +simply done using the function :code:`dump_stack()`. + +.. + _[SECTION-DEBUGGING-END] + +.. + _[SECTION-DOCUMENTATION-BEGIN] + +Documentation +============= + +Kernel development is a difficult process, compared to user space +programming. The API is different and the complexity of the subsystems +in kernel requires additional preparation. The associated documentation +is heterogeneous, sometimes requiring the inspection of multiple sources +to have a more complete understanding of a certain aspect. + +The main advantages of the Linux kernel are the access to sources and +the open development system. Because of this, the Internet offers a +larger number of documentation for the kernel. + +A few links related to the Linux kernel are shown bellow: + +- `KernelNewbies `__ +- `KernelNewbies - Kernel Hacking `__ +- `Kernel Analysis - HOWTO `__ +- `Linux Kernel Programming `__ +- `Linux kernel - Wikibooks `__ + +The links are not comprehensive. Using `The Internet `__ and +`kernel source code `__ is essential. + +.. + _[SECTION-DOCUMENTATION-END] + +Exercises +========= + +.. + _[SECTION-EXERCISES-REMARKS-BEGIN] + +Remarks +------- + +.. note:: + + - Usually, the steps used to develop a kernel module are the + following: + + - editing the module source code (on the physical machine); + - module compilation (on the physical machine); + - generation of the minimal image for the virtual machine; + this image contains the kernel, your module, busybox and + eventually test programs; + - starting the virtual machine using QEMU; + - running the tests in the virtual machine. + + - When using cscope, use :file:`~/src/linux`. + If there is no :file:`cscope.out` file, you can generate it using + the command :command:`make ARCH=x86 cscope`. + + - You can find more details about the virtual machine at + :ref:`vm_link`. + +.. important:: + Before solving an exercice, **carefully** read all its bullets. + +.. + _[SECTION-EXERCISES-REMARKS-END] + +.. + _[EXERCISE1-BEGIN] + +Booting the virtual machine +--------------------------- + +A summary of the virtual machine infrastructure: + +- :file:`~/src/linux` - Linux kernel sources, needed to + compile modules. The directory contains the file :file:`cscope.out`, + used for navigation in the source tree. + +- :file:`~/src/linux/tools/labs/qemu`- scripts and auxiliary + files used to generate and run the QEMU VM. + +To start the VM, run :command:`make boot` in the directory :file:`~/src/linux/tools/labs`: + +.. code-block:: shell + + student@eg106:~$ cd ~/src/linux/tools/labs + student@eg106:~/src/linux/tools/labs$ make boot + +By default, you will not get a prompt or any graphical interface, but you can connect to +a console exposed by the virtual machine using :command:`minicom` or :command:`screen`. + +.. code-block:: shell + + student@eg106:~/src/linux/tools/labs$ minicom -D serial.pts + + + + qemux86 login: + Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0 + +Alternatively, you can start the virtual machine with graphical interface support, using +the :command:`QEMU_DISPLAY=gtk make boot`. + +.. note:: + To access the virtual machine, at the login prompt, enter the + username :code:`root`; there is no need to enter a password. + The virtual machine will start with the permissions of the + root account. + +.. + _[EXERCISE1-END] + +.. + _[EXERCISE2-BEGIN] + +Adding and using a virtual disk +------------------------------- + +.. note:: If you don't have the file :file:`mydisk.img`, you can download + it from the address http://elf.cs.pub.ro/so2/res/laboratoare/mydisk.img. + The file must be placed in :file:`tools/labs`. + +In the :file:`~/src/linux/tools/labs` directory, you have a new virtual +machine disk, in the file :file:`mydisk.img`. We want to add the disk +to the virtual machine and use it within the virtual machine. + +Edit :file:`qemu/Makefile` and add :code:`-drive file=mydisk.img,if=virtio,format=raw` +to the :code:`QEMU_OPTS` variable. + +.. note:: There are already two disks added to qemu (disk1.img and disk2.img). You will need + to add the new one after them. In this case, the new disk can be accessed as + :file:`/dev/vdd` (vda is the root partition, vdb is disk1 and vdc is disk2). + +.. hint:: You do not need to manually create the entry for the new disk in :file:`/dev` + because the virtual machine uses :command:`devtmpfs`. + +Run :code:`make` in :file:`tools/labs` to boot the virtual machine. +Create :file:`/test` directory and try to mount the new disk: + +.. code-block:: bash + + mkdir /test + mount /dev/vdd /test + +The reason why we can not mount the virtual disk is because we do not have support in the +kernel for the filesystem with which the :file:`mydisk.img` is formatted. You will need +to identify the filesystem for :file:`mydisk.img` and compile kernel support for that filesystem. + +Close the virtual machine (close the QEMU window, you do not need to use another command). +Use the :command:`file` command on the physical machine to find out with which filesystem +the :file:`mydisk.img` file is formatted. You will identify the :command:`btrfs` file system. + +You will need to enable :command:`btrfs` support in the kernel and recompile the kernel image. + +.. warning:: If you receive an error while executing the :command:`make menuconfig` + command, you probably do not have the :command:`libncurses5-dev` + package installed. Install it using the command: + + :: + + sudo apt-get install libncurses5-dev + +.. hint:: Enter the :file:`~/src/linux/` subdirectory. Run :command:`make menuconfig` + and go to the *File systems* section. Enable *Btrfs filesystem support*. + You will need to use the builtin option (not the module), i.e. :command:`<*>` must appear + next to the option (**not** :command:``). + + Save the configuration you have made. Use the default configuration file (:file:`config`). + + In the kernel source subdirectory (:file:`~/src/linux/`) recompile using the command: + + :: + + make + + To wait less, you can use the :command:`-j` option run multiple jobs in parallel. + Generally, it is recommended to use :command:`number of CPUs+1`: + + :: + + make -j5 + +After the kernel recompilation finishes, **restart** the QEMU virtual machine: +that is, launch the :command:`make` command in the subdirectory. You +do not need to copy anything, because the :file:`bzImage` file is a symlink to the kernel +image you just recompiled. + +Inside the QEMU virtual machine, repeat the :command:`mkdir` and :command:`mount` operations. +With support for the :command:`btrfs` filesystem, now :command:`mount` will finish successfully. + +.. note:: When doing your homework, there is no need to recompile the kernel + because you will only use kernel modules. However, it is important + to be familiar with configuring and recompiling a kernel. + + If you still plan to recompile the kernel, make a backup of the bzImage + file (follow the link in ~/src/linux for the full path). This will allow + you to return to the initial setup in order to have an environment + identical to the one used by vmchecker. + +.. + _[EXERCISE2-END] + +.. + _[EXERCISE3-BEGIN] + +GDB and QEMU +------------ + +We can investigate and troubleshoot the QEMU virtual machine in real time. + +.. note:: You can also use the :command:`GDB Dashboard` plugin for a user-friendly interface. + :command:`gdb` must be compiled with Python support. + + In order to install it, you can just run: + :: + + wget -P ~ git.io/.gdbinit + +To do this, we start the QEMU virtual machine first. Then, we can connect +with :command:`gdb` to **a running QEMU virtual machine** using the command + +:: + + make gdb + +We used the QEMU command with the :command:`-s` parameter, which means +listening to port :code:`1234` from :command:`gdb`. We can do debugging +using a **remote target** for :command:`gdb`. The existing :file:`Makefile` +takes care of the details. + +When you attach a debugger to a process, the process is suspended. +You can add breakpoints and inspect the current status of the process. + +Attach to the QEMU virtual machine (using the :command:`make gdb` command) +and place a breakpoint in the :code:`sys_access` function using the +following command in the :command:`gdb` console: + +:: + + break sys_access + +At this time, the virtual machine is suspended. To continue executing it (up to the possible call +of the :code:`sys_access` function), use the command: + +:: + + continue + +in the :command:`gdb` console. + +At this time, the virtual machine is active and has a usable console. +To make a :code:`sys_access` call, issue a :command:`ls` command. +Note that the virtual machine was again suspended by :command:`gdb` +and the corresponding :code:`sys_access` callback message appeared within the :command:`gdb` console. + +Trace code execution using :command:`step` instruction, :command:`continue` or :command:`next` +instruction. You probably do not understand everything that happens, so use commands +such as :command:`list` and :command:`backtrace` to trace the execution. + +.. hint:: At the :command:`gdb` prompt, you can press :command:`Enter` + (without anything else) to rerun the last command. + +.. + _[EXERCISE3-END] + +.. + _[EXERCISE4-BEGIN] + +4. GDB spelunking +----------------- + +Use :command:`gdb` to display the source code of the function that creates kernel threads +(:code:`kernel_thread`). + +.. note:: You can use GDB for static kernel analysis using, in the kernel source directory, + a command such as: + + :: + + gdb vmlinux + + Go over the `gdb (Linux) <#gdb-linux>`__ section of the lab. + +Use :command:`gdb` to find the address of the :code:`jiffies` variable in memory and its contents. +The :code:`jiffies` variable holds the number of ticks (clock beats) since the system started. + +.. hint:: To track the value of the jiffies variable, use dynamic analysis in :command:`gdb` + by running the command: + + :: + + make gdb + + as in the previous exercise. + + Go over the `gdb (Linux) <#gdb-linux>`__ section of the lab. + +.. hint:: The :code:`jiffies` is a 64-bit variable. + You can see that its address is the same as the :code:`jiffies_64` variable. + + To explore the contents of a 64-bit variable, use in the :command:`gdb` console the command: + + :: + + x/gx & jiffies + + If you wanted to display the contents of the 32-bit variable, + you would use in the :command:`gdb` console the command: + + :: + + x/wx & jiffies + +.. + _[EXERCISE4-END] + +.. + _[EXERCISE5-BEGIN] + + +5. Cscope spelunking +-------------------- + +Use LXR or cscope in the :file:`~/src/linux/` directory to discover +the location of certain structures or functions. + +Cscope index files are already generated. Use :command:`vim` and other related commands +to scroll through the source code. For example, use the command: + +:: + + vim + +for opening the :command:`vim` editor. Afterwards, inside the editor, use commands such as: + +:command:`:cs find g task\_struct`. + +Find the file in which the following data types are defined: + +- ``struct task_struct`` + +- ``struct semaphore`` + +- ``struct list_head`` + +- ``spinlock_t`` + +- ``struct file_system_type`` + +.. hint:: For a certain structure, only its name needs to be searched. + + For instance, in the case of :command:`struct task_struct`, + search for the :command:`task_struct` string. + +Usually, you will get more matches. To locate the one you are interested in, do the following: + +#. List all matches by using, in :command:`vim`, :command:`:copen` command. + +#. Look for the right match (where the structure is defined) by looking for an open character + (:command:`{`), a single character on the structure definition line. To search for the open + braid you use in :command:`vim` the construction :command:`/{`. + +#. On the respective line, press :command:`Enter` to get into the source code where the variable + is defined. + +#. Close the secondary window using the command: :command:`:cclose` command. + +Find the file in which the following global kernel variables are declared: + +- ``sys_call_table`` + +- ``file_systems`` + +- ``current`` + +- ``chrdevs`` + +.. hint:: To do this, use a :command:`vim` command with the syntax: + + :command:`:cs f g ` + + where :command:`` is the name of the symbol being searched. + +Find the file in which the following functions are declared: + +- ``copy_from_user`` + +- ``vmalloc`` + +- ``schedule_timeout`` + +- ``add_timer`` + +.. hint:: To do this, use a :command:`vim` command with the syntax: + + :command:`:cs f g ` + + where :command:`` is the name of the symbol being searched. + +Scroll through the following sequence of structures: + +- ``struct task_struct`` + +- ``struct mm_struct`` + +- ``struct vm_area_struct`` + +- ``struct vm_operations_struct`` + +That is, you access a structure and then you find fields with the data type of the +next structure, access the respective fields and so on. +Note in which files these structures are defined; this will be useful to the following labs. + + +.. hint:: In order to search for a symbol in :command:`vim` (with :command:`cscope` support) + when the cursor is placed on it, use the :command:`Ctrl+]` keyboard shortcut. + + To return to the previous match (the one before search/jump), use the + :command:`Ctrl+o` keyboard shortcut. + + To move forward with the search (to return to matches before :command:`Ctrl+o`), + use the :command:`Ctrl+i` keyboard shortcut. + +Following the above instructions, find and go through the function call sequence: + +- ``bio_alloc`` + +- ``bio_alloc_bioset`` + +- ``bvec_alloc`` + +- ``kmem_cache_alloc`` + +- ``slab_alloc`` + +.. note:: Read `cscope <#cscope>`__ or `LXR Cross-Reference <#lxr-cross-reference>`__ sections of the lab. diff --git a/refs/pull/405/merge/_sources/labs/kernel_api.rst.txt b/refs/pull/405/merge/_sources/labs/kernel_api.rst.txt new file mode 100644 index 00000000..f927e5cf --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/kernel_api.rst.txt @@ -0,0 +1,857 @@ +========== +Kernel API +========== + +Lab objectives +============== + + * Familiarize yourself with the basic Linux kernel API + * Description of memory allocation mechanisms + * Description of locking mechanisms + +Overview +======== + +Inside the current lab we present a set of concepts and basic functions required +for starting Linux kernel programming. It is important to note that kernel +programming differs greatly from user space programming. The kernel is a +stand-alone entity that can not use libraries in user-space (not even libc). +As a result, the usual user-space functions (printf, malloc, free, open, read, +write, memcpy, strcpy, etc.) can no longer be used. In conclusion, kernel +programming is based on a totally new and independent API that is unrelated to +the user-space API, whether we refer to POSIX or ANSI C (standard C language +library functions). + +Accessing memory +================ + +An important difference in kernel programming is how to access and allocate +memory. Due to the fact that kernel programming is very close to the physical +machine, there are important rules for memory management. First, it works with +several types of memory: + + * Physical memory + * Virtual memory from the kernel address space + * Virtual memory from a process's address space + * Resident memory - we know for sure that the accessed pages are present in + physical memory + +Virtual memory in a process's address space can not be considered resident due +to the virtual memory mechanisms implemented by the operating system: pages may +be swapped or simply may not be present in physical memory as a result of the +demand paging mechanism. The memory in the kernel address space can be resident +or not. Both the data and code segments of a module and the kernel stack of a +process are resident. Dynamic memory may or may not be resident, depending on +how it is allocated. + +When working with resident memory, things are simple: memory can be accessed at +any time. But if working with non-resident memory, then it can only be accessed +from certain contexts. Non-resident memory can only be accessed from the +process context. Accessing non-resident memory from the context of an +interrupt has unpredictable results and, therefore, when the operating +system detects such access, it will take drastic measures: blocking or +resetting the system to prevent serious corruption. + +The virtual memory of a process can not be accessed directly from the kernel. +In general, it is totally discouraged to access the address space of a process, +but there are situations where a device driver needs to do it. The typical case +is where the device driver needs to access a buffer from the user-space. In +this case, the device driver must use special features and not directly access +the buffer. This is necessary to prevent access to invalid memory areas. + +Another difference from the user-space scheduling, relative to memory, is due to +the stack, a stack whose size is fixed and limited. A stack of 4K is used in +Linux, and a stack of 12K is used in Windows. For this reason, the +allocation of large structures on stack or the use of recursive calls should +be avoided. + +Contexts of execution +===================== + +In relation to kernel execution, we distinguish two contexts: process context +and interrupt context. We are in the process context when we run code as a +result of a system call or when we run in the context of a kernel thread. When +we run in a routine to handle an interrupt or a deferrable action, we run in +an interrupt context. + +Some of the kernel API calls can block the current process. Common examples are +using a semaphore or waiting for a condition. In this case, the process is +put into the ``WAITING`` state and another process is running. An interesting +situation occurs when a function that can lead to the current process to be +suspended, is called from an interrupt context. In this case, there is no +current process, and therefore the results are unpredictable. Whenever the +operating system detects this condition will generate an error condition that +will cause the operating system to shut down. + +Locking +======= + +One of the most important features of kernel programming is parallelism. Linux +supports SMP systems with multiple processors and kernel preemptivity. This +makes kernel programming more difficult because access to global variables must +be synchronized with either spinlock primitives or blocking primitives. Although +it is recommended to use blocking primitives, they can not be used in an +interrupt context, so the only locking solution in the context of an interrupt +is spinlocks. + +Spinlocks are used in order to achieve mutual exclusion. When it can not get +access to the critical region, it does not suspend the current process, but it +uses the busy-waiting mechanism (waiting in a :c:func:`while` loop for the lock +to be released). +The code that runs in the critical region protected by a spinlock is not allowed +to suspend the current process (it must adhere to the execution conditions in +the interrupt context). Moreover, the CPU will not be released except for +the case of an interrupt. Due to the mechanism used, it is important that a +spinlock is being held as little time as possible. + +Preemptivity +============ + +Linux uses preemptive kernels. The notion of preemptive multitasking should not +be confused with the notion of a preemptive kernel. The notion of preemptive +multitasking refers to the fact that the operating system forcefully interrupts +a process running in user space when its quantum (time slice) expires, in order +to run another process. +A kernel is preemptive if a process running in kernel mode (as a result of a +system call) can be interrupted so that another process is being run. + +Because of preemptivity, when we share resources between two portions of code +that can run from different process contexts, we need to protect ourselves with +synchronization primitives, even in the case of a single processor. + +Linux Kernel API +================ + +Convention indicating errors +---------------------------- + +For Linux kernel programming, the convention used for calling functions to +indicate success is the same as in UNIX programming: 0 for success, or a value +other than 0 for failure. +For failures, negative values are returned as shown in the example below: + +.. code-block:: c + + if (alloc_memory() != 0) + return -ENOMEM; + + if (user_parameter_valid() != 0) + return -EINVAL; + +The exhaustive list of errors and a summary explanation can be found in +:file:`include/uapi/asm-generic/errno-base.h` and in +:file:`include/uapi/asm-generic/ernno.h`. + +Strings of characters +--------------------- + +In Linux, the kernel programmer is provided with the usual routine functions: +:c:func:`strcpy`, :c:func:`strncpy`, :c:func:`strlcpy`, :c:func:`strcat`, +:c:func:`strncat`, :c:func:`strlcat`, :c:func:`strcmp`, :c:func:`strncmp`, +:c:func:`strnicmp`, :c:func:`strchr`, :c:func:`strnchr`, :c:func:`strrchr`, +:c:func:`strstr`, :c:func:`strlen`, :c:func:`memset`, :c:func:`memmove`, +:c:func:`memcmp`, etc. These functions are declared in the +:file:`include/linux/string.h` header and are implemented in the kernel in the +:file:`lib/string.c` file. + +printk +------ + +The printf equivalent in the kernel is printk, defined in +:file:`include/linux/printk.h`. The :c:func:`printk` syntax is very similar +to :c:func:`printf`. The first +parameter of :c:func:`printk` decides the log category in which the current log +falls into: + +.. code-block:: c + + #define KERN_EMERG "<0>" /* system is unusable */ + #define KERN_ALERT "<1>" /* action must be taken immediately */ + #define KERN_CRIT "<2>" /* critical conditions */ + #define KERN_ERR "<3>" /* error conditions */ + #define KERN_WARNING "<4>" /* warning conditions */ + #define KERN_NOTICE "<5>" /* normal but significant condition */ + #define KERN_INFO "<6>" /* informational */ + #define KERN_DEBUG "<7>" /* debug-level messages */ + +Thus, a warning message in the kernel would be sent with: + +.. code-block:: c + + printk(KERN_WARNING "my_module input string %s\n", buff); + + +If the logging level is missing from the :c:func:`printk` call, logging is done +with the default level at the time of the call. One thing to keep in mind is +that messages sent with :c:func:`printk` are only visible on the console if and +only if their level exceeds the default level set on the console. + +To reduce the size of lines when using :c:func:`printk`, it is recommended to +use the following help functions instead of directly using the :c:func:`printk` +call: + +.. code-block:: c + + pr_emerg(fmt, ...); /* similar to printk(KERN_EMERG pr_fmt(fmt), ...); */ + pr_alert(fmt, ...); /* similar to printk(KERN_ALERT pr_fmt(fmt), ...); */ + pr_crit(fmt, ...); /* similar to printk(KERN_CRIT pr_fmt(fmt), ...); */ + pr_err(fmt, ...); /* similar to printk(KERN_ERR pr_fmt(fmt), ...); */ + pr_warn(fmt, ...); /* similar to printk(KERN_WARNING pr_fmt(fmt), ...); */ + pr_notice(fmt, ...); /* similar to printk(KERN_NOTICE pr_fmt(fmt), ...); */ + pr_info(fmt, ...); /* similar to printk(KERN_INFO pr_fmt(fmt), ...); */ + pr_debug(fmt, ...); /* similar to printk(KERN_DEBUG pr_fmt(fmt), ...); */ + +A special case is :c:func:`pr_debug` that calls the :c:func:`printk` function +only when the :c:macro:`DEBUG` macro is defined or if dynamic debugging is used. + + +Memory allocation +----------------- + +In Linux only resident memory can be allocated, using :c:func:`kmalloc` call. +A typical :c:func:`kmalloc` call is presented below: + +.. code-block:: c + + #include + + string = kmalloc (string_len + 1, GFP_KERNEL); + if (!string) { + //report error: -ENOMEM; + } + +As you can see, the first parameter indicates the size in bytes of the allocated +area. The function returns a pointer to a memory area that can be directly used +in the kernel, or :c:macro:`NULL` if memory could not be allocated. The second +parameter specifies how allocation should be done and the most commonly used +values for this are: + + * :c:data:`GFP_KERNEL` - using this value may cause the current process to + be suspended. Thus, it can not be used in the interrupt context. + * :c:data:`GFP_ATOMIC` - using this value it ensures that the + :c:func:`kmalloc` function does not suspend the current process. It can be + used anytime. + +The counterpart to the :c:func:`kmalloc` function is :c:func:`kfree`, a function +that receives as argument an area allocated by :c:func:`kmalloc`. This function +does not suspend the current process and can therefore be called from any +context. + +lists +----- + +Because linked lists are often used, the Linux kernel API provides a unified +way of defining and using lists. This involves using a +:c:type:`struct list_head` element in the structure we want to consider as a +list node. The :c:type:`struct list_head` is defined in +:file:`include/linux/list.h` along with all the other functions that manipulate +the lists. The following code shows the definition of +the :c:type:`struct list_head` and the use of an element of this type in another +well-known structure in the Linux kernel: + +.. code-block:: c + + struct list_head { + struct list_head *next, *prev; + }; + + struct task_struct { + ... + struct list_head children; + ... + }; + +The usual routines for working with lists are the following: + + * :c:macro:`LIST_HEAD(name)` is used to declare the sentinel of a list + * :c:func:`INIT_LIST_HEAD(struct list_head *list)` is used to initialize the + sentinel of a list when dynamic allocation is made, by setting the value of + the :c:data:`next` and :c:data:`prev` to list fields. + * :c:func:`list_add(struct list_head *new, struct list_head *head)` adds the + :c:data:`new` element after the :c:data:`head` element. + * :c:func:`list_del(struct list_head *entry)` deletes the item at the + :c:data:`entry` address of the list it belongs to. + * :c:macro:`list_entry(ptr, type, member)` returns the structure with the + type :c:type:`type` that contains the element :c:data:`ptr` from the list, + having the name :c:member:`member` within the structure. + * :c:macro:`list_for_each(pos, head)` iterates over a list using + :c:data:`pos` as a cursor. + * :c:macro:`list_for_each_safe(pos, n, head)` iterates over a list using + :c:data:`pos` as a cursor and :c:data:`n` as a temporary cursor. + This macro is used to delete an item from the list. + +The following code shows how to use these routines: + +.. code-block:: c + + #include + #include + + struct pid_list { + pid_t pid; + struct list_head list; + }; + + LIST_HEAD(my_list); + + static int add_pid(pid_t pid) + { + struct pid_list *ple = kmalloc(sizeof *ple, GFP_KERNEL); + + if (!ple) + return -ENOMEM; + + ple->pid = pid; + list_add(&ple->list, &my_list); + + return 0; + } + + static int del_pid(pid_t pid) + { + struct list_head *i, *tmp; + struct pid_list *ple; + + list_for_each_safe(i, tmp, &my_list) { + ple = list_entry(i, struct pid_list, list); + if (ple->pid == pid) { + list_del(i); + kfree(ple); + return 0; + } + } + + return -EINVAL; + } + + static void destroy_list(void) + { + struct list_head *i, *n; + struct pid_list *ple; + + list_for_each_safe(i, n, &my_list) { + ple = list_entry(i, struct pid_list, list); + list_del(i); + kfree(ple); + } + } + +The evolution of the list can be seen in the following figure: + +.. image:: ../res/list_evolution.png + :width: 85% + +You see the stack type behavior introduced by the :c:macro:`list_add` macro, +and the use of a sentinel. + +From the above example, it can be noticed that the way to define and use a list +(double-linked) is generic and, at the same time, it does not introduce an +additional overhead. The :c:type:`struct list_head` is used to maintain the +links between the list elements. It can be noticed that iterating over the list +is also done with this structure, and that retrieving a list element can be done +using :c:macro:`list_entry`. This idea of implementing and using a list is not +new, as it has already been described in The Art of Computer Programming by +Donald Knuth in the 1980s. + +Several kernel list functions and macro definitions are presented and explained +in the :file:`include/linux/list.h` header. + +Spinlock +-------- + +:c:type:`spinlock_t` (defined in :file:`linux/spinlock.h`) is the basic type +that implements the spinlock concept in Linux. It describes a spinlock, and the +operations associated with a spinlock are :c:func:`spin_lock_init`, +:c:func:`spin_lock`, :c:func:`spin_unlock`. An example of use is given below: + +.. code-block:: c + + #include + + DEFINE_SPINLOCK(lock1); + spinlock_t lock2; + + spin_lock_init(&lock2); + + spin_lock(&lock1); + /* critical region */ + spin_unlock(&lock1); + + spin_lock(&lock2); + /* critical region */ + spin_unlock(&lock2); + + +In Linux, you can use reader-writer spinlocks, useful for readers-writers +problems. +These types of locks are identified by :c:type:`rwlock_t`, and the functions +that can work on a reader-writer spinlock are +:c:func:`rwlock_init`, +:c:func:`read_lock`, +:c:func:`write_lock`. +An example of use: + + +.. code-block:: c + + #include + + DEFINE_RWLOCK(lock); + + struct pid_list { + pid_t pid; + struct list_head list; + }; + + int have_pid(struct list_head *lh, int pid) + { + struct list_head *i; + void *elem; + + read_lock(&lock); + list_for_each(i, lh) { + struct pid_list *pl = list_entry(i, struct pid_list, list); + if (pl->pid == pid) { + read_unlock(&lock); + return 1; + } + } + read_unlock(&lock); + + return 0; + } + + void add_pid(struct list_head *lh, struct pid_list *pl) + { + write_lock(&lock); + list_add(&pl->list, lh); + write_unlock(&lock); + } + +mutex +----- + +A mutex is a variable of the :c:type:`struct mutex` type (defined in +:file:`linux/mutex.h`). +Functions and macros for working with mutexes are listed below: + +.. code-block:: c + + #include + + /* functions for mutex initialization */ + void mutex_init(struct mutex *mutex); + DEFINE_MUTEX(name); + + /* functions for mutex acquire */ + void mutex_lock(struct mutex *mutex); + + /* functions for mutex release */ + void mutex_unlock(struct mutex *mutex); + +Operations are similar to classic mutex operations in user-space or spinlock +operations: the mutex is acquired before entering the critical region and it is +released after exiting the critical region. Unlike spinlocks, these operations +can only be used in process context. + +.. _atomic-variables: + +Atomic variables +---------------- + +Often, you only need to synchronize access to a simple variable, such as a +counter. For this, an :c:type:`atomic_t` type can be used (defined in +:file:`include/linux/atomic.h`), that holds an integer value. Below are some +operations that can be performed on an :c:type:`atomic_t` variable. + +.. code-block:: c + + #include + + void atomic_set(atomic_t *v, int i); + int atomic_read(atomic_t *v); + void atomic_add(int i, atomic_t *v); + void atomic_sub(int i, atomic_t *v); + void atomic_inc(atomic_t *v); + void atomic_dec(atomic_t *v); + int atomic_inc_and_test(atomic_t *v); + int atomic_dec_and_test(atomic_t *v); + int atomic_cmpxchg(atomic_t *v, int old, int new); + +Use of atomic variables +*********************** + +A common way of using atomic variables is to store the status of an action +(e.g. a flag). So we can use an atomic variable to mark exclusive actions. For +example, we consider that an atomic variable can have the LOCKED and UNLOCKED +values, and if the respective variable equals LOCKED then a specific function +should return -EBUSY. +Such an usage is shown schematically in the code below: + +.. code-block:: c + + #define LOCKED 0 + #define UNLOCKED 1 + + static atomic_t flag; + + static int my_acquire(void) + { + int initial_flag; + + /* + * Check if flag is UNLOCKED; if so, lock it and do it atomically. + * + * This is the atomic equivalent of + * if (flag == UNLOCKED) + * flag = LOCKED; + * else + * return -EBUSY; + */ + initial_flag = atomic_cmpxchg(&flag, UNLOCKED, LOCKED); + if (initial_flag == LOCKED) { + printk(KERN_ALERT "Already locked.\n"); + return -EBUSY; + } + + /* Do your thing after getting the lock. */ + [...] + } + + static void my_release(void) + { + /* Release flag; mark it as unlocked. */ + atomic_set(&flag, UNLOCKED); + } + + void my_init(void) + { + [...] + /* Atomic variable is initially unlocked. */ + atomic_set(&flag, UNLOCKED); + + [...] + } + + +The above code is the equivalent of using a trylock (such as +:c:func:`pthread_mutex_trylock`). + +We can also use a variable to store the size of a buffer and for atomic +updates of the respective variable. The code below is such an example: + +.. code-block:: c + + static unsigned char buffer[MAX_SIZE]; + static atomic_t size; + + static void add_to_buffer(unsigned char value) + { + buffer[atomic_read(&size)] = value; + atomic_inc(&size); + } + + static unsigned char remove_from_buffer(void) + { + unsigned char value; + + value = buffer[atomic_read(&size)]; + atomic_dec(&size); + + return value + } + + static void reset_buffer(void) + { + atomic_set(&size, 0); + } + + void my_init(void) + { + [...] + /* Initialized buffer and size. */ + atomic_set(&size, 0); + memset(buffer, 0, sizeof(buffer)); + + [...] + } + +Atomic bitwise operations +------------------------- + +The kernel provides a set of functions (in :file:`asm/bitops.h`) that modify or +test bits in an atomic way. + +.. code-block:: c + + #include + + void set_bit(int nr, void *addr); + void clear_bit(int nr, void *addr); + void change_bit(int nr, void *addr); + int test_and_set_bit(int nr, void *addr); + int test_and_clear_bit(int nr, void *addr); + int test_and_change_bit(int nr, void *addr); + +:c:data:`Addr` represents the address of the memory area whose bits are being +modified or tested and :c:data:`nr` is the bit on which the operation is +performed. + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: kernel_api + +0. Intro +-------- + +Using |LXR|_ find the definitions of the following symbols in the Linux kernel: + + * :c:type:`struct list_head` + * :c:func:`INIT_LIST_HEAD` + * :c:func:`list_add` + * :c:macro:`list_for_each` + * :c:macro:`list_entry` + * :c:macro:`container_of` + * :c:macro:`offsetof` + +1. Memory allocation in Linux kernel +------------------------------------ + +Generate the skeleton for the task named **1-mem** and browse the +contents of the :file:`mem.c` file. Observe the use of :c:func:`kmalloc` +call for memory allocation. + + 1. Compile the source code and load the :file:`mem.ko` module using + :command:`insmod`. + 2. View the kernel messages using the :command:`dmesg` command. + 3. Unload the kernel module using the :command:`rmmod mem` command. + +.. note:: Review the `Memory Allocation`_ section in the lab. + +2. Sleeping in atomic context +----------------------------- + +Generate the skeleton for the task named **2-sched-spin** and browse +the contents of the :file:`sched-spin.c` file. + + 1. Compile the source code and load the module, according the above info: + (:command:`make build` and :command:`make copy`) + 2. Notice that it is waiting for 5 seconds until the insertion + order is complete. + 3. Unload the kernel module. + 4. Look for the lines marked with: ``TODO 0`` to create an atomic + section. Re-compile the source code and reload the module into + the kernel. + +You should now get an error. Look at the stack trace. What is the +cause of the error? + +.. hint:: In the error message, follow the line containing the :c:macro:`BUG` + for a description of the error. You are not allowed to sleep in + atomic context. The atomic context is given by a section + between a lock operation and an unlock on a spinlock. + +.. note:: The + :c:func:`schedule_timeout` function, corroborated with the + :c:macro:`set_current_state` macro, forces the current process to wait + for 5 seconds. + +.. note:: Review the `Contexts of execution`_, `Locking`_ and `Spinlock`_ + sections. + +3. Working with kernel memory +----------------------------- + +Generate the skeleton for the task named **3-memory** directory and +browse the contents of the :file:`memory.c` file. Notice the comments +marked with ``TODO``. You must allocate 4 structures of type :c:type:`struct +task_info` and initialize them (in :c:func:`memory_init`), then print and +free them (in :c:func:`memory_exit`). + + 1. (TODO 1) Allocate memory for :c:type:`struct task_info` structure and + initialize its fields: + + * The :c:member:`pid` field to the PID transmitted as a parameter; + * The :c:member:`timestamp` field to the value of the :c:data:`jiffies` + variable, which holds the number of ticks that have occurred since the + system booted. + + 2. (TODO 2) Allocate :c:type:`struct task_info` for the current process, + the parent process, the next process, the next process of the next + process, with the following information: + + * PID of the current process, which can be retrieved from + :c:type:`struct task_struct` structure, returned by :c:macro:`current` + macro. + + .. hint:: + Search for :c:type:`pid` in :c:type:`task_struct`. + + * PID of the parent process of the current process. + + .. hint:: + Search for the relevant field from :c:type:`struct task_struct` + structure. Look after "parent". + + * PID of the next process from the list of processes, relative to the + current process. + + .. hint:: + Use :c:macro:`next_task` macro, which returns a pointer to the next + process (i.e a :c:type:`struct task_struct` structure). + + * PID of the next process of the next process, relative to the current + process. + + .. hint:: + Call the :c:macro:`next_task` macro 2 times. + + 3. (TODO 3) Display the four structures. + + * Use :c:func:`printk` to display their two fields: + :c:member:`pid` and :c:member:`timestamp`. + + 4. (TODO 4) Release the memory occupied by the structures + (use :c:func:`kfree`). + +.. hint:: + * You can access the current process using :c:macro:`current` + macro. + * Look for the relevant fields in the :c:type:`struct task_struct` + structure (:c:member:`pid`, :c:member:`parent`). + * Use the :c:macro:`next_task` macro. The macro returns the pointer to + the next process (ie. a :c:type:`struct task_struct*` structure). + +.. note:: The :c:type:`struct task_struct` structure contains two fields to + designate the parent of a task: + + * :c:member:`real_parent` points to the process that created the + task or to process with pid 1 (init) if the parent + completed its execution. + * :c:member:`parent` indicates to the current task parent (the + process that will be reported if the task completes + execution). + + In general, the values of the two fields are the same, but + there are situations where they differ, for example when + using the :c:func:`ptrace` system call. + +.. hint:: Review the `Memory allocation`_ section in the lab. + + +4. Working with kernel lists +---------------------------- + +Generate the skeleton for the task named **4-list**. Browse the +contents of the :file:`list.c` file and notice the comments marked with +``TODO``. The current process will add the four structures from the +previous exercise into a list. The list will be built in the +:c:func:`task_info_add_for_current` function which is called when module is +loaded. The list will be printed and deleted in the :c:func:`list_exit` +function and the :c:func:`task_info_purge_list` function. + + 1. (TODO 1) Complete the :c:func:`task_info_add_to_list` function to allocate + a :c:type:`struct task_info` structure and add it to the list. + + 2. (TODO 2) Complete the :c:func:`task_info_purge_list` function to delete + all the elements in the list. + + 3. Compile the kernel module. Load and unload the module by + following the messages displayed by the kernel. + +.. hint:: Review the labs `Lists`_ section. When deleting items from + the list, you will need to use either the + :c:macro:`list_for_each_safe` or :c:macro:`list_for_each_entry_safe` + macros. + +5. Working with kernel lists for process handling +------------------------------------------------- + +Generate the skeleton for the task named **5-list-full**. Browse the +contents of the :file:`list-full.c` and notice comments marked with +``TODO``. In addition to the :file:`4-list` functionality we add the +following: + + * A :c:member:`count` field showing how many times a process has been "added" + to the list. + * If a process is "added" several times, no new entry is created in + the list, but: + + * Update the :c:member:`timestamp` field. + * Increment :c:member:`count`. + + * To implement the counter facility, add a :c:func:`task_info_find_pid` + function that searches for a pid in the existing list. + + * If found, return the reference to the :c:type:`task_info` struct. If + not, return :c:macro:`NULL`. + + * An expiration facility. If a process was added more than 3 + seconds ago and if it does not have a :c:member:`count` greater than 5 then + it is considered expired and is removed from the list. + * The expiration facility is already implemented in the + :c:func:`task_info_remove_expired` function. + + 1. (TODO 1) Implement the :c:func:`task_info_find_pid` function. + 2. (TODO 2) Change a field of an item in the list so it does not + expire. It must not satisfy a part of the expiration condition + from :c:func:`task_info_remove_expired`. + + .. hint:: For ``TODO 2``, extract the first element from the list (the one + referred by :c:member:`head.next`) and set the :c:member:`count` + field to a large enough value. Use :c:func:`atomic_set` function. + + 3. Compile, copy, load and unload the kernel module following the displayed + messages. + Kernel module loading will take some time, because :c:func:`sleep` is + being called by :c:func:`schedule_timeout` function. + +6. Synchronizing list work +-------------------------- + +Generate the skeleton for the task named **6-list-sync**. + + 1. Browse the code and look for ``TODO 1`` string. + 2. Use a spinlock or a read-write lock to synchronize access to the + list. + 3. Compile, load and unload the kernel module. + +.. important:: Always lock data, not code! + +.. note:: Read `Spinlock`_ section of the lab. + +7. Test module calling in our list module +----------------------------------------- + +Generate the skeleton for the task named **7-list-test** and browse +the contents of the :file:`list-test.c` file. We'll use it as a test +module. It will call functions exported by the **6-list-sync** +task. The exported functions are the ones marked with **extern** in +:file:`list-test.c` file. + +Uncomment the commented code from :file:`7-list-test.c`. Look for ``TODO 1``. + +To export the above functions from the module located at :file:`6-list-sync/` +directory, the following steps are required: + + 1. Functions must not be static. + 2. Use the :c:macro:`EXPORT_SYMBOL` macro to export the kernel symbols. For + example: :c:macro:`EXPORT_SYMBOL(task_info_remove_expired);`. The + macro must be used for each function after the function is defined. + Browse the code and look for the ``TODO 2`` string in the + :file:`list-sync.c`. + 3. Remove from the module from **6-list-sync** the code that avoids the + expiration of a list item (it is in contradiction to our exercise). + 4. Compile and load the module from :file:`6-list-sync/`. Once loaded, it + exposes exported functions and can be used by the test + module. You can check this by searching for the function names + in :file:`/proc/kallsyms` before and after loading the module. + 5. Compile the test module and then load it. + 6. Use :command:`lsmod` to check that the two modules have been loaded. + What do you notice? + 7. Unload the kernel test module. + +What should be the unload order of the two modules (the module from +**6-list-sync** and the test module)? What happens if you use another order? diff --git a/refs/pull/405/merge/_sources/labs/kernel_modules.rst.txt b/refs/pull/405/merge/_sources/labs/kernel_modules.rst.txt new file mode 100644 index 00000000..6b1ca31f --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/kernel_modules.rst.txt @@ -0,0 +1,1345 @@ +============== +Kernel modules +============== + +Lab objectives +============== + +* creating simple modules +* describing the process of kernel module compilation +* presenting how a module can be used with a kernel +* simple kernel debugging methods + +.. + _[SECTION-OVERVIEW-BEGIN] + +Kernel Modules Overview +======================= + +A monolithic kernel, though faster than a microkernel, has the disadvantage of +lack of modularity and extensibility. On modern monolithic kernels, this has +been solved by using kernel modules. A kernel module (or loadable kernel mode) +is an object file that contains code that can extend the kernel functionality +at runtime (it is loaded as needed); When a kernel module is no longer needed, +it can be unloaded. Most of the device drivers are used in the form of kernel +modules. + +For the development of Linux device drivers, it is recommended to download the +kernel sources, configure and compile them and then install the compiled version +on the test /development tool machine. + +.. + _[SECTION-OVERVIEW-END] + +.. + _[SECTION-MODULE-EXAMPLE-BEGIN] + +An example of a kernel module +============================= + +Below is a very simple example of a kernel module. When loading into the kernel, +it will generate the message :code:`"Hi"`. When unloading the kernel module, the +:code:`"Bye"` message will be generated. + +.. code-block:: c + + #include + #include + #include + + MODULE_DESCRIPTION("My kernel module"); + MODULE_AUTHOR("Me"); + MODULE_LICENSE("GPL"); + + static int dummy_init(void) + { + pr_debug("Hi\n"); + return 0; + } + + static void dummy_exit(void) + { + pr_debug("Bye\n"); + } + + module_init(dummy_init); + module_exit(dummy_exit); + + +The generated messages will not be displayed on the console but will be saved +in a specially reserved memory area for this, from where they will be extracted +by the logging daemon (syslog). To display kernel messages, you can use the +:command:`dmesg` command or inspect the logs: + +.. code-block:: bash + + # cat /var/log/syslog | tail -2 + Feb 20 13:57:38 asgard kernel: Hi + Feb 20 13:57:43 asgard kernel: Bye + + # dmesg | tail -2 + Hi + Bye + +.. + _[SECTION-MODULE-EXAMPLE-END] + +.. + _[SECTION-COMPILE-MODULES-BEGIN] + +Compiling kernel modules +======================== + +Compiling a kernel module differs from compiling an user program. First, other +headers should be used. Also, the module should not be linked to libraries. +And, last but not least, the module must be compiled with the same options as +the kernel in which we load the module. For these reasons, there is a standard +compilation method (:code:`kbuild`). This method requires the use of two files: +a :file:`Makefile` and a :file:`Kbuild` file. + +Below is an example of a :file:`Makefile`: + +.. code-block:: bash + + KDIR = /lib/modules/`uname -r`/build + + kbuild: + make -C $(KDIR) M=`pwd` + + clean: + make -C $(KDIR) M=`pwd` clean + +And the example of a :file:`Kbuild` file used to compile a module: + +.. code-block:: bash + + EXTRA_CFLAGS = -Wall -g + + obj-m = modul.o + + +As you can see, calling :command:`make` on the :file:`Makefile` file in the +example shown will result in the :command:`make` invocation in the kernel +source directory (``/lib/modules/`uname -r`/build``) and referring to the +current directory (``M = `pwd```). This process ultimately leads to reading +the :file:`Kbuild` file from the current directory and compiling the module +as instructed in this file. + +.. note:: For labs we will configure different :command:`KDIR`, according to + the virtual machine specifications: + + .. code-block:: bash + + KDIR = /home/student/src/linux + [...] + +A :file:`Kbuild` file contains one or more directives for compiling a kernel +module. The easiest example of such a directive is ``obj-m = +module.o``. Following this directive, a kernel module (:code:`ko` - kernel +object) will be created, starting from the ``module.o`` file. ``module.o`` will +be created starting from ``module.c`` or ``module.S``. All of these files can +be found in the :file:`Kbuild`'s directory. + +An example of a :file:`Kbuild` file that uses several sub-modules is shown +below: + +.. code-block:: bash + + EXTRA_CFLAGS = -Wall -g + + obj-m = supermodule.o + supermodule-y = module-a.o module-b.o + +For the example above, the steps to compile are: + + * compile the :file:`module-a.c` and :file:`module-b.c` sources, + resulting in module-a.o and module-b.o objects + * :file:`module-a.o` and :file:`module-b.o` will then be linked + in :file:`supermodule.o` + * from :file:`supermodule.o` will be created :file:`supermodule.ko` + module + + +The suffix of targets in :file:`Kbuild` determines how they are used, as +follows: + + * M (modules) is a target for loadable kernel modules + + * Y (yes) represents a target for object files to be compiled and then + linked to a module (``$(mode_name)-y``) or within the kernel (``obj-y``) + + * any other target suffix will be ignored by :file:`Kbuild` and will not be + compiled + + +.. note:: These suffixes are used to easily configure the kernel by running the + :command:`make menuconfig` command or directly editing the + :file:`.config` file. This file sets a series of variables that are + used to determine which features are added to the kernel at build + time. For example, when adding BTRFS support with :command:`make + menuconfig`, add the line :code:`CONFIG_BTRFS_FS = y` to the + :file:`.config` file. The BTRFS kbuild contains the line + ``obj-$(CONFIG_BTRFS_FS):= btrfs.o``, which becomes ``obj-y:= + btrfs.o``. This will compile the :file:`btrfs.o` object and will be + linked to the kernel. Before the variable was set, the line became + ``obj:=btrfs.o`` and so it was ignored, and the kernel was build + without BTRFS support. + +For more details, see the :file:`Documentation/kbuild/makefiles.txt` and +:file:`Documentation/kbuild/modules.txt` files within the kernel sources. + +.. + _[SECTION-COMPILE-MODULES-END] + +.. + _[SECTION-LOAD-MODULES-BEGIN] + +Loading/unloading a kernel module +================================= + +To load a kernel module, use the :command:`insmod` utility. This utility +receives as a parameter the path to the :file:`*.ko` file in which the module +was compiled and linked. Unloading the module from the kernel is done using +the :command:`rmmod` command, which receives the module name as a parameter. + +.. code-block:: bash + + $ insmod module.ko + $ rmmod module.ko + +When loading the kernel module, the routine specified as a parameter of the +``module_init`` macro will be executed. Similarly, when the module is unloaded +the routine specified as a parameter of the ``module_exit`` will be executed. + +A complete example of compiling and loading/unloading a kernel module is +presented below: + +.. code-block:: bash + + faust:~/lab-01/modul-lin# ls + Kbuild Makefile modul.c + + faust:~/lab-01/modul-lin# make + make -C /lib/modules/`uname -r`/build M=`pwd` + make[1]: Entering directory `/usr/src/linux-2.6.28.4' + LD /root/lab-01/modul-lin/built-in.o + CC [M] /root/lab-01/modul-lin/modul.o + Building modules, stage 2. + MODPOST 1 modules + CC /root/lab-01/modul-lin/modul.mod.o + LD [M] /root/lab-01/modul-lin/modul.ko + make[1]: Leaving directory `/usr/src/linux-2.6.28.4' + + faust:~/lab-01/modul-lin# ls + built-in.o Kbuild Makefile modul.c Module.markers + modules.order Module.symvers modul.ko modul.mod.c + modul.mod.o modul.o + + faust:~/lab-01/modul-lin# insmod modul.ko + + faust:~/lab-01/modul-lin# dmesg | tail -1 + Hi + + faust:~/lab-01/modul-lin# rmmod modul + + faust:~/lab-01/modul-lin# dmesg | tail -2 + Hi + Bye + +Information about modules loaded into the kernel can be found using the +:command:`lsmod` command or by inspecting the :file:`/proc/modules`, +:file:`/sys/module` directories. + +.. + _[SECTION-LOAD-MODULES-END] + +.. + _[SECTION-DEBUG-MODULES-BEGIN] + +Kernel Module Debugging +======================= + +Troubleshooting a kernel module is much more complicated than debugging a +regular program. First, a mistake in a kernel module can lead to blocking the +entire system. Troubleshooting is therefore much slowed down. To avoid reboot, +it is recommended to use a virtual machine (qemu, virtualbox, vmware). + +When a module containing bugs is inserted into the kernel, it will eventually +generate a `kernel oops `_. +A kernel oops is an invalid operation detected by the kernel and can only +be generated by the kernel. For a stable kernel version, it almost certainly +means that the module contains a bug. After the oops appears, the kernel will +continue to work. + +Very important to the appearance of a kernel oops is saving the generated +message. As noted above, messages generated by the kernel are saved in logs and +can be displayed with the :command:`dmesg` command. To make sure that no kernel +message is lost, it is recommended to insert/test the kernel directly from the +console, or periodically check the kernel messages. Noteworthy is that an oops +can occur because of a programming error, but also a because of hardware error. + +If a fatal error occurs, after which the system can not return to a stable +state, a `kernel panic `_ is +generated. + +Look at the kernel module below that contains a bug that generates an oops: + +.. code-block:: c + + /* + * Oops generating kernel module + */ + + #include + #include + #include + + MODULE_DESCRIPTION ("Oops"); + MODULE_LICENSE ("GPL"); + MODULE_AUTHOR ("PSO"); + + #define OP_READ 0 + #define OP_WRITE 1 + #define OP_OOPS OP_WRITE + + static int my_oops_init (void) + { + int *a; + + a = (int *) 0x00001234; + #if OP_OOPS == OP_WRITE + *a = 3; + #elif OP_OOPS == OP_READ + printk (KERN_ALERT "value = %d\n", *a); + #else + #error "Unknown op for oops!" + #endif + + return 0; + } + + static void my_oops_exit (void) + { + } + + module_init (my_oops_init); + module_exit (my_oops_exit); + +.. ** + +Inserting this module into the kernel will generate an oops: + +.. code-block:: bash + + faust:~/lab-01/modul-oops# insmod oops.ko + [...] + + faust:~/lab-01/modul-oops# dmesg | tail -32 + BUG: unable to handle kernel paging request at 00001234 + IP: [] my_oops_init+0x5/0x20 [oops] + *de = 00000000 + Oops: 0002 [#1] PREEMPT DEBUG_PAGEALLOC + last sysfs file: /sys/devices/virtual/net/lo/operstate + Modules linked in: oops(+) netconsole ide_cd_mod pcnet32 crc32 cdrom [last unloaded: modul] + + Pid: 4157, comm: insmod Not tainted (2.6.28.4 #2) VMware Virtual Platform + EIP: 0060:[] EFLAGS: 00010246 CPU: 0 + EIP is at my_oops_init+0x5/0x20 [oops] + EAX: 00000000 EBX: fffffffc ECX: c89d4300 EDX: 00000001 + ESI: c89d4000 EDI: 00000000 EBP: c5799e24 ESP: c5799e24 + DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 + Process insmod (pid: 4157, ti=c5799000 task=c665c780 task.ti=c5799000) + Stack: + c5799f8c c010102d c72b51d8 0000000c c5799e58 c01708e4 00000124 00000000 + c89d4300 c5799e58 c724f448 00000001 c89d4300 c5799e60 c0170981 c5799f8c + c014b698 00000000 00000000 c5799f78 c5799f20 00000500 c665cb00 c89d4300 + Call Trace: + [] ? _stext+0x2d/0x170 + [] ? __vunmap+0xa4/0xf0 + [] ? vfree+0x21/0x30 + [] ? load_module+0x19b8/0x1a40 + [] ? __mutex_unlock_slowpath+0xd5/0x140 + [] ? trace_hardirqs_on_caller+0x106/0x150 + [] ? sys_init_module+0x8a/0x1b0 + [] ? trace_hardirqs_on_caller+0x106/0x150 + [] ? trace_hardirqs_on_thunk+0xc/0x10 + [] ? sysenter_do_call+0x12/0x43 + Code: 05 34 12 00 00 03 00 00 00 5d c3 eb 0d 90 90 90 90 90 90 90 90 + EIP: [] my_oops_init+0x5/0x20 [oops] SS:ESP 0068:c5799e24 + ---[ end trace 2981ce73ae801363 ]--- + +Although relatively cryptic, the message provided by the kernel to the +appearance of an oops provides valuable information about the error. First line: + +.. code-block:: bash + + BUG: unable to handle kernel paging request at 00001234 + EIP: [] my_oops_init + 0x5 / 0x20 [oops] + +Tells us the cause and the address of the instruction that generated the error. +In our case this is an invalid access to memory. + +Next line + + ``Oops: 0002 [# 1] PREEMPT DEBUG_PAGEALLOC`` + +Tells us that it's the first oops (#1). This is important in the context that +an oops can lead to other oopses. Usually only the first oops is relevant. +Furthermore, the oops code (``0002``) provides information about the error type +(see :file:`arch/x86/include/asm/trap_pf.h`): + + + * Bit 0 == 0 means no page found, 1 means protection fault + * Bit 1 == 0 means read, 1 means write + * Bit 2 == 0 means kernel, 1 means user mode + +In this case, we have a write access that generated the oops (bit 1 is 1). + +Below is a dump of the registers. It decodes the instruction pointer (``EIP``) +value and notes that the bug appeared in the :code:`my_oops_init` function with +a 5-byte offset (``EIP: [] my_oops_init+0x5``). The message also +shows the stack content and a backtrace of calls until then. + +If an invalid read call is generated (``#define OP_OOPS OP_READ``), the message +will be the same, but the oops code will differ, which would now be ``0000``: + +.. code-block:: bash + + faust:~/lab-01/modul-oops# dmesg | tail -33 + BUG: unable to handle kernel paging request at 00001234 + IP: [] my_oops_init+0x6/0x20 [oops] + *de = 00000000 + Oops: 0000 [#1] PREEMPT DEBUG_PAGEALLOC + last sysfs file: /sys/devices/virtual/net/lo/operstate + Modules linked in: oops(+) netconsole pcnet32 crc32 ide_cd_mod cdrom + + Pid: 2754, comm: insmod Not tainted (2.6.28.4 #2) VMware Virtual Platform + EIP: 0060:[] EFLAGS: 00010292 CPU: 0 + EIP is at my_oops_init+0x6/0x20 [oops] + EAX: 00000000 EBX: fffffffc ECX: c89c3380 EDX: 00000001 + ESI: c89c3010 EDI: 00000000 EBP: c57cbe24 ESP: c57cbe1c + DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 + Process insmod (pid: 2754, ti=c57cb000 task=c66ec780 task.ti=c57cb000) + Stack: + c57cbe34 00000282 c57cbf8c c010102d c57b9280 0000000c c57cbe58 c01708e4 + 00000124 00000000 c89c3380 c57cbe58 c5db1d38 00000001 c89c3380 c57cbe60 + c0170981 c57cbf8c c014b698 00000000 00000000 c57cbf78 c57cbf20 00000580 + Call Trace: + [] ? _stext+0x2d/0x170 + [] ? __vunmap+0xa4/0xf0 + [] ? vfree+0x21/0x30 + [] ? load_module+0x19b8/0x1a40 + [] ? printk+0x0/0x1a + [] ? __mutex_unlock_slowpath+0xd5/0x140 + [] ? trace_hardirqs_on_caller+0x106/0x150 + [] ? sys_init_module+0x8a/0x1b0 + [] ? trace_hardirqs_on_caller+0x106/0x150 + [] ? trace_hardirqs_on_thunk+0xc/0x10 + [] ? sysenter_do_call+0x12/0x43 + Code: 34 12 00 00 c7 04 24 54 30 9c c8 89 44 24 04 e8 58 a0 99 f7 31 + EIP: [] my_oops_init+0x6/0x20 [oops] SS:ESP 0068:c57cbe1c + ---[ end trace 45eeb3d6ea8ff1ed ]--- + +objdump +------- + +Detailed information about the instruction that generated the oops can be found +using the :command:`objdump` utility. Useful options to use are :command:`-d` +to disassemble the code and :command:`-S` for interleaving C code in assembly +language code. For efficient decoding, however, we need the address where the +kernel module was loaded. This can be found in :file:`/proc/modules`. + +Here's an example of using :command:`objdump` on the above module to identify +the instruction that generated the oops: + +.. code-block:: bash + + faust:~/lab-01/modul-oops# cat /proc/modules + oops 1280 1 - Loading 0xc89d4000 + netconsole 8352 0 - Live 0xc89ad000 + pcnet32 33412 0 - Live 0xc895a000 + ide_cd_mod 34952 0 - Live 0xc8903000 + crc32 4224 1 pcnet32, Live 0xc888a000 + cdrom 34848 1 ide_cd_mod, Live 0xc886d000 + + faust:~/lab-01/modul-oops# objdump -dS --adjust-vma=0xc89d4000 oops.ko + + oops.ko: file format elf32-i386 + + + Disassembly of section .text: + + c89d4000 : + #define OP_READ 0 + #define OP_WRITE 1 + #define OP_OOPS OP_WRITE + + static int my_oops_init (void) + { + c89d4000: 55 push %ebp + #else + #error "Unknown op for oops!" + #endif + + return 0; + } + c89d4001: 31 c0 xor %eax,%eax + #define OP_READ 0 + #define OP_WRITE 1 + #define OP_OOPS OP_WRITE + + static int my_oops_init (void) + { + c89d4003: 89 e5 mov %esp,%ebp + int *a; + + a = (int *) 0x00001234; + #if OP_OOPS == OP_WRITE + *a = 3; + c89d4005: c7 05 34 12 00 00 03 movl $0x3,0x1234 + c89d400c: 00 00 00 + #else + #error "Unknown op for oops!" + #endif + + return 0; + } + c89d400f: 5d pop %ebp + c89d4010: c3 ret + c89d4011: eb 0d jmp c89c3020 + c89d4013: 90 nop + c89d4014: 90 nop + c89d4015: 90 nop + c89d4016: 90 nop + c89d4017: 90 nop + c89d4018: 90 nop + c89d4019: 90 nop + c89d401a: 90 nop + c89d401b: 90 nop + c89d401c: 90 nop + c89d401d: 90 nop + c89d401e: 90 nop + c89d401f: 90 nop + + c89d4020 : + + static void my_oops_exit (void) + { + c89d4020: 55 push %ebp + c89d4021: 89 e5 mov %esp,%ebp + } + c89d4023: 5d pop %ebp + c89d4024: c3 ret + c89d4025: 90 nop + c89d4026: 90 nop + c89d4027: 90 nop + +Note that the instruction that generated the oops (``c89d4005`` identified +earlier) is: + + ``C89d4005: c7 05 34 12 00 00 03 movl $ 0x3,0x1234`` + +That is exactly what was expected - storing value 3 at 0x0001234. + +The :file:`/proc/modules` is used to find the address where a kernel module is +loaded. The :command:`--adjust-vma` option allows you to display instructions +relative to ``0xc89d4000``. The :command:`-l` option displays the number of +each line in the source code interleaved with the assembly language code. + +addr2line +--------- + +A more simplistic way to find the code that generated an oops is to use the +:command:`addr2line` utility: + +.. code-block:: bash + + faust:~/lab-01/modul-oops# addr2line -e oops.o 0x5 + /root/lab-01/modul-oops/oops.c:23 + +Where ``0x5`` is the value of the program counter (``EIP = c89d4005``) that +generated the oops, minus the base address of the module (``0xc89d4000``) +according to :file:`/proc/modules` + +minicom +------- + +:command:`Minicom` (or other equivalent utilities, eg :command:`picocom`, +:command:`screen`) is a utility that can be used to connect and interact with a +serial port. The serial port is the basic method for analyzing kernel messages +or interacting with an embedded system in the development phase. There are two +more common ways to connect: + +* a serial port where the device we are going to use is :file:`/dev/ttyS0` + +* a serial USB port (FTDI) in which case the device we are going to use is + :file:`/dev/ttyUSB`. + +For the virtual machine used in the lab, the device that we need to use is +displayed after the virtual machine starts: + +.. code-block:: bash + + char device redirected to /dev/pts/20 (label virtiocon0) + +Minicom use: + +.. code-block:: bash + + #for connecting via COM1 and using a speed of 115,200 characters per second + minicom -b 115200 -D /dev/ttyS0 + + #For USB serial port connection + minicom -D /dev/ttyUSB0 + + #To connect to the serial port of the virtual machine + minicom -D /dev/pts/20 + +netconsole +---------- + +:command:`Netconsole` is a utility that allows logging of kernel debugging +messages over the network. This is useful when the disk logging system does not +work or when serial ports are not available or when the terminal does not +respond to commands. :command:`Netconsole` comes in the form of a kernel +module. + +To work, it needs the following parameters: + + * port, IP address, and the source interface name of the debug station + * port, MAC address, and IP address of the machine to which the debug + messages will be sent + +These parameters can be configured when the module is inserted into the kernel, +or even while the module is inserted if it has been compiled with the +``CONFIG_NETCONSOLE_DYNAMIC`` option. + +An example configuration when inserting :command:`netconsole` kernel module is +as follows: + +.. code-block:: bash + + alice:~# modprobe netconsole netconsole=6666@192.168.191.130/eth0,6000@192.168.191.1/00:50:56:c0:00:08 + +Thus, the debug messages on the station that has the address +``192.168.191.130`` will be sent to the ``eth0`` interface, having source port +``6666``. The messages will be sent to ``192.168.191.1`` with the MAC address +``00:50:56:c0:00:08``, on port ``6000``. + +Messages can be played on the destination station using :command:`netcat`: + +.. code-block:: bash + + bob:~ # nc -l -p 6000 -u + +Alternatively, the destination station can configure :command:`syslogd` to +intercept these messages. More information can be found in +:file:`Documentation/networking/netconsole.txt`. + +Printk debugging +---------------- + +``The two oldest and most useful debugging aids are Your Brain and Printf``. + +For debugging, a primitive way is often used, but it is quite effective: +:code:`printk` debugging. Although a debugger can also be used, it is generally +not very useful: simple bugs (uninitialized variables, memory management +problems, etc.) can be easily localized by control messages and the +kernel-decoded oop message. + +For more complex bugs, even a debugger can not help us too much unless the +operating system structure is very well understood. When debugging a kernel +module, there are a lot of unknowns in the equation: multiple contexts (we have +multiple processes and threads running at a time), interruptions, virtual +memory, etc. + +You can use :code:`printk` to display kernel messages to user space. It is +similar to :code:`printf`'s functionality; the only difference is that the +transmitted message can be prefixed with a string of :code:`""`, where +:code:`n` indicates the error level (loglevel) and has values between ``0`` and +``7``. Instead of :code:`""`, the levels can also be coded by symbolic +constants: + +.. code-block:: c + + KERN_EMERG - n = 0 + KERN_ALERT - n = 1 + KERN_CRIT - n = 2 + KERN_ERR - n = 3 + KERN_WARNING - n = 4 + KERN_NOTICE - n = 5 + KERN_INFO - n = 6 + KERN_DEBUG - n = 7 + + +The definitions of all log levels are found in :file:`linux/kern_levels.h`. +Basically, these log levels are used by the system to route messages sent to +various outputs: console, log files in :file:`/var/log` etc. + +.. note:: To display :code:`printk` messages in user space, the :code:`printk` + log level must be of higher priority than `console_loglevel` + variable. The default console log level can be configured from + :file:`/proc/sys/kernel/printk`. + + For instance, the command: + + .. code-block:: bash + + echo 8 > /proc/sys/kernel/printk + + will enable all the kernel log messages to be displayed in the + console. That is, the logging level has to be strictly less than the + :code:`console_loglevel` variable. For example, if the + :code:`console_loglevel` has a value of ``5`` (specific to + :code:`KERN_NOTICE`), only messages with loglevel stricter than ``5`` + (i.e :code:`KERN_EMERG`, :code:`KERN_ALERT`, :code:`KERN_CRIT`, + :code:`KERN_ERR`, :code:`KERN_WARNING`) will be shown. + +Console-redirected messages can be useful for quickly viewing the effect of +executing the kernel code, but they are no longer so useful if the kernel +encounters an irreparable error and the system freezes. In this case, the logs +of the system must be consulted, as they keep the information between system +restarts. These are found in :file:`/var/log` and are text files, populated by +:code:`syslogd` and :code:`klogd` during the kernel run. :code:`syslogd` and +:code:`klogd` take the information from the virtual file system mounted in +:file:`/proc`. In principle, with :code:`syslogd` and :code:`klogd` turned on, +all messages coming from the kernel will go to :file:`/var/log/kern.log`. + +A simpler version for debugging is using the :file:`/var/log/debug` file. It +is populated only with the :code:`printk` messages from the kernel with the +:code:`KERN_DEBUG` log level. + +Given that a production kernel (similar to the one we're probably running with) +contains only release code, our module is among the few that send messages +prefixed with KERN_DEBUG . In this way, we can easily navigate through the +:file:`/var/log/debug` information by finding the messages corresponding to a +debugging session for our module. + +Such an example would be the following: + +.. code-block:: bash + + # Clear the debug file of previous information (or possibly a backup) + $ echo "New debug session" > /var/log/debug + # Run the tests + # If there is no critical error causing a panic kernel, check the output + # if a critical error occurs and the machine only responds to a restart, + restart the system and check /var/log/debug. + +The format of the messages must obviously contain all the information of +interest in order to detect the error, but inserting in the code :code:`printk` +to provide detailed information can be as time-consuming as writing the code to +solve the problem. This is usually a trade-off between the completeness of the +debugging messages displayed using :code:`printk` and the time it takes to +insert these messages into the text. + +A very simple way, less time-consuming for inserting :code:`printk` and +providing the possibility to analyze the flow of instructions for tests is the +use of the predefined constants :code:`__FILE__`, :code:`__LINE__` and +:code:`__func__`: + + * ``__FILE__`` is replaced by the compiler with the name of the source file + it is currently being compiled. + + * ``__LINE__`` is replaced by the compiler with the line number on which the + current instruction is found in the current source file. + + * ``__func__`` /``__FUNCTION__`` is replaced by the compiler with the name + of the function in which the current instruction is found. + +.. note:: + :code:`__FILE__` and :code:`__LINE__` are part of the ANSI C specifications: + :code:`__func__` is part of specification C99; :code:`__FUNCTION__` is a GNU + :code:`C` extension and is not portable; However, since we write code for the + :code:`Linux` kernel, we can use it without any problems. + +The following macro definition can be used in this case: + +.. code-block:: c + + #define PRINT_DEBUG \ + printk (KERN_DEBUG "[% s]: FUNC:% s: LINE:% d \ n", __FILE__, + __FUNCTION__, __LINE__) + +Then, at each point where we want to see if it is "reached" in execution, +insert PRINT_DEBUG; This is a simple and quick way, and can yield by carefully +analyzing the output. + +The :command:`dmesg` command is used to view the messages printed with +:code:`printk` but not appearing on the console. + +To delete all previous messages from a log file, run: + +.. code-block:: bash + + cat /dev/null > /var/log/debug + +To delete messages displayed by the :command:`dmesg` command, run: + +.. code-block:: bash + + dmesg -c + + +Dynamic debugging +----------------- + +Dynamic `dyndbg `_ +debugging enables dynamic debugging activation/deactivation. +Unlike :code:`printk`, it offers more advanced :code:`printk` options for the +messages we want to display; it is very useful for complex modules or +troubleshooting subsystems. +This significantly reduces the amount of messages displayed, leaving only +those relevant for the debug context. To enable ``dyndbg``, the kernel must be +compiled with the ``CONFIG_DYNAMIC_DEBUG`` option. Once configured, +:code:`pr_debug()`, :code:`dev_dbg()` and :code:`print_hex_dump_debug()`, +:code:`print_hex_dump_bytes()` can be dynamically enabled per call. + +The :file:`/sys/kernel/debug/dynamic_debug/control` file from the debugfs (where +:file:`/sys/kernel/debug` is the path to which debugfs was mounted) is used to +filter messages or to view existing filters. + +.. code-block:: c + + mount -t debugfs none /debug + +`Debugfs `_ +is a simple file system, used as a kernel-space interface and +user-space interface to configure different debug options. Any debug utility +can create and use its own files /folders in debugfs. + +For example, to display existing filters in ``dyndbg``, you will use: + +.. code-block:: bash + + cat /debug/dynamic_debug/control + +And to enable the debug message from line ``1603`` in the :file:`svcsock.c` file: + +.. code-block:: bash + + echo 'file svcsock.c line 1603 +p' > /debug/dynamic_debug/control + +The :file:`/debug/dynamic_debug/control` file is not a regular file. It shows +the ``dyndbg`` settings on the filters. Writing in it with an echo will change +these settings (it will not actually make a write). Be aware that the file +contains settings for ``dyndbg`` debugging messages. Do not log in this file. + +Dyndbg Options +~~~~~~~~~~~~~~ + +* ``func`` - just the debug messages from the functions that have the same + name as the one defined in the filter. + + .. code-block:: bash + + echo 'func svc_tcp_accept +p' > /debug/dynamic_debug/control + +* ``file`` - the name of the file(s) for which we want to display the debug + messages. It can be just the source name, but also the absolute path or + kernel-tree path. + + .. code-block:: bash + + file svcsock.c + file kernel/freezer.c + file /usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svcsock.c + +* ``module`` - module name. + + .. code-block:: bash + + module sunrpc + +* ``format`` - only messages whose display format contains the specified string. + + .. code-block:: bash + + format "nfsd: SETATTR" + +* ``line`` - the line or lines for which we want to enable debug calls. + + .. code-block:: bash + + # Triggers debug messages between lines 1603 and 1605 in the svcsock.c file + $ echo 'file svcsock.c line 1603-1605 +p' > /sys/kernel/debug/dynamic_debug/control + # Enables debug messages from the beginning of the file to line 1605 + $ echo 'file svcsock.c line -1605 +p' > /sys/kernel/debug/dynamic_debug/control + +In addition to the above options, a series of flags can be added, removed, or set +with operators ``+``, ``-`` or ``=``: + + * ``p`` activates the pr_debug() . + * ``f`` includes the name of the function in the printed message. + * ``l`` includes the line number in the printed message. + * ``m`` includes the module name in the printed message. + * ``t`` includes the thread id if it is not called from interrupt context + * ``_`` no flag is set. + +KDB: Kernel debugger +-------------------- + +The kernel debugger has proven to be very useful to facilitate the development and +debugging process. One of its main advantages is the possibility to perform live debugging. +This allows us to monitor, in real time, the accesses to memory or even modify the memory +while debugging. +The debugger has been integrated in the mainline kernel starting with version 2.6.26-rci. +KDB is not a *source debugger*, but for a complete analysis it can be used in parallel with +gdb and symbol files -- see :ref:`the GDB debugging section ` + +To use KDB, you have the following options: + + * non-usb keyboard + VGA text console + * serial port console + * USB EHCI debug port + +For the lab, we will use a serial interface connected to the host. +The following command will activate GDB over the serial port: + +.. code-block:: bash + + echo hvc0 > /sys/module/kgdboc/parameters/kgdboc + +KDB is a *stop mode debugger*, which means that, while it is active, all the other processes +are stopped. The kernel can be *forced* to enter KDB during execution using the following +`SysRq `__ command + +.. code-block:: bash + + echo g > /proc/sysrq-trigger + +or by using the key combination ``Ctrl+O g`` in a terminal connected to the serial port +(for example using :command:`minicom`). + +KDB has various commands to control and define the context of the debugged system: + + * lsmod, ps, kill, dmesg, env, bt (backtrace) + * dump trace logs + * hardware breakpoints + * modifying memory + +For a better description of the available commands you can use the ``help`` command in +the KDB shell. +In the next example, you can notice a simple KDB usage example which sets a hardware +breakpoint to monitor the changes of the ``mVar`` variable. + +.. code-block:: bash + + # trigger KDB + echo g > /proc/sysrq-trigger + # or if we are connected to the serial port issue + Ctrl-O g + # breakpoint on write access to the mVar variable + kdb> bph mVar dataw + # return from KDB + kdb> go + +.. + _[SECTION-DEBUG-MODULES-END] + +Exercises +========= + +.. _exercises_summary: + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: kernel_modules + +0. Intro +-------- + +Using :command:`cscope` or |LXR|_ find the definitions of the following symbols +in the Linux kernel source code: + +* :c:func:`module_init` and :c:func:`module_exit` + + - what do the two macros do? What is ``init_module`` and ``cleanup_module``? + +* :c:data:`ignore_loglevel` + + - What is this variable used for? + +.. warning:: + If you have problems using :command:`cscope`, it is possible that the database + is not generated. To generate it, use the following command in the kernel + directory: + + .. code-block:: bash + + make ARCH=x86 cscope + +.. note:: + When searching for a structure using :command:`cscope`, use only the + structure name (without :code:`struct`). So, to search for the + structure :c:type:`struct module`, you will use the command + + .. code-block:: bash + + vim -t module + + or, in :command:`vim`, the command + + .. code-block:: bash + + :cs f g module + +.. note:: + For more info on using :command:`cscope`, read the + :ref:`cscope section ` in the previous lab. + +.. + _[EXERCISE1-BEGIN] + +1. Kernel module +---------------- + +To work with the kernel modules, we will follow the steps described +:ref:`above `. + +Generate the skeleton for the task named **1-2-test-mod** then build the module, + by running the following command in :file:`tools/labs`. + +.. code-block:: bash + + $ LABS=kernel_modules make skels + $ make build + +These command will build all the modules in the current +lab skeleton. + +.. warning:: + Until after solving exercise 3, you will get a compilation error for + ``3-error-mod``. To avoid this issue, remove the directory + :file:`skels/kernel_modules/3-error-mod/` and remove the corresponding + line from ``skels/Kbuild``. + +Start the VM using :command:`make console`, and perform the following tasks: + +* load the kernel module. + +* list the kernel modules and check if current module is present + +* unload the kernel module + +* view the messages displayed at loading/unloading the kernel module using + :command:`dmesg` command + +.. note:: Read `Loading/unloading a kernel module`_ section. When unloading + a kernel module, you can specify only the module name + (without extension). + +.. + _[EXERCISE1-END] + +.. + _[EXERCISE2-BEGIN] + + +2. Printk +--------- + +Watch the virtual machine console. Why were the messages displayed directly +to the virtual machine console? + +Configure the system such that the messages are not displayed directly +on the serial console, and they can only be inspected using ``dmesg``. + +.. hint:: One option is to set the console log level by writting + the desired level to ``/proc/sys/kernel/printk``. + Use a value smaller than the level used for the prints in + the source code of the module. + +Load/unload the module again. +The messages should not be printed to the virtual machine console, +but they should be visible when running ``dmesg``. + +.. + _[EXERCISE2-END] + +.. + _[EXERCISE3-BEGIN] + +3. Error +-------- + +Generate the skeleton for the task named **3-error-mod**. Compile the +sources and get the corresponding kernel module. + +Why have compilation +errors occurred? **Hint:** How does this module differ from the previous module? + +Modify the module to solve the cause of those errors, then compile and test +the module. + +.. + _[EXERCISE3-END] + +.. + _[EXERCISE4-BEGIN] + +4. Sub-modules +-------------- + +Inspect the C source files ``mod1.c`` and ``mod2.c`` in :file:`4-multi-mod/`. +Module 2 contains only the definition of a function used by module 1. + +Change the :file:`Kbuild` file to create the ``multi_mod.ko`` module from the +two C source files. + +.. hint:: Read the `Compiling kernel modules`_ section of the lab. + +Compile, copy, boot the VM, load and unload the kernel module. Make sure messages +are properly displayed on the console. + +.. + _[EXERCISE4-END] + +.. + _[EXERCISE5-BEGIN] + +5. Kernel oops +-------------- + +Enter the directory for the task **5-oops-mod** and inspect the +C source file. Notice where the problem will occur. Add the compilation flag +``-g`` in the Kbuild file. + +.. hint:: Read `Compiling kernel modules`_ section of the lab. + +Compile the corresponding module and load it into the kernel. Identify the memory +address at which the oops appeared. + +.. hint:: Read `Debugging`_ section of the lab. To identify the + address, follow the oops message and extract the value of + the instructions pointer (``EIP``) register. + +Determine which instruction has triggered the oops. + +.. hint:: Use the :file:`proc/modules` information to get the load address of + the kernel module. Use, on the physical machine, objdump + and/or addr2line . Objdump needs debugging support for + compilation! Read the lab's `objdump`_ and `addr2line`_ + sections. + +Try to unload the kernel module. Notice that the operation does not +work because there are references from the kernel module within the +kernel since the oops; Until the release of those references (which is +almost impossible in the case of an oops), the module can not be +unloaded. + +.. + _[EXERCISE5-END] + +.. + _[EXERCISE6-BEGIN] + +6. Module parameters +-------------------- + +Enter the directory for the task **6-cmd-mod** and inspect the C +``cmd_mod.c`` source file. Compile and copy the associated module and +load the kernel module to see the printk message. Then unload the +module from the kernel. + +Without modifying the sources, load the kernel module so that the +message shown is ``Early bird gets tired``. + +.. hint:: The str variable can be changed by passing a parameter to + the module. Find more information `here + `_. + +.. _proc-info: + +.. + _[EXERCISE6-END] + +.. + _[EXERCISE7-BEGIN] + +7. Proc info +------------ + +Check the skeleton for the task named **7-list-proc**. Add code to +display the Process ID (``PID``) and the executable name for the current +process. + +Follow the commands marked with ``TODO``. +The information must be displayed both when loading and unloading the +module. + +.. note:: + * In the Linux kernel, a process is described by the + :c:type:`struct task_struct`. Use |LXR|_ or ``cscope`` to find the + definition of :c:type:`struct task_struct`. + + * To find the structure field that contains the name of the + executable, look for the "executable" comment. + + * The pointer to the structure of the current process + running at a given time in the kernel is given by the + :c:macro:`current` variable (of the type + :c:type:`struct task_struct*`). + +.. hint:: To use :c:macro:`current` you'll need to include the header + in which the :c:type:`struct task_struct` is defined, i.e + ``linux/sched.h``. + +Compile, copy, boot the VM and load the module. Unload the kernel module. + +Repeat the loading/unloading operation. Note that the PIDs of the +displayed processes differ. This is because a process is created +from the executable :file:`/sbin/insmod` when the module is loaded and +when the module is unloaded a process is created from the executable +:file:`/sbin/rmmod`. + +.. + _[EXERCISE7-END] + +.. + _[EXTRA-EXERCISE-BEGIN] + +Extra Exercises +=============== + +1. KDB +------ + +Go to the **8-kdb** directory. Activate KDB over the serial port and enter KDB +mode using :command:`SysRq`. Connect to the pseudo-terminal linked to virtiocon0 +using :command:`minicom`, configure KDB to use the hvc0 serial port: + +.. code-block:: bash + + echo hvc0 > /sys/module/kgdboc/parameters/kgdboc + +and enable it using SysRq (:command:`Ctrl + O g`). +Review the current system status (:command:`help` to see the available KDB +commands). Continue the kernel execution using the :command:`go` command. + +Load the :file:`hello_kdb` module. +The module will simulate a bug when writing to the :file:`/proc/hello_kdb_bug` +file. To simulate a bug, use the below command: + +.. code-block:: bash + + echo 1 > /proc/hello_kdb_bug + +After running the above command, at every oops/panic the kernel stops the +execution and enters debug mode. + +Analyze the stacktrace and determine the code that generated the bug. +How can we find out from KDB the address where the module was loaded? + +In parallel, use GDB in a new window to view the code based on KDB information. + +.. hint:: + Load the symbol file. Use :command:`info line`. + +When writing to :file:`/proc/hello_kdb_break`, the module will increment the +:c:data:`kdb_write_address` variable. Enter KDB and set a breakpoint for each +write access of the :c:data:`kdb_write_address` variable. +Return to kernel to trigger a write using: + +.. code-block:: bash + + echo 1 > /proc/hello_kdb_break + +2. PS Module +------------ + +Update the created kernel module at :ref:`proc-info` in order to display +information about all the processes in the system, when inserting the kernel +module, not just about the current process. Afterwards, compare the obtained +result with the output of the :command:`ps` command. + +.. hint:: + * Processes in the system are structured in a circular list. + + * :c:macro:`for_each _...` macros (such as :c:macro:`for_each_process`) are + useful when you want to navigate the items in a list. + + * To understand how to use a feature or a macro, use |LXR|_ or Vim and + :command:`cscope` and search for usage scenarios. + +3. Memory Info +-------------- + +Create a kernel module that displays the virtual memory areas of the current +process; for each memory area it will display the start address and the end +address. + +.. hint:: + * Start from an existing kernel module. + + * Investigate the structures :c:type:`struct task_struct`, + :c:type:`struct mm_struct` and :c:type:`struct vm_area_struct`. A + memory area is indicated by a structure of type :c:type:`struct + vm_area_struct`. + + * Don't forget to include the headers where the necessary structures are + defined. + +4. Dynamic Debugging +-------------------- + +Go to the **9-dyndbg** directory and compile the :code:`dyndbg.ko` module. + +Familiarize yourself with the :code:`debugfs` file system mounted in +:file:`/debug` and analyze the contents of the file +:file:`/debug/dynamic_debug/control`. Insert the :code:`dyndbg.ko` module and +notice the new content of the :file:`dynamic_debug/control` file. + +What appears extra in the respective file? Run the following command: + +.. code-block:: bash + + grep dyndbg /debug/dynamic_debug/control + +Configure :command:`dyndbg` so that only messages marked as "Important" in +:c:func:`my_debug_func` function are displayed when the module is unloaded. +The exercise will only filter out the :c:func:`pr_debug` calls; :c:func:`printk` +calls being always displayed. + +Specify two ways to filter. + +.. hint:: + Read the `Dynamic debugging`_ section and look at the :command:`dyndbg` + options (for example, :command:`line`, :command:`format`). + +Perform the filtering and revise the :file:`dynamic_debug/control` file. What +has changed? How do you know which calls are activated? + +.. hint:: + Check the :command:`dyndbg` flags. Unload the kernel module and observe the + log messages. + +5. Dynamic Debugging During Initialization +------------------------------------------ + +As you have noticed, :c:func:`pr_debug` calls can only be activated /filtered +after module insertion. In some situations, it might be helpful to view the +messages from the initialization of the module. This can be done by using a +default (fake) parameter called :command:`dyndbg` that can be passed as an +argument to initialize the module. With this parameter you can add /delete +:command:`dyndbg` flags. + +.. hint:: + Read the last part of the `Dynamic debugging`_ section and see the available + flags (e.g.: :command:`+/- p`). + +Read the `Debug Messages section at Module Initialization Time +`_ +and insert the module so that the messages in :c:func:`my_debug_func` (called +:c:func:`dyndbg_init`) are also displayed during initialization. + +.. warning:: + In the VM from the lab, you will need to use :command:`insmod` instead of + :command:`modprobe`. + +Without unloading the module, deactivate :c:func:`pr_debug` calls. + +.. hint:: + You can delete the set flags. Unload the kernel module. + +.. + _[EXTRA-EXERCISE-END] diff --git a/refs/pull/405/merge/_sources/labs/kernel_profiling.rst.txt b/refs/pull/405/merge/_sources/labs/kernel_profiling.rst.txt new file mode 100644 index 00000000..34c3810e --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/kernel_profiling.rst.txt @@ -0,0 +1,474 @@ +================ +Kernel Profiling +================ + +Lab Objectives +============== + + * Familiarize yourself with the basics of Linux kernel profiling + * Understanding basic profiling tools + * Learning profiling methodologies and good practices + +Overview +======== + +Up until now we have studied how the different components of the Linux kernel +work, and how to write drivers that interface with them in order to provide +support for devices or protocols. This has helped us understand how the Linux +kernel works, but most people will not get to write kernel drivers. + +Nonetheless, the skills learned will help us to write applications that better +integrate with the whole operating system. In order to do this, one has to have +a good view of both the user space and the kernel space. + +This session aims to merge the work we have done up until now in the kernel +space with real world use cases where we do not write kernel space code, but we +look through the kernel using profiling tools, in order to debug issues that +we're having when writing regular, low-level, applications. + +Another focus of this session will be learning a general methodology for +debugging software issues, and we will approach some tools that give us insight +from the kernel on the way our application runs. + +Profiling Tools +=============== + +The main tool that we will focus our attention on is ``perf``, which offers +support for tracing applications, and also inspecting general aspects of the +system. We will also be using debugging tools that most people have used in +their day to day life, such as ``htop``, ``ps``, ``lsof`` and others. + +perf +---- + +``perf`` is a tool that instruments the CPU using +tracepoints, kprobes and uprobes. This tool allows us to take a look at what +functions are being called at a given point. This allows us to take a peak at +where the kernel is pending the most time, print out call stacks of functions, +and in general log what the CPU is running. + +``perf`` integrates modules such as: +* static tracing +* dynamic tracing +* resource monitoring + +The tracing interface that is offered by perf can be used by itself, using the +``perf`` command together with its subcommands. + + +.. code-block:: bash + + root@qemux86:~# ./skels/kernel_profiling/perf + + usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS] + + The most commonly used perf commands are: + annotate Read perf.data (created by perf record) and display annotated code + archive Create archive with object files with build-ids found in perf.data file + bench General framework for benchmark suites + buildid-cache Manage build-id cache. + buildid-list List the buildids in a perf.data file + c2c Shared Data C2C/HITM Analyzer. + config Get and set variables in a configuration file. + data Data file related processing + diff Read perf.data files and display the differential profile + evlist List the event names in a perf.data file + ftrace simple wrapper for kernel's ftrace functionality + inject Filter to augment the events stream with additional information + kallsyms Searches running kernel for symbols + kmem Tool to trace/measure kernel memory properties + kvm Tool to trace/measure kvm guest os + list List all symbolic event types + lock Analyze lock events + mem Profile memory accesses + record Run a command and record its profile into perf.data + report Read perf.data (created by perf record) and display the profile + sched Tool to trace/measure scheduler properties (latencies) + script Read perf.data (created by perf record) and display trace output + stat Run a command and gather performance counter statistics + test Runs sanity tests. + timechart Tool to visualize total system behavior during a workload + top System profiling tool. + version display the version of perf binary + probe Define new dynamic tracepoints + + See 'perf help COMMAND' for more information on a specific command. + +In the output above we can see all of perf's subcommands together with a +description of their functionality, the most significant of which are: + +* ``stat`` - displays statistics such as the number of context switches and page + faults; +* ``top`` - an interactive interface where we can inspect the most frequent + function calls and their caller. This interface allows us direct feedback + while profiling; +* ``list`` - lists the static trace point that we can instrument inside the + kernel. These are useful when trying to get an insight from inside the kernel; +* ``probe`` - add a dynamic trace point that instruments a function call in + order to be recorded by perf; +* ``record`` - records function calls and stack traces based on tracing points + defined by the user; It can also record specific function calls and their + stack traces. The record is saved in a file, named ``perf.data`` by default; +* ``report`` - displays the information saved in a perf recording. + +Another way to use perf's interface is through scripts that wrap over perf that +offer a higher level way of looking at events or data, without needing to know +the intricacies of the command. An example of this is the ``iosnoop.sh`` script, +which displays what I/O transfers are taking place. + +ps +-- + +``ps`` is the Linux tool that allows us to monitor the processes that are +running at a given time on the machine, including the kernel threads. This is a +simple and easy to use way of checking at a glance what processes are running on +the CPU, and what is their CPU and memory usage. + +In order to list all the processes running, we use to ``ps aux`` command in the +following way: + +.. code-block:: c + + TODO + root@qemux86:~/skels/kernel_profiling/0-demo# cd + root@qemux86:~# ps aux + USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND + root 1 0.0 0.5 2004 1256 ? Ss 12:06 0:12 init [5] + root 2 0.0 0.0 0 0 ? S 12:06 0:00 [kthreadd] + [...] + root 350 4.5 4.4 11132 10688 hvc0 T 12:07 17:21 ./io-app + root 1358 0.0 0.0 0 0 ? I 14:30 0:00 [kworker/u2:1-e + root 2293 0.1 1.5 5516 3704 ? Ss 18:18 0:00 sshd: root@pts/ + root 2295 0.0 1.3 3968 3232 pts/0 Ss+ 18:19 0:00 -sh + root 2307 0.0 0.0 0 0 ? I 18:19 0:00 [kworker/u2:2-e + root 2350 0.0 0.7 3032 1792 hvc0 R+ 18:26 0:00 ps aux + root 2392 2.6 0.0 0 0 ? D 18:31 0:00 test-script + +One information of note is that the 7th column represents the that of the +process, ``S`` meaning suspended, ``D`` suspended due to I/O, and ``R`` meaning +running. + +time +---- + +The ``time`` command allows us to inspect the amount of time spent by a +process in I/O, running the application code, or running code in kernel space. +This can be useful in order to find out whether an application's issue comes +from running too much in kernel space, so it has some overhead when it does +system calls, or the issue is in the user code. + +.. code-block:: c + + root@qemux86:~# time dd if=/dev/urandom of=./test-file bs=1K count=10 + 10+0 records in + 10+0 records out + 10240 bytes (10 kB, 10 KiB) copied, 0.00299749 s, 3.4 MB/s + + real 0m0.020s + user 0m0.001s + sys 0m0.015s + +In the output above we timed the generation of a file using ``dd``. The result +of the timing is displayed at the bottom of output. The values outputted by the +tool are the following: + +* ``real`` - the amount of time has passed from the start of the application to + its finishing; +* ``user`` - time spent running the ``dd`` code; +* ``sys`` - time spent running kernel code on behalf of the process. + +We see that the sum of the ``user`` and ``sys`` values doesn't add up to the +``real`` value. This happens either when the application runs on multiple cores, +in which case the sum might be higher, or the application sleeps, in which case +the sum is lower. + +top +--- + +``top`` is an application that is found on most systems which lists in real time +the applications that are running on the system. ``top`` runs interactively, and +it auto-refreshes its output, as opposed to ``ps``. We use this tool when we +want a high level of continuous monitoring. + +Profiling Methodology +===================== + +When doing profiling, our goal is to identify the cause of a problem. Usually +this problem is observed by someone when their application doesn't work as +expected. When we say that an application did not work as expected, this can +mean different things for different people. For example, one person might +complain that the application has a slowdown, while another might say that the +application runs on the CPU, but it doesn't output anything. + +The first step in any problem solving context is to understand the default +behaviour of the application we're trying to debug, and to make sure that it is +now not running in the expected parameters. + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: kernel_profiling + +.. note:: + + This session will require us to use the ``perf`` tracing tool. When running + natively on our systems, we have to install the + ``linux-tools--generic`` package using a package manager in order + to run it. Because in our visual machine we don't have access to a package + manager, we will be downloading the ``perf`` binary from `this + `_ link. Download the application in + the ``skels/kernel_profiling`` directory, and grant in execution + permissions. + +.. warning:: + + When running ``perf``, make sure that you're running the downloaded version, + not the version in the ``PATH`` variable. + +.. note:: + + When going through this session's exercises, we will have to run command in + parallel. In order to do this, we will have to connect to the virtual machine + using SSH. We recommend using the ``core-image-sato-sdk-qemu`` image, since it + has the tools that we need. To run the virtual machine using the + ``core-image-sato-sdk-qemu`` file system, uncomment line 16 in the + ``qemu/Makefile`` file. + +.. note:: + + If you wish to run the ``perf-tools`` based scripts that we have included in + the repository, such as ``iosnoop.sh``, you will have to grant it execution + privilleges, in order to be copied to the virtual machine file system. + +.. note:: + + In order to improve the course of SO2, its components and the way it is + conducted, your opinions are very useful to us. Please fill the feedback form + on `curs.upb.ro platform `_. + + The form is anonymous and is active between May 22 and June 2, 2023. The + results will be visible to the SO2 team after all the grades have been + marked. + + We invite you to evaluate the activity of the SO2 team and specify its + strengths and weaknesses and your suggestions for improving the subject. + Your feedback is very important to us to increase the quality of the subject + in the coming years. + + We are particularly interested in: + + * What did you not like and what do you think did not go well? + * Why didn't you like it and why do you think it didn't go well? + * What should we do to make things better? + +0. Demo: Profiling I/O Problems +=============================== + +When working with I/O, we have to keep in mind that it is one of the slowest +systems in the operating system, compared to memory, which is an order of +magnitude faster, and scheduling, which deals with what is currently running on +the CPU. + +Because of this, I/O operations have do be thought out, because you might starve +you application by saturating the system with requests. Another issue that you +might face is that the I/O's slow speed might affect your application's +responsiveness, if it waits for the I/O operations to finish. + +Let's take a look at an application and debug its issues. + +We are going to run the ``io-app`` application, from the ``0-demo`` directory. + +In order to inspect what is running on the CPU, and look at the stack of the +process, we can use the ``perf record`` subcommand in the following way: + +.. code-block:: bash + + root@qemux86:~# ./perf record -a -g + Couldn't synthesize bpf events. + ^C[ perf record: Woken up 7 times to write data ] + [ perf record: Captured and wrote 1.724 MB perf.data (8376 samples) ] + + +perf will record values indefinitely, but we can close it using the ``Ctrl+c`` +hotkey. We used the ``-a`` option in order to probe all CPUs, and ``-g`` option, +which record the whole call stack. + +To visualize the recorded information, we will use the ``perf report`` command, +which will bring up a pager which will display the most frequent function calls +that were found on the CPU, and their call stack. + +.. code-block:: bash + + root@qemux86:~# ./perf report --header -F overhead,comm,parent + # Total Lost Samples: 0 + # + # Samples: 8K of event 'cpu-clock:pppH' + # Event count (approx.): 2094000000 + # + # Overhead Command Parent symbol + # ........ ............... ............. + # + 58.63% io-app [other] + | + --58.62%--__libc_start_main + main + __kernel_vsyscall + | + --58.61%--__irqentry_text_end + do_SYSENTER_32 + do_fast_syscall_32 + __noinstr_text_start + __ia32_sys_write + ksys_write + vfs_write + | + --58.60%--ext4_file_write_iter + ext4_buffered_write_iter + [...] + +We have used the ``--header`` in order to print the table header, and ``-F +overhead,comm,parent``, in order to print the percentage of time where the call +stack, the command and the caller. + +We can see that the ``io-app`` command is doing some writes in the file system, +and this contributes to much of the load on the system. + +Armed with this information, we know that there are many I/O calls being done by +the application. In order to look at the size of these requests, we can use the +``iosnoop.sh`` script in order to see how big these requests are. + +.. code-block:: bash + + root@qemux86:~/skels/kernel_profiling# ./iosnoop.sh 1 + Tracing block I/O. Ctrl-C to end. + COMM PID TYPE DEV BLOCK BYTES LATms + io-app 889 WS 254,0 4800512 1310720 2.10 + io-app 889 WS 254,0 4803072 1310720 2.04 + io-app 889 WS 254,0 4805632 1310720 2.03 + io-app 889 WS 254,0 4808192 1310720 2.43 + io-app 889 WS 254,0 4810752 1310720 3.48 + io-app 889 WS 254,0 4813312 1310720 3.46 + io-app 889 WS 254,0 4815872 524288 1.03 + io-app 889 WS 254,0 5029888 1310720 5.82 + io-app 889 WS 254,0 5032448 786432 5.80 + jbd2/vda-43 43 WS 254,0 2702392 8192 0.22 + kworker/0:1H 34 WS 254,0 2702408 4096 0.40 + io-app 889 WS 254,0 4800512 1310720 2.60 + io-app 889 WS 254,0 4803072 1310720 2.58 + [...] + +From this output we see that the ``io-app`` is reading in a loop from the fact +that the first block ``4800512`` is repeating, and that it is doing big reads, +since it is reading one megabyte fer request. This constant looping adds the +load to the system that we're experiencing. + +1. Investigating Reduced Responsiveness +--------------------------------------- + +The ``io.ko`` module, located in the ``kernel_profiling/1-io`` directory, +decreases the system's responsiveness when inserted. We see that the command +line stutters when typing commands, but when running top, we see that the +system's load is not high, and there aren't any processes that are hogging +resources. + +Find out what the ``io.ko`` module is doing and why is it leading to the +stuttering effect that we experience. + +.. hint:: + + Trace all the functions being called and check where the CPU is + spending most of its time. In order to do this, you can run either ``perf + record`` and ``perf report`` to view the output, or ``perf top``. + +2. Launching New Threads +------------------------ + +We want to run the same function in a loop 100 times in parallel. We have +implemented two solutions inside the ``scheduling`` binary file, located in the +``kernel_profiling/2-scheduling`` directory. + +When executing the ``scheduling`` binary, it prints a message in parallel from +100 running instances. We can tune this execution by running the application +either with the first parameter ``0`` or ``1``. + +Find out which solution is better, and why. + +3. Tuning ``cp`` +---------------- + +Our goal is to write a copy of the ``cp`` tool integrated in Linux, which has +been implemented by the ``memory`` binary, in the ``kernel_profiling/3-memory`` +directory. It implements two approaches that we can take for the copy operation: + +* reading the contents of the source file in a buffer in memory using the + ``read()`` system call, and writing that buffer to the destination file using + the ``write()`` system call; +* mapping the source and destination files to memory using the ``mmap`` system + call, and copying the contents of the source file to the destination in + memory. + +Another tunable parameter that we're going to use is the block size of to copies +that we're going to make, either through reads/writes or in memory. + +1) Investigate which of the two copying mechanisms is faster. For this step, you +will use the 1024 block size. + +2) Once you have found which copying mechanism is faster, change the block size +parameter and see which value gives you the best copies. Why? + +4. I/O Latency +-------------- + +We have written a module that reads the content of a disk. Insert the ``bio.ko`` +module, located in the ``4-bio`` module, we see a large spike in the system's +load, as can be seen in the ``top`` command, but we see that the system is still +responsive. + +Investigate what is causing the increased load to the system. Is it an I/O issue, +or is it a scheduling issue? + +.. hint:: + + Try to trace the I/O operations using ``perf``, or use the + ``iosnoop.sh`` script in order to inspect what I/O is happening at a + certain point. + +5. Bad ELF +---------- + +.. note:: + + This is a bonus exercise that has been tested on a native Linux system. + It may run under the QEMU virtual machine, but the behavior was weird in our testing. + We recommend you used a native (or VirtualBox or VMware) Linux system. + +We managed to build (as part of a `Unikraft `__ build) an ELF file that is valid when doing static analysis, but that can't be executed. +The file is ``bad_elf``, located in the ``5-bad-elf/`` folder. + +Running it triggers a *segmentation fault* message. +Running it using ``strace`` show an error with ``execve()``. + +.. code:: + + ... skels/kernel_profiling/5-bad-elf$ ./bad_elf + Segmentation fault + + ... skels/kernel_profiling/5-bad-elf$ strace ./bad_elf + execve("./bad_elf", ["./bad_elf"], 0x7ffc3349ba50 /* 70 vars \*/) = -1 EINVAL (Invalid argument) + --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=NULL} --- + +++ killed by SIGSEGV +++ + Segmentation fault (core dumped) + +The ELF file itself is valid: + +.. code:: + + ... skels/kernel_profiling/5-bad-elf$ readelf -a bad_elf + +The issue is to be detected in the kernel. + +Use either ``perf``, or, better yet `ftrace `__ to inspect the kernel function calls done by the program. +Identify the function call that sends out the ``SIGSEGV`` signal. +Identify the cause of the issue. +Find that cause in the `manual page elf(5) `__. diff --git a/refs/pull/405/merge/_sources/labs/memory_mapping.rst.txt b/refs/pull/405/merge/_sources/labs/memory_mapping.rst.txt new file mode 100644 index 00000000..e9d15e15 --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/memory_mapping.rst.txt @@ -0,0 +1,499 @@ +============== +Memory mapping +============== + +Lab objectives +============== + +* Understand address space mapping mechanisms +* Learn about the most important structures related to memory management + +Keywords: + +* address space +* :c:func:`mmap` +* :c:type:`struct page` +* :c:type:`struct vm_area_struct` +* :c:type:`struct vm_struct` +* :c:type:`remap_pfn_range` +* :c:func:`SetPageReserved` +* :c:func:`ClearPageReserved` + + +Overview +======== + +In the Linux kernel it is possible to map a kernel address space to a +user address space. This eliminates the overhead of copying user space +information into the kernel space and vice versa. This can be done +through a device driver and the user space device interface +(:file:`/dev`). + +This feature can be used by implementing the :c:func:`mmap` operation +in the device driver's :c:type:`struct file_operations` and using the +:c:func:`mmap` system call in user space. + +The basic unit for virtual memory management is a page, which size is +usually 4K, but it can be up to 64K on some platforms. Whenever we +work with virtual memory we work with two types of addresses: virtual +address and physical address. All CPU access (including from kernel +space) uses virtual addresses that are translated by the MMU into +physical addresses with the help of page tables. + +A physical page of memory is identified by the Page Frame Number +(PFN). The PFN can be easily computed from the physical address by +dividing it with the size of the page (or by shifting the physical +address with PAGE_SHIFT bits to the right). + +.. image:: ../res/paging.png + :width: 49 % + +For efficiency reasons, the virtual address space is divided into +user space and kernel space. For the same reason, the kernel space +contains a memory mapped zone, called **lowmem**, which is contiguously +mapped in physical memory, starting from the lowest possible physical +address (usually 0). The virtual address where lowmem is mapped is +defined by :c:macro:`PAGE_OFFSET`. + +On a 32bit system, not all available memory can be mapped in lowmem and +because of that there is a separate zone in kernel space called +**highmem** which can be used to arbitrarily map physical memory. + +Memory allocated by :c:func:`kmalloc` resides in lowmem and it is +physically contiguous. Memory allocated by :c:func:`vmalloc` is not +contiguous and does not reside in lowmem (it has a dedicated zone in +highmem). + +.. image:: ../res/kernel-virtmem-map.png + :width: 49 % + +Structures used for memory mapping +================================== + +Before discussing about the memory mapping mechanism over a device, +we will present some of the basic structures used by the Linux memory +management subsystem. +Some of the basic structures are: :c:type:`struct page`, +:c:type:`struct vm_area_struct`, :c:type:`struct mm_struct`. + +:c:type:`struct page` +--------------------- + +:c:type:`struct page` is used to embed information about all physical +pages in the system. The kernel has a :c:type:`struct page` structure +for all pages in the system. + +There are many functions that interact with this structure: + +* :c:func:`virt_to_page` returns the page associated with a virtual + address +* :c:func:`pfn_to_page` returns the page associated with a page frame + number +* :c:func:`page_to_pfn` return the page frame number associated with a + :c:type:`struct page` +* :c:func:`page_address` returns the virtual address of a + :c:type:`struct page`; this functions can be called only for pages from + lowmem +* :c:func:`kmap` creates a mapping in kernel for an arbitrary physical + page (can be from highmem) and returns a virtual address that can be + used to directly reference the page + +:c:type:`struct vm_area_struct` +------------------------------- + +:c:type:`struct vm_area_struct` holds information about a contiguous +virtual memory area. The memory areas of a process can be viewed by +inspecting the *maps* attribute of the process via procfs: + +.. code-block:: shell + + root@qemux86:~# cat /proc/1/maps + #address perms offset device inode pathname + 08048000-08050000 r-xp 00000000 fe:00 761 /sbin/init.sysvinit + 08050000-08051000 r--p 00007000 fe:00 761 /sbin/init.sysvinit + 08051000-08052000 rw-p 00008000 fe:00 761 /sbin/init.sysvinit + 092e1000-09302000 rw-p 00000000 00:00 0 [heap] + 4480c000-4482e000 r-xp 00000000 fe:00 576 /lib/ld-2.25.so + 4482e000-4482f000 r--p 00021000 fe:00 576 /lib/ld-2.25.so + 4482f000-44830000 rw-p 00022000 fe:00 576 /lib/ld-2.25.so + 44832000-449a9000 r-xp 00000000 fe:00 581 /lib/libc-2.25.so + 449a9000-449ab000 r--p 00176000 fe:00 581 /lib/libc-2.25.so + 449ab000-449ac000 rw-p 00178000 fe:00 581 /lib/libc-2.25.so + 449ac000-449af000 rw-p 00000000 00:00 0 + b7761000-b7763000 rw-p 00000000 00:00 0 + b7763000-b7766000 r--p 00000000 00:00 0 [vvar] + b7766000-b7767000 r-xp 00000000 00:00 0 [vdso] + bfa15000-bfa36000 rw-p 00000000 00:00 0 [stack] + +A memory area is characterized by a start address, a stop address, +length, permissions. + +A :c:type:`struct vm_area_struct` is created at each :c:func:`mmap` +call issued from user space. A driver that supports the :c:func:`mmap` +operation must complete and initialize the associated +:c:type:`struct vm_area_struct`. The most important fields of this +structure are: + +* :c:member:`vm_start`, :c:member:`vm_end` - the beginning and the end of + the memory area, respectively (these fields also appear in + :file:`/proc//maps`); +* :c:member:`vm_file` - the pointer to the associated file structure (if any); +* :c:member:`vm_pgoff` - the offset of the area within the file; +* :c:member:`vm_flags` - a set of flags; +* :c:member:`vm_ops` - a set of working functions for this area +* :c:member:`vm_next`, :c:member:`vm_prev` - the areas of the same process + are chained by a list structure + +:c:type:`struct mm_struct` +-------------------------- + +:c:type:`struct mm_struct` encompasses all memory areas associated +with a process. The :c:member:`mm` field of :c:type:`struct task_struct` +is a pointer to the :c:type:`struct mm_struct` of the current process. + + +Device driver memory mapping +============================ + +Memory mapping is one of the most interesting features of a Unix +system. From a driver's point of view, the memory-mapping facility +allows direct memory access to a user space device. + +To assign a :c:func:`mmap` operation to a driver, the :c:member:`mmap` +field of the device driver's :c:type:`struct file_operations` must be +implemented. If that is the case, the user space process can then use +the :c:func:`mmap` system call on a file descriptor associated with +the device. + +The mmap system call takes the following parameters: + +.. code-block:: c + + void *mmap(caddr_t addr, size_t len, int prot, + int flags, int fd, off_t offset); + +To map memory between a device and user space, the user process must +open the device and issue the :c:func:`mmap` system call with the resulting +file descriptor. + +The device driver :c:func:`mmap` operation has the following signature: + +.. code-block:: c + + int (*mmap)(struct file *filp, struct vm_area_struct *vma); + +The *filp* field is a pointer to a :c:type:`struct file` created when +the device is opened from user space. The *vma* field is used to +indicate the virtual address space where the memory should be mapped +by the device. A driver should allocate memory (using +:c:func:`kmalloc`, :c:func:`vmalloc`, :c:func:`alloc_pages`) and then +map it to the user address space as indicated by the *vma* parameter +using helper functions such as :c:func:`remap_pfn_range`. + +:c:func:`remap_pfn_range` will map a contiguous physical address space +into the virtual space represented by :c:type:`vm_area_struct`: + +.. code-block:: c + + int remap_pfn_range (structure vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); + +:c:func:`remap_pfn_range` expects the following parameters: + +* *vma* - the virtual memory space in which mapping is made; +* *addr* - the virtual address space from where remapping begins; page + tables for the virtual address space between addr and addr + size + will be formed as needed +* *pfn* - the page frame number to which the virtual address should be + mapped +* *size* - the size (in bytes) of the memory to be mapped +* *prot* - protection flags for this mapping + +Here is an example of using this function that contiguously maps the +physical memory starting at page frame number *pfn* (memory that was +previously allocated) to the *vma->vm_start* virtual address: + +.. code-block:: c + + struct vm_area_struct *vma; + unsigned long len = vma->vm_end - vma->vm_start; + int ret ; + + ret = remap_pfn_range(vma, vma->vm_start, pfn, len, vma->vm_page_prot); + if (ret < 0) { + pr_err("could not map the address area\n"); + return -EIO; + } + +To obtain the page frame number of the physical memory we must +consider how the memory allocation was performed. For each +:c:func:`kmalloc`, :c:func:`vmalloc`, :c:func:`alloc_pages`, we must +used a different approach. For :c:func:`kmalloc` we can use something +like: + +.. code-block:: c + + static char *kmalloc_area; + + unsigned long pfn = virt_to_phys((void *)kmalloc_area)>>PAGE_SHIFT; + +while for :c:func:`vmalloc`: + +.. code-block:: c + + static char *vmalloc_area; + + unsigned long pfn = vmalloc_to_pfn(vmalloc_area); + +and finally for :c:func:`alloc_pages`: + +.. code-block:: c + + struct page *page; + + unsigned long pfn = page_to_pfn(page); + +.. attention:: Note that memory allocated with :c:func:`vmalloc` is not + physically contiguous so if we want to map a range allocated + with :c:func:`vmalloc`, we have to map each page individually + and compute the physical address for each page. + +Since the pages are mapped to user space, they might be swapped +out. To avoid this we must set the PG_reserved bit on the page. +Enabling is done using :c:func:`SetPageReserved` while reseting it +(which must be done before freeing the memory) is done with +:c:func:`ClearPageReserved`: + +.. code-block:: c + + void alloc_mmap_pages(int npages) + { + int i; + char *mem = kmalloc(PAGE_SIZE * npages); + + if (!mem) + return mem; + + for(i = 0; i < npages * PAGE_SIZE; i += PAGE_SIZE) + SetPageReserved(virt_to_page(((unsigned long)mem) + i)); + + return mem; + } + + void free_mmap_pages(void *mem, int npages) + { + int i; + + for(i = 0; i < npages * PAGE_SIZE; i += PAGE_SIZE) + ClearPageReserved(virt_to_page(((unsigned long)mem) + i)); + + kfree(mem); + } + + +Further reading +=============== + +* `Linux Device Drivers 3rd Edition - Chapter 15. Memory Mapping and DMA `_ +* `Linux Device Driver mmap Skeleton `_ +* `Driver porting: supporting mmap () `_ +* `Device Drivers Concluded `_ +* `mmap `_ + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: memory_mapping + +1. Mapping contiguous physical memory to userspace +-------------------------------------------------- + +Implement a device driver that maps contiguous physical memory +(e.g. obtained via :c:func:`kmalloc`) to userspace. + +Review the `Device driver memory mapping`_ section, generate the +skeleton for the task named **kmmap** and fill in the areas marked +with **TODO 1**. + +Start with allocating a NPAGES+2 memory area page using :c:func:`kmalloc` +in the module init function and find the first address in the area that is +aligned to a page boundary. + +.. hint:: The size of a page is *PAGE_SIZE*. + + Store the allocated area in *kmalloc_ptr* and the page + aligned address in *kmalloc_area*: + + Use :c:func:`PAGE_ALIGN` to determine *kmalloc_area*. + +Enable the PG_reserved bit of each page with +:c:func:`SetPageReserved`. Clear the bit with +:c:func:`ClearPageReserved` before freeing the memory. + +.. hint:: Use :c:func:`virt_to_page` to translate virtual pages into + physical pages, as required by :c:func:`SetPageReserved` + and :c:func:`ClearPageReserved`. + +For verification purpose (using the test below), fill in the first 4 +bytes of each page with the following values: 0xaa, 0xbb, 0xcc, 0xdd. + +Implement the :c:func:`mmap` driver function. + +.. hint:: For mapping, use :c:func:`remap_pfn_range`. The third + argument for :c:func:`remap_pfn_range` is a page frame number (PFN). + + To convert from virtual kernel address to physical address, + use :c:func:`virt_to_phys`. + + To convert a physical address to its PFN, shift the address + with PAGE_SHIFT bits to the right. + +For testing, load the kernel module and run: + +.. code-block:: shell + + root@qemux86:~# skels/memory_mapping/test/mmap-test 1 + +If everything goes well, the test will show "matched" messages. + +2. Mapping non-contiguous physical memory to userspace +------------------------------------------------------ + +Implement a device driver that maps non-contiguous physical memory +(e.g. obtained via :c:func:`vmalloc`) to userspace. + +Review the `Device driver memory mapping`_ section, generate the +skeleton for the task named **vmmap** and fill in the areas marked +with **TODO 1**. + +Allocate a memory area of NPAGES with :c:func:`vmalloc`. + +.. hint:: The size of a page is *PAGE_SIZE*. + Store the allocated area in *vmalloc_area*. + Memory allocated by :c:func:`vmalloc` is paged aligned. + +Enable the PG_reserved bit of each page with +:c:func:`SetPageReserved`. Clear the bit with +:c:func:`ClearPageReserved` before freeing the memory. + +.. hint:: Use :c:func:`vmalloc_to_page` to translate virtual pages + into physical pages used by the functions + :c:func:`SetPageReserved` and :c:func:`ClearPageReserved`. + +For verification purpose (using the test below), fill in the first 4 +bytes of each page with the following values: 0xaa, 0xbb, 0xcc, 0xdd. + +Implement the mmap driver function. + +.. hint:: To convert from virtual vmalloc address to physical address, + use :c:func:`vmalloc_to_pfn` which returns a PFN directly. + +.. attention:: vmalloc pages are not physically contiguous so it is + needed to use :c:func:`remap_pfn_range` for each page. + + Loop through all virtual pages and for each: + * determine the physical address + * map it with :c:func:`remap_pfn_range` + + Make sure that you determine the physical address + each time and that you use a range of one page for mapping. + +For testing, load the kernel module and run: + +.. code-block:: shell + + root@qemux86:~# skels/memory_mapping/test/mmap-test 1 + +If everything goes well, the test will show "matched" messages. + +3. Read / write operations in mapped memory +------------------------------------------- + +Modify one of the previous modules to allow read / write operations on +your device. This is a didactic exercise to see that the same space +can also be used with the :c:func:`mmap` call and with :c:func:`read` +and :c:func:`write` calls. + +Fill in areas marked with **TODO 2**. + +.. note:: The offset parameter sent to the read / write operation can + be ignored as all reads / writes from the test program will + be done with 0 offsets. + +For testing, load the kernel module and run: + +.. code-block:: shell + + root@qemux86:~# skels/memory_mapping/test/mmap-test 2 + + +4. Display memory mapped in procfs +---------------------------------- + +Using one of the previous modules, create a procfs file in which you +display the total memory mapped by the calling process. + +Fill in the areas marked with **TODO 3**. + +Create a new entry in procfs (:c:macro:`PROC_ENTRY_NAME`, defined in +:file:`mmap-test.h`) that will show the total memory mapped by the process +that called the :c:func:`read` on that file. + +.. hint:: Use :c:func:`proc_create`. For the mode parameter, use 0, + and for the parent parameter use NULL. Use + :c:func:`my_proc_file_ops` for operations. + +In the module exit function, delete the :c:macro:`PROC_ENTRY_NAME` entry +using :c:func:`remove_proc_entry`. + +.. note:: A (complex) use and description of the :c:type:`struct + seq_file` interface can be found here in this `example + `_ . + + For this exercise, just a simple use of the interface + described `here `_ is + sufficient. Check the "extra-simple" API described there. + +In the :c:func:`my_seq_show` function you will need to: + +* Obtain the :c:type:`struct mm_struct` structure of the current process + using the :c:func:`get_task_mm` function. + + .. hint:: The current process is available via the *current* variable + of type :c:type:`struct task_struct*`. + +* Iterate through the entire :c:type:`struct vm_area_struct` list + associated with the process. + + .. hint:: Use the variable :c:data:`vma_iterator` and start from + :c:data:`mm->mmap`. Use the :c:member:`vm_next` field of + the :c:type:`struct vm_area_struct` to navigate through + the list of memory areas. Stop when you reach :c:macro:`NULL`. + +* Use *vm_start* and *vm_end* for each area to compute the total size. + +* Use :c:func:`pr_info("%lx %lx\n, ...)` to print *vm_start* and *vm_end* for + each area. + +* To release :c:type:`struct mm_struct`, decrement the reference + counter of the structure using :c:func:`mmput`. + +* Use :c:func:`seq_printf` to write to the file. Show only the total count, + no other messages. Do not even show newline (\n). + +In :c:func:`my_seq_open` register the display function +(:c:func:`my_seq_show`) using :c:func:`single_open`. + +.. note:: :c:func:`single_open` can use :c:macro:`NULL` as its third argument. + +For testing, load the kernel module and run: + +.. code-block:: shell + + root@qemux86:~# skels/memory_mapping/test/mmap-test 3 + +.. note:: The test waits for a while (it has an internal sleep + instruction). As long as the test waits, use the + :command:`pmap` command in another console to see the + mappings of the test and compare those to the test results. diff --git a/refs/pull/405/merge/_sources/labs/networking.rst.txt b/refs/pull/405/merge/_sources/labs/networking.rst.txt new file mode 100644 index 00000000..3eab836d --- /dev/null +++ b/refs/pull/405/merge/_sources/labs/networking.rst.txt @@ -0,0 +1,1262 @@ +============================ +Networking +============================ + +Lab objectives +============== + + * Understanding the Linux kernel networking architecture + * Acquiring practical IP packet management skills using a packet filter or + firewall + * Familiarize yourself with how to use sockets at the Linux kernel level + +Overview +======== + +The development of the Internet has led to an exponential increase in network +applications and, as a consequence, to increasing the speed and productivity +requirements of an operating system's networking subsystem. The networking +subsystem is not an essential component of an operating system kernel (the Linux +kernel can be compiled without networking support). It is, however, quite +unlikely for a computing system (or even an embedded device) to have a +non-networked operating system due to the need for connectivity. Modern operating +systems use the `TCP/IP stack +`_. Their kernel +implements protocols up to the transport layer, while application layer protocols +are typically implemented in user space (HTTP, FTP, SSH, etc.). + +Networking in user space +------------------------ + +In user space the abstraction of network communication is the socket. The +socket abstracts a communication channel and is the kernel-based TCP/IP stack +interaction interface. An IP socket is associated with an IP address, the +transport layer protocol used (TCP, UDP etc) and a port. Common function calls +that use sockets are: creation (``socket``), initialization +(``bind``), connecting (``connect``), waiting for a connection +(``listen``, ``accept``), closing a socket (``close``). + +Network communication is accomplished via ``read``/``write`` or ``recv``/``send`` calls +for TCP sockets and ``recvfrom``/``sendto`` for UDP sockets. Transmission and +reception operations are transparent to the application, leaving encapsulation +and transmission over network at the kernel's discretion. However, it is +possible to implement the TCP/IP stack in user space using raw sockets (the +``PF_PACKET`` option when creating a socket), or implementing an application +layer protocol in kernel (`TUX web server +`_). + +For more details about user space programming using sockets, see `Beej's Guide to +Network Programming Using Internet +Sockets `_. + +Linux networking +================ + +The Linux kernel provides three basic structures for working with network +packets: :c:type:`struct socket`, :c:type:`struct sock` and :c:type:`struct +sk_buff`. + +The first two are abstractions of a socket: + + * :c:type:`struct socket` is an abstraction very close to user space, ie `BSD + sockets `_ used to program + network applications; + * :c:type:`struct sock` or *INET socket* in Linux terminology is the network + representation of a socket. + +The two structures are related: the :c:type:`struct socket` contains an INET +socket field, and the :c:type:`struct sock` has a BSD socket that holds it. + +The :c:type:`struct sk_buff` structure is the representation of a network packet +and its status. The structure is created when a kernel packet is received, +either from the user space or from the network interface. + +The :c:type:`struct socket` structure +------------------------------------- + +The :c:type:`struct socket` structure is the kernel representation of a BSD +socket, the operations that can be executed on it are similar to those offered +by the kernel (through system calls). Common operations with sockets +(creation, initialization/bind, closing, etc.) result in specific system +calls; they work with the :c:type:`struct socket` structure. + +The :c:type:`struct socket` operations are described in :file:`net/socket.c` and +are independent of the protocol type. The :c:type:`struct socket` structure is thus +a generic interface over particular network operations implementations. +Typically, the names of these operations begin with the ``sock_`` prefix. + +.. _SocketStructOps: + +Operations on the socket structure +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Socket operations are: + +Creation +"""""""" + +Creation is similar to calling the :c:func:`socket` function in user space, but the +:c:type:`struct socket` created will be stored in the ``res`` parameter: + + * ``int sock_create(int family, int type, int protocol, struct socket **res)`` + creates a socket after the :c:func:`socket` system call; + * ``int sock_create_kern(struct net *net, int family, int type, int protocol, + struct socket **res)`` creates a kernel socket; + * ``int sock_create_lite(int family, int type, int protocol, struct socket **res)`` + creates a kernel socket without parameter sanity checks. + +The parameters of these calls are as follows: + + * ``net``, where it is present, used as reference to the network namespace used; + we will usually initialize it with ``init_net``; + * ``family`` represents the family of protocols used in the transfer of + information; they usually begin with the ``PF_`` (Protocol Family) string; + the constants representing the family of protocols used are found in + :file:`linux/socket.h`, of which the most commonly used is ``PF_INET``, for + TCP/IP protocols; + * ``type`` is the type of socket; the constants used for this parameter are + found in :file:`linux/net.h`, of which the most used are ``SOCK_STREAM`` for + a connection based source-to-destination communication and ``SOCK_DGRAM`` + for connectionless communication; + * ``protocol`` represents the protocol used and is closely related to the + ``type`` parameter; the constants used for this parameter are found in + :file:`linux/in.h`, of which the most used are ``IPPROTO_TCP`` for TCP and + ``IPPROTO_UDP`` for UDP. + +To create a TCP socket in kernel space, you must call: + +.. code-block:: c + + struct socket *sock; + int err; + + err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + /* handle error */ + } + +and for creating UDP sockets: + +.. code-block:: c + + struct socket *sock; + int err; + + err = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (err < 0) { + /* handle error */ + } + +A usage sample is part of the :c:func:`sys_socket` system call handler: + +.. code-block:: c + + SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) + { + int retval; + struct socket *sock; + int flags; + + /* Check the SOCK_* constants for consistency. */ + BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); + BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); + BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); + + flags = type & ~SOCK_TYPE_MASK; + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + type &= SOCK_TYPE_MASK; + + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + retval = sock_create(family, type, protocol, &sock); + if (retval < 0) + goto out; + + return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); + } + +Closing +""""""" + +Close connection (for sockets using connection) and release associated +resources: + + * ``void sock_release(struct socket *sock)`` calls the ``release`` function in + the ``ops`` field of the socket structure: + +.. code-block:: c + + void sock_release(struct socket *sock) + { + if (sock->ops) { + struct module *owner = sock->ops->owner; + + sock->ops->release(sock); + sock->ops = NULL; + module_put(owner); + } + //... + } + +Sending/receiving messages +"""""""""""""""""""""""""" + +The messages are sent/received using the following functions: + + * ``int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags);`` + * ``int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags);`` + * ``int sock_sendmsg(struct socket *sock, struct msghdr *msg);`` + * ``int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size);`` + +The message sending/receiving functions will then call the ``sendmsg``/ +``recvmsg`` function in the ``ops`` field of the socket. Functions +containing ``kernel_`` as a prefix are used when the socket is used in the +kernel. + +The parameters are: + + * ``msg``, a :c:type:`struct msghdr` structure, containing the message to be + sent/received. Among the important components of this structure are ``msg_name`` + and ``msg_namelen``, which, for UDP sockets, must be filled in with the address + to which the message is sent (:c:type:`struct sockaddr_in`); + * ``vec``, a :c:type:`struct kvec` structure, containing a pointer to the buffer + containing its data and size; as can be seen, it has a similar structure to the + :c:type:`struct iovec` structure (the :c:type:`struct iovec` structure + corresponds to the user space data, and the :c:type:`struct kvec` structure + corresponds to kernel space data). + +A usage example can be seen in the :c:func:`sys_sendto` system call handler: + +.. code-block:: c + + SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, + unsigned int, flags, struct sockaddr __user *, addr, + int, addr_len) + { + struct socket *sock; + struct sockaddr_storage address; + int err; + struct msghdr msg; + struct iovec iov; + int fput_needed; + + err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter); + if (unlikely(err)) + return err; + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + if (addr) { + err = move_addr_to_kernel(addr, addr_len, &address); + if (err < 0) + goto out_put; + msg.msg_name = (struct sockaddr *)&address; + msg.msg_namelen = addr_len; + } + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; + err = sock_sendmsg(sock, &msg); + + out_put: + fput_light(sock->file, fput_needed); + out: + return err; + } + +The :c:type:`struct socket` fields +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: c + + /** + * struct socket - general BSD socket + * @state: socket state (%SS_CONNECTED, etc) + * @type: socket type (%SOCK_STREAM, etc) + * @flags: socket flags (%SOCK_NOSPACE, etc) + * @ops: protocol specific socket operations + * @file: File back pointer for gc + * @sk: internal networking protocol agnostic socket representation + * @wq: wait queue for several uses + */ + struct socket { + socket_state state; + + short type; + + unsigned long flags; + + struct socket_wq __rcu *wq; + + struct file *file; + struct sock *sk; + const struct proto_ops *ops; + }; + +The noteworthy fields are: + + * ``ops`` - the structure that stores pointers to protocol-specific functions; + * ``sk`` - The ``INET socket`` associated with it. + +The :c:type:`struct proto_ops` structure +"""""""""""""""""""""""""""""""""""""""" + +The :c:type:`struct proto_ops` structure contains the implementations of the specific +operations implemented (TCP, UDP, etc.); these functions will be called from +generic functions through :c:type:`struct socket` (:c:func:`sock_release`, +:c:func:`sock_sendmsg`, etc.) + +The :c:type:`struct proto_ops` structure therefore contains a number of function +pointers for specific protocol implementations: + +.. code-block:: c + + struct proto_ops { + int family; + struct module *owner; + int (*release) (struct socket *sock); + int (*bind) (struct socket *sock, + struct sockaddr *myaddr, + int sockaddr_len); + int (*connect) (struct socket *sock, + struct sockaddr *vaddr, + int sockaddr_len, int flags); + int (*socketpair)(struct socket *sock1, + struct socket *sock2); + int (*accept) (struct socket *sock, + struct socket *newsock, int flags, bool kern); + int (*getname) (struct socket *sock, + struct sockaddr *addr, + int peer); + //... + } + +The initialization of the ``ops`` field from :c:type:`struct socket` is done in +the :c:func:`__sock_create` function, by calling the :c:func:`create` function, +specific to each protocol; an equivalent call is the implementation of the +:c:func:`__sock_create` function: + +.. code-block:: c + + //... + err = pf->create(net, sock, protocol, kern); + if (err < 0) + goto out_module_put; + //... + +This will instantiate the function pointers with calls specific to the protocol +type associated with the socket. The :c:func:`sock_register` and +:c:func:`sock_unregister` calls are used to fill the ``net_families`` vector. + +For the rest of the socket operations (other than creating, closing, and +sending/receiving a message as described above in the `Operations on the socket +structure`_ section), the functions sent via pointers in this structure will be +called. For example, for ``bind``, which associates a socket with a socket on +the local machine, we will have the following code sequence: + +.. code-block:: c + + #define MY_PORT 60000 + + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = htons (MY_PORT), + .sin_addr = { htonl (INADDR_LOOPBACK) } + }; + + //... + err = sock->ops->bind (sock, (struct sockaddr *) &addr, sizeof(addr)); + if (err < 0) { + /* handle error */ + } + //... + +As you can see, for transmitting the address and port information that +will be associated with the socket, a :c:type:`struct sockaddr_in` is filled. + +The :c:type:`struct sock` structure +----------------------------------- + +The :c:type:`struct sock` describes an ``INET`` socket. Such a structure is +associated with a user space socket and implicitly with a :c:type:`struct +socket` structure. The structure is used to store information about the status +of a connection. The structure's fields and associated operations usually begin +with the ``sk_`` string. Some fields are listed below: + +.. code-block:: c + + struct sock { + //... + unsigned int sk_padding : 1, + sk_no_check_tx : 1, + sk_no_check_rx : 1, + sk_userlocks : 4, + sk_protocol : 8, + sk_type : 16; + //... + struct socket *sk_socket; + //... + struct sk_buff *sk_send_head; + //... + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk); + void (*sk_write_space)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); + int (*sk_backlog_rcv)(struct sock *sk, + struct sk_buff *skb); + void (*sk_destruct)(struct sock *sk); + }; + +\ + + * ``sk_protocol`` is the type of protocol used by the socket; + * ``sk_type`` is the socket type (``SOCK_STREAM``, ``SOCK_DGRAM``, etc.); + * ``sk_socket`` is the BSD socket that holds it; + * ``sk_send_head`` is the list of :c:type:`struct sk_buff` structures for + transmission; + * the function pointers at the end are callbacks for different situations. + +Initializing the :c:type:`struct sock` and attaching it to a BSD socket is done +using the callback created from ``net_families`` (called +:c:func:`__sock_create`). Here's how to initialize the :c:type:`struct sock` +structure for the IP protocol, in the :c:func:`inet_create` function: + +.. code-block:: c + + /* + * Create an inet socket. + */ + + static int inet_create(struct net *net, struct socket *sock, int protocol, + int kern) + { + + struct sock *sk; + + //... + err = -ENOBUFS; + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); + if (!sk) + goto out; + + err = 0; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = SK_CAN_REUSE; + + + //... + sock_init_data(sock, sk); + + sk->sk_destruct = inet_sock_destruct; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + //... + } + +.. _StructSKBuff: + +The :c:type:`struct sk_buff` structure +-------------------------------------- + +The :c:type:`struct sk_buff` (socket buffer) describes a network packet. The +structure fields contain information about both the header and packet contents, +the protocols used, the network device used, and pointers to the other +:c:type:`struct sk_buff`. A summary description of the content of the structure +is presented below: + +.. code-block:: c + + struct sk_buff { + union { + struct { + /* These two members must be first. */ + struct sk_buff *next; + struct sk_buff *prev; + + union { + struct net_device *dev; + /* Some protocols might use this space to store information, + * while device pointer would be NULL. + * UDP receive path is one user. + */ + unsigned long dev_scratch; + }; + }; + + struct rb_node rbnode; /* used in netem & tcp stack */ + }; + struct sock *sk; + + union { + ktime_t tstamp; + u64 skb_mstamp; + }; + + /* + * This is the control buffer. It is free to use for every + * layer. Please put your private variables there. If you + * want to keep them across layers you have to do a skb_clone() + * first. This is owned by whoever has the skb queued ATM. + */ + char cb[48] __aligned(8); + + unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); + union { + struct { + unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); + }; + struct list_head tcp_tsorted_anchor; + }; + /* ... */ + + unsigned int len, + data_len; + __u16 mac_len, + hdr_len; + + /* ... */ + + __be16 protocol; + __u16 transport_header; + __u16 network_header; + __u16 mac_header; + + /* private: */ + __u32 headers_end[0]; + /* public: */ + + /* These elements must be at the end, see alloc_skb() for details. */ + sk_buff_data_t tail; + sk_buff_data_t end; + unsigned char *head, + *data; + unsigned int truesize; + refcount_t users; + }; + +where: + + * ``next`` and ``prev`` are pointers to the next, and previous element in the + buffer list; + * ``dev`` is the device which sends or receives the buffer; + * ``sk`` is the socket associated with the buffer; + * ``destructor`` is the callback that deallocates the buffer; + * ``transport_header``, ``network_header``, and ``mac_header`` are offsets + between the beginning of the packet and the beginning of the various headers + in the packets. They are internally maintained by the various processing + layers through which the packet passes. To get pointers to the headers, use + one of the following functions: :c:func:`tcp_hdr`, :c:func:`udp_hdr`, + :c:func:`ip_hdr`, etc. In principle, each protocol provides a function to + get a reference to the header of that protocol within a received packet. + Keep in mind that the ``network_header`` field is not set until the packet + reaches the network layer and the ``transport_header`` field is not set + until the packet reaches the transport layer. + +The structure of an `IP header `_ +(:c:type:`struct iphdr`) has the following fields: + +.. code-block:: c + + struct iphdr { + #if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ihl:4, + version:4; + #elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + ihl:4; + #else + #error "Please fix " + #endif + __u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + __u8 ttl; + __u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; + /*The options start here. */ + }; + +where: + + * ``protocol`` is the transport layer protocol used; + * ``saddr`` is the source IP address; + * ``daddr`` is the destination IP address. + +The structure of a `TCP header +`_ +(:c:type:`struct tcphdr`) has the following fields: + +.. code-block:: c + + struct tcphdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; + #if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; + #elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; + #else + #error "Adjust your defines" + #endif + __be16 window; + __sum16 check; + __be16 urg_ptr; + }; + +where: + + * ``source`` is the source port; + * ``dest`` is the destination port; + * ``syn``, ``ack``, ``fin`` are the TCP flags used; for a more detailed view, + see this `diagram + `_. + +The structure of a `UDP header +`_ +(:c:type:`struct udphdr`) has the following fields: + +.. code-block:: c + + struct udphdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; + }; + +where: + + * ``source`` is the source port; + * ``dest`` is the destination port. + +An example of accessing the information present in the headers of a network +packet is as follows: + +.. code-block:: c + + struct sk_buff *skb; + + struct iphdr *iph = ip_hdr(skb); /* IP header */ + /* iph->saddr - source IP address */ + /* iph->daddr - destination IP address */ + if (iph->protocol == IPPROTO_TCP) { /* TCP protocol */ + struct tcphdr *tcph = tcp_hdr(skb); /* TCP header */ + /* tcph->source - source TCP port */ + /* tcph->dest - destination TCP port */ + } else if (iph->protocol == IPPROTO_UDP) { /* UDP protocol */ + struct udphdr *udph = udp_hdr(skb); /* UDP header */ + /* udph->source - source UDP port */ + /* udph->dest - destination UDP port */ + } + +.. _Conversions: + +Conversions +=========== + +In different systems, there are several ways of ordering bytes in a word +(`Endianness `_), including: `Big +Endian `_ (the most +significant byte first) and `Little +Endian `_ (the least +significant byte first). Since a network interconnects systems with different +platforms, the Internet has imposed a standard sequence for the storage of +numerical data, called `network byte-order +`_. In +contrast, the byte sequence for the representation of numerical data on the host +computer is called host byte-order. Data received/sent from/to the network is in +the network byte-order format and should be converted between this format and +the host byte-order. + +For converting we use the following macros: + + * ``u16 htons(u16 x)`` converts a 16 bit integer from host byte-order to + network byte-order (host to network short); + * ``u32 htonl(u32 x)`` converts a 32 bit integer from host byte-order to + network byte-order (host to network long); + * ``u16 ntohs(u16 x)`` converts a 16 bit integer from network byte-order to + host byte-order (network to host short); + * ``u32 ntohl(u32 x)`` converts a 32 bit integer from network byte-order to + host byte-order (network to host long). + +.. _netfilter: + +netfilter +========= + +Netfilter is the name of the kernel interface for capturing network packets for +modifying/analyzing them (for filtering, NAT, etc.). `The netfilter +`_ interface is used in user space by `iptables +`_. + +In the Linux kernel, packet capture using netfilter is done by attaching hooks. +Hooks can be specified in different locations in the path followed by a kernel +network packet, as needed. An organization chart with the route followed by a +package and the possible areas for a hook can be found `here +`_. + +The header included when using netfilter is :file:`linux/netfilter.h`. + +A hook is defined through the :c:type:`struct nf_hook_ops` structure: + +.. code-block:: c + + struct nf_hook_ops { + /* User fills in from here down. */ + nf_hookfn *hook; + struct net_device *dev; + void *priv; + u_int8_t pf; + unsigned int hooknum; + /* Hooks are ordered in ascending priority. */ + int priority; + }; + +where: + + * ``pf`` is the package type (``PF_INET``, etc.); + * ``priority`` is the priority; priorities are defined in + :file:`uapi/linux/netfilter_ipv4.h` as follows: + +.. code-block:: c + + enum nf_ip_hook_priorities { + NF_IP_PRI_FIRST = INT_MIN, + NF_IP_PRI_CONNTRACK_DEFRAG = -400, + NF_IP_PRI_RAW = -300, + NF_IP_PRI_SELINUX_FIRST = -225, + NF_IP_PRI_CONNTRACK = -200, + NF_IP_PRI_MANGLE = -150, + NF_IP_PRI_NAT_DST = -100, + NF_IP_PRI_FILTER = 0, + NF_IP_PRI_SECURITY = 50, + NF_IP_PRI_NAT_SRC = 100, + NF_IP_PRI_SELINUX_LAST = 225, + NF_IP_PRI_CONNTRACK_HELPER = 300, + NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX, + NF_IP_PRI_LAST = INT_MAX, + }; + +\ + + + * ``dev`` is the device (network interface) on which the capture is + intended; + + + * ``hooknum`` is the type of hook used. When a packet is captured, the + processing mode is defined by the ``hooknum`` and ``hook`` fields. For IP, + hook types are defined in :file:`linux/netfilter.h`: + +.. code-block:: c + + enum nf_inet_hooks { + NF_INET_PRE_ROUTING, + NF_INET_LOCAL_IN, + NF_INET_FORWARD, + NF_INET_LOCAL_OUT, + NF_INET_POST_ROUTING, + NF_INET_NUMHOOKS + }; + +\ + + * ``hook`` is the handler called when capturing a network packet (packet sent + as a :c:type:`struct sk_buff` structure). The ``private`` field is private information + handed to the handler. The capture handler prototype is defined by the + :c:type:`nf_hookfn` type: + +.. code-block:: c + + struct nf_hook_state { + unsigned int hook; + u_int8_t pf; + struct net_device *in; + struct net_device *out; + struct sock *sk; + struct net *net; + int (*okfn)(struct net *, struct sock *, struct sk_buff *); + }; + + typedef unsigned int nf_hookfn(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state); + +For the :c:func:`nf_hookfn` capture function, the ``priv`` parameter is the +private information with which the :c:type:`struct nf_hook_ops` was +initialized. ``skb`` is the pointer to the captured network packet. Based on +``skb`` information, packet filtering decisions are made. The function's +``state`` parameter is the status information related to the packet capture, +including the input interface, the output interface, the priority, the hook +number. Priority and hook number are useful for allowing the same function to +be called by several hooks. + +A capture handler can return one of the constants ``NF_*``: + +.. code-block:: c + + /* Responses from hook functions. */ + #define NF_DROP 0 + #define NF_ACCEPT 1 + #define NF_STOLEN 2 + #define NF_QUEUE 3 + #define NF_REPEAT 4 + #define NF_STOP 5 + #define NF_MAX_VERDICT NF_STOP + +``NF_DROP`` is used to filter (ignore) a packet, and ``NF_ACCEPT`` is used to +accept a packet and forward it. + +Registering/unregistering a hook is done using the functions defined in +:file:`linux/netfilter.h`: + +.. code-block:: c + + /* Function to register/unregister hook points. */ + int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); + void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); + int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); + void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); + + +.. attention:: + + Prior to version 3.11-rc2 of the Linux kernel, + there are some restrictions related to the use of header extraction functions + from a :c:type:`struct sk_buff` structure set as a parameter in a netfilter + hook. While the IP header can be obtained each time using :c:func:`ip_hdr`, + the TCP and UDP headers can be obtained with :c:func:`tcp_hdr` and + :c:func:`udp_hdr` only for packages that come from inside the system rather + than the ones that are received from outside the system. In the latter case, + you must manually calculate the header offset in the package: + + .. code-block:: c + + // For TCP packets (iph->protocol == IPPROTO_TCP) + tcph = (struct tcphdr*)((__u32*)iph + iph->ihl); + // For UDP packets (iph->protocol == IPPROTO_UDP) + udph = (struct udphdr*)((__u32*)iph + iph->ihl); + + This code works in all filtering situations, so it's recommended to use it + instead of header access functions. + +A usage example for a netfilter hook is shown below: + +.. code-block:: c + + #include + #include + #include + #include + #include + #include + #include + + static unsigned int my_nf_hookfn(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) + { + /* process packet */ + //... + + return NF_ACCEPT; + } + + static struct nf_hook_ops my_nfho = { + .hook = my_nf_hookfn, + .hooknum = NF_INET_LOCAL_OUT, + .pf = PF_INET, + .priority = NF_IP_PRI_FIRST + }; + + int __init my_hook_init(void) + { + return nf_register_net_hook(&init_net, &my_nfho); + } + + void __exit my_hook_exit(void) + { + nf_unregister_net_hook(&init_net, &my_nfho); + } + + module_init(my_hook_init); + module_exit(my_hook_exit); + +netcat +====== + +When developing applications that include networking code, one of the most +used tools is netcat. Also nicknamed "Swiss-army knife for TCP / IP". It allows: + + * Initiating TCP connections; + * Waiting for a TCP connection; + * Sending and receiving UDP packets; + * Displaying traffic in hexdump format; + * Run a program after establishing a connection (eg, a shell); + * Set special options in sent packages. + +Initiating TCP connections: + +.. code-block:: console + + nc hostname port + +Listening to a TCP port: + +.. code-block:: console + + nc -l -p port + +Sending and receiving UDP packets is done adding the ``-u`` command line option. + +.. note:: + + The command is :command:`nc`; often :command:`netcat` is an alias for this + command. There are other implementations of the netcat command, some of which + have slightly different parameters than the classic implementation. Run + :command:`man nc` or :command:`nc -h` to check how to use it. + +For more information on netcat, check the following `tutorial +`_. + +Further reading +=============== + +#. Understanding Linux Network Internals +#. `Linux IP networking`_ +#. `The TUX Web Server`_ +#. `Beej's Guide to Network Programming Using Internet Sockets`_ +#. `Kernel Korner - Network Programming in the Kernel`_ +#. `Hacking the Linux Kernel Network Stack`_ +#. `The netfilter.org project`_ +#. `A Deep Dive Into Iptables and Netfilter Architecture`_ +#. `Linux Foundation Networking Page`_ + +.. _Linux IP networking: http://www.cs.unh.edu/cnrg/gherrin/ +.. _The TUX Web Server: http://www.stllinux.org/meeting_notes/2001/0719/myTUX/ +.. _Beej's Guide to Network Programming Using Internet Sockets: https://www.beej.us/guide/bgnet/ +.. _Kernel Korner - Network Programming in the Kernel: http://www.linuxjournal.com/article/7660 +.. _Hacking the Linux Kernel Network Stack: http://phrack.org/issues/61/13.html +.. _The netfilter.org project: http://www.netfilter.org/ +.. _A Deep Dive Into Iptables and Netfilter Architecture: https://www.digitalocean.com/community/tutorials/a-deep-dive-into-iptables-and-netfilter-architecture +.. _Linux Foundation Networking Page: http://www.linuxfoundation.org/en/Net:Main_Page + +Exercises +========= + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: networking + +.. important:: + + You need to make sure that the ``netfilter`` support is active in kernel. It + is enabled via ``CONFIG_NETFILTER``. To activate it, run :command:`make menuconfig` in + the :file:`linux` directory and check the ``Network packet filtering framework + (Netfilter)`` option in ``Networking support -> Networking options``. If it + was not enabled, enable it (as builtin, not external module - it must be + marked with ``*``). + + +1. Displaying packets in kernel space +------------------------------------- + +Write a kernel module that displays the source address and port for TCP packets +that initiate an outbound connection. Start from the code in +:file:`1-2-netfilter` and fill in the areas marked with ``TODO 1``, taking into +account the comments below. + +You will need to register a netfilter hook of type ``NF_INET_LOCAL_OUT`` as explained +in the `netfilter`_ section. + +`The struct sk_buff structure`_ lets you access the packet headers using +specific functions. The :c:func:`ip_hdr` function returns the IP header as a +pointer to a :c:type:`struct iphdr` structure. The :c:func:`tcp_hdr` function +returns the TCP header as a pointer to a :c:type:`struct tcphdr` structure. + +The `diagram`_ explains how to make a TCP connection. The connection initiation +packet has the ``SYN`` flag set in the TCP header and the ``ACK`` flag cleared. + +.. note:: + + To display the source IP address, use the ``%pI4`` format of the printk + function. Details can be found in the `kernel documentation + `_ (``IPv4 + addresses`` section). The following is an example code snippet that uses + ``%pI4``: + + .. code-block:: c + + printk("IP address is %pI4\n", &iph->saddr); + + When using the ``%pI4`` format, the argument to printk is a pointer. Hence the + construction ``&iph->saddr`` (with operator & - ampersand) instead of + ``iph->saddr``. + +The source TCP port is, in the TCP header, in the `network byte-order`_ format. +Read through the :ref:`Conversions` section. Use :c:func:`ntohs` to convert. + +For testing, use the :file:`1-2-netfilter/user/test-1.sh` file. The test creates +a connection to the localhost, a connection that will be intercepted and +displayed by the kernel module. The script is copied on the virtual machine by +the :command:`make copy` command only if it is marked as executable. The script +uses the statically compiled :command:`netcat` tool stored in +:file:`skels/networking/netcat`; this program must have execution +permissions. + +After running the checker the output should be similar to the one bellow: + +.. code-block:: c + + # ./test-1.sh + [ 229.783512] TCP connection initiated from 127.0.0.1:44716 + Should show up in filter. + Check dmesg output. + +2. Filtering by destination address +----------------------------------- + +Extend the module from exercise 1 so that you can specify a destination address +by means of a ``MY_IOCTL_FILTER_ADDRESS`` ioctl call. You'll only show packages +containing the specified destination address. To solve this task, fill in the +areas marked with ``TODO 2`` and follow the specifications below. + +To implement the ioctl routine, you must fill out the ``my_ioctl`` function. +Review the section in :ref:`ioctl`. The address sent from user space is in +`network byte-order`_, so there will be **NO need** for conversion. + +.. note:: + + The IP address sent via ``ioctl`` is sent by address, not by value. The + address must be stored in the ``ioctl_set_addr`` variable. For copying use + :c:func:`copy_from_user`. + +To compare the addresses, fill out the ``test_daddr`` function. Addresses in +network byte-order will be used without having to convert addresses (if they +are equal from left to right they will be equal if reversed too). + +The ``test_daddr`` function must be called from the netfilter hook to display +the connection initialization packets for which the destination address is the +one sent through the ioctl routine. The connection initiation packet has the +``SYN`` flag set in the TCP header and the ``ACK`` flag cleared. You have to +check two things: + + * the TCP flags; + * the destination address of the packet (using ``test_addr``). + +For testing, use the :file:`1-2-netfilter/user/test-2.sh` script. This script +needs to compile the :file:`1-2-netfilter/user/test.c` file in the test +executable. Compilation is done automatically on the physical system when +running the :command:`make build` command. The test script is copied to the +virtual machine only if it is marked as executable. The script uses the +statically compiled :command:`netcat` tool in :file:`skels/networking/netcat`; +this executable must have execution permissions. + +After running the checker the output should be similar to the one bellow: + +.. code-block:: console + + # ./test-2.sh + [ 797.673535] TCP connection initiated from 127.0.0.1:44721 + Should show up in filter. + Should NOT show up in filter. + Check dmesg output. + +The test ask for packet filtering first for the ``127.0.0.1`` IP address and +then for the ``127.0.0.2`` IP address. The first connection initiation packet +(to ``127.0.0.1``) is intercepted and displayed by the filter, while the second +(to ``127.0.0.2``) is not intercepted. + +3. Listening on a TCP socket +---------------------------- + +Write a kernel module that creates a TCP socket that listens to connections on +port ``60000`` on the loopback interface (in ``init_module``). Start from the +code in :file:`3-4-tcp-sock` fill in the areas marked with ``TODO 1`` taking +into account the observations below. + +Read the `Operations on the socket structure`_ and `The struct proto_ops +structure`_ sections. + +The ``sock`` socket is a ``server socket`` and must be put in the listening +state. That is, the ``bind`` and ``listen`` operations must be applied to the +socket. For the ``bind`` and ``listen`` equivalent, in kernel space you will +need to call ``sock->ops->...;`` examples of such functions you can call are +``sock->ops->bind``, ``sock->ops->listen`` etc. + +.. note:: + + For example, call ``sock->ops->bind``, or ``sock->ops->listen`` functions, see + how they are called in the :c:func:`sys_bind` and :c:func:`sys_listen` system + call handlers. + + Look for the system call handlers in the ``net/socket.c`` file in the Linux + kernel source code tree. + +.. note:: + + For the second argument of the ``listen`` (backlog) call, use the + ``LISTEN_BACKLOG``. + +Remember to release the socket in the module's exit function and in the area +marked with error labels; use :c:func:`sock_release`. + +For testing, run the :command:`3-4-tcp_sock/test-3.sh` script. The script is +copied on the virtual machine by :command:`make copy` only if it is marked as +executable. + +After running the test, a TCP socket will be displayed by listening to +connections on port ``60000``. + +4. Accepting connections in kernel space +---------------------------------------- + +Expand the module from the previous exercise to allow an external connection (no +need to send any message, only accept new connections). Fill in the areas marked +with ``TODO 2``. + +Read the `Operations on the socket structure`_ and `The struct proto_ops +structure`_ sections. + +For the kernel space ``accept`` equivalent, see the system call handler for +:c:func:`sys_accept4`. Follow the `lnet_sock_accept +`_ +implementation, and how the ``sock->ops->accept`` call is used. Use ``0`` as +the value for the second to last argument (``flags``), and ``true`` for the +last argument (``kern``). + +.. note:: + + Look for the system call handlers in the ``net/socket.c`` file in the Linux + kernel source code tree. + +.. note:: + + The new socket (``new_sock``) must be created with the + :c:func:`sock_create_lite` function and then its operations must be configured + using + + .. code-block:: console + + newsock->ops = sock->ops; + +Print the address and port of the destination socket. To find the peer name of a +socket (its address), refer to the :c:func:`sys_getpeername` system call handler. + +.. note:: + + The first argument for the ``sock->ops->getname`` function will be the + connection socket, ie ``new_sock``, the one initialized with by the ``accept`` + call. + + The last argument of the ``sock->ops->getname`` function will be ``1``, + meaning that we want to know about the endpoint or the peer (*remote end* or + *peer*). + + Display the peer address (indicated by the ``raddr`` variable) using the + ``print_sock_address`` macro defined in the file. + +Release the newly created socket (after accepting the connection) in the module +exit function and after the error label. After adding the ``accept`` code to the +module initialization function, the :command:`insmod` operation will lock until +a connection is established. You can unlock using :command:`netcat` on that +port. Consequently, the test script from the previous exercise will not work. + +For testing, run the :file:`3-4-tcp_sock/test-4.sh` script. The script is copied on +the virtual machine by :command:`make copy` only if it is marked as executable. + +Nothing special will be displayed (in the kernel buffer). The success of the +test will be defined by the connection establishment. Then use ``Ctrl+c`` to +stop the test script, and then you can remove the kernel module. + +5. UDP socket sender +-------------------- + +Write a kernel module that creates a UDP socket and sends the message from the +``MY_TEST_MESSAGE`` macro on the socket to the loopback address on port +``60001``. + +Start from the code in :file:`5-udp-sock`. + +Read the `Operations on the socket structure`_ and `The struct proto_ops +structure`_ sections. + +To see how to send messages in the kernel space, see the :c:func:`sys_send` +system call handler or `Sending/receiving messages`_. + +.. hint:: + + The ``msg_name`` field of the :c:type:`struct msghdr` structure must be + initialized to the destination address (pointer to :c:type:`struct sockaddr`) + and the ``msg_namelen`` field to the address size. + + Initialize the ``msg_flags`` field of the :c:type:`struct msghdr` structure + to ``0``. + + Initialize the ``msg_control`` and ``msg_controllen`` fields of the + :c:type:`struct msghdr` structure to ``NULL`` and ``0`` respectively. + +For sending the message use :c:func:`kernel_sendmsg`. + +The message transmission parameters are retrieved from the kernel space. Cast +the :c:type:`struct iovec` structure pointer to a :c:type:`struct kvec` pointer +in the :c:func:`kernel_sendmsg` call. + +.. hint:: + + The last two parameters of :c:func:`kernel_sendmsg` are ``1`` (number of I/O + vectors) and ``len`` (message size). + +For testing, use the :file:`test-5.sh` file. The script is copied on the virtual +machine by the :command:`make copy` command only if it is marked as executable. +The script uses the statically compiled ``netcat`` tool stored in +:file:`skels/networking/netcat`; this executable must have execution +permissions. + +For a correct implementation, running the :file:`test-5.sh` script will cause +the ``kernelsocket`` message to be displayed like in the output below: + +.. code-block:: console + + /root # ./test-5.sh + + pid=1059 + + sleep 1 + + nc -l -u -p 60001 + + insmod udp_sock.ko + kernelsocket + + rmmod udp_sock + + kill 1059 diff --git a/refs/pull/405/merge/_sources/lectures/address-space.rst.txt b/refs/pull/405/merge/_sources/lectures/address-space.rst.txt new file mode 100644 index 00000000..2150cbe9 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/address-space.rst.txt @@ -0,0 +1,1080 @@ +============= +Address Space +============= + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Address Space + :inline-contents: True + :level: 2 + + * x86 MMU + + * Segmentation + + * Paging + + * TLB + + * Linux Address Space + + * User + + * Kernel + + * High memory + + +x86 MMU +======= + +The x86 MMU has a segmentation and a pagination unit. The segmentation +unit can be used to define logical memory segments defined by a +logical (virtual) start address, a base linear (mapped) address and a +size. A segment can also restrict access based on the access type +(read, execute, write) or the privilege level (we can define some +segments to be accessible only by kernel for example). + +When the CPU makes a memory access, it will use the segmentation unit +to translate the logical address to a linear address, based on the +information in the segment descriptor. + +If pagination is enabled the linear address will be further +transformed into a physical address, using the information from the +page tables. + +Note that the segmentation unit can not be disabled, so if the MMU has +been enabled, segmentation will always be used. + +.. slide:: x86 MMU + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +--------------+ +------------+ + logical | | linear | | physical + ---------> | Segmentation | --------> | Paging | ----------> + address | Unit | address | Unit | address + | | | | + +--------------+ +------------+ + +Selectors +--------- + +A program can use multiple segments and in order to determine which +segment to use, special registers (named selectors) are used. The +basic selectors that are typically used are CS - "Code Selector", DS - +"Data Selector" and SS - "Stack Selector". + +Instruction fetches will by default use CS, while data access will by +default use DS unless the stack is used (e.g. data access through the +pop and push instructions) in which case SS will be used by default. + +Selectors have three main fields: the index, the table index and the +running privilege level: + + +.. slide:: Selectors + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + 15 3 2 1 0 + +------------+----+-----+ + | | | | + Segment selectors | index | TI | RPL | + (CS, DS, SS, ES, FS, GS) | | | | + +------------+----+-----+ + + .. ifslides:: + + * Selectors: CS, DS, SS, ES, FS, GS + + * Index: indexes the segment descriptor table + + * TI: selects either the GDT or LDT + + * RPL: for CS only indicates the running (current) priviledge level + + * GDTR and LDTR registers points to the base of GDP and LDT + + +The index will be used to determine which entry of the descriptor +table should be used. `TI` is used to select either the Global +Descriptor Table (GDT) or the Local Descriptor Table (LDT). The tables +are effectively arrays that start at the location specified in the +special registers `GDTR` (for GDT) and `LDTR` (for LDT). + +.. note:: LDT was designed so that applications can define their own + particular segments. Although not many applications use this + feature, Linux (and Windows) provide system calls that + allows an application to create their own segments. + +`RPL` is only used for CS and it represents the current privilege +level. There are 4 privilege levels, the highest level being 0 (and +typically used by the kernel) and the lowest is 3 (and typically used +by user applications). + + +Segment descriptor +------------------ + +The CPU will use the `index` field of the selector to access an 8 byte +descriptor: + +.. slide:: Segment descriptor + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + 63 56 44 40 32 + +-------------------------------+---+---+---+---+---------------+---+---+---+---+---------------+-------------------------------+ + | | | D | | A | Segment | | D | | | | + | Base Address 31:24 | G | / | L | V | Limit | P | P | S | Type | Base Address 23:16 | + | | | B | | L | 19:16 | | L | | | | + +-------------------------------+---+---+---+---+---------------+---+---+---+---+---------------+-------------------------------+ + | | | + | Base address 15:0 | Segment Limit 15:0 | + | | | + +---------------------------------------------------------------+---------------------------------------------------------------+ + 31 15 0 + + + * Base: linear address for the start of the segment + + * Limit: size of the segment + + * G: granularity bit: if set the size is in bytes otherwise in 4K pages + + * B/D: data/code + + * Type: code segment, data/stack, TSS, LDT, GDT + + * Protection: the minimum priviledge level required to access the + segment (RPL is checked against DPL) + + +Some of the descriptor fields should be familiar. And that is because +there is some resemblance with Interrupt Descriptors we looked at +previously. + + +Segmentation in Linux +--------------------- + +In Linux, segments are not used to define the stack, code or +data. These will be setup using the paging unit as it allows better +granularity and more importantly it allows Linux to use a generic +approach that works on other architectures (that don't have +segmentation support). + +However, because the segmentation unit can not be disabled Linux must +create 4 generic 0 - 4GB segments for: kernel code, kernel data, user +code and user data. + +Besides these, Linux uses segments for implementing Thread Local +Storage (TLS) together with the `set_thread_area` system call. + +It also uses the TSS segment in order to define the kernel stack to +use when a change of privilege (e.g. system call, interrupt while +running in user-space) occurs. + +.. slide:: Segmentation in Linux + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* + * The layout of the per-CPU GDT under Linux: + * + * 0 - null <=== cacheline #1 + * 1 - reserved + * 2 - reserved + * 3 - reserved + * + * 4 - unused <=== cacheline #2 + * 5 - unused + * + * ------- start of TLS (Thread-Local Storage) segments: + * + * 6 - TLS segment #1 [ glibc's TLS segment ] + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] + * 8 - TLS segment #3 <=== cacheline #3 + * 9 - reserved + * 10 - reserved + * 11 - reserved + * + * ------- start of kernel segments: + * + * 12 - kernel code segment <=== cacheline #4 + * 13 - kernel data segment + * 14 - default user CS + * 15 - default user DS + * 16 - TSS <=== cacheline #5 + * 17 - LDT + * 18 - PNPBIOS support (16->32 gate) + * 19 - PNPBIOS support + * 20 - PNPBIOS support <=== cacheline #6 + * 21 - PNPBIOS support + * 22 - PNPBIOS support + * 23 - APM BIOS support + * 24 - APM BIOS support <=== cacheline #7 + * 25 - APM BIOS support + * + * 26 - ESPFIX small SS + * 27 - per-cpu [ offset to per-cpu data area ] + * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 + * 29 - unused + * 30 - unused + * 31 - TSS for double fault handler + */ + + DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { + #ifdef CONFIG_X86_64 + /* + * We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + * + * TLS descriptors are currently at a different place compared to i386. + * Hopefully nobody expects them at a fixed place (Wine?) + */ + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), + #else + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), + + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + GDT_STACK_CANARY_INIT + #endif + } }; + EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + + +Inspecting selectors and segments +--------------------------------- + +.. slide:: Inspecting selectors and segments + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/selectors-and-segments.cast + + +x86 Paging +---------- + +The x86 paging unit support two types of paging: regular and extended paging. + +Regular paging has 2 levels and a fixed page size of 4KB. The linear +address is split in three fields: + +* Directory (the 10 most significant bits) + +* Table (the next 10 most bits) + +* Offset (the least significant 12 bits) + + +.. slide:: Regular paging + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + :--no-separation: + + Virtual Address + +------------+ +------------------+----------------+---------------+ + | CR3 | | DIRECTORY cEEE| TABLE cDDD | OFFSET cCCC| + +------------+ +------------------+----------------+---------------+ + | | | | + | | | | PAGE + | | | | /-----------------------\ + | | | | | | + | | | | | | + | | +-----------+ | +-----------------------+ + | | | +--->| Physical Address cCCC| + | | | +-----------------------+ + | +-----------------+ | | | + | | | PAGE | | + | | | TABLE | | + | | PAGE | /------------\ | | + | | DIRECTORY | | | | | + | | /------------\ | | | | | + | | | | | +------------+ +----> \-----------------------/ + | | | | +---->| cDDD |---+ + | | | | +------------+ + | | | | | | + | | | | | | + | | +------------+ | | + | +----->|cEEE |---+ | | + | +------------+ | | | + | | | +---->\------------/ + | | | + +--------->\------------/ + + +When extended paging is enabled, a single level is used and pages are +4MB. The linear address is split in two fields: + +* Directory (10 most significant bits) + +* Offset (least significant 22 bits) + +.. slide:: Extended paging + :inline-contents: True + :level: 2 + + .. ditaa:: + :--no-separation: + + Virtual Address + +------------+ +-------------------+-----------------------------+ + | CR3 | | DIRECTORY cEEE | OFFSET cDDD | + +------------+ +-------------------+-----------------------------+ + | | | + | | | PAGE + | | | /----------------------\ + | | | | | + | | | | | + | | | +----------------------+ + | | +--->| Physical Address cDDD| + | | +----------------------+ + | +-----------------+ | | + | | | | + | | | | + | | PAGE | | + | | DIRECTORY | | + | | /------------\ | | + | | | | +------------------>\----------------------/ + | | | | | + | | | | | + | | | | | + | | | | | + | | +------------+ | + | +----->| cEEE |-------------+ + | +------------+ + | | | + | | | + +---------->\------------/ + + +Page tables +------------ + +We can mix regular and extended paging, the directory page has a bit +that specifies if extended or regular paging should be used. The +special CR3 register points to the base of the page directory and page +directory entries point to the base of the page table. + +Both page directory and page table have 1024 entries and each entry +has 4 bytes. + +All tables are stored in memory and the page table addresses are +physical addresses. + + +.. slide:: Page tables + :inline-contents: False + :level: 2 + + * Both page directory and page table have 1024 entries + + * Each entry has 4 bytes + + * The special CR3 register point to the base of the page directory + + * Page directory entries points to the base of the page table + + * All tables are stored in memory + + * All table addresses are physical addresses + + +Page table entry fields: + +.. slide:: Page table entry fields + :inline-contents: True + :level: 2 + + * Present/Absent + + * PFN (Page Frame Number): the most 20 significant bits of the physical address + + * Accessed - not updated by hardware (can be used by OS for housekeeping) + + * Dirty - not updated by hardware (can be used by OS for housekeeping) + + * Access rights: Read/Write + + * Privilege: User/Supervisor + + * Page size - only for page directory; if set extended paging is used + + * PCD (page cache disable), PWT (page write through) + + +Linux paging +------------ + +Linux paging uses 4 levels in order to support 64bit +architectures. The diagram below shows how the various virtual address +chunks are used to index the page tables and compute the physical +address. + + +.. slide:: Linux paging + :inline-contents: True + :level: 2 + + .. ditaa:: + :--no-separation: + + Virtual Address + +------------+ +------------------+-----------------+------------------+-------------------+---------------+ + | CR3 | | GLOBAL DIR cEEE| UPPER DIR cDDD| MIDDLE DIR cCCC| TABLE cBBB| OFFSET cAAA | + +------------+ +------------------+-----------------+------------------+-------------------+---------------+ + | | | | | | + | | | | | | PAGE + | | | | | | /----------------------\ + | | | | | | | | + | | | | | | | | + | | +-----------+ | | PAGE GLOBAL | +----------------------+ + | | | | | DIRECTORY +-------->| Physical Address cAAA| + | | | | PAGE MIDDLE | /------------\ +----------------------+ + | +-----------------+ | | DIRECTORY | | | | | + | | | PAGE UPPER | /------------\ | | | | | + | | | DIRECTORY | | | | | | | | + | | PAGE GLOBAL | /------------\ | | | | | | | | + | | DIRECTORY | | | | +------------+ | | | | | + | | /------------\ | | | +--->| cCCC |---+ | +------------+ | | + | | | | | | | +------------+ | +--->| cBBB |---------->\----------------------/ + | | | | | | | | | | +------------+ + | | | | | +------------+ +----->\------------/ | | | + | | | | +---->| cDDD |---+ | | | + | | | | +------------+ +----->\------------/ + | | +------------+ | | + | +----->| cEEE |--+ | | + | +------------+ | | | + | | | +----->\------------/ + | | | + +--------->\------------/ + + +Linux has a common API for creating and walking page tables. Creating +and modifying address spaces for kernel and processes is done using +the same generic code which relies on macros and functions to +translate these generic operations in code that runs on different +architectures. + +Here is an example of how we can translate a virtual address to a +physical address, using the Linux page table APIs: + +.. slide:: Linux APIs for page table handling + :inline-contents: True + :level: 2 + + .. code-block:: c + + struct * page; + pgd_t pgd; + pmd_t pmd; + pud_t pud; + pte_t pte; + void *laddr, *paddr; + + pgd = pgd_offset(mm, vaddr); + pud = pud_offet(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + pte = pte_offset(pmd, vaddr); + page = pte_page(pte); + laddr = page_address(page); + paddr = virt_to_phys(laddr); + + +In order to support architectures with less than 4 levels of +pagination (such as for x86 32bits) some macros and / or functions are +0 / empty: + +.. slide:: What about platforms with less then 4 levels of pagination? + :inline-contents: True + :level: 2 + + .. code-block:: c + + static inline pud_t * pud_offset(pgd_t * pgd,unsigned long address) + { +     return (pud_t *)pgd; + } + + static inline pmd_t * pmd_offset(pud_t * pud,unsigned long address) + { +     return (pmd_t *)pud; + } + + +Translation Look-aside Buffer +----------------------------- + +When using virtual memory, due to the table page organization, we may +need an extra 1 (x86 extended paging), 2 (x86 regular paging) or 3 +(x86 64bit) memory access(es). + +A special cache, called Translation Look-aside Buffer (TLB) is used to +speed up translations from virtual address to physical addresses. + +The TLB has the following properties: + +.. slide:: Translation Look-aside Buffer + :inline-contents: True + :level: 2 + + * Caches paging information (PFN, rights, privilege) + + * Content Addressable Memory / Associative Memory + + * Very small (64-128) + + * Very fast (single cycle due to parallel search implementation) + + * CPUs usually have two TLBs: i-TLB (code) and d-TLB (data) + + * TLB miss penalty: up hundreds of cycles + + +As with other caches, we must be careful to not create consistency +issues. + +For example, when changing the mapping of one page to point to a +different physical memory location in the page tables, we must +invalidate the associated TLB entry. Otherwise, the MMU will do the +translation to the old physical address instead of the new physical +address. + +The x86 platform supports TLB invalidation through two types of +operations. + +.. slide:: TLB invalidation + :inline-contents: True + :level: 2 + + Single address invalidation: + + .. code-block:: asm + + mov $addr, %eax + invlpg %(eax) + + Full invalidation: + + .. code-block:: asm + + mov %cr3, %eax + mov %eax, %cr3 + + +Linux address space +=================== + +Address space options for 32bit systems +--------------------------------------- + +There are two main options for implementing kernel and user space: +either dedicated address spaces for each, or split a shared address +space. + +.. slide:: Address space options for 32bit systems + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +-------------------+ +-------------------+ 0xFFFFFFFF +-------------------+ ^ + | | | | | | | + | | | | | | | Kernel space + | | | | | | | + | User | | Kernel | 0xC0000000 +-------------------+ v + | space | | space | | | ^ + | | | | | | | User space + | | | | | | | + | | | | | | | + | | | | | | | + | | | | | | | + | | | | | | | + | | | | | | | + | | | | | | | + +-------------------+ +-------------------+ 0x00000000 +-------------------+ v + + + (a) 4/4 split (b) 1/3 or2/2 split + + +Each has advantages and disadvantages: + +.. slide:: Advantages and disadvantages + :inline-contents: True + :level: 2 + + * Disadvantages for dedicated kernel space: + + * Fully invalidating the TLB for every system call + + * Disadvantages for shared address space + + * Less address space for both kernel and user processes + + +Linux is using a split address space for 32 bit systems, although in +the past there were options for supporting 4/4s split or dedicated +kernel address space (on those architecture that supports it, +e.g. x86). Linux always uses split address space for 64 bit systems. + +On overview of the Linux address space is presented below: + +.. slide:: Linux address space for 32bit systems + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + :--no-separation: + + : : : : + | User space | Lowmem | Highmem | + | arbitrary mapping | linear mapping | arbitrary mapping | + | | | | + +----+----+--------------------+----+------+----+----+---------------+----+----+-----+----+----+ Virtual + |cEEE|cGRE|cEEE |cRED|cEEE |cAAA|cGRE| cAAA |cEEE|cGRE|cEEE |cRED|cEEE| memory + | | | | | | | | | | | | | | + +----+----+--------------------+----+------+----+----+---------------+----+----+-----+----+----+ + | | 3G | 3.896G | | 4G + | +-------+ | | | + | | | | | + |<----------------------------------+------+<-------------------------+ | + | | | + | |<-------------------------------------------+ + | | + v V + +----+----+---------------+--------------+----+------------------------------------------------+ Physical + |cAAA|cGRE| cAAA | cEEE |cRED| cEEE | memory + | | | | | | | + +----+----+---------------+--------------+----+------------------------------------------------+ + 896MB + + +Linear mappings +--------------- + +Linear mappings refer to particular way of mapping virtual pages to +physical pages, where virtual page V, V + 1, ... V + n is mapped to +physical pages P, P + 1, ..., P + n. + +To understand the necessity of linear mappings, we should look at +common kernel operations that involves using both the virtual and +physical address of a page such as an I/O transfer: + +.. slide:: Virtual to physical address translations for I/O transfers + :inline-contents: True + :level: 2 + + * Use the virtual address of a kernel buffer in order to copy to + data from from user space + + * Walk the page tables to transform the kernel buffer virtual + address to a physical address + + * Use the physical address of the kernel buffer to start a DMA + transfer + + +However, if we use linear mappings and the kernel buffers are in the +linear mapping area, then: + +.. slide:: Linear mappings + :inline-contents: True + :level: 2 + + * Virtual to physical address space translation is reduced to one + operation (instead of walking the page tables) + + * Less memory is used to create the page tables + + * Less TLB entries are used for the kernel memory + + +Highmem +------- + +The "highmem" part of the virtual address space is used to create +arbitrary mappings (as opposed to linear mappings in lowmem). On 32bit +systems the highmem area is absolutely required in order to access +physical memory outside of lowmem. However, highmem is also used on +64bit systems but the use-case there is mainly to allow arbitrary +mappings in kernel space. + + +.. slide:: Highmem + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +--------+ 8MB +-----------+ 4KB +-----------+ +-----------+ 4KB +------------+-----------+------------+ + | | | | | | | | | Persistent | Temporary | Fix-mapped | + | Lowmem | <-----> | VMAP area | <-----> | VMAP area | ... | VMAP area | <-----> | Kernel | Kernel | linear | + | | | | | | | | | Mappings | Mappings | addresses | + +--------+ +-----------+ +-----------+ +-----------+ +------------+-----------+------------+ + : : + | 128MB | + |<------------------------------------------------------------------------------------------------------------->| + | | + | | + VMALLOC_START 4GB + (896MB) + + +There are multiple types of mappings in the highmem area: + +* Multi-page permanent mappings (vmalloc, ioremap) + +* Temporary 1 page mappings (atomic_kmap) + +* Permanent 1 page mappings (kmap, fix-mapped linear addresses) + + +Multiple page mappings allows mapping of ranges of physical memory +into the highmem area. Each such mapping is guarded by a +non-accessible page to catch buffer overflow and underflow errors. + + +The APIs that maps multiple pages into highmem are: + +.. slide:: Multi-page permanent mappings + :inline-contents: True + :level: 2 + + .. code-block:: c + + void* vmalloc(unsigned long size); + void vfree(void * addr); + + void *ioremap(unsigned long offset, unsigned size); + void iounmap(void * addr); + +:c:func:`vmalloc` is used to allocate non-contiguous system memory +pages as a contiguous segment in the kernel virtual address space. It +is usefully when allocating large buffers because due to fragmentation +it is unlikely to find free large chunks of physical contiguous memory. + +:c:func:`ioremap` is used to map device memory or device registers +into the kernel address space. It maps a contiguous physical memory +range into highmem with page caching disabled. + +Fixed-mapped linear addresses +----------------------------- + +Fixed-mapped linear addresses are a special class of singular page +mappings that are used for accessing registers of commonly used +peripherals such as the APIC or IO APIC. + +Typical I/O access for peripherals is to use a base (the kernel +virtual address space where the peripheral registers are mapped) + +offsets for various registers. + +In order to optimize access, the base is reserved at compile time +(e.g. 0xFFFFF000). Since the base is constant, the various register +accesses of the form `base + register offset` will also be constant +and thus the compiler will avoid generating an extra instruction. + +In summary, fixed-mapped linear addresses are: + +.. slide:: Fixed-mapped linear addresses + :inline-contents: True + :level: 2 + + * Reserved virtual addresses (constants) + + * Mapped to physical addresses during boot + + .. code-block:: c + + set_fixmap(idx, phys_addr) + set_fixmap_nocache(idx, phys_addr) + + +These addresses are architecture defined and, as an example, this is +the map for x86: + +.. slide:: Fixed-mapped linear addresses + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * for x86_32: We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * These 'compile-time allocated' memory buffers are + * fixed-size 4k pages (or larger if used with an increment + * higher than 1). Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ + + enum fixed_addresses { + #ifdef CONFIG_X86_32 + FIX_HOLE, + #else + #ifdef CONFIG_X86_VSYSCALL_EMULATION + VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, + #endif + #endif + FIX_DBGP_BASE, + FIX_EARLYCON_MEM_BASE, + #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, + #endif + #ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ + #endif + #ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, + #endif + #ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + #ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, + #endif + + +Notice how easy is to do the conversion between the virtual address +and the fixed address indexes: + +.. slide:: Conversion between virtual address fixed address indexes + :inline-contents: True + :level: 2 + + + .. code-block:: c + + #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) + #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + + #ifndef __ASSEMBLY__ + /* + * 'index to address' translation. If anyone tries to use the idx + * directly without translation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ + static __always_inline unsigned long fix_to_virt(const unsigned int idx) + { + BUILD_BUG_ON(idx >= __end_of_fixed_addresses); + return __fix_to_virt(idx); + } + + static inline unsigned long virt_to_fix(const unsigned long vaddr) + { + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); + } + + + inline long fix_to_virt(const unsigned int idx) + { + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + return (0xffffe000UL - (idx << PAGE_SHIFT)); + } + + +Temporary mappings +------------------ + +Temporary mappings can be used to map a single physical page, very +fast, in kernel space. It can be used in interrupt context but the +atomic kmap section, defined in between the :c:func:`kmap_atomic` and +:c:func:`kunmap_atomic` can not be preempted. That is why these are +called temporary mappings, as they can only be used momentarily. + + +.. slide:: Temporary mappings + :inline-contents: false + :level: 2 + + * :c:func:`kmap_atomic`, :c:func:`kunmap_atomic` + + * No context switch is permitted in atomic kmap section + + * Can be used in interrupt context + + * No locking required + + * Only invalidates on TLB entry + + +Temporary mappings are very fast because there is no locking or +searching required and also there is no full TLB invalidation, just +the particular virtual page will be TLB invalidated. + +Here are some code snippets that show that temporary mappings are +implemented: + +.. slide:: Temporary mappings implementation + :inline-contents: True + :level: 2 + + + .. code-block:: c + + #define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) + + void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) + { + unsigned long vaddr; + int idx, type; + + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte-idx))); + set_pte(kmap_pte-idx, mk_pte(page, prot)); + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; + } + EXPORT_SYMBOL(kmap_atomic_high_prot); + + static inline int kmap_atomic_idx_push(void) + { + int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; + + #ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(in_irq() && !irqs_disabled()); + BUG_ON(idx >= KM_TYPE_NR); + #endif + return idx; + } + + +Notice that fix-mapped linear addresses and a stack like approach is +used: each CPU has KM_TYPE_NR reserved entries which are used in a +first code first serve option. This allows using multiple temporary +mappings at once, for example one in process context, one in an +interrupt handler, and a few more in tasklets or softirqs. + +.. slide:: Implementation of temporary mappings + :inline-contents: false + :level: 2 + + * Use the fixed-mapped linear addresses + + * Every CPU has KM_TYPE_NR reserved entries to be used for + temporary mappings + + * Stack like selection: every user picks the current entry and + increments the "stack" counter + +Permanent mappings +------------------ + +Permanent mappings allows users to hold on to a mapping for long +(undefined) periods of time which means that context switch are +allowed after a mapping and before releasing it. + +This flexibility comes with a price though. A search operation is +performed to find a free entry and they can not be used in interrupt +context - the operation that tries to find a free virtual address page +may block. There is a limited number of permanent mappings available +(topically one page is reserved for permanent mappings) + +.. slide:: Permanent mappings + :inline-contents: false + :level: 2 + + * :c:func:`kmap`, :c:func:`kunmap` + + * Context switches are allowed + + * Only available in process context + + * One page table is reserved for permanent mappings + + * Page counter + + * 0 - page is not mapped, free and ready to use + + * 1 - page is not mapped, may be present in TLB needs flushing before using + + * N - page is mapped N-1 times + + diff --git a/refs/pull/405/merge/_sources/lectures/arch.rst.txt b/refs/pull/405/merge/_sources/lectures/arch.rst.txt new file mode 100644 index 00000000..ef97e871 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/arch.rst.txt @@ -0,0 +1,217 @@ +================== +Architecture Layer +================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Introduction + :inline-contents: True + :level: 2 + + * Overview of the arch layer + + * Overview of the boot process + + +Overview of the arch layer +========================== + +.. slide:: Overview of the arch layer + :level: 2 + :inline-contents: True + + .. ditaa:: + :height: 100% + + +---------------+ +--------------+ +---------------+ + | Application 1 | | Application2 | ... | Application n | + +---------------+ +--------------+ +---------------+ + | | | + v v v + +--------------------------------+------------------------+ + | Kernel core & subsystems | Generic Drivers | + +--------------------------------+------------------------+ + | Generic Architecture Code | + +---------------------------------------------------------+ + | Architecture Specific Code | + | | + | +-----------+ +--------+ +---------+ +--------+ | + | | Bootstrap | | Memory | | Threads | | Timers | | + | +-----------+ +--------+ +---------+ +--------+ | + | +------+ +----------+ +------------------+ | + | | IRQs | | Syscalls | | Platform Drivers | | + | +------+ +----------+ +------------------+ | + | +------------------+ +---------+ +---------+ | + | | Platform Drivers | | machine | ... | machine | | + | +------------------+ +---------+ +---------+ | + +---------------------------------------------------------+ + | | | + v v v + +--------------------------------------------------------+ + | Hardware | + +--------------------------------------------------------+ + + +Boot strap +---------- + +.. slide:: Bootstrap + :level: 2 + :inline-contents: True + + * The first kernel code that runs + + * Typically runs with the MMU disabled + + * Move / Relocate kernel code + + +Boot strap +---------- + +.. slide:: Bootstrap + :level: 2 + :inline-contents: True + + * The first kernel code that runs + + * Typically runs with the MMU disabled + + * Copy bootloader arguments and determine kernel run location + + * Move / relocate kernel code to final location + + * Initial MMU setup - map the kernel + + + +Memory setup +------------ + +.. slide:: Memory Setup + :level: 2 + :inline-contents: True + + * Determine available memory and setup the boot memory allocator + + * Manages memory regions before the page allocator is setup + + * Bootmem - used a bitmap to track free blocks + + * Memblock - deprecates bootmem and adds support for memory ranges + + * Supports both physical and virtual addresses + + * support NUMA architectures + + +MMU management +-------------- + +.. slide:: MMU management + :level: 2 + :inline-contents: True + + * Implements the generic page table manipulation APIs: types, + accessors, flags + + * Implement TLB management APIs: flush, invalidate + + +Thread Management +----------------- + +.. slide:: Thread Management + :level: 2 + :inline-contents: True + + * Defines the thread type (struct thread_info) and implements + functions for allocating threads (if needed) + + * Implement :c:func:`copy_thread` and :c:func:`switch_context` + + +Time Management +---------------- + +.. slide:: Timer Management + :level: 2 + :inline-contents: True + + * Setup the timer tick and provide a time source + + * Mostly transitioned to platform drivers + + * clock_event_device - for scheduling timers + + * clocksource - for reading the time + + +IRQs and exception management +----------------------------- + +.. slide:: IRQs and exception management + :level: 2 + :inline-contents: True + + * Define interrupt and exception handlers / entry points + + * Setup priorities + + * Platform drivers for interrupt controllers + + +System calls +------------ + +.. slide:: System calls + :level: 2 + :inline-contents: True + + * Define system call entry point(s) + + * Implement user-space access primitives (e.g. copy_to_user) + + +Platform Drivers +---------------- + +.. slide:: Platform Drivers + :level: 2 + :inline-contents: True + + * Platform and architecture specific drivers + + * Bindings to platform device enumeration methods (e.g. device tree + or ACPI) + +Machine specific code +--------------------- + +.. slide:: Machine specific code + :level: 2 + :inline-contents: True + + * Some architectures use a "machine" / "platform" abstraction + + * Typical for architecture used in embedded systems with a lot of + variety (e.g. ARM, powerPC) + + +Overview of the boot process +============================ + + +.. slide:: Boot flow inspection + :level: 2 + :inline-contents: True + + + .. asciicast:: ../res/boot.cast diff --git a/refs/pull/405/merge/_sources/lectures/debugging.rst.txt b/refs/pull/405/merge/_sources/lectures/debugging.rst.txt new file mode 100644 index 00000000..dc384855 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/debugging.rst.txt @@ -0,0 +1,942 @@ +========= +Debugging +========= + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +One essential part of Linux kernel development is debugging. In user space we had +the support of the kernel so we could easily stop processes and use gdb to inspect +their behavior. In the kernel, in order to use gdb we need to use hypervisor like +QEMU or JTAG based hardware interfaces which are not always available. The Linux +kernel provides a set of tools and debug options useful for investigating abnormal +behavior. + +In this lecture we will learn about: + +.. slide:: Debugging + :inline-contents: True + :level: 2 + + * decoding an oops/panic + * list debugging + * memory debugging + * locking debugging + * profiling + +Decoding an oops/panic +====================== + +An oops is an inconsistent state that the kernel detects inside itself. +Upon detecting an oops the Linux kernel kills the offending process, +prints information that can help debug the problem and continues execution +but with limited reliability. + +Lets consider the following Linux kernel module: + +.. slide:: Oops module + :inline-contents: True + :level: 2 + + .. code-block:: c + + static noinline void do_oops(void) + { + *(int*)0x42 = 'a'; + } + + static int so2_oops_init(void) + { + pr_info("oops_init\n"); + do_oops(); + + return 0; + } + + static void so2_oops_exit(void) + { + pr_info("oops exit\n"); + } + + module_init(so2_oops_init); + module_exit(so2_oops_exit); + +Notice that ''do_oops'' function tries to write at an invalid memory address. Because the kernel +cannot find a suitable physical page were to write, it kills the insmod task in the context of +which ''do_oops'' runs. Then it prints the following oops message: + + .. code-block:: bash + + root@qemux86:~/skels/debugging/oops# insmod oops.ko + BUG: unable to handle kernel NULL pointer dereference at 00000042 + IP: do_oops+0x8/0x10 [oops] + *pde = 00000000 + Oops: 0002 [#1] SMP + Modules linked in: oops(O+) + CPU: 0 PID: 234 Comm: insmod Tainted: G O 4.15.0+ #3 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 + EIP: do_oops+0x8/0x10 [oops] + EFLAGS: 00000292 CPU: 0 + EAX: 00000061 EBX: 00000000 ECX: c7ed3584 EDX: c7ece8dc + ESI: c716c908 EDI: c8816010 EBP: c7257df0 ESP: c7257df0 + DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 + CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690 + Call Trace: + so2_oops_init+0x17/0x20 [oops] + do_one_initcall+0x37/0x170 + ? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0 + ? __might_sleep+0x32/0x90 + ? trace_hardirqs_on_caller+0x11c/0x1a0 + ? do_init_module+0x17/0x1c2 + ? kmem_cache_alloc+0xa4/0x1e0 + ? do_init_module+0x17/0x1c2 + do_init_module+0x46/0x1c2 + load_module+0x1f45/0x2380 + SyS_init_module+0xe5/0x100 + do_int80_syscall_32+0x61/0x190 + entry_INT80_32+0x2f/0x2f + EIP: 0x44902cc2 + EFLAGS: 00000206 CPU: 0 + EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008 + ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c + DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b + Code: 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8 + EIP: do_oops+0x8/0x10 [oops] SS:ESP: 0068:c7257df0 + CR2: 0000000000000042 + ---[ end trace 011848be72f8bb42 ]--- + Killed + +An oops contains information about the IP which caused the fault, register status, process, +CPU on which the fault happend like below: + +.. slide:: Oops information + :inline-contents: True + :level: 2 + + .. code-block:: bash + + root@qemux86:~/skels/debugging/oops# insmod oops.ko + BUG: unable to handle kernel NULL pointer dereference at 00000042 + IP: do_oops+0x8/0x10 [oops] + *pde = 00000000 + Oops: 0002 [#1] SMP + Modules linked in: oops(O+) + CPU: 0 PID: 234 Comm: insmod Tainted: G O 4.15.0+ #3 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 + EIP: do_oops+0x8/0x10 [oops] + CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690 + EIP: 0x44902cc2 + EFLAGS: 00000206 CPU: 0 + EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008 + ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c + DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b + Code: 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8 + Killed + +Another important thing that an oops can provide is the stack trace of functions called before +the fault happend: + +.. slide:: Oops stacktrace + :inline-contents: True + :level: 2 + + + .. code-block:: bash + + root@qemux86:~/skels/debugging/oops# insmod oops.ko + BUG: unable to handle kernel NULL pointer dereference at 00000042 + Call Trace: + so2_oops_init+0x17/0x20 [oops] + do_one_initcall+0x37/0x170 + ? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0 + ? __might_sleep+0x32/0x90 + ? trace_hardirqs_on_caller+0x11c/0x1a0 + ? do_init_module+0x17/0x1c2 + ? kmem_cache_alloc+0xa4/0x1e0 + ? do_init_module+0x17/0x1c2 + do_init_module+0x46/0x1c2 + load_module+0x1f45/0x2380 + SyS_init_module+0xe5/0x100 + do_int80_syscall_32+0x61/0x190 + entry_INT80_32+0x2f/0x2f + Killed + +Decoding an oops +---------------- + +.. slide:: Debugging + :inline-contents: True + :level: 2 + + * CONFIG_DEBUG_INFO + * addr2line + * gdb + * objdump -dSr + +addr2line +--------- + +*addr2line* translates addresses into file names and line numbers. Given +an address in an executable it uses the debugging information to figure out +which file name and line number are associated with it. + +Modules are loaded at dynamic addresses but are compiled starting with 0 as +a base address. So, in order to find the line number for a given dynamic address +we need to know module's load address. + +.. slide:: addr2line + :inline-contents: True + :level: 2 + + .. code-block:: bash + + $ addr2line -e oops.o 0x08 + $ skels/debugging/oops/oops.c:5 + $ # 0x08 is the offset of the offending instruction inside the oops.ko module + +objdump +------- + +Similar we can determine the offending line using objdump: + +.. slide:: objdump + :inline-contents: True + :level: 2 + + .. code-block:: bash + + $ cat /proc/modules + oops 20480 1 - Loading 0xc8816000 (O+) + + $ objdump -dS --adjust-vma=0xc8816000 oops.ko + c8816000: b8 61 00 00 00 mov $0x61,%eax + + static noinline void do_oops(void) + { + c8816005: 55 push %ebp + c8816006: 89 e5 mov %esp,%ebp + *(int*)0x42 = 'a'; + c8816008: a3 42 00 00 00 mov %eax,0x42 + +gdb +--- + +.. slide:: gdb + :inline-contents: True + :level: 2 + + .. code-block:: bash + + $ gdb ./vmlinux + + (gdb) list *(do_panic+0x8) + 0xc1244138 is in do_panic (lib/test_panic.c:8). + 3 + 4 static struct timer_list panic_timer; + 5 + 6 static void do_panic(struct timer_list *unused) + 7 { + 8 *(int*)0x42 = 'a'; + 9 } + 10 + 11 static int so2_panic_init(void) + +Kernel panic +------------ + +A kernel panic is a special type of oops where the kernel cannot continue execution. For example +if the function do_oops from above was called in the interrupt context, the kernel wouldn't know how to kill +and it will decide that it is better to crash the kernel and stop execution. + +Here is a sample code that will generate a kernel panic: + +.. slide:: Kernel panic + :inline-contents: True + :level: 2 + + .. code-block:: c + + static struct timer_list panic_timer; + + static void do_panic(struct timer_list *unused) + { + *(int*)0x42 = 'a'; + } + + static int so2_panic_init(void) + { + pr_info("panic_init\n"); + + timer_setup(&panic_timer, do_panic, 0); + mod_timer(&panic_timer, jiffies + 2 * HZ); + + return 0; + } + +Loading the module will generate the following kernel panic message: + +.. code-block:: bash + + root@qemux86:~/skels/debugging/panic# insmod panic.ko + panic: loading out-of-tree module taints kernel. + panic_init + root@qemux86:~/skels/debugging/panic# BUG: unable to handle kernel NULL pointer dereference at 00000042 + IP: do_panic+0x8/0x10 [panic] + *pde = 00000000 + Oops: 0002 [#1] SMP + Modules linked in: panic(O) + CPU: 0 PID: 0 Comm: swapper/0 Tainted: G O 4.15.0+ #19 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 + EIP: do_panic+0x8/0x10 [panic] + EFLAGS: 00010246 CPU: 0 + EAX: 00000061 EBX: 00000101 ECX: 000002d8 EDX: 00000000 + ESI: c8817000 EDI: c8819200 EBP: c780ff34 ESP: c780ff34 + DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 + CR0: 80050033 CR2: 00000042 CR3: 0716b000 CR4: 00000690 + Call Trace: + + call_timer_fn+0x63/0xf0 + ? process_timeout+0x10/0x10 + run_timer_softirq+0x14f/0x170 + ? 0xc8817000 + ? trace_hardirqs_on_caller+0x9b/0x1a0 + __do_softirq+0xde/0x1f2 + ? __irqentry_text_end+0x6/0x6 + do_softirq_own_stack+0x57/0x70 + + irq_exit+0x7d/0x90 + smp_apic_timer_interrupt+0x4f/0x90 + ? trace_hardirqs_off_thunk+0xc/0x1d + apic_timer_interrupt+0x3a/0x40 + EIP: default_idle+0xa/0x10 + EFLAGS: 00000246 CPU: 0 + EAX: c15c97c0 EBX: 00000000 ECX: 00000000 EDX: 00000001 + ESI: 00000000 EDI: 00000000 EBP: c15c3f48 ESP: c15c3f48 + DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 + arch_cpu_idle+0x9/0x10 + default_idle_call+0x19/0x30 + do_idle+0x105/0x180 + cpu_startup_entry+0x25/0x30 + rest_init+0x1e3/0x1f0 + start_kernel+0x305/0x30a + i386_start_kernel+0x95/0x99 + startup_32_smp+0x15f/0x164 + Code: 42 00 00 00 5d c3 90 55 89 e5 83 ec 08 c7 04 24 24 80 81 c8 e8 + EIP: do_panic+0x8/0x10 [panic] SS:ESP: 0068:c780ff34 + CR2: 0000000000000042 + ---[ end trace 77f49f83f2e42f91 ]--- + Kernel panic - not syncing: Fatal exception in interrupt + Kernel Offset: disabled + ---[ end Kernel panic - not syncing: Fatal exception in interrupt + + +List debugging +============== + +In order to catch access to uninitialized elements the kernel uses poison +magic values. + +.. slide:: List debugging + :inline-contents: True + :level: 2 + + .. code-block:: bash + + static inline void list_del(struct list_head *entry) + { + __list_del(entry->prev, entry->next); + entry->next = (struct list_head*)LIST_POISON1; + entry->prev = (struct list_head*)LIST_POISON2; + } + + BUG: unable to handle kernel NULL pointer dereference at 00000100 + IP: crush+0x80/0xb0 [list] + +Memory debugging +================ + +There are several tools for memory debugging: + +.. slide:: Memory debugging + :inline-contents: True + :level: 2 + + * SLAB/SLUB debugging + * KASAN + * kmemcheck + * DEBUG_PAGEALLOC + +Slab debugging +--------------- + +Slab debugging uses a memory poison technique to detect several types of memory +bugs in the SLAB/SUB allocators. + +The allocated buffers are guarded with memory that has been filled in with +special markers. Any adjacent writes to the buffer will be detected at a later +time when other memory management operations on that buffer are performed +(e.g. when the buffer is freed). + +Upon allocation of the buffer, the buffer it is also filled in with a special +value to potentially detect buffer access before initialization (e.g. if the +buffer holds pointers). The value is selected in such a way that it is unlikely +to be a valid address and as such to trigger kernel bugs at the access time. + +A similar technique is used when freeing the buffer: the buffer is filled with +another special value that will cause kernel bugs if pointers are accessed after +the memory is freed. In this case, the allocator also checks the next time the +buffer is allocated that the buffer was not modified. + +The diagram bellow shows a summary of the way SLAB/SLUB poisoning works: + + +.. slide:: Slab debugging + :inline-contents: True + :level: 2 + + * CONFIG_DEBUG_SLAB + * poisoned based memory debuggers + + .. ditaa:: + +--------------+-----------------------+--------------+ + | cF88 | c8F8 | cF88 | + | Buffer | Allocated buffer | Buffer | + | Underflow | 0x5a5a5a5a | Overflow | + | Poison | 0x5a5a5a5a | Poison | + | | 0x5a5a5a5a | | + +--------------+-----------------------+--------------+ + + +--------------+-----------------------+--------------+ + | cF88 | c888 | cF88 | + | Buffer | Freed buffer | Buffer | + | Underflow | 0x6b6b6b6b | Overflow | + | Poison | 0x6b6b6b6b | Poison | + | | 0x6b6b6b6b | | + +--------------+-----------------------+--------------+ + + +Example of an use before initialize bug: + +.. slide:: Use before initialize bugs + :inline-contents: True + :level: 2 + + :: + + BUG: unable to handle kernel paging request at 5a5a5a5a + IP: [] __list_del_entry+0x37/0x71 + … + Call Trace: + [] list_del+0xb/0x1b + [] use_before_init+0x31/0x38 [crusher] + [] crush_it+0x38/0xa9 [crusher] + [] init_module+0x8/0xa [crusher] + [] do_one_initcall+0x72/0x119 + [] ? crush_it+0xa9/0xa9 [crusher] + [] sys_init_module+0xc8d/0xe77 + [] syscall_call+0x7/0xb + + .. code-block:: c + + noinline void use_before_init(void) + { + struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL); + + printk("%s\n", __func__); + list_del(&m->lh); + } + +Example of an use after free bug: + +.. slide:: Use after free bug + :inline-contents: True + :level: 2 + + :: + + BUG: unable to handle kernel paging request at 6b6b6b6b + IP: [] __list_del_entry+0x37/0x71 + … + Call Trace: + [] list_del+0xb/0x1b + [] use_after_free+0x38/0x3f [crusher] + [] crush_it+0x52/0xa9 [crusher] + [] init_module+0x8/0xa [crusher] + [] do_one_initcall+0x72/0x119 + [] ? crush_it+0xa9/0xa9 [crusher] + [] sys_init_module+0xc8d/0xe77 + [] syscall_call+0x7/0xb + + .. code-block:: c + + noinline void use_after_free(void) + { + struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL); + + printk("%s\n", __func__); + kfree(m); + list_del(&m->lh); + } + +Another example of an use after free bug is shown below. Note that this time the +bug is detected at the next allocation. + +.. slide:: Use after free bug + :inline-contents: True + :level: 2 + + :: + + # insmod /system/lib/modules/crusher.ko test=use_before_init + Slab corruption: size-4096 start=ed612000, len=4096 + 000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + 010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 6b 6b + + .. code-block:: c + + noinline void use_after_free2(void) + { + char *b = kmalloc(3000, GFP_KERNEL); + kfree(b); + memset(b, 0, 30); + b = kmalloc(3000, GFP_KERNEL); + kfree(b); + } + +Finally this is an example of a buffer overflow bug: + +.. slide:: Buffer overflow bugs + :inline-contents: True + :level: 2 + + :: + + slab error in verify_redzone_free(): cache `dummy': memory outside object was overwritten + Pid: 1282, comm: insmod Not tainted 3.0.16-mid10-00007-ga4a6b62-dirty #70 + Call Trace: + [] __slab_error+0x17/0x1c + [] __cache_free+0x12c/0x317 + [] kmem_cache_free+0x2b/0xaf + [] buffer_overflow+0x4c/0x57 [crusher] + [] crush_it+0x6c/0xa9 [crusher] + [] init_module+0x8/0xd [crusher] + [] do_one_initcall+0x72/0x119 + [] sys_init_module+0xc8d/0xe77 + [] syscall_call+0x7/0xb + eb002bf8: redzone 1:0xd84156c5635688c0, redzone 2:0x0 + + .. code-block:: c + + noinline void buffer_overflow(void) + { + struct kmem_cache *km = kmem_cache_create("dummy", 3000, 0, 0, NULL); + char *b = kmem_cache_alloc(km, GFP_KERNEL); + + printk("%s\n", __func__); + memset(b, 0, 3016); + kmem_cache_free(km, b); + } + + +DEBUG_PAGEALLOC +--------------- + +.. slide:: DEBUG_PAGEALLOC + :inline-contents: True + :level: 2 + + * Memory debugger that works at a page level + * Detects invalid accesses either by: + + * Filling pages with poison byte patterns and checking the pattern at + reallocation + * Unmapping the dellocated pages from kernel space (just a few + architectures) + + +KASan +----- + +KASan is a dynamic memory error detector designed to find use-after-free +and out-of-bounds bugs. + +The main idea of KASAN is to use shadow memory to record whether each byte +of memory is safe to access or not, and use compiler's instrumentation to +check the shadow memory on each memory access. + +Address sanitizer uses 1 byte of shadow memory to track 8 bytes of kernel +address space. It uses 0-7 to encode the number of consecutive bytes at +the beginning of the eigh-byte region that are valid. + +See `The Kernel Address Sanitizer (KASAN)` for more information and have a look +at lib/test_kasan.c for an example of problems that KASan can detect. + +.. slide:: KASan + :inline-contents: True + :level: 2 + + * dynamic memory error detector + * finds user-after-free or out-of-bound bugs + * uses shadow memory to track memory operations + * lib/test_kasan.c + + +KASan vs DEBUG_PAGEALLOC +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. slide:: KASan vs DEBUG_PAGEALLOC + :inline-contents: True + :level: 2 + + KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page granularity + level, so it able to find more bugs. + + +KASan vs SLUB_DEBUG +~~~~~~~~~~~~~~~~~~~ + +.. slide:: KASan vs SLUB_DEBUG + :inline-contents: True + :level: 2 + + * SLUB_DEBUG has lower overhead than KASan. + * SLUB_DEBUG in most cases are not able to detect bad reads, KASan able to + detect both reads and writes. + * In some cases (e.g. redzone overwritten) SLUB_DEBUG detect bugs only on + allocation/freeing of object. KASan catch bugs right before it will happen, + so we always know exact place of first bad read/write. + + +Kmemleak +-------- + +Kmemleak provides a way of detecting kernel memory leaks in a way similar to a +tracing garbage collector. Since tracing pointers is not possible in C, kmemleak +scans the kernel stacks as well as dynamically and statically kernel memory for +pointers to allocated buffers. A buffer for which there is no pointer is +considered as leaked. The basic steps to use kmemleak are presented bellow, for +more information see `Kernel Memory Leak Detector` + + +.. slide:: Kmemleak + :inline-contents: True + :level: 2 + + * enable kernel config: `CONFIG_DEBUG_KMEMLEAK` + * setup: `mount -t debugfs nodev /sys/kernel/debug` + * trigger a memory scan: `echo scan > /sys/kernel/debug/kmemleak` + * show memory leaks: `cat /sys/kernel/debug/kmemleak` + * clear all possible leaks: `echo clear > /sys/kernel/debug/kmemleak` + +As an example, lets look at the following simple module: + +.. slide:: Kmemleak example + :inline-contents: True + :level: 2 + + .. code-block:: c + + static int leak_init(void) + { + pr_info("%s\n", __func__); + + (void)kmalloc(16, GFP_KERNEL); + + return 0; + } + + MODULE_LICENSE("GPL v2"); + module_init(leak_init); + +Loading the module and triggering a kmemleak scan will issue the +following report: + +.. slide:: Kmemleak report + :inline-contents: True + :level: 2 + + :: + + root@qemux86:~# insmod skels/debugging/leak/leak.ko + leak: loading out-of-tree module taints kernel. + leak_init + root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak + root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak + kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak) + root@qemux86:~# cat /sys/kernel/debug/kmemleak + unreferenced object 0xd7871500 (size 32): + comm "insmod", pid 237, jiffies 4294902108 (age 24.628s) + hex dump (first 32 bytes): + 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ + 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a a5 ZZZZZZZZZZZZZZZ. + backtrace: + [<(ptrval)>] kmem_cache_alloc_trace+0x163/0x310 + [<(ptrval)>] leak_init+0x2f/0x1000 [leak] + [<(ptrval)>] do_one_initcall+0x57/0x2e0 + [<(ptrval)>] do_init_module+0x4b/0x1be + [<(ptrval)>] load_module+0x201a/0x2590 + [<(ptrval)>] sys_init_module+0xfd/0x120 + [<(ptrval)>] do_int80_syscall_32+0x6a/0x1a0 + + +.. note:: Notice that we did not had to unload the module to detect the memory + leak since kmemleak detects that the allocated buffer is not + reachable anymore. + + +Lockdep checker +=============== + +.. slide:: Lockdep checker + :inline-contents: True + :level: 2 + + * CONFIG_DEBUG_LOCKDEP + * Detects lock inversio, circular dependencies, incorrect usage of locks + (including interrupt context) + * Maintains dependency between classes of locks not individual locks + * Each scenario is only checked once and hashed + + +Lets take for example the following kernel module that runs two kernel threads: + +.. slide:: AB BA Deadlock Example + :inline-contents: True + :level: 2 + + .. code-block:: c + + static noinline int thread_a(void *unused) + { + mutex_lock(&a); pr_info("%s acquired A\n", __func__); + mutex_lock(&b); pr_info("%s acquired B\n", __func__); + + mutex_unlock(&b); + mutex_unlock(&a); + + return 0; + } + + .. code-block:: c + + static noinline int thread_b(void *unused) + { + mutex_lock(&b); pr_info("%s acquired B\n", __func__); + mutex_lock(&a); pr_info("%s acquired A\n", __func__); + + mutex_unlock(&a); + mutex_unlock(&b); + + return 0; + } + + +Loading this module with lockdep checker active will produce the following +kernel log: + +.. slide:: AB BA Deadlock Report + :inline-contents: True + :level: 2 + + :: + + thread_a acquired A + thread_a acquired B + thread_b acquired B + + ====================================================== + WARNING: possible circular locking dependency detected + 4.19.0+ #4 Tainted: G O + ------------------------------------------------------ + thread_b/238 is trying to acquire lock: + (ptrval) (a){+.+.}, at: thread_b+0x48/0x90 [locking] + + but task is already holding lock: + (ptrval) (b){+.+.}, at: thread_b+0x27/0x90 [locking] + + which lock already depends on the new lock. + + +As you can see, although the deadlock condition did not trigger (because thread +A did not complete execution before thread B started execution) the lockdep +checker identified a potential deadlock scenario. + +Lockdep checker will provide even more information to help determine what caused +the deadlock, like the dependency chain: + +.. slide:: AB BA Deadlock Report (dependency chain) + :inline-contents: True + :level: 2 + + :: + + the existing dependency chain (in reverse order) is: + + -> #1 (b){+.+.}: + __mutex_lock+0x60/0x830 + mutex_lock_nested+0x20/0x30 + thread_a+0x48/0x90 [locking] + kthread+0xeb/0x100 + ret_from_fork+0x2e/0x38 + + -> #0 (a){+.+.}: + lock_acquire+0x93/0x190 + __mutex_lock+0x60/0x830 + mutex_lock_nested+0x20/0x30 + thread_b+0x48/0x90 [locking] + kthread+0xeb/0x100 + ret_from_fork+0x2e/0x38 + +and even an unsafe locking scenario: + +.. slide:: AB BA Deadlock Report (unsafe locking scenario) + :inline-contents: True + :level: 2 + + :: + + other info that might help us debug this: + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(b); + lock(a); + lock(b); + lock(a); + + *** DEADLOCK *** + + +Another example of unsafe locking issues that lockdep checker detects +is unsafe locking from interrupt context. Lets consider the following +kernel module: + +.. slide:: IRQ Deadlock Example + :inline-contents: True + :level: 2 + + .. code-block:: c + + static DEFINE_SPINLOCK(lock); + + static void timerfn(struct timer_list *unused) + { + pr_info("%s acquiring lock\n", __func__); + spin_lock(&lock); pr_info("%s acquired lock\n", __func__); + spin_unlock(&lock); pr_info("%s released lock\n", __func__); + } + + static DEFINE_TIMER(timer, timerfn); + + int init_module(void) + { + mod_timer(&timer, jiffies); + + pr_info("%s acquiring lock\n", __func__); + spin_lock(&lock); pr_info("%s acquired lock\n", __func__); + spin_unlock(&lock); pr_info("%s released lock\n", __func__); + return 0; + } + + +As in the previous case, loading the module will trigger a lockdep +warning: + +.. slide:: IRQ Deadlock Report + :inline-contents: True + :level: 2 + + :: + + init_module acquiring lock + init_module acquired lock + init_module released lock + timerfn acquiring lock + + ================================ + WARNING: inconsistent lock state + 4.19.0+ #4 Tainted: G O + -------------------------------- + inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. + ksoftirqd/0/9 [HC0[0]:SC1[1]:HE1:SE0] takes: + (ptrval) (lock#4){+.?.}, at: timerfn+0x25/0x60 [locking2] + {SOFTIRQ-ON-W} state was registered at: + lock_acquire+0x93/0x190 + _raw_spin_lock+0x39/0x50 + init_module+0x35/0x70 [locking2] + do_one_initcall+0x57/0x2e0 + do_init_module+0x4b/0x1be + load_module+0x201a/0x2590 + sys_init_module+0xfd/0x120 + do_int80_syscall_32+0x6a/0x1a0 + restore_all+0x0/0x8d + + +The warning will also provide additional information and a potential unsafe +locking scenario: + +.. slide:: IRQ Deadlock Report + :inline-contents: True + :level: 2 + + :: + + Possible unsafe locking scenario: + + CPU0 + ---- + lock(lock#4); + + lock(lock#4); + + *** DEADLOCK *** + + 1 lock held by ksoftirqd/0/9: + #0: (ptrval) (/home/tavi/src/linux/tools/labs/skels/./debugging/locking2/locking2.c:13){+.-.}, at: call_timer_f0 + stack backtrace: + CPU: 0 PID: 9 Comm: ksoftirqd/0 Tainted: G O 4.19.0+ #4 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 + Call Trace: + dump_stack+0x66/0x96 + print_usage_bug.part.26+0x1ee/0x200 + mark_lock+0x5ea/0x640 + __lock_acquire+0x4b4/0x17a0 + lock_acquire+0x93/0x190 + _raw_spin_lock+0x39/0x50 + timerfn+0x25/0x60 [locking2] + + +perf +==== + +.. slide:: perf + :inline-contents: True + :level: 2 + + * performance counters, tracepoints, kprobes, uprobes + * hardware events: CPU cycles, TLB misses, cache misses + * software events: page faults , context switches + * collects backtraces (user + kernel) + +Other tools +=========== + +.. slide:: Other tools + :inline-contents: True + :level: 2 + + * ftrace + * kprobes + * sparse + * coccinelle + * checkpatch.pl + * printk + * dump_stack() diff --git a/refs/pull/405/merge/_sources/lectures/fs.rst.txt b/refs/pull/405/merge/_sources/lectures/fs.rst.txt new file mode 100644 index 00000000..20b4e6d6 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/fs.rst.txt @@ -0,0 +1,775 @@ +===================== +Filesystem Management +===================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Filesystem Management + :inline-contents: True + :level: 2 + + * Filesystem abstractions + + * Filesystem operations + + * Linux VFS + + * Overview of Linux I/O Management + + +Filesystem Abstractions +======================= + +A fileystem is a way to organize files and directories on storage +devices such as hard disks, SSDs or flash memory. There are many types +of filesystems (e.g. FAT, ext4, btrfs, ntfs) and on one running system +we can have multiple instances of the same filesystem type in use. + +While filesystems use different data structures to organizing the +files, directories, user data and meta (internal) data on storage +devices there are a few common abstractions that are used in almost +all filesystems: + +.. slide:: Filesystem Abstractions + :inline-contents: True + :level: 2 + + * superblock + + * file + + * inode + + * dentry + + +Some of these abstractions are present both on disk and in memory +while some are only present in memory. + +The *superblock* abstraction contains information about the filesystem +instance such as the block size, the root inode, filesystem size. It +is present both on storage and in memory (for caching purposes). + +The *file* abstraction contains information about an opened file such +as the current file pointer. It only exists in memory. + +The *inode* is identifying a file on disk. It exists both on storage +and in memory (for caching purposes). An inode identifies a file in a +unique way and has various properties such as the file size, access +rights, file type, etc. + +.. note:: The file name is not a property of the file. + +The *dentry* associates a name with an inode. It exists both on +storage and in memory (for caching purposes). + +The following diagram shows the relationship between the various filesystem +abstractions as they used in memory: + +.. slide:: Filesystem Abstractions - in memory + :inline-contents: True + :level: 2 + + .. ditaa:: + :--no-separation: + + file + descriptor + table + +------------+ +--------+ +--------+ +---------+ + | |------+--->| FILE |------->| dentry |------->| inode | + +------------+ | +--------+ +--------+ ^ +---------+ + +-> | |------+ dup | | type | + | +------------+ hard link | | perm | + | | ... | | | .... | + | +------------+ +--------+ +--------+ | +---------+ + | | |---------->| FILE |------->| dentry |---+ | + | +------------+ +--------+ +--------+ | + fd | + | + +------+ <-------------------+ + | data | + +------+ + +------+ +------+ + | data | | data | + +------+ +------+ + +------+ + | data | + +------+ + +Note that not all of the one to many relationships between the various +abstractions are depicted. + +Multiple file descriptors can point to the same *file* because we can +use the :c:func:`dup` system call to duplicate a file descriptor. + +Multiple *file* abstractions can point to the same *dentry* if we open +the same path multiple times. + +Multiple *dentries* can point to the same *inode* when hard links are +used. + +The following diagram shows the relationship of the filesystem +abstraction on storage: + +.. slide:: Filesystem Abstractions - on storage + :inline-contents: True + :level: 2 + + .. ditaa:: + :--no-separation: + + + +--------+ +-------+ data +--------+ + | dentry |-------------->| inode |--------+ | dentry | + +--------+ +-------+ | +--------+ + | ...... | | ..... | | | ...... | + +--------+ +-------+ dir | +--------+ + | dentry | | inode |--------|--+ | dentry | + +--------+ +-------+ | | +--------+ + ^ | | ^ + | | | | + | | | +--------+ + | V v | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + blocks | | | | | | | | | | | | | | | | | | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | | + | +------------+ | ++++++++++++ + +--->| superblock | +--->|||||||||||| block management + +------------+ ++++++++++++ + + +The diagram shows that the *superblock* is typically stored at the +beginning of the fileystem and that various blocks are used with +different purposes: some to store dentries, some to store inodes and +some to store user data blocks. There are also blocks used to manage +the available free blocks (e.g. bitmaps for the simple filesystems). + +The next diagram show a very simple filesystem where blocks are +grouped together by function: + +* the superblock contains information about the block size as well as + the IMAP, DMAP, IZONE and DZONE areas. + +* the IMAP area is comprised of multiple blocks which contains a + bitmap for inode allocation; it maintains the allocated/free state + for all inodes in the IZONE area + +* the DMAP area is comprised of multiple blocks which contains a + bitmap for data blocks; it maintains the allocated/free state for + all blocks the DZONE area + + +.. slide:: Simple filesystem example + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + :--no-separation: + + +--------------+--------+--------+---------+---------+ + | | | | | | + | Superblock | IMAP | DMAP | IZONE | DZONE | + | | | | | | + +--------------+--------+--------+---------+---------+ + + +Filesystem Operations +===================== + +The following diagram shows a high level overview of how the file +system drivers interact with the rest of the file system "stack". In +order to support multiple filesystem types and instances Linux +implements a large and complex subsystem that deals with filesystem +management. This is called Virtual File System (or sometimes Virtual +File Switch) and it is abbreviated with VFS. + + +.. slide:: Overview + :inline-contents: True + :level: 2 + + .. ditaa:: + + ^ ^ ^ + | stat | open | read + v v v + +------------------------------------------------------------+ + | | + | Virtual Filesystem Switch | + | | + +------------------------------------------------------------+ + ^ ^ + | | + v v + +-------------+ +-------------+ + | Filesystem | | Filesystem | + | driver | | driver | + +-------------+ +-------------+ + ^ ^ + | | + v v + +------------------------------------------------------------+ + | | + | Block I/O layer | + | | + +------------------------------------------------------------+ + +VFS translates the complex file management related system calls to +simpler operations that are implemented by the device drivers. These +are some of the operations that a file system must implement: + +.. slide:: Filesystem Operations + :inline-contents: True + :level: 2 + + * Mount + + * Open a file + + * Querying file attributes + + * Reading data from a file + + * Writing file to a file + + * Creating a file + + * Deleting a file + + +The next sections will look in-depth at some of these operations. + +Mounting a filesystem +--------------------- + +A summary of a typical implementation is presented below: + +.. slide:: Mounting a filesystem + :inline-contents: True + :level: 2 + + * Input: a storage device (partition) + + * Output: dentry pointing to the root directory + + * Steps: check device, determine filesystem parameters, locate the root inode + + * Example: check magic, determine block size, read the root inode and create dentry + + +Opening a file +-------------- + +A summary of a typical implementation is presented below: + +.. slide:: Opening a file + :inline-contents: True + :level: 2 + + * Input: path + + * Output: file descriptor + + * Steps: + + * Determine the filesystem type + + * For each name in the path: lookup parent dentry, load inode, + load data, find dentry + + * Create a new *file* that points to the last *dentry* + + * Find a free entry in the file descriptor table and set it to *file* + + +Querying file attributes +------------------------ + +A summary of a typical implementation is presented below: + +.. slide:: Querying file attributes + :inline-contents: True + :level: 2 + + * Input: path + + * Output: file attributes + + * Steps: + + * Access `file->dentry->inode` + + * Read file attributes from the *inode* + +Reading data from a file +------------------------ + +A summary of a typical implementation is presented below: + +.. slide:: Reading data from a file + :inline-contents: True + :level: 2 + + * Input: file descriptor, offset, length + + * Output: data + + * Steps: + + * Access `file->dentry->inode` + + * Determine data blocks + + * Copy data blocks to memory + + +Writing data to a file +---------------------- + +A summary of a typical implementation is presented below: + +.. slide:: Writing data to a file + :inline-contents: True + :level: 2 + + * Input: file descriptor, offset, length, data + + * Output: + + * Steps: + + * Allocate one or more data blocks + + * Add the allocated blocks to the inode and update file size + + * Copy data from userspace to internal buffers and write them to + storage + + +Closing a file +-------------- + +A summary of a typical implementation is presented below: + +.. slide:: Closing a file + :inline-contents: True + :level: 2 + + * Input: file descriptor + + * Output: + + * Steps: + + * set the file descriptor entry to NULL + + * Decrement file reference counter + + * When the counter reaches 0 free *file* + + +Directories +----------- + +.. slide:: Directories + :inline-contents: True + :level: 2 + + Directories are special files which contain one or more dentries. + +Creating a file +--------------- + +A summary of a typical implementation is presented below: + +.. slide:: Creating a file + :inline-contents: True + :level: 2 + + * Input: path + + * Output: + + * Steps: + + * Determine the inode directory + + * Read data blocks and find space for a new dentry + + * Write back the modified inode directory data blocks + + +Deleting a file +--------------- + +A summary of a typical implementation is presented below: + + +.. slide:: Deleting a file + :inline-contents: True + :level: 2 + + * Input: path + + * Output: + + * Steps: + + * determine the parent inode + + * read parent inode data blocks + + * find and erase the dentry (check for links) + + * when last file is closed: deallocate data and inode blocks + + +Linux Virtual File System +========================= + +Although the main purpose for the original introduction of VFS in UNIX +kernels was to support multiple filesystem types and instances, a side +effect was that it simplified fileystem device driver development +since command parts are now implement in the VFS. Almost all of the +caching and buffer management is dealt with VFS, leaving just +efficient data storage management to the filesystem device driver. + +In order to deal with multiple filesystem types, VFS introduced the +common filesystem abstractions previously presented. Note that the +filesystem driver can also use its own particular fileystem +abstractions in memory (e.g. ext4 inode or dentry) and that there +might be a different abstraction on storage as well. Thus we may end +up with three slightly different filesystem abstractions: one for +VFS - always in memory, and two for a particular filesystem - one in +memory used by the filesystem driver, and one on storage. + +.. slide:: Virtual File System + :level: 2 + :inline-contents: True + + .. ditaa:: + :height: 100% + + + ^ ^ ^ + | stat | open | read + v v v + +------------------------------------------------------------+ + | Virtual File System | + | | + | | + | /-------\ /--------\ /--------\ | + | | inode |<----------+ dentry |<----------+ FILE | | + | \---+---/ \----+---/ \---+----/ | + | | | | | + | | | | | + | v v v | + | +-------+ +--------+ +-------+ | + | | inode | | dentry | | page | | + | | cache | | cache | | cache | | + | +-------+ +--------+ +-------+ | + | | + +------------------------------------------------------------+ + ^ ^ + | | + v v + +-------------+ +-------------+ + | Filesystem | | Filesystem | + | driver | | driver | + +-------------+ +-------------+ + + +Superblock Operations +--------------------- + +VFS requires that all filesystem implement a set of "superblock +operations". + +They deal with initializing, updating and freeing the VFS superblock: + + * :c:func:`fill_super` - reads the filesystem statistics (e.g. total + number of inode, free number of inodes, total number of blocks, free + number of blocks) + + * :c:func:`write_super` - updates the superblock information on storage + (e.g. updating the number of free inode or data blocks) + + * :c:func:`put_super` - free any data associated with the filsystem + instance, called when unmounting a filesystem + +The next class of operations are dealing with manipulating fileystem +inodes. These operations will receive VFS inodes as parameters but the +filesystem driver may use its own inode structures internally and, if +so, they will convert in between them as necessary. + +A summary of the superblock operations are presented below: + +.. slide:: Superblock Operations + :level: 2 + :inline-contents: True + + .. hlist:: + :columns: 2 + + * fill_super + * put_super + * write_super + * read_inode + * write_inode + * evict_inode + * statfs + * remount_fs + + +Inode Operations +---------------- + +The next set of operations that VFS calls when interacting with +filesystem device drivers are the "inode operations". Non-intuitively +these mostly deal with manipulating dentries - looking up a file name, +creating, linking and removing files, dealing with symbolic links, +creating and removing directories. + +This is the list of the most important inode operations: + +.. slide:: Inode Operations + :level: 2 + :inline-contents: True + + .. hlist:: + :columns: 2 + + * create + * lookup + * link + * unlink + * symlink + * mkdir + * rmdir + * rename + * readlink + * follow_link + * put_link + * ... + + +The Inode Cache +--------------- + +The inode cache is used to avoid reading and writing inodes to and +from storage every time we need to read or update them. The cache uses +a hash table and inodes are indexed with a hash function which takes +as parameters the superblock (of a particular filesystem instance) and +the inode number associated with an inode. + +inodes are cached until either the filesystem is unmounted, the inode +deleted or the system enters a memory pressure state. When this +happens the Linux memory management system will (among other things) +free inodes from the inode cache based on how often they were +accessed. + +.. slide:: The Inode Cache + :level: 2 + :inline-contents: True + + * Caches inodes into memory to avoid costly storage operations + + * An inode is cached until low memory conditions are triggered + + * inodes are indexed with a hash table + + * The inode hash function takes the superblock and inode number as + inputs + + +The Dentry Cache +---------------- + +.. slide:: The Dentry Cache + :level: 2 + :inline-contents: True + + * State: + + * Used – *d_inode* is valid and the *dentry* object is in use + + * Unused – *d_inode* is valid but the dentry object is not in use + + * Negative – *d_inode* is not valid; the inode was not yet loaded + or the file was erased + + * Dentry cache + + * List of used dentries (dentry->d_state == used) + + * List of the most recent used dentries (sorted by access time) + + * Hash table to avoid searching the tree + +The Page Cache +-------------- + +.. slide:: The Page Cache + :level: 2 + :inline-contents: True + + * Caches file data and not block device data + + * Uses the :c:type:`struct address_space` to translate file offsets + to block offsets + + * Used for both `read` / `write` and `mmap` + + * Uses a radix tree + + + +.. slide:: struct address_space + :level: 2 + :inline-contents: True + + .. code-block:: c + + /** + * struct address_space - Contents of a cacheable, mappable object. + * @host: Owner, either the inode or the block_device. + * @i_pages: Cached pages. + * @gfp_mask: Memory allocation flags to use for allocating pages. + * @i_mmap_writable: Number of VM_SHARED mappings. + * @nr_thps: Number of THPs in the pagecache (non-shmem only). + * @i_mmap: Tree of private and shared mappings. + * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. + * @nrpages: Number of page entries, protected by the i_pages lock. + * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. + * @writeback_index: Writeback starts here. + * @a_ops: Methods. + * @flags: Error bits and flags (AS_*). + * @wb_err: The most recent error which has occurred. + * @private_lock: For use by the owner of the address_space. + * @private_list: For use by the owner of the address_space. + * @private_data: For use by the owner of the address_space. + */ + struct address_space { + struct inode *host; + struct xarray i_pages; + gfp_t gfp_mask; + atomic_t i_mmap_writable; + #ifdef CONFIG_READ_ONLY_THP_FOR_FS + /* number of thp, only for non-shmem files */ + atomic_t nr_thps; + #endif + struct rb_root_cached i_mmap; + struct rw_semaphore i_mmap_rwsem; + unsigned long nrpages; + unsigned long nrexceptional; + pgoff_t writeback_index; + const struct address_space_operations *a_ops; + unsigned long flags; + errseq_t wb_err; + spinlock_t private_lock; + struct list_head private_list; + void *private_data; + } __attribute__((aligned(sizeof(long)))) __randomize_layout; + + struct address_space_operations { + int (*writepage)(struct page *page, struct writeback_control *wbc); + int (*readpage)(struct file *, struct page *); + + /* Write back some dirty pages from this mapping. */ + int (*writepages)(struct address_space *, struct writeback_control *); + + /* Set a page dirty. Return true if this dirtied it */ + int (*set_page_dirty)(struct page *page); + + /* + * Reads in the requested pages. Unlike ->readpage(), this is + * PURELY used for read-ahead!. + */ + int (*readpages)(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + void (*readahead)(struct readahead_control *); + + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ + sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); + int (*releasepage) (struct page *, gfp_t); + void (*freepage)(struct page *); + ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); + /* + * migrate the contents of a page to the specified target. If + * migrate_mode is MIGRATE_ASYNC, it must not block. + */ + int (*migratepage) (struct address_space *, + struct page *, struct page *, enum migrate_mode); + bool (*isolate_page)(struct page *, isolate_mode_t); + void (*putback_page)(struct page *); + int (*launder_page) (struct page *); + int (*is_partially_uptodate) (struct page *, unsigned long, + unsigned long); + void (*is_dirty_writeback) (struct page *, bool *, bool *); + int (*error_remove_page)(struct address_space *, struct page *); + + /* swapfile support */ + int (*swap_activate)(struct swap_info_struct *sis, struct file *file, + sector_t *span); + void (*swap_deactivate)(struct file *file); + }; + + +.. slide:: Reading data + :level: 2 + :inline-contents: True + + .. code-block:: c + + /** + * generic_file_read_iter - generic filesystem read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the "read_iter()" routine for all filesystems + * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ + ssize_t + generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) + + /* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * set/clear_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ + int block_read_full_page(struct page *page, get_block_t *get_block) + diff --git a/refs/pull/405/merge/_sources/lectures/interrupts.rst.txt b/refs/pull/405/merge/_sources/lectures/interrupts.rst.txt new file mode 100644 index 00000000..ecb43bbd --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/interrupts.rst.txt @@ -0,0 +1,976 @@ +========== +Interrupts +========== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives +================== + +.. slide:: Interrupts + :inline-contents: True + :level: 2 + + * Interrupts and exceptions (x86) + + * Interrupts and exceptions (Linux) + + * Deferrable work + + * Timers + +What is an interrupt? +===================== + +An interrupt is an event that alters the normal execution flow of a +program and can be generated by hardware devices or even by the CPU +itself. When an interrupt occurs the current flow of execution is +suspended and interrupt handler runs. After the interrupt handler runs +the previous execution flow is resumed. + +Interrupts can be grouped into two categories based on the source of +the interrupt. They can also be grouped into two other categories based +on the ability to postpone or temporarily disable the interrupt: + +.. slide:: Interrupts + :inline-contents: True + :level: 2 + + * **synchronous**, generated by executing an instruction + + * **asynchronous**, generated by an external event + + * **maskable** + + * can be ignored + + * signaled via INT pin + + * **non-maskable** + + * cannot be ignored + + * signaled via NMI pin + +Synchronous interrupts, usually named exceptions, handle conditions detected by the +processor itself in the course of executing an instruction. Divide by zero or +a system call are examples of exceptions. + +Asynchronous interrupts, usually named interrupts, are external events generated +by I/O devices. For example a network card generates an interrupts to signal +that a packet has arrived. + +Most interrupts are maskable, which means we can temporarily postpone +running the interrupt handler when we disable the interrupt until the +time the interrupt is re-enabled. However, there are a few critical +interrupts that can not be disabled/postponed. + +Exceptions +---------- + +There are two sources for exceptions: + +.. slide:: Exceptions + :inline-contents: True + :level: 2 + + * processor detected + + - **faults** + + - **traps** + + - **aborts** + + * programmed + + - **int n** + +Processor detected exceptions are raised when an abnormal condition is +detected while executing an instruction. + +A fault is a type of exception that is reported before the execution of the +instruction and can be usually corrected. The saved EIP is the address of +the instruction that caused the fault, so after the fault is corrected +the program can re-execute the faulty instruction. (e.g page fault). + +A trap is a type of exception that is reported after the execution of the +instruction in which the exception was detected. The saved EIP is the address +of the instruction after the instruction that caused the trap. (e.g debug trap). + +Quiz: interrupt terminology +--------------------------- + +.. slide:: Quiz: interrupt terminology + :inline-contents: True + :level: 2 + + For each of the following terms on the left select all the terms + from right that best describe them. + + .. hlist:: + :columns: 2 + + * Watchdog + * Demand paging + * Division by zero + * Timer + * System call + * Breakpoint + + * Exception + * Interrupt + * Maskable + * Nonmaskable + * Trap + * Fault + + + +Hardware Concepts +================= + +Programmable Interrupt Controller +--------------------------------- + +.. slide:: Programmable Interrupt Controller + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +-----------+ NMI + | | + | |<----------+ + | | + | | +------------+ + | | | | IRQ0 + | | | |<------------+ device0 + | CPU | | | IRQ1 + | | INTR | PIC |<------------+ device1 + | |<----------+ | IRQN + | | | |<------------+ deviceN + | | | | + +-----------+ +------------+ + +A device supporting interrupts has an output pin used for signaling an Interrupt ReQuest. IRQ +pins are connected to a device named Programmable Interrupt Controller (PIC) which is connected +to CPU's INTR pin. + +A PIC usually has a set of ports used to exchange information with the CPU. When a device +connected to one of the PIC's IRQ lines needs CPU attention the following flow happens: + + * device raises an interrupt on the corresponding IRQn pin + * PIC converts the IRQ into a vector number and writes it to a port for CPU to read + * PIC raises an interrupt on CPU INTR pin + * PIC waits for CPU to acknowledge an interrupt before raising another interrupt + * CPU acknowledges the interrupt then it starts handling the interrupt + +Will see later how the CPU handles the interrupt. Notice that by +design PIC won't raise another interrupt until the CPU acknowledged +the current interrupt. + +.. note:: + + Once the interrupt is acknowledged by the CPU the interrupt + controller can request another interrupt, regardless if the CPU + finished handled the previous interrupt or not. Thus, depending on + how the OS controls the CPU it is possible to have nested + interrupts. + +The interrupt controller allows each IRQ line to be individually +disabled. This allows simplifying design by making sure that interrupt +handlers are always executed serially. + +Interrupt controllers in SMP systems +------------------------------------ + +In SMP systems we may have multiple interrupt controllers in the +systems. + +For example, on the x86 architecture each core has a local APIC used +to process interrupts from locally connected devices like timers or +thermals sensors. Then there is an I/O APIC is used to distribute IRQ +from external devices to CPU cores. + +.. slide:: Interrupt controllers in SMP systems + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + + CPU0 CPU1 + +-------------+ +-------------+ + | | | | + | |local IRQs | |local IRQs + | +---------- | +---------- + | | | | + | local APIC | | local APIC | + | | LINT0, LINT1 | | LINT0, LINT1 + | +------------- | +------------- + | | | | + +-------+-----+ +------+------+ + | | + | | + | | + +-------+--------------------------------+------+ + | | + | Interrupt Controller Communication BUS | + +----------------------+------------------------+ + | + | + +--------+--------+ + | | + | I/O APIC | + | | + +--------+--------+ + | + | + | + External interrupts + + + +Interrupt Control +----------------- + +In order to synchronize access to shared data between the interrupt handler +and other potential concurrent activities such as driver initialization or +driver data processing, it is often required to enable and disable interrupts in +a controlled fashion. + +This can be accomplished at several levels: + +.. slide:: Enabling/disabling the interrupts + :inline-contents: True + :level: 2 + + * at the device level + + * by programming the device control registers + + * at the PIC level + + * PIC can be programmed to disable a given IRQ line + + * at the CPU level; for example, on x86 one can use the following + instructions: + + * cli (CLear Interrupt flag) + * sti (SeT Interrupt flag) + + +Interrupt priorities +--------------------- + +Most architectures also support interrupt priorities. When this is +enabled, it permits interrupt nesting only for those interrupts that +have a higher priority than the current priority level. + +.. slide:: Interrupt priorities + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + Process + context + | + v + IRQ10 | irq10 handler + -----------------------------> +-------------+ + | + IRQ20 (lower priority) | + -----------------------------> pending v + | + IRQ5 (higher priority) | irq5 handler + -----------------------------> +-------->---------+ + | + v + | + +--------<---------+ + | + v + | + -------<-------+ + irq20 handler + Pending IRQ20 ------->-------+ + | + v + | + +--------------+ + | + v + + +.. note:: + + Not all architectures support interrupt priorities. It is also + difficult to support defining a generic scheme for interrupt + priorities for general use OSes and some kernels (Linux included) + do not use interrupt priorities. On the other hand most RTOS use + interrupt priorities since they are typically used in more + constraint use-cases where it is easier to define interrupt + priorities. + + +Quiz: hardware concepts +----------------------- + +.. slide:: Quiz: hardware concepts + :inline-contents: True + :level: 2 + + Which of the following statements are true? + + * The CPU can start processing a new interrupt before the current + one is finished + + * Interrupts can be disabled at the device level + + * Lower priority interrupts can not preempt handlers for higher + priority interrupts + + * Interrupts can be disabled at the interrupt controller level + + * On SMP systems the same interrupt can be routed to different CPUs + + * Interrupts can be disabled at the CPU level + + +Interrupt handling on the x86 architecture +========================================== + +This section will examine how interrupts are handled by the CPU on the +x86 architecture. + +Interrupt Descriptor Table +-------------------------- + +The interrupt descriptor table (IDT) associates each interrupt or exception +identifier with a descriptor for the instructions that service the associated +event. We will name the identifier as vector number and the associated +instructions as interrupt/exception handler. + +An IDT has the following characteristics: + +.. slide:: Interrupt Descriptor Table + :inline-contents: True + :level: 2 + + * it is used as a jump table by the CPU when a given vector is triggered + * it is an array of 256 x 8 bytes entries + * may reside anywhere in physical memory + * processor locates IDT by the means of IDTR + +Below we can find Linux IRQ vector layout. The first 32 entries are reserved +for exceptions, vector 128 is used for syscall interface and the rest are +used mostly for hardware interrupts handlers. + +.. slide:: Linux IRQ vector layout + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + arch/x86/include/asm/irq_vectors.h + +------+ + | 0 | 0..31, system traps and exceptions + +------+ + | 1 | + +------+ + | | + +------+ + | | + | | + | | + +------+ + | 32 | 32..127, device interrupts + +------+ + | | + | | + | | + +------+ + | 128 | int80 syscall interface + +------+ + | 129 | 129..255, other interrupts + +------+ + | | + | | + | | + +------+ + | 255 | + +------+ + +On x86 an IDT entry has 8 bytes and it is named gate. There can be 3 types of gates: + + * interrupt gate, holds the address of an interrupt or exception handler. + Jumping to the handler disables maskable interrupts (IF flag is cleared). + * trap gates, similar to an interrupt gate but it does not disable maskable + interrupts while jumping to interrupt/exception handler. + * task gates (not used in Linux) + +Let's have a look at several fields of an IDT entry: + + * segment selector, index into GDT/LDT to find the start of the code segment where + the interrupt handlers reside + * offset, offset inside the code segment + * T, represents the type of gate + * DPL, minimum privilege required for using the segments content. + +.. slide:: Interrupt descriptor table entry (gate) + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + 63 47 42 32 + +------------------------------+---+---+----+---+---------------+ + | | | D | | | | + | offset (16..31 | P | P | | T | | + | | | L | | | | + +------------------------------+---+---+----+---+---------------+ + | | | + | segment selector | offset (0..15) | + | | | + +------------------------------+--------------------------------+ + 31 15 0 + + +Interrupt handler address +------------------------- + +In order to find the interrupt handler address we first need to find the start +address of the code segment where interrupt handler resides. For this we +use the segment selector to index into GDT/LDT where we can find the corresponding +segment descriptor. This will provide the start address kept in the 'base' field. +Using base address and the offset we can now go to the start of the interrupt handler. + + +.. slide:: Interrupt handler address + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + + Interrupt Descriptor + +----------------------------------------------+ + | | + | +------------------+ +--------+ +------+ | + | | segment selector | | offset| | PL | | + | +----+-------------+ +---+----+ +------+ | + | | | | + +----------------------------------------------+ + | | + | | + +-------------+ +----------------------------> +---------------+ + | ^ | ISR address | + | Segment Descriptor | +---------------+ + | +----------------------------------------------+ | + | | | | + +---->| +------------------+ +--------+ +------+ | | + | | base | | limit | | PL | | | + | +---------+--------+ +--------+ +------+ | | + | | | | + +----------------------------------------------+ | + | | + +--------------------------------------------+ + + +Stack of interrupt handler +-------------------------- + +Similar to control transfer to a normal function, a control transfer +to an interrupt or exception handler uses the stack to store the +information needed for returning to the interrupted code. + +As can be seen in the figure below, an interrupt pushes the EFLAGS register +before saving the address of the interrupted instruction. Certain types +of exceptions also cause an error code to be pushed on the stack to help +debug the exception. + + +.. slide:: Interrupt handler stack + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + + w/o privilege transition w/ privilege transition + + + +---------------------+ +---------------------+ + | | | | | + | | | OLD SS:ESP | OLD SS | NEW SS:ESP from TSS + | +---------------------+ +---------------------+ + | | | | | + | | OLD EFLAGS | | OLD ESP | + | +---------------------+ +---------------------+ + | | | | | + | | OLD CS | | OLD EFLAGS | + | +---------------------+ +---------------------+ + | | | | | + | | OLD EIP | | OLD CS | + | +---------------------+ +---------------------+ + | | | | | + | | (error code) | NEW SS:ESP | OLD EIP | + | +---------------------+ +---------------------+ + | | | | | + | | | | (error code) | NEW SS:ESP + | | | +---------------------+ + | | | | | + | | | | | + | | | | | + | | | | | + | | | | | + | | | | | + | | | | | + v +---------------------+ +---------------------+ + + +Handling an interrupt request +----------------------------- + +After an interrupt request has been generated the processor runs a sequence of +events that eventually end up with running the kernel interrupt handler: + + +.. slide:: Handling an interrupt request + :inline-contents: True + :level: 2 + + + * CPU checks the current privilege level + * if need to change privilege level + + * change stack with the one associated with new privilege + * save old stack information on the new stack + + * save EFLAGS, CS, EIP on stack + * save error code on stack in case of an abort + * execute the kernel interrupt handler + +Returning from an interrupt handler +----------------------------------- + +Most architectures offer special instructions to clean up the stack and resume +the execution after the interrupt handler has been executed. On x86 IRET is used +to return from an interrupt handler. IRET is similar to RET except that IRET +increments ESP by extra four bytes (because of the flags on stack) and moves the +saved flags into EFLAGS register. + +To resume the execution after an interrupt the following sequence is used (x86): + +.. slide:: Returning from an interrupt + :inline-contents: True + :level: 2 + + * pop the error code (in case of an abort) + * call IRET + + * pops values from the stack and restore the following register: CS, EIP, EFLAGS + * if privilege level changed returns to the old stack and old privilege level + +Inspecting the x86 interrupt handling +------------------------------------- + +.. slide:: Inspecting the x86 interrupt handling + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/intr_x86.cast + + +Quiz: x86 interrupt handling +---------------------------- + +.. slide:: Quiz: x86 interrupt handling + :inline-contents: True + :level: 2 + + The following gdb commands are used to determine the handler for + the int80 based system call exception. Select and arrange the + commands or output of the commands in the correct order. + + .. code-block:: gdb + + (void *) 0xc15de780 + + set $idtr_addr=($idtr_entry>>48<<16)|($idtr_entry&0xffff) + + print (void*)$idtr_addr + + set $idtr = 0xff800000 + + (void *) 0xc15de874 + + set $idtr = 0xff801000 + + set $idtr_entry = *(uint64_t*)($idtr + 8 * 128) + + monitor info registers + +Interrupt handling in Linux +=========================== + +In Linux the interrupt handling is done in three phases: critical, immediate and +deferred. + +In the first phase the kernel will run the generic interrupt handler that +determines the interrupt number, the interrupt handler for this particular +interrupt and the interrupt controller. At this point any timing critical +actions will also be performed (e.g. acknowledge the interrupt at the interrupt +controller level). Local processor interrupts are disabled for the duration of +this phase and continue to be disabled in the next phase. + +In the second phase, all of the device driver's handlers associated with this +interrupt will be executed. At the end of this phase, the interrupt controller's +"end of interrupt" method is called to allow the interrupt controller to +reassert this interrupt. The local processor interrupts are enabled at this +point. + +.. note:: + + It is possible that one interrupt is associated with multiple + devices and in this case it is said that the interrupt is + shared. Usually, when using shared interrupts it is the + responsibility of the device driver to determine if the interrupt + is target to its device or not. + +Finally, in the last phase of interrupt handling interrupt context deferrable +actions will be run. These are also sometimes known as "bottom half" of the +interrupt (the upper half being the part of the interrupt handling that runs +with interrupts disabled). At this point, interrupts are enabled on the local +processor. + +.. slide:: Interrupt handling in Linux + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + + phase 1 + +----------------+ + | critical | phase 2 + +----------------+ +-----------------+ + | | | immediate | phase 3 + | - IRQ disabled | +-----------------+ +----------------+ + | - ACK IRQ +-----+ | | | deferred | + | | +---> - IRQ disabled | +----------------+ + +----------------+ | - device handler| | | + | - EOI IRQ +-----+ | - IRQ enabled | + +-----------------+ +----> - execute later| + | | + +----------------+ + + +Nested interrupts and exceptions +-------------------------------- + +Linux used to support nested interrupts but this was removed some time +ago in order to avoid increasingly complex solutions to stack +overflows issues - allow just one level of nesting, allow multiple +levels of nesting up to a certain kernel stack depth, etc. + +However, it is still possible to have nesting between exceptions and +interrupts but the rules are fairly restrictive: + +.. slide:: IRQ and exception nesting in Linux + :inline-contents: True + :level: 2 + + * an exception (e.g. page fault, system call) can not preempt an interrupt; + if that occurs it is considered a bug + + * an interrupt can preempt an exception + + * an interrupt can not preempt another interrupt (it used to be possible) + + +The diagram below shows the possible nesting scenarios: + +.. slide:: Interrupt/Exception nesting + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + + ^ + ^ + | | | | + | Syscall | IRQi| | + User Mode | Exception (e.g. page fault) | | | + | | | | + +------------------------------------+-----+-----------------+ + | iret| | iret^ IRQj| iret| + | | | | | | + Kernel Mode v-------+ ^-------+ ^------+ v-----+ v-----+ + | | | | + IRQi| iret| IRQj| iret| + v------+ v------+ + +Interrupt context +----------------- + +While an interrupt is handled (from the time the CPU jumps to the interrupt +handler until the interrupt handler returns - e.g. IRET is issued) it is said +that code runs in "interrupt context". + +Code that runs in interrupt context has the following properties: + +.. slide:: Interrupt context + :inline-contents: True + :level: 2 + + * it runs as a result of an IRQ (not of an exception) + * there is no well defined process context associated + * not allowed to trigger a context switch (no sleep, schedule, or user memory access) + +Deferrable actions +------------------ + +Deferrable actions are used to run callback functions at a later time. If +deferrable actions scheduled from an interrupt handler, the associated callback +function will run after the interrupt handler has completed. + +There are two large categories of deferrable actions: those that run in +interrupt context and those that run in process context. + +The purpose of interrupt context deferrable actions is to avoid doing too much +work in the interrupt handler function. Running for too long with interrupts +disabled can have undesired effects such as increased latency or poor system +performance due to missing other interrupts (e.g. dropping network packets +because the CPU did not react in time to dequeue packets from the network +interface and the network card buffer is full). + +Deferrable actions have APIs to: **initialize** an instance, **activate** or +**schedule** the action and **mask/disable** and **unmask/enable** the execution +of the callback function. The latter is used for synchronization purposes between +the callback function and other contexts. + +Typically the device driver will initialize the deferrable action +structure during the device instance initialization and will activate +/ schedule the deferrable action from the interrupt handler. + +.. slide:: Deferrable actions + :inline-contents: False + :level: 2 + + + * Schedule callback functions to run at a later time + + * Interrupt context deferrable actions + + * Process context deferrable actions + + * APIs for initialization, scheduling, and masking + +Soft IRQs +--------- + +Soft IRQs is the term used for the low-level mechanism that implements deferring +work from interrupt handlers but that still runs in interrupt context. + +.. slide:: Soft IRQs + :inline-contents: True + :level: 2 + + Soft IRQ APIs: + + * initialize: :c:func:`open_softirq` + * activation: :c:func:`raise_softirq` + * masking: :c:func:`local_bh_disable`, :c:func:`local_bh_enable` + + Once activated, the callback function :c:func:`do_softirq` runs either: + + * after an interrupt handler or + * from the ksoftirqd kernel thread + + +Since softirqs can reschedule themselves or other interrupts can occur that +reschedules them, they can potentially lead to (temporary) process starvation if +checks are not put into place. Currently, the Linux kernel does not allow +running soft irqs for more than :c:macro:`MAX_SOFTIRQ_TIME` or rescheduling for +more than :c:macro:`MAX_SOFTIRQ_RESTART` consecutive times. + +Once these limits are reached a special kernel thread, **ksoftirqd** is woken up +and all of the rest of pending soft irqs will be run from the context of this +kernel thread. + +.. slide:: ksoftirqd + :inline-contents: False + :level: 2 + + * minimum priority kernel thread + * runs softirqs after certain limits are reached + * tries to achieve good latency and avoid process starvation + +Soft irqs usage is restricted, they are use by a handful of subsystems that have +low latency requirements and high frequency: + +.. slide:: Types of soft IRQs + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high + frequency threaded job scheduling. For almost all the purposes + tasklets are more than enough. F.e. all serial device BHs et + al. should be converted to tasklets, not to softirqs. + */ + + enum + { + HI_SOFTIRQ=0, + TIMER_SOFTIRQ, + NET_TX_SOFTIRQ, + NET_RX_SOFTIRQ, + BLOCK_SOFTIRQ, + IRQ_POLL_SOFTIRQ, + TASKLET_SOFTIRQ, + SCHED_SOFTIRQ, + HRTIMER_SOFTIRQ, + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ + + NR_SOFTIRQS + }; + + +Packet flood example +--------------------- + +The following screencast will look at what happens when we flood the +system with a large number of packets. Since at least a part of the +packet processing is happening in softirq we should expect the CPU to +spend most of the time running softirqs but the majority of that +should be in the context of the `ksoftirqd` thread. + +.. slide:: Packet flood example + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/ksoftirqd-packet-flood.cast + + +Tasklets +-------- + +.. slide:: Tasklets + :inline-contents: True + :level: 2 + + Tasklets are a dynamic type (not limited to a fixed number) of + deferred work running in interrupt context. + + Tasklets API: + + * initialization: :c:func:`tasklet_init` + * activation: :c:func:`tasklet_schedule` + * masking: :c:func:`tasklet_disable`, :c:func:`tasklet_enable` + + Tasklets are implemented on top of two dedicated softirqs: + :c:macro:`TASKLET_SOFITIRQ` and :c:macro:`HI_SOFTIRQ` + + Tasklets are also serialized, i.e. the same tasklet can only execute on one processor. + + +Workqueues +---------- + + .. slide:: Workqueues + :inline-contents: True + :level: 2 + + Workqueues are a type of deferred work that runs in process context. + + They are implemented on top of kernel threads. + + Workqueues API: + + * init: :c:macro:`INIT_WORK` + * activation: :c:func:`schedule_work` + +Timers +------ + +.. slide:: Timers + :inline-contents: True + :level: 2 + + Timers are implemented on top of the :c:macro:`TIMER_SOFTIRQ` + + Timer API: + + * initialization: :c:func:`setup_timer` + * activation: :c:func:`mod_timer` + +Deferrable actions summary +-------------------------- + +Here is a cheat sheet which summarizes Linux deferrable actions: + + +.. slide:: Deferrable actions summary + :inline-contents: True + :level: 2 + + * softIRQ + + * runs in interrupt context + * statically allocated + * same handler may run in parallel on multiple cores + + * tasklet + + * runs in interrupt context + * can be dynamically allocated + * same handler runs are serialized + + * workqueues + + * run in process context + +Quiz: Linux interrupt handling +------------------------------ + +.. slide:: Quiz: Linux interrupt handling + :inline-contents: True + :level: 2 + + Which of the following phases of interrupt handling runs with + interrupts disabled at the CPU level? + + * Critical + + * Immediate + + * Deferred diff --git a/refs/pull/405/merge/_sources/lectures/intro.rst.txt b/refs/pull/405/merge/_sources/lectures/intro.rst.txt new file mode 100644 index 00000000..7d336d39 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/intro.rst.txt @@ -0,0 +1,1166 @@ +============ +Introduction +============ + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Introduction + :inline-contents: True + :level: 2 + + * Basic operating systems terms and concepts + + * Overview of the Linux kernel + + +Basic operating systems terms and concepts +========================================== + +User vs Kernel +-------------- + +.. slide:: User vs Kernel + :level: 2 + + * Execution modes + + * Kernel mode + + * User mode + + * Memory protection + + * Kernel-space + + * User-space + + +Kernel and user are two terms that are often used in operating +systems. Their definition is pretty straight forward: The kernel is +the part of the operating system that runs with higher privileges +while user (space) usually means by applications running with low +privileges. + +However these terms are heavily overloaded and might have very +specific meanings in some contexts. + +User mode and kernel mode are terms that may refer specifically to the +processor execution mode. Code that runs in kernel mode can fully +[#hypervisor]_ control the CPU while code that runs in user mode has +certain limitations. For example, local CPU interrupts can only be +disabled or enable while running in kernel mode. If such an operation +is attempted while running in user mode an exception will be generated +and the kernel will take over to handle it. + +.. [#hypervisor] some processors may have even higher privileges than + kernel mode, e.g. a hypervisor mode, that is only + accessible to code running in a hypervisor (virtual + machine monitor) + +User space and kernel space may refer specifically to memory +protection or to virtual address spaces associated with either the +kernel or user applications. + +Grossly simplifying, the kernel space is the memory area that is +reserved to the kernel while user space is the memory area reserved to +a particular user process. The kernel space is accessed protected so +that user applications can not access it directly, while user space +can be directly accessed from code running in kernel mode. + + +Typical operating system architecture +------------------------------------- + +In the typical operating system architecture (see the figure below) +the operating system kernel is responsible for access and sharing the +hardware in a secure and fair manner with multiple applications. + +.. slide:: Typical operating system architecture + :level: 2 + :inline-contents: True + + .. ditaa:: + + +---------------+ +--------------+ +---------------+ -\ + | Application 1 | | Application2 | ... | Application n | | + +---------------+ +--------------+ +---------------+ |> User space + | | | | + v v v -/ + +--------------------------------------------------------+ -\ + | System Call Interface | | + +--------------------------------------------------------+ | + | | | | + v v v |> Kernel space + +--------------------------------------------------------+ | + | Kernel | | + +--------------------------------------------------------+ | + | Device drivers | | + +--------------------------------------------------------+ -/ + | | | -\ + v v v |> Hardware + -/ + + + +The kernel offers a set of APIs that applications issue which are +generally referred to as "System Calls". These APIs are different from +regular library APIs because they are the boundary at which the +execution mode switch from user mode to kernel mode. + +In order to provide application compatibility, system calls are rarely +changed. Linux particularly enforces this (as opposed to in kernel +APIs that can change as needed). + +The kernel code itself can be logically separated in core kernel +code and device drivers code. Device drivers code is responsible of +accessing particular devices while the core kernel code is +generic. The core kernel can be further divided into multiple logical +subsystems (e.g. file access, networking, process management, etc.) + + +Monolithic kernel +----------------- + +A monolithic kernel is one where there is no access protection between +the various kernel subsystems and where public functions can be +directly called between various subsystems. + + +.. slide:: Monolithic kernel + :level: 2 + :inline-contents: True + + .. ditaa:: + + +-----+ +-----+ +-----+ + | App | | App | | App | + +-----+ +-----+ +-----+ + | | | User + =--|-------=--------|--------=-------|-------------------=- + | | | Kernel + v v v + +--------------------------------------------------------+ + | System Call Interface | + +--------------------------------------------------------+ + | | + v v + +-----+ +-----+ + | |<---------------------------->| | Kernel + | |<---+ +------->| | functions + +--+--+ | | +-----+ + | | | ^ + | | +-----+ | | + |+------+---->| |<---+ | + || | +-----+ | + || | | + vv | v + +--++-+ | +-----+ + | | +------------------------>| | Device + | |<---------------------------->| | Drivers + +--+--+ +--+--+ + | | + v v + +--------------------------------------------------------+ + | Hardware | + +--------------------------------------------------------+ + + +However, most monolithic kernels do enforce a logical separation +between subsystems especially between the core kernel and device +drivers with relatively strict APIs (but not necessarily fixed in +stone) that must be used to access services offered by one subsystem +or device drivers. This, of course, depends on the particular kernel +implementation and the kernel's architecture. + + +Micro kernel +------------ + +A micro-kernel is one where large parts of the kernel are protected +from each-other, usually running as services in user space. Because +significant parts of the kernel are now running in user mode, the +remaining code that runs in kernel mode is significantly smaller, hence +micro-kernel term. + +.. slide:: Micro-kernel + :level: 2 + :inline-contents: True + + .. ditaa:: + + +-----+ +--------+ +---------+ +---------+ + | App | | File | | Network | | Display |<--+ + | | | Server | | Server | | Server |-+ | + +-----+ +--------+ +---------+ +---------+ | | + | ^ | | User + -|-|----------------------------------------=-|-|-------=- + | | | | Kernel + | | | | + | | | | + | | | | + | | Reply +----------------------------+ | | + | +--------| |----+ | + +--------->| Micro kernel |------+ + Request | (IPC, Memory, Scheduler) | + | | + +----------------------------+ + | + v + +--------------------------------------------------------+ + | Hardware | + +--------------------------------------------------------+ + + +In a micro-kernel architecture the kernel contains just enough code +that allows for message passing between different running +processes. Practically that means implement the scheduler and an IPC +mechanism in the kernel, as well as basic memory management to setup +the protection between applications and services. + +One of the advantages of this architecture is that the services are +isolated and hence bugs in one service won't impact other services. + +As such, if a service crashes we can just restart it without affecting +the whole system. However, in practice this is difficult to achieve +since restarting a service may affect all applications that depend on +that service (e.g. if the file server crashes all applications with +opened file descriptors would encounter errors when accessing them). + +This architecture imposes a modular approach to the kernel and offers +memory protection between services but at a cost of performance. What +is a simple function call between two services on monolithic kernels +now requires going through IPC and scheduling which will incur a +performance penalty [#minix-vs-linux]_. + +.. [#minix-vs-linux] https://lwn.net/Articles/220255/ + + +Micro-kernels vs monolithic kernels +----------------------------------- + +Advocates of micro-kernels often suggest that micro-kernel are +superior because of the modular design a micro-kernel +enforces. However, monolithic kernels can also be modular and there +are several approaches that modern monolithic kernels use toward this +goal: + +.. slide:: Monolithic kernels *can* be modular + :level: 2 + :inline-contents: True + + * Components can enabled or disabled at compile time + + * Support of loadable kernel modules (at runtime) + + * Organize the kernel in logical, independent subsystems + + * Strict interfaces but with low performance overhead: macros, + inline functions, function pointers + + +There is a class of operating systems that (used to) claim to be +hybrid kernels, in between monolithic and micro-kernels (e.g. Windows, +Mac OS X). However, since all of the typical monolithic services run +in kernel-mode in these operating systems, there is little merit to +qualify them other then monolithic kernels. + +.. slide:: "Hybrid" kernels + :level: 2 + :inline-contents: True + + Many operating systems and kernel experts have dismissed the label + as meaningless, and just marketing. Linus Torvalds said of this + issue: + + "As to the whole 'hybrid kernel' thing - it's just marketing. It's + 'oh, those microkernels had good PR, how can we try to get good PR + for our working kernel? Oh, I know, let's use a cool name and try + to imply that it has all the PR advantages that that other system + has'." + + +Address space +------------- + +.. slide:: Address space + :level: 2 + + * Physical address space + + * RAM and peripheral memory + + * Virtual address space + + * How the CPU sees the memory (when in protected / paging mode) + + * Process address space + + * Kernel address space + + +The address space term is an overload term that can have different +meanings in different contexts. + +The physical address space refers to the way the RAM and device +memories are visible on the memory bus. For example, on 32bit Intel +architecture, it is common to have the RAM mapped into the lower +physical address space while the graphics card memory is mapped high +in the physical address space. + +The virtual address space (or sometimes just address space) refers to +the way the CPU sees the memory when the virtual memory module is +activated (sometime called protected mode or paging enabled). The +kernel is responsible of setting up a mapping that creates a virtual +address space in which areas of this space are mapped to certain +physical memory areas. + +Related to the virtual address space there are two other terms that +are often used: process (address) space and kernel (address) space. + +The process space is (part of) the virtual address space associated +with a process. It is the "memory view" of processes. It is a +continuous area that starts at zero. Where the process's address space +ends depends on the implementation and architecture. + +The kernel space is the "memory view" of the code that runs in kernel +mode. + + +User and kernel sharing the virtual address space +------------------------------------------------- + +A typical implementation for user and kernel spaces is one where the +virtual address space is shared between user processes and the kernel. + +In this case kernel space is located at the top of the address space, +while user space at the bottom. In order to prevent the user processes +from accessing kernel space, the kernel creates mappings that prevent +access to the kernel space from user mode. + +.. slide:: User and kernel sharing the virtual address space + :level: 2 + :inline-contents: True + + .. ditaa:: + + +-------------------+ ^ + 0xFFFFFFFF | | | + | | | Kernel space + | | | + +-------------------+ v + 0xC0000000 | | ^ + | | | User space + | | | + | | | + | | | + | | | + | | | + | | | + | | | + 0x00000000 +-------------------+ v + + 32bit Virtual Address Space + +Execution contexts +------------------ + +.. slide:: Execution contexts + :level: 2 + + * Process context + + * Code that runs in user mode, part of a process + + * Code that runs in kernel mode, as a result of a system call + issued by a process + + * Interrupt context + + * Code that runs as a result of an interrupt + + * Always runs in kernel mode + + +One of the most important jobs of the kernel is to service interrupts +and to service them efficiently. This is so important that a special +execution context is associated with it. + +The kernel executes in interrupt context when it runs as a result of +an interrupt. This includes the interrupt handler, but it is not +limited to it, there are other special (software) constructs that run +in interrupt mode. + +Code running in interrupt context always runs in kernel mode and there +are certain limitations that the kernel programmer has to be aware of +(e.g. not calling blocking functions or accessing user space). + +Opposed to interrupt context there is process context. Code that runs +in process context can do so in user mode (executing application code) +or in kernel mode (executing a system call). + + +Multi-tasking +------------- + +.. slide:: Multi-tasking + :level: 2 + + * An OS that supports the "simultaneous" execution of multiple processes + + * Implemented by fast switching between running processes to allow + the user to interact with each program + + * Implementation: + + * Cooperative + + * Preemptive + +Multitasking is the ability of the operating system to +"simultaneously" execute multiple programs. It does so by quickly +switching between running processes. + +Cooperative multitasking requires the programs to cooperate to achieve +multitasking. A program will run and relinquish CPU control back +to the OS, which will then schedule another program. + +With preemptive multitasking the kernel will enforce strict limits for +each process, so that all processes have a fair chance of +running. Each process is allowed to run a time slice (e.g. 100ms) +after which, if it is still running, it is forcefully preempted and +another task is scheduled. + +Preemptive kernel +----------------- + +.. slide:: Preemptive kernel + :level: 2 + :inline-contents: True + + Preemptive multitasking and preemptive kernels are different terms. + + A kernel is preemptive if a process can be preempted while running + in kernel mode. + + However, note that non-preemptive kernels may support preemptive + multitasking. + + +Pageable kernel memory +---------------------- + +.. slide:: Pageable kernel memory + :level: 2 + :inline-contents: True + + A kernel supports pageable kernel memory if parts of kernel memory + (code, data, stack or dynamically allocated memory) can be swapped + to disk. + +Kernel stack +------------ + +.. slide:: Kernel stack + :level: 2 + :inline-contents: True + + Each process has a kernel stack that is used to maintain the + function call chain and local variables state while it is executing + in kernel mode, as a result of a system call. + + The kernel stack is small (4KB - 12 KB) so the kernel developer has + to avoid allocating large structures on stack or recursive calls + that are not properly bounded. + +Portability +----------- + +In order to increase portability across various architectures and +hardware configurations, modern kernels are organized as follows at the +top level: + +.. slide:: Portability + :level: 2 + :inline-contents: True + + * Architecture and machine specific code (C & ASM) + + * Independent architecture code (C): + + * kernel core (further split in multiple subsystems) + + * device drivers + +This makes it easier to reuse code as much as possible between +different architectures and machine configurations. + + +Asymmetric MultiProcessing (ASMP) +--------------------------------- + +Asymmetric MultiProcessing (ASMP) is a way of supporting multiple +processors (cores) by a kernel, where a processor is dedicated to the +kernel and all other processors run user space programs. + +The disadvantage of this approach is that the kernel throughput +(e.g. system calls, interrupt handling, etc.) does not scale with the +number of processors and hence typical processes frequently use system +calls. The scalability of the approach is limited to very specific +systems (e.g. scientific applications). + + +.. slide:: Asymmetric MultiProcessing (ASMP) + :level: 2 + :inline-contents: True + + .. ditaa:: + + +-----------+ + | | + +------------------>| Memory |<-----------------+ + | | | | + | +-----------+ | + | ^ | + | | | + v v v + +--------------+ +---------------+ +---------------+ + | | | | | | + | Processor A | | Processor B | | Processor C | + | | | | | | + | | | +-----------+ | | +-----------+ | + | | | | Process 1 | | | | Process 1 | | + | | | +-----------+ | | +-----------+ | + | | | | | | + | +----------+ | | +-----------+ | | +-----------+ | + | | kernel | | | | Process 2 | | | | Process 2 | | + | +----------+ | | +-----------+ | | +-----------+ | + | | | | | | + | | | +-----------+ | | +-----------+ | + | | | | Process 3 | | | | Process 3 | | + | | | +-----------+ | | +-----------+ | + +--------------+ +---------------+ +---------------+ + + +Symmetric MultiProcessing (SMP) +------------------------------- + +As opposed to ASMP, in SMP mode the kernel can run on any of the +existing processors, just as user processes. This approach is more +difficult to implement, because it creates race conditions in the +kernel if two processes run kernel functions that access the same +memory locations. + +In order to support SMP the kernel must implement synchronization +primitives (e.g. spin locks) to guarantee that only one processor is +executing a critical section. + +.. slide:: Symmetric MultiProcessing (SMP) + :level: 2 + :inline-contents: True + + .. ditaa:: + + +-----------+ + | | + +------------------->| Memory |<------------------+ + | | | | + | +-----------+ | + | ^ | + | | | + v v v + +---------------+ +---------------+ +---------------+ + | | | | | | + | Processor A | | Processor B | | Processor C | + | | | | | | + | +-----------+ | | +-----------+ | | +-----------+ | + | | Process 1 | | | | Process 1 | | | | Process 1 | | + | +-----------+ | | +-----------+ | | +-----------+ | + | | | | | | + | +-----------+ | | +-----------+ | | +-----------+ | + | | Process 2 | | | | Process 2 | | | | Process 2 | | + | +-----------+ | | +-----------+ | | +-----------+ | + | | | | | | + | +-----------+ | | +-----------+ | | +-----------+ | + | | kernel | | | | kernel | | | | kernel | | + | +-----------+ | | +-----------+ | | +-----------+ | + +---------------+ +---------------+ +---------------+ + + +CPU Scalability +--------------- + +CPU scalability refers to how well the performance scales with +the number of cores. There are a few things that the kernel developer +should keep in mind with regard to CPU scalability: + +.. slide:: CPU Scalability + :level: 2 + :inline-contents: True + + * Use lock free algorithms when possible + + * Use fine grained locking for high contention areas + + * Pay attention to algorithm complexity + + +Overview of the Linux kernel +============================ + + +Linux development model +----------------------- + +.. slide:: Linux development model + :level: 2 + + * Open source, GPLv2 License + + * Contributors: companies, academia and independent developers + + * Development cycle: 3 – 4 months which consists of a 1 - 2 week + merge window followed by bug fixing + + * Features are only allowed in the merge window + + * After the merge window a release candidate is done on a weekly + basis (rc1, rc2, etc.) + +The Linux kernel is one the largest open source projects in the world +with thousands of developers contributing code and millions of lines of +code changed for each release. + +It is distributed under the GPLv2 license, which simply put, +requires that any modification of the kernel done on software that is +shipped to customer should be made available to them (the customers), +although in practice most companies make the source code publicly +available. + +There are many companies (often competing) that contribute code to the +Linux kernel as well as people from academia and independent +developers. + +The current development model is based on doing releases at fixed +intervals of time (usually 3 - 4 months). New features are merged into +the kernel during a one or two week merge window. After the merge +window, a release candidate is done on a weekly basis (rc1, rc2, etc.) + + +Maintainer hierarchy +-------------------- + +In order to scale the development process, Linux uses a hierarchical +maintainership model: + +.. slide:: Maintainer hierarchy + :level: 2 + :inline-contents: True + + * Linus Torvalds is the maintainer of the Linux kernel and merges pull + requests from subsystem maintainers + + * Each subsystem has one or more maintainers that accept patches or + pull requests from developers or device driver maintainers + + * Each maintainer has its own git tree, e.g.: + + * Linux Torvalds: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + + * David Miller (networking): git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/ + + * Each subsystem may maintain a -next tree where developers can submit + patches for the next merge window + +Since the merge window is only a maximum of two weeks, most of the +maintainers have a -next tree where they accept new features from +developers or maintainers downstream while even when the merge window +is closed. + +Note that bug fixes are accepted even outside merge window in the +maintainer's tree from where they are periodically pulled by the +upstream maintainer regularly, for every release candidate. + + + +Linux source code layout +------------------------- + +.. slide:: Linux source code layout + :level: 2 + :inline-contents: True + + .. ditaa:: + + +-------+ + | linux | + +-+-----+ + | + +------+--------+---------+---------+--------------+--------------+ + | | | | | | | + | v v v v v v + | +------+ +-------+ +-------+ +--------+ +---------------+ +---------+ + | | arch | | block | | certs | | crypto | | Documentation | | drivers | + | +------+ +-------+ +-------+ +--------+ +---------------+ +---------+ + | + +-------+----------+--------+---------+--------+--------+---------+ + | | | | | | | | + | v v v v v v v + | +----------+ +----+ +---------+ +------+ +-----+ +--------+ +-----+ + | | firmware | | fs | | include | | init | | ipc | | kernel | | lib | + | +----------+ +----+ +---------+ +------+ +-----+ +--------+ +-----+ + | + +-----+------+---------+------------+------------+------------+ + | | | | | | | + | v v v v v v + | +----+ +-----+ +---------+ +---------+ +----------+ +-------+ + | | mm | | net | | samples | | scripts | | security | | sound | + | +----+ +-----+ +---------+ +---------+ +----------+ +-------+ + | + +------+--------+--------+ + | | | + v v v + +-------+ +-----+ +------+ + | tools | | usr | | virt | + +-------+ +-----+ +------+ + + +These are the top level of the Linux source code folders: + +* arch - contains architecture specific code; each architecture is + implemented in a specific sub-folder (e.g. arm, arm64, x86) + +* block - contains the block subsystem code that deals with reading + and writing data from block devices: creating block I/O requests, + scheduling them (there are several I/O schedulers available), + merging requests, and passing them down through the I/O stack to the + block device drivers + +* certs - implements support for signature checking using certificates + +* crypto - software implementation of various cryptography algorithms + as well as a framework that allows offloading such algorithms in + hardware + +* Documentation - documentation for various subsystems, Linux kernel + command line options, description for sysfs files and format, device + tree bindings (supported device tree nodes and format) + +* drivers - driver for various devices as well as the Linux driver + model implementation (an abstraction that describes drivers, devices + buses and the way they are connected) + +* firmware - binary or hex firmware files that are used by various + device drivers + +* fs - home of the Virtual Filesystem Switch (generic filesystem code) + and of various filesystem drivers + +* include - header files + +* init - the generic (as opposed to architecture specific) + initialization code that runs during boot + +* ipc - implementation for various Inter Process Communication system + calls such as message queue, semaphores, shared memory + +* kernel - process management code (including support for kernel + thread, workqueues), scheduler, tracing, time management, generic + irq code, locking + +* lib - various generic functions such as sorting, checksums, + compression and decompression, bitmap manipulation, etc. + +* mm - memory management code, for both physical and virtual memory, + including the page, SL*B and CMA allocators, swapping, virtual memory + mapping, process address space manipulation, etc. + +* net - implementation for various network stacks including IPv4 and + IPv6; BSD socket implementation, routing, filtering, packet + scheduling, bridging, etc. + +* samples - various driver samples + +* scripts - parts the build system, scripts used for building modules, + kconfig the Linux kernel configurator, as well as various other + scripts (e.g. checkpatch.pl that checks if a patch is conform with + the Linux kernel coding style) + +* security - home of the Linux Security Module framework that allows + extending the default (Unix) security model as well as + implementation for multiple such extensions such as SELinux, smack, + apparmor, tomoyo, etc. + +* sound - home of ALSA (Advanced Linux Sound System) as well as the + old Linux sound framework (OSS) + +* tools - various user space tools for testing or interacting with + Linux kernel subsystems + +* usr - support for embedding an initrd file in the kernel image + +* virt - home of the KVM (Kernel Virtual Machine) hypervisor + + +Linux kernel architecture +------------------------- + +.. slide:: Linux kernel architecture + :level: 2 + :inline-contents: True + + .. ditaa:: + :height: 100% + + +---------------+ +--------------+ +---------------+ + | Application 1 | | Application2 | ... | Application n | + +---------------+ +--------------+ +---------------+ + | | | + v v v + +--------------------------------------------------------+ + | Kernel | + | | + | +----------------------+ +-------------------+ | + | | Process Management | | Memory Management | | + | +----------------------+ +-------------------+ | + | | + | +------------+ +------------+ +------------+ | + | | Block I/O | | VFS | | Networking | | + | +------------+ +------------+ +------------+ | + | | + | +------------+ +------------+ +------------+ | + | | IPC | | Security | | Crypto | | + | +------------+ +------------+ +------------+ | + | | + | +------------+ +------------+ +------------+ | + | | DRM | | ALSA | | USB | | + | +------------+ +------------+ +------------+ | + | ... | + +--------------------------------------+-----------------+ + | Device drivers | arch | + | | | + | +----+ +-----+ +--------+ +----+ | +----------+ | + | |char| |block| |ethernet| |wifi| | | machine 1| | + | +----+ +-----+ +--------+ +----+ | +----------+ | + | +----------+ +-----+ +----+ +---+ | +----------+ | + | |filesystem| |input| |iio | |usb| | | machine 2| | + | +----------+ +-----+ +----+ +---+ | +----------+ | + | +-----------+ +----------+ +---+ | | + | |framebuffer| | platform | |drm| | ... | + | +-----------+ +----------+ +---+ | | + +-------------------------+----+-------+-----------------+ + | | | + v v v + + +--------------------------------------------------------+ + | Hardware | + +--------------------------------------------------------+ + + +arch +.... + +.. slide:: arch + :level: 2 + :inline-contents: True + + * Architecture specific code + + * May be further sub-divided in machine specific code + + * Interfacing with the boot loader and architecture specific + initialization + + * Access to various hardware bits that are architecture or machine + specific such as interrupt controller, SMP controllers, BUS + controllers, exceptions and interrupt setup, virtual memory handling + + * Architecture optimized functions (e.g. memcpy, string operations, + etc.) + +This part of the Linux kernel contains architecture specific code and +may be further sub-divided in machine specific code for certain +architectures (e.g. arm). + +"Linux was first developed for 32-bit x86-based PCs (386 or +higher). These days it also runs on (at least) the Compaq Alpha AXP, +Sun SPARC and UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, +Hitachi SuperH, IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD +x86-64 and CRIS architectures.” + +It implements access to various hardware bits that are architecture or +machine specific such as interrupt controller, SMP controllers, BUS +controllers, exceptions and interrupt setup, virtual memory handling. + +It also implements architecture optimized functions (e.g. memcpy, +string operations, etc.) + + +Device drivers +.............. + +.. slide:: Device drivers + :level: 2 + + * Unified device model + + * Each subsystem has its own specific driver interfaces + + * Many device driver types (TTY, serial, SCSI, fileystem, ethernet, + USB, framebuffer, input, sound, etc.) + +The Linux kernel uses a unified device model whose purpose is to +maintain internal data structures that reflect the state and structure +of the system. Such information includes what devices are present, +what is their status, what bus they are attached to, to what driver +they are attached, etc. This information is essential for implementing +system wide power management, as well as device discovery and dynamic +device removal. + +Each subsystem has its own specific driver interface that is tailored +to the devices it represents in order to make it easier to write +correct drivers and to reduce code duplication. + +Linux supports one of the most diverse set of device drivers type, +some examples are: TTY, serial, SCSI, fileystem, ethernet, USB, +framebuffer, input, sound, etc. + + +Process management +.................. + +.. slide:: Process management + :level: 2 + + * Unix basic process management and POSIX threads support + + * Processes and threads are abstracted as tasks + + * Operating system level virtualization + + * Namespaces + + * Control groups + +Linux implements the standard Unix process management APIs such as +fork(), exec(), wait(), as well as standard POSIX threads. + +However, Linux processes and threads are implemented particularly +different than other kernels. There are no internal structures +implementing processes or threads, instead there is a :c:type:`struct +task_struct` that describe an abstract scheduling unit called task. + +A task has pointers to resources, such as address space, file +descriptors, IPC ids, etc. The resource pointers for tasks that are +part of the same process point to the same resources, while resources +of tasks of different processes will point to different resources. + +This peculiarity, together with the `clone()` and `unshare()` system +call allows for implementing new features such as namespaces. + +Namespaces are used together with control groups (cgroup) to implement +operating system virtualization in Linux. + +cgroup is a mechanism to organize processes hierarchically and +distribute system resources along the hierarchy in a controlled and +configurable manner. + + +Memory management +................. + +Linux memory management is a complex subsystem that deals with: + +.. slide:: Memory management + :level: 2 + :inline-contents: True + + * Management of the physical memory: allocating and freeing memory + + * Management of the virtual memory: paging, swapping, demand + paging, copy on write + + * User services: user address space management (e.g. mmap(), brk(), + shared memory) + + * Kernel services: SL*B allocators, vmalloc + + + +Block I/O management +.................... + +The Linux Block I/O subsystem deals with reading and writing data from +or to block devices: creating block I/O requests, transforming block I/O +requests (e.g. for software RAID or LVM), merging and sorting the +requests and scheduling them via various I/O schedulers to the block +device drivers. + +.. slide:: Block I/O management + :level: 2 + :inline-contents: True + + .. ditaa:: + :height: 100% + + +---------------------------------+ + | Virtual Filesystem Switch | + +---------------------------------+ + ^ + | + v + +---------------------------------+ + | Device Mapper | + +---------------------------------+ + ^ + | + v + +---------------------------------+ + | Generic Block Layer | + +---------------------------------+ + ^ + | + v + +--------------------------------+ + | I/O scheduler | + +--------------------------------+ + ^ ^ + | | + v v + +--------------+ +--------------+ + | Block device | | Block device | + | driver | | driver | + +--------------+ +--------------+ + + +Virtual Filesystem Switch +......................... + +The Linux Virtual Filesystem Switch implements common / generic +filesystem code to reduce duplication in filesystem drivers. It +introduces certain filesystem abstractions such as: + +* inode - describes the file on disk (attributes, location of data + blocks on disk) + +* dentry - links an inode to a name + +* file - describes the properties of an opened file (e.g. file + pointer) + +* superblock - describes the properties of a formatted filesystem + (e.g. number of blocks, block size, location of root directory on + disk, encryption, etc.) + +.. slide:: Virtual Filesystem Switch + :level: 2 + :inline-contents: True + + .. ditaa:: + :height: 100% + + + ^ ^ ^ + | stat | open | read + v v v + +------------------------------------------------------------+ + | Virtual Filesystem Switch | + | | + | | + | /-------\ /--------\ /--------\ | + | | inode |<----------+ dentry |<----------+ FILE | | + | \---+---/ \----+---/ \---+----/ | + | | | | | + | | | | | + | v v v | + | +-------+ +--------+ +-------+ | + | | inode | | dentry | | page | | + | | cache | | cache | | cache | | + | +-------+ +--------+ +-------+ | + | | + +------------------------------------------------------------+ + ^ ^ + | | + v v + +-------------+ +-------------+ + | Filesystem | | Filesystem | + | driver | | driver | + +-------------+ +-------------+ + + +The Linux VFS also implements a complex caching mechanism which +includes the following: + +* the inode cache - caches the file attributes and internal file + metadata + +* the dentry cache - caches the directory hierarchy of a filesystem + +* the page cache - caches file data blocks in memory + + + +Networking stack +................ + +.. slide:: Networking stack + :level: 2 + :inline-contents: True + + .. ditaa:: + :height: 100% + + +---------------------------+ + | Berkeley Socket Interface | + +---------------------------+ + + +---------------------------+ + | Transport layer | + +-------------+-------------+ + | TCP | UDP | + +-------------+-------------+ + + +---------------------------+ + | Network layer | + +-----+---------+-----------+ + | IP | Routing | NetFilter | + +-----+---------+-----------+ + + +---------------------------+ + | Data link layer | + +-------+-------+-----------+ + | ETH | ARP | BRIDGING | + +-------+-------+-----------+ + + +---------------------------+ + | Queuing discipline | + +---------------------------+ + + +---------------------------+ + | Network device drivers | + +---------------------------+ + +Linux Security Modules +...................... + +.. slide:: Linux Security Modules + :level: 2 + :inline-contents: True + + * Hooks to extend the default Linux security model + + * Used by several Linux security extensions: + + * Security Enhancened Linux + + * AppArmor + + * Tomoyo + + * Smack diff --git a/refs/pull/405/merge/_sources/lectures/memory-management.rst.txt b/refs/pull/405/merge/_sources/lectures/memory-management.rst.txt new file mode 100644 index 00000000..b6401eec --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/memory-management.rst.txt @@ -0,0 +1,484 @@ +================= +Memory Management +================= + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Memory Management + :inline-contents: True + :level: 2 + + * Physical Memory Management + + * Page allocations + + * Small allocations + + * Virtual Memory Management + + * Page Fault Handling Overview + + +Physical Memory Management +========================== + +.. slide:: Physical Memory Management + :inline-contents: True + :level: 2 + + * Algorithms and data structure that keep track of physical memory + pages + + * Independent of virtual memory management + + * Both virtual and physical memory management is required for complete + memory management + + * Physical pages are being tracked using a special data structure: + :c:type:`struct page` + + * All physical pages have an entry reserved in the :c:data:`mem_map` + vector + + * The physical page status may include: a counter for how many + times is a page used, position in swap or file, buffers for this + page, position int the page cache, etc. + +Memory zones +------------ + +.. slide:: Memory zones + :inline-contents: True + :level: 2 + + * DMA zone + + * DMA32 zone + + * Normal zone (LowMem) + + * HighMem Zone + + * Movable Zone + + +Non-Uniform Memory Access +------------------------- + +.. slide:: Non-Uniform Memory Access + :inline-contents: True + :level: 2 + + * Physical memory is split in between multiple nodes, one for each CPU + + * There is single physical address space accessible from every node + + * Access to the local memory is faster + + * Each node maintains is own memory zones (.e. DMA, NORMAL, HIGHMEM, etc.) + + +Page allocation +--------------- + +.. slide:: Page allocation + :inline-contents: True + :level: 2 + + + .. code-block:: c + + /* Allocates 2^order contiguous pages and returns a pointer to the + * descriptor for the first page + */ + struct page *alloc_pages(gfp_mask, order); + + /* allocates a single page */ + struct page *alloc_page(gfp_mask); + + + /* helper functions that return the kernel virtual address */ + void *__get_free_pages(gfp_mask, order); + void *__get_free_page(gfp_mask); + void *__get_zero_page(gfp_mask); + void *__get_dma_pages(gfp_mask, order); + + +.. slide:: Why only allocate pages in chunks of power of 2? + :inline-contents: True + :level: 2 + + * Typical memory allocation algorithms have linear complexity + + * Why not use paging? + + * Sometime we do need contiguous memory allocations (for DMA) + + * Allocation would require page table changes and TLB flushes + + * Not able to use extended pages + + * Some architecture directly (in hardware) linearly maps a part + of the address space (e.g. MIPS) + + +.. slide:: The buddy algorithm + :inline-contents: True + :level: 2 + + * Free blocks are distributed in multiple lists + + * Each list contains blocks of the same size + + * The block size is a power of two + + +.. slide:: Allocating a block of size N + :inline-contents: True + :level: 2 + + * If there is a free block in the N-size list, pick the first + + * If not, look for a free block in the 2N-size list + + * Split the 2N-size block in two N-size blocks and add them to the + N-size list + + * Now that we have the N-size list populated, pick the first free + block from that list + + +.. slide:: Freeing a block of size N + :inline-contents: True + :level: 2 + + * If the "buddy" is free coalesce into a 2N-size block + + * Try until no more free buddy block is found and place the + resulting block in the respective list + + +.. slide:: The Linux implementation + :inline-contents: True + :level: 2 + + * 11 lists for blocks of 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, + 1024 pages + + * Each memory zone has its own buddy allocator + + * Each zone has a vector of descriptors for free blocks, one entry + for each size + + * The descriptor contains the number of free blocks and the head of + the list + + * Blocks are linked in the list using the `lru` field of + :c:type:`struct page` + + * Free pages have the PG_buddy flag set + + * The page descriptor keeps a copy of the block size in the private + field to easily check if the "buddy" is free + + +Small allocations +----------------- + +.. slide:: Small allocations + :inline-contents: True + :level: 2 + + * Buddy is used to allocate pages + + * Many of the kernel subsystems need to allocate buffers smaller + than a page + + * Typical solution: variable size buffer allocation + + * Leads to external fragmentation + + * Alternative solution: fixed size buffer allocation + + * Leads to internal fragmentation + + * Compromise: fixed size block allocation with multiple sizes, geometrically distributed + + * e.g.: 32, 64, ..., 131056 + + +.. slide:: The SLAB allocator + :inline-contents: True + :level: 2 + + * Buffers = objects + + * Uses buddy to allocate a pool of pages for object allocations + + * Each object (optionally) has a constructor and destructor + + * Deallocated objects are cached - avoids subsequent calls for + constructors and buddy allocation / deallocation + +.. slide:: Why SLAB? + :inline-contents: True + :level: 2 + + * The kernel will typically allocate and deallocate multiple types + the same data structures over time (e.g. :c:type:`struct + task_struct`) effectively using fixed size allocations. Using the + SLAB reduces the frequency of the more heavy + allocation/deallocation operations. + + * For variable size buffers (which occurs less frequently) a + geometric distribution of caches with fixed-size can be used + + * Reduces the memory allocation foot-print since we are searching a + much smaller memory area, compared to buddy which can span over a + larger area + + * Employs cache optimization techniques (slab coloring) + + +.. slide:: Slab architecture + :inline-contents: True + :level: 2 + + .. image:: ../res/slab-overview.png + + +.. slide:: Cache descriptors + :inline-contents: True + :level: 2 + + * A name to identify the cache for stats + + * object constructor and destructor functions + + * size of the objects + + * Flags + + * Size of the slab in power of 2 pages + + * GFP masks + + * One or mores slabs, grouped by state: full, partially full, empty + +.. slide:: SLAB descriptors + :inline-contents: True + :level: 2 + + * Number of objects + + * Memory region where the objects are stored + + * Pointer to the first free object + + * Descriptor are stored either in + + * the SLAB itself (if the object size is lower the 512 or if + internal fragmentation leaves enough space for the SLAB + descriptor) + + * in generic caches internally used by the SLAB allocator + + +.. slide:: Slab detailed architecture + :inline-contents: True + :level: 2 + + .. image:: ../res/slab-detailed-arch.png + + +.. slide:: Generic vs specific caches + :inline-contents: True + :level: 2 + + * Generic caches are used internally by the slab allocator + + * allocating memory for cache and slab descriptors + + * They are also used to implement :c:func:`kmalloc` by implementing + 20 caches with object sizes geometrically distributed between + 32bytes and 4MB + + * Specific cache are created on demand by kernel subsystems + + +.. slide:: Object descriptors + :inline-contents: True + :level: 2 + + .. image:: ../res/slab-object-descriptors.png + +.. slide:: Object descriptors + :inline-contents: True + :level: 2 + + * Only used for free objects + + * An integer that points to the next free object + + * The last free object uses a terminator value + + * Internal descriptors - stored in the slab + + * External descriptors - stored in generic caches + + +.. slide:: SLAB coloring + :inline-contents: True + :level: 2 + + .. image:: ../res/slab-coloring.png + + +Virtual memory management +========================= + +.. slide:: Virtual memory management + :inline-contents: True + :level: 2 + + * Used in both kernel and user space + + * Using virtual memory requires: + + * reserving (allocating) a segment in the *virtual* address space + (be it kernel or user) + + * allocating one or more physical pages for the buffer + + * allocating one or more physical pages for page tables and + internal structures + + * mapping the virtual memory segment to the physical allocated + pages + +.. slide:: Address space descriptors + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + :--no-separation: + + +------------------+ +------------+ + | Address space | | |-------------->+------------+ + | descriptor | +------------+ | | + +------------------+ | | Page +------------+ + | +------------+ tables | | + +------------------+--------------+ | ... | +------------+ + | | +------------+ | ... | + v v | |-------+ +------------+ + +------------+ +------------+ +------------+ | | | + | Area | | Area | | +------------+ + | descriptor | | descriptor | | + +------------+ +------------+ | + | | + +-------------+------------------+ +------>+------------+ + | | | | + v v +------------+ + +------------+ +------------+ | | + | Area | | Area | +------------+ + | descriptor | | descriptor | | ... | + +------------+ +------------+ +------------+ + | | | + +-----------+-----------+ +------------+ + | | + v v + +------------+ +------------+ + | Area | | Area | + | descriptor | | descriptor | + +------------+ +------------+ + + +.. slide:: Address space descriptors + :inline-contents: True + :level: 2 + + * Page table is used either by: + + * The CPU's MMU + + * The kernel to handle TLB exception (some RISC processors) + + * The address space descriptor is used by the kernel to maintain + high level information such as file and file offset (for mmap + with files), read-only segment, copy-on-write segment, etc. + + +.. slide:: Allocating virtual memory + :inline-contents: True + :level: 2 + + * Search a free area in the address space descriptor + + * Allocate memory for a new area descriptor + + * Insert the new area descriptor in the address space descriptor + + * Allocate physical memory for one or more page tables + + * Setup the page tables for the newly allocated area in the virtual + address space + + * Allocating (on demand) physical pages and map them in the virtual + address space by updating the page tables + + +.. slide:: Freeing virtual memory + :inline-contents: True + :level: 2 + + * Removing the area descriptor + + * Freeing the area descriptor memory + + * Updating the page tables to remove the area from the virtual + address space + + * Flushing the TLB for the freed virtual memory area + + * Freeing physical memory of the page tables associated with the + freed area + + * Freeing physical memory of the freed virtual memory area + + +.. slide:: Linux virtual memory management + :inline-contents: True + :level: 2 + + * Kernel + + * vmalloc + + * area descriptor: :c:type:`struct vm_struct` + + * address space descriptor: simple linked list of :c:type:`struct vm_struct` + + * Userspace + + * area descriptor: :c:type:`struct vm_area_struct` + + * address space descriptor: :c:type:`struct mm_struct`, red-black tree + + +Fault page handling +=================== + +.. slide:: Linux virtual memory management + :inline-contents: True + :level: 2 + + .. image:: ../res/page-fault-handling.png diff --git a/refs/pull/405/merge/_sources/lectures/networking.rst.txt b/refs/pull/405/merge/_sources/lectures/networking.rst.txt new file mode 100644 index 00000000..8813d0ca --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/networking.rst.txt @@ -0,0 +1,594 @@ +================== +Network Management +================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Network Management + :inline-contents: True + :level: 2 + + * Socket implementation + + * Routing implementation + + * Network Device Interface + + * Hardware and Software Acceleration Techniques + + +Network Management Overview +=========================== + +.. slide:: Network Management Overview + :inline-contents: True + :level: 2 + + .. ditaa:: + :height: 100% + + +---------------------------+ + | Berkeley Socket Interface | + +---------------------------+ + + +---------------------------+ + | Transport layer | + +-------------+-------------+ + | TCP | UDP | + +-------------+-------------+ + + +---------------------------+ + | Network layer | + +-----+---------+-----------+ + | IP | Routing | NetFilter | + +-----+---------+-----------+ + + +---------------------------+ + | Data link layer | + +-------+-------+-----------+ + | ETH | ARP | BRIDGING | + +-------+-------+-----------+ + + +---------------------------+ + | Queuing discipline | + +---------------------------+ + + +---------------------------+ + | Network device drivers | + +---------------------------+ + + +Sockets Implementation Overview +=============================== + +.. slide:: Sockets Implementation Overview + :inline-contents: True + :level: 2 + + .. ditaa:: + :height: 100% + + Socket + File + +------+ Operations + | FILE | ----------------------> +-----------+ + +------+ | read | + | | struct socket_alloc +-----------+ + | | +---------------+ | write | + | +------->| struct socket | +-----------+ + | f_private| +-----------+ | | select | + | | | ... | | +-----------+ + | | +-----------+ | | ... | + | +---------------+ +-----------+ + +--------->| struct inode | + f_inode | +-----------+ | + | | ... | | + | +-----------+ | + +---------------+ + + +Sockets Families and Protocols +=============================== + +.. slide:: Sockets Families and Protocols + :inline-contents: True + :level: 2 + + .. ditaa:: + :height: 100% + + + + struct socket +---------> struct proto_ops + +--------------------+ | +-----------------+ + | struct socket | | | release | + | | | +-----------------+ + +--------------------+ | | bind | + | struct proto_ops * |--------+ +-----------------+ + +--------------------+ | connect | + | ... | +-----------------+ + +---------------+ | accept | + +---------| struct sock * |-------+ +-----------------+ + | +---------------+ | | sendmsg | + | | +-----------------+ + | | | recvmsg | + | | +-----------------+ + | | | poll | + | | +-----------------+ + | | | ... | + | | +-----------------+ + | | + v v +--> struct sk_prot + struct tcp_sock struct tcp_sock | +--------------------+ + +-------------------+ +-------------------+ | | inet_dgram_connect | + | struct inet_sock | | struct inet_sock | | +--------------------+ + | +---------------+ | | +---------------+ | | | inet_sendmsg | + | | struct sock | | | | struct sock | | | +--------------------+ + | | +-----------+ | | | | +-----------+ | | | | udp_poll | + | | | ... | | | | | | ... | | | | +--------------------+ + | | +-----------+ | | | | +-----------+ | | | | inet_release | + | +---------------+ | | +---------------+ | | +--------------------+ + | | sk_prot * | | | | sk_prot * | |--+ | inet_bind | + | +---------------+ | | +---------------+ | +--------------------+ + +-------------------+ +-------------------+ | ... | + | ... | | ... | +--------------------+ + +-------------------+ +-------------------+ + + +Example: UDP send +----------------- + +.. slide:: Example: UDP send + :inline-contents: True + :level: 2 + + + .. code-block:: c + + char c; + struct sockaddr_in addr; + int s; + + s = socket(AF_INET, SOCK_DGRAM, 0); + connect(s, (struct sockaddr*)&addr, sizeof(addr)); + write(s, &c, 1); + close(s); + + +.. slide:: Example: UDP send + :inline-contents: True + :level: 2 + + .. ditaa:: + + -:------------------------------------------------------------------------------------ + + VFS layer sys_write → vfs_write → do_sync_write → filp->f_op->aio_write + + -:------------------------------------------------------------------------------------ + + Generic socket layer sock_aio_write → sock->ops->sendmsg + + -:------------------------------------------------------------------------------------ + + IP socket layer sk->sk_prot->sendmsg + + -:------------------------------------------------------------------------------------ + + UDP socket layer ip_append_data udp_flush_pending_frames + | | + -:------------------------------+------------------------------+----------------------- + V V + IP socket layer skb = sock_alloc_send_skb(); ip_local_out + skb_queue_tail(sk, skb) + + -:------------------------------------------------------------------------------------ + + routing + + +Network processing phases +========================= + +.. slide:: Network processing phases + :inline-contents: True + :level: 2 + + * Interrupt handler - device driver fetches data from the RX ring, + creates a network packet and queues it to the network stack for + processing + + * NET_SOFTIRQ - packet goes through the stack layer and it is + processed: decapsulate Ethernet frame, check IP packet and route + it, if local packet decapsulate protocol packet (e.g. TCP) and + queues it to a socket + + * Process context - application fetches data from the socket queue + or pushes data to the socket queue + + +Packet Routing +============== + +.. slide:: Packet Routing + :inline-contents: True + :level: 2 + + .. ditaa:: + + +----------------------+ +----------------------+ + | Application | | Application | + +----------------------+ +----------------------+ + | ^ | ^ + | send() | recv() | send() | recv() + V | V | + +----------------------+ +----------------------+ + | Socket | | Socket | + +----------------------+ +----------------------+ + | ^ | ^ + | | | | + v | v | + +---------------------------------------------------------+ + | Transport layer | + +---------------------------------------------------------+ + | ^ | ^ + | | | | + v | v | + +---------------------------------------------------------+ + | Network layer | + +---------------------------------------------------------+ + | ^ + | | + v | + /---------------------------------------------------------\ + | Routing | ----> Drop packet + \---------------------------------------------------------/ + ^ | ^ | + | RX | TX | RX | TX + | v | v + +-----------------------+ +-----------------------+ + | Network Device Driver | | Network Device Driver | + +-----------------------+ +-----------------------+ + + +Routing Table(s) +---------------- + +.. slide:: Routing Table + :inline-contents: True + :level: 2 + + + .. code-block:: shell + + tavi@desktop-tavi:~/src/linux$ ip route list table main + default via 172.30.240.1 dev eth0 + 172.30.240.0/20 dev eth0 proto kernel scope link src 172.30.249.241 + + tavi@desktop-tavi:~/src/linux$ ip route list table local + broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1 + local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1 + local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1 + broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1 + broadcast 172.30.240.0 dev eth0 proto kernel scope link src 172.30.249.241 + local 172.30.249.241 dev eth0 proto kernel scope host src 172.30.249.241 + broadcast 172.30.255.255 dev eth0 proto kernel scope link src 172.30.249.241 + + tavi@desktop-tavi:~/src/linux$ ip rule list + 0: from all lookup local + 32766: from all lookup main + 32767: from all lookup default + + +Routing Policy Database +----------------------- + +.. slide:: Routing Policy Database + :inline-contents: True + :level: 2 + + * "Regular" routing only uses the destination address + + * To increase flexibility a "Routing Policy Database" is used that + allows different routing based on other fields such as the source + address, protocol type, transport ports, etc. + + * This is encoded as a list of rules that are evaluated based on + their priority (priority 0 is the highest) + + * Each rule has a selector (how to match the packet) and an + action (what action to take if the packet matches) + + * Selectors: source address, destination address, type of service (TOS), + input interface, output interface, etc. + + * Action: lookup / unicast - use given routing table, blackhole - + drop packet, unreachable - send ICMP unreachable message and drop + packet, etc. + + + +Routing table processing +------------------------ + +.. slide:: Routing table processing + :inline-contents: True + :level: 2 + + * Special table for local addreses -> route packets to sockets + based on family, type, ports + + * Check every routing entry for starting with the most specific + routes (e.g. 192.168.0.0/24 is checked before 192.168.0.0/16) + + * A route matches if the packet destination addreess logical ORed + with the subnet mask equals the subnet address + + * Once a route matches the following information is retrieved: + interface, link layer next-hop address, network next host address + + +Forwarding Information Database +------------------------------- + +.. slide:: Forward Information Database (removed in 3.6) + :inline-contents: True + :level: 2 + + |_| + + .. image:: ../res/fidb-overview.png + + +.. slide:: Forward Information Database (removed in 3.6) + :inline-contents: True + :level: 2 + + .. image:: ../res/fidb-details.png + +.. slide:: Routing Cache (removed in 3.6) + :inline-contents: True + :level: 2 + + |_| + + .. image:: ../res/routing-cache.png + +.. slide:: FIB TRIE + :inline-contents: True + :level: 2 + + |_| + + .. image:: ../res/fib-trie.png + +.. slide:: Compressed Trie + :inline-contents: True + :level: 2 + + |_| + + .. image:: ../res/fib-trie-compressed.png + + +Netfilter +========= + +.. slide:: Netfilter + :inline-contents: True + :level: 2 + + + * Framework that implements packet filtering and NAT + + * It uses hooks inserted in key places in the packet flow: + + * NF_IP_PRE_ROUTING + + * NF_IP_LOCAL_IN + + * NF_IP_FORWARD + + * NF_IP_LOCAL_OUT + + * NF_IP_POST_ROUTING + + * NF_IP_NUMHOOKS + + + +Network packets / skbs (struct sk_buff) +======================================= + +.. slide:: Network packets (skbs) + :inline-contents: True + :level: 2 + + .. image:: ../res/skb.png + + +.. slide:: struct sk_buff + :inline-contents: True + :level: 2 + + .. code-block:: c + + struct sk_buff { + struct sk_buff *next; + struct sk_buff *prev; + + struct sock *sk; + ktime_t tstamp; + struct net_device *dev; + char cb[48]; + + unsigned int len, + data_len; + __u16 mac_len, + hdr_len; + + void (*destructor)(struct sk_buff *skb); + + sk_buff_data_t transport_header; + sk_buff_data_t network_header; + sk_buff_data_t mac_header; + sk_buff_data_t tail; + sk_buff_data_t end; + + unsigned char *head, + *data; + unsigned int truesize; + atomic_t users; + + +.. slide:: skb APIs + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* reserve head room */ + void skb_reserve(struct sk_buff *skb, int len); + + /* add data to the end */ + unsigned char *skb_put(struct sk_buff *skb, unsigned int len); + + /* add data to the top */ + unsigned char *skb_push(struct sk_buff *skb, unsigned int len); + + /* discard data at the top */ + unsigned char *skb_pull(struct sk_buff *skb, unsigned int len); + + /* discard data at the end */ + unsigned char *skb_trim(struct sk_buff *skb, unsigned int len); + + unsigned char *skb_transport_header(const struct sk_buff *skb); + + void skb_reset_transport_header(struct sk_buff *skb); + + void skb_set_transport_header(struct sk_buff *skb, const int offset); + + unsigned char *skb_network_header(const struct sk_buff *skb); + + void skb_reset_network_header(struct sk_buff *skb); + + void skb_set_network_header(struct sk_buff *skb, const int offset); + + unsigned char *skb_mac_header(const struct sk_buff *skb); + + int skb_mac_header_was_set(const struct sk_buff *skb); + + void skb_reset_mac_header(struct sk_buff *skb); + + void skb_set_mac_header(struct sk_buff *skb, const int offset); + + +.. slide:: skb data management + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + :height: 50% + + Head + ^ +---------------+ + skb_push | | | | skb_reserve + +---------------+ v + | Data | | skb_pull + ^ | | v + skb_trim | | Tail | + +---------------+ + | | | skb_put + +---------------+ v + End + + +Network Device +============== + +.. slide:: Network Device Interface + :inline-contents: True + :level: 2 + + .. image:: ../res/net-dev-hw.png + + +.. slide:: Advanced features + :inline-contents: True + :level: 2 + + * Scatter-Gather + + * Checksum offloading: Ethernet, IP, UDP, TCP + + * Adaptive interrupt handling (coalescence, adaptive) + + + +Hardware and Software Acceleration Techniques +============================================= + +.. slide:: TCP offload + :inline-contents: True + :level: 2 + + * Full offload - Implement TCP/IP stack in hardware + + * Issues: + + * Scaling number of connections + + * Security + + * Conformance + +.. slide:: Performance observation + :inline-contents: True + :level: 2 + + * Performance is proportional with the number of packets to be + processed + + * Example: if an end-point can process 60K pps + + * 1538 MSS -> 738Mbps + * 2038 MSS -> 978Mbps + * 9038 MSS -> 4.3Gbps + * 20738 MSS -> 9.9Gbps + +.. slide:: Stateless offload + :inline-contents: True + :level: 2 + + * The networking stack processes large packets + + * TX path: the hardware splits large packets in smaller packets + (TCP Segmentation Offload) + + * RX path: the hardware aggregates small packets into larger + packets (Large Receive Offload - LRO) + + +.. slide:: TCP Segmentation Offload + :inline-contents: True + :level: 2 + + .. image:: ../res/tso.png + +.. slide:: Large Receive Offload + :inline-contents: True + :level: 2 + + .. image:: ../res/lro.png + + + diff --git a/refs/pull/405/merge/_sources/lectures/processes.rst.txt b/refs/pull/405/merge/_sources/lectures/processes.rst.txt new file mode 100644 index 00000000..95c8e2f7 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/processes.rst.txt @@ -0,0 +1,1312 @@ +========= +Processes +========= + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives +================== + +.. slide:: Processes and threads + :inline-contents: True + :level: 2 + + * Process and threads + + * Context switching + + * Blocking and waking up + + * Process context + + + +Processes and threads +===================== + +A process is an operating system abstraction that groups together +multiple resources: + +.. slide:: What is a process? + :inline-contents: True + :level: 2 + + .. hlist:: + :columns: 2 + + * An address space + * One or more threads + * Opened files + * Sockets + * Semaphores + * Shared memory regions + * Timers + * Signal handlers + * Many other resources and status information + + All this information is grouped in the Process Control Group + (PCB). In Linux this is :c:type:`struct task_struct`. + + +Overview of process resources +----------------------------- + +A summary of the resources a process has can be obtain from the +`/proc/` directory, where `` is the process id for the +process we want to look at. + +.. slide:: Overview of process resources + :inline-contents: True + :level: 2 + + .. fixme: ditta does not work well with text containing ':' or '-' characters + .. code-block:: none + + +-------------------------------------------------------------------+ + | dr-x------ 2 tavi tavi 0 2021 03 14 12:34 . | + | dr-xr-xr-x 6 tavi tavi 0 2021 03 14 12:34 .. | + | lrwx------ 1 tavi tavi 64 2021 03 14 12:34 0 -> /dev/pts/4 | + +--->| lrwx------ 1 tavi tavi 64 2021 03 14 12:34 1 -> /dev/pts/4 | + | | lrwx------ 1 tavi tavi 64 2021 03 14 12:34 2 -> /dev/pts/4 | + | | lr-x------ 1 tavi tavi 64 2021 03 14 12:34 3 -> /proc/18312/fd | + | +-------------------------------------------------------------------+ + | +----------------------------------------------------------------+ + | | 08048000-0804c000 r-xp 00000000 08:02 16875609 /bin/cat | + $ ls -1 /proc/self/ | 0804c000-0804d000 rw-p 00003000 08:02 16875609 /bin/cat | + cmdline | | 0804d000-0806e000 rw-p 0804d000 00:00 0 [heap] | + cwd | | ... | + environ | +----------->| b7f46000-b7f49000 rw-p b7f46000 00:00 0 | + exe | | | b7f59000-b7f5b000 rw-p b7f59000 00:00 0 | + fd --------+ | | b7f5b000-b7f77000 r-xp 00000000 08:02 11601524 /lib/ld-2.7.so | + fdinfo | | b7f77000-b7f79000 rw-p 0001b000 08:02 11601524 /lib/ld-2.7.so | + maps -----------+ | bfa05000-bfa1a000 rw-p bffeb000 00:00 0 [stack] | + mem | ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso] | + root +----------------------------------------------------------------+ + stat +----------------------------+ + statm | Name: cat | + status ------+ | State: R (running) | + task | | Tgid: 18205 | + wchan +------>| Pid: 18205 | + | PPid: 18133 | + | Uid: 1000 1000 1000 1000 | + | Gid: 1000 1000 1000 1000 | + +----------------------------+ + + +:c:type:`struct task_struct` +---------------------------- + +Lets take a close look at :c:type:`struct task_struct`. For that we +could just look at the source code, but here we will use a tool called +`pahole` (part of the dwarves install package) in order to get +some insights about this structure: + + +.. slide:: struct task_struct + :inline-contents: True + :level: 2 + + .. code-block:: c + + $ pahole -C task_struct vmlinux + + struct task_struct { + struct thread_info thread_info; /* 0 8 */ + volatile long int state; /* 8 4 */ + void * stack; /* 12 4 */ + + ... + + /* --- cacheline 45 boundary (2880 bytes) --- */ + struct thread_struct thread __attribute__((__aligned__(64))); /* 2880 4288 */ + + /* size: 7168, cachelines: 112, members: 155 */ + /* sum members: 7148, holes: 2, sum holes: 12 */ + /* sum bitfield members: 7 bits, bit holes: 2, sum bit holes: 57 bits */ + /* paddings: 1, sum paddings: 2 */ + /* forced alignments: 6, forced holes: 2, sum forced holes: 12 */ + } __attribute__((__aligned__(64))); + + +As you can see it is a pretty large data structure: almost 8KB in size +and 155 fields. + + +Inspecting task_struct +---------------------- + +The following screencast is going to demonstrate how we can inspect +the process control block (:c:type:`struct task_struct`) by connecting +the debugger to the running virtual machine. We are going to use a +helper gdb command `lx-ps` to list the processes and the address of +the task_struct for each process. + +.. slide:: Inspecting task_struct + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/inspect_task_struct.cast + + +Quiz: Inspect a task to determine opened files +---------------------------------------------- + +.. slide:: Quiz: Inspect opened files + :inline-contents: True + :level: 2 + + Use the debugger to inspect the process named syslogd. + + * What command should we use to list the opened file descriptors? + + * How many file descriptors are opened? + + * What command should we use the determine the file name for opened file descriptor 3? + + * What is the filename for file descriptor 3? + + +Threads +------- + +A thread is the basic unit that the kernel process scheduler uses to +allow applications to run the CPU. A thread has the following +characteristics: + +.. slide:: Threads + :inline-contents: True + :level: 2 + + * Each thread has its own stack and together with the register + values it determines the thread execution state + + * A thread runs in the context of a process and all threads in the + same process share the resources + + * The kernel schedules threads not processes and user-level threads + (e.g. fibers, coroutines, etc.) are not visible at the kernel level + + +The typical thread implementation is one where the threads is +implemented as a separate data structure which is then linked to the +process data structure. For example, the Windows kernel uses such an +implementation: + + +.. slide:: Classic implementation (Windows) + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + EPROCESS + +------------------+ + +->| KPROCESS | + | +------------------+ + | | Process ID (PID) | + | +------------------+ + | | ... | + | +------------------+ + | | Thread list |--------------+------------------------------------+ + | +------------------+ | | + | | Opened files | ETHREAD V ETHREAD V + | | +--------------+ | + | | | FILE | | + | | +--------------+ | + | | | ... | | + | | +--------------+ | + | +------------------+ +-----------------------+ +-----------------------+ + | | Address Space | | KTHREAD | | KTHREAD | + | + +--------------+ | +-----------------------+ +-----------------------+ + | | | ... | | | Thread ID (TID) | | Thread ID (TID) | + | | +--------------+ | +-----------------------+ +-----------------------+ + | +------------------+ | Thread Start Address | | Thread Start Address | + | +-----------------------+ +-----------------------+ + | | ... | ... | ... | + | +-----------------------+ +-----------------------+ + | | Process | | Process | + | +-----------------------+ +-----------------------+ + | | | + +---------------------------------------+------------------------------------+ + + +Linux uses a different implementation for threads. The basic unit is +called a task (hence the :c:type:`struct task_struct`) and it is used +for both threads and processes. Instead of embedding resources in the +task structure it has pointers to these resources. + +Thus, if two threads are the same process will point to the same +resource structure instance. If two threads are in different processes +they will point to different resource structure instances. + + +.. slide:: Linux implementation + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + Opened files + task_struct +-------------------+ task_struct + +-----------------------+ | FILE | +-----------------------+ + | Thread Group ID (PID) | +--->+-------------------+<---+ | Thread Group ID (PID) | + +-----------------------+ | | .... | | +-----------------------+ + | Thread ID (TID) | | +-------------------+ | | Thread ID (TID) | + +-----------------------+ | | +-----------------------+ + | ... | | | | ... | + +-----------------------+ | | +-----------------------+ + | Opened files |--+ +--| Opened files | + +-----------------------+ Address Space +-----------------------+ + | Address Space |---+ +-------------------+ +---| Address Space | + +-----------------------+ | | | | +-----------------------+ + | ... | +-->| .... |<--+ | ... | + +-----------------------+ | | +-----------------------+ + +-------------------+ + + +The clone system call +--------------------- + +In Linux a new thread or process is create with the :c:func:`clone` +system call. Both the :c:func:`fork` system call and the +:c:func:`pthread_create` function uses the :c:func:`clone` +implementation. + +It allows the caller to decide what resources should be shared with +the parent and which should be copied or isolated: + +.. slide:: The clone system call + :inline-contents: True + :level: 2 + + * CLONE_FILES - shares the file descriptor table with the parent + + * CLONE_VM - shares the address space with the parent + + * CLONE_FS - shares the filesystem information (root directory, + current directory) with the parent + + * CLONE_NEWNS - does not share the mount namespace with the parent + + * CLONE_NEWIPC - does not share the IPC namespace (System V IPC + objects, POSIX message queues) with the parent + + * CLONE_NEWNET - does not share the networking namespaces (network + interfaces, routing table) with the parent + + +For example, if `CLONE_FILES | CLONE_VM | CLONE_FS` is used by the +caller then effectively a new thread is created. If these flags are +not used then a new process is created. + +Namespaces and "containers" +--------------------------- + +"Containers" are a form of lightweight virtual machines that share the +same kernel instance, as opposed to normal virtualization where a +hypervisor runs multiple VMs, each with its one kernel +instance. + +Examples of container technologies are LXC - that allows running +lightweight "VM" and docker - a specialized container for running a +single application. + +Containers are built on top of a few kernel features, one of which is +namespaces. They allow isolation of different resources that would +otherwise be globally visible. For example, without containers, all +processes would be visible in /proc. With containers, processes in one +container will not be visible (in /proc or be killable) to other +containers. + +To achieve this partitioning, the :c:type:`struct nsproxy` structure +is used to group types of resources that we want to partition. It +currently supports IPC, networking, cgroup, mount, networking, PID, +time namespaces. For example, instead of having a global list for +networking interfaces, the list is part of a :c:type:`struct net`. The +system initializes with a default namespace (:c:data:`init_net`) and by +default all processes will share this namespace. When a new namespace +is created a new net namespace is created and then new processes can +point to that new namespace instead of the default one. + + +.. slide:: Namespaces and "containers" + :inline-contents: False + :level: 2 + + * Containers = a form of lightweight virtual machines + + * Container based technologies: LXC, docker + + * Containers are built of top of kernel namespaces + + * Kernel namespaces allows isolation of otherwise globally visible + resources + + * :c:type:`struct nsproxy` has multiple namespaces each of which + can be selectively shared between groups of processes + + * At boot initial namespaces are created (e.g. :c:data:`init_net`) + that are by default shared between new processes (e.g. list of + available network interfaces) + + * New namespace can be created a runtime and new processes can + point to these new namespaces + + +Accessing the current process +----------------------------- + +.. slide:: Accessing the current process + :inline-contents: True + :level: 2 + + Accessing the current process is a frequent operation: + + * opening a file needs access to :c:type:`struct task_struct`'s + file field + + * mapping a new file needs access to :c:type:`struct task_struct`'s + mm field + + * Over 90% of the system calls needs to access the current process + structure so it needs to be fast + + * The :c:macro:`current` macro is available to access to current + process's :c:type:`struct task_struct` + +In order to support fast access in multi processor configurations a +per CPU variable is used to store and retrieve the pointer to the +current :c:type:`struct task_struct`: + +.. slide:: Accessing the current process on x86 + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + CPU0 + +------------+ task_struct + | ... | +--------> +-----------------------+ + +------------- | | Thread Group ID (PID) | + +--| FS | | +-----------------------+ + | +------------- | | Thread ID (TID) | + | | ... | | +-----------------------+ + | +------------+ | | ... | + | | +-----------------------+ + | Per CPU variables | | Opened files | + +->+-----------------------+ | +-----------------------+ + | ... | | | Address Space | + +-----------------------+ | +-----------------------+ + | current_task |------+ | ... | + +-----------------------+ +-----------------------+ + | ... | + +-----------------------+ + + +Previously the following sequence was used as the implementation for +the :c:macro:`current` macro: + +.. slide:: Previous implementation for current (x86) + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* how to get the current stack pointer from C */ + register unsigned long current_stack_pointer asm("esp") __attribute_used__; + + /* how to get the thread information struct from C */ + static inline struct thread_info *current_thread_info(void) + { + return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE – 1)); + } + + #define current current_thread_info()->task + + +Quiz: previous implementation for current (x86) +----------------------------------------------- + +.. slide:: Quiz: previous implementation for current (x86) + :inline-contents: True + :level: 2 + + What is the size of :c:type:`struct thread_info`? + + Which of the following are potential valid sizes for + :c:type:`struct thread_info`: 4095, 4096, 4097? + + + +Context switching +================= + +The following diagram shows an overview of the Linux kernel context +switch process: + +.. slide:: Overview the context switching processes + :inline-contents: True + :level: 2 + + .. ditaa:: + + Userspace Kernel Kernel Userspace + T0 T0 T1 T1 + + | + | syscall +-------------------+ + V --------->| Save user regs on | +-----------------+ + interrupt | the kernel stack | | Save user regs | + +-------------------+ | on kernel stack | + | +-----------------+ + |schedule() | + | |schedule() + V | + +-----------------+ V + | context_switch |------+ +-----------------+ + +-----------------+ | | context_switch | + +-----> +-----------------+ + | + V + +-------------------+ + | Pop user regs | + | from kernel stack | + +-------------------+ + | + | exit syscall + +--------------------> | + | + V + + +Note that before a context switch can occur we must do a kernel +transition, either with a system call or with an interrupt. At that +point the user space registers are saved on the kernel stack. At some +point the :c:func:`schedule` function will be called which can decide +that a context switch must occur from T0 to T1 (e.g. because the +current thread is blocking waiting for an I/O operation to complete or +because it's allocated time slice has expired). + +At that point :c:func:`context_switch` will perform architecture +specific operations and will switch the address space if needed: + + +.. slide:: context_switch + :inline-contents: True + :level: 2 + + .. code-block:: c + + static __always_inline struct rq * + context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next, struct rq_flags *rf) + { + prepare_task_switch(rq, prev, next); + + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_start_context_switch(prev); + + /* + * kernel -> kernel lazy + transfer active + * user -> kernel lazy + mmgrab() active + * + * kernel -> user switch + mmdrop() active + * user -> user switch + */ + if (!next->mm) { // to kernel + enter_lazy_tlb(prev->active_mm, next); + + next->active_mm = prev->active_mm; + if (prev->mm) // from user + mmgrab(prev->active_mm); + else + prev->active_mm = NULL; + } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); + /* + * sys_membarrier() requires an smp_mb() between setting + * rq->curr / membarrier_switch_mm() and returning to userspace. + * + * The below provides this either through switch_mm(), or in + * case 'prev->active_mm == next->mm' through + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ + rq->prev_mm = prev->active_mm; + prev->active_mm = NULL; + } + } + + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + + prepare_lock_switch(rq, next, rf); + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + barrier(); + + return finish_task_switch(prev); + } + + +Then it will call the architecture specific :c:macro:`switch_to` +implementation to switch the registers state and kernel stack. Note +that registers are saved on stack and that the stack pointer is saved +in the task structure: + +.. slide:: switch_to + :inline-contents: True + :level: 2 + + .. code-block:: c + :emphasize-lines: 28-30,56 + + #define switch_to(prev, next, last) \ + do { \ + ((last) = __switch_to_asm((prev), (next))); \ + } while (0) + + + /* + * %eax: prev task + * %edx: next task + */ + .pushsection .text, "ax" + SYM_CODE_START(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in struct inactive_task_frame + */ + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + /* + * Flags are saved to prevent AC leakage. This could go + * away if objtool would have 32bit support to verify + * the STAC/CLAC correctness. + */ + pushfl + + /* switch stack */ + movl %esp, TASK_threadsp(%eax) + movl TASK_threadsp(%edx), %esp + + #ifdef CONFIG_STACKPROTECTOR + movl TASK_stack_canary(%edx), %ebx + movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset + #endif + + #ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + #endif + + /* Restore flags or the incoming task to restore AC state. */ + popfl + /* restore callee-saved registers */ + popl %esi + popl %edi + popl %ebx + popl %ebp + + jmp __switch_to + SYM_CODE_END(__switch_to_asm) + .popsection + + +You can notice that the instruction pointer is not explicitly +saved. It is not needed because: + + * a task will always resume in this function + + * the :c:func:`schedule` (:c:func:`context_switch` is always + inlined) caller's return address is saved on the kernel stack + + * a jmp is used to execute :c:func:`__switch_to` which is a function + and when it returns it will pop the original (next task) return + address from the stack + + +The following screencast uses the debugger to setup a breaking in +__switch_to_asm and examine the stack during the context switch: + +.. slide:: Inspecting task_struct + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/context_switch.cast + + +Quiz: context switch +-------------------- + +.. slide:: Quiz: context switch + :inline-contents: True + :level: 2 + + We are executing a context switch. Select all of the statements that are true. + + * the ESP register is saved in the task structure + + * the EIP register is saved in the task structure + + * general registers are saved in the task structure + + * the ESP register is saved on the stack + + * the EIP register is saved on the stack + + * general registers are saved on the stack + + +Blocking and waking up tasks +============================ + +Task states +----------- + +The following diagram shows to the task (threads) states and the +possible transitions between them: + +.. slide:: Task states + :inline-contents: True + :level: 2 + + .. ditaa:: + + preemption + +------------------------------+ + | | + V | + +------------+ +--------------+ +-------------+ + clone() | | schedule() | | exit() | | + -----------> | TASK_READY |-------------->| TASK_RUNNING |---------------->| TASK_DEAD | + | | | |--------+ | TASK_ZOMBIE | + +------------+ +--------------+ | | | + ^ | +-------------+ + | | + | | + | | + | signal +----------------------+ | + +-----------| | | + | | | wait_event() | + | wake_up() | TASK_INTERRUPTIBLE |<--------------+ + +-----------| | | + | | | | + | +----------------------+ | + | | + | | + | +----------------------+ | + | | | wait_event() | + | wake_up() | TASK_UNINTERRUPTIBLE |<--------------+ + +-----------| | + +----------------------+ + + +Blocking the current thread +--------------------------- + +Blocking the current thread is an important operation we need to +perform to implement efficient task scheduling - we want to run other +threads while I/O operations complete. + +In order to accomplish this the following operations take place: + +.. slide:: Blocking the current thread + :inline-contents: True + :level: 2 + + * Set the current thread state to TASK_UINTERRUPTIBLE or + TASK_INTERRUPTIBLE + + * Add the task to a waiting queue + + * Call the scheduler which will pick up a new task from the READY + queue + + * Do the context switch to the new task + +Below are some snippets for the :c:macro:`wait_event` +implementation. Note that the waiting queue is a list with some extra +information like a pointer to the task struct. + +Also note that a lot of effort is put into making sure no deadlock can +occur between :c:macro:`wait_event` and :c:macro:`wake_up`: the task +is added to the list before checking :c:data:`condition`, signals are +checked before calling :c:func:`schedule`. + +.. slide:: wait_event + :inline-contents: True + :level: 2 + + .. code-block:: c + + /** + * wait_event - sleep until a condition gets true + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + */ + #define wait_event(wq_head, condition) \ + do { \ + might_sleep(); \ + if (condition) \ + break; \ + __wait_event(wq_head, condition); \ + } while (0) + + #define __wait_event(wq_head, condition) \ + (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + schedule()) + + /* + * The below macro ___wait_event() has an explicit shadow of the __ret + * variable when used from the wait_event_*() macros. + * + * This is so that both can use the ___wait_cond_timeout() construct + * to wrap the condition. + * + * The type inconsistency of the wait_event_*() __ret variable is also + * on purpose; we use long where we can return timeout values and int + * otherwise. + */ + #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ + ({ \ + __label__ __out; \ + struct wait_queue_entry __wq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ + for (;;) { \ + long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\ + \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(&wq_head, &__wq_entry); \ + __out: __ret; \ + }) + + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) + { + wq_entry->flags = flags; + wq_entry->private = current; + wq_entry->func = autoremove_wake_function; + INIT_LIST_HEAD(&wq_entry->entry); + } + + long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) + { + unsigned long flags; + long ret = 0; + + spin_lock_irqsave(&wq_head->lock, flags); + if (signal_pending_state(state, current)) { + /* + * Exclusive waiter must not fail if it was selected by wakeup, + * it should "consume" the condition we were waiting for. + * + * The caller will recheck the condition and return success if + * we were already woken up, we can not miss the event because + * wakeup locks/unlocks the same wq_head->lock. + * + * But we need to ensure that set-condition + wakeup after that + * can't see us, it should wake up another exclusive waiter if + * we fail. + */ + list_del_init(&wq_entry->entry); + ret = -ERESTARTSYS; + } else { + if (list_empty(&wq_entry->entry)) { + if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_entry_tail(wq_head, wq_entry); + else + __add_wait_queue(wq_head, wq_entry); + } + set_current_state(state); + } + spin_unlock_irqrestore(&wq_head->lock, flags); + + return ret; + } + + static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) + { + list_add(&wq_entry->entry, &wq_head->head); + } + + static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) + { + list_add_tail(&wq_entry->entry, &wq_head->head); + } + + /** + * finish_wait - clean up after waiting in a queue + * @wq_head: waitqueue waited on + * @wq_entry: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ + void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) + { + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wq_entry->entry)) { + spin_lock_irqsave(&wq_head->lock, flags); + list_del_init(&wq_entry->entry); + spin_unlock_irqrestore(&wq_head->lock, flags); + } + } + + + +Waking up a task +---------------- + +We can wake-up tasks by using the :c:macro:`wake_up` primitive. The +following high level operations are performed to wake up a task: + +.. slide:: Waking up a task + :inline-contents: True + :level: 2 + + * Select a task from the waiting queue + + * Set the task state to TASK_READY + + * Insert the task into the scheduler's READY queue + + * On SMP system this is a complex operation: each processor has its + own queue, queues need to be balanced, CPUs needs to be signaled + + +.. slide:: wake_up + :inline-contents: True + :level: 2 + + .. code-block:: c + + #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) + + /** + * __wake_up - wake up threads blocked on a waitqueue. + * @wq_head: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. + */ + void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, void *key) + { + __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); + } + + static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) + { + unsigned long flags; + wait_queue_entry_t bookmark; + + bookmark.flags = 0; + bookmark.private = NULL; + bookmark.func = NULL; + INIT_LIST_HEAD(&bookmark.entry); + + do { + spin_lock_irqsave(&wq_head->lock, flags); + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, + wake_flags, key, &bookmark); + spin_unlock_irqrestore(&wq_head->lock, flags); + } while (bookmark.flags & WQ_FLAG_BOOKMARK); + } + + /* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ + static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key, + wait_queue_entry_t *bookmark) + { + wait_queue_entry_t *curr, *next; + int cnt = 0; + + lockdep_assert_held(&wq_head->lock); + + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { + curr = list_next_entry(bookmark, entry); + + list_del(&bookmark->entry); + bookmark->flags = 0; + } else + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); + + if (&curr->entry == &wq_head->head) + return nr_exclusive; + + list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) { + unsigned flags = curr->flags; + int ret; + + if (flags & WQ_FLAG_BOOKMARK) + continue; + + ret = curr->func(curr, mode, wake_flags, key); + if (ret < 0) + break; + if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + + if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && + (&next->entry != &wq_head->head)) { + bookmark->flags = WQ_FLAG_BOOKMARK; + list_add_tail(&bookmark->entry, &next->entry); + break; + } + } + + return nr_exclusive; + } + + int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) + { + int ret = default_wake_function(wq_entry, mode, sync, key); + + if (ret) + list_del_init_careful(&wq_entry->entry); + + return ret; + } + + int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, + void *key) + { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + return try_to_wake_up(curr->private, mode, wake_flags); + } + + /** + * try_to_wake_up - wake up a thread + * @p: the thread to be awakened + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) + * + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. + * + * If the task was not queued/runnable, also place it back on a runqueue. + * + * This function is atomic against schedule() which would dequeue the task. + * + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. + * + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. + * + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. + * + * Return: %true if @p->state changes (an actual wakeup was done), + * %false otherwise. + */ + static int + try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + { + ... + + +Preempting tasks +================ + +Up until this point we look at how context switches occurs voluntary +between threads. Next we will look at how preemption is handled. We +will start wight the simpler case where the kernel is configured as +non preemptive and then we will move to the preemptive kernel case. + +Non preemptive kernel +--------------------- + +.. slide:: Non preemptive kernel + :inline-contents: True + :level: 2 + + * At every tick the kernel checks to see if the current process has + its time slice consumed + + * If that happens a flag is set in interrupt context + + * Before returning to userspace the kernel checks this flag and + calls :c:func:`schedule` if needed + + * In this case tasks are not preempted while running in kernel mode + (e.g. system call) so there are no synchronization issues + + +Preemptive kernel +----------------- + +In this case the current task can be preempted even if we are running +in kernel mode and executing a system call. This requires using a +special synchronization primitives: :c:macro:`preempt_disable` and +:c:macro:`preempt_enable`. + +In order to simplify handling for preemptive kernels and since +synchronization primitives are needed for the SMP case anyway, +preemption is disabled automatically when a spinlock is used. + +As before, if we run into a condition that requires the preemption of +the current task (its time slices has expired) a flag is set. This +flag is checked whenever the preemption is reactivated, e.g. when +exiting a critical section through a :c:func:`spin_unlock` and if +needed the scheduler is called to select a new task. + + +.. slide:: Preemptive kernel + :inline-contents: False + :level: 2 + + * Tasks can be preempted even when running in kernel mode + + * It requires new synchronization primitives to be used in critical + sections: :c:macro:`preempt_disable` and + :c:macro:`preempt_enable` + + * Spinlocks also disable preemption + + * When a thread needs to be preempted a flag is set and action is + taken (e.g. scheduler is called) when preemption is reactivated + + +Process context +=============== + +Now that we have examined the implementation of processes and threads +(tasks), how context switching occurs, how we can block, wake-up and +preempt tasks, we can finally define what the process context is what +are its properties: + +.. slide:: Process context + :inline-contents: True + :level: 2 + + The kernel is executing in process context when it is running a + system call. + + In process context there is a well defined context and we can + access the current process data with :c:macro:`current` + + In process context we can sleep (wait on a condition). + + In process context we can access the user-space (unless we are + running in a kernel thread context). + + +Kernel threads +-------------- + +.. slide:: Kernel threads + :inline-contents: True + :level: 2 + + Sometimes the kernel core or device drivers need to perform blocking + operations and thus they need to run in process context. + + Kernel threads are used exactly for this and are a special class of + tasks that don't "userspace" resources (e.g. no address space or + opened files). + + +The following screencast takes a closer look at kernel threads: + +.. slide:: Inspecting kernel threads + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/kernel_threads.cast + + +Using gdb scripts for kernel inspection +======================================= + +The Linux kernel comes with a predefined set of gdb extra commands we +can use to inspect the kernel during debugging. They will +automatically be loaded as long gdbinit is properly setup + +.. code-block:: sh + + ubuntu@so2:/linux/tools/labs$ cat ~/.gdbinit + add-auto-load-safe-path /linux/scripts/gdb/vmlinux-gdb.py + +All of the kernel specific commands are prefixed with lx-. You can use +TAB in gdb to list all of them: + +.. code-block:: sh + + (gdb) lx- + lx-clk-summary lx-dmesg lx-mounts + lx-cmdline lx-fdtdump lx-ps + lx-configdump lx-genpd-summary lx-symbols + lx-cpus lx-iomem lx-timerlist + lx-device-list-bus lx-ioports lx-version + lx-device-list-class lx-list-check + lx-device-list-tree lx-lsmod + +The implementation of the commands can be found at +`script/gdb/linux`. Lets take a closer look at the lx-ps +implementation: + + +.. code-block:: python + + task_type = utils.CachedType("struct task_struct") + + + def task_lists(): + task_ptr_type = task_type.get_type().pointer() + init_task = gdb.parse_and_eval("init_task").address + t = g = init_task + + while True: + while True: + yield t + + t = utils.container_of(t['thread_group']['next'], + task_ptr_type, "thread_group") + if t == g: + break + + t = g = utils.container_of(g['tasks']['next'], + task_ptr_type, "tasks") + if t == init_task: + return + + + class LxPs(gdb.Command): + """Dump Linux tasks.""" + + def __init__(self): + super(LxPs, self).__init__("lx-ps", gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM")) + for task in task_lists(): + gdb.write("{} {:^5} {}\n".format( + task.format_string().split()[0], + task["pid"].format_string(), + task["comm"].string())) + + + +Quiz: Kernel gdb scripts +------------------------ + +.. slide:: Quiz: Kernel gdb scripts + :inline-contents: True + :level: 2 + + What is the following change of the lx-ps script trying to + accomplish? + + .. code-block:: diff + + diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py + index 17ec19e9b5bf..7e43c163832f 100644 + --- a/scripts/gdb/linux/tasks.py + +++ b/scripts/gdb/linux/tasks.py + @@ -75,10 +75,13 @@ class LxPs(gdb.Command): + def invoke(self, arg, from_tty): + gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM")) + for task in task_lists(): + - gdb.write("{} {:^5} {}\n".format( + + check = task["mm"].format_string() == "0x0" + + gdb.write("{} {:^5} {}{}{}\n".format( + task.format_string().split()[0], + task["pid"].format_string(), + - task["comm"].string())) + + "[" if check else "", + + task["comm"].string(), + + "]" if check else "")) + + + LxPs() + diff --git a/refs/pull/405/merge/_sources/lectures/smp.rst.txt b/refs/pull/405/merge/_sources/lectures/smp.rst.txt new file mode 100644 index 00000000..29706286 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/smp.rst.txt @@ -0,0 +1,1184 @@ +========================== +Symmetric Multi-Processing +========================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Symmetric Multi-Processing + :inline-contents: True + :level: 2 + + * Kernel Concurrency + + * Atomic operations + + * Spin locks + + * Cache thrashing + + * Optimized spin locks + + * Process and Interrupt Context Synchronization + + * Mutexes + + * Per CPU data + + * Memory Ordering and Barriers + + * Read-Copy Update + + +Synchronization basics +====================== + +Because the Linux kernel supports symmetric multi-processing (SMP) it +must use a set of synchronization mechanisms to achieve predictable +results, free of race conditions. + +.. note:: We will use the terms core, CPU and processor as + interchangeable for the purpose of this lecture. + +Race conditions can occur when the following two conditions happen +simultaneously: + +.. slide:: Race conditions + :inline-contents: True + :level: 2 + + * there are at least two execution contexts that run in "parallel": + + * truly run in parallel (e.g. two system calls running on + different processors) + + * one of the contexts can arbitrary preempt the other (e.g. an + interrupt preempts a system call) + + * the execution contexts perform read-write accesses to shared + memory + + +Race conditions can lead to erroneous results that are hard to debug, +because they manifest only when the execution contexts are scheduled +on the CPU cores in a very specific order. + +A classical race condition example is an incorrect implementation for +a release operation of a resource counter: + +.. slide:: Race condition: resource counter release + :inline-contents: True + :level: 2 + + .. code-block:: c + + void release_resource() + { + counter--; + + if (!counter) + free_resource(); + } + + +A resource counter is used to keep a shared resource available until +the last user releases it but the above implementation has a race +condition that can cause freeing the resource twice: + + +.. slide:: Race condition scenario + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + counter is 2 + + Thread A Thread B + + * + | + | + +---------------------+ + | dec counter | counter is 1 + | cEEE | + +---------------------+ + | + | B preempts A + +-----------------------------------------------+ + | + v + +----------------------+ + counter is 0 | dec counter | + | if (!counter) | + resource is freed | free_resource(); | + | cEEE | + +----------------------+ + B finishes, A continues | + +-----------------------------------------------+ + | + v + +----------------------+ + | if (!counter) | + | free_resource(); | resource is freed + | cEEE | + +----------------------+ + +In most cases the `release_resource()` function will only free the +resource once. However, in the scenario above, if thread A is +preempted right after decrementing `counter` and thread B calls +`release_resource()` it will cause the resource to be freed. When +resumed, thread A will also free the resource since the counter value +is 0. + +To avoid race conditions the programmer must first identify the +critical section that can generate a race condition. The critical +section is the part of the code that reads and writes shared memory +from multiple parallel contexts. + +In the example above, the minimal critical section is starting with +the counter decrement and ending with checking the counter's value. + +Once the critical section has been identified race conditions can be +avoided by using one of the following approaches: + +.. slide:: Avoiding race conditions + :inline-contents: True + :level: 2 + + * make the critical section **atomic** (e.g. use atomic + instructions) + + * **disable preemption** during the critical section (e.g. disable + interrupts, bottom-half handlers, or thread preemption) + + * **serialize the access** to the critical section (e.g. use spin + locks or mutexes to allow only one context or thread in the + critical section) + + + +Linux kernel concurrency sources +================================ + +There are multiple source of concurrency in the Linux kernel that +depend on the kernel configuration as well as the type of system it +runs on: + + +.. slide:: Linux kernel concurrency sources + :inline-contents: True + :level: 2 + + * **single core systems**, **non-preemptive kernel**: the current + process can be preempted by interrupts + + * **single core systems**, **preemptive kernel**: above + the + current process can be preempted by other processes + + * **multi-core systems**: above + the current process can run + in parallel with another process or with an interrupt running on + another processor + +.. note:: We only discuss kernel concurrency and that is why a + non-preemptive kernel running on an single core system + has interrupts as the only source of concurrency. + + +Atomic operations +================= + +In certain circumstances we can avoid race conditions by using atomic +operations that are provided by hardware. Linux provides a unified API +to access atomic operations: + +.. slide:: Atomic operations + :inline-contents: True + :level: 2 + + * integer based: + + * simple: :c:func:`atomic_inc`, :c:func:`atomic_dec`, + :c:func:`atomic_add`, :c:func:`atomic_sub` + + * conditional: :c:func:`atomic_dec_and_test`, :c:func:`atomic_sub_and_test` + + * bit based: + + * simple: :c:func:`test_bit`, :c:func:`set_bit`, + :c:func:`change_bit` + + * conditional: :c:func:`test_and_set_bit`, :c:func:`test_and_clear_bit`, + :c:func:`test_and_change_bit` + +For example, we could use :c:func:`atomic_dec_and_test` to implement +the resource counter decrement and value checking atomic: + +.. slide:: Using :c:func:`atomic_dec_and_test` to implement resource counter release + :inline-contents: True + :level: 2 + + .. code-block:: c + + void release_resource() + { + if (atomic_dec_and_test(&counter)) + free_resource(); + } + + +One complication with atomic operations is encountered in +multi-core systems, where an atomic operation is not longer +atomic at the system level (but still atomic at the core level). + +To understand why, we need to decompose the atomic operation in memory +loads and stores. Then we can construct race condition scenarios where +the load and store operations are interleaved across CPUs, like in the +example below where incrementing a value from two processors will +produce an unexpected result: + +.. slide:: Atomic operations may not be atomic on SMP systems + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + + +------------+ + | Memory | + +-------------+ LOAD (0) | | +-------------+ + | CPU 0 |<--------------| v <- 0 | LOAD (0) | CPU 1 | + | | STORE (1) | |-------------->| | + | inc v |-------------->| v <- 1 | STORE (1) | inc v | + | cEEE | | v <- 1 |<--------------| cEEE | + +-------------+ | cEEE | +-------------+ + +------------+ + + +In order to provide atomic operations on SMP systems different +architectures use different techniques. For example, on x86 a LOCK +prefix is used to lock the system bus while executing the prefixed +operation: + +.. slide:: Fixing atomic operations for SMP systems (x86) + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +------------+ + +-------------+ BUS LOCK | Memory | + | CPU 1 |<------------->| | + | | LOAD (0) | | + | inc v |<--------------| v <- 0 | + | | STORE (1) | | + | |-------------->| v <- 1 | + | | BUS UNLOCK | | + | cEEE |<------------->| | BUS LOCK +-------------+ + +-------------+ | |<------------->| CPU 1 | + | | LOAD (1) | | + | |<--------------| inc v | + | v <- 2 | STORE (2) | | + | |-------------->| | + | | BUS UNLOCK | | + | cEEE |<------------->| cEEE | + +------------+ +-------------+ + + +On ARM the LDREX and STREX instructions are used together to guarantee +atomic access: LDREX loads a value and signals the exclusive monitor +that an atomic operation is in progress. The STREX attempts to store a +new value but only succeeds if the exclusive monitor has not detected +other exclusive operations. So, to implement atomic operations the +programmer must retry the operation (both LDREX and STREX) until the +exclusive monitor signals a success. + +Although they are often interpreted as "light" or "efficient" +synchronization mechanisms (because they "don't require spinning or +context switches", or because they "are implemented in hardware so +they must be more efficient", or because they "are just instructions +so they must have similar efficiency as other instructions"), as seen +from the implementation details, atomic operations are actually +expensive. + + +Disabling preemption (interrupts) +================================= + +On single core systems and non preemptive kernels the only source of +concurrency is the preemption of the current thread by an +interrupt. To prevent concurrency is thus sufficient to disable +interrupts. + +This is done with architecture specific instructions, but Linux offers +architecture independent APIs to disable and enable interrupts: + +.. slide:: Synchronization with interrupts (x86) + :inline-contents: True + :level: 2 + + .. code-block:: c + + #define local_irq_disable() \ + asm volatile („cli” : : : „memory”) + + #define local_irq_enable() \ + asm volatile („sti” : : : „memory”) + + #define local_irq_save(flags) \ + asm volatile ("pushf ; pop %0" :"=g" (flags) + : /* no input */: "memory") \ + asm volatile("cli": : :"memory") + + #define local_irq_restore(flags) \ + asm volatile ("push %0 ; popf" + : /* no output */ + : "g" (flags) :"memory", "cc"); + + +Although the interrupts can be explicitly disabled and enable with +:c:func:`local_irq_disable` and :c:func:`local_irq_enable` these APIs +should only be used when the current state and interrupts is +known. They are usually used in core kernel code (like interrupt +handling). + +For typical cases where we want to avoid interrupts due to concurrency +issues it is recommended to use the :c:func:`local_irq_save` and +:c:func:`local_irq_restore` variants. They take care of saving and +restoring the interrupts states so they can be freely called from +overlapping critical sections without the risk of accidentally +enabling interrupts while still in a critical section, as long as the +calls are balanced. + +Spin Locks +========== + +Spin locks are used to serialize access to a critical section. They +are necessary on multi-core systems where we can have true execution +parallelism. This is a typical spin lock implementation: + + +.. slide:: Spin Lock Implementation Example (x86) + :inline-contents: True + :level: 2 + + .. code-block:: asm + + spin_lock: + lock bts [my_lock], 0 + jc spin_lock + + /* critical section */ + + spin_unlock: + mov [my_lock], 0 + + **bts dts, src** - bit test and set; it copies the src bit from the dts + memory address to the carry flag and then sets it: + + .. code-block:: c + + CF <- dts[src] + dts[src] <- 1 + + +As it can be seen, the spin lock uses an atomic instruction to make +sure that only one core can enter the critical section. If there are +multiple cores trying to enter they will continuously "spin" until the +lock is released. + +While the spin lock avoids race conditions, it can have a significant +impact on the system's performance due to "lock contention": + + +.. slide:: Lock Contention + :inline-contents: True + :level: 2 + + * There is lock contention when at least one core spins trying to + enter the critical section lock + + * Lock contention grows with the critical section size, time spent + in the critical section and the number of cores in the system + + +Another negative side effect of spin locks is cache thrashing. + +.. slide:: Cache Thrashing + :inline-contents: True + :level: 2 + + Cache thrashing occurs when multiple cores are trying to read and + write to the same memory resulting in excessive cache misses. + + Since spin locks continuously access memory during lock contention, + cache thrashing is a common occurrence due to the way cache + coherency is implemented. + + +Cache coherency in multi-processor systems +========================================== + +The memory hierarchy in multi-processor systems is composed of local +CPU caches (L1 caches), shared CPU caches (L2 caches) and the main +memory. To explain cache coherency we will ignore the L2 cache and +only consider the L1 caches and main memory. + +In the figure below we present a view of the memory hierarchy with two +variables A and B that fall into different cache lines and where +caches and the main memory are synchronized: + +.. slide:: Synchronized caches and memory + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + cache cache + +-------+ +-------+ + A | 1 | | 1 | A + +-------+ +-------+ + B | 2 | | 2 | B + +-------+ +-------+ + memory + +-----------------------------+ + A | 1 | + +-----------------------------+ + B | 2 | + +-----------------------------+ + + +In the absence of a synchronization mechanism between the caches and +main memory, when CPU 0 executes `A = A + B` and CPU 1 executes `B = +A + B` we will have the following memory view: + +.. slide:: Unsynchronized caches and memory + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + A <- A + B B <- A + B + + +-------+ +-------+ + A | 3 | | 1 | A + +-------+ +-------+ + B | 2 | | 3 | B + +-------+ +-------+ + write back caches + +-----------------------------+ + A | 1 | + +-----------------------------+ + B | 2 | + +-----------------------------+ + + +In order to avoid the situation above multi-processor systems use +cache coherency protocols. There are two main types of cache coherency +protocols: + +.. slide:: Cache Coherency Protocols + :inline-contents: True + :level: 2 + + * Bus snooping (sniffing) based: memory bus transactions are + monitored by caches and they take actions to preserve + coherency + + * Directory based: there is a separate entity (directory) that + maintains the state of caches; caches interact with directory + to preserve coherency + + Bus snooping is simpler but it performs poorly when the number of + cores goes beyond 32-64. + + Directory based cache coherence protocols scale much better (up + to thousands of cores) and are usually used in NUMA systems. + + +A simple cache coherency protocol that is commonly used in practice is +MESI (named after the acronym of the cache line states names: +**Modified**, **Exclusive**, **Shared** and **Invalid**). It's main +characteristics are: + +.. slide:: MESI Cache Coherence Protocol + :inline-contents: True + :level: 2 + + * Caching policy: write back + + * Cache line states + + * Modified: owned by a single core and dirty + + * Exclusive: owned by a single core and clean + + * Shared: shared between multiple cores and clean + + * Invalid : the line is not cached + +Issuing read or write requests from CPU cores will trigger state +transitions, as exemplified below: + +.. slide:: MESI State Transitions + :inline-contents: True + :level: 2 + + * Invalid -> Exclusive: read request, all other cores have the line + in Invalid; line loaded from memory + + * Invalid -> Shared: read request, at least one core has the line + in Shared or Exclusive; line loaded from sibling cache + + * Invalid/Shared/Exclusive -> Modified: write request; **all + other** cores **invalidate** the line + + * Modified -> Invalid: write request from other core; line is + flushed to memory + + +.. note:: The most important characteristic of the MESI protocol is + that it is a write-invalidate cache protocol. When writing to a + shared location all other caches are invalidated. + +This has important performance impact in certain access patterns, and +one such pattern is contention for a simple spin lock implementation +like we discussed above. + +To exemplify this issue lets consider a system with three CPU cores, +where the first has acquired the spin lock and it is running the +critical section while the other two are spinning waiting to enter the +critical section: + +.. slide:: Cache thrashing due to spin lock contention + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +-------+ +-------+ +-------+ + | CPU 0 |<---------------+ | CPU 1 | Invalidate | CPU 0 | + | cache |<-------------+ | | cache |<---+ +---------->| cache | + +-------+ Invalidate | | +-------+ | | +-------+ + | | | | + | | +----------------------------+ + spin_lock(&lock); | | | | + | | READ lock | | + | +---- WRITE lock ---+ | + | | + | READ lock | + +-------------------------------- WRITE lock ----+ + + ... ... ... + READ data READ lock READ lock + | | | + | | | + | | | + +------------------------------+-------------------------+ + | + v + + cache miss + +As it can be seen from the figure above due to the writes issued by +the cores spinning on the lock we see frequent cache line invalidate +operations which means that basically the two waiting cores will flush +and load the cache line while waiting for the lock, creating +unnecessary traffic on the memory bus and slowing down memory accesses +for the first core. + +Another issue is that most likely data accessed by the first CPU +during the critical section is stored in the same cache line with the +lock (common optimization to have the data ready in the cache after +the lock is acquired). Which means that the cache invalidation +triggered by the two other spinning cores will slow down the execution +of the critical section which in turn triggers more cache invalidate +actions. + +Optimized spin locks +==================== + +As we have seen simple spin lock implementations can have poor +performance issues due to cache thrashing, especially as the number of +cores increase. To avoid this issue there are two possible strategies: + +* reduce the number of writes and thus reduce the number of cache + invalidate operations + +* avoid the other processors spinning on the same cache line, and thus + avoid the cache invalidate operations + + +An optimized spin lock implementation that uses the first approach is +presented below: + +.. slide:: Optimized spin lock (KeAcquireSpinLock) + :inline-contents: True + :level: 2 + + |_| + + .. code-block:: asm + + spin_lock: + rep ; nop + test lock_addr, 1 + jnz spin_lock + lock bts lock_addr + jc spin_lock + + + * we first test the lock read only, using a non atomic + instructions, to avoid writes and thus invalidate operations + while we spin + + * only when the lock *might* be free, we try to acquire it + +The implementation also use the **PAUSE** instruction to avoid +pipeline flushes due to (false positive) memory order violations and +to add a small delay (proportional with the memory bus frequency) to +reduce power consumption. + +A similar implementation with support for fairness (the CPU cores are +allowed in the critical section based on the time of arrival) is used +in the Linux kernel (the `ticket spin lock `_) +for many architectures. + +However, for the x86 architecture, the current spin lock +implementation uses a queued spin lock where the CPU cores spin on +different locks (hopefully distributed in different cache lines) to +avoid cache invalidation operations: + +.. slide:: Queued Spin Locks + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +-------------------------------------------+ + | Queued Spin Lock cEEE | + | | + | +---+ +---+ +---+ +---+ | + | | |----->| |----->| |----->| | | + | +---+ +---+ +---+ +---+ | + | ^ ^ ^ ^ | + | | | | | | + +-------------------------------------------+ + | | | | + CPU10 CPU17 CPU99 CPU0 + owns the spins on spins on spins on + lock private private private + lock lock lock + + + +Conceptually, when a new CPU core tries to acquire the lock and it +fails it will add its private lock to the list of waiting CPU +cores. When the lock owner exits the critical section it unlocks the +next lock in the list, if any. + +While a read spin optimized spin lock reduces most of the cache +invalidation operations, the lock owner can still generate cache +invalidate operations due to writes to data structures close to the +lock and thus part of the same cache line. This in turn generates +memory traffic on subsequent reads on the spinning cores. + +Hence, queued spin locks scale much better for large number of cores +as is the case for NUMA systems. And since they have similar fairness +properties as the ticket lock it is the preferred implementation on +the x86 architecture. + + +Process and Interrupt Context Synchronization +============================================= + +Accessing shared data from both process and interrupt context is a +relatively common scenario. On single core systems we can do this by +disabling interrupts, but that won't work on multi-core systems, +as we can have the process running on one CPU core and the interrupt +context running on a different CPU core. + +Using a spin lock, which was designed for multi-processor systems, +seems like the right solution, but doing so can cause common +deadlock conditions, as detailed by the following scenario: + + +.. slide:: Process and Interrupt Handler Synchronization Deadlock + :inline-contents: True + :level: 2 + + * In the process context we take the spin lock + + * An interrupt occurs and it is scheduled on the same CPU core + + * The interrupt handler runs and tries to take the spin lock + + * The current CPU will deadlock + + +To avoid this issue a two fold approach is used: + + +.. slide:: Interrupt Synchronization for SMP + :inline-contents: True + :level: 2 + + * In process context: disable interrupts and acquire a spin lock; + this will protect both against interrupt or other CPU cores race + conditions (:c:func:`spin_lock_irqsave` and + :c:func:`spin_lock_restore` combine the two operations) + + * In interrupt context: take a spin lock; this will will protect + against race conditions with other interrupt handlers or process + context running on different processors + + +We have the same issue for other interrupt context handlers such as +softirqs, tasklets or timers and while disabling interrupts might +work, it is recommended to use dedicated APIs: + +.. slide:: Bottom-Half Synchronization for SMP + :inline-contents: True + :level: 2 + + * In process context use :c:func:`spin_lock_bh` (which combines + :c:func:`local_bh_disable` and :c:func:`spin_lock`) and + :c:func:`spin_unlock_bh` (which combines :c:func:`spin_unlock` and + :c:func:`local_bh_enable`) + + * In bottom half context use: :c:func:`spin_lock` and + :c:func:`spin_unlock` (or :c:func:`spin_lock_irqsave` and + :c:func:`spin_lock_irqrestore` if sharing data with interrupt + handlers) + + +As mentioned before, another source of concurrency in the Linux kernel +can be other processes, due to preemption. + +.. slide:: Preemption + :inline-contents: True + :level: 2 + + |_| + + Preemption is configurable: when active it provides better latency + and response time, while when deactivated it provides better + throughput. + + Preemption is disabled by spin locks and mutexes but it can be + manually disabled as well (by core kernel code). + + +As for local interrupt enabling and disabling APIs, the bottom half +and preemption APIs allows them to be used in overlapping critical +sections. A counter is used to track the state of bottom half and +preemption. In fact the same counter is used, with different increment +values: + +.. slide:: Preemption and Bottom-Half Masking + :inline-contents: True + :level: 2 + + .. code-block:: c + + #define PREEMPT_BITS 8 + #define SOFTIRQ_BITS 8 + #define HARDIRQ_BITS 4 + #define NMI_BITS 1 + + #define preempt_disable() preempt_count_inc() + + #define local_bh_disable() add_preempt_count(SOFTIRQ_OFFSET) + + #define local_bh_enable() sub_preempt_count(SOFTIRQ_OFFSET) + + #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) + + #define in_interrupt() irq_count() + + asmlinkage void do_softirq(void) + { + if (in_interrupt()) return; + ... + + +Mutexes +======= + +Mutexes are used to protect against race conditions from other CPU +cores but they can only be used in **process context**. As opposed to +spin locks, while a thread is waiting to enter the critical section it +will not use CPU time, but instead it will be added to a waiting queue +until the critical section is vacated. + +Since mutexes and spin locks usage intersect, it is useful to compare +the two: + +.. slide:: Mutexes + :inline-contents: True + :level: 2 + + * They don't "waste" CPU cycles; system throughput is better than + spin locks if context switch overhead is lower than medium + spinning time + + * They can't be used in interrupt context + + * They have a higher latency than spin locks + +Conceptually, the :c:func:`mutex_lock` operation is relatively simple: +if the mutex is not acquired we can take the fast path via an atomic +exchange operation: + + +.. slide:: :c:func:`mutex_lock` fast path + :inline-contents: True + :level: 2 + + .. code-block:: c + + void __sched mutex_lock(struct mutex *lock) + { + might_sleep(); + + if (!__mutex_trylock_fast(lock)) + __mutex_lock_slowpath(lock); + } + + static __always_inline bool __mutex_trylock_fast(struct mutex *lock) + { + unsigned long curr = (unsigned long)current; + + if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + return true; + + return false; + } + + +otherwise we take the slow path where we add ourselves to the mutex +waiting list and put ourselves to sleep: + +.. slide:: :c:func:`mutex_lock` slow path + :inline-contents: True + :level: 2 + + .. code-block:: c + + ... + spin_lock(&lock->wait_lock); + ... + /* add waiting tasks to the end of the waitqueue (FIFO): */ + list_add_tail(&waiter.list, &lock->wait_list); + ... + waiter.task = current; + ... + for (;;) { + if (__mutex_trylock(lock)) + goto acquired; + ... + spin_unlock(&lock->wait_lock); + ... + set_current_state(state); + spin_lock(&lock->wait_lock); + } + spin_lock(&lock->wait_lock); + acquired: + __set_current_state(TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current); + spin_lock(&lock->wait_lock); + ... + +The full implementation is a bit more complex: instead of going to +sleep immediately it optimistic spinning if it detects that the lock +owner is currently running on a different CPU as chances are the owner +will release the lock soon. It also checks for signals and handles +mutex debugging for locking dependency engine debug feature. + + +The :c:func:`mutex_unlock` operation is symmetric: if there are no +waiters on the mutex then we can take the fast path via an atomic exchange +operation: + +.. slide:: :c:func:`mutex_unlock` fast path + :inline-contents: True + :level: 2 + + .. code-block:: c + + void __sched mutex_unlock(struct mutex *lock) + { + if (__mutex_unlock_fast(lock)) + return; + __mutex_unlock_slowpath(lock, _RET_IP_); + } + + static __always_inline bool __mutex_unlock_fast(struct mutex *lock) + { + unsigned long curr = (unsigned long)current; + + if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) + return true; + + return false; + } + + void __mutex_lock_slowpath(struct mutex *lock) + { + ... + if (__mutex_waiter_is_first(lock, &waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + ... + + +.. note:: Because :c:type:`struct task_struct` is cached aligned the 7 + lower bits of the owner field can be used for various flags, + such as :c:type:`MUTEX_FLAG_WAITERS`. + + +Otherwise we take the slow path where we pick up first waiter from the +list and wake it up: + +.. slide:: :c:func:`mutex_unlock` slow path + :inline-contents: True + :level: 2 + + .. code-block:: c + + ... + spin_lock(&lock->wait_lock); + if (!list_empty(&lock->wait_list)) { + /* get the first entry from the wait-list: */ + struct mutex_waiter *waiter; + waiter = list_first_entry(&lock->wait_list, struct mutex_waiter, + list); + next = waiter->task; + wake_q_add(&wake_q, next); + } + ... + spin_unlock(&lock->wait_lock); + ... + wake_up_q(&wake_q); + + + +Per CPU data +============ + +Per CPU data avoids race conditions by avoiding to use shared +data. Instead, an array sized to the maximum possible CPU cores is +used and each core will use its own array entry to read and write +data. This approach certainly has advantages: + + +.. slide:: Per CPU data + :inline-contents: True + :level: 2 + + * No need to synchronize to access the data + + * No contention, no performance impact + + * Well suited for distributed processing where aggregation is only + seldom necessary (e.g. statistics counters) + + +Memory Ordering and Barriers +============================ + +Modern processors and compilers employ out-of-order execution to +improve performance. For example, processors can execute "future" +instructions while waiting for current instruction data to be fetched +from memory. + +Here is an example of out of order compiler generated code: + +.. slide:: Out of Order Compiler Generated Code + :inline-contents: True + :level: 2 + + +-------------------+-------------------------+ + | C code | Compiler generated code | + +-------------------+-------------------------+ + |.. code-block:: c |.. code-block:: asm | + | | | + | a = 1; | MOV R10, 1 | + | b = 2; | MOV R11, 2 | + | | STORE R11, b | + | | STORE R10, a | + +-------------------+-------------------------+ + + +.. note:: When executing instructions out of order the processor makes + sure that data dependency is observed, i.e. it won't execute + instructions whose input depend on the output of a previous + instruction that has not been executed. + +In most cases out of order execution is not an issue. However, in +certain situations (e.g. communicating via shared memory between +processors or between processors and hardware) we must issue some +instructions before others even without data dependency between them. + +For this purpose we can use barriers to order memory operations: + +.. slide:: Barriers + :inline-contents: True + :level: 2 + + * A read barrier (:c:func:`rmb()`, :c:func:`smp_rmb()`) is used to + make sure that no read operation crosses the barrier; that is, + all read operation before the barrier are complete before + executing the first instruction after the barrier + + * A write barrier (:c:func:`wmb()`, :c:func:`smp_wmb()`) is used to + make sure that no write operation crosses the barrier + + * A simple barrier (:c:func:`mb()`, :c:func:`smp_mb()`) is used + to make sure that no write or read operation crosses the barrier + + +Read Copy Update (RCU) +====================== + +Read Copy Update is a special synchronization mechanism similar with +read-write locks but with significant improvements over it (and some +limitations): + +.. slide:: Read Copy Update (RCU) + :level: 2 + :inline-contents: True + + * **Read-only** lock-less access at the same time with write access + + * Write accesses still requires locks in order to avoid races + between writers + + * Requires unidirectional traversal by readers + + +In fact, the read-write locks in the Linux kernel have been deprecated +and then removed, in favor of RCU. + +Implementing RCU for a new data structure is difficult, but a few +common data structures (lists, queues, trees) do have RCU APIs that +can be used. + +RCU splits removal updates to the data structures in two phases: + +.. slide:: Removal and Reclamation + :inline-contents: True + :level: 2 + + * **Removal**: removes references to elements. Some old readers may + still see the old reference so we can't free the element. + + * **Elimination**: free the element. This action is postponed until + all existing readers finish traversal (quiescent cycle). New + readers won't affect the quiescent cycle. + + +As an example, lets take a look on how to delete an element from a +list using RCU: + +.. slide:: RCU List Delete + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + (1) List Traversal (2) Removal + +-----------+ + +-----+ +-----+ +-----+ +-----+ | +-----+ | +-----+ + | | | | | | | | | | | | | | + | A |---->| B |---->| C | | A |--+ | B |--+->| C | + | | | | | | | | | | | | + +-----+ +-----+ +-----+ +-----+ +-----+ +-----+ + ^ ^ ^ ^ ^ ^ + | | | | | | + + + + + + + + (3) Quiescent cycle over (4) Reclamation + +-----------+ + +-----+ | +-----+ | +-----+ +-----+ +-----+ + | | | | | | | | | | | | + | A |--+ | B | +->| C | | A |---------------->| C | + | | | | | | | | | | + +-----+ +-----+ +-----+ +-----+ +-----+ + ^ ^ ^ ^ + | | | | + + +In the first step it can be seen that while readers traverse the list +all elements are referenced. In step two a writer removes +element B. Reclamation is postponed since there are still readers that +hold references to it. In step three a quiescent cycle just expired +and it can be noticed that there are no more references to +element B. Other elements still have references from readers that +started the list traversal after the element was removed. In step 4 we +finally perform reclamation (free the element). + + +Now that we covered how RCU functions at the high level, lets looks at +the APIs for traversing the list as well as adding and removing an +element to the list: + + +.. slide:: RCU list APIs cheat sheet + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* list traversal */ + rcu_read_lock(); + list_for_each_entry_rcu(i, head) { + /* no sleeping, blocking calls or context switch allowed */ + } + rcu_read_unlock(); + + + /* list element delete */ + spin_lock(&lock); + list_del_rcu(&node->list); + spin_unlock(&lock); + synchronize_rcu(); + kfree(node); + + /* list element add */ + spin_lock(&lock); + list_add_rcu(head, &node->list); + spin_unlock(&lock); + diff --git a/refs/pull/405/merge/_sources/lectures/syscalls.rst.txt b/refs/pull/405/merge/_sources/lectures/syscalls.rst.txt new file mode 100644 index 00000000..49d864f3 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/syscalls.rst.txt @@ -0,0 +1,611 @@ +============ +System Calls +============ + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: System Calls + :inline-contents: True + :level: 2 + + * Linux system calls implementation + + * VDSO and virtual syscalls + + * Accessing user space from system calls + + + +Linux system calls implementation +================================= + +At a high level system calls are "services" offered by the kernel to +user applications and they resemble library APIs in that they are +described as a function call with a name, parameters, and return value. + +.. slide:: System Calls as Kernel services + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +-------------+ +-------------+ + | Application | | Application | + +-------------+ +-------------+ + | | + |read(fd, buff, len) |fork() + | | + v v + +---------------------------------------+ + | Kernel | + +---------------------------------------+ + + +However, on a closer look, we can see that system calls are actually +not function calls, but specific assembly instructions (architecture +and kernel specific) that do the following: + +.. slide:: System Call Setup + :inline-contents: True + :level: 2 + + * setup information to identify the system call and its parameters + + * trigger a kernel mode switch + + * retrieve the result of the system call + +In Linux, system calls are identified by numbers and the parameters +for system calls are machine word sized (32 or 64 bit). There can be a +maximum of 6 system call parameters. Both the system call number and +the parameters are stored in certain registers. + +For example, on 32bit x86 architecture, the system call identifier is +stored in the EAX register, while parameters in registers EBX, ECX, +EDX, ESI, EDI, EBP. + +.. slide:: Linux system call setup + :inline-contents: False + :level: 2 + + * System calls are identified by numbers + + * The parameters for system calls are machine word sized (32 or 64 + bit) and they are limited to a maximum of 6 + + * Uses registers to store them both (e.g. for 32bit x86: EAX for + system call and EBX, ECX, EDX, ESI, EDI, EBP for parameters) + +System libraries (e.g. libc) offers functions that implement the +actual system calls in order to make it easier for applications to use +them. + +When a user to kernel mode transition occurs, the execution flow is +interrupted and it is transferred to a kernel entry point. This is +similar to how interrupts and exceptions are handled (in fact on some +architectures this transition happens as a result of an exception). + +The system call entry point will save registers (which contains values +from user space, including system call number and system call +parameters) on stack and then it will continue with executing the +system call dispatcher. + +.. note:: During the user - kernel mode transition the stack is also + switched from the user stack to the kernel stack. This is + explained in more details in the interrupts lecture. + +.. slide:: Example of Linux system call setup and handling + :inline-contents: True + :level: 2 + + .. ditaa:: + + +-------------+ dup2 +-----------------------------+ + | Application |-----+ | libc | + +-------------+ | | | + +---->| C7590 dup2: | + | ... | + | C7592 movl 0x8(%esp),%ecx | + | C7596 movl 0x4(%esp),%ebx | + | C759a movl $0x3f,%eax | + +------------------------------+ C759f int $0x80 | + | | ... +<-----+ + | +-----------------------------+ | + | | + | | + | | + | | + | +------------------------------------------------------------+ | + | | Kernel | | + | | | | + +--->|ENTRY(entry_INT80_32) | | + | ASM_CLAC | | + | pushl %eax # pt_regs->orig_ax | | + | SAVE_ALL pt_regs_ax=$-ENOSYS # save rest | | + | ... | | + | movl %esp, %eax | | + | call do_int80_syscall_32 | | + | .... | | + | RESTORE_REGS 4 # skip orig_eax/error_code | | + | ... | | + | INTERRUPT_RETURN +-+ + +------------------------------------------------------------+ + + +The purpose of the system call dispatcher is to verify the system call +number and run the kernel function associated with the system call. + +.. slide:: Linux System Call Dispatcher + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* Handles int $0x80 */ + __visible void do_int80_syscall_32(struct pt_regs *regs) + { + enter_from_user_mode(); + local_irq_enable(); + do_syscall_32_irqs_on(regs); + } + + /* simplified version of the Linux x86 32bit System Call Dispatcher */ + static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) + { + unsigned int nr = regs->orig_ax; + + if (nr < IA32_NR_syscalls) + regs->ax = ia32_sys_call_table[nr](regs->bx, regs->cx, + regs->dx, regs->si, + regs->di, regs->bp); + syscall_return_slowpath(regs); + } + + + +To demonstrate the system call flow we are going to use the virtual +machine setup, attach gdb to a running kernel, add a breakpoint to the +dup2 system call and inspect the state. + +.. slide:: Inspecting dup2 system call + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/syscalls-inspection.cast + + +In summary, this is what happens during a system call: + +.. slide:: System Call Flow Summary + :inline-contents: True + :level: 2 + + * The application is setting up the system call number and + parameters and it issues a trap instruction + + * The execution mode switches from user to kernel; the CPU switches + to a kernel stack; the user stack and the return address to user + space is saved on the kernel stack + + * The kernel entry point saves registers on the kernel stack + + * The system call dispatcher identifies the system call function + and runs it + + * The user space registers are restored and execution is switched + back to user (e.g. calling IRET) + + * The user space application resumes + + +System call table +----------------- + +The system call table is what the system call dispatcher uses to map +system call numbers to kernel functions: + +.. slide:: System Call Table + :inline-contents: True + :level: 2 + + .. code-block:: c + + #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, + + const sys_call_ptr_t ia32_sys_call_table[] = { + [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, + #include + }; + + .. code-block:: c + + __SYSCALL_I386(0, sys_restart_syscall) + __SYSCALL_I386(1, sys_exit) + __SYSCALL_I386(2, sys_fork) + __SYSCALL_I386(3, sys_read) + __SYSCALL_I386(4, sys_write) + #ifdef CONFIG_X86_32 + __SYSCALL_I386(5, sys_open) + #else + __SYSCALL_I386(5, compat_sys_open) + #endif + __SYSCALL_I386(6, sys_close) + + + +System call parameters handling +------------------------------- + +Handling system call parameters is tricky. Since these values are +setup by user space, the kernel can not assume correctness and must +always verify them thoroughly. + +Pointers have a few important special cases that must be checked: + +.. slide:: System Calls Pointer Parameters + :inline-contents: True + :level: 2 + + * Never allow pointers to kernel-space + + * Check for invalid pointers + + +Since system calls are executed in kernel mode, they have access to +kernel space and if pointers are not properly checked user +applications might get read or write access to kernel space. + +For example, let's consider the case where such a check is not made for +the read or write system calls. If the user passes a kernel-space +pointer to a write system call then it can get access to kernel data +by later reading the file. If it passes a kernel-space pointer to a +read system call then it can corrupt kernel memory. + + +.. slide:: Pointers to Kernel Space + :level: 2 + + * User access to kernel data if allowed in a write system call + + * User corrupting kernel data if allowed in a read system call + + +Likewise, if a pointer passed by the application is invalid +(e.g. unmapped, read-only for cases where it is used for writing), it +could "crash" the kernel. Two approaches could be used: + +.. slide:: Invalid pointers handling approaches + :inline-contents: True + :level: 2 + + * Check the pointer against the user address space before using it, + or + + * Avoid checking the pointer and rely on the MMU to detect when the + pointer is invalid and use the page fault handler to determine + that the pointer was invalid + + +Although it sounds tempting, the second approach is not that easy to +implement. The page fault handler uses the fault address (the address +that was accessed), the faulting address (the address of the +instruction that did the access) and information from the user address +space to determine the cause: + +.. slide:: Page fault handling + :inline-contents: True + :level: 2 + + * Copy on write, demand paging, swapping: both the fault and + faulting addresses are in user space; the fault address is + valid (checked against the user address space) + + * Invalid pointer used in system call: the faulting address is + in kernel space; the fault address is in user space and it is + invalid + + * Kernel bug (kernel accesses invalid pointer): same as above + +But in the last two cases we don't have enough information to +determine the cause of the fault. + +In order to solve this issue, Linux uses special APIs (e.g +:c:func:`copy_to_user`) to accesses user space that are specially +crafted: + +.. slide:: Marking kernel code that accesses user space + :inline-contents: True + :level: 2 + + * The exact instructions that access user space are recorded in a + table (exception table) + + * When a page fault occurs the faulting address is checked against + this table + + +Although the fault handling case may be more costly overall depending +on the address space vs exception table size, and it is more complex, +it is optimized for the common case and that is why it is preferred +and used in Linux. + + +.. slide:: Cost analysis for pointer checks vs fault handling + :inline-contents: True + :level: 2 + + +------------------+-----------------------+------------------------+ + | Cost | Pointer checks | Fault handling | + +==================+=======================+========================+ + | Valid address | address space search | negligible | + +------------------+-----------------------+------------------------+ + | Invalid address | address space search | exception table search | + +------------------+-----------------------+------------------------+ + + +Virtual Dynamic Shared Object (VDSO) +==================================== + +The VDSO mechanism was born out of the necessity of optimizing the +system call implementation, in a way that does not impact libc with +having to track the CPU capabilities in conjunction with the kernel +version. + +For example, x86 has two ways of issuing system calls: int 0x80 and +sysenter. The latter is significantly faster so it should be used when +available. However, it is only available for processors newer than +Pentium II and only for kernel versions greater than 2.6. + +With VDSO the system call interface is decided by the kernel: + +.. slide:: Virtual Dynamic Shared Object (VDSO) + :inline-contents: True + :level: 2 + + * a stream of instructions to issue the system call is generated by + the kernel in a special memory area (formatted as an ELF shared + object) + + * that memory area is mapped towards the end of the user address + space + + * libc searches for VDSO and if present will use it to issue the + system call + + +.. slide:: Inspecting VDSO + :inline-contents: True + :level: 2 + + |_| + + .. asciicast:: ../res/syscalls-vdso.cast + + + +An interesting development of the VDSO is the virtual system calls +(vsyscalls) which run directly from user space. These vsyscalls are +also part of VDSO and they are accessing data from the VDSO page that +is either static or modified by the kernel in a separate read-write +map of the VDSO page. Examples of system calls that can be implemented +as vsyscalls are: getpid or gettimeofday. + + +.. slide:: Virtual System Calls (vsyscalls) + :inline-contents: True + :level: 2 + + * "System calls" that run directly from user space, part of the VDSO + + * Static data (e.g. getpid()) + + * Dynamic data update by the kernel a in RW map of the VDSO + (e.g. gettimeofday(), time(), ) + + +Accessing user space from system calls +====================================== + +As we mentioned earlier, user space must be accessed with special APIs +(:c:func:`get_user`, :c:func:`put_user`, :c:func:`copy_from_user`, +:c:func:`copy_to_user`) that check whether the pointer is in user space +and also handle the fault if the pointer is invalid. In case of invalid +pointers, they return a non-zero value. + +.. slide:: Accessing user space from system calls + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* OK: return -EFAULT if user_ptr is invalid */ + if (copy_from_user(&kernel_buffer, user_ptr, size)) + return -EFAULT; + + /* NOK: only works if user_ptr is valid otherwise crashes kernel */ + memcpy(&kernel_buffer, user_ptr, size); + + +Let's examine the simplest API, get_user, as implemented for x86: + +.. slide:: get_user implementation + :inline-contents: True + :level: 2 + + .. code-block:: c + + #define get_user(x, ptr) \ + ({ \ + int __ret_gu; \ + register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ + __chk_user_ptr(ptr); \ + might_fault(); \ + asm volatile("call __get_user_%P4" \ + : "=a" (__ret_gu), "=r" (__val_gu), \ + ASM_CALL_CONSTRAINT \ + : "0" (ptr), "i" (sizeof(*(ptr)))); \ + (x) = (__force __typeof__(*(ptr))) __val_gu; \ + __builtin_expect(__ret_gu, 0); \ + }) + + +The implementation uses inline assembly, which allows inserting ASM +sequences in C code and also handles access to/from variables in the +ASM code. + +Based on the type size of the x variable, one of __get_user_1, +__get_user_2 or __get_user_4 will be called. Also, before executing +the assembly call, ptr will be moved to the first register EAX while +after the completion of assembly part the value of EAX will be moved +to __ret_gu and the EDX register will be moved to __val_gu. + +It is equivalent to the following pseudo code: + + +.. slide:: get_user pseudo code + :inline-contents: True + :level: 2 + + .. code-block:: c + + #define get_user(x, ptr) \ + movl ptr, %eax \ + call __get_user_1 \ + movl %edx, x \ + movl %eax, result \ + + + +The __get_user_1 implementation for x86 is the following: + +.. slide:: get_user_1 implementation + :inline-contents: True + :level: 2 + + .. code-block:: none + + .text + ENTRY(__get_user_1) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user + ASM_STAC + 1: movzbl (%_ASM_AX),%edx + xor %eax,%eax + ASM_CLAC + ret + ENDPROC(__get_user_1) + + bad_get_user: + xor %edx,%edx + mov $(-EFAULT),%_ASM_AX + ASM_CLAC + ret + END(bad_get_user) + + _ASM_EXTABLE(1b,bad_get_user) + +The first two statements check the pointer (which is stored in EDX) +with the addr_limit field of the current task (process) descriptor to +make sure that we don't have a pointer to kernel space. + +Then, SMAP is disabled, to allow access to user from kernel, and the +access to user space is done with the instruction at the 1: label. EAX +is then zeroed to mark success, SMAP is enabled, and the call returns. + +The movzbl instruction is the one that does the access to user space +and its address is captured with the 1: label and stored in a special +section: + +.. slide:: Exception table entry + :inline-contents: True + :level: 2 + + .. code-block:: c + + /* Exception table entry */ + # define _ASM_EXTABLE_HANDLE(from, to, handler) \ + .pushsection "__ex_table","a" ; \ + .balign 4 ; \ + .long (from) - . ; \ + .long (to) - . ; \ + .long (handler) - . ; \ + .popsection + + # define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) + + +For each address that accesses user space we have an entry in the +exception table, that is made up of: the faulting address(from), where +to jump to in case of a fault, and a handler function (that implements +the jump logic). All of these addresses are stored on 32bit in +relative format to the exception table, so that they work for both 32 +and 64 bit kernels. + + +All of the exception table entries are then collected in the +__ex_table section by the linker script: + +.. slide:: Exception table building + :inline-contents: True + :level: 2 + + .. code-block:: c + + #define EXCEPTION_TABLE(align) \ + . = ALIGN(align); \ + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start___ex_table) = .; \ + KEEP(*(__ex_table)) \ + VMLINUX_SYMBOL(__stop___ex_table) = .; \ + } + + +The section is guarded with __start___ex_table and __stop___ex_table +symbols, so that it is easy to find the data from C code. This table +is accessed by the fault handler: + + +.. slide:: Exception table handling + :inline-contents: True + :level: 2 + + .. code-block:: c + + bool ex_handler_default(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) + { + regs->ip = ex_fixup_addr(fixup); + return true; + } + + int fixup_exception(struct pt_regs *regs, int trapnr) + { + const struct exception_table_entry *e; + ex_handler_t handler; + + e = search_exception_tables(regs->ip); + if (!e) + return 0; + + handler = ex_fixup_handler(e); + return handler(e, regs, trapnr); + } + + +All it does is to set the return address to the one in the field of +the exception table entry which, in case of the get_user exception +table entry, is bad_get_user which return -EFAULT to the caller. + diff --git a/refs/pull/405/merge/_sources/lectures/virt.rst.txt b/refs/pull/405/merge/_sources/lectures/virt.rst.txt new file mode 100644 index 00000000..78040391 --- /dev/null +++ b/refs/pull/405/merge/_sources/lectures/virt.rst.txt @@ -0,0 +1,651 @@ +============== +Virtualization +============== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +Lecture objectives: +=================== + +.. slide:: Virtualization + :inline-contents: True + :level: 2 + + * Emulation basics + + * Virtualization basics + + * Paravirtualization basics + + * Hardware support for virtualization + + * Overview of the Xen hypervisor + + * Overview of the KVM hypervisor + + +Emulation basics +================ + +.. slide:: Emulation basics + :inline-contents: True + :level: 2 + + * Instructions are emulated (each time they are executed) + + * The other system components are also emulated: + + * MMU + + * Physical memory access + + * Peripherals + + * Target architecture - the architecture that it is emulated + + * Host architecture - the architecture that the emulator runs on + + * For emulation target and host architectures can be different + + +Virtualization basics +===================== + +.. slide:: Virtualization basics + :inline-contents: True + :level: 2 + + * Defined in a paper by Popek & Goldberg in 1974 + + * Fidelity + + * Performance + + * Security + + .. ditaa:: + + +----+ +----+ +----+ + | VM | | VM | ... | VM | + +----+ +----+ +----+ + + +-------------------------+ + | Virtual Machine Monitor | + +-------------------------+ + + +-------------------------+ + | Hardware | + +-------------------------+ + + +Classic virtualization +====================== + +.. slide:: Classic virtualization + :inline-contents: True + :level: 2 + + * Trap & Emulate + + * Same architecture for host and target + + * Most of the target instructions are natively executed + + * Target OS runs in non-privilege mode on the host + + * Privileged instructions are trapped and emulated + + * Two machine states: host and guest + + +Software virtualization +======================= + +.. slide:: Software virtualization + :inline-contents: True + :level: 2 + + * Not all architecture can be virtualized; e.g. x86: + + * CS register encodes the CPL + + * Some instructions don't generate a trap (e.g. popf) + + * Solution: emulate instructions using binary translation + + +MMU virtualization +================== + +.. slide:: MMU virtualization + :inline-contents: True + :level: 2 + + * "Fake" VM physical addresses are translated by the host to actual + physical addresses + + * Guest virtual address -> Guest physical address -> Host Physical Address + + * The guest page tables are not directly used by the host hardware + + * VM page tables are verified then translated into a new set of page + tables on the host (shadow page tables) + + +Shadow page tables +------------------ + +.. slide:: Shadow page tables + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + PGD PMD PT + +----------+ +----------+ +----------+ + | | | | | | Guest Physical Page + +----------+ +----------+ +----------+ +----------+ + | | | | | |----+ | | + +-----+ +----------+ +----------+ +----------+ | | | + | CR3 | | |----+ | |---+ | | | | | + +-----+ +----------+ | +----------+ | +----------+ +--->+----------+ + | | | | | | | | | + +---------> +----------+ +------>+----------+ +---->+----------+ + Write Protected Write Protected Write Protected + | + | + Guest (VM) | + | trap access + | + ---------------------+------------------------------------------------------------------------------ + | + | check access, transform GPP to HPP + | + v + + Shadow PGD Shadow PMD Shadow PT + +----------+ +----------+ +----------+ + | | | | | | Host Physical Page + +----------+ +----------+ +----------+ +----------+ + | | | | | |----+ | | + +----------+ +----------+ +----------+ | | | + | |----+ | |---+ | | | | | + +----------+ | +----------+ | +----------+ +--->+----------+ + | | | | | | | | + +----------+ +------>+----------+ +---->+----------+ + + + +Lazy shadow sync +---------------- + +.. slide:: Lazy shadow sync + :inline-contents: True + :level: 2 + + * Guest page tables changes are typically batched + + * To avoid repeated traps, checks and transformations map guest + page table entries with write access + + * Update the shadow page table when + + * The TLB is flushed + + * In the host page fault handler + + +I/O emulation +============= + +.. slide:: I/O emulation + :inline-contents: True + :level: 2 + + |_| + + .. ditaa:: + + +---------------------+ + | Guest OS | + | +---------------+ | + | | Guest Driver | | + | +---------------+ | + | | ^ | + | | | | + +----+-----------+----+ + | trap | + | access | + +---+-----------+----+ + | | VMM | | + | v | | + | +----------------+ | + | | Virtual Device | | + | +----------------+ | + | | ^ | + | | | | + +--+------------+----+ + | | + v | + +-----------------+ + | Physical Device | + +-----------------+ + + +.. slide:: Example: qemu SiFive UART emulation + :inline-contents: True + :level: 2 + + .. literalinclude:: ../res/sifive_uart.c + :language: c + + +Paravirtualization +================== + +.. slide:: Paravirtualization + :inline-contents: True + :level: 2 + + * Change the guest OS so that it cooperates with the VMM + + * CPU paravirtualization + + * MMU paravirtualization + + * I/O paravirtualization + + * VMM exposes hypercalls for: + + * activate / deactivate the interrupts + + * changing page tables + + * accessing virtualized peripherals + + * VMM uses events to trigger interrupts in the VM + + +Intel VT-x +========== + +.. slide:: Intel VT-x + :inline-contents: True + :level: 2 + + + * Hardware extension to transform x86 to the point it can be + virtualized "classically" + + * New execution mode: non-root mode + + * Each non-root mode instance uses a Virtual Machine Control + Structure (VMCS) to store its state + + * VMM runs in root mode + + * VM-entry and VM-exit are used to transition between the two modes + + +Virtual Machine Control Structure +--------------------------------- + +.. slide:: Virtual Machine Control Structure + :inline-contents: True + :level: 2 + + * Guest information: state of the virtual CPU + + * Host information: state of the physical CPU + + * Saved information: + + * visible state: segment registers, CR3, IDTR, etc. + + * internal state + + * VMCS can not be accessed directly but certain information can be + accessed with special instructions + +VM entry & exit +--------------- + +.. slide:: VM entry & exit + :inline-contents: True + :level: 2 + + * VM entry - new instructions that switches the CPU in non-root + mode and loads the VM state from a VMCS; host state is saved in + VMCS + + * Allows injecting interrupts and exceptions in the guest + + * VM exit will be automatically triggered based on the VMCS + configuration + + * When VM exit occurs host state is loaded from VMCS, guest state + is saved in VMCS + +VM execution control fields +--------------------------- + +.. slide:: VM execution control fields + :inline-contents: True + :level: 2 + + * Selects conditions which triggers a VM exit; examples: + + * If an external interrupt is generated + + * If an external interrupt is generated and EFLAGS.IF is set + + * If CR0-CR4 registers are modified + + * Exception bitmap - selects which exceptions will generate a VM + exit + + * IO bitmap - selects which I/O addresses (IN/OUT accesses) + generates a VM exit + + * MSR bitmaps - selects which RDMSR or WRMSR instructions will + generate a VM exit + + +Extend Page Tables +================== + +.. slide:: Extend Page Tables + :inline-contents: True + :level: 2 + + * Reduces the complexity of MMU virtualization and improves + performance + + * Access to CR3, INVLPG and page faults do not require VM exit + anymore + + * The EPT page table is controlled by the VMM + + .. ditaa:: + + +-----+ +-----+ + | CR3 | | EPT | + +-----+ +-----+ + | +------------------+ | +----------------+ + | | | | | | + +--------> | Guest Page Table | +-------> | EPT Page Table | ---------------> + | | | | + ------------> +------------------+ ------------> +----------------+ + + Guest Virtual Guest Physical Host Physical + Address Address Address + + +VPID +---- + +.. slide:: VPID + :inline-contents: True + :level: 2 + + * VM entry and VM exit forces a TLB flush - loses VMM / VM translations + + * To avoid this issue a VPID (Virtual Processor ID) tag is + associated with each VM (VPID 0 is reserved for the VMM) + + * All TLB entries are tagged + + * At VM entry and exit just the entries associated with the tags + are flushed + + * When searching the TLB just the current VPID is used + + +I/O virtualization +================== + + * Direct access to hardware from a VM - in a controlled fashion + + * Map the MMIO host directly to the guest + + * Forward interrupts + +.. slide:: I/O virtualization + :inline-contents: True + :level: 2 + + .. ditaa:: + + +---------------------+ +---------------------+ + | Guest OS | | Guest OS | + | +---------------+ | | +---------------+ | + | | Guest Driver | | | | Guest Driver | | + | +---------------+ | | +---------------+ | + | | ^ | | | ^ | + | | | | | | | | + +----+-----------+----+ +----+-----------+----+ + | traped | | mapped | + | access | | access | + +---+-----------+----+ +---+-----------+-----+ But how do we deal with DMA? + | | VMM | | | | VMM | | + | v | | | | | | + | +----------------+ | | | +---------+ | + | | Virtual Device | | | | | IRQ | | + | +----------------+ | | | | Mapping | | + | | ^ | | | +---------+ | + | | | | | | | | + +--+------------+----+ +---+-----------+-----+ + | | | | + v | v | + +-----------------+ +-----------------+ + | Physical Device | | Physical Device | + +-----------------+ +-----------------+ + +Instead of trapping MMIO as with emulated devices we can allow the +guest to access the MMIO directly by mapping through its page tables. + +Interrupts from the device are handled by the host kernel and a signal +is send to the VMM which injects the interrupt to the guest just as +for the emulated devices. + + +.. slide:: I/O MMU + :inline-contents: True + :level: 2 + + VT-d protects and translates VM physical addresses using an I/O + MMU (DMA remaping) + + .. ditaa:: + + +------+ +------+ + | | | | + | CPU | | DMA | + | | | | + +------+ +------+ + | + | + v + +-----+ +-----+ + | CR3 | | EPT | + +-----+ +-----+ + | +------------------+ | +----------------+ + | | | | | | + +--------> | Guest Page Table | +-------> | EPT Page Table | ---------------> + | | | | + ------------> +------------------+ ------------> +----------------+ + + Guest Virtual Guest Physical Host Physical + Address Address Address + + +.. slide:: Interrupt posting + :inline-contents: True + :level: 2 + + * Messsage Signaled Interrupts (MSI) = DMA writes to the host + address range of the IRQ controller (e.g. 0xFEExxxxx) + + * Low bits of the address and the data indicate which interrupt + vector to deliver to which CPU + + * Interrupt remapping table points to the virtual CPU (VMCS) that + should receive the interrupt + + * I/O MMU will trap the IRQ controller write and look it up in the + interrupt remmaping table + + * if that virtual CPU is currently running it will take the + interrupt directly + + * otherwise a bit is set in a table (Posted Interrupt Descriptor + table) and the interrupt will be inject next time that vCPU is + run + + +.. slide:: I/O virtualization + :inline-contents: True + :level: 2 + + .. ditaa:: + + +---------------------+ +---------------------+ +---------------------+ + | Guest OS | | Guest OS | | Guest OS | + | +---------------+ | | +---------------+ | | +---------------+ | + | | Guest Driver | | | | Guest Driver | | | | Guest Driver | | + | +---------------+ | | +---------------+ | | +---------------+ | + | | ^ | | | ^ | | | ^ | + | | | | | | | | | | | | + +----+-----------+----+ +----+-----------+----+ +----+-----------+----+ + | traped | | mapped | | mapped | interrupt + | access | | access | | access | posting + +---+-----------+----+ +---+-----------+-----+ +---+-----------+-----+ + | | VMM | | | | VMM | | | | VMM | | + | v | | | | | | | | | | + | +----------------+ | | | +---------+ | | | | | + | | Virtual Device | | | | | IRQ | | | | | | + | +----------------+ | | | | Mapping | | | | | | + | | ^ | | | +---------+ | | | | | + | | | | | | | | | | | | + +--+------------+----+ +---+-----------+-----+ +---+-----------+-----+ + | | | | | | + v | v | v | + +-----------------+ +-----------------+ +-----------------+ + | Physical Device | | Physical Device | | Physical Device | + +-----------------+ +-----------------+ +-----------------+ + + + +.. slide:: SR-IOV + :inline-contents: True + :level: 2 + + * Single Root - Input Output Virtualization + + * Physical device with multiple Ethernet ports will be shown as + multiple device on the PCI bus + + * Physical Function is used for the control and can be configured + + * to present itself as a new PCI device + + * which VLAN to use + + * The new virtual function is enumerated on the bus and can be + assigned to a particular guest + + +qemu +==== + +.. slide:: qemu + :inline-contents: True + :level: 2 + + * Uses binary translation via Tiny Code Generator (TCG) for + efficient emulation + + * Supports different target and host architectures (e.g. running + ARM VMs on x86) + + * Both process and full system level emulation + + * MMU emulation + + * I/O emulation + + * Can be used with KVM for accelerated virtualization + +KVM +=== + +.. slide:: KVM + :inline-contents: True + :level: 2 + + .. ditaa:: + + VM1 (qemu) VM2 (qemu) + +---------------------+ +---------------------+ + | +------+ +------+ | | +------+ +------+ | + | | App1 | | App2 | | | | App1 | | App2 | | + | +------+ +------+ | | +------+ +------+ | + | +-----------------+ | | +-----------------+ | + | | Guest Kernel | | | | Guest Kernel | | + | +-----------------+ | | +-----------------+ | + +---------------------+ +---------------------+ + + +----------------------------------------------------+ + | +-----+ | + | | KVM | Host Linux Kernel | + | +-----+ | + +----------------------------------------------------+ + + +----------------------------------------------------+ + | Hardware with virtualization support | + +----------------------------------------------------+ + + +.. slide:: KVM + :inline-contents: True + :level: 2 + + * Linux device driver for hardware virtualization (e.g. Intel VT-x, SVM) + + * IOCTL based interface for managing and running virtual CPUs + + * VMM components implemented inside the Linux kernel + (e.g. interrupt controller, timers) + + * Shadow page tables or EPT if present + + * Uses qemu or virtio for I/O virtualization + + + +Type 1 vs Type 2 Hypervisors +============================ + +.. slide:: Xen + :inline-contents: True + :level: 2 + + * Type 1 = Bare Metal Hypervisor + + * Type 2 = Hypervisor embedded in an exist kernel / OS + + +Xen +=== + +.. slide:: Xen + :inline-contents: True + :level: 2 + + .. image:: ../res/xen-overview.png diff --git a/refs/pull/405/merge/_sources/so2/assign-collaboration.rst.txt b/refs/pull/405/merge/_sources/so2/assign-collaboration.rst.txt new file mode 100644 index 00000000..188c18bc --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign-collaboration.rst.txt @@ -0,0 +1,144 @@ +============= +Collaboration +============= + +Collaboration is essential in open source world and we encourage you +to pick a team partner to work on selected assignments. + +Here is a simple guide to get you started: + +1. Use Github / Gitlab +---------------------- + +Best way to share your work inside the team is to use a version control system (VCS) +in order to track each change. Mind that you must make your repo private and only allow +read/write access rights to team members. + +2. Start with a skeleton for the assignment +------------------------------------------- + +Add `init`/`exit` functions, driver operations and global structures that you driver might need. + +.. code-block:: c + + // SPDX-License-Identifier: GPL-2.0 + /* + * uart16550.c - UART16550 driver + * + * Author: John Doe + * Author: Ionut Popescu + */ + struct uart16550_dev { + struct cdev cdev; + /*TODO */ + }; + + static struct uart16550_dev devs[MAX_NUMBER_DEVICES]; + + static int uart16550_open(struct inode *inode, struct file *file) + { + /*TODO */ + return 0; + } + + static int uart16550_release(struct inode *inode, struct file *file) + { + /*TODO */ + return 0; + } + + static ssize_t uart16550_read(struct file *file, char __user *user_buffer, + size_t size, loff_t *offset) + { + /*TODO */ + } + + static ssize_t uart16550_write(struct file *file, + const char __user *user_buffer, + size_t size, loff_t *offset) + { + /*TODO */ + } + + static long + uart16550_ioctl(struct file *file, unsigned int cmd, unsigned long arg) + { + /*TODO */ + return 0; + } + + static const struct file_operations uart16550_fops = { + .owner = THIS_MODULE, + .open = uart16550_open, + .release = uart16550_release, + .read = uart16550_read, + .write = uart16550_write, + .unlocked_ioctl = uart16550_ioctl + }; + + static int __init uart16550_init(void) + { + /* TODO: */ + } + + static void __exit uart16550_exit(void) + { + /* TODO: */ + } + + module_init(uart16550_init); + module_exit(uart16550_exit); + + MODULE_DESCRIPTION("UART16550 Driver"); + MODULE_AUTHOR("John Doe tema2) + Author: John Doe + Date: Mon Apr 4 11:54:39 2022 +0300 + + uart16550: Add initial skeleton for ssignment #2 + + This adds simple skeleton file for uart16550 assignment. Notice + module init/exit callbacks and file_operations dummy implementation + for open/release/read/write/ioctl. + + Signed-off-by: John Doe + +4. Split the work inside the team +--------------------------------- + +Add `TODOs` with each team member tasks. Try to split the work evenly. + +Before starting to code, make a plan. On top of your skeleton file, add TODOs with each member tasks. Agree on global +structures and the overall driver design. Then start coding. + +5. Do reviews +------------- + +Create Pull Requests with your commits and go through review rounds with your team members. You can follow `How to create a PR` `video `_. + +6. Merge the work +----------------- + +The final work is the result of merging all the pull requests. Following the commit messages +one should clearly understand the progress of the code and how the work was managed inside the team. + +.. code-block:: console + + f5118b873294 uart16550: Add uart16550_interrupt implementation + 2115503fc3e3 uart16550: Add uart16550_ioctl implementation + b31a257fd8b8 uart16550: Add uart16550_write implementation + ac1af6d88a25 uart16550: Add uart16550_read implementation + 9f680e8136bf uart16550: Add uart16550_open/release implementation + 3c92a02cc527 uart16550: Add skeleton for SO2 assignment #2 diff --git a/refs/pull/405/merge/_sources/so2/assign0-kernel-api.rst.txt b/refs/pull/405/merge/_sources/so2/assign0-kernel-api.rst.txt new file mode 100644 index 00000000..835eccef --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign0-kernel-api.rst.txt @@ -0,0 +1,111 @@ +========================= +Assignment 0 - Kernel API +========================= + +- Deadline: :command:`Monday, 25 March 2024, 23:59` + +Assignment's Objectives +======================= + +* getting familiar with the qemu setup +* loading/unloading kernel modules +* getting familiar with the list API implemented in the kernel +* have fun :) + +Statement +========= + +Write a kernel module called `list` (the resulting file must be called `list.ko`) which stores data (strings) +in an internal list. + +It is mandatory to use `the list API `__ +implemented in the kernel. +For details you can take a look at `the laboratory 2 `__. + +The module exports a directory named :command:`list` to procfs. The directory contains two files: + +- :command:`management`: with write-only access; is the interface for transmitting commands to the kernel module +- :command:`preview`: with read-only access; is the interface through which the internal contents of the kernel list can be viewed. + +`The code skeleton `__ implements the two procfs files. +You will need to create a list and implement support for `adding` and `reading` data. Follow the TODOs in the code for details. + +To interact with the kernel list, you must write commands (using the `echo` command) in the `/proc/list/management` file: + +- `addf name`: adds the `name` element to the top of the list +- `adde name`: adds the `name` element to the end of the list +- `delf name`: deletes the first appearance of the `name` item from the list +- `dela name`: deletes all occurrences of the `name` element in the list + +Viewing the contents of the list is done by viewing the contents of the `/proc/list/preview` file (use the` cat` command). +The format contains one element on each line. + +Testing +======= + +In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +`test script `__ called `_checker`. +The test script assumes that the kernel module is called `list.ko`. + +QuickStart +========== + +It is mandatory to start the implementation of the assignment from the code skeleton found in the `list.c `__ file. +You should follow the instructions in the `README.md file `__ of the `assignment's repo `__. + +Tips +---- + +To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the `Coding Style document `__. + +Also, use the following static analysis tools to verify the code: + +- checkpatch.pl + +.. code-block:: console + + $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/list.c + +- sparse + +.. code-block:: console + + $ sudo apt-get install sparse + $ cd linux + $ make C=2 /path/to/your/list.c + +- cppcheck + +.. code-block:: console + + $ sudo apt-get install cppcheck + $ cppcheck /path/to/your/list.c + +Penalties +--------- +Information about assigments penalties can be found on the +`General Directions page `__. + +In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above. + +Submitting the assigment +------------------------ + +The assignment will be graded automatically using the `vmchecker-next `__ infrastructure. +The submission will be made on moodle on the `course's page `__ to the related assignment. +You will find the submission details in the `README.md file `__ of the `repo `__. + +Resources +========= + +We recommend that you use gitlab to store your homework. Follow the directions in +`README.md file `__. + +Questions +========= + +For questions about the topic, you can consult the mailing `list archives `__ +or you can write a question on the dedicated Teams channel. diff --git a/refs/pull/405/merge/_sources/so2/assign1-kprobe-based-tracer.rst.txt b/refs/pull/405/merge/_sources/so2/assign1-kprobe-based-tracer.rst.txt new file mode 100644 index 00000000..c419a0fb --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign1-kprobe-based-tracer.rst.txt @@ -0,0 +1,182 @@ +================================== +Assignment 1 - Kprobe based tracer +================================== + +- Deadline: :command:`Monday, 8 April 2024, 23:59` + +Assignment's Objectives +======================= + +* gaining knowledge related to the instrumentation of functions in the Linux kernel (``kretprobes`` mechanism) +* gaining knowledge regarding the ``/proc`` file system from the Linux kernel +* get familiar with data structures specific to the Linux kernel (``hash table`` and ``list``) + +Statement +========= + +Build a kernel operations surveillant. + +With this surveillant, we aim to intercept: + +* ``kmalloc`` and ``kfree`` calls +* ``schedule`` calls +* ``up`` and ``down_interruptible`` calls +* ``mutex_lock`` and ``mutex_unlock`` calls + +The surveillant will hold, at the process level, the number of calls for each of the above functions. +For the ``kmalloc`` and ``kfree`` calls the total quantity of allocated and deallocated memory will be +shown. + +The surveillant will be implemented as a kernel module with the name ``tracer.ko``. + +Implementation details +---------------------- + +The interception will be done by recording a sample (``kretprobe``) for each of the above functions. The +surveillant will retain a list/hashtable with the monitored processes and will account for +the above information for these processes. + +For the control of the list/hashtable with the monitored processes, a char device called ``/dev/tracer`` +will be used, with major `10` and minor `42`. It will expose an ``ioctl`` interface with two arguments: + +* the first argument is the request to the monitoring subsystem: + + * ``TRACER_ADD_PROCESS`` + * ``TRACER_REMOVE_PROCESS`` + +* the second argument is the PID of the process for which the monitoring request will be executed + +In order to create a char device with major `10` you will need to use the `miscdevice `__ interface in the kernel. +Definitions of related macros can be found in the `tracer.h header `__. + +Since the ``kmalloc`` function is inline for instrumenting the allocated amount of memory, the ``__kmalloc`` +function will be inspected as follows: + +* a ``kretprobe`` will be used, which will retain the amount of memory allocated and the address of the allocated memory area. +* the ``.entry_handler`` and ``.handler`` fields in the ``kretprobe`` structure will be used to retain information about the amount of memory allocated and the address from which the allocated memory starts. + +.. code-block:: C + + static struct kretprobe kmalloc_probe = { + .entry_handler = kmalloc_probe_entry_handler, /* entry handler */ + .handler = kmalloc_probe_handler, /* return probe handler */ + .maxactive = 32, + }; + +Since the ``kfree`` function only receives the address of the memory area to be freed, in order to determine +the total amount of memory freed, we will need to determine its size based on the address of the area. +This is possible because there is an address-size association made when inspecting the ``__kmalloc`` function. + +For the rest of the instrumentation functions it is enough to use a ``kretprobe``. + +.. code-block:: C + + static struct kretprobe up_probe = { + .entry_handler = up_probe_handler, + .maxactive = 32, + }; + +The virtual machine kernel has the ``CONFIG_DEBUG_LOCK_ALLOC`` option enabled where the ``mutex_lock`` symbol +is a macro that expands to ``mutex_lock_nested``. Thus, in order to obtain information about the ``mutex_lock`` +function you will have to instrument the ``mutex_lock_nested`` function. + +Processes that have been added to the list/hashtable and that end their execution will be removed +from the list/hashtable. Also, a process will be removed from the dispatch list/hashtable following +the ``TRACER_REMOVE_PROCESS`` operation. + +The information retained by the surveillant will be displayed via the procfs file system, in the ``/proc/tracer`` file. +For each monitored process an entry is created in the ``/proc/tracer`` file having as first field the process PID. +The entry will be read-only, and a read operation on it will display the retained results. An example of +displaying the contents of the entry is: + +.. code-block:: console + + $cat /proc/tracer + PID kmalloc kfree kmalloc_mem kfree_mem sched up down lock unlock + 42 12 12 2048 2048 124 2 2 9 9 + 1099 0 0 0 0 1984 0 0 0 0 + 1244 0 0 0 0 1221 100 1023 1023 1002 + 1337 123 99 125952 101376 193821 992 81921 7421 6392 + +Testing +======= + +In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +`test script `__ called `_checker`. +The test script assumes that the kernel module is called `tracer.ko`. + +QuickStart +========== + +It is mandatory to start the implementation of the assignment from the code skeleton found in the `src `__ directory. +There is only one header in the skeleton called `tracer.h `__. +You will provide the rest of the implementation. You can add as many `*.c`` sources and additional `*.h`` headers. +You should also provide a Kbuild file that will compile the kernel module called `tracer.ko`. +Follow the instructions in the `README.md file `__ of the `assignment's repo `__. + + +Tips +---- + +To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the `Coding Style document `__. + +Also, use the following static analysis tools to verify the code: + +- checkpatch.pl + +.. code-block:: console + + $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/tracer.c + +- sparse + +.. code-block:: console + + $ sudo apt-get install sparse + $ cd linux + $ make C=2 /path/to/your/tracer.c + +- cppcheck + +.. code-block:: console + + $ sudo apt-get install cppcheck + $ cppcheck /path/to/your/tracer.c + +Penalties +--------- + +Information about assigments penalties can be found on the +`General Directions page `__. In addition, the following +elements will be taken into account: + +* *-2*: missing of proper disposal of resources (``kretprobes``, entries in ``/proc``) +* *-2*: data synchronization issues for data used by multiple executing instances (e.g. the list/hashtable) + +In exceptional cases (the assigment passes the tests but it is not complying with the requirements) +and if the assigment does not pass all the tests, the grade may decrease more than mentioned above. + +Submitting the assigment +------------------------ + +The assignment will be graded automatically using the `vmchecker-next `__ infrastructure. +The submission will be made on moodle on the `course's page `__ to the related assignment. +You will find the submission details in the `README.md file `__ of the `repo `__. + + +Resources +========= + +* `Documentation/kprobes.txt `__ - description of the ``kprobes`` subsystem from Linux kernel sources. +* `samples/kprobes/ `__ - some examples of using ``kprobes`` from Linux kernel sources. + +We recommend that you use gitlab to store your homework. Follow the directions in +`README `__. + +Questions +========= + +For questions about the topic, you can consult the mailing `list archives `__ +or you can write a question on the dedicated Teams channel. diff --git a/refs/pull/405/merge/_sources/so2/assign2-driver-uart.rst.txt b/refs/pull/405/merge/_sources/so2/assign2-driver-uart.rst.txt new file mode 100644 index 00000000..0622965b --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign2-driver-uart.rst.txt @@ -0,0 +1,152 @@ +========================== +Assignment 2 - Driver UART +========================== + +- Deadline: :command:`Monday, 22 April 2024, 23:59` +- The assigment is individual + +Assignment's Objectives +======================= + +* consolidating the knowledge of device drivers +* read hardware documentation and track the desired functionality in the documentation +* work with interrupts; use of non-blocking functions in interrupt context +* use of buffers; synchronization +* kernel modules with parameters + +Statement +========= + +Write a kernel module that implements a driver for the serial port (`UART16550`). +The device driver must support the two standard serial ports in a PC, `COM1` and `COM2` (`0x3f8` and `0x2f8`, +in fact the entire range of `8` addresses `0x3f8-0x3ff` and `0x2f8-0x2ff` specific to the two ports). +In addition to the standard routines (`open`, `read`, `write`, `close`), +the driver must also have support for changing communication parameters using an `ioctl` operation (`UART16550_IOCTL_SET_LINE`). + +The driver must use interrupts for both reception and transmission to reduce latency and CPU usage time. +`Read` and `write` calls must also be blocking. :command:`Assignments that do not meet these requirements will not be considered.` +It is recommended that you use a buffer for the read routine and another buffer for the write routine for each serial port in the driver. + +A blocking read call means that the read routine called from the user-space will be blocked until :command:`at least` one byte is read +(the read buffer in the kernel is empty and no data can be read). +A blocking write call means that the write routine called from the user-space will be blocked until :command:`at least` one byte is written +(the write buffer in the kernel is full and no data can be written). + +Buffers Scheme +-------------- + +.. image:: ../img/buffers-scheme.png + +Data transfer between the various buffers is a `Producer-Consumer `__ problem. Example: + +- The process is the producer and the device is the consumer if it is written from the process to the device; the process will block until there is at least one free space in the consumer's buffer + +- The process is the consumer and the device is the producer if it is read from a process from the device; the process will block until there is at least one element in the producer's buffer. + +Implementation Details +====================== + +- the driver will be implemented as a kernel module named :command:`uart16550.ko` +- the driver will be accessed as a character device driver, with different functions depending on the parameters transmitted to the load module: + + - the `major` parameter will specify the major with which the device must be registered + - the `option` parameter will specify how it works: + + - OPTION_BOTH: will also register COM1 and COM2, with the major given by the `major` parameter and the minors 0 (for COM1) and 1 (for COM2); + - OPTION_COM1: will only register COM1, with the major `major` and minor 0; + - OPTION_COM2: will only register COM2, with the major `major` and minor 1; + - to learn how to pass parameters in Linux, see `tldp `__ + - the default values are `major=42` and `option=OPTION_BOTH`. +- the interrupt number associated with COM1 is 4 (`IRQ_COM1`) and the interrupt number associated with COM2 is 3 (`IRQ_COM2`) +- `the header `__ with the definitions needed for special operations; +- a starting point in implementing read / write routines is the `example `__ of uppercase / lowercase character device driver; the only difference is that you have to use two buffers, one for read and one for write; +- you can use `kfifo `__ for buffers; +- you do not have to use deferred functions to read / write data from / to ports (you can do everything from interrupt context); +- you will need to synchronize the read / write routines with the interrupt handling routine for the routines to be blocking; it is recommended to use `synchronization with waiting queues `__ +- In order for the assigment to work, the `default serial driver` must be disabled: + + - `cat /proc/ioports | grep serial` will detect the presence of the default driver on the regions where COM1 and COM2 are defined + - in order to deactivate it, the kernel must be recompiled, either by setting the serial driver as the module, or by deactivating it completely (this modification is already made on the virtual machine) + + - `Device Drivers -> Character devices -> Serial driver -> 8250/16550 and compatible serial support.` + +Testing +======= +In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +`test script `__ called `_checker`. +The test script assumes that the kernel module is called `uart16550.ko`. + +QuickStart +========== + +It is mandatory to start the implementation of the assignment from the code skeleton found in the `src `__ directory. +There is only one header in the skeleton called `uart16550.h `__. +You will provide the rest of the implementation. You can add as many `*.c`` sources and additional `*.h`` headers. +You should also provide a Kbuild file that will compile the kernel module called `uart16550.ko`. +Follow the instructions in the `README.md file `__ of the `assignment's repo `__. + + +Tips +---- + +To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the `Coding Style document `__. + +Also, use the following static analysis tools to verify the code: + +- checkpatch.pl + +.. code-block:: console + + $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/list.c + +- sparse + +.. code-block:: console + + $ sudo apt-get install sparse + $ cd linux + $ make C=2 /path/to/your/list.c + +- cppcheck + +.. code-block:: console + + $ sudo apt-get install cppcheck + $ cppcheck /path/to/your/list.c + +Penalties +--------- + +Information about assigments penalties can be found on the +`General Directions page `__. + +In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above. + +Submitting the assigment +------------------------ + +The assignment will be graded automatically using the `vmchecker-next `__ infrastructure. +The submission will be made on moodle on the `course's page `__ to the related assignment. +You will find the submission details in the `README.md file `__ of the `repo `__. + + +Resources +========= + +- serial port documentation can be found on `tldp `__ +- `table with registers `__ +- `datasheet 16550 `__ +- `alternative documentation `__ + +We recommend that you use gitlab to store your homework. Follow the directions in +`README `__. + + +Questions +========= + +For questions about the topic, you can consult the mailing `list archives `__ +or you can write a question on the dedicated Teams channel. diff --git a/refs/pull/405/merge/_sources/so2/assign3-software-raid.rst.txt b/refs/pull/405/merge/_sources/so2/assign3-software-raid.rst.txt new file mode 100644 index 00000000..e0b574a8 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign3-software-raid.rst.txt @@ -0,0 +1,174 @@ +=========================== +Assignment 3 - Software RAID +=========================== + +- Deadline: :command:`Thursday, 16 May 2024, 23:59` + +Implementing a software RAID module that uses a logical block device that will read and write data from two physical devices, +ensuring the consistency and synchronization of data from the two physical devices. The type of RAID implemented will be similar to a `RAID 1`. + +Assignment's Objectives +======================= + +* in-depth understanding of how the I/O subsystem works. +* acquire advanced skills working with `bio` structures. +* work with the block / disk devices in the Linux kernel. +* acquire skills to navigate and understand the code and API dedicated to the I/O subsystem in Linux. + + +Statement +========= + +Write a kernel module that implements the RAID software functionality. `Software RAID `__ provides an abstraction between +the logical device and the physical devices. The implementation will use `RAID scheme 1 `__. + +The virtual machine has two hard disks that will represent the physical devices: `/dev/vdb` and `/dev/vdc`. The operating system +will provide a logical device (block type) that will interface the access from the user space. Writing requests to the logical device +will result in two writes, one for each hard disk. Hard disks are not partitioned. It will be considered that each hard disk has a +single partition that covers the entire disk. + +Each partition will store a sector along with an associated checksum (CRC32) to ensure error recovery. At each reading, the related +information from both partitions is read. If a sector of the first partition has corrupt data (CRC value is wrong) then the sector +on the second partition will be read; at the same time the sector of the first partition will be corrected. Similar in the case of +a reading of a corrupt sector on the second partition. If a sector has incorrect CRC values on both partitions, an appropriate error +code will be returned. + +Important to know +----------------- + +To ensure error recovery, a CRC code is associated with each sector. CRC codes are stored by LOGICAL_DISK_SIZE byte of the partition +(macro defined in the assignment `header `__). The disk structure will have the following layout: + + +.. code-block:: console + + +-----------+-----------+-----------+ +---+---+---+ + | sector1 | sector2 | sector3 |.....|C1 |C2 |C3 | + +-----------+-----------+-----------+ +---+---+---+ + +where ``C1``, ``C2``, ``C3`` are the values CRC sectors ``sector1``, ``sector2``, ``sector3``. The CRC area is found immediately after the ``LOGICAL_DISK_SIZE`` bytes of the partition. + +As a seed for CRC use 0(zero). + +Implementation Details +====================== + +- the kernel module will be named ``ssr.ko`` +- the logical device will be accessed as a block device with the major ``SSR_MAJOR`` and minor ``SSR_FIRST_MINOR`` under the name ``/dev/ssr`` (via the macro ``LOGICAL_DISK_NAME``) +- the virtual device (``LOGICAL_DISK_NAME`` - ``/dev/ssr``) will have the capacity of ``LOGICAL_DISK_SECTORS`` (use ``set_capacity`` with the ``struct gendisk`` structure) +- the two disks are represented by the devices ``/dev/vdb``, respectively ``/dev/vdc``, defined by means of macros ``PHYSICAL_DISK1_NAME``, respectively ``PHYSICAL_DISK2_NAME`` +- to work with the ``struct block _device`` structure associated with a physical device, you can use the ``blkdev_get_by_path`` and ``blkdev_put`` functions +- for the handling of requests from the user space, we recommend not to use a ``request_queue``, but to do processing at :c:type:`struct bio` level + using the ``submit_bio`` field of :c:type:`struct block_device_operations` +- since data sectors are separated from CRC sectors you will have to build separate ``bio`` structures for data and CRC values +- to allocate a :c:type:`struct bio` for physical disks you can use :c:func:`bio_alloc`; to add data pages to bio use :c:func:`alloc_page` and :c:func:`bio_add_page` +- to free up the space allocated for a :c:type:`struct bio` you need to release the pages allocated to the bio (using the :c:func:`__free_page` macro ) and call + :c:func:`bio_put` +- when generating a :c:type:`struct bio` structure, consider that its size must be multiple of the disk sector size (``KERNEL_SECTOR_SIZE``) +- to send a request to a block device and wait for it to end, you can use the :c:func:`submit_bio_wait` function +- use :c:func:`bio_endio` to signal the completion of processing a ``bio`` structure +- for the CRC32 calculation you can use the :c:func:`crc32` macro provided by the kernel +- useful macro definitions can be found in the assignment support `header `__ +- a single request processing function for block devices can be active at one time in a call stack (more details `here `__). + You will need to submit requests for physical devices in a kernel thread; we recommend using ``workqueues``. +- For a quick run, use a single bio to batch send the read/write request for CRC values for adjacent sectors. For example, + if you need to send requests for CRCs in sectors 0, 1, ..., 7, use a single bio, not 8 bios. +- our recommendations are not mandatory (any solution that meets the requirements of the assignment is accepted) +Testing +======= +In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +`test script `__ called `_checker`. +The test script assumes that the kernel module is called `ssr.ko`. + +If, as a result of the testing process, the sectors on both disks contain invalid data, resulting in +read errors that make the module impossible to use, you will need to redo the two disks in the +virtual machine using the commands: + +.. code-block:: console + + $ dd if=/dev/zero of=/dev/vdb bs=1M + $ dd if=/dev/zero of=/dev/vdc bs=1M + +You can also get the same result using the following command to start the virtual machine: + +.. code-block:: console + + $ rm disk{1,2}.img; make console # or rm disk{1,2}.img; make boot + +QuickStart +========== + +It is mandatory to start the implementation of the assignment from the code skeleton found in the `src `__ directory. +There is only one header in the skeleton called `ssr.h `__. +You will provide the rest of the implementation. You can add as many `*.c`` sources and additional `*.h`` headers. +You should also provide a Kbuild file that will compile the kernel module called `ssr.ko`. +Follow the instructions in the `README.md file `__ of the `assignment's repo `__. + + +Tips +---- + +To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the `Coding Style document `__. + +Also, use the following static analysis tools to verify the code: + +- checkpatch.pl + +.. code-block:: console + + $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c + +- sparse + +.. code-block:: console + + $ sudo apt-get install sparse + $ cd linux + $ make C=2 /path/to/your/file.c + +- cppcheck + +.. code-block:: console + + $ sudo apt-get install cppcheck + $ cppcheck /path/to/your/file.c + +Penalties +--------- + +Information about assigments penalties can be found on the +`General Directions page `__. + +In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above. + +Submitting the assigment +------------------------ + +The assignment will be graded automatically using the `vmchecker-next `__ infrastructure. +The submission will be made on moodle on the `course's page `__ to the related assignment. +You will find the submission details in the `README.md file `__ of the `repo `__. + + +Resources +========= + +- implementation of the `RAID `__ software in the Linux kernel + +We recommend that you use gitlab to store your homework. Follow the directions in +`README `__. + + +Questions +========= + +For questions about the topic, you can consult the mailing `list archives `__ +or you can write a question on the dedicated Teams channel. + +Before you ask a question, make sure that: + + - you have read the statement of the assigment well + - the question is not already presented on the `FAQ page `__ + - the answer cannot be found in the `mailing list archives `__ diff --git a/refs/pull/405/merge/_sources/so2/assign4-transport-protocol.rst.txt b/refs/pull/405/merge/_sources/so2/assign4-transport-protocol.rst.txt new file mode 100644 index 00000000..192dc842 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign4-transport-protocol.rst.txt @@ -0,0 +1,253 @@ +===================================== +Assignment 4 - SO2 Transport Protocol +===================================== + +- Deadline: :command:`Monday, 29 May 2023, 23:00` +- This assignment can be made in teams (max 2). Only one of them must submit the assignment, and the names of the student should be listed in a README file. + +Implement a simple datagram transport protocol - STP (*SO2 Transport Protocol*). + +Assignment's Objectives +======================= + +* gaining knowledge about the operation of the networking subsystem in the Linux kernel +* obtaining skills to work with the basic structures of the networking subsystem in Linux +* deepening the notions related to communication and networking protocols by implementing a protocol in an existing protocol stack + +Statement +========= + +Implement, in the Linux kernel, a protocol called STP (*SO2 Transport Protocol*), at network and transport level, that works using datagrams (it is not connection-oriented and does not use flow-control elements). + +The STP protocol acts as a Transport layer protocol (port-based multiplexing) but operates at level 3 (Network) of `the OSI stack `__, above the Data Link level. + +The STP header is defined by the ``struct stp_header`` structure: + +.. code-block:: c + + struct stp_header { + __be16 dst; + __be16 src; + __be16 len; + __u8 flags; + __u8 csum; + }; + + +where: + + * ``len`` is the length of the packet in bytes (including the header); + * ``dst`` and ``src`` are the destination and source ports, respectively; + * ``flags`` contains various flags, currently unused (marked *reserved*); + * ``csum`` is the checksum of the entire package including the header; the checksum is calculated by exclusive OR (XOR) between all bytes. + +Sockets using this protocol will use the ``AF_STP`` family. + +The protocol must work directly over Ethernet. The ports used are between ``1`` and ``65535``. Port ``0`` is not used. + +The definition of STP-related structures and macros can be found in the `assignment support header `__. + +Implementation Details +====================== + +The kernel module will be named **af_stp.ko**. + +You have to define a structure of type `net_proto_family `__, which provides the operation to create STP sockets. +Newly created sockets are not associated with any port or interface and cannot receive / send packets. +You must initialize the `socket ops field `__ with the list of operations specific to the STP family. +This field refers to a structure `proto_ops `__ which must include the following functions: + +* ``release``: releases an STP socket +* ``bind``: associates a socket with a port (possibly also an interface) on which packets will be received / sent: + + * there may be bind sockets only on one port (not on an interface) + * sockets associated with only one port will be able to receive packets sent to that port on all interfaces (analogous to UDP sockets associated with only one port); these sockets cannot send packets because the interface from which they can be sent via the standard sockets API cannot be specified + * two sockets cannot be binded to the same port-interface combination: + + * if there is a socket already binded with a port and an interface then a second socket cannot be binded to the same port and the same interface or without a specified interface + * if there is a socket already binded to a port but without a specified interface then a second socket cannot be binded to the same port (with or without a specified interface) + + * we recommend using a hash table for bind instead of other data structures (list, array); in the kernel there is a hash table implementation in the `hashtable.h header `__ + +* ``connect``: associates a socket with a remote port and hardware address (MAC address) to which packets will be sent / received: + + * this should allow ``send`` / ``recv`` operations on the socket instead of ``sendmsg`` / ``recvmsg`` or ``sendto`` / ``recvfrom`` + * once connected to a host, sockets will only accept packets from that host + * once connected, the sockets can no longer be disconnected + +* ``sendmsg``, ``recvmsg``: send or receive a datagram on an STP socket: + + * for the *receive* part, metainformation about the host that sent the packet can be stored in the `cb field in sk_buff `__ + +* ``poll``: the default function ``datagram_poll`` will have to be used +* for the rest of the operations the predefined stubs in the kernel will have to be used (``sock_no_*``) + +.. code-block:: c + + static const struct proto_ops stp_ops = { + .family = PF_STP, + .owner = THIS_MODULE, + .release = stp_release, + .bind = stp_bind, + .connect = stp_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = stp_sendmsg, + .recvmsg = stp_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, + }; + +Socket operations use a type of address called ``sockaddr_stp``, a type defined in the `assignment support header `__. +For the *bind* operation, only the port and the index of the interface on which the socket is bind will be considered. +For the *receive* operation, only the ``addr`` and ``port`` fields in the structure will be filled in with the MAC address of the host that sent the packet and with the port from which it was sent. +Also, when sending a packet, the destination host will be obtained from the ``addr`` and ``port`` fields of this structure. + +You need to register a structure `packet_type `__, using the call `dev_add_pack `__ to be able to receive STP packets from the network layer. + +The protocol will need to provide an interface through the *procfs* file system for statistics on sent / received packets. +The file must be named ``/proc/net/stp_stats``, specified by the ``STP_PROC_FULL_FILENAME`` macro in `assignment support header `__. +The format must be of simple table type with ``2`` rows: on the first row the header of the table, and on the second row the statistics corresponding to the columns. +The columns of the table must be in order: + +.. code:: + + RxPkts HdrErr CsumErr NoSock NoBuffs TxPkts + +where: + +* ``RxPkts`` - the number of packets received +* ``HdrErr`` - the number of packets received with header errors (packets too short or with source or destination 0 ports) +* ``CsumErr`` - the number of packets received with checksum errors +* ``NoSock`` - the number of received packets for which no destination socket was found +* ``NoBuffs`` - the number of received packets that could not be received because the socket queue was full +* ``TxPkts`` - the number of packets sent + +To create or delete the entry specified by ``STP_PROC_FULL_FILENAME`` we recommend using the functions `proc_create `__ and `proc_remove `__. + +Sample Protocol Implementations +------------------------------- + +For examples of protocol implementation, we recommend the implementation of `PF_PACKET `__ sockets and the various functions in `UDP implementation `__ or `IP implementation `__. + +Testing +======= + +In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +`test script `__ called `_checker`. +The test script assumes that the kernel module is called `af_stp.ko`. + +tcpdump +------- + +You can use the ``tcpdump`` utility to troubleshoot sent packets. +The tests use the loopback interface; to track sent packets you can use a command line of the form: + +.. code:: console + + tcpdump -i lo -XX + +You can use a static version of `tcpdump `__. +To add to the ``PATH`` environment variable in the virtual machine, copy this file to ``/linux/tools/labs/rootfs/bin``. +Create the directory if it does not exist. Remember to give the ``tcpdump`` file execution permissions: + +.. code:: console + + # Connect to the docker using ./local.sh docker interactive + cd /linux/tools/labs/rootfs/bin + wget http://elf.cs.pub.ro/so2/res/teme/tcpdump + chmod +x tcpdump + +QuickStart +========== + +It is mandatory to start the implementation of the assignment from the code skeleton found in the `src `__ directory. +There is only one header in the skeleton called `stp.h `__. +You will provide the rest of the implementation. You can add as many `*.c`` sources and additional `*.h`` headers. +You should also provide a Kbuild file that will compile the kernel module called `af_stp.ko`. +Follow the instructions in the `README.md file `__ of the `assignment's repo `__. + + + +Tips +---- + +To increase your chances of getting the highest grade, read and follow the Linux kernel coding style described in the `Coding Style document `__. + +Also, use the following static analysis tools to verify the code: + +* checkpatch.pl + + .. code-block:: console + + $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c + +* sparse + + .. code-block:: console + + $ sudo apt-get install sparse + $ cd linux + $ make C=2 /path/to/your/file.c + +* cppcheck + + .. code-block:: console + + $ sudo apt-get install cppcheck + $ cppcheck /path/to/your/file.c + +Penalties +--------- + +Information about assigments penalties can be found on the `General Directions page `__. + +In exceptional cases (the assigment passes the tests by not complying with the requirements) and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above. + +Submitting the assigment +------------------------ + +The assignment will be graded automatically using the `vmchecker-next `__ infrastructure. +The submission will be made on moodle on the `course's page `__ to the related assignment. +You will find the submission details in the `README.md file `__ of the `repo `__. + + +Resources +========= + +* `Lecture 10 - Networking `__ +* `Lab 10 - Networking `__ +* Linux kernel sources + + * `Implementing PF_PACKET sockets `__ + * `Implementation of the UDP protocol `__ + * `Implementation of the IP protocol `__ + +* Understanding Linux Network Internals + + * chapters 8-13 + +* `assignment support header `__ + +We recommend that you use gitlab to store your homework. Follow the directions in `README `__. + +Questions +========= + +For questions about the topic, you can consult the mailing `list archives `__ +or you can write a question on the dedicated Teams channel. + +Before you ask a question, make sure that: + + - you have read the statement of the assigment well + - the question is not already presented on the `FAQ page `__ + - the answer cannot be found in the `mailing list archives `__ + diff --git a/refs/pull/405/merge/_sources/so2/assign5-pitix.rst.txt b/refs/pull/405/merge/_sources/so2/assign5-pitix.rst.txt new file mode 100644 index 00000000..ef61918b --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign5-pitix.rst.txt @@ -0,0 +1,231 @@ +=================================== +Assignment 5 - PITIX Filesystem +=================================== + +Deadline: :command:`Tuesday, 24 May 2022, 23:00` + +Statement +========= + +Write a kernel module to implement the **PITIX** file system, version 2. +This file system will only support files and directories. +Support operations for hard or symbolic links will not be implemented. +Also, support operations for special files (pipes, character devices, or blocks) will not be implemented. +Basically you need to implement the following: + * for directories: ``lookup``, ``unlink``, ``mkdir``, ``rmdir``, ``iterate`` + * for files: ``create``, ``truncate``, bitmap functions, see `minix_get_block `__. + +The rest of the functions either have generic kernel implementations, or you don't have to implement them. + +The disk structure of the file system is: + +.. code-block:: console + + +--------------+-----------+-----------+------------+-----------------------+ + | | | | | | + | superblock | imap | dmap | izone | dzone | + +--------------+-----------+-----------+------------+-----------------------+ + 4096 bytes 1 block 1 block 32 blocks 8*block_size blocks + + +where: + +* ``Superblock`` is the superblock (``4096`` bytes) +* ``Imap`` contains the bitmap of the blocks occupied by the inodes (``1`` block) +* ``Dmap`` contains the bitmap of the blocks occupied by the data (``1`` block) +* ``Izone`` contains inodes (``32`` blocks) +* ``Dzone`` contains the data (the actual contents of the files) (``8 * block_size`` blocks) + +The superblock (**on disk**) is described by the following structure: + +.. code-block:: c + + struct pitix_super_block { + unsigned long magic; + __u8 version; + __u8 block_size_bits; + __u8 imap_block; + __u8 dmap_block; + __u8 izone_block; + __u8 dzone_block; + __u16 bfree; + __u16 ffree; + }; + +where: + +* ``magic`` must be initialized with ``PITIX_MAGIC`` +* ``version`` must be initialized with ``2`` (``PITIX_VERSION``) +* ``block_size_bits`` is the block size of two; the block size can be ``512``, ``1024``, ``2048``, or ``4096`` +* ``Imap_block`` is the block number (relative to the device) to the bit vector used for the allocation / release sites inode +* ``dmap_block`` is the block number (relative to the device) for the bit vector used to allocate / release data blocks +* ``izone_block`` is the number of the first block (relative to the device) of the inode area +* ``dzone_block`` is the number of the first block (relative to the device) of the data area +* ``bfree`` is the number of free blocks (unallocated) +* ``ffree`` is the number of free (unallocated) inodes + +The inodes will be stored in the inode area and are described by the following structure: + +.. code-block:: c + + struct pitix_inode { + __u32 mode; + uid_t uid; + gid_t gid; + __u32 size; + __u32 time; + __u16 direct_data_blocks [INODE_DIRECT_DATA_BLOCKS]; + __u16 indirect_data_block; + }; + +where: + +* ``mode`` represents the access rights and inode type (file or directory) as represented in the kernel +* ``uid`` represents the UID as it is represented in the kernel +* ``gid`` represents the GID as it is represented in the kernel +* ``size`` is the size of the file / directory +* ``time`` represents the modification time as it is represented in the kernel +* ``direct_data_blocks`` is a vector (size ``INODE_DIRECT_DATA_BLOCKS`` ) that contains indexes of direct data blocks +* ``indirect_data_block`` is the index of a data block that contains the indexes of indirect data blocks + +The index of a data block (direct or indirect) indicates the number of that data block relative to the data area (``Dzone``). +The size of an index is ``2`` bytes. + +As can be seen from its structure, the inode uses a simple routing scheme for data blocks. +Blocks in the range ``[0, INODE_DIRECT_DATA_BLOCKS)`` are blocks of direct data and are referenced by elements of the vector ``direct_data_blocks`` and blocks in the range ``[INODE_DIRECT_DATA_BLOCKS, INODE_DIRECT_DATA_BL)`` are indirect data blocks and are referred to by indices within the data block indicated by ``indirect_data_block``. + +The data block indicated by ``indirect_data_block`` must be allocated when we have to refer to a first block of indirect data and must be released when there are no more blocks of indirect data. + +Unused indexes must be set to ``0``. +The first block, the one with index ``0``, is always allocated when formatting. This block cannot be used and, consequently, the value ``0``: + +* in an element of the vector, ``direct_data_blocks`` means free slot (that element does not refer to a block of data directly) +* ``indirect_data_block`` means that no data block is allocated to keep track of indirect data blocks (when no indirect data blocks are needed) +* an index within the data block referred to as ``indirect_data_block`` means free slot (that index does not refer to an indirect data block) + +It is guaranteed that the number of bytes occupied by an inode on the disk is a divisor of the block size. + +Directories have associated a single block of data (referred to as ``direct_data_block [0]``) in which directory entries will be stored. These are described by the following structure: + +.. code-block:: c + + struct pitix_dir_entry { + __u32 ino; + char name [PITIX_NAME_LEN]; + }; + +where + +* ``inoi`` is the inode number of the file or directory; this number is an index in the inode area +* ``name`` is the name of the file or directory; maximum name length is ``16`` bytes (``PITIX_NAME_LEN``); if the name length is less than 16 bytes, then the name will end with the ASCII character that has the code ``0`` (same as for strings) + +The root directory will be assigned inode ``0`` and data block ``0``. + +For simplicity, at ``mkdir`` it is not necessary to create the entries ``.`` (*dot*) and ``..`` (*dot dot*) in the new directory; the checker uses this assumption. + +All numeric values are stored on disk in byte-order CPU. + +In the `assignment header Block devices -> Loopback device support`` + +In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, the assignment evaluation will be done automatically with with the help of public tests that are in the new infrastructure. + +For local testing, use the following commands: + +.. code-block:: console + + $ git clone https://github.com/linux-kernel-labs/linux.git + $ cd linux/tools/labs + $ LABS=assignments/5-pitix make skels + $ #the development of the assignment will be written in the 5-pitix directory + $ make build + $ make copy + $ make boot + +Instructions for using the test suite can be found in the ``README`` file. + +Tips +---- + +To increase your chances of getting the highest grade, read and follow the Linux kernel coding style described in the `Coding Style document `__. + +Also, use the following static analysis tools to verify the code: + +- checkpatch.pl + +.. code-block:: console + + $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c + +- sparse + +.. code-block:: console + + $ sudo apt-get install sparse + $ cd linux + $ make C=2 /path/to/your/file.c + +- cppcheck + +.. code-block:: console + + $ sudo apt-get install cppcheck + $ cppcheck /path/to/your/file.c + +Penalties +--------- + +As a more difficult assignment, it is worth 2 points. + +Information about assigments penalties can be found on the +`General Directions page `__. + +In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above. + +Submitting the assigment +------------------------ + +The assignment archive will be submitted to vmchecker, according to the rules on the +`rules page `__. + +In the vmchecker interface choose the ``Google Challenge - Sistem de fișiere`` option for this assignment. + +Resources +========= + +* `assignment header `__ +* `Lab 08: File system drivers (Part 1) `__ +* `Lab 09: File system drivers (Part 2) `__ +* `Minix filesystem source code `__ + +We recommend that you use GitLab to store your homework. Follow the directions in +`README `__ +and on the dedicated `Git wiki page `__. + +The resources for the assignment can also be found in the `so2-assignments `__ repo on GitHub. +The repo contains a `Bash script `__ +that helps you create a private repository on the faculty `GitLab `__ instance. +Follow the tips from the `README `__ and +on the dedicated `Wiki page `__. + +Questions +========= + +For questions about the assigment, you can consult the mailing `list archives `__ +or send an e-mail (you must be `registered `__). +Please follow and follow `the tips for use of the list `__. + +Before you ask a question, make sure that: + +* you have read the statement of the assigment well +* the question is not already presented on the `FAQ page `__ +* the answer cannot be found in the `mailing list archives `__ diff --git a/refs/pull/405/merge/_sources/so2/assign7-kvm-vmm.rst.txt b/refs/pull/405/merge/_sources/so2/assign7-kvm-vmm.rst.txt new file mode 100644 index 00000000..3eb0b20c --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/assign7-kvm-vmm.rst.txt @@ -0,0 +1,295 @@ +===================================================== +Assignment 7 - SO2 Virtual Machine Manager with KVM +===================================================== + +- Deadline: :command:`Tuesday, 29 May 2023, 23:00` +- This assignment can be made in teams (max 2). Only one of them must submit the assignment, and the names of the student should be listed in a README file. + +In this assignment we will work on a simple Virtual Machine Manager (VMM). We will be using the KVM API +from the Linux kernel. + +The assignment has two components: the VM code and the VMM code. We will be using a very simple protocol +to enable the communication between the two components. The protocol is called SIMVIRTIO. + + +I. Virtual Machine Manager +========================== + +In general, to build a VMM from scratch we will have to implement three main functionalities: initialize the VMM, initialize the virtual CPU and run the guest code. We will split the implementation of the VMM in these three phases. + +1. Initialize the VMM +------------------------- + +A VM will be represented in general by three elements, a file descriptor used to interact with the KVM API, a file descriptor per VM used to configure it (e.g. set its memory) and a pointer to the VM's memory. We provide you with the following structure to start from when working with a VM. + +.. code-block:: c + + typedef struct vm { + int sys_fd; + int fd; + char *mem; + } virtual_machine; + + +The first step in initializing the KVM VM is to interract with the [KVM_API](https://www.kernel.org/doc/html/latest/virt/kvm/api.html]. The KVM API is exposed via ``/dev/kvm``. We will be using ioctl calls to call the API. + +The snippet below shows how one can call ``KVM_GET_API_VERSION`` to get the KVM API Version + +.. code-block:: c + + int kvm_fd = open("/dev/kvm", O_RDWR); + if (kvm_fd < 0) { + perror("open /dev/kvm"); + exit(1); + } + + int api_ver = ioctl(kvm_fd, KVM_GET_API_VERSION, 0); + if (api_ver < 0) { + perror("KVM_GET_API_VERSION"); + exit(1); + } + +Let us now go briefly through how a VMM initializes a VM. This is only the bare bones, a VMM may do lots of other things during VM initialization. + +1. We first use KVM_GET_API_VERSION to check that we are running the expected version of KVM, ``KVM_API_VERSION``. +2. We now create the VM using ``KVM_CREATE_VM``. Note that calling ``KVM_CREATE_VM`` returns a file descriptor. We will be using this file descriptor for the next phases of the setup. +3. (Optional) On Intel based CPUs we will have to call ``KVM_SET_TSS_ADDR`` with address ``0xfffbd000`` +4. Next, we allocate the memory for the VM, we will be using ``mmap`` for this with ``PROT_WRITE``, ``MAP_PRIVATE``, ``MAP_ANONYMOUS`` and ``MAP_NORESERVE``. We recommend allocating 0x100000 bytes for the VM. +5. We flag the memory as ``MADV_MERGEABLE`` using ``madvise`` +6. Finally, we use ``KVM_SET_USER_MEMORY_REGION`` to assign the memory to the VM. + +**Make sure you understand what file descriptor to use and when, we use the KVM fd when calling KVM_CREATE_VM, but when interacting with the vm such as calling KVM_SET_USER_MEMORY_REGION we use the VMs +file descriptor** + +TLDR: API used for VM initialization: + +* KVM_GET_API_VERSION +* KVM_CREATE_VM +* KVM_SET_TSS_ADDR +* KVM_SET_USER_MEMORY_REGION. + +2. Initialize a virtual CPU +___________________________ + +We need a Virtual CPU (VCPU) to store registers. + +.. code-block:: c + + typedef struct vcpu { + int fd; + struct kvm_run *kvm_run; + } virtual_cpu; + +To create a virtual CPU we will do the following: +1. Call ``KVM_CREATE_VCPU`` to create the virtual CPU. This call returns a file descriptor. +2. Use ``KVM_GET_VCPU_MMAP_SIZE`` to get the size of the shared memory +3. Allocated the necessary VCPU mem size with ``mmap``. We will be passing the VCPU file descriptor to the ``mmap`` call. We can store the result in ``kvm_run``. + + +TLDR: API used for VM + +* KVM_CREATE_VCPU +* KVM_GET_VCPU_MMAP_SIZE + +**We recommend using 2MB pages to simplify the translation process** + +Running the VM +============== + + +Setup real mode +--------------- + +At first, the CPU will start in Protected mode. To do run any meaningful code, we will switch the CPU to [Real mode](https://wiki.osdev.org/Real_Mode). To do this we will +need to configure several CPU registers. + +1. First, we will use ``KVM_GET_SREGS`` to get the registers. We use ``struct kvm_regs`` for this task. +2. We will need to set ``cs.selector`` and ``cs.base`` to 0. We will use ``KVM_SET_SREGS`` to set the registers. +3. Next we will clear all ``FLAGS`` bits via the ``rflags`` register, this means setting ``rflags`` to 2 since bit 1 must always be to 1. We alo set the ``RIP`` register to 0. + +Setup long mode +--------------- + +Read mode is all right for very simple guests, such as the one found in the folder `guest_16_bits`. But, +most programs nowdays need 64 bits addresses, and such we will need to switch to long mode. The following article from OSDev presents all the necessary information about [Setting Up Long Mode](https://wiki.osdev.org/Setting_Up_Long_Mode). + +In ``vcpu.h``, you may found helpful macros such as CR0_PE, CR0_MP, CR0_ET, etc. + +Since we will running a more complex program, we will also create a small stack for our program +``regs.rsp = 1 << 20;``. Don't forget to set the RIP and RFLAGS registers. + +Running +------- + +After we setup our VCPU in real or long mode we can finally start running code on the VM. + +1. We copy to the vm memory the guest code, `memcpy(vm->mem, guest_code, guest_code_size)` The guest code will be available in two variables which will be discussed below. +2. In a infinite loop we run the following: + * We call ``KVM_RUN`` on the VCPU file descriptor to run the VPCU + * Through the shared memory of the VCPU we check the ``exit_reason`` parameter to see if the guest has made any requests: + * We will handle the following VMEXITs: `KVM_EXIT_MMIO`, `KVM_EXIT_IO` and ``KVM_EXIT_HLT``. ``KVM_EXIT_MMIO`` is triggered when the VM writes to a MMIO address. ``KVM_EXIT_IO`` is called when the VM calls ``inb`` or ``outb``. ``KVM_EXIT_HLT`` is called when the user does a ``hlt`` instruction. + +Guest code +---------- + +The VM that is running is also called guest. We will be using the guest to test our implementation. + +1. To test the implementation before implementing SIMVIRTIO. The guest will write at address 400 and the RAX register the value 42. +2. To test a more complicated implementation,we will extend the previous program to also write "Hello, world!\n" on port `0xE9` using the `outb` instruction. +3. To test the implementation of `SIMVIRTIO`, we will + +How do we get the guest code? The guest code is available at the following static pointers guest16, guest16_end-guest16. The linker script is populating them. + + +## SIMVIRTIO: +From the communication between the guest and the VMM we will implement a very simple protocol called ``SIMVIRTIO``. It's a simplified version of the real protocol used in the real world called virtio. + +Configuration space: + ++--------------+----------------+----------------+----------------+------------------+-------------+-------------+ +| u32 | u16 | u8 | u8 | u8 | u8 | u8 | ++==============+================+================+================+==================+=============+=============+ +| magic value | max queue len | device status | driver status | queue selector | Q0(TX) CTL | Q1(RX) CTL | +| R | R | R | R/W | R/W | R/W | R/w | ++--------------+----------------+----------------+----------------+------------------+-------------+-------------+ + + +Controller queues +----------------- + +We provide you with the following structures and methods for the ``SIMVIRTIO`` implementation. + +.. code-block:: c + + typedef uint8_t q_elem_t; + typedef struct queue_control { + // Ptr to current available head/producer index in 'buffer'. + unsigned head; + // Ptr to last index in 'buffer' used by consumer. + unsigned tail; + } queue_control_t; + typedef struct simqueue { + // MMIO queue control. + volatile queue_control_t *q_ctrl; + // Size of the queue buffer/data. + unsigned maxlen; + // Queue data buffer. + q_elem_t *buffer; + } simqueue_t; + int circ_bbuf_push(simqueue_t *q, q_elem_t data) + { + } + int circ_bbuf_pop(simqueue_t *q, q_elem_t *data) + { + } + + +Device structures +----------------- + +.. code-block:: c + + #define MAGIC_VALUE 0x74726976 + #define DEVICE_RESET 0x0 + #define DEVICE_CONFIG 0x2 + #define DEVICE_READY 0x4 + #define DRIVER_ACK 0x0 + #define DRIVER 0x2 + #define DRIVER_OK 0x4 + #define DRIVER_RESET 0x8000 + typedef struct device { + uint32_t magic; + uint8_t device_status; + uint8_t driver_status; + uint8_t max_queue_len; + } device_t; + typedef struct device_table { + uint16_t count; + uint64_t device_addresses[10]; + } device_table_t; + + +We will be implementing the following handles: +* MMIO (read/write) VMEXIT +* PIO (read/write) VMEXIT + +Using the skeleton +================== + +Debugging +========= + + +Tasks +===== +1. 30p Implement a simple VMM that runs the code from `guest_16_bits`. We will be running the VCPU in read mode for this task +2. 20p Extend the previous implementation to run the VCPU in real mode. We will be running the `guest_32_bits` example +3. 30p Implement the `SIMVIRTIO` protocol. +4. 10p Implement pooling as opposed to VMEXIT. We will use the macro `USE_POOLING` to switch this option on and off. +5. 10p Add profiling code. Measure the number of VMEXITs triggered by the VMM. + +Submitting the assigment +------------------------ + +The assignment archive will be submitted on **Moodle**, according to the rules on the `rules page `__. + + +Tips +---- + +To increase your chances of getting the highest grade, read and follow the Linux kernel coding style described in the `Coding Style document `__. + +Also, use the following static analysis tools to verify the code: + +* checkpatch.pl + + .. code-block:: console + + $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c + +* sparse + + .. code-block:: console + + $ sudo apt-get install sparse + $ cd linux + $ make C=2 /path/to/your/file.c + +* cppcheck + + .. code-block:: console + + $ sudo apt-get install cppcheck + $ cppcheck /path/to/your/file.c + +Penalties +--------- + +Information about assigments penalties can be found on the `General Directions page `__. + +In exceptional cases (the assigment passes the tests by not complying with the requirements) and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above. + +## References +We recommend you the following readings before starting to work on the homework: +* [KVM host in a few lines of code](https://zserge.com/posts/kvm/) + + +TLDR +---- + +1. The VMM creates and initializes a virtual machine and a virtual CPU +2. We switch to real mode and check run the simple guest code from `guest_16_bits` +3. We switch to long mode and run the more complex guest from `guest_32_bits` +4. We implement the SIMVIRTIO protocol. We will describe how it behaves in the following subtasks. +5. The guest writes in the TX queue (queue 0) the ascii code for `R` which will result in a `VMEXIT` +6. the VMM will handle the VMEXIT caused by the previous write in the queue. When the guests receiver the +`R` letter it will initiate the reser procedure of the device and set the device status to `DEVICE_RESET` +7. After the reset handling, the guest must set the status of the device to `DRIVER_ACK`. After this, the guest will write to the TX queue the letter `C` +8. In the VMM we will initialize the config process when letter `C` is received.It will set the device status to `DEVICE_CONFIG` and add a new entry in the device_table +9. After the configuration process is finished, the guest will set the driver status to `DRIVER_OK` +10. Nex, the VMM will set the device status to `DEVICE_READY` +11. The guest will write in the TX queue "Ana are mere" and will execute a halt +12. The VMM will print to the STDOUT the message received and execute the halt request +13. Finally, the VMM will verify that at address 0x400 and in register RAX is stored the value 42 + + diff --git a/refs/pull/405/merge/_sources/so2/grading.rst.txt b/refs/pull/405/merge/_sources/so2/grading.rst.txt new file mode 100644 index 00000000..ab728e12 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/grading.rst.txt @@ -0,0 +1,207 @@ +=============================== +SO2 - General Rules and Grading +=============================== + +General Rules +============= + +1. Laboratory +------------- +There is no formal rule for dividing students; everyone can participate in any laboratory as long as the following rules are respected. +Priority for participation is given to students from the respective group (34xC3 or optional). +The limit of students in a laboratory is 14 people. +Starting from the third week, the participation list in the laboratory is "frozen". +Students who have a retake can participate in any laboratory as long as there are available spots. +Like other students, the participation list is "frozen" starting from the third week. +The division is done on the laboratory hours division page. +You can make up for a maximum of 2 laboratories (you can attend another subgroup) (in those laboratories where there are available spots). +Laboratories cannot be made up retroactively. You cannot make up a laboratory from the previous week within the same laboratory week. +Laboratory activities take place only in the laboratory room. +We encourage you to go through the brief and laboratory exercises at home. +You can solve exercises at home, but you will have to start from scratch in the laboratory. + +2. Final deadline for submitting assignments +-------------------------------------------- +The final deadline for submitting SO2 assignments is **Wednesday, May 29, 2024, 23:59.**. +Beyond this date, assignments cannot be submitted anymore. +Please ensure timely submission of assignments with complete information to be graded. +We will not accept assignments submitted after this date or assignments not submitted on vmchecker-next. +For the testing part, assignments will receive the score indicated from testing on vmchecker-next; tests failed due to reasons unrelated to vmchecker-next will not be graded. +Assignments cannot be submitted for the special June 2023 exam session. +Assignments can be resubmitted after TODO for the September 2024 exam session. +The deadline for submitting assignments for the Fall 2024 session is TODO. + +3. Assignment Presentations +--------------------------- +The SO2 team reserves the right to request presentations for some homework assignments. +A presentation involves a discussion with at least two assistants about the completion of the assignment, the solution used, and any encountered issues. +The purpose of the assignment presentation sessions is to clarify any uncertainties regarding the completion of the assignment and to verify its correctness. +Individuals who will present an assignment will be contacted at least 24 hours in advance by the laboratory assistant. +Most likely, a 15-minute slot before/after the SO2 class or at the end of the SO2 laboratory session will be used. + +4. Rules on Assignments +------------------------ +The assignments for Operating Systems 2 are individual, except when explicitly stated that an assignment can be solved in a team. +This is because the primary objective of the assignments is for you to acquire or deepen your practical skills. +If the level of collaboration is too high or if you seek solutions online, this objective will not be achieved. +Each assignment is to be completed by a student without consulting the source code of their peers. + +We understand that teamwork is important, but we do not have the environment to carry out team projects in the Operating Systems 2 course. +If you encounter any problems in completing an assignment, use the discussion list or ask the laboratory assistants or course instructors. +Our role is to help you solve them. +Feel free to rely on the SO2 team. + +You can discuss among yourselves within the bounds of common sense; that is, you should not dictate a solution to someone, but you can offer a general idea. +If you are the one being asked and providing explanations, please consider redirecting to the discussion list and the SO2 team. +It is not allowed to request the solution to an assignment on a site like StackExchange, Rent a Coder, ChatGPT etc. +You can ask more generic questions, but do not request the solution to the assignment. + +You can freely use code from the laboratory, skeletons provided by us. +You can use external resources (GitHub, open-source code, or others) as long as they do not represent obvious solutions to the assignments, publicly available with or without intention. +See also the next paragraph. + +It is not allowed to publish assignment solutions (even after the end of the course). +If you find assignment solutions on GitHub or elsewhere, report them to the discussion list or privately to the laboratory assistant or course instructor. +We reiterate that if you need clarification that you would address to older colleagues or other forums, StackExchange, or other sources, use the discussion list and the SO2 team. +It is the safest and most honest way to solve problems. + +It is not allowed to transfer files between yourselves. +In general, we recommend not to screen-share with another colleague, whether for inspiration or to help them with their assignment. +Avoid testing an assignment on a colleague's system. +There may be exceptions; you can help someone troubleshoot, but please ensure that it does not transition from "let's solve this problem together" to "let me solve your assignment for you". +However, we recommend using the discussion list or the SO2 team to ask questions. + +5. Penalties for Plagiarized Assignments +----------------------------------- + +In general, we consider punitive measures as a last resort. +As long as the assignment is completed individually, without problematic source code contribution from external sources, then it is not a plagiarized assignment. + +The notion of a plagiarized assignment refers to, without limitation, situations such as: + + * Two assignments that are similar enough to draw this conclusion; + * Using source code from the internet that is an obvious solution to the assignment; + * Using pieces of code from another colleague; + * Accessing another colleague's code during the assignment; + * Modifying an existing assignment; + * Following another colleague's code; + * Direct assistance in completing the assignment (someone else wrote or dictated the code); + * Someone else wrote the assignment (voluntarily, for payment, or other benefits). + * If two assignments are considered plagiarized, both the source and destination will be penalized equally, without discussions about who plagiarized from whom and whose fault it is. + +.. warning:: + + Plagiarizing an assignment results in the elimination of points for the assignments completed up to that session. + Any assignment submitted until that session receives a score of 0 and cannot be resubmitted during the current academic year. + If there were instances of plagiarized assignments during the semester, it will be possible to obtain points in the summer, for the September session, from assignments **not yet** submitted. + We reiterate that our goal is not and will not be penalization for plagiarism. + We consider cheating to be dishonest behavior that will be punished if it occurs. + However, our goal is to prevent cheating; for this purpose, we offer support and resources from the team in all its forms (discussion list, face-to-face discussions with the SO2 team). + Please use them with confidence; we believe that an honest approach to completing assignments will also result in a gain of knowledge and skills for you. + +6. Retake/Grade Increase +------------------------- + +In the retake/grade increase session in September, only assignments can be submitted, only the final exam can be retaken, or both. +You can continue to submit assignments with the deadlines from the semester, meaning you can achieve a maximum grade of 7 for each assignment. +Assignments are submitted using the vmchecker-next interface. +If you did not have plagiarized assignments during the semester, you can (re)submit any assignments. +If there were instances of plagiarized assignments during the semester, you can submit only assignments not yet submitted during the semester. +The submission deadline is TODO + +If you do not wish to retake the final exam, you can choose not to participate in the exam. +Grades will be recorded in the official catalog, according to the SO2 catalog. + +In the special retake/grade increase session in June, only the final exam can be retaken, and no homework assignments can be submitted. + +The exam in the retake session will consist of 11 equally weighted topics (for a total of 3 points - one topic is a bonus). Passing the exam is conditional on obtaining 1 point out of the 3 points assigned to the course. In practice, this means correctly solving 3 out of the 11 topics in the exam. + +In the case of retaking the final exam, the higher grade will be retained (between the semester grade and the grade from the retake session). + +You can participate in only one exam during a session. + +7. Class Redo +------------------- + +If you prefer, you can keep the score from the previous academic year for the entire semester's activity (labs, assignments, course work), and only retake the final exam. +You cannot keep the score for individual components of the semester (only assignments or only course work). + +If you want to keep the score from the previous academic year for the entire semester's activity, you must announce this at the begining of the semester. +Otherwise, the score from the previous academic year's semester will be reset according to the default mode. + +By default, the score for the academic year will be reset on October 1. +If you do not graduate from the course during the current academic year, you will need to retake it completely during the next academic year. + +Grading +======= + +You must achieve at least 4.5 points out of 10 to pass. + +1. Lectures (3 points) +---------------------- +* Completion of the course is conditioned by obtaining 30% (3 out of 10) of the course score. +* The lecture score will be obtained from 11 lecture quizzes to be completed before each class (one quiz is a bonus). +* Each course assignment contains a set of 4 questions from the material covered in the previous class (one question is a bonus). + * There will be no final exam. + * Each question is scored with 0 or 1. + * A question is scored only if it is fully and correctly answered. + * A question answered incompletely or one answered completely but with incorrect specifications or errors will not be scored. + * Course assignments cannot be redone. + * Each assignment lasts 3 minutes. + * The score is obtained from the formula min(sum_of_assignment_scores / 10 * 4/3, 10). + * The assignments are closed book. +* For those who cannot attend the course assignments or wish to improve their course score, an assignment will be given at the end of the semester (during the last class) covering all the course material. + * The end-of-semester assignment (last class) consists of 11 questions for the 3 course points and lasts 60 minutes. + * The end-of-semester assignment is open-book. You are allowed to use class notes, books, slides, laptops, or tablets without internet access. + * Access with mobile phones is not permitted. Phones must be turned off/silent/deactivated during the exam. + * You may download course materials, labs, or other resources for offline use. + + +2. Laboratory (2 points) +------------------------ +* The laboratories are held in EG106, EG306, and PR706. +* Completion of the laboratory exercises leads to obtaining 10 or 11 points allocated for the laboratory. +* The final grade for the laboratory is calculated using the formula (sum(l1:l12) / 12). + + +3. Assignments (5 points + Extra) +--------------------------------- +* There are 4 Assignments: + * Assignment 0 - "Kernel API" - 0.5 points + * Assignment 1 - "Kprobe based tracer" - 1.5 points + * Assignment 2 - "Driver UART" 1.5 points + * Assignment 3 - "Software RAID" - 1.5 points +* Extra activities: + * SO2 transport protocol - 2 points + * SO2 Virtual Machine Manager with KVM - 2 points +* In case the total score for assignments + "Extra" activities exceeds 5 points, the following procedure will be followed: + * 5 points are considered as part of the total score. + * The difference between the total score and 5 points will be proportionally adjusted relative to the grade obtained in the lecture. + +.. code-block:: c + + S = A0 + A1 + A2 + A3 + Extra; + if (S <= 5) + assignment_grade = S; + else + assignment_grade = 5 + (S - 5) * course_grade / 3; // 0 <= course_grade <=3 + +* The verification and scoring of assignments: + * Assignments are tested against plagiarism. + * Assignments will be automatically verified using the `vmchecker-next `__ infrastructure integrated with moodle. + * The verification tests are public. + * Students who upload their assignments on Moodle must wait for the checker's feedback in the feedback section of the assignment upload page. + * The grade listed in the feedback section will be the final grade for the assigment. + * There may be exceptional situations where this rule is not considered (for example, if the assignment is implemented solely to pass the tests and does not meet the assignment requirements). + * The verification system deducts points (automatically) for certain situations (segmentation faults, unhandled exceptions, compilation errors, or warnings) regardless of the test results. + * Deductions are specified in the instructions list and in the assignment statement. + * Deductions are subtracted from the assignment grade (maximum of 10) not from the assignment score. + +* Late assignments + * Each assignment has a deadline of 2 weeks from the publication date. (exception! Assignment 0) + * After the deadline, 0.25 points per day (out of 10, the maximum grade for each assignment) will be deducted for 12 days (up to a maximum grade of 7). + * The deduction is from the grade (maximum 10), not from the score. An assignment incurs deductions of 0.25 points per day from the maximum grade (10), regardless of its score. + * For example, if for assignment 3 (scored with 1.5 points) the delay is 4 days, you will receive a deduction of 4 * 0.25 = 1 point from the grade, resulting in a maximum grade of 9, equivalent to a maximum score of 1.35 points. + * After 12 days, no further deductions will be made; a maximum grade of 7 can be obtained for an assignment submitted 13 days after the deadline expiration, or 50 days, or more, including during the retake session. + + diff --git a/refs/pull/405/merge/_sources/so2/index.rst.txt b/refs/pull/405/merge/_sources/so2/index.rst.txt new file mode 100644 index 00000000..80188277 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/index.rst.txt @@ -0,0 +1,57 @@ +=================== +Operating Systems 2 +=================== + +.. toctree:: + :caption: Good To Know + :maxdepth: 1 + + grading.rst + +.. toctree:: + :caption: Lectures + :maxdepth: 1 + + lec1-intro.rst + lec2-syscalls.rst + lec3-processes.rst + lec4-interrupts.rst + lec5-smp.rst + lec6-address-space.rst + lec7-memory-management.rst + lec8-filesystems.rst + lec9-debugging.rst + lec10-networking.rst + lec11-arch.rst + lec12-virtualization.rst + +.. toctree:: + :caption: Labs + :maxdepth: 1 + + lab1-intro.rst + lab2-kernel-api.rst + lab3-device-drivers.rst + lab4-interrupts.rst + lab5-deferred-work.rst + lab6-memory-mapping.rst + lab7-block-device-drivers.rst + lab8-filesystems-part1.rst + lab9-filesystems-part2.rst + lab10-networking.rst + lab11-arm-kernel-development.rst + lab12-kernel-profiling.rst + +.. toctree:: + :caption: Assignments + :maxdepth: 1 + + assign-collaboration.rst + assign0-kernel-api.rst + assign1-kprobe-based-tracer.rst + assign2-driver-uart.rst + assign3-software-raid.rst + assign4-transport-protocol.rst + .. uncoment next line for pitix to be available in Docs + .. assign5-pitix.rst + assign7-kvm-vmm.rst diff --git a/refs/pull/405/merge/_sources/so2/lab1-intro.rst.txt b/refs/pull/405/merge/_sources/so2/lab1-intro.rst.txt new file mode 100644 index 00000000..461148a9 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab1-intro.rst.txt @@ -0,0 +1,112 @@ +========================= +SO2 Lab 01 - Introduction +========================= + +Lab objectives +============== + +* presenting the rules and objectives of the Operating Systems 2 lab +* introducing the lab documentation +* introducing the Linux kernel and related resources +* creating simple modules +* describing the process of kernel module compilation +* presenting how a module can be used with a kernel +* simple kernel debugging methods + +.. include:: ../labs/introduction.rst + :start-after: [SECTION-ABOUT-BEGIN] + :end-before: [SECTION-ABOUT-END] + +.. include:: ../labs/introduction.rst + :start-after: [SECTION-REFERENCES-BEGIN] + :end-before: [SECTION-REFERENCES-END] + +.. include:: ../labs/introduction.rst + :start-after: [SECTION-DOCUMENTATION-BEGIN] + :end-before: [SECTION-DOCUMENTATION-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [SECTION-OVERVIEW-BEGIN] + :end-before: [SECTION-OVERVIEW-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [SECTION-MODULE-EXAMPLE-BEGIN] + :end-before: [SECTION-MODULE-EXAMPLE-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [SECTION-COMPILE-MODULES-BEGIN] + :end-before: [SECTION-COMPILE-MODULES-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [SECTION-LOAD-MODULES-BEGIN] + :end-before: [SECTION-LOAD-MODULES-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [SECTION-DEBUG-MODULES-BEGIN] + :end-before: [SECTION-DEBUG-MODULES-END] + +.. note:: + + If you want to learn how to easily browse through the Linux source code + and how to debug kernel code, read the `Good to know <#good-to-know>`__ + section. + +Exercises +========= + +.. include:: ../labs/introduction.rst + :start-after: [SECTION-EXERCISES-REMARKS-BEGIN] + :end-before: [SECTION-EXERCISES-REMARKS-END] + +.. _exercises_summary: + +.. include:: ../labs/exercises-summary.hrst +.. |LAB_NAME| replace:: kernel_modules + +.. .. include:: ../labs/introduction.rst +.. :start-after: [EXERCISE1-BEGIN] +.. :end-before: [EXERCISE1-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [EXERCISE1-BEGIN] + :end-before: [EXERCISE1-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [EXERCISE2-BEGIN] + :end-before: [EXERCISE2-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [EXERCISE3-BEGIN] + :end-before: [EXERCISE3-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [EXERCISE4-BEGIN] + :end-before: [EXERCISE4-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [EXERCISE5-BEGIN] + :end-before: [EXERCISE5-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [EXERCISE6-BEGIN] + :end-before: [EXERCISE6-END] + +.. include:: ../labs/kernel_modules.rst + :start-after: [EXERCISE7-BEGIN] + :end-before: [EXERCISE7-END] + +.. _good-to-know: + +Good to know +============ + +The following sections contain useful information for getitng used to the Linux +kernel code and debugging techniques. + +.. include:: ../labs/introduction.rst + :start-after: [SECTION-CODE-NAVIGATION-BEGIN] + :end-before: [SECTION-CODE-NAVIGATION-END] + +.. include:: ../labs/introduction.rst + :start-after: [SECTION-DEBUGGING-BEGIN] + :end-before: [SECTION-DEBUGGING-END] diff --git a/refs/pull/405/merge/_sources/so2/lab10-networking.rst.txt b/refs/pull/405/merge/_sources/so2/lab10-networking.rst.txt new file mode 100644 index 00000000..0f9675cb --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab10-networking.rst.txt @@ -0,0 +1,6 @@ +======================= +SO2 Lab 10 - Networking +======================= + +.. include:: ../labs/networking.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab11-arm-kernel-development.rst.txt b/refs/pull/405/merge/_sources/so2/lab11-arm-kernel-development.rst.txt new file mode 100644 index 00000000..6f59f8cc --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab11-arm-kernel-development.rst.txt @@ -0,0 +1,6 @@ +====================================== +SO2 Lab 11 - Kernel Development on ARM +====================================== + +.. include:: ../labs/arm_kernel_development.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab12-kernel-profiling.rst.txt b/refs/pull/405/merge/_sources/so2/lab12-kernel-profiling.rst.txt new file mode 100644 index 00000000..ce8f1358 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab12-kernel-profiling.rst.txt @@ -0,0 +1,6 @@ +============================= +SO2 Lab 12 - Kernel Profiling +============================= + +.. include:: ../labs/kernel_profiling.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab2-kernel-api.rst.txt b/refs/pull/405/merge/_sources/so2/lab2-kernel-api.rst.txt new file mode 100644 index 00000000..379601db --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab2-kernel-api.rst.txt @@ -0,0 +1,6 @@ +======================= +SO2 Lab 02 - Kernel API +======================= + +.. include:: ../labs/kernel_api.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab3-device-drivers.rst.txt b/refs/pull/405/merge/_sources/so2/lab3-device-drivers.rst.txt new file mode 100644 index 00000000..6a3c77b0 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab3-device-drivers.rst.txt @@ -0,0 +1,6 @@ +===================================== +SO2 Lab 03 - Character device drivers +===================================== + +.. include:: ../labs/device_drivers.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab4-interrupts.rst.txt b/refs/pull/405/merge/_sources/so2/lab4-interrupts.rst.txt new file mode 100644 index 00000000..5375f6cc --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab4-interrupts.rst.txt @@ -0,0 +1,6 @@ +====================================== +SO2 Lab 04 - I/O access and Interrupts +====================================== + +.. include:: ../labs/interrupts.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab5-deferred-work.rst.txt b/refs/pull/405/merge/_sources/so2/lab5-deferred-work.rst.txt new file mode 100644 index 00000000..bf763d7f --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab5-deferred-work.rst.txt @@ -0,0 +1,6 @@ +========================== +SO2 Lab 05 - Deferred work +========================== + +.. include:: ../labs/deferred_work.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab6-memory-mapping.rst.txt b/refs/pull/405/merge/_sources/so2/lab6-memory-mapping.rst.txt new file mode 100644 index 00000000..53bc205e --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab6-memory-mapping.rst.txt @@ -0,0 +1,6 @@ +=========================== +SO2 Lab 06 - Memory Mapping +=========================== + +.. include:: ../labs/memory_mapping.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab7-block-device-drivers.rst.txt b/refs/pull/405/merge/_sources/so2/lab7-block-device-drivers.rst.txt new file mode 100644 index 00000000..2bbad421 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab7-block-device-drivers.rst.txt @@ -0,0 +1,6 @@ +================================= +SO2 Lab 07 - Block Device Drivers +================================= + +.. include:: ../labs/block_device_drivers.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lab8-filesystems-part1.rst.txt b/refs/pull/405/merge/_sources/so2/lab8-filesystems-part1.rst.txt new file mode 100644 index 00000000..d4d8516f --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab8-filesystems-part1.rst.txt @@ -0,0 +1,19 @@ +========================================= +SO2 Lab 08 - File system drivers (Part 1) +========================================= + +.. include:: ../labs/filesystems_part1.rst + :start-line: 4 + :end-before: [SURVEY-LABEL] + +.. important:: + In order to have a better understanding of what we do well and we can do + better, what factors affect your implication in teaching, extracurricular + but also professional activities, we ask you to complete `this survey + `_. The survey is a short one, + having answers with check marks, with an estimated completion time of + 3-5 minutes. Obviously, we will send you the analysis of the survey and + use it to improve the teaching activities. + +.. include:: ../labs/filesystems_part1.rst + :start-after: [SURVEY-LABEL] diff --git a/refs/pull/405/merge/_sources/so2/lab9-filesystems-part2.rst.txt b/refs/pull/405/merge/_sources/so2/lab9-filesystems-part2.rst.txt new file mode 100644 index 00000000..5ef61803 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lab9-filesystems-part2.rst.txt @@ -0,0 +1,6 @@ +========================================= +SO2 Lab 09 - File system drivers (Part 2) +========================================= + +.. include:: ../labs/filesystems_part2.rst + :start-line: 4 diff --git a/refs/pull/405/merge/_sources/so2/lec1-intro.rst.txt b/refs/pull/405/merge/_sources/so2/lec1-intro.rst.txt new file mode 100644 index 00000000..2a011341 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec1-intro.rst.txt @@ -0,0 +1,260 @@ +============================================================== +SO2 Lecture 01 - Course overview and Linux kernel introduction +============================================================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 01 - Course overview and Linux kernel introduction + :inline-contents: False + :level: 1 + + +Echipa +====== + +.. slide:: Echipa + :inline-contents: True + :level: 2 + + * Daniel Băluță (Daniel), Răzvan Deaconescu (Răzvan, RD), Claudiu + Ghioc (Claudiu), Valentin Ghiță (Vali), Sergiu Weisz (Sergiu), + Octavian Purdilă (Tavi) + + * Alexandru Militaru (Alex), Teodora Șerbănescu (Teo), Ștefan + Teodorescu (Ștefan, Fane), Mihai Popescu (Mihai, Mișu), + Constantin Răducanu, Daniel Dinca, Laurențiu Ștefan + + * Mult succes în noul semestru! + +Poziționare curs +================ + +.. slide:: Poziționare curs + :inline-contents: True + :level: 2 + + .. ditaa:: + + +---------------------------------------------------------+ + | application programming (EGC, SPG, PP, SPRC, IOC, etc.) | + +---------------------------------------------------------+ + + +----------------------------------+ + | system programming (PC, SO, CPL) | + +----------------------------------+ + user space + ----------------------------------------------------------=- + kernel space + +--------------------------+ + | kernel programming (SO2) | + +--------------------------+ + + ----------------------------------------------------------=- + + +----------------------------------+ + | hardware (PM, CN1, CN2, PL ) | + +----------------------------------+ + +Resurse +======= + +.. slide:: Resurse + :inline-contents: True + :level: 2 + + * Linux Kernel Labs: https://linux-kernel-labs.github.io/ + * mailing list: so2@cursuri.cs.pub.ro + * Facebook + * vmchecker + * catalog Google, calendar Google + * LXR: https://elixir.bootlin.com/linux/v5.10.14/source + * cs.curs.pub.ro - rol de portal + * karma awards + +Comunitate +========== + +.. slide:: Comunitate + :inline-contents: True + :level: 2 + + * tutorial contribuții: https://linux-kernel-labs.github.io/refs/heads/master/info/contributing.html + * corecții, ajustări, precizări, informații utile + * listă de discuții + * răspundeți la întrebările colegilor voștri + * propuneți subiecte de discuție care au legătură cu disciplina + * Facebook + * sugestii, propuneri, feedback + * Primiți puncte de karma + +Notare +======= + +.. slide:: Notare + :inline-contents: True + :level: 2 + + * 2 puncte activitate la laborator + * 3 puncte „examen”, notare pe parcurs + * 5 puncte teme de casă + * Activități "extra" + * Punctajul din teme de casă + activitați extra ce depăsește 5 + puncte e corelat direct proportional cu nota de la examen + * Tema 0 - 0,5 puncte + * Temele 1, 2, 3 - câte 1,5 puncte fiecare + * Condiţii de promovare: nota finală 4.5, nota minimă examen 3 + +Obiectivele cursului +==================== + +.. slide:: Obiectivele cursului + :inline-contents: True + :level: 2 + + * Prezentarea structurii interne a unui sistem de operare + * Target: sisteme de operare de uz general + * Structura și componentele unui kernel monolitic + * Procese, FS, Networking + * Memory management + * Exemplificare pe Linux + +Obiectivele laboratorului si a temelor +====================================== + +.. slide:: Obiectivele laboratorului si a temelor + :inline-contents: True + :level: 2 + + * Însușirea cunoștințelor necesare implementării de device drivere + + * Înțelegerea în profunzime a cunoștințelor prin rezolvarea de + exerciții + +Cursuri necesare +================ + +.. slide:: Cursuri necesare + :inline-contents: True + :level: 2 + + * Programare: C + * SD: tabele de dispersie, arbori echilibrați + * IOCLA: lucrul cu registre și instrucțiuni de bază (adunări, comparaţii, salturi) + * CN: TLB/CAM, memorie, procesor, I/O + * PC, RL: ethernet, IP, sockeți + * SO: procese, fișiere, thread-uri, memorie virtuală + +Despre curs +=========== + +.. slide:: Despre curs + :inline-contents: True + :level: 2 + + * 12 cursuri + * interactiv + * participaţi la discuţii + * întrebaţi atunci când nu aţi înţeles + * destul de “dens”, se recomandă călduros parcurgerea suportului bibliografic înainte şi după curs + * 1h:20 prezentare + 20min teste si discutii pe marginea testului + +Lista cursuri +============= + +.. slide:: Lista cursuri + :inline-contents: True + :level: 2 + + .. hlist:: + :columns: 2 + + * Introducere + * Apeluri de sistem + * Procese + * Întreruperi + * Sincronizare + * Adresarea memoriei + * Gestiunea memoriei + * Gestiunea fișierelor + * Kernel debugging + * Gestiunea rețelei + * Virtualizare + * Kernel profiling + + +Despre laborator +================ + +.. slide:: Despre laborator + :inline-contents: True + :level: 2 + + * Kernel Modules and Device Drivers + * 15 min prezentare / 80 de minute lucru + * se punctează activitatea + * learn by doing + +Despre teme +=========== + +.. slide:: Despre teme + :inline-contents: True + :level: 2 + + * necesare: aprofundare API (laborator) și concepte (curs) + * teste publice + * suport de testare (vmchecker) + * relativ puţin cod de scris dar relativ dificile + * dificultatea constă în acomodarea cu noul mediu + +Lista teme +========== + +.. slide:: Lista teme + :inline-contents: True + :level: 2 + + * Tema 0 - Kernel API + * Kprobe based tracer + * Driver pentru portul serial + * Software RAID + * SO2 Transport Protocol + + +Bibliografie curs +================= + +.. slide:: Bibliografie curs + :inline-contents: True + :level: 2 + + * Linux Kernel Development, 3rd edition, Robert Love, Addison + Wesley, 2010 + + * Understanding the Linux Kernel, 3rd edition, Daniel P. Bovet & + Marco Cesati, O'Reilly 2005 + + * Linux Networking Architecture, Klaus Wehrle, Frank Pahlke, + Hartmut Ritter, Daniel Muller, Marc Bechler, Prentice Hall 2004 + + * Understanding Linux Network Internals, Christian Benvenuti, O'Reilly 2005 + +Bibliografie laborator +====================== + +.. slide:: Bibliografie laborator + :inline-contents: True + :level: 2 + + * Linux Device Drivers, 3nd edition, Alessandro Rubini & Jonathan + Corbet, O'Reilly 2006 + + * Linux Kernel in a Nutshell, Greg Kroah-Hartman, O'Reilly 2005 + + +.. include:: ../lectures/intro.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec10-networking.rst.txt b/refs/pull/405/merge/_sources/so2/lec10-networking.rst.txt new file mode 100644 index 00000000..aec4b414 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec10-networking.rst.txt @@ -0,0 +1,16 @@ +=========================== +SO2 Lecture 10 - Networking +=========================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 10 - Networking + :inline-contents: False + :level: 1 + +.. include:: ../lectures/networking.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec11-arch.rst.txt b/refs/pull/405/merge/_sources/so2/lec11-arch.rst.txt new file mode 100644 index 00000000..fa160b27 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec11-arch.rst.txt @@ -0,0 +1,17 @@ +=================================== +SO2 Lecture 11 - Architecture Layer +=================================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 11 - Architecture Layer + :inline-contents: False + :level: 1 + +.. include:: ../lectures/arch.rst + :start-line: 6 + diff --git a/refs/pull/405/merge/_sources/so2/lec12-profiling.rst.txt b/refs/pull/405/merge/_sources/so2/lec12-profiling.rst.txt new file mode 100644 index 00000000..8a854414 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec12-profiling.rst.txt @@ -0,0 +1,13 @@ +========================== +SO2 Lecture 12 - Profiling +========================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 12 - Profiling + :inline-contents: False + :level: 1 diff --git a/refs/pull/405/merge/_sources/so2/lec12-virtualization.rst.txt b/refs/pull/405/merge/_sources/so2/lec12-virtualization.rst.txt new file mode 100644 index 00000000..9ababa1d --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec12-virtualization.rst.txt @@ -0,0 +1,16 @@ +=============================== +SO2 Lecture 12 - Virtualization +=============================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 12 - Virtualization + :inline-contents: False + :level: 1 + +.. include:: ../lectures/virt.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec2-syscalls.rst.txt b/refs/pull/405/merge/_sources/so2/lec2-syscalls.rst.txt new file mode 100644 index 00000000..bcf3bb6b --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec2-syscalls.rst.txt @@ -0,0 +1,16 @@ +============================= +SO2 Lecture 02 - System calls +============================= + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 02 - System calls + :inline-contents: False + :level: 1 + +.. include:: ../lectures/syscalls.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec3-processes.rst.txt b/refs/pull/405/merge/_sources/so2/lec3-processes.rst.txt new file mode 100644 index 00000000..394e3306 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec3-processes.rst.txt @@ -0,0 +1,16 @@ +========================== +SO2 Lecture 03 - Processes +========================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 03 - Processes + :inline-contents: False + :level: 1 + +.. include:: ../lectures/processes.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec4-interrupts.rst.txt b/refs/pull/405/merge/_sources/so2/lec4-interrupts.rst.txt new file mode 100644 index 00000000..3fba6c5b --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec4-interrupts.rst.txt @@ -0,0 +1,16 @@ +============================= +SO2 Lecture 04 - Interrupts +============================= + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 04 - Interrupts + :inline-contents: False + :level: 1 + +.. include:: ../lectures/interrupts.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec5-smp.rst.txt b/refs/pull/405/merge/_sources/so2/lec5-smp.rst.txt new file mode 100644 index 00000000..a0ef6425 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec5-smp.rst.txt @@ -0,0 +1,16 @@ +=========================================== +SO2 Lecture 05 - Symmetric Multi-Processing +=========================================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 05 - Symmetric Multi-Processing + :inline-contents: False + :level: 1 + +.. include:: ../lectures/smp.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec6-address-space.rst.txt b/refs/pull/405/merge/_sources/so2/lec6-address-space.rst.txt new file mode 100644 index 00000000..b0237be0 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec6-address-space.rst.txt @@ -0,0 +1,16 @@ +============================== +SO2 Lecture 06 - Address Space +============================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 06 - Address Space + :inline-contents: False + :level: 1 + +.. include:: ../lectures/address-space.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec7-memory-management.rst.txt b/refs/pull/405/merge/_sources/so2/lec7-memory-management.rst.txt new file mode 100644 index 00000000..05e2e3b0 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec7-memory-management.rst.txt @@ -0,0 +1,16 @@ +================================== +SO2 Lecture 07 - Memory Management +================================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 07 - Memory Management + :inline-contents: False + :level: 1 + +.. include:: ../lectures/memory-management.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_sources/so2/lec8-filesystems.rst.txt b/refs/pull/405/merge/_sources/so2/lec8-filesystems.rst.txt new file mode 100644 index 00000000..6029d651 --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec8-filesystems.rst.txt @@ -0,0 +1,17 @@ +====================================== +SO2 Lecture 08 - Filesystem Management +====================================== + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 08 - Filesystem Management + :inline-contents: False + :level: 1 + +.. include:: ../lectures/fs.rst + :start-line: 6 + diff --git a/refs/pull/405/merge/_sources/so2/lec9-debugging.rst.txt b/refs/pull/405/merge/_sources/so2/lec9-debugging.rst.txt new file mode 100644 index 00000000..02b4ed6d --- /dev/null +++ b/refs/pull/405/merge/_sources/so2/lec9-debugging.rst.txt @@ -0,0 +1,16 @@ +================================= +SO2 Lecture 09 - Kernel debugging +================================= + +`View slides `_ + +.. slideconf:: + :autoslides: False + :theme: single-level + +.. slide:: SO2 Lecture 09 - Kernel debugging + :inline-contents: False + :level: 1 + +.. include:: ../lectures/debugging.rst + :start-line: 6 diff --git a/refs/pull/405/merge/_static/ajax-loader.gif b/refs/pull/405/merge/_static/ajax-loader.gif new file mode 100644 index 00000000..61faf8ca Binary files /dev/null and b/refs/pull/405/merge/_static/ajax-loader.gif differ diff --git a/refs/pull/405/merge/_static/asciinema-player.css b/refs/pull/405/merge/_static/asciinema-player.css new file mode 100644 index 00000000..20b6974f --- /dev/null +++ b/refs/pull/405/merge/_static/asciinema-player.css @@ -0,0 +1,2563 @@ +.asciinema-player-wrapper { + position: relative; + text-align: center; + outline: none; +} +.asciinema-player-wrapper .title-bar { + display: none; + top: -78px; + transition: top 0.15s linear; + position: absolute; + left: 0; + right: 0; + box-sizing: content-box; + font-size: 20px; + line-height: 1em; + padding: 15px; + font-family: sans-serif; + color: white; + background-color: rgba(0, 0, 0, 0.8); +} +.asciinema-player-wrapper .title-bar img { + vertical-align: middle; + height: 48px; + margin-right: 16px; +} +.asciinema-player-wrapper .title-bar a { + color: white; + text-decoration: underline; +} +.asciinema-player-wrapper .title-bar a:hover { + text-decoration: none; +} +.asciinema-player-wrapper:fullscreen { + background-color: #000; + width: 100%; + height: 100%; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-justify-content: center; + justify-content: center; + -webkit-align-items: center; + align-items: center; +} +.asciinema-player-wrapper:fullscreen .asciinema-player { + position: static; +} +.asciinema-player-wrapper:fullscreen .title-bar { + display: initial; +} +.asciinema-player-wrapper:fullscreen.hud .title-bar { + top: 0; +} +.asciinema-player-wrapper:-webkit-full-screen { + background-color: #000; + width: 100%; + height: 100%; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-justify-content: center; + justify-content: center; + -webkit-align-items: center; + align-items: center; +} +.asciinema-player-wrapper:-webkit-full-screen .asciinema-player { + position: static; +} +.asciinema-player-wrapper:-webkit-full-screen .title-bar { + display: initial; +} +.asciinema-player-wrapper:-webkit-full-screen.hud .title-bar { + top: 0; +} +.asciinema-player-wrapper:-moz-full-screen { + background-color: #000; + width: 100%; + height: 100%; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-justify-content: center; + justify-content: center; + -webkit-align-items: center; + align-items: center; +} +.asciinema-player-wrapper:-moz-full-screen .asciinema-player { + position: static; +} +.asciinema-player-wrapper:-moz-full-screen .title-bar { + display: initial; +} +.asciinema-player-wrapper:-moz-full-screen.hud .title-bar { + top: 0; +} +.asciinema-player-wrapper:-ms-fullscreen { + background-color: #000; + width: 100%; + height: 100%; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-justify-content: center; + justify-content: center; + -webkit-align-items: center; + align-items: center; +} +.asciinema-player-wrapper:-ms-fullscreen .asciinema-player { + position: static; +} +.asciinema-player-wrapper:-ms-fullscreen .title-bar { + display: initial; +} +.asciinema-player-wrapper:-ms-fullscreen.hud .title-bar { + top: 0; +} +.asciinema-player-wrapper .asciinema-player { + text-align: left; + display: inline-block; + padding: 0px; + position: relative; + box-sizing: content-box; + -moz-box-sizing: content-box; + -webkit-box-sizing: content-box; + overflow: hidden; + max-width: 100%; +} +.asciinema-terminal { + box-sizing: content-box; + -moz-box-sizing: content-box; + -webkit-box-sizing: content-box; + overflow: hidden; + padding: 0; + margin: 0px; + display: block; + white-space: pre; + border: 0; + word-wrap: normal; + word-break: normal; + border-radius: 0; + border-style: solid; + cursor: text; + border-width: 0.5em; + font-family: Consolas, Menlo, 'Bitstream Vera Sans Mono', monospace, 'Powerline Symbols'; + line-height: 1.3333333333em; +} +.asciinema-terminal .line { + letter-spacing: normal; + overflow: hidden; + height: 1.3333333333em; +} +.asciinema-terminal .line span { + padding: 0; + display: inline-block; + height: 1.3333333333em; +} +.asciinema-terminal .line { + display: block; + width: 200%; +} +.asciinema-terminal .bright { + font-weight: bold; +} +.asciinema-terminal .underline { + text-decoration: underline; +} +.asciinema-terminal .italic { + font-style: italic; +} +.asciinema-terminal.font-small { + font-size: 12px; +} +.asciinema-terminal.font-medium { + font-size: 18px; +} +.asciinema-terminal.font-big { + font-size: 24px; +} +.asciinema-player .control-bar { + width: 100%; + height: 32px; + background: rgba(0, 0, 0, 0.8); + /* no gradient fallback */ + background: -moz-linear-gradient(top, rgba(0, 0, 0, 0.5) 0%, #000000 25%, #000000 100%); + /* FF3.6-15 */ + background: -webkit-linear-gradient(top, rgba(0, 0, 0, 0.5) 0%, #000000 25%, #000000 100%); + /* Chrome10-25,Safari5.1-6 */ + background: linear-gradient(to bottom, rgba(0, 0, 0, 0.5) 0%, #000000 25%, #000000 100%); + /* W3C, IE10+, FF16+, Chrome26+, Opera12+, Safari7+ */ + color: #bbbbbb; + box-sizing: content-box; + line-height: 1; + /* position: absolute; */ + bottom: -35px; + left: 0; + transition: bottom 0.15s linear; +} +.asciinema-player .control-bar * { + box-sizing: inherit; + font-size: 0; +} +.asciinema-player .control-bar svg.icon path { + fill: #bbbbbb; +} +.asciinema-player .control-bar .playback-button { + display: block; + float: left; + cursor: pointer; + height: 12px; + width: 12px; + padding: 10px; +} +.asciinema-player .control-bar .playback-button svg { + height: 12px; + width: 12px; +} +.asciinema-player .control-bar .timer { + display: block; + float: left; + width: 50px; + height: 100%; + text-align: center; + font-family: Helvetica, Arial, sans-serif; + font-size: 11px; + font-weight: bold; + line-height: 32px; + cursor: default; +} +.asciinema-player .control-bar .timer span { + display: inline-block; + font-size: inherit; +} +.asciinema-player .control-bar .timer .time-remaining { + display: none; +} +.asciinema-player .control-bar .timer:hover .time-elapsed { + display: none; +} +.asciinema-player .control-bar .timer:hover .time-remaining { + display: inline; +} +.asciinema-player .control-bar .progressbar { + display: block; + overflow: hidden; + height: 100%; + padding: 0 10px; +} +.asciinema-player .control-bar .progressbar .bar { + display: block; + cursor: pointer; + height: 100%; + padding-top: 15px; + font-size: 0; +} +.asciinema-player .control-bar .progressbar .bar .gutter { + display: block; + height: 3px; + background-color: #333; +} +.asciinema-player .control-bar .progressbar .bar .gutter span { + display: inline-block; + height: 100%; + background-color: #bbbbbb; + border-radius: 3px; +} +.asciinema-player .control-bar.live .progressbar .bar { + cursor: default; +} +.asciinema-player .control-bar .fullscreen-button { + display: block; + float: right; + width: 14px; + height: 14px; + padding: 9px; + cursor: pointer; +} +.asciinema-player .control-bar .fullscreen-button svg { + width: 14px; + height: 14px; +} +.asciinema-player .control-bar .fullscreen-button svg:first-child { + display: inline; +} +.asciinema-player .control-bar .fullscreen-button svg:last-child { + display: none; +} +.asciinema-player-wrapper.hud .control-bar { + bottom: 0px; +} +.asciinema-player-wrapper:fullscreen .fullscreen-button svg:first-child { + display: none; +} +.asciinema-player-wrapper:fullscreen .fullscreen-button svg:last-child { + display: inline; +} +.asciinema-player-wrapper:-webkit-full-screen .fullscreen-button svg:first-child { + display: none; +} +.asciinema-player-wrapper:-webkit-full-screen .fullscreen-button svg:last-child { + display: inline; +} +.asciinema-player-wrapper:-moz-full-screen .fullscreen-button svg:first-child { + display: none; +} +.asciinema-player-wrapper:-moz-full-screen .fullscreen-button svg:last-child { + display: inline; +} +.asciinema-player-wrapper:-ms-fullscreen .fullscreen-button svg:first-child { + display: none; +} +.asciinema-player-wrapper:-ms-fullscreen .fullscreen-button svg:last-child { + display: inline; +} +.asciinema-player .loading { + z-index: 10; + background-repeat: no-repeat; + background-position: center; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 32px; + background-color: rgba(0, 0, 0, 0.5); +} +.asciinema-player .start-prompt { + z-index: 10; + background-repeat: no-repeat; + background-position: center; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 32px; + z-index: 20; + cursor: pointer; +} +.asciinema-player .start-prompt .play-button { + font-size: 0px; +} +.asciinema-player .start-prompt .play-button { + position: absolute; + left: 0; + top: 0; + right: 0; + bottom: 0; + text-align: center; + color: white; + display: table; + width: 100%; + height: 100%; +} +.asciinema-player .start-prompt .play-button div { + vertical-align: middle; + display: table-cell; +} +.asciinema-player .start-prompt .play-button div span { + width: 96px; + height: 96px; + display: inline-block; +} +@-webkit-keyframes expand { + 0% { + -webkit-transform: scale(0); + } + 50% { + -webkit-transform: scale(1); + } + 100% { + z-index: 1; + } +} +@-moz-keyframes expand { + 0% { + -moz-transform: scale(0); + } + 50% { + -moz-transform: scale(1); + } + 100% { + z-index: 1; + } +} +@-o-keyframes expand { + 0% { + -o-transform: scale(0); + } + 50% { + -o-transform: scale(1); + } + 100% { + z-index: 1; + } +} +@keyframes expand { + 0% { + transform: scale(0); + } + 50% { + transform: scale(1); + } + 100% { + z-index: 1; + } +} +.loader { + position: absolute; + left: 50%; + top: 50%; + margin: -20px 0 0 -20px; + background-color: white; + border-radius: 50%; + box-shadow: 0 0 0 6.66667px #141414; + width: 40px; + height: 40px; +} +.loader:before, +.loader:after { + content: ""; + position: absolute; + left: 50%; + top: 50%; + display: block; + margin: -21px 0 0 -21px; + border-radius: 50%; + z-index: 2; + width: 42px; + height: 42px; +} +.loader:before { + background-color: #141414; + -webkit-animation: expand 1.6s linear infinite both; + -moz-animation: expand 1.6s linear infinite both; + animation: expand 1.6s linear infinite both; +} +.loader:after { + background-color: white; + -webkit-animation: expand 1.6s linear 0.8s infinite both; + -moz-animation: expand 1.6s linear 0.8s infinite both; + animation: expand 1.6s linear 0.8s infinite both; +} +.asciinema-terminal .fg-16 { + color: #000000; +} +.asciinema-terminal .bg-16 { + background-color: #000000; +} +.asciinema-terminal .fg-17 { + color: #00005f; +} +.asciinema-terminal .bg-17 { + background-color: #00005f; +} +.asciinema-terminal .fg-18 { + color: #000087; +} +.asciinema-terminal .bg-18 { + background-color: #000087; +} +.asciinema-terminal .fg-19 { + color: #0000af; +} +.asciinema-terminal .bg-19 { + background-color: #0000af; +} +.asciinema-terminal .fg-20 { + color: #0000d7; +} +.asciinema-terminal .bg-20 { + background-color: #0000d7; +} +.asciinema-terminal .fg-21 { + color: #0000ff; +} +.asciinema-terminal .bg-21 { + background-color: #0000ff; +} +.asciinema-terminal .fg-22 { + color: #005f00; +} +.asciinema-terminal .bg-22 { + background-color: #005f00; +} +.asciinema-terminal .fg-23 { + color: #005f5f; +} +.asciinema-terminal .bg-23 { + background-color: #005f5f; +} +.asciinema-terminal .fg-24 { + color: #005f87; +} +.asciinema-terminal .bg-24 { + background-color: #005f87; +} +.asciinema-terminal .fg-25 { + color: #005faf; +} +.asciinema-terminal .bg-25 { + background-color: #005faf; +} +.asciinema-terminal .fg-26 { + color: #005fd7; +} +.asciinema-terminal .bg-26 { + background-color: #005fd7; +} +.asciinema-terminal .fg-27 { + color: #005fff; +} +.asciinema-terminal .bg-27 { + background-color: #005fff; +} +.asciinema-terminal .fg-28 { + color: #008700; +} +.asciinema-terminal .bg-28 { + background-color: #008700; +} +.asciinema-terminal .fg-29 { + color: #00875f; +} +.asciinema-terminal .bg-29 { + background-color: #00875f; +} +.asciinema-terminal .fg-30 { + color: #008787; +} +.asciinema-terminal .bg-30 { + background-color: #008787; +} +.asciinema-terminal .fg-31 { + color: #0087af; +} +.asciinema-terminal .bg-31 { + background-color: #0087af; +} +.asciinema-terminal .fg-32 { + color: #0087d7; +} +.asciinema-terminal .bg-32 { + background-color: #0087d7; +} +.asciinema-terminal .fg-33 { + color: #0087ff; +} +.asciinema-terminal .bg-33 { + background-color: #0087ff; +} +.asciinema-terminal .fg-34 { + color: #00af00; +} +.asciinema-terminal .bg-34 { + background-color: #00af00; +} +.asciinema-terminal .fg-35 { + color: #00af5f; +} +.asciinema-terminal .bg-35 { + background-color: #00af5f; +} +.asciinema-terminal .fg-36 { + color: #00af87; +} +.asciinema-terminal .bg-36 { + background-color: #00af87; +} +.asciinema-terminal .fg-37 { + color: #00afaf; +} +.asciinema-terminal .bg-37 { + background-color: #00afaf; +} +.asciinema-terminal .fg-38 { + color: #00afd7; +} +.asciinema-terminal .bg-38 { + background-color: #00afd7; +} +.asciinema-terminal .fg-39 { + color: #00afff; +} +.asciinema-terminal .bg-39 { + background-color: #00afff; +} +.asciinema-terminal .fg-40 { + color: #00d700; +} +.asciinema-terminal .bg-40 { + background-color: #00d700; +} +.asciinema-terminal .fg-41 { + color: #00d75f; +} +.asciinema-terminal .bg-41 { + background-color: #00d75f; +} +.asciinema-terminal .fg-42 { + color: #00d787; +} +.asciinema-terminal .bg-42 { + background-color: #00d787; +} +.asciinema-terminal .fg-43 { + color: #00d7af; +} +.asciinema-terminal .bg-43 { + background-color: #00d7af; +} +.asciinema-terminal .fg-44 { + color: #00d7d7; +} +.asciinema-terminal .bg-44 { + background-color: #00d7d7; +} +.asciinema-terminal .fg-45 { + color: #00d7ff; +} +.asciinema-terminal .bg-45 { + background-color: #00d7ff; +} +.asciinema-terminal .fg-46 { + color: #00ff00; +} +.asciinema-terminal .bg-46 { + background-color: #00ff00; +} +.asciinema-terminal .fg-47 { + color: #00ff5f; +} +.asciinema-terminal .bg-47 { + background-color: #00ff5f; +} +.asciinema-terminal .fg-48 { + color: #00ff87; +} +.asciinema-terminal .bg-48 { + background-color: #00ff87; +} +.asciinema-terminal .fg-49 { + color: #00ffaf; +} +.asciinema-terminal .bg-49 { + background-color: #00ffaf; +} +.asciinema-terminal .fg-50 { + color: #00ffd7; +} +.asciinema-terminal .bg-50 { + background-color: #00ffd7; +} +.asciinema-terminal .fg-51 { + color: #00ffff; +} +.asciinema-terminal .bg-51 { + background-color: #00ffff; +} +.asciinema-terminal .fg-52 { + color: #5f0000; +} +.asciinema-terminal .bg-52 { + background-color: #5f0000; +} +.asciinema-terminal .fg-53 { + color: #5f005f; +} +.asciinema-terminal .bg-53 { + background-color: #5f005f; +} +.asciinema-terminal .fg-54 { + color: #5f0087; +} +.asciinema-terminal .bg-54 { + background-color: #5f0087; +} +.asciinema-terminal .fg-55 { + color: #5f00af; +} +.asciinema-terminal .bg-55 { + background-color: #5f00af; +} +.asciinema-terminal .fg-56 { + color: #5f00d7; +} +.asciinema-terminal .bg-56 { + background-color: #5f00d7; +} +.asciinema-terminal .fg-57 { + color: #5f00ff; +} +.asciinema-terminal .bg-57 { + background-color: #5f00ff; +} +.asciinema-terminal .fg-58 { + color: #5f5f00; +} +.asciinema-terminal .bg-58 { + background-color: #5f5f00; +} +.asciinema-terminal .fg-59 { + color: #5f5f5f; +} +.asciinema-terminal .bg-59 { + background-color: #5f5f5f; +} +.asciinema-terminal .fg-60 { + color: #5f5f87; +} +.asciinema-terminal .bg-60 { + background-color: #5f5f87; +} +.asciinema-terminal .fg-61 { + color: #5f5faf; +} +.asciinema-terminal .bg-61 { + background-color: #5f5faf; +} +.asciinema-terminal .fg-62 { + color: #5f5fd7; +} +.asciinema-terminal .bg-62 { + background-color: #5f5fd7; +} +.asciinema-terminal .fg-63 { + color: #5f5fff; +} +.asciinema-terminal .bg-63 { + background-color: #5f5fff; +} +.asciinema-terminal .fg-64 { + color: #5f8700; +} +.asciinema-terminal .bg-64 { + background-color: #5f8700; +} +.asciinema-terminal .fg-65 { + color: #5f875f; +} +.asciinema-terminal .bg-65 { + background-color: #5f875f; +} +.asciinema-terminal .fg-66 { + color: #5f8787; +} +.asciinema-terminal .bg-66 { + background-color: #5f8787; +} +.asciinema-terminal .fg-67 { + color: #5f87af; +} +.asciinema-terminal .bg-67 { + background-color: #5f87af; +} +.asciinema-terminal .fg-68 { + color: #5f87d7; +} +.asciinema-terminal .bg-68 { + background-color: #5f87d7; +} +.asciinema-terminal .fg-69 { + color: #5f87ff; +} +.asciinema-terminal .bg-69 { + background-color: #5f87ff; +} +.asciinema-terminal .fg-70 { + color: #5faf00; +} +.asciinema-terminal .bg-70 { + background-color: #5faf00; +} +.asciinema-terminal .fg-71 { + color: #5faf5f; +} +.asciinema-terminal .bg-71 { + background-color: #5faf5f; +} +.asciinema-terminal .fg-72 { + color: #5faf87; +} +.asciinema-terminal .bg-72 { + background-color: #5faf87; +} +.asciinema-terminal .fg-73 { + color: #5fafaf; +} +.asciinema-terminal .bg-73 { + background-color: #5fafaf; +} +.asciinema-terminal .fg-74 { + color: #5fafd7; +} +.asciinema-terminal .bg-74 { + background-color: #5fafd7; +} +.asciinema-terminal .fg-75 { + color: #5fafff; +} +.asciinema-terminal .bg-75 { + background-color: #5fafff; +} +.asciinema-terminal .fg-76 { + color: #5fd700; +} +.asciinema-terminal .bg-76 { + background-color: #5fd700; +} +.asciinema-terminal .fg-77 { + color: #5fd75f; +} +.asciinema-terminal .bg-77 { + background-color: #5fd75f; +} +.asciinema-terminal .fg-78 { + color: #5fd787; +} +.asciinema-terminal .bg-78 { + background-color: #5fd787; +} +.asciinema-terminal .fg-79 { + color: #5fd7af; +} +.asciinema-terminal .bg-79 { + background-color: #5fd7af; +} +.asciinema-terminal .fg-80 { + color: #5fd7d7; +} +.asciinema-terminal .bg-80 { + background-color: #5fd7d7; +} +.asciinema-terminal .fg-81 { + color: #5fd7ff; +} +.asciinema-terminal .bg-81 { + background-color: #5fd7ff; +} +.asciinema-terminal .fg-82 { + color: #5fff00; +} +.asciinema-terminal .bg-82 { + background-color: #5fff00; +} +.asciinema-terminal .fg-83 { + color: #5fff5f; +} +.asciinema-terminal .bg-83 { + background-color: #5fff5f; +} +.asciinema-terminal .fg-84 { + color: #5fff87; +} +.asciinema-terminal .bg-84 { + background-color: #5fff87; +} +.asciinema-terminal .fg-85 { + color: #5fffaf; +} +.asciinema-terminal .bg-85 { + background-color: #5fffaf; +} +.asciinema-terminal .fg-86 { + color: #5fffd7; +} +.asciinema-terminal .bg-86 { + background-color: #5fffd7; +} +.asciinema-terminal .fg-87 { + color: #5fffff; +} +.asciinema-terminal .bg-87 { + background-color: #5fffff; +} +.asciinema-terminal .fg-88 { + color: #870000; +} +.asciinema-terminal .bg-88 { + background-color: #870000; +} +.asciinema-terminal .fg-89 { + color: #87005f; +} +.asciinema-terminal .bg-89 { + background-color: #87005f; +} +.asciinema-terminal .fg-90 { + color: #870087; +} +.asciinema-terminal .bg-90 { + background-color: #870087; +} +.asciinema-terminal .fg-91 { + color: #8700af; +} +.asciinema-terminal .bg-91 { + background-color: #8700af; +} +.asciinema-terminal .fg-92 { + color: #8700d7; +} +.asciinema-terminal .bg-92 { + background-color: #8700d7; +} +.asciinema-terminal .fg-93 { + color: #8700ff; +} +.asciinema-terminal .bg-93 { + background-color: #8700ff; +} +.asciinema-terminal .fg-94 { + color: #875f00; +} +.asciinema-terminal .bg-94 { + background-color: #875f00; +} +.asciinema-terminal .fg-95 { + color: #875f5f; +} +.asciinema-terminal .bg-95 { + background-color: #875f5f; +} +.asciinema-terminal .fg-96 { + color: #875f87; +} +.asciinema-terminal .bg-96 { + background-color: #875f87; +} +.asciinema-terminal .fg-97 { + color: #875faf; +} +.asciinema-terminal .bg-97 { + background-color: #875faf; +} +.asciinema-terminal .fg-98 { + color: #875fd7; +} +.asciinema-terminal .bg-98 { + background-color: #875fd7; +} +.asciinema-terminal .fg-99 { + color: #875fff; +} +.asciinema-terminal .bg-99 { + background-color: #875fff; +} +.asciinema-terminal .fg-100 { + color: #878700; +} +.asciinema-terminal .bg-100 { + background-color: #878700; +} +.asciinema-terminal .fg-101 { + color: #87875f; +} +.asciinema-terminal .bg-101 { + background-color: #87875f; +} +.asciinema-terminal .fg-102 { + color: #878787; +} +.asciinema-terminal .bg-102 { + background-color: #878787; +} +.asciinema-terminal .fg-103 { + color: #8787af; +} +.asciinema-terminal .bg-103 { + background-color: #8787af; +} +.asciinema-terminal .fg-104 { + color: #8787d7; +} +.asciinema-terminal .bg-104 { + background-color: #8787d7; +} +.asciinema-terminal .fg-105 { + color: #8787ff; +} +.asciinema-terminal .bg-105 { + background-color: #8787ff; +} +.asciinema-terminal .fg-106 { + color: #87af00; +} +.asciinema-terminal .bg-106 { + background-color: #87af00; +} +.asciinema-terminal .fg-107 { + color: #87af5f; +} +.asciinema-terminal .bg-107 { + background-color: #87af5f; +} +.asciinema-terminal .fg-108 { + color: #87af87; +} +.asciinema-terminal .bg-108 { + background-color: #87af87; +} +.asciinema-terminal .fg-109 { + color: #87afaf; +} +.asciinema-terminal .bg-109 { + background-color: #87afaf; +} +.asciinema-terminal .fg-110 { + color: #87afd7; +} +.asciinema-terminal .bg-110 { + background-color: #87afd7; +} +.asciinema-terminal .fg-111 { + color: #87afff; +} +.asciinema-terminal .bg-111 { + background-color: #87afff; +} +.asciinema-terminal .fg-112 { + color: #87d700; +} +.asciinema-terminal .bg-112 { + background-color: #87d700; +} +.asciinema-terminal .fg-113 { + color: #87d75f; +} +.asciinema-terminal .bg-113 { + background-color: #87d75f; +} +.asciinema-terminal .fg-114 { + color: #87d787; +} +.asciinema-terminal .bg-114 { + background-color: #87d787; +} +.asciinema-terminal .fg-115 { + color: #87d7af; +} +.asciinema-terminal .bg-115 { + background-color: #87d7af; +} +.asciinema-terminal .fg-116 { + color: #87d7d7; +} +.asciinema-terminal .bg-116 { + background-color: #87d7d7; +} +.asciinema-terminal .fg-117 { + color: #87d7ff; +} +.asciinema-terminal .bg-117 { + background-color: #87d7ff; +} +.asciinema-terminal .fg-118 { + color: #87ff00; +} +.asciinema-terminal .bg-118 { + background-color: #87ff00; +} +.asciinema-terminal .fg-119 { + color: #87ff5f; +} +.asciinema-terminal .bg-119 { + background-color: #87ff5f; +} +.asciinema-terminal .fg-120 { + color: #87ff87; +} +.asciinema-terminal .bg-120 { + background-color: #87ff87; +} +.asciinema-terminal .fg-121 { + color: #87ffaf; +} +.asciinema-terminal .bg-121 { + background-color: #87ffaf; +} +.asciinema-terminal .fg-122 { + color: #87ffd7; +} +.asciinema-terminal .bg-122 { + background-color: #87ffd7; +} +.asciinema-terminal .fg-123 { + color: #87ffff; +} +.asciinema-terminal .bg-123 { + background-color: #87ffff; +} +.asciinema-terminal .fg-124 { + color: #af0000; +} +.asciinema-terminal .bg-124 { + background-color: #af0000; +} +.asciinema-terminal .fg-125 { + color: #af005f; +} +.asciinema-terminal .bg-125 { + background-color: #af005f; +} +.asciinema-terminal .fg-126 { + color: #af0087; +} +.asciinema-terminal .bg-126 { + background-color: #af0087; +} +.asciinema-terminal .fg-127 { + color: #af00af; +} +.asciinema-terminal .bg-127 { + background-color: #af00af; +} +.asciinema-terminal .fg-128 { + color: #af00d7; +} +.asciinema-terminal .bg-128 { + background-color: #af00d7; +} +.asciinema-terminal .fg-129 { + color: #af00ff; +} +.asciinema-terminal .bg-129 { + background-color: #af00ff; +} +.asciinema-terminal .fg-130 { + color: #af5f00; +} +.asciinema-terminal .bg-130 { + background-color: #af5f00; +} +.asciinema-terminal .fg-131 { + color: #af5f5f; +} +.asciinema-terminal .bg-131 { + background-color: #af5f5f; +} +.asciinema-terminal .fg-132 { + color: #af5f87; +} +.asciinema-terminal .bg-132 { + background-color: #af5f87; +} +.asciinema-terminal .fg-133 { + color: #af5faf; +} +.asciinema-terminal .bg-133 { + background-color: #af5faf; +} +.asciinema-terminal .fg-134 { + color: #af5fd7; +} +.asciinema-terminal .bg-134 { + background-color: #af5fd7; +} +.asciinema-terminal .fg-135 { + color: #af5fff; +} +.asciinema-terminal .bg-135 { + background-color: #af5fff; +} +.asciinema-terminal .fg-136 { + color: #af8700; +} +.asciinema-terminal .bg-136 { + background-color: #af8700; +} +.asciinema-terminal .fg-137 { + color: #af875f; +} +.asciinema-terminal .bg-137 { + background-color: #af875f; +} +.asciinema-terminal .fg-138 { + color: #af8787; +} +.asciinema-terminal .bg-138 { + background-color: #af8787; +} +.asciinema-terminal .fg-139 { + color: #af87af; +} +.asciinema-terminal .bg-139 { + background-color: #af87af; +} +.asciinema-terminal .fg-140 { + color: #af87d7; +} +.asciinema-terminal .bg-140 { + background-color: #af87d7; +} +.asciinema-terminal .fg-141 { + color: #af87ff; +} +.asciinema-terminal .bg-141 { + background-color: #af87ff; +} +.asciinema-terminal .fg-142 { + color: #afaf00; +} +.asciinema-terminal .bg-142 { + background-color: #afaf00; +} +.asciinema-terminal .fg-143 { + color: #afaf5f; +} +.asciinema-terminal .bg-143 { + background-color: #afaf5f; +} +.asciinema-terminal .fg-144 { + color: #afaf87; +} +.asciinema-terminal .bg-144 { + background-color: #afaf87; +} +.asciinema-terminal .fg-145 { + color: #afafaf; +} +.asciinema-terminal .bg-145 { + background-color: #afafaf; +} +.asciinema-terminal .fg-146 { + color: #afafd7; +} +.asciinema-terminal .bg-146 { + background-color: #afafd7; +} +.asciinema-terminal .fg-147 { + color: #afafff; +} +.asciinema-terminal .bg-147 { + background-color: #afafff; +} +.asciinema-terminal .fg-148 { + color: #afd700; +} +.asciinema-terminal .bg-148 { + background-color: #afd700; +} +.asciinema-terminal .fg-149 { + color: #afd75f; +} +.asciinema-terminal .bg-149 { + background-color: #afd75f; +} +.asciinema-terminal .fg-150 { + color: #afd787; +} +.asciinema-terminal .bg-150 { + background-color: #afd787; +} +.asciinema-terminal .fg-151 { + color: #afd7af; +} +.asciinema-terminal .bg-151 { + background-color: #afd7af; +} +.asciinema-terminal .fg-152 { + color: #afd7d7; +} +.asciinema-terminal .bg-152 { + background-color: #afd7d7; +} +.asciinema-terminal .fg-153 { + color: #afd7ff; +} +.asciinema-terminal .bg-153 { + background-color: #afd7ff; +} +.asciinema-terminal .fg-154 { + color: #afff00; +} +.asciinema-terminal .bg-154 { + background-color: #afff00; +} +.asciinema-terminal .fg-155 { + color: #afff5f; +} +.asciinema-terminal .bg-155 { + background-color: #afff5f; +} +.asciinema-terminal .fg-156 { + color: #afff87; +} +.asciinema-terminal .bg-156 { + background-color: #afff87; +} +.asciinema-terminal .fg-157 { + color: #afffaf; +} +.asciinema-terminal .bg-157 { + background-color: #afffaf; +} +.asciinema-terminal .fg-158 { + color: #afffd7; +} +.asciinema-terminal .bg-158 { + background-color: #afffd7; +} +.asciinema-terminal .fg-159 { + color: #afffff; +} +.asciinema-terminal .bg-159 { + background-color: #afffff; +} +.asciinema-terminal .fg-160 { + color: #d70000; +} +.asciinema-terminal .bg-160 { + background-color: #d70000; +} +.asciinema-terminal .fg-161 { + color: #d7005f; +} +.asciinema-terminal .bg-161 { + background-color: #d7005f; +} +.asciinema-terminal .fg-162 { + color: #d70087; +} +.asciinema-terminal .bg-162 { + background-color: #d70087; +} +.asciinema-terminal .fg-163 { + color: #d700af; +} +.asciinema-terminal .bg-163 { + background-color: #d700af; +} +.asciinema-terminal .fg-164 { + color: #d700d7; +} +.asciinema-terminal .bg-164 { + background-color: #d700d7; +} +.asciinema-terminal .fg-165 { + color: #d700ff; +} +.asciinema-terminal .bg-165 { + background-color: #d700ff; +} +.asciinema-terminal .fg-166 { + color: #d75f00; +} +.asciinema-terminal .bg-166 { + background-color: #d75f00; +} +.asciinema-terminal .fg-167 { + color: #d75f5f; +} +.asciinema-terminal .bg-167 { + background-color: #d75f5f; +} +.asciinema-terminal .fg-168 { + color: #d75f87; +} +.asciinema-terminal .bg-168 { + background-color: #d75f87; +} +.asciinema-terminal .fg-169 { + color: #d75faf; +} +.asciinema-terminal .bg-169 { + background-color: #d75faf; +} +.asciinema-terminal .fg-170 { + color: #d75fd7; +} +.asciinema-terminal .bg-170 { + background-color: #d75fd7; +} +.asciinema-terminal .fg-171 { + color: #d75fff; +} +.asciinema-terminal .bg-171 { + background-color: #d75fff; +} +.asciinema-terminal .fg-172 { + color: #d78700; +} +.asciinema-terminal .bg-172 { + background-color: #d78700; +} +.asciinema-terminal .fg-173 { + color: #d7875f; +} +.asciinema-terminal .bg-173 { + background-color: #d7875f; +} +.asciinema-terminal .fg-174 { + color: #d78787; +} +.asciinema-terminal .bg-174 { + background-color: #d78787; +} +.asciinema-terminal .fg-175 { + color: #d787af; +} +.asciinema-terminal .bg-175 { + background-color: #d787af; +} +.asciinema-terminal .fg-176 { + color: #d787d7; +} +.asciinema-terminal .bg-176 { + background-color: #d787d7; +} +.asciinema-terminal .fg-177 { + color: #d787ff; +} +.asciinema-terminal .bg-177 { + background-color: #d787ff; +} +.asciinema-terminal .fg-178 { + color: #d7af00; +} +.asciinema-terminal .bg-178 { + background-color: #d7af00; +} +.asciinema-terminal .fg-179 { + color: #d7af5f; +} +.asciinema-terminal .bg-179 { + background-color: #d7af5f; +} +.asciinema-terminal .fg-180 { + color: #d7af87; +} +.asciinema-terminal .bg-180 { + background-color: #d7af87; +} +.asciinema-terminal .fg-181 { + color: #d7afaf; +} +.asciinema-terminal .bg-181 { + background-color: #d7afaf; +} +.asciinema-terminal .fg-182 { + color: #d7afd7; +} +.asciinema-terminal .bg-182 { + background-color: #d7afd7; +} +.asciinema-terminal .fg-183 { + color: #d7afff; +} +.asciinema-terminal .bg-183 { + background-color: #d7afff; +} +.asciinema-terminal .fg-184 { + color: #d7d700; +} +.asciinema-terminal .bg-184 { + background-color: #d7d700; +} +.asciinema-terminal .fg-185 { + color: #d7d75f; +} +.asciinema-terminal .bg-185 { + background-color: #d7d75f; +} +.asciinema-terminal .fg-186 { + color: #d7d787; +} +.asciinema-terminal .bg-186 { + background-color: #d7d787; +} +.asciinema-terminal .fg-187 { + color: #d7d7af; +} +.asciinema-terminal .bg-187 { + background-color: #d7d7af; +} +.asciinema-terminal .fg-188 { + color: #d7d7d7; +} +.asciinema-terminal .bg-188 { + background-color: #d7d7d7; +} +.asciinema-terminal .fg-189 { + color: #d7d7ff; +} +.asciinema-terminal .bg-189 { + background-color: #d7d7ff; +} +.asciinema-terminal .fg-190 { + color: #d7ff00; +} +.asciinema-terminal .bg-190 { + background-color: #d7ff00; +} +.asciinema-terminal .fg-191 { + color: #d7ff5f; +} +.asciinema-terminal .bg-191 { + background-color: #d7ff5f; +} +.asciinema-terminal .fg-192 { + color: #d7ff87; +} +.asciinema-terminal .bg-192 { + background-color: #d7ff87; +} +.asciinema-terminal .fg-193 { + color: #d7ffaf; +} +.asciinema-terminal .bg-193 { + background-color: #d7ffaf; +} +.asciinema-terminal .fg-194 { + color: #d7ffd7; +} +.asciinema-terminal .bg-194 { + background-color: #d7ffd7; +} +.asciinema-terminal .fg-195 { + color: #d7ffff; +} +.asciinema-terminal .bg-195 { + background-color: #d7ffff; +} +.asciinema-terminal .fg-196 { + color: #ff0000; +} +.asciinema-terminal .bg-196 { + background-color: #ff0000; +} +.asciinema-terminal .fg-197 { + color: #ff005f; +} +.asciinema-terminal .bg-197 { + background-color: #ff005f; +} +.asciinema-terminal .fg-198 { + color: #ff0087; +} +.asciinema-terminal .bg-198 { + background-color: #ff0087; +} +.asciinema-terminal .fg-199 { + color: #ff00af; +} +.asciinema-terminal .bg-199 { + background-color: #ff00af; +} +.asciinema-terminal .fg-200 { + color: #ff00d7; +} +.asciinema-terminal .bg-200 { + background-color: #ff00d7; +} +.asciinema-terminal .fg-201 { + color: #ff00ff; +} +.asciinema-terminal .bg-201 { + background-color: #ff00ff; +} +.asciinema-terminal .fg-202 { + color: #ff5f00; +} +.asciinema-terminal .bg-202 { + background-color: #ff5f00; +} +.asciinema-terminal .fg-203 { + color: #ff5f5f; +} +.asciinema-terminal .bg-203 { + background-color: #ff5f5f; +} +.asciinema-terminal .fg-204 { + color: #ff5f87; +} +.asciinema-terminal .bg-204 { + background-color: #ff5f87; +} +.asciinema-terminal .fg-205 { + color: #ff5faf; +} +.asciinema-terminal .bg-205 { + background-color: #ff5faf; +} +.asciinema-terminal .fg-206 { + color: #ff5fd7; +} +.asciinema-terminal .bg-206 { + background-color: #ff5fd7; +} +.asciinema-terminal .fg-207 { + color: #ff5fff; +} +.asciinema-terminal .bg-207 { + background-color: #ff5fff; +} +.asciinema-terminal .fg-208 { + color: #ff8700; +} +.asciinema-terminal .bg-208 { + background-color: #ff8700; +} +.asciinema-terminal .fg-209 { + color: #ff875f; +} +.asciinema-terminal .bg-209 { + background-color: #ff875f; +} +.asciinema-terminal .fg-210 { + color: #ff8787; +} +.asciinema-terminal .bg-210 { + background-color: #ff8787; +} +.asciinema-terminal .fg-211 { + color: #ff87af; +} +.asciinema-terminal .bg-211 { + background-color: #ff87af; +} +.asciinema-terminal .fg-212 { + color: #ff87d7; +} +.asciinema-terminal .bg-212 { + background-color: #ff87d7; +} +.asciinema-terminal .fg-213 { + color: #ff87ff; +} +.asciinema-terminal .bg-213 { + background-color: #ff87ff; +} +.asciinema-terminal .fg-214 { + color: #ffaf00; +} +.asciinema-terminal .bg-214 { + background-color: #ffaf00; +} +.asciinema-terminal .fg-215 { + color: #ffaf5f; +} +.asciinema-terminal .bg-215 { + background-color: #ffaf5f; +} +.asciinema-terminal .fg-216 { + color: #ffaf87; +} +.asciinema-terminal .bg-216 { + background-color: #ffaf87; +} +.asciinema-terminal .fg-217 { + color: #ffafaf; +} +.asciinema-terminal .bg-217 { + background-color: #ffafaf; +} +.asciinema-terminal .fg-218 { + color: #ffafd7; +} +.asciinema-terminal .bg-218 { + background-color: #ffafd7; +} +.asciinema-terminal .fg-219 { + color: #ffafff; +} +.asciinema-terminal .bg-219 { + background-color: #ffafff; +} +.asciinema-terminal .fg-220 { + color: #ffd700; +} +.asciinema-terminal .bg-220 { + background-color: #ffd700; +} +.asciinema-terminal .fg-221 { + color: #ffd75f; +} +.asciinema-terminal .bg-221 { + background-color: #ffd75f; +} +.asciinema-terminal .fg-222 { + color: #ffd787; +} +.asciinema-terminal .bg-222 { + background-color: #ffd787; +} +.asciinema-terminal .fg-223 { + color: #ffd7af; +} +.asciinema-terminal .bg-223 { + background-color: #ffd7af; +} +.asciinema-terminal .fg-224 { + color: #ffd7d7; +} +.asciinema-terminal .bg-224 { + background-color: #ffd7d7; +} +.asciinema-terminal .fg-225 { + color: #ffd7ff; +} +.asciinema-terminal .bg-225 { + background-color: #ffd7ff; +} +.asciinema-terminal .fg-226 { + color: #ffff00; +} +.asciinema-terminal .bg-226 { + background-color: #ffff00; +} +.asciinema-terminal .fg-227 { + color: #ffff5f; +} +.asciinema-terminal .bg-227 { + background-color: #ffff5f; +} +.asciinema-terminal .fg-228 { + color: #ffff87; +} +.asciinema-terminal .bg-228 { + background-color: #ffff87; +} +.asciinema-terminal .fg-229 { + color: #ffffaf; +} +.asciinema-terminal .bg-229 { + background-color: #ffffaf; +} +.asciinema-terminal .fg-230 { + color: #ffffd7; +} +.asciinema-terminal .bg-230 { + background-color: #ffffd7; +} +.asciinema-terminal .fg-231 { + color: #ffffff; +} +.asciinema-terminal .bg-231 { + background-color: #ffffff; +} +.asciinema-terminal .fg-232 { + color: #080808; +} +.asciinema-terminal .bg-232 { + background-color: #080808; +} +.asciinema-terminal .fg-233 { + color: #121212; +} +.asciinema-terminal .bg-233 { + background-color: #121212; +} +.asciinema-terminal .fg-234 { + color: #1c1c1c; +} +.asciinema-terminal .bg-234 { + background-color: #1c1c1c; +} +.asciinema-terminal .fg-235 { + color: #262626; +} +.asciinema-terminal .bg-235 { + background-color: #262626; +} +.asciinema-terminal .fg-236 { + color: #303030; +} +.asciinema-terminal .bg-236 { + background-color: #303030; +} +.asciinema-terminal .fg-237 { + color: #3a3a3a; +} +.asciinema-terminal .bg-237 { + background-color: #3a3a3a; +} +.asciinema-terminal .fg-238 { + color: #444444; +} +.asciinema-terminal .bg-238 { + background-color: #444444; +} +.asciinema-terminal .fg-239 { + color: #4e4e4e; +} +.asciinema-terminal .bg-239 { + background-color: #4e4e4e; +} +.asciinema-terminal .fg-240 { + color: #585858; +} +.asciinema-terminal .bg-240 { + background-color: #585858; +} +.asciinema-terminal .fg-241 { + color: #626262; +} +.asciinema-terminal .bg-241 { + background-color: #626262; +} +.asciinema-terminal .fg-242 { + color: #6c6c6c; +} +.asciinema-terminal .bg-242 { + background-color: #6c6c6c; +} +.asciinema-terminal .fg-243 { + color: #767676; +} +.asciinema-terminal .bg-243 { + background-color: #767676; +} +.asciinema-terminal .fg-244 { + color: #808080; +} +.asciinema-terminal .bg-244 { + background-color: #808080; +} +.asciinema-terminal .fg-245 { + color: #8a8a8a; +} +.asciinema-terminal .bg-245 { + background-color: #8a8a8a; +} +.asciinema-terminal .fg-246 { + color: #949494; +} +.asciinema-terminal .bg-246 { + background-color: #949494; +} +.asciinema-terminal .fg-247 { + color: #9e9e9e; +} +.asciinema-terminal .bg-247 { + background-color: #9e9e9e; +} +.asciinema-terminal .fg-248 { + color: #a8a8a8; +} +.asciinema-terminal .bg-248 { + background-color: #a8a8a8; +} +.asciinema-terminal .fg-249 { + color: #b2b2b2; +} +.asciinema-terminal .bg-249 { + background-color: #b2b2b2; +} +.asciinema-terminal .fg-250 { + color: #bcbcbc; +} +.asciinema-terminal .bg-250 { + background-color: #bcbcbc; +} +.asciinema-terminal .fg-251 { + color: #c6c6c6; +} +.asciinema-terminal .bg-251 { + background-color: #c6c6c6; +} +.asciinema-terminal .fg-252 { + color: #d0d0d0; +} +.asciinema-terminal .bg-252 { + background-color: #d0d0d0; +} +.asciinema-terminal .fg-253 { + color: #dadada; +} +.asciinema-terminal .bg-253 { + background-color: #dadada; +} +.asciinema-terminal .fg-254 { + color: #e4e4e4; +} +.asciinema-terminal .bg-254 { + background-color: #e4e4e4; +} +.asciinema-terminal .fg-255 { + color: #eeeeee; +} +.asciinema-terminal .bg-255 { + background-color: #eeeeee; +} +.asciinema-theme-asciinema .asciinema-terminal { + color: #cccccc; + background-color: #121314; + border-color: #121314; +} +.asciinema-theme-asciinema .fg-bg { + color: #121314; +} +.asciinema-theme-asciinema .bg-fg { + background-color: #cccccc; +} +.asciinema-theme-asciinema .fg-0 { + color: #000000; +} +.asciinema-theme-asciinema .bg-0 { + background-color: #000000; +} +.asciinema-theme-asciinema .fg-1 { + color: #dd3c69; +} +.asciinema-theme-asciinema .bg-1 { + background-color: #dd3c69; +} +.asciinema-theme-asciinema .fg-2 { + color: #4ebf22; +} +.asciinema-theme-asciinema .bg-2 { + background-color: #4ebf22; +} +.asciinema-theme-asciinema .fg-3 { + color: #ddaf3c; +} +.asciinema-theme-asciinema .bg-3 { + background-color: #ddaf3c; +} +.asciinema-theme-asciinema .fg-4 { + color: #26b0d7; +} +.asciinema-theme-asciinema .bg-4 { + background-color: #26b0d7; +} +.asciinema-theme-asciinema .fg-5 { + color: #b954e1; +} +.asciinema-theme-asciinema .bg-5 { + background-color: #b954e1; +} +.asciinema-theme-asciinema .fg-6 { + color: #54e1b9; +} +.asciinema-theme-asciinema .bg-6 { + background-color: #54e1b9; +} +.asciinema-theme-asciinema .fg-7 { + color: #d9d9d9; +} +.asciinema-theme-asciinema .bg-7 { + background-color: #d9d9d9; +} +.asciinema-theme-asciinema .fg-8 { + color: #4d4d4d; +} +.asciinema-theme-asciinema .bg-8 { + background-color: #4d4d4d; +} +.asciinema-theme-asciinema .fg-9 { + color: #dd3c69; +} +.asciinema-theme-asciinema .bg-9 { + background-color: #dd3c69; +} +.asciinema-theme-asciinema .fg-10 { + color: #4ebf22; +} +.asciinema-theme-asciinema .bg-10 { + background-color: #4ebf22; +} +.asciinema-theme-asciinema .fg-11 { + color: #ddaf3c; +} +.asciinema-theme-asciinema .bg-11 { + background-color: #ddaf3c; +} +.asciinema-theme-asciinema .fg-12 { + color: #26b0d7; +} +.asciinema-theme-asciinema .bg-12 { + background-color: #26b0d7; +} +.asciinema-theme-asciinema .fg-13 { + color: #b954e1; +} +.asciinema-theme-asciinema .bg-13 { + background-color: #b954e1; +} +.asciinema-theme-asciinema .fg-14 { + color: #54e1b9; +} +.asciinema-theme-asciinema .bg-14 { + background-color: #54e1b9; +} +.asciinema-theme-asciinema .fg-15 { + color: #ffffff; +} +.asciinema-theme-asciinema .bg-15 { + background-color: #ffffff; +} +.asciinema-theme-asciinema .fg-8, +.asciinema-theme-asciinema .fg-9, +.asciinema-theme-asciinema .fg-10, +.asciinema-theme-asciinema .fg-11, +.asciinema-theme-asciinema .fg-12, +.asciinema-theme-asciinema .fg-13, +.asciinema-theme-asciinema .fg-14, +.asciinema-theme-asciinema .fg-15 { + font-weight: bold; +} +.asciinema-theme-tango .asciinema-terminal { + color: #cccccc; + background-color: #121314; + border-color: #121314; +} +.asciinema-theme-tango .fg-bg { + color: #121314; +} +.asciinema-theme-tango .bg-fg { + background-color: #cccccc; +} +.asciinema-theme-tango .fg-0 { + color: #000000; +} +.asciinema-theme-tango .bg-0 { + background-color: #000000; +} +.asciinema-theme-tango .fg-1 { + color: #cc0000; +} +.asciinema-theme-tango .bg-1 { + background-color: #cc0000; +} +.asciinema-theme-tango .fg-2 { + color: #4e9a06; +} +.asciinema-theme-tango .bg-2 { + background-color: #4e9a06; +} +.asciinema-theme-tango .fg-3 { + color: #c4a000; +} +.asciinema-theme-tango .bg-3 { + background-color: #c4a000; +} +.asciinema-theme-tango .fg-4 { + color: #3465a4; +} +.asciinema-theme-tango .bg-4 { + background-color: #3465a4; +} +.asciinema-theme-tango .fg-5 { + color: #75507b; +} +.asciinema-theme-tango .bg-5 { + background-color: #75507b; +} +.asciinema-theme-tango .fg-6 { + color: #06989a; +} +.asciinema-theme-tango .bg-6 { + background-color: #06989a; +} +.asciinema-theme-tango .fg-7 { + color: #d3d7cf; +} +.asciinema-theme-tango .bg-7 { + background-color: #d3d7cf; +} +.asciinema-theme-tango .fg-8 { + color: #555753; +} +.asciinema-theme-tango .bg-8 { + background-color: #555753; +} +.asciinema-theme-tango .fg-9 { + color: #ef2929; +} +.asciinema-theme-tango .bg-9 { + background-color: #ef2929; +} +.asciinema-theme-tango .fg-10 { + color: #8ae234; +} +.asciinema-theme-tango .bg-10 { + background-color: #8ae234; +} +.asciinema-theme-tango .fg-11 { + color: #fce94f; +} +.asciinema-theme-tango .bg-11 { + background-color: #fce94f; +} +.asciinema-theme-tango .fg-12 { + color: #729fcf; +} +.asciinema-theme-tango .bg-12 { + background-color: #729fcf; +} +.asciinema-theme-tango .fg-13 { + color: #ad7fa8; +} +.asciinema-theme-tango .bg-13 { + background-color: #ad7fa8; +} +.asciinema-theme-tango .fg-14 { + color: #34e2e2; +} +.asciinema-theme-tango .bg-14 { + background-color: #34e2e2; +} +.asciinema-theme-tango .fg-15 { + color: #eeeeec; +} +.asciinema-theme-tango .bg-15 { + background-color: #eeeeec; +} +.asciinema-theme-tango .fg-8, +.asciinema-theme-tango .fg-9, +.asciinema-theme-tango .fg-10, +.asciinema-theme-tango .fg-11, +.asciinema-theme-tango .fg-12, +.asciinema-theme-tango .fg-13, +.asciinema-theme-tango .fg-14, +.asciinema-theme-tango .fg-15 { + font-weight: bold; +} +.asciinema-theme-solarized-dark .asciinema-terminal { + color: #839496; + background-color: #002b36; + border-color: #002b36; +} +.asciinema-theme-solarized-dark .fg-bg { + color: #002b36; +} +.asciinema-theme-solarized-dark .bg-fg { + background-color: #839496; +} +.asciinema-theme-solarized-dark .fg-0 { + color: #073642; +} +.asciinema-theme-solarized-dark .bg-0 { + background-color: #073642; +} +.asciinema-theme-solarized-dark .fg-1 { + color: #dc322f; +} +.asciinema-theme-solarized-dark .bg-1 { + background-color: #dc322f; +} +.asciinema-theme-solarized-dark .fg-2 { + color: #859900; +} +.asciinema-theme-solarized-dark .bg-2 { + background-color: #859900; +} +.asciinema-theme-solarized-dark .fg-3 { + color: #b58900; +} +.asciinema-theme-solarized-dark .bg-3 { + background-color: #b58900; +} +.asciinema-theme-solarized-dark .fg-4 { + color: #268bd2; +} +.asciinema-theme-solarized-dark .bg-4 { + background-color: #268bd2; +} +.asciinema-theme-solarized-dark .fg-5 { + color: #d33682; +} +.asciinema-theme-solarized-dark .bg-5 { + background-color: #d33682; +} +.asciinema-theme-solarized-dark .fg-6 { + color: #2aa198; +} +.asciinema-theme-solarized-dark .bg-6 { + background-color: #2aa198; +} +.asciinema-theme-solarized-dark .fg-7 { + color: #eee8d5; +} +.asciinema-theme-solarized-dark .bg-7 { + background-color: #eee8d5; +} +.asciinema-theme-solarized-dark .fg-8 { + color: #002b36; +} +.asciinema-theme-solarized-dark .bg-8 { + background-color: #002b36; +} +.asciinema-theme-solarized-dark .fg-9 { + color: #cb4b16; +} +.asciinema-theme-solarized-dark .bg-9 { + background-color: #cb4b16; +} +.asciinema-theme-solarized-dark .fg-10 { + color: #586e75; +} +.asciinema-theme-solarized-dark .bg-10 { + background-color: #586e75; +} +.asciinema-theme-solarized-dark .fg-11 { + color: #657b83; +} +.asciinema-theme-solarized-dark .bg-11 { + background-color: #657b83; +} +.asciinema-theme-solarized-dark .fg-12 { + color: #839496; +} +.asciinema-theme-solarized-dark .bg-12 { + background-color: #839496; +} +.asciinema-theme-solarized-dark .fg-13 { + color: #6c71c4; +} +.asciinema-theme-solarized-dark .bg-13 { + background-color: #6c71c4; +} +.asciinema-theme-solarized-dark .fg-14 { + color: #93a1a1; +} +.asciinema-theme-solarized-dark .bg-14 { + background-color: #93a1a1; +} +.asciinema-theme-solarized-dark .fg-15 { + color: #fdf6e3; +} +.asciinema-theme-solarized-dark .bg-15 { + background-color: #fdf6e3; +} +.asciinema-theme-solarized-light .asciinema-terminal { + color: #657b83; + background-color: #fdf6e3; + border-color: #fdf6e3; +} +.asciinema-theme-solarized-light .fg-bg { + color: #fdf6e3; +} +.asciinema-theme-solarized-light .bg-fg { + background-color: #657b83; +} +.asciinema-theme-solarized-light .fg-0 { + color: #073642; +} +.asciinema-theme-solarized-light .bg-0 { + background-color: #073642; +} +.asciinema-theme-solarized-light .fg-1 { + color: #dc322f; +} +.asciinema-theme-solarized-light .bg-1 { + background-color: #dc322f; +} +.asciinema-theme-solarized-light .fg-2 { + color: #859900; +} +.asciinema-theme-solarized-light .bg-2 { + background-color: #859900; +} +.asciinema-theme-solarized-light .fg-3 { + color: #b58900; +} +.asciinema-theme-solarized-light .bg-3 { + background-color: #b58900; +} +.asciinema-theme-solarized-light .fg-4 { + color: #268bd2; +} +.asciinema-theme-solarized-light .bg-4 { + background-color: #268bd2; +} +.asciinema-theme-solarized-light .fg-5 { + color: #d33682; +} +.asciinema-theme-solarized-light .bg-5 { + background-color: #d33682; +} +.asciinema-theme-solarized-light .fg-6 { + color: #2aa198; +} +.asciinema-theme-solarized-light .bg-6 { + background-color: #2aa198; +} +.asciinema-theme-solarized-light .fg-7 { + color: #eee8d5; +} +.asciinema-theme-solarized-light .bg-7 { + background-color: #eee8d5; +} +.asciinema-theme-solarized-light .fg-8 { + color: #002b36; +} +.asciinema-theme-solarized-light .bg-8 { + background-color: #002b36; +} +.asciinema-theme-solarized-light .fg-9 { + color: #cb4b16; +} +.asciinema-theme-solarized-light .bg-9 { + background-color: #cb4b16; +} +.asciinema-theme-solarized-light .fg-10 { + color: #586e75; +} +.asciinema-theme-solarized-light .bg-10 { + background-color: #586e75; +} +.asciinema-theme-solarized-light .fg-11 { + color: #657c83; +} +.asciinema-theme-solarized-light .bg-11 { + background-color: #657c83; +} +.asciinema-theme-solarized-light .fg-12 { + color: #839496; +} +.asciinema-theme-solarized-light .bg-12 { + background-color: #839496; +} +.asciinema-theme-solarized-light .fg-13 { + color: #6c71c4; +} +.asciinema-theme-solarized-light .bg-13 { + background-color: #6c71c4; +} +.asciinema-theme-solarized-light .fg-14 { + color: #93a1a1; +} +.asciinema-theme-solarized-light .bg-14 { + background-color: #93a1a1; +} +.asciinema-theme-solarized-light .fg-15 { + color: #fdf6e3; +} +.asciinema-theme-solarized-light .bg-15 { + background-color: #fdf6e3; +} +.asciinema-theme-seti .asciinema-terminal { + color: #cacecd; + background-color: #111213; + border-color: #111213; +} +.asciinema-theme-seti .fg-bg { + color: #111213; +} +.asciinema-theme-seti .bg-fg { + background-color: #cacecd; +} +.asciinema-theme-seti .fg-0 { + color: #323232; +} +.asciinema-theme-seti .bg-0 { + background-color: #323232; +} +.asciinema-theme-seti .fg-1 { + color: #c22832; +} +.asciinema-theme-seti .bg-1 { + background-color: #c22832; +} +.asciinema-theme-seti .fg-2 { + color: #8ec43d; +} +.asciinema-theme-seti .bg-2 { + background-color: #8ec43d; +} +.asciinema-theme-seti .fg-3 { + color: #e0c64f; +} +.asciinema-theme-seti .bg-3 { + background-color: #e0c64f; +} +.asciinema-theme-seti .fg-4 { + color: #43a5d5; +} +.asciinema-theme-seti .bg-4 { + background-color: #43a5d5; +} +.asciinema-theme-seti .fg-5 { + color: #8b57b5; +} +.asciinema-theme-seti .bg-5 { + background-color: #8b57b5; +} +.asciinema-theme-seti .fg-6 { + color: #8ec43d; +} +.asciinema-theme-seti .bg-6 { + background-color: #8ec43d; +} +.asciinema-theme-seti .fg-7 { + color: #eeeeee; +} +.asciinema-theme-seti .bg-7 { + background-color: #eeeeee; +} +.asciinema-theme-seti .fg-8 { + color: #323232; +} +.asciinema-theme-seti .bg-8 { + background-color: #323232; +} +.asciinema-theme-seti .fg-9 { + color: #c22832; +} +.asciinema-theme-seti .bg-9 { + background-color: #c22832; +} +.asciinema-theme-seti .fg-10 { + color: #8ec43d; +} +.asciinema-theme-seti .bg-10 { + background-color: #8ec43d; +} +.asciinema-theme-seti .fg-11 { + color: #e0c64f; +} +.asciinema-theme-seti .bg-11 { + background-color: #e0c64f; +} +.asciinema-theme-seti .fg-12 { + color: #43a5d5; +} +.asciinema-theme-seti .bg-12 { + background-color: #43a5d5; +} +.asciinema-theme-seti .fg-13 { + color: #8b57b5; +} +.asciinema-theme-seti .bg-13 { + background-color: #8b57b5; +} +.asciinema-theme-seti .fg-14 { + color: #8ec43d; +} +.asciinema-theme-seti .bg-14 { + background-color: #8ec43d; +} +.asciinema-theme-seti .fg-15 { + color: #ffffff; +} +.asciinema-theme-seti .bg-15 { + background-color: #ffffff; +} +.asciinema-theme-seti .fg-8, +.asciinema-theme-seti .fg-9, +.asciinema-theme-seti .fg-10, +.asciinema-theme-seti .fg-11, +.asciinema-theme-seti .fg-12, +.asciinema-theme-seti .fg-13, +.asciinema-theme-seti .fg-14, +.asciinema-theme-seti .fg-15 { + font-weight: bold; +} +/* Based on Monokai from base16 collection - https://github.com/chriskempson/base16 */ +.asciinema-theme-monokai .asciinema-terminal { + color: #f8f8f2; + background-color: #272822; + border-color: #272822; +} +.asciinema-theme-monokai .fg-bg { + color: #272822; +} +.asciinema-theme-monokai .bg-fg { + background-color: #f8f8f2; +} +.asciinema-theme-monokai .fg-0 { + color: #272822; +} +.asciinema-theme-monokai .bg-0 { + background-color: #272822; +} +.asciinema-theme-monokai .fg-1 { + color: #f92672; +} +.asciinema-theme-monokai .bg-1 { + background-color: #f92672; +} +.asciinema-theme-monokai .fg-2 { + color: #a6e22e; +} +.asciinema-theme-monokai .bg-2 { + background-color: #a6e22e; +} +.asciinema-theme-monokai .fg-3 { + color: #f4bf75; +} +.asciinema-theme-monokai .bg-3 { + background-color: #f4bf75; +} +.asciinema-theme-monokai .fg-4 { + color: #66d9ef; +} +.asciinema-theme-monokai .bg-4 { + background-color: #66d9ef; +} +.asciinema-theme-monokai .fg-5 { + color: #ae81ff; +} +.asciinema-theme-monokai .bg-5 { + background-color: #ae81ff; +} +.asciinema-theme-monokai .fg-6 { + color: #a1efe4; +} +.asciinema-theme-monokai .bg-6 { + background-color: #a1efe4; +} +.asciinema-theme-monokai .fg-7 { + color: #f8f8f2; +} +.asciinema-theme-monokai .bg-7 { + background-color: #f8f8f2; +} +.asciinema-theme-monokai .fg-8 { + color: #75715e; +} +.asciinema-theme-monokai .bg-8 { + background-color: #75715e; +} +.asciinema-theme-monokai .fg-9 { + color: #f92672; +} +.asciinema-theme-monokai .bg-9 { + background-color: #f92672; +} +.asciinema-theme-monokai .fg-10 { + color: #a6e22e; +} +.asciinema-theme-monokai .bg-10 { + background-color: #a6e22e; +} +.asciinema-theme-monokai .fg-11 { + color: #f4bf75; +} +.asciinema-theme-monokai .bg-11 { + background-color: #f4bf75; +} +.asciinema-theme-monokai .fg-12 { + color: #66d9ef; +} +.asciinema-theme-monokai .bg-12 { + background-color: #66d9ef; +} +.asciinema-theme-monokai .fg-13 { + color: #ae81ff; +} +.asciinema-theme-monokai .bg-13 { + background-color: #ae81ff; +} +.asciinema-theme-monokai .fg-14 { + color: #a1efe4; +} +.asciinema-theme-monokai .bg-14 { + background-color: #a1efe4; +} +.asciinema-theme-monokai .fg-15 { + color: #f9f8f5; +} +.asciinema-theme-monokai .bg-15 { + background-color: #f9f8f5; +} +.asciinema-theme-monokai .fg-8, +.asciinema-theme-monokai .fg-9, +.asciinema-theme-monokai .fg-10, +.asciinema-theme-monokai .fg-11, +.asciinema-theme-monokai .fg-12, +.asciinema-theme-monokai .fg-13, +.asciinema-theme-monokai .fg-14, +.asciinema-theme-monokai .fg-15 { + font-weight: bold; +} diff --git a/refs/pull/405/merge/_static/asciinema-player.js b/refs/pull/405/merge/_static/asciinema-player.js new file mode 100644 index 00000000..5ad47e08 --- /dev/null +++ b/refs/pull/405/merge/_static/asciinema-player.js @@ -0,0 +1,1213 @@ +/** + * asciinema-player v2.6.1 + * + * Copyright 2011-2018, Marcin Kulik + * + */ + +// CustomEvent polyfill from MDN (https://developer.mozilla.org/en-US/docs/Web/API/CustomEvent/CustomEvent) + +(function () { + if (typeof window.CustomEvent === "function") return false; + + function CustomEvent ( event, params ) { + params = params || { bubbles: false, cancelable: false, detail: undefined }; + var evt = document.createEvent( 'CustomEvent'); + evt.initCustomEvent(event, params.bubbles, params.cancelable, params.detail); + return evt; + } + + CustomEvent.prototype = window.Event.prototype; + + window.CustomEvent = CustomEvent; +})(); + +/** + * @license + * Copyright (c) 2014 The Polymer Project Authors. All rights reserved. + * This code may only be used under the BSD style license found at http://polymer.github.io/LICENSE.txt + * The complete set of authors may be found at http://polymer.github.io/AUTHORS.txt + * The complete set of contributors may be found at http://polymer.github.io/CONTRIBUTORS.txt + * Code distributed by Google as part of the polymer project is also + * subject to an additional IP rights grant found at http://polymer.github.io/PATENTS.txt + */ +// @version 0.7.22 +"undefined"==typeof WeakMap&&!function(){var e=Object.defineProperty,t=Date.now()%1e9,n=function(){this.name="__st"+(1e9*Math.random()>>>0)+(t++ +"__")};n.prototype={set:function(t,n){var o=t[this.name];return o&&o[0]===t?o[1]=n:e(t,this.name,{value:[t,n],writable:!0}),this},get:function(e){var t;return(t=e[this.name])&&t[0]===e?t[1]:void 0},"delete":function(e){var t=e[this.name];return t&&t[0]===e?(t[0]=t[1]=void 0,!0):!1},has:function(e){var t=e[this.name];return t?t[0]===e:!1}},window.WeakMap=n}(),function(e){function t(e){E.push(e),b||(b=!0,w(o))}function n(e){return window.ShadowDOMPolyfill&&window.ShadowDOMPolyfill.wrapIfNeeded(e)||e}function o(){b=!1;var e=E;E=[],e.sort(function(e,t){return e.uid_-t.uid_});var t=!1;e.forEach(function(e){var n=e.takeRecords();r(e),n.length&&(e.callback_(n,e),t=!0)}),t&&o()}function r(e){e.nodes_.forEach(function(t){var n=v.get(t);n&&n.forEach(function(t){t.observer===e&&t.removeTransientObservers()})})}function i(e,t){for(var n=e;n;n=n.parentNode){var o=v.get(n);if(o)for(var r=0;r0){var r=n[o-1],i=p(r,e);if(i)return void(n[o-1]=i)}else t(this.observer);n[o]=e},addListeners:function(){this.addListeners_(this.target)},addListeners_:function(e){var t=this.options;t.attributes&&e.addEventListener("DOMAttrModified",this,!0),t.characterData&&e.addEventListener("DOMCharacterDataModified",this,!0),t.childList&&e.addEventListener("DOMNodeInserted",this,!0),(t.childList||t.subtree)&&e.addEventListener("DOMNodeRemoved",this,!0)},removeListeners:function(){this.removeListeners_(this.target)},removeListeners_:function(e){var t=this.options;t.attributes&&e.removeEventListener("DOMAttrModified",this,!0),t.characterData&&e.removeEventListener("DOMCharacterDataModified",this,!0),t.childList&&e.removeEventListener("DOMNodeInserted",this,!0),(t.childList||t.subtree)&&e.removeEventListener("DOMNodeRemoved",this,!0)},addTransientObserver:function(e){if(e!==this.target){this.addListeners_(e),this.transientObservedNodes.push(e);var t=v.get(e);t||v.set(e,t=[]),t.push(this)}},removeTransientObservers:function(){var e=this.transientObservedNodes;this.transientObservedNodes=[],e.forEach(function(e){this.removeListeners_(e);for(var t=v.get(e),n=0;n=0)){n.push(e);for(var o,r=e.querySelectorAll("link[rel="+a+"]"),d=0,s=r.length;s>d&&(o=r[d]);d++)o["import"]&&i(o["import"],t,n);t(e)}}var a=window.HTMLImports?window.HTMLImports.IMPORT_LINK_TYPE:"none";e.forDocumentTree=r,e.forSubtree=t}),window.CustomElements.addModule(function(e){function t(e,t){return n(e,t)||o(e,t)}function n(t,n){return e.upgrade(t,n)?!0:void(n&&a(t))}function o(e,t){b(e,function(e){return n(e,t)?!0:void 0})}function r(e){N.push(e),y||(y=!0,setTimeout(i))}function i(){y=!1;for(var e,t=N,n=0,o=t.length;o>n&&(e=t[n]);n++)e();N=[]}function a(e){_?r(function(){d(e)}):d(e)}function d(e){e.__upgraded__&&!e.__attached&&(e.__attached=!0,e.attachedCallback&&e.attachedCallback())}function s(e){u(e),b(e,function(e){u(e)})}function u(e){_?r(function(){c(e)}):c(e)}function c(e){e.__upgraded__&&e.__attached&&(e.__attached=!1,e.detachedCallback&&e.detachedCallback())}function l(e){for(var t=e,n=window.wrap(document);t;){if(t==n)return!0;t=t.parentNode||t.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&t.host}}function f(e){if(e.shadowRoot&&!e.shadowRoot.__watched){g.dom&&console.log("watching shadow-root for: ",e.localName);for(var t=e.shadowRoot;t;)w(t),t=t.olderShadowRoot}}function p(e,n){if(g.dom){var o=n[0];if(o&&"childList"===o.type&&o.addedNodes&&o.addedNodes){for(var r=o.addedNodes[0];r&&r!==document&&!r.host;)r=r.parentNode;var i=r&&(r.URL||r._URL||r.host&&r.host.localName)||"";i=i.split("/?").shift().split("/").pop()}console.group("mutations (%d) [%s]",n.length,i||"")}var a=l(e);n.forEach(function(e){"childList"===e.type&&(M(e.addedNodes,function(e){e.localName&&t(e,a)}),M(e.removedNodes,function(e){e.localName&&s(e)}))}),g.dom&&console.groupEnd()}function m(e){for(e=window.wrap(e),e||(e=window.wrap(document));e.parentNode;)e=e.parentNode;var t=e.__observer;t&&(p(e,t.takeRecords()),i())}function w(e){if(!e.__observer){var t=new MutationObserver(p.bind(this,e));t.observe(e,{childList:!0,subtree:!0}),e.__observer=t}}function v(e){e=window.wrap(e),g.dom&&console.group("upgradeDocument: ",e.baseURI.split("/").pop());var n=e===window.wrap(document);t(e,n),w(e),g.dom&&console.groupEnd()}function h(e){E(e,v)}var g=e.flags,b=e.forSubtree,E=e.forDocumentTree,_=window.MutationObserver._isPolyfilled&&g["throttle-attached"];e.hasPolyfillMutations=_,e.hasThrottledAttached=_;var y=!1,N=[],M=Array.prototype.forEach.call.bind(Array.prototype.forEach),O=Element.prototype.createShadowRoot;O&&(Element.prototype.createShadowRoot=function(){var e=O.call(this);return window.CustomElements.watchShadow(this),e}),e.watchShadow=f,e.upgradeDocumentTree=h,e.upgradeDocument=v,e.upgradeSubtree=o,e.upgradeAll=t,e.attached=a,e.takeRecords=m}),window.CustomElements.addModule(function(e){function t(t,o){if("template"===t.localName&&window.HTMLTemplateElement&&HTMLTemplateElement.decorate&&HTMLTemplateElement.decorate(t),!t.__upgraded__&&t.nodeType===Node.ELEMENT_NODE){var r=t.getAttribute("is"),i=e.getRegisteredDefinition(t.localName)||e.getRegisteredDefinition(r);if(i&&(r&&i.tag==t.localName||!r&&!i["extends"]))return n(t,i,o)}}function n(t,n,r){return a.upgrade&&console.group("upgrade:",t.localName),n.is&&t.setAttribute("is",n.is),o(t,n),t.__upgraded__=!0,i(t),r&&e.attached(t),e.upgradeSubtree(t,r),a.upgrade&&console.groupEnd(),t}function o(e,t){Object.__proto__?e.__proto__=t.prototype:(r(e,t.prototype,t["native"]),e.__proto__=t.prototype)}function r(e,t,n){for(var o={},r=t;r!==n&&r!==HTMLElement.prototype;){for(var i,a=Object.getOwnPropertyNames(r),d=0;i=a[d];d++)o[i]||(Object.defineProperty(e,i,Object.getOwnPropertyDescriptor(r,i)),o[i]=1);r=Object.getPrototypeOf(r)}}function i(e){e.createdCallback&&e.createdCallback()}var a=e.flags;e.upgrade=t,e.upgradeWithDefinition=n,e.implementPrototype=o}),window.CustomElements.addModule(function(e){function t(t,o){var s=o||{};if(!t)throw new Error("document.registerElement: first argument `name` must not be empty");if(t.indexOf("-")<0)throw new Error("document.registerElement: first argument ('name') must contain a dash ('-'). Argument provided was '"+String(t)+"'.");if(r(t))throw new Error("Failed to execute 'registerElement' on 'Document': Registration failed for type '"+String(t)+"'. The type name is invalid.");if(u(t))throw new Error("DuplicateDefinitionError: a type with name '"+String(t)+"' is already registered");return s.prototype||(s.prototype=Object.create(HTMLElement.prototype)),s.__name=t.toLowerCase(),s["extends"]&&(s["extends"]=s["extends"].toLowerCase()),s.lifecycle=s.lifecycle||{},s.ancestry=i(s["extends"]),a(s),d(s),n(s.prototype),c(s.__name,s),s.ctor=l(s),s.ctor.prototype=s.prototype,s.prototype.constructor=s.ctor,e.ready&&v(document),s.ctor}function n(e){if(!e.setAttribute._polyfilled){var t=e.setAttribute;e.setAttribute=function(e,n){o.call(this,e,n,t)};var n=e.removeAttribute;e.removeAttribute=function(e){o.call(this,e,null,n)},e.setAttribute._polyfilled=!0}}function o(e,t,n){e=e.toLowerCase();var o=this.getAttribute(e);n.apply(this,arguments);var r=this.getAttribute(e);this.attributeChangedCallback&&r!==o&&this.attributeChangedCallback(e,o,r)}function r(e){for(var t=0;t<_.length;t++)if(e===_[t])return!0}function i(e){var t=u(e);return t?i(t["extends"]).concat([t]):[]}function a(e){for(var t,n=e["extends"],o=0;t=e.ancestry[o];o++)n=t.is&&t.tag;e.tag=n||e.__name,n&&(e.is=e.__name)}function d(e){if(!Object.__proto__){var t=HTMLElement.prototype;if(e.is){var n=document.createElement(e.tag);t=Object.getPrototypeOf(n)}for(var o,r=e.prototype,i=!1;r;)r==t&&(i=!0),o=Object.getPrototypeOf(r),o&&(r.__proto__=o),r=o;i||console.warn(e.tag+" prototype not found in prototype chain for "+e.is),e["native"]=t}}function s(e){return g(M(e.tag),e)}function u(e){return e?y[e.toLowerCase()]:void 0}function c(e,t){y[e]=t}function l(e){return function(){return s(e)}}function f(e,t,n){return e===N?p(t,n):O(e,t)}function p(e,t){e&&(e=e.toLowerCase()),t&&(t=t.toLowerCase());var n=u(t||e);if(n){if(e==n.tag&&t==n.is)return new n.ctor;if(!t&&!n.is)return new n.ctor}var o;return t?(o=p(e),o.setAttribute("is",t),o):(o=M(e),e.indexOf("-")>=0&&b(o,HTMLElement),o)}function m(e,t){var n=e[t];e[t]=function(){var e=n.apply(this,arguments);return h(e),e}}var w,v=(e.isIE,e.upgradeDocumentTree),h=e.upgradeAll,g=e.upgradeWithDefinition,b=e.implementPrototype,E=e.useNative,_=["annotation-xml","color-profile","font-face","font-face-src","font-face-uri","font-face-format","font-face-name","missing-glyph"],y={},N="http://www.w3.org/1999/xhtml",M=document.createElement.bind(document),O=document.createElementNS.bind(document);w=Object.__proto__||E?function(e,t){return e instanceof t}:function(e,t){if(e instanceof t)return!0;for(var n=e;n;){if(n===t.prototype)return!0;n=n.__proto__}return!1},m(Node.prototype,"cloneNode"),m(document,"importNode"),document.registerElement=t,document.createElement=p,document.createElementNS=f,e.registry=y,e["instanceof"]=w,e.reservedTagList=_,e.getRegisteredDefinition=u,document.register=document.registerElement}),function(e){function t(){i(window.wrap(document)),window.CustomElements.ready=!0;var e=window.requestAnimationFrame||function(e){setTimeout(e,16)};e(function(){setTimeout(function(){window.CustomElements.readyTime=Date.now(),window.HTMLImports&&(window.CustomElements.elapsed=window.CustomElements.readyTime-window.HTMLImports.readyTime),document.dispatchEvent(new CustomEvent("WebComponentsReady",{bubbles:!0}))})})}var n=e.useNative,o=e.initializeModules;e.isIE;if(n){var r=function(){};e.watchShadow=r,e.upgrade=r,e.upgradeAll=r,e.upgradeDocumentTree=r,e.upgradeSubtree=r,e.takeRecords=r,e["instanceof"]=function(e,t){return e instanceof t}}else o();var i=e.upgradeDocumentTree,a=e.upgradeDocument;if(window.wrap||(window.ShadowDOMPolyfill?(window.wrap=window.ShadowDOMPolyfill.wrapIfNeeded,window.unwrap=window.ShadowDOMPolyfill.unwrapIfNeeded):window.wrap=window.unwrap=function(e){return e}),window.HTMLImports&&(window.HTMLImports.__importsParsingHook=function(e){e["import"]&&a(wrap(e["import"]))}),"complete"===document.readyState||e.flags.eager)t();else if("interactive"!==document.readyState||window.attachEvent||window.HTMLImports&&!window.HTMLImports.ready){var d=window.HTMLImports&&!window.HTMLImports.ready?"HTMLImportsLoaded":"DOMContentLoaded";window.addEventListener(d,t)}else t()}(window.CustomElements); +if(typeof Math.imul == "undefined" || (Math.imul(0xffffffff,5) == 0)) { + Math.imul = function (a, b) { + var ah = (a >>> 16) & 0xffff; + var al = a & 0xffff; + var bh = (b >>> 16) & 0xffff; + var bl = b & 0xffff; + // the shift by 0 fixes the sign on the high part + // the final |0 converts the unsigned value into a signed value + return ((al * bl) + (((ah * bl + al * bh) << 16) >>> 0)|0); + } +} + +/** + * React v15.5.4 + * + * Copyright 2013-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + * + */ +!function(t){if("object"==typeof exports&&"undefined"!=typeof module)module.exports=t();else if("function"==typeof define&&define.amd)define([],t);else{var e;e="undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:this,e.React=t()}}(function(){return function t(e,n,r){function o(u,a){if(!n[u]){if(!e[u]){var s="function"==typeof require&&require;if(!a&&s)return s(u,!0);if(i)return i(u,!0);var c=new Error("Cannot find module '"+u+"'");throw c.code="MODULE_NOT_FOUND",c}var l=n[u]={exports:{}};e[u][0].call(l.exports,function(t){var n=e[u][1][t];return o(n||t)},l,l.exports,t,e,n,r)}return n[u].exports}for(var i="function"==typeof require&&require,u=0;u1){for(var y=Array(d),h=0;h1){for(var m=Array(v),b=0;b8&&C<=11),x=32,w=String.fromCharCode(x),T={beforeInput:{phasedRegistrationNames:{bubbled:"onBeforeInput",captured:"onBeforeInputCapture"},dependencies:["topCompositionEnd","topKeyPress","topTextInput","topPaste"]},compositionEnd:{phasedRegistrationNames:{bubbled:"onCompositionEnd",captured:"onCompositionEndCapture"},dependencies:["topBlur","topCompositionEnd","topKeyDown","topKeyPress","topKeyUp","topMouseDown"]},compositionStart:{phasedRegistrationNames:{bubbled:"onCompositionStart",captured:"onCompositionStartCapture"},dependencies:["topBlur","topCompositionStart","topKeyDown","topKeyPress","topKeyUp","topMouseDown"]},compositionUpdate:{phasedRegistrationNames:{bubbled:"onCompositionUpdate",captured:"onCompositionUpdateCapture"},dependencies:["topBlur","topCompositionUpdate","topKeyDown","topKeyPress","topKeyUp","topMouseDown"]}},k=!1,P=null,S={eventTypes:T,extractEvents:function(e,t,n,r){return[u(e,t,n,r),p(e,t,n,r)]}};t.exports=S},{123:123,19:19,20:20,78:78,82:82}],4:[function(e,t,n){"use strict";function r(e,t){return e+t.charAt(0).toUpperCase()+t.substring(1)}var o={animationIterationCount:!0,borderImageOutset:!0,borderImageSlice:!0,borderImageWidth:!0,boxFlex:!0,boxFlexGroup:!0,boxOrdinalGroup:!0,columnCount:!0,flex:!0,flexGrow:!0,flexPositive:!0,flexShrink:!0,flexNegative:!0,flexOrder:!0,gridRow:!0,gridColumn:!0,fontWeight:!0,lineClamp:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,tabSize:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,floodOpacity:!0,stopOpacity:!0,strokeDasharray:!0,strokeDashoffset:!0,strokeMiterlimit:!0,strokeOpacity:!0,strokeWidth:!0},i=["Webkit","ms","Moz","O"];Object.keys(o).forEach(function(e){i.forEach(function(t){o[r(t,e)]=o[e]})});var a={background:{backgroundAttachment:!0,backgroundColor:!0,backgroundImage:!0,backgroundPositionX:!0,backgroundPositionY:!0,backgroundRepeat:!0},backgroundPosition:{backgroundPositionX:!0,backgroundPositionY:!0},border:{borderWidth:!0,borderStyle:!0,borderColor:!0},borderBottom:{borderBottomWidth:!0,borderBottomStyle:!0,borderBottomColor:!0},borderLeft:{borderLeftWidth:!0,borderLeftStyle:!0,borderLeftColor:!0},borderRight:{borderRightWidth:!0,borderRightStyle:!0,borderRightColor:!0},borderTop:{borderTopWidth:!0,borderTopStyle:!0,borderTopColor:!0},font:{fontStyle:!0,fontVariant:!0,fontWeight:!0,fontSize:!0,lineHeight:!0,fontFamily:!0},outline:{outlineWidth:!0,outlineStyle:!0,outlineColor:!0}},s={isUnitlessNumber:o,shorthandPropertyExpansions:a};t.exports=s},{}],5:[function(e,t,n){"use strict";var r=e(4),o=e(123),i=(e(58),e(125),e(94)),a=e(136),s=e(140),u=(e(142),s(function(e){return a(e)})),l=!1,c="cssFloat";if(o.canUseDOM){var p=document.createElement("div").style;try{p.font=""}catch(e){l=!0}void 0===document.documentElement.style.cssFloat&&(c="styleFloat")}var d={createMarkupForStyles:function(e,t){var n="";for(var r in e)if(e.hasOwnProperty(r)){var o=e[r];null!=o&&(n+=u(r)+":",n+=i(r,o,t)+";")}return n||null},setValueForStyles:function(e,t,n){var o=e.style;for(var a in t)if(t.hasOwnProperty(a)){var s=i(a,t[a],n);if("float"!==a&&"cssFloat"!==a||(a=c),s)o[a]=s;else{var u=l&&r.shorthandPropertyExpansions[a];if(u)for(var p in u)o[p]="";else o[a]=""}}}};t.exports=d},{123:123,125:125,136:136,140:140,142:142,4:4,58:58,94:94}],6:[function(e,t,n){"use strict";function r(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}var o=e(112),i=e(24),a=(e(137),function(){function e(t){r(this,e),this._callbacks=null,this._contexts=null,this._arg=t}return e.prototype.enqueue=function(e,t){this._callbacks=this._callbacks||[],this._callbacks.push(e),this._contexts=this._contexts||[],this._contexts.push(t)},e.prototype.notifyAll=function(){var e=this._callbacks,t=this._contexts,n=this._arg;if(e&&t){e.length!==t.length&&o("24"),this._callbacks=null,this._contexts=null;for(var r=0;r8));var A=!1;b.canUseDOM&&(A=k("input")&&(!document.documentMode||document.documentMode>11));var D={get:function(){return O.get.call(this)},set:function(e){I=""+e,O.set.call(this,e)}},L={eventTypes:S,extractEvents:function(e,t,n,o){var i,a,s=t?E.getNodeFromInstance(t):window;if(r(s)?R?i=u:a=l:P(s)?A?i=f:(i=m,a=h):v(s)&&(i=g),i){var c=i(e,t);if(c){var p=w.getPooled(S.change,c,n,o);return p.type="change",C.accumulateTwoPhaseDispatches(p),p}}a&&a(e,s,t),"topBlur"===e&&y(t,s)}};t.exports=L},{102:102,109:109,110:110,123:123,16:16,19:19,33:33,71:71,80:80}],8:[function(e,t,n){"use strict";function r(e,t){return Array.isArray(t)&&(t=t[1]),t?t.nextSibling:e.firstChild}function o(e,t,n){c.insertTreeBefore(e,t,n)}function i(e,t,n){Array.isArray(t)?s(e,t[0],t[1],n):m(e,t,n)}function a(e,t){if(Array.isArray(t)){var n=t[1];t=t[0],u(e,t,n),e.removeChild(n)}e.removeChild(t)}function s(e,t,n,r){for(var o=t;;){var i=o.nextSibling;if(m(e,o,r),o===n)break;o=i}}function u(e,t,n){for(;;){var r=t.nextSibling;if(r===n)break;e.removeChild(r)}}function l(e,t,n){var r=e.parentNode,o=e.nextSibling;o===t?n&&m(r,document.createTextNode(n),o):n?(h(o,n),u(r,o,t)):u(r,e,t)}var c=e(9),p=e(13),d=(e(33),e(58),e(93)),f=e(114),h=e(115),m=d(function(e,t,n){e.insertBefore(t,n)}),v=p.dangerouslyReplaceNodeWithMarkup,g={dangerouslyReplaceNodeWithMarkup:v,replaceDelimitedText:l,processUpdates:function(e,t){for(var n=0;n-1||a("96",e),!l.plugins[n]){t.extractEvents||a("97",e),l.plugins[n]=t;var r=t.eventTypes;for(var i in r)o(r[i],t,i)||a("98",i,e)}}}function o(e,t,n){l.eventNameDispatchConfigs.hasOwnProperty(n)&&a("99",n),l.eventNameDispatchConfigs[n]=e;var r=e.phasedRegistrationNames;if(r){for(var o in r)if(r.hasOwnProperty(o)){var s=r[o];i(s,t,n)}return!0}return!!e.registrationName&&(i(e.registrationName,t,n),!0)}function i(e,t,n){l.registrationNameModules[e]&&a("100",e),l.registrationNameModules[e]=t,l.registrationNameDependencies[e]=t.eventTypes[n].dependencies}var a=e(112),s=(e(137),null),u={},l={plugins:[],eventNameDispatchConfigs:{},registrationNameModules:{},registrationNameDependencies:{},possibleRegistrationNames:null,injectEventPluginOrder:function(e){s&&a("101"),s=Array.prototype.slice.call(e),r()},injectEventPluginsByName:function(e){var t=!1;for(var n in e)if(e.hasOwnProperty(n)){var o=e[n];u.hasOwnProperty(n)&&u[n]===o||(u[n]&&a("102",n),u[n]=o,t=!0)}t&&r()},getPluginModuleForEvent:function(e){var t=e.dispatchConfig;if(t.registrationName)return l.registrationNameModules[t.registrationName]||null;if(void 0!==t.phasedRegistrationNames){var n=t.phasedRegistrationNames;for(var r in n)if(n.hasOwnProperty(r)){var o=l.registrationNameModules[n[r]];if(o)return o}}return null},_resetEventPlugins:function(){s=null;for(var e in u)u.hasOwnProperty(e)&&delete u[e];l.plugins.length=0;var t=l.eventNameDispatchConfigs;for(var n in t)t.hasOwnProperty(n)&&delete t[n];var r=l.registrationNameModules;for(var o in r)r.hasOwnProperty(o)&&delete r[o]}};t.exports=l},{112:112,137:137}],18:[function(e,t,n){"use strict";function r(e){return"topMouseUp"===e||"topTouchEnd"===e||"topTouchCancel"===e}function o(e){return"topMouseMove"===e||"topTouchMove"===e}function i(e){return"topMouseDown"===e||"topTouchStart"===e}function a(e,t,n,r){var o=e.type||"unknown-event";e.currentTarget=g.getNodeFromInstance(r),t?m.invokeGuardedCallbackWithCatch(o,n,e):m.invokeGuardedCallback(o,n,e),e.currentTarget=null}function s(e,t){var n=e._dispatchListeners,r=e._dispatchInstances;if(Array.isArray(n))for(var o=0;o1?1-t:void 0;return this._fallbackText=o.slice(e,s),this._fallbackText}}),i.addPoolingTo(r),t.exports=r},{106:106,143:143,24:24}],21:[function(e,t,n){"use strict";var r=e(11),o=r.injection.MUST_USE_PROPERTY,i=r.injection.HAS_BOOLEAN_VALUE,a=r.injection.HAS_NUMERIC_VALUE,s=r.injection.HAS_POSITIVE_NUMERIC_VALUE,u=r.injection.HAS_OVERLOADED_BOOLEAN_VALUE,l={isCustomAttribute:RegExp.prototype.test.bind(new RegExp("^(data|aria)-["+r.ATTRIBUTE_NAME_CHAR+"]*$")),Properties:{accept:0,acceptCharset:0,accessKey:0,action:0,allowFullScreen:i,allowTransparency:0,alt:0,as:0,async:i,autoComplete:0,autoPlay:i,capture:i,cellPadding:0,cellSpacing:0,charSet:0,challenge:0,checked:o|i,cite:0,classID:0,className:0,cols:s,colSpan:0,content:0,contentEditable:0,contextMenu:0,controls:i,coords:0,crossOrigin:0,data:0,dateTime:0,default:i,defer:i,dir:0,disabled:i,download:u,draggable:0,encType:0,form:0,formAction:0,formEncType:0,formMethod:0,formNoValidate:i,formTarget:0,frameBorder:0,headers:0,height:0,hidden:i,high:0,href:0,hrefLang:0,htmlFor:0,httpEquiv:0,icon:0,id:0,inputMode:0,integrity:0,is:0,keyParams:0,keyType:0,kind:0,label:0,lang:0,list:0,loop:i,low:0,manifest:0,marginHeight:0,marginWidth:0,max:0,maxLength:0,media:0,mediaGroup:0,method:0,min:0,minLength:0,multiple:o|i,muted:o|i,name:0,nonce:0,noValidate:i,open:i,optimum:0,pattern:0,placeholder:0,playsInline:i,poster:0,preload:0,profile:0,radioGroup:0,readOnly:i,referrerPolicy:0,rel:0,required:i,reversed:i,role:0,rows:s,rowSpan:a,sandbox:0,scope:0,scoped:i,scrolling:0,seamless:i,selected:o|i,shape:0,size:s,sizes:0,span:s,spellCheck:0,src:0,srcDoc:0,srcLang:0,srcSet:0,start:a,step:0,style:0,summary:0,tabIndex:0,target:0,title:0,type:0,useMap:0,value:0,width:0,wmode:0,wrap:0,about:0,datatype:0,inlist:0,prefix:0,property:0,resource:0,typeof:0,vocab:0,autoCapitalize:0,autoCorrect:0,autoSave:0,color:0,itemProp:0,itemScope:i,itemType:0,itemID:0,itemRef:0,results:0,security:0,unselectable:0},DOMAttributeNames:{acceptCharset:"accept-charset",className:"class",htmlFor:"for",httpEquiv:"http-equiv"},DOMPropertyNames:{},DOMMutationMethods:{value:function(e,t){if(null==t)return e.removeAttribute("value");"number"!==e.type||!1===e.hasAttribute("value")?e.setAttribute("value",""+t):e.validity&&!e.validity.badInput&&e.ownerDocument.activeElement!==e&&e.setAttribute("value",""+t)}}};t.exports=l},{11:11}],22:[function(e,t,n){"use strict";function r(e){var t={"=":"=0",":":"=2"};return"$"+(""+e).replace(/[=:]/g,function(e){return t[e]})}function o(e){var t={"=0":"=","=2":":"};return(""+("."===e[0]&&"$"===e[1]?e.substring(2):e.substring(1))).replace(/(=0|=2)/g,function(e){return t[e]})}var i={escape:r,unescape:o};t.exports=i},{}],23:[function(e,t,n){"use strict";function r(e){null!=e.checkedLink&&null!=e.valueLink&&s("87")}function o(e){r(e),(null!=e.value||null!=e.onChange)&&s("88")}function i(e){r(e),(null!=e.checked||null!=e.onChange)&&s("89")}function a(e){if(e){var t=e.getName();if(t)return" Check the render method of `"+t+"`."}return""}var s=e(112),u=e(64),l=e(145),c=e(120),p=l(c.isValidElement),d=(e(137),e(142),{button:!0,checkbox:!0,image:!0,hidden:!0,radio:!0,reset:!0,submit:!0}),f={value:function(e,t,n){return!e[t]||d[e.type]||e.onChange||e.readOnly||e.disabled?null:new Error("You provided a `value` prop to a form field without an `onChange` handler. This will render a read-only field. If the field should be mutable use `defaultValue`. Otherwise, set either `onChange` or `readOnly`.")},checked:function(e,t,n){return!e[t]||e.onChange||e.readOnly||e.disabled?null:new Error("You provided a `checked` prop to a form field without an `onChange` handler. This will render a read-only field. If the field should be mutable use `defaultChecked`. Otherwise, set either `onChange` or `readOnly`.")},onChange:p.func},h={},m={checkPropTypes:function(e,t,n){for(var r in f){if(f.hasOwnProperty(r))var o=f[r](t,r,e,"prop",null,u);o instanceof Error&&!(o.message in h)&&(h[o.message]=!0,a(n))}},getValue:function(e){return e.valueLink?(o(e),e.valueLink.value):e.value},getChecked:function(e){return e.checkedLink?(i(e),e.checkedLink.value):e.checked},executeOnChange:function(e,t){return e.valueLink?(o(e),e.valueLink.requestChange(t.target.value)):e.checkedLink?(i(e),e.checkedLink.requestChange(t.target.checked)):e.onChange?e.onChange.call(void 0,t):void 0}};t.exports=m},{112:112,120:120,137:137,142:142,145:145,64:64}],24:[function(e,t,n){"use strict";var r=e(112),o=(e(137),function(e){var t=this;if(t.instancePool.length){var n=t.instancePool.pop();return t.call(n,e),n}return new t(e)}),i=function(e,t){var n=this;if(n.instancePool.length){var r=n.instancePool.pop();return n.call(r,e,t),r}return new n(e,t)},a=function(e,t,n){var r=this;if(r.instancePool.length){var o=r.instancePool.pop();return r.call(o,e,t,n),o}return new r(e,t,n)},s=function(e,t,n,r){var o=this;if(o.instancePool.length){var i=o.instancePool.pop();return o.call(i,e,t,n,r),i}return new o(e,t,n,r)},u=function(e){var t=this;e instanceof t||r("25"),e.destructor(),t.instancePool.length=0||null!=t.is}function h(e){var t=e.type;d(t),this._currentElement=e,this._tag=t.toLowerCase(),this._namespaceURI=null,this._renderedChildren=null,this._previousStyle=null,this._previousStyleCopy=null,this._hostNode=null,this._hostParent=null,this._rootNodeID=0,this._domID=0,this._hostContainerInfo=null,this._wrapperState=null,this._topLevelWrapper=null,this._flags=0}var m=e(112),v=e(143),g=e(2),y=e(5),_=e(9),C=e(10),b=e(11),E=e(12),x=e(16),w=e(17),T=e(25),k=e(32),P=e(33),S=e(38),N=e(39),M=e(40),I=e(43),O=(e(58),e(61)),R=e(68),A=(e(129),e(95)),D=(e(137),e(109),e(141),e(118),e(142),k),L=x.deleteListener,U=P.getNodeFromInstance,F=T.listenTo,j=w.registrationNameModules,V={string:!0,number:!0},B="__html",W={children:null,dangerouslySetInnerHTML:null,suppressContentEditableWarning:null},H=11,q={topAbort:"abort",topCanPlay:"canplay",topCanPlayThrough:"canplaythrough",topDurationChange:"durationchange",topEmptied:"emptied",topEncrypted:"encrypted",topEnded:"ended",topError:"error",topLoadedData:"loadeddata",topLoadedMetadata:"loadedmetadata",topLoadStart:"loadstart",topPause:"pause",topPlay:"play",topPlaying:"playing",topProgress:"progress",topRateChange:"ratechange",topSeeked:"seeked",topSeeking:"seeking",topStalled:"stalled",topSuspend:"suspend",topTimeUpdate:"timeupdate",topVolumeChange:"volumechange",topWaiting:"waiting"},K={area:!0,base:!0,br:!0,col:!0,embed:!0,hr:!0,img:!0,input:!0,keygen:!0,link:!0,meta:!0,param:!0,source:!0,track:!0,wbr:!0},z={listing:!0,pre:!0,textarea:!0},Y=v({menuitem:!0},K),X=/^[a-zA-Z][a-zA-Z:_\.\-\d]*$/,Q={},G={}.hasOwnProperty,$=1;h.displayName="ReactDOMComponent",h.Mixin={mountComponent:function(e,t,n,r){this._rootNodeID=$++,this._domID=n._idCounter++,this._hostParent=t,this._hostContainerInfo=n;var i=this._currentElement.props;switch(this._tag){case"audio":case"form":case"iframe":case"img":case"link":case"object":case"source":case"video":this._wrapperState={listeners:null},e.getReactMountReady().enqueue(c,this);break;case"input":S.mountWrapper(this,i,t),i=S.getHostProps(this,i),e.getReactMountReady().enqueue(c,this);break;case"option":N.mountWrapper(this,i,t),i=N.getHostProps(this,i);break;case"select":M.mountWrapper(this,i,t),i=M.getHostProps(this,i),e.getReactMountReady().enqueue(c,this);break;case"textarea":I.mountWrapper(this,i,t),i=I.getHostProps(this,i),e.getReactMountReady().enqueue(c,this)}o(this,i);var a,p;null!=t?(a=t._namespaceURI,p=t._tag):n._tag&&(a=n._namespaceURI,p=n._tag),(null==a||a===C.svg&&"foreignobject"===p)&&(a=C.html),a===C.html&&("svg"===this._tag?a=C.svg:"math"===this._tag&&(a=C.mathml)),this._namespaceURI=a;var d;if(e.useCreateElement){var f,h=n._ownerDocument;if(a===C.html)if("script"===this._tag){var m=h.createElement("div"),v=this._currentElement.type;m.innerHTML="<"+v+">",f=m.removeChild(m.firstChild)}else f=i.is?h.createElement(this._currentElement.type,i.is):h.createElement(this._currentElement.type);else f=h.createElementNS(a,this._currentElement.type);P.precacheNode(this,f),this._flags|=D.hasCachedChildNodes,this._hostParent||E.setAttributeForRoot(f),this._updateDOMProperties(null,i,e);var y=_(f);this._createInitialChildren(e,i,r,y),d=y}else{var b=this._createOpenTagMarkupAndPutListeners(e,i),x=this._createContentMarkup(e,i,r);d=!x&&K[this._tag]?b+"/>":b+">"+x+""}switch(this._tag){case"input":e.getReactMountReady().enqueue(s,this),i.autoFocus&&e.getReactMountReady().enqueue(g.focusDOMComponent,this);break;case"textarea":e.getReactMountReady().enqueue(u,this),i.autoFocus&&e.getReactMountReady().enqueue(g.focusDOMComponent,this);break;case"select":case"button":i.autoFocus&&e.getReactMountReady().enqueue(g.focusDOMComponent,this);break;case"option":e.getReactMountReady().enqueue(l,this)}return d},_createOpenTagMarkupAndPutListeners:function(e,t){var n="<"+this._currentElement.type;for(var r in t)if(t.hasOwnProperty(r)){var o=t[r];if(null!=o)if(j.hasOwnProperty(r))o&&i(this,r,o,e);else{"style"===r&&(o&&(o=this._previousStyleCopy=v({},t.style)),o=y.createMarkupForStyles(o,this));var a=null;null!=this._tag&&f(this._tag,t)?W.hasOwnProperty(r)||(a=E.createMarkupForCustomAttribute(r,o)):a=E.createMarkupForProperty(r,o),a&&(n+=" "+a)}}return e.renderToStaticMarkup?n:(this._hostParent||(n+=" "+E.createMarkupForRoot()),n+=" "+E.createMarkupForID(this._domID))},_createContentMarkup:function(e,t,n){var r="",o=t.dangerouslySetInnerHTML;if(null!=o)null!=o.__html&&(r=o.__html);else{var i=V[typeof t.children]?t.children:null,a=null!=i?null:t.children;if(null!=i)r=A(i);else if(null!=a){var s=this.mountChildren(a,e,n);r=s.join("")}}return z[this._tag]&&"\n"===r.charAt(0)?"\n"+r:r},_createInitialChildren:function(e,t,n,r){var o=t.dangerouslySetInnerHTML;if(null!=o)null!=o.__html&&_.queueHTML(r,o.__html);else{var i=V[typeof t.children]?t.children:null,a=null!=i?null:t.children;if(null!=i)""!==i&&_.queueText(r,i);else if(null!=a)for(var s=this.mountChildren(a,e,n),u=0;u"},receiveComponent:function(){},getHostNode:function(){return i.getNodeFromInstance(this)},unmountComponent:function(){i.uncacheNode(this)}}),t.exports=a},{143:143,33:33,9:9}],36:[function(e,t,n){"use strict";var r={useCreateElement:!0,useFiber:!1};t.exports=r},{}],37:[function(e,t,n){"use strict";var r=e(8),o=e(33),i={dangerouslyProcessChildrenUpdates:function(e,t){var n=o.getNodeFromInstance(e);r.processUpdates(n,t)}};t.exports=i},{33:33,8:8}],38:[function(e,t,n){"use strict";function r(){this._rootNodeID&&d.updateWrapper(this)}function o(e){return"checkbox"===e.type||"radio"===e.type?null!=e.checked:null!=e.value}function i(e){var t=this._currentElement.props,n=l.executeOnChange(t,e);p.asap(r,this);var o=t.name;if("radio"===t.type&&null!=o){for(var i=c.getNodeFromInstance(this),s=i;s.parentNode;)s=s.parentNode;for(var u=s.querySelectorAll("input[name="+JSON.stringify(""+o)+'][type="radio"]'),d=0;dt.end?(n=t.end,r=t.start):(n=t.start,r=t.end),o.moveToElementText(e),o.moveStart("character",n),o.setEndPoint("EndToStart",o),o.moveEnd("character",r-n),o.select()}function s(e,t){if(window.getSelection){var n=window.getSelection(),r=e[c()].length,o=Math.min(t.start,r),i=void 0===t.end?o:Math.min(t.end,r);if(!n.extend&&o>i){var a=i;i=o,o=a}var s=l(e,o),u=l(e,i);if(s&&u){var p=document.createRange();p.setStart(s.node,s.offset),n.removeAllRanges(),o>i?(n.addRange(p),n.extend(u.node,u.offset)):(p.setEnd(u.node,u.offset),n.addRange(p))}}}var u=e(123),l=e(105),c=e(106),p=u.canUseDOM&&"selection"in document&&!("getSelection"in window),d={getOffsets:p?o:i,setOffsets:p?a:s};t.exports=d},{105:105,106:106,123:123}],42:[function(e,t,n){"use strict";var r=e(112),o=e(143),i=e(8),a=e(9),s=e(33),u=e(95),l=(e(137),e(118),function(e){this._currentElement=e,this._stringText=""+e, +this._hostNode=null,this._hostParent=null,this._domID=0,this._mountIndex=0,this._closingComment=null,this._commentNodes=null});o(l.prototype,{mountComponent:function(e,t,n,r){var o=n._idCounter++,i=" react-text: "+o+" ";if(this._domID=o,this._hostParent=t,e.useCreateElement){var l=n._ownerDocument,c=l.createComment(i),p=l.createComment(" /react-text "),d=a(l.createDocumentFragment());return a.queueChild(d,a(c)),this._stringText&&a.queueChild(d,a(l.createTextNode(this._stringText))),a.queueChild(d,a(p)),s.precacheNode(this,c),this._closingComment=p,d}var f=u(this._stringText);return e.renderToStaticMarkup?f:""+f+""},receiveComponent:function(e,t){if(e!==this._currentElement){this._currentElement=e;var n=""+e;if(n!==this._stringText){this._stringText=n;var r=this.getHostNode();i.replaceDelimitedText(r[0],r[1],n)}}},getHostNode:function(){var e=this._commentNodes;if(e)return e;if(!this._closingComment)for(var t=s.getNodeFromInstance(this),n=t.nextSibling;;){if(null==n&&r("67",this._domID),8===n.nodeType&&" /react-text "===n.nodeValue){this._closingComment=n;break}n=n.nextSibling}return e=[this._hostNode,this._closingComment],this._commentNodes=e,e},unmountComponent:function(){this._closingComment=null,this._commentNodes=null,s.uncacheNode(this)}}),t.exports=l},{112:112,118:118,137:137,143:143,33:33,8:8,9:9,95:95}],43:[function(e,t,n){"use strict";function r(){this._rootNodeID&&c.updateWrapper(this)}function o(e){var t=this._currentElement.props,n=s.executeOnChange(t,e);return l.asap(r,this),n}var i=e(112),a=e(143),s=e(23),u=e(33),l=e(71),c=(e(137),e(142),{getHostProps:function(e,t){return null!=t.dangerouslySetInnerHTML&&i("91"),a({},t,{value:void 0,defaultValue:void 0,children:""+e._wrapperState.initialValue,onChange:e._wrapperState.onChange})},mountWrapper:function(e,t){var n=s.getValue(t),r=n;if(null==n){var a=t.defaultValue,u=t.children;null!=u&&(null!=a&&i("92"),Array.isArray(u)&&(u.length<=1||i("93"),u=u[0]),a=""+u),null==a&&(a=""),r=a}e._wrapperState={initialValue:""+r,listeners:null,onChange:o.bind(e)}},updateWrapper:function(e){var t=e._currentElement.props,n=u.getNodeFromInstance(e),r=s.getValue(t);if(null!=r){var o=""+r;o!==n.value&&(n.value=o),null==t.defaultValue&&(n.defaultValue=o)}null!=t.defaultValue&&(n.defaultValue=t.defaultValue)},postMountWrapper:function(e){var t=u.getNodeFromInstance(e),n=t.textContent;n===e._wrapperState.initialValue&&(t.value=n)}});t.exports=c},{112:112,137:137,142:142,143:143,23:23,33:33,71:71}],44:[function(e,t,n){"use strict";function r(e,t){"_hostNode"in e||u("33"),"_hostNode"in t||u("33");for(var n=0,r=e;r;r=r._hostParent)n++;for(var o=0,i=t;i;i=i._hostParent)o++;for(;n-o>0;)e=e._hostParent,n--;for(;o-n>0;)t=t._hostParent,o--;for(var a=n;a--;){if(e===t)return e;e=e._hostParent,t=t._hostParent}return null}function o(e,t){"_hostNode"in e||u("35"),"_hostNode"in t||u("35");for(;t;){if(t===e)return!0;t=t._hostParent}return!1}function i(e){return"_hostNode"in e||u("36"),e._hostParent}function a(e,t,n){for(var r=[];e;)r.push(e),e=e._hostParent;var o;for(o=r.length;o-- >0;)t(r[o],"captured",n);for(o=0;o0;)n(u[l],"captured",i)}var u=e(112);e(137);t.exports={isAncestor:o,getLowestCommonAncestor:r,getParentInstance:i,traverseTwoPhase:a,traverseEnterLeave:s}},{112:112,137:137}],45:[function(e,t,n){"use strict";var r=e(120),o=e(30),i=o;r.addons&&(r.__SECRET_INJECTED_REACT_DOM_DO_NOT_USE_OR_YOU_WILL_BE_FIRED=i),t.exports=i},{120:120,30:30}],46:[function(e,t,n){"use strict";function r(){this.reinitializeTransaction()}var o=e(143),i=e(71),a=e(89),s=e(129),u={initialize:s,close:function(){d.isBatchingUpdates=!1}},l={initialize:s,close:i.flushBatchedUpdates.bind(i)},c=[l,u];o(r.prototype,a,{getTransactionWrappers:function(){return c}});var p=new r,d={isBatchingUpdates:!1,batchedUpdates:function(e,t,n,r,o,i){var a=d.isBatchingUpdates;return d.isBatchingUpdates=!0,a?e(t,n,r,o,i):p.perform(e,null,t,n,r,o,i)}};t.exports=d},{129:129,143:143,71:71,89:89}],47:[function(e,t,n){"use strict";function r(){x||(x=!0,y.EventEmitter.injectReactEventListener(g),y.EventPluginHub.injectEventPluginOrder(s),y.EventPluginUtils.injectComponentTree(d),y.EventPluginUtils.injectTreeTraversal(h),y.EventPluginHub.injectEventPluginsByName({SimpleEventPlugin:E,EnterLeaveEventPlugin:u,ChangeEventPlugin:a,SelectEventPlugin:b,BeforeInputEventPlugin:i}),y.HostComponent.injectGenericComponentClass(p),y.HostComponent.injectTextComponentClass(m),y.DOMProperty.injectDOMPropertyConfig(o),y.DOMProperty.injectDOMPropertyConfig(l),y.DOMProperty.injectDOMPropertyConfig(C),y.EmptyComponent.injectEmptyComponentFactory(function(e){return new f(e)}),y.Updates.injectReconcileTransaction(_),y.Updates.injectBatchingStrategy(v),y.Component.injectEnvironment(c))}var o=e(1),i=e(3),a=e(7),s=e(14),u=e(15),l=e(21),c=e(27),p=e(31),d=e(33),f=e(35),h=e(44),m=e(42),v=e(46),g=e(52),y=e(55),_=e(65),C=e(73),b=e(74),E=e(75),x=!1;t.exports={inject:r}},{1:1,14:14,15:15,21:21,27:27,3:3,31:31,33:33,35:35,42:42,44:44,46:46,52:52,55:55,65:65,7:7,73:73,74:74,75:75}],48:[function(e,t,n){"use strict";var r="function"==typeof Symbol&&Symbol.for&&Symbol.for("react.element")||60103;t.exports=r},{}],49:[function(e,t,n){"use strict";var r,o={injectEmptyComponentFactory:function(e){r=e}},i={create:function(e){return r(e)}};i.injection=o,t.exports=i},{}],50:[function(e,t,n){"use strict";function r(e,t,n){try{t(n)}catch(e){null===o&&(o=e)}}var o=null,i={invokeGuardedCallback:r,invokeGuardedCallbackWithCatch:r,rethrowCaughtError:function(){if(o){var e=o;throw o=null,e}}};t.exports=i},{}],51:[function(e,t,n){"use strict";function r(e){o.enqueueEvents(e),o.processEventQueue(!1)}var o=e(16),i={handleTopLevel:function(e,t,n,i){r(o.extractEvents(e,t,n,i))}};t.exports=i},{16:16}],52:[function(e,t,n){"use strict";function r(e){for(;e._hostParent;)e=e._hostParent;var t=p.getNodeFromInstance(e),n=t.parentNode;return p.getClosestInstanceFromNode(n)}function o(e,t){this.topLevelType=e,this.nativeEvent=t,this.ancestors=[]}function i(e){var t=f(e.nativeEvent),n=p.getClosestInstanceFromNode(t),o=n;do{e.ancestors.push(o),o=o&&r(o)}while(o);for(var i=0;i/," "+i.CHECKSUM_ATTR_NAME+'="'+t+'"$&')},canReuseMarkup:function(e,t){var n=t.getAttribute(i.CHECKSUM_ATTR_NAME);return n=n&&parseInt(n,10),r(e)===n}};t.exports=i},{92:92}],60:[function(e,t,n){"use strict";function r(e,t){for(var n=Math.min(e.length,t.length),r=0;r.":"function"==typeof t?" Instead of passing a class like Foo, pass React.createElement(Foo) or .":null!=t&&void 0!==t.props?" This may be caused by unintentionally loading two independent copies of React.":"");var a,s=v.createElement(F,{child:t});if(e){var u=E.get(e);a=u._processChildContext(u._context)}else a=P;var c=d(n);if(c){var p=c._currentElement,h=p.props.child;if(M(h,t)){var m=c._renderedComponent.getPublicInstance(),g=r&&function(){r.call(m)};return j._updateRootComponent(c,s,a,n,g),m}j.unmountComponentAtNode(n)}var y=o(n),_=y&&!!i(y),C=l(n),b=_&&!c&&!C,x=j._renderNewRootComponent(s,n,b,a)._renderedComponent.getPublicInstance();return r&&r.call(x),x},render:function(e,t,n){return j._renderSubtreeIntoContainer(null,e,t,n)},unmountComponentAtNode:function(e){c(e)||f("40");var t=d(e);return t?(delete L[t._instance.rootID],k.batchedUpdates(u,t,e,!1),!0):(l(e),1===e.nodeType&&e.hasAttribute(O),!1)},_mountImageIntoNode:function(e,t,n,i,a){if(c(t)||f("41"),i){var s=o(t);if(x.canReuseMarkup(e,s))return void y.precacheNode(n,s);var u=s.getAttribute(x.CHECKSUM_ATTR_NAME);s.removeAttribute(x.CHECKSUM_ATTR_NAME);var l=s.outerHTML;s.setAttribute(x.CHECKSUM_ATTR_NAME,u);var p=e,d=r(p,l),m=" (client) "+p.substring(d-20,d+20)+"\n (server) "+l.substring(d-20,d+20);t.nodeType===A&&f("42",m)}if(t.nodeType===A&&f("43"),a.useCreateElement){for(;t.lastChild;)t.removeChild(t.lastChild);h.insertTreeBefore(t,e,null)}else N(t,e),y.precacheNode(n,t.firstChild)}};t.exports=j},{108:108,11:11,112:112,114:114,116:116,119:119,120:120,130:130,137:137,142:142,25:25,33:33,34:34,36:36,53:53,57:57,58:58,59:59,66:66,70:70,71:71,9:9}],61:[function(e,t,n){"use strict";function r(e,t,n){return{type:"INSERT_MARKUP",content:e,fromIndex:null,fromNode:null,toIndex:n,afterNode:t}}function o(e,t,n){return{type:"MOVE_EXISTING",content:null,fromIndex:e._mountIndex,fromNode:d.getHostNode(e),toIndex:n,afterNode:t}}function i(e,t){return{type:"REMOVE_NODE",content:null,fromIndex:e._mountIndex,fromNode:t,toIndex:null,afterNode:null}}function a(e){return{type:"SET_MARKUP",content:e,fromIndex:null,fromNode:null,toIndex:null,afterNode:null}}function s(e){return{type:"TEXT_CONTENT",content:e,fromIndex:null,fromNode:null,toIndex:null,afterNode:null}}function u(e,t){return t&&(e=e||[],e.push(t)),e}function l(e,t){p.processChildrenUpdates(e,t)}var c=e(112),p=e(28),d=(e(57),e(58),e(119),e(66)),f=e(26),h=(e(129),e(97)),m=(e(137),{Mixin:{_reconcilerInstantiateChildren:function(e,t,n){return f.instantiateChildren(e,t,n)},_reconcilerUpdateChildren:function(e,t,n,r,o,i){var a;return a=h(t,0),f.updateChildren(e,a,n,r,o,this,this._hostContainerInfo,i,0),a},mountChildren:function(e,t,n){var r=this._reconcilerInstantiateChildren(e,t,n);this._renderedChildren=r;var o=[],i=0;for(var a in r)if(r.hasOwnProperty(a)){var s=r[a],u=d.mountComponent(s,t,this,this._hostContainerInfo,n,0);s._mountIndex=i++,o.push(u)}return o},updateTextContent:function(e){var t=this._renderedChildren;f.unmountChildren(t,!1);for(var n in t)t.hasOwnProperty(n)&&c("118");l(this,[s(e)])},updateMarkup:function(e){var t=this._renderedChildren;f.unmountChildren(t,!1);for(var n in t)t.hasOwnProperty(n)&&c("118");l(this,[a(e)])},updateChildren:function(e,t,n){this._updateChildren(e,t,n)},_updateChildren:function(e,t,n){var r=this._renderedChildren,o={},i=[],a=this._reconcilerUpdateChildren(r,e,i,o,t,n);if(a||r){var s,c=null,p=0,f=0,h=0,m=null;for(s in a)if(a.hasOwnProperty(s)){var v=r&&r[s],g=a[s];v===g?(c=u(c,this.moveChild(v,m,p,f)),f=Math.max(v._mountIndex,f),v._mountIndex=p):(v&&(f=Math.max(v._mountIndex,f)),c=u(c,this._mountChildAtIndex(g,i[h],m,p,t,n)),h++),p++,m=d.getHostNode(g)}for(s in o)o.hasOwnProperty(s)&&(c=u(c,this._unmountChild(r[s],o[s])));c&&l(this,c),this._renderedChildren=a}},unmountChildren:function(e){var t=this._renderedChildren;f.unmountChildren(t,e),this._renderedChildren=null},moveChild:function(e,t,n,r){if(e._mountIndex0&&r.length<20?n+" (keys: "+r.join(", ")+")":n}function i(e,t){var n=s.get(e);return n||null}var a=e(112),s=(e(119),e(57)),u=(e(58),e(71)),l=(e(137),e(142),{isMounted:function(e){var t=s.get(e);return!!t&&!!t._renderedComponent},enqueueCallback:function(e,t,n){l.validateCallback(t,n);var o=i(e);if(!o)return null;o._pendingCallbacks?o._pendingCallbacks.push(t):o._pendingCallbacks=[t],r(o)},enqueueCallbackInternal:function(e,t){e._pendingCallbacks?e._pendingCallbacks.push(t):e._pendingCallbacks=[t],r(e)},enqueueForceUpdate:function(e){var t=i(e,"forceUpdate");t&&(t._pendingForceUpdate=!0,r(t))},enqueueReplaceState:function(e,t,n){var o=i(e,"replaceState");o&&(o._pendingStateQueue=[t],o._pendingReplaceState=!0,void 0!==n&&null!==n&&(l.validateCallback(n,"replaceState"),o._pendingCallbacks?o._pendingCallbacks.push(n):o._pendingCallbacks=[n]),r(o))},enqueueSetState:function(e,t){var n=i(e,"setState");n&&((n._pendingStateQueue||(n._pendingStateQueue=[])).push(t),r(n))},enqueueElementInternal:function(e,t,n){e._pendingElement=t,e._context=n,r(e)},validateCallback:function(e,t){e&&"function"!=typeof e&&a("122",t,o(e))}});t.exports=l},{112:112,119:119,137:137,142:142,57:57,58:58,71:71}],71:[function(e,t,n){"use strict";function r(){P.ReactReconcileTransaction&&b||c("123")}function o(){this.reinitializeTransaction(),this.dirtyComponentsLength=null,this.callbackQueue=d.getPooled(),this.reconcileTransaction=P.ReactReconcileTransaction.getPooled(!0)}function i(e,t,n,o,i,a){return r(),b.batchedUpdates(e,t,n,o,i,a)}function a(e,t){return e._mountOrder-t._mountOrder}function s(e){var t=e.dirtyComponentsLength;t!==g.length&&c("124",t,g.length),g.sort(a),y++;for(var n=0;n]/;t.exports=o},{}],96:[function(e,t,n){"use strict";function r(e){if(null==e)return null;if(1===e.nodeType)return e;var t=a.get(e);if(t)return t=s(t),t?i.getNodeFromInstance(t):null;"function"==typeof e.render?o("44"):o("45",Object.keys(e))}var o=e(112),i=(e(119),e(33)),a=e(57),s=e(103);e(137),e(142);t.exports=r},{103:103,112:112,119:119,137:137,142:142,33:33,57:57}],97:[function(e,t,n){(function(n){"use strict";function r(e,t,n,r){if(e&&"object"==typeof e){var o=e;void 0===o[n]&&null!=t&&(o[n]=t)}}function o(e,t){if(null==e)return e;var n={};return i(e,r,n),n}var i=(e(22),e(117));e(142);void 0!==n&&n.env,t.exports=o}).call(this,void 0)},{117:117,142:142,22:22}],98:[function(e,t,n){"use strict";function r(e,t,n){Array.isArray(e)?e.forEach(t,n):e&&t.call(n,e)}t.exports=r},{}],99:[function(e,t,n){"use strict";function r(e){var t,n=e.keyCode;return"charCode"in e?0===(t=e.charCode)&&13===n&&(t=13):t=n,t>=32||13===t?t:0}t.exports=r},{}],100:[function(e,t,n){"use strict";function r(e){if(e.key){var t=i[e.key]||e.key;if("Unidentified"!==t)return t}if("keypress"===e.type){var n=o(e);return 13===n?"Enter":String.fromCharCode(n)}return"keydown"===e.type||"keyup"===e.type?a[e.keyCode]||"Unidentified":""}var o=e(99),i={Esc:"Escape",Spacebar:" ",Left:"ArrowLeft",Up:"ArrowUp",Right:"ArrowRight",Down:"ArrowDown",Del:"Delete",Win:"OS",Menu:"ContextMenu",Apps:"ContextMenu",Scroll:"ScrollLock",MozPrintableKey:"Unidentified"},a={8:"Backspace",9:"Tab",12:"Clear",13:"Enter",16:"Shift",17:"Control",18:"Alt",19:"Pause",20:"CapsLock",27:"Escape",32:" ",33:"PageUp",34:"PageDown",35:"End",36:"Home",37:"ArrowLeft",38:"ArrowUp",39:"ArrowRight",40:"ArrowDown",45:"Insert",46:"Delete",112:"F1",113:"F2",114:"F3",115:"F4",116:"F5",117:"F6",118:"F7",119:"F8",120:"F9",121:"F10",122:"F11",123:"F12",144:"NumLock",145:"ScrollLock",224:"Meta"};t.exports=r},{99:99}],101:[function(e,t,n){"use strict";function r(e){var t=this,n=t.nativeEvent;if(n.getModifierState)return n.getModifierState(e);var r=i[e];return!!r&&!!n[r]}function o(e){return r}var i={Alt:"altKey",Control:"ctrlKey",Meta:"metaKey",Shift:"shiftKey"};t.exports=o},{}],102:[function(e,t,n){"use strict";function r(e){var t=e.target||e.srcElement||window;return t.correspondingUseElement&&(t=t.correspondingUseElement),3===t.nodeType?t.parentNode:t}t.exports=r},{}],103:[function(e,t,n){"use strict";function r(e){for(var t;(t=e._renderedNodeType)===o.COMPOSITE;)e=e._renderedComponent;return t===o.HOST?e._renderedComponent:t===o.EMPTY?null:void 0}var o=e(62);t.exports=r},{62:62}],104:[function(e,t,n){"use strict";function r(e){var t=e&&(o&&e[o]||e[i]);if("function"==typeof t)return t}var o="function"==typeof Symbol&&Symbol.iterator,i="@@iterator";t.exports=r},{}],105:[function(e,t,n){"use strict";function r(e){for(;e&&e.firstChild;)e=e.firstChild;return e}function o(e){for(;e;){if(e.nextSibling)return e.nextSibling;e=e.parentNode}}function i(e,t){for(var n=r(e),i=0,a=0;n;){if(3===n.nodeType){if(a=i+n.textContent.length,i<=t&&a>=t)return{node:n,offset:t-i};i=a}n=r(o(n))}}t.exports=i},{}],106:[function(e,t,n){"use strict";function r(){return!i&&o.canUseDOM&&(i="textContent"in document.documentElement?"textContent":"innerText"),i}var o=e(123),i=null;t.exports=r},{123:123}],107:[function(e,t,n){"use strict";function r(e,t){var n={};return n[e.toLowerCase()]=t.toLowerCase(),n["Webkit"+e]="webkit"+t,n["Moz"+e]="moz"+t,n["ms"+e]="MS"+t,n["O"+e]="o"+t.toLowerCase(),n}function o(e){if(s[e])return s[e];if(!a[e])return e;var t=a[e];for(var n in t)if(t.hasOwnProperty(n)&&n in u)return s[e]=t[n];return""}var i=e(123),a={animationend:r("Animation","AnimationEnd"),animationiteration:r("Animation","AnimationIteration"),animationstart:r("Animation","AnimationStart"),transitionend:r("Transition","TransitionEnd")},s={},u={};i.canUseDOM&&(u=document.createElement("div").style,"AnimationEvent"in window||(delete a.animationend.animation,delete a.animationiteration.animation,delete a.animationstart.animation),"TransitionEvent"in window||delete a.transitionend.transition),t.exports=o},{123:123}],108:[function(e,t,n){"use strict";function r(e){if(e){var t=e.getName();if(t)return" Check the render method of `"+t+"`."}return""}function o(e){return"function"==typeof e&&void 0!==e.prototype&&"function"==typeof e.prototype.mountComponent&&"function"==typeof e.prototype.receiveComponent}function i(e,t){var n;if(null===e||!1===e)n=l.create(i);else if("object"==typeof e){var s=e,u=s.type;if("function"!=typeof u&&"string"!=typeof u){var d="";d+=r(s._owner),a("130",null==u?u:typeof u,d)}"string"==typeof s.type?n=c.createInternalComponent(s):o(s.type)?(n=new s.type(s),n.getHostNode||(n.getHostNode=n.getNativeNode)):n=new p(s)}else"string"==typeof e||"number"==typeof e?n=c.createInstanceForText(e):a("131",typeof e);return n._mountIndex=0,n._mountImage=null,n}var a=e(112),s=e(143),u=e(29),l=e(49),c=e(54),p=(e(121),e(137),e(142),function(e){this.construct(e)});s(p.prototype,u,{_instantiateReactComponent:i}),t.exports=i},{112:112,121:121,137:137,142:142,143:143,29:29,49:49,54:54}],109:[function(e,t,n){"use strict";function r(e,t){if(!i.canUseDOM||t&&!("addEventListener"in document))return!1;var n="on"+e,r=n in document;if(!r){var a=document.createElement("div");a.setAttribute(n,"return;"),r="function"==typeof a[n]}return!r&&o&&"wheel"===e&&(r=document.implementation.hasFeature("Events.wheel","3.0")),r}var o,i=e(123);i.canUseDOM&&(o=document.implementation&&document.implementation.hasFeature&&!0!==document.implementation.hasFeature("","")),t.exports=r},{123:123}],110:[function(e,t,n){"use strict";function r(e){var t=e&&e.nodeName&&e.nodeName.toLowerCase();return"input"===t?!!o[e.type]:"textarea"===t}var o={color:!0,date:!0,datetime:!0,"datetime-local":!0,email:!0,month:!0,number:!0,password:!0,range:!0,search:!0,tel:!0,text:!0,time:!0,url:!0,week:!0};t.exports=r},{}],111:[function(e,t,n){"use strict";function r(e){return'"'+o(e)+'"'}var o=e(95);t.exports=r},{95:95}],112:[function(e,t,n){"use strict";function r(e){for(var t=arguments.length-1,n="Minified React error #"+e+"; visit http://facebook.github.io/react/docs/error-decoder.html?invariant="+e,r=0;r]/,u=e(93),l=u(function(e,t){if(e.namespaceURI!==i.svg||"innerHTML"in e)e.innerHTML=t;else{r=r||document.createElement("div"),r.innerHTML=""+t+"";for(var n=r.firstChild;n.firstChild;)e.appendChild(n.firstChild)}});if(o.canUseDOM){var c=document.createElement("div");c.innerHTML=" ",""===c.innerHTML&&(l=function(e,t){if(e.parentNode&&e.parentNode.replaceChild(e,e),a.test(t)||"<"===t[0]&&s.test(t)){e.innerHTML=String.fromCharCode(65279)+t;var n=e.firstChild;1===n.data.length?e.removeChild(n):n.deleteData(0,1)}else e.innerHTML=t}),c=null}t.exports=l},{10:10,123:123,93:93}],115:[function(e,t,n){"use strict";var r=e(123),o=e(95),i=e(114),a=function(e,t){if(t){var n=e.firstChild;if(n&&n===e.lastChild&&3===n.nodeType)return void(n.nodeValue=t)}e.textContent=t};r.canUseDOM&&("textContent"in document.documentElement||(a=function(e,t){if(3===e.nodeType)return void(e.nodeValue=t);i(e,o(t))})),t.exports=a},{114:114,123:123,95:95}],116:[function(e,t,n){"use strict";function r(e,t){var n=null===e||!1===e,r=null===t||!1===t;if(n||r)return n===r;var o=typeof e,i=typeof t;return"string"===o||"number"===o?"string"===i||"number"===i:"object"===i&&e.type===t.type&&e.key===t.key}t.exports=r},{}],117:[function(e,t,n){"use strict";function r(e,t){return e&&"object"==typeof e&&null!=e.key?l.escape(e.key):t.toString(36)}function o(e,t,n,i){var d=typeof e;if("undefined"!==d&&"boolean"!==d||(e=null),null===e||"string"===d||"number"===d||"object"===d&&e.$$typeof===s)return n(i,e,""===t?c+r(e,0):t),1;var f,h,m=0,v=""===t?c:t+p;if(Array.isArray(e))for(var g=0;g":"<"+e+">",s[e]=!a.firstChild),s[e]?d[e]:null}var o=e(123),i=e(137),a=o.canUseDOM?document.createElement("div"):null,s={},u=[1,'"],l=[1,"","
"],c=[3,"","
"],p=[1,'',""],d={"*":[1,"?
","
"],area:[1,"",""],col:[2,"","
"],legend:[1,"
","
"],param:[1,"",""],tr:[2,"","
"],optgroup:u,option:u,caption:l,colgroup:l,tbody:l,tfoot:l,thead:l,td:c,th:c};["circle","clipPath","defs","ellipse","g","image","line","linearGradient","mask","path","pattern","polygon","polyline","radialGradient","rect","stop","text","tspan"].forEach(function(e){d[e]=p,s[e]=!0}),t.exports=r},{123:123,137:137}],134:[function(e,t,n){"use strict";function r(e){return e.Window&&e instanceof e.Window?{x:e.pageXOffset||e.document.documentElement.scrollLeft,y:e.pageYOffset||e.document.documentElement.scrollTop}:{x:e.scrollLeft,y:e.scrollTop}}t.exports=r},{}],135:[function(e,t,n){"use strict";function r(e){return e.replace(o,"-$1").toLowerCase()}var o=/([A-Z])/g;t.exports=r},{}],136:[function(e,t,n){"use strict";function r(e){return o(e).replace(i,"-ms-")}var o=e(135),i=/^ms-/;t.exports=r},{135:135}],137:[function(e,t,n){"use strict";function r(e,t,n,r,i,a,s,u){if(o(t),!e){var l;if(void 0===t)l=new Error("Minified exception occurred; use the non-minified dev environment for the full error message and additional helpful warnings.");else{var c=[n,r,i,a,s,u],p=0;l=new Error(t.replace(/%s/g,function(){return c[p++]})),l.name="Invariant Violation"}throw l.framesToPop=1,l}}var o=function(e){};t.exports=r},{}],138:[function(e,t,n){"use strict";function r(e){var t=e?e.ownerDocument||e:document,n=t.defaultView||window;return!(!e||!("function"==typeof n.Node?e instanceof n.Node:"object"==typeof e&&"number"==typeof e.nodeType&&"string"==typeof e.nodeName))}t.exports=r},{}],139:[function(e,t,n){"use strict";function r(e){return o(e)&&3==e.nodeType}var o=e(138);t.exports=r},{138:138}],140:[function(e,t,n){"use strict";function r(e){var t={};return function(n){return t.hasOwnProperty(n)||(t[n]=e.call(this,n)),t[n]}}t.exports=r},{}],141:[function(e,t,n){"use strict";function r(e,t){return e===t?0!==e||0!==t||1/e==1/t:e!==e&&t!==t}function o(e,t){if(r(e,t))return!0;if("object"!=typeof e||null===e||"object"!=typeof t||null===t)return!1;var n=Object.keys(e),o=Object.keys(t);if(n.length!==o.length)return!1;for(var a=0;a 0x10FFFF || // not a valid Unicode code point + floor(codePoint) != codePoint // not an integer + ) { + throw RangeError('Invalid code point: ' + codePoint); + } + if (codePoint <= 0xFFFF) { // BMP code point + codeUnits.push(codePoint); + } else { // Astral code point; split in surrogate halves + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + codePoint -= 0x10000; + highSurrogate = (codePoint >> 10) + 0xD800; + lowSurrogate = (codePoint % 0x400) + 0xDC00; + codeUnits.push(highSurrogate, lowSurrogate); + } + if (index + 1 == length || codeUnits.length > MAX_SIZE) { + result += stringFromCharCode.apply(null, codeUnits); + codeUnits.length = 0; + } + } + return result; + }; + if (defineProperty) { + defineProperty(String, 'fromCodePoint', { + 'value': fromCodePoint, + 'configurable': true, + 'writable': true + }); + } else { + String.fromCodePoint = fromCodePoint; + } + }()); +} + +/*! http://mths.be/codepointat v0.1.0 by @mathias */ +if (!String.prototype.codePointAt) { + (function() { + 'use strict'; // needed to support `apply`/`call` with `undefined`/`null` + var codePointAt = function(position) { + if (this == null) { + throw TypeError(); + } + var string = String(this); + var size = string.length; + // `ToInteger` + var index = position ? Number(position) : 0; + if (index != index) { // better `isNaN` + index = 0; + } + // Account for out-of-bounds indices: + if (index < 0 || index >= size) { + return undefined; + } + // Get the first code unit + var first = string.charCodeAt(index); + var second; + if ( // check if it’s the start of a surrogate pair + first >= 0xD800 && first <= 0xDBFF && // high surrogate + size > index + 1 // there is a next code unit + ) { + second = string.charCodeAt(index + 1); + if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + return (first - 0xD800) * 0x400 + second - 0xDC00 + 0x10000; + } + } + return first; + }; + if (Object.defineProperty) { + Object.defineProperty(String.prototype, 'codePointAt', { + 'value': codePointAt, + 'configurable': true, + 'writable': true + }); + } else { + String.prototype.codePointAt = codePointAt; + } + }()); +} + +function registerAsciinemaPlayerElement() { + var AsciinemaPlayerProto = Object.create(HTMLElement.prototype); + + function merge() { + var merged = {}; + for (var i=0; i>>0),ma=0;function na(a,b,c){return a.call.apply(a.bind,arguments)} +function oa(a,b,c){if(!a)throw Error();if(2b?1:0};var ua=Array.prototype.indexOf?function(a,b,c){return Array.prototype.indexOf.call(a,b,c)}:function(a,b,c){c=null==c?0:0>c?Math.max(0,a.length+c):c;if(ca(a))return ca(b)&&1==b.length?a.indexOf(b,c):-1;for(;cb?null:ca(a)?a.charAt(b):a[b]}function ya(a,b){var c=ua(a,b),d;(d=0<=c)&&Array.prototype.splice.call(a,c,1);return d}function za(a,b){a.sort(b||Aa)}function Ca(a,b){for(var c=Array(a.length),d=0;db?1:a2*this.Fc&&Na(this),!0):!1};function Na(a){if(a.Fc!=a.ib.length){for(var b=0,c=0;ba){var b=Ra[a];if(b)return b}b=new Qa([a|0],0>a?-1:0);-128<=a&&128>a&&(Ra[a]=b);return b}function Ta(a){if(isNaN(a)||!isFinite(a))return Ua;if(0>a)return Ta(-a).kb();for(var b=[],c=1,d=0;a>=c;d++)b[d]=a/c|0,c*=Va;return new Qa(b,0)}var Va=4294967296,Ua=Sa(0),Wa=Sa(1),Xa=Sa(16777216);g=Qa.prototype; +g.Of=function(){return 0a||36>>0).toString(a);c=e;if(c.hc())return f+d;for(;6>f.length;)f="0"+f;d=""+f+d}};function Ya(a,b){return 0>b?0:bthis.compare(Xa)};g.Ve=function(a){return 0>=this.compare(a)};g.compare=function(a){a=this.ze(a);return a.Eb()?-1:a.hc()?0:1};g.kb=function(){return this.Hf().add(Wa)}; +g.add=function(a){for(var b=Math.max(this.Ma.length,a.Ma.length),c=[],d=0,e=0;e<=b;e++){var f=d+(Ya(this,e)&65535)+(Ya(a,e)&65535),h=(f>>>16)+(Ya(this,e)>>>16)+(Ya(a,e)>>>16);d=h>>>16;f&=65535;h&=65535;c[e]=h<<16|f}return new Qa(c,c[c.length-1]&-2147483648?-1:0)};g.ze=function(a){return this.add(a.kb())}; +g.multiply=function(a){if(this.hc()||a.hc())return Ua;if(this.Eb())return a.Eb()?this.kb().multiply(a.kb()):this.kb().multiply(a).kb();if(a.Eb())return this.multiply(a.kb()).kb();if(this.Ue()&&a.Ue())return Ta(this.vd()*a.vd());for(var b=this.Ma.length+a.Ma.length,c=[],d=0;d<2*b;d++)c[d]=0;for(d=0;d>>16,h=Ya(this,d)&65535,k=Ya(a,e)>>>16,l=Ya(a,e)&65535;c[2*d+2*e]+=h*l;ab(c,2*d+2*e);c[2*d+2*e+1]+=f*l;ab(c,2*d+2*e+1);c[2*d+2*e+1]+= +h*k;ab(c,2*d+2*e+1);c[2*d+2*e+2]+=f*k;ab(c,2*d+2*e+2)}for(d=0;d>>16,a[b]&=65535,b++} +function Za(a,b){if(b.hc())throw Error("division by zero");if(a.hc())return Ua;if(a.Eb())return b.Eb()?Za(a.kb(),b.kb()):Za(a.kb(),b).kb();if(b.Eb())return Za(a,b.kb()).kb();if(30=f?1:Math.pow(2,f-48);h=Ta(e);for(var k=h.multiply(b);k.Eb()||k.xf(d);)e-=f,h=Ta(e),k=h.multiply(b);h.hc()&&(h=Wa);c=c.add(h);d=d.ze(k)}return c}g.Hf=function(){for(var a=this.Ma.length,b=[],c=0;c>5;a%=32;for(var c=this.Ma.length+b+(0>>32-a:Ya(this,e-b);return new Qa(d,this.Lc)}; +g.ad=function(a){var b=a>>5;a%=32;for(var c=this.Ma.length-b,d=[],e=0;e>>a|Ya(this,e+b+1)<<32-a:Ya(this,e+b);return new Qa(d,this.Lc)};function cb(a,b){null!=a&&this.append.apply(this,arguments)}g=cb.prototype;g.xc="";g.set=function(a){this.xc=""+a};g.append=function(a,b,c){this.xc+=String(a);if(null!=b)for(var d=1;d>>16&65535)*d+c*(b>>>16&65535)<<16>>>0)|0};function hd(a){a=gd(a|0,-862048943);return gd(a<<15|a>>>-15,461845907)} +function id(a,b){var c=(a|0)^(b|0);return gd(c<<13|c>>>-13,5)+-430675100|0}function jd(a,b){var c=(a|0)^b;c=gd(c^c>>>16,-2048144789);c=gd(c^c>>>13,-1028477387);return c^c>>>16}function kd(a){a:{var b=1;for(var c=0;;)if(b>2)}function qd(a){return a instanceof rd} +function sd(a,b){if(a.Zb===b.Zb)return 0;var c=wb(a.fb);if(t(c?b.fb:c))return-1;if(t(a.fb)){if(wb(b.fb))return 1;c=Aa(a.fb,b.fb);return 0===c?Aa(a.name,b.name):c}return Aa(a.name,b.name)}function rd(a,b,c,d,e){this.fb=a;this.name=b;this.Zb=c;this.Oc=d;this.hb=e;this.m=2154168321;this.J=4096}g=rd.prototype;g.toString=function(){return this.Zb};g.equiv=function(a){return this.K(null,a)};g.K=function(a,b){return b instanceof rd?this.Zb===b.Zb:!1}; +g.call=function(){var a=null;a=function(a,c,d){switch(arguments.length){case 2:return D.c(c,this);case 3:return D.l(c,this,d)}throw Error("Invalid arity: "+(arguments.length-1));};a.c=function(a,c){return D.c(c,this)};a.l=function(a,c,d){return D.l(c,this,d)};return a}();g.apply=function(a,b){return this.call.apply(this,[this].concat(Gb(b)))};g.h=function(a){return D.c(a,this)};g.c=function(a,b){return D.l(a,this,b)};g.P=function(){return this.hb}; +g.T=function(a,b){return new rd(this.fb,this.name,this.Zb,this.Oc,b)};g.U=function(){var a=this.Oc;return null!=a?a:this.Oc=a=pd(kd(this.name),nd(this.fb))};g.hd=function(){return this.name};g.jd=function(){return this.fb};g.R=function(a,b){return Jc(b,this.Zb)};var td=function td(a){switch(arguments.length){case 1:return td.h(arguments[0]);case 2:return td.c(arguments[0],arguments[1]);default:throw Error(["Invalid arity: ",v.h(arguments.length)].join(""));}}; +td.h=function(a){if(a instanceof rd)return a;var b=a.indexOf("/");return 1>b?td.c(null,a):td.c(a.substring(0,b),a.substring(b+1,a.length))};td.c=function(a,b){var c=null!=a?[v.h(a),"/",v.h(b)].join(""):b;return new rd(a,b,c,null,null)};td.L=2;function ud(a){return null!=a?a.J&131072||q===a.Tf?!0:a.J?!1:Ab(cd,a):Ab(cd,a)} +function E(a){if(null==a)return null;if(null!=a&&(a.m&8388608||q===a.Pe))return a.S(null);if(vb(a)||"string"===typeof a)return 0===a.length?null:new Jb(a,0,null);if(Ab(Bc,a))return Cc(a);throw Error([v.h(a)," is not ISeqable"].join(""));}function y(a){if(null==a)return null;if(null!=a&&(a.m&64||q===a.G))return a.Ia(null);a=E(a);return null==a?null:Wb(a)}function vd(a){return null!=a?null!=a&&(a.m&64||q===a.G)?a.bb(null):(a=E(a))?Yb(a):wd:wd} +function z(a){return null==a?null:null!=a&&(a.m&128||q===a.Id)?a.Ka(null):E(vd(a))}var G=function G(a){switch(arguments.length){case 1:return G.h(arguments[0]);case 2:return G.c(arguments[0],arguments[1]);default:for(var c=[],d=arguments.length,e=0;;)if(e=d)return-1;!(0c&&(c+=d,c=0>c?0:c);for(;;)if(cc?d+c:c;for(;;)if(0<=c){if(G.c(Vd(a,c),b))return c;--c}else return-1}function Yd(a,b){this.o=a;this.i=b} +Yd.prototype.ja=function(){return this.ia?0:a};g.Rc=function(){var a=this.W(null);return 0d)c=1;else if(0===c)c=0;else a:for(d=0;;){var e=Ke(Vd(a,d),Vd(b,d));if(0===e&&d+1>1&1431655765;a=(a&858993459)+(a>>2&858993459);return 16843009*(a+(a>>4)&252645135)>>24} +var v=function v(a){switch(arguments.length){case 0:return v.B();case 1:return v.h(arguments[0]);default:for(var c=[],d=arguments.length,e=0;;)if(ed:e))c[d]=a.next(),d+=1;else return qf(new nf(c,0,d),Rf.h?Rf.h(a):Rf.call(null,a))}else return null},null,null)};function Sf(a,b,c,d,e,f){this.buffer=a;this.ub=b;this.pe=c;this.Rb=d;this.ye=e;this.Gf=f} +Sf.prototype.step=function(){if(this.ub!==Nf)return!0;for(;;)if(this.ub===Nf)if(this.buffer.Td()){if(this.pe)return!1;if(this.ye.ja()){if(this.Gf)var a=P(this.Rb,ae(null,this.ye.next()));else a=this.ye.next(),a=this.Rb.c?this.Rb.c(null,a):this.Rb.call(null,null,a);Hd(a)&&(this.Rb.h?this.Rb.h(null):this.Rb.call(null,null),this.pe=!0)}else this.Rb.h?this.Rb.h(null):this.Rb.call(null,null),this.pe=!0}else this.ub=this.buffer.remove();else return!0};Sf.prototype.ja=function(){return this.step()}; +Sf.prototype.next=function(){if(this.ja()){var a=this.ub;this.ub=Nf;return a}throw Error("No such element");};Sf.prototype.remove=function(){return Error("Unsupported operation")};Sf.prototype[Fb]=function(){return yd(this)}; +function Tf(a,b){var c=new Sf(Qf,Nf,!1,null,b,!1);c.Rb=function(){var b=function(a){return function(){function b(b,c){a.buffer=a.buffer.add(c);return b}var c=null;c=function(a,c){switch(arguments.length){case 0:return null;case 1:return a;case 2:return b.call(this,a,c)}throw Error("Invalid arity: "+(arguments.length-1));};c.B=function(){return null};c.h=function(a){return a};c.c=b;return c}()}(c);return a.h?a.h(b):a.call(null,b)}();return c} +function Uf(a,b){var c=Kf(b);c=Tf(a,c);c=Rf(c);return t(c)?c:wd}function Vf(a,b){for(;;){if(null==E(b))return!0;var c=y(b);c=a.h?a.h(c):a.call(null,c);if(t(c)){c=a;var d=z(b);a=c;b=d}else return!1}}function Wf(a,b){for(;;)if(E(b)){var c=y(b);c=a.h?a.h(c):a.call(null,c);if(t(c))return c;c=a;var d=z(b);a=c;b=d}else return null}function Xf(a){if(Ge(a))return 0===(a&1);throw Error(["Argument must be an integer: ",v.h(a)].join(""));} +function Yf(a){return function(){function b(b,c){return wb(a.c?a.c(b,c):a.call(null,b,c))}function c(b){return wb(a.h?a.h(b):a.call(null,b))}function d(){return wb(a.B?a.B():a.call(null))}var e=null,f=function(){function b(a,b,d){var e=null;if(2a?0:a-1>>>5<<5}function Jg(a,b,c){for(;;){if(0===b)return c;var d=Gg(a);d.o[0]=c;c=d;b-=5}} +var Kg=function Kg(a,b,c,d){var f=Hg(c),h=a.F-1>>>b&31;5===b?f.o[h]=d:(c=c.o[h],null!=c?(b-=5,a=Kg.M?Kg.M(a,b,c,d):Kg.call(null,a,b,c,d)):a=Jg(null,b-5,d),f.o[h]=a);return f};function Lg(a,b){throw Error(["No item ",v.h(a)," in vector of length ",v.h(b)].join(""));}function Mg(a,b){if(b>=Ig(a))return a.fa;for(var c=a.root,d=a.shift;;)if(0>>d&31];d=e}else return c.o} +var Ng=function Ng(a,b,c,d,e){var h=Hg(c);if(0===b)h.o[d&31]=e;else{var k=d>>>b&31;b-=5;c=c.o[k];a=Ng.Z?Ng.Z(a,b,c,d,e):Ng.call(null,a,b,c,d,e);h.o[k]=a}return h},Og=function Og(a,b,c){var e=a.F-2>>>b&31;if(5=this.F)a=new Jb(this.fa,0,null);else{a:{a=this.root;for(var b=this.shift;;)if(0this.F-Ig(this)){for(var c=this.fa.length,d=Array(c+1),e=0;;)if(e>>5>1<b)return new R(null,b,5,T,a,null);for(var c=32,d=(new R(null,32,5,T,a.slice(0,32),null)).Pc(null);;)if(cb||this.end<=this.start+b?Lg(b,this.end-this.start):A.c(this.Ja,this.start+b)};g.ka=function(a,b,c){return 0>b||this.end<=this.start+b?c:A.l(this.Ja,this.start+b,c)}; +g.dc=function(a,b,c){a=this.start+b;if(0>b||this.end+1<=a)throw Error(["Index ",v.h(b)," out of bounds [0,",v.h(this.W(null)),"]"].join(""));b=this.meta;c=K.l(this.Ja,a,c);var d=this.end;a+=1;return Zg(b,c,this.start,d>a?d:a,null)};g.ba=function(){return null!=this.Ja&&q===this.Ja.fe?Qg(this.Ja,this.start,this.end):new Jf(Hf,this)};g.P=function(){return this.meta};g.W=function(){return this.end-this.start};g.Ac=function(){return A.c(this.Ja,this.end-1)}; +g.Bc=function(){if(this.start===this.end)throw Error("Can't pop empty vector");return Zg(this.meta,this.Ja,this.start,this.end-1,null)};g.Rc=function(){return this.start!==this.end?new Zd(this,this.end-this.start-1,null):null};g.U=function(){var a=this.w;return null!=a?a:this.w=a=Ad(this)};g.K=function(a,b){return $d(this,b)};g.oa=function(){return tc(he,this.meta)};g.Fa=function(a,b){return null!=this.Ja&&q===this.Ja.fe?Rg(this.Ja,b,this.start,this.end):Kd(this,b)}; +g.Ga=function(a,b,c){return null!=this.Ja&&q===this.Ja.fe?Sg(this.Ja,b,c,this.start,this.end):Ld(this,b,c)};g.O=function(a,b,c){if("number"===typeof b)return this.dc(null,b,c);throw Error("Subvec's key for assoc must be a number.");};g.S=function(){var a=this;return function(b){return function e(d){return d===a.end?null:ae(A.c(a.Ja,d),new kf(null,function(){return function(){return e(d+1)}}(b),null,null))}}(this)(a.start)};g.T=function(a,b){return Zg(b,this.Ja,this.start,this.end,this.w)}; +g.X=function(a,b){return Zg(this.meta,qc(this.Ja,this.end,b),this.start,this.end+1,null)};g.call=function(){var a=null;a=function(a,c,d){switch(arguments.length){case 2:return this.$(null,c);case 3:return this.ka(null,c,d)}throw Error("Invalid arity: "+(arguments.length-1));};a.c=function(a,c){return this.$(null,c)};a.l=function(a,c,d){return this.ka(null,c,d)};return a}();g.apply=function(a,b){return this.call.apply(this,[this].concat(Gb(b)))};g.h=function(a){return this.$(null,a)}; +g.c=function(a,b){return this.ka(null,a,b)};Yg.prototype[Fb]=function(){return yd(this)};function Zg(a,b,c,d,e){for(;;)if(b instanceof Yg)c=b.start+c,d=b.start+d,b=b.Ja;else{if(!ze(b))throw Error("v must satisfy IVector");var f=H(b);if(0>c||0>d||c>f||d>f)throw Error("Index out of bounds");return new Yg(a,b,c,d,e)}}function $g(a,b){return a===b.la?b:new Fg(a,Gb(b.o))} +var ah=function ah(a,b,c,d){c=$g(a.root.la,c);var f=a.F-1>>>b&31;if(5===b)a=d;else{var h=c.o[f];null!=h?(b-=5,a=ah.M?ah.M(a,b,h,d):ah.call(null,a,b,h,d)):a=Jg(a.root.la,b-5,d)}c.o[f]=a;return c};function Tg(a,b,c,d){this.F=a;this.shift=b;this.root=c;this.fa=d;this.J=88;this.m=275}g=Tg.prototype; +g.Dc=function(a,b){if(this.root.la){if(32>this.F-Ig(this))this.fa[this.F&31]=b;else{var c=new Fg(this.root.la,this.fa),d=[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null];d[0]=b;this.fa=d;if(this.F>>>5>1<>>d&31,m=k(d-5,f.o[p]);f.o[p]=m}return f}}(a)(a.shift,a.root)}();a.root=d}return a}if(b===a.F)return a.Dc(null,c);throw Error(["Index ",v.h(b)," out of bounds for TransientVector of length",v.h(a.F)].join(""));}throw Error("assoc! after persistent!");} +g.W=function(){if(this.root.la)return this.F;throw Error("count after persistent!");};g.$=function(a,b){if(this.root.la)return(0<=b&&b=c)return new r(this.meta,this.F-1,d,null);G.c(b,this.o[e])||(d[f]=this.o[e],d[f+1]=this.o[e+1],f+=2);e+=2}}else return this}; +g.O=function(a,b,c){a=ih(this.o,b);if(-1===a){if(this.Fb?4:2*(b+1));Be(this.o,0,c,0,2*b);return new xh(a,this.na,c)};g.qd=function(){return yh(this.o,0,null)};g.Jc=function(a,b){return vh(this.o,a,b)};g.sc=function(a,b,c,d){var e=1<<(b>>>a&31);if(0===(this.na&e))return d;var f=$e(this.na&e-1);e=this.o[2*f];f=this.o[2*f+1];return null==e?f.sc(a+5,b,c,d):rh(c,e)?f:d}; +g.Kb=function(a,b,c,d,e,f){var h=1<<(c>>>b&31),k=$e(this.na&h-1);if(0===(this.na&h)){var l=$e(this.na);if(2*l>>b&31]=zh.Kb(a,b+5,c,d,e,f);for(e=d=0;;)if(32>d)0!== +(this.na>>>d&1)&&(k[d]=null!=this.o[e]?zh.Kb(a,b+5,od(this.o[e]),this.o[e],this.o[e+1],f):this.o[e+1],e+=2),d+=1;else break;return new Ah(a,l+1,k)}b=Array(2*(l+4));Be(this.o,0,b,0,2*k);b[2*k]=d;b[2*k+1]=e;Be(this.o,2*k,b,2*(k+1),2*(l-k));f.H=!0;a=this.Gc(a);a.o=b;a.na|=h;return a}l=this.o[2*k];h=this.o[2*k+1];if(null==l)return l=h.Kb(a,b+5,c,d,e,f),l===h?this:uh(this,a,2*k+1,l);if(rh(d,l))return e===h?this:uh(this,a,2*k+1,e);f.H=!0;f=b+5;b=od(l);if(b===c)e=new Bh(null,b,2,[l,h,d,e]);else{var p=new qh; +e=zh.Kb(a,f,b,l,h,p).Kb(a,f,c,d,e,p)}d=2*k;k=2*k+1;a=this.Gc(a);a.o[d]=null;a.o[k]=e;return a}; +g.Jb=function(a,b,c,d,e){var f=1<<(b>>>a&31),h=$e(this.na&f-1);if(0===(this.na&f)){var k=$e(this.na);if(16<=k){h=[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null];h[b>>>a&31]=zh.Jb(a+5,b,c,d,e);for(d=c=0;;)if(32>c)0!==(this.na>>>c&1)&&(h[c]=null!=this.o[d]?zh.Jb(a+5,od(this.o[d]),this.o[d],this.o[d+1],e):this.o[d+1],d+=2),c+=1;else break;return new Ah(null,k+1,h)}a=Array(2*(k+1));Be(this.o, +0,a,0,2*h);a[2*h]=c;a[2*h+1]=d;Be(this.o,2*h,a,2*(h+1),2*(k-h));e.H=!0;return new xh(null,this.na|f,a)}var l=this.o[2*h];f=this.o[2*h+1];if(null==l)return k=f.Jb(a+5,b,c,d,e),k===f?this:new xh(null,this.na,sh(this.o,2*h+1,k));if(rh(c,l))return d===f?this:new xh(null,this.na,sh(this.o,2*h+1,d));e.H=!0;e=this.na;k=this.o;a+=5;var p=od(l);if(p===b)c=new Bh(null,p,2,[l,f,c,d]);else{var m=new qh;c=zh.Jb(a,p,l,f,m).Jb(a,b,c,d,m)}a=2*h;h=2*h+1;d=Gb(k);d[a]=null;d[h]=c;return new xh(null,e,d)}; +g.rd=function(a,b,c){var d=1<<(b>>>a&31);if(0===(this.na&d))return this;var e=$e(this.na&d-1),f=this.o[2*e],h=this.o[2*e+1];return null==f?(a=h.rd(a+5,b,c),a===h?this:null!=a?new xh(null,this.na,sh(this.o,2*e+1,a)):this.na===d?null:new xh(null,this.na^d,th(this.o,e))):rh(c,f)?new xh(null,this.na^d,th(this.o,e)):this};g.ba=function(){return new wh(this.o,0,null,null)};var zh=new xh(null,0,[]);function Ch(a,b,c){this.o=a;this.i=b;this.Lb=c} +Ch.prototype.ja=function(){for(var a=this.o.length;;){if(null!=this.Lb&&this.Lb.ja())return!0;if(this.i>>a&31];return null!=e?e.sc(a+5,b,c,d):d};g.Kb=function(a,b,c,d,e,f){var h=c>>>b&31,k=this.o[h];if(null==k)return a=uh(this,a,h,zh.Kb(a,b+5,c,d,e,f)),a.F+=1,a;b=k.Kb(a,b+5,c,d,e,f);return b===k?this:uh(this,a,h,b)}; +g.Jb=function(a,b,c,d,e){var f=b>>>a&31,h=this.o[f];if(null==h)return new Ah(null,this.F+1,sh(this.o,f,zh.Jb(a+5,b,c,d,e)));a=h.Jb(a+5,b,c,d,e);return a===h?this:new Ah(null,this.F,sh(this.o,f,a))}; +g.rd=function(a,b,c){var d=b>>>a&31,e=this.o[d];if(null!=e){a=e.rd(a+5,b,c);if(a===e)d=this;else if(null==a)if(8>=this.F)a:{e=this.o;a=e.length;b=Array(2*(this.F-1));c=0;for(var f=1,h=0;;)if(ca?d:rh(c,this.o[a])?this.o[a+1]:d}; +g.Kb=function(a,b,c,d,e,f){if(c===this.ec){b=Eh(this.o,this.F,d);if(-1===b){if(this.o.length>2*this.F)return b=2*this.F,c=2*this.F+1,a=this.Gc(a),a.o[b]=d,a.o[c]=e,f.H=!0,a.F+=1,a;c=this.o.length;b=Array(c+2);Be(this.o,0,b,0,c);b[c]=d;b[c+1]=e;f.H=!0;d=this.F+1;a===this.la?(this.o=b,this.F=d,a=this):a=new Bh(this.la,this.ec,d,b);return a}return this.o[b+1]===e?this:uh(this,a,b+1,e)}return(new xh(a,1<<(this.ec>>>b&31),[null,this,null,null])).Kb(a,b,c,d,e,f)}; +g.Jb=function(a,b,c,d,e){return b===this.ec?(a=Eh(this.o,this.F,c),-1===a?(a=2*this.F,b=Array(a+2),Be(this.o,0,b,0,a),b[a]=c,b[a+1]=d,e.H=!0,new Bh(null,this.ec,this.F+1,b)):G.c(this.o[a+1],d)?this:new Bh(null,this.ec,this.F,sh(this.o,a+1,d))):(new xh(null,1<<(this.ec>>>a&31),[null,this])).Jb(a,b,c,d,e)};g.rd=function(a,b,c){a=Eh(this.o,this.F,c);return-1===a?this:1===this.F?null:new Bh(null,this.ec,this.F-1,th(this.o,Ze(a)))};g.ba=function(){return new wh(this.o,0,null,null)}; +function Fh(a,b,c,d,e){this.meta=a;this.Mb=b;this.i=c;this.s=d;this.w=e;this.m=32374988;this.J=0}g=Fh.prototype;g.toString=function(){return fd(this)};g.equiv=function(a){return this.K(null,a)};g.indexOf=function(){var a=null;a=function(a,c){switch(arguments.length){case 1:return Ud(this,a,0);case 2:return Ud(this,a,c)}throw Error("Invalid arity: "+(arguments.length-1));};a.h=function(a){return Ud(this,a,0)};a.c=function(a,c){return Ud(this,a,c)};return a}(); +g.lastIndexOf=function(){function a(a){return Xd(this,a,H(this))}var b=null;b=function(b,d){switch(arguments.length){case 1:return a.call(this,b);case 2:return Xd(this,b,d)}throw Error("Invalid arity: "+(arguments.length-1));};b.h=a;b.c=function(a,b){return Xd(this,a,b)};return b}();g.P=function(){return this.meta};g.Ka=function(){return null==this.s?yh(this.Mb,this.i+2,null):yh(this.Mb,this.i,z(this.s))};g.U=function(){var a=this.w;return null!=a?a:this.w=a=Ad(this)}; +g.K=function(a,b){return $d(this,b)};g.oa=function(){return tc(wd,this.meta)};g.Fa=function(a,b){return ce(b,this)};g.Ga=function(a,b,c){return de(b,c,this)};g.Ia=function(){return null==this.s?new R(null,2,5,T,[this.Mb[this.i],this.Mb[this.i+1]],null):y(this.s)};g.bb=function(){var a=null==this.s?yh(this.Mb,this.i+2,null):yh(this.Mb,this.i,z(this.s));return null!=a?a:wd};g.S=function(){return this};g.T=function(a,b){return new Fh(b,this.Mb,this.i,this.s,this.w)};g.X=function(a,b){return ae(b,this)}; +Fh.prototype[Fb]=function(){return yd(this)};function yh(a,b,c){if(null==c)for(c=a.length;;)if(bthis.F?H(z(this))+1:this.F};g.U=function(){var a=this.w;return null!=a?a:this.w=a=Ad(this)};g.K=function(a,b){return $d(this,b)};g.oa=function(){return tc(wd,this.meta)};g.Fa=function(a,b){return ce(b,this)};g.Ga=function(a,b,c){return de(b,c,this)};g.Ia=function(){var a=this.stack;return null==a?null:nc(a)};g.bb=function(){var a=y(this.stack);a=Mh(this.vc?a.right:a.left,z(this.stack),this.vc);return null!=a?new Nh(null,a,this.vc,this.F-1,null):wd};g.S=function(){return this}; +g.T=function(a,b){return new Nh(b,this.stack,this.vc,this.F,this.w)};g.X=function(a,b){return ae(b,this)};Nh.prototype[Fb]=function(){return yd(this)};function Oh(a,b,c){return new Nh(null,Mh(a,null,b),b,c,null)} +function Ph(a,b,c,d){return c instanceof Qh?c.left instanceof Qh?new Qh(c.key,c.H,c.left.bc(),new Rh(a,b,c.right,d,null),null):c.right instanceof Qh?new Qh(c.right.key,c.right.H,new Rh(c.key,c.H,c.left,c.right.left,null),new Rh(a,b,c.right.right,d,null),null):new Rh(a,b,c,d,null):new Rh(a,b,c,d,null)} +function Sh(a,b,c,d){return d instanceof Qh?d.right instanceof Qh?new Qh(d.key,d.H,new Rh(a,b,c,d.left,null),d.right.bc(),null):d.left instanceof Qh?new Qh(d.left.key,d.left.H,new Rh(a,b,c,d.left.left,null),new Rh(d.key,d.H,d.left.right,d.right,null),null):new Rh(a,b,c,d,null):new Rh(a,b,c,d,null)} +function Th(a,b,c,d){if(c instanceof Qh)return new Qh(a,b,c.bc(),d,null);if(d instanceof Rh)return Sh(a,b,c,d.ud());if(d instanceof Qh&&d.left instanceof Rh)return new Qh(d.left.key,d.left.H,new Rh(a,b,c,d.left.left,null),Sh(d.key,d.H,d.left.right,d.right.ud()),null);throw Error("red-black tree invariant violation");} +function Uh(a,b,c,d){if(d instanceof Qh)return new Qh(a,b,c,d.bc(),null);if(c instanceof Rh)return Ph(a,b,c.ud(),d);if(c instanceof Qh&&c.right instanceof Rh)return new Qh(c.right.key,c.right.H,Ph(c.key,c.H,c.left.ud(),c.right.left),new Rh(a,b,c.right.right,d,null),null);throw Error("red-black tree invariant violation");} +var Vh=function Vh(a,b,c){var e=null!=a.left?function(){var e=a.left;return Vh.l?Vh.l(e,b,c):Vh.call(null,e,b,c)}():c;if(Hd(e))return e;var f=function(){var c=a.key,f=a.H;return b.l?b.l(e,c,f):b.call(null,e,c,f)}();if(Hd(f))return f;if(null!=a.right){var h=a.right;return Vh.l?Vh.l(h,b,f):Vh.call(null,h,b,f)}return f};function Rh(a,b,c,d,e){this.key=a;this.H=b;this.left=c;this.right=d;this.w=e;this.m=32402207;this.J=0}g=Rh.prototype; +g.lastIndexOf=function(){function a(a){return Xd(this,a,H(this))}var b=null;b=function(b,d){switch(arguments.length){case 1:return a.call(this,b);case 2:return Xd(this,b,d)}throw Error("Invalid arity: "+(arguments.length-1));};b.h=a;b.c=function(a,b){return Xd(this,a,b)};return b}(); +g.indexOf=function(){var a=null;a=function(a,c){switch(arguments.length){case 1:return Ud(this,a,0);case 2:return Ud(this,a,c)}throw Error("Invalid arity: "+(arguments.length-1));};a.h=function(a){return Ud(this,a,0)};a.c=function(a,c){return Ud(this,a,c)};return a}();g.Ee=function(a){return a.He(this)};g.ud=function(){return new Qh(this.key,this.H,this.left,this.right,null)};g.bc=function(){return this};g.De=function(a){return a.Ge(this)};g.replace=function(a,b,c,d){return new Rh(a,b,c,d,null)}; +g.Ge=function(a){return new Rh(a.key,a.H,this,a.right,null)};g.He=function(a){return new Rh(a.key,a.H,a.left,this,null)};g.Jc=function(a,b){return Vh(this,a,b)};g.V=function(a,b){return this.ka(null,b,null)};g.I=function(a,b,c){return this.ka(null,b,c)};g.$=function(a,b){if(0===b)return this.key;if(1===b)return this.H;throw Error("Index out of bounds");};g.ka=function(a,b,c){return 0===b?this.key:1===b?this.H:c};g.dc=function(a,b,c){return(new R(null,2,5,T,[this.key,this.H],null)).dc(null,b,c)}; +g.P=function(){return null};g.W=function(){return 2};g.fd=function(){return this.key};g.gd=function(){return this.H};g.Ac=function(){return this.H};g.Bc=function(){return new R(null,1,5,T,[this.key],null)};g.U=function(){var a=this.w;return null!=a?a:this.w=a=Ad(this)};g.K=function(a,b){return $d(this,b)};g.oa=function(){return he};g.Fa=function(a,b){return Kd(this,b)};g.Ga=function(a,b,c){return Ld(this,b,c)};g.O=function(a,b,c){return K.l(new R(null,2,5,T,[this.key,this.H],null),b,c)}; +g.yc=function(a,b){return 0===b||1===b};g.S=function(){var a=this.key;return Tb(Tb(wd,this.H),a)};g.T=function(a,b){return tc(new R(null,2,5,T,[this.key,this.H],null),b)};g.X=function(a,b){return new R(null,3,5,T,[this.key,this.H,b],null)}; +g.call=function(){var a=null;a=function(a,c,d){switch(arguments.length){case 2:return this.$(null,c);case 3:return this.ka(null,c,d)}throw Error("Invalid arity: "+(arguments.length-1));};a.c=function(a,c){return this.$(null,c)};a.l=function(a,c,d){return this.ka(null,c,d)};return a}();g.apply=function(a,b){return this.call.apply(this,[this].concat(Gb(b)))};g.h=function(a){return this.$(null,a)};g.c=function(a,b){return this.ka(null,a,b)};Rh.prototype[Fb]=function(){return yd(this)}; +function Qh(a,b,c,d,e){this.key=a;this.H=b;this.left=c;this.right=d;this.w=e;this.m=32402207;this.J=0}g=Qh.prototype;g.lastIndexOf=function(){function a(a){return Xd(this,a,H(this))}var b=null;b=function(b,d){switch(arguments.length){case 1:return a.call(this,b);case 2:return Xd(this,b,d)}throw Error("Invalid arity: "+(arguments.length-1));};b.h=a;b.c=function(a,b){return Xd(this,a,b)};return b}(); +g.indexOf=function(){var a=null;a=function(a,c){switch(arguments.length){case 1:return Ud(this,a,0);case 2:return Ud(this,a,c)}throw Error("Invalid arity: "+(arguments.length-1));};a.h=function(a){return Ud(this,a,0)};a.c=function(a,c){return Ud(this,a,c)};return a}();g.Ee=function(a){return new Qh(this.key,this.H,this.left,a,null)};g.ud=function(){throw Error("red-black tree invariant violation");};g.bc=function(){return new Rh(this.key,this.H,this.left,this.right,null)}; +g.De=function(a){return new Qh(this.key,this.H,a,this.right,null)};g.replace=function(a,b,c,d){return new Qh(a,b,c,d,null)};g.Ge=function(a){return this.left instanceof Qh?new Qh(this.key,this.H,this.left.bc(),new Rh(a.key,a.H,this.right,a.right,null),null):this.right instanceof Qh?new Qh(this.right.key,this.right.H,new Rh(this.key,this.H,this.left,this.right.left,null),new Rh(a.key,a.H,this.right.right,a.right,null),null):new Rh(a.key,a.H,this,a.right,null)}; +g.He=function(a){return this.right instanceof Qh?new Qh(this.key,this.H,new Rh(a.key,a.H,a.left,this.left,null),this.right.bc(),null):this.left instanceof Qh?new Qh(this.left.key,this.left.H,new Rh(a.key,a.H,a.left,this.left.left,null),new Rh(this.key,this.H,this.left.right,this.right,null),null):new Rh(a.key,a.H,a.left,this,null)};g.Jc=function(a,b){return Vh(this,a,b)};g.V=function(a,b){return this.ka(null,b,null)};g.I=function(a,b,c){return this.ka(null,b,c)}; +g.$=function(a,b){if(0===b)return this.key;if(1===b)return this.H;throw Error("Index out of bounds");};g.ka=function(a,b,c){return 0===b?this.key:1===b?this.H:c};g.dc=function(a,b,c){return(new R(null,2,5,T,[this.key,this.H],null)).dc(null,b,c)};g.P=function(){return null};g.W=function(){return 2};g.fd=function(){return this.key};g.gd=function(){return this.H};g.Ac=function(){return this.H};g.Bc=function(){return new R(null,1,5,T,[this.key],null)}; +g.U=function(){var a=this.w;return null!=a?a:this.w=a=Ad(this)};g.K=function(a,b){return $d(this,b)};g.oa=function(){return he};g.Fa=function(a,b){return Kd(this,b)};g.Ga=function(a,b,c){return Ld(this,b,c)};g.O=function(a,b,c){return K.l(new R(null,2,5,T,[this.key,this.H],null),b,c)};g.yc=function(a,b){return 0===b||1===b};g.S=function(){var a=this.key;return Tb(Tb(wd,this.H),a)};g.T=function(a,b){return tc(new R(null,2,5,T,[this.key,this.H],null),b)}; +g.X=function(a,b){return new R(null,3,5,T,[this.key,this.H,b],null)};g.call=function(){var a=null;a=function(a,c,d){switch(arguments.length){case 2:return this.$(null,c);case 3:return this.ka(null,c,d)}throw Error("Invalid arity: "+(arguments.length-1));};a.c=function(a,c){return this.$(null,c)};a.l=function(a,c,d){return this.ka(null,c,d)};return a}();g.apply=function(a,b){return this.call.apply(this,[this].concat(Gb(b)))};g.h=function(a){return this.$(null,a)}; +g.c=function(a,b){return this.ka(null,a,b)};Qh.prototype[Fb]=function(){return yd(this)}; +var Wh=function Wh(a,b,c,d,e){if(null==b)return new Qh(c,d,null,null,null);var h=function(){var d=b.key;return a.c?a.c(c,d):a.call(null,c,d)}();if(0===h)return e[0]=b,null;if(0>h)return h=function(){var h=b.left;return Wh.Z?Wh.Z(a,h,c,d,e):Wh.call(null,a,h,c,d,e)}(),null!=h?b.De(h):null;h=function(){var h=b.right;return Wh.Z?Wh.Z(a,h,c,d,e):Wh.call(null,a,h,c,d,e)}();return null!=h?b.Ee(h):null},Xh=function Xh(a,b){if(null==a)return b;if(null==b)return a;if(a instanceof Qh){if(b instanceof Qh){var d= +function(){var d=a.right,f=b.left;return Xh.c?Xh.c(d,f):Xh.call(null,d,f)}();return d instanceof Qh?new Qh(d.key,d.H,new Qh(a.key,a.H,a.left,d.left,null),new Qh(b.key,b.H,d.right,b.right,null),null):new Qh(a.key,a.H,a.left,new Qh(b.key,b.H,d,b.right,null),null)}return new Qh(a.key,a.H,a.left,function(){var d=a.right;return Xh.c?Xh.c(d,b):Xh.call(null,d,b)}(),null)}if(b instanceof Qh)return new Qh(b.key,b.H,function(){var d=b.left;return Xh.c?Xh.c(a,d):Xh.call(null,a,d)}(),b.right,null);d=function(){var d= +a.right,f=b.left;return Xh.c?Xh.c(d,f):Xh.call(null,d,f)}();return d instanceof Qh?new Qh(d.key,d.H,new Rh(a.key,a.H,a.left,d.left,null),new Rh(b.key,b.H,d.right,b.right,null),null):Th(a.key,a.H,a.left,new Rh(b.key,b.H,d,b.right,null))},Yh=function Yh(a,b,c,d){if(null!=b){var f=function(){var d=b.key;return a.c?a.c(c,d):a.call(null,c,d)}();if(0===f)return d[0]=b,Xh(b.left,b.right);if(0>f)return f=function(){var f=b.left;return Yh.M?Yh.M(a,f,c,d):Yh.call(null,a,f,c,d)}(),null!=f||null!=d[0]?b.left instanceof +Rh?Th(b.key,b.H,f,b.right):new Qh(b.key,b.H,f,b.right,null):null;f=function(){var f=b.right;return Yh.M?Yh.M(a,f,c,d):Yh.call(null,a,f,c,d)}();return null!=f||null!=d[0]?b.right instanceof Rh?Uh(b.key,b.H,b.left,f):new Qh(b.key,b.H,b.left,f,null):null}return null},Zh=function Zh(a,b,c,d){var f=b.key,h=a.c?a.c(c,f):a.call(null,c,f);return 0===h?b.replace(f,d,b.left,b.right):0>h?b.replace(f,b.H,function(){var f=b.left;return Zh.M?Zh.M(a,f,c,d):Zh.call(null,a,f,c,d)}(),b.right):b.replace(f,b.H,b.left, +function(){var f=b.right;return Zh.M?Zh.M(a,f,c,d):Zh.call(null,a,f,c,d)}())};function $h(a,b,c,d,e){this.Bb=a;this.mc=b;this.F=c;this.meta=d;this.w=e;this.m=418776847;this.J=8192}g=$h.prototype;g.forEach=function(a){for(var b=E(this),c=null,d=0,e=0;;)if(ed?c.left:c.right}else return null}g.has=function(a){return He(this,a)};g.V=function(a,b){return this.I(null,b,null)}; +g.I=function(a,b,c){a=ai(this,b);return null!=a?a.H:c};g.Qc=function(a,b,c){return null!=this.mc?Jd(Vh(this.mc,b,c)):c};g.P=function(){return this.meta};g.W=function(){return this.F};g.Rc=function(){return 0(a.h?a.h(c):a.call(null,c))?b:c};Ai.A=function(a,b,c,d){return Mb(function(b,c){return Ai.l(a,b,c)},Ai.l(a,b,c),d)};Ai.N=function(a){var b=y(a),c=z(a);a=y(c);var d=z(c);c=y(d);d=z(d);return Ai.A(b,a,c,d)};Ai.L=3;function Bi(a,b){return new kf(null,function(){var c=E(b);if(c){var d=y(c);d=a.h?a.h(d):a.call(null,d);c=t(d)?ae(y(c),Bi(a,vd(c))):null}else c=null;return c},null,null)}function Di(a,b,c){this.i=a;this.end=b;this.step=c} +Di.prototype.ja=function(){return 0this.end};Di.prototype.next=function(){var a=this.i;this.i+=this.step;return a};function Ei(a,b,c,d,e){this.meta=a;this.start=b;this.end=c;this.step=d;this.w=e;this.m=32375006;this.J=139264}g=Ei.prototype;g.toString=function(){return fd(this)};g.equiv=function(a){return this.K(null,a)}; +g.indexOf=function(){var a=null;a=function(a,c){switch(arguments.length){case 1:return Ud(this,a,0);case 2:return Ud(this,a,c)}throw Error("Invalid arity: "+(arguments.length-1));};a.h=function(a){return Ud(this,a,0)};a.c=function(a,c){return Ud(this,a,c)};return a}(); +g.lastIndexOf=function(){function a(a){return Xd(this,a,H(this))}var b=null;b=function(b,d){switch(arguments.length){case 1:return a.call(this,b);case 2:return Xd(this,b,d)}throw Error("Invalid arity: "+(arguments.length-1));};b.h=a;b.c=function(a,b){return Xd(this,a,b)};return b}();g.$=function(a,b){if(0<=b&&bthis.end&&0===this.step)return this.start;throw Error("Index out of bounds");}; +g.ka=function(a,b,c){return 0<=b&&bthis.end&&0===this.step?this.start:c};g.ba=function(){return new Di(this.start,this.end,this.step)};g.P=function(){return this.meta};g.Ka=function(){return 0this.end?new Ei(this.meta,this.start+this.step,this.end,this.step,null):null}; +g.W=function(){return wb(this.S(null))?0:Math.ceil((this.end-this.start)/this.step)};g.U=function(){var a=this.w;return null!=a?a:this.w=a=Ad(this)};g.K=function(a,b){return $d(this,b)};g.oa=function(){return tc(wd,this.meta)};g.Fa=function(a,b){return Kd(this,b)};g.Ga=function(a,b,c){for(a=this.start;;)if(0this.end){c=b.c?b.c(c,a):b.call(null,c,a);if(Hd(c))return B(c);a+=this.step}else return c};g.Ia=function(){return null==this.S(null)?null:this.start}; +g.bb=function(){return null!=this.S(null)?new Ei(this.meta,this.start+this.step,this.end,this.step,null):wd};g.S=function(){return 0this.step?this.start>this.end?this:null:this.start===this.end?null:this};g.T=function(a,b){return new Ei(b,this.start,this.end,this.step,this.w)};g.X=function(a,b){return ae(b,this)};Ei.prototype[Fb]=function(){return yd(this)};function Fi(a,b,c){return new Ei(null,a,b,c,null)} +function Gi(a,b){return new R(null,2,5,T,[Bi(a,b),ng(a,b)],null)} +function Hi(a){var b=y;return function(){function c(c,d,e){return new R(null,2,5,T,[b.l?b.l(c,d,e):b.call(null,c,d,e),a.l?a.l(c,d,e):a.call(null,c,d,e)],null)}function d(c,d){return new R(null,2,5,T,[b.c?b.c(c,d):b.call(null,c,d),a.c?a.c(c,d):a.call(null,c,d)],null)}function e(c){return new R(null,2,5,T,[b.h?b.h(c):b.call(null,c),a.h?a.h(c):a.call(null,c)],null)}function f(){return new R(null,2,5,T,[b.B?b.B():b.call(null),a.B?a.B():a.call(null)],null)}var h=null,k=function(){function c(a,b,c,e){var f= +null;if(3lb)return Jc(a,"#");Jc(a,c);if(0===tb.h(f))E(h)&&Jc(a,function(){var a=Ki.h(f);return t(a)?a:"..."}());else{if(E(h)){var l=y(h);b.l?b.l(l,a,f):b.call(null,l,a,f)}for(var p=z(h),m=tb.h(f)-1;;)if(!p||null!=m&&0===m){E(p)&&0===m&&(Jc(a,d),Jc(a,function(){var a=Ki.h(f);return t(a)?a:"..."}()));break}else{Jc(a,d);var u=y(p);c=a;h=f;b.l?b.l(u,c,h):b.call(null,u,c,h);var w=z(p);c=m-1;p=w;m=c}}return Jc(a,e)}finally{lb=k}} +function Li(a,b){for(var c=E(b),d=null,e=0,f=0;;)if(fH(a)?a.toUpperCase():[v.h(a.substring(0,1).toUpperCase()),v.h(a.substring(1))].join("")} +function Qo(a){if("string"===typeof a)return a;a=jf(a);var b=Fo(a,/-/),c=E(b);b=y(c);c=z(c);return t(Oo.h?Oo.h(b):Oo.call(null,b))?a:Kb(v,b,ig.c(Po,c))}function Ro(a){var b=function(){var b=function(){var b=me(a);return b?(b=a.displayName,t(b)?b:a.name):b}();if(t(b))return b;b=function(){var b=null!=a?a.J&4096||q===a.Oe?!0:!1:!1;return b?jf(a):b}();if(t(b))return b;b=qe(a);return xe(b)?Tk.h(b):null}();return Do(""+v.h(b),"$",".")}var So=!1;if("undefined"===typeof To)var To=0;function Uo(a){return setTimeout(a,16)}var Vo="undefined"===typeof window||null==window.document?Uo:function(){var a=window,b=a.requestAnimationFrame;if(t(b))return b;b=a.webkitRequestAnimationFrame;if(t(b))return b;b=a.mozRequestAnimationFrame;if(t(b))return b;a=a.msRequestAnimationFrame;return t(a)?a:Uo}();function Wo(a,b){return a.cljsMountOrder-b.cljsMountOrder}if("undefined"===typeof Xo)var Xo=function(){return null};function Yo(a){this.Yd=a} +function Zo(a,b){var c=a[b];if(null==c)return null;a[b]=null;for(var d=c.length,e=0;;)if(e=d&&a.push(gq(c));return a}}(e),[b,c],a))}};if("undefined"===typeof jq)var jq=null;function kq(){if(null!=jq)return jq;if("undefined"!==typeof ReactDOM)return jq=ReactDOM;if("undefined"!==typeof require){var a=jq=require("react-dom");if(t(a))return a;throw Error("require('react-dom') failed");}throw Error("js/ReactDOM is missing");}if("undefined"===typeof lq)var lq=dg.h(Ef); +function mq(a,b,c){var d=So;So=!0;try{return kq().render(a.B?a.B():a.call(null),b,function(){return function(){var d=So;So=!1;try{return gg.M(lq,K,b,new R(null,2,5,T,[a,b],null)),Zo(bp,"afterRender"),null!=c?c.B?c.B():c.call(null):null}finally{So=d}}}(d))}finally{So=d}}function nq(a,b){return mq(a,b,null)}function oq(a,b,c){qp();return mq(function(){return gq(me(a)?a.B?a.B():a.call(null):a)},b,c)}Wp=function(a){return kq().findDOMNode(a)};function pq(a){switch(arguments.length){case 2:return oq(arguments[0],arguments[1],null);case 3:return oq(arguments[0],arguments[1],arguments[2]);default:throw Error(["Invalid arity: ",v.h(arguments.length)].join(""));}}function qq(a,b){return oq(a,b,null)} +da("reagent.core.force_update_all",function(){qp();qp();for(var a=E(mh(B(lq))),b=null,c=0,d=0;;)if(d=Number(c)?a:a=-1Number(a)?"-":0<=b.indexOf("+")?"+":0<=b.indexOf(" ")?" ":"";0<=Number(a)&&(d=f+d);if(isNaN(c)||d.length>=Number(c))return d;d=isNaN(e)?Math.abs(Number(a)).toString():Math.abs(Number(a)).toFixed(e);a=Number(c)-d.length-f.length;0<=b.indexOf("-",0)?d=f+d+sa(" ",a):(b=0<=b.indexOf("0",0)?"0":" ",d=f+sa(b,a)+d);return d};yq.fc.d=function(a,b,c,d,e,f,h,k){return yq.fc.f(parseInt(a,10),b,c,d,0,f,h,k)}; +yq.fc.i=yq.fc.d;yq.fc.u=yq.fc.d;function zq(a){var b=be([Vk,null]);return wg.c(t(a)?a:Ef,function(){return function e(a){return new kf(null,function(){for(var b=a;;)if(b=E(b)){if(Ae(b)){var d=Wc(b),k=H(d),l=of(k);a:for(var p=0;;)if(p=H(h)&&Vf(function(){return function(a){return!(a instanceof Xq)}}(b,c,d,e,f,h),h)))throw Error(Bq("%s is not a valid sequence schema; %s%s%s",be([a,"a valid sequence schema consists of zero or more `one` elements, ","followed by zero or more `optional` elements, followed by an optional ", +"schema that will match the remaining elements."])));return new R(null,2,5,T,[O.c(c,f),y(h)],null)} +R.prototype.xb=function(){var a=this,b=Zq(a),c=J(b,0,null),d=J(b,1,null);return Wg(O.c(function(){return function(a,b,c,d){return function m(e){return new kf(null,function(){return function(){for(;;){var a=E(e);if(a){if(Ae(a)){var b=Wc(a),c=H(b),d=of(c);return function(){for(var a=0;;)if(ac?f:c;return $r(a,ea?0:a}():function(){var a=e-b;return f>a?f:a}())} +function gs(a,b){var c=null!=a&&(a.m&64||q===a.G)?P(U,a):a,d=D.c(c,pl);d=null!=d&&(d.m&64||q===d.G)?P(U,d):d;var e=D.c(d,Aj),f=D.c(c,Yj),h=D.c(c,no);return $r(c,e>f?function(){var a=h-1,c=e+b;return a=a}}(l,p,a,c,c,d,e,f,h,k),h),l,p);return Zr(c,d)} +function it(a,b){var c=null!=a&&(a.m&64||q===a.G)?P(U,a):a,d=D.c(c,pl),e=null!=d&&(d.m&64||q===d.G)?P(U,d):d,f=D.c(e,zn),h=D.c(c,tk),k=D.c(c,fl),l=b-1;d=J(cf(Bi(function(a,b,c,d,e,f,h){return function(a){return h>a}}(l,a,c,c,d,e,f,h,k),h)),l,0);return Zr(c,d)}function jt(a){return K.l(a,im,Ve)}function kt(a){return K.l(a,im,Hr)}function lt(a,b,c){return K.l(a,b,c)}function mt(a,b,c){return Wg(O.A(jg(b,a),new R(null,1,5,T,[c],null),be([jg(H(a)-b-1,kg(b,a))])))} +function nt(a,b){var c=null!=a&&(a.m&64||q===a.G)?P(U,a):a,d=D.c(c,pl),e=null!=d&&(d.m&64||q===d.G)?P(U,d):d;d=D.c(e,zn);e=D.c(e,Aj);var f=D.c(c,fl);D.c(c,no);var h=D.c(c,Oj),k=D.c(c,Rj),l=D.c(c,$l),p=D.c(c,im);p=95b?p.h?p.h(b):p.call(null,b):b;h=tr(p,h);return G.c(f,d+1)?t(k)?K.l(Yr(zg(c,new R(null,3,5,T,[il,e,d],null),h),d+1),vk,!0):zg(c,new R(null,3,5,T,[il,e,d],null),h):Yr(Ag.Z(c,new R(null,2,5,T,[il,e],null),t(l)?mt:lt,d,h),d+1)} +function ot(a,b){var c=null!=a&&(a.m&64||q===a.G)?P(U,a):a,d=D.c(c,Rj),e=D.c(c,vk);t(t(d)?e:d)&&(c=null!=c&&(c.m&64||q===c.G)?P(U,c):c,d=D.c(c,pl),d=null!=d&&(d.m&64||q===d.G)?P(U,d):d,d=D.c(d,Aj),e=D.c(c,no),c=Yr(c,0),c=G.c(e,d+1)?Tr.h(c):$r(c,d+1));return c=nt(c,b)}function pt(a){a=null!=a&&(a.m&64||q===a.G)?P(U,a):a;var b=D.c(a,fl),c=D.c(a,no);return K.l(a,il,Wg(qg(c,Wg(qg(b,new R(null,2,5,T,[69,Ef],null))))))} +function qt(a){a=null!=a&&(a.m&64||q===a.G)?P(U,a):a;var b=D.c(a,pl);b=null!=b&&(b.m&64||q===b.G)?P(U,b):b;b=D.c(b,Aj);var c=D.c(a,fl),d=D.c(a,Oj);return zg(a,new R(null,2,5,T,[il,b],null),gr.c(c,d))}function rt(a,b,c){return Wg(O.c(jg(b,a),qg(H(a)-b,vr(c))))}function st(a,b,c){return Wg(O.c(qg(b+1,vr(c)),kg(b+1,a)))} +function tt(a){a=null!=a&&(a.m&64||q===a.G)?P(U,a):a;var b=D.c(a,pl),c=null!=b&&(b.m&64||q===b.G)?P(U,b):b;b=D.c(c,zn);c=D.c(c,Aj);var d=D.c(a,fl),e=D.c(a,Oj);--d;return Ag.Z(a,new R(null,2,5,T,[il,c],null),rt,b=k?Zr(c,k-1):c,m=Mb(D,p,new R(null,2,5,T,[pl,zn],null));return Ag.l(p,new R(null,2,5,T,[il,h],null),function(a,b,c,d,e,f,h,k,m,l,p,Q){return function(a){return Wg(O.A(jg(b,a),kg(b+c,a),be([qg(c,vr(Q))])))}}(p,m,function(){var a=k-m;return b=a}}(c,b)(b)}()))return Gu(a,b+64);throw Jt;}catch(h){if(h instanceof Error){var d=h;if(d===Jt)try{if(55===b)return Bg(a,V,ms);throw Jt;}catch(k){if(k instanceof Error){var e=k;if(e===Jt)try{if(56===b)return Bg(a,V,ns);throw Jt;}catch(l){if(l instanceof Error){var f=l;if(f===Jt)try{if(99===b)return du(a); +throw Jt;}catch(p){if(p instanceof Error){d=p;if(d===Jt)throw Jt;throw d;}throw p;}else throw f;}else throw l;}else throw e;}else throw k;}else throw d;}else throw h;}else throw Jt;}catch(h){if(h instanceof Error)if(d=h,d===Jt)try{if(35===c)try{if(56===b)return Bg(a,V,pt);throw Jt;}catch(k){if(k instanceof Error){e=k;if(e===Jt)throw Jt;throw e;}throw k;}else throw Jt;}catch(k){if(k instanceof Error)if(e=k,e===Jt)try{if(40===c)try{if(48===b)return Zt(a);throw Jt;}catch(l){if(l instanceof Error){f= +l;if(f===Jt)return $t(a);throw f;}throw l;}else throw Jt;}catch(l){if(l instanceof Error){f=l;if(f===Jt)return a;throw f;}throw l;}else throw e;else throw k;}else throw d;else throw h;}},function(a){return a},function(a){return a},Gu,function(a,b){return Cg(a,V,ot,b)},function(a,b){var c=function(){switch(b){case 64:return eu;case 65:return fu;case 66:return gu;case 67:return hu;case 68:return iu;case 69:return ju;case 70:return ku;case 71:return lu;case 72:return mu;case 73:return nu;case 74:return ou; +case 75:return pu;case 76:return su;case 77:return tu;case 80:return uu;case 83:return qu;case 84:return ru;case 87:return vu;case 88:return wu;case 90:return xu;case 96:return lu;case 97:return hu;case 100:return Du;case 101:return fu;case 102:return mu;case 103:return yu;case 104:return zu;case 108:return Au;case 109:return Cu;case 112:return Eu;case 114:return Fu;default:return null}}();return t(c)?c.h?c.h(a):c.call(null,a):a},function(a){return a},function(a,b){return K.l(a,kk,ge.c(kk.h(a),b))}, +function(a){return a},function(a,b){return K.l(a,rk,ge.c(rk.h(a),b))},function(a){return a},function(a){return a},function(a){return K.A(a,rk,he,be([kk,he]))}]);function Iu(a,b){for(var c=a,d=Tl.h(c),e=b;;){var f=y(e);if(t(f)){var h=160<=f?65:f;h=D.c(d.h?d.h(xq):d.call(null,xq),h);d=J(h,0,null);h=J(h,1,null);a:for(;;)if(E(h)){var k=y(h);k=Hu.h?Hu.h(k):Hu.call(null,k);c=k.c?k.c(c,f):k.call(null,c,f);h=z(h)}else break a;e=vd(e)}else return K.l(c,Tl,d)}} +function Ju(a,b){var c=xg(function(a){return a.codePointAt(0)},b);return Iu(a,c)} +function Ku(a,b){try{if(ze(b)&&3===H(b)){var c=Vd(b,0),d=Vd(b,1),e=Vd(b,2);return[v.h(a+8),";2;",v.h(c),";",v.h(d),";",v.h(e)].join("")}throw Jt;}catch(k){if(k instanceof Error){var f=k;if(f===Jt)try{if(t(function(){return function(){return function(a){return 8>a}}(f)(b)}()))return""+v.h(a+b);throw Jt;}catch(l){if(l instanceof Error){var h=l;if(h===Jt)try{if(t(function(){return function(){return function(a){return 16>a}}(h,f)(b)}()))return""+v.h(a+52+b);throw Jt;}catch(p){if(p instanceof Error){c= +p;if(c===Jt)return[v.h(a+8),";5;",v.h(b)].join("");throw c;}throw p;}else throw h;}else throw l;}else throw f;}else throw k;}}ag.c(Ku,30);ag.c(Ku,40);var Lu=function Lu(a){if(null!=a&&null!=a.yd)return a.yd(a);var c=Lu[n(null==a?null:a)];if(null!=c)return c.h?c.h(a):c.call(null,a);c=Lu._;if(null!=c)return c.h?c.h(a):c.call(null,a);throw Cb("Screen.lines",a);},Mu=function Mu(a){if(null!=a&&null!=a.xd)return a.xd(a);var c=Mu[n(null==a?null:a)];if(null!=c)return c.h?c.h(a):c.call(null,a);c=Mu._;if(null!=c)return c.h?c.h(a):c.call(null,a);throw Cb("Screen.cursor",a);};function Nu(a,b){var c=0parseFloat(Iv)){Hv=String(Kv);break a}}Hv=Iv}var gb={}; +function Lv(a){return fb(a,function(){for(var b=0,c=ra(String(Hv)).split("."),d=ra(String(a)).split("."),e=Math.max(c.length,d.length),f=0;0==b&&f=a.keyCode)a.keyCode=-1}catch(b){}};var Uv="closure_listenable_"+(1E6*Math.random()|0),Vv=0;function Wv(a,b,c,d,e){this.listener=a;this.Xd=null;this.src=b;this.type=c;this.capture=!!d;this.Ub=e;this.key=++Vv;this.$c=this.Fd=!1}function Xv(a){a.$c=!0;a.listener=null;a.Xd=null;a.src=null;a.Ub=null};function Yv(a){this.src=a;this.rb={};this.wd=0}Yv.prototype.add=function(a,b,c,d,e){var f=a.toString();a=this.rb[f];a||(a=this.rb[f]=[],this.wd++);var h=Zv(a,b,d,e);-1e.keyCode||void 0!=e.returnValue)){a:{var f=!1;if(0==e.keyCode)try{e.keyCode=-1;break a}catch(l){f=!0}if(f||void 0==e.returnValue)e.returnValue=!0}e=[];for(f=c.currentTarget;f;f=f.parentNode)e.push(f);f=a.type;for(var h=e.length-1;!c.Kc&&0<=h;h--){c.currentTarget=e[h];var k=nw(e[h],f,!0,c);d=d&&k}for(h=0;!c.Kc&& +h>>0);function fw(a){if(ha(a))return a;a[pw]||(a[pw]=function(b){return a.handleEvent(b)});return a[pw]};function qw(){wv.call(this);this.Ib=new Yv(this);this.ff=this;this.ve=null}qa(qw,wv);qw.prototype[Uv]=!0;g=qw.prototype;g.addEventListener=function(a,b,c,d){dw(this,a,b,c,d)};g.removeEventListener=function(a,b,c,d){lw(this,a,b,c,d)}; +g.dispatchEvent=function(a){var b,c=this.ve;if(c)for(b=[];c;c=c.ve)b.push(c);c=this.ff;var d=a.type||a;if(ca(a))a=new Sv(a,c);else if(a instanceof Sv)a.target=a.target||c;else{var e=a;a=new Sv(d,c);Ia(a,e)}e=!0;if(b)for(var f=b.length-1;!a.Kc&&0<=f;f--){var h=a.currentTarget=b[f];e=rw(h,d,!0,a)&&e}a.Kc||(h=a.currentTarget=c,e=rw(h,d,!0,a)&&e,a.Kc||(e=rw(h,d,!1,a)&&e));if(b)for(f=0;!a.Kc&&fthis.head?(Yw(this.o,this.fa,a,0,this.o.length-this.fa),Yw(this.o,0,a,this.o.length-this.fa,this.head),this.fa=0,this.head=this.length,this.o=a):this.fa===this.head?(this.head=this.fa=0,this.o=a):null};function ax(a,b){for(var c=a.length,d=0;;)if(da)){a+=1;continue}break}hx=!1;return 0c)return a;a:for(;;){var e=cMath.random()&&15>d)d+=1;else break a;if(d>this.level){for(var e=this.level+1;;)if(e<=d+1)c[e]=this.header,e+=1;else break;this.level=d}for(d=Ex(a,b,Array(d));;)return 0<=this.level?(c=c[0].forward,d.forward[0]=c[0],c[0]=d):null}; +Gx.prototype.remove=function(a){var b=Array(15),c=Fx(this.header,a,this.level,b);c=0===c.forward.length?null:c.forward[0];if(null!=c&&c.key===a){for(a=0;;)if(a<=this.level){var d=b[a].forward;c===(ad)return c===b.header?null:c;var e;a:for(e=c;;){e=d=a)break a}null!=e?(--d,c=e):--d}}Gx.prototype.S=function(){return function(a){return function d(c){return new kf(null,function(){return function(){return null==c?null:ae(new R(null,2,5,T,[c.key,c.H],null),d(c.forward[0]))}}(a),null,null)}}(this)(this.header.forward[0])}; +Gx.prototype.R=function(a,b,c){return Y(b,function(){return function(a){return Y(b,Qi,""," ","",c,a)}}(this),"{",", ","}",c,this)};var Ix=new Gx(Ex(null,null,0),0);function Jx(a){var b=(new Date).valueOf()+a,c=Hx(b),d=t(t(c)?c.keya:b)?a+8:a,[v.h(c),v.h(a)].join("")):null} +function Vy(a){var b=J(a,0,null),c=J(a,1,null);a=J(a,2,null);return["rgb(",v.h(b),",",v.h(c),",",v.h(a),")"].join("")} +var Wy=hj(function(a){a=null!=a&&(a.m&64||q===a.G)?P(U,a):a;var b=D.c(a,Nk),c=D.c(a,pl);a=K.l(a,Nk,t(c)?wb(b):b);var d=null!=a&&(a.m&64||q===a.G)?P(U,a):a,e=D.c(d,Ok),f=D.c(d,Tn);b=D.c(d,Kj);var h=D.c(d,dk);c=D.c(d,Vl);var k=D.c(d,Nk),l=D.c(d,Yn);d=D.c(d,pl);var p=t(k)?t(e)?e:"fg":f;e=Uy(t(k)?t(f)?f:"bg":e,b,"fg-");h=Uy(p,h,"bg-");c=vg(ub,new R(null,6,5,T,[e,h,t(b)?"bright":null,t(l)?"italic":null,t(c)?"underline":null,t(d)?"cursor":null],null));if(E(c))a:for(b=new cb,c=E(c);;)if(null!=c)b.append(""+ +v.h(y(c))),c=z(c),null!=c&&b.append(" ");else{b=b.toString();break a}else b=null;l=null!=a&&(a.m&64||q===a.G)?P(U,a):a;a=D.c(l,Ok);c=D.c(l,Tn);h=D.c(l,Nk);l=t(h)?c:a;a=t(h)?a:c;a=hi.A(be([t(ze.h?ze.h(l):ze.call(null,l))?new r(null,1,[ik,Vy(l)],null):null,t(ze.h?ze.h(a):ze.call(null,a))?new r(null,1,[al,Vy(a)],null):null]));return hi.A(be([t(b)?new r(null,1,[vn,b],null):null,t(a)?new r(null,1,[fm,a],null):null]))}); +function Xy(a,b){var c=J(a,0,null),d=J(a,1,null);d=Bg(d,pl,function(){return function(a){return t(a)?B(b):a}}(a,c,d));return new R(null,3,5,T,[ro,Wy.h?Wy.h(d):Wy.call(null,d),c],null)}function Yy(a,b){var c=J(a,0,null),d=J(a,1,null),e=jg(b,c);e=E(e)?new R(null,2,5,T,[Eo(e),d],null):null;var f=K.l(d,pl,!0);f=new R(null,2,5,T,[Vd(c,b),f],null);c=kg(b+1,c);d=E(c)?new R(null,2,5,T,[Eo(c),d],null):null;return vg(ub,new R(null,3,5,T,[e,f,d],null))} +function Zy(a,b){for(var c=he,d=a,e=b;;)if(E(d)){var f=y(d),h=J(f,0,null);J(f,1,null);h=H(h);if(h<=e)c=ge.c(c,f),d=vd(d),e-=h;else return O.A(c,Yy(f,e),be([vd(d)]))}else return c}function $y(a,b,c){a=t(B(b))?Zy(B(a),B(b)):B(a);return new R(null,2,5,T,[Lm,Ii(bg(function(){return function(a,b){return pe(new R(null,3,5,T,[Xy,b,c],null),new r(null,1,[mk,a],null))}}(a),a))],null)}var qA=new ti(null,new r(null,3,["small",null,"medium",null,"big",null],null),null); +function rA(a,b,c,d,e){var f=yp(function(){var a=B(c);return t(qA.h?qA.h(a):qA.call(null,a))?["font-",v.h(a)].join(""):null}),h=yp(function(){return function(){var d=B(a),e=B(b),f=B(c);f=t(qA.h?qA.h(f):qA.call(null,f))?null:new r(null,1,[wk,f],null);return hi.A(be([new r(null,2,[fl,[v.h(d),"ch"].join(""),no,[v.h(1.3333333333*e),"em"].join("")],null),f]))}}(f)),k=yp(function(){return function(){return Lu(B(d))}}(f,h)),l=yp(function(a,c,d){return function(){return xg(function(a,b,c){return function(d){return yp(function(a, +b,c){return function(){return D.c(B(c),d)}}(a,b,c))}}(a,c,d),Fi(0,B(b),1))}}(f,h,k)),p=yp(function(){return function(){return Mu(B(d))}}(f,h,k,l)),m=yp(function(a,b,c,d,e){return function(){return zn.h(B(e))}}(f,h,k,l,p)),u=yp(function(a,b,c,d,e){return function(){return Aj.h(B(e))}}(f,h,k,l,p,m)),w=yp(function(a,b,c,d,e){return function(){return On.h(B(e))}}(f,h,k,l,p,m,u));return function(a,b,c,d,f,h,k,l){return function(){return new R(null,3,5,T,[Gm,new r(null,2,[vn,B(a),fm,B(b)],null),bg(function(a, +b,c,d,f,h,k,l){return function(m,p){var u=yp(function(a,b,c,d,e,f,h,k){return function(){var a=B(k);return t(a)?(a=G.c(m,B(h)))?B(f):a:a}}(a,b,c,d,f,h,k,l));return pe(new R(null,4,5,T,[$y,p,u,e],null),new r(null,1,[mk,m],null))}}(a,b,c,d,f,h,k,l),B(d))],null)}}(f,h,k,l,p,m,u,w)} +function sA(){return new R(null,2,5,T,[Ym,new r(null,4,[Mn,"1.1",Fl,"0 0 866.0254037844387 866.0254037844387",vn,"icon",mo,new r(null,1,[An,'\x3cdefs\x3e \x3cmask id\x3d"small-triangle-mask"\x3e \x3crect width\x3d"100%" height\x3d"100%" fill\x3d"white"/\x3e \x3cpolygon points\x3d"508.01270189221935 433.01270189221935, 208.0127018922194 259.8076211353316, 208.01270189221927 606.217782649107" fill\x3d"black"\x3e\x3c/polygon\x3e \x3c/mask\x3e \x3c/defs\x3e \x3cpolygon points\x3d"808.0127018922194 433.01270189221935, 58.01270189221947 -1.1368683772161603e-13, 58.01270189221913 866.0254037844386" mask\x3d"url(#small-triangle-mask)" fill\x3d"white"\x3e\x3c/polygon\x3e \x3cpolyline points\x3d"481.2177826491071 333.0127018922194, 134.80762113533166 533.0127018922194" stroke\x3d"white" stroke-width\x3d"90"\x3e\x3c/polyline\x3e'],null)], +null)],null)}function tA(){return new R(null,3,5,T,[Ym,new r(null,3,[Mn,"1.1",Fl,"0 0 12 12",vn,"icon"],null),new R(null,2,5,T,[Fj,new r(null,1,[pn,"M1,0 L11,6 L1,12 Z"],null)],null)],null)}function uA(){return new R(null,4,5,T,[Ym,new r(null,3,[Mn,"1.1",Fl,"0 0 12 12",vn,"icon"],null),new R(null,2,5,T,[Fj,new r(null,1,[pn,"M1,0 L4,0 L4,12 L1,12 Z"],null)],null),new R(null,2,5,T,[Fj,new r(null,1,[pn,"M8,0 L11,0 L11,12 L8,12 Z"],null)],null)],null)} +function vA(){return new R(null,4,5,T,[Ym,new r(null,3,[Mn,"1.1",Fl,"0 0 12 12",vn,"icon"],null),new R(null,2,5,T,[Fj,new r(null,1,[pn,"M12,0 L7,0 L9,2 L7,4 L8,5 L10,3 L12,5 Z"],null)],null),new R(null,2,5,T,[Fj,new r(null,1,[pn,"M0,12 L0,7 L2,9 L4,7 L5,8 L3,10 L5,12 Z"],null)],null)],null)} +function wA(){return new R(null,4,5,T,[Ym,new r(null,3,[Mn,"1.1",Fl,"0 0 12 12",vn,"icon"],null),new R(null,2,5,T,[Fj,new r(null,1,[pn,"M7,5 L7,0 L9,2 L11,0 L12,1 L10,3 L12,5 Z"],null)],null),new R(null,2,5,T,[Fj,new r(null,1,[pn,"M5,7 L0,7 L2,9 L0,11 L1,12 L3,10 L5,12 Z"],null)],null)],null)}function xA(a,b){return function(b){return function(){return new R(null,3,5,T,[cl,new r(null,1,[Sl,b],null),new R(null,1,5,T,[t(B(a))?uA:tA],null)],null)}}(Ty(b,new fy(null,null,null)))} +function yA(a){return 10>a?["0",v.h(a)].join(""):a}function zA(a){var b=Math.floor((a%60+60)%60);return[v.h(yA(Math.floor(a/60))),":",v.h(yA(b))].join("")}function AA(a,b){var c=T,d=new R(null,2,5,T,[Yk,zA(B(a))],null),e=T;var f=B(a);var h=B(b);f=["-",v.h(zA(h-f))].join("");return new R(null,3,5,c,[Ml,d,new R(null,2,5,e,[co,f],null)],null)} +function BA(){function a(a){a.preventDefault();return Ry(a.currentTarget.parentNode.parentNode.parentNode)}return function(){return new R(null,4,5,T,[un,new r(null,1,[Sl,a],null),new R(null,1,5,T,[vA],null),new R(null,1,5,T,[wA],null)],null)}} +function CA(a,b){var c=Sy(b,function(a){var b=a.currentTarget.offsetWidth,c=a.currentTarget.getBoundingClientRect();return cy(Nu(a.clientX-c.left,b)/b)}),d=yp(function(){return function(){return[v.h(100*B(a)),"%"].join("")}}(c));return function(a,b){return function(){return new R(null,2,5,T,[Vj,new R(null,3,5,T,[Bl,new r(null,1,[Ql,a],null),new R(null,2,5,T,[Cj,new R(null,2,5,T,[ro,new r(null,1,[fm,new r(null,1,[fl,B(b)],null)],null)],null)],null)],null)],null)}}(c,d)} +function DA(a,b,c,d){return function(e){return function(){return new R(null,5,5,T,[Kk,new R(null,3,5,T,[xA,a,d],null),new R(null,3,5,T,[AA,b,c],null),new R(null,1,5,T,[BA],null),new R(null,3,5,T,[CA,e,d],null)],null)}}(yp(function(){return B(b)/B(c)}))} +function EA(a){return function(a){return function(){return new R(null,3,5,T,[ol,new r(null,1,[Sl,a],null),new R(null,2,5,T,[Xk,new R(null,2,5,T,[km,new R(null,2,5,T,[ro,new R(null,1,5,T,[sA],null)],null)],null)],null)],null)}}(Ty(a,new fy(null,null,null)))}function FA(){return new R(null,2,5,T,[Ek,new R(null,1,5,T,[xn],null)],null)}function GA(a){return Wf(function(b){return a[b]},new R(null,4,5,T,["altKey","shiftKey","metaKey","ctrlKey"],null))} +function HA(a){var b=t(GA(a))?null:function(){switch(a.key){case " ":return new fy(null,null,null);case "f":return bm;case "0":return cy(0);case "1":return cy(.1);case "2":return cy(.2);case "3":return cy(.3);case "4":return cy(.4);case "5":return cy(.5);case "6":return cy(.6);case "7":return cy(.7);case "8":return cy(.8);case "9":return cy(.9);default:return null}}();if(t(b))return b;switch(a.key){case "\x3e":return new ey(null,null,null);case "\x3c":return new dy(null,null,null);default:return null}} +function IA(a){if(t(GA(a)))return null;switch(a.which){case 37:return new ay(null,null,null);case 39:return new $x(null,null,null);default:return null}}function JA(a){var b=HA(a);return t(b)?(a.preventDefault(),G.c(b,bm)?(Ry(a.currentTarget),null):b):null}function KA(a){var b=IA(a);return t(b)?(a.preventDefault(),b):null} +function LA(a,b,c,d){a=t(a)?['"',v.h(a),'"'].join(""):"untitled";return new R(null,4,5,T,[dl,t(d)?new R(null,2,5,T,[jo,new r(null,1,[zl,d],null)],null):null,a,t(b)?new R(null,3,5,T,[ro," by ",t(c)?new R(null,3,5,T,[lo,new r(null,1,[ho,c],null),b],null):b],null):null],null)} +function MA(a){var b=Mx(1,ig.h(iy)),c=Kx(1);lx(function(c){return function(){var d=function(){return function(a){return function(){function b(b){for(;;){a:try{for(;;){var c=a(b);if(!N(c,Z)){var d=c;break a}}}catch(x){if(x instanceof Object)b[5]=x,Cx(b),d=Z;else throw x;}if(!N(d,Z))return d}}function c(){var a=[null,null,null,null,null,null,null,null,null,null,null,null];a[0]=d;a[1]=1;return a}var d=null;d=function(a){switch(arguments.length){case 0:return c.call(this);case 1:return b.call(this,a)}throw Error("Invalid arity: "+ +(arguments.length-1));};d.B=c;d.h=b;return d}()}(function(){return function(c){var d=c[1];if(7===d)return c[7]=c[2],Ax(c,12,b,!1);if(1===d)return c[2]=null,c[1]=2,Z;if(4===d)return c[8]=c[2],Ax(c,5,b,!0);if(6===d)return d=Jx(3E3),Ux(c,8,new R(null,2,5,T,[a,d],null));if(3===d)return Bx(c,c[2]);if(12===d)return c[9]=c[2],c[2]=null,c[1]=2,Z;if(2===d)return zx(c,4,a);if(11===d)return c[2]=c[2],c[1]=7,Z;if(9===d)return c[2]=null,c[1]=6,Z;if(5===d)return c[10]=c[2],c[2]=null,c[1]=6,Z;if(10===d)return c[2]= +null,c[1]=11,Z;if(8===d){var e=c[2];d=J(e,0,null);e=J(e,1,null);e=G.c(e,a);c[11]=d;c[1]=e?9:10;return Z}return null}}(c),c)}(),f=function(){var a=d.B?d.B():d.call(null);a[6]=c;return a}();return yx(f)}}(c));return b} +function NA(a,b){var c=dg.h(b),d=Kx(1);lx(function(b,c){return function(){var d=function(){return function(a){return function(){function b(b){for(;;){a:try{for(;;){var c=a(b);if(!N(c,Z)){var d=c;break a}}}catch(F){if(F instanceof Object)b[5]=F,Cx(b),d=Z;else throw F;}if(!N(d,Z))return d}}function c(){var a=[null,null,null,null,null,null,null,null,null,null,null,null,null];a[0]=d;a[1]=1;return a}var d=null;d=function(a){switch(arguments.length){case 0:return c.call(this);case 1:return b.call(this, +a)}throw Error("Invalid arity: "+(arguments.length-1));};d.B=c;d.h=b;return d}()}(function(b,c){return function(d){var e=d[1];if(7===e){var f=d[7],h=wb(null==f);d[8]=d[2];d[1]=h?8:9;return Z}if(20===e)return f=d[7],d[1]=t(q===f.Fe)?23:24,Z;if(27===e)return d[2]=!1,d[1]=28,Z;if(1===e)return d[2]=null,d[1]=2,Z;if(24===e)return f=d[7],d[1]=t(!f.Tc)?26:27,Z;if(4===e){f=d[7];var k=d[9];h=d[2];var l=J(h,0,null),m=J(h,1,null);d[10]=m;d[7]=l;d[9]=h;d[1]=t(null==l)?5:6;return Z}return 15===e?(d[2]=!1,d[1]= +16,Z):21===e?(f=d[7],h=Ab(Yx,f),d[2]=h,d[1]=22,Z):31===e?(d[11]=d[2],d[2]=null,d[1]=2,Z):13===e?(d[2]=d[2],d[1]=10,Z):22===e?(d[1]=t(d[2])?29:30,Z):29===e?(f=d[7],h=B(a),h=Zx(f,h),h=gg.l(c,wo,h),d[2]=h,d[1]=31,Z):6===e?(d[2]=null,d[1]=7,Z):28===e?(d[2]=d[2],d[1]=25,Z):25===e?(d[2]=d[2],d[1]=22,Z):17===e?(m=d[10],f=d[7],k=d[9],h=gg.c(a,function(){return function(a,b){return function(a){return Xx(b,a)}}(k,f,m,m,f,k,e,b,c)}()),d[2]=h,d[1]=19,Z):3===e?Bx(d,d[2]):12===e?(f=d[7],d[1]=t(!f.Tc)?14:15,Z): +2===e?(h=B(c),h=E(h),Ux(d,4,h)):23===e?(d[2]=!0,d[1]=25,Z):19===e?(f=d[7],h=wb(null==f),d[12]=d[2],d[1]=h?20:21,Z):11===e?(d[2]=!0,d[1]=13,Z):9===e?(f=d[7],h=Ab(Wx,f),d[2]=h,d[1]=10,Z):5===e?(m=d[10],h=gg.l(c,re,m),d[2]=h,d[1]=7,Z):14===e?(f=d[7],h=Ab(Wx,f),d[2]=h,d[1]=16,Z):26===e?(f=d[7],h=Ab(Yx,f),d[2]=h,d[1]=28,Z):16===e?(d[2]=d[2],d[1]=13,Z):30===e?(d[2]=null,d[1]=31,Z):10===e?(d[1]=t(d[2])?17:18,Z):18===e?(d[2]=null,d[1]=19,Z):8===e?(f=d[7],d[1]=t(q===f.sb)?11:12,Z):null}}(b,c),b,c)}(),e=function(){var a= +d.B?d.B():d.call(null);a[6]=b;return a}();return yx(e)}}(d,c));return d} +function OA(a,b,c){c=Ty(c,!0);var d=Sy(b,JA),e=Sy(b,KA),f=yp(function(){return function(){return Hm.h(B(a))}}(c,d,e)),h=yp(function(){return function(){return el.h(B(a))}}(c,d,e,f)),k=yp(function(a,b,c,d,e){return function(){var a=B(d);return t(a)?a:B(e)}}(c,d,e,f,h)),l=yp(function(b,c,d,e,f,h){return function(){var b=Gk.h(B(a));b=t(b)?b:wb(B(h));return t(b)?"hud":null}}(c,d,e,f,h,k)),p=yp(function(){return function(){return["asciinema-theme-",v.h(gm.h(B(a)))].join("")}}(c,d,e,f,h,k,l)),m=yp(function(){return function(){var b= +fl.h(B(a));return t(b)?b:80}}(c,d,e,f,h,k,l,p)),u=yp(function(){return function(){var b=no.h(B(a));return t(b)?b:24}}(c,d,e,f,h,k,l,p,m)),w=yp(function(){return function(){return wk.h(B(a))}}(c,d,e,f,h,k,l,p,m,u)),x=yp(function(){return function(){return V.h(B(a))}}(c,d,e,f,h,k,l,p,m,u,w)),C=yp(function(){return function(){return ml.h(B(a))}}(c,d,e,f,h,k,l,p,m,u,w,x)),F=yp(function(){return function(){return jn.h(B(a))}}(c,d,e,f,h,k,l,p,m,u,w,x,C)),I=yp(function(){return function(){return Uj.h(B(a))}}(c, +d,e,f,h,k,l,p,m,u,w,x,C,F)),M=yp(function(){return function(){return wl.h(B(a))}}(c,d,e,f,h,k,l,p,m,u,w,x,C,F,I)),S=B(a),X=null!=S&&(S.m&64||q===S.G)?P(U,S):S,Ga=D.c(X,ki),db=D.c(X,li),Q=D.c(X,mi),xb=D.c(X,ni);return function(a,c,d,e,f,h,k,l,m,p,u,w,x,C,F,I,M,S,Q,X,Ga,db){return function(){return new R(null,3,5,T,[Cn,new r(null,5,[Jj,-1,Zj,c,Rn,d,Vm,a,vn,B(k)],null),new R(null,7,5,T,[Sm,new r(null,1,[vn,B(l)],null),new R(null,6,5,T,[rA,m,p,u,w,x],null),new R(null,5,5,T,[DA,C,F,I,b],null),t(t(Q)?Q: +X)?new R(null,5,5,T,[LA,Q,X,Ga,db],null):null,t(B(h))?null:new R(null,2,5,T,[EA,b],null),t(B(e))?new R(null,1,5,T,[FA],null):null],null)],null)}}(c,d,e,f,h,k,l,p,m,u,w,x,C,F,I,M,S,X,Ga,db,Q,xb)} +function PA(a){var b=Kx(null),c=Kx(new dx(bx(1),1));return function(b,c){return function(){return Pp(new r(null,4,[ln,"asciinema-player",Dm,function(b,c){return function(){return OA(a,b,c)}}(b,c),$k,function(b,c){return function(){var d=ty(Gl.h(B(a))),e=MA(c);Tx(e,b);return NA(a,Je([b,d]))}}(b,c),Wm,function(){return function(){return uy(Gl.h(B(a)))}}(b,c)],null))}}(b,c)};function QA(a,b){var c=null!=b&&(b.m&64||q===b.G)?P(U,b):b,d=D.c(c,Ak),e=D.c(c,Gl);d=a.h?a.h(d):a.call(null,d);zy(e,d);return K.l(c,Ak,d)}$x.prototype.sb=q;$x.prototype.qb=function(a,b){var c=null!=b&&(b.m&64||q===b.G)?P(U,b):b,d=D.c(c,Uj),e=D.c(c,wl),f=D.c(c,Gl);t(e)&&yy(f,Nu(d+5,e));return c};ay.prototype.sb=q;ay.prototype.qb=function(a,b){var c=null!=b&&(b.m&64||q===b.G)?P(U,b):b,d=D.c(c,Uj),e=D.c(c,wl),f=D.c(c,Gl);t(e)&&yy(f,Nu(d+-5,e));return c};by.prototype.sb=q; +by.prototype.qb=function(a,b){var c=null!=b&&(b.m&64||q===b.G)?P(U,b):b,d=D.c(c,wl),e=D.c(c,Gl);t(d)&&(d*=nn.h(this),yy(e,d));return c};dy.prototype.sb=q;dy.prototype.qb=function(a,b){return QA(function(){return function(a){return a/2}}(this),b)};ey.prototype.sb=q;ey.prototype.qb=function(a,b){return QA(function(){return function(a){return 2*a}}(this),b)};fy.prototype.sb=q;fy.prototype.qb=function(a,b){xy(Gl.h(b));return b};gy.prototype.sb=q;gy.prototype.qb=function(a,b){return K.l(b,ml,so.h(this))}; +hy.prototype.sb=q;hy.prototype.qb=function(a,b){return K.l(b,Gk,so.h(this))};jy.prototype.sb=q;jy.prototype.qb=function(a,b){var c=null!=a&&(a.m&64||q===a.G)?P(U,a):a;D.c(c,fl);D.c(c,no);D.c(c,wl);c=null!=b&&(b.m&64||q===b.G)?P(U,b):b;var d=D.c(c,fl),e=D.c(c,no),f=null!=this&&(this.m&64||q===this.G)?P(U,this):this,h=D.c(f,fl),k=D.c(f,no);f=D.c(f,wl);return K.A(c,fl,t(d)?d:h,be([no,t(e)?e:k,wl,f]))};ky.prototype.sb=q;ky.prototype.qb=function(a,b){return K.l(b,Hm,Hm.h(this))};oy.prototype.sb=q; +oy.prototype.qb=function(a,b){var c=null!=b&&(b.m&64||q===b.G)?P(U,b):b,d=D.c(c,oi);t(d)&&(ap(bp),d.B?d.B():d.call(null));return c};ry.prototype.sb=q;ry.prototype.qb=function(a,b){return K.l(b,Uj,Zk.h(this))};function RA(){return ig.l(function(a,b){return new R(null,2,5,T,[a,new gy(b,null,null,null)],null)},rg(function(a){return a+.5},.5),og(new R(null,2,5,T,[!1,!0],null)))}function SA(a){var b=Dy(RA());return K.l(K.l(a,ml,!0),Ol,b)} +function TA(a){a=null!=a&&(a.m&64||q===a.G)?P(U,a):a;var b=D.c(a,Ol);Tw(b);return K.l(K.l(a,ml,!0),Ol,null)}function UA(a){a=null!=a&&(a.m&64||q===a.G)?P(U,a):a;a=D.c(a,Ol);return t(a)?Je([a]):vi}my.prototype.sb=q; +my.prototype.qb=function(a,b){var c=null!=a&&(a.m&64||q===a.G)?P(U,a):a;D.c(c,jn);var d=null!=b&&(b.m&64||q===b.G)?P(U,b):b,e=D.c(d,jn);c=D.c(d,pi);var f=D.c(d,qi),h=null!=this&&(this.m&64||q===this.G)?P(U,this):this;h=D.c(h,jn);if(G.c(e,h))return d;d=K.A(d,jn,h,be([el,!0]));if(t(h))return t(c)&&(c.B?c.B():c.call(null)),SA(d);t(f)&&(f.B?f.B():f.call(null));return TA(d)};my.prototype.Fe=q;my.prototype.de=function(a,b){return UA(b)};py.prototype.sb=q; +py.prototype.qb=function(a,b){var c=K.l(b,V,V.h(this));c=null!=c&&(c.m&64||q===c.G)?P(U,c):c;var d=D.c(c,Ol);return t(d)?SA(TA(c)):c};py.prototype.Fe=q;py.prototype.de=function(a,b){return UA(b)};function VA(a){return t(a)?(a=ig.c(parseFloat,Fo(""+v.h(a),/:/)),a=ig.l(Ye,cf(a),rg(function(){return function(a){return 60*a}}(a),1)),P(Xe,a)):null} +function WA(a,b,c){t(a)?"string"===typeof a?t(0===a.indexOf("data:application/json;base64,"))?(b=a.substring(29).replace(RegExp("\\s","g"),""),b=JSON.parse(atob(b)),b=fj(b),b=new r(null,1,[V,new r(null,1,[il,b],null)],null)):t(0===a.indexOf("data:text/plain,"))?(a=a.substring(16),b=Ju(Ot(t(b)?b:80,t(c)?c:24),a),b=new r(null,1,[V,b],null)):b=t(0===a.indexOf("npt:"))?new r(null,1,[Zk,VA(a.substring(4))],null):null:b=new r(null,1,[V,new r(null,1,[il,a],null)],null):b=null;return b} +var XA=new r(null,2,[pl,new r(null,1,[On,!1],null),il,he],null); +function YA(a,b){var c=null!=b&&(b.m&64||q===b.G)?P(U,b):b,d=D.c(c,no),e=D.l(c,wk,"small"),f=D.l(c,Ak,1),h=D.c(c,Hk),k=D.c(c,fl),l=D.c(c,rl),p=D.l(c,cm,!1),m=D.l(c,gm,"asciinema"),u=D.c(c,qm),w=D.c(c,Bm),x=D.l(c,vm,!1),C=D.l(c,Em,!1),F=function(){var a=VA(h);return t(a)?a:0}();w=WA(w,k,d);var I=null!=w&&(w.m&64||q===w.G)?P(U,w):w;w=D.c(I,V);I=D.c(I,Zk);var M=t(I)?I:wb(w)&&0 tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px 7px 0 7px; + background-color: #ffe; + width: 40%; + float: right; +} + +p.sidebar-title { + font-weight: bold; +} + +/* -- topics ---------------------------------------------------------------- */ + +div.topic { + border: 1px solid #ccc; + padding: 7px 7px 0 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +div.admonition dl { + margin-bottom: 0; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +table.footnote td, table.footnote th { + border: 0 !important; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +dl { + margin-bottom: 15px; +} + +dd p { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; +} + +td.linenos pre { + padding: 5px 0px; + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + margin-left: 0.5em; +} + +table.highlighttable td { + padding: 0 0.5em 0 0.5em; +} + +div.code-block-caption { + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +div.code-block-caption + div > div.highlight > pre { + margin-top: 0; +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + padding: 1em 1em 0; +} + +div.literal-block-wrapper div.highlight { + margin: 0; +} + +code.descname { + background-color: transparent; + font-weight: bold; + font-size: 1.2em; +} + +code.descclassname { + background-color: transparent; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: relative; + left: 0px; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/refs/pull/405/merge/_static/comment-bright.png b/refs/pull/405/merge/_static/comment-bright.png new file mode 100644 index 00000000..15e27edb Binary files /dev/null and b/refs/pull/405/merge/_static/comment-bright.png differ diff --git a/refs/pull/405/merge/_static/comment-close.png b/refs/pull/405/merge/_static/comment-close.png new file mode 100644 index 00000000..4d91bcf5 Binary files /dev/null and b/refs/pull/405/merge/_static/comment-close.png differ diff --git a/refs/pull/405/merge/_static/comment.png b/refs/pull/405/merge/_static/comment.png new file mode 100644 index 00000000..dfbc0cbd Binary files /dev/null and b/refs/pull/405/merge/_static/comment.png differ diff --git a/refs/pull/405/merge/_static/common.js b/refs/pull/405/merge/_static/common.js new file mode 100644 index 00000000..9ec85fa9 --- /dev/null +++ b/refs/pull/405/merge/_static/common.js @@ -0,0 +1,129 @@ +var PERMANENT_URL_PREFIX = DOCUMENTATION_OPTIONS.URL_ROOT + '_static/'; + +var SLIDE_CLASSES = ['far-past', 'past', 'current', 'next', 'far-next']; +var SLIDES_SELECTOR = 'section.slides > article'; + +var PM_TOUCH_SENSITIVITY = 15; +var TABLE_CLASS = 'table'; + +/* ---------------------------------------------------------------------- */ +/* classList polyfill by Eli Grey + * (http://purl.eligrey.com/github/classList.js/blob/master/classList.js) */ + +if (typeof document !== "undefined" && !("classList" in document.createElement("a"))) { + +(function (view) { + +var + classListProp = "classList" + , protoProp = "prototype" + , elemCtrProto = (view.HTMLElement || view.Element)[protoProp] + , objCtr = Object + strTrim = String[protoProp].trim || function () { + return this.replace(/^\s+|\s+$/g, ""); + } + , arrIndexOf = Array[protoProp].indexOf || function (item) { + for (var i = 0, len = this.length; i < len; i++) { + if (i in this && this[i] === item) { + return i; + } + } + return -1; + } + // Vendors: please allow content code to instantiate DOMExceptions + , DOMEx = function (type, message) { + this.name = type; + this.code = DOMException[type]; + this.message = message; + } + , checkTokenAndGetIndex = function (classList, token) { + if (token === "") { + throw new DOMEx( + "SYNTAX_ERR" + , "An invalid or illegal string was specified" + ); + } + if (/\s/.test(token)) { + throw new DOMEx( + "INVALID_CHARACTER_ERR" + , "String contains an invalid character" + ); + } + return arrIndexOf.call(classList, token); + } + , ClassList = function (elem) { + var + trimmedClasses = strTrim.call(elem.className) + , classes = trimmedClasses ? trimmedClasses.split(/\s+/) : [] + ; + for (var i = 0, len = classes.length; i < len; i++) { + this.push(classes[i]); + } + this._updateClassName = function () { + elem.className = this.toString(); + }; + } + , classListProto = ClassList[protoProp] = [] + , classListGetter = function () { + return new ClassList(this); + } +; +// Most DOMException implementations don't allow calling DOMException's toString() +// on non-DOMExceptions. Error's toString() is sufficient here. +DOMEx[protoProp] = Error[protoProp]; +classListProto.item = function (i) { + return this[i] || null; +}; +classListProto.contains = function (token) { + token += ""; + return checkTokenAndGetIndex(this, token) !== -1; +}; +classListProto.add = function (token) { + token += ""; + if (checkTokenAndGetIndex(this, token) === -1) { + this.push(token); + this._updateClassName(); + } +}; +classListProto.remove = function (token) { + token += ""; + var index = checkTokenAndGetIndex(this, token); + if (index !== -1) { + this.splice(index, 1); + this._updateClassName(); + } +}; +classListProto.toggle = function (token) { + token += ""; + if (checkTokenAndGetIndex(this, token) === -1) { + this.add(token); + } else { + this.remove(token); + } +}; +classListProto.toString = function () { + return this.join(" "); +}; + +if (objCtr.defineProperty) { + var classListPropDesc = { + get: classListGetter + , enumerable: true + , configurable: true + }; + try { + objCtr.defineProperty(elemCtrProto, classListProp, classListPropDesc); + } catch (ex) { // IE 8 doesn't support enumerable:true + if (ex.number === -0x7FF5EC54) { + classListPropDesc.enumerable = false; + objCtr.defineProperty(elemCtrProto, classListProp, classListPropDesc); + } + } +} else if (objCtr[protoProp].__defineGetter__) { + elemCtrProto.__defineGetter__(classListProp, classListGetter); +} + +}(self)); + +} +/* ---------------------------------------------------------------------- */ diff --git a/refs/pull/405/merge/_static/console.css b/refs/pull/405/merge/_static/console.css new file mode 100644 index 00000000..32730124 --- /dev/null +++ b/refs/pull/405/merge/_static/console.css @@ -0,0 +1,64 @@ +#controls { + text-align: center; + width: 100%; + margin: 10px; + font-size: 1.5em; + font-family: sans-serif; +} + +.slides.table > article { + display: inline-block; +} + +article.placeholder { + background: #ddd; +} + +.slides.table > article { + position: absolute; + left: 50%; + margin-left: -225px; +} + +.slides.table > article.past { + transform: translate(-325px); + -o-transform: translate(-325px); + -moz-transform: translate(-325px); + -webkit-transform: translate3d(-325px, 0, 0); + +} + +.slides.table > article.next { + transform: translate(475px); + -o-transform: translate(475px); + -moz-transform: translate(475px); + -webkit-transform: translate3d(475px, 0, 0); +} + +.slides > article.past, +.slides > article.next { + height: 230px; + width: 300px; + + margin-top: 60px; +} + +div.presenter_notes { + position: absolute; + top: 420px; + left: 10%; + background-color: white; + color: black; + padding: 1em; + width: 80%; + font-size: 130%; + + border-radius: 10px; + -o-border-radius: 10px; + -moz-border-radius: 10px; + -webkit-border-radius: 10px; +} + +div.presenter_notes p.admonition-title { + display: none; +} \ No newline at end of file diff --git a/refs/pull/405/merge/_static/console.js b/refs/pull/405/merge/_static/console.js new file mode 100644 index 00000000..583b9a5b --- /dev/null +++ b/refs/pull/405/merge/_static/console.js @@ -0,0 +1,91 @@ +document.addEventListener('DOMContentLoaded', function() { + + var + + handleKey = function(event) { + switch (event.keyCode) { + case 39: // right arrow + case 13: // Enter + case 32: // space + case 34: // PgDn + nextSlide(); + event.preventDefault(); + break; + + case 37: // left arrow + case 8: // Backspace + case 33: // PgUp + prevSlide(); + event.preventDefault(); + break; + } + }, + + handleUpdateSlides = function(slide_index, prev_slide, cur_slide, next_slide) { + document.querySelector('#cur_slide_num').innerHTML = Number(slide_index) + 1; + + // make sure we have a previous and next slide to show; + // if not add dummy placeholders + if (!prev_slide) { + prev_slide = '
'; + } + if (!next_slide) { + next_slide = '
'; + } + + document.querySelector('#slide_container').innerHTML = prev_slide + cur_slide + next_slide; + + // Copy the presenter notes into place + $('#presenter_notes').empty(); + $('article.current').find('div.admonition.note').each( + function(i, node) { + $('#presenter_notes').append($(node).html()); + } + ); + + var slides = document.querySelector('section.slides > article'); + for (var i=0; i < slides.length; i++) { + + } + }, + + handleMessage = function(e) { + switch (e.data.command) { + case 'num_slides': + document.querySelector('#num_slides').innerHTML = e.data.content; + break; + case 'cur_slide': + handleUpdateSlides(e.data.content, e.data.prev_slide, e.data.slide, e.data.next_slide); + break; + } + }, + + nextSlide = function(e) { + if (e) { + e.preventDefault(); + } + window.opener.postMessage({command: 'nextSlide'}, '*'); + }, + + prevSlide = function(e) { + if (e) { + e.preventDefault(); + } + + window.opener.postMessage({command: 'prevSlide'}, '*'); + }, + + init = function(e) { + window.addEventListener('message', handleMessage, false); + document.addEventListener('keydown', handleKey, false); + + document.querySelector('#next').addEventListener('click', nextSlide); + document.querySelector('#prev').addEventListener('click', prevSlide); + + window.opener.postMessage({command: 'register'}, '*'); + + }; + + init(); + + }, false); diff --git a/refs/pull/405/merge/_static/controller.js b/refs/pull/405/merge/_static/controller.js new file mode 100644 index 00000000..e5fda6b9 --- /dev/null +++ b/refs/pull/405/merge/_static/controller.js @@ -0,0 +1,57 @@ +var SlideController = ( + function(){ + + var + slidedeck, + + onKeyDown = function (event) { + + switch (event.keyCode) { + case 39: // right arrow + case 13: // Enter + case 32: // space + case 34: // PgDn + slidedeck.nextSlide(); + event.preventDefault(); + break; + + case 37: // left arrow + case 8: // Backspace + case 33: // PgUp + slidedeck.prevSlide(); + event.preventDefault(); + break; + + case 40: // down arrow + if (isChromeVoxActive()) { + slidedeck.speakNextItem(); + } else { + slidedeck.nextSlide(); + } + event.preventDefault(); + break; + + case 38: // up arrow + if (isChromeVoxActive()) { + slidedeck.speakPrevItem(); + } else { + slidedeck.prevSlide(); + } + event.preventDefault(); + break; + + } + }; + + init = function(slides) { + slidedeck = slides; + + document.addEventListener('keydown', onKeyDown, false); + + }; + + return { + init: init + }; + + }()); diff --git a/refs/pull/405/merge/_static/css/badge_only.css b/refs/pull/405/merge/_static/css/badge_only.css new file mode 100644 index 00000000..c718cee4 --- /dev/null +++ b/refs/pull/405/merge/_static/css/badge_only.css @@ -0,0 +1 @@ +.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} \ No newline at end of file diff --git a/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Bold.woff b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 00000000..6cb60000 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Bold.woff2 b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 00000000..7059e231 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Regular.woff b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 00000000..f815f63f Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Regular.woff2 b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 00000000..f2c76e5b Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.eot b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.eot new file mode 100644 index 00000000..e9f60ca9 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.eot differ diff --git a/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.svg b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.svg new file mode 100644 index 00000000..855c845e --- /dev/null +++ b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserveddiff --git a/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.ttf b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 00000000..35acda2f Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.woff b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.woff new file mode 100644 index 00000000..400014a4 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.woff differ diff --git a/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.woff2 b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 00000000..4d13fc60 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/fontawesome-webfont.woff2 differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-bold-italic.woff b/refs/pull/405/merge/_static/css/fonts/lato-bold-italic.woff new file mode 100644 index 00000000..88ad05b9 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-bold-italic.woff differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-bold-italic.woff2 b/refs/pull/405/merge/_static/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 00000000..c4e3d804 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-bold-italic.woff2 differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-bold.woff b/refs/pull/405/merge/_static/css/fonts/lato-bold.woff new file mode 100644 index 00000000..c6dff51f Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-bold.woff differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-bold.woff2 b/refs/pull/405/merge/_static/css/fonts/lato-bold.woff2 new file mode 100644 index 00000000..bb195043 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-bold.woff2 differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-normal-italic.woff b/refs/pull/405/merge/_static/css/fonts/lato-normal-italic.woff new file mode 100644 index 00000000..76114bc0 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-normal-italic.woff differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-normal-italic.woff2 b/refs/pull/405/merge/_static/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 00000000..3404f37e Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-normal-italic.woff2 differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-normal.woff b/refs/pull/405/merge/_static/css/fonts/lato-normal.woff new file mode 100644 index 00000000..ae1307ff Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-normal.woff differ diff --git a/refs/pull/405/merge/_static/css/fonts/lato-normal.woff2 b/refs/pull/405/merge/_static/css/fonts/lato-normal.woff2 new file mode 100644 index 00000000..3bf98433 Binary files /dev/null and b/refs/pull/405/merge/_static/css/fonts/lato-normal.woff2 differ diff --git a/refs/pull/405/merge/_static/css/theme.css b/refs/pull/405/merge/_static/css/theme.css new file mode 100644 index 00000000..19a446a0 --- /dev/null +++ b/refs/pull/405/merge/_static/css/theme.css @@ -0,0 +1,4 @@ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs>li{display:inline-block;padding-top:5px}.wy-breadcrumbs>li.wy-breadcrumbs-aside{float:right}.rst-content .wy-breadcrumbs>li code,.rst-content .wy-breadcrumbs>li tt,.wy-breadcrumbs>li .rst-content tt,.wy-breadcrumbs>li code{all:inherit;color:inherit}.breadcrumb-item:before{content:"/";color:#bbb;font-size:13px;padding:0 6px 0 3px}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content p a{overflow-wrap:anywhere}.rst-content .wy-table td p,.rst-content .wy-table td ul,.rst-content .wy-table th p,.rst-content .wy-table th ul,.rst-content table.docutils td p,.rst-content table.docutils td ul,.rst-content table.docutils th p,.rst-content table.docutils th ul,.rst-content table.field-list td p,.rst-content table.field-list td ul,.rst-content table.field-list th p,.rst-content table.field-list th ul{font-size:inherit}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .citation-reference>span.fn-bracket,.rst-content .footnote-reference>span.fn-bracket{display:none}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:auto minmax(80%,95%)}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{display:inline-grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{display:grid;grid-template-columns:auto auto minmax(.65rem,auto) minmax(40%,95%)}html.writer-html5 .rst-content aside.citation>span.label,html.writer-html5 .rst-content aside.footnote>span.label,html.writer-html5 .rst-content div.citation>span.label{grid-column-start:1;grid-column-end:2}html.writer-html5 .rst-content aside.citation>span.backrefs,html.writer-html5 .rst-content aside.footnote>span.backrefs,html.writer-html5 .rst-content div.citation>span.backrefs{grid-column-start:2;grid-column-end:3;grid-row-start:1;grid-row-end:3}html.writer-html5 .rst-content aside.citation>p,html.writer-html5 .rst-content aside.footnote>p,html.writer-html5 .rst-content div.citation>p{grid-column-start:4;grid-column-end:5}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{margin-bottom:24px}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.citation>dt>span.brackets:before,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.citation>dt>span.brackets:after,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a{word-break:keep-all}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a:not(:first-child):before,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.citation>dd p,html.writer-html5 .rst-content dl.footnote>dd p{font-size:.9rem}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{padding-left:1rem;padding-right:1rem;font-size:.9rem;line-height:1.2rem}html.writer-html5 .rst-content aside.citation p,html.writer-html5 .rst-content aside.footnote p,html.writer-html5 .rst-content div.citation p{font-size:.9rem;line-height:1.2rem;margin-bottom:12px}html.writer-html5 .rst-content aside.citation span.backrefs,html.writer-html5 .rst-content aside.footnote span.backrefs,html.writer-html5 .rst-content div.citation span.backrefs{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content aside.citation span.backrefs>a,html.writer-html5 .rst-content aside.footnote span.backrefs>a,html.writer-html5 .rst-content div.citation span.backrefs>a{word-break:keep-all}html.writer-html5 .rst-content aside.citation span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content aside.footnote span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content div.citation span.backrefs>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content aside.citation span.label,html.writer-html5 .rst-content aside.footnote span.label,html.writer-html5 .rst-content div.citation span.label{line-height:1.2rem}html.writer-html5 .rst-content aside.citation-list,html.writer-html5 .rst-content aside.footnote-list,html.writer-html5 .rst-content div.citation-list{margin-bottom:24px}html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content aside.footnote-list aside.footnote,html.writer-html5 .rst-content div.citation-list>div.citation,html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content aside.footnote-list aside.footnote code,html.writer-html5 .rst-content aside.footnote-list aside.footnote tt,html.writer-html5 .rst-content aside.footnote code,html.writer-html5 .rst-content aside.footnote tt,html.writer-html5 .rst-content div.citation-list>div.citation code,html.writer-html5 .rst-content div.citation-list>div.citation tt,html.writer-html5 .rst-content dl.citation code,html.writer-html5 .rst-content dl.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040;overflow-wrap:normal}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl dd>ol:last-child,.rst-content dl dd>p:last-child,.rst-content dl dd>table:last-child,.rst-content dl dd>ul:last-child{margin-bottom:0}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel,.rst-content .menuselection{font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .guilabel,.rst-content .menuselection{border:1px solid #7fbbe3;background:#e7f2fa}.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>.kbd,.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>kbd{color:inherit;font-size:80%;background-color:#fff;border:1px solid #a6a6a6;border-radius:4px;box-shadow:0 2px grey;padding:2.4px 6px;margin:auto 0}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} \ No newline at end of file diff --git a/refs/pull/405/merge/_static/doctools.js b/refs/pull/405/merge/_static/doctools.js new file mode 100644 index 00000000..0c15c009 --- /dev/null +++ b/refs/pull/405/merge/_static/doctools.js @@ -0,0 +1,311 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for all documentation. + * + * :copyright: Copyright 2007-2018 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + +/** + * make the code below compatible with browsers without + * an installed firebug like debugger +if (!window.console || !console.firebug) { + var names = ["log", "debug", "info", "warn", "error", "assert", "dir", + "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", + "profile", "profileEnd"]; + window.console = {}; + for (var i = 0; i < names.length; ++i) + window.console[names[i]] = function() {}; +} + */ + +/** + * small helper function to urldecode strings + */ +jQuery.urldecode = function(x) { + return decodeURIComponent(x).replace(/\+/g, ' '); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var bbox = span.getBBox(); + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + var parentOfText = node.parentNode.parentNode; + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} + +/** + * Small JavaScript module for the documentation. + */ +var Documentation = { + + init : function() { + this.fixFirefoxAnchorBug(); + this.highlightSearchWords(); + this.initIndexTable(); + + }, + + /** + * i18n support + */ + TRANSLATIONS : {}, + PLURAL_EXPR : function(n) { return n === 1 ? 0 : 1; }, + LOCALE : 'unknown', + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext : function(string) { + var translated = Documentation.TRANSLATIONS[string]; + if (typeof translated === 'undefined') + return string; + return (typeof translated === 'string') ? translated : translated[0]; + }, + + ngettext : function(singular, plural, n) { + var translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated === 'undefined') + return (n == 1) ? singular : plural; + return translated[Documentation.PLURALEXPR(n)]; + }, + + addTranslations : function(catalog) { + for (var key in catalog.messages) + this.TRANSLATIONS[key] = catalog.messages[key]; + this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); + this.LOCALE = catalog.locale; + }, + + /** + * add context elements like header anchor links + */ + addContextElements : function() { + $('div[id] > :header:first').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this headline')). + appendTo(this); + }); + $('dt[id]').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this definition')). + appendTo(this); + }); + }, + + /** + * workaround a firefox stupidity + * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 + */ + fixFirefoxAnchorBug : function() { + if (document.location.hash && $.browser.mozilla) + window.setTimeout(function() { + document.location.href += ''; + }, 10); + }, + + /** + * highlight the search words provided in the url in the text + */ + highlightSearchWords : function() { + var params = $.getQueryParameters(); + var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; + if (terms.length) { + var body = $('div.body'); + if (!body.length) { + body = $('body'); + } + window.setTimeout(function() { + $.each(terms, function() { + body.highlightText(this.toLowerCase(), 'highlighted'); + }); + }, 10); + $('') + .appendTo($('#searchbox')); + } + }, + + /** + * init the domain index toggle buttons + */ + initIndexTable : function() { + var togglers = $('img.toggler').click(function() { + var src = $(this).attr('src'); + var idnum = $(this).attr('id').substr(7); + $('tr.cg-' + idnum).toggle(); + if (src.substr(-9) === 'minus.png') + $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); + else + $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); + }).css('display', ''); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { + togglers.click(); + } + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords : function() { + $('#searchbox .highlight-link').fadeOut(300); + $('span.highlighted').removeClass('highlighted'); + }, + + /** + * make the url absolute + */ + makeURL : function(relativeURL) { + return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; + }, + + /** + * get the current relative url + */ + getCurrentURL : function() { + var path = document.location.pathname; + var parts = path.split(/\//); + $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { + if (this === '..') + parts.pop(); + }); + var url = parts.join('/'); + return path.substring(url.lastIndexOf('/') + 1, path.length - 1); + }, + + initOnKeyListeners: function() { + $(document).keyup(function(event) { + var activeElementType = document.activeElement.tagName; + // don't navigate when in search box or textarea + if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT') { + switch (event.keyCode) { + case 37: // left + var prevHref = $('link[rel="prev"]').prop('href'); + if (prevHref) { + window.location.href = prevHref; + return false; + } + case 39: // right + var nextHref = $('link[rel="next"]').prop('href'); + if (nextHref) { + window.location.href = nextHref; + return false; + } + } + } + }); + } +}; + +// quick alias for translations +_ = Documentation.gettext; + +$(document).ready(function() { + Documentation.init(); +}); \ No newline at end of file diff --git a/refs/pull/405/merge/_static/down-pressed.png b/refs/pull/405/merge/_static/down-pressed.png new file mode 100644 index 00000000..5756c8ca Binary files /dev/null and b/refs/pull/405/merge/_static/down-pressed.png differ diff --git a/refs/pull/405/merge/_static/down.png b/refs/pull/405/merge/_static/down.png new file mode 100644 index 00000000..1b3bdad2 Binary files /dev/null and b/refs/pull/405/merge/_static/down.png differ diff --git a/refs/pull/405/merge/_static/file.png b/refs/pull/405/merge/_static/file.png new file mode 100644 index 00000000..a858a410 Binary files /dev/null and b/refs/pull/405/merge/_static/file.png differ diff --git a/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.eot b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.eot new file mode 100644 index 00000000..0a22d0cf Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.eot differ diff --git a/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.svg b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.svg new file mode 100644 index 00000000..03cbdefb --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.svgo newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.ttf b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.ttf new file mode 100644 index 00000000..b06166f7 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.woff b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.woff new file mode 100644 index 00000000..2f0b1618 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/droidsansmono-webfont.woff differ diff --git a/refs/pull/405/merge/_static/fonts/generator_config.txt b/refs/pull/405/merge/_static/fonts/generator_config.txt new file mode 100644 index 00000000..b3e2b92f --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/generator_config.txt @@ -0,0 +1,5 @@ +# Font Squirrel Font-face Generator Configuration File +# Upload this file to the generator to recreate the settings +# you used to create these fonts. + +{"mode":"optimal","formats":["ttf","woff","eotz"],"tt_instructor":"default","fix_vertical_metrics":"Y","fix_gasp":"xy","add_spaces":"Y","add_hyphens":"Y","fallback":"none","fallback_custom":"100","options_subset":"basic","subset_custom":"","subset_custom_range":"","css_stylesheet":"stylesheet.css","filename_suffix":"-webfont","emsquare":"2048","spacing_adjustment":"0"} \ No newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.eot b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.eot new file mode 100644 index 00000000..bf9e21ae Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.eot differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.svg b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.svg new file mode 100644 index 00000000..ff52169a --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.svg @@ -0,0 +1,244 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.ttf b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.ttf new file mode 100644 index 00000000..3931cd3d Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.woff b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.woff new file mode 100644 index 00000000..5a6305ee Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-italic-webfont.woff differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-light-webfont.eot b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.eot new file mode 100644 index 00000000..48414108 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.eot differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-light-webfont.svg b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.svg new file mode 100644 index 00000000..45d9a051 --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.svgo newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/opensans-light-webfont.ttf b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.ttf new file mode 100644 index 00000000..1218bd92 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-light-webfont.woff b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.woff new file mode 100644 index 00000000..4dc04810 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-light-webfont.woff differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.eot b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.eot new file mode 100644 index 00000000..7635c37b Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.eot differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.svg b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.svg new file mode 100644 index 00000000..b99917fc --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.svgo newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.ttf b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.ttf new file mode 100644 index 00000000..ef7ae600 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.woff b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.woff new file mode 100644 index 00000000..d70276ea Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-lightitalic-webfont.woff differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.eot b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.eot new file mode 100644 index 00000000..dc6f2696 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.eot differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.svg b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.svg new file mode 100644 index 00000000..8ea47510 --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.svgo newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.ttf b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.ttf new file mode 100644 index 00000000..2cb04137 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.woff b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.woff new file mode 100644 index 00000000..4fbd4914 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-regular-webfont.woff differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.eot b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.eot new file mode 100644 index 00000000..2d786b7a Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.eot differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.svg b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.svg new file mode 100644 index 00000000..5bf92e3d --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.svgo newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.ttf b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.ttf new file mode 100644 index 00000000..8d502590 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.woff b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.woff new file mode 100644 index 00000000..67c4b988 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-semibold-webfont.woff differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.eot b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.eot new file mode 100644 index 00000000..377ca0e8 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.eot differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.svg b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.svg new file mode 100644 index 00000000..47dfc0e0 --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.svgo newline at end of file diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.ttf b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.ttf new file mode 100644 index 00000000..a48ee420 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.ttf differ diff --git a/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.woff b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.woff new file mode 100644 index 00000000..e30d1f64 Binary files /dev/null and b/refs/pull/405/merge/_static/fonts/opensans-semibolditalic-webfont.woff differ diff --git a/refs/pull/405/merge/_static/fonts/stylesheet.css b/refs/pull/405/merge/_static/fonts/stylesheet.css new file mode 100644 index 00000000..648f40e3 --- /dev/null +++ b/refs/pull/405/merge/_static/fonts/stylesheet.css @@ -0,0 +1,105 @@ +/* Generated by Font Squirrel (http://www.fontsquirrel.com) on June 8, 2012 */ + + + +@font-face { + font-family: 'Open Sans Italic'; + src: url('opensans-italic-webfont.eot'); + src: url('opensans-italic-webfont.eot?#iefix') format('embedded-opentype'), + url('opensans-italic-webfont.woff') format('woff'), + url('opensans-italic-webfont.ttf') format('truetype'), + url('opensans-italic-webfont.svg#OpenSansItalic') format('svg'); + font-weight: normal; + font-style: normal; + +} + + + + +@font-face { + font-family: 'Open Sans'; + src: url('opensans-regular-webfont.eot'); + src: url('opensans-regular-webfont.eot?#iefix') format('embedded-opentype'), + url('opensans-regular-webfont.woff') format('woff'), + url('opensans-regular-webfont.ttf') format('truetype'), + url('opensans-regular-webfont.svg#OpenSansRegular') format('svg'); + font-weight: normal; + font-style: normal; + +} + + + + +@font-face { + font-family: 'Open Sans Semibold'; + src: url('opensans-semibold-webfont.eot'); + src: url('opensans-semibold-webfont.eot?#iefix') format('embedded-opentype'), + url('opensans-semibold-webfont.woff') format('woff'), + url('opensans-semibold-webfont.ttf') format('truetype'), + url('opensans-semibold-webfont.svg#OpenSansSemiboldRegular') format('svg'); + font-weight: normal; + font-style: normal; + +} + + + + +@font-face { + font-family: 'Open Sans Semibold Italic'; + src: url('opensans-semibolditalic-webfont.eot'); + src: url('opensans-semibolditalic-webfont.eot?#iefix') format('embedded-opentype'), + url('opensans-semibolditalic-webfont.woff') format('woff'), + url('opensans-semibolditalic-webfont.ttf') format('truetype'), + url('opensans-semibolditalic-webfont.svg#OpenSansSemiboldItalic') format('svg'); + font-weight: normal; + font-style: normal; + +} + + + + +@font-face { + font-family: 'Droid Sans Mono'; + src: url('droidsansmono-webfont.eot'); + src: url('droidsansmono-webfont.eot?#iefix') format('embedded-opentype'), + url('droidsansmono-webfont.woff') format('woff'), + url('droidsansmono-webfont.ttf') format('truetype'), + url('droidsansmono-webfont.svg#DroidSansMonoRegular') format('svg'); + font-weight: normal; + font-style: normal; + +} + + + + +@font-face { + font-family: 'Open Sans Light'; + src: url('opensans-light-webfont.eot'); + src: url('opensans-light-webfont.eot?#iefix') format('embedded-opentype'), + url('opensans-light-webfont.woff') format('woff'), + url('opensans-light-webfont.ttf') format('truetype'), + url('opensans-light-webfont.svg#OpenSansLightRegular') format('svg'); + font-weight: normal; + font-style: normal; + +} + + + + +@font-face { + font-family: 'Open Sans Light Italic'; + src: url('opensans-lightitalic-webfont.eot'); + src: url('opensans-lightitalic-webfont.eot?#iefix') format('embedded-opentype'), + url('opensans-lightitalic-webfont.woff') format('woff'), + url('opensans-lightitalic-webfont.ttf') format('truetype'), + url('opensans-lightitalic-webfont.svg#OpenSansLightItalic') format('svg'); + font-weight: normal; + font-style: normal; + +} \ No newline at end of file diff --git a/refs/pull/405/merge/_static/init.js b/refs/pull/405/merge/_static/init.js new file mode 100644 index 00000000..6b436ebd --- /dev/null +++ b/refs/pull/405/merge/_static/init.js @@ -0,0 +1,2 @@ +SlideSync.init(SlideDeck); +SlideController.init(SlideSync); diff --git a/refs/pull/405/merge/_static/jquery-3.2.1.js b/refs/pull/405/merge/_static/jquery-3.2.1.js new file mode 100644 index 00000000..d2d8ca47 --- /dev/null +++ b/refs/pull/405/merge/_static/jquery-3.2.1.js @@ -0,0 +1,10253 @@ +/*! + * jQuery JavaScript Library v3.2.1 + * https://jquery.com/ + * + * Includes Sizzle.js + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://jquery.org/license + * + * Date: 2017-03-20T18:59Z + */ +( function( global, factory ) { + + "use strict"; + + if ( typeof module === "object" && typeof module.exports === "object" ) { + + // For CommonJS and CommonJS-like environments where a proper `window` + // is present, execute the factory and get jQuery. + // For environments that do not have a `window` with a `document` + // (such as Node.js), expose a factory as module.exports. + // This accentuates the need for the creation of a real `window`. + // e.g. var jQuery = require("jquery")(window); + // See ticket #14549 for more info. + module.exports = global.document ? + factory( global, true ) : + function( w ) { + if ( !w.document ) { + throw new Error( "jQuery requires a window with a document" ); + } + return factory( w ); + }; + } else { + factory( global ); + } + +// Pass this if window is not defined yet +} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { + +// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 +// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode +// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common +// enough that all such attempts are guarded in a try block. +"use strict"; + +var arr = []; + +var document = window.document; + +var getProto = Object.getPrototypeOf; + +var slice = arr.slice; + +var concat = arr.concat; + +var push = arr.push; + +var indexOf = arr.indexOf; + +var class2type = {}; + +var toString = class2type.toString; + +var hasOwn = class2type.hasOwnProperty; + +var fnToString = hasOwn.toString; + +var ObjectFunctionString = fnToString.call( Object ); + +var support = {}; + + + + function DOMEval( code, doc ) { + doc = doc || document; + + var script = doc.createElement( "script" ); + + script.text = code; + doc.head.appendChild( script ).parentNode.removeChild( script ); + } +/* global Symbol */ +// Defining this global in .eslintrc.json would create a danger of using the global +// unguarded in another place, it seems safer to define global only for this module + + + +var + version = "3.2.1", + + // Define a local copy of jQuery + jQuery = function( selector, context ) { + + // The jQuery object is actually just the init constructor 'enhanced' + // Need init if jQuery is called (just allow error to be thrown if not included) + return new jQuery.fn.init( selector, context ); + }, + + // Support: Android <=4.0 only + // Make sure we trim BOM and NBSP + rtrim = /^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g, + + // Matches dashed string for camelizing + rmsPrefix = /^-ms-/, + rdashAlpha = /-([a-z])/g, + + // Used by jQuery.camelCase as callback to replace() + fcamelCase = function( all, letter ) { + return letter.toUpperCase(); + }; + +jQuery.fn = jQuery.prototype = { + + // The current version of jQuery being used + jquery: version, + + constructor: jQuery, + + // The default length of a jQuery object is 0 + length: 0, + + toArray: function() { + return slice.call( this ); + }, + + // Get the Nth element in the matched element set OR + // Get the whole matched element set as a clean array + get: function( num ) { + + // Return all the elements in a clean array + if ( num == null ) { + return slice.call( this ); + } + + // Return just the one element from the set + return num < 0 ? this[ num + this.length ] : this[ num ]; + }, + + // Take an array of elements and push it onto the stack + // (returning the new matched element set) + pushStack: function( elems ) { + + // Build a new jQuery matched element set + var ret = jQuery.merge( this.constructor(), elems ); + + // Add the old object onto the stack (as a reference) + ret.prevObject = this; + + // Return the newly-formed element set + return ret; + }, + + // Execute a callback for every element in the matched set. + each: function( callback ) { + return jQuery.each( this, callback ); + }, + + map: function( callback ) { + return this.pushStack( jQuery.map( this, function( elem, i ) { + return callback.call( elem, i, elem ); + } ) ); + }, + + slice: function() { + return this.pushStack( slice.apply( this, arguments ) ); + }, + + first: function() { + return this.eq( 0 ); + }, + + last: function() { + return this.eq( -1 ); + }, + + eq: function( i ) { + var len = this.length, + j = +i + ( i < 0 ? len : 0 ); + return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); + }, + + end: function() { + return this.prevObject || this.constructor(); + }, + + // For internal use only. + // Behaves like an Array's method, not like a jQuery method. + push: push, + sort: arr.sort, + splice: arr.splice +}; + +jQuery.extend = jQuery.fn.extend = function() { + var options, name, src, copy, copyIsArray, clone, + target = arguments[ 0 ] || {}, + i = 1, + length = arguments.length, + deep = false; + + // Handle a deep copy situation + if ( typeof target === "boolean" ) { + deep = target; + + // Skip the boolean and the target + target = arguments[ i ] || {}; + i++; + } + + // Handle case when target is a string or something (possible in deep copy) + if ( typeof target !== "object" && !jQuery.isFunction( target ) ) { + target = {}; + } + + // Extend jQuery itself if only one argument is passed + if ( i === length ) { + target = this; + i--; + } + + for ( ; i < length; i++ ) { + + // Only deal with non-null/undefined values + if ( ( options = arguments[ i ] ) != null ) { + + // Extend the base object + for ( name in options ) { + src = target[ name ]; + copy = options[ name ]; + + // Prevent never-ending loop + if ( target === copy ) { + continue; + } + + // Recurse if we're merging plain objects or arrays + if ( deep && copy && ( jQuery.isPlainObject( copy ) || + ( copyIsArray = Array.isArray( copy ) ) ) ) { + + if ( copyIsArray ) { + copyIsArray = false; + clone = src && Array.isArray( src ) ? src : []; + + } else { + clone = src && jQuery.isPlainObject( src ) ? src : {}; + } + + // Never move original objects, clone them + target[ name ] = jQuery.extend( deep, clone, copy ); + + // Don't bring in undefined values + } else if ( copy !== undefined ) { + target[ name ] = copy; + } + } + } + } + + // Return the modified object + return target; +}; + +jQuery.extend( { + + // Unique for each copy of jQuery on the page + expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), + + // Assume jQuery is ready without the ready module + isReady: true, + + error: function( msg ) { + throw new Error( msg ); + }, + + noop: function() {}, + + isFunction: function( obj ) { + return jQuery.type( obj ) === "function"; + }, + + isWindow: function( obj ) { + return obj != null && obj === obj.window; + }, + + isNumeric: function( obj ) { + + // As of jQuery 3.0, isNumeric is limited to + // strings and numbers (primitives or objects) + // that can be coerced to finite numbers (gh-2662) + var type = jQuery.type( obj ); + return ( type === "number" || type === "string" ) && + + // parseFloat NaNs numeric-cast false positives ("") + // ...but misinterprets leading-number strings, particularly hex literals ("0x...") + // subtraction forces infinities to NaN + !isNaN( obj - parseFloat( obj ) ); + }, + + isPlainObject: function( obj ) { + var proto, Ctor; + + // Detect obvious negatives + // Use toString instead of jQuery.type to catch host objects + if ( !obj || toString.call( obj ) !== "[object Object]" ) { + return false; + } + + proto = getProto( obj ); + + // Objects with no prototype (e.g., `Object.create( null )`) are plain + if ( !proto ) { + return true; + } + + // Objects with prototype are plain iff they were constructed by a global Object function + Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; + return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; + }, + + isEmptyObject: function( obj ) { + + /* eslint-disable no-unused-vars */ + // See https://github.com/eslint/eslint/issues/6125 + var name; + + for ( name in obj ) { + return false; + } + return true; + }, + + type: function( obj ) { + if ( obj == null ) { + return obj + ""; + } + + // Support: Android <=2.3 only (functionish RegExp) + return typeof obj === "object" || typeof obj === "function" ? + class2type[ toString.call( obj ) ] || "object" : + typeof obj; + }, + + // Evaluates a script in a global context + globalEval: function( code ) { + DOMEval( code ); + }, + + // Convert dashed to camelCase; used by the css and data modules + // Support: IE <=9 - 11, Edge 12 - 13 + // Microsoft forgot to hump their vendor prefix (#9572) + camelCase: function( string ) { + return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); + }, + + each: function( obj, callback ) { + var length, i = 0; + + if ( isArrayLike( obj ) ) { + length = obj.length; + for ( ; i < length; i++ ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } else { + for ( i in obj ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } + + return obj; + }, + + // Support: Android <=4.0 only + trim: function( text ) { + return text == null ? + "" : + ( text + "" ).replace( rtrim, "" ); + }, + + // results is for internal usage only + makeArray: function( arr, results ) { + var ret = results || []; + + if ( arr != null ) { + if ( isArrayLike( Object( arr ) ) ) { + jQuery.merge( ret, + typeof arr === "string" ? + [ arr ] : arr + ); + } else { + push.call( ret, arr ); + } + } + + return ret; + }, + + inArray: function( elem, arr, i ) { + return arr == null ? -1 : indexOf.call( arr, elem, i ); + }, + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + merge: function( first, second ) { + var len = +second.length, + j = 0, + i = first.length; + + for ( ; j < len; j++ ) { + first[ i++ ] = second[ j ]; + } + + first.length = i; + + return first; + }, + + grep: function( elems, callback, invert ) { + var callbackInverse, + matches = [], + i = 0, + length = elems.length, + callbackExpect = !invert; + + // Go through the array, only saving the items + // that pass the validator function + for ( ; i < length; i++ ) { + callbackInverse = !callback( elems[ i ], i ); + if ( callbackInverse !== callbackExpect ) { + matches.push( elems[ i ] ); + } + } + + return matches; + }, + + // arg is for internal usage only + map: function( elems, callback, arg ) { + var length, value, + i = 0, + ret = []; + + // Go through the array, translating each of the items to their new values + if ( isArrayLike( elems ) ) { + length = elems.length; + for ( ; i < length; i++ ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + + // Go through every key on the object, + } else { + for ( i in elems ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + } + + // Flatten any nested arrays + return concat.apply( [], ret ); + }, + + // A global GUID counter for objects + guid: 1, + + // Bind a function to a context, optionally partially applying any + // arguments. + proxy: function( fn, context ) { + var tmp, args, proxy; + + if ( typeof context === "string" ) { + tmp = fn[ context ]; + context = fn; + fn = tmp; + } + + // Quick check to determine if target is callable, in the spec + // this throws a TypeError, but we will just return undefined. + if ( !jQuery.isFunction( fn ) ) { + return undefined; + } + + // Simulated bind + args = slice.call( arguments, 2 ); + proxy = function() { + return fn.apply( context || this, args.concat( slice.call( arguments ) ) ); + }; + + // Set the guid of unique handler to the same of original handler, so it can be removed + proxy.guid = fn.guid = fn.guid || jQuery.guid++; + + return proxy; + }, + + now: Date.now, + + // jQuery.support is not used in Core but other projects attach their + // properties to it so it needs to exist. + support: support +} ); + +if ( typeof Symbol === "function" ) { + jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; +} + +// Populate the class2type map +jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), +function( i, name ) { + class2type[ "[object " + name + "]" ] = name.toLowerCase(); +} ); + +function isArrayLike( obj ) { + + // Support: real iOS 8.2 only (not reproducible in simulator) + // `in` check used to prevent JIT error (gh-2145) + // hasOwn isn't used here due to false negatives + // regarding Nodelist length in IE + var length = !!obj && "length" in obj && obj.length, + type = jQuery.type( obj ); + + if ( type === "function" || jQuery.isWindow( obj ) ) { + return false; + } + + return type === "array" || length === 0 || + typeof length === "number" && length > 0 && ( length - 1 ) in obj; +} +var Sizzle = +/*! + * Sizzle CSS Selector Engine v2.3.3 + * https://sizzlejs.com/ + * + * Copyright jQuery Foundation and other contributors + * Released under the MIT license + * http://jquery.org/license + * + * Date: 2016-08-08 + */ +(function( window ) { + +var i, + support, + Expr, + getText, + isXML, + tokenize, + compile, + select, + outermostContext, + sortInput, + hasDuplicate, + + // Local document vars + setDocument, + document, + docElem, + documentIsHTML, + rbuggyQSA, + rbuggyMatches, + matches, + contains, + + // Instance-specific data + expando = "sizzle" + 1 * new Date(), + preferredDoc = window.document, + dirruns = 0, + done = 0, + classCache = createCache(), + tokenCache = createCache(), + compilerCache = createCache(), + sortOrder = function( a, b ) { + if ( a === b ) { + hasDuplicate = true; + } + return 0; + }, + + // Instance methods + hasOwn = ({}).hasOwnProperty, + arr = [], + pop = arr.pop, + push_native = arr.push, + push = arr.push, + slice = arr.slice, + // Use a stripped-down indexOf as it's faster than native + // https://jsperf.com/thor-indexof-vs-for/5 + indexOf = function( list, elem ) { + var i = 0, + len = list.length; + for ( ; i < len; i++ ) { + if ( list[i] === elem ) { + return i; + } + } + return -1; + }, + + booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped", + + // Regular expressions + + // http://www.w3.org/TR/css3-selectors/#whitespace + whitespace = "[\\x20\\t\\r\\n\\f]", + + // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier + identifier = "(?:\\\\.|[\\w-]|[^\0-\\xa0])+", + + // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors + attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + + // Operator (capture 2) + "*([*^$|!~]?=)" + whitespace + + // "Attribute values must be CSS identifiers [capture 5] or strings [capture 3 or capture 4]" + "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + whitespace + + "*\\]", + + pseudos = ":(" + identifier + ")(?:\\((" + + // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: + // 1. quoted (capture 3; capture 4 or capture 5) + "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + + // 2. simple (capture 6) + "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + + // 3. anything else (capture 2) + ".*" + + ")\\)|)", + + // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter + rwhitespace = new RegExp( whitespace + "+", "g" ), + rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + whitespace + "+$", "g" ), + + rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), + rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + "*" ), + + rattributeQuotes = new RegExp( "=" + whitespace + "*([^\\]'\"]*?)" + whitespace + "*\\]", "g" ), + + rpseudo = new RegExp( pseudos ), + ridentifier = new RegExp( "^" + identifier + "$" ), + + matchExpr = { + "ID": new RegExp( "^#(" + identifier + ")" ), + "CLASS": new RegExp( "^\\.(" + identifier + ")" ), + "TAG": new RegExp( "^(" + identifier + "|[*])" ), + "ATTR": new RegExp( "^" + attributes ), + "PSEUDO": new RegExp( "^" + pseudos ), + "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + whitespace + + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + whitespace + + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), + "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), + // For use in libraries implementing .is() + // We use this for POS matching in `select` + "needsContext": new RegExp( "^" + whitespace + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + + whitespace + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) + }, + + rinputs = /^(?:input|select|textarea|button)$/i, + rheader = /^h\d$/i, + + rnative = /^[^{]+\{\s*\[native \w/, + + // Easily-parseable/retrievable ID or TAG or CLASS selectors + rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, + + rsibling = /[+~]/, + + // CSS escapes + // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters + runescape = new RegExp( "\\\\([\\da-f]{1,6}" + whitespace + "?|(" + whitespace + ")|.)", "ig" ), + funescape = function( _, escaped, escapedWhitespace ) { + var high = "0x" + escaped - 0x10000; + // NaN means non-codepoint + // Support: Firefox<24 + // Workaround erroneous numeric interpretation of +"0x" + return high !== high || escapedWhitespace ? + escaped : + high < 0 ? + // BMP codepoint + String.fromCharCode( high + 0x10000 ) : + // Supplemental Plane codepoint (surrogate pair) + String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); + }, + + // CSS string/identifier serialization + // https://drafts.csswg.org/cssom/#common-serializing-idioms + rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, + fcssescape = function( ch, asCodePoint ) { + if ( asCodePoint ) { + + // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER + if ( ch === "\0" ) { + return "\uFFFD"; + } + + // Control characters and (dependent upon position) numbers get escaped as code points + return ch.slice( 0, -1 ) + "\\" + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; + } + + // Other potentially-special ASCII characters get backslash-escaped + return "\\" + ch; + }, + + // Used for iframes + // See setDocument() + // Removing the function wrapper causes a "Permission Denied" + // error in IE + unloadHandler = function() { + setDocument(); + }, + + disabledAncestor = addCombinator( + function( elem ) { + return elem.disabled === true && ("form" in elem || "label" in elem); + }, + { dir: "parentNode", next: "legend" } + ); + +// Optimize for push.apply( _, NodeList ) +try { + push.apply( + (arr = slice.call( preferredDoc.childNodes )), + preferredDoc.childNodes + ); + // Support: Android<4.0 + // Detect silently failing push.apply + arr[ preferredDoc.childNodes.length ].nodeType; +} catch ( e ) { + push = { apply: arr.length ? + + // Leverage slice if possible + function( target, els ) { + push_native.apply( target, slice.call(els) ); + } : + + // Support: IE<9 + // Otherwise append directly + function( target, els ) { + var j = target.length, + i = 0; + // Can't trust NodeList.length + while ( (target[j++] = els[i++]) ) {} + target.length = j - 1; + } + }; +} + +function Sizzle( selector, context, results, seed ) { + var m, i, elem, nid, match, groups, newSelector, + newContext = context && context.ownerDocument, + + // nodeType defaults to 9, since context defaults to document + nodeType = context ? context.nodeType : 9; + + results = results || []; + + // Return early from calls with invalid selector or context + if ( typeof selector !== "string" || !selector || + nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { + + return results; + } + + // Try to shortcut find operations (as opposed to filters) in HTML documents + if ( !seed ) { + + if ( ( context ? context.ownerDocument || context : preferredDoc ) !== document ) { + setDocument( context ); + } + context = context || document; + + if ( documentIsHTML ) { + + // If the selector is sufficiently simple, try using a "get*By*" DOM method + // (excepting DocumentFragment context, where the methods don't exist) + if ( nodeType !== 11 && (match = rquickExpr.exec( selector )) ) { + + // ID selector + if ( (m = match[1]) ) { + + // Document context + if ( nodeType === 9 ) { + if ( (elem = context.getElementById( m )) ) { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( elem.id === m ) { + results.push( elem ); + return results; + } + } else { + return results; + } + + // Element context + } else { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( newContext && (elem = newContext.getElementById( m )) && + contains( context, elem ) && + elem.id === m ) { + + results.push( elem ); + return results; + } + } + + // Type selector + } else if ( match[2] ) { + push.apply( results, context.getElementsByTagName( selector ) ); + return results; + + // Class selector + } else if ( (m = match[3]) && support.getElementsByClassName && + context.getElementsByClassName ) { + + push.apply( results, context.getElementsByClassName( m ) ); + return results; + } + } + + // Take advantage of querySelectorAll + if ( support.qsa && + !compilerCache[ selector + " " ] && + (!rbuggyQSA || !rbuggyQSA.test( selector )) ) { + + if ( nodeType !== 1 ) { + newContext = context; + newSelector = selector; + + // qSA looks outside Element context, which is not what we want + // Thanks to Andrew Dupont for this workaround technique + // Support: IE <=8 + // Exclude object elements + } else if ( context.nodeName.toLowerCase() !== "object" ) { + + // Capture the context ID, setting it first if necessary + if ( (nid = context.getAttribute( "id" )) ) { + nid = nid.replace( rcssescape, fcssescape ); + } else { + context.setAttribute( "id", (nid = expando) ); + } + + // Prefix every selector in the list + groups = tokenize( selector ); + i = groups.length; + while ( i-- ) { + groups[i] = "#" + nid + " " + toSelector( groups[i] ); + } + newSelector = groups.join( "," ); + + // Expand context for sibling selectors + newContext = rsibling.test( selector ) && testContext( context.parentNode ) || + context; + } + + if ( newSelector ) { + try { + push.apply( results, + newContext.querySelectorAll( newSelector ) + ); + return results; + } catch ( qsaError ) { + } finally { + if ( nid === expando ) { + context.removeAttribute( "id" ); + } + } + } + } + } + } + + // All others + return select( selector.replace( rtrim, "$1" ), context, results, seed ); +} + +/** + * Create key-value caches of limited size + * @returns {function(string, object)} Returns the Object data after storing it on itself with + * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) + * deleting the oldest entry + */ +function createCache() { + var keys = []; + + function cache( key, value ) { + // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) + if ( keys.push( key + " " ) > Expr.cacheLength ) { + // Only keep the most recent entries + delete cache[ keys.shift() ]; + } + return (cache[ key + " " ] = value); + } + return cache; +} + +/** + * Mark a function for special use by Sizzle + * @param {Function} fn The function to mark + */ +function markFunction( fn ) { + fn[ expando ] = true; + return fn; +} + +/** + * Support testing using an element + * @param {Function} fn Passed the created element and returns a boolean result + */ +function assert( fn ) { + var el = document.createElement("fieldset"); + + try { + return !!fn( el ); + } catch (e) { + return false; + } finally { + // Remove from its parent by default + if ( el.parentNode ) { + el.parentNode.removeChild( el ); + } + // release memory in IE + el = null; + } +} + +/** + * Adds the same handler for all of the specified attrs + * @param {String} attrs Pipe-separated list of attributes + * @param {Function} handler The method that will be applied + */ +function addHandle( attrs, handler ) { + var arr = attrs.split("|"), + i = arr.length; + + while ( i-- ) { + Expr.attrHandle[ arr[i] ] = handler; + } +} + +/** + * Checks document order of two siblings + * @param {Element} a + * @param {Element} b + * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b + */ +function siblingCheck( a, b ) { + var cur = b && a, + diff = cur && a.nodeType === 1 && b.nodeType === 1 && + a.sourceIndex - b.sourceIndex; + + // Use IE sourceIndex if available on both nodes + if ( diff ) { + return diff; + } + + // Check if b follows a + if ( cur ) { + while ( (cur = cur.nextSibling) ) { + if ( cur === b ) { + return -1; + } + } + } + + return a ? 1 : -1; +} + +/** + * Returns a function to use in pseudos for input types + * @param {String} type + */ +function createInputPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for buttons + * @param {String} type + */ +function createButtonPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return (name === "input" || name === "button") && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for :enabled/:disabled + * @param {Boolean} disabled true for :disabled; false for :enabled + */ +function createDisabledPseudo( disabled ) { + + // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable + return function( elem ) { + + // Only certain elements can match :enabled or :disabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled + if ( "form" in elem ) { + + // Check for inherited disabledness on relevant non-disabled elements: + // * listed form-associated elements in a disabled fieldset + // https://html.spec.whatwg.org/multipage/forms.html#category-listed + // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled + // * option elements in a disabled optgroup + // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled + // All such elements have a "form" property. + if ( elem.parentNode && elem.disabled === false ) { + + // Option elements defer to a parent optgroup if present + if ( "label" in elem ) { + if ( "label" in elem.parentNode ) { + return elem.parentNode.disabled === disabled; + } else { + return elem.disabled === disabled; + } + } + + // Support: IE 6 - 11 + // Use the isDisabled shortcut property to check for disabled fieldset ancestors + return elem.isDisabled === disabled || + + // Where there is no isDisabled, check manually + /* jshint -W018 */ + elem.isDisabled !== !disabled && + disabledAncestor( elem ) === disabled; + } + + return elem.disabled === disabled; + + // Try to winnow out elements that can't be disabled before trusting the disabled property. + // Some victims get caught in our net (label, legend, menu, track), but it shouldn't + // even exist on them, let alone have a boolean value. + } else if ( "label" in elem ) { + return elem.disabled === disabled; + } + + // Remaining elements are neither :enabled nor :disabled + return false; + }; +} + +/** + * Returns a function to use in pseudos for positionals + * @param {Function} fn + */ +function createPositionalPseudo( fn ) { + return markFunction(function( argument ) { + argument = +argument; + return markFunction(function( seed, matches ) { + var j, + matchIndexes = fn( [], seed.length, argument ), + i = matchIndexes.length; + + // Match elements found at the specified indexes + while ( i-- ) { + if ( seed[ (j = matchIndexes[i]) ] ) { + seed[j] = !(matches[j] = seed[j]); + } + } + }); + }); +} + +/** + * Checks a node for validity as a Sizzle context + * @param {Element|Object=} context + * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value + */ +function testContext( context ) { + return context && typeof context.getElementsByTagName !== "undefined" && context; +} + +// Expose support vars for convenience +support = Sizzle.support = {}; + +/** + * Detects XML nodes + * @param {Element|Object} elem An element or a document + * @returns {Boolean} True iff elem is a non-HTML XML node + */ +isXML = Sizzle.isXML = function( elem ) { + // documentElement is verified for cases where it doesn't yet exist + // (such as loading iframes in IE - #4833) + var documentElement = elem && (elem.ownerDocument || elem).documentElement; + return documentElement ? documentElement.nodeName !== "HTML" : false; +}; + +/** + * Sets document-related variables once based on the current document + * @param {Element|Object} [doc] An element or document object to use to set the document + * @returns {Object} Returns the current document + */ +setDocument = Sizzle.setDocument = function( node ) { + var hasCompare, subWindow, + doc = node ? node.ownerDocument || node : preferredDoc; + + // Return early if doc is invalid or already selected + if ( doc === document || doc.nodeType !== 9 || !doc.documentElement ) { + return document; + } + + // Update global variables + document = doc; + docElem = document.documentElement; + documentIsHTML = !isXML( document ); + + // Support: IE 9-11, Edge + // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) + if ( preferredDoc !== document && + (subWindow = document.defaultView) && subWindow.top !== subWindow ) { + + // Support: IE 11, Edge + if ( subWindow.addEventListener ) { + subWindow.addEventListener( "unload", unloadHandler, false ); + + // Support: IE 9 - 10 only + } else if ( subWindow.attachEvent ) { + subWindow.attachEvent( "onunload", unloadHandler ); + } + } + + /* Attributes + ---------------------------------------------------------------------- */ + + // Support: IE<8 + // Verify that getAttribute really returns attributes and not properties + // (excepting IE8 booleans) + support.attributes = assert(function( el ) { + el.className = "i"; + return !el.getAttribute("className"); + }); + + /* getElement(s)By* + ---------------------------------------------------------------------- */ + + // Check if getElementsByTagName("*") returns only elements + support.getElementsByTagName = assert(function( el ) { + el.appendChild( document.createComment("") ); + return !el.getElementsByTagName("*").length; + }); + + // Support: IE<9 + support.getElementsByClassName = rnative.test( document.getElementsByClassName ); + + // Support: IE<10 + // Check if getElementById returns elements by name + // The broken getElementById methods don't pick up programmatically-set names, + // so use a roundabout getElementsByName test + support.getById = assert(function( el ) { + docElem.appendChild( el ).id = expando; + return !document.getElementsByName || !document.getElementsByName( expando ).length; + }); + + // ID filter and find + if ( support.getById ) { + Expr.filter["ID"] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + return elem.getAttribute("id") === attrId; + }; + }; + Expr.find["ID"] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var elem = context.getElementById( id ); + return elem ? [ elem ] : []; + } + }; + } else { + Expr.filter["ID"] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + var node = typeof elem.getAttributeNode !== "undefined" && + elem.getAttributeNode("id"); + return node && node.value === attrId; + }; + }; + + // Support: IE 6 - 7 only + // getElementById is not reliable as a find shortcut + Expr.find["ID"] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var node, i, elems, + elem = context.getElementById( id ); + + if ( elem ) { + + // Verify the id attribute + node = elem.getAttributeNode("id"); + if ( node && node.value === id ) { + return [ elem ]; + } + + // Fall back on getElementsByName + elems = context.getElementsByName( id ); + i = 0; + while ( (elem = elems[i++]) ) { + node = elem.getAttributeNode("id"); + if ( node && node.value === id ) { + return [ elem ]; + } + } + } + + return []; + } + }; + } + + // Tag + Expr.find["TAG"] = support.getElementsByTagName ? + function( tag, context ) { + if ( typeof context.getElementsByTagName !== "undefined" ) { + return context.getElementsByTagName( tag ); + + // DocumentFragment nodes don't have gEBTN + } else if ( support.qsa ) { + return context.querySelectorAll( tag ); + } + } : + + function( tag, context ) { + var elem, + tmp = [], + i = 0, + // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too + results = context.getElementsByTagName( tag ); + + // Filter out possible comments + if ( tag === "*" ) { + while ( (elem = results[i++]) ) { + if ( elem.nodeType === 1 ) { + tmp.push( elem ); + } + } + + return tmp; + } + return results; + }; + + // Class + Expr.find["CLASS"] = support.getElementsByClassName && function( className, context ) { + if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { + return context.getElementsByClassName( className ); + } + }; + + /* QSA/matchesSelector + ---------------------------------------------------------------------- */ + + // QSA and matchesSelector support + + // matchesSelector(:active) reports false when true (IE9/Opera 11.5) + rbuggyMatches = []; + + // qSa(:focus) reports false when true (Chrome 21) + // We allow this because of a bug in IE8/9 that throws an error + // whenever `document.activeElement` is accessed on an iframe + // So, we allow :focus to pass through QSA all the time to avoid the IE error + // See https://bugs.jquery.com/ticket/13378 + rbuggyQSA = []; + + if ( (support.qsa = rnative.test( document.querySelectorAll )) ) { + // Build QSA regex + // Regex strategy adopted from Diego Perini + assert(function( el ) { + // Select is set to empty string on purpose + // This is to test IE's treatment of not explicitly + // setting a boolean content attribute, + // since its presence should be enough + // https://bugs.jquery.com/ticket/12359 + docElem.appendChild( el ).innerHTML = "" + + ""; + + // Support: IE8, Opera 11-12.16 + // Nothing should be selected when empty strings follow ^= or $= or *= + // The test attribute must be unknown in Opera but "safe" for WinRT + // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section + if ( el.querySelectorAll("[msallowcapture^='']").length ) { + rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); + } + + // Support: IE8 + // Boolean attributes and "value" are not treated correctly + if ( !el.querySelectorAll("[selected]").length ) { + rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); + } + + // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ + if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { + rbuggyQSA.push("~="); + } + + // Webkit/Opera - :checked should return selected option elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + // IE8 throws error here and will not see later tests + if ( !el.querySelectorAll(":checked").length ) { + rbuggyQSA.push(":checked"); + } + + // Support: Safari 8+, iOS 8+ + // https://bugs.webkit.org/show_bug.cgi?id=136851 + // In-page `selector#id sibling-combinator selector` fails + if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { + rbuggyQSA.push(".#.+[+~]"); + } + }); + + assert(function( el ) { + el.innerHTML = "" + + ""; + + // Support: Windows 8 Native Apps + // The type and name attributes are restricted during .innerHTML assignment + var input = document.createElement("input"); + input.setAttribute( "type", "hidden" ); + el.appendChild( input ).setAttribute( "name", "D" ); + + // Support: IE8 + // Enforce case-sensitivity of name attribute + if ( el.querySelectorAll("[name=d]").length ) { + rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); + } + + // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) + // IE8 throws error here and will not see later tests + if ( el.querySelectorAll(":enabled").length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: IE9-11+ + // IE's :disabled selector does not pick up the children of disabled fieldsets + docElem.appendChild( el ).disabled = true; + if ( el.querySelectorAll(":disabled").length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Opera 10-11 does not throw on post-comma invalid pseudos + el.querySelectorAll("*,:x"); + rbuggyQSA.push(",.*:"); + }); + } + + if ( (support.matchesSelector = rnative.test( (matches = docElem.matches || + docElem.webkitMatchesSelector || + docElem.mozMatchesSelector || + docElem.oMatchesSelector || + docElem.msMatchesSelector) )) ) { + + assert(function( el ) { + // Check to see if it's possible to do matchesSelector + // on a disconnected node (IE 9) + support.disconnectedMatch = matches.call( el, "*" ); + + // This should fail with an exception + // Gecko does not error, returns false instead + matches.call( el, "[s!='']:x" ); + rbuggyMatches.push( "!=", pseudos ); + }); + } + + rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join("|") ); + rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join("|") ); + + /* Contains + ---------------------------------------------------------------------- */ + hasCompare = rnative.test( docElem.compareDocumentPosition ); + + // Element contains another + // Purposefully self-exclusive + // As in, an element does not contain itself + contains = hasCompare || rnative.test( docElem.contains ) ? + function( a, b ) { + var adown = a.nodeType === 9 ? a.documentElement : a, + bup = b && b.parentNode; + return a === bup || !!( bup && bup.nodeType === 1 && ( + adown.contains ? + adown.contains( bup ) : + a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 + )); + } : + function( a, b ) { + if ( b ) { + while ( (b = b.parentNode) ) { + if ( b === a ) { + return true; + } + } + } + return false; + }; + + /* Sorting + ---------------------------------------------------------------------- */ + + // Document order sorting + sortOrder = hasCompare ? + function( a, b ) { + + // Flag for duplicate removal + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + // Sort on method existence if only one input has compareDocumentPosition + var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; + if ( compare ) { + return compare; + } + + // Calculate position if both inputs belong to the same document + compare = ( a.ownerDocument || a ) === ( b.ownerDocument || b ) ? + a.compareDocumentPosition( b ) : + + // Otherwise we know they are disconnected + 1; + + // Disconnected nodes + if ( compare & 1 || + (!support.sortDetached && b.compareDocumentPosition( a ) === compare) ) { + + // Choose the first element that is related to our preferred document + if ( a === document || a.ownerDocument === preferredDoc && contains(preferredDoc, a) ) { + return -1; + } + if ( b === document || b.ownerDocument === preferredDoc && contains(preferredDoc, b) ) { + return 1; + } + + // Maintain original order + return sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + } + + return compare & 4 ? -1 : 1; + } : + function( a, b ) { + // Exit early if the nodes are identical + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + var cur, + i = 0, + aup = a.parentNode, + bup = b.parentNode, + ap = [ a ], + bp = [ b ]; + + // Parentless nodes are either documents or disconnected + if ( !aup || !bup ) { + return a === document ? -1 : + b === document ? 1 : + aup ? -1 : + bup ? 1 : + sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + + // If the nodes are siblings, we can do a quick check + } else if ( aup === bup ) { + return siblingCheck( a, b ); + } + + // Otherwise we need full lists of their ancestors for comparison + cur = a; + while ( (cur = cur.parentNode) ) { + ap.unshift( cur ); + } + cur = b; + while ( (cur = cur.parentNode) ) { + bp.unshift( cur ); + } + + // Walk down the tree looking for a discrepancy + while ( ap[i] === bp[i] ) { + i++; + } + + return i ? + // Do a sibling check if the nodes have a common ancestor + siblingCheck( ap[i], bp[i] ) : + + // Otherwise nodes in our document sort first + ap[i] === preferredDoc ? -1 : + bp[i] === preferredDoc ? 1 : + 0; + }; + + return document; +}; + +Sizzle.matches = function( expr, elements ) { + return Sizzle( expr, null, null, elements ); +}; + +Sizzle.matchesSelector = function( elem, expr ) { + // Set document vars if needed + if ( ( elem.ownerDocument || elem ) !== document ) { + setDocument( elem ); + } + + // Make sure that attribute selectors are quoted + expr = expr.replace( rattributeQuotes, "='$1']" ); + + if ( support.matchesSelector && documentIsHTML && + !compilerCache[ expr + " " ] && + ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && + ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { + + try { + var ret = matches.call( elem, expr ); + + // IE 9's matchesSelector returns false on disconnected nodes + if ( ret || support.disconnectedMatch || + // As well, disconnected nodes are said to be in a document + // fragment in IE 9 + elem.document && elem.document.nodeType !== 11 ) { + return ret; + } + } catch (e) {} + } + + return Sizzle( expr, document, null, [ elem ] ).length > 0; +}; + +Sizzle.contains = function( context, elem ) { + // Set document vars if needed + if ( ( context.ownerDocument || context ) !== document ) { + setDocument( context ); + } + return contains( context, elem ); +}; + +Sizzle.attr = function( elem, name ) { + // Set document vars if needed + if ( ( elem.ownerDocument || elem ) !== document ) { + setDocument( elem ); + } + + var fn = Expr.attrHandle[ name.toLowerCase() ], + // Don't get fooled by Object.prototype properties (jQuery #13807) + val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? + fn( elem, name, !documentIsHTML ) : + undefined; + + return val !== undefined ? + val : + support.attributes || !documentIsHTML ? + elem.getAttribute( name ) : + (val = elem.getAttributeNode(name)) && val.specified ? + val.value : + null; +}; + +Sizzle.escape = function( sel ) { + return (sel + "").replace( rcssescape, fcssescape ); +}; + +Sizzle.error = function( msg ) { + throw new Error( "Syntax error, unrecognized expression: " + msg ); +}; + +/** + * Document sorting and removing duplicates + * @param {ArrayLike} results + */ +Sizzle.uniqueSort = function( results ) { + var elem, + duplicates = [], + j = 0, + i = 0; + + // Unless we *know* we can detect duplicates, assume their presence + hasDuplicate = !support.detectDuplicates; + sortInput = !support.sortStable && results.slice( 0 ); + results.sort( sortOrder ); + + if ( hasDuplicate ) { + while ( (elem = results[i++]) ) { + if ( elem === results[ i ] ) { + j = duplicates.push( i ); + } + } + while ( j-- ) { + results.splice( duplicates[ j ], 1 ); + } + } + + // Clear input after sorting to release objects + // See https://github.com/jquery/sizzle/pull/225 + sortInput = null; + + return results; +}; + +/** + * Utility function for retrieving the text value of an array of DOM nodes + * @param {Array|Element} elem + */ +getText = Sizzle.getText = function( elem ) { + var node, + ret = "", + i = 0, + nodeType = elem.nodeType; + + if ( !nodeType ) { + // If no nodeType, this is expected to be an array + while ( (node = elem[i++]) ) { + // Do not traverse comment nodes + ret += getText( node ); + } + } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { + // Use textContent for elements + // innerText usage removed for consistency of new lines (jQuery #11153) + if ( typeof elem.textContent === "string" ) { + return elem.textContent; + } else { + // Traverse its children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + ret += getText( elem ); + } + } + } else if ( nodeType === 3 || nodeType === 4 ) { + return elem.nodeValue; + } + // Do not include comment or processing instruction nodes + + return ret; +}; + +Expr = Sizzle.selectors = { + + // Can be adjusted by the user + cacheLength: 50, + + createPseudo: markFunction, + + match: matchExpr, + + attrHandle: {}, + + find: {}, + + relative: { + ">": { dir: "parentNode", first: true }, + " ": { dir: "parentNode" }, + "+": { dir: "previousSibling", first: true }, + "~": { dir: "previousSibling" } + }, + + preFilter: { + "ATTR": function( match ) { + match[1] = match[1].replace( runescape, funescape ); + + // Move the given value to match[3] whether quoted or unquoted + match[3] = ( match[3] || match[4] || match[5] || "" ).replace( runescape, funescape ); + + if ( match[2] === "~=" ) { + match[3] = " " + match[3] + " "; + } + + return match.slice( 0, 4 ); + }, + + "CHILD": function( match ) { + /* matches from matchExpr["CHILD"] + 1 type (only|nth|...) + 2 what (child|of-type) + 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) + 4 xn-component of xn+y argument ([+-]?\d*n|) + 5 sign of xn-component + 6 x of xn-component + 7 sign of y-component + 8 y of y-component + */ + match[1] = match[1].toLowerCase(); + + if ( match[1].slice( 0, 3 ) === "nth" ) { + // nth-* requires argument + if ( !match[3] ) { + Sizzle.error( match[0] ); + } + + // numeric x and y parameters for Expr.filter.CHILD + // remember that false/true cast respectively to 0/1 + match[4] = +( match[4] ? match[5] + (match[6] || 1) : 2 * ( match[3] === "even" || match[3] === "odd" ) ); + match[5] = +( ( match[7] + match[8] ) || match[3] === "odd" ); + + // other types prohibit arguments + } else if ( match[3] ) { + Sizzle.error( match[0] ); + } + + return match; + }, + + "PSEUDO": function( match ) { + var excess, + unquoted = !match[6] && match[2]; + + if ( matchExpr["CHILD"].test( match[0] ) ) { + return null; + } + + // Accept quoted arguments as-is + if ( match[3] ) { + match[2] = match[4] || match[5] || ""; + + // Strip excess characters from unquoted arguments + } else if ( unquoted && rpseudo.test( unquoted ) && + // Get excess from tokenize (recursively) + (excess = tokenize( unquoted, true )) && + // advance to the next closing parenthesis + (excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length) ) { + + // excess is a negative index + match[0] = match[0].slice( 0, excess ); + match[2] = unquoted.slice( 0, excess ); + } + + // Return only captures needed by the pseudo filter method (type and argument) + return match.slice( 0, 3 ); + } + }, + + filter: { + + "TAG": function( nodeNameSelector ) { + var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); + return nodeNameSelector === "*" ? + function() { return true; } : + function( elem ) { + return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; + }; + }, + + "CLASS": function( className ) { + var pattern = classCache[ className + " " ]; + + return pattern || + (pattern = new RegExp( "(^|" + whitespace + ")" + className + "(" + whitespace + "|$)" )) && + classCache( className, function( elem ) { + return pattern.test( typeof elem.className === "string" && elem.className || typeof elem.getAttribute !== "undefined" && elem.getAttribute("class") || "" ); + }); + }, + + "ATTR": function( name, operator, check ) { + return function( elem ) { + var result = Sizzle.attr( elem, name ); + + if ( result == null ) { + return operator === "!="; + } + if ( !operator ) { + return true; + } + + result += ""; + + return operator === "=" ? result === check : + operator === "!=" ? result !== check : + operator === "^=" ? check && result.indexOf( check ) === 0 : + operator === "*=" ? check && result.indexOf( check ) > -1 : + operator === "$=" ? check && result.slice( -check.length ) === check : + operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : + operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : + false; + }; + }, + + "CHILD": function( type, what, argument, first, last ) { + var simple = type.slice( 0, 3 ) !== "nth", + forward = type.slice( -4 ) !== "last", + ofType = what === "of-type"; + + return first === 1 && last === 0 ? + + // Shortcut for :nth-*(n) + function( elem ) { + return !!elem.parentNode; + } : + + function( elem, context, xml ) { + var cache, uniqueCache, outerCache, node, nodeIndex, start, + dir = simple !== forward ? "nextSibling" : "previousSibling", + parent = elem.parentNode, + name = ofType && elem.nodeName.toLowerCase(), + useCache = !xml && !ofType, + diff = false; + + if ( parent ) { + + // :(first|last|only)-(child|of-type) + if ( simple ) { + while ( dir ) { + node = elem; + while ( (node = node[ dir ]) ) { + if ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) { + + return false; + } + } + // Reverse direction for :only-* (if we haven't yet done so) + start = dir = type === "only" && !start && "nextSibling"; + } + return true; + } + + start = [ forward ? parent.firstChild : parent.lastChild ]; + + // non-xml :nth-child(...) stores cache data on `parent` + if ( forward && useCache ) { + + // Seek `elem` from a previously-cached index + + // ...in a gzip-friendly way + node = parent; + outerCache = node[ expando ] || (node[ expando ] = {}); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + (outerCache[ node.uniqueID ] = {}); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex && cache[ 2 ]; + node = nodeIndex && parent.childNodes[ nodeIndex ]; + + while ( (node = ++nodeIndex && node && node[ dir ] || + + // Fallback to seeking `elem` from the start + (diff = nodeIndex = 0) || start.pop()) ) { + + // When found, cache indexes on `parent` and break + if ( node.nodeType === 1 && ++diff && node === elem ) { + uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; + break; + } + } + + } else { + // Use previously-cached element index if available + if ( useCache ) { + // ...in a gzip-friendly way + node = elem; + outerCache = node[ expando ] || (node[ expando ] = {}); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + (outerCache[ node.uniqueID ] = {}); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex; + } + + // xml :nth-child(...) + // or :nth-last-child(...) or :nth(-last)?-of-type(...) + if ( diff === false ) { + // Use the same loop as above to seek `elem` from the start + while ( (node = ++nodeIndex && node && node[ dir ] || + (diff = nodeIndex = 0) || start.pop()) ) { + + if ( ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) && + ++diff ) { + + // Cache the index of each encountered element + if ( useCache ) { + outerCache = node[ expando ] || (node[ expando ] = {}); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + (outerCache[ node.uniqueID ] = {}); + + uniqueCache[ type ] = [ dirruns, diff ]; + } + + if ( node === elem ) { + break; + } + } + } + } + } + + // Incorporate the offset, then check against cycle size + diff -= last; + return diff === first || ( diff % first === 0 && diff / first >= 0 ); + } + }; + }, + + "PSEUDO": function( pseudo, argument ) { + // pseudo-class names are case-insensitive + // http://www.w3.org/TR/selectors/#pseudo-classes + // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters + // Remember that setFilters inherits from pseudos + var args, + fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || + Sizzle.error( "unsupported pseudo: " + pseudo ); + + // The user may use createPseudo to indicate that + // arguments are needed to create the filter function + // just as Sizzle does + if ( fn[ expando ] ) { + return fn( argument ); + } + + // But maintain support for old signatures + if ( fn.length > 1 ) { + args = [ pseudo, pseudo, "", argument ]; + return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? + markFunction(function( seed, matches ) { + var idx, + matched = fn( seed, argument ), + i = matched.length; + while ( i-- ) { + idx = indexOf( seed, matched[i] ); + seed[ idx ] = !( matches[ idx ] = matched[i] ); + } + }) : + function( elem ) { + return fn( elem, 0, args ); + }; + } + + return fn; + } + }, + + pseudos: { + // Potentially complex pseudos + "not": markFunction(function( selector ) { + // Trim the selector passed to compile + // to avoid treating leading and trailing + // spaces as combinators + var input = [], + results = [], + matcher = compile( selector.replace( rtrim, "$1" ) ); + + return matcher[ expando ] ? + markFunction(function( seed, matches, context, xml ) { + var elem, + unmatched = matcher( seed, null, xml, [] ), + i = seed.length; + + // Match elements unmatched by `matcher` + while ( i-- ) { + if ( (elem = unmatched[i]) ) { + seed[i] = !(matches[i] = elem); + } + } + }) : + function( elem, context, xml ) { + input[0] = elem; + matcher( input, null, xml, results ); + // Don't keep the element (issue #299) + input[0] = null; + return !results.pop(); + }; + }), + + "has": markFunction(function( selector ) { + return function( elem ) { + return Sizzle( selector, elem ).length > 0; + }; + }), + + "contains": markFunction(function( text ) { + text = text.replace( runescape, funescape ); + return function( elem ) { + return ( elem.textContent || elem.innerText || getText( elem ) ).indexOf( text ) > -1; + }; + }), + + // "Whether an element is represented by a :lang() selector + // is based solely on the element's language value + // being equal to the identifier C, + // or beginning with the identifier C immediately followed by "-". + // The matching of C against the element's language value is performed case-insensitively. + // The identifier C does not have to be a valid language name." + // http://www.w3.org/TR/selectors/#lang-pseudo + "lang": markFunction( function( lang ) { + // lang value must be a valid identifier + if ( !ridentifier.test(lang || "") ) { + Sizzle.error( "unsupported lang: " + lang ); + } + lang = lang.replace( runescape, funescape ).toLowerCase(); + return function( elem ) { + var elemLang; + do { + if ( (elemLang = documentIsHTML ? + elem.lang : + elem.getAttribute("xml:lang") || elem.getAttribute("lang")) ) { + + elemLang = elemLang.toLowerCase(); + return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; + } + } while ( (elem = elem.parentNode) && elem.nodeType === 1 ); + return false; + }; + }), + + // Miscellaneous + "target": function( elem ) { + var hash = window.location && window.location.hash; + return hash && hash.slice( 1 ) === elem.id; + }, + + "root": function( elem ) { + return elem === docElem; + }, + + "focus": function( elem ) { + return elem === document.activeElement && (!document.hasFocus || document.hasFocus()) && !!(elem.type || elem.href || ~elem.tabIndex); + }, + + // Boolean properties + "enabled": createDisabledPseudo( false ), + "disabled": createDisabledPseudo( true ), + + "checked": function( elem ) { + // In CSS3, :checked should return both checked and selected elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + var nodeName = elem.nodeName.toLowerCase(); + return (nodeName === "input" && !!elem.checked) || (nodeName === "option" && !!elem.selected); + }, + + "selected": function( elem ) { + // Accessing this property makes selected-by-default + // options in Safari work properly + if ( elem.parentNode ) { + elem.parentNode.selectedIndex; + } + + return elem.selected === true; + }, + + // Contents + "empty": function( elem ) { + // http://www.w3.org/TR/selectors/#empty-pseudo + // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), + // but not by others (comment: 8; processing instruction: 7; etc.) + // nodeType < 6 works because attributes (2) do not appear as children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + if ( elem.nodeType < 6 ) { + return false; + } + } + return true; + }, + + "parent": function( elem ) { + return !Expr.pseudos["empty"]( elem ); + }, + + // Element/input types + "header": function( elem ) { + return rheader.test( elem.nodeName ); + }, + + "input": function( elem ) { + return rinputs.test( elem.nodeName ); + }, + + "button": function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === "button" || name === "button"; + }, + + "text": function( elem ) { + var attr; + return elem.nodeName.toLowerCase() === "input" && + elem.type === "text" && + + // Support: IE<8 + // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" + ( (attr = elem.getAttribute("type")) == null || attr.toLowerCase() === "text" ); + }, + + // Position-in-collection + "first": createPositionalPseudo(function() { + return [ 0 ]; + }), + + "last": createPositionalPseudo(function( matchIndexes, length ) { + return [ length - 1 ]; + }), + + "eq": createPositionalPseudo(function( matchIndexes, length, argument ) { + return [ argument < 0 ? argument + length : argument ]; + }), + + "even": createPositionalPseudo(function( matchIndexes, length ) { + var i = 0; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + }), + + "odd": createPositionalPseudo(function( matchIndexes, length ) { + var i = 1; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + }), + + "lt": createPositionalPseudo(function( matchIndexes, length, argument ) { + var i = argument < 0 ? argument + length : argument; + for ( ; --i >= 0; ) { + matchIndexes.push( i ); + } + return matchIndexes; + }), + + "gt": createPositionalPseudo(function( matchIndexes, length, argument ) { + var i = argument < 0 ? argument + length : argument; + for ( ; ++i < length; ) { + matchIndexes.push( i ); + } + return matchIndexes; + }) + } +}; + +Expr.pseudos["nth"] = Expr.pseudos["eq"]; + +// Add button/input type pseudos +for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { + Expr.pseudos[ i ] = createInputPseudo( i ); +} +for ( i in { submit: true, reset: true } ) { + Expr.pseudos[ i ] = createButtonPseudo( i ); +} + +// Easy API for creating new setFilters +function setFilters() {} +setFilters.prototype = Expr.filters = Expr.pseudos; +Expr.setFilters = new setFilters(); + +tokenize = Sizzle.tokenize = function( selector, parseOnly ) { + var matched, match, tokens, type, + soFar, groups, preFilters, + cached = tokenCache[ selector + " " ]; + + if ( cached ) { + return parseOnly ? 0 : cached.slice( 0 ); + } + + soFar = selector; + groups = []; + preFilters = Expr.preFilter; + + while ( soFar ) { + + // Comma and first run + if ( !matched || (match = rcomma.exec( soFar )) ) { + if ( match ) { + // Don't consume trailing commas as valid + soFar = soFar.slice( match[0].length ) || soFar; + } + groups.push( (tokens = []) ); + } + + matched = false; + + // Combinators + if ( (match = rcombinators.exec( soFar )) ) { + matched = match.shift(); + tokens.push({ + value: matched, + // Cast descendant combinators to space + type: match[0].replace( rtrim, " " ) + }); + soFar = soFar.slice( matched.length ); + } + + // Filters + for ( type in Expr.filter ) { + if ( (match = matchExpr[ type ].exec( soFar )) && (!preFilters[ type ] || + (match = preFilters[ type ]( match ))) ) { + matched = match.shift(); + tokens.push({ + value: matched, + type: type, + matches: match + }); + soFar = soFar.slice( matched.length ); + } + } + + if ( !matched ) { + break; + } + } + + // Return the length of the invalid excess + // if we're just parsing + // Otherwise, throw an error or return tokens + return parseOnly ? + soFar.length : + soFar ? + Sizzle.error( selector ) : + // Cache the tokens + tokenCache( selector, groups ).slice( 0 ); +}; + +function toSelector( tokens ) { + var i = 0, + len = tokens.length, + selector = ""; + for ( ; i < len; i++ ) { + selector += tokens[i].value; + } + return selector; +} + +function addCombinator( matcher, combinator, base ) { + var dir = combinator.dir, + skip = combinator.next, + key = skip || dir, + checkNonElements = base && key === "parentNode", + doneName = done++; + + return combinator.first ? + // Check against closest ancestor/preceding element + function( elem, context, xml ) { + while ( (elem = elem[ dir ]) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + return matcher( elem, context, xml ); + } + } + return false; + } : + + // Check against all ancestor/preceding elements + function( elem, context, xml ) { + var oldCache, uniqueCache, outerCache, + newCache = [ dirruns, doneName ]; + + // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching + if ( xml ) { + while ( (elem = elem[ dir ]) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + if ( matcher( elem, context, xml ) ) { + return true; + } + } + } + } else { + while ( (elem = elem[ dir ]) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + outerCache = elem[ expando ] || (elem[ expando ] = {}); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ elem.uniqueID ] || (outerCache[ elem.uniqueID ] = {}); + + if ( skip && skip === elem.nodeName.toLowerCase() ) { + elem = elem[ dir ] || elem; + } else if ( (oldCache = uniqueCache[ key ]) && + oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { + + // Assign to newCache so results back-propagate to previous elements + return (newCache[ 2 ] = oldCache[ 2 ]); + } else { + // Reuse newcache so results back-propagate to previous elements + uniqueCache[ key ] = newCache; + + // A match means we're done; a fail means we have to keep checking + if ( (newCache[ 2 ] = matcher( elem, context, xml )) ) { + return true; + } + } + } + } + } + return false; + }; +} + +function elementMatcher( matchers ) { + return matchers.length > 1 ? + function( elem, context, xml ) { + var i = matchers.length; + while ( i-- ) { + if ( !matchers[i]( elem, context, xml ) ) { + return false; + } + } + return true; + } : + matchers[0]; +} + +function multipleContexts( selector, contexts, results ) { + var i = 0, + len = contexts.length; + for ( ; i < len; i++ ) { + Sizzle( selector, contexts[i], results ); + } + return results; +} + +function condense( unmatched, map, filter, context, xml ) { + var elem, + newUnmatched = [], + i = 0, + len = unmatched.length, + mapped = map != null; + + for ( ; i < len; i++ ) { + if ( (elem = unmatched[i]) ) { + if ( !filter || filter( elem, context, xml ) ) { + newUnmatched.push( elem ); + if ( mapped ) { + map.push( i ); + } + } + } + } + + return newUnmatched; +} + +function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { + if ( postFilter && !postFilter[ expando ] ) { + postFilter = setMatcher( postFilter ); + } + if ( postFinder && !postFinder[ expando ] ) { + postFinder = setMatcher( postFinder, postSelector ); + } + return markFunction(function( seed, results, context, xml ) { + var temp, i, elem, + preMap = [], + postMap = [], + preexisting = results.length, + + // Get initial elements from seed or context + elems = seed || multipleContexts( selector || "*", context.nodeType ? [ context ] : context, [] ), + + // Prefilter to get matcher input, preserving a map for seed-results synchronization + matcherIn = preFilter && ( seed || !selector ) ? + condense( elems, preMap, preFilter, context, xml ) : + elems, + + matcherOut = matcher ? + // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, + postFinder || ( seed ? preFilter : preexisting || postFilter ) ? + + // ...intermediate processing is necessary + [] : + + // ...otherwise use results directly + results : + matcherIn; + + // Find primary matches + if ( matcher ) { + matcher( matcherIn, matcherOut, context, xml ); + } + + // Apply postFilter + if ( postFilter ) { + temp = condense( matcherOut, postMap ); + postFilter( temp, [], context, xml ); + + // Un-match failing elements by moving them back to matcherIn + i = temp.length; + while ( i-- ) { + if ( (elem = temp[i]) ) { + matcherOut[ postMap[i] ] = !(matcherIn[ postMap[i] ] = elem); + } + } + } + + if ( seed ) { + if ( postFinder || preFilter ) { + if ( postFinder ) { + // Get the final matcherOut by condensing this intermediate into postFinder contexts + temp = []; + i = matcherOut.length; + while ( i-- ) { + if ( (elem = matcherOut[i]) ) { + // Restore matcherIn since elem is not yet a final match + temp.push( (matcherIn[i] = elem) ); + } + } + postFinder( null, (matcherOut = []), temp, xml ); + } + + // Move matched elements from seed to results to keep them synchronized + i = matcherOut.length; + while ( i-- ) { + if ( (elem = matcherOut[i]) && + (temp = postFinder ? indexOf( seed, elem ) : preMap[i]) > -1 ) { + + seed[temp] = !(results[temp] = elem); + } + } + } + + // Add elements to results, through postFinder if defined + } else { + matcherOut = condense( + matcherOut === results ? + matcherOut.splice( preexisting, matcherOut.length ) : + matcherOut + ); + if ( postFinder ) { + postFinder( null, results, matcherOut, xml ); + } else { + push.apply( results, matcherOut ); + } + } + }); +} + +function matcherFromTokens( tokens ) { + var checkContext, matcher, j, + len = tokens.length, + leadingRelative = Expr.relative[ tokens[0].type ], + implicitRelative = leadingRelative || Expr.relative[" "], + i = leadingRelative ? 1 : 0, + + // The foundational matcher ensures that elements are reachable from top-level context(s) + matchContext = addCombinator( function( elem ) { + return elem === checkContext; + }, implicitRelative, true ), + matchAnyContext = addCombinator( function( elem ) { + return indexOf( checkContext, elem ) > -1; + }, implicitRelative, true ), + matchers = [ function( elem, context, xml ) { + var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( + (checkContext = context).nodeType ? + matchContext( elem, context, xml ) : + matchAnyContext( elem, context, xml ) ); + // Avoid hanging onto element (issue #299) + checkContext = null; + return ret; + } ]; + + for ( ; i < len; i++ ) { + if ( (matcher = Expr.relative[ tokens[i].type ]) ) { + matchers = [ addCombinator(elementMatcher( matchers ), matcher) ]; + } else { + matcher = Expr.filter[ tokens[i].type ].apply( null, tokens[i].matches ); + + // Return special upon seeing a positional matcher + if ( matcher[ expando ] ) { + // Find the next relative operator (if any) for proper handling + j = ++i; + for ( ; j < len; j++ ) { + if ( Expr.relative[ tokens[j].type ] ) { + break; + } + } + return setMatcher( + i > 1 && elementMatcher( matchers ), + i > 1 && toSelector( + // If the preceding token was a descendant combinator, insert an implicit any-element `*` + tokens.slice( 0, i - 1 ).concat({ value: tokens[ i - 2 ].type === " " ? "*" : "" }) + ).replace( rtrim, "$1" ), + matcher, + i < j && matcherFromTokens( tokens.slice( i, j ) ), + j < len && matcherFromTokens( (tokens = tokens.slice( j )) ), + j < len && toSelector( tokens ) + ); + } + matchers.push( matcher ); + } + } + + return elementMatcher( matchers ); +} + +function matcherFromGroupMatchers( elementMatchers, setMatchers ) { + var bySet = setMatchers.length > 0, + byElement = elementMatchers.length > 0, + superMatcher = function( seed, context, xml, results, outermost ) { + var elem, j, matcher, + matchedCount = 0, + i = "0", + unmatched = seed && [], + setMatched = [], + contextBackup = outermostContext, + // We must always have either seed elements or outermost context + elems = seed || byElement && Expr.find["TAG"]( "*", outermost ), + // Use integer dirruns iff this is the outermost matcher + dirrunsUnique = (dirruns += contextBackup == null ? 1 : Math.random() || 0.1), + len = elems.length; + + if ( outermost ) { + outermostContext = context === document || context || outermost; + } + + // Add elements passing elementMatchers directly to results + // Support: IE<9, Safari + // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id + for ( ; i !== len && (elem = elems[i]) != null; i++ ) { + if ( byElement && elem ) { + j = 0; + if ( !context && elem.ownerDocument !== document ) { + setDocument( elem ); + xml = !documentIsHTML; + } + while ( (matcher = elementMatchers[j++]) ) { + if ( matcher( elem, context || document, xml) ) { + results.push( elem ); + break; + } + } + if ( outermost ) { + dirruns = dirrunsUnique; + } + } + + // Track unmatched elements for set filters + if ( bySet ) { + // They will have gone through all possible matchers + if ( (elem = !matcher && elem) ) { + matchedCount--; + } + + // Lengthen the array for every element, matched or not + if ( seed ) { + unmatched.push( elem ); + } + } + } + + // `i` is now the count of elements visited above, and adding it to `matchedCount` + // makes the latter nonnegative. + matchedCount += i; + + // Apply set filters to unmatched elements + // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` + // equals `i`), unless we didn't visit _any_ elements in the above loop because we have + // no element matchers and no seed. + // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that + // case, which will result in a "00" `matchedCount` that differs from `i` but is also + // numerically zero. + if ( bySet && i !== matchedCount ) { + j = 0; + while ( (matcher = setMatchers[j++]) ) { + matcher( unmatched, setMatched, context, xml ); + } + + if ( seed ) { + // Reintegrate element matches to eliminate the need for sorting + if ( matchedCount > 0 ) { + while ( i-- ) { + if ( !(unmatched[i] || setMatched[i]) ) { + setMatched[i] = pop.call( results ); + } + } + } + + // Discard index placeholder values to get only actual matches + setMatched = condense( setMatched ); + } + + // Add matches to results + push.apply( results, setMatched ); + + // Seedless set matches succeeding multiple successful matchers stipulate sorting + if ( outermost && !seed && setMatched.length > 0 && + ( matchedCount + setMatchers.length ) > 1 ) { + + Sizzle.uniqueSort( results ); + } + } + + // Override manipulation of globals by nested matchers + if ( outermost ) { + dirruns = dirrunsUnique; + outermostContext = contextBackup; + } + + return unmatched; + }; + + return bySet ? + markFunction( superMatcher ) : + superMatcher; +} + +compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { + var i, + setMatchers = [], + elementMatchers = [], + cached = compilerCache[ selector + " " ]; + + if ( !cached ) { + // Generate a function of recursive functions that can be used to check each element + if ( !match ) { + match = tokenize( selector ); + } + i = match.length; + while ( i-- ) { + cached = matcherFromTokens( match[i] ); + if ( cached[ expando ] ) { + setMatchers.push( cached ); + } else { + elementMatchers.push( cached ); + } + } + + // Cache the compiled function + cached = compilerCache( selector, matcherFromGroupMatchers( elementMatchers, setMatchers ) ); + + // Save selector and tokenization + cached.selector = selector; + } + return cached; +}; + +/** + * A low-level selection function that works with Sizzle's compiled + * selector functions + * @param {String|Function} selector A selector or a pre-compiled + * selector function built with Sizzle.compile + * @param {Element} context + * @param {Array} [results] + * @param {Array} [seed] A set of elements to match against + */ +select = Sizzle.select = function( selector, context, results, seed ) { + var i, tokens, token, type, find, + compiled = typeof selector === "function" && selector, + match = !seed && tokenize( (selector = compiled.selector || selector) ); + + results = results || []; + + // Try to minimize operations if there is only one selector in the list and no seed + // (the latter of which guarantees us context) + if ( match.length === 1 ) { + + // Reduce context if the leading compound selector is an ID + tokens = match[0] = match[0].slice( 0 ); + if ( tokens.length > 2 && (token = tokens[0]).type === "ID" && + context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[1].type ] ) { + + context = ( Expr.find["ID"]( token.matches[0].replace(runescape, funescape), context ) || [] )[0]; + if ( !context ) { + return results; + + // Precompiled matchers will still verify ancestry, so step up a level + } else if ( compiled ) { + context = context.parentNode; + } + + selector = selector.slice( tokens.shift().value.length ); + } + + // Fetch a seed set for right-to-left matching + i = matchExpr["needsContext"].test( selector ) ? 0 : tokens.length; + while ( i-- ) { + token = tokens[i]; + + // Abort if we hit a combinator + if ( Expr.relative[ (type = token.type) ] ) { + break; + } + if ( (find = Expr.find[ type ]) ) { + // Search, expanding context for leading sibling combinators + if ( (seed = find( + token.matches[0].replace( runescape, funescape ), + rsibling.test( tokens[0].type ) && testContext( context.parentNode ) || context + )) ) { + + // If seed is empty or no tokens remain, we can return early + tokens.splice( i, 1 ); + selector = seed.length && toSelector( tokens ); + if ( !selector ) { + push.apply( results, seed ); + return results; + } + + break; + } + } + } + } + + // Compile and execute a filtering function if one is not provided + // Provide `match` to avoid retokenization if we modified the selector above + ( compiled || compile( selector, match ) )( + seed, + context, + !documentIsHTML, + results, + !context || rsibling.test( selector ) && testContext( context.parentNode ) || context + ); + return results; +}; + +// One-time assignments + +// Sort stability +support.sortStable = expando.split("").sort( sortOrder ).join("") === expando; + +// Support: Chrome 14-35+ +// Always assume duplicates if they aren't passed to the comparison function +support.detectDuplicates = !!hasDuplicate; + +// Initialize against the default document +setDocument(); + +// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) +// Detached nodes confoundingly follow *each other* +support.sortDetached = assert(function( el ) { + // Should return 1, but returns 4 (following) + return el.compareDocumentPosition( document.createElement("fieldset") ) & 1; +}); + +// Support: IE<8 +// Prevent attribute/property "interpolation" +// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx +if ( !assert(function( el ) { + el.innerHTML = ""; + return el.firstChild.getAttribute("href") === "#" ; +}) ) { + addHandle( "type|href|height|width", function( elem, name, isXML ) { + if ( !isXML ) { + return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); + } + }); +} + +// Support: IE<9 +// Use defaultValue in place of getAttribute("value") +if ( !support.attributes || !assert(function( el ) { + el.innerHTML = ""; + el.firstChild.setAttribute( "value", "" ); + return el.firstChild.getAttribute( "value" ) === ""; +}) ) { + addHandle( "value", function( elem, name, isXML ) { + if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { + return elem.defaultValue; + } + }); +} + +// Support: IE<9 +// Use getAttributeNode to fetch booleans when getAttribute lies +if ( !assert(function( el ) { + return el.getAttribute("disabled") == null; +}) ) { + addHandle( booleans, function( elem, name, isXML ) { + var val; + if ( !isXML ) { + return elem[ name ] === true ? name.toLowerCase() : + (val = elem.getAttributeNode( name )) && val.specified ? + val.value : + null; + } + }); +} + +return Sizzle; + +})( window ); + + + +jQuery.find = Sizzle; +jQuery.expr = Sizzle.selectors; + +// Deprecated +jQuery.expr[ ":" ] = jQuery.expr.pseudos; +jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; +jQuery.text = Sizzle.getText; +jQuery.isXMLDoc = Sizzle.isXML; +jQuery.contains = Sizzle.contains; +jQuery.escapeSelector = Sizzle.escape; + + + + +var dir = function( elem, dir, until ) { + var matched = [], + truncate = until !== undefined; + + while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { + if ( elem.nodeType === 1 ) { + if ( truncate && jQuery( elem ).is( until ) ) { + break; + } + matched.push( elem ); + } + } + return matched; +}; + + +var siblings = function( n, elem ) { + var matched = []; + + for ( ; n; n = n.nextSibling ) { + if ( n.nodeType === 1 && n !== elem ) { + matched.push( n ); + } + } + + return matched; +}; + + +var rneedsContext = jQuery.expr.match.needsContext; + + + +function nodeName( elem, name ) { + + return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); + +}; +var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); + + + +var risSimple = /^.[^:#\[\.,]*$/; + +// Implement the identical functionality for filter and not +function winnow( elements, qualifier, not ) { + if ( jQuery.isFunction( qualifier ) ) { + return jQuery.grep( elements, function( elem, i ) { + return !!qualifier.call( elem, i, elem ) !== not; + } ); + } + + // Single element + if ( qualifier.nodeType ) { + return jQuery.grep( elements, function( elem ) { + return ( elem === qualifier ) !== not; + } ); + } + + // Arraylike of elements (jQuery, arguments, Array) + if ( typeof qualifier !== "string" ) { + return jQuery.grep( elements, function( elem ) { + return ( indexOf.call( qualifier, elem ) > -1 ) !== not; + } ); + } + + // Simple selector that can be filtered directly, removing non-Elements + if ( risSimple.test( qualifier ) ) { + return jQuery.filter( qualifier, elements, not ); + } + + // Complex selector, compare the two sets, removing non-Elements + qualifier = jQuery.filter( qualifier, elements ); + return jQuery.grep( elements, function( elem ) { + return ( indexOf.call( qualifier, elem ) > -1 ) !== not && elem.nodeType === 1; + } ); +} + +jQuery.filter = function( expr, elems, not ) { + var elem = elems[ 0 ]; + + if ( not ) { + expr = ":not(" + expr + ")"; + } + + if ( elems.length === 1 && elem.nodeType === 1 ) { + return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; + } + + return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { + return elem.nodeType === 1; + } ) ); +}; + +jQuery.fn.extend( { + find: function( selector ) { + var i, ret, + len = this.length, + self = this; + + if ( typeof selector !== "string" ) { + return this.pushStack( jQuery( selector ).filter( function() { + for ( i = 0; i < len; i++ ) { + if ( jQuery.contains( self[ i ], this ) ) { + return true; + } + } + } ) ); + } + + ret = this.pushStack( [] ); + + for ( i = 0; i < len; i++ ) { + jQuery.find( selector, self[ i ], ret ); + } + + return len > 1 ? jQuery.uniqueSort( ret ) : ret; + }, + filter: function( selector ) { + return this.pushStack( winnow( this, selector || [], false ) ); + }, + not: function( selector ) { + return this.pushStack( winnow( this, selector || [], true ) ); + }, + is: function( selector ) { + return !!winnow( + this, + + // If this is a positional/relative selector, check membership in the returned set + // so $("p:first").is("p:last") won't return true for a doc with two "p". + typeof selector === "string" && rneedsContext.test( selector ) ? + jQuery( selector ) : + selector || [], + false + ).length; + } +} ); + + +// Initialize a jQuery object + + +// A central reference to the root jQuery(document) +var rootjQuery, + + // A simple way to check for HTML strings + // Prioritize #id over to avoid XSS via location.hash (#9521) + // Strict HTML recognition (#11290: must start with <) + // Shortcut simple #id case for speed + rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, + + init = jQuery.fn.init = function( selector, context, root ) { + var match, elem; + + // HANDLE: $(""), $(null), $(undefined), $(false) + if ( !selector ) { + return this; + } + + // Method init() accepts an alternate rootjQuery + // so migrate can support jQuery.sub (gh-2101) + root = root || rootjQuery; + + // Handle HTML strings + if ( typeof selector === "string" ) { + if ( selector[ 0 ] === "<" && + selector[ selector.length - 1 ] === ">" && + selector.length >= 3 ) { + + // Assume that strings that start and end with <> are HTML and skip the regex check + match = [ null, selector, null ]; + + } else { + match = rquickExpr.exec( selector ); + } + + // Match html or make sure no context is specified for #id + if ( match && ( match[ 1 ] || !context ) ) { + + // HANDLE: $(html) -> $(array) + if ( match[ 1 ] ) { + context = context instanceof jQuery ? context[ 0 ] : context; + + // Option to run scripts is true for back-compat + // Intentionally let the error be thrown if parseHTML is not present + jQuery.merge( this, jQuery.parseHTML( + match[ 1 ], + context && context.nodeType ? context.ownerDocument || context : document, + true + ) ); + + // HANDLE: $(html, props) + if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { + for ( match in context ) { + + // Properties of context are called as methods if possible + if ( jQuery.isFunction( this[ match ] ) ) { + this[ match ]( context[ match ] ); + + // ...and otherwise set as attributes + } else { + this.attr( match, context[ match ] ); + } + } + } + + return this; + + // HANDLE: $(#id) + } else { + elem = document.getElementById( match[ 2 ] ); + + if ( elem ) { + + // Inject the element directly into the jQuery object + this[ 0 ] = elem; + this.length = 1; + } + return this; + } + + // HANDLE: $(expr, $(...)) + } else if ( !context || context.jquery ) { + return ( context || root ).find( selector ); + + // HANDLE: $(expr, context) + // (which is just equivalent to: $(context).find(expr) + } else { + return this.constructor( context ).find( selector ); + } + + // HANDLE: $(DOMElement) + } else if ( selector.nodeType ) { + this[ 0 ] = selector; + this.length = 1; + return this; + + // HANDLE: $(function) + // Shortcut for document ready + } else if ( jQuery.isFunction( selector ) ) { + return root.ready !== undefined ? + root.ready( selector ) : + + // Execute immediately if ready is not present + selector( jQuery ); + } + + return jQuery.makeArray( selector, this ); + }; + +// Give the init function the jQuery prototype for later instantiation +init.prototype = jQuery.fn; + +// Initialize central reference +rootjQuery = jQuery( document ); + + +var rparentsprev = /^(?:parents|prev(?:Until|All))/, + + // Methods guaranteed to produce a unique set when starting from a unique set + guaranteedUnique = { + children: true, + contents: true, + next: true, + prev: true + }; + +jQuery.fn.extend( { + has: function( target ) { + var targets = jQuery( target, this ), + l = targets.length; + + return this.filter( function() { + var i = 0; + for ( ; i < l; i++ ) { + if ( jQuery.contains( this, targets[ i ] ) ) { + return true; + } + } + } ); + }, + + closest: function( selectors, context ) { + var cur, + i = 0, + l = this.length, + matched = [], + targets = typeof selectors !== "string" && jQuery( selectors ); + + // Positional selectors never match, since there's no _selection_ context + if ( !rneedsContext.test( selectors ) ) { + for ( ; i < l; i++ ) { + for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { + + // Always skip document fragments + if ( cur.nodeType < 11 && ( targets ? + targets.index( cur ) > -1 : + + // Don't pass non-elements to Sizzle + cur.nodeType === 1 && + jQuery.find.matchesSelector( cur, selectors ) ) ) { + + matched.push( cur ); + break; + } + } + } + } + + return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); + }, + + // Determine the position of an element within the set + index: function( elem ) { + + // No argument, return index in parent + if ( !elem ) { + return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; + } + + // Index in selector + if ( typeof elem === "string" ) { + return indexOf.call( jQuery( elem ), this[ 0 ] ); + } + + // Locate the position of the desired element + return indexOf.call( this, + + // If it receives a jQuery object, the first element is used + elem.jquery ? elem[ 0 ] : elem + ); + }, + + add: function( selector, context ) { + return this.pushStack( + jQuery.uniqueSort( + jQuery.merge( this.get(), jQuery( selector, context ) ) + ) + ); + }, + + addBack: function( selector ) { + return this.add( selector == null ? + this.prevObject : this.prevObject.filter( selector ) + ); + } +} ); + +function sibling( cur, dir ) { + while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} + return cur; +} + +jQuery.each( { + parent: function( elem ) { + var parent = elem.parentNode; + return parent && parent.nodeType !== 11 ? parent : null; + }, + parents: function( elem ) { + return dir( elem, "parentNode" ); + }, + parentsUntil: function( elem, i, until ) { + return dir( elem, "parentNode", until ); + }, + next: function( elem ) { + return sibling( elem, "nextSibling" ); + }, + prev: function( elem ) { + return sibling( elem, "previousSibling" ); + }, + nextAll: function( elem ) { + return dir( elem, "nextSibling" ); + }, + prevAll: function( elem ) { + return dir( elem, "previousSibling" ); + }, + nextUntil: function( elem, i, until ) { + return dir( elem, "nextSibling", until ); + }, + prevUntil: function( elem, i, until ) { + return dir( elem, "previousSibling", until ); + }, + siblings: function( elem ) { + return siblings( ( elem.parentNode || {} ).firstChild, elem ); + }, + children: function( elem ) { + return siblings( elem.firstChild ); + }, + contents: function( elem ) { + if ( nodeName( elem, "iframe" ) ) { + return elem.contentDocument; + } + + // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only + // Treat the template element as a regular one in browsers that + // don't support it. + if ( nodeName( elem, "template" ) ) { + elem = elem.content || elem; + } + + return jQuery.merge( [], elem.childNodes ); + } +}, function( name, fn ) { + jQuery.fn[ name ] = function( until, selector ) { + var matched = jQuery.map( this, fn, until ); + + if ( name.slice( -5 ) !== "Until" ) { + selector = until; + } + + if ( selector && typeof selector === "string" ) { + matched = jQuery.filter( selector, matched ); + } + + if ( this.length > 1 ) { + + // Remove duplicates + if ( !guaranteedUnique[ name ] ) { + jQuery.uniqueSort( matched ); + } + + // Reverse order for parents* and prev-derivatives + if ( rparentsprev.test( name ) ) { + matched.reverse(); + } + } + + return this.pushStack( matched ); + }; +} ); +var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); + + + +// Convert String-formatted options into Object-formatted ones +function createOptions( options ) { + var object = {}; + jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { + object[ flag ] = true; + } ); + return object; +} + +/* + * Create a callback list using the following parameters: + * + * options: an optional list of space-separated options that will change how + * the callback list behaves or a more traditional option object + * + * By default a callback list will act like an event callback list and can be + * "fired" multiple times. + * + * Possible options: + * + * once: will ensure the callback list can only be fired once (like a Deferred) + * + * memory: will keep track of previous values and will call any callback added + * after the list has been fired right away with the latest "memorized" + * values (like a Deferred) + * + * unique: will ensure a callback can only be added once (no duplicate in the list) + * + * stopOnFalse: interrupt callings when a callback returns false + * + */ +jQuery.Callbacks = function( options ) { + + // Convert options from String-formatted to Object-formatted if needed + // (we check in cache first) + options = typeof options === "string" ? + createOptions( options ) : + jQuery.extend( {}, options ); + + var // Flag to know if list is currently firing + firing, + + // Last fire value for non-forgettable lists + memory, + + // Flag to know if list was already fired + fired, + + // Flag to prevent firing + locked, + + // Actual callback list + list = [], + + // Queue of execution data for repeatable lists + queue = [], + + // Index of currently firing callback (modified by add/remove as needed) + firingIndex = -1, + + // Fire callbacks + fire = function() { + + // Enforce single-firing + locked = locked || options.once; + + // Execute callbacks for all pending executions, + // respecting firingIndex overrides and runtime changes + fired = firing = true; + for ( ; queue.length; firingIndex = -1 ) { + memory = queue.shift(); + while ( ++firingIndex < list.length ) { + + // Run callback and check for early termination + if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && + options.stopOnFalse ) { + + // Jump to end and forget the data so .add doesn't re-fire + firingIndex = list.length; + memory = false; + } + } + } + + // Forget the data if we're done with it + if ( !options.memory ) { + memory = false; + } + + firing = false; + + // Clean up if we're done firing for good + if ( locked ) { + + // Keep an empty list if we have data for future add calls + if ( memory ) { + list = []; + + // Otherwise, this object is spent + } else { + list = ""; + } + } + }, + + // Actual Callbacks object + self = { + + // Add a callback or a collection of callbacks to the list + add: function() { + if ( list ) { + + // If we have memory from a past run, we should fire after adding + if ( memory && !firing ) { + firingIndex = list.length - 1; + queue.push( memory ); + } + + ( function add( args ) { + jQuery.each( args, function( _, arg ) { + if ( jQuery.isFunction( arg ) ) { + if ( !options.unique || !self.has( arg ) ) { + list.push( arg ); + } + } else if ( arg && arg.length && jQuery.type( arg ) !== "string" ) { + + // Inspect recursively + add( arg ); + } + } ); + } )( arguments ); + + if ( memory && !firing ) { + fire(); + } + } + return this; + }, + + // Remove a callback from the list + remove: function() { + jQuery.each( arguments, function( _, arg ) { + var index; + while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { + list.splice( index, 1 ); + + // Handle firing indexes + if ( index <= firingIndex ) { + firingIndex--; + } + } + } ); + return this; + }, + + // Check if a given callback is in the list. + // If no argument is given, return whether or not list has callbacks attached. + has: function( fn ) { + return fn ? + jQuery.inArray( fn, list ) > -1 : + list.length > 0; + }, + + // Remove all callbacks from the list + empty: function() { + if ( list ) { + list = []; + } + return this; + }, + + // Disable .fire and .add + // Abort any current/pending executions + // Clear all callbacks and values + disable: function() { + locked = queue = []; + list = memory = ""; + return this; + }, + disabled: function() { + return !list; + }, + + // Disable .fire + // Also disable .add unless we have memory (since it would have no effect) + // Abort any pending executions + lock: function() { + locked = queue = []; + if ( !memory && !firing ) { + list = memory = ""; + } + return this; + }, + locked: function() { + return !!locked; + }, + + // Call all callbacks with the given context and arguments + fireWith: function( context, args ) { + if ( !locked ) { + args = args || []; + args = [ context, args.slice ? args.slice() : args ]; + queue.push( args ); + if ( !firing ) { + fire(); + } + } + return this; + }, + + // Call all the callbacks with the given arguments + fire: function() { + self.fireWith( this, arguments ); + return this; + }, + + // To know if the callbacks have already been called at least once + fired: function() { + return !!fired; + } + }; + + return self; +}; + + +function Identity( v ) { + return v; +} +function Thrower( ex ) { + throw ex; +} + +function adoptValue( value, resolve, reject, noValue ) { + var method; + + try { + + // Check for promise aspect first to privilege synchronous behavior + if ( value && jQuery.isFunction( ( method = value.promise ) ) ) { + method.call( value ).done( resolve ).fail( reject ); + + // Other thenables + } else if ( value && jQuery.isFunction( ( method = value.then ) ) ) { + method.call( value, resolve, reject ); + + // Other non-thenables + } else { + + // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: + // * false: [ value ].slice( 0 ) => resolve( value ) + // * true: [ value ].slice( 1 ) => resolve() + resolve.apply( undefined, [ value ].slice( noValue ) ); + } + + // For Promises/A+, convert exceptions into rejections + // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in + // Deferred#then to conditionally suppress rejection. + } catch ( value ) { + + // Support: Android 4.0 only + // Strict mode functions invoked without .call/.apply get global-object context + reject.apply( undefined, [ value ] ); + } +} + +jQuery.extend( { + + Deferred: function( func ) { + var tuples = [ + + // action, add listener, callbacks, + // ... .then handlers, argument index, [final state] + [ "notify", "progress", jQuery.Callbacks( "memory" ), + jQuery.Callbacks( "memory" ), 2 ], + [ "resolve", "done", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 0, "resolved" ], + [ "reject", "fail", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 1, "rejected" ] + ], + state = "pending", + promise = { + state: function() { + return state; + }, + always: function() { + deferred.done( arguments ).fail( arguments ); + return this; + }, + "catch": function( fn ) { + return promise.then( null, fn ); + }, + + // Keep pipe for back-compat + pipe: function( /* fnDone, fnFail, fnProgress */ ) { + var fns = arguments; + + return jQuery.Deferred( function( newDefer ) { + jQuery.each( tuples, function( i, tuple ) { + + // Map tuples (progress, done, fail) to arguments (done, fail, progress) + var fn = jQuery.isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; + + // deferred.progress(function() { bind to newDefer or newDefer.notify }) + // deferred.done(function() { bind to newDefer or newDefer.resolve }) + // deferred.fail(function() { bind to newDefer or newDefer.reject }) + deferred[ tuple[ 1 ] ]( function() { + var returned = fn && fn.apply( this, arguments ); + if ( returned && jQuery.isFunction( returned.promise ) ) { + returned.promise() + .progress( newDefer.notify ) + .done( newDefer.resolve ) + .fail( newDefer.reject ); + } else { + newDefer[ tuple[ 0 ] + "With" ]( + this, + fn ? [ returned ] : arguments + ); + } + } ); + } ); + fns = null; + } ).promise(); + }, + then: function( onFulfilled, onRejected, onProgress ) { + var maxDepth = 0; + function resolve( depth, deferred, handler, special ) { + return function() { + var that = this, + args = arguments, + mightThrow = function() { + var returned, then; + + // Support: Promises/A+ section 2.3.3.3.3 + // https://promisesaplus.com/#point-59 + // Ignore double-resolution attempts + if ( depth < maxDepth ) { + return; + } + + returned = handler.apply( that, args ); + + // Support: Promises/A+ section 2.3.1 + // https://promisesaplus.com/#point-48 + if ( returned === deferred.promise() ) { + throw new TypeError( "Thenable self-resolution" ); + } + + // Support: Promises/A+ sections 2.3.3.1, 3.5 + // https://promisesaplus.com/#point-54 + // https://promisesaplus.com/#point-75 + // Retrieve `then` only once + then = returned && + + // Support: Promises/A+ section 2.3.4 + // https://promisesaplus.com/#point-64 + // Only check objects and functions for thenability + ( typeof returned === "object" || + typeof returned === "function" ) && + returned.then; + + // Handle a returned thenable + if ( jQuery.isFunction( then ) ) { + + // Special processors (notify) just wait for resolution + if ( special ) { + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ) + ); + + // Normal processors (resolve) also hook into progress + } else { + + // ...and disregard older resolution values + maxDepth++; + + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ), + resolve( maxDepth, deferred, Identity, + deferred.notifyWith ) + ); + } + + // Handle all other returned values + } else { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Identity ) { + that = undefined; + args = [ returned ]; + } + + // Process the value(s) + // Default process is resolve + ( special || deferred.resolveWith )( that, args ); + } + }, + + // Only normal processors (resolve) catch and reject exceptions + process = special ? + mightThrow : + function() { + try { + mightThrow(); + } catch ( e ) { + + if ( jQuery.Deferred.exceptionHook ) { + jQuery.Deferred.exceptionHook( e, + process.stackTrace ); + } + + // Support: Promises/A+ section 2.3.3.3.4.1 + // https://promisesaplus.com/#point-61 + // Ignore post-resolution exceptions + if ( depth + 1 >= maxDepth ) { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Thrower ) { + that = undefined; + args = [ e ]; + } + + deferred.rejectWith( that, args ); + } + } + }; + + // Support: Promises/A+ section 2.3.3.3.1 + // https://promisesaplus.com/#point-57 + // Re-resolve promises immediately to dodge false rejection from + // subsequent errors + if ( depth ) { + process(); + } else { + + // Call an optional hook to record the stack, in case of exception + // since it's otherwise lost when execution goes async + if ( jQuery.Deferred.getStackHook ) { + process.stackTrace = jQuery.Deferred.getStackHook(); + } + window.setTimeout( process ); + } + }; + } + + return jQuery.Deferred( function( newDefer ) { + + // progress_handlers.add( ... ) + tuples[ 0 ][ 3 ].add( + resolve( + 0, + newDefer, + jQuery.isFunction( onProgress ) ? + onProgress : + Identity, + newDefer.notifyWith + ) + ); + + // fulfilled_handlers.add( ... ) + tuples[ 1 ][ 3 ].add( + resolve( + 0, + newDefer, + jQuery.isFunction( onFulfilled ) ? + onFulfilled : + Identity + ) + ); + + // rejected_handlers.add( ... ) + tuples[ 2 ][ 3 ].add( + resolve( + 0, + newDefer, + jQuery.isFunction( onRejected ) ? + onRejected : + Thrower + ) + ); + } ).promise(); + }, + + // Get a promise for this deferred + // If obj is provided, the promise aspect is added to the object + promise: function( obj ) { + return obj != null ? jQuery.extend( obj, promise ) : promise; + } + }, + deferred = {}; + + // Add list-specific methods + jQuery.each( tuples, function( i, tuple ) { + var list = tuple[ 2 ], + stateString = tuple[ 5 ]; + + // promise.progress = list.add + // promise.done = list.add + // promise.fail = list.add + promise[ tuple[ 1 ] ] = list.add; + + // Handle state + if ( stateString ) { + list.add( + function() { + + // state = "resolved" (i.e., fulfilled) + // state = "rejected" + state = stateString; + }, + + // rejected_callbacks.disable + // fulfilled_callbacks.disable + tuples[ 3 - i ][ 2 ].disable, + + // progress_callbacks.lock + tuples[ 0 ][ 2 ].lock + ); + } + + // progress_handlers.fire + // fulfilled_handlers.fire + // rejected_handlers.fire + list.add( tuple[ 3 ].fire ); + + // deferred.notify = function() { deferred.notifyWith(...) } + // deferred.resolve = function() { deferred.resolveWith(...) } + // deferred.reject = function() { deferred.rejectWith(...) } + deferred[ tuple[ 0 ] ] = function() { + deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); + return this; + }; + + // deferred.notifyWith = list.fireWith + // deferred.resolveWith = list.fireWith + // deferred.rejectWith = list.fireWith + deferred[ tuple[ 0 ] + "With" ] = list.fireWith; + } ); + + // Make the deferred a promise + promise.promise( deferred ); + + // Call given func if any + if ( func ) { + func.call( deferred, deferred ); + } + + // All done! + return deferred; + }, + + // Deferred helper + when: function( singleValue ) { + var + + // count of uncompleted subordinates + remaining = arguments.length, + + // count of unprocessed arguments + i = remaining, + + // subordinate fulfillment data + resolveContexts = Array( i ), + resolveValues = slice.call( arguments ), + + // the master Deferred + master = jQuery.Deferred(), + + // subordinate callback factory + updateFunc = function( i ) { + return function( value ) { + resolveContexts[ i ] = this; + resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; + if ( !( --remaining ) ) { + master.resolveWith( resolveContexts, resolveValues ); + } + }; + }; + + // Single- and empty arguments are adopted like Promise.resolve + if ( remaining <= 1 ) { + adoptValue( singleValue, master.done( updateFunc( i ) ).resolve, master.reject, + !remaining ); + + // Use .then() to unwrap secondary thenables (cf. gh-3000) + if ( master.state() === "pending" || + jQuery.isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { + + return master.then(); + } + } + + // Multiple arguments are aggregated like Promise.all array elements + while ( i-- ) { + adoptValue( resolveValues[ i ], updateFunc( i ), master.reject ); + } + + return master.promise(); + } +} ); + + +// These usually indicate a programmer mistake during development, +// warn about them ASAP rather than swallowing them by default. +var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; + +jQuery.Deferred.exceptionHook = function( error, stack ) { + + // Support: IE 8 - 9 only + // Console exists when dev tools are open, which can happen at any time + if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { + window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); + } +}; + + + + +jQuery.readyException = function( error ) { + window.setTimeout( function() { + throw error; + } ); +}; + + + + +// The deferred used on DOM ready +var readyList = jQuery.Deferred(); + +jQuery.fn.ready = function( fn ) { + + readyList + .then( fn ) + + // Wrap jQuery.readyException in a function so that the lookup + // happens at the time of error handling instead of callback + // registration. + .catch( function( error ) { + jQuery.readyException( error ); + } ); + + return this; +}; + +jQuery.extend( { + + // Is the DOM ready to be used? Set to true once it occurs. + isReady: false, + + // A counter to track how many items to wait for before + // the ready event fires. See #6781 + readyWait: 1, + + // Handle when the DOM is ready + ready: function( wait ) { + + // Abort if there are pending holds or we're already ready + if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { + return; + } + + // Remember that the DOM is ready + jQuery.isReady = true; + + // If a normal DOM Ready event fired, decrement, and wait if need be + if ( wait !== true && --jQuery.readyWait > 0 ) { + return; + } + + // If there are functions bound, to execute + readyList.resolveWith( document, [ jQuery ] ); + } +} ); + +jQuery.ready.then = readyList.then; + +// The ready event handler and self cleanup method +function completed() { + document.removeEventListener( "DOMContentLoaded", completed ); + window.removeEventListener( "load", completed ); + jQuery.ready(); +} + +// Catch cases where $(document).ready() is called +// after the browser event has already occurred. +// Support: IE <=9 - 10 only +// Older IE sometimes signals "interactive" too soon +if ( document.readyState === "complete" || + ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { + + // Handle it asynchronously to allow scripts the opportunity to delay ready + window.setTimeout( jQuery.ready ); + +} else { + + // Use the handy event callback + document.addEventListener( "DOMContentLoaded", completed ); + + // A fallback to window.onload, that will always work + window.addEventListener( "load", completed ); +} + + + + +// Multifunctional method to get and set values of a collection +// The value/s can optionally be executed if it's a function +var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { + var i = 0, + len = elems.length, + bulk = key == null; + + // Sets many values + if ( jQuery.type( key ) === "object" ) { + chainable = true; + for ( i in key ) { + access( elems, fn, i, key[ i ], true, emptyGet, raw ); + } + + // Sets one value + } else if ( value !== undefined ) { + chainable = true; + + if ( !jQuery.isFunction( value ) ) { + raw = true; + } + + if ( bulk ) { + + // Bulk operations run against the entire set + if ( raw ) { + fn.call( elems, value ); + fn = null; + + // ...except when executing function values + } else { + bulk = fn; + fn = function( elem, key, value ) { + return bulk.call( jQuery( elem ), value ); + }; + } + } + + if ( fn ) { + for ( ; i < len; i++ ) { + fn( + elems[ i ], key, raw ? + value : + value.call( elems[ i ], i, fn( elems[ i ], key ) ) + ); + } + } + } + + if ( chainable ) { + return elems; + } + + // Gets + if ( bulk ) { + return fn.call( elems ); + } + + return len ? fn( elems[ 0 ], key ) : emptyGet; +}; +var acceptData = function( owner ) { + + // Accepts only: + // - Node + // - Node.ELEMENT_NODE + // - Node.DOCUMENT_NODE + // - Object + // - Any + return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); +}; + + + + +function Data() { + this.expando = jQuery.expando + Data.uid++; +} + +Data.uid = 1; + +Data.prototype = { + + cache: function( owner ) { + + // Check if the owner object already has a cache + var value = owner[ this.expando ]; + + // If not, create one + if ( !value ) { + value = {}; + + // We can accept data for non-element nodes in modern browsers, + // but we should not, see #8335. + // Always return an empty object. + if ( acceptData( owner ) ) { + + // If it is a node unlikely to be stringify-ed or looped over + // use plain assignment + if ( owner.nodeType ) { + owner[ this.expando ] = value; + + // Otherwise secure it in a non-enumerable property + // configurable must be true to allow the property to be + // deleted when data is removed + } else { + Object.defineProperty( owner, this.expando, { + value: value, + configurable: true + } ); + } + } + } + + return value; + }, + set: function( owner, data, value ) { + var prop, + cache = this.cache( owner ); + + // Handle: [ owner, key, value ] args + // Always use camelCase key (gh-2257) + if ( typeof data === "string" ) { + cache[ jQuery.camelCase( data ) ] = value; + + // Handle: [ owner, { properties } ] args + } else { + + // Copy the properties one-by-one to the cache object + for ( prop in data ) { + cache[ jQuery.camelCase( prop ) ] = data[ prop ]; + } + } + return cache; + }, + get: function( owner, key ) { + return key === undefined ? + this.cache( owner ) : + + // Always use camelCase key (gh-2257) + owner[ this.expando ] && owner[ this.expando ][ jQuery.camelCase( key ) ]; + }, + access: function( owner, key, value ) { + + // In cases where either: + // + // 1. No key was specified + // 2. A string key was specified, but no value provided + // + // Take the "read" path and allow the get method to determine + // which value to return, respectively either: + // + // 1. The entire cache object + // 2. The data stored at the key + // + if ( key === undefined || + ( ( key && typeof key === "string" ) && value === undefined ) ) { + + return this.get( owner, key ); + } + + // When the key is not a string, or both a key and value + // are specified, set or extend (existing objects) with either: + // + // 1. An object of properties + // 2. A key and value + // + this.set( owner, key, value ); + + // Since the "set" path can have two possible entry points + // return the expected data based on which path was taken[*] + return value !== undefined ? value : key; + }, + remove: function( owner, key ) { + var i, + cache = owner[ this.expando ]; + + if ( cache === undefined ) { + return; + } + + if ( key !== undefined ) { + + // Support array or space separated string of keys + if ( Array.isArray( key ) ) { + + // If key is an array of keys... + // We always set camelCase keys, so remove that. + key = key.map( jQuery.camelCase ); + } else { + key = jQuery.camelCase( key ); + + // If a key with the spaces exists, use it. + // Otherwise, create an array by matching non-whitespace + key = key in cache ? + [ key ] : + ( key.match( rnothtmlwhite ) || [] ); + } + + i = key.length; + + while ( i-- ) { + delete cache[ key[ i ] ]; + } + } + + // Remove the expando if there's no more data + if ( key === undefined || jQuery.isEmptyObject( cache ) ) { + + // Support: Chrome <=35 - 45 + // Webkit & Blink performance suffers when deleting properties + // from DOM nodes, so set to undefined instead + // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) + if ( owner.nodeType ) { + owner[ this.expando ] = undefined; + } else { + delete owner[ this.expando ]; + } + } + }, + hasData: function( owner ) { + var cache = owner[ this.expando ]; + return cache !== undefined && !jQuery.isEmptyObject( cache ); + } +}; +var dataPriv = new Data(); + +var dataUser = new Data(); + + + +// Implementation Summary +// +// 1. Enforce API surface and semantic compatibility with 1.9.x branch +// 2. Improve the module's maintainability by reducing the storage +// paths to a single mechanism. +// 3. Use the same single mechanism to support "private" and "user" data. +// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) +// 5. Avoid exposing implementation details on user objects (eg. expando properties) +// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 + +var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, + rmultiDash = /[A-Z]/g; + +function getData( data ) { + if ( data === "true" ) { + return true; + } + + if ( data === "false" ) { + return false; + } + + if ( data === "null" ) { + return null; + } + + // Only convert to a number if it doesn't change the string + if ( data === +data + "" ) { + return +data; + } + + if ( rbrace.test( data ) ) { + return JSON.parse( data ); + } + + return data; +} + +function dataAttr( elem, key, data ) { + var name; + + // If nothing was found internally, try to fetch any + // data from the HTML5 data-* attribute + if ( data === undefined && elem.nodeType === 1 ) { + name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); + data = elem.getAttribute( name ); + + if ( typeof data === "string" ) { + try { + data = getData( data ); + } catch ( e ) {} + + // Make sure we set the data so it isn't changed later + dataUser.set( elem, key, data ); + } else { + data = undefined; + } + } + return data; +} + +jQuery.extend( { + hasData: function( elem ) { + return dataUser.hasData( elem ) || dataPriv.hasData( elem ); + }, + + data: function( elem, name, data ) { + return dataUser.access( elem, name, data ); + }, + + removeData: function( elem, name ) { + dataUser.remove( elem, name ); + }, + + // TODO: Now that all calls to _data and _removeData have been replaced + // with direct calls to dataPriv methods, these can be deprecated. + _data: function( elem, name, data ) { + return dataPriv.access( elem, name, data ); + }, + + _removeData: function( elem, name ) { + dataPriv.remove( elem, name ); + } +} ); + +jQuery.fn.extend( { + data: function( key, value ) { + var i, name, data, + elem = this[ 0 ], + attrs = elem && elem.attributes; + + // Gets all values + if ( key === undefined ) { + if ( this.length ) { + data = dataUser.get( elem ); + + if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { + i = attrs.length; + while ( i-- ) { + + // Support: IE 11 only + // The attrs elements can be null (#14894) + if ( attrs[ i ] ) { + name = attrs[ i ].name; + if ( name.indexOf( "data-" ) === 0 ) { + name = jQuery.camelCase( name.slice( 5 ) ); + dataAttr( elem, name, data[ name ] ); + } + } + } + dataPriv.set( elem, "hasDataAttrs", true ); + } + } + + return data; + } + + // Sets multiple values + if ( typeof key === "object" ) { + return this.each( function() { + dataUser.set( this, key ); + } ); + } + + return access( this, function( value ) { + var data; + + // The calling jQuery object (element matches) is not empty + // (and therefore has an element appears at this[ 0 ]) and the + // `value` parameter was not undefined. An empty jQuery object + // will result in `undefined` for elem = this[ 0 ] which will + // throw an exception if an attempt to read a data cache is made. + if ( elem && value === undefined ) { + + // Attempt to get data from the cache + // The key will always be camelCased in Data + data = dataUser.get( elem, key ); + if ( data !== undefined ) { + return data; + } + + // Attempt to "discover" the data in + // HTML5 custom data-* attrs + data = dataAttr( elem, key ); + if ( data !== undefined ) { + return data; + } + + // We tried really hard, but the data doesn't exist. + return; + } + + // Set the data... + this.each( function() { + + // We always store the camelCased key + dataUser.set( this, key, value ); + } ); + }, null, value, arguments.length > 1, null, true ); + }, + + removeData: function( key ) { + return this.each( function() { + dataUser.remove( this, key ); + } ); + } +} ); + + +jQuery.extend( { + queue: function( elem, type, data ) { + var queue; + + if ( elem ) { + type = ( type || "fx" ) + "queue"; + queue = dataPriv.get( elem, type ); + + // Speed up dequeue by getting out quickly if this is just a lookup + if ( data ) { + if ( !queue || Array.isArray( data ) ) { + queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); + } else { + queue.push( data ); + } + } + return queue || []; + } + }, + + dequeue: function( elem, type ) { + type = type || "fx"; + + var queue = jQuery.queue( elem, type ), + startLength = queue.length, + fn = queue.shift(), + hooks = jQuery._queueHooks( elem, type ), + next = function() { + jQuery.dequeue( elem, type ); + }; + + // If the fx queue is dequeued, always remove the progress sentinel + if ( fn === "inprogress" ) { + fn = queue.shift(); + startLength--; + } + + if ( fn ) { + + // Add a progress sentinel to prevent the fx queue from being + // automatically dequeued + if ( type === "fx" ) { + queue.unshift( "inprogress" ); + } + + // Clear up the last queue stop function + delete hooks.stop; + fn.call( elem, next, hooks ); + } + + if ( !startLength && hooks ) { + hooks.empty.fire(); + } + }, + + // Not public - generate a queueHooks object, or return the current one + _queueHooks: function( elem, type ) { + var key = type + "queueHooks"; + return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { + empty: jQuery.Callbacks( "once memory" ).add( function() { + dataPriv.remove( elem, [ type + "queue", key ] ); + } ) + } ); + } +} ); + +jQuery.fn.extend( { + queue: function( type, data ) { + var setter = 2; + + if ( typeof type !== "string" ) { + data = type; + type = "fx"; + setter--; + } + + if ( arguments.length < setter ) { + return jQuery.queue( this[ 0 ], type ); + } + + return data === undefined ? + this : + this.each( function() { + var queue = jQuery.queue( this, type, data ); + + // Ensure a hooks for this queue + jQuery._queueHooks( this, type ); + + if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { + jQuery.dequeue( this, type ); + } + } ); + }, + dequeue: function( type ) { + return this.each( function() { + jQuery.dequeue( this, type ); + } ); + }, + clearQueue: function( type ) { + return this.queue( type || "fx", [] ); + }, + + // Get a promise resolved when queues of a certain type + // are emptied (fx is the type by default) + promise: function( type, obj ) { + var tmp, + count = 1, + defer = jQuery.Deferred(), + elements = this, + i = this.length, + resolve = function() { + if ( !( --count ) ) { + defer.resolveWith( elements, [ elements ] ); + } + }; + + if ( typeof type !== "string" ) { + obj = type; + type = undefined; + } + type = type || "fx"; + + while ( i-- ) { + tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); + if ( tmp && tmp.empty ) { + count++; + tmp.empty.add( resolve ); + } + } + resolve(); + return defer.promise( obj ); + } +} ); +var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; + +var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); + + +var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; + +var isHiddenWithinTree = function( elem, el ) { + + // isHiddenWithinTree might be called from jQuery#filter function; + // in that case, element will be second argument + elem = el || elem; + + // Inline style trumps all + return elem.style.display === "none" || + elem.style.display === "" && + + // Otherwise, check computed style + // Support: Firefox <=43 - 45 + // Disconnected elements can have computed display: none, so first confirm that elem is + // in the document. + jQuery.contains( elem.ownerDocument, elem ) && + + jQuery.css( elem, "display" ) === "none"; + }; + +var swap = function( elem, options, callback, args ) { + var ret, name, + old = {}; + + // Remember the old values, and insert the new ones + for ( name in options ) { + old[ name ] = elem.style[ name ]; + elem.style[ name ] = options[ name ]; + } + + ret = callback.apply( elem, args || [] ); + + // Revert the old values + for ( name in options ) { + elem.style[ name ] = old[ name ]; + } + + return ret; +}; + + + + +function adjustCSS( elem, prop, valueParts, tween ) { + var adjusted, + scale = 1, + maxIterations = 20, + currentValue = tween ? + function() { + return tween.cur(); + } : + function() { + return jQuery.css( elem, prop, "" ); + }, + initial = currentValue(), + unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), + + // Starting value computation is required for potential unit mismatches + initialInUnit = ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && + rcssNum.exec( jQuery.css( elem, prop ) ); + + if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { + + // Trust units reported by jQuery.css + unit = unit || initialInUnit[ 3 ]; + + // Make sure we update the tween properties later on + valueParts = valueParts || []; + + // Iteratively approximate from a nonzero starting point + initialInUnit = +initial || 1; + + do { + + // If previous iteration zeroed out, double until we get *something*. + // Use string for doubling so we don't accidentally see scale as unchanged below + scale = scale || ".5"; + + // Adjust and apply + initialInUnit = initialInUnit / scale; + jQuery.style( elem, prop, initialInUnit + unit ); + + // Update scale, tolerating zero or NaN from tween.cur() + // Break the loop if scale is unchanged or perfect, or if we've just had enough. + } while ( + scale !== ( scale = currentValue() / initial ) && scale !== 1 && --maxIterations + ); + } + + if ( valueParts ) { + initialInUnit = +initialInUnit || +initial || 0; + + // Apply relative offset (+=/-=) if specified + adjusted = valueParts[ 1 ] ? + initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : + +valueParts[ 2 ]; + if ( tween ) { + tween.unit = unit; + tween.start = initialInUnit; + tween.end = adjusted; + } + } + return adjusted; +} + + +var defaultDisplayMap = {}; + +function getDefaultDisplay( elem ) { + var temp, + doc = elem.ownerDocument, + nodeName = elem.nodeName, + display = defaultDisplayMap[ nodeName ]; + + if ( display ) { + return display; + } + + temp = doc.body.appendChild( doc.createElement( nodeName ) ); + display = jQuery.css( temp, "display" ); + + temp.parentNode.removeChild( temp ); + + if ( display === "none" ) { + display = "block"; + } + defaultDisplayMap[ nodeName ] = display; + + return display; +} + +function showHide( elements, show ) { + var display, elem, + values = [], + index = 0, + length = elements.length; + + // Determine new display value for elements that need to change + for ( ; index < length; index++ ) { + elem = elements[ index ]; + if ( !elem.style ) { + continue; + } + + display = elem.style.display; + if ( show ) { + + // Since we force visibility upon cascade-hidden elements, an immediate (and slow) + // check is required in this first loop unless we have a nonempty display value (either + // inline or about-to-be-restored) + if ( display === "none" ) { + values[ index ] = dataPriv.get( elem, "display" ) || null; + if ( !values[ index ] ) { + elem.style.display = ""; + } + } + if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { + values[ index ] = getDefaultDisplay( elem ); + } + } else { + if ( display !== "none" ) { + values[ index ] = "none"; + + // Remember what we're overwriting + dataPriv.set( elem, "display", display ); + } + } + } + + // Set the display of the elements in a second loop to avoid constant reflow + for ( index = 0; index < length; index++ ) { + if ( values[ index ] != null ) { + elements[ index ].style.display = values[ index ]; + } + } + + return elements; +} + +jQuery.fn.extend( { + show: function() { + return showHide( this, true ); + }, + hide: function() { + return showHide( this ); + }, + toggle: function( state ) { + if ( typeof state === "boolean" ) { + return state ? this.show() : this.hide(); + } + + return this.each( function() { + if ( isHiddenWithinTree( this ) ) { + jQuery( this ).show(); + } else { + jQuery( this ).hide(); + } + } ); + } +} ); +var rcheckableType = ( /^(?:checkbox|radio)$/i ); + +var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]+)/i ); + +var rscriptType = ( /^$|\/(?:java|ecma)script/i ); + + + +// We have to close these tags to support XHTML (#13200) +var wrapMap = { + + // Support: IE <=9 only + option: [ 1, "" ], + + // XHTML parsers do not magically insert elements in the + // same way that tag soup parsers do. So we cannot shorten + // this by omitting or other required elements. + thead: [ 1, "", "
" ], + col: [ 2, "", "
" ], + tr: [ 2, "", "
" ], + td: [ 3, "", "
" ], + + _default: [ 0, "", "" ] +}; + +// Support: IE <=9 only +wrapMap.optgroup = wrapMap.option; + +wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; +wrapMap.th = wrapMap.td; + + +function getAll( context, tag ) { + + // Support: IE <=9 - 11 only + // Use typeof to avoid zero-argument method invocation on host objects (#15151) + var ret; + + if ( typeof context.getElementsByTagName !== "undefined" ) { + ret = context.getElementsByTagName( tag || "*" ); + + } else if ( typeof context.querySelectorAll !== "undefined" ) { + ret = context.querySelectorAll( tag || "*" ); + + } else { + ret = []; + } + + if ( tag === undefined || tag && nodeName( context, tag ) ) { + return jQuery.merge( [ context ], ret ); + } + + return ret; +} + + +// Mark scripts as having already been evaluated +function setGlobalEval( elems, refElements ) { + var i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + dataPriv.set( + elems[ i ], + "globalEval", + !refElements || dataPriv.get( refElements[ i ], "globalEval" ) + ); + } +} + + +var rhtml = /<|&#?\w+;/; + +function buildFragment( elems, context, scripts, selection, ignored ) { + var elem, tmp, tag, wrap, contains, j, + fragment = context.createDocumentFragment(), + nodes = [], + i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + elem = elems[ i ]; + + if ( elem || elem === 0 ) { + + // Add nodes directly + if ( jQuery.type( elem ) === "object" ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); + + // Convert non-html into a text node + } else if ( !rhtml.test( elem ) ) { + nodes.push( context.createTextNode( elem ) ); + + // Convert html into DOM nodes + } else { + tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); + + // Deserialize a standard representation + tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); + wrap = wrapMap[ tag ] || wrapMap._default; + tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; + + // Descend through wrappers to the right content + j = wrap[ 0 ]; + while ( j-- ) { + tmp = tmp.lastChild; + } + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, tmp.childNodes ); + + // Remember the top-level container + tmp = fragment.firstChild; + + // Ensure the created nodes are orphaned (#12392) + tmp.textContent = ""; + } + } + } + + // Remove wrapper from fragment + fragment.textContent = ""; + + i = 0; + while ( ( elem = nodes[ i++ ] ) ) { + + // Skip elements already in the context collection (trac-4087) + if ( selection && jQuery.inArray( elem, selection ) > -1 ) { + if ( ignored ) { + ignored.push( elem ); + } + continue; + } + + contains = jQuery.contains( elem.ownerDocument, elem ); + + // Append to fragment + tmp = getAll( fragment.appendChild( elem ), "script" ); + + // Preserve script evaluation history + if ( contains ) { + setGlobalEval( tmp ); + } + + // Capture executables + if ( scripts ) { + j = 0; + while ( ( elem = tmp[ j++ ] ) ) { + if ( rscriptType.test( elem.type || "" ) ) { + scripts.push( elem ); + } + } + } + } + + return fragment; +} + + +( function() { + var fragment = document.createDocumentFragment(), + div = fragment.appendChild( document.createElement( "div" ) ), + input = document.createElement( "input" ); + + // Support: Android 4.0 - 4.3 only + // Check state lost if the name is set (#11217) + // Support: Windows Web Apps (WWA) + // `name` and `type` must use .setAttribute for WWA (#14901) + input.setAttribute( "type", "radio" ); + input.setAttribute( "checked", "checked" ); + input.setAttribute( "name", "t" ); + + div.appendChild( input ); + + // Support: Android <=4.1 only + // Older WebKit doesn't clone checked state correctly in fragments + support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; + + // Support: IE <=11 only + // Make sure textarea (and checkbox) defaultValue is properly cloned + div.innerHTML = ""; + support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; +} )(); +var documentElement = document.documentElement; + + + +var + rkeyEvent = /^key/, + rmouseEvent = /^(?:mouse|pointer|contextmenu|drag|drop)|click/, + rtypenamespace = /^([^.]*)(?:\.(.+)|)/; + +function returnTrue() { + return true; +} + +function returnFalse() { + return false; +} + +// Support: IE <=9 only +// See #13393 for more info +function safeActiveElement() { + try { + return document.activeElement; + } catch ( err ) { } +} + +function on( elem, types, selector, data, fn, one ) { + var origFn, type; + + // Types can be a map of types/handlers + if ( typeof types === "object" ) { + + // ( types-Object, selector, data ) + if ( typeof selector !== "string" ) { + + // ( types-Object, data ) + data = data || selector; + selector = undefined; + } + for ( type in types ) { + on( elem, type, selector, data, types[ type ], one ); + } + return elem; + } + + if ( data == null && fn == null ) { + + // ( types, fn ) + fn = selector; + data = selector = undefined; + } else if ( fn == null ) { + if ( typeof selector === "string" ) { + + // ( types, selector, fn ) + fn = data; + data = undefined; + } else { + + // ( types, data, fn ) + fn = data; + data = selector; + selector = undefined; + } + } + if ( fn === false ) { + fn = returnFalse; + } else if ( !fn ) { + return elem; + } + + if ( one === 1 ) { + origFn = fn; + fn = function( event ) { + + // Can use an empty set, since event contains the info + jQuery().off( event ); + return origFn.apply( this, arguments ); + }; + + // Use same guid so caller can remove using origFn + fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); + } + return elem.each( function() { + jQuery.event.add( this, types, fn, data, selector ); + } ); +} + +/* + * Helper functions for managing events -- not part of the public interface. + * Props to Dean Edwards' addEvent library for many of the ideas. + */ +jQuery.event = { + + global: {}, + + add: function( elem, types, handler, data, selector ) { + + var handleObjIn, eventHandle, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.get( elem ); + + // Don't attach events to noData or text/comment nodes (but allow plain objects) + if ( !elemData ) { + return; + } + + // Caller can pass in an object of custom data in lieu of the handler + if ( handler.handler ) { + handleObjIn = handler; + handler = handleObjIn.handler; + selector = handleObjIn.selector; + } + + // Ensure that invalid selectors throw exceptions at attach time + // Evaluate against documentElement in case elem is a non-element node (e.g., document) + if ( selector ) { + jQuery.find.matchesSelector( documentElement, selector ); + } + + // Make sure that the handler has a unique ID, used to find/remove it later + if ( !handler.guid ) { + handler.guid = jQuery.guid++; + } + + // Init the element's event structure and main handler, if this is the first + if ( !( events = elemData.events ) ) { + events = elemData.events = {}; + } + if ( !( eventHandle = elemData.handle ) ) { + eventHandle = elemData.handle = function( e ) { + + // Discard the second event of a jQuery.event.trigger() and + // when an event is called after a page has unloaded + return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? + jQuery.event.dispatch.apply( elem, arguments ) : undefined; + }; + } + + // Handle multiple events separated by a space + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // There *must* be a type, no attaching namespace-only handlers + if ( !type ) { + continue; + } + + // If event changes its type, use the special event handlers for the changed type + special = jQuery.event.special[ type ] || {}; + + // If selector defined, determine special event api type, otherwise given type + type = ( selector ? special.delegateType : special.bindType ) || type; + + // Update special based on newly reset type + special = jQuery.event.special[ type ] || {}; + + // handleObj is passed to all event handlers + handleObj = jQuery.extend( { + type: type, + origType: origType, + data: data, + handler: handler, + guid: handler.guid, + selector: selector, + needsContext: selector && jQuery.expr.match.needsContext.test( selector ), + namespace: namespaces.join( "." ) + }, handleObjIn ); + + // Init the event handler queue if we're the first + if ( !( handlers = events[ type ] ) ) { + handlers = events[ type ] = []; + handlers.delegateCount = 0; + + // Only use addEventListener if the special events handler returns false + if ( !special.setup || + special.setup.call( elem, data, namespaces, eventHandle ) === false ) { + + if ( elem.addEventListener ) { + elem.addEventListener( type, eventHandle ); + } + } + } + + if ( special.add ) { + special.add.call( elem, handleObj ); + + if ( !handleObj.handler.guid ) { + handleObj.handler.guid = handler.guid; + } + } + + // Add to the element's handler list, delegates in front + if ( selector ) { + handlers.splice( handlers.delegateCount++, 0, handleObj ); + } else { + handlers.push( handleObj ); + } + + // Keep track of which events have ever been used, for event optimization + jQuery.event.global[ type ] = true; + } + + }, + + // Detach an event or set of events from an element + remove: function( elem, types, handler, selector, mappedTypes ) { + + var j, origCount, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); + + if ( !elemData || !( events = elemData.events ) ) { + return; + } + + // Once for each type.namespace in types; type may be omitted + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // Unbind all events (on this namespace, if provided) for the element + if ( !type ) { + for ( type in events ) { + jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); + } + continue; + } + + special = jQuery.event.special[ type ] || {}; + type = ( selector ? special.delegateType : special.bindType ) || type; + handlers = events[ type ] || []; + tmp = tmp[ 2 ] && + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); + + // Remove matching events + origCount = j = handlers.length; + while ( j-- ) { + handleObj = handlers[ j ]; + + if ( ( mappedTypes || origType === handleObj.origType ) && + ( !handler || handler.guid === handleObj.guid ) && + ( !tmp || tmp.test( handleObj.namespace ) ) && + ( !selector || selector === handleObj.selector || + selector === "**" && handleObj.selector ) ) { + handlers.splice( j, 1 ); + + if ( handleObj.selector ) { + handlers.delegateCount--; + } + if ( special.remove ) { + special.remove.call( elem, handleObj ); + } + } + } + + // Remove generic event handler if we removed something and no more handlers exist + // (avoids potential for endless recursion during removal of special event handlers) + if ( origCount && !handlers.length ) { + if ( !special.teardown || + special.teardown.call( elem, namespaces, elemData.handle ) === false ) { + + jQuery.removeEvent( elem, type, elemData.handle ); + } + + delete events[ type ]; + } + } + + // Remove data and the expando if it's no longer used + if ( jQuery.isEmptyObject( events ) ) { + dataPriv.remove( elem, "handle events" ); + } + }, + + dispatch: function( nativeEvent ) { + + // Make a writable jQuery.Event from the native event object + var event = jQuery.event.fix( nativeEvent ); + + var i, j, ret, matched, handleObj, handlerQueue, + args = new Array( arguments.length ), + handlers = ( dataPriv.get( this, "events" ) || {} )[ event.type ] || [], + special = jQuery.event.special[ event.type ] || {}; + + // Use the fix-ed jQuery.Event rather than the (read-only) native event + args[ 0 ] = event; + + for ( i = 1; i < arguments.length; i++ ) { + args[ i ] = arguments[ i ]; + } + + event.delegateTarget = this; + + // Call the preDispatch hook for the mapped type, and let it bail if desired + if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { + return; + } + + // Determine handlers + handlerQueue = jQuery.event.handlers.call( this, event, handlers ); + + // Run delegates first; they may want to stop propagation beneath us + i = 0; + while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { + event.currentTarget = matched.elem; + + j = 0; + while ( ( handleObj = matched.handlers[ j++ ] ) && + !event.isImmediatePropagationStopped() ) { + + // Triggered event must either 1) have no namespace, or 2) have namespace(s) + // a subset or equal to those in the bound event (both can have no namespace). + if ( !event.rnamespace || event.rnamespace.test( handleObj.namespace ) ) { + + event.handleObj = handleObj; + event.data = handleObj.data; + + ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || + handleObj.handler ).apply( matched.elem, args ); + + if ( ret !== undefined ) { + if ( ( event.result = ret ) === false ) { + event.preventDefault(); + event.stopPropagation(); + } + } + } + } + } + + // Call the postDispatch hook for the mapped type + if ( special.postDispatch ) { + special.postDispatch.call( this, event ); + } + + return event.result; + }, + + handlers: function( event, handlers ) { + var i, handleObj, sel, matchedHandlers, matchedSelectors, + handlerQueue = [], + delegateCount = handlers.delegateCount, + cur = event.target; + + // Find delegate handlers + if ( delegateCount && + + // Support: IE <=9 + // Black-hole SVG instance trees (trac-13180) + cur.nodeType && + + // Support: Firefox <=42 + // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) + // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click + // Support: IE 11 only + // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) + !( event.type === "click" && event.button >= 1 ) ) { + + for ( ; cur !== this; cur = cur.parentNode || this ) { + + // Don't check non-elements (#13208) + // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) + if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { + matchedHandlers = []; + matchedSelectors = {}; + for ( i = 0; i < delegateCount; i++ ) { + handleObj = handlers[ i ]; + + // Don't conflict with Object.prototype properties (#13203) + sel = handleObj.selector + " "; + + if ( matchedSelectors[ sel ] === undefined ) { + matchedSelectors[ sel ] = handleObj.needsContext ? + jQuery( sel, this ).index( cur ) > -1 : + jQuery.find( sel, this, null, [ cur ] ).length; + } + if ( matchedSelectors[ sel ] ) { + matchedHandlers.push( handleObj ); + } + } + if ( matchedHandlers.length ) { + handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); + } + } + } + } + + // Add the remaining (directly-bound) handlers + cur = this; + if ( delegateCount < handlers.length ) { + handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); + } + + return handlerQueue; + }, + + addProp: function( name, hook ) { + Object.defineProperty( jQuery.Event.prototype, name, { + enumerable: true, + configurable: true, + + get: jQuery.isFunction( hook ) ? + function() { + if ( this.originalEvent ) { + return hook( this.originalEvent ); + } + } : + function() { + if ( this.originalEvent ) { + return this.originalEvent[ name ]; + } + }, + + set: function( value ) { + Object.defineProperty( this, name, { + enumerable: true, + configurable: true, + writable: true, + value: value + } ); + } + } ); + }, + + fix: function( originalEvent ) { + return originalEvent[ jQuery.expando ] ? + originalEvent : + new jQuery.Event( originalEvent ); + }, + + special: { + load: { + + // Prevent triggered image.load events from bubbling to window.load + noBubble: true + }, + focus: { + + // Fire native event if possible so blur/focus sequence is correct + trigger: function() { + if ( this !== safeActiveElement() && this.focus ) { + this.focus(); + return false; + } + }, + delegateType: "focusin" + }, + blur: { + trigger: function() { + if ( this === safeActiveElement() && this.blur ) { + this.blur(); + return false; + } + }, + delegateType: "focusout" + }, + click: { + + // For checkbox, fire native event so checked state will be right + trigger: function() { + if ( this.type === "checkbox" && this.click && nodeName( this, "input" ) ) { + this.click(); + return false; + } + }, + + // For cross-browser consistency, don't fire native .click() on links + _default: function( event ) { + return nodeName( event.target, "a" ); + } + }, + + beforeunload: { + postDispatch: function( event ) { + + // Support: Firefox 20+ + // Firefox doesn't alert if the returnValue field is not set. + if ( event.result !== undefined && event.originalEvent ) { + event.originalEvent.returnValue = event.result; + } + } + } + } +}; + +jQuery.removeEvent = function( elem, type, handle ) { + + // This "if" is needed for plain objects + if ( elem.removeEventListener ) { + elem.removeEventListener( type, handle ); + } +}; + +jQuery.Event = function( src, props ) { + + // Allow instantiation without the 'new' keyword + if ( !( this instanceof jQuery.Event ) ) { + return new jQuery.Event( src, props ); + } + + // Event object + if ( src && src.type ) { + this.originalEvent = src; + this.type = src.type; + + // Events bubbling up the document may have been marked as prevented + // by a handler lower down the tree; reflect the correct value. + this.isDefaultPrevented = src.defaultPrevented || + src.defaultPrevented === undefined && + + // Support: Android <=2.3 only + src.returnValue === false ? + returnTrue : + returnFalse; + + // Create target properties + // Support: Safari <=6 - 7 only + // Target should not be a text node (#504, #13143) + this.target = ( src.target && src.target.nodeType === 3 ) ? + src.target.parentNode : + src.target; + + this.currentTarget = src.currentTarget; + this.relatedTarget = src.relatedTarget; + + // Event type + } else { + this.type = src; + } + + // Put explicitly provided properties onto the event object + if ( props ) { + jQuery.extend( this, props ); + } + + // Create a timestamp if incoming event doesn't have one + this.timeStamp = src && src.timeStamp || jQuery.now(); + + // Mark it as fixed + this[ jQuery.expando ] = true; +}; + +// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding +// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html +jQuery.Event.prototype = { + constructor: jQuery.Event, + isDefaultPrevented: returnFalse, + isPropagationStopped: returnFalse, + isImmediatePropagationStopped: returnFalse, + isSimulated: false, + + preventDefault: function() { + var e = this.originalEvent; + + this.isDefaultPrevented = returnTrue; + + if ( e && !this.isSimulated ) { + e.preventDefault(); + } + }, + stopPropagation: function() { + var e = this.originalEvent; + + this.isPropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopPropagation(); + } + }, + stopImmediatePropagation: function() { + var e = this.originalEvent; + + this.isImmediatePropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopImmediatePropagation(); + } + + this.stopPropagation(); + } +}; + +// Includes all common event props including KeyEvent and MouseEvent specific props +jQuery.each( { + altKey: true, + bubbles: true, + cancelable: true, + changedTouches: true, + ctrlKey: true, + detail: true, + eventPhase: true, + metaKey: true, + pageX: true, + pageY: true, + shiftKey: true, + view: true, + "char": true, + charCode: true, + key: true, + keyCode: true, + button: true, + buttons: true, + clientX: true, + clientY: true, + offsetX: true, + offsetY: true, + pointerId: true, + pointerType: true, + screenX: true, + screenY: true, + targetTouches: true, + toElement: true, + touches: true, + + which: function( event ) { + var button = event.button; + + // Add which for key events + if ( event.which == null && rkeyEvent.test( event.type ) ) { + return event.charCode != null ? event.charCode : event.keyCode; + } + + // Add which for click: 1 === left; 2 === middle; 3 === right + if ( !event.which && button !== undefined && rmouseEvent.test( event.type ) ) { + if ( button & 1 ) { + return 1; + } + + if ( button & 2 ) { + return 3; + } + + if ( button & 4 ) { + return 2; + } + + return 0; + } + + return event.which; + } +}, jQuery.event.addProp ); + +// Create mouseenter/leave events using mouseover/out and event-time checks +// so that event delegation works in jQuery. +// Do the same for pointerenter/pointerleave and pointerover/pointerout +// +// Support: Safari 7 only +// Safari sends mouseenter too often; see: +// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 +// for the description of the bug (it existed in older Chrome versions as well). +jQuery.each( { + mouseenter: "mouseover", + mouseleave: "mouseout", + pointerenter: "pointerover", + pointerleave: "pointerout" +}, function( orig, fix ) { + jQuery.event.special[ orig ] = { + delegateType: fix, + bindType: fix, + + handle: function( event ) { + var ret, + target = this, + related = event.relatedTarget, + handleObj = event.handleObj; + + // For mouseenter/leave call the handler if related is outside the target. + // NB: No relatedTarget if the mouse left/entered the browser window + if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { + event.type = handleObj.origType; + ret = handleObj.handler.apply( this, arguments ); + event.type = fix; + } + return ret; + } + }; +} ); + +jQuery.fn.extend( { + + on: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn ); + }, + one: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn, 1 ); + }, + off: function( types, selector, fn ) { + var handleObj, type; + if ( types && types.preventDefault && types.handleObj ) { + + // ( event ) dispatched jQuery.Event + handleObj = types.handleObj; + jQuery( types.delegateTarget ).off( + handleObj.namespace ? + handleObj.origType + "." + handleObj.namespace : + handleObj.origType, + handleObj.selector, + handleObj.handler + ); + return this; + } + if ( typeof types === "object" ) { + + // ( types-object [, selector] ) + for ( type in types ) { + this.off( type, selector, types[ type ] ); + } + return this; + } + if ( selector === false || typeof selector === "function" ) { + + // ( types [, fn] ) + fn = selector; + selector = undefined; + } + if ( fn === false ) { + fn = returnFalse; + } + return this.each( function() { + jQuery.event.remove( this, types, fn, selector ); + } ); + } +} ); + + +var + + /* eslint-disable max-len */ + + // See https://github.com/eslint/eslint/issues/3229 + rxhtmlTag = /<(?!area|br|col|embed|hr|img|input|link|meta|param)(([a-z][^\/\0>\x20\t\r\n\f]*)[^>]*)\/>/gi, + + /* eslint-enable */ + + // Support: IE <=10 - 11, Edge 12 - 13 + // In IE/Edge using regex groups here causes severe slowdowns. + // See https://connect.microsoft.com/IE/feedback/details/1736512/ + rnoInnerhtml = /\s*$/g; + +// Prefer a tbody over its parent table for containing new rows +function manipulationTarget( elem, content ) { + if ( nodeName( elem, "table" ) && + nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { + + return jQuery( ">tbody", elem )[ 0 ] || elem; + } + + return elem; +} + +// Replace/restore the type attribute of script elements for safe DOM manipulation +function disableScript( elem ) { + elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; + return elem; +} +function restoreScript( elem ) { + var match = rscriptTypeMasked.exec( elem.type ); + + if ( match ) { + elem.type = match[ 1 ]; + } else { + elem.removeAttribute( "type" ); + } + + return elem; +} + +function cloneCopyEvent( src, dest ) { + var i, l, type, pdataOld, pdataCur, udataOld, udataCur, events; + + if ( dest.nodeType !== 1 ) { + return; + } + + // 1. Copy private data: events, handlers, etc. + if ( dataPriv.hasData( src ) ) { + pdataOld = dataPriv.access( src ); + pdataCur = dataPriv.set( dest, pdataOld ); + events = pdataOld.events; + + if ( events ) { + delete pdataCur.handle; + pdataCur.events = {}; + + for ( type in events ) { + for ( i = 0, l = events[ type ].length; i < l; i++ ) { + jQuery.event.add( dest, type, events[ type ][ i ] ); + } + } + } + } + + // 2. Copy user data + if ( dataUser.hasData( src ) ) { + udataOld = dataUser.access( src ); + udataCur = jQuery.extend( {}, udataOld ); + + dataUser.set( dest, udataCur ); + } +} + +// Fix IE bugs, see support tests +function fixInput( src, dest ) { + var nodeName = dest.nodeName.toLowerCase(); + + // Fails to persist the checked state of a cloned checkbox or radio button. + if ( nodeName === "input" && rcheckableType.test( src.type ) ) { + dest.checked = src.checked; + + // Fails to return the selected option to the default selected state when cloning options + } else if ( nodeName === "input" || nodeName === "textarea" ) { + dest.defaultValue = src.defaultValue; + } +} + +function domManip( collection, args, callback, ignored ) { + + // Flatten any nested arrays + args = concat.apply( [], args ); + + var fragment, first, scripts, hasScripts, node, doc, + i = 0, + l = collection.length, + iNoClone = l - 1, + value = args[ 0 ], + isFunction = jQuery.isFunction( value ); + + // We can't cloneNode fragments that contain checked, in WebKit + if ( isFunction || + ( l > 1 && typeof value === "string" && + !support.checkClone && rchecked.test( value ) ) ) { + return collection.each( function( index ) { + var self = collection.eq( index ); + if ( isFunction ) { + args[ 0 ] = value.call( this, index, self.html() ); + } + domManip( self, args, callback, ignored ); + } ); + } + + if ( l ) { + fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); + first = fragment.firstChild; + + if ( fragment.childNodes.length === 1 ) { + fragment = first; + } + + // Require either new content or an interest in ignored elements to invoke the callback + if ( first || ignored ) { + scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); + hasScripts = scripts.length; + + // Use the original fragment for the last item + // instead of the first because it can end up + // being emptied incorrectly in certain situations (#8070). + for ( ; i < l; i++ ) { + node = fragment; + + if ( i !== iNoClone ) { + node = jQuery.clone( node, true, true ); + + // Keep references to cloned scripts for later restoration + if ( hasScripts ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( scripts, getAll( node, "script" ) ); + } + } + + callback.call( collection[ i ], node, i ); + } + + if ( hasScripts ) { + doc = scripts[ scripts.length - 1 ].ownerDocument; + + // Reenable scripts + jQuery.map( scripts, restoreScript ); + + // Evaluate executable scripts on first document insertion + for ( i = 0; i < hasScripts; i++ ) { + node = scripts[ i ]; + if ( rscriptType.test( node.type || "" ) && + !dataPriv.access( node, "globalEval" ) && + jQuery.contains( doc, node ) ) { + + if ( node.src ) { + + // Optional AJAX dependency, but won't run scripts if not present + if ( jQuery._evalUrl ) { + jQuery._evalUrl( node.src ); + } + } else { + DOMEval( node.textContent.replace( rcleanScript, "" ), doc ); + } + } + } + } + } + } + + return collection; +} + +function remove( elem, selector, keepData ) { + var node, + nodes = selector ? jQuery.filter( selector, elem ) : elem, + i = 0; + + for ( ; ( node = nodes[ i ] ) != null; i++ ) { + if ( !keepData && node.nodeType === 1 ) { + jQuery.cleanData( getAll( node ) ); + } + + if ( node.parentNode ) { + if ( keepData && jQuery.contains( node.ownerDocument, node ) ) { + setGlobalEval( getAll( node, "script" ) ); + } + node.parentNode.removeChild( node ); + } + } + + return elem; +} + +jQuery.extend( { + htmlPrefilter: function( html ) { + return html.replace( rxhtmlTag, "<$1>" ); + }, + + clone: function( elem, dataAndEvents, deepDataAndEvents ) { + var i, l, srcElements, destElements, + clone = elem.cloneNode( true ), + inPage = jQuery.contains( elem.ownerDocument, elem ); + + // Fix IE cloning issues + if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && + !jQuery.isXMLDoc( elem ) ) { + + // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 + destElements = getAll( clone ); + srcElements = getAll( elem ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + fixInput( srcElements[ i ], destElements[ i ] ); + } + } + + // Copy the events from the original to the clone + if ( dataAndEvents ) { + if ( deepDataAndEvents ) { + srcElements = srcElements || getAll( elem ); + destElements = destElements || getAll( clone ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + cloneCopyEvent( srcElements[ i ], destElements[ i ] ); + } + } else { + cloneCopyEvent( elem, clone ); + } + } + + // Preserve script evaluation history + destElements = getAll( clone, "script" ); + if ( destElements.length > 0 ) { + setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); + } + + // Return the cloned set + return clone; + }, + + cleanData: function( elems ) { + var data, elem, type, + special = jQuery.event.special, + i = 0; + + for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { + if ( acceptData( elem ) ) { + if ( ( data = elem[ dataPriv.expando ] ) ) { + if ( data.events ) { + for ( type in data.events ) { + if ( special[ type ] ) { + jQuery.event.remove( elem, type ); + + // This is a shortcut to avoid jQuery.event.remove's overhead + } else { + jQuery.removeEvent( elem, type, data.handle ); + } + } + } + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataPriv.expando ] = undefined; + } + if ( elem[ dataUser.expando ] ) { + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataUser.expando ] = undefined; + } + } + } + } +} ); + +jQuery.fn.extend( { + detach: function( selector ) { + return remove( this, selector, true ); + }, + + remove: function( selector ) { + return remove( this, selector ); + }, + + text: function( value ) { + return access( this, function( value ) { + return value === undefined ? + jQuery.text( this ) : + this.empty().each( function() { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + this.textContent = value; + } + } ); + }, null, value, arguments.length ); + }, + + append: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.appendChild( elem ); + } + } ); + }, + + prepend: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.insertBefore( elem, target.firstChild ); + } + } ); + }, + + before: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this ); + } + } ); + }, + + after: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this.nextSibling ); + } + } ); + }, + + empty: function() { + var elem, + i = 0; + + for ( ; ( elem = this[ i ] ) != null; i++ ) { + if ( elem.nodeType === 1 ) { + + // Prevent memory leaks + jQuery.cleanData( getAll( elem, false ) ); + + // Remove any remaining nodes + elem.textContent = ""; + } + } + + return this; + }, + + clone: function( dataAndEvents, deepDataAndEvents ) { + dataAndEvents = dataAndEvents == null ? false : dataAndEvents; + deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; + + return this.map( function() { + return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); + } ); + }, + + html: function( value ) { + return access( this, function( value ) { + var elem = this[ 0 ] || {}, + i = 0, + l = this.length; + + if ( value === undefined && elem.nodeType === 1 ) { + return elem.innerHTML; + } + + // See if we can take a shortcut and just use innerHTML + if ( typeof value === "string" && !rnoInnerhtml.test( value ) && + !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { + + value = jQuery.htmlPrefilter( value ); + + try { + for ( ; i < l; i++ ) { + elem = this[ i ] || {}; + + // Remove element nodes and prevent memory leaks + if ( elem.nodeType === 1 ) { + jQuery.cleanData( getAll( elem, false ) ); + elem.innerHTML = value; + } + } + + elem = 0; + + // If using innerHTML throws an exception, use the fallback method + } catch ( e ) {} + } + + if ( elem ) { + this.empty().append( value ); + } + }, null, value, arguments.length ); + }, + + replaceWith: function() { + var ignored = []; + + // Make the changes, replacing each non-ignored context element with the new content + return domManip( this, arguments, function( elem ) { + var parent = this.parentNode; + + if ( jQuery.inArray( this, ignored ) < 0 ) { + jQuery.cleanData( getAll( this ) ); + if ( parent ) { + parent.replaceChild( elem, this ); + } + } + + // Force callback invocation + }, ignored ); + } +} ); + +jQuery.each( { + appendTo: "append", + prependTo: "prepend", + insertBefore: "before", + insertAfter: "after", + replaceAll: "replaceWith" +}, function( name, original ) { + jQuery.fn[ name ] = function( selector ) { + var elems, + ret = [], + insert = jQuery( selector ), + last = insert.length - 1, + i = 0; + + for ( ; i <= last; i++ ) { + elems = i === last ? this : this.clone( true ); + jQuery( insert[ i ] )[ original ]( elems ); + + // Support: Android <=4.0 only, PhantomJS 1 only + // .get() because push.apply(_, arraylike) throws on ancient WebKit + push.apply( ret, elems.get() ); + } + + return this.pushStack( ret ); + }; +} ); +var rmargin = ( /^margin/ ); + +var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); + +var getStyles = function( elem ) { + + // Support: IE <=11 only, Firefox <=30 (#15098, #14150) + // IE throws on elements created in popups + // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" + var view = elem.ownerDocument.defaultView; + + if ( !view || !view.opener ) { + view = window; + } + + return view.getComputedStyle( elem ); + }; + + + +( function() { + + // Executing both pixelPosition & boxSizingReliable tests require only one layout + // so they're executed at the same time to save the second computation. + function computeStyleTests() { + + // This is a singleton, we need to execute it only once + if ( !div ) { + return; + } + + div.style.cssText = + "box-sizing:border-box;" + + "position:relative;display:block;" + + "margin:auto;border:1px;padding:1px;" + + "top:1%;width:50%"; + div.innerHTML = ""; + documentElement.appendChild( container ); + + var divStyle = window.getComputedStyle( div ); + pixelPositionVal = divStyle.top !== "1%"; + + // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 + reliableMarginLeftVal = divStyle.marginLeft === "2px"; + boxSizingReliableVal = divStyle.width === "4px"; + + // Support: Android 4.0 - 4.3 only + // Some styles come back with percentage values, even though they shouldn't + div.style.marginRight = "50%"; + pixelMarginRightVal = divStyle.marginRight === "4px"; + + documentElement.removeChild( container ); + + // Nullify the div so it wouldn't be stored in the memory and + // it will also be a sign that checks already performed + div = null; + } + + var pixelPositionVal, boxSizingReliableVal, pixelMarginRightVal, reliableMarginLeftVal, + container = document.createElement( "div" ), + div = document.createElement( "div" ); + + // Finish early in limited (non-browser) environments + if ( !div.style ) { + return; + } + + // Support: IE <=9 - 11 only + // Style of cloned element affects source element cloned (#8908) + div.style.backgroundClip = "content-box"; + div.cloneNode( true ).style.backgroundClip = ""; + support.clearCloneStyle = div.style.backgroundClip === "content-box"; + + container.style.cssText = "border:0;width:8px;height:0;top:0;left:-9999px;" + + "padding:0;margin-top:1px;position:absolute"; + container.appendChild( div ); + + jQuery.extend( support, { + pixelPosition: function() { + computeStyleTests(); + return pixelPositionVal; + }, + boxSizingReliable: function() { + computeStyleTests(); + return boxSizingReliableVal; + }, + pixelMarginRight: function() { + computeStyleTests(); + return pixelMarginRightVal; + }, + reliableMarginLeft: function() { + computeStyleTests(); + return reliableMarginLeftVal; + } + } ); +} )(); + + +function curCSS( elem, name, computed ) { + var width, minWidth, maxWidth, ret, + + // Support: Firefox 51+ + // Retrieving style before computed somehow + // fixes an issue with getting wrong values + // on detached elements + style = elem.style; + + computed = computed || getStyles( elem ); + + // getPropertyValue is needed for: + // .css('filter') (IE 9 only, #12537) + // .css('--customProperty) (#3144) + if ( computed ) { + ret = computed.getPropertyValue( name ) || computed[ name ]; + + if ( ret === "" && !jQuery.contains( elem.ownerDocument, elem ) ) { + ret = jQuery.style( elem, name ); + } + + // A tribute to the "awesome hack by Dean Edwards" + // Android Browser returns percentage for some values, + // but width seems to be reliably pixels. + // This is against the CSSOM draft spec: + // https://drafts.csswg.org/cssom/#resolved-values + if ( !support.pixelMarginRight() && rnumnonpx.test( ret ) && rmargin.test( name ) ) { + + // Remember the original values + width = style.width; + minWidth = style.minWidth; + maxWidth = style.maxWidth; + + // Put in the new values to get a computed value out + style.minWidth = style.maxWidth = style.width = ret; + ret = computed.width; + + // Revert the changed values + style.width = width; + style.minWidth = minWidth; + style.maxWidth = maxWidth; + } + } + + return ret !== undefined ? + + // Support: IE <=9 - 11 only + // IE returns zIndex value as an integer. + ret + "" : + ret; +} + + +function addGetHookIf( conditionFn, hookFn ) { + + // Define the hook, we'll check on the first run if it's really needed. + return { + get: function() { + if ( conditionFn() ) { + + // Hook not needed (or it's not possible to use it due + // to missing dependency), remove it. + delete this.get; + return; + } + + // Hook needed; redefine it so that the support test is not executed again. + return ( this.get = hookFn ).apply( this, arguments ); + } + }; +} + + +var + + // Swappable if display is none or starts with table + // except "table", "table-cell", or "table-caption" + // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display + rdisplayswap = /^(none|table(?!-c[ea]).+)/, + rcustomProp = /^--/, + cssShow = { position: "absolute", visibility: "hidden", display: "block" }, + cssNormalTransform = { + letterSpacing: "0", + fontWeight: "400" + }, + + cssPrefixes = [ "Webkit", "Moz", "ms" ], + emptyStyle = document.createElement( "div" ).style; + +// Return a css property mapped to a potentially vendor prefixed property +function vendorPropName( name ) { + + // Shortcut for names that are not vendor prefixed + if ( name in emptyStyle ) { + return name; + } + + // Check for vendor prefixed names + var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), + i = cssPrefixes.length; + + while ( i-- ) { + name = cssPrefixes[ i ] + capName; + if ( name in emptyStyle ) { + return name; + } + } +} + +// Return a property mapped along what jQuery.cssProps suggests or to +// a vendor prefixed property. +function finalPropName( name ) { + var ret = jQuery.cssProps[ name ]; + if ( !ret ) { + ret = jQuery.cssProps[ name ] = vendorPropName( name ) || name; + } + return ret; +} + +function setPositiveNumber( elem, value, subtract ) { + + // Any relative (+/-) values have already been + // normalized at this point + var matches = rcssNum.exec( value ); + return matches ? + + // Guard against undefined "subtract", e.g., when used as in cssHooks + Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : + value; +} + +function augmentWidthOrHeight( elem, name, extra, isBorderBox, styles ) { + var i, + val = 0; + + // If we already have the right measurement, avoid augmentation + if ( extra === ( isBorderBox ? "border" : "content" ) ) { + i = 4; + + // Otherwise initialize for horizontal or vertical properties + } else { + i = name === "width" ? 1 : 0; + } + + for ( ; i < 4; i += 2 ) { + + // Both box models exclude margin, so add it if we want it + if ( extra === "margin" ) { + val += jQuery.css( elem, extra + cssExpand[ i ], true, styles ); + } + + if ( isBorderBox ) { + + // border-box includes padding, so remove it if we want content + if ( extra === "content" ) { + val -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + } + + // At this point, extra isn't border nor margin, so remove border + if ( extra !== "margin" ) { + val -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + } else { + + // At this point, extra isn't content, so add padding + val += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + + // At this point, extra isn't content nor padding, so add border + if ( extra !== "padding" ) { + val += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + } + } + + return val; +} + +function getWidthOrHeight( elem, name, extra ) { + + // Start with computed style + var valueIsBorderBox, + styles = getStyles( elem ), + val = curCSS( elem, name, styles ), + isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; + + // Computed unit is not pixels. Stop here and return. + if ( rnumnonpx.test( val ) ) { + return val; + } + + // Check for style in case a browser which returns unreliable values + // for getComputedStyle silently falls back to the reliable elem.style + valueIsBorderBox = isBorderBox && + ( support.boxSizingReliable() || val === elem.style[ name ] ); + + // Fall back to offsetWidth/Height when value is "auto" + // This happens for inline elements with no explicit setting (gh-3571) + if ( val === "auto" ) { + val = elem[ "offset" + name[ 0 ].toUpperCase() + name.slice( 1 ) ]; + } + + // Normalize "", auto, and prepare for extra + val = parseFloat( val ) || 0; + + // Use the active box-sizing model to add/subtract irrelevant styles + return ( val + + augmentWidthOrHeight( + elem, + name, + extra || ( isBorderBox ? "border" : "content" ), + valueIsBorderBox, + styles + ) + ) + "px"; +} + +jQuery.extend( { + + // Add in style property hooks for overriding the default + // behavior of getting and setting a style property + cssHooks: { + opacity: { + get: function( elem, computed ) { + if ( computed ) { + + // We should always get a number back from opacity + var ret = curCSS( elem, "opacity" ); + return ret === "" ? "1" : ret; + } + } + } + }, + + // Don't automatically add "px" to these possibly-unitless properties + cssNumber: { + "animationIterationCount": true, + "columnCount": true, + "fillOpacity": true, + "flexGrow": true, + "flexShrink": true, + "fontWeight": true, + "lineHeight": true, + "opacity": true, + "order": true, + "orphans": true, + "widows": true, + "zIndex": true, + "zoom": true + }, + + // Add in properties whose names you wish to fix before + // setting or getting the value + cssProps: { + "float": "cssFloat" + }, + + // Get and set the style property on a DOM Node + style: function( elem, name, value, extra ) { + + // Don't set styles on text and comment nodes + if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { + return; + } + + // Make sure that we're working with the right name + var ret, type, hooks, + origName = jQuery.camelCase( name ), + isCustomProp = rcustomProp.test( name ), + style = elem.style; + + // Make sure that we're working with the right name. We don't + // want to query the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Gets hook for the prefixed version, then unprefixed version + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // Check if we're setting a value + if ( value !== undefined ) { + type = typeof value; + + // Convert "+=" or "-=" to relative numbers (#7345) + if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { + value = adjustCSS( elem, name, ret ); + + // Fixes bug #9237 + type = "number"; + } + + // Make sure that null and NaN values aren't set (#7116) + if ( value == null || value !== value ) { + return; + } + + // If a number was passed in, add the unit (except for certain CSS properties) + if ( type === "number" ) { + value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); + } + + // background-* props affect original clone's values + if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { + style[ name ] = "inherit"; + } + + // If a hook was provided, use that value, otherwise just set the specified value + if ( !hooks || !( "set" in hooks ) || + ( value = hooks.set( elem, value, extra ) ) !== undefined ) { + + if ( isCustomProp ) { + style.setProperty( name, value ); + } else { + style[ name ] = value; + } + } + + } else { + + // If a hook was provided get the non-computed value from there + if ( hooks && "get" in hooks && + ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { + + return ret; + } + + // Otherwise just get the value from the style object + return style[ name ]; + } + }, + + css: function( elem, name, extra, styles ) { + var val, num, hooks, + origName = jQuery.camelCase( name ), + isCustomProp = rcustomProp.test( name ); + + // Make sure that we're working with the right name. We don't + // want to modify the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Try prefixed name followed by the unprefixed name + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // If a hook was provided get the computed value from there + if ( hooks && "get" in hooks ) { + val = hooks.get( elem, true, extra ); + } + + // Otherwise, if a way to get the computed value exists, use that + if ( val === undefined ) { + val = curCSS( elem, name, styles ); + } + + // Convert "normal" to computed value + if ( val === "normal" && name in cssNormalTransform ) { + val = cssNormalTransform[ name ]; + } + + // Make numeric if forced or a qualifier was provided and val looks numeric + if ( extra === "" || extra ) { + num = parseFloat( val ); + return extra === true || isFinite( num ) ? num || 0 : val; + } + + return val; + } +} ); + +jQuery.each( [ "height", "width" ], function( i, name ) { + jQuery.cssHooks[ name ] = { + get: function( elem, computed, extra ) { + if ( computed ) { + + // Certain elements can have dimension info if we invisibly show them + // but it must have a current display style that would benefit + return rdisplayswap.test( jQuery.css( elem, "display" ) ) && + + // Support: Safari 8+ + // Table columns in Safari have non-zero offsetWidth & zero + // getBoundingClientRect().width unless display is changed. + // Support: IE <=11 only + // Running getBoundingClientRect on a disconnected node + // in IE throws an error. + ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? + swap( elem, cssShow, function() { + return getWidthOrHeight( elem, name, extra ); + } ) : + getWidthOrHeight( elem, name, extra ); + } + }, + + set: function( elem, value, extra ) { + var matches, + styles = extra && getStyles( elem ), + subtract = extra && augmentWidthOrHeight( + elem, + name, + extra, + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + styles + ); + + // Convert to pixels if value adjustment is needed + if ( subtract && ( matches = rcssNum.exec( value ) ) && + ( matches[ 3 ] || "px" ) !== "px" ) { + + elem.style[ name ] = value; + value = jQuery.css( elem, name ); + } + + return setPositiveNumber( elem, value, subtract ); + } + }; +} ); + +jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, + function( elem, computed ) { + if ( computed ) { + return ( parseFloat( curCSS( elem, "marginLeft" ) ) || + elem.getBoundingClientRect().left - + swap( elem, { marginLeft: 0 }, function() { + return elem.getBoundingClientRect().left; + } ) + ) + "px"; + } + } +); + +// These hooks are used by animate to expand properties +jQuery.each( { + margin: "", + padding: "", + border: "Width" +}, function( prefix, suffix ) { + jQuery.cssHooks[ prefix + suffix ] = { + expand: function( value ) { + var i = 0, + expanded = {}, + + // Assumes a single number if not a string + parts = typeof value === "string" ? value.split( " " ) : [ value ]; + + for ( ; i < 4; i++ ) { + expanded[ prefix + cssExpand[ i ] + suffix ] = + parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; + } + + return expanded; + } + }; + + if ( !rmargin.test( prefix ) ) { + jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; + } +} ); + +jQuery.fn.extend( { + css: function( name, value ) { + return access( this, function( elem, name, value ) { + var styles, len, + map = {}, + i = 0; + + if ( Array.isArray( name ) ) { + styles = getStyles( elem ); + len = name.length; + + for ( ; i < len; i++ ) { + map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); + } + + return map; + } + + return value !== undefined ? + jQuery.style( elem, name, value ) : + jQuery.css( elem, name ); + }, name, value, arguments.length > 1 ); + } +} ); + + +function Tween( elem, options, prop, end, easing ) { + return new Tween.prototype.init( elem, options, prop, end, easing ); +} +jQuery.Tween = Tween; + +Tween.prototype = { + constructor: Tween, + init: function( elem, options, prop, end, easing, unit ) { + this.elem = elem; + this.prop = prop; + this.easing = easing || jQuery.easing._default; + this.options = options; + this.start = this.now = this.cur(); + this.end = end; + this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); + }, + cur: function() { + var hooks = Tween.propHooks[ this.prop ]; + + return hooks && hooks.get ? + hooks.get( this ) : + Tween.propHooks._default.get( this ); + }, + run: function( percent ) { + var eased, + hooks = Tween.propHooks[ this.prop ]; + + if ( this.options.duration ) { + this.pos = eased = jQuery.easing[ this.easing ]( + percent, this.options.duration * percent, 0, 1, this.options.duration + ); + } else { + this.pos = eased = percent; + } + this.now = ( this.end - this.start ) * eased + this.start; + + if ( this.options.step ) { + this.options.step.call( this.elem, this.now, this ); + } + + if ( hooks && hooks.set ) { + hooks.set( this ); + } else { + Tween.propHooks._default.set( this ); + } + return this; + } +}; + +Tween.prototype.init.prototype = Tween.prototype; + +Tween.propHooks = { + _default: { + get: function( tween ) { + var result; + + // Use a property on the element directly when it is not a DOM element, + // or when there is no matching style property that exists. + if ( tween.elem.nodeType !== 1 || + tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { + return tween.elem[ tween.prop ]; + } + + // Passing an empty string as a 3rd parameter to .css will automatically + // attempt a parseFloat and fallback to a string if the parse fails. + // Simple values such as "10px" are parsed to Float; + // complex values such as "rotate(1rad)" are returned as-is. + result = jQuery.css( tween.elem, tween.prop, "" ); + + // Empty strings, null, undefined and "auto" are converted to 0. + return !result || result === "auto" ? 0 : result; + }, + set: function( tween ) { + + // Use step hook for back compat. + // Use cssHook if its there. + // Use .style if available and use plain properties where available. + if ( jQuery.fx.step[ tween.prop ] ) { + jQuery.fx.step[ tween.prop ]( tween ); + } else if ( tween.elem.nodeType === 1 && + ( tween.elem.style[ jQuery.cssProps[ tween.prop ] ] != null || + jQuery.cssHooks[ tween.prop ] ) ) { + jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); + } else { + tween.elem[ tween.prop ] = tween.now; + } + } + } +}; + +// Support: IE <=9 only +// Panic based approach to setting things on disconnected nodes +Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { + set: function( tween ) { + if ( tween.elem.nodeType && tween.elem.parentNode ) { + tween.elem[ tween.prop ] = tween.now; + } + } +}; + +jQuery.easing = { + linear: function( p ) { + return p; + }, + swing: function( p ) { + return 0.5 - Math.cos( p * Math.PI ) / 2; + }, + _default: "swing" +}; + +jQuery.fx = Tween.prototype.init; + +// Back compat <1.8 extension point +jQuery.fx.step = {}; + + + + +var + fxNow, inProgress, + rfxtypes = /^(?:toggle|show|hide)$/, + rrun = /queueHooks$/; + +function schedule() { + if ( inProgress ) { + if ( document.hidden === false && window.requestAnimationFrame ) { + window.requestAnimationFrame( schedule ); + } else { + window.setTimeout( schedule, jQuery.fx.interval ); + } + + jQuery.fx.tick(); + } +} + +// Animations created synchronously will run synchronously +function createFxNow() { + window.setTimeout( function() { + fxNow = undefined; + } ); + return ( fxNow = jQuery.now() ); +} + +// Generate parameters to create a standard animation +function genFx( type, includeWidth ) { + var which, + i = 0, + attrs = { height: type }; + + // If we include width, step value is 1 to do all cssExpand values, + // otherwise step value is 2 to skip over Left and Right + includeWidth = includeWidth ? 1 : 0; + for ( ; i < 4; i += 2 - includeWidth ) { + which = cssExpand[ i ]; + attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; + } + + if ( includeWidth ) { + attrs.opacity = attrs.width = type; + } + + return attrs; +} + +function createTween( value, prop, animation ) { + var tween, + collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), + index = 0, + length = collection.length; + for ( ; index < length; index++ ) { + if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { + + // We're done with this property + return tween; + } + } +} + +function defaultPrefilter( elem, props, opts ) { + var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, + isBox = "width" in props || "height" in props, + anim = this, + orig = {}, + style = elem.style, + hidden = elem.nodeType && isHiddenWithinTree( elem ), + dataShow = dataPriv.get( elem, "fxshow" ); + + // Queue-skipping animations hijack the fx hooks + if ( !opts.queue ) { + hooks = jQuery._queueHooks( elem, "fx" ); + if ( hooks.unqueued == null ) { + hooks.unqueued = 0; + oldfire = hooks.empty.fire; + hooks.empty.fire = function() { + if ( !hooks.unqueued ) { + oldfire(); + } + }; + } + hooks.unqueued++; + + anim.always( function() { + + // Ensure the complete handler is called before this completes + anim.always( function() { + hooks.unqueued--; + if ( !jQuery.queue( elem, "fx" ).length ) { + hooks.empty.fire(); + } + } ); + } ); + } + + // Detect show/hide animations + for ( prop in props ) { + value = props[ prop ]; + if ( rfxtypes.test( value ) ) { + delete props[ prop ]; + toggle = toggle || value === "toggle"; + if ( value === ( hidden ? "hide" : "show" ) ) { + + // Pretend to be hidden if this is a "show" and + // there is still data from a stopped show/hide + if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { + hidden = true; + + // Ignore all other no-op show/hide data + } else { + continue; + } + } + orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); + } + } + + // Bail out if this is a no-op like .hide().hide() + propTween = !jQuery.isEmptyObject( props ); + if ( !propTween && jQuery.isEmptyObject( orig ) ) { + return; + } + + // Restrict "overflow" and "display" styles during box animations + if ( isBox && elem.nodeType === 1 ) { + + // Support: IE <=9 - 11, Edge 12 - 13 + // Record all 3 overflow attributes because IE does not infer the shorthand + // from identically-valued overflowX and overflowY + opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; + + // Identify a display type, preferring old show/hide data over the CSS cascade + restoreDisplay = dataShow && dataShow.display; + if ( restoreDisplay == null ) { + restoreDisplay = dataPriv.get( elem, "display" ); + } + display = jQuery.css( elem, "display" ); + if ( display === "none" ) { + if ( restoreDisplay ) { + display = restoreDisplay; + } else { + + // Get nonempty value(s) by temporarily forcing visibility + showHide( [ elem ], true ); + restoreDisplay = elem.style.display || restoreDisplay; + display = jQuery.css( elem, "display" ); + showHide( [ elem ] ); + } + } + + // Animate inline elements as inline-block + if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { + if ( jQuery.css( elem, "float" ) === "none" ) { + + // Restore the original display value at the end of pure show/hide animations + if ( !propTween ) { + anim.done( function() { + style.display = restoreDisplay; + } ); + if ( restoreDisplay == null ) { + display = style.display; + restoreDisplay = display === "none" ? "" : display; + } + } + style.display = "inline-block"; + } + } + } + + if ( opts.overflow ) { + style.overflow = "hidden"; + anim.always( function() { + style.overflow = opts.overflow[ 0 ]; + style.overflowX = opts.overflow[ 1 ]; + style.overflowY = opts.overflow[ 2 ]; + } ); + } + + // Implement show/hide animations + propTween = false; + for ( prop in orig ) { + + // General show/hide setup for this element animation + if ( !propTween ) { + if ( dataShow ) { + if ( "hidden" in dataShow ) { + hidden = dataShow.hidden; + } + } else { + dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); + } + + // Store hidden/visible for toggle so `.stop().toggle()` "reverses" + if ( toggle ) { + dataShow.hidden = !hidden; + } + + // Show elements before animating them + if ( hidden ) { + showHide( [ elem ], true ); + } + + /* eslint-disable no-loop-func */ + + anim.done( function() { + + /* eslint-enable no-loop-func */ + + // The final step of a "hide" animation is actually hiding the element + if ( !hidden ) { + showHide( [ elem ] ); + } + dataPriv.remove( elem, "fxshow" ); + for ( prop in orig ) { + jQuery.style( elem, prop, orig[ prop ] ); + } + } ); + } + + // Per-property setup + propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); + if ( !( prop in dataShow ) ) { + dataShow[ prop ] = propTween.start; + if ( hidden ) { + propTween.end = propTween.start; + propTween.start = 0; + } + } + } +} + +function propFilter( props, specialEasing ) { + var index, name, easing, value, hooks; + + // camelCase, specialEasing and expand cssHook pass + for ( index in props ) { + name = jQuery.camelCase( index ); + easing = specialEasing[ name ]; + value = props[ index ]; + if ( Array.isArray( value ) ) { + easing = value[ 1 ]; + value = props[ index ] = value[ 0 ]; + } + + if ( index !== name ) { + props[ name ] = value; + delete props[ index ]; + } + + hooks = jQuery.cssHooks[ name ]; + if ( hooks && "expand" in hooks ) { + value = hooks.expand( value ); + delete props[ name ]; + + // Not quite $.extend, this won't overwrite existing keys. + // Reusing 'index' because we have the correct "name" + for ( index in value ) { + if ( !( index in props ) ) { + props[ index ] = value[ index ]; + specialEasing[ index ] = easing; + } + } + } else { + specialEasing[ name ] = easing; + } + } +} + +function Animation( elem, properties, options ) { + var result, + stopped, + index = 0, + length = Animation.prefilters.length, + deferred = jQuery.Deferred().always( function() { + + // Don't match elem in the :animated selector + delete tick.elem; + } ), + tick = function() { + if ( stopped ) { + return false; + } + var currentTime = fxNow || createFxNow(), + remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), + + // Support: Android 2.3 only + // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) + temp = remaining / animation.duration || 0, + percent = 1 - temp, + index = 0, + length = animation.tweens.length; + + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( percent ); + } + + deferred.notifyWith( elem, [ animation, percent, remaining ] ); + + // If there's more to do, yield + if ( percent < 1 && length ) { + return remaining; + } + + // If this was an empty animation, synthesize a final progress notification + if ( !length ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + } + + // Resolve the animation and report its conclusion + deferred.resolveWith( elem, [ animation ] ); + return false; + }, + animation = deferred.promise( { + elem: elem, + props: jQuery.extend( {}, properties ), + opts: jQuery.extend( true, { + specialEasing: {}, + easing: jQuery.easing._default + }, options ), + originalProperties: properties, + originalOptions: options, + startTime: fxNow || createFxNow(), + duration: options.duration, + tweens: [], + createTween: function( prop, end ) { + var tween = jQuery.Tween( elem, animation.opts, prop, end, + animation.opts.specialEasing[ prop ] || animation.opts.easing ); + animation.tweens.push( tween ); + return tween; + }, + stop: function( gotoEnd ) { + var index = 0, + + // If we are going to the end, we want to run all the tweens + // otherwise we skip this part + length = gotoEnd ? animation.tweens.length : 0; + if ( stopped ) { + return this; + } + stopped = true; + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( 1 ); + } + + // Resolve when we played the last frame; otherwise, reject + if ( gotoEnd ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + deferred.resolveWith( elem, [ animation, gotoEnd ] ); + } else { + deferred.rejectWith( elem, [ animation, gotoEnd ] ); + } + return this; + } + } ), + props = animation.props; + + propFilter( props, animation.opts.specialEasing ); + + for ( ; index < length; index++ ) { + result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); + if ( result ) { + if ( jQuery.isFunction( result.stop ) ) { + jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = + jQuery.proxy( result.stop, result ); + } + return result; + } + } + + jQuery.map( props, createTween, animation ); + + if ( jQuery.isFunction( animation.opts.start ) ) { + animation.opts.start.call( elem, animation ); + } + + // Attach callbacks from options + animation + .progress( animation.opts.progress ) + .done( animation.opts.done, animation.opts.complete ) + .fail( animation.opts.fail ) + .always( animation.opts.always ); + + jQuery.fx.timer( + jQuery.extend( tick, { + elem: elem, + anim: animation, + queue: animation.opts.queue + } ) + ); + + return animation; +} + +jQuery.Animation = jQuery.extend( Animation, { + + tweeners: { + "*": [ function( prop, value ) { + var tween = this.createTween( prop, value ); + adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); + return tween; + } ] + }, + + tweener: function( props, callback ) { + if ( jQuery.isFunction( props ) ) { + callback = props; + props = [ "*" ]; + } else { + props = props.match( rnothtmlwhite ); + } + + var prop, + index = 0, + length = props.length; + + for ( ; index < length; index++ ) { + prop = props[ index ]; + Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; + Animation.tweeners[ prop ].unshift( callback ); + } + }, + + prefilters: [ defaultPrefilter ], + + prefilter: function( callback, prepend ) { + if ( prepend ) { + Animation.prefilters.unshift( callback ); + } else { + Animation.prefilters.push( callback ); + } + } +} ); + +jQuery.speed = function( speed, easing, fn ) { + var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { + complete: fn || !fn && easing || + jQuery.isFunction( speed ) && speed, + duration: speed, + easing: fn && easing || easing && !jQuery.isFunction( easing ) && easing + }; + + // Go to the end state if fx are off + if ( jQuery.fx.off ) { + opt.duration = 0; + + } else { + if ( typeof opt.duration !== "number" ) { + if ( opt.duration in jQuery.fx.speeds ) { + opt.duration = jQuery.fx.speeds[ opt.duration ]; + + } else { + opt.duration = jQuery.fx.speeds._default; + } + } + } + + // Normalize opt.queue - true/undefined/null -> "fx" + if ( opt.queue == null || opt.queue === true ) { + opt.queue = "fx"; + } + + // Queueing + opt.old = opt.complete; + + opt.complete = function() { + if ( jQuery.isFunction( opt.old ) ) { + opt.old.call( this ); + } + + if ( opt.queue ) { + jQuery.dequeue( this, opt.queue ); + } + }; + + return opt; +}; + +jQuery.fn.extend( { + fadeTo: function( speed, to, easing, callback ) { + + // Show any hidden elements after setting opacity to 0 + return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() + + // Animate to the value specified + .end().animate( { opacity: to }, speed, easing, callback ); + }, + animate: function( prop, speed, easing, callback ) { + var empty = jQuery.isEmptyObject( prop ), + optall = jQuery.speed( speed, easing, callback ), + doAnimation = function() { + + // Operate on a copy of prop so per-property easing won't be lost + var anim = Animation( this, jQuery.extend( {}, prop ), optall ); + + // Empty animations, or finishing resolves immediately + if ( empty || dataPriv.get( this, "finish" ) ) { + anim.stop( true ); + } + }; + doAnimation.finish = doAnimation; + + return empty || optall.queue === false ? + this.each( doAnimation ) : + this.queue( optall.queue, doAnimation ); + }, + stop: function( type, clearQueue, gotoEnd ) { + var stopQueue = function( hooks ) { + var stop = hooks.stop; + delete hooks.stop; + stop( gotoEnd ); + }; + + if ( typeof type !== "string" ) { + gotoEnd = clearQueue; + clearQueue = type; + type = undefined; + } + if ( clearQueue && type !== false ) { + this.queue( type || "fx", [] ); + } + + return this.each( function() { + var dequeue = true, + index = type != null && type + "queueHooks", + timers = jQuery.timers, + data = dataPriv.get( this ); + + if ( index ) { + if ( data[ index ] && data[ index ].stop ) { + stopQueue( data[ index ] ); + } + } else { + for ( index in data ) { + if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { + stopQueue( data[ index ] ); + } + } + } + + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && + ( type == null || timers[ index ].queue === type ) ) { + + timers[ index ].anim.stop( gotoEnd ); + dequeue = false; + timers.splice( index, 1 ); + } + } + + // Start the next in the queue if the last step wasn't forced. + // Timers currently will call their complete callbacks, which + // will dequeue but only if they were gotoEnd. + if ( dequeue || !gotoEnd ) { + jQuery.dequeue( this, type ); + } + } ); + }, + finish: function( type ) { + if ( type !== false ) { + type = type || "fx"; + } + return this.each( function() { + var index, + data = dataPriv.get( this ), + queue = data[ type + "queue" ], + hooks = data[ type + "queueHooks" ], + timers = jQuery.timers, + length = queue ? queue.length : 0; + + // Enable finishing flag on private data + data.finish = true; + + // Empty the queue first + jQuery.queue( this, type, [] ); + + if ( hooks && hooks.stop ) { + hooks.stop.call( this, true ); + } + + // Look for any active animations, and finish them + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && timers[ index ].queue === type ) { + timers[ index ].anim.stop( true ); + timers.splice( index, 1 ); + } + } + + // Look for any animations in the old queue and finish them + for ( index = 0; index < length; index++ ) { + if ( queue[ index ] && queue[ index ].finish ) { + queue[ index ].finish.call( this ); + } + } + + // Turn off finishing flag + delete data.finish; + } ); + } +} ); + +jQuery.each( [ "toggle", "show", "hide" ], function( i, name ) { + var cssFn = jQuery.fn[ name ]; + jQuery.fn[ name ] = function( speed, easing, callback ) { + return speed == null || typeof speed === "boolean" ? + cssFn.apply( this, arguments ) : + this.animate( genFx( name, true ), speed, easing, callback ); + }; +} ); + +// Generate shortcuts for custom animations +jQuery.each( { + slideDown: genFx( "show" ), + slideUp: genFx( "hide" ), + slideToggle: genFx( "toggle" ), + fadeIn: { opacity: "show" }, + fadeOut: { opacity: "hide" }, + fadeToggle: { opacity: "toggle" } +}, function( name, props ) { + jQuery.fn[ name ] = function( speed, easing, callback ) { + return this.animate( props, speed, easing, callback ); + }; +} ); + +jQuery.timers = []; +jQuery.fx.tick = function() { + var timer, + i = 0, + timers = jQuery.timers; + + fxNow = jQuery.now(); + + for ( ; i < timers.length; i++ ) { + timer = timers[ i ]; + + // Run the timer and safely remove it when done (allowing for external removal) + if ( !timer() && timers[ i ] === timer ) { + timers.splice( i--, 1 ); + } + } + + if ( !timers.length ) { + jQuery.fx.stop(); + } + fxNow = undefined; +}; + +jQuery.fx.timer = function( timer ) { + jQuery.timers.push( timer ); + jQuery.fx.start(); +}; + +jQuery.fx.interval = 13; +jQuery.fx.start = function() { + if ( inProgress ) { + return; + } + + inProgress = true; + schedule(); +}; + +jQuery.fx.stop = function() { + inProgress = null; +}; + +jQuery.fx.speeds = { + slow: 600, + fast: 200, + + // Default speed + _default: 400 +}; + + +// Based off of the plugin by Clint Helfers, with permission. +// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ +jQuery.fn.delay = function( time, type ) { + time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; + type = type || "fx"; + + return this.queue( type, function( next, hooks ) { + var timeout = window.setTimeout( next, time ); + hooks.stop = function() { + window.clearTimeout( timeout ); + }; + } ); +}; + + +( function() { + var input = document.createElement( "input" ), + select = document.createElement( "select" ), + opt = select.appendChild( document.createElement( "option" ) ); + + input.type = "checkbox"; + + // Support: Android <=4.3 only + // Default value for a checkbox should be "on" + support.checkOn = input.value !== ""; + + // Support: IE <=11 only + // Must access selectedIndex to make default options select + support.optSelected = opt.selected; + + // Support: IE <=11 only + // An input loses its value after becoming a radio + input = document.createElement( "input" ); + input.value = "t"; + input.type = "radio"; + support.radioValue = input.value === "t"; +} )(); + + +var boolHook, + attrHandle = jQuery.expr.attrHandle; + +jQuery.fn.extend( { + attr: function( name, value ) { + return access( this, jQuery.attr, name, value, arguments.length > 1 ); + }, + + removeAttr: function( name ) { + return this.each( function() { + jQuery.removeAttr( this, name ); + } ); + } +} ); + +jQuery.extend( { + attr: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set attributes on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + // Fallback to prop when attributes are not supported + if ( typeof elem.getAttribute === "undefined" ) { + return jQuery.prop( elem, name, value ); + } + + // Attribute hooks are determined by the lowercase version + // Grab necessary hook if one is defined + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + hooks = jQuery.attrHooks[ name.toLowerCase() ] || + ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); + } + + if ( value !== undefined ) { + if ( value === null ) { + jQuery.removeAttr( elem, name ); + return; + } + + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + elem.setAttribute( name, value + "" ); + return value; + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + ret = jQuery.find.attr( elem, name ); + + // Non-existent attributes return null, we normalize to undefined + return ret == null ? undefined : ret; + }, + + attrHooks: { + type: { + set: function( elem, value ) { + if ( !support.radioValue && value === "radio" && + nodeName( elem, "input" ) ) { + var val = elem.value; + elem.setAttribute( "type", value ); + if ( val ) { + elem.value = val; + } + return value; + } + } + } + }, + + removeAttr: function( elem, value ) { + var name, + i = 0, + + // Attribute names can contain non-HTML whitespace characters + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + attrNames = value && value.match( rnothtmlwhite ); + + if ( attrNames && elem.nodeType === 1 ) { + while ( ( name = attrNames[ i++ ] ) ) { + elem.removeAttribute( name ); + } + } + } +} ); + +// Hooks for boolean attributes +boolHook = { + set: function( elem, value, name ) { + if ( value === false ) { + + // Remove boolean attributes when set to false + jQuery.removeAttr( elem, name ); + } else { + elem.setAttribute( name, name ); + } + return name; + } +}; + +jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( i, name ) { + var getter = attrHandle[ name ] || jQuery.find.attr; + + attrHandle[ name ] = function( elem, name, isXML ) { + var ret, handle, + lowercaseName = name.toLowerCase(); + + if ( !isXML ) { + + // Avoid an infinite loop by temporarily removing this function from the getter + handle = attrHandle[ lowercaseName ]; + attrHandle[ lowercaseName ] = ret; + ret = getter( elem, name, isXML ) != null ? + lowercaseName : + null; + attrHandle[ lowercaseName ] = handle; + } + return ret; + }; +} ); + + + + +var rfocusable = /^(?:input|select|textarea|button)$/i, + rclickable = /^(?:a|area)$/i; + +jQuery.fn.extend( { + prop: function( name, value ) { + return access( this, jQuery.prop, name, value, arguments.length > 1 ); + }, + + removeProp: function( name ) { + return this.each( function() { + delete this[ jQuery.propFix[ name ] || name ]; + } ); + } +} ); + +jQuery.extend( { + prop: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set properties on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + + // Fix name and attach hooks + name = jQuery.propFix[ name ] || name; + hooks = jQuery.propHooks[ name ]; + } + + if ( value !== undefined ) { + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + return ( elem[ name ] = value ); + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + return elem[ name ]; + }, + + propHooks: { + tabIndex: { + get: function( elem ) { + + // Support: IE <=9 - 11 only + // elem.tabIndex doesn't always return the + // correct value when it hasn't been explicitly set + // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ + // Use proper attribute retrieval(#12072) + var tabindex = jQuery.find.attr( elem, "tabindex" ); + + if ( tabindex ) { + return parseInt( tabindex, 10 ); + } + + if ( + rfocusable.test( elem.nodeName ) || + rclickable.test( elem.nodeName ) && + elem.href + ) { + return 0; + } + + return -1; + } + } + }, + + propFix: { + "for": "htmlFor", + "class": "className" + } +} ); + +// Support: IE <=11 only +// Accessing the selectedIndex property +// forces the browser to respect setting selected +// on the option +// The getter ensures a default option is selected +// when in an optgroup +// eslint rule "no-unused-expressions" is disabled for this code +// since it considers such accessions noop +if ( !support.optSelected ) { + jQuery.propHooks.selected = { + get: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent && parent.parentNode ) { + parent.parentNode.selectedIndex; + } + return null; + }, + set: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent ) { + parent.selectedIndex; + + if ( parent.parentNode ) { + parent.parentNode.selectedIndex; + } + } + } + }; +} + +jQuery.each( [ + "tabIndex", + "readOnly", + "maxLength", + "cellSpacing", + "cellPadding", + "rowSpan", + "colSpan", + "useMap", + "frameBorder", + "contentEditable" +], function() { + jQuery.propFix[ this.toLowerCase() ] = this; +} ); + + + + + // Strip and collapse whitespace according to HTML spec + // https://html.spec.whatwg.org/multipage/infrastructure.html#strip-and-collapse-whitespace + function stripAndCollapse( value ) { + var tokens = value.match( rnothtmlwhite ) || []; + return tokens.join( " " ); + } + + +function getClass( elem ) { + return elem.getAttribute && elem.getAttribute( "class" ) || ""; +} + +jQuery.fn.extend( { + addClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( jQuery.isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + if ( typeof value === "string" && value ) { + classes = value.match( rnothtmlwhite ) || []; + + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + if ( cur.indexOf( " " + clazz + " " ) < 0 ) { + cur += clazz + " "; + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + removeClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( jQuery.isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + if ( !arguments.length ) { + return this.attr( "class", "" ); + } + + if ( typeof value === "string" && value ) { + classes = value.match( rnothtmlwhite ) || []; + + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + + // This expression is here for better compressibility (see addClass) + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + + // Remove *all* instances + while ( cur.indexOf( " " + clazz + " " ) > -1 ) { + cur = cur.replace( " " + clazz + " ", " " ); + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + toggleClass: function( value, stateVal ) { + var type = typeof value; + + if ( typeof stateVal === "boolean" && type === "string" ) { + return stateVal ? this.addClass( value ) : this.removeClass( value ); + } + + if ( jQuery.isFunction( value ) ) { + return this.each( function( i ) { + jQuery( this ).toggleClass( + value.call( this, i, getClass( this ), stateVal ), + stateVal + ); + } ); + } + + return this.each( function() { + var className, i, self, classNames; + + if ( type === "string" ) { + + // Toggle individual class names + i = 0; + self = jQuery( this ); + classNames = value.match( rnothtmlwhite ) || []; + + while ( ( className = classNames[ i++ ] ) ) { + + // Check each className given, space separated list + if ( self.hasClass( className ) ) { + self.removeClass( className ); + } else { + self.addClass( className ); + } + } + + // Toggle whole class name + } else if ( value === undefined || type === "boolean" ) { + className = getClass( this ); + if ( className ) { + + // Store className if set + dataPriv.set( this, "__className__", className ); + } + + // If the element has a class name or if we're passed `false`, + // then remove the whole classname (if there was one, the above saved it). + // Otherwise bring back whatever was previously saved (if anything), + // falling back to the empty string if nothing was stored. + if ( this.setAttribute ) { + this.setAttribute( "class", + className || value === false ? + "" : + dataPriv.get( this, "__className__" ) || "" + ); + } + } + } ); + }, + + hasClass: function( selector ) { + var className, elem, + i = 0; + + className = " " + selector + " "; + while ( ( elem = this[ i++ ] ) ) { + if ( elem.nodeType === 1 && + ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { + return true; + } + } + + return false; + } +} ); + + + + +var rreturn = /\r/g; + +jQuery.fn.extend( { + val: function( value ) { + var hooks, ret, isFunction, + elem = this[ 0 ]; + + if ( !arguments.length ) { + if ( elem ) { + hooks = jQuery.valHooks[ elem.type ] || + jQuery.valHooks[ elem.nodeName.toLowerCase() ]; + + if ( hooks && + "get" in hooks && + ( ret = hooks.get( elem, "value" ) ) !== undefined + ) { + return ret; + } + + ret = elem.value; + + // Handle most common string cases + if ( typeof ret === "string" ) { + return ret.replace( rreturn, "" ); + } + + // Handle cases where value is null/undef or number + return ret == null ? "" : ret; + } + + return; + } + + isFunction = jQuery.isFunction( value ); + + return this.each( function( i ) { + var val; + + if ( this.nodeType !== 1 ) { + return; + } + + if ( isFunction ) { + val = value.call( this, i, jQuery( this ).val() ); + } else { + val = value; + } + + // Treat null/undefined as ""; convert numbers to string + if ( val == null ) { + val = ""; + + } else if ( typeof val === "number" ) { + val += ""; + + } else if ( Array.isArray( val ) ) { + val = jQuery.map( val, function( value ) { + return value == null ? "" : value + ""; + } ); + } + + hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; + + // If set returns undefined, fall back to normal setting + if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { + this.value = val; + } + } ); + } +} ); + +jQuery.extend( { + valHooks: { + option: { + get: function( elem ) { + + var val = jQuery.find.attr( elem, "value" ); + return val != null ? + val : + + // Support: IE <=10 - 11 only + // option.text throws exceptions (#14686, #14858) + // Strip and collapse whitespace + // https://html.spec.whatwg.org/#strip-and-collapse-whitespace + stripAndCollapse( jQuery.text( elem ) ); + } + }, + select: { + get: function( elem ) { + var value, option, i, + options = elem.options, + index = elem.selectedIndex, + one = elem.type === "select-one", + values = one ? null : [], + max = one ? index + 1 : options.length; + + if ( index < 0 ) { + i = max; + + } else { + i = one ? index : 0; + } + + // Loop through all the selected options + for ( ; i < max; i++ ) { + option = options[ i ]; + + // Support: IE <=9 only + // IE8-9 doesn't update selected after form reset (#2551) + if ( ( option.selected || i === index ) && + + // Don't return options that are disabled or in a disabled optgroup + !option.disabled && + ( !option.parentNode.disabled || + !nodeName( option.parentNode, "optgroup" ) ) ) { + + // Get the specific value for the option + value = jQuery( option ).val(); + + // We don't need an array for one selects + if ( one ) { + return value; + } + + // Multi-Selects return an array + values.push( value ); + } + } + + return values; + }, + + set: function( elem, value ) { + var optionSet, option, + options = elem.options, + values = jQuery.makeArray( value ), + i = options.length; + + while ( i-- ) { + option = options[ i ]; + + /* eslint-disable no-cond-assign */ + + if ( option.selected = + jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 + ) { + optionSet = true; + } + + /* eslint-enable no-cond-assign */ + } + + // Force browsers to behave consistently when non-matching value is set + if ( !optionSet ) { + elem.selectedIndex = -1; + } + return values; + } + } + } +} ); + +// Radios and checkboxes getter/setter +jQuery.each( [ "radio", "checkbox" ], function() { + jQuery.valHooks[ this ] = { + set: function( elem, value ) { + if ( Array.isArray( value ) ) { + return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); + } + } + }; + if ( !support.checkOn ) { + jQuery.valHooks[ this ].get = function( elem ) { + return elem.getAttribute( "value" ) === null ? "on" : elem.value; + }; + } +} ); + + + + +// Return jQuery for attributes-only inclusion + + +var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/; + +jQuery.extend( jQuery.event, { + + trigger: function( event, data, elem, onlyHandlers ) { + + var i, cur, tmp, bubbleType, ontype, handle, special, + eventPath = [ elem || document ], + type = hasOwn.call( event, "type" ) ? event.type : event, + namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; + + cur = tmp = elem = elem || document; + + // Don't do events on text and comment nodes + if ( elem.nodeType === 3 || elem.nodeType === 8 ) { + return; + } + + // focus/blur morphs to focusin/out; ensure we're not firing them right now + if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { + return; + } + + if ( type.indexOf( "." ) > -1 ) { + + // Namespaced trigger; create a regexp to match event type in handle() + namespaces = type.split( "." ); + type = namespaces.shift(); + namespaces.sort(); + } + ontype = type.indexOf( ":" ) < 0 && "on" + type; + + // Caller can pass in a jQuery.Event object, Object, or just an event type string + event = event[ jQuery.expando ] ? + event : + new jQuery.Event( type, typeof event === "object" && event ); + + // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) + event.isTrigger = onlyHandlers ? 2 : 3; + event.namespace = namespaces.join( "." ); + event.rnamespace = event.namespace ? + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : + null; + + // Clean up the event in case it is being reused + event.result = undefined; + if ( !event.target ) { + event.target = elem; + } + + // Clone any incoming data and prepend the event, creating the handler arg list + data = data == null ? + [ event ] : + jQuery.makeArray( data, [ event ] ); + + // Allow special events to draw outside the lines + special = jQuery.event.special[ type ] || {}; + if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { + return; + } + + // Determine event propagation path in advance, per W3C events spec (#9951) + // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) + if ( !onlyHandlers && !special.noBubble && !jQuery.isWindow( elem ) ) { + + bubbleType = special.delegateType || type; + if ( !rfocusMorph.test( bubbleType + type ) ) { + cur = cur.parentNode; + } + for ( ; cur; cur = cur.parentNode ) { + eventPath.push( cur ); + tmp = cur; + } + + // Only add window if we got to document (e.g., not plain obj or detached DOM) + if ( tmp === ( elem.ownerDocument || document ) ) { + eventPath.push( tmp.defaultView || tmp.parentWindow || window ); + } + } + + // Fire handlers on the event path + i = 0; + while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { + + event.type = i > 1 ? + bubbleType : + special.bindType || type; + + // jQuery handler + handle = ( dataPriv.get( cur, "events" ) || {} )[ event.type ] && + dataPriv.get( cur, "handle" ); + if ( handle ) { + handle.apply( cur, data ); + } + + // Native handler + handle = ontype && cur[ ontype ]; + if ( handle && handle.apply && acceptData( cur ) ) { + event.result = handle.apply( cur, data ); + if ( event.result === false ) { + event.preventDefault(); + } + } + } + event.type = type; + + // If nobody prevented the default action, do it now + if ( !onlyHandlers && !event.isDefaultPrevented() ) { + + if ( ( !special._default || + special._default.apply( eventPath.pop(), data ) === false ) && + acceptData( elem ) ) { + + // Call a native DOM method on the target with the same name as the event. + // Don't do default actions on window, that's where global variables be (#6170) + if ( ontype && jQuery.isFunction( elem[ type ] ) && !jQuery.isWindow( elem ) ) { + + // Don't re-trigger an onFOO event when we call its FOO() method + tmp = elem[ ontype ]; + + if ( tmp ) { + elem[ ontype ] = null; + } + + // Prevent re-triggering of the same event, since we already bubbled it above + jQuery.event.triggered = type; + elem[ type ](); + jQuery.event.triggered = undefined; + + if ( tmp ) { + elem[ ontype ] = tmp; + } + } + } + } + + return event.result; + }, + + // Piggyback on a donor event to simulate a different one + // Used only for `focus(in | out)` events + simulate: function( type, elem, event ) { + var e = jQuery.extend( + new jQuery.Event(), + event, + { + type: type, + isSimulated: true + } + ); + + jQuery.event.trigger( e, null, elem ); + } + +} ); + +jQuery.fn.extend( { + + trigger: function( type, data ) { + return this.each( function() { + jQuery.event.trigger( type, data, this ); + } ); + }, + triggerHandler: function( type, data ) { + var elem = this[ 0 ]; + if ( elem ) { + return jQuery.event.trigger( type, data, elem, true ); + } + } +} ); + + +jQuery.each( ( "blur focus focusin focusout resize scroll click dblclick " + + "mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave " + + "change select submit keydown keypress keyup contextmenu" ).split( " " ), + function( i, name ) { + + // Handle event binding + jQuery.fn[ name ] = function( data, fn ) { + return arguments.length > 0 ? + this.on( name, null, data, fn ) : + this.trigger( name ); + }; +} ); + +jQuery.fn.extend( { + hover: function( fnOver, fnOut ) { + return this.mouseenter( fnOver ).mouseleave( fnOut || fnOver ); + } +} ); + + + + +support.focusin = "onfocusin" in window; + + +// Support: Firefox <=44 +// Firefox doesn't have focus(in | out) events +// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 +// +// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 +// focus(in | out) events fire after focus & blur events, +// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order +// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 +if ( !support.focusin ) { + jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { + + // Attach a single capturing handler on the document while someone wants focusin/focusout + var handler = function( event ) { + jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); + }; + + jQuery.event.special[ fix ] = { + setup: function() { + var doc = this.ownerDocument || this, + attaches = dataPriv.access( doc, fix ); + + if ( !attaches ) { + doc.addEventListener( orig, handler, true ); + } + dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); + }, + teardown: function() { + var doc = this.ownerDocument || this, + attaches = dataPriv.access( doc, fix ) - 1; + + if ( !attaches ) { + doc.removeEventListener( orig, handler, true ); + dataPriv.remove( doc, fix ); + + } else { + dataPriv.access( doc, fix, attaches ); + } + } + }; + } ); +} +var location = window.location; + +var nonce = jQuery.now(); + +var rquery = ( /\?/ ); + + + +// Cross-browser xml parsing +jQuery.parseXML = function( data ) { + var xml; + if ( !data || typeof data !== "string" ) { + return null; + } + + // Support: IE 9 - 11 only + // IE throws on parseFromString with invalid input. + try { + xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); + } catch ( e ) { + xml = undefined; + } + + if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { + jQuery.error( "Invalid XML: " + data ); + } + return xml; +}; + + +var + rbracket = /\[\]$/, + rCRLF = /\r?\n/g, + rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, + rsubmittable = /^(?:input|select|textarea|keygen)/i; + +function buildParams( prefix, obj, traditional, add ) { + var name; + + if ( Array.isArray( obj ) ) { + + // Serialize array item. + jQuery.each( obj, function( i, v ) { + if ( traditional || rbracket.test( prefix ) ) { + + // Treat each array item as a scalar. + add( prefix, v ); + + } else { + + // Item is non-scalar (array or object), encode its numeric index. + buildParams( + prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", + v, + traditional, + add + ); + } + } ); + + } else if ( !traditional && jQuery.type( obj ) === "object" ) { + + // Serialize object item. + for ( name in obj ) { + buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); + } + + } else { + + // Serialize scalar item. + add( prefix, obj ); + } +} + +// Serialize an array of form elements or a set of +// key/values into a query string +jQuery.param = function( a, traditional ) { + var prefix, + s = [], + add = function( key, valueOrFunction ) { + + // If value is a function, invoke it and use its return value + var value = jQuery.isFunction( valueOrFunction ) ? + valueOrFunction() : + valueOrFunction; + + s[ s.length ] = encodeURIComponent( key ) + "=" + + encodeURIComponent( value == null ? "" : value ); + }; + + // If an array was passed in, assume that it is an array of form elements. + if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { + + // Serialize the form elements + jQuery.each( a, function() { + add( this.name, this.value ); + } ); + + } else { + + // If traditional, encode the "old" way (the way 1.3.2 or older + // did it), otherwise encode params recursively. + for ( prefix in a ) { + buildParams( prefix, a[ prefix ], traditional, add ); + } + } + + // Return the resulting serialization + return s.join( "&" ); +}; + +jQuery.fn.extend( { + serialize: function() { + return jQuery.param( this.serializeArray() ); + }, + serializeArray: function() { + return this.map( function() { + + // Can add propHook for "elements" to filter or add form elements + var elements = jQuery.prop( this, "elements" ); + return elements ? jQuery.makeArray( elements ) : this; + } ) + .filter( function() { + var type = this.type; + + // Use .is( ":disabled" ) so that fieldset[disabled] works + return this.name && !jQuery( this ).is( ":disabled" ) && + rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && + ( this.checked || !rcheckableType.test( type ) ); + } ) + .map( function( i, elem ) { + var val = jQuery( this ).val(); + + if ( val == null ) { + return null; + } + + if ( Array.isArray( val ) ) { + return jQuery.map( val, function( val ) { + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ); + } + + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ).get(); + } +} ); + + +var + r20 = /%20/g, + rhash = /#.*$/, + rantiCache = /([?&])_=[^&]*/, + rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, + + // #7653, #8125, #8152: local protocol detection + rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, + rnoContent = /^(?:GET|HEAD)$/, + rprotocol = /^\/\//, + + /* Prefilters + * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) + * 2) These are called: + * - BEFORE asking for a transport + * - AFTER param serialization (s.data is a string if s.processData is true) + * 3) key is the dataType + * 4) the catchall symbol "*" can be used + * 5) execution will start with transport dataType and THEN continue down to "*" if needed + */ + prefilters = {}, + + /* Transports bindings + * 1) key is the dataType + * 2) the catchall symbol "*" can be used + * 3) selection will start with transport dataType and THEN go to "*" if needed + */ + transports = {}, + + // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression + allTypes = "*/".concat( "*" ), + + // Anchor tag for parsing the document origin + originAnchor = document.createElement( "a" ); + originAnchor.href = location.href; + +// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport +function addToPrefiltersOrTransports( structure ) { + + // dataTypeExpression is optional and defaults to "*" + return function( dataTypeExpression, func ) { + + if ( typeof dataTypeExpression !== "string" ) { + func = dataTypeExpression; + dataTypeExpression = "*"; + } + + var dataType, + i = 0, + dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; + + if ( jQuery.isFunction( func ) ) { + + // For each dataType in the dataTypeExpression + while ( ( dataType = dataTypes[ i++ ] ) ) { + + // Prepend if requested + if ( dataType[ 0 ] === "+" ) { + dataType = dataType.slice( 1 ) || "*"; + ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); + + // Otherwise append + } else { + ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); + } + } + } + }; +} + +// Base inspection function for prefilters and transports +function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { + + var inspected = {}, + seekingTransport = ( structure === transports ); + + function inspect( dataType ) { + var selected; + inspected[ dataType ] = true; + jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { + var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); + if ( typeof dataTypeOrTransport === "string" && + !seekingTransport && !inspected[ dataTypeOrTransport ] ) { + + options.dataTypes.unshift( dataTypeOrTransport ); + inspect( dataTypeOrTransport ); + return false; + } else if ( seekingTransport ) { + return !( selected = dataTypeOrTransport ); + } + } ); + return selected; + } + + return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); +} + +// A special extend for ajax options +// that takes "flat" options (not to be deep extended) +// Fixes #9887 +function ajaxExtend( target, src ) { + var key, deep, + flatOptions = jQuery.ajaxSettings.flatOptions || {}; + + for ( key in src ) { + if ( src[ key ] !== undefined ) { + ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; + } + } + if ( deep ) { + jQuery.extend( true, target, deep ); + } + + return target; +} + +/* Handles responses to an ajax request: + * - finds the right dataType (mediates between content-type and expected dataType) + * - returns the corresponding response + */ +function ajaxHandleResponses( s, jqXHR, responses ) { + + var ct, type, finalDataType, firstDataType, + contents = s.contents, + dataTypes = s.dataTypes; + + // Remove auto dataType and get content-type in the process + while ( dataTypes[ 0 ] === "*" ) { + dataTypes.shift(); + if ( ct === undefined ) { + ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); + } + } + + // Check if we're dealing with a known content-type + if ( ct ) { + for ( type in contents ) { + if ( contents[ type ] && contents[ type ].test( ct ) ) { + dataTypes.unshift( type ); + break; + } + } + } + + // Check to see if we have a response for the expected dataType + if ( dataTypes[ 0 ] in responses ) { + finalDataType = dataTypes[ 0 ]; + } else { + + // Try convertible dataTypes + for ( type in responses ) { + if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { + finalDataType = type; + break; + } + if ( !firstDataType ) { + firstDataType = type; + } + } + + // Or just use first one + finalDataType = finalDataType || firstDataType; + } + + // If we found a dataType + // We add the dataType to the list if needed + // and return the corresponding response + if ( finalDataType ) { + if ( finalDataType !== dataTypes[ 0 ] ) { + dataTypes.unshift( finalDataType ); + } + return responses[ finalDataType ]; + } +} + +/* Chain conversions given the request and the original response + * Also sets the responseXXX fields on the jqXHR instance + */ +function ajaxConvert( s, response, jqXHR, isSuccess ) { + var conv2, current, conv, tmp, prev, + converters = {}, + + // Work with a copy of dataTypes in case we need to modify it for conversion + dataTypes = s.dataTypes.slice(); + + // Create converters map with lowercased keys + if ( dataTypes[ 1 ] ) { + for ( conv in s.converters ) { + converters[ conv.toLowerCase() ] = s.converters[ conv ]; + } + } + + current = dataTypes.shift(); + + // Convert to each sequential dataType + while ( current ) { + + if ( s.responseFields[ current ] ) { + jqXHR[ s.responseFields[ current ] ] = response; + } + + // Apply the dataFilter if provided + if ( !prev && isSuccess && s.dataFilter ) { + response = s.dataFilter( response, s.dataType ); + } + + prev = current; + current = dataTypes.shift(); + + if ( current ) { + + // There's only work to do if current dataType is non-auto + if ( current === "*" ) { + + current = prev; + + // Convert response if prev dataType is non-auto and differs from current + } else if ( prev !== "*" && prev !== current ) { + + // Seek a direct converter + conv = converters[ prev + " " + current ] || converters[ "* " + current ]; + + // If none found, seek a pair + if ( !conv ) { + for ( conv2 in converters ) { + + // If conv2 outputs current + tmp = conv2.split( " " ); + if ( tmp[ 1 ] === current ) { + + // If prev can be converted to accepted input + conv = converters[ prev + " " + tmp[ 0 ] ] || + converters[ "* " + tmp[ 0 ] ]; + if ( conv ) { + + // Condense equivalence converters + if ( conv === true ) { + conv = converters[ conv2 ]; + + // Otherwise, insert the intermediate dataType + } else if ( converters[ conv2 ] !== true ) { + current = tmp[ 0 ]; + dataTypes.unshift( tmp[ 1 ] ); + } + break; + } + } + } + } + + // Apply converter (if not an equivalence) + if ( conv !== true ) { + + // Unless errors are allowed to bubble, catch and return them + if ( conv && s.throws ) { + response = conv( response ); + } else { + try { + response = conv( response ); + } catch ( e ) { + return { + state: "parsererror", + error: conv ? e : "No conversion from " + prev + " to " + current + }; + } + } + } + } + } + } + + return { state: "success", data: response }; +} + +jQuery.extend( { + + // Counter for holding the number of active queries + active: 0, + + // Last-Modified header cache for next request + lastModified: {}, + etag: {}, + + ajaxSettings: { + url: location.href, + type: "GET", + isLocal: rlocalProtocol.test( location.protocol ), + global: true, + processData: true, + async: true, + contentType: "application/x-www-form-urlencoded; charset=UTF-8", + + /* + timeout: 0, + data: null, + dataType: null, + username: null, + password: null, + cache: null, + throws: false, + traditional: false, + headers: {}, + */ + + accepts: { + "*": allTypes, + text: "text/plain", + html: "text/html", + xml: "application/xml, text/xml", + json: "application/json, text/javascript" + }, + + contents: { + xml: /\bxml\b/, + html: /\bhtml/, + json: /\bjson\b/ + }, + + responseFields: { + xml: "responseXML", + text: "responseText", + json: "responseJSON" + }, + + // Data converters + // Keys separate source (or catchall "*") and destination types with a single space + converters: { + + // Convert anything to text + "* text": String, + + // Text to html (true = no transformation) + "text html": true, + + // Evaluate text as a json expression + "text json": JSON.parse, + + // Parse text as xml + "text xml": jQuery.parseXML + }, + + // For options that shouldn't be deep extended: + // you can add your own custom options here if + // and when you create one that shouldn't be + // deep extended (see ajaxExtend) + flatOptions: { + url: true, + context: true + } + }, + + // Creates a full fledged settings object into target + // with both ajaxSettings and settings fields. + // If target is omitted, writes into ajaxSettings. + ajaxSetup: function( target, settings ) { + return settings ? + + // Building a settings object + ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : + + // Extending ajaxSettings + ajaxExtend( jQuery.ajaxSettings, target ); + }, + + ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), + ajaxTransport: addToPrefiltersOrTransports( transports ), + + // Main method + ajax: function( url, options ) { + + // If url is an object, simulate pre-1.5 signature + if ( typeof url === "object" ) { + options = url; + url = undefined; + } + + // Force options to be an object + options = options || {}; + + var transport, + + // URL without anti-cache param + cacheURL, + + // Response headers + responseHeadersString, + responseHeaders, + + // timeout handle + timeoutTimer, + + // Url cleanup var + urlAnchor, + + // Request state (becomes false upon send and true upon completion) + completed, + + // To know if global events are to be dispatched + fireGlobals, + + // Loop variable + i, + + // uncached part of the url + uncached, + + // Create the final options object + s = jQuery.ajaxSetup( {}, options ), + + // Callbacks context + callbackContext = s.context || s, + + // Context for global events is callbackContext if it is a DOM node or jQuery collection + globalEventContext = s.context && + ( callbackContext.nodeType || callbackContext.jquery ) ? + jQuery( callbackContext ) : + jQuery.event, + + // Deferreds + deferred = jQuery.Deferred(), + completeDeferred = jQuery.Callbacks( "once memory" ), + + // Status-dependent callbacks + statusCode = s.statusCode || {}, + + // Headers (they are sent all at once) + requestHeaders = {}, + requestHeadersNames = {}, + + // Default abort message + strAbort = "canceled", + + // Fake xhr + jqXHR = { + readyState: 0, + + // Builds headers hashtable if needed + getResponseHeader: function( key ) { + var match; + if ( completed ) { + if ( !responseHeaders ) { + responseHeaders = {}; + while ( ( match = rheaders.exec( responseHeadersString ) ) ) { + responseHeaders[ match[ 1 ].toLowerCase() ] = match[ 2 ]; + } + } + match = responseHeaders[ key.toLowerCase() ]; + } + return match == null ? null : match; + }, + + // Raw string + getAllResponseHeaders: function() { + return completed ? responseHeadersString : null; + }, + + // Caches the header + setRequestHeader: function( name, value ) { + if ( completed == null ) { + name = requestHeadersNames[ name.toLowerCase() ] = + requestHeadersNames[ name.toLowerCase() ] || name; + requestHeaders[ name ] = value; + } + return this; + }, + + // Overrides response content-type header + overrideMimeType: function( type ) { + if ( completed == null ) { + s.mimeType = type; + } + return this; + }, + + // Status-dependent callbacks + statusCode: function( map ) { + var code; + if ( map ) { + if ( completed ) { + + // Execute the appropriate callbacks + jqXHR.always( map[ jqXHR.status ] ); + } else { + + // Lazy-add the new callbacks in a way that preserves old ones + for ( code in map ) { + statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; + } + } + } + return this; + }, + + // Cancel the request + abort: function( statusText ) { + var finalText = statusText || strAbort; + if ( transport ) { + transport.abort( finalText ); + } + done( 0, finalText ); + return this; + } + }; + + // Attach deferreds + deferred.promise( jqXHR ); + + // Add protocol if not provided (prefilters might expect it) + // Handle falsy url in the settings object (#10093: consistency with old signature) + // We also use the url parameter if available + s.url = ( ( url || s.url || location.href ) + "" ) + .replace( rprotocol, location.protocol + "//" ); + + // Alias method option to type as per ticket #12004 + s.type = options.method || options.type || s.method || s.type; + + // Extract dataTypes list + s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; + + // A cross-domain request is in order when the origin doesn't match the current origin. + if ( s.crossDomain == null ) { + urlAnchor = document.createElement( "a" ); + + // Support: IE <=8 - 11, Edge 12 - 13 + // IE throws exception on accessing the href property if url is malformed, + // e.g. http://example.com:80x/ + try { + urlAnchor.href = s.url; + + // Support: IE <=8 - 11 only + // Anchor's host property isn't correctly set when s.url is relative + urlAnchor.href = urlAnchor.href; + s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== + urlAnchor.protocol + "//" + urlAnchor.host; + } catch ( e ) { + + // If there is an error parsing the URL, assume it is crossDomain, + // it can be rejected by the transport if it is invalid + s.crossDomain = true; + } + } + + // Convert data if not already a string + if ( s.data && s.processData && typeof s.data !== "string" ) { + s.data = jQuery.param( s.data, s.traditional ); + } + + // Apply prefilters + inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); + + // If request was aborted inside a prefilter, stop there + if ( completed ) { + return jqXHR; + } + + // We can fire global events as of now if asked to + // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) + fireGlobals = jQuery.event && s.global; + + // Watch for a new set of requests + if ( fireGlobals && jQuery.active++ === 0 ) { + jQuery.event.trigger( "ajaxStart" ); + } + + // Uppercase the type + s.type = s.type.toUpperCase(); + + // Determine if request has content + s.hasContent = !rnoContent.test( s.type ); + + // Save the URL in case we're toying with the If-Modified-Since + // and/or If-None-Match header later on + // Remove hash to simplify url manipulation + cacheURL = s.url.replace( rhash, "" ); + + // More options handling for requests with no content + if ( !s.hasContent ) { + + // Remember the hash so we can put it back + uncached = s.url.slice( cacheURL.length ); + + // If data is available, append data to url + if ( s.data ) { + cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; + + // #9682: remove data so that it's not used in an eventual retry + delete s.data; + } + + // Add or update anti-cache param if needed + if ( s.cache === false ) { + cacheURL = cacheURL.replace( rantiCache, "$1" ); + uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce++ ) + uncached; + } + + // Put hash and anti-cache on the URL that will be requested (gh-1732) + s.url = cacheURL + uncached; + + // Change '%20' to '+' if this is encoded form body content (gh-2658) + } else if ( s.data && s.processData && + ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { + s.data = s.data.replace( r20, "+" ); + } + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + if ( jQuery.lastModified[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); + } + if ( jQuery.etag[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); + } + } + + // Set the correct header, if data is being sent + if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { + jqXHR.setRequestHeader( "Content-Type", s.contentType ); + } + + // Set the Accepts header for the server, depending on the dataType + jqXHR.setRequestHeader( + "Accept", + s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? + s.accepts[ s.dataTypes[ 0 ] ] + + ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : + s.accepts[ "*" ] + ); + + // Check for headers option + for ( i in s.headers ) { + jqXHR.setRequestHeader( i, s.headers[ i ] ); + } + + // Allow custom headers/mimetypes and early abort + if ( s.beforeSend && + ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { + + // Abort if not done already and return + return jqXHR.abort(); + } + + // Aborting is no longer a cancellation + strAbort = "abort"; + + // Install callbacks on deferreds + completeDeferred.add( s.complete ); + jqXHR.done( s.success ); + jqXHR.fail( s.error ); + + // Get transport + transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); + + // If no transport, we auto-abort + if ( !transport ) { + done( -1, "No Transport" ); + } else { + jqXHR.readyState = 1; + + // Send global event + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); + } + + // If request was aborted inside ajaxSend, stop there + if ( completed ) { + return jqXHR; + } + + // Timeout + if ( s.async && s.timeout > 0 ) { + timeoutTimer = window.setTimeout( function() { + jqXHR.abort( "timeout" ); + }, s.timeout ); + } + + try { + completed = false; + transport.send( requestHeaders, done ); + } catch ( e ) { + + // Rethrow post-completion exceptions + if ( completed ) { + throw e; + } + + // Propagate others as results + done( -1, e ); + } + } + + // Callback for when everything is done + function done( status, nativeStatusText, responses, headers ) { + var isSuccess, success, error, response, modified, + statusText = nativeStatusText; + + // Ignore repeat invocations + if ( completed ) { + return; + } + + completed = true; + + // Clear timeout if it exists + if ( timeoutTimer ) { + window.clearTimeout( timeoutTimer ); + } + + // Dereference transport for early garbage collection + // (no matter how long the jqXHR object will be used) + transport = undefined; + + // Cache response headers + responseHeadersString = headers || ""; + + // Set readyState + jqXHR.readyState = status > 0 ? 4 : 0; + + // Determine if successful + isSuccess = status >= 200 && status < 300 || status === 304; + + // Get response data + if ( responses ) { + response = ajaxHandleResponses( s, jqXHR, responses ); + } + + // Convert no matter what (that way responseXXX fields are always set) + response = ajaxConvert( s, response, jqXHR, isSuccess ); + + // If successful, handle type chaining + if ( isSuccess ) { + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + modified = jqXHR.getResponseHeader( "Last-Modified" ); + if ( modified ) { + jQuery.lastModified[ cacheURL ] = modified; + } + modified = jqXHR.getResponseHeader( "etag" ); + if ( modified ) { + jQuery.etag[ cacheURL ] = modified; + } + } + + // if no content + if ( status === 204 || s.type === "HEAD" ) { + statusText = "nocontent"; + + // if not modified + } else if ( status === 304 ) { + statusText = "notmodified"; + + // If we have data, let's convert it + } else { + statusText = response.state; + success = response.data; + error = response.error; + isSuccess = !error; + } + } else { + + // Extract error from statusText and normalize for non-aborts + error = statusText; + if ( status || !statusText ) { + statusText = "error"; + if ( status < 0 ) { + status = 0; + } + } + } + + // Set data for the fake xhr object + jqXHR.status = status; + jqXHR.statusText = ( nativeStatusText || statusText ) + ""; + + // Success/Error + if ( isSuccess ) { + deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); + } else { + deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); + } + + // Status-dependent callbacks + jqXHR.statusCode( statusCode ); + statusCode = undefined; + + if ( fireGlobals ) { + globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", + [ jqXHR, s, isSuccess ? success : error ] ); + } + + // Complete + completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); + + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); + + // Handle the global AJAX counter + if ( !( --jQuery.active ) ) { + jQuery.event.trigger( "ajaxStop" ); + } + } + } + + return jqXHR; + }, + + getJSON: function( url, data, callback ) { + return jQuery.get( url, data, callback, "json" ); + }, + + getScript: function( url, callback ) { + return jQuery.get( url, undefined, callback, "script" ); + } +} ); + +jQuery.each( [ "get", "post" ], function( i, method ) { + jQuery[ method ] = function( url, data, callback, type ) { + + // Shift arguments if data argument was omitted + if ( jQuery.isFunction( data ) ) { + type = type || callback; + callback = data; + data = undefined; + } + + // The url can be an options object (which then must have .url) + return jQuery.ajax( jQuery.extend( { + url: url, + type: method, + dataType: type, + data: data, + success: callback + }, jQuery.isPlainObject( url ) && url ) ); + }; +} ); + + +jQuery._evalUrl = function( url ) { + return jQuery.ajax( { + url: url, + + // Make this explicit, since user can override this through ajaxSetup (#11264) + type: "GET", + dataType: "script", + cache: true, + async: false, + global: false, + "throws": true + } ); +}; + + +jQuery.fn.extend( { + wrapAll: function( html ) { + var wrap; + + if ( this[ 0 ] ) { + if ( jQuery.isFunction( html ) ) { + html = html.call( this[ 0 ] ); + } + + // The elements to wrap the target around + wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); + + if ( this[ 0 ].parentNode ) { + wrap.insertBefore( this[ 0 ] ); + } + + wrap.map( function() { + var elem = this; + + while ( elem.firstElementChild ) { + elem = elem.firstElementChild; + } + + return elem; + } ).append( this ); + } + + return this; + }, + + wrapInner: function( html ) { + if ( jQuery.isFunction( html ) ) { + return this.each( function( i ) { + jQuery( this ).wrapInner( html.call( this, i ) ); + } ); + } + + return this.each( function() { + var self = jQuery( this ), + contents = self.contents(); + + if ( contents.length ) { + contents.wrapAll( html ); + + } else { + self.append( html ); + } + } ); + }, + + wrap: function( html ) { + var isFunction = jQuery.isFunction( html ); + + return this.each( function( i ) { + jQuery( this ).wrapAll( isFunction ? html.call( this, i ) : html ); + } ); + }, + + unwrap: function( selector ) { + this.parent( selector ).not( "body" ).each( function() { + jQuery( this ).replaceWith( this.childNodes ); + } ); + return this; + } +} ); + + +jQuery.expr.pseudos.hidden = function( elem ) { + return !jQuery.expr.pseudos.visible( elem ); +}; +jQuery.expr.pseudos.visible = function( elem ) { + return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); +}; + + + + +jQuery.ajaxSettings.xhr = function() { + try { + return new window.XMLHttpRequest(); + } catch ( e ) {} +}; + +var xhrSuccessStatus = { + + // File protocol always yields status code 0, assume 200 + 0: 200, + + // Support: IE <=9 only + // #1450: sometimes IE returns 1223 when it should be 204 + 1223: 204 + }, + xhrSupported = jQuery.ajaxSettings.xhr(); + +support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); +support.ajax = xhrSupported = !!xhrSupported; + +jQuery.ajaxTransport( function( options ) { + var callback, errorCallback; + + // Cross domain only allowed if supported through XMLHttpRequest + if ( support.cors || xhrSupported && !options.crossDomain ) { + return { + send: function( headers, complete ) { + var i, + xhr = options.xhr(); + + xhr.open( + options.type, + options.url, + options.async, + options.username, + options.password + ); + + // Apply custom fields if provided + if ( options.xhrFields ) { + for ( i in options.xhrFields ) { + xhr[ i ] = options.xhrFields[ i ]; + } + } + + // Override mime type if needed + if ( options.mimeType && xhr.overrideMimeType ) { + xhr.overrideMimeType( options.mimeType ); + } + + // X-Requested-With header + // For cross-domain requests, seeing as conditions for a preflight are + // akin to a jigsaw puzzle, we simply never set it to be sure. + // (it can always be set on a per-request basis or even using ajaxSetup) + // For same-domain requests, won't change header if already provided. + if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { + headers[ "X-Requested-With" ] = "XMLHttpRequest"; + } + + // Set headers + for ( i in headers ) { + xhr.setRequestHeader( i, headers[ i ] ); + } + + // Callback + callback = function( type ) { + return function() { + if ( callback ) { + callback = errorCallback = xhr.onload = + xhr.onerror = xhr.onabort = xhr.onreadystatechange = null; + + if ( type === "abort" ) { + xhr.abort(); + } else if ( type === "error" ) { + + // Support: IE <=9 only + // On a manual native abort, IE9 throws + // errors on any property access that is not readyState + if ( typeof xhr.status !== "number" ) { + complete( 0, "error" ); + } else { + complete( + + // File: protocol always yields status 0; see #8605, #14207 + xhr.status, + xhr.statusText + ); + } + } else { + complete( + xhrSuccessStatus[ xhr.status ] || xhr.status, + xhr.statusText, + + // Support: IE <=9 only + // IE9 has no XHR2 but throws on binary (trac-11426) + // For XHR2 non-text, let the caller handle it (gh-2498) + ( xhr.responseType || "text" ) !== "text" || + typeof xhr.responseText !== "string" ? + { binary: xhr.response } : + { text: xhr.responseText }, + xhr.getAllResponseHeaders() + ); + } + } + }; + }; + + // Listen to events + xhr.onload = callback(); + errorCallback = xhr.onerror = callback( "error" ); + + // Support: IE 9 only + // Use onreadystatechange to replace onabort + // to handle uncaught aborts + if ( xhr.onabort !== undefined ) { + xhr.onabort = errorCallback; + } else { + xhr.onreadystatechange = function() { + + // Check readyState before timeout as it changes + if ( xhr.readyState === 4 ) { + + // Allow onerror to be called first, + // but that will not handle a native abort + // Also, save errorCallback to a variable + // as xhr.onerror cannot be accessed + window.setTimeout( function() { + if ( callback ) { + errorCallback(); + } + } ); + } + }; + } + + // Create the abort callback + callback = callback( "abort" ); + + try { + + // Do send the request (this may raise an exception) + xhr.send( options.hasContent && options.data || null ); + } catch ( e ) { + + // #14683: Only rethrow if this hasn't been notified as an error yet + if ( callback ) { + throw e; + } + } + }, + + abort: function() { + if ( callback ) { + callback(); + } + } + }; + } +} ); + + + + +// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) +jQuery.ajaxPrefilter( function( s ) { + if ( s.crossDomain ) { + s.contents.script = false; + } +} ); + +// Install script dataType +jQuery.ajaxSetup( { + accepts: { + script: "text/javascript, application/javascript, " + + "application/ecmascript, application/x-ecmascript" + }, + contents: { + script: /\b(?:java|ecma)script\b/ + }, + converters: { + "text script": function( text ) { + jQuery.globalEval( text ); + return text; + } + } +} ); + +// Handle cache's special case and crossDomain +jQuery.ajaxPrefilter( "script", function( s ) { + if ( s.cache === undefined ) { + s.cache = false; + } + if ( s.crossDomain ) { + s.type = "GET"; + } +} ); + +// Bind script tag hack transport +jQuery.ajaxTransport( "script", function( s ) { + + // This transport only deals with cross domain requests + if ( s.crossDomain ) { + var script, callback; + return { + send: function( _, complete ) { + script = jQuery( " + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/refs/pull/405/merge/index.html b/refs/pull/405/merge/index.html new file mode 100644 index 00000000..813082ed --- /dev/null +++ b/refs/pull/405/merge/index.html @@ -0,0 +1,1960 @@ + + + + + + Linux Kernel Teaching — The Linux Kernel documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Linux Kernel Teaching

+

This is a collection of lectures and labs Linux kernel topics. The +lectures focus on theoretical and Linux kernel exploration.

+

The labs focus on device drivers topics and they resemble "howto" +style documentation. Each topic has two parts:

+
    +
  • a walk-through the topic which contains an overview, the main +abstractions, simple examples and pointers to APIs
  • +
  • a hands-on part which contains a few exercises that should be +resolved by the student; to focus on the topic at hand, the student +is presented with a starting coding skeleton and with in-depth tips +on how to solve the exercises
  • +
+

This content is based on the Operatings Systems 2 course from the Computer Science +and Engineering Department, the Faculty of Automatic Control and +Computers, University POLITEHNICA of Bucharest.

+

You can get the latest version at http://github.com/linux-kernel-labs.

+

To get started build the documentation from the sources after +installing docker-compose on you host:

+
cd tools/labs && make docker-docs
+
+
+

then point your browser at Documentation/output/labs/index.html.

+

Alternatively, you can build directly on the host (see +tools/labs/docs/Dockerfile for dependencies):

+
cd tools/labs && make docs
+
+
+
+ +
+
+

Lectures

+ +
+
+

Labs

+ +
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/info/contributing.html b/refs/pull/405/merge/info/contributing.html new file mode 100644 index 00000000..0fea866c --- /dev/null +++ b/refs/pull/405/merge/info/contributing.html @@ -0,0 +1,379 @@ + + + + + + Contributing to linux-kernel-labs — The Linux Kernel documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Contributing to linux-kernel-labs

+

linux-kernel-labs is an open platform. +You can help it get better by contributing to the documentation, exercises or +the infrastructure. +All contributions are welcome, no matter if they are just fixes for typos or +new sections in the documentation.

+

All information required for making a contribution can be found in the +linux-kernel-labs Linux repo. +In order to change anything, you need to create a Pull Request (PR) +from your own fork to this repository. +The PR will be reviewed by the members of the team and will be merged once +any potential issue is fixed.

+
+

Repository structure

+

The linux-kernel-labs repo is +a fork of the Linux kernel repo, with the following additions:

+
+
    +
  • /tools/labs: contains the labs and the virtual machine (VM) infrastructure
      +
    • tools/labs/templates: contains the skeletons sources
    • +
    • tools/labs/qemu: contains the qemu VM configuration
    • +
    +
  • +
  • /Documentation/teaching: contains the sources used to generate this +documentation
  • +
+
+
+
+

Building the documentation

+

To build the documentation, navigate to tools/labs and run the following +command:

+
make docs
+
+
+
+

Note

+

The command should install all the required packages. +In some cases, installing the packages or building the documentation might +fail, because of broken dependencies versions.

+

Instead of struggling to fix the dependencies, the simplest way to build +the documentation is using a Docker. +First, install docker and docker-compose on your host, and then run:

+
make docker-docs
+
+
+

The first run might take some time, but subsequent builds will be faster.

+
+
+
+

Creating a contribution

+
+

Forking the repository

+
    +
  1. If you haven't done it already, clone the +linux-kernel-labs repo +repository locally:

    +
    $ mkdir -p ~/src
    +$ git clone git@github.com:linux-kernel-labs/linux.git ~/src/linux
    +
    +
    +
  2. +
  3. Go to https://github.com/linux-kernel-labs/linux, make sure you are logged +in and click Fork in the top right of the page.

    +
  4. +
  5. Add the forked repo as a new remote to the local repo:

    +
    $ git remote add my_fork git@github.com:<your_username>/linux.git
    +
    +
    +
  6. +
+

Now, you can push to your fork by using my_fork instead of origin +(e.g. git push my_fork master).

+
+
+

Creating a pull request

+
+

Warning

+

Pull requests must be created from their own branches, which are started from +master.

+
+
    +
  1. Go to the master branch and make sure you have no local changes:
  2. +
+
+
student@eg106:~/src/linux$ git checkout master
+student@eg106:~/src/linux$ git status
+On branch master
+Your branch is up-to-date with 'origin/master'.
+nothing to commit, working directory clean
+
+
+
+
    +
  1. Make sure the local master branch is up-to-date with linux-kernel-labs:
  2. +
+
+
student@eg106:~/src/linux$ git pull origin master
+
+
+
+

Note

+

You can also push the latest master to your forked repo:

+
student@eg106:~/src/linux$ git push my_fork master
+
+
+
+
+
    +
  1. Create a new branch for your change:
  2. +
+
+
student@eg106:~/src/linux$ git checkout -b <your_branch_name>
+
+
+
+
    +
  1. Make some changes and commit them. In this example, we are going to change +Documentation/teaching/index.rst:
  2. +
+
+
student@eg106:~/src/linux$ vim Documentation/teaching/index.rst
+student@eg106:~/src/linux$ git add Documentation/teaching/index.rst
+student@eg106:~/src/linux$ git commit -m "<commit message>"
+
+
+
+

Warning

+

The commit message must include a relevant description of your change +and the location of the changed component.

+

Examples:

+
+
    +
  • documentation: index: Fix typo in the first section
  • +
  • labs: block_devices: Change printk log level
  • +
+
+
+
+
    +
  1. Push the local branch to your forked repository:
  2. +
+
+
student@eg106:~/src/linux$ git push my_fork <your_branch_name>
+
+
+
+
    +
  1. Open the Pull Request
  2. +
+
+
    +
  • Go to https://github.com and open your forked repository page
  • +
  • Click New pull request.
  • +
  • Make sure base repository (left side) is linux-kernel-labs/linux and the +base is master.
  • +
  • Make sure the head repository (right side) is your forked repo and the +compare branch is your pushed branch.
  • +
  • Click Create pull request.
  • +
+
+
+
+

Making changes to a Pull Request

+

After receiving feedback for your changes, you might need to update the Pull +Request. +Your goal is to do a new push on the same branch. For this, follow the next steps:

+
    +
  1. Make sure your branch is still up to date with the linux-kernel-labs repo +master branch.
  2. +
+
+
student@eg106:~/src/linux$ git fetch origin master
+student@eg106:~/src/linux$ git rebase FETCH_HEAD
+
+
+
+

Note

+

If you are getting conflicts, it means that someone else modified the same +files/lines as you and already merged the changes since you opened the +Pull Request.

+

In this case, you will need to fix the conflicts by editing the +conflicting files manually (run git status to see these files). +After fixing the conflicts, add them using git add and then run +git rebase --continue.

+
+
+
    +
  1. Apply the changes to your local files
  2. +
  3. Commit the changes. We want all the changes to be in the same commit, so +we will amend the changes to the initial commit.
  4. +
+
+
student@eg106:~/src/linux$ git add Documentation/teaching/index.rst
+student@eg106:~/src/linux$ git commit --amend
+
+
+
+
    +
  1. Force-push the updated commit:
  2. +
+
+
student@eg106:~/src/linux$ git push my_fork <your_branch_name> -f
+
+
+

After this step, the Pull Request is updated. It is now up to the +linux-kernel-labs team to review the pull request and integrate your +contributions in the main project.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/info/extra-vm.html b/refs/pull/405/merge/info/extra-vm.html new file mode 100644 index 00000000..763ca8c0 --- /dev/null +++ b/refs/pull/405/merge/info/extra-vm.html @@ -0,0 +1,306 @@ + + + + + + Customizing the Virtual Machine Setup — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Customizing the Virtual Machine Setup

+
+

Connect to the Virtual Machine via SSH

+

The default Yocto image for the QEMU virtual machine +(core-image-minimal-qemu) provides the minimal functionality to run the +kernel and kernel modules. For extra features, such as an SSH connection, +a more complete image is required, such as core-image-sato-dev-qemu.

+

To use the new image, update the YOCTO_IMAGE variable in +tools/labs/qemu/Makefile:

+
YOCTO_IMAGE = core-image-sato-qemu$(ARCH).ext4
+
+
+

When you start the virtual machine the first time using make boot with the +new image configuration, it will download the image and then boot the virtual +machine. The image is larger (around 400MB) than the minimal image so expect +some time for the download.

+

You then enter the virtual machine via minicom, determine the IP address of +the eth0 interface an then you can connect to the virtual machine via SSH:

+
$ minicom -D serial.pts
+Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0
+
+qemux86 login: root
+root@qemux86:~# ip a s
+1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue qlen 1000
+    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+    inet 127.0.0.1/8 scope host lo
+       valid_lft forever preferred_lft forever
+    inet6 ::1/128 scope host
+       valid_lft forever preferred_lft forever
+2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast qlen 1000
+    link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
+    inet 172.213.0.18/24 brd 172.213.0.255 scope global eth0
+       valid_lft forever preferred_lft forever
+    inet6 fe80::5054:ff:fe12:3456/64 scope link
+       valid_lft forever preferred_lft forever
+3: sit0@NONE: <NOARP> mtu 1480 qdisc noop qlen 1000
+    link/sit 0.0.0.0 brd 0.0.0.0
+
+$ ssh -l root 172.213.0.18
+The authenticity of host '172.213.0.18 (172.213.0.18)' can't be established.
+RSA key fingerprint is SHA256:JUWUcD7LdvURNcamoPePMhqEjFFtUNLAqO+TtzUiv5k.
+Are you sure you want to continue connecting (yes/no)? yes
+Warning: Permanently added '172.213.0.18' (RSA) to the list of known hosts.
+root@qemux86:~# uname -a
+Linux qemux86 4.19.0+ #3 SMP Sat Apr 4 22:45:18 EEST 2020 i686 GNU/Linux
+
+
+
+
+

Connecting a Debugger to the Virtual Machine Kernel

+

You can use GDB to connect to the running virtual machine kernel and inspect +the state of the kernel. You run make gdb in tools/labs/:

+
.../linux/tools/labs$ make gdb
+ln -fs /home/tavi/src/linux/vmlinux vmlinux
+gdb -ex "target remote localhost:1234" vmlinux
+GNU gdb (Ubuntu 7.11.1-0ubuntu1~16.04) 7.11.1
+Copyright (C) 2016 Free Software Foundation, Inc.
+License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
+and "show warranty" for details.
+This GDB was configured as "x86_64-linux-gnu".
+Type "show configuration" for configuration details.
+For bug reporting instructions, please see:
+<http://www.gnu.org/software/gdb/bugs/>.
+Find the GDB manual and other documentation resources online at:
+<http://www.gnu.org/software/gdb/documentation/>.
+For help, type "help".
+Type "apropos word" to search for commands related to "word"...
+Reading symbols from vmlinux...done.
+Remote debugging using localhost:1234
+0xc13cf2f2 in native_safe_halt () at ./arch/x86/include/asm/irqflags.h:53
+53asm volatile("sti; hlt": : :"memory");
+(gdb) bt
+#0  0xc13cf2f2 in native_safe_halt () at ./arch/x86/include/asm/irqflags.h:53
+#1  arch_safe_halt () at ./arch/x86/include/asm/irqflags.h:95
+#2  default_idle () at arch/x86/kernel/process.c:341
+#3  0xc101f136 in arch_cpu_idle () at arch/x86/kernel/process.c:332
+#4  0xc106a6dd in cpuidle_idle_call () at kernel/sched/idle.c:156
+#5  do_idle () at kernel/sched/idle.c:245
+#6  0xc106a8c5 in cpu_startup_entry (state=<optimized out>)
+at kernel/sched/idle.c:350
+#7  0xc13cb14a in rest_init () at init/main.c:415
+#8  0xc1507a7a in start_kernel () at init/main.c:679
+#9  0xc10001da in startup_32_smp () at arch/x86/kernel/head_32.S:368
+#10 0x00000000 in ?? ()
+(gdb)
+
+
+
+
+

Rebuild the Kernel Image

+

The kernel image is built the first time the VM is started. To rebuild the +kernel remove the kernel image file defined by the ZIMAGE variable in +tools/labs/qemu/Makefile:

+
ZIMAGE = $(KDIR)/arch/$(ARCH)/boot/$(b)zImage
+
+
+

Typically the full path of the kernel is arch/x86/boot/bzImage.

+

Once removed the kernel image is rebuild by using:

+
~/src/linux/tools/labs$ make zImage
+
+
+

or simply starting the virtual machine

+
~/src/linux/tools/labs$ make boot
+
+
+
+
+

Using Docker containers

+

If your setup doesn't allow the installation of the packages required for the +laboratory setup, you can build and run a container that has all the setup +already prepared for the virtual machine environment.

+

In order to run the containerized setup, you need to install the following +packages:

+
    +
  • docker
  • +
  • docker-compose
  • +
+

In order to run the container infrastructure run the following command in the +tools/labs/ directory:

+
sergiu@local:~/src/linux/tools/labs$ make docker-kernel
+...
+ubuntu@so2:~$
+
+
+

The first time you run the command above, it will take a long time, because you +will have to build the container environment and install the required +applications.

+

Every time you run the make docker-kernel command, another shell will +connect to the container. This will allow you to work with multiple tabs.

+

All the commands that you would use in the regular environment can be used in +the containerized environment.

+

The linux repository is mounted in the /linux directory. All changes +you will make here will also be seen on your local instance.

+

In order to stop the container use the following command:

+
make stop-docker-kernel
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/info/vm.html b/refs/pull/405/merge/info/vm.html new file mode 100644 index 00000000..b44fdf42 --- /dev/null +++ b/refs/pull/405/merge/info/vm.html @@ -0,0 +1,283 @@ + + + + + + Recommended Setup — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ + +
+

Virtual Machine Setup

+

Practice work is designed to run on a QEMU based virtual machine. Kernel code +is developed and built on the host machine and then deployed and run on the +virtual machine.

+

In order to run and use the virtual machine the following packages are required +on a Debian/Ubuntu system:

+
    +
  • flex
  • +
  • bison
  • +
  • build-essential
  • +
  • gcc-multilib
  • +
  • libncurses5-dev
  • +
  • qemu-system-x86
  • +
  • qemu-system-arm
  • +
  • python3
  • +
  • minicom
  • +
+

The kvm package is not strictly required, but will make the virtual machine +faster by using KVM support (with the -enable-kvm option to QEMU). If kvm +is absent, the virtual machine will still run (albeit slower) using emulation.

+

The virtual machine setup uses prebuild Yocto images that it downloads and a +kernel image that it builds itself. The following images are supported:

+
    +
  • core-image-minimal-qemu
  • +
  • core-image-minimal-dev-qemu
  • +
  • core-image-sato-dev-qemu
  • +
  • core-image-sato-qemu
  • +
  • core-image-sato-sdk-qemu
  • +
+

By default, core-image-minimal-qemu it used. This setting can be changed by +updating the YOCTO_IMAGE variable in tools/labs/qemu/Makefile.

+
+

Starting the Virtual Machine

+

You start the virtual machine in the tools/labs/ folder by running make +boot:

+
.../linux/tools/labs$ make boot
+
+
+

The first run of the make boot command will compile the kernel image and it +will take longer. Subsequent runs will only start the QEMU virtual machine, +with verbose output provided:

+
.../linux/tools/labs$ make boot
+mkdir /tmp/tmp.7rWv63E9Wf
+sudo mount -t ext4 -o loop core-image-minimal-qemux86.ext4 /tmp/tmp.7rWv63E9Wf
+sudo make -C /home/razvan/school/so2/linux.git modules_install INSTALL_MOD_PATH=/tmp/tmp.7rWv63E9Wf
+make: Entering directory '/home/razvan/school/so2/linux.git'
+  INSTALL crypto/crypto_engine.ko
+  INSTALL drivers/crypto/virtio/virtio_crypto.ko
+  INSTALL drivers/net/netconsole.ko
+  DEPMOD  4.19.0+
+make: Leaving directory '/home/razvan/school/so2/linux.git'
+sudo umount /tmp/tmp.7rWv63E9Wf
+rmdir /tmp/tmp.7rWv63E9Wf
+sleep 1 && touch .modinst
+qemu/create_net.sh tap0
+
+dnsmasq: failed to create listening socket for 172.213.0.1: Address already in use
+qemu/create_net.sh tap1
+
+dnsmasq: failed to create listening socket for 127.0.0.1: Address already in use
+/home/razvan/school/so2/linux.git/tools/labs/templates/assignments/6-e100/nttcp -v -i &
+nttcp-l: nttcp, version 1.47
+nttcp-l: running in inetd mode on port 5037 - ignoring options beside -v and -p
+bind: Address already in use
+nttcp-l: service-socket: bind:: Address already in use, errno=98
+ARCH=x86 qemu/qemu.sh -kernel /home/razvan/school/so2/linux.git/arch/x86/boot/bzImage -device virtio-serial -chardev pty,id=virtiocon0 -device virtconsole,chardev=virtiocon0 -serial pipe:pipe1 -serial pipe:pipe2 -netdev tap,id=tap0,ifname=tap0,script=no,downscript=no -net nic,netdev=tap0,model=virtio -netdev tap,id=tap1,ifname=tap1,script=no,downscript=no -net nic,netdev=tap1,model=i82559er -drive file=core-image-minimal-qemux86.ext4,if=virtio,format=raw -drive file=disk1.img,if=virtio,format=raw -drive file=disk2.img,if=virtio,format=raw --append "root=/dev/vda loglevel=15 console=hvc0" --display none -s
+qemu-system-i386: -chardev pty,id=virtiocon0: char device redirected to /dev/pts/68 (label virtiocon0)
+
+
+
+

Note

+

To show the QEMU console use

+
+
.../linux/tools/labs$ QEMU_DISPLAY=gtk make boot
+
+       This will show the VGA output and will also give
+       access to the standard keyboard.
+
+
+
+

Note

+

The virtual machine setup scripts and configuration files are located +in tools/labs/qemu/.

+
+
+
+

Connecting to the Virtual Machine

+

Once the virtual machine is started you can connect to it on the serial port. A +symbolic link named serial.pts is created to the emulated serial port +device:

+
.../linux/tools/labs$ ls -l serial.pts
+lrwxrwxrwx 1 razvan razvan 11 Apr  1 08:03 serial.pts -> /dev/pts/68
+
+
+

On the host you use the minicom command to connect to the virtual machine +via the serial.pts link:

+
.../linux/tools/labs$ minicom -D serial.pts
+[...]
+Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0
+
+qemux86 login: root
+root@qemux86:~#
+
+
+
+

Note

+

When you connect to the virtual machine, simply enter root at the +login prompt and you will get a root console, no password required.

+
+
+

Note

+

You exit minicom by pressing Ctrl+a and then x. You will +get a confirmation prompt and then you will exit minicom.

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/arm_kernel_development.html b/refs/pull/405/merge/labs/arm_kernel_development.html new file mode 100644 index 00000000..1d08441e --- /dev/null +++ b/refs/pull/405/merge/labs/arm_kernel_development.html @@ -0,0 +1,586 @@ + + + + + + Kernel Development on ARM — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Kernel Development on ARM

+
+

Lab objectives

+
    +
  • get a feeling of what System on a Chip (SoC) means
  • +
  • get familiar with embedded world using ARM as a supported architecture
  • +
  • understand what a Board Support Package means (BSP)
  • +
  • compile and boot an ARM kernel with Qemu using i.MX6UL platform as an example
  • +
  • get familiar with hardware description using Device Trees
  • +
+
+
+

System on a Chip

+

A System on a Chip (SoC) is an integrated circuit (IC) that integrates an entire system onto it. The components +that can be usually found on an SoC include a central processing unit (CPU), memory, input/output ports, storage devices +together with more sophisticated modules like audio digital interfaces, neural processing units (NPU) or graphical +processing units (GPU).

+
+
SoCs can be used in various applications most common are:
+
    +
  • consumer electronics (TV sets, mobile phones, video game consoles)
  • +
  • industrial computers (medical imaging, etc)
  • +
  • automotive
  • +
  • home appliances
  • +
+
+
+

The leading architecture for SoCs is ARM. Worth mentioning here is that there are also x86-based SoCs platforms. Another thing +we need to keep an eye on is RISC-V an open standard instruction set architecture.

+

A simplified view of an ARM platform is shown in the image below:

+../_images/schematic.png +

We will refer as a reference platform at NXP's i.MX6UL platform, but in general all SoC's contain the following building blocks:

+
+
+
    +
  • one or more CPU cores
  • +
  • a system bus
  • +
  • clock and reset module
      +
    • PLL
    • +
    • OSC
    • +
    • reset controller
    • +
    +
  • +
+
+
    +
  • interrupt controller
  • +
  • timers
  • +
  • memory controller
  • +
  • peripheral controllers +
  • +
+
+

Here is the complete block diagram for i.MX6UL platform:

+IMX6UL-BD +

i.MX6UL Evaluation Kit board looks like this:

+imx6ul-evk +

Other popular SoC boards:

+
+
+
+
+

Board Support package

+

A board support package (BSP) is the minimal set of software packages that allow to demonstrate the capabilities of a certain hardware platform. This includes:

+
+
    +
  • toolchain
  • +
  • bootloader
  • +
  • Linux kernel image, device tree files and drivers
  • +
  • root filesystem
  • +
+
+

Semiconductor manufacturers usually provide a BSP together with an evaluation board. BSP is typically bundled using Yocto

+
+
+

Toolchain

+

Because our development machines are mostly x86-based we need a cross compiler that can produce executable +code for ARM platform.

+

We can build our own cross compiler from scratch using https://crosstool-ng.github.io/ or we can install one

+
$ sudo apt-get install gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf # for arm32
+$ sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu     # for arm64
+
+
+

There are several of toolchain binaries depending on the configuration:

+
+
    +
  • With "arm-eabi-gcc" you have the Linux system C library which will make calls into the kernel IOCTLs, e.g. for allocating memory pages to the process.
  • +
  • With "arm-eabi-none-gcc" you are running on platform which doesn't have an operating system at all - so the C library is different to cope with that.
  • +
+
+
+

Compiling the Linux kernel on ARM

+

Compile the kernel for 32bit ARM boards:

+
# select defconfig based on your platform
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make imx_v6_v7_defconfig
+# compile the kernel
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+

Compile the kernel for 64bit ARM boards:

+
# for 64bit ARM there is a single config for all supported boards
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make defconfig
+# compile the kernel
+$ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make -j8
+
+
+
+
+
+

Linux kernel image

+

The kernel image binary is named vmlinux and it can be found in the root of the kernel tree. Compressed image used for booting can be found under:

+
    +
  • arch/arm/boot/Image, for arm32
  • +
  • arch/arm64/boot/Image, for arm64
  • +
+
$ file vmlinux
+  vmlinux: ELF 32-bit LSB executable, ARM, EABI5 version 1 (SYSV), statically linked, not stripped
+
+$ file vmlinux
+  vmlinux: ELF 64-bit LSB shared object, ARM aarch64, version 1 (SYSV), statically linked, not stripped
+
+
+
+
+

Rootfs

+

The root filesystem (rootfs) is the filesystem mounted at the top of files hierarchy (/). It should contain at least +the critical files allowing the system to boot to a shell.

+
root@so2$ tree -d -L 2
+├── bin
+├── boot
+├── dev
+├── etc
+├── home
+│   └── root
+├── lib
+│   └── udev
+├── mnt
+├── proc
+├── sbin
+│   └── init
+├── sys
+├── usr
+│   ├── bin
+│   ├── include
+│   ├── lib
+└── var
+
+
+

As for x86 we will make use of Yocto rootfs images. In order to download an ext4 rootfs image for arm32 one needs to run:

+
$ cd tools/labs/
+$ ARCH=arm make core-image-minimal-qemuarm.ext4
+
+
+
+
+

Device tree

+

Device tree (DT) is a tree structure used to describe the hardware devices in a system. Each node in the tree describes a device hence it is called device node. DT was introduced +to provide a way to discover non-discoverable hardware (e.g a device on an I2C bus). This information was previously stored inside the source code for the Linux kernel. This meant that +each time we needed to modify a node for a device the kernel needed to be recompiled. This no longer holds true as device tree and kernel image are separate binaries now.

+

Device trees are stored inside device tree sources (.dts) and compiled into device tree blobs (.dtb).

+
# compile dtbs
+$ make dtbs
+
+# location for DT sources on arm32
+$ ls arch/arm/boot/dts/
+  imx6ul-14x14-evk.dtb imx6ull-14x14-evk.dtb bcm2835-rpi-a-plus.dts
+
+# location for DT source on arm64
+$ ls arch/arm64/boot/dts/<vendor>
+  imx8mm-evk.dts imx8mp-evk.dts
+
+
+

The following image is a represantation of a simple device tree, describing board type, cpu and memory.

+../_images/dts_node.png +

Notice that a device tree node can be defined using label: name@address:

+
+
    +
  • label, is an identifier used to reference the node from other places
  • +
  • name, node identifier
  • +
  • address, used to differentiate nodes with the same name.
  • +
+
+

A node might contain several properties arranged in the name = value format. The name is a string +and the value can be bytes, strings, array of strings.

+

Here is an example:

+
/ {
+     node@0 {
+          empty-property;
+          string-property = "string value";
+          string-list-property = "string value 1", "string value 2";
+          int-list-property = <value1 value2>;
+
+          child-node@0 {
+                  child-empty-property;
+                  child-string-property = "string value";
+                  child-node-reference = <&child-node1>;
+          };
+
+          child-node1: child-node@1 {
+                  child-empty-property;
+                  child-string-property = "string value";
+          };
+   };
+};
+
+
+
+
+

Qemu

+

We will use qemu-system-arm to boot 32bit ARM platforms. Although, this can be installed from official distro repos, for example:

+
sudo apt-get install -y qemu-system-arm
+
+
+

We strongly recommend using latest version of qemu-system-arm build from sources:

+
$ git clone https://gitlab.com/qemu-project/qemu.git
+$ ./configure --target-list=arm-softmmu --disable-docs
+$ make -j8
+$ ./build/qemu-system-arm
+
+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is arm_kernel_development. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/arm_kernel_development/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Warning

+

The rules for working with the virtual machine for ARM are modified as follows

+
# modules build
+tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make build
+# modules copy
+tools/labs $ ARCH=arm make copy
+# kernel build
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+
+
+

0. Intro

+

Inspect the following locations in the Linux kernel code and identify platforms and vendors using +ARM architecture:

+
+
    +
  • 32-bit: arch/arm/boot/dts
  • +
  • 64-bit: arch/arm64/boot/dts
  • +
+
+

Use qemu and look at the supported platforms:

+
../qemu/build/arm-softmmu/qemu-system-arm -M ?
+
+
+
+

Note

+

We used our own compiled version of Qemu for arm32. See Qemu section for more details.

+
+
+
+

1. Boot

+

Use qemu to boot i.MX6UL platform. In order to boot, we first need to compile the kernel. +Review Compiling the Linux kernel on ARM section.

+

Successful compilation will result in the following binaries:

+
+
    +
  • arch/arm/boot/Image, kernel image compiled for ARM
  • +
  • arch/arm/boot/dts/imx6ul-14x14-evk.dtb, device tree blob for i.MX6UL board
  • +
+
+

Review Rootfs section and download core-image-minimal-qemuarm.ext4 rootfs. +Run qemu using then following command:

+
../qemu/build/arm-softmmu/qemu-system-arm -M mcimx6ul-evk -cpu cortex-a7 -m 512M \
+  -kernel arch/arm/boot/zImage -nographic  -dtb arch/arm/boot/dts/imx6ul-14x14-evk.dtb \
+  -append "root=/dev/mmcblk0 rw console=ttymxc0 loglevel=8 earlycon printk" -sd tools/labs/core-image-minimal-qemuarm.ext4
+
+
+
+

Note

+

LCDIF and ASRC devices are not well supported with Qemu. Remove them from compilation.

+
+
$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make menuconfig
+# set FSL_ASRC=n and DRM_MXSFB=n
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+

Once the kernel is booted check kernel version and cpu info:

+
$ cat /proc/cpuinfo
+$ cat /proc/version
+
+
+
+
+

2. CPU information

+

Inspect the CPU configuration for NXP i.MX6UL board. Start with arch/arm/boot/dts/imx6ul-14x14-evk.dts.

+
+
    +
  • find cpu@0 device tree node and look for operating-points property.
  • +
  • read the maximum and minimum operating frequency the processor can run
  • +
+
+
$ cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq
+$ cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq
+
+
+
+
+
+
+

3. I/O memory

+

Inspect I/O space configuration for NXP i.MX6UL board. Start with arch/arm/boot/dts/imx6ul-14x14-evk.dts and identify each device mentioned below.

+
$ cat /proc/iomem
+  00900000-0091ffff : 900000.sram sram@900000
+  0209c000-0209ffff : 209c000.gpio gpio@209c000
+  021a0000-021a3fff : 21a0000.i2c i2c@21a0000
+  80000000-9fffffff : System RAM
+
+
+

Identify device tree nodes corresponding to:

+
+
    +
  • System RAM, look for memory@80000000 node in arch/arm/boot/dts/imx6ul-14x14-evk.dtsi. What's the size of the System RAM?
  • +
  • GPIO1, look for gpio@209c000 node in arch/arm/boot/dts/imx6ul.dtsi. What's the size of the I/O space for this device?
  • +
  • I2C1, look for i2c@21a0000 node in arch/arm/boot/dts/imx6ul.dtsi. What's the size of the I/O spaces for this device?
  • +
+
+
+
+

4. Hello World

+

Implement a simple kernel module that prints a message at load/unload time. Compile it and load it on i.MX6UL emulated platform.

+
# modules build
+tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make build
+# modules copy
+tools/labs $ ARCH=arm make copy
+# kernel build
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+
+
+

5. Simple device

+

Implement a driver for a simple platform device. Find TODO 1 and notice how simple_driver is declared and register as a platform driver. +Follow TODO 2 and add the so2,simple-device-v1 and so2,simple-device-v2 compatible strings in the simple_device_ids array.

+

Create two device tree nodes in arch/arm/boot/dts/imx6ul.dtsi under soc node with compatible strings so2,simple-device-v1 and +so2,simple-device-v2 respectively. Then notice the behavior when loading simple_driver module.

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/block_device_drivers.html b/refs/pull/405/merge/labs/block_device_drivers.html new file mode 100644 index 00000000..98333214 --- /dev/null +++ b/refs/pull/405/merge/labs/block_device_drivers.html @@ -0,0 +1,1362 @@ + + + + + + Block Device Drivers — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Block Device Drivers

+
+

Lab objectives

+
+
    +
  • acquiring knowledge about the behavior of the I/O subsystem on Linux
  • +
  • hands-on activities in structures and functions of block devices
  • +
  • acquiring basic skills for utilizing the API for block devices, by solving +exercises
  • +
+
+
+
+

Overview

+

Block devices are characterized by random access to data organized in fixed-size +blocks. Examples of such devices are hard drives, CD-ROM drives, RAM disks, etc. +The speed of block devices is generally much higher than the speed of character +devices, and their performance is also important. This is why the Linux kernel +handles differently these 2 types of devices (it uses a specialized API).

+

Working with block devices is therefore more complicated than working with +character devices. Character devices have a single current position, while block +devices must be able to move to any position in the device to provide random +access to data. To simplify work with block devices, the Linux kernel provides +an entire subsystem called the block I/O (or block layer) subsystem.

+

From the kernel perspective, the smallest logical unit of addressing is the +block. Although the physical device can be addressed at sector level, the kernel +performs all disk operations using blocks. Since the smallest unit of physical +addressing is the sector, the size of the block must be a multiple of the size +of the sector. Additionally, the block size must be a power of 2 and can not +exceed the size of a page. The size of the block may vary depending on the file +system used, the most common values being 512 bytes, 1 kilobytes and 4 +kilobytes.

+
+
+

Register a block I/O device

+

To register a block I/O device, function register_blkdev() is used. +To deregister a block I/O device, function unregister_blkdev() is +used.

+

Starting with version 4.9 of the Linux kernel, the call to +register_blkdev() is optional. The only operations performed by this +function are the dynamic allocation of a major (if the major argument is 0 when +calling the function) and creating an entry in /proc/devices. In +future kernel versions it may be removed; however, most drivers still call it.

+

Usually, the call to the register function is performed in the module +initialization function, and the call to the deregister function is performed in +the module exit function. A typical scenario is presented below:

+
#include <linux/fs.h>
+
+#define MY_BLOCK_MAJOR           240
+#define MY_BLKDEV_NAME          "mybdev"
+
+static int my_block_init(void)
+{
+    int status;
+
+    status = register_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
+    if (status < 0) {
+             printk(KERN_ERR "unable to register mybdev block device\n");
+             return -EBUSY;
+     }
+     //...
+}
+
+static void my_block_exit(void)
+{
+     //...
+     unregister_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
+}
+
+
+
+
+

Register a disk

+

Although the register_blkdev() function obtains a major, it does not +provide a device (disk) to the system. For creating and using block devices +(disks), a specialized interface defined in linux/genhd.h is used.

+

The useful functions defined in linux/genhd.h are to register /allocate +a disk, add it to the system, and de-register /unmount the disk.

+

The alloc_disk() function is used to allocate a disk, and the +del_gendisk() function is used to deallocate it. Adding the disk to the +system is done using the add_disk() function.

+

The alloc_disk() and add_disk() functions are typically used in +the module initialization function, and the del_gendisk() function in +the module exit function.

+
#include <linux/fs.h>
+#include <linux/genhd.h>
+
+#define MY_BLOCK_MINORS       1
+
+static struct my_block_dev {
+    struct gendisk *gd;
+    //...
+} dev;
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    dev->gd = alloc_disk(MY_BLOCK_MINORS);
+    //...
+    add_disk(dev->gd);
+}
+
+static int my_block_init(void)
+{
+    //...
+    create_block_device(&dev);
+}
+
+static void delete_block_device(struct my_block_dev *dev)
+{
+    if (dev->gd)
+        del_gendisk(dev->gd);
+    //...
+}
+
+static void my_block_exit(void)
+{
+    delete_block_device(&dev);
+    //...
+}
+
+
+

As with character devices, it is recommended to use my_block_dev +structure to store important elements describing the block device.

+

Note that immediately after calling the add_disk() function (actually +even during the call), the disk is active and its methods can be called at any +time. As a result, this function should not be called before the driver is fully +initialized and ready to respond to requests for the registered disk.

+

It can be noticed that the basic structure in working with block devices (disks) +is the struct gendisk structure.

+

After a call to del_gendisk(), the struct gendisk structure +may continue to exist (and the device operations may still be called) if there +are still users (an open operation was called on the device but the associated +release operation has not been called). One solution is to keep the number of +users of the device and call the del_gendisk() function only when there +are no users left of the device.

+
+
+

struct gendisk structure

+

The struct gendisk structure stores information about a disk. As +stated above, such a structure is obtained from the alloc_disk() call +and its fields must be filled before it is sent to the add_disk() +function.

+

The struct gendisk structure has the following important fields:

+
+
    +
  • major, first_minor, minor, describing +the identifiers used by the disk; a disk must have at least one minor; if +the disk allows the partitioning operation, a minor must be allocated for +each possible partition
  • +
  • disk_name, which represents the disk name as it appears in +/proc/partitions and in sysfs (/sys/block)
  • +
  • fops, representing operations associated with the disk
  • +
  • queue, which represents the queue of requests
  • +
  • capacity, which is disk capacity in 512 byte sectors; +it is initialized using the set_capacity() function
  • +
  • private_data, which is a pointer to private data
  • +
+
+

An example of filling a struct gendisk structure is presented below:

+
#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#define NR_SECTORS                   1024
+
+#define KERNEL_SECTOR_SIZE           512
+
+static struct my_block_dev {
+    //...
+    spinlock_t lock;                /* For mutual exclusion */
+    struct request_queue *queue;    /* The device request queue */
+    struct gendisk *gd;             /* The gendisk structure */
+    //...
+} dev;
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    ...
+    /* Initialize the gendisk structure */
+    dev->gd = alloc_disk(MY_BLOCK_MINORS);
+    if (!dev->gd) {
+        printk (KERN_NOTICE "alloc_disk failure\n");
+        return -ENOMEM;
+    }
+
+    dev->gd->major = MY_BLOCK_MAJOR;
+    dev->gd->first_minor = 0;
+    dev->gd->fops = &my_block_ops;
+    dev->gd->queue = dev->queue;
+    dev->gd->private_data = dev;
+    snprintf (dev->gd->disk_name, 32, "myblock");
+    set_capacity(dev->gd, NR_SECTORS);
+
+    add_disk(dev->gd);
+
+    return 0;
+}
+
+static int my_block_init(void)
+{
+    int status;
+    //...
+    status = create_block_device(&dev);
+    if (status < 0)
+        return status;
+    //...
+}
+
+static void delete_block_device(struct my_block_dev *dev)
+{
+    if (dev->gd) {
+        del_gendisk(dev->gd);
+    }
+    //...
+}
+
+static void my_block_exit(void)
+{
+    delete_block_device(&dev);
+    //...
+}
+
+
+

As stated before, the kernel considers a disk as a vector of 512 byte sectors. +In reality, the devices may have a different size of the sector. To work with +these devices, the kernel needs to be informed about the real size of a sector, +and for all operations the necessary conversions must be made.

+

To inform the kernel about the device sector size, a parameter of the request +queue must be set just after the request queue is allocated, using the +blk_queue_logical_block_size() function. All requests generated by the +kernel will be multiple of this sector size and will be aligned accordingly. +However, communication between the device and the driver will still be performed +in sectors of 512 bytes in size, so conversion should be done each time (an +example of such conversion is when calling the set_capacity() function +in the code above).

+
+
+

struct block_device_operations structure

+

Just as for a character device, operations in struct file_operations +should be completed, so for a block device, the operations in +struct block_device_operations should be completed. The association +of operations is done through the fops field in the +struct gendisk +structure.

+

Some of the fields of the struct block_device_operations structure +are presented below:

+
struct block_device_operations {
+    int (*open) (struct block_device *, fmode_t);
+    int (*release) (struct gendisk *, fmode_t);
+    int (*locked_ioctl) (struct block_device *, fmode_t, unsigned,
+                         unsigned long);
+    int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+    int (*compat_ioctl) (struct block_device *, fmode_t, unsigned,
+                         unsigned long);
+    int (*direct_access) (struct block_device *, sector_t,
+                          void **, unsigned long *);
+    int (*media_changed) (struct gendisk *);
+    int (*revalidate_disk) (struct gendisk *);
+    int (*getgeo)(struct block_device *, struct hd_geometry *);
+    blk_qc_t (*submit_bio) (struct bio *bio);
+    struct module *owner;
+}
+
+
+

open() and release() operations are called directly from user +space by utilities that may perform the following tasks: partitioning, file +system creation, file system verification. In a mount() operation, the +open() function is called directly from the kernel space, the file +descriptor being stored by the kernel. A driver for a block device can not +differentiate between open() calls performed from user space and kernel +space.

+

An example of how to use these two functions is given below:

+
#include <linux/fs.h>
+#include <linux/genhd.h>
+
+static struct my_block_dev {
+    //...
+    struct gendisk * gd;
+    //...
+} dev;
+
+static int my_block_open(struct block_device *bdev, fmode_t mode)
+{
+    //...
+
+    return 0;
+}
+
+static int my_block_release(struct gendisk *gd, fmode_t mode)
+{
+    //...
+
+    return 0;
+}
+
+struct block_device_operations my_block_ops = {
+    .owner = THIS_MODULE,
+    .open = my_block_open,
+    .release = my_block_release
+};
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    //....
+    dev->gd->fops = &my_block_ops;
+    dev->gd->private_data = dev;
+    //...
+}
+
+
+

Please notice that there are no read or write operations. These operations are +performed by the request() function associated with the request queue +of the disk.

+
+
+

Request Queues - Multi-Queue Block Layer

+

Drivers for block devices use queues to store the block I/O requests that will +be processed. A request queue is represented by the +struct request_queue structure. The request queue is made up of a +double-linked list of requests and their associated control information. The +requests are added to the queue by higher-level kernel code (for example, file +systems).

+

The block device driver associates each queue with a handling function, which +will be called for each request in the queue +(the struct request structure).

+

In earlier version of the Linux kernel, each device driver had associated one or +more request queues (struct request_queue), where any client could add +requests, while also being able to reorder them. +The problem with this approach is that it requires a per-queue lock, making it +inefficient in distributed systems.

+

The Multi-Queue Block Queing Mechanism +solves this issue by splitting the device driver queue in two parts:

+
+
    +
  1. Software staging queues
  2. +
  3. Hardware dispatch queues
  4. +
+
+
+

Software staging queues

+

The staging queues hold requests from the clients before sending them to the +block device driver. To prevent the waiting for a per-queue lock, a staging +queue is allocated for each CPU or node. A software queue is associated to +only one hardware queue.

+

While in this queue, the requests can be merged or reordered, according to an +I/O Scheduler, in order to maximize performance. This means that only the +requests coming from the same CPU or node can be optimized.

+

Staging queues are usually not used by the block device drivers, but only +internally by the I/O subsystem to optimize requests before sending them to the +device drivers.

+
+
+

Hardware dispatch queues

+

The hardware queues (struct blk_mq_hw_ctx) are used to send the +requests from the staging queues to the block device driver. +Once in this queue, the requests can't be merged or reordered.

+

Depending on the underlying hardware, a block device driver can create multiple +hardware queues in order to improve parallelism and maximize performance.

+
+
+

Tag sets

+

A block device driver can accept a request before the previous one is completed. +As a consequence, the upper layers need a way to know when a request is +completed. For this, a "tag" is added to each request upon submission and sent +back using a completion notification after the request is completed.

+

The tags are part of a tag set (struct blk_mq_tag_set), which is +unique to a device. +The tag set structure is allocated and initialized before the request queues +and also stores some of the queues properties.

+
struct blk_mq_tag_set {
+  ...
+  const struct blk_mq_ops   *ops;
+  unsigned int               nr_hw_queues;
+  unsigned int               queue_depth;
+  unsigned int               cmd_size;
+  int                        numa_node;
+  void                      *driver_data;
+  struct blk_mq_tags       **tags;
+  struct list_head           tag_list;
+  ...
+};
+
+
+

Some of the fields in struct blk_mq_tag_set are:

+
+
    +
  • ops - Queue operations, most notably the request handling function.
  • +
  • nr_hw_queues - The number of hardware queues allocated for the device
  • +
  • queue_depth - Hardware queues size
  • +
  • cmd_size - Number of extra bytes allocated at the end of the device, to +be used by the block device driver, if needed.
  • +
  • numa_node - In NUMA systems, the index of the node the storage device is +connected to.
  • +
  • driver_data - Data private to the driver, if needed.
  • +
  • tags - Pointer to an array of nr_hw_queues tag sets.
  • +
  • tag_list - List of request queues using this tag set.
  • +
+
+
+
+

Create and delete a request queue

+

Request queues are created using the blk_mq_init_queue() function and +are deleted using blk_cleanup_queue(). The first function creates both +the hardware and the software queues and initializes their structures.

+

Queue properties, including the number of hardware queues, their capacity and +request handling function are configured using the blk_mq_tag_set +structure, as described above.

+

An example of using these functions is as follows:

+
#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+static struct my_block_dev {
+    //...
+    struct blk_mq_tag_set tag_set;
+    struct request_queue *queue;
+    //...
+} dev;
+
+static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx,
+                                     const struct blk_mq_queue_data *bd)
+//...
+
+static struct blk_mq_ops my_queue_ops = {
+   .queue_rq = my_block_request,
+};
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    /* Initialize tag set. */
+    dev->tag_set.ops = &my_queue_ops;
+    dev->tag_set.nr_hw_queues = 1;
+    dev->tag_set.queue_depth = 128;
+    dev->tag_set.numa_node = NUMA_NO_NODE;
+    dev->tag_set.cmd_size = 0;
+    dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+    err = blk_mq_alloc_tag_set(&dev->tag_set);
+    if (err) {
+        goto out_err;
+    }
+
+    /* Allocate queue. */
+    dev->queue = blk_mq_init_queue(&dev->tag_set);
+    if (IS_ERR(dev->queue)) {
+        goto out_blk_init;
+    }
+
+    blk_queue_logical_block_size(dev->queue, KERNEL_SECTOR_SIZE);
+
+     /* Assign private data to queue structure. */
+    dev->queue->queuedata = dev;
+    //...
+
+out_blk_init:
+    blk_mq_free_tag_set(&dev->tag_set);
+out_err:
+    return -ENOMEM;
+}
+
+static int my_block_init(void)
+{
+    int status;
+    //...
+    status = create_block_device(&dev);
+    if (status < 0)
+        return status;
+    //...
+}
+
+static void delete_block_device(struct block_dev *dev)
+{
+    //...
+    blk_cleanup_queue(dev->queue);
+    blk_mq_free_tag_set(&dev->tag_set);
+}
+
+static void my_block_exit(void)
+{
+    delete_block_device(&dev);
+    //...
+}
+
+
+

After initializing the tag set structure, the tag lists are allocated using the +blk_mq_alloc_tag_set() function. +The pointer to the function which will process the requests +(my_block_request()) is filled in the my_queue_ops structure and +then the pointer to this structure is added to the tag set.

+

The queue is created using the blk_mq_init_queue() function, based on +the information added in the tag set.

+

As part of the request queue initialization, you can configure the +queuedata field, which is equivalent to the private_data +field in other structures.

+
+
+

Useful functions for processing request queues

+

The queue_rq function from struct blk_mq_ops is used to handle +requests for working with the block device. +This function is the equivalent of read and write functions encountered on +character devices. The function receives the requests for the device as +arguments and can use various functions for processing them.

+

The functions used to process the requests in the handler are described below:

+
+
    +
  • blk_mq_start_request() - must be called before starting processing +a request;
  • +
  • blk_mq_requeue_request() - to re-send the request in the queue;
  • +
  • blk_mq_end_request() - to end request processing and notify the +upper layers.
  • +
+
+
+
+
+

Requests for block devices

+

A request for a block device is described by struct request +structure.

+

The fields of struct request structure include:

+
+
    +
  • cmd_flags: a series of flags including direction (reading or +writing); to find out the direction, the macrodefinition +rq_data_dir is used, which returns 0 for a read request and 1 +for a write request on the device;
  • +
  • __sector: the first sector of the transfer request; if the +device sector has a different size, the appropriate conversion should be +done. To access this field, use the blk_rq_pos macro;
  • +
  • __data_len: the total number of bytes to be transferred; to +access this field the blk_rq_bytes macro is used;
  • +
  • generally, data from the current struct bio will be +transferred; the data size is obtained using the +blk_rq_cur_bytes macro;
  • +
  • bio, a dynamic list of struct bio structures that +is a set of buffers associated to the request; this field is accessed by +macrodefinition rq_for_each_segment if there are multiple +buffers, or by bio_data macrodefinition in case there is only +one associated buffer;
  • +
+
+

We will discuss more about the struct bio structure and its +associated operations in the bio_structure section.

+
+

Create a request

+

Read /write requests are created by code layers superior to the kernel I/O +subsystem. Typically, the subsystem that creates requests for block devices is +the file management subsystem. The I/O subsystem acts as an interface between +the file management subsystem and the block device driver. The main operations +under the responsibility of the I/O subsystem are adding requests to the queue +of the specific block device and sorting and merging requests according to +performance considerations.

+
+
+

Process a request

+

The central part of a block device driver is the request handling function +(queue_rq). In previous examples, the function that fulfilled this role was +my_block_request(). As stated in the +Create and delete a request queue section, this function is associated to the +driver when creating the tag set structure.

+

This function is called when the kernel considers that the driver should process +I/O requests. The function must start processing the requests from the queue, +but it is not mandatory to finish them, as requests may be finished by other +parts of the driver.

+

The request function runs in an atomic context and must follow the rules for +atomic code (it does not need to call functions that can cause sleep, etc.).

+

Calling the function that processes the requests is asynchronous relative +to the actions of any userspace process and no assumptions about the process +in which the respective function is running should be made. Also, it should not +be assumed that the buffer provided by a request is from kernel space or user +space, any operation that accesses the userspace being erroneous.

+

One of the simplest request handling function is presented below:

+
static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx,
+                                     const struct blk_mq_queue_data *bd)
+{
+    struct request *rq = bd->rq;
+    struct my_block_dev *dev = q->queuedata;
+
+    blk_mq_start_request(rq);
+
+    if (blk_rq_is_passthrough(rq)) {
+        printk (KERN_NOTICE "Skip non-fs request\n");
+        blk_mq_end_request(rq, BLK_STS_IOERR);
+        goto out;
+    }
+
+    /* do work */
+    ...
+
+    blk_mq_end_request(rq, BLK_STS_OK);
+
+out:
+    return BLK_STS_OK;
+}
+
+
+

The my_block_request() function performs the following operations:

+
+
    +
  • Get a pointer to the request structure from the bd argument and start +its processing using the blk_mq_start_request() function.
  • +
  • A block device can receive calls which do not transfer data blocks (e.g. +low level operations on the disk, instructions referring to special ways of +accessing the device). Most drivers do not know how to handle these +requests and return an error.
  • +
  • To return an error, blk_mq_end_request() function is called, +BLK_STS_IOERR being the second argument.
  • +
  • The request is processed according to the needs of the associated device.
  • +
  • The request ends. In this case, blk_mq_end_request() function is +called in order to complete the request.
  • +
+
+
+
+
+

struct bio structure

+

Each struct request structure is an I/O block request, but may come +from combining more independent requests from a higher level. The sectors to be +transferred for a request can be scattered into the main memory but they always +correspond to a set of consecutive sectors on the device. The request is +represented as a series of segments, each corresponding to a buffer in memory. +The kernel can combine requests that refer to adjacent sectors but will not +combine write requests with read requests into a single +struct request structure.

+

A struct request structure is implemented as a linked list of +struct bio structures together with information that allows the +driver to retain its current position while processing the request.

+

The struct bio structure is a low-level description of a portion of +a block I/O request.

+
struct bio {
+    //...
+    struct gendisk          *bi_disk;
+    unsigned int            bi_opf;         /* bottom bits req flags, top bits REQ_OP. Use accessors. */
+    //...
+    struct bio_vec          *bi_io_vec;     /* the actual vec list */
+    //...
+    struct bvec_iter        bi_iter;
+    /...
+    void                    *bi_private;
+    //...
+};
+
+
+

In turn, the struct bio structure contains a bi_io_vec +vector of struct bio_vec structures. It consists of the individual +pages in the physical memory to be transferred, the offset within the page and +the size of the buffer. To iterate through a struct bio structure, +we need to iterate through the vector of struct bio_vec and transfer +the data from every physical page. To simplify vector iteration, the +struct bvec_iter structure is used. This structure maintains +information about how many buffers and sectors were consumed during the +iteration. The request type is encoded in the bi_opf field; to +determine it, use the bio_data_dir() function.

+
+

Create a struct bio structure

+

Two functions can be used to create a struct bio structure:

+
+
    +
  • bio_alloc(): allocates space for a new structure; the structure +must be initialized;
  • +
  • bio_clone(): makes a copy of an existing struct bio +structure; the newly obtained structure is initialized with the values of +the cloned structure fields; the buffers are shared with the +struct bio structure that has been cloned so that access to the +buffers has to be done carefully to avoid access to the same memory area +from the two clones;
  • +
+
+

Both functions return a new struct bio structure.

+
+
+

Submit a struct bio structure

+

Usually, a struct bio structure is created by the higher levels of +the kernel (usually the file system). A structure thus created is then +transmitted to the I/O subsystem that gathers more struct bio +structures into a request.

+

For submitting a struct bio structure to the associated I/O device +driver, the submit_bio() function is used. The function receives as +argument an initialized struct bio structure that will be added to +a request from the request queue of an I/O device. From that queue, it can be +processed by the I/O device driver using a specialized function.

+
+
+

Wait for the completion of a struct bio structure

+

Submitting a struct bio structure to a driver has the effect of +adding it to a request from the request queue from where it will be further +processed. Thus, when the submit_bio() function returns, it is not +guaranteed that the processing of the structure has finished. If you want to +wait for the processing of the request to be finished, use the +submit_bio_wait() function.

+

To be notified when the processing of a struct bio structure ends +(when we do not use submit_bio_wait() function), the +bi_end_io field of the structure should be used. This field +specifies the function that will be called at the end of the +struct bio structure processing. You can use the +bi_private field of the structure to pass information to the +function.

+
+
+

Initialize a struct bio structure

+

Once a struct bio structure has been allocated and before being +transmitted, it must be initialized.

+

Initializing the structure involves filling in its important fields. As +mentioned above, the bi_end_io field is used to specify the function +called when the processing of the structure is finished. The +bi_private field is used to store useful data that can be accessed +in the function pointed by bi_end_io.

+

The bi_opf field specifies the type of operation.

+
struct bio *bio = bio_alloc(GFP_NOIO, 1);
+//...
+bio->bi_disk = bdev->bd_disk;
+bio->bi_iter.bi_sector = sector;
+bio->bi_opf = REQ_OP_READ;
+bio_add_page(bio, page, size, offset);
+//...
+
+
+

In the code snippet above we specified the block device to which we sent the +following: struct bio structure, startup sector, operation +(REQ_OP_READ or REQ_OP_WRITE) and content. The content of a +struct bio structure is a buffer described by: a physical page, +the offset in the page and the size of the bufer. A page can be assigned using +the alloc_page() call.

+
+

Note

+

The size field of the bio_add_page() call must be +a multiple of the device sector size.

+
+
+
+

How to use the content of a struct bio structure

+

To use the content of a struct bio structure, the structure's +support pages must be mapped to the kernel address space from where they can be +accessed. For mapping /unmapping, use the kmap_atomic and +the kunmap_atomic macros.

+

A typical example of use is:

+
static void my_block_transfer(struct my_block_dev *dev, size_t start,
+                              size_t len, char *buffer, int dir);
+
+
+static int my_xfer_bio(struct my_block_dev *dev, struct bio *bio)
+{
+    struct bio_vec bvec;
+    struct bvec_iter i;
+    int dir = bio_data_dir(bio);
+
+    /* Do each segment independently. */
+    bio_for_each_segment(bvec, bio, i) {
+        sector_t sector = i.bi_sector;
+        char *buffer = kmap_atomic(bvec.bv_page);
+        unsigned long offset = bvec.bv_offset;
+        size_t len = bvec.bv_len;
+
+        /* process mapped buffer */
+        my_block_transfer(dev, sector, len, buffer + offset, dir);
+
+        kunmap_atomic(buffer);
+    }
+
+    return 0;
+}
+
+
+

As it can be seen from the example above, iterating through a +struct bio requires iterating through all of its segments. A segment +(struct bio_vec) is defined by the physical address page, the offset +in the page and its size.

+

To simplify the processing of a struct bio, use the +bio_for_each_segment macrodefinition. It will iterate through all +segments, and will also update global information stored in an iterator +(struct bvec_iter) such as the current sector as well as other +internal information (segment vector index, number of bytes left to be +processed, etc.) .

+

You can store information in the mapped buffer, or extract information.

+

In case request queues are used and you needed to process the requests +at struct bio level, use the rq_for_each_segment +macrodefinition instead of the bio_for_each_segment macrodefinition. +This macrodefinition iterates through each segment of each +struct bio structure of a struct request structure and +updates a struct req_iterator structure. The +struct req_iterator contains the current struct bio +structure and the iterator that traverses its segments.

+

A typical example of use is:

+
struct bio_vec bvec;
+struct req_iterator iter;
+
+rq_for_each_segment(bvec, req, iter) {
+    sector_t sector = iter.iter.bi_sector;
+    char *buffer = kmap_atomic(bvec.bv_page);
+    unsigned long offset = bvec.bv_offset;
+    size_t len = bvec.bv_len;
+    int dir = bio_data_dir(iter.bio);
+
+    my_block_transfer(dev, sector, len, buffer + offset, dir);
+
+    kunmap_atomic(buffer);
+}
+
+
+
+
+

Free a struct bio structure

+

Once a kernel subsystem uses a struct bio structure, it will have to +release the reference to it. This is done by calling bio_put() function.

+
+
+

Set up a request queue at struct bio level

+

We have previously seen how we can specify a function to be used to process +requests sent to the driver. The function receives as argument the requests and +carries out processing at struct request level.

+

If, for flexibility reasons, we need to specify a function that carries +out processing at struct bio structure level, we no longer +use request queues and we will need to fill the submit_bio field in the +struct block_device_operations associated to the driver.

+

Below is a typical example of initializing a function that carries out +processing at struct bio structure level:

+
// the declaration of the function that carries out processing
+// :c:type:`struct bio` structures
+static blk_qc_t my_submit_bio(struct bio *bio);
+
+struct block_device_operations my_block_ops = {
+   .owner = THIS_MODULE,
+   .submit_bio = my_submit_bio
+   ...
+};
+
+
+
+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is block_device_drivers. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/block_device_drivers/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR find the definitions of the following symbols in the Linux kernel:

+
+
    +
  • struct bio
  • +
  • struct bio_vec
  • +
  • bio_for_each_segment
  • +
  • struct gendisk
  • +
  • struct block_device_operations
  • +
  • struct request
  • +
+
+
+
+

1. Block device

+

Create a kernel module that allows you to register or deregister a block device. +Start from the files in the 1-2-3-6-ram-disk/kernel directory in the +lab skeleton.

+

Follow the comments marked with TODO 1 in the laboratory skeleton. Use the +existing macrodefinitions (MY_BLOCK_MAJOR, +MY_BLKDEV_NAME). Check the value returned by the register function, +and in case of error, return the error code.

+

Compile the module, copy it to the virtual machine and insert it into the +kernel. Verify that your device was successfully created inside the +/proc/devices. +You will see a device with major 240.

+

Unload the kernel module and check that the device was unregistered.

+
+

Hint

+

Review the Register a block I/O device section.

+
+

Change the MY_BLOCK_MAJOR value to 7. Compile the module, copy it to +the virtual machine, and insert it into the kernel. Notice that the insertion +fails because there is already another driver/device registered in the kernel +with the major 7.

+

Restore the 240 value for the MY_BLOCK_MAJOR macro.

+
+
+

2. Disk registration

+

Modify the previous module to add a disk associated with the driver. Analyze the +macrodefinitions, my_block_dev structure and existing functions from +the ram-disk.c file.

+

Follow the comments marked with TODO 2. Use the +create_block_device() and the delete_block_device() functions.

+
+

Hint

+

Review the Register a disk and Process a request sections.

+
+

Fill in the my_block_request() function to process the request +without actually processing your request: display the "request received" message +and the following information: start sector, total size, data size from the +current struct bio structure, direction. To validate a request type, +use the blk_rq_is_passthrough() (the function returns 0 in the case in +which we are interested, i.e. when the request is generated by the file system).

+
+

Hint

+

To find the needed info, review the Requests for block devices +section.

+
+

Use the blk_mq_end_request() function to finish processing the +request.

+

Insert the module into the kernel and inspect the messages printed +by the module. When a device is added, a request is sent to the device. Check +the presence of /dev/myblock and if it doesn't exist, create the device +using the command:

+
mknod /dev/myblock b 240 0
+
+
+

To generate writing requests, use the command:

+
echo "abc"> /dev/myblock
+
+
+

Notice that a write request is preceded by a read request. The request +is done to read the block from the disk and "update" its content with the +data provided by the user, without overwriting the rest. After reading and +updating, writing takes place.

+
+
+

3. RAM disk

+

Modify the previous module to create a RAM disk: requests to the device will +result in reads/writes in a memory area.

+

The memory area dev->data is already allocated in the source code of +the module using vmalloc() and deallocated using vfree().

+
+

Note

+

Review the Process a request section.

+
+

Follow the comments marked with TODO 3 to complete the +my_block_transfer() function to write/read the request information +in/from the memory area. The function will be called for each request within +the queue processing function: my_block_request(). To write/read +to/from the memory area, use memcpy(). To determine the write/read +information, use the fields of the struct request structure.

+
+

Hint

+

To find out the size of the request data, use the +blk_rq_cur_bytes macro. Do not use the +blk_rq_bytes macro.

+
+
+

Hint

+

To find out the buffer associated to the request, use +bio_data`(:c:data:`rq->bio).

+
+
+

Hint

+

A description of useful macros is in the Requests for block devices +section.

+
+
+

Hint

+

You can find useful information in the +block device driver example +from Linux Device Driver.

+
+

For testing, use the test file user/ram-disk-test.c. +The test program is compiled automatically at make build, copied to the +virtual machine at make copy and can be run on the QEMU virtual machine +using the command:

+
./ram-disk-test
+
+
+

There is no need to insert the module into the kernel, it will be inserted by +the ram-disk-test command.

+

Some tests may fail because of lack of synchronization between the transmitted +data (flush).

+
+
+

4. Read data from the disk

+

The purpose of this exercise is to read data from the +PHYSICAL_DISK_NAME disk (/dev/vdb) directly from the kernel.

+
+

Attention

+

Before solving the exercise, we need to make sure the disk is +added to the virtual machine.

+

Check the variable QEMU_OPTS from qemu/Makefile. +There should already be two extra disks added using -drive ....

+

If there are not, generate a file that we will use as +the disk image using the command: +dd if=/dev/zero of=qemu/mydisk.img bs=1024 count=1 +and add the following option: +-drive file=qemu/mydisk.img,if=virtio,format=raw +to qemu/Makefile (in the QEMU_OPTS variable, +after the root disk).

+
+

Follow the comments marked with TODO 4 in the directory 4-5-relay/ +and implement open_disk() and close_disk(). +Use the blkdev_get_by_path() and blkdev_put() functions. The +device must be opened in read-write mode exclusively +(FMODE_READ | FMODE_WRITE | FMODE_EXCL), and +as holder you must use the current module (THIS_MODULE).

+

Implement the send_test_bio() function. You will have to create a new +struct bio structure and fill it, submit it and wait for it. Read the +first sector of the disk. To wait, call the submit_bio_wait() function.

+
+

Hint

+

The first sector of the disk is the sector with the index 0. +This value must be used to initialize the field +bi_iter.bi_sector of the struct bio.

+

For the read operation, use the REQ_OP_READ macro to +initialize the bi_opf field of the struct bio.

+
+

After finishing the operation, display the first 3 bytes of data read by +struct bio structure. Use the format "% 02x" for printk() +to display the data and the kmap_atomic and kunmap_atomic +macros respectively.

+
+

Hint

+

As an argument for the kmap_atomic() function, just use the +page which is allocated above in the code, in the page +variable.

+
+ +

For testing, use the test-relay-disk script, which is copied on the +virtual machine when running make copy. If it is not copied, make +sure it is executable:

+
chmod +x test-relay-disk
+
+
+

There is no need to load the module into the kernel, it will be loaded by +test-relay-disk.

+

Use the command below to run the script:

+
./test-relay-disk
+
+
+

The script writes "abc" at the beginning of the disk indicated by +PHYSICAL_DISK_NAME. After running, the module will display 61 62 63 +(the corresponding hexadecimal values of letters "a", "b" and "c").

+
+
+

5. Write data to the disk

+

Follow the comments marked with TODO 5 to write a message +(BIO_WRITE_MESSAGE) on the disk.

+

The send_test_bio() function receives as argument the operation type +(read or write). Call in the relay_init() function the function for +reading and in the relay_exit() function the function for writing. We +recommend using the REQ_OP_READ and the REQ_OP_WRITE +macros.

+

Inside the send_test_bio() function, if the operation is write, fill in +the buffer associated to the struct bio structure with the message +BIO_WRITE_MESSAGE. Use the kmap_atomic and the +kunmap_atomic macros to work with the buffer associated to the +struct bio structure.

+
+

Hint

+

You need to update the type of the operation associated to the +struct bio structure by setting the bi_opf field +accordingly.

+
+

For testing, run the test-relay-disk script using the command:

+
./test-relay-disk
+
+
+

The script will display the "read from /dev/sdb: 64 65 66" message at the +standard output.

+
+
+

6. Processing requests from the request queue at struct bio level

+

In the implementation from Exercise 3, we have only processed a +struct bio_vec of the current struct bio from the request. +We want to process all struct bio_vec structures from all +struct bio structures. +For this, we will iterate through all struct bio requests and through +all struct bio_vec structures (also called segments) of each +struct bio.

+

Add, within the ramdisk implementation (1-2-3-6-ram-disk/ directory), +support for processing the requests from the request queue at +struct bio level. Follow the comments marked with TODO 6.

+

Set the USE_BIO_TRANSFER macro to 1.

+

Implement the my_xfer_request() function. Use the +rq_for_each_segment macro to iterate through the bio_vec +structures of each struct bio from the request.

+
+

Hint

+

Review the indications and the code snippets from the +How to use the content of a struct bio structure section.

+
+
+

Hint

+

Use the struct bio segment iterator to get the current +sector (iter.iter.bi_sector).

+
+
+

Hint

+

Use the request iterator to get the reference to the current +struct bio (iter.bio).

+
+
+

Hint

+

Use the bio_data_dir macro to find the reading or writing +direction for a struct bio.

+
+

Use the kmap_atomic or the kunmap_atomic macros to map +the pages of each struct bio structure and access its associated +buffers. For the actual transfer, call the my_block_transfer() function +implemented in the previous exercise.

+

For testing, use the ram-disk-test.c test file:

+
./ram-disk-test
+
+
+

There is no need to insert the module into the kernel, it will be inserted by +the ram-disk-test executable.

+

Some tests may crash because of lack of synchronization between the transmitted +data (flush).

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/deferred_work.html b/refs/pull/405/merge/labs/deferred_work.html new file mode 100644 index 00000000..1ab912d1 --- /dev/null +++ b/refs/pull/405/merge/labs/deferred_work.html @@ -0,0 +1,1072 @@ + + + + + + Deferred work — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Deferred work

+
+

Lab objectives

+
    +
  • Understanding deferred work (i.e. code scheduled to be executed at a +later time)
  • +
  • Implementation of common tasks that uses deferred work
  • +
  • Understanding the peculiarities of synchronization for deferred work
  • +
+

Keywords: softirq, tasklet, struct tasklet_struct, bottom-half +handlers, jiffies, HZ, timer, struct timer_list, spin_lock_bh, +spin_unlock_bh, workqueue, struct work_struct, kernel thread, events/x

+
+
+

Background information

+

Deferred work is a class of kernel facilities that allows one to +schedule code to be executed at a later timer. This scheduled code can +run either in the process context or in interruption context depending +on the type of deferred work. Deferred work is used to complement the +interrupt handler functionality since interrupts have important +requirements and limitations:

+
    +
  • The execution time of the interrupt handler must be as small as +possible
  • +
  • In interrupt context we can not use blocking calls
  • +
+

Using deferred work we can perform the minimum required work in the +interrupt handler and schedule an asynchronous action from the +interrupt handler to run at a later time and execute the rest of the +operations.

+

Deferred work that runs in interrupt context is also known as +bottom-half, since its purpose is to execute the rest of the actions +from an interrupt handler (top-half).

+

Timers are another type of deferred work that are used to schedule the +execution of future actions after a certain amount of time has passed.

+

Kernel threads are not themselves deferred work, but can be used to +complement the deferred work mechanisms. In general, kernel threads +are used as "workers" to process events whose execution contains +blocking calls.

+

There are three typical operations that are used with all types of +deferred work:

+
    +
  1. Initialization. Each type is described by a structure whose +fields will have to be initialized. The handler to be scheduled is +also set at this time.
  2. +
  3. Scheduling. Schedules the execution of the handler as soon as +possible (or after expiry of a timeout).
  4. +
  5. Masking or Canceling. Disables the execution of the +handler. This action can be either synchronous (which guarantees +that the handler will not run after the completion of canceling) or +asynchronous.
  6. +
+
+

Attention

+

When doing deferred work cleanup, like freeing the +structures associated with the deferred work or +removing the module and thus the handler code from the +kernel, always use the synchronous type of canceling +the deferred work.

+
+

The main types of deferred work are kernel threads and softirqs. Work +queues are implemented on top of kernel threads and tasklets and +timers on top of softirqs. Bottom-half handlers were the first +implementation of deferred work in Linux, but in the meantime it was +replaced by softirqs. That is why some functions presented +contain bh in their name.

+
+
+

Softirqs

+

softirqs can not be used by device drivers, they are reserved for +various kernel subsystems. Because of this there is a fixed number of +softirqs defined at compile time. For the current kernel version we +have the following types defined:

+
enum {
+    HI_SOFTIRQ = 0,
+    TIMER_SOFTIRQ,
+    NET_TX_SOFTIRQ,
+    NET_RX_SOFTIRQ,
+    BLOCK_SOFTIRQ,
+    IRQ_POLL_SOFTIRQ,
+    TASKLET_SOFTIRQ,
+    SCHED_SOFTIRQ,
+    HRTIMER_SOFTIRQ,
+    RCU_SOFTIRQ,
+    NR_SOFTIRQS
+};
+
+
+

Each type has a specific purpose:

+
    +
  • HI_SOFTIRQ and TASKLET_SOFTIRQ - running tasklets
  • +
  • TIMER_SOFTIRQ - running timers
  • +
  • NET_TX_SOFIRQ and NET_RX_SOFTIRQ - used by the networking subsystem
  • +
  • BLOCK_SOFTIRQ - used by the IO subsystem
  • +
  • BLOCK_IOPOLL_SOFTIRQ - used by the IO subsystem to increase performance when the iopoll handler is invoked;
  • +
  • SCHED_SOFTIRQ - load balancing
  • +
  • HRTIMER_SOFTIRQ - implementation of high precision timers
  • +
  • RCU_SOFTIRQ - implementation of RCU type mechanisms [1]
  • +
+ + + + + +
[1]RCU is a mechanism by which destructive operations +(e.g. deleting an element from a chained list) are done in two +steps: (1) removing references to deleted data and (2) freeing +the memory of the element. The second setup is done only after +we are sure nobody uses the element anymore. The advantage of +this mechanism is that reading the data can be done without +synchronization. For more information see +Documentation/RCU/rcu.txt.
+

The highest priority is the HI_SOFTIRQ type softirqs, followed in +order by the other softirqs defined. RCU_SOFTIRQ has the lowest +priority.

+

Softirqs are running in interrupt context which means that they can +not call blocking functions. If the sofitrq handler requires calls to +such functions, work queues can be scheduled to execute these blocking +calls.

+
+

Tasklets

+

A tasklet is a special form of deferred work that runs in interrupt +context, just like softirqs. The main difference between sofirqs and tasklets +is that tasklets can be allocated dynamically and thus they can be used +by device drivers. A tasklet is represented by struct +tasklet and as many other kernel structures it needs to be +initialized before being used. A pre-initialized tasklet can be defined +as following:

+
void handler(unsigned long data);
+
+DECLARE_TASKLET(tasklet, handler, data);
+DECLARE_TASKLET_DISABLED(tasklet, handler, data);
+
+
+

If we want to initialize the tasklet manually we can use the following +approach:

+
void handler(unsigned long data);
+
+struct tasklet_struct tasklet;
+
+tasklet_init(&tasklet, handler, data);
+
+
+

The data parameter will be sent to the handler when it is executed.

+

Programming tasklets for running is called scheduling. Tasklets are +running from softirqs. Tasklets scheduling is done with:

+
void tasklet_schedule(struct tasklet_struct *tasklet);
+
+void tasklet_hi_schedule(struct tasklet_struct *tasklet);
+
+
+

When using tasklet_schedule, a TASKLET_SOFTIRQ softirq is +scheduled and all tasklets scheduled are run. For +tasklet_hi_schedule, a HI_SOFTIRQ softirq is scheduled.

+

If a tasklet was scheduled multiple times and it did not run between +schedules, it will run once. Once the tasklet has run, it can be +re-scheduled, and will run again at a later timer. Tasklets can be +re-scheduled from their handlers.

+

Tasklets can be masked and the following functions can be used:

+
void tasklet_enable(struct tasklet_struct * tasklet);
+void tasklet_disable(struct tasklet_struct * tasklet);
+
+
+

Remember that since tasklets are running from softirqs, blocking calls +can not be used in the handler function.

+
+
+

Timers

+

A particular type of deferred work, very often used, are timers. They +are defined by struct timer_list. They run in interrupt +context and are implemented on top of softirqs.

+

To be used, a timer must first be initialized by calling timer_setup():

+
#include <linux/sched.h>
+
+void timer_setup(struct timer_list * timer,
+                 void (*function)(struct timer_list *),
+                 unsigned int flags);
+
+
+

The above function initializes the internal fields of the structure +and associates function as the timer handler. Since timers are planned +over softirqs, blocking calls can not be used in the code associated +with the treatment function.

+

Scheduling a timer is done with mod_timer():

+
int mod_timer(struct timer_list *timer, unsigned long expires);
+
+
+

Where expires is the time (in the future) to run the handler +function. The function can be used to schedule or reschedule a timer.

+

The time unit is jiffie. The absolute value of a jiffie +is dependent on the platform and it can be found using the +HZ macro that defines the number of jiffies for 1 second. To +convert between jiffies (jiffies_value) and seconds (seconds_value), +the following formulas are used:

+
jiffies_value = seconds_value * HZ ;
+seconds_value = jiffies_value / HZ ;
+
+
+

The kernel maintains a counter that contains the number of jiffies +since the last boot, which can be accessed via the jiffies +global variable or macro. We can use it to calculate a time in the +future for timers:

+
#include <linux/jiffies.h>
+
+unsigned long current_jiffies, next_jiffies;
+unsigned long seconds = 1;
+
+current_jiffies = jiffies;
+next_jiffies = jiffies + seconds * HZ;
+
+
+

To stop a timer, use del_timer() and del_timer_sync():

+
int del_timer(struct timer_list *timer);
+int del_timer_sync(struct timer_list *timer);
+
+
+

These functions can be called for both a scheduled timer and an +unplanned timer. del_timer_sync() is used to eliminate the +races that can occur on multiprocessor systems, since at the end of +the call it is guaranteed that the timer processing function does not +run on any processor.

+

A frequent mistake in using timers is that we forget to turn off +timers. For example, before removing a module, we must stop the timers +because if a timer expires after the module is removed, the handler +function will no longer be loaded into the kernel and a kernel oops +will be generated.

+

The usual sequence used to initialize and schedule a one-second +timeout is:

+
#include <linux/sched.h>
+
+void timer_function(struct timer_list *);
+
+struct timer_list timer ;
+unsigned long seconds = 1;
+
+timer_setup(&timer, timer_function, 0);
+mod_timer(&timer, jiffies + seconds * HZ);
+
+
+

And to stop it:

+
del_timer_sync(&timer);
+
+
+
+
+

Locking

+

For synchronization between code running in process context (A) and +code running in softirq context (B) we need to use special locking +primitives. We must use spinlock operations augmented with +deactivation of bottom-half handlers on the current processor in (A), +and in (B) only basic spinlock operations. Using spinlocks makes sure +that we don't have races between multiple CPUs while deactivating the +softirqs makes sure that we don't deadlock in the softirq is scheduled +on the same CPU where we already acquired a spinlock.

+

We can use the local_bh_disable() and +local_bh_enable() to disable and enable softirqs handlers (and +since they run on top of softirqs also timers and tasklets):

+
void local_bh_disable(void);
+void local_bh_enable(void);
+
+
+

Nested calls are allowed, the actual reactivation of the softirqs is +done only when all local_bh_disable() calls have been complemented by +local_bh_enable() calls:

+
/* We assume that softirqs are enabled */
+local_bh_disable();  /* Softirqs are now disabled */
+local_bh_disable();  /* Softirqs remain disabled */
+
+local_bh_enable();  /* Softirqs remain disabled */
+local_bh_enable();  /* Softirqs are now enabled */
+
+
+
+

Attention

+

These above calls will disable the softirqs only on the +local processor and they are usually not safe to use, they must be +complemented with spinlocks.

+
+

Most of the time device drivers will use special versions of spinlocks +calls for synchronization like spin_lock_bh() and +spin_unlock_bh():

+
void spin_lock_bh(spinlock_t *lock);
+void spin_unlock_bh(spinlock_t *lock);
+
+
+
+
+
+

Workqueues

+

Workqueues are used to schedule actions to run in process context. The +base unit with which they work is called work. There are two types of +work:

+
    +
  • struct work_struct - it schedules a task to run at +a later time
  • +
  • struct delayed_work - it schedules a task to run after at +least a given time interval
  • +
+

A delayed work uses a timer to run after the specified time +interval. The calls with this type of work are similar to those for +struct work_struct, but has _delayed in the functions +names.

+

Before using them a work item must be initialized. There are two types +of macros that can be used, one that declares and initializes the work +item at the same time and one that only initializes the work item (and +the declaration must be done separately):

+
#include <linux/workqueue.h>
+
+DECLARE_WORK(name , void (*function)(struct work_struct *));
+DECLARE_DELAYED_WORK(name, void(*function)(struct work_struct *));
+
+INIT_WORK(struct work_struct *work, void(*function)(struct work_struct *));
+INIT_DELAYED_WORK(struct delayed_work *work, void(*function)(struct work_struct *));
+
+
+

DECLARE_WORK() and DECLARE_DELAYED_WORK() declare and +initialize a work item, and INIT_WORK() and +INIT_DELAYED_WORK() initialize an already declared work item.

+

The following sequence declares and initiates a work item:

+
#include <linux/workqueue.h>
+
+void my_work_handler(struct work_struct *work);
+
+DECLARE_WORK(my_work, my_work_handler);
+
+
+

Or, if we want to initialize the work item separately:

+
void my_work_handler(struct work_struct * work);
+
+struct work_struct my_work;
+
+INIT_WORK(&my_work, my_work_handler);
+
+
+

Once declared and initialized, we can schedule the task using +schedule_work() and schedule_delayed_work():

+
schedule_work(struct work_struct *work);
+
+schedule_delayed_work(struct delayed_work *work, unsigned long delay);
+
+
+

schedule_delayed_work() can be used to plan a work item for +execution with a given delay. The delay time unit is jiffies.

+

Work items can not be masked but they can be canceled by calling +cancel_delayed_work_sync() or cancel_work_sync():

+
int cancel_work_sync(struct delayed_work *work);
+int cancel_delayed_work_sync(struct delayed_work *work);
+
+
+

The call only stops the subsequent execution of the work item. If the +work item is already running at the time of the call, it will continue +to run. In any case, when these calls return, it is guaranteed that +the task will no longer run.

+
+

Attention

+

While there are versions of these functions that are +not synchronous (.e.g. cancel_work()) do not +use them when you are performing cleanup work otherwise +race condition could occur.

+
+

We can wait for a workqueue to complete running all of its work items by calling flush_scheduled_work():

+
void flush_scheduled_work(void);
+
+
+

This function is blocking and, therefore, can not be used in interrupt +context. The function will wait for all work items to be completed. +For delayed work items, cancel_delayed_work must be called +before flush_scheduled_work().

+

Finally, the following functions can be used to schedule work items on +a particular processor (schedule_delayed_work_on()), or on all +processors (schedule_on_each_cpu()):

+
int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
+int schedule_on_each_cpu(void(*function)(struct work_struct *));
+
+
+

A usual sequence to initialize and schedule a work item is the following:

+
void my_work_handler(struct work_struct *work);
+
+struct work_struct my_work;
+
+INIT_WORK(&my_work, my_work_handler);
+
+schedule_work(&my_work);
+
+
+

And for waiting for termination of a work item:

+
flush_scheduled_work();
+
+
+

As you can see, the my_work_handler function receives the task as +the parameter. To be able to access the module's private data, you can +use container_of():

+
struct my_device_data {
+    struct work_struct my_work;
+    // ...
+};
+
+void my_work_handler(struct work_struct *work)
+{
+   struct my_device_data * my_data;
+
+   my_data = container_of(work, struct my_device_data,  my_work);
+   // ...
+}
+
+
+

Scheduling work items with the functions above will run the handler in +the context of a kernel thread called events/x, where x is the +processor number. The kernel will initialize a kernel thread (or a +pool of workers) for each processor present in the system:

+
$ ps -e
+PID TTY TIME CMD
+1?  00:00:00 init
+2 ?  00:00:00 ksoftirqd / 0
+3 ?  00:00:00 events / 0 <--- kernel thread that runs work items
+4 ?  00:00:00 khelper
+5 ?  00:00:00 kthread
+7?  00:00:00 kblockd / 0
+8?  00:00:00 kacpid
+
+
+

The above functions use a predefined workqueue (called events), and +they run in the context of the events/x thread, as noted +above. Although this is sufficient in most cases, it is a shared +resource and large delays in work items handlers can cause delays for +other queue users. For this reason there are functions for creating +additional queues.

+

A workqueue is represented by struct workqueue_struct. A new +workqueue can be created with these functions:

+
struct workqueue_struct *create_workqueue(const char *name);
+struct workqueue_struct *create_singlethread_workqueue(const char *name);
+
+
+

create_workqueue() uses one thread for each processor in the +system, and create_singlethread_workqueue() uses a single +thread.

+

To add a task in the new queue, use queue_work() or +queue_delayed_work():

+
int queue_work(struct workqueue_struct * queue, struct work_struct *work);
+
+int queue_delayed_work(struct workqueue_struct *queue,
+                       struct delayed_work * work , unsigned long delay);
+
+
+

queue_delayed_work() can be used to plan a work for execution +with a given delay. The time unit for the delay is jiffies.

+

To wait for all work items to finish call flush_workqueue():

+
void flush_workqueue(struct worksqueue_struct * queue);
+
+
+

And to destroy the workqueue call destroy_workqueue()

+
void destroy_workqueue(struct workqueue_struct *queue);
+
+
+

The next sequence declares and initializes an additional workqueue, +declares and initializes a work item and adds it to the queue:

+
void my_work_handler(struct work_struct *work);
+
+struct work_struct my_work;
+struct workqueue_struct * my_workqueue;
+
+my_workqueue = create_singlethread_workqueue("my_workqueue");
+INIT_WORK(&my_work, my_work_handler);
+
+queue_work(my_workqueue, &my_work);
+
+
+

And the next code sample shows how to remove the workqueue:

+
flush_workqueue(my_workqueue);
+destroy_workqueue(my_workqueue);
+
+
+

The work items planned with these functions will run in the context of +a new kernel thread called my_workqueue, the name passed to +create_singlethread_workqueue().

+
+
+

Kernel threads

+

Kernel threads have emerged from the need to run kernel code in +process context. Kernel threads are the basis of the workqueue +mechanism. Essentially, a kernel thread is a thread that only runs in +kernel mode and has no user address space or other user attributes.

+

To create a kernel thread, use kthread_create():

+
#include <linux/kthread.h>
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+                                      void *data, const char namefmt[], ...);
+
+
+
    +
  • threadfn is a function that will be run by the kernel thread
  • +
  • data is a parameter to be sent to the function
  • +
  • namefmt represents the kernel thread name, as it is displayed in +ps/top ; Can contain sequences %d , %s etc. Which will be replaced +according to the standard printf syntax.
  • +
+

For example, the following call:

+
kthread_create (f, NULL, "%skthread%d", "my", 0);
+
+
+

Will create a kernel thread with the name mykthread0.

+

The kernel thread created with this function will be stopped (in the +TASK_INTERRUPTIBLE state). To start the kernel thread, call the +wake_up_process():

+
#include <linux/sched.h>
+
+int wake_up_process(struct task_struct *p);
+
+
+

Alternatively, you can use kthread_run() to create and run a +kernel thread:

+
struct task_struct * kthread_run(int (*threadfn)(void *data)
+                                 void *data, const char namefmt[], ...);
+
+
+

Even if the programming restrictions for the function running within +the kernel thread are more relaxed and scheduling is closer to +scheduling in userspace, there are, however, some limitations to be +taken into account. We will list below the actions that can or can not +be made from a kernel thread:

+
    +
  • can't access the user address space (even with copy_from_user, +copy_to_user) because a kernel thread does not have a user address +space
  • +
  • can't implement busy wait code that runs for a long time; if the +kernel is compiled without the preemptive option, that code will run +without being preempted by other kernel threads or user processes +thus hogging the system
  • +
  • can call blocking operations
  • +
  • can use spinlocks, but if the hold time of the lock is significant, +it is recommended to use mutexes
  • +
+

The termination of a kernel thread is done voluntarily, within the +function running in the kernel thread, by calling do_exit():

+
fastcall NORET_TYPE void do_exit(long code);
+
+
+

Most of the implementations of kernel threads handlers use the same +model and it is recommended to start using the same model to avoid +common mistakes:

+
#include <linux/kthread.h>
+
+DECLARE_WAIT_QUEUE_HEAD(wq);
+
+// list events to be processed by kernel thread
+struct list_head events_list;
+struct spin_lock events_lock;
+
+
+// structure describing the event to be processed
+struct event {
+    struct list_head lh;
+    bool stop;
+    //...
+};
+
+struct event* get_next_event(void)
+{
+    struct event *e;
+
+    spin_lock(&events_lock);
+    e = list_first_entry(&events_list, struct event*, lh);
+    if (e)
+        list_del(&e->lh);
+    spin_unlock(&events_lock);
+
+    return e
+}
+
+int my_thread_f(void *data)
+{
+    struct event *e;
+
+    while (true) {
+        wait_event(wq, (e = get_next_event));
+
+        /* Event processing */
+
+        if (e->stop)
+            break;
+    }
+
+    do_exit(0);
+}
+
+/* start and start kthread */
+kthread_run(my_thread_f, NULL, "%skthread%d", "my", 0);
+
+
+

With the template above, the kernel thread requests can be issued +with:

+
void send_event(struct event *ev)
+{
+    spin_lock(&events_lock);
+    list_add(&ev->lh, &events_list);
+    spin_unlock(&events_lock);
+    wake_up(&wq);
+}
+
+
+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is deferred_work. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/deferred_work/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR, find the definitions of the following symbols:

+
    +
  • jiffies
  • +
  • struct timer_list
  • +
  • spin_lock_bh function()
  • +
+
+
+

1.Timer

+

We're looking at creating a simple kernel module that displays a +message at TIMER_TIMEOUT seconds after the module's kernel load.

+

Generate the skeleton for the task named 1-2-timer and follow the +sections marked with TODO 1 to complete the task.

+
+

Hint

+

Use pr_info(...). Messages will be displayed on the +console and can also be viewed using dmesg. When scheduling +the timer we need to use the absolute time of the system (in +the future) in number of ticks. The current time of the +system in the number of ticks is given by jiffies. +Thus, the absolute time we need to pass to the timer is +jiffies + TIMER_TIMEOUT * HZ.

+

For more information review the Timers section.

+
+
+
+

2. Periodic timer

+

Modify the previous module to display the message in once every +TIMER_TIMEOUT seconds. Follow the section marked with TODO 2 in the +skeleton.

+
+
+

3. Timer control using ioctl

+

We plan to display information about the current process after N +seconds of receiving a ioctl call from user space. N is transmitted as +ioctl parameter.

+

Generate the skeleton for the task named 3-4-5-deferred and +follow the sections marked with TODO 1 in the skeleton driver.

+

You will need to implement the following ioctl operations.

+
    +
  • MY_IOCTL_TIMER_SET to schedule a timer to run after a number of +seconds which is received as an argument to ioctl. The timer does +not run periodically. +* This command receives directly a value, not a pointer.
  • +
  • MY_IOCTL_TIMER_CANCEL to deactivate the timer.
  • +
+
+

Note

+

Review ioctl for a way to access the ioctl argument.

+
+
+

Note

+

Review the Timers section for information on enabling / +disabling a timer. In the timer handler, display the current +process identifier (PID) and the process executable image name.

+
+
+

Hint

+

You can find the current process identifier using the pid +and comm fields of the current process. For details, +review proc-info.

+
+
+

Hint

+

To use the device driver from userspace you must create the +device character file /dev/deferred using the mknod +utility. Alternatively, you can run the +3-4-5-deferred/kernel/makenode script that performs this +operation.

+
+

Enable and disable the timer by calling user-space ioctl +operations. Use the 3-4-5-deferred/user/test program to test +planning and canceling of the timer. The program receives the ioctl +type operation and its parameters (if any) on the command line.

+
+

Hint

+

Run the test executable without arguments to observe the +command line options it accepts.

+

To enable the timer after 3 seconds use:

+
./test s 3
+
+
+

To disable the timer use:

+
./test c
+
+
+
+

Note that every time the current process the timer runs from is +swapper/0 with PID 0. This process is the idle process. It is +running when there is nothing else to run on. Because the virtual +machine is very light and does not do much it is natural to see this +process most of the time.

+
+
+

4. Blocking operations

+

Next we want to see what happens when we perform blocking operations +in a timer routine. For this we try to call in the timer-handling +routines a function called alloc_io() that simulates a blocking +operation.

+

Modify the module so that when you receive MY_IOCTL_TIMER_ALLOC +command the timer handler will call alloc_io(). Follow the +sections marked with TODO 2 in the skeleton.

+

Use the same timer. To differentiate functionality in the timer +handler, use a flag in the device structure. Use the +TIMER_TYPE_ALLOC and TIMER_TYPE_SET macros defined in the code +skeleton. For initialization, use TIMER_TYPE_NONE.

+

Run the test program to verify the functionality of task 3. Run the +test program again to call alloc_io().

+
+

Note

+

The driver causes an error because a blocking function is +called in the atomic context (the timer handler runs +interrupt context).

+
+
+
+

5. Workqueues

+

We will modify the module to prevent the error observed in the +previous task.

+

To do so, lets call alloc_io() using workqueues. Schedule a +work item from the timer handler In the work handler (running in +process context) call the alloc_io(). Follow the sections +marked with TODO 3 in the skeleton and review the Workqueues +section if needed.

+
+

Hint

+

Add a new field with the type struct work_struct +in your device structure. Initialize this field. Schedule +the work from the timer handler using schedule_work(). +Schedule the timer handler aften N seconds from the ioctl.

+
+
+
+

6. Kernel thread

+

Implement a simple module that creates a kernel thread that shows the +current process identifier.

+

Generate the skeleton for the task named 6-kthread and follow the +TODOs from the skeleton.

+
+

Note

+

There are two options for creating and running a thread:

+
    +
  • kthread_run() to create and run the thread
  • +
  • kthread_create() to create a suspended thread and +then start it running with wake_up_process().
  • +
+

Review the Kernel Threads section if needed.

+
+
+

Attention

+

Synchronize the thread termination with module unloading:

+
    +
  • The thread should finish when the module is unloaded
  • +
  • Wait for the kernel thread to exit before continuing +with unloading
  • +
+
+
+

Hint

+

For synchronization use two wait queues and two flags.

+

Review waiting-queues on how to use waiting queue.

+

Use atomic variables for flags. Review Atomic variables.

+
+
+
+

7. Buffer shared between timer and process

+

The purpose of this task is to exercise the synchronization between a +deferrable action (a timer) and process context. Set up a periodic +timer that monitors a list of processes. If one of the processes +terminate a message is printed. Processes can be dynamically added to +the list. Use the 3-4-5-deferred/kernel/ skeleton as a base and +follow the TODO 4 markings to complete the task.

+

When the MY_IOCTL_TIMER_MON command is received check that the given +process exists and if so add to the monitored list of +processes and then arm the timer after setting its type.

+
+

Hint

+

Use get_proc() which checks the pid, finds the +associated struct task_struct and allocates a +struct mon_proc item you can add to your +list. Note that the function also increases the reference +counter of the task, so that its memory won't be free when +the task terminates.

+
+
+

Attention

+

Use a spinlock to protect the access to the list. Note +that since we share data with the timer handler we need +to disable bottom-half handlers in addition to taking +the lock. Review the Locking section.

+
+
+

Hint

+

Collect the information every second from a timer. Use the +existing timer and add new behaviour for it via the +TIMER_TYPE_ACCT. To set the flag, use the t argument of +the test program.

+
+

In the timer handler iterate over the list of monitored processes and +check if they have terminated. If so, print the process name and pid +then remove the process from the list, decrement the task usage +counter so that it's memory can be free and finally free the +struct mon_proc structure.

+
+

Hint

+

Use the state field of struct task_struct(). A +task has terminated if its state is TASK_DEAD.

+
+
+

Hint

+

Use put_task_struct() to decrement the task usage +counter.

+
+
+

Attention

+

Make sure you protect the list access with a +spinlock. The simple variant will suffice.

+
+
+

Attention

+

Make sure to use the safe iteration over the list since +we may need to remove an item from the list.

+
+

Rearm the timer after checking the list.

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/device_drivers.html b/refs/pull/405/merge/labs/device_drivers.html new file mode 100644 index 00000000..a5663090 --- /dev/null +++ b/refs/pull/405/merge/labs/device_drivers.html @@ -0,0 +1,1200 @@ + + + + + + Character device drivers — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Character device drivers

+
+

Laboratory objectives

+
+
    +
  • understand the concepts behind character device driver
  • +
  • understand the various operations that can be performed on character devices
  • +
  • working with waiting queues
  • +
+
+
+
+

Overview

+

In UNIX, hardware devices are accessed by the user through special device +files. These files are grouped into the /dev directory, and system calls +open, read, write, close, lseek, mmap etc. are +redirected by the operating system to the device driver associated with the +physical device. The device driver is a kernel component (usually a module) +that interacts with a hardware device.

+

In the UNIX world there are two categories of device files and thus +device drivers: character and block. This division is done by the speed, +volume and way of organizing the data to be transferred from the device to the +system and vice versa. In the first category, there are slow devices, which +manage a small amount of data, and access to data does not require frequent +seek queries. Examples are devices such as keyboard, mouse, serial ports, +sound card, joystick. In general, operations with these devices (read, write) +are performed sequentially byte by byte. The second category includes devices +where data volume is large, data is organized on blocks, and search is common. +Examples of devices that fall into this category are hard drives, cdroms, ram +disks, magnetic tape drives. For these devices, reading and writing is done at +the data block level.

+

For the two types of device drivers, the Linux kernel offers different APIs. +If for character devices system calls go directly to device drivers, in case of +block devices, the drivers do not work directly with system calls. In +the case of block devices, communication between the user-space and the block +device driver is mediated by the file management subsystem and the block device +subsystem. The role of these subsystems is to prepare the device driver's +necessary resources (buffers), to keep the recently read data in the cache +buffer, and to order the read and write operations for performance reasons.

+
+
+

Majors and minors

+

In UNIX, the devices traditionally had a unique, fixed identifier associated +with them. This tradition is preserved in Linux, although identifiers can be +dynamically allocated (for compatibility reasons, most drivers still use static +identifiers). The identifier consists of two parts: major and minor. The first +part identifies the device type (IDE disk, SCSI disk, serial port, etc.) +and the second one identifies the device (first disk, second serial port, +etc.). Most times, the major identifies the driver, while the minor identifies +each physical device served by the driver. In general, a driver will have a +major associate and will be responsible for all minors associated with that +major.

+
$ ls -la /dev/hda? /dev/ttyS?
+brw-rw----  1 root disk    3,  1 2004-09-18 14:51 /dev/hda1
+brw-rw----  1 root disk    3,  2 2004-09-18 14:51 /dev/hda2
+crw-rw----  1 root dialout 4, 64 2004-09-18 14:52 /dev/ttyS0
+crw-rw----  1 root dialout 4, 65 2004-09-18 14:52 /dev/ttyS1
+
+
+

As can be seen from the example above, device-type information can be found +using the ls command. The special character files are identified by the c +character in the first column of the command output, and the block type by the +character b. In columns 5 and 6 of the result you can see the +major, respectively the minor for each device.

+

Certain major identifiers are statically assigned to devices (in the +Documentation/admin-guide/devices.txt file from the kernel sources). When choosing the +identifier for a new device, you can use two methods: static (choose a number +that does not seem to be used already) or dynamically. In /proc/devices are the +loaded devices, along with the major identifier.

+

To create a device type file, use the mknod command; the command receives the +type (block or character), major and minor of the device +(mknod name type major minor). Thus, if you want to create a character device +named mycdev with the major 42 and minor 0, use the command:

+
# mknod /dev/mycdev c 42 0
+
+
+

To create the block device with the name mybdev with the major 240 and minor 0 +the command will be:

+
# mknod /dev/mybdev b 240 0
+
+
+

Next, we'll refer to character devices as drivers.

+
+
+

Data structures for a character device

+

In the kernel, a character-type device is represented by +struct cdev, a structure used to register it in the +system. Most driver operations use three important structures: +struct file_operations, struct file and struct inode.

+
+

struct file_operations

+

As mentioned above, the character device drivers receive unaltered system calls +made by users over device-type files. Consequently, implementation of a character +device driver means implementing the system calls specific to files: open, +close, read, write, lseek, mmap, etc. These operations are +described in the fields of the struct file_operations structure:

+
#include <linux/fs.h>
+
+struct file_operations {
+    struct module *owner;
+    loff_t (*llseek) (struct file *, loff_t, int);
+    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
+    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
+    [...]
+    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+    [...]
+    int (*open) (struct inode *, struct file *);
+    int (*flush) (struct file *, fl_owner_t id);
+    int (*release) (struct inode *, struct file *);
+    [...]
+
+
+

It can be noticed that the signature of the function differs from the system +call that the user uses. The operating system sits between the user and +the device driver to simplify implementation in the device driver.

+

open does not receive the parameter path or the various parameters that control +the file opening mode. Similarly, read, write, release, ioctl, lseek +do not receive as a parameter a file descriptor. Instead, these routines receive as +parameters two structures: file and inode. Both structures represent a file, +but from different perspectives.

+
+
Most parameters for the presented operations have a direct meaning:
+
    +
  • file and inode identifies the device type file;
  • +
  • size is the number of bytes to be read or written;
  • +
  • offset is the displacement to be read or written (to be updated +accordingly);
  • +
  • user_buffer user buffer from which it reads / writes;
  • +
  • whence is the way to seek (the position where the search operation starts);
  • +
  • cmd and arg are the parameters sent by the users to the ioctl call (IO +control).
  • +
+
+
+
+
+

inode and file structures

+

An inode represents a file from the point of view of the file system. Attributes +of an inode are the size, rights, times associated with the file. An inode uniquely +identifies a file in a file system.

+

The file structure is still a file, but closer to the user's point of view. +From the attributes of the file structure we list: the inode, the file name, +the file opening attributes, the file position. All open files at a given time +have associated a file structure.

+

To understand the differences between inode and file, we will use an analogy +from object-oriented programming: if we consider a class inode, then the files +are objects, that is, instances of the inode class. Inode represents the static +image of the file (the inode has no state), while the file represents the +dynamic image of the file (the file has state).

+

Returning to device drivers, the two entities have almost always standard ways +of using: the inode is used to determine the major and minor of the device on +which the operation is performed, and the file is used to determine the flags +with which the file was opened, but also to save and access (later) private +data.

+

The file structure contains, among many fields:

+
+
    +
  • f_mode, which specifies read (FMODE_READ) or write +(FMODE_WRITE);
  • +
  • f_flags, which specifies the file opening flags (O_RDONLY, +O_NONBLOCK, O_SYNC, O_APPEND, O_TRUNC, etc.);
  • +
  • f_op, which specifies the operations associated with the file (pointer to +the file_operations structure );
  • +
  • private_data, a pointer that can be used by the programmer to store +device-specific data; The pointer will be initialized to a memory location +assigned by the programmer.
  • +
  • f_pos, the offset within the file
  • +
+
+

The inode structure contains, among much information, an i_cdev +field, which is a pointer to the structure that defines the character +device (when the inode corresponds to a character device).

+
+
+
+

Implementation of operations

+

To implement a device driver, it is recommended that you create a structure +that contains information about the device, information used in the module. In +the case of a driver for a character device, the structure will contain a cdev +structure field to refer to the device. The following example uses the struct +my_device_data:

+
#include <linux/fs.h>
+#include <linux/cdev.h>
+
+struct my_device_data {
+    struct cdev cdev;
+    /* my data starts here */
+    //...
+};
+
+static int my_open(struct inode *inode, struct file *file)
+{
+    struct my_device_data *my_data;
+
+    my_data = container_of(inode->i_cdev, struct my_device_data, cdev);
+
+    file->private_data = my_data;
+    //...
+}
+
+static int my_read(struct file *file, char __user *user_buffer, size_t size, loff_t *offset)
+{
+    struct my_device_data *my_data;
+
+    my_data = (struct my_device_data *) file->private_data;
+
+    //...
+}
+
+
+

A structure like my_device_data will contain the data associated with a device. +The cdev field (cdev type) is a character-type device and is used to record it +in the system and identify the device. The pointer to the cdev member can be +found using the i_cdev field of the inode structure (using the container_of +macro). In the private_data field of the file structure, information can be +stored at open which is then available in the read, write, release, etc. +routines.

+
+
+

Registration and unregistration of character devices

+

The registration/unregistration of a device is made by specifying the major and +minor. The dev_t type is used to keep the identifiers of a device (both major +and minor) and can be obtained using the MKDEV macro.

+

For the static assignment and unallocation of device identifiers, the +register_chrdev_region and unregister_chrdev_region functions are used:

+
#include <linux/fs.h>
+
+int register_chrdev_region(dev_t first, unsigned int count, char *name);
+void unregister_chrdev_region(dev_t first, unsigned int count);
+
+
+

It is recommended that device identifiers be dynamically assigned to the +alloc_chrdev_region function.

+

Below sequence reserves my_minor_count devices, starting with my_major +major and my_first_minor minor (if the max value for minor is exceeded, +move to the next major):

+
#include <linux/fs.h>
+...
+
+err = register_chrdev_region(MKDEV(my_major, my_first_minor), my_minor_count,
+                             "my_device_driver");
+if (err != 0) {
+    /* report error */
+    return err;
+}
+...
+
+
+

After assigning the identifiers, the character device will have to be +initialized (cdev_init) and the kernel will have to be notified(cdev_add). The +cdev_add function must be called only after the device is ready to receive +calls. Removing a device is done using the cdev_del function.

+
#include <linux/cdev.h>
+
+void cdev_init(struct cdev *cdev, struct file_operations *fops);
+int cdev_add(struct cdev *dev, dev_t num, unsigned int count);
+void cdev_del(struct cdev *dev);
+
+
+

The following sequence registers and initializes MY_MAX_MINORS devices:

+
#include <linux/fs.h>
+#include <linux/cdev.h>
+
+#define MY_MAJOR       42
+#define MY_MAX_MINORS  5
+
+struct my_device_data {
+    struct cdev cdev;
+    /* my data starts here */
+    //...
+};
+
+struct my_device_data devs[MY_MAX_MINORS];
+
+const struct file_operations my_fops = {
+    .owner = THIS_MODULE,
+    .open = my_open,
+    .read = my_read,
+    .write = my_write,
+    .release = my_release,
+    .unlocked_ioctl = my_ioctl
+};
+
+int init_module(void)
+{
+    int i, err;
+
+    err = register_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS,
+                                 "my_device_driver");
+    if (err != 0) {
+        /* report error */
+        return err;
+    }
+
+    for(i = 0; i < MY_MAX_MINORS; i++) {
+        /* initialize devs[i] fields */
+        cdev_init(&devs[i].cdev, &my_fops);
+        cdev_add(&devs[i].cdev, MKDEV(MY_MAJOR, i), 1);
+    }
+
+    return 0;
+}
+
+
+

While the following sequence deletes and unregisters them:

+
void cleanup_module(void)
+{
+    int i;
+
+    for(i = 0; i < MY_MAX_MINORS; i++) {
+        /* release devs[i] fields */
+        cdev_del(&devs[i].cdev);
+    }
+    unregister_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS);
+}
+
+
+
+

Note

+

Initialization of the struct my_fops used the initialization +of members by name, defined in C99 standard (see designated +initializers and the file_operations structure). Structure +members who do not explicitly appear in this initialization +will be set to the default value for their type. For +example, after the initialization above, my_fops.mmap will +be NULL.

+
+
+
+

Access to the address space of the process

+

A driver for a device is the interface between an application and hardware. As +a result, we often have to access user-space data. Accessing it can not be done +directly (by dereferencing a user-space pointer). Direct access of a +user-space pointer can lead to incorrect behavior (depending on architecture, a +user-space pointer may not be valid or mapped to kernel-space), a kernel oops +(the user-mode pointer can refer to a non-resident memory area) or security +issues. Proper access to user-space data is done by calling the macros / +functions below:

+
#include <asm/uaccess.h>
+
+put_user(type val, type *address);
+get_user(type val, type *address);
+unsigned long copy_to_user(void __user *to, const void *from, unsigned long n);
+unsigned long copy_from_user(void *to, const void __user *from, unsigned long n);
+
+
+

All macros / functions return 0 in case of success and another value in case of +error and have the following roles:

+
+
    +
  • put_user store the value val to user-space address address; +Type can be one on 8, 16, 32, 64 bit (the maximum supported type depends on the +hardware platform);
  • +
  • get_user analogue to the previous function, only that val will be set to a +value identical to the value at the user-space address given by address;
  • +
  • copy_to_user copies n bytes from the kernel-space, from the address +referenced by from in user-space to the address referenced by to;
  • +
  • copy_from_user copies n bytes from user-space from the address +referenced by from in kernel-space to the address referenced by to.
  • +
+
+

A common section of code that works with these functions is:

+
#include <asm/uaccess.h>
+
+/*
+ * Copy at most size bytes to user space.
+ * Return ''0'' on success and some other value on error.
+ */
+if (copy_to_user(user_buffer, kernel_buffer, size))
+    return -EFAULT;
+else
+    return 0;
+
+
+
+
+

Open and release

+

The open function performs the initialization of a device. In most cases, +these operations refer to initializing the device and filling in specific data +(if it is the first open call). The release function is about releasing +device-specific resources: unlocking specific data and closing the device if +the last call is close.

+

In most cases, the open function will have the following structure:

+
static int my_open(struct inode *inode, struct file *file)
+{
+    struct my_device_data *my_data =
+             container_of(inode->i_cdev, struct my_device_data, cdev);
+
+    /* validate access to device */
+    file->private_data = my_data;
+
+    /* initialize device */
+    ...
+
+    return 0;
+}
+
+
+

A problem that occurs when implementing the open function is access control. +Sometimes a device needs to be opened once at a time; More specifically, do not +allow the second open before the release. To implement this restriction, you +choose a way to handle an open call for an already open device: it can return +an error (-EBUSY), block open calls until a release operation, or shut down +the device before do the open.

+

At the user-space call of the open and close functions on the device, call +my_open and my_release in the driver. An example of a user-space call:

+
int fd = open("/dev/my_device", O_RDONLY);
+if (fd < 0) {
+    /* handle error */
+}
+
+/* do work */
+//..
+
+close(fd);
+
+
+
+
+

Read and write

+

The read and write operations are reaching the device driver as a +result of an user-space program calling the read or write system calls:

+
if (read(fd, buffer, size) < 0) {
+    /* handle error */
+}
+
+if (write(fd, buffer, size) < 0) {
+    /* handle error */
+}
+
+
+

The read and write functions transfer data between the device and the +user-space: the read function reads the data from the device and transfers it +to the user-space, while writing reads the user-space data and writes it to the +device. The buffer received as a parameter is a user-space pointer, which is +why it is necessary to use the copy_to_user or copy_from_user functions.

+

The value returned by read or write can be:

+
+
    +
  • the number of bytes transferred; if the returned value is less than the size +parameter (the number of bytes requested), then it means that a partial +transfer was made. Most of the time, the user-space app calls the system call +(read or write) function until the required data number is transferred.
  • +
  • 0 to mark the end of the file in the case of read ; if write returns the +value 0 then it means that no byte has been written and that no error has +occurred; In this case, the user-space application retries the write call.
  • +
  • a negative value indicating an error code.
  • +
+
+

To perform a data transfer consisting of several partial transfers, the +following operations should be performed:

+
+
    +
  • transfer the maximum number of possible bytes between the buffer received +as a parameter and the device (writing to the device/reading from the device +will be done from the offset received as a parameter);
  • +
  • update the offset received as a parameter to the position from which the +next read / write data will begin;
  • +
  • return the number of bytes transferred.
  • +
+
+

The sequence below shows an example for the read function that takes +into account the internal buffer size, user buffer size and the offset:

+
static int my_read(struct file *file, char __user *user_buffer,
+                   size_t size, loff_t *offset)
+{
+    struct my_device_data *my_data = (struct my_device_data *) file->private_data;
+    ssize_t len = min(my_data->size - *offset, size);
+
+    if (len <= 0)
+        return 0;
+
+    /* read data from my_data->buffer to user buffer */
+    if (copy_to_user(user_buffer, my_data->buffer + *offset, len))
+        return -EFAULT;
+
+    *offset += len;
+    return len;
+}
+
+
+

The images below illustrate the read operation and how data is +transferred between the user-space and the driver:

+
+
    +
  1. when the driver has enough data available (starting with the OFFSET +position) to accurately transfer the required size (SIZE) to the user.
  2. +
  3. when a smaller amount is transferred than required.
  4. +
+
+../_images/read.png +../_images/read2.png +

We can look at the read operation implemented by the driver as a response to a +user-space read request. In this case, the driver is responsible for advancing +the offset according to how much it reads and returning the read size (which +may be less than what is required).

+

The structure of the write function is similar:

+
static int my_write(struct file *file, const char __user *user_buffer,
+                    size_t size, loff_t * offset)
+{
+    struct my_device_data *my_data = (struct my_device_data *) file->private_data;
+    ssize_t len = min(my_data->size - *offset, size);
+
+    if (len <= 0)
+        return 0;
+
+    /* read data from user buffer to my_data->buffer */
+    if (copy_from_user(my_data->buffer + *offset, user_buffer, len))
+        return -EFAULT;
+
+    *offset += len;
+    return len;
+}
+
+
+

The write operation will respond to a write request from user-space. In +this case, depending on the maximum driver capacity (MAXSIZ), it can +write more or less than the required size.

+../_images/write.png +../_images/write2.png +
+
+

ioctl

+

In addition to read and write operations, a driver needs the ability to perform +certain physical device control tasks. These operations are accomplished by +implementing a ioctl function. Initially, the ioctl system call used Big Kernel +Lock. That's why the call was gradually replaced with its unlocked version +called unlocked_ioctl. You can read more on LWN: +http://lwn.net/Articles/119652/

+
static long my_ioctl (struct file *file, unsigned int cmd, unsigned long arg);
+
+
+

cmd is the command sent from user-space. If a value is being sent from the +user-space call, it can be accessed directly. If a buffer is fetched, the arg +value will be a pointer to it, and must be accessed through the copy_to_user +or copy_from_user.

+

Before implementing the ioctl function, the numbers corresponding to the +commands must be chosen. One method is to choose consecutive numbers starting +at 0, but it is recommended to use _IOC(dir, type, nr, size) macro definition +to generate ioctl codes. The macro definition parameters are as follows:

+
+
    +
  • dir represents the data transfer (_IOC_NONE , _IOC_READ, +_IOC_WRITE).
  • +
  • type represents the magic number (Documentation/ioctl/ioctl-number.txt);
  • +
  • nr is the ioctl code for the device;
  • +
  • size is the size of the transferred data.
  • +
+
+

The following example shows an implementation for a ioctl function:

+
#include <asm/ioctl.h>
+
+#define MY_IOCTL_IN _IOC(_IOC_WRITE, 'k', 1, sizeof(my_ioctl_data))
+
+static long my_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    struct my_device_data *my_data =
+         (struct my_device_data*) file->private_data;
+    my_ioctl_data mid;
+
+    switch(cmd) {
+    case MY_IOCTL_IN:
+        if( copy_from_user(&mid, (my_ioctl_data *) arg,
+                           sizeof(my_ioctl_data)) )
+            return -EFAULT;
+
+        /* process data and execute command */
+
+        break;
+    default:
+        return -ENOTTY;
+    }
+
+    return 0;
+}
+
+
+

At the user-space call for the ioctl function, the my_ioctl function of the +driver will be called. An example of such a user-space call:

+
if (ioctl(fd, MY_IOCTL_IN, buffer) < 0) {
+    /* handle error */
+}
+
+
+
+
+

Waiting queues

+

It is often necessary for a thread to wait for an operation to finish, +but it is desirable that this wait is not busy-waiting. Using waiting +queues we can block a thread until an event occurs. When the condition +is satisfied, elsewhere in the kernel, in another process, in an +interrupt or deferrable work, we will wake up the process.

+

A waiting queue is a list of processes that are waiting for a specific +event. A queue is defined with the wait_queue_head_t type and can +be used by the functions/macros:

+
#include <linux/wait.h>
+
+DECLARE_WAIT_QUEUE_HEAD(wq_name);
+
+void init_waitqueue_head(wait_queue_head_t *q);
+
+int wait_event(wait_queue_head_t q, int condition);
+
+int wait_event_interruptible(wait_queue_head_t q, int condition);
+
+int wait_event_timeout(wait_queue_head_t q, int condition, int timeout);
+
+int wait_event_interruptible_timeout(wait_queue_head_t q, int condition, int timeout);
+
+void wake_up(wait_queue_head_t *q);
+
+void wake_up_interruptible(wait_queue_head_t *q);
+
+
+

The roles of the macros / functions above are:

+
+
    +
  • init_waitqueue_head() initializes the queue; to initialize the +queue at compile time, you can use the DECLARE_WAIT_QUEUE_HEAD macro;
  • +
  • wait_event() and wait_event_interruptible() adds the current thread to the +queue while the condition is false, sets it to TASK_UNINTERRUPTIBLE or +TASK_INTERRUPTIBLE and calls the scheduler to schedule a new thread; Waiting +will be interrupted when another thread will call the wake_up function;
  • +
  • wait_event_timeout() and wait_event_interruptible_timeout() have the same +effect as the above functions, only waiting can be interrupted at the end of +the timeout received as a parameter;
  • +
  • wake_up() puts all threads off from state TASK_INTERRUPTIBLE and +TASK_UNINTERRUPTIBLE in TASK_RUNNING status; Remove these threads from the +queue;
  • +
  • wake_up_interruptible() same action, but only threads with TASK_INTERRUPTIBLE +status are woken up.
  • +
+
+

A simple example is that of a thread waiting to change the value of a flag. The +initializations are done by the sequence:

+
#include <linux/sched.h>
+
+wait_queue_head_t wq;
+int flag = 0;
+
+init_waitqueue_head(&wq);
+
+
+

A thread will wait for the flag to be changed to a value other than zero:

+
wait_event_interruptible(wq, flag != 0);
+
+
+

While another thread will change the flag value and wake up the waiting threads:

+
flag = 1 ;
+wake_up_interruptible (&wq);
+
+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is device_drivers. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/device_drivers/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR find the definitions +of the following symbols in the Linux kernel:

+
+
    +
  • struct file
  • +
  • struct file_operations
  • +
  • generic_ro_fops
  • +
  • vfs_read()
  • +
+
+
+
+

1. Register/unregister

+

The driver will control a single device with the MY_MAJOR major and +MY_MINOR minor (the macros defined in the kernel/so2_cdev.c file).

+
+
    +
  1. Create /dev/so2_cdev character device node using mknod.

    +
    +

    Hint

    +

    Read Majors and minors section in the lab.

    +
    +
  2. +
  3. Implement the registration and deregistration of the device with the name +so2_cdev, respectively in the init and exit module functions. Implement TODO 1.

    + +
  4. +
  5. Display, using pr_info, a message after the registration and unregistration +operations to confirm that they were successful. Then load the module into the kernel:

    +
    $ insmod so2_cdev.ko
    +
    +
    +

    And see character devices in /proc/devices:

    +
    $ cat /proc/devices | less
    +
    +
    +

    Identify the device type registered with major 42 . Note that /proc/devices +contains only the device types (major) but not the actual devices (i.e. minors).

    +
    +

    Note

    +

    Entries in /dev are not created by loading the module. These can be created +in two ways:

    +
      +
    • manually, using the mknod command as we did above.
    • +
    • automatically using udev daemon
    • +
    +
    +
  6. +
  7. Unload the kernel module

    +
    rmmod so2_cdev
    +
    +
    +
  8. +
+
+
+
+

2. Register an already registered major

+

Modify MY_MAJOR so that it points to an already used major number.

+
+

Hint

+

See /proc/devices to get an already assigned major.

+
+

See errno-base.h +and figure out what does the error code mean. +Return to the initial configuration of the module.

+
+
+

3. Open and close

+

Run cat /dev/so2_cdev to read data from our char device. +Reading does not work because the driver does not have the open function implemented. +Follow comments marked with TODO 2 and implement them.

+
+
    +
  1. Initialize your device +
  2. +
  3. Implement the open and release functions in the driver.
  4. +
  5. Display a message in the open and release functions.
  6. +
  7. Read again /dev/so2_cdev file. Follow the messages displayed by the kernel. +We still get an error because read function is not yet implemented.
  8. +
+
+
+

Note

+

The prototype of a device driver's operations is in the file_operations +structure. Read Open and release section.

+
+
+
+

4. Access restriction

+

Restrict access to the device with atomic variables, so that a single process +can open the device at a time. The rest will receive the "device busy" error +(-EBUSY). Restricting access will be done in the open function displayed by +the driver. Follow comments marked with TODO 3 and implement them.

+
+
    +
  1. Add an atomic_t variable to the device structure.
  2. +
  3. Initialize the variable at module initialization.
  4. +
  5. Use the variable in the open function to restrict access to the device. We +recommend using atomic_cmpxchg().
  6. +
  7. Reset the variable in the release function to retrieve access to the device.
  8. +
  9. To test your deployment, you'll need to simulate a long-term use of your +device. To simulate a sleep, call the scheduler at the end of the device opening:
  10. +
+
+
set_current_state(TASK_INTERRUPTIBLE);
+schedule_timeout(1000);
+
+
+
+

Note

+

The advantage of the atomic_cmpxchg function is that it can check the +old value of the variable and set it up to a new value, all in one +atomic operation. Read more details about atomic_cmpxchg +An example of use is here.

+
+
+
+

5. Read operation

+

Implement the read function in the driver. Follow comments marked with TODO 4 and implement them.

+
+
    +
  1. Keep a buffer in so2_device_data structure initialized with the value of MESSAGE macro. +Initializing this buffer will be done in module init function.
  2. +
  3. At a read call, copy the contents of the kernel space buffer into the user +space buffer.
      +
    • Use the copy_to_user() function to copy information from kernel space to +user space.
    • +
    • Ignore the size and offset parameters at this time. You can assume that +the buffer in user space is large enough. You do not need to check the +validity of the size argument of the read function.
    • +
    • The value returned by the read call is the number of bytes transmitted +from the kernel space buffer to the user space buffer.
    • +
    +
  4. +
  5. After implementation, test using cat /dev/so2_cdev.
  6. +
+
+
+

Note

+

The command cat /dev/so2_cdev does not end (use Ctrl+C). +Read the read and write sections and Access to the address space of the process +If you want to display the offset value use a construction of the form: +pr_info("Offset: %lld \n", *offset); The data type loff_t (used by offset ) is a typedef for long long int.

+
+

The cat command reads to the end of the file, and the end of the file is +signaled by returning the value 0 in the read. Thus, for a correct implementation, +you will need to update and use the offset received as a parameter in the read +function and return the value 0 when the user has reached the end of the buffer.

+

Modify the driver so that the cat commands ends:

+
+
    +
  1. Use the size parameter.
  2. +
  3. For every read, update the offset parameter accordingly.
  4. +
  5. Ensure that the read function returns the number of bytes that were copied +into the user buffer.
  6. +
+
+
+

Note

+

By dereferencing the offset parameter it is possible to read and move the current +position in the file. Its value needs to be updated every time a read is done +successfully.

+
+
+
+

6. Write operation

+

Add the ability to write a message into kernel buffer to replace the predefined message. Implement +the write function in the driver. Follow comments marked with TODO 5

+

Ignore the offset parameter at this time. You can assume that the driver buffer is +large enough. You do not need to check the validity of the write function size +argument.

+
+

Note

+

The prototype of a device driver's operations is in the file_operations +structure. +Test using commands:

+
echo "arpeggio"> /dev/so2_cdev
+cat /dev/so2_cdev
+
+
+

Read the read and write sections and Access to the address space of the process

+
+
+
+

7. ioctl operation

+

For this exercise, we want to add the ioctl MY_IOCTL_PRINT to display the +message from the IOCTL_MESSAGE macro in the driver. +Follow the comments marked with TODO 6

+

For this:

+
+
    +
  1. Implement the ioctl function in the driver.
  2. +
  3. We need to use user/so2_cdev_test.c to call the +ioctl function with the appropriate parameters.
  4. +
  5. To test, we will use an user-space program (user/so2_cdev_test.c) +which will call the ioctl function with the required arguments.
  6. +
+
+
+

Note

+

The macro MY_IOCTL_PRINT is defined in the file include/so2_cdev.h, +which is shared between the kernel module and the user-space program.

+

Read the ioctl section in the lab.

+
+
+

Note

+

The user-space code is compiled automatically at make build and +copied at make copy.

+

Because we need to compile the program for qemu machine which is 32 bit, +if your host is 64 bit then you need to install gcc-multilib package.

+
+
+
+
+

Extra Exercises

+
+

Ioctl with messaging

+

Add two ioctl operations to modify the message associated with the +driver. Use fixed-length buffer ( BUFFER_SIZE ).

+
+
    +
  1. Add the ioctl function from the driver the following operations:
      +
    • MY_IOCTL_SET_BUFFER for writing a message to the device;
    • +
    • MY_IOCTL_GET_BUFFER to read a message from your device.
    • +
    +
  2. +
  3. For testing, pass the required command line arguments to the +user-space program.
  4. +
+
+
+

Note

+

Read the ioctl and Access to the address space of the process +sections of the lab.

+
+
+
+

Ioctl with waiting queues

+

Add two ioctl operations to the device driver for queuing.

+
+
    +
  1. Add the ioctl function from the driver the following operations:
      +
    • MY_IOCTL_DOWN to add the process to a queue;
    • +
    • MY_IOCTL_UP to remove the process from a queue.
    • +
    +
  2. +
  3. Fill the device structure with a wait_queue_head_t field and a flag.
  4. +
  5. Do not forget to initialize the wait queue and flag.
  6. +
  7. Remove exclusive access condition from previous exercise
  8. +
  9. For testing, pass the required command line arguments to the +user-space program.
  10. +
+
+

When the process is added to the queue, it will remain blocked in execution; To +run the queue command open a new console in the virtual machine with Alt+F2 ; +You can return to the previous console with Alt+F1. If you're connected via +SSH to the virtual machine, open a new console.

+
+

Note

+

Read the ioctl and Waiting queues sections in the lab.

+
+
+
+

O_NONBLOCK implementation

+
+

Note

+

If a file is open with the O_NONBLOCK flag, then its +operations will be non-blocking.

+

In case data is not available when performing a read, the following +happens:

+
+
    +
  • if the file has been open with O_NONBLOCK, the read call +will return -EWOULDBLOCK.
  • +
  • otherwise, the current task (process) will be placed in a waiting +queue and will be unblocked as soon as data becomes available +(in our case, at write).
  • +
+
+
+
    +
  • To allow unblocking the read operation, remove the exclusive access +condition from previous exercises.
  • +
  • You can use the queue defined for the previous exercise.
  • +
  • You can ignore the file offset.
  • +
  • Modify the initial size of data to 0, to allow testing.
  • +
  • For testing, pass the required command line arguments to the +user-space program.
      +
    • when using the n option, the test program will change the open flags +to O_NONBLOCK and then perform a read.
    • +
    +
  • +
  • What are the flags used to open the file when running cat /dev/so2_dev?
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/device_model.html b/refs/pull/405/merge/labs/device_model.html new file mode 100644 index 00000000..f4618913 --- /dev/null +++ b/refs/pull/405/merge/labs/device_model.html @@ -0,0 +1,1311 @@ + + + + + + Linux Device Model — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Linux Device Model

+
+

Overview

+

Plug and Play is a technology that offers support for automatically adding and +removing devices to the system. This reduces conflicts with the resources they +use by automatically configuring them at system startup. In order to achieve +these goals, the following features are required:

+
+
    +
  • Automatic detection of adding and removing devices in the system (the device +and its bus must notify the appropriate driver that a configuration change +occurred).
  • +
  • Resource management (addresses, irq lines, DMA channels, memory areas), +including resource allocation to devices and solving conflicts that may arise.
  • +
  • Devices must allow for software configuration (device resources - ports, +interrupts, DMA resources - must allow for driver assignment).
  • +
  • The drivers required for new devices must be loaded automatically by the +operating system when needed.
  • +
  • When the device and its bus allow, the system should be able to add or +remove the device from the system while it is running, without having to reboot +the system (hotplug).
  • +
+
+

For a system to support plug and play, the BIOS, operating system and the device +must support this technology. The device must have an ID that will provide to the +driver for identification, and the operating system must be able to identify +these configuration changes as they appear.

+

Plug and play devices are: PCI devices (network cards), USB (keyboard, mouse, +printer), etc.

+

Prior to version 2.6, the kernel did not have a unified model to get +information about devices. +For this reason, a model for Linux devices, Linux Device Model, was developed.

+

The primary purpose of this model is to maintain internal data structures that +reflect the state and structure of the system. Such information includes what +devices are in the system, how they are in terms of power management, what bus +they are attached to, what drivers they have, along with the structure of the +buses, devices, drivers in the system.

+

To maintain this information, the kernel uses the following entities:

+
+
    +
  • device - a physical device that is attached to a bus
  • +
  • driver - a software entity that can be associated with a device and performs +operations with it
  • +
  • bus - a device to which other devices can be attached
  • +
  • class - a type of device that has a similar behavior; There is a class for +disks, partitions, serial ports, etc.
  • +
  • subsystem - a view on the structure of the system; Kernel subsystems +include devices (hierarchical view of all devices in the system), buses (bus +view of devices according to how they are attached to buses), classes, etc.
  • +
+
+
+
+

sysfs

+

The kernel provides a representation of its model in userspace through the +sysfs virtual file system. It is usually mounted in the /sys directory and +contains the following subdirectories:

+
+
    +
  • block - all block devices available in the system (disks, partitions)
  • +
  • bus - types of bus to which physical devices are connected (pci, ide, usb)
  • +
  • class - drivers classes that are available in the system (net, sound, usb)
  • +
  • devices - the hierarchical structure of devices connected to the system
  • +
  • firmware - information from system firmware (ACPI)
  • +
  • fs - information about mounted file systems
  • +
  • kernel - kernel status information (logged-in users, hotplug)
  • +
  • module - the list of modules currently loaded
  • +
  • power - information related to the power management subsystem
  • +
+
+

As you can see, there is a correlation between the kernel data structures +within the described model and the subdirectories in the sysfs virtual file +system. Although this likeness may lead to confusion between the two concepts, +they are different. The kernel device model can work without the sysfs file +system, but the reciprocal is not true.

+

The sysfs information is found in files that contain an attribute. Some +standard attributes (represented by files or directories with the same name) +are as follows:

+
+
    +
  • dev - Major and minor device identifier. It can be used to automatically +create entries in the /dev directory
  • +
  • device - a symbolic link to the directory containing devices; It can be +used to discover the hardware devices that provide a particular service (for +example, the ethi PCI card)
  • +
  • driver - a symbolic link to the driver directory (located in +/sys/bus/*/drivers )
  • +
+
+

Other attributes are available, depending on the bus and driver used.

+../_images/ditaa-a5f399cb84561893770eb45ceeb827ce6d4a2336.png +
+
+

Basic Structures in Linux Devices

+

Linux Device Model provides a number of structures to ensure the interaction +between a hardware device and a device driver. The whole model is based on +kobject structure. Hierarchies are built using this structure and the following +structures are implemented:

+
+
    +
  • struct bus_type
  • +
  • struct device
  • +
  • struct device_driver
  • +
+
+../_images/ditaa-f7ee56960e76c3e80fcbe59fafa38c3d93eac261.png +
+

The kobject structure

+

A kobject structure does not perform a single function. This structure is +usually integrated into a larger one. A kobject structure actually +incorporates a set of features that will be offered to a higher abstraction +object in the Linux Device Model hierarchy.

+

For example, the cdev structure has the following definition:

+
struct cdev {
+        struct kobject kob;
+        struct module *owner;
+        const struct file_operations *ops;
+        struct list_head list;
+        dev_t dev;
+        unsigned int count;
+};
+
+
+

Note that this structure includes a kobject structure field.

+

A kobject structure is defined as follows:

+
struct kobject {
+        const char              *name;
+        struct list_head        entry;
+        struct kobject          *parent;
+        struct kset             *kset;
+        struct kobj_type        *ktype;
+        struct sysfs_dirent     *sd;
+        struct kref             kref;
+        unsigned int state_initialized:1;
+        unsigned int state_in_sysfs:1;
+        unsigned int state_add_uevent_sent:1;
+        unsigned int state_remove_uevent_sent:1;
+        unsigned int uevent_suppress:1;
+};
+
+
+

As we can see, the kobject structures are in a hierarchy: an object has a +parent and holds a kset member, which contains objects on the same level.

+

Working with the structure involves initializing it with the +kobject_init() function. +Also in the initialization process it is necessary to set the name of the +kobject structure, which will appear in sysfs, using the +kobject_set_name() function.

+

Any operation on a kobject is done by incrementing its internal counter using +kobject_get(), or decrementing if it is no longer used using +kobject_put(). +Thus, a kobject object will only be released when its internal counter reaches 0. +A method of notifying this is needed so that the resources associated with the +device structure which included the kobject structure are released +(for example, cdev). +The method is called release and is associated with the object via the ktype +field (struct kobj_type).

+

The kobject structure is the basic structure of the Linux Device Model. +The structures in the higher levels of the model are struct bus_type, +struct device and struct device_driver.

+
+
+

Buses

+

A bus is a communication channel between the processor and an input/output +device. To ensure that the model is generic, all input/output devices are +connected to the processor via such a bus (even if it can be a virtual one +without a physical hardware correspondent).

+

When adding a system bus, it will appear in the sysfs file system in +/sys/bus. +As with kobjects, buses can be organized into hierarchies and will be represented +in sysfs.

+

In the Linux Device Model, a bus is represented by the structure +struct bus_type:

+
struct bus_type {
+        const char              *name;
+        const char              *dev_name;
+        struct device           *dev_root;
+        struct bus_attribute    *bus_attrs;
+        struct device_attribute *dev_attrs;
+        struct driver_attribute *drv_attrs;
+        struct subsys_private *p;
+
+        int             (*match)(struct device *dev, struct device_driver *drv);
+        int             (*uevent)(struct device *dev, struct kobj_uevent_env *env);
+        int             (*probe)(struct device *dev);
+        int             (*remove)(struct device *dev);
+        //...
+};
+
+
+

It can be noticed that a bus has a name, lists of default attributes, a number +of specific functions, and the driver's private data. +The uevent function (formerly hotplug) is used with hotplug devices.

+

Bus operations are the registration, the implementation of the operations +described in the struct bus_type structure and the iteration and +inspection of the devices connected to the bus.

+

A bus is registered using bus_register(), and unregistered using +bus_unregister().

+

Implementation example:

+
#include <linux/device.h>
+/* mybus.c */
+
+//bus type
+struct bus_type my_bus_type = {
+  .name   = "mybus",
+  .match  = my_match,
+  .uevent = my_uevent,
+};
+
+static int __init my_bus_init(void)
+{
+  int err;
+
+  //...
+  err = bus_register(&my_bus_type);
+  if (err)
+    return err;
+  //...
+}
+
+static void __exit my_bus_exit(void)
+{
+  //...
+  bus_unregister(&my_bus_type);
+  //...
+}
+
+
+

The functions that will normally be initialized within a bus_type structure are +match and uevent:

+
#include <linux/device.h>
+#include <linux/string.h>
+/* mybus.c */
+
+// match devices to drivers; just do a simple name test
+static int my_match(struct device *dev, struct device_driver *driver)
+{
+  return !strncmp(dev_name(dev), driver->name, strlen(driver->name));
+}
+
+// respond to hotplug user events; add environment variable DEV_NAME
+static int my_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+  add_uevent_var(env, "DEV_NAME=%s", dev_name(dev));
+  return 0;
+}
+
+
+

The match function is used when a new device or a new driver is added to the +bus. Its role is to make a comparison between the device ID and the driver ID. +The uevent function is called before generating a hotplug in user-space and +has the role of adding environment variables.

+

Other possible operations on a bus are iterating over the drivers or devices +attached to it. +Although we can not directly access them (lists of drivers and devices +being stored in the private data of the driver, the subsys_private *p field), +these can be iterated using the bus_for_each_dev and +bus_for_each_drv macros.

+

The Linux Device Model interface allows you to create attributes for the +associated objects. These attributes will have a corresponding file in the +bus subdirectory in sysfs. The attributes associated with a bus are +described by the bus_attribute structure :

+
struct bus_attribute {
+         struct attribute        attr;
+         ssize_t (*show)(struct bus_type *, char *buf);
+         ssize_t (*store)(struct bus_type *, const char *buf, size_t count);
+};
+
+
+

Typically, an attribute is defined by the BUS_ATTR macro. +The bus_create_file() and bus_remove_file() functions can be +used to add/delete an attribute within the bus structure.

+

An example of defining an attribute for my_bus is shown below:

+
/* mybus.c */
+
+#define MY_BUS_DESCR     "SO2 rules forever"
+
+// export a simple bus attribute
+static ssize_t my_show_bus_descr(struct bus_type *bus, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%s\n", MY_BUS_DESCR);
+}
+
+/*
+ * define attribute - attribute name is descr;
+ * full name is bus_attr_descr;
+ * sysfs entry should be /sys/bus/mybus/descr
+ */
+BUS_ATTR(descr, 0444, my_show_bus_descr, NULL);
+
+// specify attribute - in module init function
+static int __init my_bus_init(void)
+{
+        int err;
+        //...
+        err = bus_create_file(&my_bus_type, &bus_attr_descr);
+        if (err) {
+                /* handle error */
+        }
+        //...
+}
+
+static void __exit my_bus_exit(void)
+{
+        //...
+        bus_remove_file(&my_bus_type, &bus_attr_descr);
+        //...
+}
+
+
+

The bus is represented by both a bus_type object and a device object, +as we will see later (the bus is also a device).

+
+
+

Devices

+

Any device in the system has a struct device structure associated +with it. +Devices are discovered by different kernel methods (hotplug, device drivers, +system initialization) and are registered in the system. Each device present in +the kernel has an entry in /sys/devices.

+

At the lowest level, a device in Linux Device Model is represented by a +struct device structure:

+
struct device {
+        //...
+        struct device           *parent;
+        struct device_private   *p;
+        struct kobject          kobj;
+
+        const char              *init_name; /* initial name of the device */
+        //...
+        struct bus_type         *bus;       /* type of bus device is on */
+        struct device_driver    *driver;    /* which driver has allocated this
+                                             device */
+        //...
+        void    (*release)(struct device *dev);
+};
+
+
+

Structure fields include the parent device that is usually a controller, the +associated kobject, the bus it is connected to, the device driver, and a +function called when the device counter reaches 0 (release).

+

As usual, we have the registration/unregistration functions +device_register() and device_unregister().

+

To work with attributes, we have structure struct device_attribute, +the macro DEVICE_ATTR for definition, and the functions +device_create_file() and device_remove_file() for adding/removing +the attribute to/from the device.

+

One important thing to note is that the struct device structure is +usually not used directly, but it is added to another structure. For example:

+
// my device type
+struct my_device {
+    char *name;
+    struct my_driver *driver;
+    struct device dev;
+};
+
+
+

Typically, a bus driver will export functions to add or remove such a +device, as shown below:

+
/* mybus.c */
+
+/* BUS DEVICE (parent) */
+
+// parent device release
+static void my_bus_device_release(struct device *dev)
+{
+}
+
+// parent device
+static struct device my_bus_device = {
+  .init_name   = "mybus0",
+  .release     = my_bus_device_release
+};
+
+/* DEVICE */
+
+/*
+ * as we are not using the reference count, we use a no-op
+ * release function
+ */
+static void my_dev_release(struct device *dev)
+{
+}
+
+int my_register_device(struct my_device *mydev)
+{
+  mydev->dev.bus = &my_bus_type;
+  mydev->dev.parent = &my_bus_device;
+  mydev->dev.release = my_dev_release;
+  dev_set_name(&mydev->dev, mydev->name);
+
+  return device_register(&mydev->dev);
+}
+
+void my_unregister_device(struct my_device *mydev)
+{
+  device_unregister(&mydev->dev);
+}
+
+/* export register/unregister device functions */
+EXPORT_SYMBOL(my_register_device);
+EXPORT_SYMBOL(my_unregister_device);
+
+
+

As seen, the functions my_register_device and my_unregister_device, used +to add/remove a device to/from a bus, are defined in the same file where the +bus is defined. Device structures are not initialized; they will be initialized +when the devices are discovered by the system (by hotplug or direct registration +from driver) and the function my_register_device will be called to add a +device to the bus.

+

To use the bus defined above in the driver implementation, we must define a +structure of type my_device, initialize it and register it using the function +exported by the bus (my_register_device).

+
/* mydriver.c */
+
+static struct my_device mydev;
+char devname[NAME_SIZE];
+//...
+
+//register
+int err;
+
+sprintf(devname, "mydev0");
+mydev.name = devname;
+mydev.driver = &mydriver;
+dev_set_drvdata(&mydev.dev, &mydev);
+err = my_register_device(&mydev);
+if (err < 0) {
+  /*handle error */
+}
+
+//..
+
+//unregister
+my_unregister_device(&mydev);
+
+
+
+
+

Drivers

+

Linux Device Model is used to allow simple association between system +devices and drivers. Drivers can export information independent of the physical +device.

+

In sysfs, driver information has no single subdirectory associated; They can be +found in the directory structure in different places: the loaded module is in +/sys/module, in /sys/devices you can find the driver associated with +each device, in /sys/class the drivers belonging to a class, in +/sys/bus the drivers associated to each bus.

+

A device driver is identified by the structure struct device_driver:

+
struct device_driver {
+         const char              *name;
+         struct bus_type         *bus;
+
+         struct driver_private   *p;
+
+         struct module           *owner;
+         const char              *mod_name;     /* used for built-in modules */
+
+         int     (*probe)        (struct device *dev);
+         int     (*remove)       (struct device *dev);
+         void    (*shutdown)     (struct device *dev);
+         int     (*suspend)      (struct device *dev, pm_message_t state);
+         int     (*resume)       (struct device *dev);
+};
+
+
+

Among the structure fields we find the name of the driver (appears in sysfs), +the bus with which the driver works, and functions called at various times in a +device's operation.

+

As before, we have the functions driver_register() and +driver_unregister() to register/unregister a driver.

+

To work with attributes, we have the struct driver_attribute structure, +the macro DRIVER_ATTR for definition, and the functions +driver_create_file() and driver_remove_file() functions for +adding the attribute to the device.

+

As with devices, the structure struct device_driver is usually +incorporated into another structure specific to a particular bus (PCI, USB, etc.):

+
/* mybus.c */
+
+// my driver type
+struct my_driver {
+  struct module *module;
+  struct device_driver driver;
+};
+
+#define to_my_driver(drv) container_of(drv, struct my_driver, driver);
+
+int my_register_driver(struct my_driver *driver)
+{
+  int err;
+
+  driver->driver.bus = &my_bus_type;
+  err= driver_register(&driver->driver);
+  if (err)
+    return err;
+  return 0;
+}
+
+void my_unregister_driver(struct my_driver *driver)
+{
+  driver_unregister(&driver->driver);
+}
+
+/* export register/unregister driver functions */
+EXPORT_SYMBOL(my_register_driver);
+EXPORT_SYMBOL(my_unregister_driver);
+
+
+

Driver registration/unregistration operations are exported for use in +other modules.

+

As for devices, the operations for drivers are defined when the bus is +initialized and they are exported to be used by drivers. When implementing a +driver that works with devices attached to the bus, we will call the functions +my_register_driver and my_unregister_driver to associate with the bus.

+

To use the functions (in the driver implementation), we must declare a structure +of type my_driver, initialize it and register using the function exported +by the bus.

+
/* mydriver.c */
+
+static struct my_driver mydriver = {
+  .module = THIS_MODULE,
+  .driver = {
+    .name = "mydriver",
+  },
+};
+//...
+
+//register
+int err;
+err = my_register_driver(&mydriver);
+if (err < 0) {
+  /*handle error */
+}
+//..
+
+//unregister
+my_unregister_driver(&mydriver);
+
+
+
+
+

Classes

+

A class is a high-level view of the Linux Device Model, which abstracts +implementation details. For example, there are drivers for SCSI and ATA +drivers, but all belong to the class of disks. Classes provide a grouping of +devices based on functionality, not how they are connected or how they work. +Classes have a correspondent in /sys/classes.

+

There are two main structures that describe the classes: struct class +and struct device. +The class structure describes a generic class, while the structure +struct device describes a class associated with a device. +There are functions for initializing/deinitiating and adding attributes for each +of these, described in include/linux/device.h.

+

The advantage of using classes is that the udev program in userspace, which we +will discuss later, allows the automatic creation of devices in the /dev +directory based on class information.

+

For this reason, we will continue to present a small set of functions that work +with classes to simplify the use of the plug and play mechanism.

+

A generic class is described by structure class structure:

+
struct class {
+         const char              *name;
+         struct module           *owner;
+         struct kobject          *dev_kobj;
+
+         struct subsys_private   *p;
+
+         struct class_attribute          *class_attrs;
+         struct class_device_attribute   *class_dev_attrs;
+         struct device_attribute         *dev_attrs;
+
+         int     (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
+         void    (*class_release)(struct class *class);
+         void    (*dev_release)(struct device *dev);
+         //...
+};
+
+
+

The class_register() and class_unregister() functions can be +used for initialization/deinitialization.

+
static struct class my_class = {
+        .name = "myclass",
+};
+
+static int __init my_init(void)
+{
+        int err;
+        //...
+        err = class_register(&my_class);
+        if (err < 0) {
+                /* handle error */
+        }
+        //...
+}
+
+static void __exit my_cleanup(void)
+{
+        //...
+        class_unregister(&my_class);
+        //...
+}
+
+
+

A class associated with a device is described by the struct device +structure. +The device_create() and device_destroy() functions can be used +for initialization/deinitialization. +The device_create() function initializes the device structure, +and assigns the generic class structure and the device received as a +parameter to it; +In addition, it will create an attribute of the class, dev, which contains +the minor and major of the device (minor:major). +Thus, udev utility in usermode can read the necessary data from this attribute +file to create a node in the /dev directory by calling makenod.

+

An example of initialization:

+
struct device* my_classdev;
+struct cdev cdev;
+struct device dev;
+
+//init class for device cdev.dev
+my_classdev = device_create(&my_class, NULL, cdev.dev, &dev, "myclass0");
+
+//destroy class for device cdev.dev
+device_destroy(&my_class, cdev.dev);
+
+
+

When a new device is discovered, a class and a node will be assigned to it and +a node will be created in the /dev directory. +For the example above, the node /dev/myclass0 will be generated.

+
+
+

Hotplug

+

Hotplug describes the mechanism for adding or removing a device from the +system while it is running without having to reboot the system.

+

A hotplug event is a notification from the kernel to the user-space when something +changes in the system configuration. These events are generated when creating +or removing a kobject from the kernel. Since these objects are the basis of the +Linux Device Model, being included in all structures (struct bus_type, +struct device, struct device_driver, struct class, etc.), a hotplug event +will be generated when any of these structures is created or removed (uevent).

+

When a device is discovered in the system, an event is generated. Depending on +the point where it resides in Linux Device Model, the functions corresponding +to the event will be called (usually, the uevent function associated to the +bus or the class). Using these functions, the driver has the ability to set +system variables for the user-space. +The generated event then reaches the user-space. Here is the udev +utility that captures these events. There are configuration files for this +utility in the /etc/udev/ directory. Different rules can be specified to +capture only certain events and perform certain actions, depending on the +system variables set in the kernel or in uevent functions.

+

An important consequence is that in this way the plug and play mechanism can be +achieved; with the help of udev and the classes (described above), entries +in the /dev/ directories can be automatically created for devices, and using +udev drivers can be automatically loaded for a device.

+

Rules for udev are located /etc/udev/rules.d. +Any file that ends with .rules in this directory will be parsed when an +event occurs. For more details on how to write rules in these files see +Writing udev rules. +For testing, there are utilities such as udevmonitor, udevinfo and +udevtest.

+

For a quick example, consider the situation where we want to automatically load +a driver for a device when an event occurs. We can create a new file +/etc/udev/rules.d/myrules.rules, we will have the following line:

+
SUBSYSTEM=="pnp", ATTRS{id}=="PNP0400", RUN+="/sbin/insmod /root/mydriver.ko"
+
+
+

This will choose from the events generated only those belonging to the pnp +subsystem (connected to PNP bus) and having an id attribute with the value +PNP0400.

+

When this rule will be found, the command specified under RUN will be +executed to insert the appropriate driver in the kernel.

+
+
+
+

Plug and Play

+

As noted above, in Linux Device Model all devices are connected by a bus, even if +it has a corresponding physical hardware or it is virtual.

+

The kernel already has implemented most buses using a bus_type structure +and functions to register/unregister drivers and devices. +To implement a driver, we must first determine the bus to which the supported +devices are connected and use the structures and functions exported by this bus. +The main buses are PCI, USB, PNP, IDE, SCSI, platform, +ACPI, etc.

+
+

PNP bus

+

The plug and play mechanism provides a means of detecting and setting the resources +for legacy driver that may not be configured or otherwise. All plug and play +drivers, protocols, services are based on Plug and Play level. It is responsible +for the exchange of information between drivers and protocols. The following +protocols are available:

+
+
    +
  • PNPBIOS - used for systems such as serial and parallel ports
  • +
  • ISAPNP - offers support for the ISA bus
  • +
  • ACPI - offering, among other things, information about system-level devices
  • +
+
+

The kernel contains a bus, called pnp_bus, that is used for connecting by +many drivers. +The implementation and working with the bus follow the Linux Device Model and +is very similar to what we discussed above.

+

The main functions and structures exported by the bus, which can be used by +drivers, are:

+
+
    +
  • struct pnp_driver - driver type associated to the bus
  • +
  • pnp_register_driver() - function used to register a PNP driver in the system
  • +
  • pnp_unregister_driver() - function used to unregister a PNP driver from the system
  • +
+
+

As noted in previous sections, the bus has a function called match used to +associate the devices with the appropriate drivers. +For example, when discovering a new device, a driver which meets the condition +given by the match function regarding to the new device. Usually, this +condition is a comparation of IDs (driver id and device id). +A common approach is using a static table in each driver, which holds information +about the devices supported by the driver, which will be used by the bus +when verifying the condition. For example, for a parallel port device we have +the table parport_pc_pnp_tbl:

+
static const struct pnp_device_id parport_pc_pnp_tbl[] = {
+         /* Standard LPT Printer Port */
+         {.id = "PNP0400", .driver_data = 0},
+         /* ECP Printer Port */
+         {.id = "PNP0401", .driver_data = 0},
+};
+
+MODULE_DEVICE_TABLE(pnp, parport_pc_pnp_tbl);
+
+
+

Each driver declares and initializes a structure pnp_driver, such as +parport_pc_pnp_driver:

+
static int parport_pc_pnp_probe(struct pnp_dev *dev, const struct pnp_id *card_id,
+                                                     const struct pnp_id *dev_id);
+static void parport_pc_pnp_remove(struct pnp_dev* dev);
+
+static struct pnp_driver parport_pc_pnp_driver = {
+         .name           = "parport_pc",
+         .id_table       = parport_pc_pnp_tbl,
+         .probe          = parport_pc_pnp_probe,
+         .remove         = parport_pc_pnp_remove,
+};
+
+
+

We can notice that the structure has as fields a pointer to the table declared +above and two functions, which are called when a new device is detected and when +it is removed from the system. +As all the structures presented above, the driver must be registered to the +system:

+
static int __init parport_pc_init(void)
+{
+        err = pnp_register_driver(&parport_pc_pnp_driver);
+        if (err < 0) {
+                /* handle error */
+        }
+}
+
+static void __exit parport_pc_exit(void)
+{
+        pnp_unregister_driver(&parport_pc_pnp_driver);
+}
+
+
+
+
+

PNP operations

+

So far we have discussed the Linux Device Model and its API. To +implement a plug and play driver, we must respect the Linux Device Model model.

+

Most often, adding a bus in the kernel is not necessary, as most of the existing +buses are already implemented (PCI, USB, etc.). Thus, we must first identify the +bus to which the device is attached. +In the examples below, we will consider that this bus is bus PNP and we will +use the structures and functions described above.

+../_images/ditaa-4e1f9758808dba9e61bc0e48faf4365d377f9d32.png +
+
+

Adding a driver

+

In addition to the usual operations, a driver must follow the Linux Device Model. +Thus, it will be registered in the system using the functions provided by +the bus for this purpose. +Usually, the bus provides a particular driver structure containing a +struct device_driver structure, that the driver must initialize and +register using a function *_register_driver. +For example, for the PNP bus, the driver must declare and initialize a +structure of type struct pnp_driver and register it using +pnp_register_drvier:

+
static struct pnp_driver my_pnp_driver = {
+         .name           = "mydriver",
+         .id_table       = my_pnp_tbl,
+         .probe          = my_pnp_probe,
+         .remove         = my_pnp_remove,
+};
+
+static int __init my_init(void)
+{
+        err = pnp_register_driver(&my_pnp_driver);
+}
+
+
+

Unlike legacy drivers, plug and play drivers don't register devices at +initialization in the init function (my_init in the example above) using +register_device().

+

As described above, each bus has a match function which is called when a new +device is detected in the system to determine the associated driver. +Thus, there must be a way for each driver to export information about the +devices it supports, to allow this check to pass and have its functions further +called. +In the examples presented in this lab, the match function does a simple +comparison between the device name and the driver name. Most drivers use a table +containing information devices and store a pointer to this table in the +driver structure. +For example, a driver associated to a PNP bus defines a table of type +struct pnp_device_id and initializes the field id_table from the +structure pnp_driver my_pnp_driver with a pointer to it:

+
static const struct pnp_device_id my_pnp_tbl[] = {
+         /* Standard LPT Printer Port */
+         {.id = "PNP0400", .driver_data = 0},
+         /* ECP Printer Port */
+         {.id = "PNP0401", .driver_data = 0},
+         { }
+};
+
+MODULE_DEVICE_TABLE(pnp,my_pnp_tbl);
+
+static struct pnp_driver my_pnp_driver = {
+         //...
+         .id_table       = my_pnp_tbl,
+         //...
+};
+
+
+

In the example above, the driver supports multiple parallel port devices, +defined in the table my_pnp_tbl. This information is used by the bus in +the match_device function. +When adding a driver, the bus driver will be associated to it and new entires +in sysfs will be created based on the driver name. +Then the bus match function will be called for every supported device, +to associate the driver with any connected device that it supports.

+
+
+

Removing a driver

+

To remove a driver from the kernel, in addition to operations required for a +legacy driver, we must unregister the device_driver structure. +For a driver associated with the PNP bus, we must unregister the pnp_driver +structure using the pnp_unregister_driver() function:

+
static struct pnp_driver my_pnp_driver;
+
+static void __exit my_exit(void)
+{
+        pnp_unregister_driver(&my_pnp_driver);
+}
+
+
+

Unlike legacy drivers, plug and play drivers don't unregister devices in the +module unload function (my_exit). When a driver is removed, all the +references to it will be removed for all the devices it supports, and entries +from sysfs will also be removed.

+
+
+

Adding a new device

+

As we saw above, plug and play drivers do not register devices at initialization. +This operation will take place in the probe function, which is called when +a new device is detected. A device attached to the PNP bus will be added to +the system by the function probe from the pnp_driver structure:

+
static int my_pnp_probe(struct pnp_dev *dev, const struct pnp_id *card_id,
+                                             const struct pnp_id *dev_id) {
+        int err, iobase, nr_ports, irq;
+
+        //get irq & ports
+        if (pnp_irq_valid(dev, 0))
+                irq = pnp_irq(dev, 0);
+        if (pnp_port_valid(dev, 0)) {
+                iobase = pnp_port_start(dev, 0);
+        } else
+                return -ENODEV;
+        nr_ports = pnp_port_len(dev, 0);
+
+        /* register device dev */
+}
+
+static struct pnp_driver my_pnp_driver = {
+         //...
+         .probe          = my_pnp_probe,
+         //...
+};
+
+
+

Upon detection of a device in the kernel (at boot or by the insertion of the +device through hotplug), an interrupt is generated and reaches the bus +driver. +The device is registered using the function device_register() and it is +attached to the bus. A call to the user space will also be generated, and the +event can be treated by udev. Then, the list of drivers associated with the +bus is iterated and the match function is called for each of them. +The match function tries to find a driver for the new device. After a +suitable driver is found for the device, the probe function of the driver +is called. If the function ends successfully, the device is added to the driver's +list of devices and new entries are created in sysfs based on the device name.

+
+
+

Removing a device

+

As we saw above, the plug and play drivers don't unregister devices when the +driver is unloaded. This operation is done in the remove function, which +is called when a device is removed from the system. +In case of a device attached to the PNP bus, the unregister will be done +in the remove function specified in the pnp_driver structure:

+
static void my_pnp_remove(struct pnp_dev *dev) {
+         /* unregister device dev */
+}
+
+static struct pnp_driver my_pnp_driver = {
+         //...
+         .remove         = my_pnp_remove,
+};
+
+
+

As seen in the example above, when the removal of a device is detected, the +my_pnp_remove function is called. A user-space call is also generated, which +can be detected by udev, and entries are removed from sysfs.

+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is device_model. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/device_model/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Find the definitions of the following symbols in the Linux kernel:

+
+
    +
  • functions dev_name, dev_set_name.
  • +
  • functions pnp_device_probe, pnp_bus_match, pnp_register_driver +and the pnp_bus_type variable.
  • +
+
+
+
+

1. Bus implementation

+

Analyze the contents of the bex.c, a module that implements a bus +driver. Follow the comments marked with TODO 1 and implement the missing +functionality: register the bus driver and add a new device named root +with type none and version 1.

+
+

Hint

+

See bex_add_dev().

+
+
+

Hint

+

The register and unregister must be done using bus_register() +and bus_unregister().

+
+

Load the module and verify that the bus is visible in /sys/bus. Verify +that the device is visible in /sys/bus/bex/devices.

+

Remove the module and notice that the sysfs entries are removed.

+
+
+

2. Add type and version device attributes

+

Add two read-only device attributes, type and version. Follow the +TODO 2 markings.

+
+

Hint

+

You will need to add the two attributes in the structure +bex_dev_attrs, as follows:

+

&dev_attr_<insert-attribute-type-here>.attr,

+
+
+

Hint

+

A possible implementation for the show function is the following:

+
static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+  struct bex_device *bex_dev = to_bex_device(dev);
+
+  return sprintf(buf, "%s\n", bex_dev->type);
+}
+DEVICE_ATTR_RO(type);
+
+
+
+

Observe that two new attributes are visible in +/sys/bus/bex/devices/root. Check the contents of these attributes.

+
+
+

3. Add del and add bus attributes

+

Add two write-only bus attributes, del and add. del expects the name +of a device to delete, while add expects the name, type and version to +create a new device. Follow the TODO 3 markings and review +Buses.

+
+

Hint

+

Use sscanf() to parse the input from sysfs and +bex_del_dev() and bex_add_dev() to delete +and create a new device.

+
+

An example for the store function is the following:

+
static ssize_t add_store(struct bus_type *bt, const char *buf, size_t count)
+{
+  char name[32];
+  int ret;
+
+  ret = sscanf(buf, "%31s", name);
+  if (ret != 1)
+    return -EINVAL;
+
+  ...
+}
+BUS_ATTR(add, S_IWUSR, NULL, add_store);
+
+
+
+

Hint

+

The store functions should return 0 if +bex_add_dev/bex_del_dev fail and count otherwise.

+
+

Create a new device and observe that is visible in +/sys/bus/devices. Delete it and observe it disapears from sysfs.

+
+

Hint

+

Use echo to write into the bus attributes:

+
$ echo "name type 1" > /sys/bus/bex/add
+$ echo "name" > /sys/bus/bex/del
+
+
+
+
+
+

4. Register the bex misc driver

+

Modify bex-misc.c so that it registers the driver with the bex +bus. Insert the bmx_misc.ko module and create a new bex device from +sysfs with the name "test", type "misc", version 2. Follow the TODO +4 markings.

+

Observe that the driver is visible in /sys/bus/bex/drivers.

+

Why isn't the probe function called?

+
+

Hint

+

Notice that the bus match function in bex.c is not +implemented.

+
+

Implement the bus matching function in bex.c. Follow the TODO 5 +markings. Try again to create a new bex device and observe that this +time the probe function from the bex_misc driver is called.

+
+
+

5. Register misc device in the bex_misc probe function

+

Modify bex_misc.c to refuse probing if version > 1. Also, register the +defined misc device in bex_misc_probe and deregister it in +bex_misc_remove. Follow the TODO 6 markings.

+
+

Hint

+

Use misc_register() and misc_deregister().

+
+

Create a new device with the name "test", type "misc" and version 2 +and observe that the probe fails. Create a new device with the name +"test2", type "misc" and version 1 and observe that the probe is +successful.

+

Inspect /sys/bus/bex/devices/test2 and observe that we have a new +entry. Identify the major and minor for the misc device, create a +character device file and try to read and write from the misc device +buffer.

+
+

Hint

+

The major and minor should be visible in the dev attribute +of the misc device

+
+
+
+

6. Monitor uevent notifications

+

Use the udevadm monitor command and observe what happens when:

+
    +
  • the bex.ko and bex_misc.ko modules are inserted
  • +
  • a new device with the type "type" is created
  • +
  • a new device with the type "misc" and version 2 is created
  • +
  • a new device with the type "misc" and version 1 is created
  • +
  • all of the above are removed
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/filesystems_part1.html b/refs/pull/405/merge/labs/filesystems_part1.html new file mode 100644 index 00000000..593bfb35 --- /dev/null +++ b/refs/pull/405/merge/labs/filesystems_part1.html @@ -0,0 +1,958 @@ + + + + + + File system drivers (Part 1) — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

File system drivers (Part 1)

+
+

Lab objectives

+
+
    +
  • acquiring knowledge about the Virtual Filesystem (VFS) in Linux and understanding concepts regarding 'inode', 'dentry', 'file', superblock and data block.
  • +
  • understanding the process of mounting a file system inside VFS.
  • +
  • knowledge regarding various file system types and understanding differences between file systems with physical support (on disk) and the ones without physical support.
  • +
+
+
+
+

Virtual Filesystem (VFS)

+

The Virtual Filesystem (also known as VFS) is a component of the kernel that handles all system calls related to files and file systems. +VFS is a generic interface between the user and a particular file system. +This abstraction simplifies the implementation of file systems and provides an easier integration of multiple file systems. This way, the implementation of a file system is accomplished by using the API provided by the VFS, and the generic hardware and I/O subsystem communication parts are handled by VFS.

+

From a functional point of view, file systems can be grouped into:

+
+
    +
  • disk file systems (ext3, ext4, xfs, fat, ntfs, etc.)
  • +
  • network file systems (nfs, smbfs/cifs, ncp, etc.)
  • +
  • virtual filesystems (procfs, sysfs, sockfs, pipefs, etc.)
  • +
+
+

A Linux kernel instance will use VFS for the hierarchy (a tree) of directories and files. +A new file system will be added as a VFS subtree using the mount operation. +A file system is usually mounted from the environment for which it was built (from a block type device, from network, etc.). +In particular, however, the VFS can use a normal file as a virtual block device, so it is possible to mount disk file systems over normal files. This way, stacks of file systems can be created.

+

The basic idea of VFS is to provide a single file model that can represent files from any file system. +The file system driver is responsible for bringing to the common denominator. +This way the kernel can create a single directory structure that contains the entire system. +There will be a file system that will be the root, the rest being mounted in its various directories.

+
+
+

The general file system model

+

The general file system model, to which any implemented file system needs to be reduced, consists of several well-defined entities: superblock, inode, file, and dentry. +These entities are file system metadata (they contain information about data or other metadata).

+

Model entities interact using some VFS or kernel subsystems: dentry cache, inode cache, buffer cache. +Each entity is treated as an object: it has a associated data structure and a pointer to a table of methods. The induction of particular behavior for each component is done by replacing the associated methods.

+
+

superblock

+

The superblock stores the information needed for a mounted file system:

+
+
    +
  • inode and blocks locations
  • +
  • file system block size
  • +
  • maximum filename length
  • +
  • maximum file size
  • +
  • the location of the root inode
  • +
+
+
+

Localization:

+
+
    +
  • In the case of disk file systems, the superblock has a correspondent in the first block of the disk. (Filesystem Control Block).
  • +
  • In VFS, all superblocks of filesystems are retained in a list of structures of type struct super_block and the methods in structures of type struct super_operations.
  • +
+
+
+
+
+

inode

+

The inode (index node) keeps information about a file in the general sense (abstraction): regular file, directory, special file (pipe, fifo), block device, character device, link, or anything that can be abstracted as a file.

+

An inode stores information like:

+
+
    +
  • file type;
  • +
  • file size;
  • +
  • access rights;
  • +
  • access or modify time;
  • +
  • location of data on the disk (pointers to disk blocks containing data).
  • +
+
+
+

Note

+

Usually, the inode does not contain the file name. The name is stored by the dentry entity. This way, an inode can have multiple names (hardlinks).

+
+
+

Localization:

+

Like the superblock, the inode has a disk correspondent. +The inodes on disk are generally grouped into a specialized area (inode area) separated from the data blocks area; In some file systems, the equivalents of the inodes are spread in the file system structure (FAT); +As a VFS entity, an inode is represented by the structure struct inode and by the operations with it defined in the structure struct inode_operations.

+

Each inode is generally identified by a number. On Linux, the -i argument of the ls command shows the inode number associated with each file:

+
razvan@valhalla:~/school/so2/wiki$ ls -i
+1277956 lab10.wiki  1277962 lab9.wikibak  1277964 replace_lxr.sh
+1277954 lab9.wiki   1277958 link.txt      1277955 homework.wiki
+
+
+
+
+
+

file

+

File is the component of the file system model that is closest to the user. +The structure exists only as a VFS entity in memory and has no physical correspondent on disk.

+

While the inode abstracts a file on the disk, the file structure abstracts an open file. +From the point of view of the process, the file entity abstracts the file. From the point of view of the file system implementation, however, the inode is the entity that abstracts the file.

+

The file structure maintains information such as:

+
+
    +
  • file cursor position;
  • +
  • file opening rights;
  • +
  • pointer to the associated inode (eventually its index).
  • +
+
+
+

Localization:

+
+
    +
  • The structure struct file is the associated VFS entity, and the structure struct file_operations represents the operations associated with it.
  • +
+
+
+
+
+

dentry

+

The dentry (directory entry) associates an inode with a file name.

+

Generally, a dentry structure contains two fields:

+
+
    +
  • an integer that identifies the inode;
  • +
  • a string representing its name.
  • +
+
+

The dentry is a specific part of a path that can be a directory or a file. For example, for the path /bin/vi, dentry objects will be created for /, bin, and vi (a total of 3 dentry objects).

+
+
    +
  • the dentry has a correspondent on the disk, but the correspondence is not direct because each file system keeps the dentries in a specific way
  • +
  • in VFS, the dentry entity is represented by the structure struct dentry and the operations with it are defined in the struct dentry_operations structure.
  • +
+
+
+
+
+

Register and unregister filesystems

+

In the current version, the Linux kernel supports about 50 file systems, including:

+
+
    +
  • ext2/ ext4
  • +
  • reiserfs
  • +
  • xfs
  • +
  • fat
  • +
  • ntfs
  • +
  • iso9660
  • +
  • udf for CDs and DVDs
  • +
  • hpfs
  • +
+
+

On a single system, however, it is unlikely that there will be more than 5-6 file systems. For this reason, file systems (or, more correctly, file system types) are implemented as modules and can be loaded or unloaded at any time.

+

In order to be able to dynamically load / unload a file system module, a file system registration / deregistration API is required. The structure describing a particular file system is struct file_system_type:

+
+
+
#include <linux/fs.h>
+
+struct file_system_type {
+         const char *name;
+         int fs_flags;
+         struct dentry *(*mount) (struct file_system_type *, int,
+                                   const char *, void *);
+         void (*kill_sb) (struct super_block *);
+         struct module *owner;
+         struct file_system_type * next;
+         struct hlist_head fs_supers;
+         struct lock_class_key s_lock_key;
+         struct lock_class_key s_umount_key;
+         //...
+};
+
+
+
+
    +
  • name is a string representing the name that will identify a file system (the argument passed to mount -t).
  • +
  • owner is THIS_MODULE for file systems implemented in modules, and NULL if they are written directly into the kernel.
  • +
  • The mount function reads the superblock from the disk in memory when loading the file system. The function is unique to each file system.
  • +
  • The kill_sb function releases the super-block from memory.
  • +
  • fs_flags specifies the flags with which the file system must be mounted. An example of such flag is FS_REQUIRES_DEV that specifies to VFS that the file system needs a disk (it is not a virtual file system).
  • +
  • fs_supers is a list containing all the superblocks associated with this file system. Since the same file system can be mounted multiple times, there will be a separate superblock for each mount.
  • +
+
+

The registration of a file system into the kernel is generally performed in the module initialization function. For registration, the programmer will have to

+
+
    +
  1. initialize a structure of type struct file_system_type with the name, the flags, the function that implements the superblock reading operation and the reference to the structure that identifies the current module
  2. +
  3. call the register_filesystem() function.
  4. +
+
+

When unloading the module, you must unregister the file system by calling the unregister_filesystem() function.

+

An example of registering a virtual file system is found in the code for ramfs:

+
static struct file_system_type ramfs_fs_type = {
+        .name           = "ramfs",
+        .mount          = ramfs_mount,
+        .kill_sb        = ramfs_kill_sb,
+        .fs_flags       = FS_USERNS_MOUNT,
+};
+
+static int __init init_ramfs_fs(void)
+{
+        if (test_and_set_bit(0, &once))
+                return 0;
+        return register_filesystem(&ramfs_fs_type);
+}
+
+
+
+

Functions mount, kill_sb

+

When mounting the file system, the kernel calls the mount function defined within the structure file_system_type. The function makes a set of initializations and returns a dentry (the structure struct dentry) that represents the mount point directory. Usually mount() is a simple function that calls one of the functions:

+
+
    +
  • mount_bdev(), which mounts a file system stored on a block device
  • +
  • mount_single(), which mounts a file system that shares an instance between all mount operations
  • +
  • mount_nodev(), which mounts a file system that is not on a physical device
  • +
  • mount_pseudo(), a helper function for pseudo-file systems (sockfs, pipefs, generally file systems that can not be mounted)
  • +
+
+

These functions get as parameter a pointer to a function fill_super() that will be called after the superblock initialization to finish its initialization by the driver. An example of such a function can be found in the fill_super section.

+

When unmounting the file system, the kernel calls kill_sb(), which performs cleanup operations and invokes one of the functions:

+
+
    +
  • kill_block_super(), which unmounts a file system on a block device
  • +
  • kill_anon_super(), which unmounts a virtual file system (information is generated when requested)
  • +
  • kill_litter_super(), which unmounts a file system that is not on a physical device (the information is kept in memory)
  • +
+
+

An example for a file system without disk support is the ramfs_mount() function in the ramfs file system:

+
struct dentry *ramfs_mount(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+{
+        return mount_nodev(fs_type, flags, data, ramfs_fill_super);
+}
+
+
+

An example for a file system from disk is the minix_mount() function in the minix file system:

+
struct dentry *minix_mount(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+{
+         return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+}
+
+
+
+
+
+

Superblock in VFS

+

The superblock exists both as a physical entity (entity on disk) and as a VFS entity (within the struct super_block structure). +The superblock contains only metainformation and is used to write and read metadata from the disk (inodes, directory entries). +A superblock (and implicitly the struct super_block structure) will contain information about the block device used, the list of inodes, a pointer to the inode of the file system root directory, and a pointer to the superblock operations.

+
+

The struct super_block structure

+

Part of the struct super_block structure definition is presented below:

+
struct super_block {
+        //...
+        dev_t                   s_dev;              /* identifier */
+        unsigned char           s_blocksize_bits;   /* block size in bits */
+        unsigned long           s_blocksize;        /* block size in bytes */
+        unsigned char           s_dirt;             /* dirty flag */
+        loff_t                  s_maxbytes;         /* max file size */
+        struct file_system_type *s_type;            /* filesystem type */
+        struct super_operations *s_op;              /* superblock methods */
+        //...
+        unsigned long           s_flags;            /* mount flags */
+        unsigned long           s_magic;            /* filesystem’s magic number */
+        struct dentry           *s_root;            /* directory mount point */
+        //...
+        char                    s_id[32];           /* informational name */
+        void                    *s_fs_info;         /* filesystem private info */
+};
+
+
+
+
The superblock stores global information for an instance of a file system:
+
    +
  • the physical device on which it resides
  • +
  • block size
  • +
  • the maximum size of a file
  • +
  • file system type
  • +
  • the operations it supports
  • +
  • magic number (identifies the file system)
  • +
  • the root directory dentry
  • +
+
+
+

Additionally, a generic pointer (void *) stores the private data of the file system. +The superblock can be viewed as an abstract object to which its own data is added when there is a concrete implementation.

+
+
+

Superblock operations

+

The superblock operations are described by the struct super_operations structure:

+
struct super_operations {
+       //...
+       int (*write_inode) (struct inode *, struct writeback_control *wbc);
+       struct inode *(*alloc_inode)(struct super_block *sb);
+       void (*destroy_inode)(struct inode *);
+
+       void (*put_super) (struct super_block *);
+       int (*statfs) (struct dentry *, struct kstatfs *);
+       int (*remount_fs) (struct super_block *, int *, char *);
+       //...
+};
+
+
+

The fields of the structure are function pointers with the following meanings:

+
+
    +
  • write_inode, alloc_inode, destroy_inode write, allocate, respectively release resources associated with an inode and are described in the next lab
  • +
  • put_super is called when the superblock is released at umount; within this function, any resources (generally memory) from the file system's private data must be released;
  • +
  • remount_fs is called when the kernel detects a remount attempt (mount flag MS_REMOUNTM); most of the time here must be detected if a switch from read-only to read-write or vice versa is attempted; this can be done simply because both the old flags (in sb->s_flags) and the new flags (the flags argument) can be accessed; data is a pointer to the data sent by mount() that represent file system specific options;
  • +
  • statfs is called when a statfs system call is done (try stat –f or df); this call must fill the fields of the struct kstatfs structure, as it is done, for example, in the ext4_statfs() function.
  • +
+
+
+
+
+

The fill_super() function

+

As specified, the fill_super() function is called to terminate the superblock initialization. This initialization involves filling the struct super_block structure fields and the initialization of the root directory inode.

+

An example of implementation is the ramfs_fill_super() function which is called to initialize the remaining fields in the superblock:

+
#include <linux/pagemap.h>
+
+#define RAMFS_MAGIC     0x858458f6
+
+static const struct super_operations ramfs_ops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .show_options   = ramfs_show_options,
+};
+
+static int ramfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct ramfs_fs_info *fsi;
+        struct inode *inode;
+        int err;
+
+        save_mount_options(sb, data);
+
+        fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+        sb->s_fs_info = fsi;
+        if (!fsi)
+                return -ENOMEM;
+
+        err = ramfs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                return err;
+
+        sb->s_maxbytes          = MAX_LFS_FILESIZE;
+        sb->s_blocksize         = PAGE_SIZE;
+        sb->s_blocksize_bits    = PAGE_SHIFT;
+        sb->s_magic             = RAMFS_MAGIC;
+        sb->s_op                = &ramfs_ops;
+        sb->s_time_gran         = 1;
+
+        inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
+        sb->s_root = d_make_root(inode);
+        if (!sb->s_root)
+                return -ENOMEM;
+
+        return 0;
+}
+
+
+

The kernel provides generic function to implement operations with file system structures. +The generic_delete_inode() and simple_statfs() functions used in the above code are such functions and can be used to implement the drivers if their functionality is sufficient.

+

The ramfs_fill_super() function in the above code fills some fields in the superblock, then reads the root inode and allocates the root dentry. +Reading the root inode is done in the ramfs_get_inode() function, and consists of allocating a new inode using new_inode() and initializing it. In order to free the inode, iput() is used, and d_make_root() is used to allocate the root dentry.

+

An example implementation for a disk file system is the minix_fill_super() function in the minix file system. +The functionality for the disk file system is similar to that of the virtual file system, with the exception of using the buffer cache. +Also, the minix file system keeps private data using the struct minix_sb_info structure. +A large part of this function deals with the initialization of these private data. +The private data is allocated using the kzalloc() function and stored in the s_fs_info field of the superblock structure.

+

VFS functions typically get as arguments the superblock, an inode and/or a dentry that contain a pointer to the superblock so that these private data can be easily accessed.

+
+
+

Buffer cache

+

Buffer cache is a kernel subsystem that handles caching (both read and write) blocks from block devices. +The base entity used by buffer cache is the struct buffer_head structure. +The most important fields in this structure are:

+
+
    +
  • b_data, pointer to a memory area where the data was read from or where the data must be written to
  • +
  • b_size, buffer size
  • +
  • b_bdev, the block device
  • +
  • b_blocknr, the number of block on the device that has been loaded or needs to be saved on the disk
  • +
  • b_state, the status of the buffer
  • +
+
+

There are some important functions that work with these structures:

+
+
    +
  • __bread(): reads a block with the given number and given size in a buffer_head structure; in case of success returns a pointer to the buffer_head structure, otherwise it returns NULL;
  • +
  • sb_bread(): does the same thing as the previous function, but the size of the read block is taken from the superblock, as well as the device from which the read is done;
  • +
  • mark_buffer_dirty(): marks the buffer as dirty (sets the BH_Dirty bit); the buffer will be written to the disk at a later time (from time to time the bdflush kernel thread wakes up and writes the buffers to disk);
  • +
  • brelse(): frees up the memory used by the buffer, after it has previously written the buffer on disk if needed;
  • +
  • map_bh(): associates the buffer-head with the corresponding sector.
  • +
+
+
+
+

Functions and useful macros

+

The super block typically contains a map of occupied blocks (by inodes, dentries, data) in the form of a bitmap (vector of bits). To work with such maps, it is recommend to use the following features:

+
+
    +
  • find_first_zero_bit(), to find the first zero bit in a memory area. The size parameter means the number of bits in the search area;
  • +
  • test_and_set_bit(), to set a bit and get the old value;
  • +
  • test_and_clear_bit(), to delete a bit and get the old value;
  • +
  • test_and_change_bit(), to invert the value of a bit and get the old value.
  • +
+
+

The following macrodefinitions can be used to verify the type of an inode:

+
+
    +
  • S_ISDIR (inode->i_mode) to check if the inode is a directory;
  • +
  • S_ISREG (inode->i_mode) to check if the inode is a regular file (not a link or device file).
  • +
+
+
+
+

Further reading

+
    +
  1. Robert Love -- Linux Kernel Development, Second Edition -- Chapter +12. The Virtual Filesystem
  2. +
  3. Understanding the Linux Kernel, 3rd edition - Chapter 12. The Virtual +Filesystem
  4. +
  5. Linux Virtual File System (presentation)
  6. +
  7. Understanding Unix/Linux Filesystem
  8. +
  9. Creating Linux virtual filesystems
  10. +
  11. The Linux Documentation Project - VFS
  12. +
  13. The "Virtual File System" in Linux
  14. +
  15. A Linux Filesystem Tutorial
  16. +
  17. The Linux Virtual File System
  18. +
  19. Documentation/filesystems/vfs.txt
  20. +
  21. File systems sources
  22. +
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is filesystems. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/filesystems/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

myfs

+

To begin, we plan to get familiar with the interface exposed by the Linux kernel and the Virtual File System (VFS) component. That is why, for the beginning, we will work with a simple, virtual file system (i.e. without physical disk support). The file system is called myfs.

+

For this we will access the myfs/ subdirectory in the laboratory skeleton. We will implement the superblock operations within this lab, and the next lab will continue with the inode operations.

+
+

1. Register and unregister the myfs file system

+

The first step in working with the file system is to register and unregister it. We want to do this for the file system described in myfs.c. Check the file contents and follow the directions marked with TODO 1.

+

The steps you need to take are described in the section Register and unregister filesystems. Use the "myfs" string for the file system name.

+
+

Note

+

Within the file system structure, use the myfs_mount function present in the code skeleton to fill the superblock (done when mounting). In myfs_mount call the function specific to a file system without disk support. As an argument for the specific mount function, use the function of type fill_super defined in the code skeleton. You can review the Functions mount, kill_sb section.

+

To destroy the superblock (done at unmounting) use kill_litter_super, also a function specific to a file system without disk support. The function is already implemented, you need to fill it in the struct file_system_type structure.

+
+

After completing the sections marked with TODO 1 , compile the module, copy it to the QEMU virtual machine, and start the virtual machine. Load the kernel module and then check the presence of the myfs file system within the /proc/filesystems file.

+

At the moment, the file system is only registered, it does not expose operations to use it. If we try to mount it, the operation will fail. To try mounting, we create mount point /mnt/myfs/.

+
# mkdir -p /mnt/myfs
+
+
+

and then we use the mount command:

+
# mount -t myfs none /mnt/myfs
+
+
+

The error message we get shows that we have not implemented the operations that work on the superblock. We will have to implement the operations on the superblock and initialize the root inode. We will do this further.

+
+

Note

+

The none argument sent to the mount command indicates that we do not have a device from which to mount, the file system being a virtual one. Similarly, this is how the procfs or sysfs filesystems are mounted on Linux systems.

+
+
+
+

2. Completing myfs superblock

+

To be able to mount the file system, we need to fill its superblock's fields, that is, a generic VFS structure of type struct super_block. +We will fill out the structure within the myfs_fill_super() function; the superblock is represented by the variable sb passed as an argument to the function. +Follow the hints marked with TODO 2.

+
+

Note

+

To fill the myfs_fill_super function, you can start from the example in the section The fill_super() function.

+

For the superblock structure fields, use the macros defined within the code skeleton wherever possible.

+
+

The s_op field in the superblock structure must be initialized to the superblock operations structures (type struct super_operations). You need to define such a structure.

+

For information on defining the struct super_operations structure and filling the superblock, see the section Superblock operations.

+
+

Note

+

Initialize the drop_inode and statfs fields of struct super_operations structure.

+
+

Although the superblock will be properly initialized at this time, the mount operation will continue to fail. +In order for the operation to be successfully completed, the root inode will have to be initialized, which we will do for the next exercise.

+
+
+

3. Initialize myfs root inode

+

The root inode is the inode of the file system root directory (i.e. /). +Initialization is done when the file system is mounted. +The myfs_fill_super function, called at mount, is the one that calls the myfs_get_inode function that creates and initializes an inode. +Typically, this function is used to create and initialize all inodes; In this exercise, however, we will only create the root inode.

+

The inode is allocated inside the myfs_get_inode function (local variable inode, allocated using the new_inode() function call).

+

To successfully complete mounting the file system, you will need to fill the myfs_get_inode function. Follow directions marked with TODO 3. A starting point is the ramfs_get_inode function.

+
+

Note

+

To initialize uid, gid and mode , you can use the inode_init_owner() function as it is used in ramfs_get_inode(). +When you call inode_init_owner(), use NULL as the second parameter because there is no parent directory for the created inode.

+

Initialize the i_atime, i_ctime, and i_mtime of the VFS inode to the value returned by the current_time() function.

+

You will need to initialize the operations for the inode of type directory. To do this, follow the steps:

+
+
    +
  1. Check if this is a directory type inode using the S_ISDIR macro.
  2. +
  3. For the i_op and i_fop fields, use kernel functions that are already implemented:
      +
    • for i_op: simple_dir_inode_operations.
    • +
    • for i_fop: simple_dir_operations
    • +
    +
  4. +
  5. Increase the number of links for the directory using the inc_nlink() function.
  6. +
+
+
+
+
+

4. Test myfs mount and unmount

+

Now we can mount the filesystem. +Follow the steps above to compile the kernel module, copy to the virtual machine, and start the virtual machine, then insert the kernel module, create the mount point /mnt/myfs/, and mount the file system. +We verify that the file system was mounted by inspecting the /proc/mounts file.

+

What inode number does the /mnt/myfs directory have? Why?

+
+

Note

+

To display the inode number of a directory, use the command:

+
ls -di /path/to/directory
+
+
+

where /path/to/directory/ is the path to the directory whose inode number we want to display.

+
+

We check myfs file system statistics using the following command:

+
stat -f /mnt/myfs
+
+
+

We want to see what the mount point /mnt/myfs contains and if we can create files. +For this we run the commands:

+
# ls -la /mnt/myfs
+# touch /mnt/myfs/a.txt
+
+
+

We can see that we can not create the a.txt file on the file system. +This is because we have not implemented the operations to work with inodes in the struct super_operations structure. +We will implement these operations within the next lab.

+

Unmount the file system using the command

+
umount /mnt/myfs
+
+
+

Unload the kernel module corresponding to the file system as well.

+
+

Note

+

To test the entire functionality, you can use the test-myfs.sh script:

+
./test-myfs.sh
+
+
+

The script is copied to the virtual machine using make copy only if it is executable:

+
student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/myfs/test-myfs.sh
+
+
+
+
+

Note

+

The statistics displayed for the file system are minimal because the information is provided by the simple_statfs function.

+
+
+
+
+

minfs

+

Next, we will implement the basics of a very simple file system, called minfs, with disk support. +We will use a disk in the virtual machine that we will format and mount with the minfs filesystem.

+

For this we will access the minfs/kernel directory from the laboratory skeleton and work with the code in minfs.c. +Just like myfs we will not implement the operations for working with inodes. We will just limit to working with the superblock and, therefore, mounting. +The rest of the operations will be implemented in the next lab.

+

Follow the diagram below to clarify the role of structures within the minfs file system.

+../_images/minfs.png +
+

1. Registering and unregistering the minfs file system

+
+

Note

+

Before solving the exercise, we need to add a disk to the virtual machine. To do this, generate a file that we will use as the disk image using the following command:

+
dd if=/dev/zero of=mydisk.img bs=1M count=100
+
+
+

and add the -drive file=mydisk.img,if=virtio,format=raw argument to the qemu command in qemu/Makefile (in the QEMU_OPTS variable). +The new argument for the qemu command must be added after the one for the existing disk (YOCTO_IMAGE).

+
+

To register and unregister the file system, you will need to fill the minfs_fs_type and minfs_mount functions in minfs.c. Follow the directions marked with TODO 1.

+
+

Note

+

In the file system structure, for mount, use the minfs_mount function from in the code skeleton. +In this function, call the function to mount a file system with disk support (See the Functions mount, kill_sb section. Use mount_bdev()). +Choose the most suitable function for destroying the superblock (done at unmount); keep in mind that it is a file system with disk support. Use the kill_block_super() function.

+

Initialize the fs_flags field of the minfs_fs_type structure with the appropriate value for a file system with disk support. See the section Register and unregister filesystems.

+

The function for filling the superblock is minfs_fill_super.

+
+

After completing the sections marked with TODO 1, compile the module, copy it into the QEMU virtual machine, and start the virtual machine. +Load the kernel module and then check the presence of the minfs file system within the /proc/filesystems file.

+

To test the mounting of the minfs file system we will need to format the disk with its structure. Formatting requires the mkfs.minfs formatting tool from the minfs/user directory. The utility is automatically compiled when running make build and copied to the virtual machine at make copy.

+

After compiling, copying, and starting the virtual machine, format the /dev/vdd using the formatting utility:

+
# ./mkfs.minfs /dev/vdd
+
+
+

Load the kernel module:

+
# insmod minfs.ko
+
+
+

Create mount point /mnt/minfs/:

+
# mkdir -p /mnt/minfs/
+
+
+

and mount the filesystem

+
# mount -t minfs /dev/vdd /mnt/minfs/
+
+
+

The operation fails because the root inode is not initialized.

+
+
+

2. Completing minfs superblock

+

To be able to mount the file system, you will need to fill the superblock (i.e a structure with type struct super_block) within the minfs_fill_super function; it is the s argument of the function. +The structure of operations on the superblock is already defined: minfs_ops. +Follow the directions marked with TODO 2. You can also follow the implementation of the minix_fill_super function.

+
+

Note

+

Some structures are found in the header file minfs.h.

+

For information on working with buffers, go to the Buffer cache section.

+

Read the first block on the disk (block with index 0). +To read the block, use the sb_bread() function. +Cast the read data (the b_data field in the struct buffer_head structure) to the structure storing the minfs superblock information on the disk: struct minfs_super_block, defined in the source code file.

+

Structure struct minfs_super_block holds file system-specific information that is not found in the struct super_block generic structure (in this case only version). +Those additional information (found in struct minfs_super_block (on disk) but not in struct super_block (VFS)) will be stored in the struct minfs_sb_info structure.

+
+

To check the functionality, we need a function for reading the root inode. +For the time being, use the myfs_get_inode function from myfs file system exercises. +Copy the function into the source code and call it the same as you did for myfs. +The third argument when calling the myfs_get_inode function is the inode creation permissions, similar to the virtual file system exercise (myfs).

+

Validate the implementation by executing the commands from the previous exercise.

+
+
+

3. Creating and destroying minfs inodes

+

For mounting, we need to initialize the root inode, and to get the root inode, we need to implement the functions to work with inodes. +That is, you need to implement the minfs_alloc_inode and minfs_destroy_inode functions. +Follow the directions marked with TODO 3. You can use the minix_alloc_inode() and minix_destroy_inode() functions as a model.

+

For the implementation, look at the macros and structures in the minfs.h header file.

+
+

Note

+

For memory allocation/deallocation in minfs_alloc_inode and minfs_destroy_inode, we recommend using kzalloc() and kfree().

+

In minfs_alloc_inode allocate structures with type struct minfs_inode_info, but only return structures with type struct inode, i.e. return those given by the vfs_inode field.

+

In the minfs_alloc_inode function, call inode_init_once() to initialize the inode.

+

In the destroy_inode function, you can access the structure with type struct minfs_inode_info using the container_of macro.

+
+
+

Note

+

In this exercise, you have implemented the minfs_alloc_inode and minfs_destroy_inode functions, but they are not yet called. The correctness of the implementation will be checked at the end of the next exercise.

+
+
+
+

4. Initialize minfs root inode

+

Initializing the root inode is required in order to mount the file system. +For this, you will need to complete the minfs_ops structure with the minfs_alloc_inode and minfs_destroy_inode functions and fill the minfs_iget function.

+

The minfs_iget function is the function called to allocate a VFS inode (i.e. struct inode) and fill it with minfs inode-specific information from the disk (i.e. struct minfs_inode).

+

Follow the directions marked with TODO 4. +Fill out the alloc_inode and destroy_inode fields of struct super_operations structure with the functions implemented in the previous step.

+

The information about the root inode is found in the second block on the disk (the inode with index 1). +Make minfs_iget read the root minfs inode from the disk (struct minfs_inode) and fill in the VFS inode (struct inode).

+

In the minfs_fill_super function, replace the myfs_get_inode call with the minfs_iget function call.

+
+

Note

+

To implement the minfs_iget function, follow the implementation of V1_minix_iget. +To read a block, use the sb_bread() function. +Cast the read data (the b_data field of the struct buffer_head structure) to the minfs inode from the disk (struct minfs_inode).

+

The i_uid, i_gid, i_mode, i_size must be filled in the VFS inode with the values in the minfs inode structure read from disk. +To initialize the i_uid and i_gid fields, use the functions i_uid_write() , and i_gid_write().

+

Initialize the i_atime , i_ctime, and i_mtime fields of the VFS inode to the value returned by the current_time() function.

+

You will need to initialize the operations for the inode with type directory. To do this, follow the steps:

+
+
    +
  1. Check if this is a directory type inode using the S_ISDIR macro.
  2. +
  3. For the i_op and i_fop fields, use kernel functions already implemented:
      +
    • for i_op: simple_dir_inode_operations() .
    • +
    • for i_fop: simple_dir_operations()
    • +
    +
  4. +
  5. Increment the number of links for the directory using the inc_nlink() function.
  6. +
+
+
+
+
+

5. Testing of minfs mount and unmount

+

Now we can mount the filesystem. +Follow the steps above to compile the kernel module, copy to the virtual machine, start the virtual machine, and then insert the kernel module, create mount point /mnt/minfs/ and mount the file system. +We verify that the file system was mounted by investigating the /proc/mounts file.

+

We check that everything is fine by listing the mount point contents /mnt/minfs/:

+
# ls /mnt/minfs/
+
+
+

After mount and verification, unmount the file system and unload the module from the kernel.

+
+

Note

+

Alternatively, to test the entire functionality, you can use the test-minfs.sh script:

+
# ./test-minfs.sh
+
+
+

The script is copied to the virtual machine when running the make copy command only if is executable.

+
student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/minfs/user/test-minfs.sh
+
+
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/filesystems_part2.html b/refs/pull/405/merge/labs/filesystems_part2.html new file mode 100644 index 00000000..fe3d9eeb --- /dev/null +++ b/refs/pull/405/merge/labs/filesystems_part2.html @@ -0,0 +1,1215 @@ + + + + + + File system drivers (Part 2) — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

File system drivers (Part 2)

+
+

Lab objectives

+
+
    +
  • Improving the knowledge about inode, file and dentry.
  • +
  • Acquiring knowledge about adding support for working with regular files and directories in VFS (Virtual File System).
  • +
  • Acquiring knowledge about the internal implementation of a file system.
  • +
+
+
+
+

Inode

+

The inode is an essential component of a UNIX file system and, at the same time, an important component of VFS. An inode is a metadata (it has information about information). +An inode uniquely identifies a file on disk and holds information about it (uid, gid, access rights, access times, pointers to data blocks, etc.). +An important aspect is that an inode does not have information about the file name (it is retained by the associated struct dentry structure).

+

The inode refers to a file on the disk. To refer an open file (associated with a file descriptor within a process), the struct file structure is used. +An inode can have any number of (zero or more) file structures associated (multiple processes can open the same file, or a process can open the same file several times).

+

Inode exists both as a VFS entity (in memory) and as a disk entity (for UNIX, HFS, NTFS, etc.). +The inode in VFS is represented by the structure struct inode. +Like the other structures in VFS, struct inode is a generic structure that covers the options for all supported file types, even those that do not have an associated disk entity (such as FAT).

+
+

The inode structure

+

The inode structure is the same for all file systems. In general, file systems also have private information. These are referenced through the i_private field of the structure. +Conventionally, the structure that keeps that particular information is called <fsname>_inode_info, where fsname represents the file system name. For example, minix and ext4 filesystems store particular information in structures struct minix_inode_info, or struct ext4_inode_info.

+

Some of the important fields of struct inode are:

+
+
    +
  • i_sb : The superblock structure of the file system the inode belongs to.
  • +
  • i_rdev: the device on which this file system is mounted
  • +
  • i_ino : the number of the inode (uniquely identifies the inode within the file system)
  • +
  • i_blkbits: number of bits used for the block size == log2(block size)
  • +
  • i_mode, i_uid, i_gid: access rights, uid, gid
  • +
  • i_size: file/directory/etc. size in bytes
  • +
  • i_mtime, i_atime, i_ctime: change, access, and creation time
  • +
  • i_nlink: the number of names entries (dentries) that use this inode; for file systems without links (either hard or symbolic) this is always set to 1
  • +
  • i_blocks: the number of blocks used by the file (all blocks, not just data); this is only used by the quota subsystem
  • +
  • i_op, i_fop: pointers to operations structures: struct inode_operations and struct file_operations; i_mapping->a_ops contains a pointer to struct address_space_operations.
  • +
  • i_count: the inode counter indicating how many kernel components use it.
  • +
+
+

Some functions that can be used to work with inodes:

+
+
    +
  • new_inode(): creates a new inode, sets the i_nlink field to 1 and initializes i_blkbits, i_sb and i_dev;

    +
  • +
  • insert_inode_hash(): adds the inode to the hash table of inodes; an interesting effect of this call is that the inode will be written to the disk if it is marked as dirty;

    +
    +

    Warning

    +

    An inode created with new_inode() is not in the hash table, and unless you have serious reasons not to, you must enter it in the hash table;

    +
    +
  • +
  • mark_inode_dirty(): marks the inode as dirty; at a later moment, it will be written on the disc;

    +
  • +
  • iget_locked(): loads the inode with the given number from the disk, if it is not already loaded;

    +
  • +
  • unlock_new_inode(): used in conjunction with iget_locked(), releases the lock on the inode;

    +
  • +
  • iput(): tells the kernel that the work on the inode is finished; if no one else uses it, it will be destroyed (after being written on the disk if it is maked as dirty);

    +
  • +
  • make_bad_inode(): tells the kernel that the inode can not be used; It is generally used from the function that reads the inode when the inode could not be read from the disk, being invalid.

    +
  • +
+
+
+
+

Inode operations

+
+

Getting an inode

+

One of the main inode operations is obtaining an inode (the struct inode in VFS). +Until version 2.6.24 of the Linux kernel, the developer defined a read_inode function. +Starting with version 2.6.25, the developer must define a <fsname>_iget where <fsname> is the name of the file system. +This function is responsible with finding the VFS inode if it exists or creating a new one and filling it with the information from the disk.

+

Generally, this function will call iget_locked() to get the inode structure from VFS. If the inode is newly created then it will need to read the inode from the disk (using sb_bread()) and fill in the useful information.

+

An example of such a function is minix_iget():

+
static struct inode *V1_minix_iget(struct inode *inode)
+{
+      struct buffer_head * bh;
+      struct minix_inode * raw_inode;
+      struct minix_inode_info *minix_inode = minix_i(inode);
+      int i;
+
+      raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+      if (!raw_inode) {
+              iget_failed(inode);
+              return ERR_PTR(-EIO);
+      ...
+}
+
+struct inode *minix_iget(struct super_block *sb, unsigned long ino)
+{
+      struct inode *inode;
+
+      inode = iget_locked(sb, ino);
+      if (!inode)
+              return ERR_PTR(-ENOMEM);
+      if (!(inode->i_state & I_NEW))
+              return inode;
+
+      if (INODE_VERSION(inode) == MINIX_V1)
+              return V1_minix_iget(inode);
+    ...
+}
+
+
+

The minix_iget function gets the VFS inode using iget_locked(). +If the inode is already existing (not new == the I_NEW flag is not set) the function returns. +Otherwise, the function calls the V1_minix_iget() function that will read the inode from the disk using minix_V1_raw_inode() and then complete the VFS inode with the read information.

+
+
+

Superoperations

+

Many of the superoperations (components of the struct super_operations structure used by the superblock) are used when working with inodes. These operations are described next:

+
+
    +
  • alloc_inode: allocates an inode. +Usually, this funcion allocates a struct <fsname>_inode_info structure and performs basic VFS inode initialization (using inode_init_once()); +minix uses for allocation the kmem_cache_alloc() function that interacts with the SLAB subsystem. +For each allocation, the cache construction is called, which in the case of minix is the init_once() function. +Alternatively, kmalloc() can be used, in which case the inode_init_once() function should be called. +The alloc_inode() function will be called by the new_inode() and iget_locked() functions.
  • +
  • write_inode : saves/updates the inode received as a parameter on disk; to update the inode, though inefficient, for beginners it is recommended to use the following sequence of operations:
      +
    • load the inode from the disk using the sb_bread() function;
    • +
    • modify the buffer according to the saved inode;
    • +
    • mark the buffer as dirty using mark_buffer_dirty(); the kernel will then handle its writing on the disk;
    • +
    • an example is the minix_write_inode() function in the minix file system
    • +
    +
  • +
  • evict_inode: removes any information about the inode with the number received in the i_ino field from the disk and memory (both the inode on the disk and the associated data blocks). This involves performing the following operations:
      +
    • delete the inode from the disk;
    • +
    • updates disk bitmaps (if any);
    • +
    • delete the inode from the page cache by calling truncate_inode_pages();
    • +
    • delete the inode from memory by calling clear_inode() ;
    • +
    • an example is the minix_evict_inode() function from the minix file system.
    • +
    +
  • +
  • destroy_inode releases the memory occupied by inode
  • +
+
+
+
+

inode_operations

+

The inode operations are described by the struct inode_operations structure.

+

Inodes are of several types: file, directory, special file (pipe, fifo), block device, character device, link etc. +For this reason, the operations that an inode needs to implement are different for each type of inode. +Below are detailed operations for a file type inode and a directory inode.

+

The operations of an inode are initialized and accessed using the i_op field of the structure struct inode.

+
+
+
+
+

The file structure

+

The file structure corresponds to a file open by a process and exists only in memory, being associated with an inode. +It is the closest VFS entity to user-space; the structure fields contain familiar information of a user-space file (access mode, file position, etc.) and the operations with it are performed by known system calls (read, write , etc.).

+

The file operations are described by the struct file_operations structure.

+

The file operations for a file system are initialized using the i_fop field of the struct inode structure. +When opening a file, the VFS initializes the f_op field of the struct file structure with address of inode->i_fop, such that subsequent system calls use the value stored in the file->f_op.

+
+
+

Regular files inodes

+

To work with the inode, the i_op and i_fop fields of the inode structure must be filled in. +The type of the inode determines the operations that it needs to implement.

+
+

Regular files inode operations

+

In the minix file system, the minix_file_inode_operations structure is defined for the operations on an inode and for the file operations the minix_file_operations structure is defined:

+
const struct file_operations minix_file_operations = {
+         .llseek         = generic_file_llseek,
+         .read_iter      = generic_file_read_iter,
+         //...
+         .write_iter     = generic_file_write_iter,
+         //...
+         .mmap           = generic_file_mmap,
+         //...
+};
+
+const struct inode_operations minix_file_inode_operations = {
+        .setattr        = minix_setattr,
+        .getattr        = minix_getattr,
+};
+
+        //...
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &minix_file_inode_operations;
+                inode->i_fop = &minix_file_operations;
+        }
+        //...
+
+
+

The functions generic_file_llseek() , generic_file_mmap() , generic_file_read_iter() and generic_file_write_iter() are implemented in the kernel.

+

For simple file systems, only the truncation operation (truncate system call) must be implemented. +Although initially there was a dedicated operation, starting with 3.14 the operation was embedded in setattr: if the paste size is different from the current size of the inode, then a truncate operation must be performed. +An example of implementing this verification is in the minix_setattr() function:

+
static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = d_inode(dentry);
+        int error;
+
+        error = setattr_prepare(dentry, attr);
+        if (error)
+                return error;
+
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = inode_newsize_ok(inode, attr->ia_size);
+                if (error)
+                        return error;
+
+                truncate_setsize(inode, attr->ia_size);
+                minix_truncate(inode);
+        }
+
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
+}
+
+
+

The truncate operation involves:

+
+
    +
  • freeing blocks of data on the disk that are now extra (if the new dimension is smaller than the old one) or allocating new blocks (for cases where the new dimension is larger)
  • +
  • updating disk bit maps (if used);
  • +
  • updating the inode;
  • +
  • filling with zero the space that was left unused from the last block using the block_truncate_page() function.
  • +
+
+

An example of the implementation of the cropping operation is the minix_truncate() function in the minix file system.

+
+
+

Address space operations

+

There is a close link between the address space of a process and files: the execution of the programs is done almost exclusively by mapping the file into the process address space. +Because this approach works very well and is quite general, it can also be used for regular system calls such as read and write.

+

The structure that describes the address space is struct address_space, and the operations with it are described by the structure struct address_space_operations. To initialize the address space operations, fill inode->i_mapping->a_ops of the file type inode.

+

An example is the minix_aops structure in the minix file system:

+
static const struct address_space_operations minix_aops = {
+       .readpage = minix_readpage,
+       .writepage = minix_writepage,
+       .write_begin = minix_write_begin,
+       .write_end = generic_write_end,
+       .bmap = minix_bmap
+};
+
+//...
+if (S_ISREG(inode->i_mode)) {
+      inode->i_mapping->a_ops = &minix_aops;
+}
+//...
+
+
+

The generic_write_end() function is already implemented. +Most of the specific functions are very easy to implement, as follows:

+
static int minix_writepage(struct page *page, struct writeback_control *wbc)
+{
+         return block_write_full_page(page, minix_get_block, wbc);
+}
+
+static int minix_readpage(struct file *file, struct page *page)
+{
+         return block_read_full_page(page, minix_get_block);
+}
+
+static void minix_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, inode->i_size);
+                minix_truncate(inode);
+        }
+}
+
+static int minix_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
+{
+        int ret;
+
+        ret = block_write_begin(mapping, pos, len, flags, pagep,
+                                minix_get_block);
+        if (unlikely(ret))
+                minix_write_failed(mapping, pos + len);
+
+        return ret;
+}
+
+static sector_t minix_bmap(struct address_space *mapping, sector_t block)
+{
+         return generic_block_bmap(mapping, block, minix_get_block);
+}
+
+
+

All that needs to be done is to implement minix_get_block, which has to translate a block of a file into a block on the device. +If the flag create received as a parameter is set, a new block must be allocated. +In case a new block is created, the bit map must be updated accordingly. +To notify the kernel not to read the block from the disk, bh must be marked with set_buffer_new(). The buffer must be associated with the block through map_bh().

+
+
+
+

Dentry structure

+

Directories operations use the struct dentry structure. +Its main task is to make links between inodes and filenames. +The important fields of this structure are presented below:

+
struct dentry {
+        //...
+        struct inode             *d_inode;     /* associated inode */
+        //...
+        struct dentry            *d_parent;    /* dentry object of parent */
+        struct qstr              d_name;       /* dentry name */
+        //...
+
+        struct dentry_operations *d_op;        /* dentry operations table */
+        struct super_block       *d_sb;        /* superblock of file */
+        void                     *d_fsdata;    /* filesystem-specific data */
+        //...
+};
+
+
+

Fields meaning:

+
+
    +
  • d_inode: the inode referenced by this dentry;
  • +
  • d_parent: the dentry associated with the parent directory;
  • +
  • d_name: a struct qstr structure that contains the fields name and len (the name and the length of the name).
  • +
  • d_op: operations with dentries, represented by the struct dentry_operations structure. +The kernel implements default operations so there is no need to (re)implement them. Some file systems can do optimizations based on the specific structure of the dentries.
  • +
  • d_fsdata: field reserved for the file system that implements dentry operations;
  • +
+
+
+

Dentry operations

+

The most commonly operations applied to dentries are:

+
+
    +
  • d_make_root: allocates the root dentry. It is generally used in the function that is called to read the superblock (fill_super), which must initialize the root directory. +So the root inode is obtained from the superblock and is used as an argument to this function, to fill the s_root field from the struct super_block structure.
  • +
  • d_add: associates a dentry with an inode; the dentry received as a parameter in the calls discussed above signifies the entry (name, length) that needs to be created. This function will be used when creating/loading a new inode that does not have a dentry associated with it and has not yet been introduced to the hash table of inodes (at lookup);
  • +
  • d_instantiate: The lighter version of the previous call, in which the dentry was previously added in the hash table.
  • +
+
+
+

Warning

+

d_instantiate must be used to implement create calls (mkdir, mknod, rename, symlink) and NOT d_add.

+
+
+
+
+

Directory inodes operations

+

The operations for directory type inodes have a higher complexity level than the ones for files. +The developer must define operations for inodes and operations for files. +In minix, these operations are defined in minix_dir_inode_operations and minix_dir_operations:

+
struct inode_operations minix_dir_inode_operations = {
+      .create = minix_create,
+      .lookup = minix_lookup,
+      .link = minix_link,
+      .unlink = minix_unlink,
+      .symlink = minix_symlink,
+      .mkdir = minix_mkdir,
+      .rmdir = minix_rmdir,
+      .mknod = minix_mknod,
+      //...
+};
+
+struct file_operations minix_dir_operations = {
+      .llseek = generic_file_llseek,
+      .read = generic_read_dir,
+      .iterate = minix_readdir,
+      //...
+};
+
+        //...
+      if (S_ISDIR(inode->i_mode)) {
+              inode->i_op = &minix_dir_inode_operations;
+              inode->i_fop = &minix_dir_operations;
+              inode->i_mapping->a_ops = &minix_aops;
+      }
+       //...
+
+
+

The only function already implemented is generic_read_dir().

+

The functions that implement the operations on directory inodes are the ones described below.

+
+

Creating an inode

+

The inode creation function is indicated by the field create in the inode_operations structure. +In the minix case, the function is minix_create(). +This function is called by the open and creat system calls. Such a function performs the following operations:

+
+
    +
  1. Introduces a new entry into the physical structure on the disk; the update of the bit maps on the disk must not be forgotten.
  2. +
  3. Configures access rights to those received as a parameter.
  4. +
  5. Marks the inode as dirty with the mark_inode_dirty() function.
  6. +
  7. Instantiates the directory entry (dentry) with the d_instantiate function.
  8. +
+
+
+
+

Creating a directory

+

The directory creation function is indicated by the mkdir field in the inode_operations structure. +In the minix case, the function is minix_mkdir(). +This function is called by the mkdir system call. Such a function performs the following operations:

+
+
    +
  1. Calls minix_create().
  2. +
  3. Allocates a data block for the directory.
  4. +
  5. Creates the "." and ".." entries.
  6. +
+
+
+ + + +
+

Deleting a directory

+

The directory delete function is indicated by the rmdir field in the inode_operations structure. +In the minix case, the function is minix_rmdir(). +This function is called by the rmdir system call. +Such a function performs the following operations:

+
+
    +
  1. Performs the operations done by minix_unlink.
  2. +
  3. Ensures that the directory is empty; otherwise, returns ENOTEMPTY.
  4. +
  5. Also deletes the data blocks.
  6. +
+
+
+
+

Searching for an inode in a directory

+

The function that searches for an entry in a directory and extracts the inode is indicated by the lookup field in the inode_operations structure. +In the minix case, the function is minix_lookup. +This function is called indirectly when information about the inode associated with an entry in a directory is needed. +Such a function performs the following operations:

+
+
    +
  1. Searches in the directory indicated by dir the entry having the name dentry->d_name.name.
  2. +
  3. If the entry is found, it will return NULL and associate the inode with the name using the d_add() function.
  4. +
  5. Otherwise, returns ERR_PTR.
  6. +
+
+
+
+

Iterating through entries in a directory

+

The function which iterates through the entries in a directory (lists the directory contents) is indicated by the field iterate in the struct file_operations structure. +In the minix case, the function is minix_readdir. +This function is called by the readdir system call.

+

The function returns either all entries in the directory or just a part when the buffer allocated for it is not available. +A call of this function can return:

+
+
    +
  • a number equal to the existing number of entries if there is enough space in the corresponding user space buffer;
  • +
  • a number smaller than the actual number of entries, as much as there was space in the corresponding user space buffer;
  • +
  • 0, where there are no more entries to read.
  • +
+
+

The function will be called consecutively until all available entries are read. The function is called at least twice.

+
+
    +
  • It is only called twice if:
      +
    • the first call reads all entries and returns their number;
    • +
    • the second call returns 0, having no other entries to read.
    • +
    +
  • +
  • It is called more than twice if the first call does not return the total number of entries.
  • +
+
+

The function performs the following operations:

+
+
    +
  1. Iterates over the entries (the dentries) from the current directory.
  2. +
  3. For each dentry found, increments ctx->pos.
  4. +
  5. For each valid dentry (an inode other than 0, for example), calls the dir_emit() function.
  6. +
  7. If the dir_emit() function returns a value other than zero, it means that the buffer in the user space is full and the function returns.
  8. +
+
+

The arguments of the dir_emit function are:

+
+
    +
  • ctx is the directory iteration context, passed as an argument to the iterate function;
  • +
  • name is the name of the entry (a string of characters);
  • +
  • name_len is the length of the entry name;
  • +
  • ino is the inode number associated with the entry;
  • +
  • type identifies the entry type: DT_REG (file), DT_DIR (directory), DT_UNKNOWN etc. DT_UNKNOWN can be used when the entry type is unknown.
  • +
+
+
+
+
+

Bitmap operations

+

When working with the file systems, management information (what block is free or busy, what inode is free or busy) is stored using bitmaps. +For this we often need to use bit operations. Such operations are:

+
+
    +
  • searching the first 0 bit: representing a free block or inode
  • +
  • marking a bit as 1: marking a busy block or inode
  • +
+
+

The bitmap operations are found in headers from include/asm-generic/bitops, especially in find.h and atomic.h. Usual functions, with names indicating their role, are:

+
+
    +
  • find_first_zero_bit()
  • +
  • find_first_bit()
  • +
  • set_bit()
  • +
  • clear_bit()
  • +
  • test_and_set_bit()
  • +
  • test_and_clear_bit()
  • +
+
+

These functions usually receive the address of the bitmap, possibly its size (in bytes) and, if necessary, the index of the bit that needs to be activated (set) or deactivated (clear).

+

Some usage examples are listed below:

+
unsigned int map;
+unsigned char array_map[NUM_BYTES];
+size_t idx;
+int changed;
+
+/* Find first zero bit in 32 bit integer. */
+idx = find_first_zero_bit(&map, 32);
+printk (KERN_ALERT "The %zu-th bit is the first zero bit.\n", idx);
+
+/* Find first one bit in NUM_BYTES bytes array. */
+idx = find_first_bit(array_map, NUM_BYTES * 8);
+printk (KERN_ALERT "The %zu-th bit is the first one bit.\n", idx);
+
+/*
+ * Clear the idx-th bit in integer.
+ * It is assumed idx is less the number of bits in integer.
+ */
+clear_bit(idx, &map);
+
+/*
+ * Test and set the idx-th bit in array.
+ * It is assumed idx is less the number of bits in array.
+ */
+changed = __test_and_set_bit(idx, &sbi->imap);
+if (changed)
+      printk(KERN_ALERT "%zu-th bit changed\n", idx);
+
+
+
+
+

Further reading

+
    +
  1. Robert Love -- Linux Kernel Development, Second Edition -- Chapter +12. The Virtual Filesystem
  2. +
  3. Understanding the Linux Kernel, 3rd edition - Chapter 12. The Virtual +Filesystem
  4. +
  5. Linux Virtual File System (presentation)
  6. +
  7. Understanding Unix/Linux Filesystem
  8. +
  9. Creating Linux virtual filesystems
  10. +
  11. The Linux Documentation Project - VFS
  12. +
  13. The "Virtual File System" in Linux
  14. +
  15. A Linux Filesystem Tutorial
  16. +
  17. The Linux Virtual File System
  18. +
  19. Documentation/filesystems/vfs.txt
  20. +
  21. File systems sources
  22. +
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is filesystems. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/filesystems/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Important

+

In this lab, we will continue the implementation of the file systems started in the previous one. +For this, we will generate the laboratory skeleton using the following command:

+
TODO=5 LABS=filesystems make skels
+
+
+

After this, we will start the implementation from TODO 5.

+
+
+

myfs

+

For the exercises below, we will use the myfs file system whose implementation we started with the previous lab. +We stopped after mounting the file system and now we will continue with the operations for regular files and directories. +At the end of these exercises, we will be able to create, modify and delete regular directories and files.

+

We will mostly use the inode and dentry VFS structures. +The inode structure defines a file (of any type: regular, directory, link), while the dentry structure defines a name, which is an entry in a directory.

+

For this we will access the myfs directory in the lab skeleton. +The previously generated skeleton contains the solution for the previous lab; we will start from this. As in the previous lab, we will use the ramfs file system as a starting point.

+
+

1. Directory operations

+

To begin with, we will implement the operations for working with directories. +The operations of creating a file or deleting a file are also directory operations; these operations result in adding or deleting a directory entry (dentry).

+

At the end of this exercise we will be able to create and delete entries in the file system. We will not be able to read and write to regular files; we will do so in the next exercise.

+

Follow directions marked with TODO 5 which will guide you through the steps you need to take.

+

You will need to specify the following directory operations:

+
+
    +
  • create a file (create function)
  • +
  • search (lookup function)
  • +
  • link (link function)
  • +
  • create directory (mkdir function)
  • +
  • deletion (rmdir and unlink functions)
  • +
  • create node (mknod)
  • +
  • rename (rename function)
  • +
+
+

For this, define the myfs_dir_inode_operations structure in the code, where marked with TODO 5. +To begin, just define the structure myfs_dir_inode_operations; you will define the structures myfs_file_operations, myfs_file_inode_operations , and myfs_aops in the next exercise.

+
+

Tip

+

Read the section Directory inodes operations

+

As a model, you are following the ramfs_dir_inode_operations structure.

+
+

Implement the mkdir, mknod and create operations inside myfs_mkdir, myfs_mknod and myfs_create. +These operations will allow you to create directories and files in the file system.

+
+

Tip

+

We recommend making the code modular using a mknod function, which you can also use for the next exercise. +For inode reading and allocation, use myfs_get_inode, which is already implemented.

+

As a model, follow the next functions implemented in the ramfs file system:

+
+
    +
  • ramfs_mknod()
  • +
  • ramfs_mkdir()
  • +
  • ramfs_create()
  • +
+
+
+

For the other functions, use generic calls (simple_*) already defined in VFS.

+

In the myfs_get_inode function, initialize the operations fields of the directory inodes:

+
+
    +
  • i_op must be initialized to the address of the structure myfs_dir_inode_operations;
  • +
  • i_fop must be initialized to the address of the structure simple_dir_operations, defined in VFS.
  • +
+
+
+

Note

+

i_op is a pointer to a structure of type struct inode_operations containing operations that have to do with the inode, which are, for a directory, creating a new entry, listing entries, deleting entries, etc.

+

i_fop is a pointer to a structure of type struct file_operations containing operations that have to do with the file structure associated with the inode, such as read, write, and lseek.

+
+
+
Testing
+

Once the module is done, we can test the creation of files and directories. +To do this, we compile the kernel module (using make build) and copy the resulting file (myfs.ko) and the test scripts (test-myfs-{1,2}.sh) in the virtual machine directory (using make copy).

+
+

Note

+

The test scripts are copied to the virtual machine using make copy only if they are executable:

+
student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/myfs/test-myfs-*.sh
+
+
+
+

After starting the virtual machine, insert the module, create the mount point and mount the file system:

+
# insmod myfs.ko
+# mkdir -p /mnt/myfs
+# mount -t myfs none /mnt/myfs
+
+
+

Now we can create file hierarchies and subdirectories in the mounted directory (/mnt/myfs). +We use commands like the ones below:

+
# touch /mnt/myfs/peanuts.txt
+# mkdir -p /mnt/myfs/mountain/forest
+# touch /mnt/myfs/mountain/forest/tree.txt
+# rm /mnt/myfs/mountain/forest/tree.txt
+# rmdir /mnt/myfs/mountain/forest
+
+
+

At this time we can not read or write files. When running commands such as the following ones we will get errors.

+
# echo "chocolate" > /mnt/myfs/peanuts.txt
+# cat /mnt/myfs/peanuts.txt
+
+
+

This happens because we have not implemented the operations for working with files; we will do so further.

+

To unload the kernel module, use the command

+
umount /mnt/myfs
+rmmod myfs
+
+
+

To test the functionality provided by the kernel module, we can use the dedicated script test-myfs-1.sh. +If the implementation is correct, no error messages will be displayed.

+
+
+
+

2. File operations

+

We want to implement the operations for working with files, which are used for accessing a file's content: read, write, truncate, etc. +For this you will specify the operations described in the structures struct inode_operations, struct file_operations and struct address_space_operations.

+

Follow the locations marked with TODO 6 which will guide you through the steps you need to take.

+

Start by defining myfs_file_inode_operations and myfs_file_operations.

+
+

Tip

+

Read the section Regular files inode operations.

+

Use the generic function provided by VFS.

+

An example of implementation is the ramfs file system. +Follow the implementation of ramfs_file_inode_operations and ramfs_file_operations.

+
+

Inside the function myfs_get_inode, initialize the operations fields for the regular file inodes:

+
+
    +
  • i_op must be initialized to myfs_file_inode_operations;
  • +
  • i_fop msust be initialized to myfs_file_operations.
  • +
+
+

Continue with defining the structure myfs_aops.

+
+

Tip

+

Read the section Address space operations.

+

Use the generic functions provided by VFS.

+

An implementation example is the ramfs file system: the ramfs_aops structure.

+

You do not need to define the function of type set_page_dirty.

+
+

Initialize the i_mapping->a_ops field of the inode structure to myfs_aops.

+
+
Testing
+

For testing, we use the steps described in the previous exercise. +In addition to those steps, we will now be able to read, write and modify a file using commands like the ones below:

+
# echo "chocolate" > /mnt/myfs/peanuts.txt
+# cat /mnt/myfs/peanuts.txt
+
+
+

To test the functionality provided by the module, we can use the dedicated script:

+
# ./test-myfs-2.sh
+
+
+

If the implementation is correct, no error messages will be displayed when running the above script.

+
+
+
+
+

minfs

+

For the exercises below, we will use the minfs file system whose development started in the previous lab. +This is a file system with disk support. +We stopped after mounting the file system and now we will continue with the operations on regular files and directories. +At the end of these exercises we will be able to create and delete entries in the file system.

+

We will mainly use the inode and dentry VFS structures. +The inode structure defines a file (of any type: regular, directory, link), while the dentry structure defines a name, which is a directory entry.

+

For this we will access the minfs/kernel directory from the laboratory skeleton. +The generated skeleton contains the solution from the previous lab; we will start from this. +As in the previous lab, we will use the minix file system as a starting point.

+

We will use the formatting tool mkfs.minfs in the minfs/user directory which is automatically compiled when running make build and copied to the virtual machine at make copy.

+

The formatting tool prepares a virtual machine disk using a command like

+
# ./mkfs.minfs /dev/vdb
+
+
+

After formatting, the disk has a structure like the one in the diagram below:

+../_images/minfs_arch.png +

As shown in the diagram, minfs is a minimalist file system. +minfs contains a maximum of 32 inodes, each inode having a single data block (the file size is limited to block size). +The super block contains a 32-bit map (imap), each bit indicating the use of an inode.

+
+

Note

+

Before you start working, go through the minfs/kernel/minfs.h header file. +This file contains the structures and macros that will be used in these exercises. +These structures and macros define the file system as described in the diagram above.

+
+
+

1. Iterate operation

+

At first we want to be able to list the contents of the root directory. +For this we must be able to read the entries in the root directory, which means implementing the iterate operation. +The iterate operation is a field within the minfs_dir_operations structure (of type file_operations) and is implemented by the function minfs_readdir. We need to implement this function.

+

Follow directions marked with TODO 5 which will guide you through the steps you need to take.

+
+

Tip

+

Read the section Directory inodes operations

+

As a starting point, follow the minix_readdir() function. +The function is rather complicated, but it gives you an insight into the steps you have to do.

+

Follow, in minfs.c and minfs.h, the definitions of structures struct minfs_inode_info, struct minfs_inode and struct minfs_dir_entry. +You will use them in the minfs_readdir implementation.

+
+

Obtain the inode and the structure struct minfs_inode_info associated with the directory. +The structure struct minfs_inode_info is useful to find out the directory's data block. +From this structure you get the data_block field, representing the data block index on the disk.

+
+

Tip

+

To get the structure struct minfs_inode_info structure, use list_entry() or container_of().

+
+

Use sb_bread() to read the directory data block.

+
+

Tip

+

The data block of the directory is indicated by the data_block field of the structure struct minfs_inode_info corresponding to the directory.

+

The data in the block is referenced by the b_data field of the buffer_head structure (the usual code will be bh->b_data). +This block (being the data block of a directory) contains an array of at most MINFS_NUM_ENTRIES entries of type struct minfs_dir_entry (directory entries specific to minfs). +Use casting to struct minfs_dir_entry * to work with the data in the block.

+
+

Iterate over all the entries in the data block and fill the user space buffer inside the for loop.

+
+

Tip

+

For each index, get the corresponding entry of the struct minfs_dir_entry by using pointer arithmetics on the bh->b_data field. +Ignore dentries that have an ino field equal to 0. Such a dentry is a free slot in the director's dentry list.

+

For each valid entry, there is an existing call dir_emit() with the appropriate parameters. This is the call that sends the dentries to the caller (and then to user space).

+

Check the call examples in qnx6_readdir() and minix_readdir().

+
+
+
Testing
+

Once the module is done, we can test the listing of the root directory contents. +To do this, we compile the kernel module (make build) and copy the result to the virtual machine together with the test scripts (minfs/user/test-minfs-{0,1}.sh) and the formatting utility (minfs/user/mkfs.minfs) using make copy, then start the machine.

+
+

Note

+

The test scripts are copied to the virtual machine only if they are executable:

+
student@eg106:~/src/linux/tools/labs$ chmod +x skels/filesystems/minfs/user/test-minfs*.sh
+
+
+
+

After we start the virtual machine, we format the /dev/vdb disk, create the mount point and mount the file system:

+
# ./mkfs.minfs /dev/vdb
+# mkdir -p /mnt/minfs
+# mount -t minfs /dev/vdb /mnt/minfs
+
+
+

Now we can list the contents of the root directory:

+
# ls -l /mnt/minfs
+
+
+

We notice that there is already a file (a.txt); it is created by the formatting utility.

+

We also notice that we are not allowed to display information for a file using the ls command. +This is because we have not implemented the lookup function. We will implement it in the next exercise.

+

To test the functionality provided by the module, we can use the dedicated script:

+
# ./test-minfs-0.sh
+# ./test-minfs-1.sh
+
+
+
+
+
+

2. Lookup operation

+

To properly list the contents of a directory, we need to implement the search functionality, ie the lookup operation. +The lookup operation is a field within the minfs_dir_inode_operations structure (of type inode_operations) and is implemented by the minfs_lookup function. +This function (minfs_lookup) needs to be implemented. +We will actually implement the minfs_find_entry function called by minfs_lookup .

+

Follow directions marked with TODO 6 which will tell you the steps you need to take.

+
+

Tip

+

Read the section Directory inodes operations

+

As a starting point, read the functions qnx6_find_entry() and minix_find_entry().

+
+

In the minfs_find_entry function, iterate over the directory where the dentry is: dentry->d_parent->d_inode. +Iterating means going through the entries in the directory's data block (of type struct minfs_dir_entry) and locate, if it exists, the requested entry.

+
+

Tip

+

From the structure of type struct minfs_inode_info corresponding to the directory, find out the data block index and read it (sb_read). +You will access the block contents using bh->b_data. +The directory data block contains an array of at most MINFS_NUM_ENTRIES entries of type struct minfs_dir_entry. +Use pointer arithmetics to get entries of type struct minfs_dir_entry from the data block (bh->b_data).

+

Check the presence of the name (stored in the local variable name) in the directory (if there is an entry in the data block whose name is a string equal to the given name). Use strcmp() to verify.

+

Ignore dentries that have an ino field equal to 0. Those dentries are free slots in the directory dentry list.

+

Store in the final_de variable the dentry found. +If you do not find any dentry, then the final_de variable will have the value NULL, the value with which it was initialized.

+
+

Comment the simple_lookup call in the minfs_lookup function to invoke the implementation of minfs_readdir.

+
+
Testing
+

For testing, we use the steps described in the previous exercise. +The long file listing (ls -l) of the contents of a directory (root directory) will display permissions and other file-specific information:

+
# ls -l /mnt/minfs
+
+
+

To test the functionality provided by the module, we can use the dedicated scripts:

+
# ./test-minfs-0.sh
+# ./test-minfs-1.sh
+
+
+

If the implementation is correct, no error messages will be displayed when running the scripts above.

+
+

Note

+

After mounting the file system using the command

+
# mount -t minfs /dev/vdb /mnt/minfs
+
+
+

we try to create a file using the command

+
# touch /mnt/minfs/peanuts.txt
+
+
+

We notice that we get an error because we did not implement the directory operations that allow us to create a file. +We will do this for the next exercise.

+
+
+
+
+

3. Create operation

+

In order to allow the creation of a file in a directory, we must implement the create operation. +The create operation is a field in the minfs_dir_inode_operations structure (of type inode_operations) and is implemented by the minfs_create function. We need to implement this function. +In fact, we will implement the minfs_new_inode (which creates and initializes an inode) and minfs_add_link which adds a link (or name or dentry) for the created inode.

+

Follow directions marked with TODO 7 which will guide you through the steps you need to take.

+
+

Tip

+

Read the section Directory inodes operations

+

Inspect the code in the minfs_create and the skeleton of functions minfs_new_inode and minfs_add_link.

+
+

Implement the function minfs_new_inode. Inside this function you will create (using new_inode()) and initialize an inode. The initialization is done using the data from disk.

+
+

Tip

+

Use the minix_new_inode() function as a model. +Find the first free inode in imap (sbi->imap). +Use bitwise operations (find_first_zero_bit and set_bit). +Read the Bitmap operations section.

+

The buffer for the superblock (sbi->sbh) must be marked as dirty .

+

You must initialize the usual fields as it is done for the myfs file system. +Initialize the i_mode field to 0 in the call to inode_init_owner. It will be initialized in the caller later.

+
+

Implement the minfs_add_link function. The function adds a new dentry (struct minfs_dir_entry) to the parent directory data block (dentry->d_parent->d_inode).

+
+

Tip

+

Use the function minix_add_link function as a model.

+
+

In minfs_add_link we want to find the first free place for the dentry. +For this, you will iterate over the directory data block and you will find the first free entry. A free dentry has the ino field equal to 0.

+
+

Tip

+

In order to work with the directory, get the inode of type struct minfs_inode_info corresponding to the parent directory (the dir inode). +Do not use the variable inode to get struct minfs_inode_info; that inode belongs to the file, not to the parent directory inside which you want to add the link/dentry. +To get the struct minfs_inode_info structure, use container_of().

+

The structure struct minfs_inode_info is useful for finding the directory data block (the one indicated by the dentry->d_parent->d_inode, which is the dir variable). +From this structure, get the data_block field, representing index of the data block on the disk. +This block contains the entries in the directory. Use sb_bread() to read the block and then bh->b_data to refer to the data. +The block contains at most MINFS_NUM_ENTRIES entries of type struct minfs_dir_entry.

+

If all entries are occupied, return -ENOSPC.

+

Iterate over the entries in the data block using the variable de and extract the first free entry (for which the ino field is 0).

+

When you have found a free place, fill in the corresponding entry:

+
+
    +
  • the inode->i_ino field in de->ino
  • +
  • the dentry->d_name.name field in de->name
  • +
+
+

Then mark the buffer dirty.

+
+
+
Testing
+

For testing, we use the steps described in the previous exercise. +Now we can create files within the file system:

+
# touch /mnt/minfs/peanuts.txt
+
+
+

To test the functionality provided by the module, we can use the dedicated script:

+
# ./test-minfs-2.sh
+
+
+

If the deployment is valid, no error messages will be displayed following the above script run.

+
+

Note

+

The current implementation of the minfs file system is not definitive. +To be complete, the implementations needs function to delete files, create and delete directories, rename entries, and modify the contents of a file.

+
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/infrastructure.html b/refs/pull/405/merge/labs/infrastructure.html new file mode 100644 index 00000000..7f32d602 --- /dev/null +++ b/refs/pull/405/merge/labs/infrastructure.html @@ -0,0 +1,228 @@ + + + + + + Infrastructure — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Infrastructure

+

In order to facilitate learning each topic has a hands-on exercises +section which will contain in-depth, incremental clues on how to solve +one or multiple tasks. To focus on a particular issue most of the +tasks will be performed on existing skeleton drivers. Each skeleton +driver has clearly marked sections that needs to be filled in order to +complete the tasks.

+

The skeleton drivers are generated from full source examples located +in tools/labs/templates. To solve tasks you start by generating the +skeleton drivers, running the skels target in tools/labs. To +keep the workspace clean it is recommended to generate the skeletons +for one lab only and clean the workspace before start working on a new +lab. Labs can be selected by using the LABS variable:

+
tools/labs $ make clean
+tools/labs $ LABS=kernel_modules make skels
+
+tools/labs $ ls skels/kernel_modules/
+1-2-test-mod  3-error-mod  4-multi-mod  5-oops-mod  6-cmd-mod  \
+7-list-proc  8-kprobes  9-kdb
+
+
+

You can also use the same variable to generate skeletons for specific +tasks:

+
tools/labs $ LABS="kernel_modules/6-cmd-mod kernel_modules/8-kprobes" make skels
+
+tools/labs$ ls skels/kernel_modules
+6-cmd-mod  8-kprobes
+
+
+

For each task you may have multiple steps to perform, usually +incremental. These steps are marked in the source code as well as in +the lab exercises with the keyword TODO. If we have multiple steps +to perform they will be prefixed by a number, like TODO1, TODO2, +etc. If no number is used it is assumed to be the one and only +step. If you want to resume a task from a certain step, you can using +the TODO variable. The following example will generate the +skeleton with the first TODO step resolved:

+
tools/labs $ TODO=2 LABS="kernel_modules/8-kprobes" skels
+
+
+

Once the skelton drivers are generated you can build them with the +build make target:

+
tools/labs $ make build
+echo "# autogenerated, do not edit " > skels/Kbuild
+for i in ./kernel_modules/8-kprobes; do echo "obj-m += $i/" >> skels/Kbuild; done
+make -C /home/tavi/src/linux M=/home/tavi/src/linux/tools/labs/skels ARCH=x86 modules
+make[1]: Entering directory '/home/tavi/src/linux'
+CC [M]  /home/tavi/src/linux/tools/labs/skels/./kernel_modules/8-kprobes/kprobes.o
+Building modules, stage 2.
+MODPOST 1 modules
+CC      /home/tavi/src/linux/tools/labs/skels/./kernel_modules/8-kprobes/kprobes.mod.o
+LD [M]  /home/tavi/src/linux/tools/labs/skels/./kernel_modules/8-kprobes/kprobes.ko
+make[1]: Leaving directory '/home/tavi/src/linux'
+
+
+

To copy the drivers to the VM you can use either use ssh or update the +VM image directly using the copy target:

+
tools/labs $ make copy
+...
+'skels/kernel_modules/8-kprobes/kprobes.ko' -> '/tmp/tmp.4UMKcISmQM/home/root/skels/kernel_modules/8-kprobes/kprobes.ko'
+
+
+
+

Attention

+

The copy target will fail if the VM is +running. This is intentional so that we avoid corrupting the +filesystem.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/interrupts.html b/refs/pull/405/merge/labs/interrupts.html new file mode 100644 index 00000000..79341d97 --- /dev/null +++ b/refs/pull/405/merge/labs/interrupts.html @@ -0,0 +1,1261 @@ + + + + + + I/O access and Interrupts — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

I/O access and Interrupts

+
+

Lab objectives

+
    +
  • communication with peripheral devices
  • +
  • implement interrupt handlers
  • +
  • synchronizing interrupts with process context
  • +
+

Keywords: IRQ, I/O port, I/O address, base address, UART, request_region, release_region, inb, outb

+
+
+

Background information

+

A peripheral device is controlled by writing and reading its +registers. Often, a device has multiple registers that can be accessed +at consecutive addresses either in the memory address space or in the +I/O address space. Each device connected to the I/O bus has a set of +I/O addresses, called I/O ports. I/O ports can be mapped to physical +memory addresses so that the processor can communicate with the device +through instructions that work directly with the memory. For +simplicity, we will directly use I/O ports (without mapping to physical +memory addresses) to communicate with physical devices.

+

The I/O ports of each device are structured into a set of specialized +registers to provide a uniform programming interface. Thus, most +devices will have the following types of registers:

+
    +
  • Control registers that receive device commands
  • +
  • Status registers, which contain information about the device's +internal status
  • +
  • Input registers from which data is taken from the device
  • +
  • Output registers in which the data is written to transmit it to the +device
  • +
+

Physical ports are differentiated by the number of bits: they can be +8, 16 or 32-bit ports.

+

For example, the parallel port has 8 8-bit I/O ports starting at base +address 0x378. The data log is found at base address (0x378), status +register at base + 1 (0x379), and control at base address + 2 +(0x37a). The data log is both an entry and exit log.

+

Although there are devices that can be fully controlled using I/O +ports or special memory areas, there are situations where this is +insufficient. The main problem that needs to be addressed is that +certain events occur at undefined moments in time and it is +inefficient for the processor (CPU) to interrogate the status of the +device repeatedly (polling). The way to solve this problem is using an +Interrupt ReQuest (IRQ) which is a hardware notification by which the +processor is announced that a particular external event happened.

+

For IRQs to be useful device drivers must implement handlers, i.e. a +particular sequence of code that handles the interrupt. Because in +many situations the number of interrupts available is limited, a +device driver must behave in an orderly fashion with interruptions: +interrupts must be requested before being used and released when they +are no longer needed. In addition, in some situations, device drivers +must share an interrupt or synchronize with interrupts. All of these will be +discussed further.

+

When we need to access shared resources between an interrupt +routine (A) and code running in process context or in bottom-half +context (B), we must use a special synchronization technique. In (A) +we need to use a spinlock primitive, and in (B) we must disable +interrupts AND use a spinlock primitive. Disabling interrupts is not +enough because the interrupt routine can run on a processor other than +the one running (B).

+

Using only a spinlock can lead to a deadlock. The classic example of +deadlock in this case is:

+
    +
  1. We run a process on the X processor, and we acquire the lock
  2. +
  3. Before releasing the lock, an interrupt is generated on the X processor
  4. +
  5. The interrupt handling routine will try to acquire the lock and it +will go into an infinite loop
  6. +
+
+
+

Accessing the hardware

+

In Linux, the I/O ports access is implemented on all architectures and +there are several APIs that can be used.

+
+

Request access to I/O ports

+

Before accessing I/O ports we first must request access to them, to +make sure there is only one user. In order to do so, one must use the +request_region() function:

+
#include <linux/ioport.h>
+
+struct resource *request_region(unsigned long first, unsigned long n,
+                                const char *name);
+
+
+

To release a reserved region one must use the release_region() function:

+
void release_region(unsigned long start, unsigned long n);
+
+
+

For example, the serial port COM1 has the base address 0x3F8 and it +has 8 ports and this is a code snippet of how to request access to +these ports:

+
#include <linux/ioport.h>
+
+#define MY_BASEPORT 0x3F8
+#define MY_NR_PORTS 8
+
+if (!request_region(MY_BASEPORT, MY_NR_PORTS, "com1")) {
+     /* handle error */
+     return -ENODEV;
+}
+
+
+

To release the ports one would use something like:

+
release_region(MY_BASEPORT, MY_NR_PORTS);
+
+
+

Most of the time, port requests are done at the driver initialization +or probe time and the port releasing is done at the removal of the +device or module.

+

All of the port requests can be seen from userspace via the +/proc/ioports file:

+
$ cat /proc/ioports
+0000-001f : dma1
+0020-0021 : pic1
+0040-005f : timer
+0060-006f : keyboard
+0070-0077 : rtc
+0080-008f : dma page reg
+00a0-00a1 : pic2
+00c0-00df : dma2
+00f0-00ff : fpu
+0170-0177 : ide1
+01f0-01f7 : ide0
+0376-0376 : ide1
+0378-037a : parport0
+037b-037f : parport0
+03c0-03df : vga+
+03f6-03f6 : ide0
+03f8-03ff : serial
+...
+
+
+
+
+

Accessing I/O ports

+

After a driver has obtained the desired I/O port range, one can +perform read or write operations on these ports. Since physical ports +are differentiated by the number of bits (8, 16, or 32 bits), there +are different port access functions depending on their size. The +following port access functions are defined in asm/io.h:

+
    +
  • unsigned inb(int port), reads one byte (8 bits) from port
  • +
  • void outb(unsigned char byte, int port), writes one byte (8 bits) to port
  • +
  • unsigned inw(int port), reads two bytes (16-bit) ports
  • +
  • void outw(unsigned short word, int port), writes two bytes (16-bits) to port
  • +
  • unsigned inl (int port), reads four bytes (32-bits) from port
  • +
  • void outl(unsigned long word, int port), writes four bytes (32-bits) to port
  • +
+

The port argument specifies the address of the port where the reads or +writes are done, and its type is platform dependent (may be unsigned +long or unsigned short).

+

Some devices may have problems when the processor is trying to +transfer data too fast to and from the device. To avoid this issue we +may need to insert a delay after an I/O operation and there are functions +you can use that introduce this delay. Their names are similar to +those described above, with the exception that it ends in _p: inb_p, +outb_p, etc.

+

For example, the following sequence writes a byte on COM1 serial port +and then reads it:

+
#include <asm/io.h>
+#define MY_BASEPORT 0x3F8
+
+unsigned char value = 0xFF;
+outb(value, MY_BASEPORT);
+value = inb(MY_BASEPORT);
+
+
+
+
+

5. Accessing I/O ports from userspace

+

Although the functions described above are defined for device drivers, +they can also be used in user space by including the <sys/io.h> +header. In order to be used, ioperm or iopl must first be called to +get permission to perform port operations. The ioperm function obtains +permission for individual ports, while iopl for the entire I/O address +space. To use these features, the user must be root.

+

The following sequence used in user space gets permission for the +first 3 ports of the serial port, and then releases them:

+
#include <sys/io.h>
+#define MY_BASEPORT 0x3F8
+
+if (ioperm(MY_BASEPORT, 3, 1)) {
+     /* handle error */
+}
+
+if (ioperm(MY_BASEPORT, 3, 0)) {
+     /* handle error */
+}
+
+
+

The third parameter of the ioperm function is used to request or +release port permission: 1 to get permission and 0 to release.

+
+
+
+

Interrupt handling

+
+

Requesting an interrupt

+

As with other resources, a driver must gain access to an interrupt +line before it can use it and release it at the end of the execution.

+

In Linux, the request to obtain and release an interrupt is done using +the requests_irq() and free_irq() functions:

+
#include <linux/interrupt.h>
+
+typedef irqreturn_t (*irq_handler_t)(int, void *);
+
+int request_irq(unsigned int irq_no, irq_handler_t handler,
+                unsigned long flags, const char *dev_name, void *dev_id);
+
+void free_irq(unsigned int irq_no, void *dev_id);
+
+
+

Note that to get an interrupt, the developer calls +request_irq(). When calling this function you must specify the +interrupt number (irq_no), a handler that will be called when the +interrupt is generated (handler), flags that will instruct the +kernel about the desired behaviour (flags), the name of the device +using this interrupt (dev_name), and a pointer that can be +configured by the user at any value, and that has no global +significance (dev_id). Most of the time, dev_id will be +pointer to the device driver's private data. When the interrupt is +released, using the free_irq() function, the developer must +send the same pointer value (dev_id) along with the same interrupt +number (irq_no). The device name (dev_name) is used to display +statistics in /proc/interrupts.

+

The value that request_irq() returns is 0 if the entry was +successful or a negative error code indicating the reason for the +failure. A typical value is -EBUSY which means that the interrupt +was already requested by another device driver.

+

The handler function is executed in interrupt context which means +that we can't call blocking APIs such as mutex_lock() or +msleep(). We must also avoid doing a lot of work in the +interrupt handler and instead use deferred work if needed. The actions +performed in the interrupt handler include reading the device +registers to get the status of the device and acknowledge the +interrupt, operations that most of the time can be performed with +non-blocking calls.

+

There are situations where although a device uses interrupts we can't +read the device's registers in a non-blocking mode (for example a +sensor connected to an I2C or SPI bus whose driver does not guarantee +that bus read / write operations are non-blocking ). In this +situation, in the interruption, we must plan a work-in-process action +(work queue, kernel thread) to access the device's registers. Because +such a situation is relatively common, the kernel provides the +request_threaded_irq() function to write interrupt handling +routines running in two phases: a process-phase and an interrupt +context phase:

+
#include <linux/interrupt.h>
+
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+                         irq_handler_t thread_fn,
+                         unsigned long flags, const char *name, void *dev);
+
+
+

handler is the function running in interrupt context, and will +implement critical operations while the thread_fn function runs in +process context and implements the rest of the operations.

+

The flags that can be transmitted when an interruption is made are:

+
    +
  • IRQF_SHARED announces the kernel that the interrupt can be +shared with other devices. If this flag is not set, then if there is +already a handler associated with the requested interrupt, the +request for interrupt will fail. A shared interrupt is handled in a +special way by the kernel: all the associated interrupt handlers +will be executed until the device that generated the interrupt will +be identified. But how can a device driver know if the interrupt +handling routine was activated by an interrupt generated by the +device it manages? Virtually all devices that offer interrupt +support have a status register that can be interrogated in the +handling routine to see if the interrupt was or was not generated by +the device (for example, in the case of the 8250 serial port, this +status register is IIR - Interrupt Information Register). When +requesting a shared interrupt, the dev_id argument must be unique +and it must not be NULL. Usually it is set to module's private +data.
  • +
  • IRQF_ONESHOT interrupt will be reactivated after running the process +context routine; Without this flag, the interrupt will be +reactivated after running the handler routine in the context of +the interrupt
  • +
+

Requesting the interrupt can be done either at the initialization of +the driver (init_module()), when the device is probed, or when +the device is used (e.g. during open).

+

The following example performs the interrupt request for the COM1 +serial port:

+
#include <linux/interrupt.h>
+
+#define MY_BASEPORT 0x3F8
+#define MY_IRQ 4
+
+static my_init(void)
+{
+     [...]
+     struct my_device_data *my_data;
+     int err;
+
+     err = request_irq(MY_IRQ, my_handler, IRQF_SHARED,
+                       "com1", my_data);
+     if (err < 0) {
+         /* handle error*/
+         return err;
+     }
+     [...]
+}
+
+
+

As you can see, the IRQ for serial port COM1 is 4, which is used in +shared mode (IRQF_SHARED).

+
+

Attention

+

When requesting a shared interrupt (IRQF_SHARED) the +dev_id argument can not be NULL.

+
+

To release the interrupt associated with the serial port, the +following operations will be executed:

+
free_irq (MY_IRQ, my_data);
+
+
+

During the initialization function (init_module()), or in the +function that opens the device, interrupts must be activated for the +device. This operation is dependent on the device, but most often +involves setting a bit from the control register.

+

As an example, for the 8250 serial port, the following operations must +be performed to enable interrupts:

+
#include <asm/io.h>
+#define MY_BASEPORT 0x3F8
+
+outb(0x08, MY_BASEPORT+4);
+outb(0x01, MY_BASEPORT+1);
+
+
+

In the above example, two operations are performed:

+
    +
  1. All interruptions are activated by setting bit 3 (Aux Output 2) in +the MCR register - Modem Control Register
  2. +
  3. The RDAI (Transmit Holding Register Empty Interrupt) is activated +by setting the appropriate bit in the IER - Interrupt Enable +Register.
  4. +
+
+
+

Implementing an interrupt handler

+

Lets take a look at the signature of the interrupt handler function:

+
irqreturn_t (*handler)(int irq_no, void *dev_id);
+
+
+

The function receives as parameters the number of the interrupt +(irq_no) and the pointer sent to request_irq() when the +interrupt was requested. The interrupt handling routine must return a +value with a type of typedef irqreturn_t. For the current kernel +version, there are three valid values: IRQ_NONE, IRQ_HANDLED, +and IRQ_WAKE_THREAD. The device driver must return IRQ_NONE if +it notices that the interrupt has not been generated by the device it +is in charge. Otherwise, the device driver must return IRQ_HANDLED +if the interrupt can be handled directly from the interrupt context or +IRQ_WAKE_THREAD to schedule the running of the process context +processing function.

+

The skeleton for an interrupt handler is:

+
irqreturn_t my_handler(int irq_no, void *dev_id)
+{
+    struct my_device_data *my_data = (struct my_device_data *) dev_id;
+
+    /* if interrupt is not for this device (shared interrupts) */
+        /* return IRQ_NONE;*/
+
+    /* clear interrupt-pending bit */
+    /* read from device or write to device*/
+
+    return IRQ_HANDLED;
+}
+
+
+

Typically, the first thing executed in the interrupt handler is to +determine whether the interrupt was generated by the device that the +driver ordered. This usually reads information from the device's +registers to indicate whether the device has generated an +interrupt. The second thing is to reset the interrupt pending bit on +the physical device as most devices will no longer generate +interruptions until this bit has been reset (e.g. for the 8250 +serial port bit 0 in the IIR register must be cleared).

+
+
+

Locking

+

Because the interrupt handlers run in interrupt context the actions +that can be performed are limited: unable to access user space memory, +can't call blocking functions. Also, synchronization using spinlocks is +tricky and can lead to deadlocks if the spinlock used is already +acquired by a process that has been interrupted by the running +handler.

+

However, there are cases where device drivers have to synchronize +using interrupts, such as when data is shared between the interrupt +handler and process context or bottom-half handlers. In these +situations it is necessary to both deactivate the interrupt and use +spinlocks.

+

There are two ways to disable interrupts: disabling all interrupts, at +the processor level, or disabling a particular interrupt at the device +or interrupt controller level. Processor disabling is faster and is +therefore preferred. For this purpose, there are locking functions +that disable and enable interrupts acquiring and release a spinlock at +the same time: spin_lock_irqsave(), +spin_unlock_irqrestore(), spin_lock_irq(), and +spin_unlock_irq():

+
#include <linux/spinlock.h>
+
+void spin_lock_irqsave (spinlock_t * lock, unsigned long flags);
+void spin_unlock_irqrestore (spinlock_t * lock, unsigned long flags);
+
+void spin_lock_irq (spinlock_t * lock);
+void spin_unlock_irq (spinlock_t * lock);
+
+
+

The spin_lock_irqsave() function disables interrupts for the +local processor before it obtains the spinlock; The previous state of +the interrupts is saved in flags.

+

If you are absolutely sure that the interrupts on the current +processor have not already been disabled by someone else and you are +sure you can activate the interrupts when you release the spinlock, +you can use spin_lock_irq().

+

For read / write spinlocks there are similar functions available:

+
    +
  • read_lock_irqsave()
  • +
  • read_unlock_irqrestore()
  • +
  • read_lock_irq()
  • +
  • read_unlock_irq()
  • +
  • write_lock_irqsave()
  • +
  • write_unlock_irqrestore()
  • +
  • write_lock_irq()
  • +
  • write_unlock_irq()
  • +
+

If we want to disable interrupts at the interrupt controller level +(not recommended because disabling a particular interrupt is slower, +we can not disable shared interrupts) we can do this with +disable_irq(), disable_irq_nosync(), and +enable_irq(). Using these functions will disable the interrupts on +all processors. Calls can be nested: if disable_irq is called twice, +it will require as many calls enable_irq to enable it. The difference +between disable_irq and disable_irq_nosync is that the first one will +wait for the executed handlers to finish. Because of this, +disable_irq_nosync() is generally faster, but may lead to +races with the interrupts handler, so when not sure use +disable_irq().

+

The following sequence disables and then enables the interrupt for +the COM1 serial port:

+
#define MY_IRQ 4
+
+disable_irq (MY_IRQ);
+enable_irq (MY_IRQ);
+
+
+

It is also possible to disable interrupts at the device level. This +approach is also slower than disabling interrupts at the processor +level, but it works with shared interrupts. The way to accomplish this +is device specific and it usually means we have to clear a bit from +one of the control registers.

+

It is also possible to disable all interrupts for the current +processor independent of taking locks. Disabling all interruptions by +device drivers for synchronization purposes is inappropriate because +races are still possible if the interrupt is handled on another +CPU. For reference, the functions that disable / enable interrupts on +the local processor are local_irq_disable() and +local_irq_enable().

+

In order to use a resource shared between process context and the +interrupt handling routine, the functions described above will be used +as follows:

+
static spinlock_t lock;
+
+/* IRQ handling routine: interrupt context */
+irqreturn_t kbd_interrupt_handle(int irq_no, void * dev_id)
+{
+    ...
+    spin_lock(&lock);
+    /* Critical region - access shared resource */
+    spin_unlock (&lock);
+    ...
+}
+
+/* Process context: Disable interrupts when locking */
+static void my_access(void)
+{
+    unsigned long flags;
+
+    spin_lock_irqsave(&lock, flags);
+    /* Critical region - access shared resource */
+    spin_unlock_irqrestore(&lock, flags);
+
+    ...
+}
+
+void my_init (void)
+{
+    ...
+    spin_lock_init (&lock);
+    ...
+}
+
+
+

The my_access function above runs in process context. To +synchronize access to the shared data, we disable the interrupts and +use the spinlock lock, i.e. the spin_lock_irqsave() and +spin_unlock_irqrestore() functions.

+

In the interrupt handling routine, we use the spin_lock() and +spin_unlock() functions to access the shared resource.

+
+

Note

+

The flags argument for spin_lock_irqsave() and +spin_unlock_irqrestore() is a value and not a pointer but keep +in mind that spin_lock_irqsave() function changes the value of +the flag, since this is actually a macro.

+
+
+
+

Interrupt statistics

+

Information and statistics about system interrupts can be found in +/proc/interrupts or /proc/stat. Only system interrupts with +associated interrupt handlers appear in /proc/interrupts:

+
# cat /proc/interrupts
+                CPU0
+0:           7514294       IO-APIC-edge   timer
+1:              4528       IO-APIC-edge   i8042
+6:                 2       IO-APIC-edge   floppy
+8:                 1       IO-APIC-edge   rtc
+9:                 0       IO-APIC-level  acpi
+12:             2301       IO-APIC-edge   i8042
+15:               41       IO-APIC-edge   ide1
+16:             3230       IO-APIC-level  ioc0
+17:             1016       IO-APIC-level  vmxnet ether
+NMI:               0
+LOC:         7229438
+ERR:               0
+MIS:               0
+
+
+

The first column specifies the IRQ associated with the interrupt. The +following column shows the number of interrupts that were generated +for each processor in the system; The last two columns provide +information about the interrupt controller and the device name that +registered the handler for that interrupt.

+

The /proc/state file provides information about system activity, +including the number of interruptions generated since the last (re)boot +of the system:

+
# cat /proc/stat | grep in
+intr 7765626 7754228 4620 0 0 0 0 2 0 1 0 0 0 2377 0 0 41 3259 1098 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+
+
+

Each line in the /proc/state file begins with a keyword that +specifies the meaning of the information on the line. For information +on interrupts, this keyword is intr. The first number on the line +represents the total number of interrupts, and the other numbers +represent the number of interrupts for each IRQ, starting at 0. The +counter includes the number of interrupts for all processors in the +system.

+
+
+
+

Further reading

+ + +
+

Keyboard controller

+
    +
  • Intel 8042
  • +
  • drivers/input/serio/i8042.c
  • +
  • drivers/input/keyboard/atkbd.c
  • +
+
+ +
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is interrupts. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/interrupts/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR, find the definitions of the following symbols in the Linux kernel:

+
    +
  • struct resource
  • +
  • request_region() and __request_region()
  • +
  • request_irq() and request_threaded_irq()
  • +
  • inb() for the x86 architecture.
  • +
+

Analyze the following Linux code:

+
    +
  • Keyboard initialization function i8042_setup_kbd()
  • +
  • The AT or PS/2 keyboard interrupt function atkbd_interrupt()
  • +
+
+
+

Keyboard driver

+

The next exercise's objective is to create a driver that uses the +keyboard IRQ, inspect the incoming key codes and stores them in a +buffer. The buffer will be accessible from userspace via character +device driver.

+
+
+

1. Request the I/O ports

+

To start with, we aim to allocate memory in the I/O space for hardware +devices. We will see that we cannot allocate space for the keyboard +because the designated region is already allocated. Then we will allocate +I/O space for unused ports.

+

The kbd.c file contains a skeleton for the keyboard driver. Browse +the source code and inspect kbd_init(). Notice that the I/O +ports we need are I8042_STATUS_REG and I8042_DATA_REG.

+

Follow the sections maked with TODO 1 in the skeleton. Request the I/O +ports in kbd_init() and make sure to check for errors and to properly +clean-up in case of errors. When requesting, set the reserving caller's ID +string (name) with MODULE_NAME macro. Also, add code to release the I/O +ports in kbd_exit().

+
+

Note

+

You can review the Request access to I/O ports section before +proceeding.

+
+

Now build the module and copy it to the VM image:

+
tools/labs $ make build
+tools/labs $ make copy
+
+
+

Now start the VM and insert the module:

+
root@qemux86:~# insmod skels/interrupts/kbd.ko
+kbd: loading out-of-tree module taints kernel.
+insmod: can't insert 'skels/interrupts/kbd.ko': Device or resource busy
+
+
+

Notice that you get an error when trying to request the I/O +ports. This is because we already have a driver that has requested the +I/O ports. To validate check the /proc/ioports file for the +STATUS_REG and DATA_REG values:

+
root@qemux86:~# cat /proc/ioports | egrep "(0060|0064)"
+0060-0060 : keyboard
+0064-0064 : keyboard
+
+
+

Lets find out which driver register these ports and try to remove the +module associated with it.

+
$ find -name \*.c | xargs grep \"keyboard\"
+
+find -name \*.c | xargs grep \"keyboard\" | egrep '(0x60|0x64)'
+...
+./arch/x86/kernel/setup.c:{ .name = "keyboard", .start = 0x60, .end = 0x60,
+./arch/x86/kernel/setup.c:{ .name = "keyboard", .start = 0x64, .end = 0x64
+
+
+

It looks like the I/O ports are registered by the kernel during the +boot, and we won't be able to remove the associated module. Instead, +let's trick the kernel and register ports 0x61 and 0x65.

+

Use the function request_region() (inside the kbd_init() +function) to allocate the ports and the function release_region() +(inside the kbd_exit() function) to release the allocated memory.

+

This time we can load the module and /proc/ioports shows that the +owner of these ports is our module:

+
root@qemux86:~# insmod skels/interrupts/kbd.ko
+kbd: loading out-of-tree module taints kernel.
+Driver kbd loaded
+root@qemux86:~# cat /proc/ioports | grep kbd
+0061-0061 : kbd
+0065-0065 : kbd
+
+
+

Let's remove the module and check that the I/O ports are released:

+
root@qemux86:~# rmmod kbd
+Driver kbd unloaded
+root@qemux86:~# cat /proc/ioports | grep kbd
+root@qemux86:~#
+
+
+
+
+

2. Interrupt handling routine

+

For this task we will implement and register an interrupt handler for +the keyboard interrupt. You can review the Requesting an interrupt +section before proceeding.

+

Follow the sections marked with TODO 2 in the skeleton.

+

First, define an empty interrupt handling routine named +kbd_interrupt_handler().

+
+

Note

+

Since we already have a driver that uses this interrupt we +should report the interrupt as not handled (i.e. return +IRQ_NONE) so that the original driver still has a +chance to process it.

+
+

Then register the interrupt handler routine using +request_irq. The interrupt number is defined by the +I8042_KBD_IRQ macro. The interrupt handling routine must be +requested with IRQF_SHARED to share the interrupt line with +the keyboard driver (i8042).

+
+

Note

+

For shared interrupts, dev_id can not be NULL . Use +&devs[0], that is pointer to struct kbd. This +structure contains all the information needed for device +management. To see the interrupt in /proc/interrupts, do +not use NULL for dev_name . You can use the MODULE_NAME +macro.

+

If the interrupt requesting fails make sure to properly +cleanup by jumping to the right label, in this case the one +the releases the I/O ports and continues with unregistering +the character device driver.

+
+

Compile, copy and load module in the kernel. Check that the interrupt +line has been registered by looking at /proc/interrupts . Determine +the IRQ number from the source code (see I8042_KBD_IRQ) and verify +that there are two drivers registered at this interrupt line (which +means that we have a shared interrupt line): the i8042 initial driver +and our driver.

+
+

Note

+

More details about the format of the /proc/interrupts can +be found in the Interrupt statistics section.

+
+

Print a message inside the routine to make sure it is called. Compile +and reload the module into the kernel. Check that the interrupt handling +routine is called when you press the keyboard on the virtual machine, +using dmesg. Also note that when you use the serial port no +keyboard interrupt is generated.

+
+

Attention

+

To get access to the keyboard on the virtual machine +boot with "QEMU_DISPLAY=gtk make boot".

+
+
+
+

3. Store ASCII keys to buffer

+

Next, we want to collect the keystrokes in a buffer whose content we +will then send to the user space. For this routine we will add the +following in the interrupt handling:

+
    +
  • capture the pressed keys (only pressed, ignore released)
  • +
  • identify the ASCII characters.
  • +
  • copy the ASCII characters corresponding to the keystrokes and store +them in the buffer of the device
  • +
+

Follow the sections marked TODO 3 in the skeleton.

+
+

Reading the data register

+

First, fill in the i8042_read_data() function to read the +I8042_DATA_REG of the keyboard controller. The function +just needs to return the value of the register. The value of the +registry is also called scancode, which is what is generated at each +keystroke.

+
+

Hint

+

Read the I8042_DATA_REG register using inb() and +store the value in the local variable val. +Revisit the Accessing I/O ports section.

+
+

Call the i8042_read_data() in the +kbd_interrupt_handler() and print the value read.

+

Print information about the keystrokes in the following format:

+
pr_info("IRQ:% d, scancode = 0x%x (%u,%c)\n",
+   irq_no, scancode, scancode, scancode);
+
+
+

Where scancode is the value of the read register using the +i8042_read_data() function.

+

Notice that the scancode (reading of the read register) is not an ASCII +character of the pressed key. We'll have to understand the scancode.

+
+
+

Interpreting the scancode

+

Note that the registry value is a scancode, not the ASCII value of the +character pressed. Also note that an interrupt is sent both when the +key is pressed and when the key is released. We only need to select +the code when the key is pressed and then and decode the ASCII +character.

+
+

Note

+

To check scancode, we can use the showkey command (showkey +-s).

+

In this form, the command will display the key scancodes for +10 seconds after the last pressed key end then it will +stop. If you press and release a key you will get two +scancodes: one for the pressed key and one for the released +key. E.g:

+
    +
  • If you press the ENTER key, you will get the 0x1c ( 0x1c ) +and 0x9c (for the released key)

    +
  • +
  • If you press the key a you will get the 0x1e (key pressed) +and 0x9e (for the key release)

    +
  • +
  • If you press b you will get 0x30 (key pressed) and 0xb0 +(for the release key)

    +
  • +
  • If you press the c key, you will get the 0x2e (key +pressed) 0xae and 0xae (for the released key)

    +
  • +
  • If you press the Shift key you will get the 0x2a (key +pressed) 0xaa and 0xaa (for the released key)

    +
  • +
  • If you press the Ctrl key you will get the 0x1d (key +pressed) and 0x9d (for the release key)

    +

    As also indicated in this article, a key +release scancode is 128 (0x80) higher then a key press +scancode. This is how we can distinguish between a press +key scancode and a release scancode.

    +

    A scancode is translated into a keycode that matches a +key. A pressed scanned keycode and a released scancode +have the same keycode. For the keys shown above we have +the following table:

    + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyKey Press ScancodeKey Release ScancodeKeycode
    ENTER0x1c0x9c0x1c (28)
    a0x1e0x9e0x1e (30)
    b0x300xb00x30 (48)
    c0x2e0xae0x2e (46)
    Shift0x2a0xaa0x2a (42)
    Ctrl0x1d0x9d0x1d (29)
    +

    The press / release key is performed in the is_key_press() +function and obtaining the ASCII character of a scancode +takes place in the get_ascii() function.

    +
  • +
+
+

In the interrupt handler check the scancode to see if the key is +pressed or released then determine the corresponding ASCII +character.

+
+

Hint

+

To check for press / release, use is_key_press(). +Use get_ascii() function to get the corresponding +ASCII code. Both functions expect the scancode.

+
+
+

Hint

+

To display the received information use the following +format.

+
pr_info("IRQ %d: scancode=0x%x (%u) pressed=%d ch=%c\n",
+        irq_no, scancode, scancode, pressed, ch);
+
+
+

Where scancode is the value of the data register, and ch is +the value returned by the get_ascii() function.

+
+
+
+

Store characters to the buffer

+

We want to collect the pressed characters (not the other keys) into +a circular buffer that can be consumed from user space.

+

Update the interrupt handler to add a pressed ASCII character to the +end of the device buffer. If the buffer is full, the character will be +discarded.

+
+

Hint

+

The device buffer is the field buf in the device's +struct kbd. To get the device data from the interrupt handler +use the following construct:

+
struct kbd *data = (struct kbd *) dev_id;
+
+
+

The buffer's dimension is located in struct kbd's field, +count. The put_idx and get_idx fields +specify the next writing and reading index. Take a look at the +put_char() function's implementation to observe how the data is +added to the circular buffer.

+
+
+

Attention

+

Synchronize the access to the buffer and the helper +indexes with a spinlock. +Define the spinlock in the device struct struct kbd +and initialize it in kbd_init().

+

Use the spin_lock() and spin_unlock() functions +to protect the buffer in the interrupt handler.

+

Revisit the Locking section.

+
+
+
+
+

4. Reading the buffer

+

In order to have access to the keylogger's data, we have to send it to +the user space. We will do this using the /dev/kbd character device. When +reading from this device, we will get the data from the buffer in the kernel +space, where we collected the keys pressed.

+

For this step +follow the sections marked with TODO 4 in the kbd_read() function.

+

Implement get_char() in a similar way to put_char(). Be careful +when implementing the circular buffer.

+

In the kbd_read() function copy the data from the buffer to the +userspace buffer.

+
+

Hint

+

Use get_char() to read a character from the buffer +and put_user() to store it to the user buffer.

+
+
+

Attention

+

In the read function, use spin_lock_irqsave() and +spin_unlock_irqrestore() for locking.

+

Revisit the Locking section.

+
+
+

Attention

+

We cannot use put_user() or copy_to_user() +while holding the lock, as userpace access is not permitted from +atomic contexts.

+

For more info, read the Access to the address space of the +process section in the +previous lab.

+
+

For testing, you will need to create the /dev/kbd character device +driver using the mknod before reading from it. The device master and +minor are defined as KBD_MAJOR and KBD_MINOR:

+
mknod /dev/kbd c 42 0
+
+
+

Build, copy and boot the virtual machine and load the module. Test it +using the command:

+
cat /dev/kbd
+
+
+
+
+

5. Reset the buffer

+

Reset the buffer if the device is written to. For this step follow the +sections marked with TODO 5 in the skeleton.

+

Implement reset_buffer() and add the write operation to kbd_fops.

+
+

Attention

+

In the write function Use spin_lock_irqsave() and +spin_unlock_irqrestore() for locking when resetting the +buffer.

+

Revisit the Locking section.

+
+

For testing, you will need to create the /dev/kbd character device +driver using the mknod before reading from it. The device master and +minor are defined as KBD_MAJOR and KBD_MINOR:

+
mknod /dev/kbd c 42 0
+
+
+

Build, copy and boot the virtual machine and load the module. +Test it using the command:

+
cat /dev/kbd
+
+
+

Press some keys, then run the command echo "clear" > /dev/kbd. +Check the buffer's content again. It should be reset.

+
+
+
+

Extra Exercises

+
+

1. kfifo

+

Implement a keylogger using the +kfifo API.

+
+

Hint

+

Follow the API call examples from the kernel code. +For example, the file bytestream-examples.c.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/introduction.html b/refs/pull/405/merge/labs/introduction.html new file mode 100644 index 00000000..2c3de3f5 --- /dev/null +++ b/refs/pull/405/merge/labs/introduction.html @@ -0,0 +1,917 @@ + + + + + + Introduction — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Introduction

+
+

Lab objectives

+
    +
  • presenting the rules and objectives of the Operating Systems 2 lab
  • +
  • introducing the lab documentation
  • +
  • introducing the Linux kernel and related resources
  • +
+
+
+

Keywords

+
    +
  • kernel, kernel programming
  • +
  • Linux, vanilla, http://www.kernel.org
  • +
  • cscope, LXR
  • +
  • gdb, /proc/kcore, addr2line, dump_stack
  • +
+
+
+

About this laboratory

+

The Operating Systems 2 lab is a kernel programming and driver development lab. +The objectives of the laboratory are:

+
    +
  • deepening the notions presented in the course
  • +
  • presentation of kernel programming interfaces (kernel API)
  • +
  • gaining documenting, development and debugging skills on a freestanding +environment
  • +
  • acquiring knowledge and skills for drivers development
  • +
+

A laboratory will present a set of concepts, applications and commands +specific to a given problem. The lab will start with a presentation +(each lab will have a set of slides) (15 minutes) and the remaining +time will be allocated to the lab exercises (80 minutes).

+

For best laboratory performance, we recommend that you read the related slides. +To fully understand a laboratory, we recommend going through the lab support. For +in-depth study, use the supporting documentation.

+
+ +
+

Source code navigation

+
+

cscope

+

Cscope is a tool for +efficient navigation of C sources. To use it, a cscope database must +be generated from the existing sources. In a Linux tree, the command +make ARCH=x86 cscope is sufficient. Specification of the +architecture through the ARCH variable is optional but recommended; +otherwise, some architecture dependent functions will appear multiple +times in the database.

+

You can build the cscope database with the command make +ARCH=x86 COMPILED_SOURCE=1 cscope. This way, the cscope database will +only contain symbols that have already been used in the compile +process before, thus resulting in better performance when searching +for symbols.

+

Cscope can also be used as stand-alone, but it is more useful when +combined with an editor. To use cscope with vim, it is necessary to +install both packages and add the following lines to the file +.vimrc (the machine in the lab already has the settings):

+
if has("cscope")
+        " Look for a 'cscope.out' file starting from the current directory,
+        " going up to the root directory.
+        let s:dirs = split(getcwd(), "/")
+        while s:dirs != []
+                let s:path = "/" . join(s:dirs, "/")
+                if (filereadable(s:path . "/cscope.out"))
+                        execute "cs add " . s:path . "/cscope.out " . s:path . " -v"
+                        break
+                endif
+                let s:dirs = s:dirs[:-2]
+        endwhile
+
+        set csto=0  " Use cscope first, then ctags
+        set cst     " Only search cscope
+        set csverb  " Make cs verbose
+
+        nmap `<C-\>`s :cs find s `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`g :cs find g `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`c :cs find c `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`t :cs find t `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`e :cs find e `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`f :cs find f `<C-R>`=expand("`<cfile>`")`<CR>``<CR>`
+        nmap `<C-\>`i :cs find i ^`<C-R>`=expand("`<cfile>`")`<CR>`$`<CR>`
+        nmap `<C-\>`d :cs find d `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap <F6> :cnext <CR>
+        nmap <F5> :cprev <CR>
+
+        " Open a quickfix window for the following queries.
+        set cscopequickfix=s-,c-,d-,i-,t-,e-,g-
+endif
+
+
+

The script searches for a file called cscope.out in the current directory, or +in parent directories. If vim finds this file, you can use the shortcut Ctrl +] +or Ctrl+\ g (the combination control-\ followed by g) to jump directly to +the definition of the word under the cursor (function, variable, structure, etc.). +Similarly, you can use Ctrl+\ s to go where the word under the cursor is used.

+

You can take a cscope-enabled .vimrc file (also contains other goodies) from +https://github.com/ddvlad/cfg/blob/master/_vimrc. +The following guidelines are based on this file, but also show basic vim commands +that have the same effect.

+

If there are more than one results (usually there are) you can move between them +using F6 and F5 (:ccnext and :cprev). +You can also open a new panel showing the results using :copen. To close +the panel, use the :cclose command.

+

To return to the previous location, use Ctrl+o (o, not zero). +The command can be used multiple times and works even if cscope changed the +file you are currently editing.

+

To go to a symbol definition directly when vim starts, use vim -t <symbol_name> +(for example vim -t task_struct). Otherwise, if you started vim and want +to search for a symbol by name, use cs find g <symbol_name> (for example +cs find g task_struct).

+

If you found more than one results and opened a panel showing all the matches +(using :copen) and you want to find a symbol of type structure, +it is recommended to search in the results panel (using / -- slash) +the character { (opening brace).

+
+

Important

+

You can get a summary of all the cscope commands using :cs help.

+

For more info, use the vim built-in help command: :h cscope or :h copen.

+
+

If you use emacs, install the xcscope-el package and +add the following lines in ~/.emacs.

+
(require ‘xcscope)
+(cscope-setup)
+
+
+

These commands will activate cscope for the C and C++ modes automatically. +C-s s is the key bindings prefix and C-s s s is used to +search for a symbol (if you call it when the cursor is over a word, +it will use that). For more details, check https://github.com/dkogan/xcscope.el

+
+
+

clangd

+

Clangd is a language server that provides tools +for navigating C and C++ code. +Language Server Protocol +facilitates features like go-to-definition, find-references, hover, completion, etc., +using semantic whole project analysis.

+

Clangd requires a compilation database to understand the kernel source code. +It can be generated with:

+
make defconfig
+make
+scripts/clang-tools/gen_compile_commands.py
+
+
+

LSP clients:

+ +
+
+

Kscope

+

For a simpler interface, Kscope +is a cscope frontend which uses QT. It is lightweight, very fast and very +easy to use. It allows searching using regular expressions, call graphs, etc. +Kscope is no longer mantained.

+

There is also a port +of version 1.6 for Qt4 and KDE 4 which keeps the integration of the text +editor Kate and is easier to use than the last version on SourceForge.

+
+
+

LXR Cross-Reference

+

LXR (LXR Cross-Reference) is a tool that allows indexing and +referencing the symbols in the source code of a program using +a web interface. The web interface shows links to +locations in files where a symbol is defined or used. Development website +for LXR is http://sourceforge.net/projects/lxr. Similar tools +are OpenGrok and +Gonzui.

+

Although LXR was originally intended for the Linux kernel sources, it is +also used in the sources of Mozilla, +Apache HTTP Server and +FreeBSD.

+

There are a number of sites that use LXR for cross-referencing the +the sources of the Linux kernel, the main site being the original site of +development which does not work anymore. You can +use https://elixir.bootlin.com/.

+

LXR allows searching for an identifier (symbol), after a free text +or after a file name. The main feature and, at the same +time, the main advantage provided is the ease of finding the declaration +of any global identifier. This way, it facilitates quick access to function +declarations, variables, macro definitions and the code can be easily +navigated. Also, the fact that it can detect what code areas are affected +when a variable or function is changed is a real advantage in the development +and debugging phase.

+
+
+

SourceWeb

+

SourceWeb is a source code indexer +for C and C++. It uses the +framework +provided by the Clang compiler to index the code.

+

The main difference between cscope and SourceWeb is the fact that SourceWeb +is, in a way, a compiler pass. SourceWeb doesn't index all the code, but +only the code that was efectively compiled by the compiler. This way, some +problems are eliminated, such as ambiguities about which variant of a function +defined in multiple places is used. This also means that the indexing takes +more time, because the compiled files must pass one more time through +the indexer to generate the references.

+

Usage example:

+
make oldconfig
+sw-btrace make -j4
+sw-btrace-to-compile-db
+sw-clang-indexer --index-project
+sourceweb index
+
+
+

sw-btrace is a script that adds the libsw-btrace.so +library to LD_PRELOAD. This way, the library is loaded by +every process started by make (basically, the compiler), +registers the commands used to start the processes and generates +a filed called btrace.log. This file is then used by +sw-btrace-to-compile-db which converts it to a format defined +by clang: JSON Compilation Database. +This JSON Compilation Database resulted from the above steps is then +used by the indexer, which makes one more pass through the compiled +source files and generates the index used by the GUI.

+

Word of advice: don't index the sources you are working with, but use +a copy, because SourceWeb doesn't have, at this moment, the capability +to regenerate the index for a single file and you will have to regenerate +the complete index.

+
+
+
+

Kernel Debugging

+

Debugging a kernel is a much more difficult process than the debugging +of a program, because there is no support from the operating system. +This is why this process is usually done using two computers, connected +on serial interfaces.

+
+

gdb (Linux)

+

A simpler debug method on Linux, but with many disadvantages, +is local debugging, using gdb, +the uncompressed kernel image (vmlinux) and /proc/kcore +(the real-time kernel image). This method is usually used to inspect +the kernel and detect certain inconsistencies while it runs. The +method is useful especially if the kernel was compiled using the +-g option, which keeps debug information. Some well-known +debug techniques can't be used by this method, such as breakpoints +of data modification.

+
+

Note

+

Because /proc is a virtual filesystem, /proc/kcore +does not physically exist on the disk. It is generated on-the-fly +by the kernel when a program tries to access proc/kcore.

+

It is used for debugging purposes.

+

From man proc, we have:

+
/proc/kcore
+This file represents the physical memory of the system and is stored in the ELF core file format.  With this pseudo-file, and
+an unstripped kernel (/usr/src/linux/vmlinux) binary, GDB can be used to examine the current state of any kernel data struc‐
+tures.
+
+
+
+

The uncompressed kernel image offers information about the data structures +and symbols it contains.

+
student@eg106$ cd ~/src/linux
+student@eg106$ file vmlinux
+vmlinux: ELF 32-bit LSB executable, Intel 80386, ...
+student@eg106$ nm vmlinux | grep sys_call_table
+c02e535c R sys_call_table
+student@eg106$ cat System.map | grep sys_call_table
+c02e535c R sys_call_table
+
+
+

The nm utility is used to show the symbols in an object or +executable file. In our case, vmlinux is an ELF file. Alternately, +we can use the file System.map to view information about the +symbols in kernel.

+

Then we use gdb to inspect the symbols using the uncompressed +kernel image. A simple gdb session is the following:

+
student@eg106$ cd ~/src/linux
+stduent@eg106$ gdb --quiet vmlinux
+Using host libthread_db library "/lib/tls/libthread_db.so.1".
+(gdb) x/x 0xc02e535c
+0xc02e535c `<sys_call_table>`:    0xc011bc58
+(gdb) x/16 0xc02e535c
+0xc02e535c `<sys_call_table>`:    0xc011bc58      0xc011482a      0xc01013d3     0xc014363d
+0xc02e536c `<sys_call_table+16>`: 0xc014369f      0xc0142d4e      0xc0142de5     0xc011548b
+0xc02e537c `<sys_call_table+32>`: 0xc0142d7d      0xc01507a1      0xc015042c     0xc0101431
+0xc02e538c `<sys_call_table+48>`: 0xc014249e      0xc0115c6c      0xc014fee7     0xc0142725
+(gdb) x/x sys_call_table
+0xc011bc58 `<sys_restart_syscall>`:       0xffe000ba
+(gdb) x/x &sys_call_table
+0xc02e535c `<sys_call_table>`:    0xc011bc58
+(gdb) x/16 &sys_call_table
+0xc02e535c `<sys_call_table>`:    0xc011bc58      0xc011482a      0xc01013d3     0xc014363d
+0xc02e536c `<sys_call_table+16>`: 0xc014369f      0xc0142d4e      0xc0142de5     0xc011548b
+0xc02e537c `<sys_call_table+32>`: 0xc0142d7d      0xc01507a1      0xc015042c     0xc0101431
+0xc02e538c `<sys_call_table+48>`: 0xc014249e      0xc0115c6c      0xc014fee7     0xc0142725
+(gdb) x/x sys_fork
+0xc01013d3 `<sys_fork>`:  0x3824548b
+(gdb) disass sys_fork
+Dump of assembler code for function sys_fork:
+0xc01013d3 `<sys_fork+0>`:        mov    0x38(%esp),%edx
+0xc01013d7 `<sys_fork+4>`:        mov    $0x11,%eax
+0xc01013dc `<sys_fork+9>`:        push   $0x0
+0xc01013de `<sys_fork+11>`:       push   $0x0
+0xc01013e0 `<sys_fork+13>`:       push   $0x0
+0xc01013e2 `<sys_fork+15>`:       lea    0x10(%esp),%ecx
+0xc01013e6 `<sys_fork+19>`:       call   0xc0111aab `<do_fork>`
+0xc01013eb `<sys_fork+24>`:       add    $0xc,%esp
+0xc01013ee `<sys_fork+27>`:       ret
+End of assembler dump.
+
+
+

It can be noticed that the uncompressed kernel image was used as an argument +for gdb. The image can be found in the root of the kernel sources +after compilation.

+

A few commands used for debugging using gdb are:

+
    +
  • x (examine) - Used to show the contents of the memory area +whose address is specified as an argument to the command (this address +can be the value of a physical address, a symbol or the address of a +symbol). It can take as arguments (preceded by /): the format +to display the data in (x for hexadecimal, d for +decimal, etc.), how many memory units to display and the size of a +memory unit.
  • +
  • disassemble - Used to disassemble a function.
  • +
  • p (print) - Used to evaluate and show the value of an +expression. The format to show the data in can be specified as +an argument (/x for hexadecimal, /d for decimal, etc.).
  • +
+

The analysis of the kernel image is a method of static analysis. If we +want to perform dynamic analysis (analyzing how the kernel runs, not +only its static image) we can use /proc/kcore; this is a dynamic +image (in memory) of the kernel.

+
student@eg106$ gdb ~/src/linux/vmlinux /proc/kcore
+Core was generated by `root=/dev/hda3 ro'.
+#0  0x00000000 in ?? ()
+(gdb) p sys_call_table
+$1 = -1072579496
+(gdb) p /x sys_call_table
+$2 = 0xc011bc58
+(gdb) p /x &sys_call_table
+$3 = 0xc02e535c
+(gdb) x/16 &sys_call_table
+0xc02e535c `<sys_call_table>`:    0xc011bc58      0xc011482a      0xc01013d3     0xc014363d
+0xc02e536c `<sys_call_table+16>`: 0xc014369f      0xc0142d4e      0xc0142de5     0xc011548b
+0xc02e537c `<sys_call_table+32>`: 0xc0142d7d      0xc01507a1      0xc015042c     0xc0101431
+0xc02e538c `<sys_call_table+48>`: 0xc014249e      0xc0115c6c      0xc014fee7     0xc0142725
+
+
+

Using the dynamic image of the kernel is useful for detecting rootkits.

+ +
+
+

Getting a stack trace

+

Sometimes, you will want information about the trace the execution +reaches a certain point. You can determine this information using +cscope or LXR, but some function are called from many +execution paths, which makes this method difficult.

+

In these situations, it is useful to get a stack trace, which can be +simply done using the function dump_stack().

+
+
+
+

Documentation

+

Kernel development is a difficult process, compared to user space +programming. The API is different and the complexity of the subsystems +in kernel requires additional preparation. The associated documentation +is heterogeneous, sometimes requiring the inspection of multiple sources +to have a more complete understanding of a certain aspect.

+

The main advantages of the Linux kernel are the access to sources and +the open development system. Because of this, the Internet offers a +larger number of documentation for the kernel.

+

A few links related to the Linux kernel are shown bellow:

+ +

The links are not comprehensive. Using The Internet and +kernel source code is essential.

+
+
+

Exercises

+
+

Remarks

+
+

Note

+
    +
  • Usually, the steps used to develop a kernel module are the +following:
      +
    • editing the module source code (on the physical machine);
    • +
    • module compilation (on the physical machine);
    • +
    • generation of the minimal image for the virtual machine; +this image contains the kernel, your module, busybox and +eventually test programs;
    • +
    • starting the virtual machine using QEMU;
    • +
    • running the tests in the virtual machine.
    • +
    +
  • +
  • When using cscope, use ~/src/linux. +If there is no cscope.out file, you can generate it using +the command make ARCH=x86 cscope.
  • +
  • You can find more details about the virtual machine at +Recommended Setup.
  • +
+
+
+

Important

+

Before solving an exercice, carefully read all its bullets.

+
+
+
+

Booting the virtual machine

+

A summary of the virtual machine infrastructure:

+
    +
  • ~/src/linux - Linux kernel sources, needed to +compile modules. The directory contains the file cscope.out, +used for navigation in the source tree.
  • +
  • ~/src/linux/tools/labs/qemu- scripts and auxiliary +files used to generate and run the QEMU VM.
  • +
+

To start the VM, run make boot in the directory ~/src/linux/tools/labs:

+
student@eg106:~$ cd ~/src/linux/tools/labs
+student@eg106:~/src/linux/tools/labs$ make boot
+
+
+

By default, you will not get a prompt or any graphical interface, but you can connect to +a console exposed by the virtual machine using minicom or screen.

+
student@eg106:~/src/linux/tools/labs$ minicom -D serial.pts
+
+<press enter>
+
+qemux86 login:
+Poky (Yocto Project Reference Distro) 2.3 qemux86 /dev/hvc0
+
+
+

Alternatively, you can start the virtual machine with graphical interface support, using +the QEMU_DISPLAY=gtk make boot.

+
+

Note

+

To access the virtual machine, at the login prompt, enter the +username root; there is no need to enter a password. +The virtual machine will start with the permissions of the +root account.

+
+
+
+

Adding and using a virtual disk

+
+

Note

+

If you don't have the file mydisk.img, you can download +it from the address http://elf.cs.pub.ro/so2/res/laboratoare/mydisk.img. +The file must be placed in tools/labs.

+
+

In the ~/src/linux/tools/labs directory, you have a new virtual +machine disk, in the file mydisk.img. We want to add the disk +to the virtual machine and use it within the virtual machine.

+

Edit qemu/Makefile and add -drive file=mydisk.img,if=virtio,format=raw +to the QEMU_OPTS variable.

+
+

Note

+

There are already two disks added to qemu (disk1.img and disk2.img). You will need +to add the new one after them. In this case, the new disk can be accessed as +/dev/vdd (vda is the root partition, vdb is disk1 and vdc is disk2).

+
+
+

Hint

+

You do not need to manually create the entry for the new disk in /dev +because the virtual machine uses devtmpfs.

+
+

Run make in tools/labs to boot the virtual machine. +Create /test directory and try to mount the new disk:

+
mkdir /test
+mount /dev/vdd /test
+
+
+

The reason why we can not mount the virtual disk is because we do not have support in the +kernel for the filesystem with which the mydisk.img is formatted. You will need +to identify the filesystem for mydisk.img and compile kernel support for that filesystem.

+

Close the virtual machine (close the QEMU window, you do not need to use another command). +Use the file command on the physical machine to find out with which filesystem +the mydisk.img file is formatted. You will identify the btrfs file system.

+

You will need to enable btrfs support in the kernel and recompile the kernel image.

+
+

Warning

+

If you receive an error while executing the make menuconfig +command, you probably do not have the libncurses5-dev +package installed. Install it using the command:

+
sudo apt-get install libncurses5-dev
+
+
+
+
+

Hint

+

Enter the ~/src/linux/ subdirectory. Run make menuconfig +and go to the File systems section. Enable Btrfs filesystem support. +You will need to use the builtin option (not the module), i.e. <*> must appear +next to the option (not <M>).

+

Save the configuration you have made. Use the default configuration file (config).

+

In the kernel source subdirectory (~/src/linux/) recompile using the command:

+
make
+
+
+

To wait less, you can use the -j option run multiple jobs in parallel. +Generally, it is recommended to use number of CPUs+1:

+
make -j5
+
+
+
+

After the kernel recompilation finishes, restart the QEMU virtual machine: +that is, launch the make command in the subdirectory. You +do not need to copy anything, because the bzImage file is a symlink to the kernel +image you just recompiled.

+

Inside the QEMU virtual machine, repeat the mkdir and mount operations. +With support for the btrfs filesystem, now mount will finish successfully.

+
+

Note

+

When doing your homework, there is no need to recompile the kernel +because you will only use kernel modules. However, it is important +to be familiar with configuring and recompiling a kernel.

+

If you still plan to recompile the kernel, make a backup of the bzImage +file (follow the link in ~/src/linux for the full path). This will allow +you to return to the initial setup in order to have an environment +identical to the one used by vmchecker.

+
+
+
+

GDB and QEMU

+

We can investigate and troubleshoot the QEMU virtual machine in real time.

+
+

Note

+

You can also use the GDB Dashboard plugin for a user-friendly interface. +gdb must be compiled with Python support.

+

In order to install it, you can just run:

+
wget -P ~ git.io/.gdbinit
+
+
+
+

To do this, we start the QEMU virtual machine first. Then, we can connect +with gdb to a running QEMU virtual machine using the command

+
make gdb
+
+
+

We used the QEMU command with the -s parameter, which means +listening to port 1234 from gdb. We can do debugging +using a remote target for gdb. The existing Makefile +takes care of the details.

+

When you attach a debugger to a process, the process is suspended. +You can add breakpoints and inspect the current status of the process.

+

Attach to the QEMU virtual machine (using the make gdb command) +and place a breakpoint in the sys_access function using the +following command in the gdb console:

+
break sys_access
+
+
+

At this time, the virtual machine is suspended. To continue executing it (up to the possible call +of the sys_access function), use the command:

+
continue
+
+
+

in the gdb console.

+

At this time, the virtual machine is active and has a usable console. +To make a sys_access call, issue a ls command. +Note that the virtual machine was again suspended by gdb +and the corresponding sys_access callback message appeared within the gdb console.

+

Trace code execution using step instruction, continue or next +instruction. You probably do not understand everything that happens, so use commands +such as list and backtrace to trace the execution.

+
+

Hint

+

At the gdb prompt, you can press Enter +(without anything else) to rerun the last command.

+
+
+
+

4. GDB spelunking

+

Use gdb to display the source code of the function that creates kernel threads +(kernel_thread).

+
+

Note

+

You can use GDB for static kernel analysis using, in the kernel source directory, +a command such as:

+
gdb vmlinux
+
+
+

Go over the gdb (Linux) section of the lab.

+
+

Use gdb to find the address of the jiffies variable in memory and its contents. +The jiffies variable holds the number of ticks (clock beats) since the system started.

+
+

Hint

+

To track the value of the jiffies variable, use dynamic analysis in gdb +by running the command:

+
make gdb
+
+
+

as in the previous exercise.

+

Go over the gdb (Linux) section of the lab.

+
+
+

Hint

+

The jiffies is a 64-bit variable. +You can see that its address is the same as the jiffies_64 variable.

+

To explore the contents of a 64-bit variable, use in the gdb console the command:

+
x/gx & jiffies
+
+
+

If you wanted to display the contents of the 32-bit variable, +you would use in the gdb console the command:

+
x/wx & jiffies
+
+
+
+
+
+

5. Cscope spelunking

+

Use LXR or cscope in the ~/src/linux/ directory to discover +the location of certain structures or functions.

+

Cscope index files are already generated. Use vim and other related commands +to scroll through the source code. For example, use the command:

+
vim
+
+
+

for opening the vim editor. Afterwards, inside the editor, use commands such as:

+

:cs find g task_struct.

+

Find the file in which the following data types are defined:

+
    +
  • struct task_struct
  • +
  • struct semaphore
  • +
  • struct list_head
  • +
  • spinlock_t
  • +
  • struct file_system_type
  • +
+
+

Hint

+

For a certain structure, only its name needs to be searched.

+

For instance, in the case of struct task_struct, +search for the task_struct string.

+
+

Usually, you will get more matches. To locate the one you are interested in, do the following:

+
    +
  1. List all matches by using, in vim, :copen command.
  2. +
  3. Look for the right match (where the structure is defined) by looking for an open character +({), a single character on the structure definition line. To search for the open +braid you use in vim the construction /{.
  4. +
  5. On the respective line, press Enter to get into the source code where the variable +is defined.
  6. +
  7. Close the secondary window using the command: :cclose command.
  8. +
+

Find the file in which the following global kernel variables are declared:

+
    +
  • sys_call_table
  • +
  • file_systems
  • +
  • current
  • +
  • chrdevs
  • +
+
+

Hint

+

To do this, use a vim command with the syntax:

+

:cs f g <symbol>

+

where <symbol> is the name of the symbol being searched.

+
+

Find the file in which the following functions are declared:

+
    +
  • copy_from_user
  • +
  • vmalloc
  • +
  • schedule_timeout
  • +
  • add_timer
  • +
+
+

Hint

+

To do this, use a vim command with the syntax:

+

:cs f g <symbol>

+

where <symbol> is the name of the symbol being searched.

+
+

Scroll through the following sequence of structures:

+
    +
  • struct task_struct
  • +
  • struct mm_struct
  • +
  • struct vm_area_struct
  • +
  • struct vm_operations_struct
  • +
+

That is, you access a structure and then you find fields with the data type of the +next structure, access the respective fields and so on. +Note in which files these structures are defined; this will be useful to the following labs.

+
+

Hint

+

In order to search for a symbol in vim (with cscope support) +when the cursor is placed on it, use the Ctrl+] keyboard shortcut.

+

To return to the previous match (the one before search/jump), use the +Ctrl+o keyboard shortcut.

+

To move forward with the search (to return to matches before Ctrl+o), +use the Ctrl+i keyboard shortcut.

+
+

Following the above instructions, find and go through the function call sequence:

+
    +
  • bio_alloc
  • +
  • bio_alloc_bioset
  • +
  • bvec_alloc
  • +
  • kmem_cache_alloc
  • +
  • slab_alloc
  • +
+
+

Note

+

Read cscope or LXR Cross-Reference sections of the lab.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/kernel_api.html b/refs/pull/405/merge/labs/kernel_api.html new file mode 100644 index 00000000..f14577df --- /dev/null +++ b/refs/pull/405/merge/labs/kernel_api.html @@ -0,0 +1,1093 @@ + + + + + + Kernel API — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Kernel API

+
+

Lab objectives

+
+
    +
  • Familiarize yourself with the basic Linux kernel API
  • +
  • Description of memory allocation mechanisms
  • +
  • Description of locking mechanisms
  • +
+
+
+
+

Overview

+

Inside the current lab we present a set of concepts and basic functions required +for starting Linux kernel programming. It is important to note that kernel +programming differs greatly from user space programming. The kernel is a +stand-alone entity that can not use libraries in user-space (not even libc). +As a result, the usual user-space functions (printf, malloc, free, open, read, +write, memcpy, strcpy, etc.) can no longer be used. In conclusion, kernel +programming is based on a totally new and independent API that is unrelated to +the user-space API, whether we refer to POSIX or ANSI C (standard C language +library functions).

+
+
+

Accessing memory

+

An important difference in kernel programming is how to access and allocate +memory. Due to the fact that kernel programming is very close to the physical +machine, there are important rules for memory management. First, it works with +several types of memory:

+
+
    +
  • Physical memory
  • +
  • Virtual memory from the kernel address space
  • +
  • Virtual memory from a process's address space
  • +
  • Resident memory - we know for sure that the accessed pages are present in +physical memory
  • +
+
+

Virtual memory in a process's address space can not be considered resident due +to the virtual memory mechanisms implemented by the operating system: pages may +be swapped or simply may not be present in physical memory as a result of the +demand paging mechanism. The memory in the kernel address space can be resident +or not. Both the data and code segments of a module and the kernel stack of a +process are resident. Dynamic memory may or may not be resident, depending on +how it is allocated.

+

When working with resident memory, things are simple: memory can be accessed at +any time. But if working with non-resident memory, then it can only be accessed +from certain contexts. Non-resident memory can only be accessed from the +process context. Accessing non-resident memory from the context of an +interrupt has unpredictable results and, therefore, when the operating +system detects such access, it will take drastic measures: blocking or +resetting the system to prevent serious corruption.

+

The virtual memory of a process can not be accessed directly from the kernel. +In general, it is totally discouraged to access the address space of a process, +but there are situations where a device driver needs to do it. The typical case +is where the device driver needs to access a buffer from the user-space. In +this case, the device driver must use special features and not directly access +the buffer. This is necessary to prevent access to invalid memory areas.

+

Another difference from the user-space scheduling, relative to memory, is due to +the stack, a stack whose size is fixed and limited. A stack of 4K is used in +Linux, and a stack of 12K is used in Windows. For this reason, the +allocation of large structures on stack or the use of recursive calls should +be avoided.

+
+
+

Contexts of execution

+

In relation to kernel execution, we distinguish two contexts: process context +and interrupt context. We are in the process context when we run code as a +result of a system call or when we run in the context of a kernel thread. When +we run in a routine to handle an interrupt or a deferrable action, we run in +an interrupt context.

+

Some of the kernel API calls can block the current process. Common examples are +using a semaphore or waiting for a condition. In this case, the process is +put into the WAITING state and another process is running. An interesting +situation occurs when a function that can lead to the current process to be +suspended, is called from an interrupt context. In this case, there is no +current process, and therefore the results are unpredictable. Whenever the +operating system detects this condition will generate an error condition that +will cause the operating system to shut down.

+
+
+

Locking

+

One of the most important features of kernel programming is parallelism. Linux +supports SMP systems with multiple processors and kernel preemptivity. This +makes kernel programming more difficult because access to global variables must +be synchronized with either spinlock primitives or blocking primitives. Although +it is recommended to use blocking primitives, they can not be used in an +interrupt context, so the only locking solution in the context of an interrupt +is spinlocks.

+

Spinlocks are used in order to achieve mutual exclusion. When it can not get +access to the critical region, it does not suspend the current process, but it +uses the busy-waiting mechanism (waiting in a while() loop for the lock +to be released). +The code that runs in the critical region protected by a spinlock is not allowed +to suspend the current process (it must adhere to the execution conditions in +the interrupt context). Moreover, the CPU will not be released except for +the case of an interrupt. Due to the mechanism used, it is important that a +spinlock is being held as little time as possible.

+
+
+

Preemptivity

+

Linux uses preemptive kernels. The notion of preemptive multitasking should not +be confused with the notion of a preemptive kernel. The notion of preemptive +multitasking refers to the fact that the operating system forcefully interrupts +a process running in user space when its quantum (time slice) expires, in order +to run another process. +A kernel is preemptive if a process running in kernel mode (as a result of a +system call) can be interrupted so that another process is being run.

+

Because of preemptivity, when we share resources between two portions of code +that can run from different process contexts, we need to protect ourselves with +synchronization primitives, even in the case of a single processor.

+
+
+

Linux Kernel API

+
+

Convention indicating errors

+

For Linux kernel programming, the convention used for calling functions to +indicate success is the same as in UNIX programming: 0 for success, or a value +other than 0 for failure. +For failures, negative values are returned as shown in the example below:

+
if (alloc_memory() != 0)
+    return -ENOMEM;
+
+if (user_parameter_valid() != 0)
+    return -EINVAL;
+
+
+

The exhaustive list of errors and a summary explanation can be found in +include/uapi/asm-generic/errno-base.h and in +include/uapi/asm-generic/ernno.h.

+
+
+

Strings of characters

+

In Linux, the kernel programmer is provided with the usual routine functions: +strcpy(), strncpy(), strlcpy(), strcat(), +strncat(), strlcat(), strcmp(), strncmp(), +strnicmp(), strchr(), strnchr(), strrchr(), +strstr(), strlen(), memset(), memmove(), +memcmp(), etc. These functions are declared in the +include/linux/string.h header and are implemented in the kernel in the +lib/string.c file.

+
+
+

printk

+

The printf equivalent in the kernel is printk, defined in +include/linux/printk.h. The printk() syntax is very similar +to printf(). The first +parameter of printk() decides the log category in which the current log +falls into:

+
#define KERN_EMERG   "<0>"  /* system is unusable */
+#define KERN_ALERT   "<1>"  /* action must be taken immediately */
+#define KERN_CRIT    "<2>"  /* critical conditions */
+#define KERN_ERR     "<3>"  /* error conditions */
+#define KERN_WARNING "<4>"  /* warning conditions */
+#define KERN_NOTICE  "<5>"  /* normal but significant condition */
+#define KERN_INFO    "<6>"  /* informational */
+#define KERN_DEBUG   "<7>"  /* debug-level messages */
+
+
+

Thus, a warning message in the kernel would be sent with:

+
printk(KERN_WARNING "my_module input string %s\n", buff);
+
+
+

If the logging level is missing from the printk() call, logging is done +with the default level at the time of the call. One thing to keep in mind is +that messages sent with printk() are only visible on the console if and +only if their level exceeds the default level set on the console.

+

To reduce the size of lines when using printk(), it is recommended to +use the following help functions instead of directly using the printk() +call:

+
pr_emerg(fmt, ...); /* similar to printk(KERN_EMERG pr_fmt(fmt), ...); */
+pr_alert(fmt, ...); /* similar to printk(KERN_ALERT pr_fmt(fmt), ...); */
+pr_crit(fmt, ...); /* similar to printk(KERN_CRIT pr_fmt(fmt), ...); */
+pr_err(fmt, ...); /* similar to printk(KERN_ERR pr_fmt(fmt), ...); */
+pr_warn(fmt, ...); /* similar to printk(KERN_WARNING pr_fmt(fmt), ...); */
+pr_notice(fmt, ...); /* similar to printk(KERN_NOTICE pr_fmt(fmt), ...); */
+pr_info(fmt, ...); /* similar to printk(KERN_INFO pr_fmt(fmt), ...); */
+pr_debug(fmt, ...); /* similar to printk(KERN_DEBUG pr_fmt(fmt), ...); */
+
+
+

A special case is pr_debug() that calls the printk() function +only when the DEBUG macro is defined or if dynamic debugging is used.

+
+
+

Memory allocation

+

In Linux only resident memory can be allocated, using kmalloc() call. +A typical kmalloc() call is presented below:

+
#include <linux/slab.h>
+
+string = kmalloc (string_len + 1, GFP_KERNEL);
+if (!string) {
+    //report error: -ENOMEM;
+}
+
+
+

As you can see, the first parameter indicates the size in bytes of the allocated +area. The function returns a pointer to a memory area that can be directly used +in the kernel, or NULL if memory could not be allocated. The second +parameter specifies how allocation should be done and the most commonly used +values for this are:

+
+
    +
  • GFP_KERNEL - using this value may cause the current process to +be suspended. Thus, it can not be used in the interrupt context.
  • +
  • GFP_ATOMIC - using this value it ensures that the +kmalloc() function does not suspend the current process. It can be +used anytime.
  • +
+
+

The counterpart to the kmalloc() function is kfree(), a function +that receives as argument an area allocated by kmalloc(). This function +does not suspend the current process and can therefore be called from any +context.

+
+
+

lists

+

Because linked lists are often used, the Linux kernel API provides a unified +way of defining and using lists. This involves using a +struct list_head element in the structure we want to consider as a +list node. The struct list_head is defined in +include/linux/list.h along with all the other functions that manipulate +the lists. The following code shows the definition of +the struct list_head and the use of an element of this type in another +well-known structure in the Linux kernel:

+
struct list_head {
+    struct list_head *next, *prev;
+};
+
+struct task_struct {
+    ...
+    struct list_head children;
+    ...
+};
+
+
+

The usual routines for working with lists are the following:

+
+
    +
  • LIST_HEAD(name) is used to declare the sentinel of a list
  • +
  • INIT_LIST_HEAD(struct list_head *list)() is used to initialize the +sentinel of a list when dynamic allocation is made, by setting the value of +the next and prev to list fields.
  • +
  • list_add(struct list_head *new, struct list_head *head)() adds the +new element after the head element.
  • +
  • list_del(struct list_head *entry)() deletes the item at the +entry address of the list it belongs to.
  • +
  • list_entry(ptr, type, member) returns the structure with the +type type that contains the element ptr from the list, +having the name member within the structure.
  • +
  • list_for_each(pos, head) iterates over a list using +pos as a cursor.
  • +
  • list_for_each_safe(pos, n, head) iterates over a list using +pos as a cursor and n as a temporary cursor. +This macro is used to delete an item from the list.
  • +
+
+

The following code shows how to use these routines:

+
#include <linux/slab.h>
+#include <linux/list.h>
+
+struct pid_list {
+    pid_t pid;
+    struct list_head list;
+};
+
+LIST_HEAD(my_list);
+
+static int add_pid(pid_t pid)
+{
+    struct pid_list *ple = kmalloc(sizeof *ple, GFP_KERNEL);
+
+    if (!ple)
+        return -ENOMEM;
+
+    ple->pid = pid;
+    list_add(&ple->list, &my_list);
+
+    return 0;
+}
+
+static int del_pid(pid_t pid)
+{
+    struct list_head *i, *tmp;
+    struct pid_list *ple;
+
+    list_for_each_safe(i, tmp, &my_list) {
+        ple = list_entry(i, struct pid_list, list);
+        if (ple->pid == pid) {
+            list_del(i);
+            kfree(ple);
+            return 0;
+        }
+    }
+
+    return -EINVAL;
+}
+
+static void destroy_list(void)
+{
+    struct list_head *i, *n;
+    struct pid_list *ple;
+
+    list_for_each_safe(i, n, &my_list) {
+        ple = list_entry(i, struct pid_list, list);
+        list_del(i);
+        kfree(ple);
+    }
+}
+
+
+

The evolution of the list can be seen in the following figure:

+../_images/list_evolution.png +

You see the stack type behavior introduced by the list_add macro, +and the use of a sentinel.

+

From the above example, it can be noticed that the way to define and use a list +(double-linked) is generic and, at the same time, it does not introduce an +additional overhead. The struct list_head is used to maintain the +links between the list elements. It can be noticed that iterating over the list +is also done with this structure, and that retrieving a list element can be done +using list_entry. This idea of implementing and using a list is not +new, as it has already been described in The Art of Computer Programming by +Donald Knuth in the 1980s.

+

Several kernel list functions and macro definitions are presented and explained +in the include/linux/list.h header.

+
+
+

Spinlock

+

spinlock_t (defined in linux/spinlock.h) is the basic type +that implements the spinlock concept in Linux. It describes a spinlock, and the +operations associated with a spinlock are spin_lock_init(), +spin_lock(), spin_unlock(). An example of use is given below:

+
#include <linux/spinlock.h>
+
+DEFINE_SPINLOCK(lock1);
+spinlock_t lock2;
+
+spin_lock_init(&lock2);
+
+spin_lock(&lock1);
+/* critical region */
+spin_unlock(&lock1);
+
+spin_lock(&lock2);
+/* critical region */
+spin_unlock(&lock2);
+
+
+

In Linux, you can use reader-writer spinlocks, useful for readers-writers +problems. +These types of locks are identified by rwlock_t, and the functions +that can work on a reader-writer spinlock are +rwlock_init(), +read_lock(), +write_lock(). +An example of use:

+
#include <linux/spinlock.h>
+
+DEFINE_RWLOCK(lock);
+
+struct pid_list {
+    pid_t pid;
+    struct list_head list;
+};
+
+int have_pid(struct list_head *lh, int pid)
+{
+    struct list_head *i;
+    void *elem;
+
+    read_lock(&lock);
+    list_for_each(i, lh) {
+        struct pid_list *pl = list_entry(i, struct pid_list, list);
+        if (pl->pid == pid) {
+            read_unlock(&lock);
+            return 1;
+        }
+    }
+    read_unlock(&lock);
+
+    return 0;
+}
+
+void add_pid(struct list_head *lh, struct pid_list *pl)
+{
+    write_lock(&lock);
+    list_add(&pl->list, lh);
+    write_unlock(&lock);
+}
+
+
+
+
+

mutex

+

A mutex is a variable of the struct mutex type (defined in +linux/mutex.h). +Functions and macros for working with mutexes are listed below:

+
#include <linux/mutex.h>
+
+/* functions for mutex initialization */
+void mutex_init(struct mutex *mutex);
+DEFINE_MUTEX(name);
+
+/* functions for mutex acquire */
+void mutex_lock(struct mutex *mutex);
+
+/* functions for mutex release */
+void mutex_unlock(struct mutex *mutex);
+
+
+

Operations are similar to classic mutex operations in user-space or spinlock +operations: the mutex is acquired before entering the critical region and it is +released after exiting the critical region. Unlike spinlocks, these operations +can only be used in process context.

+
+
+

Atomic variables

+

Often, you only need to synchronize access to a simple variable, such as a +counter. For this, an atomic_t type can be used (defined in +include/linux/atomic.h), that holds an integer value. Below are some +operations that can be performed on an atomic_t variable.

+
#include <asm/atomic.h>
+
+void atomic_set(atomic_t *v, int i);
+int atomic_read(atomic_t *v);
+void atomic_add(int i, atomic_t *v);
+void atomic_sub(int i, atomic_t *v);
+void atomic_inc(atomic_t *v);
+void atomic_dec(atomic_t *v);
+int atomic_inc_and_test(atomic_t *v);
+int atomic_dec_and_test(atomic_t *v);
+int atomic_cmpxchg(atomic_t *v, int old, int new);
+
+
+
+

Use of atomic variables

+

A common way of using atomic variables is to store the status of an action +(e.g. a flag). So we can use an atomic variable to mark exclusive actions. For +example, we consider that an atomic variable can have the LOCKED and UNLOCKED +values, and if the respective variable equals LOCKED then a specific function +should return -EBUSY. +Such an usage is shown schematically in the code below:

+
#define LOCKED       0
+#define UNLOCKED     1
+
+static atomic_t flag;
+
+static int my_acquire(void)
+{
+     int initial_flag;
+
+     /*
+      * Check if flag is UNLOCKED; if so, lock it and do it atomically.
+      *
+      * This is the atomic equivalent of
+      *      if (flag == UNLOCKED)
+      *              flag = LOCKED;
+      *      else
+      *              return -EBUSY;
+      */
+     initial_flag = atomic_cmpxchg(&flag, UNLOCKED, LOCKED);
+     if (initial_flag == LOCKED) {
+             printk(KERN_ALERT "Already locked.\n");
+             return -EBUSY;
+     }
+
+     /* Do your thing after getting the lock. */
+     [...]
+}
+
+static void my_release(void)
+{
+     /* Release flag; mark it as unlocked. */
+     atomic_set(&flag, UNLOCKED);
+}
+
+void my_init(void)
+{
+     [...]
+     /* Atomic variable is initially unlocked. */
+     atomic_set(&flag, UNLOCKED);
+
+     [...]
+}
+
+
+

The above code is the equivalent of using a trylock (such as +pthread_mutex_trylock()).

+

We can also use a variable to store the size of a buffer and for atomic +updates of the respective variable. The code below is such an example:

+
static unsigned char buffer[MAX_SIZE];
+static atomic_t size;
+
+static void add_to_buffer(unsigned char value)
+{
+     buffer[atomic_read(&size)] = value;
+     atomic_inc(&size);
+}
+
+static unsigned char remove_from_buffer(void)
+{
+     unsigned char value;
+
+     value = buffer[atomic_read(&size)];
+     atomic_dec(&size);
+
+     return value
+}
+
+static void reset_buffer(void)
+{
+     atomic_set(&size, 0);
+}
+
+void my_init(void)
+{
+     [...]
+     /* Initialized buffer and size. */
+     atomic_set(&size, 0);
+     memset(buffer, 0, sizeof(buffer));
+
+     [...]
+}
+
+
+
+
+
+

Atomic bitwise operations

+

The kernel provides a set of functions (in asm/bitops.h) that modify or +test bits in an atomic way.

+
#include <asm/bitops.h>
+
+void set_bit(int nr, void *addr);
+void clear_bit(int nr, void *addr);
+void change_bit(int nr, void *addr);
+int test_and_set_bit(int nr, void *addr);
+int test_and_clear_bit(int nr, void *addr);
+int test_and_change_bit(int nr, void *addr);
+
+
+

Addr represents the address of the memory area whose bits are being +modified or tested and nr is the bit on which the operation is +performed.

+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is kernel_api. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/kernel_api/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR find the definitions of the following symbols in the Linux kernel:

+
+
    +
  • struct list_head
  • +
  • INIT_LIST_HEAD()
  • +
  • list_add()
  • +
  • list_for_each
  • +
  • list_entry
  • +
  • container_of
  • +
  • offsetof
  • +
+
+
+
+

1. Memory allocation in Linux kernel

+

Generate the skeleton for the task named 1-mem and browse the +contents of the mem.c file. Observe the use of kmalloc() +call for memory allocation.

+
+
    +
  1. Compile the source code and load the mem.ko module using +insmod.
  2. +
  3. View the kernel messages using the dmesg command.
  4. +
  5. Unload the kernel module using the rmmod mem command.
  6. +
+
+
+

Note

+

Review the Memory Allocation section in the lab.

+
+
+
+

2. Sleeping in atomic context

+

Generate the skeleton for the task named 2-sched-spin and browse +the contents of the sched-spin.c file.

+
+
    +
  1. Compile the source code and load the module, according the above info: +(make build and make copy)
  2. +
  3. Notice that it is waiting for 5 seconds until the insertion +order is complete.
  4. +
  5. Unload the kernel module.
  6. +
  7. Look for the lines marked with: TODO 0 to create an atomic +section. Re-compile the source code and reload the module into +the kernel.
  8. +
+
+

You should now get an error. Look at the stack trace. What is the +cause of the error?

+
+

Hint

+

In the error message, follow the line containing the BUG +for a description of the error. You are not allowed to sleep in +atomic context. The atomic context is given by a section +between a lock operation and an unlock on a spinlock.

+
+
+

Note

+

The +schedule_timeout() function, corroborated with the +set_current_state macro, forces the current process to wait +for 5 seconds.

+
+
+

Note

+

Review the Contexts of execution, Locking and Spinlock +sections.

+
+
+
+

3. Working with kernel memory

+

Generate the skeleton for the task named 3-memory directory and +browse the contents of the memory.c file. Notice the comments +marked with TODO. You must allocate 4 structures of type struct +task_info and initialize them (in memory_init()), then print and +free them (in memory_exit()).

+
+
    +
  1. (TODO 1) Allocate memory for struct task_info structure and +initialize its fields:

    +
      +
    • The pid field to the PID transmitted as a parameter;
    • +
    • The timestamp field to the value of the jiffies +variable, which holds the number of ticks that have occurred since the +system booted.
    • +
    +
  2. +
  3. (TODO 2) Allocate struct task_info for the current process, +the parent process, the next process, the next process of the next +process, with the following information:

    +
      +
    • PID of the current process, which can be retrieved from +struct task_struct structure, returned by current +macro.
    • +
    +
    +

    Hint

    +

    Search for pid in task_struct.

    +
    +
      +
    • PID of the parent process of the current process.
    • +
    +
    +

    Hint

    +

    Search for the relevant field from struct task_struct +structure. Look after "parent".

    +
    +
      +
    • PID of the next process from the list of processes, relative to the +current process.
    • +
    +
    +

    Hint

    +

    Use next_task macro, which returns a pointer to the next +process (i.e a struct task_struct structure).

    +
    +
      +
    • PID of the next process of the next process, relative to the current +process.
    • +
    +
    +

    Hint

    +

    Call the next_task macro 2 times.

    +
    +
  4. +
  5. (TODO 3) Display the four structures.

    +
      +
    • Use printk() to display their two fields:
    • +
    +

    pid and timestamp.

    +
  6. +
  7. (TODO 4) Release the memory occupied by the structures +(use kfree()).

    +
  8. +
+
+
+

Hint

+
    +
  • You can access the current process using current +macro.
  • +
  • Look for the relevant fields in the struct task_struct +structure (pid, parent).
  • +
  • Use the next_task macro. The macro returns the pointer to +the next process (ie. a struct task_struct* structure).
  • +
+
+
+

Note

+

The struct task_struct structure contains two fields to +designate the parent of a task:

+
    +
  • real_parent points to the process that created the +task or to process with pid 1 (init) if the parent +completed its execution.
  • +
  • parent indicates to the current task parent (the +process that will be reported if the task completes +execution).
  • +
+

In general, the values of the two fields are the same, but +there are situations where they differ, for example when +using the ptrace() system call.

+
+
+

Hint

+

Review the Memory allocation section in the lab.

+
+
+
+

4. Working with kernel lists

+

Generate the skeleton for the task named 4-list. Browse the +contents of the list.c file and notice the comments marked with +TODO. The current process will add the four structures from the +previous exercise into a list. The list will be built in the +task_info_add_for_current() function which is called when module is +loaded. The list will be printed and deleted in the list_exit() +function and the task_info_purge_list() function.

+
+
    +
  1. (TODO 1) Complete the task_info_add_to_list() function to allocate +a struct task_info structure and add it to the list.
  2. +
  3. (TODO 2) Complete the task_info_purge_list() function to delete +all the elements in the list.
  4. +
  5. Compile the kernel module. Load and unload the module by +following the messages displayed by the kernel.
  6. +
+
+
+

Hint

+

Review the labs Lists section. When deleting items from +the list, you will need to use either the +list_for_each_safe or list_for_each_entry_safe +macros.

+
+
+
+

5. Working with kernel lists for process handling

+

Generate the skeleton for the task named 5-list-full. Browse the +contents of the list-full.c and notice comments marked with +TODO. In addition to the 4-list functionality we add the +following:

+
+
    +
  • A count field showing how many times a process has been "added" +to the list.

    +
  • +
  • If a process is "added" several times, no new entry is created in +the list, but:

    +
    +
      +
    • Update the timestamp field.
    • +
    • Increment count.
    • +
    +
    +
  • +
  • To implement the counter facility, add a task_info_find_pid() +function that searches for a pid in the existing list.

    +
  • +
  • If found, return the reference to the task_info struct. If +not, return NULL.

    +
  • +
  • An expiration facility. If a process was added more than 3 +seconds ago and if it does not have a count greater than 5 then +it is considered expired and is removed from the list.

    +
  • +
  • The expiration facility is already implemented in the +task_info_remove_expired() function.

    +
  • +
+
    +
  1. (TODO 1) Implement the task_info_find_pid() function.

    +
  2. +
  3. (TODO 2) Change a field of an item in the list so it does not +expire. It must not satisfy a part of the expiration condition +from task_info_remove_expired().

    +
    +

    Hint

    +

    For TODO 2, extract the first element from the list (the one +referred by head.next) and set the count +field to a large enough value. Use atomic_set() function.

    +
    +
  4. +
  5. Compile, copy, load and unload the kernel module following the displayed +messages. +Kernel module loading will take some time, because sleep() is +being called by schedule_timeout() function.

    +
  6. +
+
+
+
+

6. Synchronizing list work

+

Generate the skeleton for the task named 6-list-sync.

+
+
    +
  1. Browse the code and look for TODO 1 string.
  2. +
  3. Use a spinlock or a read-write lock to synchronize access to the +list.
  4. +
  5. Compile, load and unload the kernel module.
  6. +
+
+
+

Important

+

Always lock data, not code!

+
+
+

Note

+

Read Spinlock section of the lab.

+
+
+
+

7. Test module calling in our list module

+

Generate the skeleton for the task named 7-list-test and browse +the contents of the list-test.c file. We'll use it as a test +module. It will call functions exported by the 6-list-sync +task. The exported functions are the ones marked with extern in +list-test.c file.

+

Uncomment the commented code from 7-list-test.c. Look for TODO 1.

+

To export the above functions from the module located at 6-list-sync/ +directory, the following steps are required:

+
+
    +
  1. Functions must not be static.
  2. +
  3. Use the EXPORT_SYMBOL macro to export the kernel symbols. For +example: EXPORT_SYMBOL(task_info_remove_expired);. The +macro must be used for each function after the function is defined. +Browse the code and look for the TODO 2 string in the +list-sync.c.
  4. +
  5. Remove from the module from 6-list-sync the code that avoids the +expiration of a list item (it is in contradiction to our exercise).
  6. +
  7. Compile and load the module from 6-list-sync/. Once loaded, it +exposes exported functions and can be used by the test +module. You can check this by searching for the function names +in /proc/kallsyms before and after loading the module.
  8. +
  9. Compile the test module and then load it.
  10. +
  11. Use lsmod to check that the two modules have been loaded. +What do you notice?
  12. +
  13. Unload the kernel test module.
  14. +
+
+

What should be the unload order of the two modules (the module from +6-list-sync and the test module)? What happens if you use another order?

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/kernel_modules.html b/refs/pull/405/merge/labs/kernel_modules.html new file mode 100644 index 00000000..13dc7fb1 --- /dev/null +++ b/refs/pull/405/merge/labs/kernel_modules.html @@ -0,0 +1,1391 @@ + + + + + + Kernel modules — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Kernel modules

+
+

Lab objectives

+
    +
  • creating simple modules
  • +
  • describing the process of kernel module compilation
  • +
  • presenting how a module can be used with a kernel
  • +
  • simple kernel debugging methods
  • +
+
+
+

Kernel Modules Overview

+

A monolithic kernel, though faster than a microkernel, has the disadvantage of +lack of modularity and extensibility. On modern monolithic kernels, this has +been solved by using kernel modules. A kernel module (or loadable kernel mode) +is an object file that contains code that can extend the kernel functionality +at runtime (it is loaded as needed); When a kernel module is no longer needed, +it can be unloaded. Most of the device drivers are used in the form of kernel +modules.

+

For the development of Linux device drivers, it is recommended to download the +kernel sources, configure and compile them and then install the compiled version +on the test /development tool machine.

+
+
+

An example of a kernel module

+

Below is a very simple example of a kernel module. When loading into the kernel, +it will generate the message "Hi". When unloading the kernel module, the +"Bye" message will be generated.

+
#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("My kernel module");
+MODULE_AUTHOR("Me");
+MODULE_LICENSE("GPL");
+
+static int dummy_init(void)
+{
+        pr_debug("Hi\n");
+        return 0;
+}
+
+static void dummy_exit(void)
+{
+        pr_debug("Bye\n");
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+
+
+

The generated messages will not be displayed on the console but will be saved +in a specially reserved memory area for this, from where they will be extracted +by the logging daemon (syslog). To display kernel messages, you can use the +dmesg command or inspect the logs:

+
# cat /var/log/syslog | tail -2
+Feb 20 13:57:38 asgard kernel: Hi
+Feb 20 13:57:43 asgard kernel: Bye
+
+# dmesg | tail -2
+Hi
+Bye
+
+
+
+
+

Compiling kernel modules

+

Compiling a kernel module differs from compiling an user program. First, other +headers should be used. Also, the module should not be linked to libraries. +And, last but not least, the module must be compiled with the same options as +the kernel in which we load the module. For these reasons, there is a standard +compilation method (kbuild). This method requires the use of two files: +a Makefile and a Kbuild file.

+

Below is an example of a Makefile:

+
KDIR = /lib/modules/`uname -r`/build
+
+kbuild:
+        make -C $(KDIR) M=`pwd`
+
+clean:
+        make -C $(KDIR) M=`pwd` clean
+
+
+

And the example of a Kbuild file used to compile a module:

+
EXTRA_CFLAGS = -Wall -g
+
+obj-m        = modul.o
+
+
+

As you can see, calling make on the Makefile file in the +example shown will result in the make invocation in the kernel +source directory (/lib/modules/`uname -r`/build) and referring to the +current directory (M = `pwd`). This process ultimately leads to reading +the Kbuild file from the current directory and compiling the module +as instructed in this file.

+
+

Note

+

For labs we will configure different KDIR, according to +the virtual machine specifications:

+
KDIR = /home/student/src/linux
+[...]
+
+
+
+

A Kbuild file contains one or more directives for compiling a kernel +module. The easiest example of such a directive is obj-m = +module.o. Following this directive, a kernel module (ko - kernel +object) will be created, starting from the module.o file. module.o will +be created starting from module.c or module.S. All of these files can +be found in the Kbuild's directory.

+

An example of a Kbuild file that uses several sub-modules is shown +below:

+
EXTRA_CFLAGS = -Wall -g
+
+obj-m        = supermodule.o
+supermodule-y = module-a.o module-b.o
+
+
+

For the example above, the steps to compile are:

+
+
    +
  • compile the module-a.c and module-b.c sources, +resulting in module-a.o and module-b.o objects
  • +
  • module-a.o and module-b.o will then be linked +in supermodule.o
  • +
  • from supermodule.o will be created supermodule.ko +module
  • +
+
+

The suffix of targets in Kbuild determines how they are used, as +follows:

+
+
    +
  • M (modules) is a target for loadable kernel modules
  • +
  • Y (yes) represents a target for object files to be compiled and then +linked to a module ($(mode_name)-y) or within the kernel (obj-y)
  • +
  • any other target suffix will be ignored by Kbuild and will not be +compiled
  • +
+
+
+

Note

+

These suffixes are used to easily configure the kernel by running the +make menuconfig command or directly editing the +.config file. This file sets a series of variables that are +used to determine which features are added to the kernel at build +time. For example, when adding BTRFS support with make +menuconfig, add the line CONFIG_BTRFS_FS = y to the +.config file. The BTRFS kbuild contains the line +obj-$(CONFIG_BTRFS_FS):= btrfs.o, which becomes obj-y:= +btrfs.o. This will compile the btrfs.o object and will be +linked to the kernel. Before the variable was set, the line became +obj:=btrfs.o and so it was ignored, and the kernel was build +without BTRFS support.

+
+

For more details, see the Documentation/kbuild/makefiles.txt and +Documentation/kbuild/modules.txt files within the kernel sources.

+
+
+

Loading/unloading a kernel module

+

To load a kernel module, use the insmod utility. This utility +receives as a parameter the path to the *.ko file in which the module +was compiled and linked. Unloading the module from the kernel is done using +the rmmod command, which receives the module name as a parameter.

+
$ insmod module.ko
+$ rmmod module.ko
+
+
+

When loading the kernel module, the routine specified as a parameter of the +module_init macro will be executed. Similarly, when the module is unloaded +the routine specified as a parameter of the module_exit will be executed.

+

A complete example of compiling and loading/unloading a kernel module is +presented below:

+
faust:~/lab-01/modul-lin# ls
+Kbuild  Makefile  modul.c
+
+faust:~/lab-01/modul-lin# make
+make -C /lib/modules/`uname -r`/build M=`pwd`
+make[1]: Entering directory `/usr/src/linux-2.6.28.4'
+  LD      /root/lab-01/modul-lin/built-in.o
+  CC [M]  /root/lab-01/modul-lin/modul.o
+  Building modules, stage 2.
+  MODPOST 1 modules
+  CC      /root/lab-01/modul-lin/modul.mod.o
+  LD [M]  /root/lab-01/modul-lin/modul.ko
+make[1]: Leaving directory `/usr/src/linux-2.6.28.4'
+
+faust:~/lab-01/modul-lin# ls
+built-in.o  Kbuild  Makefile  modul.c  Module.markers
+modules.order  Module.symvers  modul.ko  modul.mod.c
+modul.mod.o  modul.o
+
+faust:~/lab-01/modul-lin# insmod modul.ko
+
+faust:~/lab-01/modul-lin# dmesg | tail -1
+Hi
+
+faust:~/lab-01/modul-lin# rmmod modul
+
+faust:~/lab-01/modul-lin# dmesg | tail -2
+Hi
+Bye
+
+
+

Information about modules loaded into the kernel can be found using the +lsmod command or by inspecting the /proc/modules, +/sys/module directories.

+
+
+

Kernel Module Debugging

+

Troubleshooting a kernel module is much more complicated than debugging a +regular program. First, a mistake in a kernel module can lead to blocking the +entire system. Troubleshooting is therefore much slowed down. To avoid reboot, +it is recommended to use a virtual machine (qemu, virtualbox, vmware).

+

When a module containing bugs is inserted into the kernel, it will eventually +generate a kernel oops. +A kernel oops is an invalid operation detected by the kernel and can only +be generated by the kernel. For a stable kernel version, it almost certainly +means that the module contains a bug. After the oops appears, the kernel will +continue to work.

+

Very important to the appearance of a kernel oops is saving the generated +message. As noted above, messages generated by the kernel are saved in logs and +can be displayed with the dmesg command. To make sure that no kernel +message is lost, it is recommended to insert/test the kernel directly from the +console, or periodically check the kernel messages. Noteworthy is that an oops +can occur because of a programming error, but also a because of hardware error.

+

If a fatal error occurs, after which the system can not return to a stable +state, a kernel panic is +generated.

+

Look at the kernel module below that contains a bug that generates an oops:

+
/*
+ * Oops generating kernel module
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+MODULE_DESCRIPTION ("Oops");
+MODULE_LICENSE ("GPL");
+MODULE_AUTHOR ("PSO");
+
+#define OP_READ         0
+#define OP_WRITE        1
+#define OP_OOPS         OP_WRITE
+
+static int my_oops_init (void)
+{
+        int *a;
+
+        a = (int *) 0x00001234;
+#if OP_OOPS == OP_WRITE
+        *a = 3;
+#elif OP_OOPS == OP_READ
+        printk (KERN_ALERT "value = %d\n", *a);
+#else
+#error "Unknown op for oops!"
+#endif
+
+        return 0;
+}
+
+static void my_oops_exit (void)
+{
+}
+
+module_init (my_oops_init);
+module_exit (my_oops_exit);
+
+
+

Inserting this module into the kernel will generate an oops:

+
faust:~/lab-01/modul-oops# insmod oops.ko
+[...]
+
+faust:~/lab-01/modul-oops# dmesg | tail -32
+BUG: unable to handle kernel paging request at 00001234
+IP: [<c89d4005>] my_oops_init+0x5/0x20 [oops]
+  *de = 00000000
+Oops: 0002 [#1] PREEMPT DEBUG_PAGEALLOC
+last sysfs file: /sys/devices/virtual/net/lo/operstate
+Modules linked in: oops(+) netconsole ide_cd_mod pcnet32 crc32 cdrom [last unloaded: modul]
+
+Pid: 4157, comm: insmod Not tainted (2.6.28.4 #2) VMware Virtual Platform
+EIP: 0060:[<c89d4005>] EFLAGS: 00010246 CPU: 0
+EIP is at my_oops_init+0x5/0x20 [oops]
+EAX: 00000000 EBX: fffffffc ECX: c89d4300 EDX: 00000001
+ESI: c89d4000 EDI: 00000000 EBP: c5799e24 ESP: c5799e24
+ DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
+Process insmod (pid: 4157, ti=c5799000 task=c665c780 task.ti=c5799000)
+Stack:
+ c5799f8c c010102d c72b51d8 0000000c c5799e58 c01708e4 00000124 00000000
+ c89d4300 c5799e58 c724f448 00000001 c89d4300 c5799e60 c0170981 c5799f8c
+ c014b698 00000000 00000000 c5799f78 c5799f20 00000500 c665cb00 c89d4300
+Call Trace:
+ [<c010102d>] ? _stext+0x2d/0x170
+ [<c01708e4>] ? __vunmap+0xa4/0xf0
+ [<c0170981>] ? vfree+0x21/0x30
+ [<c014b698>] ? load_module+0x19b8/0x1a40
+ [<c035e965>] ? __mutex_unlock_slowpath+0xd5/0x140
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c014b7aa>] ? sys_init_module+0x8a/0x1b0
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c0240a08>] ? trace_hardirqs_on_thunk+0xc/0x10
+ [<c0103407>] ? sysenter_do_call+0x12/0x43
+Code: <c7> 05 34 12 00 00 03 00 00 00 5d c3 eb 0d 90 90 90 90 90 90 90 90
+EIP: [<c89d4005>] my_oops_init+0x5/0x20 [oops] SS:ESP 0068:c5799e24
+---[ end trace 2981ce73ae801363 ]---
+
+
+

Although relatively cryptic, the message provided by the kernel to the +appearance of an oops provides valuable information about the error. First line:

+
BUG: unable to handle kernel paging request at 00001234
+EIP: [<c89d4005>] my_oops_init + 0x5 / 0x20 [oops]
+
+
+

Tells us the cause and the address of the instruction that generated the error. +In our case this is an invalid access to memory.

+

Next line

+
+
Oops: 0002 [# 1] PREEMPT DEBUG_PAGEALLOC
+

Tells us that it's the first oops (#1). This is important in the context that +an oops can lead to other oopses. Usually only the first oops is relevant. +Furthermore, the oops code (0002) provides information about the error type +(see arch/x86/include/asm/trap_pf.h):

+
+
    +
  • Bit 0 == 0 means no page found, 1 means protection fault
  • +
  • Bit 1 == 0 means read, 1 means write
  • +
  • Bit 2 == 0 means kernel, 1 means user mode
  • +
+
+

In this case, we have a write access that generated the oops (bit 1 is 1).

+

Below is a dump of the registers. It decodes the instruction pointer (EIP) +value and notes that the bug appeared in the my_oops_init function with +a 5-byte offset (EIP: [<c89d4005>] my_oops_init+0x5). The message also +shows the stack content and a backtrace of calls until then.

+

If an invalid read call is generated (#define OP_OOPS OP_READ), the message +will be the same, but the oops code will differ, which would now be 0000:

+
faust:~/lab-01/modul-oops# dmesg | tail -33
+BUG: unable to handle kernel paging request at 00001234
+IP: [<c89c3016>] my_oops_init+0x6/0x20 [oops]
+  *de = 00000000
+Oops: 0000 [#1] PREEMPT DEBUG_PAGEALLOC
+last sysfs file: /sys/devices/virtual/net/lo/operstate
+Modules linked in: oops(+) netconsole pcnet32 crc32 ide_cd_mod cdrom
+
+Pid: 2754, comm: insmod Not tainted (2.6.28.4 #2) VMware Virtual Platform
+EIP: 0060:[<c89c3016>] EFLAGS: 00010292 CPU: 0
+EIP is at my_oops_init+0x6/0x20 [oops]
+EAX: 00000000 EBX: fffffffc ECX: c89c3380 EDX: 00000001
+ESI: c89c3010 EDI: 00000000 EBP: c57cbe24 ESP: c57cbe1c
+ DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
+Process insmod (pid: 2754, ti=c57cb000 task=c66ec780 task.ti=c57cb000)
+Stack:
+ c57cbe34 00000282 c57cbf8c c010102d c57b9280 0000000c c57cbe58 c01708e4
+ 00000124 00000000 c89c3380 c57cbe58 c5db1d38 00000001 c89c3380 c57cbe60
+ c0170981 c57cbf8c c014b698 00000000 00000000 c57cbf78 c57cbf20 00000580
+Call Trace:
+ [<c010102d>] ? _stext+0x2d/0x170
+ [<c01708e4>] ? __vunmap+0xa4/0xf0
+ [<c0170981>] ? vfree+0x21/0x30
+ [<c014b698>] ? load_module+0x19b8/0x1a40
+ [<c035d083>] ? printk+0x0/0x1a
+ [<c035e965>] ? __mutex_unlock_slowpath+0xd5/0x140
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c014b7aa>] ? sys_init_module+0x8a/0x1b0
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c0240a08>] ? trace_hardirqs_on_thunk+0xc/0x10
+ [<c0103407>] ? sysenter_do_call+0x12/0x43
+Code: <a1> 34 12 00 00 c7 04 24 54 30 9c c8 89 44 24 04 e8 58 a0 99 f7 31
+EIP: [<c89c3016>] my_oops_init+0x6/0x20 [oops] SS:ESP 0068:c57cbe1c
+---[ end trace 45eeb3d6ea8ff1ed ]---
+
+
+
+

objdump

+

Detailed information about the instruction that generated the oops can be found +using the objdump utility. Useful options to use are -d +to disassemble the code and -S for interleaving C code in assembly +language code. For efficient decoding, however, we need the address where the +kernel module was loaded. This can be found in /proc/modules.

+

Here's an example of using objdump on the above module to identify +the instruction that generated the oops:

+
faust:~/lab-01/modul-oops# cat /proc/modules
+oops 1280 1 - Loading 0xc89d4000
+netconsole 8352 0 - Live 0xc89ad000
+pcnet32 33412 0 - Live 0xc895a000
+ide_cd_mod 34952 0 - Live 0xc8903000
+crc32 4224 1 pcnet32, Live 0xc888a000
+cdrom 34848 1 ide_cd_mod, Live 0xc886d000
+
+faust:~/lab-01/modul-oops# objdump -dS --adjust-vma=0xc89d4000 oops.ko
+
+oops.ko:     file format elf32-i386
+
+
+Disassembly of section .text:
+
+c89d4000 <init_module>:
+#define OP_READ         0
+#define OP_WRITE        1
+#define OP_OOPS         OP_WRITE
+
+static int my_oops_init (void)
+{
+c89d4000:       55                      push   %ebp
+#else
+#error "Unknown op for oops!"
+#endif
+
+        return 0;
+}
+c89d4001:       31 c0                   xor    %eax,%eax
+#define OP_READ         0
+#define OP_WRITE        1
+#define OP_OOPS         OP_WRITE
+
+static int my_oops_init (void)
+{
+c89d4003:       89 e5                   mov    %esp,%ebp
+        int *a;
+
+        a = (int *) 0x00001234;
+#if OP_OOPS == OP_WRITE
+        *a = 3;
+c89d4005:       c7 05 34 12 00 00 03    movl   $0x3,0x1234
+c89d400c:       00 00 00
+#else
+#error "Unknown op for oops!"
+#endif
+
+        return 0;
+}
+c89d400f:       5d                      pop    %ebp
+c89d4010:       c3                      ret
+c89d4011:       eb 0d                   jmp    c89c3020 <cleanup_module>
+c89d4013:       90                      nop
+c89d4014:       90                      nop
+c89d4015:       90                      nop
+c89d4016:       90                      nop
+c89d4017:       90                      nop
+c89d4018:       90                      nop
+c89d4019:       90                      nop
+c89d401a:       90                      nop
+c89d401b:       90                      nop
+c89d401c:       90                      nop
+c89d401d:       90                      nop
+c89d401e:       90                      nop
+c89d401f:       90                      nop
+
+c89d4020 <cleanup_module>:
+
+static void my_oops_exit (void)
+{
+c89d4020:       55                      push   %ebp
+c89d4021:       89 e5                   mov    %esp,%ebp
+}
+c89d4023:       5d                      pop    %ebp
+c89d4024:       c3                      ret
+c89d4025:       90                      nop
+c89d4026:       90                      nop
+c89d4027:       90                      nop
+
+
+

Note that the instruction that generated the oops (c89d4005 identified +earlier) is:

+
+
C89d4005: c7 05 34 12 00 00 03 movl $ 0x3,0x1234
+

That is exactly what was expected - storing value 3 at 0x0001234.

+

The /proc/modules is used to find the address where a kernel module is +loaded. The --adjust-vma option allows you to display instructions +relative to 0xc89d4000. The -l option displays the number of +each line in the source code interleaved with the assembly language code.

+
+
+

addr2line

+

A more simplistic way to find the code that generated an oops is to use the +addr2line utility:

+
faust:~/lab-01/modul-oops# addr2line -e oops.o 0x5
+/root/lab-01/modul-oops/oops.c:23
+
+
+

Where 0x5 is the value of the program counter (EIP = c89d4005) that +generated the oops, minus the base address of the module (0xc89d4000) +according to /proc/modules

+
+
+

minicom

+

Minicom (or other equivalent utilities, eg picocom, +screen) is a utility that can be used to connect and interact with a +serial port. The serial port is the basic method for analyzing kernel messages +or interacting with an embedded system in the development phase. There are two +more common ways to connect:

+
    +
  • a serial port where the device we are going to use is /dev/ttyS0
  • +
  • a serial USB port (FTDI) in which case the device we are going to use is +/dev/ttyUSB.
  • +
+

For the virtual machine used in the lab, the device that we need to use is +displayed after the virtual machine starts:

+
char device redirected to /dev/pts/20 (label virtiocon0)
+
+
+

Minicom use:

+
#for connecting via COM1 and using a speed of 115,200 characters per second
+minicom -b 115200 -D /dev/ttyS0
+
+#For USB serial port connection
+minicom -D /dev/ttyUSB0
+
+#To connect to the serial port of the virtual machine
+minicom -D /dev/pts/20
+
+
+
+
+

netconsole

+

Netconsole is a utility that allows logging of kernel debugging +messages over the network. This is useful when the disk logging system does not +work or when serial ports are not available or when the terminal does not +respond to commands. Netconsole comes in the form of a kernel +module.

+

To work, it needs the following parameters:

+
+
    +
  • port, IP address, and the source interface name of the debug station
  • +
  • port, MAC address, and IP address of the machine to which the debug +messages will be sent
  • +
+
+

These parameters can be configured when the module is inserted into the kernel, +or even while the module is inserted if it has been compiled with the +CONFIG_NETCONSOLE_DYNAMIC option.

+

An example configuration when inserting netconsole kernel module is +as follows:

+
alice:~# modprobe netconsole netconsole=6666@192.168.191.130/eth0,6000@192.168.191.1/00:50:56:c0:00:08
+
+
+

Thus, the debug messages on the station that has the address +192.168.191.130 will be sent to the eth0 interface, having source port +6666. The messages will be sent to 192.168.191.1 with the MAC address +00:50:56:c0:00:08, on port 6000.

+

Messages can be played on the destination station using netcat:

+
bob:~ # nc -l -p 6000 -u
+
+
+

Alternatively, the destination station can configure syslogd to +intercept these messages. More information can be found in +Documentation/networking/netconsole.txt.

+
+
+

Printk debugging

+

The two oldest and most useful debugging aids are Your Brain and Printf.

+

For debugging, a primitive way is often used, but it is quite effective: +printk debugging. Although a debugger can also be used, it is generally +not very useful: simple bugs (uninitialized variables, memory management +problems, etc.) can be easily localized by control messages and the +kernel-decoded oop message.

+

For more complex bugs, even a debugger can not help us too much unless the +operating system structure is very well understood. When debugging a kernel +module, there are a lot of unknowns in the equation: multiple contexts (we have +multiple processes and threads running at a time), interruptions, virtual +memory, etc.

+

You can use printk to display kernel messages to user space. It is +similar to printf's functionality; the only difference is that the +transmitted message can be prefixed with a string of "<n>", where +n indicates the error level (loglevel) and has values between 0 and +7. Instead of "<n>", the levels can also be coded by symbolic +constants:

+
KERN_EMERG - n = 0
+KERN_ALERT - n = 1
+KERN_CRIT - n = 2
+KERN_ERR - n = 3
+KERN_WARNING - n = 4
+KERN_NOTICE - n = 5
+KERN_INFO - n = 6
+KERN_DEBUG - n = 7
+
+
+

The definitions of all log levels are found in linux/kern_levels.h. +Basically, these log levels are used by the system to route messages sent to +various outputs: console, log files in /var/log etc.

+
+

Note

+

To display printk messages in user space, the printk +log level must be of higher priority than console_loglevel +variable. The default console log level can be configured from +/proc/sys/kernel/printk.

+

For instance, the command:

+
echo 8 > /proc/sys/kernel/printk
+
+
+

will enable all the kernel log messages to be displayed in the +console. That is, the logging level has to be strictly less than the +console_loglevel variable. For example, if the +console_loglevel has a value of 5 (specific to +KERN_NOTICE), only messages with loglevel stricter than 5 +(i.e KERN_EMERG, KERN_ALERT, KERN_CRIT, +KERN_ERR, KERN_WARNING) will be shown.

+
+

Console-redirected messages can be useful for quickly viewing the effect of +executing the kernel code, but they are no longer so useful if the kernel +encounters an irreparable error and the system freezes. In this case, the logs +of the system must be consulted, as they keep the information between system +restarts. These are found in /var/log and are text files, populated by +syslogd and klogd during the kernel run. syslogd and +klogd take the information from the virtual file system mounted in +/proc. In principle, with syslogd and klogd turned on, +all messages coming from the kernel will go to /var/log/kern.log.

+

A simpler version for debugging is using the /var/log/debug file. It +is populated only with the printk messages from the kernel with the +KERN_DEBUG log level.

+

Given that a production kernel (similar to the one we're probably running with) +contains only release code, our module is among the few that send messages +prefixed with KERN_DEBUG . In this way, we can easily navigate through the +/var/log/debug information by finding the messages corresponding to a +debugging session for our module.

+

Such an example would be the following:

+
# Clear the debug file of previous information (or possibly a backup)
+$ echo "New debug session" > /var/log/debug
+# Run the tests
+# If there is no critical error causing a panic kernel, check the output
+# if a critical error occurs and the machine only responds to a restart,
+  restart the system and check /var/log/debug.
+
+
+

The format of the messages must obviously contain all the information of +interest in order to detect the error, but inserting in the code printk +to provide detailed information can be as time-consuming as writing the code to +solve the problem. This is usually a trade-off between the completeness of the +debugging messages displayed using printk and the time it takes to +insert these messages into the text.

+

A very simple way, less time-consuming for inserting printk and +providing the possibility to analyze the flow of instructions for tests is the +use of the predefined constants __FILE__, __LINE__ and +__func__:

+
+
    +
  • __FILE__ is replaced by the compiler with the name of the source file +it is currently being compiled.
  • +
  • __LINE__ is replaced by the compiler with the line number on which the +current instruction is found in the current source file.
  • +
  • __func__ /__FUNCTION__ is replaced by the compiler with the name +of the function in which the current instruction is found.
  • +
+
+
+

Note

+

__FILE__ and __LINE__ are part of the ANSI C specifications: +__func__ is part of specification C99; __FUNCTION__ is a GNU +C extension and is not portable; However, since we write code for the +Linux kernel, we can use it without any problems.

+
+

The following macro definition can be used in this case:

+
#define PRINT_DEBUG \
+       printk (KERN_DEBUG "[% s]: FUNC:% s: LINE:% d \ n", __FILE__,
+               __FUNCTION__, __LINE__)
+
+
+

Then, at each point where we want to see if it is "reached" in execution, +insert PRINT_DEBUG; This is a simple and quick way, and can yield by carefully +analyzing the output.

+

The dmesg command is used to view the messages printed with +printk but not appearing on the console.

+

To delete all previous messages from a log file, run:

+
cat /dev/null > /var/log/debug
+
+
+

To delete messages displayed by the dmesg command, run:

+
dmesg -c
+
+
+
+
+

Dynamic debugging

+

Dynamic dyndbg +debugging enables dynamic debugging activation/deactivation. +Unlike printk, it offers more advanced printk options for the +messages we want to display; it is very useful for complex modules or +troubleshooting subsystems. +This significantly reduces the amount of messages displayed, leaving only +those relevant for the debug context. To enable dyndbg, the kernel must be +compiled with the CONFIG_DYNAMIC_DEBUG option. Once configured, +pr_debug(), dev_dbg() and print_hex_dump_debug(), +print_hex_dump_bytes() can be dynamically enabled per call.

+

The /sys/kernel/debug/dynamic_debug/control file from the debugfs (where +/sys/kernel/debug is the path to which debugfs was mounted) is used to +filter messages or to view existing filters.

+
mount -t debugfs none /debug
+
+
+

Debugfs +is a simple file system, used as a kernel-space interface and +user-space interface to configure different debug options. Any debug utility +can create and use its own files /folders in debugfs.

+

For example, to display existing filters in dyndbg, you will use:

+
cat /debug/dynamic_debug/control
+
+
+

And to enable the debug message from line 1603 in the svcsock.c file:

+
echo 'file svcsock.c line 1603 +p' > /debug/dynamic_debug/control
+
+
+

The /debug/dynamic_debug/control file is not a regular file. It shows +the dyndbg settings on the filters. Writing in it with an echo will change +these settings (it will not actually make a write). Be aware that the file +contains settings for dyndbg debugging messages. Do not log in this file.

+
+

Dyndbg Options

+
    +
  • func - just the debug messages from the functions that have the same +name as the one defined in the filter.

    +
    echo 'func svc_tcp_accept +p' > /debug/dynamic_debug/control
    +
    +
    +
  • +
  • file - the name of the file(s) for which we want to display the debug +messages. It can be just the source name, but also the absolute path or +kernel-tree path.

    +
    file svcsock.c
    +file kernel/freezer.c
    +file /usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svcsock.c
    +
    +
    +
  • +
  • module - module name.

    +
    module sunrpc
    +
    +
    +
  • +
  • format - only messages whose display format contains the specified string.

    +
    format "nfsd: SETATTR"
    +
    +
    +
  • +
  • line - the line or lines for which we want to enable debug calls.

    +
    # Triggers debug messages between lines 1603 and 1605 in the svcsock.c file
    +$ echo 'file svcsock.c line 1603-1605 +p' > /sys/kernel/debug/dynamic_debug/control
    +# Enables debug messages from the beginning of the file to line 1605
    +$ echo 'file svcsock.c line -1605 +p' > /sys/kernel/debug/dynamic_debug/control
    +
    +
    +
  • +
+

In addition to the above options, a series of flags can be added, removed, or set +with operators +, - or =:

+
+
    +
  • p activates the pr_debug() .
  • +
  • f includes the name of the function in the printed message.
  • +
  • l includes the line number in the printed message.
  • +
  • m includes the module name in the printed message.
  • +
  • t includes the thread id if it is not called from interrupt context
  • +
  • _ no flag is set.
  • +
+
+
+
+
+

KDB: Kernel debugger

+

The kernel debugger has proven to be very useful to facilitate the development and +debugging process. One of its main advantages is the possibility to perform live debugging. +This allows us to monitor, in real time, the accesses to memory or even modify the memory +while debugging. +The debugger has been integrated in the mainline kernel starting with version 2.6.26-rci. +KDB is not a source debugger, but for a complete analysis it can be used in parallel with +gdb and symbol files -- see the GDB debugging section

+

To use KDB, you have the following options:

+
+
    +
  • non-usb keyboard + VGA text console
  • +
  • serial port console
  • +
  • USB EHCI debug port
  • +
+
+

For the lab, we will use a serial interface connected to the host. +The following command will activate GDB over the serial port:

+
echo hvc0 > /sys/module/kgdboc/parameters/kgdboc
+
+
+

KDB is a stop mode debugger, which means that, while it is active, all the other processes +are stopped. The kernel can be forced to enter KDB during execution using the following +SysRq command

+
echo g > /proc/sysrq-trigger
+
+
+

or by using the key combination Ctrl+O g in a terminal connected to the serial port +(for example using minicom).

+

KDB has various commands to control and define the context of the debugged system:

+
+
    +
  • lsmod, ps, kill, dmesg, env, bt (backtrace)
  • +
  • dump trace logs
  • +
  • hardware breakpoints
  • +
  • modifying memory
  • +
+
+

For a better description of the available commands you can use the help command in +the KDB shell. +In the next example, you can notice a simple KDB usage example which sets a hardware +breakpoint to monitor the changes of the mVar variable.

+
# trigger KDB
+echo g > /proc/sysrq-trigger
+# or if we are connected to the serial port issue
+Ctrl-O g
+# breakpoint on write access to the mVar variable
+kdb> bph mVar dataw
+# return from KDB
+kdb> go
+
+
+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is kernel_modules. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/kernel_modules/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using cscope or LXR find the definitions of the following symbols +in the Linux kernel source code:

+
    +
  • module_init() and module_exit()
      +
    • what do the two macros do? What is init_module and cleanup_module?
    • +
    +
  • +
  • ignore_loglevel
      +
    • What is this variable used for?
    • +
    +
  • +
+
+

Warning

+

If you have problems using cscope, it is possible that the database +is not generated. To generate it, use the following command in the kernel +directory:

+
make ARCH=x86 cscope
+
+
+
+
+

Note

+

When searching for a structure using cscope, use only the +structure name (without struct). So, to search for the +structure struct module, you will use the command

+
+
vim -t module
+
+
+
+

or, in vim, the command

+
+
:cs f g module
+
+
+
+
+
+

Note

+

For more info on using cscope, read the +cscope section in the previous lab.

+
+
+
+

1. Kernel module

+

To work with the kernel modules, we will follow the steps described +above.

+
+
Generate the skeleton for the task named 1-2-test-mod then build the module,
+
by running the following command in tools/labs.
+
+
$ LABS=kernel_modules make skels
+$ make build
+
+
+

These command will build all the modules in the current +lab skeleton.

+
+

Warning

+

Until after solving exercise 3, you will get a compilation error for +3-error-mod. To avoid this issue, remove the directory +skels/kernel_modules/3-error-mod/ and remove the corresponding +line from skels/Kbuild.

+
+

Start the VM using make console, and perform the following tasks:

+
    +
  • load the kernel module.
  • +
  • list the kernel modules and check if current module is present
  • +
  • unload the kernel module
  • +
  • view the messages displayed at loading/unloading the kernel module using +dmesg command
  • +
+
+

Note

+

Read Loading/unloading a kernel module section. When unloading +a kernel module, you can specify only the module name +(without extension).

+
+
+
+

2. Printk

+

Watch the virtual machine console. Why were the messages displayed directly +to the virtual machine console?

+

Configure the system such that the messages are not displayed directly +on the serial console, and they can only be inspected using dmesg.

+
+

Hint

+

One option is to set the console log level by writting +the desired level to /proc/sys/kernel/printk. +Use a value smaller than the level used for the prints in +the source code of the module.

+
+

Load/unload the module again. +The messages should not be printed to the virtual machine console, +but they should be visible when running dmesg.

+
+
+

3. Error

+

Generate the skeleton for the task named 3-error-mod. Compile the +sources and get the corresponding kernel module.

+

Why have compilation +errors occurred? Hint: How does this module differ from the previous module?

+

Modify the module to solve the cause of those errors, then compile and test +the module.

+
+
+

4. Sub-modules

+

Inspect the C source files mod1.c and mod2.c in 4-multi-mod/. +Module 2 contains only the definition of a function used by module 1.

+

Change the Kbuild file to create the multi_mod.ko module from the +two C source files.

+
+

Hint

+

Read the Compiling kernel modules section of the lab.

+
+

Compile, copy, boot the VM, load and unload the kernel module. Make sure messages +are properly displayed on the console.

+
+
+

5. Kernel oops

+

Enter the directory for the task 5-oops-mod and inspect the +C source file. Notice where the problem will occur. Add the compilation flag +-g in the Kbuild file.

+
+

Hint

+

Read Compiling kernel modules section of the lab.

+
+

Compile the corresponding module and load it into the kernel. Identify the memory +address at which the oops appeared.

+
+

Hint

+

Read `Debugging`_ section of the lab. To identify the +address, follow the oops message and extract the value of +the instructions pointer (EIP) register.

+
+

Determine which instruction has triggered the oops.

+
+

Hint

+

Use the proc/modules information to get the load address of +the kernel module. Use, on the physical machine, objdump +and/or addr2line . Objdump needs debugging support for +compilation! Read the lab's objdump and addr2line +sections.

+
+

Try to unload the kernel module. Notice that the operation does not +work because there are references from the kernel module within the +kernel since the oops; Until the release of those references (which is +almost impossible in the case of an oops), the module can not be +unloaded.

+
+
+

6. Module parameters

+

Enter the directory for the task 6-cmd-mod and inspect the C +cmd_mod.c source file. Compile and copy the associated module and +load the kernel module to see the printk message. Then unload the +module from the kernel.

+

Without modifying the sources, load the kernel module so that the +message shown is Early bird gets tired.

+
+

Hint

+

The str variable can be changed by passing a parameter to +the module. Find more information here.

+
+
+
+

7. Proc info

+

Check the skeleton for the task named 7-list-proc. Add code to +display the Process ID (PID) and the executable name for the current +process.

+

Follow the commands marked with TODO. +The information must be displayed both when loading and unloading the +module.

+
+

Note

+
    +
  • In the Linux kernel, a process is described by the +struct task_struct. Use LXR or cscope to find the +definition of struct task_struct.
  • +
  • To find the structure field that contains the name of the +executable, look for the "executable" comment.
  • +
  • The pointer to the structure of the current process +running at a given time in the kernel is given by the +current variable (of the type +struct task_struct*).
  • +
+
+
+

Hint

+

To use current you'll need to include the header +in which the struct task_struct is defined, i.e +linux/sched.h.

+
+

Compile, copy, boot the VM and load the module. Unload the kernel module.

+

Repeat the loading/unloading operation. Note that the PIDs of the +displayed processes differ. This is because a process is created +from the executable /sbin/insmod when the module is loaded and +when the module is unloaded a process is created from the executable +/sbin/rmmod.

+
+
+
+

Extra Exercises

+
+

1. KDB

+

Go to the 8-kdb directory. Activate KDB over the serial port and enter KDB +mode using SysRq. Connect to the pseudo-terminal linked to virtiocon0 +using minicom, configure KDB to use the hvc0 serial port:

+
echo hvc0 > /sys/module/kgdboc/parameters/kgdboc
+
+
+

and enable it using SysRq (Ctrl + O g). +Review the current system status (help to see the available KDB +commands). Continue the kernel execution using the go command.

+

Load the hello_kdb module. +The module will simulate a bug when writing to the /proc/hello_kdb_bug +file. To simulate a bug, use the below command:

+
echo 1 > /proc/hello_kdb_bug
+
+
+

After running the above command, at every oops/panic the kernel stops the +execution and enters debug mode.

+

Analyze the stacktrace and determine the code that generated the bug. +How can we find out from KDB the address where the module was loaded?

+

In parallel, use GDB in a new window to view the code based on KDB information.

+
+

Hint

+

Load the symbol file. Use info line.

+
+

When writing to /proc/hello_kdb_break, the module will increment the +kdb_write_address variable. Enter KDB and set a breakpoint for each +write access of the kdb_write_address variable. +Return to kernel to trigger a write using:

+
echo 1 > /proc/hello_kdb_break
+
+
+
+
+

2. PS Module

+

Update the created kernel module at proc-info in order to display +information about all the processes in the system, when inserting the kernel +module, not just about the current process. Afterwards, compare the obtained +result with the output of the ps command.

+
+

Hint

+
    +
  • Processes in the system are structured in a circular list.
  • +
  • for_each _... macros (such as for_each_process) are +useful when you want to navigate the items in a list.
  • +
  • To understand how to use a feature or a macro, use LXR or Vim and +cscope and search for usage scenarios.
  • +
+
+
+
+

3. Memory Info

+

Create a kernel module that displays the virtual memory areas of the current +process; for each memory area it will display the start address and the end +address.

+
+

Hint

+
    +
  • Start from an existing kernel module.
  • +
  • Investigate the structures struct task_struct, +struct mm_struct and struct vm_area_struct. A +memory area is indicated by a structure of type struct +vm_area_struct.
  • +
  • Don't forget to include the headers where the necessary structures are +defined.
  • +
+
+
+
+

4. Dynamic Debugging

+

Go to the 9-dyndbg directory and compile the dyndbg.ko module.

+

Familiarize yourself with the debugfs file system mounted in +/debug and analyze the contents of the file +/debug/dynamic_debug/control. Insert the dyndbg.ko module and +notice the new content of the dynamic_debug/control file.

+

What appears extra in the respective file? Run the following command:

+
grep dyndbg /debug/dynamic_debug/control
+
+
+

Configure dyndbg so that only messages marked as "Important" in +my_debug_func() function are displayed when the module is unloaded. +The exercise will only filter out the pr_debug() calls; printk() +calls being always displayed.

+

Specify two ways to filter.

+
+

Hint

+

Read the Dynamic debugging section and look at the dyndbg +options (for example, line, format).

+
+

Perform the filtering and revise the dynamic_debug/control file. What +has changed? How do you know which calls are activated?

+
+

Hint

+

Check the dyndbg flags. Unload the kernel module and observe the +log messages.

+
+
+
+

5. Dynamic Debugging During Initialization

+

As you have noticed, pr_debug() calls can only be activated /filtered +after module insertion. In some situations, it might be helpful to view the +messages from the initialization of the module. This can be done by using a +default (fake) parameter called dyndbg that can be passed as an +argument to initialize the module. With this parameter you can add /delete +dyndbg flags.

+
+

Hint

+

Read the last part of the Dynamic debugging section and see the available +flags (e.g.: +/- p).

+
+

Read the Debug Messages section at Module Initialization Time +and insert the module so that the messages in my_debug_func() (called +dyndbg_init()) are also displayed during initialization.

+
+

Warning

+

In the VM from the lab, you will need to use insmod instead of +modprobe.

+
+

Without unloading the module, deactivate pr_debug() calls.

+
+

Hint

+

You can delete the set flags. Unload the kernel module.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/kernel_profiling.html b/refs/pull/405/merge/labs/kernel_profiling.html new file mode 100644 index 00000000..80c42a4a --- /dev/null +++ b/refs/pull/405/merge/labs/kernel_profiling.html @@ -0,0 +1,645 @@ + + + + + + Kernel Profiling — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Kernel Profiling

+
+

Lab Objectives

+
+
    +
  • Familiarize yourself with the basics of Linux kernel profiling
  • +
  • Understanding basic profiling tools
  • +
  • Learning profiling methodologies and good practices
  • +
+
+
+
+

Overview

+

Up until now we have studied how the different components of the Linux kernel +work, and how to write drivers that interface with them in order to provide +support for devices or protocols. This has helped us understand how the Linux +kernel works, but most people will not get to write kernel drivers.

+

Nonetheless, the skills learned will help us to write applications that better +integrate with the whole operating system. In order to do this, one has to have +a good view of both the user space and the kernel space.

+

This session aims to merge the work we have done up until now in the kernel +space with real world use cases where we do not write kernel space code, but we +look through the kernel using profiling tools, in order to debug issues that +we're having when writing regular, low-level, applications.

+

Another focus of this session will be learning a general methodology for +debugging software issues, and we will approach some tools that give us insight +from the kernel on the way our application runs.

+
+
+

Profiling Tools

+

The main tool that we will focus our attention on is perf, which offers +support for tracing applications, and also inspecting general aspects of the +system. We will also be using debugging tools that most people have used in +their day to day life, such as htop, ps, lsof and others.

+
+

perf

+

perf is a tool that instruments the CPU using +tracepoints, kprobes and uprobes. This tool allows us to take a look at what +functions are being called at a given point. This allows us to take a peak at +where the kernel is pending the most time, print out call stacks of functions, +and in general log what the CPU is running.

+

perf integrates modules such as: +* static tracing +* dynamic tracing +* resource monitoring

+

The tracing interface that is offered by perf can be used by itself, using the +perf command together with its subcommands.

+
root@qemux86:~# ./skels/kernel_profiling/perf
+
+ usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS]
+
+ The most commonly used perf commands are:
+   annotate        Read perf.data (created by perf record) and display annotated code
+   archive         Create archive with object files with build-ids found in perf.data file
+   bench           General framework for benchmark suites
+   buildid-cache   Manage build-id cache.
+   buildid-list    List the buildids in a perf.data file
+   c2c             Shared Data C2C/HITM Analyzer.
+   config          Get and set variables in a configuration file.
+   data            Data file related processing
+   diff            Read perf.data files and display the differential profile
+   evlist          List the event names in a perf.data file
+   ftrace          simple wrapper for kernel's ftrace functionality
+   inject          Filter to augment the events stream with additional information
+   kallsyms        Searches running kernel for symbols
+   kmem            Tool to trace/measure kernel memory properties
+   kvm             Tool to trace/measure kvm guest os
+   list            List all symbolic event types
+   lock            Analyze lock events
+   mem             Profile memory accesses
+   record          Run a command and record its profile into perf.data
+   report          Read perf.data (created by perf record) and display the profile
+   sched           Tool to trace/measure scheduler properties (latencies)
+   script          Read perf.data (created by perf record) and display trace output
+   stat            Run a command and gather performance counter statistics
+   test            Runs sanity tests.
+   timechart       Tool to visualize total system behavior during a workload
+   top             System profiling tool.
+   version         display the version of perf binary
+   probe           Define new dynamic tracepoints
+
+ See 'perf help COMMAND' for more information on a specific command.
+
+
+

In the output above we can see all of perf's subcommands together with a +description of their functionality, the most significant of which are:

+
    +
  • stat - displays statistics such as the number of context switches and page +faults;
  • +
  • top - an interactive interface where we can inspect the most frequent +function calls and their caller. This interface allows us direct feedback +while profiling;
  • +
  • list - lists the static trace point that we can instrument inside the +kernel. These are useful when trying to get an insight from inside the kernel;
  • +
  • probe - add a dynamic trace point that instruments a function call in +order to be recorded by perf;
  • +
  • record - records function calls and stack traces based on tracing points +defined by the user; It can also record specific function calls and their +stack traces. The record is saved in a file, named perf.data by default;
  • +
  • report - displays the information saved in a perf recording.
  • +
+

Another way to use perf's interface is through scripts that wrap over perf that +offer a higher level way of looking at events or data, without needing to know +the intricacies of the command. An example of this is the iosnoop.sh script, +which displays what I/O transfers are taking place.

+
+
+

ps

+

ps is the Linux tool that allows us to monitor the processes that are +running at a given time on the machine, including the kernel threads. This is a +simple and easy to use way of checking at a glance what processes are running on +the CPU, and what is their CPU and memory usage.

+

In order to list all the processes running, we use to ps aux command in the +following way:

+
TODO
+root@qemux86:~/skels/kernel_profiling/0-demo# cd
+ root@qemux86:~# ps aux
+ USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
+ root         1  0.0  0.5   2004  1256 ?        Ss   12:06   0:12 init [5]
+ root         2  0.0  0.0      0     0 ?        S    12:06   0:00 [kthreadd]
+ [...]
+ root       350  4.5  4.4  11132 10688 hvc0     T    12:07  17:21 ./io-app
+ root      1358  0.0  0.0      0     0 ?        I    14:30   0:00 [kworker/u2:1-e
+ root      2293  0.1  1.5   5516  3704 ?        Ss   18:18   0:00 sshd: root@pts/
+ root      2295  0.0  1.3   3968  3232 pts/0    Ss+  18:19   0:00 -sh
+ root      2307  0.0  0.0      0     0 ?        I    18:19   0:00 [kworker/u2:2-e
+ root      2350  0.0  0.7   3032  1792 hvc0     R+   18:26   0:00 ps aux
+ root      2392  2.6  0.0      0     0 ?        D    18:31   0:00 test-script
+
+
+

One information of note is that the 7th column represents the that of the +process, S meaning suspended, D suspended due to I/O, and R meaning +running.

+
+
+

time

+

The time command allows us to inspect the amount of time spent by a +process in I/O, running the application code, or running code in kernel space. +This can be useful in order to find out whether an application's issue comes +from running too much in kernel space, so it has some overhead when it does +system calls, or the issue is in the user code.

+
root@qemux86:~# time dd if=/dev/urandom of=./test-file bs=1K count=10
+10+0 records in
+10+0 records out
+10240 bytes (10 kB, 10 KiB) copied, 0.00299749 s, 3.4 MB/s
+
+real        0m0.020s
+user        0m0.001s
+sys 0m0.015s
+
+
+

In the output above we timed the generation of a file using dd. The result +of the timing is displayed at the bottom of output. The values outputted by the +tool are the following:

+
    +
  • real - the amount of time has passed from the start of the application to +its finishing;
  • +
  • user - time spent running the dd code;
  • +
  • sys - time spent running kernel code on behalf of the process.
  • +
+

We see that the sum of the user and sys values doesn't add up to the +real value. This happens either when the application runs on multiple cores, +in which case the sum might be higher, or the application sleeps, in which case +the sum is lower.

+
+
+

top

+

top is an application that is found on most systems which lists in real time +the applications that are running on the system. top runs interactively, and +it auto-refreshes its output, as opposed to ps. We use this tool when we +want a high level of continuous monitoring.

+
+
+
+

Profiling Methodology

+

When doing profiling, our goal is to identify the cause of a problem. Usually +this problem is observed by someone when their application doesn't work as +expected. When we say that an application did not work as expected, this can +mean different things for different people. For example, one person might +complain that the application has a slowdown, while another might say that the +application runs on the CPU, but it doesn't output anything.

+

The first step in any problem solving context is to understand the default +behaviour of the application we're trying to debug, and to make sure that it is +now not running in the expected parameters.

+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is kernel_profiling. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/kernel_profiling/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Note

+

This session will require us to use the perf tracing tool. When running +natively on our systems, we have to install the +linux-tools-<version>-generic package using a package manager in order +to run it. Because in our visual machine we don't have access to a package +manager, we will be downloading the perf binary from this link. Download the application in +the skels/kernel_profiling directory, and grant in execution +permissions.

+
+
+

Warning

+

When running perf, make sure that you're running the downloaded version, +not the version in the PATH variable.

+
+
+

Note

+

When going through this session's exercises, we will have to run command in +parallel. In order to do this, we will have to connect to the virtual machine +using SSH. We recommend using the core-image-sato-sdk-qemu image, since it +has the tools that we need. To run the virtual machine using the +core-image-sato-sdk-qemu file system, uncomment line 16 in the +qemu/Makefile file.

+
+
+

Note

+

If you wish to run the perf-tools based scripts that we have included in +the repository, such as iosnoop.sh, you will have to grant it execution +privilleges, in order to be copied to the virtual machine file system.

+
+
+

Note

+

In order to improve the course of SO2, its components and the way it is +conducted, your opinions are very useful to us. Please fill the feedback form +on curs.upb.ro platform.

+

The form is anonymous and is active between May 22 and June 2, 2023. The +results will be visible to the SO2 team after all the grades have been +marked.

+

We invite you to evaluate the activity of the SO2 team and specify its +strengths and weaknesses and your suggestions for improving the subject. +Your feedback is very important to us to increase the quality of the subject +in the coming years.

+

We are particularly interested in:

+
+
    +
  • What did you not like and what do you think did not go well?
  • +
  • Why didn't you like it and why do you think it didn't go well?
  • +
  • What should we do to make things better?
  • +
+
+
+
+
+

0. Demo: Profiling I/O Problems

+

When working with I/O, we have to keep in mind that it is one of the slowest +systems in the operating system, compared to memory, which is an order of +magnitude faster, and scheduling, which deals with what is currently running on +the CPU.

+

Because of this, I/O operations have do be thought out, because you might starve +you application by saturating the system with requests. Another issue that you +might face is that the I/O's slow speed might affect your application's +responsiveness, if it waits for the I/O operations to finish.

+

Let's take a look at an application and debug its issues.

+

We are going to run the io-app application, from the 0-demo directory.

+

In order to inspect what is running on the CPU, and look at the stack of the +process, we can use the perf record subcommand in the following way:

+
root@qemux86:~# ./perf record -a -g
+Couldn't synthesize bpf events.
+^C[ perf record: Woken up 7 times to write data ]
+[ perf record: Captured and wrote 1.724 MB perf.data (8376 samples) ]
+
+
+

perf will record values indefinitely, but we can close it using the Ctrl+c +hotkey. We used the -a option in order to probe all CPUs, and -g option, +which record the whole call stack.

+

To visualize the recorded information, we will use the perf report command, +which will bring up a pager which will display the most frequent function calls +that were found on the CPU, and their call stack.

+
root@qemux86:~# ./perf report --header -F overhead,comm,parent
+# Total Lost Samples: 0
+#
+# Samples: 8K of event 'cpu-clock:pppH'
+# Event count (approx.): 2094000000
+#
+# Overhead  Command          Parent symbol
+# ........  ...............  .............
+#
+    58.63%  io-app           [other]
+            |
+             --58.62%--__libc_start_main
+                       main
+                       __kernel_vsyscall
+                       |
+                        --58.61%--__irqentry_text_end
+                                  do_SYSENTER_32
+                                  do_fast_syscall_32
+                                  __noinstr_text_start
+                                  __ia32_sys_write
+                                  ksys_write
+                                  vfs_write
+                                  |
+                                   --58.60%--ext4_file_write_iter
+                                             ext4_buffered_write_iter
+[...]
+
+
+

We have used the --header in order to print the table header, and -F +overhead,comm,parent, in order to print the percentage of time where the call +stack, the command and the caller.

+

We can see that the io-app command is doing some writes in the file system, +and this contributes to much of the load on the system.

+

Armed with this information, we know that there are many I/O calls being done by +the application. In order to look at the size of these requests, we can use the +iosnoop.sh script in order to see how big these requests are.

+
root@qemux86:~/skels/kernel_profiling# ./iosnoop.sh 1
+Tracing block I/O. Ctrl-C to end.
+COMM         PID    TYPE DEV      BLOCK        BYTES     LATms
+io-app       889    WS   254,0    4800512      1310720     2.10
+io-app       889    WS   254,0    4803072      1310720     2.04
+io-app       889    WS   254,0    4805632      1310720     2.03
+io-app       889    WS   254,0    4808192      1310720     2.43
+io-app       889    WS   254,0    4810752      1310720     3.48
+io-app       889    WS   254,0    4813312      1310720     3.46
+io-app       889    WS   254,0    4815872      524288     1.03
+io-app       889    WS   254,0    5029888      1310720     5.82
+io-app       889    WS   254,0    5032448      786432     5.80
+jbd2/vda-43  43     WS   254,0    2702392      8192       0.22
+kworker/0:1H 34     WS   254,0    2702408      4096       0.40
+io-app       889    WS   254,0    4800512      1310720     2.60
+io-app       889    WS   254,0    4803072      1310720     2.58
+[...]
+
+
+

From this output we see that the io-app is reading in a loop from the fact +that the first block 4800512 is repeating, and that it is doing big reads, +since it is reading one megabyte fer request. This constant looping adds the +load to the system that we're experiencing.

+
+

1. Investigating Reduced Responsiveness

+

The io.ko module, located in the kernel_profiling/1-io directory, +decreases the system's responsiveness when inserted. We see that the command +line stutters when typing commands, but when running top, we see that the +system's load is not high, and there aren't any processes that are hogging +resources.

+

Find out what the io.ko module is doing and why is it leading to the +stuttering effect that we experience.

+
+

Hint

+

Trace all the functions being called and check where the CPU is +spending most of its time. In order to do this, you can run either perf +record and perf report to view the output, or perf top.

+
+
+
+

2. Launching New Threads

+

We want to run the same function in a loop 100 times in parallel. We have +implemented two solutions inside the scheduling binary file, located in the +kernel_profiling/2-scheduling directory.

+

When executing the scheduling binary, it prints a message in parallel from +100 running instances. We can tune this execution by running the application +either with the first parameter 0 or 1.

+

Find out which solution is better, and why.

+
+
+

3. Tuning cp

+

Our goal is to write a copy of the cp tool integrated in Linux, which has +been implemented by the memory binary, in the kernel_profiling/3-memory +directory. It implements two approaches that we can take for the copy operation:

+
    +
  • reading the contents of the source file in a buffer in memory using the +read() system call, and writing that buffer to the destination file using +the write() system call;
  • +
  • mapping the source and destination files to memory using the mmap system +call, and copying the contents of the source file to the destination in +memory.
  • +
+

Another tunable parameter that we're going to use is the block size of to copies +that we're going to make, either through reads/writes or in memory.

+

1) Investigate which of the two copying mechanisms is faster. For this step, you +will use the 1024 block size.

+

2) Once you have found which copying mechanism is faster, change the block size +parameter and see which value gives you the best copies. Why?

+
+
+

4. I/O Latency

+

We have written a module that reads the content of a disk. Insert the bio.ko +module, located in the 4-bio module, we see a large spike in the system's +load, as can be seen in the top command, but we see that the system is still +responsive.

+

Investigate what is causing the increased load to the system. Is it an I/O issue, +or is it a scheduling issue?

+
+

Hint

+

Try to trace the I/O operations using perf, or use the +iosnoop.sh script in order to inspect what I/O is happening at a +certain point.

+
+
+
+

5. Bad ELF

+
+

Note

+

This is a bonus exercise that has been tested on a native Linux system. +It may run under the QEMU virtual machine, but the behavior was weird in our testing. +We recommend you used a native (or VirtualBox or VMware) Linux system.

+
+

We managed to build (as part of a Unikraft build) an ELF file that is valid when doing static analysis, but that can't be executed. +The file is bad_elf, located in the 5-bad-elf/ folder.

+

Running it triggers a segmentation fault message. +Running it using strace show an error with execve().

+
... skels/kernel_profiling/5-bad-elf$ ./bad_elf
+Segmentation fault
+
+... skels/kernel_profiling/5-bad-elf$ strace ./bad_elf
+execve("./bad_elf", ["./bad_elf"], 0x7ffc3349ba50 /* 70 vars \*/) = -1 EINVAL (Invalid argument)
+--- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=NULL} ---
++++ killed by SIGSEGV +++
+Segmentation fault (core dumped)
+
+
+

The ELF file itself is valid:

+
... skels/kernel_profiling/5-bad-elf$ readelf -a bad_elf
+
+
+

The issue is to be detected in the kernel.

+

Use either perf, or, better yet ftrace to inspect the kernel function calls done by the program. +Identify the function call that sends out the SIGSEGV signal. +Identify the cause of the issue. +Find that cause in the manual page elf(5).

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/memory_mapping.html b/refs/pull/405/merge/labs/memory_mapping.html new file mode 100644 index 00000000..8587a6d6 --- /dev/null +++ b/refs/pull/405/merge/labs/memory_mapping.html @@ -0,0 +1,674 @@ + + + + + + Memory mapping — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Memory mapping

+
+

Lab objectives

+
    +
  • Understand address space mapping mechanisms
  • +
  • Learn about the most important structures related to memory management
  • +
+

Keywords:

+
    +
  • address space
  • +
  • mmap()
  • +
  • struct page
  • +
  • struct vm_area_struct
  • +
  • struct vm_struct
  • +
  • remap_pfn_range
  • +
  • SetPageReserved()
  • +
  • ClearPageReserved()
  • +
+
+
+

Overview

+

In the Linux kernel it is possible to map a kernel address space to a +user address space. This eliminates the overhead of copying user space +information into the kernel space and vice versa. This can be done +through a device driver and the user space device interface +(/dev).

+

This feature can be used by implementing the mmap() operation +in the device driver's struct file_operations and using the +mmap() system call in user space.

+

The basic unit for virtual memory management is a page, which size is +usually 4K, but it can be up to 64K on some platforms. Whenever we +work with virtual memory we work with two types of addresses: virtual +address and physical address. All CPU access (including from kernel +space) uses virtual addresses that are translated by the MMU into +physical addresses with the help of page tables.

+

A physical page of memory is identified by the Page Frame Number +(PFN). The PFN can be easily computed from the physical address by +dividing it with the size of the page (or by shifting the physical +address with PAGE_SHIFT bits to the right).

+../_images/paging.png +

For efficiency reasons, the virtual address space is divided into +user space and kernel space. For the same reason, the kernel space +contains a memory mapped zone, called lowmem, which is contiguously +mapped in physical memory, starting from the lowest possible physical +address (usually 0). The virtual address where lowmem is mapped is +defined by PAGE_OFFSET.

+

On a 32bit system, not all available memory can be mapped in lowmem and +because of that there is a separate zone in kernel space called +highmem which can be used to arbitrarily map physical memory.

+

Memory allocated by kmalloc() resides in lowmem and it is +physically contiguous. Memory allocated by vmalloc() is not +contiguous and does not reside in lowmem (it has a dedicated zone in +highmem).

+../_images/kernel-virtmem-map.png +
+
+

Structures used for memory mapping

+

Before discussing about the memory mapping mechanism over a device, +we will present some of the basic structures used by the Linux memory +management subsystem. +Some of the basic structures are: struct page, +struct vm_area_struct, struct mm_struct.

+
+

struct page

+

struct page is used to embed information about all physical +pages in the system. The kernel has a struct page structure +for all pages in the system.

+

There are many functions that interact with this structure:

+
    +
  • virt_to_page() returns the page associated with a virtual +address
  • +
  • pfn_to_page() returns the page associated with a page frame +number
  • +
  • page_to_pfn() return the page frame number associated with a +struct page
  • +
  • page_address() returns the virtual address of a +struct page; this functions can be called only for pages from +lowmem
  • +
  • kmap() creates a mapping in kernel for an arbitrary physical +page (can be from highmem) and returns a virtual address that can be +used to directly reference the page
  • +
+
+
+

struct vm_area_struct

+

struct vm_area_struct holds information about a contiguous +virtual memory area. The memory areas of a process can be viewed by +inspecting the maps attribute of the process via procfs:

+
root@qemux86:~# cat /proc/1/maps
+#address          perms offset  device inode     pathname
+08048000-08050000 r-xp 00000000 fe:00 761        /sbin/init.sysvinit
+08050000-08051000 r--p 00007000 fe:00 761        /sbin/init.sysvinit
+08051000-08052000 rw-p 00008000 fe:00 761        /sbin/init.sysvinit
+092e1000-09302000 rw-p 00000000 00:00 0          [heap]
+4480c000-4482e000 r-xp 00000000 fe:00 576        /lib/ld-2.25.so
+4482e000-4482f000 r--p 00021000 fe:00 576        /lib/ld-2.25.so
+4482f000-44830000 rw-p 00022000 fe:00 576        /lib/ld-2.25.so
+44832000-449a9000 r-xp 00000000 fe:00 581        /lib/libc-2.25.so
+449a9000-449ab000 r--p 00176000 fe:00 581        /lib/libc-2.25.so
+449ab000-449ac000 rw-p 00178000 fe:00 581        /lib/libc-2.25.so
+449ac000-449af000 rw-p 00000000 00:00 0
+b7761000-b7763000 rw-p 00000000 00:00 0
+b7763000-b7766000 r--p 00000000 00:00 0          [vvar]
+b7766000-b7767000 r-xp 00000000 00:00 0          [vdso]
+bfa15000-bfa36000 rw-p 00000000 00:00 0          [stack]
+
+
+

A memory area is characterized by a start address, a stop address, +length, permissions.

+

A struct vm_area_struct is created at each mmap() +call issued from user space. A driver that supports the mmap() +operation must complete and initialize the associated +struct vm_area_struct. The most important fields of this +structure are:

+
    +
  • vm_start, vm_end - the beginning and the end of +the memory area, respectively (these fields also appear in +/proc/<pid>/maps);
  • +
  • vm_file - the pointer to the associated file structure (if any);
  • +
  • vm_pgoff - the offset of the area within the file;
  • +
  • vm_flags - a set of flags;
  • +
  • vm_ops - a set of working functions for this area
  • +
  • vm_next, vm_prev - the areas of the same process +are chained by a list structure
  • +
+
+
+

struct mm_struct

+

struct mm_struct encompasses all memory areas associated +with a process. The mm field of struct task_struct +is a pointer to the struct mm_struct of the current process.

+
+
+
+

Device driver memory mapping

+

Memory mapping is one of the most interesting features of a Unix +system. From a driver's point of view, the memory-mapping facility +allows direct memory access to a user space device.

+

To assign a mmap() operation to a driver, the mmap +field of the device driver's struct file_operations must be +implemented. If that is the case, the user space process can then use +the mmap() system call on a file descriptor associated with +the device.

+

The mmap system call takes the following parameters:

+
void *mmap(caddr_t addr, size_t len, int prot,
+           int flags, int fd, off_t offset);
+
+
+

To map memory between a device and user space, the user process must +open the device and issue the mmap() system call with the resulting +file descriptor.

+

The device driver mmap() operation has the following signature:

+
int (*mmap)(struct file *filp, struct vm_area_struct *vma);
+
+
+

The filp field is a pointer to a struct file created when +the device is opened from user space. The vma field is used to +indicate the virtual address space where the memory should be mapped +by the device. A driver should allocate memory (using +kmalloc(), vmalloc(), alloc_pages()) and then +map it to the user address space as indicated by the vma parameter +using helper functions such as remap_pfn_range().

+

remap_pfn_range() will map a contiguous physical address space +into the virtual space represented by vm_area_struct:

+
int remap_pfn_range (structure vm_area_struct *vma, unsigned long addr,
+                     unsigned long pfn, unsigned long size, pgprot_t prot);
+
+
+

remap_pfn_range() expects the following parameters:

+
    +
  • vma - the virtual memory space in which mapping is made;
  • +
  • addr - the virtual address space from where remapping begins; page +tables for the virtual address space between addr and addr + size +will be formed as needed
  • +
  • pfn - the page frame number to which the virtual address should be +mapped
  • +
  • size - the size (in bytes) of the memory to be mapped
  • +
  • prot - protection flags for this mapping
  • +
+

Here is an example of using this function that contiguously maps the +physical memory starting at page frame number pfn (memory that was +previously allocated) to the vma->vm_start virtual address:

+
struct vm_area_struct *vma;
+unsigned long len = vma->vm_end - vma->vm_start;
+int ret ;
+
+ret = remap_pfn_range(vma, vma->vm_start, pfn, len, vma->vm_page_prot);
+if (ret < 0) {
+    pr_err("could not map the address area\n");
+    return -EIO;
+}
+
+
+

To obtain the page frame number of the physical memory we must +consider how the memory allocation was performed. For each +kmalloc(), vmalloc(), alloc_pages(), we must +used a different approach. For kmalloc() we can use something +like:

+
static char *kmalloc_area;
+
+unsigned long pfn = virt_to_phys((void *)kmalloc_area)>>PAGE_SHIFT;
+
+
+

while for vmalloc():

+
static char *vmalloc_area;
+
+unsigned long pfn = vmalloc_to_pfn(vmalloc_area);
+
+
+

and finally for alloc_pages():

+
struct page *page;
+
+unsigned long pfn = page_to_pfn(page);
+
+
+
+

Attention

+

Note that memory allocated with vmalloc() is not +physically contiguous so if we want to map a range allocated +with vmalloc(), we have to map each page individually +and compute the physical address for each page.

+
+

Since the pages are mapped to user space, they might be swapped +out. To avoid this we must set the PG_reserved bit on the page. +Enabling is done using SetPageReserved() while reseting it +(which must be done before freeing the memory) is done with +ClearPageReserved():

+
void alloc_mmap_pages(int npages)
+{
+    int i;
+    char *mem = kmalloc(PAGE_SIZE * npages);
+
+    if (!mem)
+        return mem;
+
+    for(i = 0; i < npages * PAGE_SIZE; i += PAGE_SIZE)
+        SetPageReserved(virt_to_page(((unsigned long)mem) + i));
+
+    return mem;
+}
+
+void free_mmap_pages(void *mem, int npages)
+{
+    int i;
+
+    for(i = 0; i < npages * PAGE_SIZE; i += PAGE_SIZE)
+        ClearPageReserved(virt_to_page(((unsigned long)mem) + i));
+
+    kfree(mem);
+}
+
+
+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is memory_mapping. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/memory_mapping/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

1. Mapping contiguous physical memory to userspace

+

Implement a device driver that maps contiguous physical memory +(e.g. obtained via kmalloc()) to userspace.

+

Review the Device driver memory mapping section, generate the +skeleton for the task named kmmap and fill in the areas marked +with TODO 1.

+

Start with allocating a NPAGES+2 memory area page using kmalloc() +in the module init function and find the first address in the area that is +aligned to a page boundary.

+
+

Hint

+

The size of a page is PAGE_SIZE.

+

Store the allocated area in kmalloc_ptr and the page +aligned address in kmalloc_area:

+

Use PAGE_ALIGN() to determine kmalloc_area.

+
+

Enable the PG_reserved bit of each page with +SetPageReserved(). Clear the bit with +ClearPageReserved() before freeing the memory.

+
+

Hint

+

Use virt_to_page() to translate virtual pages into +physical pages, as required by SetPageReserved() +and ClearPageReserved().

+
+

For verification purpose (using the test below), fill in the first 4 +bytes of each page with the following values: 0xaa, 0xbb, 0xcc, 0xdd.

+

Implement the mmap() driver function.

+
+

Hint

+

For mapping, use remap_pfn_range(). The third +argument for remap_pfn_range() is a page frame number (PFN).

+

To convert from virtual kernel address to physical address, +use virt_to_phys().

+

To convert a physical address to its PFN, shift the address +with PAGE_SHIFT bits to the right.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 1
+
+
+

If everything goes well, the test will show "matched" messages.

+
+
+

2. Mapping non-contiguous physical memory to userspace

+

Implement a device driver that maps non-contiguous physical memory +(e.g. obtained via vmalloc()) to userspace.

+

Review the Device driver memory mapping section, generate the +skeleton for the task named vmmap and fill in the areas marked +with TODO 1.

+

Allocate a memory area of NPAGES with vmalloc().

+
+

Hint

+

The size of a page is PAGE_SIZE. +Store the allocated area in vmalloc_area. +Memory allocated by vmalloc() is paged aligned.

+
+

Enable the PG_reserved bit of each page with +SetPageReserved(). Clear the bit with +ClearPageReserved() before freeing the memory.

+
+

Hint

+

Use vmalloc_to_page() to translate virtual pages +into physical pages used by the functions +SetPageReserved() and ClearPageReserved().

+
+

For verification purpose (using the test below), fill in the first 4 +bytes of each page with the following values: 0xaa, 0xbb, 0xcc, 0xdd.

+

Implement the mmap driver function.

+
+

Hint

+

To convert from virtual vmalloc address to physical address, +use vmalloc_to_pfn() which returns a PFN directly.

+
+
+

Attention

+

vmalloc pages are not physically contiguous so it is +needed to use remap_pfn_range() for each page.

+

Loop through all virtual pages and for each: +* determine the physical address +* map it with remap_pfn_range()

+

Make sure that you determine the physical address +each time and that you use a range of one page for mapping.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 1
+
+
+

If everything goes well, the test will show "matched" messages.

+
+
+

3. Read / write operations in mapped memory

+

Modify one of the previous modules to allow read / write operations on +your device. This is a didactic exercise to see that the same space +can also be used with the mmap() call and with read() +and write() calls.

+

Fill in areas marked with TODO 2.

+
+

Note

+

The offset parameter sent to the read / write operation can +be ignored as all reads / writes from the test program will +be done with 0 offsets.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 2
+
+
+
+
+

4. Display memory mapped in procfs

+

Using one of the previous modules, create a procfs file in which you +display the total memory mapped by the calling process.

+

Fill in the areas marked with TODO 3.

+

Create a new entry in procfs (PROC_ENTRY_NAME, defined in +mmap-test.h) that will show the total memory mapped by the process +that called the read() on that file.

+
+

Hint

+

Use proc_create(). For the mode parameter, use 0, +and for the parent parameter use NULL. Use +my_proc_file_ops() for operations.

+
+

In the module exit function, delete the PROC_ENTRY_NAME entry +using remove_proc_entry().

+
+

Note

+

A (complex) use and description of the struct +seq_file interface can be found here in this example .

+

For this exercise, just a simple use of the interface +described here is +sufficient. Check the "extra-simple" API described there.

+
+

In the my_seq_show() function you will need to:

+
    +
  • Obtain the struct mm_struct structure of the current process +using the get_task_mm() function.

    +
    +

    Hint

    +

    The current process is available via the current variable +of type struct task_struct*.

    +
    +
  • +
  • Iterate through the entire struct vm_area_struct list +associated with the process.

    +
    +

    Hint

    +

    Use the variable vma_iterator and start from +mm->mmap. Use the vm_next field of +the struct vm_area_struct to navigate through +the list of memory areas. Stop when you reach NULL.

    +
    +
  • +
  • Use vm_start and vm_end for each area to compute the total size.

    +
  • +
  • Use pr_info("%lx %lxn, ...)() to print vm_start and vm_end for +each area.

    +
  • +
  • To release struct mm_struct, decrement the reference +counter of the structure using mmput().

    +
  • +
  • Use seq_printf() to write to the file. Show only the total count, +no other messages. Do not even show newline (n).

    +
  • +
+

In my_seq_open() register the display function +(my_seq_show()) using single_open().

+
+

Note

+

single_open() can use NULL as its third argument.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 3
+
+
+
+

Note

+

The test waits for a while (it has an internal sleep +instruction). As long as the test waits, use the +pmap command in another console to see the +mappings of the test and compare those to the test results.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/labs/networking.html b/refs/pull/405/merge/labs/networking.html new file mode 100644 index 00000000..b4a3e42b --- /dev/null +++ b/refs/pull/405/merge/labs/networking.html @@ -0,0 +1,1387 @@ + + + + + + Networking — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Networking

+
+

Lab objectives

+
+
    +
  • Understanding the Linux kernel networking architecture
  • +
  • Acquiring practical IP packet management skills using a packet filter or +firewall
  • +
  • Familiarize yourself with how to use sockets at the Linux kernel level
  • +
+
+
+
+

Overview

+

The development of the Internet has led to an exponential increase in network +applications and, as a consequence, to increasing the speed and productivity +requirements of an operating system's networking subsystem. The networking +subsystem is not an essential component of an operating system kernel (the Linux +kernel can be compiled without networking support). It is, however, quite +unlikely for a computing system (or even an embedded device) to have a +non-networked operating system due to the need for connectivity. Modern operating +systems use the TCP/IP stack. Their kernel +implements protocols up to the transport layer, while application layer protocols +are typically implemented in user space (HTTP, FTP, SSH, etc.).

+
+

Networking in user space

+

In user space the abstraction of network communication is the socket. The +socket abstracts a communication channel and is the kernel-based TCP/IP stack +interaction interface. An IP socket is associated with an IP address, the +transport layer protocol used (TCP, UDP etc) and a port. Common function calls +that use sockets are: creation (socket), initialization +(bind), connecting (connect), waiting for a connection +(listen, accept), closing a socket (close).

+

Network communication is accomplished via read/write or recv/send calls +for TCP sockets and recvfrom/sendto for UDP sockets. Transmission and +reception operations are transparent to the application, leaving encapsulation +and transmission over network at the kernel's discretion. However, it is +possible to implement the TCP/IP stack in user space using raw sockets (the +PF_PACKET option when creating a socket), or implementing an application +layer protocol in kernel (TUX web server).

+

For more details about user space programming using sockets, see Beej's Guide to +Network Programming Using Internet +Sockets.

+
+
+
+

Linux networking

+

The Linux kernel provides three basic structures for working with network +packets: struct socket, struct sock and struct +sk_buff.

+

The first two are abstractions of a socket:

+
+
    +
  • struct socket is an abstraction very close to user space, ie BSD +sockets used to program +network applications;
  • +
  • struct sock or INET socket in Linux terminology is the network +representation of a socket.
  • +
+
+

The two structures are related: the struct socket contains an INET +socket field, and the struct sock has a BSD socket that holds it.

+

The struct sk_buff structure is the representation of a network packet +and its status. The structure is created when a kernel packet is received, +either from the user space or from the network interface.

+
+

The struct socket structure

+

The struct socket structure is the kernel representation of a BSD +socket, the operations that can be executed on it are similar to those offered +by the kernel (through system calls). Common operations with sockets +(creation, initialization/bind, closing, etc.) result in specific system +calls; they work with the struct socket structure.

+

The struct socket operations are described in net/socket.c and +are independent of the protocol type. The struct socket structure is thus +a generic interface over particular network operations implementations. +Typically, the names of these operations begin with the sock_ prefix.

+
+

Operations on the socket structure

+

Socket operations are:

+
+
Creation
+

Creation is similar to calling the socket() function in user space, but the +struct socket created will be stored in the res parameter:

+
+
    +
  • int sock_create(int family, int type, int protocol, struct socket **res) +creates a socket after the socket() system call;
  • +
  • int sock_create_kern(struct net *net, int family, int type, int protocol, +struct socket **res) creates a kernel socket;
  • +
  • int sock_create_lite(int family, int type, int protocol, struct socket **res) +creates a kernel socket without parameter sanity checks.
  • +
+
+

The parameters of these calls are as follows:

+
+
    +
  • net, where it is present, used as reference to the network namespace used; +we will usually initialize it with init_net;
  • +
  • family represents the family of protocols used in the transfer of +information; they usually begin with the PF_ (Protocol Family) string; +the constants representing the family of protocols used are found in +linux/socket.h, of which the most commonly used is PF_INET, for +TCP/IP protocols;
  • +
  • type is the type of socket; the constants used for this parameter are +found in linux/net.h, of which the most used are SOCK_STREAM for +a connection based source-to-destination communication and SOCK_DGRAM +for connectionless communication;
  • +
  • protocol represents the protocol used and is closely related to the +type parameter; the constants used for this parameter are found in +linux/in.h, of which the most used are IPPROTO_TCP for TCP and +IPPROTO_UDP for UDP.
  • +
+
+

To create a TCP socket in kernel space, you must call:

+
struct socket *sock;
+int err;
+
+err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+if (err < 0) {
+        /* handle error */
+}
+
+
+

and for creating UDP sockets:

+
struct socket *sock;
+int err;
+
+err = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+if (err < 0) {
+        /* handle error */
+}
+
+
+

A usage sample is part of the sys_socket() system call handler:

+
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
+{
+      int retval;
+      struct socket *sock;
+      int flags;
+
+      /* Check the SOCK_* constants for consistency.  */
+      BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
+      BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
+      BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
+      BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
+
+      flags = type & ~SOCK_TYPE_MASK;
+      if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+              return -EINVAL;
+      type &= SOCK_TYPE_MASK;
+
+      if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+              flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+      retval = sock_create(family, type, protocol, &sock);
+      if (retval < 0)
+              goto out;
+
+      return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
+}
+
+
+
+
+
Closing
+

Close connection (for sockets using connection) and release associated +resources:

+
+
    +
  • void sock_release(struct socket *sock) calls the release function in +the ops field of the socket structure:
  • +
+
+
void sock_release(struct socket *sock)
+{
+      if (sock->ops) {
+              struct module *owner = sock->ops->owner;
+
+              sock->ops->release(sock);
+              sock->ops = NULL;
+              module_put(owner);
+      }
+      //...
+}
+
+
+
+
+
Sending/receiving messages
+

The messages are sent/received using the following functions:

+
+
    +
  • int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags);
  • +
  • int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags);
  • +
  • int sock_sendmsg(struct socket *sock, struct msghdr *msg);
  • +
  • int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size);
  • +
+
+

The message sending/receiving functions will then call the sendmsg/ +recvmsg function in the ops field of the socket. Functions +containing kernel_ as a prefix are used when the socket is used in the +kernel.

+

The parameters are:

+
+
    +
  • msg, a struct msghdr structure, containing the message to be +sent/received. Among the important components of this structure are msg_name +and msg_namelen, which, for UDP sockets, must be filled in with the address +to which the message is sent (struct sockaddr_in);
  • +
  • vec, a struct kvec structure, containing a pointer to the buffer +containing its data and size; as can be seen, it has a similar structure to the +struct iovec structure (the struct iovec structure +corresponds to the user space data, and the struct kvec structure +corresponds to kernel space data).
  • +
+
+

A usage example can be seen in the sys_sendto() system call handler:

+
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
+              unsigned int, flags, struct sockaddr __user *, addr,
+              int, addr_len)
+{
+      struct socket *sock;
+      struct sockaddr_storage address;
+      int err;
+      struct msghdr msg;
+      struct iovec iov;
+      int fput_needed;
+
+      err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter);
+      if (unlikely(err))
+              return err;
+      sock = sockfd_lookup_light(fd, &err, &fput_needed);
+      if (!sock)
+              goto out;
+
+      msg.msg_name = NULL;
+      msg.msg_control = NULL;
+      msg.msg_controllen = 0;
+      msg.msg_namelen = 0;
+      if (addr) {
+              err = move_addr_to_kernel(addr, addr_len, &address);
+              if (err < 0)
+                      goto out_put;
+              msg.msg_name = (struct sockaddr *)&address;
+              msg.msg_namelen = addr_len;
+      }
+      if (sock->file->f_flags & O_NONBLOCK)
+              flags |= MSG_DONTWAIT;
+      msg.msg_flags = flags;
+      err = sock_sendmsg(sock, &msg);
+
+out_put:
+      fput_light(sock->file, fput_needed);
+out:
+      return err;
+}
+
+
+
+
+
+

The struct socket fields

+
/**
+ *  struct socket - general BSD socket
+ *  @state: socket state (%SS_CONNECTED, etc)
+ *  @type: socket type (%SOCK_STREAM, etc)
+ *  @flags: socket flags (%SOCK_NOSPACE, etc)
+ *  @ops: protocol specific socket operations
+ *  @file: File back pointer for gc
+ *  @sk: internal networking protocol agnostic socket representation
+ *  @wq: wait queue for several uses
+ */
+struct socket {
+      socket_state            state;
+
+      short                   type;
+
+      unsigned long           flags;
+
+      struct socket_wq __rcu  *wq;
+
+      struct file             *file;
+      struct sock             *sk;
+      const struct proto_ops  *ops;
+};
+
+
+

The noteworthy fields are:

+
+
    +
  • ops - the structure that stores pointers to protocol-specific functions;
  • +
  • sk - The INET socket associated with it.
  • +
+
+
+
The struct proto_ops structure
+

The struct proto_ops structure contains the implementations of the specific +operations implemented (TCP, UDP, etc.); these functions will be called from +generic functions through struct socket (sock_release(), +sock_sendmsg(), etc.)

+

The struct proto_ops structure therefore contains a number of function +pointers for specific protocol implementations:

+
struct proto_ops {
+      int             family;
+      struct module   *owner;
+      int             (*release)   (struct socket *sock);
+      int             (*bind)      (struct socket *sock,
+                                    struct sockaddr *myaddr,
+                                    int sockaddr_len);
+      int             (*connect)   (struct socket *sock,
+                                    struct sockaddr *vaddr,
+                                    int sockaddr_len, int flags);
+      int             (*socketpair)(struct socket *sock1,
+                                    struct socket *sock2);
+      int             (*accept)    (struct socket *sock,
+                                    struct socket *newsock, int flags, bool kern);
+      int             (*getname)   (struct socket *sock,
+                                    struct sockaddr *addr,
+                                    int peer);
+      //...
+}
+
+
+

The initialization of the ops field from struct socket is done in +the __sock_create() function, by calling the create() function, +specific to each protocol; an equivalent call is the implementation of the +__sock_create() function:

+
//...
+      err = pf->create(net, sock, protocol, kern);
+      if (err < 0)
+              goto out_module_put;
+//...
+
+
+

This will instantiate the function pointers with calls specific to the protocol +type associated with the socket. The sock_register() and +sock_unregister() calls are used to fill the net_families vector.

+

For the rest of the socket operations (other than creating, closing, and +sending/receiving a message as described above in the Operations on the socket +structure section), the functions sent via pointers in this structure will be +called. For example, for bind, which associates a socket with a socket on +the local machine, we will have the following code sequence:

+
#define MY_PORT 60000
+
+struct sockaddr_in addr = {
+      .sin_family = AF_INET,
+      .sin_port = htons (MY_PORT),
+      .sin_addr = { htonl (INADDR_LOOPBACK) }
+};
+
+//...
+      err = sock->ops->bind (sock, (struct sockaddr *) &addr, sizeof(addr));
+      if (err < 0) {
+              /* handle error */
+      }
+//...
+
+
+

As you can see, for transmitting the address and port information that +will be associated with the socket, a struct sockaddr_in is filled.

+
+
+
+
+

The struct sock structure

+

The struct sock describes an INET socket. Such a structure is +associated with a user space socket and implicitly with a struct +socket structure. The structure is used to store information about the status +of a connection. The structure's fields and associated operations usually begin +with the sk_ string. Some fields are listed below:

+
struct sock {
+      //...
+      unsigned int            sk_padding : 1,
+                              sk_no_check_tx : 1,
+                              sk_no_check_rx : 1,
+                              sk_userlocks : 4,
+                              sk_protocol  : 8,
+                              sk_type      : 16;
+      //...
+      struct socket           *sk_socket;
+      //...
+      struct sk_buff          *sk_send_head;
+      //...
+      void                    (*sk_state_change)(struct sock *sk);
+      void                    (*sk_data_ready)(struct sock *sk);
+      void                    (*sk_write_space)(struct sock *sk);
+      void                    (*sk_error_report)(struct sock *sk);
+      int                     (*sk_backlog_rcv)(struct sock *sk,
+                                                struct sk_buff *skb);
+      void                    (*sk_destruct)(struct sock *sk);
+};
+
+
+

+
+
    +
  • sk_protocol is the type of protocol used by the socket;
  • +
  • sk_type is the socket type (SOCK_STREAM, SOCK_DGRAM, etc.);
  • +
  • sk_socket is the BSD socket that holds it;
  • +
  • sk_send_head is the list of struct sk_buff structures for +transmission;
  • +
  • the function pointers at the end are callbacks for different situations.
  • +
+
+

Initializing the struct sock and attaching it to a BSD socket is done +using the callback created from net_families (called +__sock_create()). Here's how to initialize the struct sock +structure for the IP protocol, in the inet_create() function:

+
/*
+ *    Create an inet socket.
+ */
+
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+                     int kern)
+{
+
+      struct sock *sk;
+
+      //...
+      err = -ENOBUFS;
+      sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
+      if (!sk)
+              goto out;
+
+      err = 0;
+      if (INET_PROTOSW_REUSE & answer_flags)
+              sk->sk_reuse = SK_CAN_REUSE;
+
+
+      //...
+      sock_init_data(sock, sk);
+
+      sk->sk_destruct    = inet_sock_destruct;
+      sk->sk_protocol    = protocol;
+      sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+      //...
+}
+
+
+
+
+

The struct sk_buff structure

+

The struct sk_buff (socket buffer) describes a network packet. The +structure fields contain information about both the header and packet contents, +the protocols used, the network device used, and pointers to the other +struct sk_buff. A summary description of the content of the structure +is presented below:

+
struct sk_buff {
+      union {
+              struct {
+                      /* These two members must be first. */
+                      struct sk_buff          *next;
+                      struct sk_buff          *prev;
+
+                      union {
+                              struct net_device       *dev;
+                              /* Some protocols might use this space to store information,
+                               * while device pointer would be NULL.
+                               * UDP receive path is one user.
+                               */
+                              unsigned long           dev_scratch;
+                      };
+              };
+
+              struct rb_node  rbnode; /* used in netem & tcp stack */
+      };
+      struct sock             *sk;
+
+        union {
+              ktime_t         tstamp;
+              u64             skb_mstamp;
+      };
+
+      /*
+       * This is the control buffer. It is free to use for every
+       * layer. Please put your private variables there. If you
+       * want to keep them across layers you have to do a skb_clone()
+       * first. This is owned by whoever has the skb queued ATM.
+       */
+      char                    cb[48] __aligned(8);
+
+      unsigned long           _skb_refdst;
+      void                    (*destructor)(struct sk_buff *skb);
+        union {
+              struct {
+                      unsigned long   _skb_refdst;
+                      void            (*destructor)(struct sk_buff *skb);
+              };
+              struct list_head        tcp_tsorted_anchor;
+      };
+      /* ... */
+
+      unsigned int            len,
+                              data_len;
+      __u16                   mac_len,
+                              hdr_len;
+
+         /* ... */
+
+      __be16                  protocol;
+      __u16                   transport_header;
+      __u16                   network_header;
+      __u16                   mac_header;
+
+      /* private: */
+      __u32                   headers_end[0];
+      /* public: */
+
+      /* These elements must be at the end, see alloc_skb() for details.  */
+      sk_buff_data_t          tail;
+      sk_buff_data_t          end;
+      unsigned char           *head,
+                              *data;
+      unsigned int            truesize;
+      refcount_t              users;
+};
+
+
+

where:

+
+
    +
  • next and prev are pointers to the next, and previous element in the +buffer list;
  • +
  • dev is the device which sends or receives the buffer;
  • +
  • sk is the socket associated with the buffer;
  • +
  • destructor is the callback that deallocates the buffer;
  • +
  • transport_header, network_header, and mac_header are offsets +between the beginning of the packet and the beginning of the various headers +in the packets. They are internally maintained by the various processing +layers through which the packet passes. To get pointers to the headers, use +one of the following functions: tcp_hdr(), udp_hdr(), +ip_hdr(), etc. In principle, each protocol provides a function to +get a reference to the header of that protocol within a received packet. +Keep in mind that the network_header field is not set until the packet +reaches the network layer and the transport_header field is not set +until the packet reaches the transport layer.
  • +
+
+

The structure of an IP header +(struct iphdr) has the following fields:

+
struct iphdr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+      __u8    ihl:4,
+              version:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+      __u8    version:4,
+              ihl:4;
+#else
+#error        "Please fix <asm/byteorder.h>"
+#endif
+      __u8    tos;
+      __be16  tot_len;
+      __be16  id;
+      __be16  frag_off;
+      __u8    ttl;
+      __u8    protocol;
+      __sum16 check;
+      __be32  saddr;
+      __be32  daddr;
+      /*The options start here. */
+};
+
+
+

where:

+
+
    +
  • protocol is the transport layer protocol used;
  • +
  • saddr is the source IP address;
  • +
  • daddr is the destination IP address.
  • +
+
+

The structure of a TCP header +(struct tcphdr) has the following fields:

+
struct tcphdr {
+      __be16  source;
+      __be16  dest;
+      __be32  seq;
+      __be32  ack_seq;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+      __u16   res1:4,
+              doff:4,
+              fin:1,
+              syn:1,
+              rst:1,
+              psh:1,
+              ack:1,
+              urg:1,
+              ece:1,
+              cwr:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+      __u16   doff:4,
+              res1:4,
+              cwr:1,
+              ece:1,
+              urg:1,
+              ack:1,
+              psh:1,
+              rst:1,
+              syn:1,
+              fin:1;
+#else
+#error        "Adjust your <asm/byteorder.h> defines"
+#endif
+      __be16  window;
+      __sum16 check;
+      __be16  urg_ptr;
+};
+
+
+

where:

+
+
    +
  • source is the source port;
  • +
  • dest is the destination port;
  • +
  • syn, ack, fin are the TCP flags used; for a more detailed view, +see this diagram.
  • +
+
+

The structure of a UDP header +(struct udphdr) has the following fields:

+
struct udphdr {
+      __be16  source;
+      __be16  dest;
+      __be16  len;
+      __sum16 check;
+};
+
+
+

where:

+
+
    +
  • source is the source port;
  • +
  • dest is the destination port.
  • +
+
+

An example of accessing the information present in the headers of a network +packet is as follows:

+
struct sk_buff *skb;
+
+struct iphdr *iph = ip_hdr(skb);                 /* IP header */
+/* iph->saddr  - source IP address */
+/* iph->daddr  - destination IP address */
+if (iph->protocol == IPPROTO_TCP) {              /* TCP protocol */
+        struct tcphdr *tcph = tcp_hdr(skb);      /* TCP header */
+        /* tcph->source  - source TCP port */
+        /* tcph->dest    - destination TCP port */
+} else if (iph->protocol == IPPROTO_UDP) {       /* UDP protocol */
+        struct udphdr *udph = udp_hdr(skb);      /* UDP header */
+        /* udph->source  - source UDP port */
+        /* udph->dest    - destination UDP port */
+}
+
+
+
+
+
+

Conversions

+

In different systems, there are several ways of ordering bytes in a word +(Endianness), including: Big +Endian (the most +significant byte first) and Little +Endian (the least +significant byte first). Since a network interconnects systems with different +platforms, the Internet has imposed a standard sequence for the storage of +numerical data, called network byte-order. In +contrast, the byte sequence for the representation of numerical data on the host +computer is called host byte-order. Data received/sent from/to the network is in +the network byte-order format and should be converted between this format and +the host byte-order.

+

For converting we use the following macros:

+
+
    +
  • u16 htons(u16 x) converts a 16 bit integer from host byte-order to +network byte-order (host to network short);
  • +
  • u32 htonl(u32 x) converts a 32 bit integer from host byte-order to +network byte-order (host to network long);
  • +
  • u16 ntohs(u16 x) converts a 16 bit integer from network byte-order to +host byte-order (network to host short);
  • +
  • u32 ntohl(u32 x) converts a 32 bit integer from network byte-order to +host byte-order (network to host long).
  • +
+
+
+
+

netfilter

+

Netfilter is the name of the kernel interface for capturing network packets for +modifying/analyzing them (for filtering, NAT, etc.). The netfilter interface is used in user space by iptables.

+

In the Linux kernel, packet capture using netfilter is done by attaching hooks. +Hooks can be specified in different locations in the path followed by a kernel +network packet, as needed. An organization chart with the route followed by a +package and the possible areas for a hook can be found here.

+

The header included when using netfilter is linux/netfilter.h.

+

A hook is defined through the struct nf_hook_ops structure:

+
struct nf_hook_ops {
+      /* User fills in from here down. */
+      nf_hookfn               *hook;
+      struct net_device       *dev;
+      void                    *priv;
+      u_int8_t                pf;
+      unsigned int            hooknum;
+      /* Hooks are ordered in ascending priority. */
+      int                     priority;
+};
+
+
+

where:

+
+
    +
  • pf is the package type (PF_INET, etc.);
  • +
  • +
    priority is the priority; priorities are defined in
    +
    uapi/linux/netfilter_ipv4.h as follows:
    +
    +
  • +
+
+
enum nf_ip_hook_priorities {
+      NF_IP_PRI_FIRST = INT_MIN,
+      NF_IP_PRI_CONNTRACK_DEFRAG = -400,
+      NF_IP_PRI_RAW = -300,
+      NF_IP_PRI_SELINUX_FIRST = -225,
+      NF_IP_PRI_CONNTRACK = -200,
+      NF_IP_PRI_MANGLE = -150,
+      NF_IP_PRI_NAT_DST = -100,
+      NF_IP_PRI_FILTER = 0,
+      NF_IP_PRI_SECURITY = 50,
+      NF_IP_PRI_NAT_SRC = 100,
+      NF_IP_PRI_SELINUX_LAST = 225,
+      NF_IP_PRI_CONNTRACK_HELPER = 300,
+      NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
+      NF_IP_PRI_LAST = INT_MAX,
+};
+
+
+

+
+
    +
  • dev is the device (network interface) on which the capture is +intended;
  • +
  • hooknum is the type of hook used. When a packet is captured, the +processing mode is defined by the hooknum and hook fields. For IP, +hook types are defined in linux/netfilter.h:
  • +
+
+
enum nf_inet_hooks {
+      NF_INET_PRE_ROUTING,
+      NF_INET_LOCAL_IN,
+      NF_INET_FORWARD,
+      NF_INET_LOCAL_OUT,
+      NF_INET_POST_ROUTING,
+      NF_INET_NUMHOOKS
+};
+
+
+

+
+
    +
  • hook is the handler called when capturing a network packet (packet sent +as a struct sk_buff structure). The private field is private information +handed to the handler. The capture handler prototype is defined by the +nf_hookfn type:
  • +
+
+
struct nf_hook_state {
+      unsigned int hook;
+      u_int8_t pf;
+      struct net_device *in;
+      struct net_device *out;
+      struct sock *sk;
+      struct net *net;
+      int (*okfn)(struct net *, struct sock *, struct sk_buff *);
+};
+
+typedef unsigned int nf_hookfn(void *priv,
+                             struct sk_buff *skb,
+                             const struct nf_hook_state *state);
+
+
+

For the nf_hookfn() capture function, the priv parameter is the +private information with which the struct nf_hook_ops was +initialized. skb is the pointer to the captured network packet. Based on +skb information, packet filtering decisions are made. The function's +state parameter is the status information related to the packet capture, +including the input interface, the output interface, the priority, the hook +number. Priority and hook number are useful for allowing the same function to +be called by several hooks.

+

A capture handler can return one of the constants NF_*:

+
/* Responses from hook functions. */
+#define NF_DROP 0
+#define NF_ACCEPT 1
+#define NF_STOLEN 2
+#define NF_QUEUE 3
+#define NF_REPEAT 4
+#define NF_STOP 5
+#define NF_MAX_VERDICT NF_STOP
+
+
+

NF_DROP is used to filter (ignore) a packet, and NF_ACCEPT is used to +accept a packet and forward it.

+

Registering/unregistering a hook is done using the functions defined in +linux/netfilter.h:

+
/* Function to register/unregister hook points. */
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+                        unsigned int n);
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+                           unsigned int n);
+
+
+
+

Attention

+

Prior to version 3.11-rc2 of the Linux kernel, +there are some restrictions related to the use of header extraction functions +from a struct sk_buff structure set as a parameter in a netfilter +hook. While the IP header can be obtained each time using ip_hdr(), +the TCP and UDP headers can be obtained with tcp_hdr() and +udp_hdr() only for packages that come from inside the system rather +than the ones that are received from outside the system. In the latter case, +you must manually calculate the header offset in the package:

+
// For TCP packets (iph->protocol == IPPROTO_TCP)
+tcph = (struct tcphdr*)((__u32*)iph + iph->ihl);
+// For UDP packets (iph->protocol == IPPROTO_UDP)
+udph = (struct udphdr*)((__u32*)iph + iph->ihl);
+
+
+

This code works in all filtering situations, so it's recommended to use it +instead of header access functions.

+
+

A usage example for a netfilter hook is shown below:

+
#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+static unsigned int my_nf_hookfn(void *priv,
+              struct sk_buff *skb,
+              const struct nf_hook_state *state)
+{
+      /* process packet */
+      //...
+
+      return NF_ACCEPT;
+}
+
+static struct nf_hook_ops my_nfho = {
+      .hook        = my_nf_hookfn,
+      .hooknum     = NF_INET_LOCAL_OUT,
+      .pf          = PF_INET,
+      .priority    = NF_IP_PRI_FIRST
+};
+
+int __init my_hook_init(void)
+{
+      return nf_register_net_hook(&init_net, &my_nfho);
+}
+
+void __exit my_hook_exit(void)
+{
+      nf_unregister_net_hook(&init_net, &my_nfho);
+}
+
+module_init(my_hook_init);
+module_exit(my_hook_exit);
+
+
+
+
+

netcat

+

When developing applications that include networking code, one of the most +used tools is netcat. Also nicknamed "Swiss-army knife for TCP / IP". It allows:

+
+
    +
  • Initiating TCP connections;
  • +
  • Waiting for a TCP connection;
  • +
  • Sending and receiving UDP packets;
  • +
  • Displaying traffic in hexdump format;
  • +
  • Run a program after establishing a connection (eg, a shell);
  • +
  • Set special options in sent packages.
  • +
+
+

Initiating TCP connections:

+
nc hostname port
+
+
+

Listening to a TCP port:

+
nc -l -p port
+
+
+

Sending and receiving UDP packets is done adding the -u command line option.

+
+

Note

+

The command is nc; often netcat is an alias for this +command. There are other implementations of the netcat command, some of which +have slightly different parameters than the classic implementation. Run +man nc or nc -h to check how to use it.

+
+

For more information on netcat, check the following tutorial.

+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is networking. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/networking/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Important

+

You need to make sure that the netfilter support is active in kernel. It +is enabled via CONFIG_NETFILTER. To activate it, run make menuconfig in +the linux directory and check the Network packet filtering framework +(Netfilter) option in Networking support -> Networking options. If it +was not enabled, enable it (as builtin, not external module - it must be +marked with *).

+
+
+

1. Displaying packets in kernel space

+

Write a kernel module that displays the source address and port for TCP packets +that initiate an outbound connection. Start from the code in +1-2-netfilter and fill in the areas marked with TODO 1, taking into +account the comments below.

+

You will need to register a netfilter hook of type NF_INET_LOCAL_OUT as explained +in the netfilter section.

+

The struct sk_buff structure lets you access the packet headers using +specific functions. The ip_hdr() function returns the IP header as a +pointer to a struct iphdr structure. The tcp_hdr() function +returns the TCP header as a pointer to a struct tcphdr structure.

+

The diagram explains how to make a TCP connection. The connection initiation +packet has the SYN flag set in the TCP header and the ACK flag cleared.

+
+

Note

+

To display the source IP address, use the %pI4 format of the printk +function. Details can be found in the kernel documentation (IPv4 +addresses section). The following is an example code snippet that uses +%pI4:

+
printk("IP address is %pI4\n", &iph->saddr);
+
+
+

When using the %pI4 format, the argument to printk is a pointer. Hence the +construction &iph->saddr (with operator & - ampersand) instead of +iph->saddr.

+
+

The source TCP port is, in the TCP header, in the network byte-order format. +Read through the Conversions section. Use ntohs() to convert.

+

For testing, use the 1-2-netfilter/user/test-1.sh file. The test creates +a connection to the localhost, a connection that will be intercepted and +displayed by the kernel module. The script is copied on the virtual machine by +the make copy command only if it is marked as executable. The script +uses the statically compiled netcat tool stored in +skels/networking/netcat; this program must have execution +permissions.

+

After running the checker the output should be similar to the one bellow:

+
# ./test-1.sh
+[  229.783512] TCP connection initiated from 127.0.0.1:44716
+Should show up in filter.
+Check dmesg output.
+
+
+
+
+

2. Filtering by destination address

+

Extend the module from exercise 1 so that you can specify a destination address +by means of a MY_IOCTL_FILTER_ADDRESS ioctl call. You'll only show packages +containing the specified destination address. To solve this task, fill in the +areas marked with TODO 2 and follow the specifications below.

+

To implement the ioctl routine, you must fill out the my_ioctl function. +Review the section in ioctl. The address sent from user space is in +network byte-order, so there will be NO need for conversion.

+
+

Note

+

The IP address sent via ioctl is sent by address, not by value. The +address must be stored in the ioctl_set_addr variable. For copying use +copy_from_user().

+
+

To compare the addresses, fill out the test_daddr function. Addresses in +network byte-order will be used without having to convert addresses (if they +are equal from left to right they will be equal if reversed too).

+

The test_daddr function must be called from the netfilter hook to display +the connection initialization packets for which the destination address is the +one sent through the ioctl routine. The connection initiation packet has the +SYN flag set in the TCP header and the ACK flag cleared. You have to +check two things:

+
+
    +
  • the TCP flags;
  • +
  • the destination address of the packet (using test_addr).
  • +
+
+

For testing, use the 1-2-netfilter/user/test-2.sh script. This script +needs to compile the 1-2-netfilter/user/test.c file in the test +executable. Compilation is done automatically on the physical system when +running the make build command. The test script is copied to the +virtual machine only if it is marked as executable. The script uses the +statically compiled netcat tool in skels/networking/netcat; +this executable must have execution permissions.

+

After running the checker the output should be similar to the one bellow:

+
# ./test-2.sh
+[  797.673535] TCP connection initiated from 127.0.0.1:44721
+Should show up in filter.
+Should NOT show up in filter.
+Check dmesg output.
+
+
+

The test ask for packet filtering first for the 127.0.0.1 IP address and +then for the 127.0.0.2 IP address. The first connection initiation packet +(to 127.0.0.1) is intercepted and displayed by the filter, while the second +(to 127.0.0.2) is not intercepted.

+
+
+

3. Listening on a TCP socket

+

Write a kernel module that creates a TCP socket that listens to connections on +port 60000 on the loopback interface (in init_module). Start from the +code in 3-4-tcp-sock fill in the areas marked with TODO 1 taking +into account the observations below.

+

Read the Operations on the socket structure and The struct proto_ops +structure sections.

+

The sock socket is a server socket and must be put in the listening +state. That is, the bind and listen operations must be applied to the +socket. For the bind and listen equivalent, in kernel space you will +need to call sock->ops->...; examples of such functions you can call are +sock->ops->bind, sock->ops->listen etc.

+
+

Note

+

For example, call sock->ops->bind, or sock->ops->listen functions, see +how they are called in the sys_bind() and sys_listen() system +call handlers.

+

Look for the system call handlers in the net/socket.c file in the Linux +kernel source code tree.

+
+
+

Note

+

For the second argument of the listen (backlog) call, use the +LISTEN_BACKLOG.

+
+

Remember to release the socket in the module's exit function and in the area +marked with error labels; use sock_release().

+

For testing, run the 3-4-tcp_sock/test-3.sh script. The script is +copied on the virtual machine by make copy only if it is marked as +executable.

+

After running the test, a TCP socket will be displayed by listening to +connections on port 60000.

+
+
+

4. Accepting connections in kernel space

+

Expand the module from the previous exercise to allow an external connection (no +need to send any message, only accept new connections). Fill in the areas marked +with TODO 2.

+

Read the Operations on the socket structure and The struct proto_ops +structure sections.

+

For the kernel space accept equivalent, see the system call handler for +sys_accept4(). Follow the lnet_sock_accept +implementation, and how the sock->ops->accept call is used. Use 0 as +the value for the second to last argument (flags), and true for the +last argument (kern).

+
+

Note

+

Look for the system call handlers in the net/socket.c file in the Linux +kernel source code tree.

+
+
+

Note

+

The new socket (new_sock) must be created with the +sock_create_lite() function and then its operations must be configured +using

+
newsock->ops = sock->ops;
+
+
+
+

Print the address and port of the destination socket. To find the peer name of a +socket (its address), refer to the sys_getpeername() system call handler.

+
+

Note

+

The first argument for the sock->ops->getname function will be the +connection socket, ie new_sock, the one initialized with by the accept +call.

+

The last argument of the sock->ops->getname function will be 1, +meaning that we want to know about the endpoint or the peer (remote end or +peer).

+

Display the peer address (indicated by the raddr variable) using the +print_sock_address macro defined in the file.

+
+

Release the newly created socket (after accepting the connection) in the module +exit function and after the error label. After adding the accept code to the +module initialization function, the insmod operation will lock until +a connection is established. You can unlock using netcat on that +port. Consequently, the test script from the previous exercise will not work.

+

For testing, run the 3-4-tcp_sock/test-4.sh script. The script is copied on +the virtual machine by make copy only if it is marked as executable.

+

Nothing special will be displayed (in the kernel buffer). The success of the +test will be defined by the connection establishment. Then use Ctrl+c to +stop the test script, and then you can remove the kernel module.

+
+
+

5. UDP socket sender

+

Write a kernel module that creates a UDP socket and sends the message from the +MY_TEST_MESSAGE macro on the socket to the loopback address on port +60001.

+

Start from the code in 5-udp-sock.

+

Read the Operations on the socket structure and The struct proto_ops +structure sections.

+

To see how to send messages in the kernel space, see the sys_send() +system call handler or Sending/receiving messages.

+
+

Hint

+

The msg_name field of the struct msghdr structure must be +initialized to the destination address (pointer to struct sockaddr) +and the msg_namelen field to the address size.

+

Initialize the msg_flags field of the struct msghdr structure +to 0.

+

Initialize the msg_control and msg_controllen fields of the +struct msghdr structure to NULL and 0 respectively.

+
+

For sending the message use kernel_sendmsg().

+

The message transmission parameters are retrieved from the kernel space. Cast +the struct iovec structure pointer to a struct kvec pointer +in the kernel_sendmsg() call.

+
+

Hint

+

The last two parameters of kernel_sendmsg() are 1 (number of I/O +vectors) and len (message size).

+
+

For testing, use the test-5.sh file. The script is copied on the virtual +machine by the make copy command only if it is marked as executable. +The script uses the statically compiled netcat tool stored in +skels/networking/netcat; this executable must have execution +permissions.

+

For a correct implementation, running the test-5.sh script will cause +the kernelsocket message to be displayed like in the output below:

+
/root # ./test-5.sh
++ pid=1059
++ sleep 1
++ nc -l -u -p 60001
++ insmod udp_sock.ko
+kernelsocket
++ rmmod udp_sock
++ kill 1059
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/address-space-slides.html b/refs/pull/405/merge/lectures/address-space-slides.html new file mode 100644 index 00000000..fef6d4b7 --- /dev/null +++ b/refs/pull/405/merge/lectures/address-space-slides.html @@ -0,0 +1,709 @@ + + + + + + + + Address Space — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Address Space

+ +
    +
  • x86 MMU
      +
    • Segmentation
    • +
    • Paging
    • +
    • TLB
    • +
    +
  • +
  • Linux Address Space
      +
    • User
    • +
    • Kernel
    • +
    • High memory
    • +
    +
  • +
+ + + + +
+
+ +

x86 MMU

+ +

 

+../_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.png + + + + +
+
+ +

Selectors

+ +

 

+../_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.png +
    +
  • Selectors: CS, DS, SS, ES, FS, GS
  • +
  • Index: indexes the segment descriptor table
  • +
  • TI: selects either the GDT or LDT
  • +
  • RPL: for CS only indicates the running (current) priviledge level
  • +
  • GDTR and LDTR registers points to the base of GDP and LDT
  • +
+ + + + +
+
+ +

Segment descriptor

+ +

 

+../_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.png +
    +
  • Base: linear address for the start of the segment
  • +
  • Limit: size of the segment
  • +
  • G: granularity bit: if set the size is in bytes otherwise in 4K pages
  • +
  • B/D: data/code
  • +
  • Type: code segment, data/stack, TSS, LDT, GDT
  • +
  • Protection: the minimum priviledge level required to access the +segment (RPL is checked against DPL)
  • +
+ + + + +
+
+ +

Segmentation in Linux

+ +
/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ *   0 - null                                                             <=== cacheline #1
+ *   1 - reserved
+ *   2 - reserved
+ *   3 - reserved
+ *
+ *   4 - unused                                                           <=== cacheline #2
+ *   5 - unused
+ *
+ *  ------- start of TLS (Thread-Local Storage) segments:
+ *
+ *   6 - TLS segment #1                   [ glibc's TLS segment ]
+ *   7 - TLS segment #2                   [ Wine's %fs Win32 segment ]
+ *   8 - TLS segment #3                                                   <=== cacheline #3
+ *   9 - reserved
+ *  10 - reserved
+ *  11 - reserved
+ *
+ *  ------- start of kernel segments:
+ *
+ *  12 - kernel code segment                                              <=== cacheline #4
+ *  13 - kernel data segment
+ *  14 - default user CS
+ *  15 - default user DS
+ *  16 - TSS                                                              <=== cacheline #5
+ *  17 - LDT
+ *  18 - PNPBIOS support (16->32 gate)
+ *  19 - PNPBIOS support
+ *  20 - PNPBIOS support                                                  <=== cacheline #6
+ *  21 - PNPBIOS support
+ *  22 - PNPBIOS support
+ *  23 - APM BIOS support
+ *  24 - APM BIOS support                                                 <=== cacheline #7
+ *  25 - APM BIOS support
+ *
+ *  26 - ESPFIX small SS
+ *  27 - per-cpu                  [ offset to per-cpu data area ]
+ *  28 - stack_canary-20          [ for stack protector ]                 <=== cacheline #8
+ *  29 - unused
+ *  30 - unused
+ *  31 - TSS for double fault handler
+ */
+
+ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+ #ifdef CONFIG_X86_64
+         /*
+          * We need valid kernel segments for data and code in long mode too
+          * IRET will check the segment types  kkeil 2000/10/28
+          * Also sysret mandates a special GDT layout
+          *
+          * TLS descriptors are currently at a different place compared to i386.
+          * Hopefully nobody expects them at a fixed place (Wine?)
+          */
+         [GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER32_CS]   = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ #else
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+         /*
+          * Segments used for calling PnP BIOS have byte granularity.
+          * They code segments and data segments have fixed 64k limits,
+          * the transfer segment sizes are set at run time.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_PNPBIOS_CS32]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_PNPBIOS_CS16]        = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_DS]          = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS1]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS2]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /*
+          * The APM segments have byte granularity and their bases
+          * are set at run time.  All have 64k limits.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_APMBIOS_BASE]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_APMBIOS_BASE+1]      = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* data */
+         [GDT_ENTRY_APMBIOS_BASE+2]      = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+         [GDT_ENTRY_ESPFIX_SS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_PERCPU]              = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         GDT_STACK_CANARY_INIT
+ #endif
+ } };
+ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+
+ + + + +
+
+ +

Inspecting selectors and segments

+ +

 

+ + + + +
+
+ +

Regular paging

+ +

 

+../_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.png + + + + +
+
+ +

Extended paging

+ +../_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.png + + + + +
+
+ +

Page tables

+ +
    +
  • Both page directory and page table have 1024 entries
  • +
  • Each entry has 4 bytes
  • +
  • The special CR3 register point to the base of the page directory
  • +
  • Page directory entries points to the base of the page table
  • +
  • All tables are stored in memory
  • +
  • All table addresses are physical addresses
  • +
+ + + + +
+
+ +

Page table entry fields

+ +
    +
  • Present/Absent
  • +
  • PFN (Page Frame Number): the most 20 significant bits of the physical address
  • +
  • Accessed - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Dirty - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Access rights: Read/Write
  • +
  • Privilege: User/Supervisor
  • +
  • Page size - only for page directory; if set extended paging is used
  • +
  • PCD (page cache disable), PWT (page write through)
  • +
+ + + + +
+
+ +

Linux paging

+ +../_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.png + + + + +
+
+ +

Linux APIs for page table handling

+ +
struct * page;
+pgd_t pgd;
+pmd_t pmd;
+pud_t pud;
+pte_t pte;
+void *laddr, *paddr;
+
+pgd = pgd_offset(mm, vaddr);
+pud = pud_offet(pgd, vaddr);
+pmd = pmd_offset(pud, vaddr);
+pte = pte_offset(pmd, vaddr);
+page = pte_page(pte);
+laddr = page_address(page);
+paddr = virt_to_phys(laddr);
+
+
+ + + + +
+
+ +

What about platforms with less then 4 levels of pagination?

+ +
static inline pud_t * pud_offset(pgd_t * pgd,unsigned long address)
+{
+    return (pud_t *)pgd;
+}
+
+static inline pmd_t * pmd_offset(pud_t * pud,unsigned long address)
+{
+    return (pmd_t *)pud;
+}
+
+
+ + + + +
+
+ +

Translation Look-aside Buffer

+ +
    +
  • Caches paging information (PFN, rights, privilege)
  • +
  • Content Addressable Memory / Associative Memory
      +
    • Very small (64-128)
    • +
    • Very fast (single cycle due to parallel search implementation)
    • +
    +
  • +
  • CPUs usually have two TLBs: i-TLB (code) and d-TLB (data)
  • +
  • TLB miss penalty: up hundreds of cycles
  • +
+ + + + +
+
+ +

TLB invalidation

+ +

Single address invalidation:

+
mov $addr, %eax
+invlpg %(eax)
+
+
+

Full invalidation:

+
mov %cr3, %eax
+mov %eax, %cr3
+
+
+ + + + +
+
+ +

Address space options for 32bit systems

+ +

 

+../_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.png + + + + +
+
+ +

Advantages and disadvantages

+ +
    +
  • Disadvantages for dedicated kernel space:
      +
    • Fully invalidating the TLB for every system call
    • +
    +
  • +
  • Disadvantages for shared address space
      +
    • Less address space for both kernel and user processes
    • +
    +
  • +
+ + + + +
+
+ +

Linux address space for 32bit systems

+ +

 

+../_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.png + + + + +
+
+ +

Virtual to physical address translations for I/O transfers

+ +
    +
  • Use the virtual address of a kernel buffer in order to copy to +data from from user space
  • +
  • Walk the page tables to transform the kernel buffer virtual +address to a physical address
  • +
  • Use the physical address of the kernel buffer to start a DMA +transfer
  • +
+ + + + +
+
+ +

Linear mappings

+ +
    +
  • Virtual to physical address space translation is reduced to one +operation (instead of walking the page tables)
  • +
  • Less memory is used to create the page tables
  • +
  • Less TLB entries are used for the kernel memory
  • +
+ + + + +
+
+ +

Highmem

+ +

 

+../_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.png + + + + +
+
+ +

Multi-page permanent mappings

+ +
void* vmalloc(unsigned long size);
+void vfree(void * addr);
+
+void *ioremap(unsigned long offset, unsigned size);
+void iounmap(void * addr);
+
+
+ + + + +
+
+ +

Fixed-mapped linear addresses

+ +
    +
  • Reserved virtual addresses (constants)
  • +
  • Mapped to physical addresses during boot
  • +
+
set_fixmap(idx, phys_addr)
+set_fixmap_nocache(idx, phys_addr)
+
+
+ + + + +
+
+ +

Fixed-mapped linear addresses

+ +
/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+    FIX_HOLE,
+#else
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+    VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
+#endif
+#endif
+    FIX_DBGP_BASE,
+    FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+    FIX_OHCI1394_BASE,
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+    FIX_APIC_BASE,        /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+    FIX_IO_APIC_BASE_0,
+    FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#ifdef CONFIG_X86_32
+    FIX_KMAP_BEGIN,       /* reserved pte's for temporary kernel mappings */
+    FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+    FIX_PCIE_MCFG,
+#endif
+
+
+ + + + +
+
+ +

Conversion between virtual address fixed address indexes

+ +
#define __fix_to_virt(x)  (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)  ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+#ifndef __ASSEMBLY__
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+ static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+ {
+     BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
+     return __fix_to_virt(idx);
+ }
+
+ static inline unsigned long virt_to_fix(const unsigned long vaddr)
+ {
+     BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+     return __virt_to_fix(vaddr);
+ }
+
+
+ inline long fix_to_virt(const unsigned int idx)
+ {
+     if (idx >= __end_of_fixed_addresses)
+         __this_fixmap_does_not_exist();
+     return (0xffffe000UL - (idx << PAGE_SHIFT));
+ }
+
+
+ + + + +
+
+ +

Temporary mappings

+ +
    +
  • kmap_atomic(), kunmap_atomic()
  • +
  • No context switch is permitted in atomic kmap section
  • +
  • Can be used in interrupt context
  • +
  • No locking required
  • +
  • Only invalidates on TLB entry
  • +
+ + + + +
+
+ +

Temporary mappings implementation

+ +
#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot)
+
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
+{
+  unsigned long vaddr;
+  int idx, type;
+
+  type = kmap_atomic_idx_push();
+  idx = type + KM_TYPE_NR*smp_processor_id();
+  vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+  BUG_ON(!pte_none(*(kmap_pte-idx)));
+  set_pte(kmap_pte-idx, mk_pte(page, prot));
+  arch_flush_lazy_mmu_mode();
+
+  return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_high_prot);
+
+static inline int kmap_atomic_idx_push(void)
+{
+  int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+  WARN_ON_ONCE(in_irq() && !irqs_disabled());
+  BUG_ON(idx >= KM_TYPE_NR);
+#endif
+  return idx;
+}
+
+
+ + + + +
+
+ +

Implementation of temporary mappings

+ +
    +
  • Use the fixed-mapped linear addresses
  • +
  • Every CPU has KM_TYPE_NR reserved entries to be used for +temporary mappings
  • +
  • Stack like selection: every user picks the current entry and +increments the "stack" counter
  • +
+ + + + +
+
+ +

Permanent mappings

+ +
    +
  • kmap(), kunmap()
  • +
  • Context switches are allowed
  • +
  • Only available in process context
  • +
  • One page table is reserved for permanent mappings
  • +
  • Page counter
      +
    • 0 - page is not mapped, free and ready to use
    • +
    • 1 - page is not mapped, may be present in TLB needs flushing before using
    • +
    • N - page is mapped N-1 times
    • +
    +
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/address-space.html b/refs/pull/405/merge/lectures/address-space.html new file mode 100644 index 00000000..c314c4dd --- /dev/null +++ b/refs/pull/405/merge/lectures/address-space.html @@ -0,0 +1,767 @@ + + + + + + Address Space — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Address Space

+

View slides

+
+

Lecture objectives:

+
    +
  • x86 MMU
      +
    • Segmentation
    • +
    • Paging
    • +
    • TLB
    • +
    +
  • +
  • Linux Address Space
      +
    • User
    • +
    • Kernel
    • +
    • High memory
    • +
    +
  • +
+
+
+

x86 MMU

+

The x86 MMU has a segmentation and a pagination unit. The segmentation +unit can be used to define logical memory segments defined by a +logical (virtual) start address, a base linear (mapped) address and a +size. A segment can also restrict access based on the access type +(read, execute, write) or the privilege level (we can define some +segments to be accessible only by kernel for example).

+

When the CPU makes a memory access, it will use the segmentation unit +to translate the logical address to a linear address, based on the +information in the segment descriptor.

+

If pagination is enabled the linear address will be further +transformed into a physical address, using the information from the +page tables.

+

Note that the segmentation unit can not be disabled, so if the MMU has +been enabled, segmentation will always be used.

+

 

+../_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.png +
+

Selectors

+

A program can use multiple segments and in order to determine which +segment to use, special registers (named selectors) are used. The +basic selectors that are typically used are CS - "Code Selector", DS - +"Data Selector" and SS - "Stack Selector".

+

Instruction fetches will by default use CS, while data access will by +default use DS unless the stack is used (e.g. data access through the +pop and push instructions) in which case SS will be used by default.

+

Selectors have three main fields: the index, the table index and the +running privilege level:

+

 

+../_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.png +

The index will be used to determine which entry of the descriptor +table should be used. TI is used to select either the Global +Descriptor Table (GDT) or the Local Descriptor Table (LDT). The tables +are effectively arrays that start at the location specified in the +special registers GDTR (for GDT) and LDTR (for LDT).

+
+

Note

+

LDT was designed so that applications can define their own +particular segments. Although not many applications use this +feature, Linux (and Windows) provide system calls that +allows an application to create their own segments.

+
+

RPL is only used for CS and it represents the current privilege +level. There are 4 privilege levels, the highest level being 0 (and +typically used by the kernel) and the lowest is 3 (and typically used +by user applications).

+
+
+

Segment descriptor

+

The CPU will use the index field of the selector to access an 8 byte +descriptor:

+

 

+../_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.png +
    +
  • Base: linear address for the start of the segment
  • +
  • Limit: size of the segment
  • +
  • G: granularity bit: if set the size is in bytes otherwise in 4K pages
  • +
  • B/D: data/code
  • +
  • Type: code segment, data/stack, TSS, LDT, GDT
  • +
  • Protection: the minimum priviledge level required to access the +segment (RPL is checked against DPL)
  • +
+

Some of the descriptor fields should be familiar. And that is because +there is some resemblance with Interrupt Descriptors we looked at +previously.

+
+
+

Segmentation in Linux

+

In Linux, segments are not used to define the stack, code or +data. These will be setup using the paging unit as it allows better +granularity and more importantly it allows Linux to use a generic +approach that works on other architectures (that don't have +segmentation support).

+

However, because the segmentation unit can not be disabled Linux must +create 4 generic 0 - 4GB segments for: kernel code, kernel data, user +code and user data.

+

Besides these, Linux uses segments for implementing Thread Local +Storage (TLS) together with the set_thread_area system call.

+

It also uses the TSS segment in order to define the kernel stack to +use when a change of privilege (e.g. system call, interrupt while +running in user-space) occurs.

+
/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ *   0 - null                                                             <=== cacheline #1
+ *   1 - reserved
+ *   2 - reserved
+ *   3 - reserved
+ *
+ *   4 - unused                                                           <=== cacheline #2
+ *   5 - unused
+ *
+ *  ------- start of TLS (Thread-Local Storage) segments:
+ *
+ *   6 - TLS segment #1                   [ glibc's TLS segment ]
+ *   7 - TLS segment #2                   [ Wine's %fs Win32 segment ]
+ *   8 - TLS segment #3                                                   <=== cacheline #3
+ *   9 - reserved
+ *  10 - reserved
+ *  11 - reserved
+ *
+ *  ------- start of kernel segments:
+ *
+ *  12 - kernel code segment                                              <=== cacheline #4
+ *  13 - kernel data segment
+ *  14 - default user CS
+ *  15 - default user DS
+ *  16 - TSS                                                              <=== cacheline #5
+ *  17 - LDT
+ *  18 - PNPBIOS support (16->32 gate)
+ *  19 - PNPBIOS support
+ *  20 - PNPBIOS support                                                  <=== cacheline #6
+ *  21 - PNPBIOS support
+ *  22 - PNPBIOS support
+ *  23 - APM BIOS support
+ *  24 - APM BIOS support                                                 <=== cacheline #7
+ *  25 - APM BIOS support
+ *
+ *  26 - ESPFIX small SS
+ *  27 - per-cpu                  [ offset to per-cpu data area ]
+ *  28 - stack_canary-20          [ for stack protector ]                 <=== cacheline #8
+ *  29 - unused
+ *  30 - unused
+ *  31 - TSS for double fault handler
+ */
+
+ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+ #ifdef CONFIG_X86_64
+         /*
+          * We need valid kernel segments for data and code in long mode too
+          * IRET will check the segment types  kkeil 2000/10/28
+          * Also sysret mandates a special GDT layout
+          *
+          * TLS descriptors are currently at a different place compared to i386.
+          * Hopefully nobody expects them at a fixed place (Wine?)
+          */
+         [GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER32_CS]   = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ #else
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+         /*
+          * Segments used for calling PnP BIOS have byte granularity.
+          * They code segments and data segments have fixed 64k limits,
+          * the transfer segment sizes are set at run time.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_PNPBIOS_CS32]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_PNPBIOS_CS16]        = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_DS]          = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS1]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS2]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /*
+          * The APM segments have byte granularity and their bases
+          * are set at run time.  All have 64k limits.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_APMBIOS_BASE]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_APMBIOS_BASE+1]      = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* data */
+         [GDT_ENTRY_APMBIOS_BASE+2]      = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+         [GDT_ENTRY_ESPFIX_SS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_PERCPU]              = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         GDT_STACK_CANARY_INIT
+ #endif
+ } };
+ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+
+
+
+

Inspecting selectors and segments

+

 

+
+
+

x86 Paging

+

The x86 paging unit support two types of paging: regular and extended paging.

+

Regular paging has 2 levels and a fixed page size of 4KB. The linear +address is split in three fields:

+
    +
  • Directory (the 10 most significant bits)
  • +
  • Table (the next 10 most bits)
  • +
  • Offset (the least significant 12 bits)
  • +
+

 

+../_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.png +

When extended paging is enabled, a single level is used and pages are +4MB. The linear address is split in two fields:

+
    +
  • Directory (10 most significant bits)
  • +
  • Offset (least significant 22 bits)
  • +
+../_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.png +
+
+

Page tables

+

We can mix regular and extended paging, the directory page has a bit +that specifies if extended or regular paging should be used. The +special CR3 register points to the base of the page directory and page +directory entries point to the base of the page table.

+

Both page directory and page table have 1024 entries and each entry +has 4 bytes.

+

All tables are stored in memory and the page table addresses are +physical addresses.

+

Page table entry fields:

+
    +
  • Present/Absent
  • +
  • PFN (Page Frame Number): the most 20 significant bits of the physical address
  • +
  • Accessed - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Dirty - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Access rights: Read/Write
  • +
  • Privilege: User/Supervisor
  • +
  • Page size - only for page directory; if set extended paging is used
  • +
  • PCD (page cache disable), PWT (page write through)
  • +
+
+
+

Linux paging

+

Linux paging uses 4 levels in order to support 64bit +architectures. The diagram below shows how the various virtual address +chunks are used to index the page tables and compute the physical +address.

+../_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.png +

Linux has a common API for creating and walking page tables. Creating +and modifying address spaces for kernel and processes is done using +the same generic code which relies on macros and functions to +translate these generic operations in code that runs on different +architectures.

+

Here is an example of how we can translate a virtual address to a +physical address, using the Linux page table APIs:

+
struct * page;
+pgd_t pgd;
+pmd_t pmd;
+pud_t pud;
+pte_t pte;
+void *laddr, *paddr;
+
+pgd = pgd_offset(mm, vaddr);
+pud = pud_offet(pgd, vaddr);
+pmd = pmd_offset(pud, vaddr);
+pte = pte_offset(pmd, vaddr);
+page = pte_page(pte);
+laddr = page_address(page);
+paddr = virt_to_phys(laddr);
+
+
+

In order to support architectures with less than 4 levels of +pagination (such as for x86 32bits) some macros and / or functions are +0 / empty:

+
static inline pud_t * pud_offset(pgd_t * pgd,unsigned long address)
+{
+    return (pud_t *)pgd;
+}
+
+static inline pmd_t * pmd_offset(pud_t * pud,unsigned long address)
+{
+    return (pmd_t *)pud;
+}
+
+
+
+
+

Translation Look-aside Buffer

+

When using virtual memory, due to the table page organization, we may +need an extra 1 (x86 extended paging), 2 (x86 regular paging) or 3 +(x86 64bit) memory access(es).

+

A special cache, called Translation Look-aside Buffer (TLB) is used to +speed up translations from virtual address to physical addresses.

+

The TLB has the following properties:

+
    +
  • Caches paging information (PFN, rights, privilege)
  • +
  • Content Addressable Memory / Associative Memory
      +
    • Very small (64-128)
    • +
    • Very fast (single cycle due to parallel search implementation)
    • +
    +
  • +
  • CPUs usually have two TLBs: i-TLB (code) and d-TLB (data)
  • +
  • TLB miss penalty: up hundreds of cycles
  • +
+

As with other caches, we must be careful to not create consistency +issues.

+

For example, when changing the mapping of one page to point to a +different physical memory location in the page tables, we must +invalidate the associated TLB entry. Otherwise, the MMU will do the +translation to the old physical address instead of the new physical +address.

+

The x86 platform supports TLB invalidation through two types of +operations.

+

Single address invalidation:

+
mov $addr, %eax
+invlpg %(eax)
+
+
+

Full invalidation:

+
mov %cr3, %eax
+mov %eax, %cr3
+
+
+
+
+
+

Linux address space

+
+

Address space options for 32bit systems

+

There are two main options for implementing kernel and user space: +either dedicated address spaces for each, or split a shared address +space.

+

 

+../_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.png +

Each has advantages and disadvantages:

+
    +
  • Disadvantages for dedicated kernel space:
      +
    • Fully invalidating the TLB for every system call
    • +
    +
  • +
  • Disadvantages for shared address space
      +
    • Less address space for both kernel and user processes
    • +
    +
  • +
+

Linux is using a split address space for 32 bit systems, although in +the past there were options for supporting 4/4s split or dedicated +kernel address space (on those architecture that supports it, +e.g. x86). Linux always uses split address space for 64 bit systems.

+

On overview of the Linux address space is presented below:

+

 

+../_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.png +
+
+

Linear mappings

+

Linear mappings refer to particular way of mapping virtual pages to +physical pages, where virtual page V, V + 1, ... V + n is mapped to +physical pages P, P + 1, ..., P + n.

+

To understand the necessity of linear mappings, we should look at +common kernel operations that involves using both the virtual and +physical address of a page such as an I/O transfer:

+
    +
  • Use the virtual address of a kernel buffer in order to copy to +data from from user space
  • +
  • Walk the page tables to transform the kernel buffer virtual +address to a physical address
  • +
  • Use the physical address of the kernel buffer to start a DMA +transfer
  • +
+

However, if we use linear mappings and the kernel buffers are in the +linear mapping area, then:

+
    +
  • Virtual to physical address space translation is reduced to one +operation (instead of walking the page tables)
  • +
  • Less memory is used to create the page tables
  • +
  • Less TLB entries are used for the kernel memory
  • +
+
+
+

Highmem

+

The "highmem" part of the virtual address space is used to create +arbitrary mappings (as opposed to linear mappings in lowmem). On 32bit +systems the highmem area is absolutely required in order to access +physical memory outside of lowmem. However, highmem is also used on +64bit systems but the use-case there is mainly to allow arbitrary +mappings in kernel space.

+

 

+../_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.png +

There are multiple types of mappings in the highmem area:

+
    +
  • Multi-page permanent mappings (vmalloc, ioremap)
  • +
  • Temporary 1 page mappings (atomic_kmap)
  • +
  • Permanent 1 page mappings (kmap, fix-mapped linear addresses)
  • +
+

Multiple page mappings allows mapping of ranges of physical memory +into the highmem area. Each such mapping is guarded by a +non-accessible page to catch buffer overflow and underflow errors.

+

The APIs that maps multiple pages into highmem are:

+
void* vmalloc(unsigned long size);
+void vfree(void * addr);
+
+void *ioremap(unsigned long offset, unsigned size);
+void iounmap(void * addr);
+
+
+

vmalloc() is used to allocate non-contiguous system memory +pages as a contiguous segment in the kernel virtual address space. It +is usefully when allocating large buffers because due to fragmentation +it is unlikely to find free large chunks of physical contiguous memory.

+

ioremap() is used to map device memory or device registers +into the kernel address space. It maps a contiguous physical memory +range into highmem with page caching disabled.

+
+
+

Fixed-mapped linear addresses

+

Fixed-mapped linear addresses are a special class of singular page +mappings that are used for accessing registers of commonly used +peripherals such as the APIC or IO APIC.

+

Typical I/O access for peripherals is to use a base (the kernel +virtual address space where the peripheral registers are mapped) + +offsets for various registers.

+

In order to optimize access, the base is reserved at compile time +(e.g. 0xFFFFF000). Since the base is constant, the various register +accesses of the form base + register offset will also be constant +and thus the compiler will avoid generating an extra instruction.

+

In summary, fixed-mapped linear addresses are:

+
    +
  • Reserved virtual addresses (constants)
  • +
  • Mapped to physical addresses during boot
  • +
+
set_fixmap(idx, phys_addr)
+set_fixmap_nocache(idx, phys_addr)
+
+
+

These addresses are architecture defined and, as an example, this is +the map for x86:

+
/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+    FIX_HOLE,
+#else
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+    VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
+#endif
+#endif
+    FIX_DBGP_BASE,
+    FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+    FIX_OHCI1394_BASE,
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+    FIX_APIC_BASE,        /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+    FIX_IO_APIC_BASE_0,
+    FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#ifdef CONFIG_X86_32
+    FIX_KMAP_BEGIN,       /* reserved pte's for temporary kernel mappings */
+    FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+    FIX_PCIE_MCFG,
+#endif
+
+
+

Notice how easy is to do the conversion between the virtual address +and the fixed address indexes:

+
#define __fix_to_virt(x)  (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)  ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+#ifndef __ASSEMBLY__
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+ static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+ {
+     BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
+     return __fix_to_virt(idx);
+ }
+
+ static inline unsigned long virt_to_fix(const unsigned long vaddr)
+ {
+     BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+     return __virt_to_fix(vaddr);
+ }
+
+
+ inline long fix_to_virt(const unsigned int idx)
+ {
+     if (idx >= __end_of_fixed_addresses)
+         __this_fixmap_does_not_exist();
+     return (0xffffe000UL - (idx << PAGE_SHIFT));
+ }
+
+
+
+
+

Temporary mappings

+

Temporary mappings can be used to map a single physical page, very +fast, in kernel space. It can be used in interrupt context but the +atomic kmap section, defined in between the kmap_atomic() and +kunmap_atomic() can not be preempted. That is why these are +called temporary mappings, as they can only be used momentarily.

+

Temporary mappings are very fast because there is no locking or +searching required and also there is no full TLB invalidation, just +the particular virtual page will be TLB invalidated.

+

Here are some code snippets that show that temporary mappings are +implemented:

+
#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot)
+
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
+{
+  unsigned long vaddr;
+  int idx, type;
+
+  type = kmap_atomic_idx_push();
+  idx = type + KM_TYPE_NR*smp_processor_id();
+  vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+  BUG_ON(!pte_none(*(kmap_pte-idx)));
+  set_pte(kmap_pte-idx, mk_pte(page, prot));
+  arch_flush_lazy_mmu_mode();
+
+  return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_high_prot);
+
+static inline int kmap_atomic_idx_push(void)
+{
+  int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+  WARN_ON_ONCE(in_irq() && !irqs_disabled());
+  BUG_ON(idx >= KM_TYPE_NR);
+#endif
+  return idx;
+}
+
+
+

Notice that fix-mapped linear addresses and a stack like approach is +used: each CPU has KM_TYPE_NR reserved entries which are used in a +first code first serve option. This allows using multiple temporary +mappings at once, for example one in process context, one in an +interrupt handler, and a few more in tasklets or softirqs.

+
+
+

Permanent mappings

+

Permanent mappings allows users to hold on to a mapping for long +(undefined) periods of time which means that context switch are +allowed after a mapping and before releasing it.

+

This flexibility comes with a price though. A search operation is +performed to find a free entry and they can not be used in interrupt +context - the operation that tries to find a free virtual address page +may block. There is a limited number of permanent mappings available +(topically one page is reserved for permanent mappings)

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/arch-slides.html b/refs/pull/405/merge/lectures/arch-slides.html new file mode 100644 index 00000000..93cabcde --- /dev/null +++ b/refs/pull/405/merge/lectures/arch-slides.html @@ -0,0 +1,244 @@ + + + + + + + + Architecture Layer — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Introduction

+ +
    +
  • Overview of the arch layer
  • +
  • Overview of the boot process
  • +
+ + + + +
+
+ +

Overview of the arch layer

+ +../_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.png + + + + +
+
+ +

Bootstrap

+ +
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Move / Relocate kernel code
  • +
+ + + + +
+
+ +

Bootstrap

+ +
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Copy bootloader arguments and determine kernel run location
  • +
  • Move / relocate kernel code to final location
  • +
  • Initial MMU setup - map the kernel
  • +
+ + + + +
+
+ +

Memory Setup

+ +
    +
  • Determine available memory and setup the boot memory allocator
  • +
  • Manages memory regions before the page allocator is setup
  • +
  • Bootmem - used a bitmap to track free blocks
  • +
  • Memblock - deprecates bootmem and adds support for memory ranges
      +
    • Supports both physical and virtual addresses
    • +
    • support NUMA architectures
    • +
    +
  • +
+ + + + +
+
+ +

MMU management

+ +
    +
  • Implements the generic page table manipulation APIs: types, +accessors, flags
  • +
  • Implement TLB management APIs: flush, invalidate
  • +
+ + + + +
+
+ +

Thread Management

+ +
    +
  • Defines the thread type (struct thread_info) and implements +functions for allocating threads (if needed)
  • +
  • Implement copy_thread() and switch_context()
  • +
+ + + + +
+
+ +

Timer Management

+ +
    +
  • Setup the timer tick and provide a time source
  • +
  • Mostly transitioned to platform drivers
      +
    • clock_event_device - for scheduling timers
    • +
    • clocksource - for reading the time
    • +
    +
  • +
+ + + + +
+
+ +

IRQs and exception management

+ +
    +
  • Define interrupt and exception handlers / entry points
  • +
  • Setup priorities
  • +
  • Platform drivers for interrupt controllers
  • +
+ + + + +
+
+ +

System calls

+ +
    +
  • Define system call entry point(s)
  • +
  • Implement user-space access primitives (e.g. copy_to_user)
  • +
+ + + + +
+
+ +

Platform Drivers

+ +
    +
  • Platform and architecture specific drivers
  • +
  • Bindings to platform device enumeration methods (e.g. device tree +or ACPI)
  • +
+ + + + +
+
+ +

Machine specific code

+ +
    +
  • Some architectures use a "machine" / "platform" abstraction
  • +
  • Typical for architecture used in embedded systems with a lot of +variety (e.g. ARM, powerPC)
  • +
+ + + + +
+
+ +

Boot flow inspection

+ + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/arch.html b/refs/pull/405/merge/lectures/arch.html new file mode 100644 index 00000000..316dfeec --- /dev/null +++ b/refs/pull/405/merge/lectures/arch.html @@ -0,0 +1,282 @@ + + + + + + Architecture Layer — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Architecture Layer

+

View slides

+
+

Lecture objectives:

+
    +
  • Overview of the arch layer
  • +
  • Overview of the boot process
  • +
+
+
+

Overview of the arch layer

+../_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.png +
+

Boot strap

+
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Move / Relocate kernel code
  • +
+
+
+

Boot strap

+
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Copy bootloader arguments and determine kernel run location
  • +
  • Move / relocate kernel code to final location
  • +
  • Initial MMU setup - map the kernel
  • +
+
+
+

Memory setup

+
    +
  • Determine available memory and setup the boot memory allocator
  • +
  • Manages memory regions before the page allocator is setup
  • +
  • Bootmem - used a bitmap to track free blocks
  • +
  • Memblock - deprecates bootmem and adds support for memory ranges
      +
    • Supports both physical and virtual addresses
    • +
    • support NUMA architectures
    • +
    +
  • +
+
+
+

MMU management

+
    +
  • Implements the generic page table manipulation APIs: types, +accessors, flags
  • +
  • Implement TLB management APIs: flush, invalidate
  • +
+
+
+

Thread Management

+
    +
  • Defines the thread type (struct thread_info) and implements +functions for allocating threads (if needed)
  • +
  • Implement copy_thread() and switch_context()
  • +
+
+
+

Time Management

+
    +
  • Setup the timer tick and provide a time source
  • +
  • Mostly transitioned to platform drivers
      +
    • clock_event_device - for scheduling timers
    • +
    • clocksource - for reading the time
    • +
    +
  • +
+
+
+

IRQs and exception management

+
    +
  • Define interrupt and exception handlers / entry points
  • +
  • Setup priorities
  • +
  • Platform drivers for interrupt controllers
  • +
+
+
+

System calls

+
    +
  • Define system call entry point(s)
  • +
  • Implement user-space access primitives (e.g. copy_to_user)
  • +
+
+
+

Platform Drivers

+
    +
  • Platform and architecture specific drivers
  • +
  • Bindings to platform device enumeration methods (e.g. device tree +or ACPI)
  • +
+
+
+

Machine specific code

+
    +
  • Some architectures use a "machine" / "platform" abstraction
  • +
  • Typical for architecture used in embedded systems with a lot of +variety (e.g. ARM, powerPC)
  • +
+
+
+
+

Overview of the boot process

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/debugging-slides.html b/refs/pull/405/merge/lectures/debugging-slides.html new file mode 100644 index 00000000..692811f1 --- /dev/null +++ b/refs/pull/405/merge/lectures/debugging-slides.html @@ -0,0 +1,830 @@ + + + + + + + + Debugging — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Debugging

+ +
    +
  • decoding an oops/panic
  • +
  • list debugging
  • +
  • memory debugging
  • +
  • locking debugging
  • +
  • profiling
  • +
+ + + + +
+
+ +

Oops module

+ +
static noinline void do_oops(void)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_oops_init(void)
+{
+    pr_info("oops_init\n");
+    do_oops();
+
+    return 0;
+}
+
+static void so2_oops_exit(void)
+{
+    pr_info("oops exit\n");
+}
+
+module_init(so2_oops_init);
+module_exit(so2_oops_exit);
+
+
+ + + + +
+
+ +

Oops information

+ +
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_oops+0x8/0x10 [oops]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: oops(O+)
+CPU: 0 PID: 234 Comm: insmod Tainted: G           O     4.15.0+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_oops+0x8/0x10 [oops]
+CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690
+EIP: 0x44902cc2
+EFLAGS: 00000206 CPU: 0
+EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008
+ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c
+DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8
+Killed
+
+
+ + + + +
+
+ +

Oops stacktrace

+ +
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+Call Trace:
+so2_oops_init+0x17/0x20 [oops]
+do_one_initcall+0x37/0x170
+? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0
+? __might_sleep+0x32/0x90
+? trace_hardirqs_on_caller+0x11c/0x1a0
+? do_init_module+0x17/0x1c2
+? kmem_cache_alloc+0xa4/0x1e0
+? do_init_module+0x17/0x1c2
+do_init_module+0x46/0x1c2
+load_module+0x1f45/0x2380
+SyS_init_module+0xe5/0x100
+do_int80_syscall_32+0x61/0x190
+entry_INT80_32+0x2f/0x2f
+Killed
+
+
+ + + + +
+
+ +

Debugging

+ +
    +
  • CONFIG_DEBUG_INFO
  • +
  • addr2line
  • +
  • gdb
  • +
  • objdump -dSr
  • +
+ + + + +
+
+ +

addr2line

+ +
$ addr2line -e oops.o  0x08
+$ skels/debugging/oops/oops.c:5
+$ # 0x08 is the offset of the offending instruction inside the oops.ko module
+
+
+ + + + +
+
+ +

objdump

+ +
$ cat /proc/modules
+oops 20480 1 - Loading 0xc8816000 (O+)
+
+$ objdump -dS --adjust-vma=0xc8816000 oops.ko
+c8816000:       b8 61 00 00 00          mov    $0x61,%eax
+
+static noinline void do_oops(void)
+{
+c8816005:       55                      push   %ebp
+c8816006:       89 e5                   mov    %esp,%ebp
+*(int*)0x42 = 'a';
+c8816008:       a3 42 00 00 00          mov    %eax,0x42
+
+
+ + + + +
+
+ +

gdb

+ +
$ gdb ./vmlinux
+
+(gdb) list *(do_panic+0x8)
+0xc1244138 is in do_panic (lib/test_panic.c:8).
+3
+4 static struct timer_list panic_timer;
+5
+6 static void do_panic(struct timer_list *unused)
+7 {
+8         *(int*)0x42 = 'a';
+9 }
+10
+11        static int so2_panic_init(void)
+
+
+ + + + +
+
+ +

Kernel panic

+ +
static struct timer_list panic_timer;
+
+static void do_panic(struct timer_list *unused)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_panic_init(void)
+{
+    pr_info("panic_init\n");
+
+    timer_setup(&panic_timer,  do_panic, 0);
+    mod_timer(&panic_timer, jiffies + 2 * HZ);
+
+    return 0;
+}
+
+
+ + + + +
+
+ +

List debugging

+ +
static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = (struct list_head*)LIST_POISON1;
+        entry->prev = (struct list_head*)LIST_POISON2;
+}
+
+BUG: unable to handle kernel NULL pointer dereference at 00000100
+IP: crush+0x80/0xb0 [list]
+
+
+ + + + +
+
+ +

Memory debugging

+ +
    +
  • SLAB/SLUB debugging
  • +
  • KASAN
  • +
  • kmemcheck
  • +
  • DEBUG_PAGEALLOC
  • +
+ + + + +
+
+ +

Slab debugging

+ +
    +
  • CONFIG_DEBUG_SLAB
  • +
  • poisoned based memory debuggers
  • +
+../_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.png + + + + +
+
+ +

Use before initialize bugs

+ +
BUG: unable to handle kernel paging request at 5a5a5a5a
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f1de81a2>] use_before_init+0x31/0x38 [crusher]
+[<f1de8265>] crush_it+0x38/0xa9 [crusher]
+[<f1de82de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f1de82d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_before_init(void)
+{
+     struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+     printk("%s\n", __func__);
+     list_del(&m->lh);
+}
+
+
+ + + + +
+
+ +

Use after free bug

+ +
BUG: unable to handle kernel paging request at 6b6b6b6b
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f4c6816a>] use_after_free+0x38/0x3f [crusher]
+[<f4c6827f>] crush_it+0x52/0xa9 [crusher]
+[<f4c682de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f4c682d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_after_free(void)
+{
+    struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    kfree(m);
+    list_del(&m->lh);
+}
+
+
+ + + + +
+
+ +

Use after free bug

+ +
# insmod /system/lib/modules/crusher.ko test=use_before_init
+Slab corruption: size-4096 start=ed612000, len=4096
+000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 6b 6b
+
+
+
noinline void use_after_free2(void)
+{
+    char *b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+    memset(b, 0, 30);
+    b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+}
+
+
+ + + + +
+
+ +

Buffer overflow bugs

+ +
slab error in verify_redzone_free(): cache `dummy': memory outside object was overwritten
+Pid: 1282, comm: insmod Not tainted 3.0.16-mid10-00007-ga4a6b62-dirty #70
+Call Trace:
+[<c10cc1de>] __slab_error+0x17/0x1c
+[<c10cc7ca>] __cache_free+0x12c/0x317
+[<c10ccaba>] kmem_cache_free+0x2b/0xaf
+[<f27f1138>] buffer_overflow+0x4c/0x57 [crusher]
+[<f27f12aa>] crush_it+0x6c/0xa9 [crusher]
+[<f27f12ef>] init_module+0x8/0xd [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+eb002bf8: redzone 1:0xd84156c5635688c0, redzone 2:0x0
+
+
+
noinline void buffer_overflow(void)
+{
+    struct kmem_cache *km = kmem_cache_create("dummy", 3000, 0, 0, NULL);
+    char *b = kmem_cache_alloc(km, GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    memset(b, 0, 3016);
+    kmem_cache_free(km, b);
+}
+
+
+ + + + +
+
+ +

DEBUG_PAGEALLOC

+ +
    +
  • Memory debugger that works at a page level
  • +
  • Detects invalid accesses either by:
      +
    • Filling pages with poison byte patterns and checking the pattern at +reallocation
    • +
    • Unmapping the dellocated pages from kernel space (just a few +architectures)
    • +
    +
  • +
+ + + + +
+
+ +

KASan

+ +
    +
  • dynamic memory error detector
  • +
  • finds user-after-free or out-of-bound bugs
  • +
  • uses shadow memory to track memory operations
  • +
  • lib/test_kasan.c
  • +
+ + + + +
+
+ +

KASan vs DEBUG_PAGEALLOC

+ +

KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page granularity +level, so it able to find more bugs.

+ + + + +
+
+ +

KASan vs SLUB_DEBUG

+ +
    +
  • SLUB_DEBUG has lower overhead than KASan.
  • +
  • SLUB_DEBUG in most cases are not able to detect bad reads, KASan able to +detect both reads and writes.
  • +
  • In some cases (e.g. redzone overwritten) SLUB_DEBUG detect bugs only on +allocation/freeing of object. KASan catch bugs right before it will happen, +so we always know exact place of first bad read/write.
  • +
+ + + + +
+
+ +

Kmemleak

+ +
    +
  • enable kernel config: CONFIG_DEBUG_KMEMLEAK
  • +
  • setup: mount -t debugfs nodev /sys/kernel/debug
  • +
  • trigger a memory scan: echo scan > /sys/kernel/debug/kmemleak
  • +
  • show memory leaks: cat /sys/kernel/debug/kmemleak
  • +
  • clear all possible leaks: echo clear > /sys/kernel/debug/kmemleak
  • +
+ + + + +
+
+ +

Kmemleak example

+ +
static int leak_init(void)
+{
+    pr_info("%s\n", __func__);
+
+    (void)kmalloc(16, GFP_KERNEL);
+
+    return 0;
+}
+
+MODULE_LICENSE("GPL v2");
+module_init(leak_init);
+
+
+ + + + +
+
+ +

Kmemleak report

+ +
root@qemux86:~# insmod skels/debugging/leak/leak.ko
+leak: loading out-of-tree module taints kernel.
+leak_init
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
+root@qemux86:~# cat /sys/kernel/debug/kmemleak
+unreferenced object 0xd7871500 (size 32):
+comm "insmod", pid 237, jiffies 4294902108 (age 24.628s)
+hex dump (first 32 bytes):
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a a5  ZZZZZZZZZZZZZZZ.
+backtrace:
+[<(ptrval)>] kmem_cache_alloc_trace+0x163/0x310
+[<(ptrval)>] leak_init+0x2f/0x1000 [leak]
+[<(ptrval)>] do_one_initcall+0x57/0x2e0
+[<(ptrval)>] do_init_module+0x4b/0x1be
+[<(ptrval)>] load_module+0x201a/0x2590
+[<(ptrval)>] sys_init_module+0xfd/0x120
+[<(ptrval)>] do_int80_syscall_32+0x6a/0x1a0
+
+
+ + + + +
+
+ +

Lockdep checker

+ +
    +
  • CONFIG_DEBUG_LOCKDEP
  • +
  • Detects lock inversio, circular dependencies, incorrect usage of locks +(including interrupt context)
  • +
  • Maintains dependency between classes of locks not individual locks
  • +
  • Each scenario is only checked once and hashed
  • +
+ + + + +
+
+ +

AB BA Deadlock Example

+ +
static noinline int thread_a(void *unused)
+{
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+
+  mutex_unlock(&b);
+  mutex_unlock(&a);
+
+  return 0;
+}
+
+
+
static noinline int thread_b(void *unused)
+{
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+
+  mutex_unlock(&a);
+  mutex_unlock(&b);
+
+  return 0;
+}
+
+
+ + + + +
+
+ +

AB BA Deadlock Report

+ +
thread_a acquired A
+thread_a acquired B
+thread_b acquired B
+
+======================================================
+WARNING: possible circular locking dependency detected
+4.19.0+ #4 Tainted: G           O
+------------------------------------------------------
+thread_b/238 is trying to acquire lock:
+(ptrval) (a){+.+.}, at: thread_b+0x48/0x90 [locking]
+
+but task is already holding lock:
+(ptrval) (b){+.+.}, at: thread_b+0x27/0x90 [locking]
+
+which lock already depends on the new lock.
+
+
+ + + + +
+
+ +

AB BA Deadlock Report (dependency chain)

+ +
the existing dependency chain (in reverse order) is:
+
+-> #1 (b){+.+.}:
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_a+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+-> #0 (a){+.+.}:
+      lock_acquire+0x93/0x190
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_b+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+
+ + + + +
+
+ +

AB BA Deadlock Report (unsafe locking scenario)

+ +
other info that might help us debug this:
+
+Possible unsafe locking scenario:
+
+CPU0                    CPU1
+----                    ----
+lock(b);
+                        lock(a);
+                        lock(b);
+lock(a);
+
+*** DEADLOCK ***
+
+
+ + + + +
+
+ +

IRQ Deadlock Example

+ +
static DEFINE_SPINLOCK(lock);
+
+static void timerfn(struct timer_list *unused)
+{
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+}
+
+static DEFINE_TIMER(timer, timerfn);
+
+int init_module(void)
+{
+  mod_timer(&timer, jiffies);
+
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+  return 0;
+}
+
+
+ + + + +
+
+ +

IRQ Deadlock Report

+ +
init_module acquiring lock
+init_module acquired lock
+init_module released lock
+timerfn acquiring lock
+
+================================
+WARNING: inconsistent lock state
+4.19.0+ #4 Tainted: G           O
+--------------------------------
+inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+ksoftirqd/0/9 [HC0[0]:SC1[1]:HE1:SE0] takes:
+(ptrval) (lock#4){+.?.}, at: timerfn+0x25/0x60 [locking2]
+{SOFTIRQ-ON-W} state was registered at:
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+init_module+0x35/0x70 [locking2]
+do_one_initcall+0x57/0x2e0
+do_init_module+0x4b/0x1be
+load_module+0x201a/0x2590
+sys_init_module+0xfd/0x120
+do_int80_syscall_32+0x6a/0x1a0
+restore_all+0x0/0x8d
+
+
+ + + + +
+
+ +

IRQ Deadlock Report

+ +
Possible unsafe locking scenario:
+
+       CPU0
+       ----
+       lock(lock#4);
+       <Interrupt>
+       lock(lock#4);
+
+       *** DEADLOCK ***
+
+1 lock held by ksoftirqd/0/9:
+#0: (ptrval) (/home/tavi/src/linux/tools/labs/skels/./debugging/locking2/locking2.c:13){+.-.}, at: call_timer_f0
+stack backtrace:
+CPU: 0 PID: 9 Comm: ksoftirqd/0 Tainted: G           O      4.19.0+ #4
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
+Call Trace:
+dump_stack+0x66/0x96
+print_usage_bug.part.26+0x1ee/0x200
+mark_lock+0x5ea/0x640
+__lock_acquire+0x4b4/0x17a0
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+timerfn+0x25/0x60 [locking2]
+
+
+ + + + +
+
+ +

perf

+ +
    +
  • performance counters, tracepoints, kprobes, uprobes
  • +
  • hardware events: CPU cycles, TLB misses, cache misses
  • +
  • software events: page faults , context switches
  • +
  • collects backtraces (user + kernel)
  • +
+ + + + +
+
+ +

Other tools

+ +
    +
  • ftrace
  • +
  • kprobes
  • +
  • sparse
  • +
  • coccinelle
  • +
  • checkpatch.pl
  • +
  • printk
  • +
  • dump_stack()
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/debugging.html b/refs/pull/405/merge/lectures/debugging.html new file mode 100644 index 00000000..ccb92e11 --- /dev/null +++ b/refs/pull/405/merge/lectures/debugging.html @@ -0,0 +1,902 @@ + + + + + + Debugging — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Debugging

+

View slides

+
+

Lecture objectives:

+

One essential part of Linux kernel development is debugging. In user space we had +the support of the kernel so we could easily stop processes and use gdb to inspect +their behavior. In the kernel, in order to use gdb we need to use hypervisor like +QEMU or JTAG based hardware interfaces which are not always available. The Linux +kernel provides a set of tools and debug options useful for investigating abnormal +behavior.

+

In this lecture we will learn about:

+
    +
  • decoding an oops/panic
  • +
  • list debugging
  • +
  • memory debugging
  • +
  • locking debugging
  • +
  • profiling
  • +
+
+
+

Decoding an oops/panic

+

An oops is an inconsistent state that the kernel detects inside itself. +Upon detecting an oops the Linux kernel kills the offending process, +prints information that can help debug the problem and continues execution +but with limited reliability.

+

Lets consider the following Linux kernel module:

+
static noinline void do_oops(void)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_oops_init(void)
+{
+    pr_info("oops_init\n");
+    do_oops();
+
+    return 0;
+}
+
+static void so2_oops_exit(void)
+{
+    pr_info("oops exit\n");
+}
+
+module_init(so2_oops_init);
+module_exit(so2_oops_exit);
+
+
+

Notice that ''do_oops'' function tries to write at an invalid memory address. Because the kernel +cannot find a suitable physical page were to write, it kills the insmod task in the context of +which ''do_oops'' runs. Then it prints the following oops message:

+
+
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_oops+0x8/0x10 [oops]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: oops(O+)
+CPU: 0 PID: 234 Comm: insmod Tainted: G           O     4.15.0+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_oops+0x8/0x10 [oops]
+EFLAGS: 00000292 CPU: 0
+EAX: 00000061 EBX: 00000000 ECX: c7ed3584 EDX: c7ece8dc
+ESI: c716c908 EDI: c8816010 EBP: c7257df0 ESP: c7257df0
+DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
+CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690
+Call Trace:
+so2_oops_init+0x17/0x20 [oops]
+do_one_initcall+0x37/0x170
+? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0
+? __might_sleep+0x32/0x90
+? trace_hardirqs_on_caller+0x11c/0x1a0
+? do_init_module+0x17/0x1c2
+? kmem_cache_alloc+0xa4/0x1e0
+? do_init_module+0x17/0x1c2
+do_init_module+0x46/0x1c2
+load_module+0x1f45/0x2380
+SyS_init_module+0xe5/0x100
+do_int80_syscall_32+0x61/0x190
+entry_INT80_32+0x2f/0x2f
+EIP: 0x44902cc2
+EFLAGS: 00000206 CPU: 0
+EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008
+ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c
+DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8
+EIP: do_oops+0x8/0x10 [oops] SS:ESP: 0068:c7257df0
+CR2: 0000000000000042
+---[ end trace 011848be72f8bb42 ]---
+Killed
+
+
+
+

An oops contains information about the IP which caused the fault, register status, process, +CPU on which the fault happend like below:

+
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_oops+0x8/0x10 [oops]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: oops(O+)
+CPU: 0 PID: 234 Comm: insmod Tainted: G           O     4.15.0+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_oops+0x8/0x10 [oops]
+CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690
+EIP: 0x44902cc2
+EFLAGS: 00000206 CPU: 0
+EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008
+ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c
+DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8
+Killed
+
+
+

Another important thing that an oops can provide is the stack trace of functions called before +the fault happend:

+
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+Call Trace:
+so2_oops_init+0x17/0x20 [oops]
+do_one_initcall+0x37/0x170
+? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0
+? __might_sleep+0x32/0x90
+? trace_hardirqs_on_caller+0x11c/0x1a0
+? do_init_module+0x17/0x1c2
+? kmem_cache_alloc+0xa4/0x1e0
+? do_init_module+0x17/0x1c2
+do_init_module+0x46/0x1c2
+load_module+0x1f45/0x2380
+SyS_init_module+0xe5/0x100
+do_int80_syscall_32+0x61/0x190
+entry_INT80_32+0x2f/0x2f
+Killed
+
+
+
+

Decoding an oops

+
    +
  • CONFIG_DEBUG_INFO
  • +
  • addr2line
  • +
  • gdb
  • +
  • objdump -dSr
  • +
+
+
+

addr2line

+

addr2line translates addresses into file names and line numbers. Given +an address in an executable it uses the debugging information to figure out +which file name and line number are associated with it.

+

Modules are loaded at dynamic addresses but are compiled starting with 0 as +a base address. So, in order to find the line number for a given dynamic address +we need to know module's load address.

+
$ addr2line -e oops.o  0x08
+$ skels/debugging/oops/oops.c:5
+$ # 0x08 is the offset of the offending instruction inside the oops.ko module
+
+
+
+
+

objdump

+

Similar we can determine the offending line using objdump:

+
$ cat /proc/modules
+oops 20480 1 - Loading 0xc8816000 (O+)
+
+$ objdump -dS --adjust-vma=0xc8816000 oops.ko
+c8816000:       b8 61 00 00 00          mov    $0x61,%eax
+
+static noinline void do_oops(void)
+{
+c8816005:       55                      push   %ebp
+c8816006:       89 e5                   mov    %esp,%ebp
+*(int*)0x42 = 'a';
+c8816008:       a3 42 00 00 00          mov    %eax,0x42
+
+
+
+
+

gdb

+
$ gdb ./vmlinux
+
+(gdb) list *(do_panic+0x8)
+0xc1244138 is in do_panic (lib/test_panic.c:8).
+3
+4 static struct timer_list panic_timer;
+5
+6 static void do_panic(struct timer_list *unused)
+7 {
+8         *(int*)0x42 = 'a';
+9 }
+10
+11        static int so2_panic_init(void)
+
+
+
+
+

Kernel panic

+

A kernel panic is a special type of oops where the kernel cannot continue execution. For example +if the function do_oops from above was called in the interrupt context, the kernel wouldn't know how to kill +and it will decide that it is better to crash the kernel and stop execution.

+

Here is a sample code that will generate a kernel panic:

+
static struct timer_list panic_timer;
+
+static void do_panic(struct timer_list *unused)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_panic_init(void)
+{
+    pr_info("panic_init\n");
+
+    timer_setup(&panic_timer,  do_panic, 0);
+    mod_timer(&panic_timer, jiffies + 2 * HZ);
+
+    return 0;
+}
+
+
+

Loading the module will generate the following kernel panic message:

+
root@qemux86:~/skels/debugging/panic# insmod panic.ko
+panic: loading out-of-tree module taints kernel.
+panic_init
+root@qemux86:~/skels/debugging/panic# BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_panic+0x8/0x10 [panic]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: panic(O)
+CPU: 0 PID: 0 Comm: swapper/0 Tainted: G           O     4.15.0+ #19
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_panic+0x8/0x10 [panic]
+EFLAGS: 00010246 CPU: 0
+EAX: 00000061 EBX: 00000101 ECX: 000002d8 EDX: 00000000
+ESI: c8817000 EDI: c8819200 EBP: c780ff34 ESP: c780ff34
+DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
+CR0: 80050033 CR2: 00000042 CR3: 0716b000 CR4: 00000690
+Call Trace:
+<SOFTIRQ>
+call_timer_fn+0x63/0xf0
+? process_timeout+0x10/0x10
+run_timer_softirq+0x14f/0x170
+? 0xc8817000
+? trace_hardirqs_on_caller+0x9b/0x1a0
+__do_softirq+0xde/0x1f2
+? __irqentry_text_end+0x6/0x6
+do_softirq_own_stack+0x57/0x70
+</SOFTIRQ>
+irq_exit+0x7d/0x90
+smp_apic_timer_interrupt+0x4f/0x90
+? trace_hardirqs_off_thunk+0xc/0x1d
+apic_timer_interrupt+0x3a/0x40
+EIP: default_idle+0xa/0x10
+EFLAGS: 00000246 CPU: 0
+EAX: c15c97c0 EBX: 00000000 ECX: 00000000 EDX: 00000001
+ESI: 00000000 EDI: 00000000 EBP: c15c3f48 ESP: c15c3f48
+DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
+arch_cpu_idle+0x9/0x10
+default_idle_call+0x19/0x30
+do_idle+0x105/0x180
+cpu_startup_entry+0x25/0x30
+rest_init+0x1e3/0x1f0
+start_kernel+0x305/0x30a
+i386_start_kernel+0x95/0x99
+startup_32_smp+0x15f/0x164
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 08 c7 04 24 24 80 81 c8 e8
+EIP: do_panic+0x8/0x10 [panic] SS:ESP: 0068:c780ff34
+CR2: 0000000000000042
+---[ end trace 77f49f83f2e42f91 ]---
+Kernel panic - not syncing: Fatal exception in interrupt
+Kernel Offset: disabled
+---[ end Kernel panic - not syncing: Fatal exception in interrupt
+
+
+
+
+
+

List debugging

+

In order to catch access to uninitialized elements the kernel uses poison +magic values.

+
static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = (struct list_head*)LIST_POISON1;
+        entry->prev = (struct list_head*)LIST_POISON2;
+}
+
+BUG: unable to handle kernel NULL pointer dereference at 00000100
+IP: crush+0x80/0xb0 [list]
+
+
+
+
+

Memory debugging

+

There are several tools for memory debugging:

+
    +
  • SLAB/SLUB debugging
  • +
  • KASAN
  • +
  • kmemcheck
  • +
  • DEBUG_PAGEALLOC
  • +
+
+

Slab debugging

+

Slab debugging uses a memory poison technique to detect several types of memory +bugs in the SLAB/SUB allocators.

+

The allocated buffers are guarded with memory that has been filled in with +special markers. Any adjacent writes to the buffer will be detected at a later +time when other memory management operations on that buffer are performed +(e.g. when the buffer is freed).

+

Upon allocation of the buffer, the buffer it is also filled in with a special +value to potentially detect buffer access before initialization (e.g. if the +buffer holds pointers). The value is selected in such a way that it is unlikely +to be a valid address and as such to trigger kernel bugs at the access time.

+

A similar technique is used when freeing the buffer: the buffer is filled with +another special value that will cause kernel bugs if pointers are accessed after +the memory is freed. In this case, the allocator also checks the next time the +buffer is allocated that the buffer was not modified.

+

The diagram bellow shows a summary of the way SLAB/SLUB poisoning works:

+
    +
  • CONFIG_DEBUG_SLAB
  • +
  • poisoned based memory debuggers
  • +
+../_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.png +

Example of an use before initialize bug:

+
BUG: unable to handle kernel paging request at 5a5a5a5a
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f1de81a2>] use_before_init+0x31/0x38 [crusher]
+[<f1de8265>] crush_it+0x38/0xa9 [crusher]
+[<f1de82de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f1de82d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_before_init(void)
+{
+     struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+     printk("%s\n", __func__);
+     list_del(&m->lh);
+}
+
+
+

Example of an use after free bug:

+
BUG: unable to handle kernel paging request at 6b6b6b6b
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f4c6816a>] use_after_free+0x38/0x3f [crusher]
+[<f4c6827f>] crush_it+0x52/0xa9 [crusher]
+[<f4c682de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f4c682d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_after_free(void)
+{
+    struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    kfree(m);
+    list_del(&m->lh);
+}
+
+
+

Another example of an use after free bug is shown below. Note that this time the +bug is detected at the next allocation.

+
# insmod /system/lib/modules/crusher.ko test=use_before_init
+Slab corruption: size-4096 start=ed612000, len=4096
+000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 6b 6b
+
+
+
noinline void use_after_free2(void)
+{
+    char *b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+    memset(b, 0, 30);
+    b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+}
+
+
+

Finally this is an example of a buffer overflow bug:

+
slab error in verify_redzone_free(): cache `dummy': memory outside object was overwritten
+Pid: 1282, comm: insmod Not tainted 3.0.16-mid10-00007-ga4a6b62-dirty #70
+Call Trace:
+[<c10cc1de>] __slab_error+0x17/0x1c
+[<c10cc7ca>] __cache_free+0x12c/0x317
+[<c10ccaba>] kmem_cache_free+0x2b/0xaf
+[<f27f1138>] buffer_overflow+0x4c/0x57 [crusher]
+[<f27f12aa>] crush_it+0x6c/0xa9 [crusher]
+[<f27f12ef>] init_module+0x8/0xd [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+eb002bf8: redzone 1:0xd84156c5635688c0, redzone 2:0x0
+
+
+
noinline void buffer_overflow(void)
+{
+    struct kmem_cache *km = kmem_cache_create("dummy", 3000, 0, 0, NULL);
+    char *b = kmem_cache_alloc(km, GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    memset(b, 0, 3016);
+    kmem_cache_free(km, b);
+}
+
+
+
+
+

DEBUG_PAGEALLOC

+
    +
  • Memory debugger that works at a page level
  • +
  • Detects invalid accesses either by:
      +
    • Filling pages with poison byte patterns and checking the pattern at +reallocation
    • +
    • Unmapping the dellocated pages from kernel space (just a few +architectures)
    • +
    +
  • +
+
+
+

KASan

+

KASan is a dynamic memory error detector designed to find use-after-free +and out-of-bounds bugs.

+

The main idea of KASAN is to use shadow memory to record whether each byte +of memory is safe to access or not, and use compiler's instrumentation to +check the shadow memory on each memory access.

+

Address sanitizer uses 1 byte of shadow memory to track 8 bytes of kernel +address space. It uses 0-7 to encode the number of consecutive bytes at +the beginning of the eigh-byte region that are valid.

+

See The Kernel Address Sanitizer (KASAN) for more information and have a look +at lib/test_kasan.c for an example of problems that KASan can detect.

+
    +
  • dynamic memory error detector
  • +
  • finds user-after-free or out-of-bound bugs
  • +
  • uses shadow memory to track memory operations
  • +
  • lib/test_kasan.c
  • +
+
+

KASan vs DEBUG_PAGEALLOC

+

KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page granularity +level, so it able to find more bugs.

+
+
+

KASan vs SLUB_DEBUG

+
    +
  • SLUB_DEBUG has lower overhead than KASan.
  • +
  • SLUB_DEBUG in most cases are not able to detect bad reads, KASan able to +detect both reads and writes.
  • +
  • In some cases (e.g. redzone overwritten) SLUB_DEBUG detect bugs only on +allocation/freeing of object. KASan catch bugs right before it will happen, +so we always know exact place of first bad read/write.
  • +
+
+
+
+

Kmemleak

+

Kmemleak provides a way of detecting kernel memory leaks in a way similar to a +tracing garbage collector. Since tracing pointers is not possible in C, kmemleak +scans the kernel stacks as well as dynamically and statically kernel memory for +pointers to allocated buffers. A buffer for which there is no pointer is +considered as leaked. The basic steps to use kmemleak are presented bellow, for +more information see Kernel Memory Leak Detector

+
    +
  • enable kernel config: CONFIG_DEBUG_KMEMLEAK
  • +
  • setup: mount -t debugfs nodev /sys/kernel/debug
  • +
  • trigger a memory scan: echo scan > /sys/kernel/debug/kmemleak
  • +
  • show memory leaks: cat /sys/kernel/debug/kmemleak
  • +
  • clear all possible leaks: echo clear > /sys/kernel/debug/kmemleak
  • +
+

As an example, lets look at the following simple module:

+
static int leak_init(void)
+{
+    pr_info("%s\n", __func__);
+
+    (void)kmalloc(16, GFP_KERNEL);
+
+    return 0;
+}
+
+MODULE_LICENSE("GPL v2");
+module_init(leak_init);
+
+
+

Loading the module and triggering a kmemleak scan will issue the +following report:

+
root@qemux86:~# insmod skels/debugging/leak/leak.ko
+leak: loading out-of-tree module taints kernel.
+leak_init
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
+root@qemux86:~# cat /sys/kernel/debug/kmemleak
+unreferenced object 0xd7871500 (size 32):
+comm "insmod", pid 237, jiffies 4294902108 (age 24.628s)
+hex dump (first 32 bytes):
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a a5  ZZZZZZZZZZZZZZZ.
+backtrace:
+[<(ptrval)>] kmem_cache_alloc_trace+0x163/0x310
+[<(ptrval)>] leak_init+0x2f/0x1000 [leak]
+[<(ptrval)>] do_one_initcall+0x57/0x2e0
+[<(ptrval)>] do_init_module+0x4b/0x1be
+[<(ptrval)>] load_module+0x201a/0x2590
+[<(ptrval)>] sys_init_module+0xfd/0x120
+[<(ptrval)>] do_int80_syscall_32+0x6a/0x1a0
+
+
+
+

Note

+

Notice that we did not had to unload the module to detect the memory +leak since kmemleak detects that the allocated buffer is not +reachable anymore.

+
+
+
+
+

Lockdep checker

+
    +
  • CONFIG_DEBUG_LOCKDEP
  • +
  • Detects lock inversio, circular dependencies, incorrect usage of locks +(including interrupt context)
  • +
  • Maintains dependency between classes of locks not individual locks
  • +
  • Each scenario is only checked once and hashed
  • +
+

Lets take for example the following kernel module that runs two kernel threads:

+
static noinline int thread_a(void *unused)
+{
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+
+  mutex_unlock(&b);
+  mutex_unlock(&a);
+
+  return 0;
+}
+
+
+
static noinline int thread_b(void *unused)
+{
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+
+  mutex_unlock(&a);
+  mutex_unlock(&b);
+
+  return 0;
+}
+
+
+

Loading this module with lockdep checker active will produce the following +kernel log:

+
thread_a acquired A
+thread_a acquired B
+thread_b acquired B
+
+======================================================
+WARNING: possible circular locking dependency detected
+4.19.0+ #4 Tainted: G           O
+------------------------------------------------------
+thread_b/238 is trying to acquire lock:
+(ptrval) (a){+.+.}, at: thread_b+0x48/0x90 [locking]
+
+but task is already holding lock:
+(ptrval) (b){+.+.}, at: thread_b+0x27/0x90 [locking]
+
+which lock already depends on the new lock.
+
+
+

As you can see, although the deadlock condition did not trigger (because thread +A did not complete execution before thread B started execution) the lockdep +checker identified a potential deadlock scenario.

+

Lockdep checker will provide even more information to help determine what caused +the deadlock, like the dependency chain:

+
the existing dependency chain (in reverse order) is:
+
+-> #1 (b){+.+.}:
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_a+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+-> #0 (a){+.+.}:
+      lock_acquire+0x93/0x190
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_b+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+
+

and even an unsafe locking scenario:

+
other info that might help us debug this:
+
+Possible unsafe locking scenario:
+
+CPU0                    CPU1
+----                    ----
+lock(b);
+                        lock(a);
+                        lock(b);
+lock(a);
+
+*** DEADLOCK ***
+
+
+

Another example of unsafe locking issues that lockdep checker detects +is unsafe locking from interrupt context. Lets consider the following +kernel module:

+
static DEFINE_SPINLOCK(lock);
+
+static void timerfn(struct timer_list *unused)
+{
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+}
+
+static DEFINE_TIMER(timer, timerfn);
+
+int init_module(void)
+{
+  mod_timer(&timer, jiffies);
+
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+  return 0;
+}
+
+
+

As in the previous case, loading the module will trigger a lockdep +warning:

+
init_module acquiring lock
+init_module acquired lock
+init_module released lock
+timerfn acquiring lock
+
+================================
+WARNING: inconsistent lock state
+4.19.0+ #4 Tainted: G           O
+--------------------------------
+inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+ksoftirqd/0/9 [HC0[0]:SC1[1]:HE1:SE0] takes:
+(ptrval) (lock#4){+.?.}, at: timerfn+0x25/0x60 [locking2]
+{SOFTIRQ-ON-W} state was registered at:
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+init_module+0x35/0x70 [locking2]
+do_one_initcall+0x57/0x2e0
+do_init_module+0x4b/0x1be
+load_module+0x201a/0x2590
+sys_init_module+0xfd/0x120
+do_int80_syscall_32+0x6a/0x1a0
+restore_all+0x0/0x8d
+
+
+

The warning will also provide additional information and a potential unsafe +locking scenario:

+
Possible unsafe locking scenario:
+
+       CPU0
+       ----
+       lock(lock#4);
+       <Interrupt>
+       lock(lock#4);
+
+       *** DEADLOCK ***
+
+1 lock held by ksoftirqd/0/9:
+#0: (ptrval) (/home/tavi/src/linux/tools/labs/skels/./debugging/locking2/locking2.c:13){+.-.}, at: call_timer_f0
+stack backtrace:
+CPU: 0 PID: 9 Comm: ksoftirqd/0 Tainted: G           O      4.19.0+ #4
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
+Call Trace:
+dump_stack+0x66/0x96
+print_usage_bug.part.26+0x1ee/0x200
+mark_lock+0x5ea/0x640
+__lock_acquire+0x4b4/0x17a0
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+timerfn+0x25/0x60 [locking2]
+
+
+
+
+

perf

+
    +
  • performance counters, tracepoints, kprobes, uprobes
  • +
  • hardware events: CPU cycles, TLB misses, cache misses
  • +
  • software events: page faults , context switches
  • +
  • collects backtraces (user + kernel)
  • +
+
+
+

Other tools

+
    +
  • ftrace
  • +
  • kprobes
  • +
  • sparse
  • +
  • coccinelle
  • +
  • checkpatch.pl
  • +
  • printk
  • +
  • dump_stack()
  • +
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/fs-slides.html b/refs/pull/405/merge/lectures/fs-slides.html new file mode 100644 index 00000000..6a78b011 --- /dev/null +++ b/refs/pull/405/merge/lectures/fs-slides.html @@ -0,0 +1,572 @@ + + + + + + + + Filesystem Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Filesystem Management

+ +
    +
  • Filesystem abstractions
  • +
  • Filesystem operations
  • +
  • Linux VFS
  • +
  • Overview of Linux I/O Management
  • +
+ + + + +
+
+ +

Filesystem Abstractions

+ +
    +
  • superblock
  • +
  • file
  • +
  • inode
  • +
  • dentry
  • +
+ + + + +
+
+ +

Filesystem Abstractions - in memory

+ +../_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.png + + + + +
+
+ +

Filesystem Abstractions - on storage

+ +../_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.png + + + + +
+
+ +

Simple filesystem example

+ +

 

+../_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.png + + + + +
+
+ +

Overview

+ +../_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.png + + + + +
+
+ +

Filesystem Operations

+ +
    +
  • Mount
  • +
  • Open a file
  • +
  • Querying file attributes
  • +
  • Reading data from a file
  • +
  • Writing file to a file
  • +
  • Creating a file
  • +
  • Deleting a file
  • +
+ + + + +
+
+ +

Mounting a filesystem

+ +
    +
  • Input: a storage device (partition)
  • +
  • Output: dentry pointing to the root directory
  • +
  • Steps: check device, determine filesystem parameters, locate the root inode
  • +
  • Example: check magic, determine block size, read the root inode and create dentry
  • +
+ + + + +
+
+ +

Opening a file

+ +
    +
  • Input: path
  • +
  • Output: file descriptor
  • +
  • Steps:
      +
    • Determine the filesystem type
    • +
    • For each name in the path: lookup parent dentry, load inode, +load data, find dentry
    • +
    • Create a new file that points to the last dentry
    • +
    • Find a free entry in the file descriptor table and set it to file
    • +
    +
  • +
+ + + + +
+
+ +

Querying file attributes

+ +
    +
  • Input: path
  • +
  • Output: file attributes
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Read file attributes from the inode
    • +
    +
  • +
+ + + + +
+
+ +

Reading data from a file

+ +
    +
  • Input: file descriptor, offset, length
  • +
  • Output: data
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Determine data blocks
    • +
    • Copy data blocks to memory
    • +
    +
  • +
+ + + + +
+
+ +

Writing data to a file

+ +
    +
  • Input: file descriptor, offset, length, data
  • +
  • Output:
  • +
  • Steps:
      +
    • Allocate one or more data blocks
    • +
    • Add the allocated blocks to the inode and update file size
    • +
    • Copy data from userspace to internal buffers and write them to +storage
    • +
    +
  • +
+ + + + +
+
+ +

Closing a file

+ +
    +
  • Input: file descriptor
  • +
  • Output:
  • +
  • Steps:
      +
    • set the file descriptor entry to NULL
    • +
    • Decrement file reference counter
    • +
    • When the counter reaches 0 free file
    • +
    +
  • +
+ + + + +
+
+ +

Directories

+ +

Directories are special files which contain one or more dentries.

+ + + + +
+
+ +

Creating a file

+ +
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • Determine the inode directory
    • +
    • Read data blocks and find space for a new dentry
    • +
    • Write back the modified inode directory data blocks
    • +
    +
  • +
+ + + + +
+
+ +

Deleting a file

+ +
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • determine the parent inode
    • +
    • read parent inode data blocks
    • +
    • find and erase the dentry (check for links)
    • +
    • when last file is closed: deallocate data and inode blocks
    • +
    +
  • +
+ + + + +
+
+ +

Virtual File System

+ +../_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.png + + + + +
+
+ +

Superblock Operations

+ +
    +
  • fill_super
  • +
  • put_super
  • +
  • write_super
  • +
  • read_inode
  • +
+
    +
  • write_inode
  • +
  • evict_inode
  • +
  • statfs
  • +
  • remount_fs
  • +
+
+ + + + +
+
+ +

Inode Operations

+ +
    +
  • create
  • +
  • lookup
  • +
  • link
  • +
  • unlink
  • +
  • symlink
  • +
  • mkdir
  • +
+
    +
  • rmdir
  • +
  • rename
  • +
  • readlink
  • +
  • follow_link
  • +
  • put_link
  • +
  • ...
  • +
+
+ + + + +
+
+ +

The Inode Cache

+ +
    +
  • Caches inodes into memory to avoid costly storage operations
  • +
  • An inode is cached until low memory conditions are triggered
  • +
  • inodes are indexed with a hash table
  • +
  • The inode hash function takes the superblock and inode number as +inputs
  • +
+ + + + +
+
+ +

The Dentry Cache

+ +
    +
  • State:
      +
    • Used – d_inode is valid and the dentry object is in use
    • +
    • Unused – d_inode is valid but the dentry object is not in use
    • +
    • Negative – d_inode is not valid; the inode was not yet loaded +or the file was erased
    • +
    +
  • +
  • Dentry cache
      +
    • List of used dentries (dentry->d_state == used)
    • +
    • List of the most recent used dentries (sorted by access time)
    • +
    • Hash table to avoid searching the tree
    • +
    +
  • +
+ + + + +
+
+ +

The Page Cache

+ +
    +
  • Caches file data and not block device data
  • +
  • Uses the struct address_space to translate file offsets +to block offsets
  • +
  • Used for both read / write and mmap
  • +
  • Uses a radix tree
  • +
+ + + + +
+
+ +

struct address_space

+ +
/**
+ * struct address_space - Contents of a cacheable, mappable object.
+ * @host: Owner, either the inode or the block_device.
+ * @i_pages: Cached pages.
+ * @gfp_mask: Memory allocation flags to use for allocating pages.
+ * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
+ * @i_mmap: Tree of private and shared mappings.
+ * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+ * @nrpages: Number of page entries, protected by the i_pages lock.
+ * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
+ * @writeback_index: Writeback starts here.
+ * @a_ops: Methods.
+ * @flags: Error bits and flags (AS_*).
+ * @wb_err: The most recent error which has occurred.
+ * @private_lock: For use by the owner of the address_space.
+ * @private_list: For use by the owner of the address_space.
+ * @private_data: For use by the owner of the address_space.
+ */
+struct address_space {
+  struct inode            *host;
+  struct xarray           i_pages;
+  gfp_t                   gfp_mask;
+  atomic_t                i_mmap_writable;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+  /* number of thp, only for non-shmem files */
+  atomic_t                nr_thps;
+#endif
+  struct rb_root_cached   i_mmap;
+  struct rw_semaphore     i_mmap_rwsem;
+  unsigned long           nrpages;
+  unsigned long           nrexceptional;
+  pgoff_t                 writeback_index;
+  const struct address_space_operations *a_ops;
+  unsigned long           flags;
+  errseq_t                wb_err;
+  spinlock_t              private_lock;
+  struct list_head        private_list;
+  void                    *private_data;
+} __attribute__((aligned(sizeof(long)))) __randomize_layout;
+
+struct address_space_operations {
+  int (*writepage)(struct page *page, struct writeback_control *wbc);
+  int (*readpage)(struct file *, struct page *);
+
+  /* Write back some dirty pages from this mapping. */
+  int (*writepages)(struct address_space *, struct writeback_control *);
+
+  /* Set a page dirty.  Return true if this dirtied it */
+  int (*set_page_dirty)(struct page *page);
+
+  /*
+   * Reads in the requested pages. Unlike ->readpage(), this is
+   * PURELY used for read-ahead!.
+   */
+  int (*readpages)(struct file *filp, struct address_space *mapping,
+                  struct list_head *pages, unsigned nr_pages);
+  void (*readahead)(struct readahead_control *);
+
+  int (*write_begin)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned flags,
+                          struct page **pagep, void **fsdata);
+  int (*write_end)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata);
+
+  /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
+  sector_t (*bmap)(struct address_space *, sector_t);
+  void (*invalidatepage) (struct page *, unsigned int, unsigned int);
+  int (*releasepage) (struct page *, gfp_t);
+  void (*freepage)(struct page *);
+  ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+  /*
+   * migrate the contents of a page to the specified target. If
+   * migrate_mode is MIGRATE_ASYNC, it must not block.
+   */
+  int (*migratepage) (struct address_space *,
+                  struct page *, struct page *, enum migrate_mode);
+  bool (*isolate_page)(struct page *, isolate_mode_t);
+  void (*putback_page)(struct page *);
+  int (*launder_page) (struct page *);
+  int (*is_partially_uptodate) (struct page *, unsigned long,
+                                  unsigned long);
+  void (*is_dirty_writeback) (struct page *, bool *, bool *);
+  int (*error_remove_page)(struct address_space *, struct page *);
+
+  /* swapfile support */
+  int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
+                          sector_t *span);
+  void (*swap_deactivate)(struct file *file);
+};
+
+
+ + + + +
+
+ +

Reading data

+ +
/**
+ * generic_file_read_iter - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the "read_iter()" routine for all filesystems
+ * that can use the page cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
+ * be returned when no data can be read without waiting for I/O requests
+ * to complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
+ * requests shall be made for the read or for readahead.  When no data
+ * can be read, -EAGAIN shall be returned.  When readahead would be
+ * triggered, a partial, possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/fs.html b/refs/pull/405/merge/lectures/fs.html new file mode 100644 index 00000000..e9cc54fd --- /dev/null +++ b/refs/pull/405/merge/lectures/fs.html @@ -0,0 +1,654 @@ + + + + + + Filesystem Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Filesystem Management

+

View slides

+
+

Lecture objectives:

+
    +
  • Filesystem abstractions
  • +
  • Filesystem operations
  • +
  • Linux VFS
  • +
  • Overview of Linux I/O Management
  • +
+
+
+

Filesystem Abstractions

+

A fileystem is a way to organize files and directories on storage +devices such as hard disks, SSDs or flash memory. There are many types +of filesystems (e.g. FAT, ext4, btrfs, ntfs) and on one running system +we can have multiple instances of the same filesystem type in use.

+

While filesystems use different data structures to organizing the +files, directories, user data and meta (internal) data on storage +devices there are a few common abstractions that are used in almost +all filesystems:

+
    +
  • superblock
  • +
  • file
  • +
  • inode
  • +
  • dentry
  • +
+

Some of these abstractions are present both on disk and in memory +while some are only present in memory.

+

The superblock abstraction contains information about the filesystem +instance such as the block size, the root inode, filesystem size. It +is present both on storage and in memory (for caching purposes).

+

The file abstraction contains information about an opened file such +as the current file pointer. It only exists in memory.

+

The inode is identifying a file on disk. It exists both on storage +and in memory (for caching purposes). An inode identifies a file in a +unique way and has various properties such as the file size, access +rights, file type, etc.

+
+

Note

+

The file name is not a property of the file.

+
+

The dentry associates a name with an inode. It exists both on +storage and in memory (for caching purposes).

+

The following diagram shows the relationship between the various filesystem +abstractions as they used in memory:

+../_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.png +

Note that not all of the one to many relationships between the various +abstractions are depicted.

+

Multiple file descriptors can point to the same file because we can +use the dup() system call to duplicate a file descriptor.

+

Multiple file abstractions can point to the same dentry if we open +the same path multiple times.

+

Multiple dentries can point to the same inode when hard links are +used.

+

The following diagram shows the relationship of the filesystem +abstraction on storage:

+../_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.png +

The diagram shows that the superblock is typically stored at the +beginning of the fileystem and that various blocks are used with +different purposes: some to store dentries, some to store inodes and +some to store user data blocks. There are also blocks used to manage +the available free blocks (e.g. bitmaps for the simple filesystems).

+

The next diagram show a very simple filesystem where blocks are +grouped together by function:

+
    +
  • the superblock contains information about the block size as well as +the IMAP, DMAP, IZONE and DZONE areas.
  • +
  • the IMAP area is comprised of multiple blocks which contains a +bitmap for inode allocation; it maintains the allocated/free state +for all inodes in the IZONE area
  • +
  • the DMAP area is comprised of multiple blocks which contains a +bitmap for data blocks; it maintains the allocated/free state for +all blocks the DZONE area
  • +
+

 

+../_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.png +
+
+

Filesystem Operations

+

The following diagram shows a high level overview of how the file +system drivers interact with the rest of the file system "stack". In +order to support multiple filesystem types and instances Linux +implements a large and complex subsystem that deals with filesystem +management. This is called Virtual File System (or sometimes Virtual +File Switch) and it is abbreviated with VFS.

+../_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.png +

VFS translates the complex file management related system calls to +simpler operations that are implemented by the device drivers. These +are some of the operations that a file system must implement:

+
    +
  • Mount
  • +
  • Open a file
  • +
  • Querying file attributes
  • +
  • Reading data from a file
  • +
  • Writing file to a file
  • +
  • Creating a file
  • +
  • Deleting a file
  • +
+

The next sections will look in-depth at some of these operations.

+
+

Mounting a filesystem

+

A summary of a typical implementation is presented below:

+
    +
  • Input: a storage device (partition)
  • +
  • Output: dentry pointing to the root directory
  • +
  • Steps: check device, determine filesystem parameters, locate the root inode
  • +
  • Example: check magic, determine block size, read the root inode and create dentry
  • +
+
+
+

Opening a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output: file descriptor
  • +
  • Steps:
      +
    • Determine the filesystem type
    • +
    • For each name in the path: lookup parent dentry, load inode, +load data, find dentry
    • +
    • Create a new file that points to the last dentry
    • +
    • Find a free entry in the file descriptor table and set it to file
    • +
    +
  • +
+
+
+

Querying file attributes

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output: file attributes
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Read file attributes from the inode
    • +
    +
  • +
+
+
+

Reading data from a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: file descriptor, offset, length
  • +
  • Output: data
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Determine data blocks
    • +
    • Copy data blocks to memory
    • +
    +
  • +
+
+
+

Writing data to a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: file descriptor, offset, length, data
  • +
  • Output:
  • +
  • Steps:
      +
    • Allocate one or more data blocks
    • +
    • Add the allocated blocks to the inode and update file size
    • +
    • Copy data from userspace to internal buffers and write them to +storage
    • +
    +
  • +
+
+
+

Closing a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: file descriptor
  • +
  • Output:
  • +
  • Steps:
      +
    • set the file descriptor entry to NULL
    • +
    • Decrement file reference counter
    • +
    • When the counter reaches 0 free file
    • +
    +
  • +
+
+
+

Directories

+

Directories are special files which contain one or more dentries.

+
+
+

Creating a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • Determine the inode directory
    • +
    • Read data blocks and find space for a new dentry
    • +
    • Write back the modified inode directory data blocks
    • +
    +
  • +
+
+
+

Deleting a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • determine the parent inode
    • +
    • read parent inode data blocks
    • +
    • find and erase the dentry (check for links)
    • +
    • when last file is closed: deallocate data and inode blocks
    • +
    +
  • +
+
+
+
+

Linux Virtual File System

+

Although the main purpose for the original introduction of VFS in UNIX +kernels was to support multiple filesystem types and instances, a side +effect was that it simplified fileystem device driver development +since command parts are now implement in the VFS. Almost all of the +caching and buffer management is dealt with VFS, leaving just +efficient data storage management to the filesystem device driver.

+

In order to deal with multiple filesystem types, VFS introduced the +common filesystem abstractions previously presented. Note that the +filesystem driver can also use its own particular fileystem +abstractions in memory (e.g. ext4 inode or dentry) and that there +might be a different abstraction on storage as well. Thus we may end +up with three slightly different filesystem abstractions: one for +VFS - always in memory, and two for a particular filesystem - one in +memory used by the filesystem driver, and one on storage.

+../_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.png +
+

Superblock Operations

+

VFS requires that all filesystem implement a set of "superblock +operations".

+

They deal with initializing, updating and freeing the VFS superblock:

+
+
    +
  • fill_super() - reads the filesystem statistics (e.g. total +number of inode, free number of inodes, total number of blocks, free +number of blocks)
  • +
  • write_super() - updates the superblock information on storage +(e.g. updating the number of free inode or data blocks)
  • +
  • put_super() - free any data associated with the filsystem +instance, called when unmounting a filesystem
  • +
+
+

The next class of operations are dealing with manipulating fileystem +inodes. These operations will receive VFS inodes as parameters but the +filesystem driver may use its own inode structures internally and, if +so, they will convert in between them as necessary.

+

A summary of the superblock operations are presented below:

+
    +
  • fill_super
  • +
  • put_super
  • +
  • write_super
  • +
  • read_inode
  • +
+
    +
  • write_inode
  • +
  • evict_inode
  • +
  • statfs
  • +
  • remount_fs
  • +
+
+
+
+

Inode Operations

+

The next set of operations that VFS calls when interacting with +filesystem device drivers are the "inode operations". Non-intuitively +these mostly deal with manipulating dentries - looking up a file name, +creating, linking and removing files, dealing with symbolic links, +creating and removing directories.

+

This is the list of the most important inode operations:

+
    +
  • create
  • +
  • lookup
  • +
  • link
  • +
  • unlink
  • +
  • symlink
  • +
  • mkdir
  • +
+
    +
  • rmdir
  • +
  • rename
  • +
  • readlink
  • +
  • follow_link
  • +
  • put_link
  • +
  • ...
  • +
+
+
+
+

The Inode Cache

+

The inode cache is used to avoid reading and writing inodes to and +from storage every time we need to read or update them. The cache uses +a hash table and inodes are indexed with a hash function which takes +as parameters the superblock (of a particular filesystem instance) and +the inode number associated with an inode.

+

inodes are cached until either the filesystem is unmounted, the inode +deleted or the system enters a memory pressure state. When this +happens the Linux memory management system will (among other things) +free inodes from the inode cache based on how often they were +accessed.

+
    +
  • Caches inodes into memory to avoid costly storage operations
  • +
  • An inode is cached until low memory conditions are triggered
  • +
  • inodes are indexed with a hash table
  • +
  • The inode hash function takes the superblock and inode number as +inputs
  • +
+
+
+

The Dentry Cache

+
    +
  • State:
      +
    • Used – d_inode is valid and the dentry object is in use
    • +
    • Unused – d_inode is valid but the dentry object is not in use
    • +
    • Negative – d_inode is not valid; the inode was not yet loaded +or the file was erased
    • +
    +
  • +
  • Dentry cache
      +
    • List of used dentries (dentry->d_state == used)
    • +
    • List of the most recent used dentries (sorted by access time)
    • +
    • Hash table to avoid searching the tree
    • +
    +
  • +
+
+
+

The Page Cache

+
    +
  • Caches file data and not block device data
  • +
  • Uses the struct address_space to translate file offsets +to block offsets
  • +
  • Used for both read / write and mmap
  • +
  • Uses a radix tree
  • +
+
/**
+ * struct address_space - Contents of a cacheable, mappable object.
+ * @host: Owner, either the inode or the block_device.
+ * @i_pages: Cached pages.
+ * @gfp_mask: Memory allocation flags to use for allocating pages.
+ * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
+ * @i_mmap: Tree of private and shared mappings.
+ * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+ * @nrpages: Number of page entries, protected by the i_pages lock.
+ * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
+ * @writeback_index: Writeback starts here.
+ * @a_ops: Methods.
+ * @flags: Error bits and flags (AS_*).
+ * @wb_err: The most recent error which has occurred.
+ * @private_lock: For use by the owner of the address_space.
+ * @private_list: For use by the owner of the address_space.
+ * @private_data: For use by the owner of the address_space.
+ */
+struct address_space {
+  struct inode            *host;
+  struct xarray           i_pages;
+  gfp_t                   gfp_mask;
+  atomic_t                i_mmap_writable;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+  /* number of thp, only for non-shmem files */
+  atomic_t                nr_thps;
+#endif
+  struct rb_root_cached   i_mmap;
+  struct rw_semaphore     i_mmap_rwsem;
+  unsigned long           nrpages;
+  unsigned long           nrexceptional;
+  pgoff_t                 writeback_index;
+  const struct address_space_operations *a_ops;
+  unsigned long           flags;
+  errseq_t                wb_err;
+  spinlock_t              private_lock;
+  struct list_head        private_list;
+  void                    *private_data;
+} __attribute__((aligned(sizeof(long)))) __randomize_layout;
+
+struct address_space_operations {
+  int (*writepage)(struct page *page, struct writeback_control *wbc);
+  int (*readpage)(struct file *, struct page *);
+
+  /* Write back some dirty pages from this mapping. */
+  int (*writepages)(struct address_space *, struct writeback_control *);
+
+  /* Set a page dirty.  Return true if this dirtied it */
+  int (*set_page_dirty)(struct page *page);
+
+  /*
+   * Reads in the requested pages. Unlike ->readpage(), this is
+   * PURELY used for read-ahead!.
+   */
+  int (*readpages)(struct file *filp, struct address_space *mapping,
+                  struct list_head *pages, unsigned nr_pages);
+  void (*readahead)(struct readahead_control *);
+
+  int (*write_begin)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned flags,
+                          struct page **pagep, void **fsdata);
+  int (*write_end)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata);
+
+  /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
+  sector_t (*bmap)(struct address_space *, sector_t);
+  void (*invalidatepage) (struct page *, unsigned int, unsigned int);
+  int (*releasepage) (struct page *, gfp_t);
+  void (*freepage)(struct page *);
+  ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+  /*
+   * migrate the contents of a page to the specified target. If
+   * migrate_mode is MIGRATE_ASYNC, it must not block.
+   */
+  int (*migratepage) (struct address_space *,
+                  struct page *, struct page *, enum migrate_mode);
+  bool (*isolate_page)(struct page *, isolate_mode_t);
+  void (*putback_page)(struct page *);
+  int (*launder_page) (struct page *);
+  int (*is_partially_uptodate) (struct page *, unsigned long,
+                                  unsigned long);
+  void (*is_dirty_writeback) (struct page *, bool *, bool *);
+  int (*error_remove_page)(struct address_space *, struct page *);
+
+  /* swapfile support */
+  int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
+                          sector_t *span);
+  void (*swap_deactivate)(struct file *file);
+};
+
+
+
/**
+ * generic_file_read_iter - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the "read_iter()" routine for all filesystems
+ * that can use the page cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
+ * be returned when no data can be read without waiting for I/O requests
+ * to complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
+ * requests shall be made for the read or for readahead.  When no data
+ * can be read, -EAGAIN shall be returned.  When readahead would be
+ * triggered, a partial, possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/interrupts-slides.html b/refs/pull/405/merge/lectures/interrupts-slides.html new file mode 100644 index 00000000..aff9d470 --- /dev/null +++ b/refs/pull/405/merge/lectures/interrupts-slides.html @@ -0,0 +1,628 @@ + + + + + + + + Interrupts — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Interrupts

+ +
    +
  • Interrupts and exceptions (x86)
  • +
  • Interrupts and exceptions (Linux)
  • +
  • Deferrable work
  • +
  • Timers
  • +
+ + + + +
+
+ +

Interrupts

+ +
    +
  • synchronous, generated by executing an instruction
  • +
  • asynchronous, generated by an external event
  • +
  • maskable
      +
    • can be ignored
    • +
    • signaled via INT pin
    • +
    +
  • +
  • non-maskable
      +
    • cannot be ignored
    • +
    • signaled via NMI pin
    • +
    +
  • +
+ + + + +
+
+ +

Exceptions

+ +
    +
  • processor detected
      +
    • faults
    • +
    • traps
    • +
    • aborts
    • +
    +
  • +
  • programmed
      +
    • int n
    • +
    +
  • +
+ + + + +
+
+ +

Quiz: interrupt terminology

+ +

For each of the following terms on the left select all the terms +from right that best describe them.

+
    +
  • Watchdog
  • +
  • Demand paging
  • +
  • Division by zero
  • +
  • Timer
  • +
  • System call
  • +
  • Breakpoint
  • +
+
    +
  • Exception
  • +
  • Interrupt
  • +
  • Maskable
  • +
  • Nonmaskable
  • +
  • Trap
  • +
  • Fault
  • +
+
+ + + + +
+
+ +

Programmable Interrupt Controller

+ +

 

+../_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.png + + + + +
+
+ +

Interrupt controllers in SMP systems

+ +

 

+../_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.png + + + + +
+
+ +

Enabling/disabling the interrupts

+ +
    +
  • at the device level
      +
    • by programming the device control registers
    • +
    +
  • +
  • at the PIC level
      +
    • PIC can be programmed to disable a given IRQ line
    • +
    +
  • +
  • at the CPU level; for example, on x86 one can use the following +instructions:
  • +
+
+
    +
  • cli (CLear Interrupt flag)
  • +
  • sti (SeT Interrupt flag)
  • +
+
+ + + + +
+
+ +

Interrupt priorities

+ +

 

+../_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.png + + + + +
+
+ +

Quiz: hardware concepts

+ +

Which of the following statements are true?

+
    +
  • The CPU can start processing a new interrupt before the current +one is finished
  • +
  • Interrupts can be disabled at the device level
  • +
  • Lower priority interrupts can not preempt handlers for higher +priority interrupts
  • +
  • Interrupts can be disabled at the interrupt controller level
  • +
  • On SMP systems the same interrupt can be routed to different CPUs
  • +
  • Interrupts can be disabled at the CPU level
  • +
+ + + + +
+
+ +

Interrupt Descriptor Table

+ +
    +
  • it is used as a jump table by the CPU when a given vector is triggered
  • +
  • it is an array of 256 x 8 bytes entries
  • +
  • may reside anywhere in physical memory
  • +
  • processor locates IDT by the means of IDTR
  • +
+ + + + +
+
+ +

Linux IRQ vector layout

+ +

 

+../_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.png + + + + +
+
+ +

Interrupt descriptor table entry (gate)

+ +

 

+../_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.png + + + + +
+
+ +

Interrupt handler address

+ +

 

+../_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.png + + + + +
+
+ +

Interrupt handler stack

+ +

 

+../_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.png + + + + +
+
+ +

Handling an interrupt request

+ +
    +
  • CPU checks the current privilege level

    +
  • +
  • if need to change privilege level

    +
    +
      +
    • change stack with the one associated with new privilege
    • +
    • save old stack information on the new stack
    • +
    +
    +
  • +
  • save EFLAGS, CS, EIP on stack

    +
  • +
  • save error code on stack in case of an abort

    +
  • +
  • execute the kernel interrupt handler

    +
  • +
+ + + + +
+
+ +

Returning from an interrupt

+ +
    +
  • pop the error code (in case of an abort)
  • +
  • call IRET
      +
    • pops values from the stack and restore the following register: CS, EIP, EFLAGS
    • +
    • if privilege level changed returns to the old stack and old privilege level
    • +
    +
  • +
+ + + + +
+
+ +

Inspecting the x86 interrupt handling

+ +

 

+ + + + +
+
+ +

Quiz: x86 interrupt handling

+ +

The following gdb commands are used to determine the handler for +the int80 based system call exception. Select and arrange the +commands or output of the commands in the correct order.

+
(void *) 0xc15de780 <entry_SYSENTER_32>
+
+set $idtr_addr=($idtr_entry>>48<<16)|($idtr_entry&0xffff)
+
+print (void*)$idtr_addr
+
+set $idtr = 0xff800000
+
+(void *) 0xc15de874 <entry_INT80_32>
+
+set $idtr = 0xff801000
+
+set $idtr_entry = *(uint64_t*)($idtr + 8 * 128)
+
+monitor info registers
+
+
+ + + + +
+
+ +

Interrupt handling in Linux

+ +

 

+../_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.png + + + + +
+
+ +

IRQ and exception nesting in Linux

+ +
    +
  • an exception (e.g. page fault, system call) can not preempt an interrupt; +if that occurs it is considered a bug
  • +
  • an interrupt can preempt an exception
  • +
  • an interrupt can not preempt another interrupt (it used to be possible)
  • +
+ + + + +
+
+ +

Interrupt/Exception nesting

+ +

 

+../_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.png + + + + +
+
+ +

Interrupt context

+ +
+
    +
  • it runs as a result of an IRQ (not of an exception)
  • +
  • there is no well defined process context associated
  • +
  • not allowed to trigger a context switch (no sleep, schedule, or user memory access)
  • +
+
+ + + + +
+
+ +

Deferrable actions

+ +
+
    +
  • Schedule callback functions to run at a later time
  • +
  • Interrupt context deferrable actions
  • +
  • Process context deferrable actions
  • +
  • APIs for initialization, scheduling, and masking
  • +
+
+ + + + +
+
+ +

Soft IRQs

+ +
+

Soft IRQ APIs:

+
+
    +
  • initialize: open_softirq()
  • +
  • activation: raise_softirq()
  • +
  • masking: local_bh_disable(), local_bh_enable()
  • +
+
+

Once activated, the callback function do_softirq() runs either:

+
+
    +
  • after an interrupt handler or
  • +
  • from the ksoftirqd kernel thread
  • +
+
+
+ + + + +
+
+ +

ksoftirqd

+ +
+
    +
  • minimum priority kernel thread
  • +
  • runs softirqs after certain limits are reached
  • +
  • tries to achieve good latency and avoid process starvation
  • +
+
+ + + + +
+
+ +

Types of soft IRQs

+ +
/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
+   frequency threaded job scheduling. For almost all the purposes
+   tasklets are more than enough. F.e. all serial device BHs et
+   al. should be converted to tasklets, not to softirqs.
+*/
+
+enum
+{
+  HI_SOFTIRQ=0,
+  TIMER_SOFTIRQ,
+  NET_TX_SOFTIRQ,
+  NET_RX_SOFTIRQ,
+  BLOCK_SOFTIRQ,
+  IRQ_POLL_SOFTIRQ,
+  TASKLET_SOFTIRQ,
+  SCHED_SOFTIRQ,
+  HRTIMER_SOFTIRQ,
+  RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
+
+  NR_SOFTIRQS
+};
+
+
+ + + + +
+
+ +

Packet flood example

+ +

 

+ + + + +
+
+ +

Tasklets

+ +

Tasklets are a dynamic type (not limited to a fixed number) of +deferred work running in interrupt context.

+

Tasklets API:

+
+
    +
  • initialization: tasklet_init()
  • +
  • activation: tasklet_schedule()
  • +
  • masking: tasklet_disable(), tasklet_enable()
  • +
+
+

Tasklets are implemented on top of two dedicated softirqs: +TASKLET_SOFITIRQ and HI_SOFTIRQ

+

Tasklets are also serialized, i.e. the same tasklet can only execute on one processor.

+ + + + +
+
+ +

Workqueues

+ +

Workqueues are a type of deferred work that runs in process context.

+

They are implemented on top of kernel threads.

+

Workqueues API:

+
+
    +
  • init: INIT_WORK
  • +
  • activation: schedule_work()
  • +
+
+ + + + +
+
+ +

Timers

+ +
+

Timers are implemented on top of the TIMER_SOFTIRQ

+

Timer API:

+
    +
  • initialization: setup_timer()
  • +
  • activation: mod_timer()
  • +
+
+ + + + +
+
+ +

Deferrable actions summary

+ +
+
    +
  • softIRQ
      +
    • runs in interrupt context
    • +
    • statically allocated
    • +
    • same handler may run in parallel on multiple cores
    • +
    +
  • +
  • tasklet
      +
    • runs in interrupt context
    • +
    • can be dynamically allocated
    • +
    • same handler runs are serialized
    • +
    +
  • +
  • workqueues
      +
    • run in process context
    • +
    +
  • +
+
+ + + + +
+
+ +

Quiz: Linux interrupt handling

+ +

Which of the following phases of interrupt handling runs with +interrupts disabled at the CPU level?

+
    +
  • Critical
  • +
  • Immediate
  • +
  • Deferred
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/interrupts.html b/refs/pull/405/merge/lectures/interrupts.html new file mode 100644 index 00000000..5ce40a8f --- /dev/null +++ b/refs/pull/405/merge/lectures/interrupts.html @@ -0,0 +1,755 @@ + + + + + + Interrupts — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Interrupts

+

View slides

+
+

Lecture objectives

+
    +
  • Interrupts and exceptions (x86)
  • +
  • Interrupts and exceptions (Linux)
  • +
  • Deferrable work
  • +
  • Timers
  • +
+
+
+

What is an interrupt?

+

An interrupt is an event that alters the normal execution flow of a +program and can be generated by hardware devices or even by the CPU +itself. When an interrupt occurs the current flow of execution is +suspended and interrupt handler runs. After the interrupt handler runs +the previous execution flow is resumed.

+

Interrupts can be grouped into two categories based on the source of +the interrupt. They can also be grouped into two other categories based +on the ability to postpone or temporarily disable the interrupt:

+
    +
  • synchronous, generated by executing an instruction
  • +
  • asynchronous, generated by an external event
  • +
  • maskable
      +
    • can be ignored
    • +
    • signaled via INT pin
    • +
    +
  • +
  • non-maskable
      +
    • cannot be ignored
    • +
    • signaled via NMI pin
    • +
    +
  • +
+

Synchronous interrupts, usually named exceptions, handle conditions detected by the +processor itself in the course of executing an instruction. Divide by zero or +a system call are examples of exceptions.

+

Asynchronous interrupts, usually named interrupts, are external events generated +by I/O devices. For example a network card generates an interrupts to signal +that a packet has arrived.

+

Most interrupts are maskable, which means we can temporarily postpone +running the interrupt handler when we disable the interrupt until the +time the interrupt is re-enabled. However, there are a few critical +interrupts that can not be disabled/postponed.

+
+

Exceptions

+

There are two sources for exceptions:

+
    +
  • processor detected
      +
    • faults
    • +
    • traps
    • +
    • aborts
    • +
    +
  • +
  • programmed
      +
    • int n
    • +
    +
  • +
+

Processor detected exceptions are raised when an abnormal condition is +detected while executing an instruction.

+

A fault is a type of exception that is reported before the execution of the +instruction and can be usually corrected. The saved EIP is the address of +the instruction that caused the fault, so after the fault is corrected +the program can re-execute the faulty instruction. (e.g page fault).

+

A trap is a type of exception that is reported after the execution of the +instruction in which the exception was detected. The saved EIP is the address +of the instruction after the instruction that caused the trap. (e.g debug trap).

+
+
+

Quiz: interrupt terminology

+

For each of the following terms on the left select all the terms +from right that best describe them.

+
    +
  • Watchdog
  • +
  • Demand paging
  • +
  • Division by zero
  • +
  • Timer
  • +
  • System call
  • +
  • Breakpoint
  • +
+
    +
  • Exception
  • +
  • Interrupt
  • +
  • Maskable
  • +
  • Nonmaskable
  • +
  • Trap
  • +
  • Fault
  • +
+
+
+
+
+

Hardware Concepts

+
+

Programmable Interrupt Controller

+

 

+../_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.png +

A device supporting interrupts has an output pin used for signaling an Interrupt ReQuest. IRQ +pins are connected to a device named Programmable Interrupt Controller (PIC) which is connected +to CPU's INTR pin.

+

A PIC usually has a set of ports used to exchange information with the CPU. When a device +connected to one of the PIC's IRQ lines needs CPU attention the following flow happens:

+
+
    +
  • device raises an interrupt on the corresponding IRQn pin
  • +
  • PIC converts the IRQ into a vector number and writes it to a port for CPU to read
  • +
  • PIC raises an interrupt on CPU INTR pin
  • +
  • PIC waits for CPU to acknowledge an interrupt before raising another interrupt
  • +
  • CPU acknowledges the interrupt then it starts handling the interrupt
  • +
+
+

Will see later how the CPU handles the interrupt. Notice that by +design PIC won't raise another interrupt until the CPU acknowledged +the current interrupt.

+
+

Note

+

Once the interrupt is acknowledged by the CPU the interrupt +controller can request another interrupt, regardless if the CPU +finished handled the previous interrupt or not. Thus, depending on +how the OS controls the CPU it is possible to have nested +interrupts.

+
+

The interrupt controller allows each IRQ line to be individually +disabled. This allows simplifying design by making sure that interrupt +handlers are always executed serially.

+
+
+

Interrupt controllers in SMP systems

+

In SMP systems we may have multiple interrupt controllers in the +systems.

+

For example, on the x86 architecture each core has a local APIC used +to process interrupts from locally connected devices like timers or +thermals sensors. Then there is an I/O APIC is used to distribute IRQ +from external devices to CPU cores.

+

 

+../_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.png +
+
+

Interrupt Control

+

In order to synchronize access to shared data between the interrupt handler +and other potential concurrent activities such as driver initialization or +driver data processing, it is often required to enable and disable interrupts in +a controlled fashion.

+

This can be accomplished at several levels:

+
    +
  • at the device level
      +
    • by programming the device control registers
    • +
    +
  • +
  • at the PIC level
      +
    • PIC can be programmed to disable a given IRQ line
    • +
    +
  • +
  • at the CPU level; for example, on x86 one can use the following +instructions:
  • +
+
+
    +
  • cli (CLear Interrupt flag)
  • +
  • sti (SeT Interrupt flag)
  • +
+
+
+
+

Interrupt priorities

+

Most architectures also support interrupt priorities. When this is +enabled, it permits interrupt nesting only for those interrupts that +have a higher priority than the current priority level.

+

 

+../_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.png +
+

Note

+

Not all architectures support interrupt priorities. It is also +difficult to support defining a generic scheme for interrupt +priorities for general use OSes and some kernels (Linux included) +do not use interrupt priorities. On the other hand most RTOS use +interrupt priorities since they are typically used in more +constraint use-cases where it is easier to define interrupt +priorities.

+
+
+
+

Quiz: hardware concepts

+

Which of the following statements are true?

+
    +
  • The CPU can start processing a new interrupt before the current +one is finished
  • +
  • Interrupts can be disabled at the device level
  • +
  • Lower priority interrupts can not preempt handlers for higher +priority interrupts
  • +
  • Interrupts can be disabled at the interrupt controller level
  • +
  • On SMP systems the same interrupt can be routed to different CPUs
  • +
  • Interrupts can be disabled at the CPU level
  • +
+
+
+
+

Interrupt handling on the x86 architecture

+

This section will examine how interrupts are handled by the CPU on the +x86 architecture.

+
+

Interrupt Descriptor Table

+

The interrupt descriptor table (IDT) associates each interrupt or exception +identifier with a descriptor for the instructions that service the associated +event. We will name the identifier as vector number and the associated +instructions as interrupt/exception handler.

+

An IDT has the following characteristics:

+
    +
  • it is used as a jump table by the CPU when a given vector is triggered
  • +
  • it is an array of 256 x 8 bytes entries
  • +
  • may reside anywhere in physical memory
  • +
  • processor locates IDT by the means of IDTR
  • +
+

Below we can find Linux IRQ vector layout. The first 32 entries are reserved +for exceptions, vector 128 is used for syscall interface and the rest are +used mostly for hardware interrupts handlers.

+

 

+../_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.png +

On x86 an IDT entry has 8 bytes and it is named gate. There can be 3 types of gates:

+
+
    +
  • interrupt gate, holds the address of an interrupt or exception handler. +Jumping to the handler disables maskable interrupts (IF flag is cleared).
  • +
  • trap gates, similar to an interrupt gate but it does not disable maskable +interrupts while jumping to interrupt/exception handler.
  • +
  • task gates (not used in Linux)
  • +
+
+

Let's have a look at several fields of an IDT entry:

+
+
    +
  • segment selector, index into GDT/LDT to find the start of the code segment where +the interrupt handlers reside
  • +
  • offset, offset inside the code segment
  • +
  • T, represents the type of gate
  • +
  • DPL, minimum privilege required for using the segments content.
  • +
+
+

 

+../_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.png +
+
+

Interrupt handler address

+

In order to find the interrupt handler address we first need to find the start +address of the code segment where interrupt handler resides. For this we +use the segment selector to index into GDT/LDT where we can find the corresponding +segment descriptor. This will provide the start address kept in the 'base' field. +Using base address and the offset we can now go to the start of the interrupt handler.

+

 

+../_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.png +
+
+

Stack of interrupt handler

+

Similar to control transfer to a normal function, a control transfer +to an interrupt or exception handler uses the stack to store the +information needed for returning to the interrupted code.

+

As can be seen in the figure below, an interrupt pushes the EFLAGS register +before saving the address of the interrupted instruction. Certain types +of exceptions also cause an error code to be pushed on the stack to help +debug the exception.

+

 

+../_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.png +
+
+

Handling an interrupt request

+

After an interrupt request has been generated the processor runs a sequence of +events that eventually end up with running the kernel interrupt handler:

+
    +
  • CPU checks the current privilege level

    +
  • +
  • if need to change privilege level

    +
    +
      +
    • change stack with the one associated with new privilege
    • +
    • save old stack information on the new stack
    • +
    +
    +
  • +
  • save EFLAGS, CS, EIP on stack

    +
  • +
  • save error code on stack in case of an abort

    +
  • +
  • execute the kernel interrupt handler

    +
  • +
+
+
+

Returning from an interrupt handler

+

Most architectures offer special instructions to clean up the stack and resume +the execution after the interrupt handler has been executed. On x86 IRET is used +to return from an interrupt handler. IRET is similar to RET except that IRET +increments ESP by extra four bytes (because of the flags on stack) and moves the +saved flags into EFLAGS register.

+

To resume the execution after an interrupt the following sequence is used (x86):

+
    +
  • pop the error code (in case of an abort)
  • +
  • call IRET
      +
    • pops values from the stack and restore the following register: CS, EIP, EFLAGS
    • +
    • if privilege level changed returns to the old stack and old privilege level
    • +
    +
  • +
+
+
+

Inspecting the x86 interrupt handling

+

 

+
+
+

Quiz: x86 interrupt handling

+

The following gdb commands are used to determine the handler for +the int80 based system call exception. Select and arrange the +commands or output of the commands in the correct order.

+
(void *) 0xc15de780 <entry_SYSENTER_32>
+
+set $idtr_addr=($idtr_entry>>48<<16)|($idtr_entry&0xffff)
+
+print (void*)$idtr_addr
+
+set $idtr = 0xff800000
+
+(void *) 0xc15de874 <entry_INT80_32>
+
+set $idtr = 0xff801000
+
+set $idtr_entry = *(uint64_t*)($idtr + 8 * 128)
+
+monitor info registers
+
+
+
+
+
+

Interrupt handling in Linux

+

In Linux the interrupt handling is done in three phases: critical, immediate and +deferred.

+

In the first phase the kernel will run the generic interrupt handler that +determines the interrupt number, the interrupt handler for this particular +interrupt and the interrupt controller. At this point any timing critical +actions will also be performed (e.g. acknowledge the interrupt at the interrupt +controller level). Local processor interrupts are disabled for the duration of +this phase and continue to be disabled in the next phase.

+

In the second phase, all of the device driver's handlers associated with this +interrupt will be executed. At the end of this phase, the interrupt controller's +"end of interrupt" method is called to allow the interrupt controller to +reassert this interrupt. The local processor interrupts are enabled at this +point.

+
+

Note

+

It is possible that one interrupt is associated with multiple +devices and in this case it is said that the interrupt is +shared. Usually, when using shared interrupts it is the +responsibility of the device driver to determine if the interrupt +is target to its device or not.

+
+

Finally, in the last phase of interrupt handling interrupt context deferrable +actions will be run. These are also sometimes known as "bottom half" of the +interrupt (the upper half being the part of the interrupt handling that runs +with interrupts disabled). At this point, interrupts are enabled on the local +processor.

+

 

+../_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.png +
+

Nested interrupts and exceptions

+

Linux used to support nested interrupts but this was removed some time +ago in order to avoid increasingly complex solutions to stack +overflows issues - allow just one level of nesting, allow multiple +levels of nesting up to a certain kernel stack depth, etc.

+

However, it is still possible to have nesting between exceptions and +interrupts but the rules are fairly restrictive:

+
    +
  • an exception (e.g. page fault, system call) can not preempt an interrupt; +if that occurs it is considered a bug
  • +
  • an interrupt can preempt an exception
  • +
  • an interrupt can not preempt another interrupt (it used to be possible)
  • +
+

The diagram below shows the possible nesting scenarios:

+

 

+../_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.png +
+
+

Interrupt context

+

While an interrupt is handled (from the time the CPU jumps to the interrupt +handler until the interrupt handler returns - e.g. IRET is issued) it is said +that code runs in "interrupt context".

+

Code that runs in interrupt context has the following properties:

+
+
    +
  • it runs as a result of an IRQ (not of an exception)
  • +
  • there is no well defined process context associated
  • +
  • not allowed to trigger a context switch (no sleep, schedule, or user memory access)
  • +
+
+
+
+

Deferrable actions

+

Deferrable actions are used to run callback functions at a later time. If +deferrable actions scheduled from an interrupt handler, the associated callback +function will run after the interrupt handler has completed.

+

There are two large categories of deferrable actions: those that run in +interrupt context and those that run in process context.

+

The purpose of interrupt context deferrable actions is to avoid doing too much +work in the interrupt handler function. Running for too long with interrupts +disabled can have undesired effects such as increased latency or poor system +performance due to missing other interrupts (e.g. dropping network packets +because the CPU did not react in time to dequeue packets from the network +interface and the network card buffer is full).

+

Deferrable actions have APIs to: initialize an instance, activate or +schedule the action and mask/disable and unmask/enable the execution +of the callback function. The latter is used for synchronization purposes between +the callback function and other contexts.

+

Typically the device driver will initialize the deferrable action +structure during the device instance initialization and will activate +/ schedule the deferrable action from the interrupt handler.

+
+
+

Soft IRQs

+

Soft IRQs is the term used for the low-level mechanism that implements deferring +work from interrupt handlers but that still runs in interrupt context.

+
+

Soft IRQ APIs:

+
+
    +
  • initialize: open_softirq()
  • +
  • activation: raise_softirq()
  • +
  • masking: local_bh_disable(), local_bh_enable()
  • +
+
+

Once activated, the callback function do_softirq() runs either:

+
+
    +
  • after an interrupt handler or
  • +
  • from the ksoftirqd kernel thread
  • +
+
+
+

Since softirqs can reschedule themselves or other interrupts can occur that +reschedules them, they can potentially lead to (temporary) process starvation if +checks are not put into place. Currently, the Linux kernel does not allow +running soft irqs for more than MAX_SOFTIRQ_TIME or rescheduling for +more than MAX_SOFTIRQ_RESTART consecutive times.

+

Once these limits are reached a special kernel thread, ksoftirqd is woken up +and all of the rest of pending soft irqs will be run from the context of this +kernel thread.

+

Soft irqs usage is restricted, they are use by a handful of subsystems that have +low latency requirements and high frequency:

+
/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
+   frequency threaded job scheduling. For almost all the purposes
+   tasklets are more than enough. F.e. all serial device BHs et
+   al. should be converted to tasklets, not to softirqs.
+*/
+
+enum
+{
+  HI_SOFTIRQ=0,
+  TIMER_SOFTIRQ,
+  NET_TX_SOFTIRQ,
+  NET_RX_SOFTIRQ,
+  BLOCK_SOFTIRQ,
+  IRQ_POLL_SOFTIRQ,
+  TASKLET_SOFTIRQ,
+  SCHED_SOFTIRQ,
+  HRTIMER_SOFTIRQ,
+  RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
+
+  NR_SOFTIRQS
+};
+
+
+
+
+

Packet flood example

+

The following screencast will look at what happens when we flood the +system with a large number of packets. Since at least a part of the +packet processing is happening in softirq we should expect the CPU to +spend most of the time running softirqs but the majority of that +should be in the context of the ksoftirqd thread.

+

 

+
+
+

Tasklets

+

Tasklets are a dynamic type (not limited to a fixed number) of +deferred work running in interrupt context.

+

Tasklets API:

+
+
    +
  • initialization: tasklet_init()
  • +
  • activation: tasklet_schedule()
  • +
  • masking: tasklet_disable(), tasklet_enable()
  • +
+
+

Tasklets are implemented on top of two dedicated softirqs: +TASKLET_SOFITIRQ and HI_SOFTIRQ

+

Tasklets are also serialized, i.e. the same tasklet can only execute on one processor.

+
+
+

Workqueues

+
+

Workqueues are a type of deferred work that runs in process context.

+

They are implemented on top of kernel threads.

+

Workqueues API:

+
+
    +
  • init: INIT_WORK
  • +
  • activation: schedule_work()
  • +
+
+
+
+
+

Timers

+
+

Timers are implemented on top of the TIMER_SOFTIRQ

+

Timer API:

+
    +
  • initialization: setup_timer()
  • +
  • activation: mod_timer()
  • +
+
+
+
+

Deferrable actions summary

+

Here is a cheat sheet which summarizes Linux deferrable actions:

+
+
    +
  • softIRQ
      +
    • runs in interrupt context
    • +
    • statically allocated
    • +
    • same handler may run in parallel on multiple cores
    • +
    +
  • +
  • tasklet
      +
    • runs in interrupt context
    • +
    • can be dynamically allocated
    • +
    • same handler runs are serialized
    • +
    +
  • +
  • workqueues
      +
    • run in process context
    • +
    +
  • +
+
+
+
+

Quiz: Linux interrupt handling

+

Which of the following phases of interrupt handling runs with +interrupts disabled at the CPU level?

+
    +
  • Critical
  • +
  • Immediate
  • +
  • Deferred
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/intro-slides.html b/refs/pull/405/merge/lectures/intro-slides.html new file mode 100644 index 00000000..5589ceb6 --- /dev/null +++ b/refs/pull/405/merge/lectures/intro-slides.html @@ -0,0 +1,503 @@ + + + + + + + + Introduction — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Introduction

+ +
    +
  • Basic operating systems terms and concepts
  • +
  • Overview of the Linux kernel
  • +
+ + + + +
+
+ +

User vs Kernel

+ +
    +
  • Execution modes
      +
    • Kernel mode
    • +
    • User mode
    • +
    +
  • +
  • Memory protection
      +
    • Kernel-space
    • +
    • User-space
    • +
    +
  • +
+ + + + +
+
+ +

Typical operating system architecture

+ +../_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.png + + + + +
+
+ +

Monolithic kernel

+ +../_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.png + + + + +
+
+ +

Micro-kernel

+ +../_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.png + + + + +
+
+ +

Monolithic kernels can be modular

+ +
    +
  • Components can enabled or disabled at compile time
  • +
  • Support of loadable kernel modules (at runtime)
  • +
  • Organize the kernel in logical, independent subsystems
  • +
  • Strict interfaces but with low performance overhead: macros, +inline functions, function pointers
  • +
+ + + + +
+
+ +

"Hybrid" kernels

+ +

Many operating systems and kernel experts have dismissed the label +as meaningless, and just marketing. Linus Torvalds said of this +issue:

+

"As to the whole 'hybrid kernel' thing - it's just marketing. It's +'oh, those microkernels had good PR, how can we try to get good PR +for our working kernel? Oh, I know, let's use a cool name and try +to imply that it has all the PR advantages that that other system +has'."

+ + + + +
+
+ +

Address space

+ +
    +
  • Physical address space
      +
    • RAM and peripheral memory
    • +
    +
  • +
  • Virtual address space
      +
    • How the CPU sees the memory (when in protected / paging mode)
    • +
    • Process address space
    • +
    • Kernel address space
    • +
    +
  • +
+ + + + +
+ +
+ +

Execution contexts

+ +
    +
  • Process context
      +
    • Code that runs in user mode, part of a process
    • +
    • Code that runs in kernel mode, as a result of a system call +issued by a process
    • +
    +
  • +
  • Interrupt context
      +
    • Code that runs as a result of an interrupt
    • +
    • Always runs in kernel mode
    • +
    +
  • +
+ + + + +
+
+ +

Multi-tasking

+ +
    +
  • An OS that supports the "simultaneous" execution of multiple processes
  • +
  • Implemented by fast switching between running processes to allow +the user to interact with each program
  • +
  • Implementation:
      +
    • Cooperative
    • +
    • Preemptive
    • +
    +
  • +
+ + + + +
+
+ +

Preemptive kernel

+ +

Preemptive multitasking and preemptive kernels are different terms.

+

A kernel is preemptive if a process can be preempted while running +in kernel mode.

+

However, note that non-preemptive kernels may support preemptive +multitasking.

+ + + + +
+
+ +

Pageable kernel memory

+ +

A kernel supports pageable kernel memory if parts of kernel memory +(code, data, stack or dynamically allocated memory) can be swapped +to disk.

+ + + + +
+
+ +

Kernel stack

+ +

Each process has a kernel stack that is used to maintain the +function call chain and local variables state while it is executing +in kernel mode, as a result of a system call.

+

The kernel stack is small (4KB - 12 KB) so the kernel developer has +to avoid allocating large structures on stack or recursive calls +that are not properly bounded.

+ + + + +
+
+ +

Portability

+ +
    +
  • Architecture and machine specific code (C & ASM)
  • +
  • Independent architecture code (C):
      +
    • kernel core (further split in multiple subsystems)
    • +
    • device drivers
    • +
    +
  • +
+ + + + +
+
+ +

Asymmetric MultiProcessing (ASMP)

+ +../_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.png + + + + +
+
+ +

Symmetric MultiProcessing (SMP)

+ +../_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.png + + + + +
+
+ +

CPU Scalability

+ +
    +
  • Use lock free algorithms when possible
  • +
  • Use fine grained locking for high contention areas
  • +
  • Pay attention to algorithm complexity
  • +
+ + + + +
+
+ +

Linux development model

+ +
    +
  • Open source, GPLv2 License
  • +
  • Contributors: companies, academia and independent developers
  • +
  • Development cycle: 3 – 4 months which consists of a 1 - 2 week +merge window followed by bug fixing
  • +
  • Features are only allowed in the merge window
  • +
  • After the merge window a release candidate is done on a weekly +basis (rc1, rc2, etc.)
  • +
+ + + + +
+
+ +

Maintainer hierarchy

+ +
    +
  • Linus Torvalds is the maintainer of the Linux kernel and merges pull +requests from subsystem maintainers
  • +
  • Each subsystem has one or more maintainers that accept patches or +pull requests from developers or device driver maintainers
  • +
  • Each maintainer has its own git tree, e.g.:
      +
    • Linux Torvalds: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    • +
    • David Miller (networking): git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/
    • +
    +
  • +
  • Each subsystem may maintain a -next tree where developers can submit +patches for the next merge window
  • +
+ + + + +
+
+ +

Linux source code layout

+ +../_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.png + + + + +
+
+ +

Linux kernel architecture

+ +../_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.png + + + + +
+
+ +

arch

+ +
    +
  • Architecture specific code
  • +
  • May be further sub-divided in machine specific code
  • +
  • Interfacing with the boot loader and architecture specific +initialization
  • +
  • Access to various hardware bits that are architecture or machine +specific such as interrupt controller, SMP controllers, BUS +controllers, exceptions and interrupt setup, virtual memory handling
  • +
  • Architecture optimized functions (e.g. memcpy, string operations, +etc.)
  • +
+ + + + +
+
+ +

Device drivers

+ +
    +
  • Unified device model
  • +
  • Each subsystem has its own specific driver interfaces
  • +
  • Many device driver types (TTY, serial, SCSI, fileystem, ethernet, +USB, framebuffer, input, sound, etc.)
  • +
+ + + + +
+
+ +

Process management

+ +
    +
  • Unix basic process management and POSIX threads support
  • +
  • Processes and threads are abstracted as tasks
  • +
  • Operating system level virtualization
      +
    • Namespaces
    • +
    • Control groups
    • +
    +
  • +
+ + + + +
+
+ +

Memory management

+ +
    +
  • Management of the physical memory: allocating and freeing memory
  • +
  • Management of the virtual memory: paging, swapping, demand +paging, copy on write
  • +
  • User services: user address space management (e.g. mmap(), brk(), +shared memory)
  • +
  • Kernel services: SL*B allocators, vmalloc
  • +
+ + + + +
+
+ +

Block I/O management

+ +../_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.png + + + + +
+
+ +

Virtual Filesystem Switch

+ +../_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.png + + + + +
+
+ +

Networking stack

+ +../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png + + + + +
+
+ +

Linux Security Modules

+ +
    +
  • Hooks to extend the default Linux security model
  • +
  • Used by several Linux security extensions:
      +
    • Security Enhancened Linux
    • +
    • AppArmor
    • +
    • Tomoyo
    • +
    • Smack
    • +
    +
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/intro.html b/refs/pull/405/merge/lectures/intro.html new file mode 100644 index 00000000..3661f8c2 --- /dev/null +++ b/refs/pull/405/merge/lectures/intro.html @@ -0,0 +1,710 @@ + + + + + + Introduction — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Introduction

+

View slides

+
+

Lecture objectives:

+
    +
  • Basic operating systems terms and concepts
  • +
  • Overview of the Linux kernel
  • +
+
+
+

Basic operating systems terms and concepts

+
+

User vs Kernel

+

Kernel and user are two terms that are often used in operating +systems. Their definition is pretty straight forward: The kernel is +the part of the operating system that runs with higher privileges +while user (space) usually means by applications running with low +privileges.

+

However these terms are heavily overloaded and might have very +specific meanings in some contexts.

+

User mode and kernel mode are terms that may refer specifically to the +processor execution mode. Code that runs in kernel mode can fully +[1] control the CPU while code that runs in user mode has +certain limitations. For example, local CPU interrupts can only be +disabled or enable while running in kernel mode. If such an operation +is attempted while running in user mode an exception will be generated +and the kernel will take over to handle it.

+ + + + + +
[1]some processors may have even higher privileges than +kernel mode, e.g. a hypervisor mode, that is only +accessible to code running in a hypervisor (virtual +machine monitor)
+

User space and kernel space may refer specifically to memory +protection or to virtual address spaces associated with either the +kernel or user applications.

+

Grossly simplifying, the kernel space is the memory area that is +reserved to the kernel while user space is the memory area reserved to +a particular user process. The kernel space is accessed protected so +that user applications can not access it directly, while user space +can be directly accessed from code running in kernel mode.

+
+
+

Typical operating system architecture

+

In the typical operating system architecture (see the figure below) +the operating system kernel is responsible for access and sharing the +hardware in a secure and fair manner with multiple applications.

+../_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.png +

The kernel offers a set of APIs that applications issue which are +generally referred to as "System Calls". These APIs are different from +regular library APIs because they are the boundary at which the +execution mode switch from user mode to kernel mode.

+

In order to provide application compatibility, system calls are rarely +changed. Linux particularly enforces this (as opposed to in kernel +APIs that can change as needed).

+

The kernel code itself can be logically separated in core kernel +code and device drivers code. Device drivers code is responsible of +accessing particular devices while the core kernel code is +generic. The core kernel can be further divided into multiple logical +subsystems (e.g. file access, networking, process management, etc.)

+
+
+

Monolithic kernel

+

A monolithic kernel is one where there is no access protection between +the various kernel subsystems and where public functions can be +directly called between various subsystems.

+../_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.png +

However, most monolithic kernels do enforce a logical separation +between subsystems especially between the core kernel and device +drivers with relatively strict APIs (but not necessarily fixed in +stone) that must be used to access services offered by one subsystem +or device drivers. This, of course, depends on the particular kernel +implementation and the kernel's architecture.

+
+
+

Micro kernel

+

A micro-kernel is one where large parts of the kernel are protected +from each-other, usually running as services in user space. Because +significant parts of the kernel are now running in user mode, the +remaining code that runs in kernel mode is significantly smaller, hence +micro-kernel term.

+../_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.png +

In a micro-kernel architecture the kernel contains just enough code +that allows for message passing between different running +processes. Practically that means implement the scheduler and an IPC +mechanism in the kernel, as well as basic memory management to setup +the protection between applications and services.

+

One of the advantages of this architecture is that the services are +isolated and hence bugs in one service won't impact other services.

+

As such, if a service crashes we can just restart it without affecting +the whole system. However, in practice this is difficult to achieve +since restarting a service may affect all applications that depend on +that service (e.g. if the file server crashes all applications with +opened file descriptors would encounter errors when accessing them).

+

This architecture imposes a modular approach to the kernel and offers +memory protection between services but at a cost of performance. What +is a simple function call between two services on monolithic kernels +now requires going through IPC and scheduling which will incur a +performance penalty [2].

+ + + + + +
[2]https://lwn.net/Articles/220255/
+
+
+

Micro-kernels vs monolithic kernels

+

Advocates of micro-kernels often suggest that micro-kernel are +superior because of the modular design a micro-kernel +enforces. However, monolithic kernels can also be modular and there +are several approaches that modern monolithic kernels use toward this +goal:

+
    +
  • Components can enabled or disabled at compile time
  • +
  • Support of loadable kernel modules (at runtime)
  • +
  • Organize the kernel in logical, independent subsystems
  • +
  • Strict interfaces but with low performance overhead: macros, +inline functions, function pointers
  • +
+

There is a class of operating systems that (used to) claim to be +hybrid kernels, in between monolithic and micro-kernels (e.g. Windows, +Mac OS X). However, since all of the typical monolithic services run +in kernel-mode in these operating systems, there is little merit to +qualify them other then monolithic kernels.

+

Many operating systems and kernel experts have dismissed the label +as meaningless, and just marketing. Linus Torvalds said of this +issue:

+

"As to the whole 'hybrid kernel' thing - it's just marketing. It's +'oh, those microkernels had good PR, how can we try to get good PR +for our working kernel? Oh, I know, let's use a cool name and try +to imply that it has all the PR advantages that that other system +has'."

+
+
+

Address space

+

The address space term is an overload term that can have different +meanings in different contexts.

+

The physical address space refers to the way the RAM and device +memories are visible on the memory bus. For example, on 32bit Intel +architecture, it is common to have the RAM mapped into the lower +physical address space while the graphics card memory is mapped high +in the physical address space.

+

The virtual address space (or sometimes just address space) refers to +the way the CPU sees the memory when the virtual memory module is +activated (sometime called protected mode or paging enabled). The +kernel is responsible of setting up a mapping that creates a virtual +address space in which areas of this space are mapped to certain +physical memory areas.

+

Related to the virtual address space there are two other terms that +are often used: process (address) space and kernel (address) space.

+

The process space is (part of) the virtual address space associated +with a process. It is the "memory view" of processes. It is a +continuous area that starts at zero. Where the process's address space +ends depends on the implementation and architecture.

+

The kernel space is the "memory view" of the code that runs in kernel +mode.

+
+
+

User and kernel sharing the virtual address space

+

A typical implementation for user and kernel spaces is one where the +virtual address space is shared between user processes and the kernel.

+

In this case kernel space is located at the top of the address space, +while user space at the bottom. In order to prevent the user processes +from accessing kernel space, the kernel creates mappings that prevent +access to the kernel space from user mode.

+ +
+
+

Execution contexts

+

One of the most important jobs of the kernel is to service interrupts +and to service them efficiently. This is so important that a special +execution context is associated with it.

+

The kernel executes in interrupt context when it runs as a result of +an interrupt. This includes the interrupt handler, but it is not +limited to it, there are other special (software) constructs that run +in interrupt mode.

+

Code running in interrupt context always runs in kernel mode and there +are certain limitations that the kernel programmer has to be aware of +(e.g. not calling blocking functions or accessing user space).

+

Opposed to interrupt context there is process context. Code that runs +in process context can do so in user mode (executing application code) +or in kernel mode (executing a system call).

+
+
+

Multi-tasking

+

Multitasking is the ability of the operating system to +"simultaneously" execute multiple programs. It does so by quickly +switching between running processes.

+

Cooperative multitasking requires the programs to cooperate to achieve +multitasking. A program will run and relinquish CPU control back +to the OS, which will then schedule another program.

+

With preemptive multitasking the kernel will enforce strict limits for +each process, so that all processes have a fair chance of +running. Each process is allowed to run a time slice (e.g. 100ms) +after which, if it is still running, it is forcefully preempted and +another task is scheduled.

+
+
+

Preemptive kernel

+

Preemptive multitasking and preemptive kernels are different terms.

+

A kernel is preemptive if a process can be preempted while running +in kernel mode.

+

However, note that non-preemptive kernels may support preemptive +multitasking.

+
+
+

Pageable kernel memory

+

A kernel supports pageable kernel memory if parts of kernel memory +(code, data, stack or dynamically allocated memory) can be swapped +to disk.

+
+
+

Kernel stack

+

Each process has a kernel stack that is used to maintain the +function call chain and local variables state while it is executing +in kernel mode, as a result of a system call.

+

The kernel stack is small (4KB - 12 KB) so the kernel developer has +to avoid allocating large structures on stack or recursive calls +that are not properly bounded.

+
+
+

Portability

+

In order to increase portability across various architectures and +hardware configurations, modern kernels are organized as follows at the +top level:

+
    +
  • Architecture and machine specific code (C & ASM)
  • +
  • Independent architecture code (C):
      +
    • kernel core (further split in multiple subsystems)
    • +
    • device drivers
    • +
    +
  • +
+

This makes it easier to reuse code as much as possible between +different architectures and machine configurations.

+
+
+

Asymmetric MultiProcessing (ASMP)

+

Asymmetric MultiProcessing (ASMP) is a way of supporting multiple +processors (cores) by a kernel, where a processor is dedicated to the +kernel and all other processors run user space programs.

+

The disadvantage of this approach is that the kernel throughput +(e.g. system calls, interrupt handling, etc.) does not scale with the +number of processors and hence typical processes frequently use system +calls. The scalability of the approach is limited to very specific +systems (e.g. scientific applications).

+../_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.png +
+
+

Symmetric MultiProcessing (SMP)

+

As opposed to ASMP, in SMP mode the kernel can run on any of the +existing processors, just as user processes. This approach is more +difficult to implement, because it creates race conditions in the +kernel if two processes run kernel functions that access the same +memory locations.

+

In order to support SMP the kernel must implement synchronization +primitives (e.g. spin locks) to guarantee that only one processor is +executing a critical section.

+../_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.png +
+
+

CPU Scalability

+

CPU scalability refers to how well the performance scales with +the number of cores. There are a few things that the kernel developer +should keep in mind with regard to CPU scalability:

+
    +
  • Use lock free algorithms when possible
  • +
  • Use fine grained locking for high contention areas
  • +
  • Pay attention to algorithm complexity
  • +
+
+
+
+

Overview of the Linux kernel

+
+

Linux development model

+

The Linux kernel is one the largest open source projects in the world +with thousands of developers contributing code and millions of lines of +code changed for each release.

+

It is distributed under the GPLv2 license, which simply put, +requires that any modification of the kernel done on software that is +shipped to customer should be made available to them (the customers), +although in practice most companies make the source code publicly +available.

+

There are many companies (often competing) that contribute code to the +Linux kernel as well as people from academia and independent +developers.

+

The current development model is based on doing releases at fixed +intervals of time (usually 3 - 4 months). New features are merged into +the kernel during a one or two week merge window. After the merge +window, a release candidate is done on a weekly basis (rc1, rc2, etc.)

+
+
+

Maintainer hierarchy

+

In order to scale the development process, Linux uses a hierarchical +maintainership model:

+
    +
  • Linus Torvalds is the maintainer of the Linux kernel and merges pull +requests from subsystem maintainers
  • +
  • Each subsystem has one or more maintainers that accept patches or +pull requests from developers or device driver maintainers
  • +
  • Each maintainer has its own git tree, e.g.:
      +
    • Linux Torvalds: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    • +
    • David Miller (networking): git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/
    • +
    +
  • +
  • Each subsystem may maintain a -next tree where developers can submit +patches for the next merge window
  • +
+

Since the merge window is only a maximum of two weeks, most of the +maintainers have a -next tree where they accept new features from +developers or maintainers downstream while even when the merge window +is closed.

+

Note that bug fixes are accepted even outside merge window in the +maintainer's tree from where they are periodically pulled by the +upstream maintainer regularly, for every release candidate.

+
+
+

Linux source code layout

+../_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.png +

These are the top level of the Linux source code folders:

+
    +
  • arch - contains architecture specific code; each architecture is +implemented in a specific sub-folder (e.g. arm, arm64, x86)
  • +
  • block - contains the block subsystem code that deals with reading +and writing data from block devices: creating block I/O requests, +scheduling them (there are several I/O schedulers available), +merging requests, and passing them down through the I/O stack to the +block device drivers
  • +
  • certs - implements support for signature checking using certificates
  • +
  • crypto - software implementation of various cryptography algorithms +as well as a framework that allows offloading such algorithms in +hardware
  • +
  • Documentation - documentation for various subsystems, Linux kernel +command line options, description for sysfs files and format, device +tree bindings (supported device tree nodes and format)
  • +
  • drivers - driver for various devices as well as the Linux driver +model implementation (an abstraction that describes drivers, devices +buses and the way they are connected)
  • +
  • firmware - binary or hex firmware files that are used by various +device drivers
  • +
  • fs - home of the Virtual Filesystem Switch (generic filesystem code) +and of various filesystem drivers
  • +
  • include - header files
  • +
  • init - the generic (as opposed to architecture specific) +initialization code that runs during boot
  • +
  • ipc - implementation for various Inter Process Communication system +calls such as message queue, semaphores, shared memory
  • +
  • kernel - process management code (including support for kernel +thread, workqueues), scheduler, tracing, time management, generic +irq code, locking
  • +
  • lib - various generic functions such as sorting, checksums, +compression and decompression, bitmap manipulation, etc.
  • +
  • mm - memory management code, for both physical and virtual memory, +including the page, SL*B and CMA allocators, swapping, virtual memory +mapping, process address space manipulation, etc.
  • +
  • net - implementation for various network stacks including IPv4 and +IPv6; BSD socket implementation, routing, filtering, packet +scheduling, bridging, etc.
  • +
  • samples - various driver samples
  • +
  • scripts - parts the build system, scripts used for building modules, +kconfig the Linux kernel configurator, as well as various other +scripts (e.g. checkpatch.pl that checks if a patch is conform with +the Linux kernel coding style)
  • +
  • security - home of the Linux Security Module framework that allows +extending the default (Unix) security model as well as +implementation for multiple such extensions such as SELinux, smack, +apparmor, tomoyo, etc.
  • +
  • sound - home of ALSA (Advanced Linux Sound System) as well as the +old Linux sound framework (OSS)
  • +
  • tools - various user space tools for testing or interacting with +Linux kernel subsystems
  • +
  • usr - support for embedding an initrd file in the kernel image
  • +
  • virt - home of the KVM (Kernel Virtual Machine) hypervisor
  • +
+
+
+

Linux kernel architecture

+../_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.png +
+

arch

+
    +
  • Architecture specific code
  • +
  • May be further sub-divided in machine specific code
  • +
  • Interfacing with the boot loader and architecture specific +initialization
  • +
  • Access to various hardware bits that are architecture or machine +specific such as interrupt controller, SMP controllers, BUS +controllers, exceptions and interrupt setup, virtual memory handling
  • +
  • Architecture optimized functions (e.g. memcpy, string operations, +etc.)
  • +
+

This part of the Linux kernel contains architecture specific code and +may be further sub-divided in machine specific code for certain +architectures (e.g. arm).

+

"Linux was first developed for 32-bit x86-based PCs (386 or +higher). These days it also runs on (at least) the Compaq Alpha AXP, +Sun SPARC and UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, +Hitachi SuperH, IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD +x86-64 and CRIS architectures.”

+

It implements access to various hardware bits that are architecture or +machine specific such as interrupt controller, SMP controllers, BUS +controllers, exceptions and interrupt setup, virtual memory handling.

+

It also implements architecture optimized functions (e.g. memcpy, +string operations, etc.)

+
+
+

Device drivers

+

The Linux kernel uses a unified device model whose purpose is to +maintain internal data structures that reflect the state and structure +of the system. Such information includes what devices are present, +what is their status, what bus they are attached to, to what driver +they are attached, etc. This information is essential for implementing +system wide power management, as well as device discovery and dynamic +device removal.

+

Each subsystem has its own specific driver interface that is tailored +to the devices it represents in order to make it easier to write +correct drivers and to reduce code duplication.

+

Linux supports one of the most diverse set of device drivers type, +some examples are: TTY, serial, SCSI, fileystem, ethernet, USB, +framebuffer, input, sound, etc.

+
+
+

Process management

+

Linux implements the standard Unix process management APIs such as +fork(), exec(), wait(), as well as standard POSIX threads.

+

However, Linux processes and threads are implemented particularly +different than other kernels. There are no internal structures +implementing processes or threads, instead there is a struct +task_struct that describe an abstract scheduling unit called task.

+

A task has pointers to resources, such as address space, file +descriptors, IPC ids, etc. The resource pointers for tasks that are +part of the same process point to the same resources, while resources +of tasks of different processes will point to different resources.

+

This peculiarity, together with the clone() and unshare() system +call allows for implementing new features such as namespaces.

+

Namespaces are used together with control groups (cgroup) to implement +operating system virtualization in Linux.

+

cgroup is a mechanism to organize processes hierarchically and +distribute system resources along the hierarchy in a controlled and +configurable manner.

+
+
+

Memory management

+

Linux memory management is a complex subsystem that deals with:

+
    +
  • Management of the physical memory: allocating and freeing memory
  • +
  • Management of the virtual memory: paging, swapping, demand +paging, copy on write
  • +
  • User services: user address space management (e.g. mmap(), brk(), +shared memory)
  • +
  • Kernel services: SL*B allocators, vmalloc
  • +
+
+
+

Block I/O management

+

The Linux Block I/O subsystem deals with reading and writing data from +or to block devices: creating block I/O requests, transforming block I/O +requests (e.g. for software RAID or LVM), merging and sorting the +requests and scheduling them via various I/O schedulers to the block +device drivers.

+../_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.png +
+
+

Virtual Filesystem Switch

+

The Linux Virtual Filesystem Switch implements common / generic +filesystem code to reduce duplication in filesystem drivers. It +introduces certain filesystem abstractions such as:

+
    +
  • inode - describes the file on disk (attributes, location of data +blocks on disk)
  • +
  • dentry - links an inode to a name
  • +
  • file - describes the properties of an opened file (e.g. file +pointer)
  • +
  • superblock - describes the properties of a formatted filesystem +(e.g. number of blocks, block size, location of root directory on +disk, encryption, etc.)
  • +
+../_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.png +

The Linux VFS also implements a complex caching mechanism which +includes the following:

+
    +
  • the inode cache - caches the file attributes and internal file +metadata
  • +
  • the dentry cache - caches the directory hierarchy of a filesystem
  • +
  • the page cache - caches file data blocks in memory
  • +
+
+
+

Networking stack

+../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png +
+
+

Linux Security Modules

+
    +
  • Hooks to extend the default Linux security model
  • +
  • Used by several Linux security extensions:
      +
    • Security Enhancened Linux
    • +
    • AppArmor
    • +
    • Tomoyo
    • +
    • Smack
    • +
    +
  • +
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/memory-management-slides.html b/refs/pull/405/merge/lectures/memory-management-slides.html new file mode 100644 index 00000000..7c9d9db1 --- /dev/null +++ b/refs/pull/405/merge/lectures/memory-management-slides.html @@ -0,0 +1,550 @@ + + + + + + + + Memory Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Memory Management

+ +
    +
  • Physical Memory Management
      +
    • Page allocations
    • +
    • Small allocations
    • +
    +
  • +
  • Virtual Memory Management
  • +
  • Page Fault Handling Overview
  • +
+ + + + +
+
+ +

Physical Memory Management

+ +
    +
  • Algorithms and data structure that keep track of physical memory +pages
  • +
  • Independent of virtual memory management
  • +
  • Both virtual and physical memory management is required for complete +memory management
  • +
  • Physical pages are being tracked using a special data structure: +struct page
  • +
  • All physical pages have an entry reserved in the mem_map +vector
  • +
  • The physical page status may include: a counter for how many +times is a page used, position in swap or file, buffers for this +page, position int the page cache, etc.
  • +
+ + + + +
+
+ +

Memory zones

+ +
    +
  • DMA zone
  • +
  • DMA32 zone
  • +
  • Normal zone (LowMem)
  • +
  • HighMem Zone
  • +
  • Movable Zone
  • +
+ + + + +
+
+ +

Non-Uniform Memory Access

+ +
    +
  • Physical memory is split in between multiple nodes, one for each CPU
  • +
  • There is single physical address space accessible from every node
  • +
  • Access to the local memory is faster
  • +
  • Each node maintains is own memory zones (.e. DMA, NORMAL, HIGHMEM, etc.)
  • +
+ + + + +
+
+ +

Page allocation

+ +
/* Allocates 2^order contiguous pages and returns a pointer to the
+ * descriptor for the first page
+ */
+struct page *alloc_pages(gfp_mask, order);
+
+/* allocates a single page */
+struct page *alloc_page(gfp_mask);
+
+
+/* helper functions that return the kernel virtual address */
+void *__get_free_pages(gfp_mask, order);
+void *__get_free_page(gfp_mask);
+void *__get_zero_page(gfp_mask);
+void *__get_dma_pages(gfp_mask, order);
+
+
+ + + + +
+
+ +

Why only allocate pages in chunks of power of 2?

+ +
    +
  • Typical memory allocation algorithms have linear complexity
  • +
  • Why not use paging?
      +
    • Sometime we do need contiguous memory allocations (for DMA)
    • +
    • Allocation would require page table changes and TLB flushes
    • +
    • Not able to use extended pages
    • +
    • Some architecture directly (in hardware) linearly maps a part +of the address space (e.g. MIPS)
    • +
    +
  • +
+ + + + +
+
+ +

The buddy algorithm

+ +
    +
  • Free blocks are distributed in multiple lists
  • +
  • Each list contains blocks of the same size
  • +
  • The block size is a power of two
  • +
+ + + + +
+
+ +

Allocating a block of size N

+ +
    +
  • If there is a free block in the N-size list, pick the first
  • +
  • If not, look for a free block in the 2N-size list
  • +
  • Split the 2N-size block in two N-size blocks and add them to the +N-size list
  • +
  • Now that we have the N-size list populated, pick the first free +block from that list
  • +
+ + + + +
+
+ +

Freeing a block of size N

+ +
    +
  • If the "buddy" is free coalesce into a 2N-size block
  • +
  • Try until no more free buddy block is found and place the +resulting block in the respective list
  • +
+ + + + +
+
+ +

The Linux implementation

+ +
    +
  • 11 lists for blocks of 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, +1024 pages
  • +
  • Each memory zone has its own buddy allocator
  • +
  • Each zone has a vector of descriptors for free blocks, one entry +for each size
  • +
  • The descriptor contains the number of free blocks and the head of +the list
  • +
  • Blocks are linked in the list using the lru field of +struct page
  • +
  • Free pages have the PG_buddy flag set
  • +
  • The page descriptor keeps a copy of the block size in the private +field to easily check if the "buddy" is free
  • +
+ + + + +
+
+ +

Small allocations

+ +
    +
  • Buddy is used to allocate pages
  • +
  • Many of the kernel subsystems need to allocate buffers smaller +than a page
  • +
  • Typical solution: variable size buffer allocation
      +
    • Leads to external fragmentation
    • +
    +
  • +
  • Alternative solution: fixed size buffer allocation
      +
    • Leads to internal fragmentation
    • +
    +
  • +
  • Compromise: fixed size block allocation with multiple sizes, geometrically distributed
      +
    • e.g.: 32, 64, ..., 131056
    • +
    +
  • +
+ + + + +
+
+ +

The SLAB allocator

+ +
    +
  • Buffers = objects
  • +
  • Uses buddy to allocate a pool of pages for object allocations
  • +
  • Each object (optionally) has a constructor and destructor
  • +
  • Deallocated objects are cached - avoids subsequent calls for +constructors and buddy allocation / deallocation
  • +
+ + + + +
+
+ +

Why SLAB?

+ +
    +
  • The kernel will typically allocate and deallocate multiple types +the same data structures over time (e.g. struct +task_struct) effectively using fixed size allocations. Using the +SLAB reduces the frequency of the more heavy +allocation/deallocation operations.
  • +
  • For variable size buffers (which occurs less frequently) a +geometric distribution of caches with fixed-size can be used
  • +
  • Reduces the memory allocation foot-print since we are searching a +much smaller memory area, compared to buddy which can span over a +larger area
  • +
  • Employs cache optimization techniques (slab coloring)
  • +
+ + + + +
+
+ +

Slab architecture

+ +../_images/slab-overview.png + + + + +
+
+ +

Cache descriptors

+ +
    +
  • A name to identify the cache for stats
  • +
  • object constructor and destructor functions
  • +
  • size of the objects
  • +
  • Flags
  • +
  • Size of the slab in power of 2 pages
  • +
  • GFP masks
  • +
  • One or mores slabs, grouped by state: full, partially full, empty
  • +
+ + + + +
+
+ +

SLAB descriptors

+ +
    +
  • Number of objects
  • +
  • Memory region where the objects are stored
  • +
  • Pointer to the first free object
  • +
  • Descriptor are stored either in
      +
    • the SLAB itself (if the object size is lower the 512 or if +internal fragmentation leaves enough space for the SLAB +descriptor)
    • +
    • in generic caches internally used by the SLAB allocator
    • +
    +
  • +
+ + + + +
+
+ +

Slab detailed architecture

+ +../_images/slab-detailed-arch.png + + + + +
+
+ +

Generic vs specific caches

+ +
    +
  • Generic caches are used internally by the slab allocator
      +
    • allocating memory for cache and slab descriptors
    • +
    +
  • +
  • They are also used to implement kmalloc() by implementing +20 caches with object sizes geometrically distributed between +32bytes and 4MB
  • +
  • Specific cache are created on demand by kernel subsystems
  • +
+ + + + +
+
+ +

Object descriptors

+ +../_images/slab-object-descriptors.png + + + + +
+
+ +

Object descriptors

+ +
    +
  • Only used for free objects
  • +
  • An integer that points to the next free object
  • +
  • The last free object uses a terminator value
  • +
  • Internal descriptors - stored in the slab
  • +
  • External descriptors - stored in generic caches
  • +
+ + + + +
+
+ +

SLAB coloring

+ +../_images/slab-coloring.png + + + + +
+
+ +

Virtual memory management

+ +
    +
  • Used in both kernel and user space
  • +
  • Using virtual memory requires:
      +
    • reserving (allocating) a segment in the virtual address space +(be it kernel or user)
    • +
    • allocating one or more physical pages for the buffer
    • +
    • allocating one or more physical pages for page tables and +internal structures
    • +
    • mapping the virtual memory segment to the physical allocated +pages
    • +
    +
  • +
+ + + + +
+
+ +

Address space descriptors

+ +

 

+../_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.png + + + + +
+
+ +

Address space descriptors

+ +
    +
  • Page table is used either by:
      +
    • The CPU's MMU
    • +
    • The kernel to handle TLB exception (some RISC processors)
    • +
    +
  • +
  • The address space descriptor is used by the kernel to maintain +high level information such as file and file offset (for mmap +with files), read-only segment, copy-on-write segment, etc.
  • +
+ + + + +
+
+ +

Allocating virtual memory

+ +
    +
  • Search a free area in the address space descriptor
  • +
  • Allocate memory for a new area descriptor
  • +
  • Insert the new area descriptor in the address space descriptor
  • +
  • Allocate physical memory for one or more page tables
  • +
  • Setup the page tables for the newly allocated area in the virtual +address space
  • +
  • Allocating (on demand) physical pages and map them in the virtual +address space by updating the page tables
  • +
+ + + + +
+
+ +

Freeing virtual memory

+ +
    +
  • Removing the area descriptor
  • +
  • Freeing the area descriptor memory
  • +
  • Updating the page tables to remove the area from the virtual +address space
  • +
  • Flushing the TLB for the freed virtual memory area
  • +
  • Freeing physical memory of the page tables associated with the +freed area
  • +
  • Freeing physical memory of the freed virtual memory area
  • +
+ + + + +
+
+ +

Linux virtual memory management

+ +
    +
  • Kernel
      +
    • vmalloc
        +
      • area descriptor: struct vm_struct
      • +
      • address space descriptor: simple linked list of struct vm_struct
      • +
      +
    • +
    +
  • +
  • Userspace
      +
    • area descriptor: struct vm_area_struct
    • +
    • address space descriptor: struct mm_struct, red-black tree
    • +
    +
  • +
+ + + + +
+
+ +

Linux virtual memory management

+ +../_images/page-fault-handling.png + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/memory-management.html b/refs/pull/405/merge/lectures/memory-management.html new file mode 100644 index 00000000..ab137479 --- /dev/null +++ b/refs/pull/405/merge/lectures/memory-management.html @@ -0,0 +1,433 @@ + + + + + + Memory Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Memory Management

+

View slides

+
+

Lecture objectives:

+
    +
  • Physical Memory Management
      +
    • Page allocations
    • +
    • Small allocations
    • +
    +
  • +
  • Virtual Memory Management
  • +
  • Page Fault Handling Overview
  • +
+
+
+

Physical Memory Management

+
    +
  • Algorithms and data structure that keep track of physical memory +pages
  • +
  • Independent of virtual memory management
  • +
  • Both virtual and physical memory management is required for complete +memory management
  • +
  • Physical pages are being tracked using a special data structure: +struct page
  • +
  • All physical pages have an entry reserved in the mem_map +vector
  • +
  • The physical page status may include: a counter for how many +times is a page used, position in swap or file, buffers for this +page, position int the page cache, etc.
  • +
+
+

Memory zones

+
    +
  • DMA zone
  • +
  • DMA32 zone
  • +
  • Normal zone (LowMem)
  • +
  • HighMem Zone
  • +
  • Movable Zone
  • +
+
+
+

Non-Uniform Memory Access

+
    +
  • Physical memory is split in between multiple nodes, one for each CPU
  • +
  • There is single physical address space accessible from every node
  • +
  • Access to the local memory is faster
  • +
  • Each node maintains is own memory zones (.e. DMA, NORMAL, HIGHMEM, etc.)
  • +
+
+
+

Page allocation

+
/* Allocates 2^order contiguous pages and returns a pointer to the
+ * descriptor for the first page
+ */
+struct page *alloc_pages(gfp_mask, order);
+
+/* allocates a single page */
+struct page *alloc_page(gfp_mask);
+
+
+/* helper functions that return the kernel virtual address */
+void *__get_free_pages(gfp_mask, order);
+void *__get_free_page(gfp_mask);
+void *__get_zero_page(gfp_mask);
+void *__get_dma_pages(gfp_mask, order);
+
+
+
    +
  • Typical memory allocation algorithms have linear complexity
  • +
  • Why not use paging?
      +
    • Sometime we do need contiguous memory allocations (for DMA)
    • +
    • Allocation would require page table changes and TLB flushes
    • +
    • Not able to use extended pages
    • +
    • Some architecture directly (in hardware) linearly maps a part +of the address space (e.g. MIPS)
    • +
    +
  • +
+
    +
  • Free blocks are distributed in multiple lists
  • +
  • Each list contains blocks of the same size
  • +
  • The block size is a power of two
  • +
+
    +
  • If there is a free block in the N-size list, pick the first
  • +
  • If not, look for a free block in the 2N-size list
  • +
  • Split the 2N-size block in two N-size blocks and add them to the +N-size list
  • +
  • Now that we have the N-size list populated, pick the first free +block from that list
  • +
+
    +
  • If the "buddy" is free coalesce into a 2N-size block
  • +
  • Try until no more free buddy block is found and place the +resulting block in the respective list
  • +
+
    +
  • 11 lists for blocks of 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, +1024 pages
  • +
  • Each memory zone has its own buddy allocator
  • +
  • Each zone has a vector of descriptors for free blocks, one entry +for each size
  • +
  • The descriptor contains the number of free blocks and the head of +the list
  • +
  • Blocks are linked in the list using the lru field of +struct page
  • +
  • Free pages have the PG_buddy flag set
  • +
  • The page descriptor keeps a copy of the block size in the private +field to easily check if the "buddy" is free
  • +
+
+
+

Small allocations

+
    +
  • Buddy is used to allocate pages
  • +
  • Many of the kernel subsystems need to allocate buffers smaller +than a page
  • +
  • Typical solution: variable size buffer allocation
      +
    • Leads to external fragmentation
    • +
    +
  • +
  • Alternative solution: fixed size buffer allocation
      +
    • Leads to internal fragmentation
    • +
    +
  • +
  • Compromise: fixed size block allocation with multiple sizes, geometrically distributed
      +
    • e.g.: 32, 64, ..., 131056
    • +
    +
  • +
+
    +
  • Buffers = objects
  • +
  • Uses buddy to allocate a pool of pages for object allocations
  • +
  • Each object (optionally) has a constructor and destructor
  • +
  • Deallocated objects are cached - avoids subsequent calls for +constructors and buddy allocation / deallocation
  • +
+
    +
  • The kernel will typically allocate and deallocate multiple types +the same data structures over time (e.g. struct +task_struct) effectively using fixed size allocations. Using the +SLAB reduces the frequency of the more heavy +allocation/deallocation operations.
  • +
  • For variable size buffers (which occurs less frequently) a +geometric distribution of caches with fixed-size can be used
  • +
  • Reduces the memory allocation foot-print since we are searching a +much smaller memory area, compared to buddy which can span over a +larger area
  • +
  • Employs cache optimization techniques (slab coloring)
  • +
+../_images/slab-overview.png +
    +
  • A name to identify the cache for stats
  • +
  • object constructor and destructor functions
  • +
  • size of the objects
  • +
  • Flags
  • +
  • Size of the slab in power of 2 pages
  • +
  • GFP masks
  • +
  • One or mores slabs, grouped by state: full, partially full, empty
  • +
+
    +
  • Number of objects
  • +
  • Memory region where the objects are stored
  • +
  • Pointer to the first free object
  • +
  • Descriptor are stored either in
      +
    • the SLAB itself (if the object size is lower the 512 or if +internal fragmentation leaves enough space for the SLAB +descriptor)
    • +
    • in generic caches internally used by the SLAB allocator
    • +
    +
  • +
+../_images/slab-detailed-arch.png +
    +
  • Generic caches are used internally by the slab allocator
      +
    • allocating memory for cache and slab descriptors
    • +
    +
  • +
  • They are also used to implement kmalloc() by implementing +20 caches with object sizes geometrically distributed between +32bytes and 4MB
  • +
  • Specific cache are created on demand by kernel subsystems
  • +
+../_images/slab-object-descriptors.png +
    +
  • Only used for free objects
  • +
  • An integer that points to the next free object
  • +
  • The last free object uses a terminator value
  • +
  • Internal descriptors - stored in the slab
  • +
  • External descriptors - stored in generic caches
  • +
+../_images/slab-coloring.png +
+
+
+

Virtual memory management

+
    +
  • Used in both kernel and user space
  • +
  • Using virtual memory requires:
      +
    • reserving (allocating) a segment in the virtual address space +(be it kernel or user)
    • +
    • allocating one or more physical pages for the buffer
    • +
    • allocating one or more physical pages for page tables and +internal structures
    • +
    • mapping the virtual memory segment to the physical allocated +pages
    • +
    +
  • +
+

 

+../_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.png +
    +
  • Page table is used either by:
      +
    • The CPU's MMU
    • +
    • The kernel to handle TLB exception (some RISC processors)
    • +
    +
  • +
  • The address space descriptor is used by the kernel to maintain +high level information such as file and file offset (for mmap +with files), read-only segment, copy-on-write segment, etc.
  • +
+
    +
  • Search a free area in the address space descriptor
  • +
  • Allocate memory for a new area descriptor
  • +
  • Insert the new area descriptor in the address space descriptor
  • +
  • Allocate physical memory for one or more page tables
  • +
  • Setup the page tables for the newly allocated area in the virtual +address space
  • +
  • Allocating (on demand) physical pages and map them in the virtual +address space by updating the page tables
  • +
+
    +
  • Removing the area descriptor
  • +
  • Freeing the area descriptor memory
  • +
  • Updating the page tables to remove the area from the virtual +address space
  • +
  • Flushing the TLB for the freed virtual memory area
  • +
  • Freeing physical memory of the page tables associated with the +freed area
  • +
  • Freeing physical memory of the freed virtual memory area
  • +
+
    +
  • Kernel
      +
    • vmalloc
        +
      • area descriptor: struct vm_struct
      • +
      • address space descriptor: simple linked list of struct vm_struct
      • +
      +
    • +
    +
  • +
  • Userspace
      +
    • area descriptor: struct vm_area_struct
    • +
    • address space descriptor: struct mm_struct, red-black tree
    • +
    +
  • +
+
+
+

Fault page handling

+../_images/page-fault-handling.png +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/networking-slides.html b/refs/pull/405/merge/lectures/networking-slides.html new file mode 100644 index 00000000..d7a51db6 --- /dev/null +++ b/refs/pull/405/merge/lectures/networking-slides.html @@ -0,0 +1,514 @@ + + + + + + + + Network Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Network Management

+ +
    +
  • Socket implementation
  • +
  • Routing implementation
  • +
  • Network Device Interface
  • +
  • Hardware and Software Acceleration Techniques
  • +
+ + + + +
+
+ +

Network Management Overview

+ +../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png + + + + +
+
+ +

Sockets Implementation Overview

+ +../_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.png + + + + +
+
+ +

Sockets Families and Protocols

+ +../_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.png + + + + +
+
+ +

Example: UDP send

+ +
char c;
+struct sockaddr_in addr;
+int s;
+
+s = socket(AF_INET, SOCK_DGRAM, 0);
+connect(s, (struct sockaddr*)&addr, sizeof(addr));
+write(s, &c, 1);
+close(s);
+
+
+ + + + +
+
+ +

Example: UDP send

+ +../_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.png + + + + +
+
+ +

Network processing phases

+ +
    +
  • Interrupt handler - device driver fetches data from the RX ring, +creates a network packet and queues it to the network stack for +processing
  • +
  • NET_SOFTIRQ - packet goes through the stack layer and it is +processed: decapsulate Ethernet frame, check IP packet and route +it, if local packet decapsulate protocol packet (e.g. TCP) and +queues it to a socket
  • +
  • Process context - application fetches data from the socket queue +or pushes data to the socket queue
  • +
+ + + + +
+
+ +

Packet Routing

+ +../_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.png + + + + +
+
+ +

Routing Table

+ +
tavi@desktop-tavi:~/src/linux$ ip route list table main
+default via 172.30.240.1 dev eth0
+172.30.240.0/20 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip route list table local
+broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1
+local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1
+local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1
+broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1
+broadcast 172.30.240.0 dev eth0 proto kernel scope link src 172.30.249.241
+local 172.30.249.241 dev eth0 proto kernel scope host src 172.30.249.241
+broadcast 172.30.255.255 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip rule list
+0:      from all lookup local
+32766:  from all lookup main
+32767:  from all lookup default
+
+
+ + + + +
+
+ +

Routing Policy Database

+ +
    +
  • "Regular" routing only uses the destination address
  • +
  • To increase flexibility a "Routing Policy Database" is used that +allows different routing based on other fields such as the source +address, protocol type, transport ports, etc.
  • +
  • This is encoded as a list of rules that are evaluated based on +their priority (priority 0 is the highest)
  • +
  • Each rule has a selector (how to match the packet) and an +action (what action to take if the packet matches)
  • +
  • Selectors: source address, destination address, type of service (TOS), +input interface, output interface, etc.
  • +
  • Action: lookup / unicast - use given routing table, blackhole - +drop packet, unreachable - send ICMP unreachable message and drop +packet, etc.
  • +
+ + + + +
+
+ +

Routing table processing

+ +
    +
  • Special table for local addreses -> route packets to sockets +based on family, type, ports
  • +
  • Check every routing entry for starting with the most specific +routes (e.g. 192.168.0.0/24 is checked before 192.168.0.0/16)
  • +
  • A route matches if the packet destination addreess logical ORed +with the subnet mask equals the subnet address
  • +
  • Once a route matches the following information is retrieved: +interface, link layer next-hop address, network next host address
  • +
+ + + + +
+
+ +

Forward Information Database (removed in 3.6)

+ +

 

+../_images/fidb-overview.png + + + + +
+
+ +

Forward Information Database (removed in 3.6)

+ +../_images/fidb-details.png + + + + +
+
+ +

Routing Cache (removed in 3.6)

+ +

 

+../_images/routing-cache.png + + + + +
+
+ +

FIB TRIE

+ +

 

+../_images/fib-trie.png + + + + +
+
+ +

Compressed Trie

+ +

 

+../_images/fib-trie-compressed.png + + + + +
+
+ +

Netfilter

+ +
    +
  • Framework that implements packet filtering and NAT
  • +
  • It uses hooks inserted in key places in the packet flow:
      +
    • NF_IP_PRE_ROUTING
    • +
    • NF_IP_LOCAL_IN
    • +
    • NF_IP_FORWARD
    • +
    • NF_IP_LOCAL_OUT
    • +
    • NF_IP_POST_ROUTING
    • +
    • NF_IP_NUMHOOKS
    • +
    +
  • +
+ + + + +
+
+ +

Network packets (skbs)

+ +../_images/skb.png + + + + +
+
+ +

struct sk_buff

+ +
struct sk_buff {
+    struct sk_buff *next;
+    struct sk_buff *prev;
+
+    struct sock *sk;
+    ktime_t tstamp;
+    struct net_device *dev;
+    char cb[48];
+
+    unsigned int len,
+    data_len;
+    __u16 mac_len,
+    hdr_len;
+
+    void (*destructor)(struct sk_buff *skb);
+
+    sk_buff_data_t transport_header;
+    sk_buff_data_t network_header;
+    sk_buff_data_t mac_header;
+    sk_buff_data_t tail;
+    sk_buff_data_t end;
+
+    unsigned char *head,
+    *data;
+    unsigned int truesize;
+    atomic_t users;
+
+
+ + + + +
+
+ +

skb APIs

+ +
/* reserve head room */
+void skb_reserve(struct sk_buff *skb, int len);
+
+/* add data to the end */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len);
+
+/* add data to the top */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the top */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the end */
+unsigned char *skb_trim(struct sk_buff *skb, unsigned int len);
+
+unsigned char *skb_transport_header(const struct sk_buff *skb);
+
+void skb_reset_transport_header(struct sk_buff *skb);
+
+void skb_set_transport_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_network_header(const struct sk_buff *skb);
+
+void skb_reset_network_header(struct sk_buff *skb);
+
+void skb_set_network_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_mac_header(const struct sk_buff *skb);
+
+int skb_mac_header_was_set(const struct sk_buff *skb);
+
+void skb_reset_mac_header(struct sk_buff *skb);
+
+void skb_set_mac_header(struct sk_buff *skb, const int offset);
+
+
+ + + + +
+
+ +

skb data management

+ +

 

+../_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.png + + + + +
+
+ +

Network Device Interface

+ +../_images/net-dev-hw.png + + + + +
+
+ +

Advanced features

+ +
    +
  • Scatter-Gather
  • +
  • Checksum offloading: Ethernet, IP, UDP, TCP
  • +
  • Adaptive interrupt handling (coalescence, adaptive)
  • +
+ + + + +
+
+ +

TCP offload

+ +
    +
  • Full offload - Implement TCP/IP stack in hardware
  • +
  • Issues:
      +
    • Scaling number of connections
    • +
    • Security
    • +
    • Conformance
    • +
    +
  • +
+ + + + +
+
+ +

Performance observation

+ +
    +
  • Performance is proportional with the number of packets to be +processed
  • +
  • Example: if an end-point can process 60K pps
      +
    • 1538 MSS -> 738Mbps
    • +
    • 2038 MSS -> 978Mbps
    • +
    • 9038 MSS -> 4.3Gbps
    • +
    • 20738 MSS -> 9.9Gbps
    • +
    +
  • +
+ + + + +
+
+ +

Stateless offload

+ +
    +
  • The networking stack processes large packets
  • +
  • TX path: the hardware splits large packets in smaller packets +(TCP Segmentation Offload)
  • +
  • RX path: the hardware aggregates small packets into larger +packets (Large Receive Offload - LRO)
  • +
+ + + + +
+
+ +

TCP Segmentation Offload

+ +../_images/tso.png + + + + +
+
+ +

Large Receive Offload

+ +../_images/lro.png + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/networking.html b/refs/pull/405/merge/lectures/networking.html new file mode 100644 index 00000000..d83f8608 --- /dev/null +++ b/refs/pull/405/merge/lectures/networking.html @@ -0,0 +1,427 @@ + + + + + + Network Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Network Management

+

View slides

+
+

Lecture objectives:

+
    +
  • Socket implementation
  • +
  • Routing implementation
  • +
  • Network Device Interface
  • +
  • Hardware and Software Acceleration Techniques
  • +
+
+
+

Network Management Overview

+../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png +
+
+

Sockets Implementation Overview

+../_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.png +
+
+

Sockets Families and Protocols

+../_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.png +
+

Example: UDP send

+
char c;
+struct sockaddr_in addr;
+int s;
+
+s = socket(AF_INET, SOCK_DGRAM, 0);
+connect(s, (struct sockaddr*)&addr, sizeof(addr));
+write(s, &c, 1);
+close(s);
+
+
+../_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.png +
+
+
+

Network processing phases

+
    +
  • Interrupt handler - device driver fetches data from the RX ring, +creates a network packet and queues it to the network stack for +processing
  • +
  • NET_SOFTIRQ - packet goes through the stack layer and it is +processed: decapsulate Ethernet frame, check IP packet and route +it, if local packet decapsulate protocol packet (e.g. TCP) and +queues it to a socket
  • +
  • Process context - application fetches data from the socket queue +or pushes data to the socket queue
  • +
+
+
+

Packet Routing

+../_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.png +
+

Routing Table(s)

+
tavi@desktop-tavi:~/src/linux$ ip route list table main
+default via 172.30.240.1 dev eth0
+172.30.240.0/20 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip route list table local
+broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1
+local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1
+local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1
+broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1
+broadcast 172.30.240.0 dev eth0 proto kernel scope link src 172.30.249.241
+local 172.30.249.241 dev eth0 proto kernel scope host src 172.30.249.241
+broadcast 172.30.255.255 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip rule list
+0:      from all lookup local
+32766:  from all lookup main
+32767:  from all lookup default
+
+
+
+
+

Routing Policy Database

+
    +
  • "Regular" routing only uses the destination address
  • +
  • To increase flexibility a "Routing Policy Database" is used that +allows different routing based on other fields such as the source +address, protocol type, transport ports, etc.
  • +
  • This is encoded as a list of rules that are evaluated based on +their priority (priority 0 is the highest)
  • +
  • Each rule has a selector (how to match the packet) and an +action (what action to take if the packet matches)
  • +
  • Selectors: source address, destination address, type of service (TOS), +input interface, output interface, etc.
  • +
  • Action: lookup / unicast - use given routing table, blackhole - +drop packet, unreachable - send ICMP unreachable message and drop +packet, etc.
  • +
+
+
+

Routing table processing

+
    +
  • Special table for local addreses -> route packets to sockets +based on family, type, ports
  • +
  • Check every routing entry for starting with the most specific +routes (e.g. 192.168.0.0/24 is checked before 192.168.0.0/16)
  • +
  • A route matches if the packet destination addreess logical ORed +with the subnet mask equals the subnet address
  • +
  • Once a route matches the following information is retrieved: +interface, link layer next-hop address, network next host address
  • +
+
+
+

Forwarding Information Database

+

 

+../_images/fidb-overview.png +../_images/fidb-details.png +

 

+../_images/routing-cache.png +

 

+../_images/fib-trie.png +

 

+../_images/fib-trie-compressed.png +
+
+
+

Netfilter

+
    +
  • Framework that implements packet filtering and NAT
  • +
  • It uses hooks inserted in key places in the packet flow:
      +
    • NF_IP_PRE_ROUTING
    • +
    • NF_IP_LOCAL_IN
    • +
    • NF_IP_FORWARD
    • +
    • NF_IP_LOCAL_OUT
    • +
    • NF_IP_POST_ROUTING
    • +
    • NF_IP_NUMHOOKS
    • +
    +
  • +
+
+
+

Network packets / skbs (struct sk_buff)

+../_images/skb.png +
struct sk_buff {
+    struct sk_buff *next;
+    struct sk_buff *prev;
+
+    struct sock *sk;
+    ktime_t tstamp;
+    struct net_device *dev;
+    char cb[48];
+
+    unsigned int len,
+    data_len;
+    __u16 mac_len,
+    hdr_len;
+
+    void (*destructor)(struct sk_buff *skb);
+
+    sk_buff_data_t transport_header;
+    sk_buff_data_t network_header;
+    sk_buff_data_t mac_header;
+    sk_buff_data_t tail;
+    sk_buff_data_t end;
+
+    unsigned char *head,
+    *data;
+    unsigned int truesize;
+    atomic_t users;
+
+
+
/* reserve head room */
+void skb_reserve(struct sk_buff *skb, int len);
+
+/* add data to the end */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len);
+
+/* add data to the top */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the top */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the end */
+unsigned char *skb_trim(struct sk_buff *skb, unsigned int len);
+
+unsigned char *skb_transport_header(const struct sk_buff *skb);
+
+void skb_reset_transport_header(struct sk_buff *skb);
+
+void skb_set_transport_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_network_header(const struct sk_buff *skb);
+
+void skb_reset_network_header(struct sk_buff *skb);
+
+void skb_set_network_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_mac_header(const struct sk_buff *skb);
+
+int skb_mac_header_was_set(const struct sk_buff *skb);
+
+void skb_reset_mac_header(struct sk_buff *skb);
+
+void skb_set_mac_header(struct sk_buff *skb, const int offset);
+
+
+

 

+../_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.png +
+
+

Network Device

+../_images/net-dev-hw.png +
    +
  • Scatter-Gather
  • +
  • Checksum offloading: Ethernet, IP, UDP, TCP
  • +
  • Adaptive interrupt handling (coalescence, adaptive)
  • +
+
+
+

Hardware and Software Acceleration Techniques

+
    +
  • Full offload - Implement TCP/IP stack in hardware
  • +
  • Issues:
      +
    • Scaling number of connections
    • +
    • Security
    • +
    • Conformance
    • +
    +
  • +
+
    +
  • Performance is proportional with the number of packets to be +processed
  • +
  • Example: if an end-point can process 60K pps
      +
    • 1538 MSS -> 738Mbps
    • +
    • 2038 MSS -> 978Mbps
    • +
    • 9038 MSS -> 4.3Gbps
    • +
    • 20738 MSS -> 9.9Gbps
    • +
    +
  • +
+
    +
  • The networking stack processes large packets
  • +
  • TX path: the hardware splits large packets in smaller packets +(TCP Segmentation Offload)
  • +
  • RX path: the hardware aggregates small packets into larger +packets (Large Receive Offload - LRO)
  • +
+../_images/tso.png +../_images/lro.png +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/processes-slides.html b/refs/pull/405/merge/lectures/processes-slides.html new file mode 100644 index 00000000..60a65ddf --- /dev/null +++ b/refs/pull/405/merge/lectures/processes-slides.html @@ -0,0 +1,986 @@ + + + + + + + + Processes — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Processes and threads

+ +
    +
  • Process and threads
  • +
  • Context switching
  • +
  • Blocking and waking up
  • +
  • Process context
  • +
+ + + + +
+
+ +

What is a process?

+ +
    +
  • An address space
  • +
  • One or more threads
  • +
  • Opened files
  • +
  • Sockets
  • +
  • Semaphores
  • +
+
    +
  • Shared memory regions
  • +
  • Timers
  • +
  • Signal handlers
  • +
  • Many other resources and status information
  • +
+
+

All this information is grouped in the Process Control Group +(PCB). In Linux this is struct task_struct.

+ + + + +
+
+ +

Overview of process resources

+ +
                +-------------------------------------------------------------------+
+                | dr-x------    2 tavi tavi 0  2021 03 14 12:34 .                   |
+                | dr-xr-xr-x    6 tavi tavi 0  2021 03 14 12:34 ..                  |
+                | lrwx------    1 tavi tavi 64 2021 03 14 12:34 0 -> /dev/pts/4     |
+           +--->| lrwx------    1 tavi tavi 64 2021 03 14 12:34 1 -> /dev/pts/4     |
+           |    | lrwx------    1 tavi tavi 64 2021 03 14 12:34 2 -> /dev/pts/4     |
+           |    | lr-x------    1 tavi tavi 64 2021 03 14 12:34 3 -> /proc/18312/fd |
+           |    +-------------------------------------------------------------------+
+           |                 +----------------------------------------------------------------+
+           |                 | 08048000-0804c000 r-xp 00000000 08:02 16875609 /bin/cat        |
+$ ls -1 /proc/self/          | 0804c000-0804d000 rw-p 00003000 08:02 16875609 /bin/cat        |
+cmdline    |                 | 0804d000-0806e000 rw-p 0804d000 00:00 0 [heap]                 |
+cwd        |                 | ...                                                            |
+environ    |    +----------->| b7f46000-b7f49000 rw-p b7f46000 00:00 0                        |
+exe        |    |            | b7f59000-b7f5b000 rw-p b7f59000 00:00 0                        |
+fd --------+    |            | b7f5b000-b7f77000 r-xp 00000000 08:02 11601524 /lib/ld-2.7.so  |
+fdinfo          |            | b7f77000-b7f79000 rw-p 0001b000 08:02 11601524 /lib/ld-2.7.so  |
+maps -----------+            | bfa05000-bfa1a000 rw-p bffeb000 00:00 0 [stack]                |
+mem                          | ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]                 |
+root                         +----------------------------------------------------------------+
+stat                 +----------------------------+
+statm                |  Name: cat                 |
+status ------+       |  State: R (running)        |
+task         |       |  Tgid: 18205               |
+wchan        +------>|  Pid: 18205                |
+                     |  PPid: 18133               |
+                     |  Uid: 1000 1000 1000 1000  |
+                     |  Gid: 1000 1000 1000 1000  |
+                     +----------------------------+
+
+
+ + + + +
+
+ +

struct task_struct

+ +
$ pahole -C task_struct vmlinux
+
+struct task_struct {
+    struct thread_info thread_info;                  /*     0     8 */
+    volatile long int          state;                /*     8     4 */
+    void *                     stack;                /*    12     4 */
+
+    ...
+
+    /* --- cacheline 45 boundary (2880 bytes) --- */
+    struct thread_struct thread __attribute__((__aligned__(64))); /*  2880  4288 */
+
+    /* size: 7168, cachelines: 112, members: 155 */
+    /* sum members: 7148, holes: 2, sum holes: 12 */
+    /* sum bitfield members: 7 bits, bit holes: 2, sum bit holes: 57 bits */
+    /* paddings: 1, sum paddings: 2 */
+    /* forced alignments: 6, forced holes: 2, sum forced holes: 12 */
+} __attribute__((__aligned__(64)));
+
+
+ + + + +
+
+ +

Inspecting task_struct

+ +

 

+ + + + +
+
+ +

Quiz: Inspect opened files

+ +

Use the debugger to inspect the process named syslogd.

+
    +
  • What command should we use to list the opened file descriptors?
  • +
  • How many file descriptors are opened?
  • +
  • What command should we use the determine the file name for opened file descriptor 3?
  • +
  • What is the filename for file descriptor 3?
  • +
+ + + + +
+
+ +

Threads

+ +
    +
  • Each thread has its own stack and together with the register +values it determines the thread execution state
  • +
  • A thread runs in the context of a process and all threads in the +same process share the resources
  • +
  • The kernel schedules threads not processes and user-level threads +(e.g. fibers, coroutines, etc.) are not visible at the kernel level
  • +
+ + + + +
+
+ +

Classic implementation (Windows)

+ +

 

+../_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.png + + + + +
+
+ +

Linux implementation

+ +

 

+../_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.png + + + + +
+
+ +

The clone system call

+ +
    +
  • CLONE_FILES - shares the file descriptor table with the parent
  • +
  • CLONE_VM - shares the address space with the parent
  • +
  • CLONE_FS - shares the filesystem information (root directory, +current directory) with the parent
  • +
  • CLONE_NEWNS - does not share the mount namespace with the parent
  • +
  • CLONE_NEWIPC - does not share the IPC namespace (System V IPC +objects, POSIX message queues) with the parent
  • +
  • CLONE_NEWNET - does not share the networking namespaces (network +interfaces, routing table) with the parent
  • +
+ + + + +
+
+ +

Namespaces and "containers"

+ +
    +
  • Containers = a form of lightweight virtual machines
  • +
  • Container based technologies: LXC, docker
  • +
  • Containers are built of top of kernel namespaces
  • +
  • Kernel namespaces allows isolation of otherwise globally visible +resources
  • +
  • struct nsproxy has multiple namespaces each of which +can be selectively shared between groups of processes
  • +
  • At boot initial namespaces are created (e.g. init_net) +that are by default shared between new processes (e.g. list of +available network interfaces)
  • +
  • New namespace can be created a runtime and new processes can +point to these new namespaces
  • +
+ + + + +
+
+ +

Accessing the current process

+ +

Accessing the current process is a frequent operation:

+
    +
  • opening a file needs access to struct task_struct's +file field
  • +
  • mapping a new file needs access to struct task_struct's +mm field
  • +
  • Over 90% of the system calls needs to access the current process +structure so it needs to be fast
  • +
  • The current macro is available to access to current +process's struct task_struct
  • +
+ + + + +
+
+ +

Accessing the current process on x86

+ +

 

+../_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.png + + + + +
+
+ +

Previous implementation for current (x86)

+ +
/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __attribute_used__;
+
+/* how to get the thread information struct from C */
+static inline struct thread_info *current_thread_info(void)
+{
+   return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE – 1));
+}
+
+#define current current_thread_info()->task
+
+
+ + + + +
+
+ +

Quiz: previous implementation for current (x86)

+ +

What is the size of struct thread_info?

+

Which of the following are potential valid sizes for +struct thread_info: 4095, 4096, 4097?

+ + + + +
+
+ +

Overview the context switching processes

+ +../_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.png + + + + +
+
+ +

context_switch

+ +
static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+         struct task_struct *next, struct rq_flags *rf)
+{
+    prepare_task_switch(rq, prev, next);
+
+    /*
+     * For paravirt, this is coupled with an exit in switch_to to
+     * combine the page table reload and the switch backend into
+     * one hypercall.
+     */
+    arch_start_context_switch(prev);
+
+    /*
+     * kernel -> kernel   lazy + transfer active
+     *   user -> kernel   lazy + mmgrab() active
+     *
+     * kernel ->   user   switch + mmdrop() active
+     *   user ->   user   switch
+     */
+    if (!next->mm) {                                // to kernel
+        enter_lazy_tlb(prev->active_mm, next);
+
+        next->active_mm = prev->active_mm;
+        if (prev->mm)                           // from user
+            mmgrab(prev->active_mm);
+        else
+            prev->active_mm = NULL;
+    } else {                                        // to user
+        membarrier_switch_mm(rq, prev->active_mm, next->mm);
+        /*
+         * sys_membarrier() requires an smp_mb() between setting
+         * rq->curr / membarrier_switch_mm() and returning to userspace.
+         *
+         * The below provides this either through switch_mm(), or in
+         * case 'prev->active_mm == next->mm' through
+         * finish_task_switch()'s mmdrop().
+         */
+        switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+        if (!prev->mm) {                        // from kernel
+            /* will mmdrop() in finish_task_switch(). */
+            rq->prev_mm = prev->active_mm;
+            prev->active_mm = NULL;
+        }
+    }
+
+    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
+    prepare_lock_switch(rq, next, rf);
+
+    /* Here we just switch the register state and the stack. */
+    switch_to(prev, next, prev);
+    barrier();
+
+    return finish_task_switch(prev);
+  }
+
+
+ + + + +
+
+ +

switch_to

+ +
#define switch_to(prev, next, last)               \
+do {                                              \
+    ((last) = __switch_to_asm((prev), (next)));   \
+} while (0)
+
+
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(__switch_to_asm)
+    /*
+     * Save callee-saved registers
+     * This must match the order in struct inactive_task_frame
+     */
+    pushl   %ebp
+    pushl   %ebx
+    pushl   %edi
+    pushl   %esi
+    /*
+     * Flags are saved to prevent AC leakage. This could go
+     * away if objtool would have 32bit support to verify
+     * the STAC/CLAC correctness.
+     */
+    pushfl
+
+    /* switch stack */
+    movl    %esp, TASK_threadsp(%eax)
+    movl    TASK_threadsp(%edx), %esp
+
+  #ifdef CONFIG_STACKPROTECTOR
+    movl    TASK_stack_canary(%edx), %ebx
+    movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+  #endif
+
+  #ifdef CONFIG_RETPOLINE
+    /*
+     * When switching from a shallower to a deeper call stack
+     * the RSB may either underflow or use entries populated
+     * with userspace addresses. On CPUs where those concerns
+     * exist, overwrite the RSB with entries which capture
+     * speculative execution to prevent attack.
+     */
+    FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+    #endif
+
+    /* Restore flags or the incoming task to restore AC state. */
+    popfl
+    /* restore callee-saved registers */
+    popl    %esi
+    popl    %edi
+    popl    %ebx
+    popl    %ebp
+
+    jmp     __switch_to
+  SYM_CODE_END(__switch_to_asm)
+  .popsection
+
+
+ + + + +
+
+ +

Inspecting task_struct

+ +

 

+ + + + +
+
+ +

Quiz: context switch

+ +

We are executing a context switch. Select all of the statements that are true.

+
    +
  • the ESP register is saved in the task structure
  • +
  • the EIP register is saved in the task structure
  • +
  • general registers are saved in the task structure
  • +
  • the ESP register is saved on the stack
  • +
  • the EIP register is saved on the stack
  • +
  • general registers are saved on the stack
  • +
+ + + + +
+
+ +

Task states

+ +../_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.png + + + + +
+
+ +

Blocking the current thread

+ +
    +
  • Set the current thread state to TASK_UINTERRUPTIBLE or +TASK_INTERRUPTIBLE
  • +
  • Add the task to a waiting queue
  • +
  • Call the scheduler which will pick up a new task from the READY +queue
  • +
  • Do the context switch to the new task
  • +
+ + + + +
+
+ +

wait_event

+ +
/**
+ * wait_event - sleep until a condition gets true
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event(wq_head, condition)            \
+do {                                              \
+  might_sleep();                                  \
+  if (condition)                                  \
+          break;                                  \
+  __wait_event(wq_head, condition);               \
+} while (0)
+
+#define __wait_event(wq_head, condition)                                  \
+    (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,   \
+                        schedule())
+
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)    \
+({                                                                       \
+    __label__ __out;                                                     \
+    struct wait_queue_entry __wq_entry;                                  \
+    long __ret = ret;       /* explicit shadow */                        \
+                                                                         \
+    init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);     \
+    for (;;) {                                                           \
+        long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
+                                                                         \
+        if (condition)                                                   \
+            break;                                                       \
+                                                                         \
+        if (___wait_is_interruptible(state) && __int) {                  \
+            __ret = __int;                                               \
+            goto __out;                                                  \
+        }                                                                \
+                                                                         \
+        cmd;                                                             \
+    }                                                                    \
+    finish_wait(&wq_head, &__wq_entry);                                  \
+   __out:  __ret;                                                        \
+ })
+
+ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+ {
+    wq_entry->flags = flags;
+    wq_entry->private = current;
+    wq_entry->func = autoremove_wake_function;
+    INIT_LIST_HEAD(&wq_entry->entry);
+ }
+
+ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+ {
+     unsigned long flags;
+     long ret = 0;
+
+     spin_lock_irqsave(&wq_head->lock, flags);
+     if (signal_pending_state(state, current)) {
+         /*
+          * Exclusive waiter must not fail if it was selected by wakeup,
+          * it should "consume" the condition we were waiting for.
+          *
+          * The caller will recheck the condition and return success if
+          * we were already woken up, we can not miss the event because
+          * wakeup locks/unlocks the same wq_head->lock.
+          *
+          * But we need to ensure that set-condition + wakeup after that
+          * can't see us, it should wake up another exclusive waiter if
+          * we fail.
+          */
+         list_del_init(&wq_entry->entry);
+         ret = -ERESTARTSYS;
+     } else {
+         if (list_empty(&wq_entry->entry)) {
+             if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+                 __add_wait_queue_entry_tail(wq_head, wq_entry);
+             else
+                 __add_wait_queue(wq_head, wq_entry);
+         }
+         set_current_state(state);
+     }
+     spin_unlock_irqrestore(&wq_head->lock, flags);
+
+     return ret;
+ }
+
+ static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add(&wq_entry->entry, &wq_head->head);
+ }
+
+ static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add_tail(&wq_entry->entry, &wq_head->head);
+ }
+
+ /**
+  * finish_wait - clean up after waiting in a queue
+  * @wq_head: waitqueue waited on
+  * @wq_entry: wait descriptor
+  *
+  * Sets current thread back to running state and removes
+  * the wait descriptor from the given waitqueue if still
+  * queued.
+  */
+ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     unsigned long flags;
+
+     __set_current_state(TASK_RUNNING);
+     /*
+      * We can check for list emptiness outside the lock
+      * IFF:
+      *  - we use the "careful" check that verifies both
+      *    the next and prev pointers, so that there cannot
+      *    be any half-pending updates in progress on other
+      *    CPU's that we haven't seen yet (and that might
+      *    still change the stack area.
+      * and
+      *  - all other users take the lock (ie we can only
+      *    have _one_ other CPU that looks at or modifies
+      *    the list).
+      */
+     if (!list_empty_careful(&wq_entry->entry)) {
+         spin_lock_irqsave(&wq_head->lock, flags);
+         list_del_init(&wq_entry->entry);
+         spin_unlock_irqrestore(&wq_head->lock, flags);
+     }
+ }
+
+
+ + + + +
+
+ +

Waking up a task

+ +
    +
  • Select a task from the waiting queue
  • +
  • Set the task state to TASK_READY
  • +
  • Insert the task into the scheduler's READY queue
  • +
  • On SMP system this is a complex operation: each processor has its +own queue, queues need to be balanced, CPUs needs to be signaled
  • +
+ + + + +
+
+ +

wake_up

+ +
#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+               int nr_exclusive, void *key)
+{
+    __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+                  int nr_exclusive, int wake_flags, void *key)
+{
+  unsigned long flags;
+  wait_queue_entry_t bookmark;
+
+  bookmark.flags = 0;
+  bookmark.private = NULL;
+  bookmark.func = NULL;
+  INIT_LIST_HEAD(&bookmark.entry);
+
+  do {
+          spin_lock_irqsave(&wq_head->lock, flags);
+          nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+                                          wake_flags, key, &bookmark);
+          spin_unlock_irqrestore(&wq_head->lock, flags);
+  } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+}
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+                            int nr_exclusive, int wake_flags, void *key,
+                  wait_queue_entry_t *bookmark)
+{
+    wait_queue_entry_t *curr, *next;
+    int cnt = 0;
+
+    lockdep_assert_held(&wq_head->lock);
+
+    if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+          curr = list_next_entry(bookmark, entry);
+
+          list_del(&bookmark->entry);
+          bookmark->flags = 0;
+    } else
+          curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+    if (&curr->entry == &wq_head->head)
+          return nr_exclusive;
+
+    list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
+          unsigned flags = curr->flags;
+          int ret;
+
+          if (flags & WQ_FLAG_BOOKMARK)
+                  continue;
+
+          ret = curr->func(curr, mode, wake_flags, key);
+          if (ret < 0)
+                  break;
+          if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+                  break;
+
+          if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+                          (&next->entry != &wq_head->head)) {
+                  bookmark->flags = WQ_FLAG_BOOKMARK;
+                  list_add_tail(&bookmark->entry, &next->entry);
+                  break;
+          }
+    }
+
+    return nr_exclusive;
+}
+
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+{
+    int ret = default_wake_function(wq_entry, mode, sync, key);
+
+    if (ret)
+        list_del_init_careful(&wq_entry->entry);
+
+    return ret;
+}
+
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
+                    void *key)
+{
+    WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+    return try_to_wake_up(curr->private, mode, wake_flags);
+}
+
+/**
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Conceptually does:
+ *
+ *   If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * This function is atomic against schedule() which would dequeue the task.
+ *
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ *  - p->sched_class
+ *  - p->cpus_ptr
+ *  - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
+ *  - ttwu_queue()       -- new rq, for enqueue of the task;
+ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ *           %false otherwise.
+ */
+ static int
+ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ {
+     ...
+
+
+ + + + +
+
+ +

Non preemptive kernel

+ +
    +
  • At every tick the kernel checks to see if the current process has +its time slice consumed
  • +
  • If that happens a flag is set in interrupt context
  • +
  • Before returning to userspace the kernel checks this flag and +calls schedule() if needed
  • +
  • In this case tasks are not preempted while running in kernel mode +(e.g. system call) so there are no synchronization issues
  • +
+ + + + +
+
+ +

Preemptive kernel

+ +
    +
  • Tasks can be preempted even when running in kernel mode
  • +
  • It requires new synchronization primitives to be used in critical +sections: preempt_disable and +preempt_enable
  • +
  • Spinlocks also disable preemption
  • +
  • When a thread needs to be preempted a flag is set and action is +taken (e.g. scheduler is called) when preemption is reactivated
  • +
+ + + + +
+
+ +

Process context

+ +

The kernel is executing in process context when it is running a +system call.

+

In process context there is a well defined context and we can +access the current process data with current

+

In process context we can sleep (wait on a condition).

+

In process context we can access the user-space (unless we are +running in a kernel thread context).

+ + + + +
+
+ +

Kernel threads

+ +

Sometimes the kernel core or device drivers need to perform blocking +operations and thus they need to run in process context.

+

Kernel threads are used exactly for this and are a special class of +tasks that don't "userspace" resources (e.g. no address space or +opened files).

+ + + + +
+
+ +

Inspecting kernel threads

+ +

 

+ + + + +
+
+ +

Quiz: Kernel gdb scripts

+ +

What is the following change of the lx-ps script trying to +accomplish?

+
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
+index 17ec19e9b5bf..7e43c163832f 100644
+--- a/scripts/gdb/linux/tasks.py
++++ b/scripts/gdb/linux/tasks.py
+@@ -75,10 +75,13 @@ class LxPs(gdb.Command):
+     def invoke(self, arg, from_tty):
+         gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM"))
+         for task in task_lists():
+-            gdb.write("{} {:^5} {}\n".format(
++            check = task["mm"].format_string() == "0x0"
++            gdb.write("{} {:^5} {}{}{}\n".format(
+                 task.format_string().split()[0],
+                 task["pid"].format_string(),
+-                task["comm"].string()))
++                "[" if check else "",
++                task["comm"].string(),
++                "]" if check else ""))
+
+
+ LxPs()
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/processes.html b/refs/pull/405/merge/lectures/processes.html new file mode 100644 index 00000000..80aedeca --- /dev/null +++ b/refs/pull/405/merge/lectures/processes.html @@ -0,0 +1,1084 @@ + + + + + + Processes — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Processes

+

View slides

+
+

Lecture objectives

+
    +
  • Process and threads
  • +
  • Context switching
  • +
  • Blocking and waking up
  • +
  • Process context
  • +
+
+
+

Processes and threads

+

A process is an operating system abstraction that groups together +multiple resources:

+
    +
  • An address space
  • +
  • One or more threads
  • +
  • Opened files
  • +
  • Sockets
  • +
  • Semaphores
  • +
+
    +
  • Shared memory regions
  • +
  • Timers
  • +
  • Signal handlers
  • +
  • Many other resources and status information
  • +
+
+

All this information is grouped in the Process Control Group +(PCB). In Linux this is struct task_struct.

+
+

Overview of process resources

+

A summary of the resources a process has can be obtain from the +/proc/<pid> directory, where <pid> is the process id for the +process we want to look at.

+
                +-------------------------------------------------------------------+
+                | dr-x------    2 tavi tavi 0  2021 03 14 12:34 .                   |
+                | dr-xr-xr-x    6 tavi tavi 0  2021 03 14 12:34 ..                  |
+                | lrwx------    1 tavi tavi 64 2021 03 14 12:34 0 -> /dev/pts/4     |
+           +--->| lrwx------    1 tavi tavi 64 2021 03 14 12:34 1 -> /dev/pts/4     |
+           |    | lrwx------    1 tavi tavi 64 2021 03 14 12:34 2 -> /dev/pts/4     |
+           |    | lr-x------    1 tavi tavi 64 2021 03 14 12:34 3 -> /proc/18312/fd |
+           |    +-------------------------------------------------------------------+
+           |                 +----------------------------------------------------------------+
+           |                 | 08048000-0804c000 r-xp 00000000 08:02 16875609 /bin/cat        |
+$ ls -1 /proc/self/          | 0804c000-0804d000 rw-p 00003000 08:02 16875609 /bin/cat        |
+cmdline    |                 | 0804d000-0806e000 rw-p 0804d000 00:00 0 [heap]                 |
+cwd        |                 | ...                                                            |
+environ    |    +----------->| b7f46000-b7f49000 rw-p b7f46000 00:00 0                        |
+exe        |    |            | b7f59000-b7f5b000 rw-p b7f59000 00:00 0                        |
+fd --------+    |            | b7f5b000-b7f77000 r-xp 00000000 08:02 11601524 /lib/ld-2.7.so  |
+fdinfo          |            | b7f77000-b7f79000 rw-p 0001b000 08:02 11601524 /lib/ld-2.7.so  |
+maps -----------+            | bfa05000-bfa1a000 rw-p bffeb000 00:00 0 [stack]                |
+mem                          | ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]                 |
+root                         +----------------------------------------------------------------+
+stat                 +----------------------------+
+statm                |  Name: cat                 |
+status ------+       |  State: R (running)        |
+task         |       |  Tgid: 18205               |
+wchan        +------>|  Pid: 18205                |
+                     |  PPid: 18133               |
+                     |  Uid: 1000 1000 1000 1000  |
+                     |  Gid: 1000 1000 1000 1000  |
+                     +----------------------------+
+
+
+
+
+

struct task_struct

+

Lets take a close look at struct task_struct. For that we +could just look at the source code, but here we will use a tool called +pahole (part of the dwarves install package) in order to get +some insights about this structure:

+
$ pahole -C task_struct vmlinux
+
+struct task_struct {
+    struct thread_info thread_info;                  /*     0     8 */
+    volatile long int          state;                /*     8     4 */
+    void *                     stack;                /*    12     4 */
+
+    ...
+
+    /* --- cacheline 45 boundary (2880 bytes) --- */
+    struct thread_struct thread __attribute__((__aligned__(64))); /*  2880  4288 */
+
+    /* size: 7168, cachelines: 112, members: 155 */
+    /* sum members: 7148, holes: 2, sum holes: 12 */
+    /* sum bitfield members: 7 bits, bit holes: 2, sum bit holes: 57 bits */
+    /* paddings: 1, sum paddings: 2 */
+    /* forced alignments: 6, forced holes: 2, sum forced holes: 12 */
+} __attribute__((__aligned__(64)));
+
+
+

As you can see it is a pretty large data structure: almost 8KB in size +and 155 fields.

+
+
+

Inspecting task_struct

+

The following screencast is going to demonstrate how we can inspect +the process control block (struct task_struct) by connecting +the debugger to the running virtual machine. We are going to use a +helper gdb command lx-ps to list the processes and the address of +the task_struct for each process.

+

 

+
+
+

Quiz: Inspect a task to determine opened files

+

Use the debugger to inspect the process named syslogd.

+
    +
  • What command should we use to list the opened file descriptors?
  • +
  • How many file descriptors are opened?
  • +
  • What command should we use the determine the file name for opened file descriptor 3?
  • +
  • What is the filename for file descriptor 3?
  • +
+
+
+

Threads

+

A thread is the basic unit that the kernel process scheduler uses to +allow applications to run the CPU. A thread has the following +characteristics:

+
    +
  • Each thread has its own stack and together with the register +values it determines the thread execution state
  • +
  • A thread runs in the context of a process and all threads in the +same process share the resources
  • +
  • The kernel schedules threads not processes and user-level threads +(e.g. fibers, coroutines, etc.) are not visible at the kernel level
  • +
+

The typical thread implementation is one where the threads is +implemented as a separate data structure which is then linked to the +process data structure. For example, the Windows kernel uses such an +implementation:

+

 

+../_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.png +

Linux uses a different implementation for threads. The basic unit is +called a task (hence the struct task_struct) and it is used +for both threads and processes. Instead of embedding resources in the +task structure it has pointers to these resources.

+

Thus, if two threads are the same process will point to the same +resource structure instance. If two threads are in different processes +they will point to different resource structure instances.

+

 

+../_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.png +
+
+

The clone system call

+

In Linux a new thread or process is create with the clone() +system call. Both the fork() system call and the +pthread_create() function uses the clone() +implementation.

+

It allows the caller to decide what resources should be shared with +the parent and which should be copied or isolated:

+
    +
  • CLONE_FILES - shares the file descriptor table with the parent
  • +
  • CLONE_VM - shares the address space with the parent
  • +
  • CLONE_FS - shares the filesystem information (root directory, +current directory) with the parent
  • +
  • CLONE_NEWNS - does not share the mount namespace with the parent
  • +
  • CLONE_NEWIPC - does not share the IPC namespace (System V IPC +objects, POSIX message queues) with the parent
  • +
  • CLONE_NEWNET - does not share the networking namespaces (network +interfaces, routing table) with the parent
  • +
+

For example, if CLONE_FILES | CLONE_VM | CLONE_FS is used by the +caller then effectively a new thread is created. If these flags are +not used then a new process is created.

+
+
+

Namespaces and "containers"

+

"Containers" are a form of lightweight virtual machines that share the +same kernel instance, as opposed to normal virtualization where a +hypervisor runs multiple VMs, each with its one kernel +instance.

+

Examples of container technologies are LXC - that allows running +lightweight "VM" and docker - a specialized container for running a +single application.

+

Containers are built on top of a few kernel features, one of which is +namespaces. They allow isolation of different resources that would +otherwise be globally visible. For example, without containers, all +processes would be visible in /proc. With containers, processes in one +container will not be visible (in /proc or be killable) to other +containers.

+

To achieve this partitioning, the struct nsproxy structure +is used to group types of resources that we want to partition. It +currently supports IPC, networking, cgroup, mount, networking, PID, +time namespaces. For example, instead of having a global list for +networking interfaces, the list is part of a struct net. The +system initializes with a default namespace (init_net) and by +default all processes will share this namespace. When a new namespace +is created a new net namespace is created and then new processes can +point to that new namespace instead of the default one.

+
+
+

Accessing the current process

+

Accessing the current process is a frequent operation:

+
    +
  • opening a file needs access to struct task_struct's +file field
  • +
  • mapping a new file needs access to struct task_struct's +mm field
  • +
  • Over 90% of the system calls needs to access the current process +structure so it needs to be fast
  • +
  • The current macro is available to access to current +process's struct task_struct
  • +
+

In order to support fast access in multi processor configurations a +per CPU variable is used to store and retrieve the pointer to the +current struct task_struct:

+

 

+../_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.png +

Previously the following sequence was used as the implementation for +the current macro:

+
/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __attribute_used__;
+
+/* how to get the thread information struct from C */
+static inline struct thread_info *current_thread_info(void)
+{
+   return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE – 1));
+}
+
+#define current current_thread_info()->task
+
+
+
+
+

Quiz: previous implementation for current (x86)

+

What is the size of struct thread_info?

+

Which of the following are potential valid sizes for +struct thread_info: 4095, 4096, 4097?

+
+
+
+

Context switching

+

The following diagram shows an overview of the Linux kernel context +switch process:

+../_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.png +

Note that before a context switch can occur we must do a kernel +transition, either with a system call or with an interrupt. At that +point the user space registers are saved on the kernel stack. At some +point the schedule() function will be called which can decide +that a context switch must occur from T0 to T1 (e.g. because the +current thread is blocking waiting for an I/O operation to complete or +because it's allocated time slice has expired).

+

At that point context_switch() will perform architecture +specific operations and will switch the address space if needed:

+
static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+         struct task_struct *next, struct rq_flags *rf)
+{
+    prepare_task_switch(rq, prev, next);
+
+    /*
+     * For paravirt, this is coupled with an exit in switch_to to
+     * combine the page table reload and the switch backend into
+     * one hypercall.
+     */
+    arch_start_context_switch(prev);
+
+    /*
+     * kernel -> kernel   lazy + transfer active
+     *   user -> kernel   lazy + mmgrab() active
+     *
+     * kernel ->   user   switch + mmdrop() active
+     *   user ->   user   switch
+     */
+    if (!next->mm) {                                // to kernel
+        enter_lazy_tlb(prev->active_mm, next);
+
+        next->active_mm = prev->active_mm;
+        if (prev->mm)                           // from user
+            mmgrab(prev->active_mm);
+        else
+            prev->active_mm = NULL;
+    } else {                                        // to user
+        membarrier_switch_mm(rq, prev->active_mm, next->mm);
+        /*
+         * sys_membarrier() requires an smp_mb() between setting
+         * rq->curr / membarrier_switch_mm() and returning to userspace.
+         *
+         * The below provides this either through switch_mm(), or in
+         * case 'prev->active_mm == next->mm' through
+         * finish_task_switch()'s mmdrop().
+         */
+        switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+        if (!prev->mm) {                        // from kernel
+            /* will mmdrop() in finish_task_switch(). */
+            rq->prev_mm = prev->active_mm;
+            prev->active_mm = NULL;
+        }
+    }
+
+    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
+    prepare_lock_switch(rq, next, rf);
+
+    /* Here we just switch the register state and the stack. */
+    switch_to(prev, next, prev);
+    barrier();
+
+    return finish_task_switch(prev);
+  }
+
+
+

Then it will call the architecture specific switch_to +implementation to switch the registers state and kernel stack. Note +that registers are saved on stack and that the stack pointer is saved +in the task structure:

+
#define switch_to(prev, next, last)               \
+do {                                              \
+    ((last) = __switch_to_asm((prev), (next)));   \
+} while (0)
+
+
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(__switch_to_asm)
+    /*
+     * Save callee-saved registers
+     * This must match the order in struct inactive_task_frame
+     */
+    pushl   %ebp
+    pushl   %ebx
+    pushl   %edi
+    pushl   %esi
+    /*
+     * Flags are saved to prevent AC leakage. This could go
+     * away if objtool would have 32bit support to verify
+     * the STAC/CLAC correctness.
+     */
+    pushfl
+
+    /* switch stack */
+    movl    %esp, TASK_threadsp(%eax)
+    movl    TASK_threadsp(%edx), %esp
+
+  #ifdef CONFIG_STACKPROTECTOR
+    movl    TASK_stack_canary(%edx), %ebx
+    movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+  #endif
+
+  #ifdef CONFIG_RETPOLINE
+    /*
+     * When switching from a shallower to a deeper call stack
+     * the RSB may either underflow or use entries populated
+     * with userspace addresses. On CPUs where those concerns
+     * exist, overwrite the RSB with entries which capture
+     * speculative execution to prevent attack.
+     */
+    FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+    #endif
+
+    /* Restore flags or the incoming task to restore AC state. */
+    popfl
+    /* restore callee-saved registers */
+    popl    %esi
+    popl    %edi
+    popl    %ebx
+    popl    %ebp
+
+    jmp     __switch_to
+  SYM_CODE_END(__switch_to_asm)
+  .popsection
+
+
+

You can notice that the instruction pointer is not explicitly +saved. It is not needed because:

+
+
    +
  • a task will always resume in this function
  • +
  • the schedule() (context_switch() is always +inlined) caller's return address is saved on the kernel stack
  • +
  • a jmp is used to execute __switch_to() which is a function +and when it returns it will pop the original (next task) return +address from the stack
  • +
+
+

The following screencast uses the debugger to setup a breaking in +__switch_to_asm and examine the stack during the context switch:

+

 

+
+

Quiz: context switch

+

We are executing a context switch. Select all of the statements that are true.

+
    +
  • the ESP register is saved in the task structure
  • +
  • the EIP register is saved in the task structure
  • +
  • general registers are saved in the task structure
  • +
  • the ESP register is saved on the stack
  • +
  • the EIP register is saved on the stack
  • +
  • general registers are saved on the stack
  • +
+
+
+
+

Blocking and waking up tasks

+
+

Task states

+

The following diagram shows to the task (threads) states and the +possible transitions between them:

+../_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.png +
+
+

Blocking the current thread

+

Blocking the current thread is an important operation we need to +perform to implement efficient task scheduling - we want to run other +threads while I/O operations complete.

+

In order to accomplish this the following operations take place:

+
    +
  • Set the current thread state to TASK_UINTERRUPTIBLE or +TASK_INTERRUPTIBLE
  • +
  • Add the task to a waiting queue
  • +
  • Call the scheduler which will pick up a new task from the READY +queue
  • +
  • Do the context switch to the new task
  • +
+

Below are some snippets for the wait_event +implementation. Note that the waiting queue is a list with some extra +information like a pointer to the task struct.

+

Also note that a lot of effort is put into making sure no deadlock can +occur between wait_event and wake_up: the task +is added to the list before checking condition, signals are +checked before calling schedule().

+
/**
+ * wait_event - sleep until a condition gets true
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event(wq_head, condition)            \
+do {                                              \
+  might_sleep();                                  \
+  if (condition)                                  \
+          break;                                  \
+  __wait_event(wq_head, condition);               \
+} while (0)
+
+#define __wait_event(wq_head, condition)                                  \
+    (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,   \
+                        schedule())
+
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)    \
+({                                                                       \
+    __label__ __out;                                                     \
+    struct wait_queue_entry __wq_entry;                                  \
+    long __ret = ret;       /* explicit shadow */                        \
+                                                                         \
+    init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);     \
+    for (;;) {                                                           \
+        long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
+                                                                         \
+        if (condition)                                                   \
+            break;                                                       \
+                                                                         \
+        if (___wait_is_interruptible(state) && __int) {                  \
+            __ret = __int;                                               \
+            goto __out;                                                  \
+        }                                                                \
+                                                                         \
+        cmd;                                                             \
+    }                                                                    \
+    finish_wait(&wq_head, &__wq_entry);                                  \
+   __out:  __ret;                                                        \
+ })
+
+ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+ {
+    wq_entry->flags = flags;
+    wq_entry->private = current;
+    wq_entry->func = autoremove_wake_function;
+    INIT_LIST_HEAD(&wq_entry->entry);
+ }
+
+ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+ {
+     unsigned long flags;
+     long ret = 0;
+
+     spin_lock_irqsave(&wq_head->lock, flags);
+     if (signal_pending_state(state, current)) {
+         /*
+          * Exclusive waiter must not fail if it was selected by wakeup,
+          * it should "consume" the condition we were waiting for.
+          *
+          * The caller will recheck the condition and return success if
+          * we were already woken up, we can not miss the event because
+          * wakeup locks/unlocks the same wq_head->lock.
+          *
+          * But we need to ensure that set-condition + wakeup after that
+          * can't see us, it should wake up another exclusive waiter if
+          * we fail.
+          */
+         list_del_init(&wq_entry->entry);
+         ret = -ERESTARTSYS;
+     } else {
+         if (list_empty(&wq_entry->entry)) {
+             if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+                 __add_wait_queue_entry_tail(wq_head, wq_entry);
+             else
+                 __add_wait_queue(wq_head, wq_entry);
+         }
+         set_current_state(state);
+     }
+     spin_unlock_irqrestore(&wq_head->lock, flags);
+
+     return ret;
+ }
+
+ static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add(&wq_entry->entry, &wq_head->head);
+ }
+
+ static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add_tail(&wq_entry->entry, &wq_head->head);
+ }
+
+ /**
+  * finish_wait - clean up after waiting in a queue
+  * @wq_head: waitqueue waited on
+  * @wq_entry: wait descriptor
+  *
+  * Sets current thread back to running state and removes
+  * the wait descriptor from the given waitqueue if still
+  * queued.
+  */
+ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     unsigned long flags;
+
+     __set_current_state(TASK_RUNNING);
+     /*
+      * We can check for list emptiness outside the lock
+      * IFF:
+      *  - we use the "careful" check that verifies both
+      *    the next and prev pointers, so that there cannot
+      *    be any half-pending updates in progress on other
+      *    CPU's that we haven't seen yet (and that might
+      *    still change the stack area.
+      * and
+      *  - all other users take the lock (ie we can only
+      *    have _one_ other CPU that looks at or modifies
+      *    the list).
+      */
+     if (!list_empty_careful(&wq_entry->entry)) {
+         spin_lock_irqsave(&wq_head->lock, flags);
+         list_del_init(&wq_entry->entry);
+         spin_unlock_irqrestore(&wq_head->lock, flags);
+     }
+ }
+
+
+
+
+

Waking up a task

+

We can wake-up tasks by using the wake_up primitive. The +following high level operations are performed to wake up a task:

+
    +
  • Select a task from the waiting queue
  • +
  • Set the task state to TASK_READY
  • +
  • Insert the task into the scheduler's READY queue
  • +
  • On SMP system this is a complex operation: each processor has its +own queue, queues need to be balanced, CPUs needs to be signaled
  • +
+
#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+               int nr_exclusive, void *key)
+{
+    __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+                  int nr_exclusive, int wake_flags, void *key)
+{
+  unsigned long flags;
+  wait_queue_entry_t bookmark;
+
+  bookmark.flags = 0;
+  bookmark.private = NULL;
+  bookmark.func = NULL;
+  INIT_LIST_HEAD(&bookmark.entry);
+
+  do {
+          spin_lock_irqsave(&wq_head->lock, flags);
+          nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+                                          wake_flags, key, &bookmark);
+          spin_unlock_irqrestore(&wq_head->lock, flags);
+  } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+}
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+                            int nr_exclusive, int wake_flags, void *key,
+                  wait_queue_entry_t *bookmark)
+{
+    wait_queue_entry_t *curr, *next;
+    int cnt = 0;
+
+    lockdep_assert_held(&wq_head->lock);
+
+    if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+          curr = list_next_entry(bookmark, entry);
+
+          list_del(&bookmark->entry);
+          bookmark->flags = 0;
+    } else
+          curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+    if (&curr->entry == &wq_head->head)
+          return nr_exclusive;
+
+    list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
+          unsigned flags = curr->flags;
+          int ret;
+
+          if (flags & WQ_FLAG_BOOKMARK)
+                  continue;
+
+          ret = curr->func(curr, mode, wake_flags, key);
+          if (ret < 0)
+                  break;
+          if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+                  break;
+
+          if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+                          (&next->entry != &wq_head->head)) {
+                  bookmark->flags = WQ_FLAG_BOOKMARK;
+                  list_add_tail(&bookmark->entry, &next->entry);
+                  break;
+          }
+    }
+
+    return nr_exclusive;
+}
+
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+{
+    int ret = default_wake_function(wq_entry, mode, sync, key);
+
+    if (ret)
+        list_del_init_careful(&wq_entry->entry);
+
+    return ret;
+}
+
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
+                    void *key)
+{
+    WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+    return try_to_wake_up(curr->private, mode, wake_flags);
+}
+
+/**
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Conceptually does:
+ *
+ *   If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * This function is atomic against schedule() which would dequeue the task.
+ *
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ *  - p->sched_class
+ *  - p->cpus_ptr
+ *  - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
+ *  - ttwu_queue()       -- new rq, for enqueue of the task;
+ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ *           %false otherwise.
+ */
+ static int
+ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ {
+     ...
+
+
+
+
+
+

Preempting tasks

+

Up until this point we look at how context switches occurs voluntary +between threads. Next we will look at how preemption is handled. We +will start wight the simpler case where the kernel is configured as +non preemptive and then we will move to the preemptive kernel case.

+
+

Non preemptive kernel

+
    +
  • At every tick the kernel checks to see if the current process has +its time slice consumed
  • +
  • If that happens a flag is set in interrupt context
  • +
  • Before returning to userspace the kernel checks this flag and +calls schedule() if needed
  • +
  • In this case tasks are not preempted while running in kernel mode +(e.g. system call) so there are no synchronization issues
  • +
+
+
+

Preemptive kernel

+

In this case the current task can be preempted even if we are running +in kernel mode and executing a system call. This requires using a +special synchronization primitives: preempt_disable and +preempt_enable.

+

In order to simplify handling for preemptive kernels and since +synchronization primitives are needed for the SMP case anyway, +preemption is disabled automatically when a spinlock is used.

+

As before, if we run into a condition that requires the preemption of +the current task (its time slices has expired) a flag is set. This +flag is checked whenever the preemption is reactivated, e.g. when +exiting a critical section through a spin_unlock() and if +needed the scheduler is called to select a new task.

+
+
+
+

Process context

+

Now that we have examined the implementation of processes and threads +(tasks), how context switching occurs, how we can block, wake-up and +preempt tasks, we can finally define what the process context is what +are its properties:

+

The kernel is executing in process context when it is running a +system call.

+

In process context there is a well defined context and we can +access the current process data with current

+

In process context we can sleep (wait on a condition).

+

In process context we can access the user-space (unless we are +running in a kernel thread context).

+
+

Kernel threads

+

Sometimes the kernel core or device drivers need to perform blocking +operations and thus they need to run in process context.

+

Kernel threads are used exactly for this and are a special class of +tasks that don't "userspace" resources (e.g. no address space or +opened files).

+

The following screencast takes a closer look at kernel threads:

+

 

+
+
+
+

Using gdb scripts for kernel inspection

+

The Linux kernel comes with a predefined set of gdb extra commands we +can use to inspect the kernel during debugging. They will +automatically be loaded as long gdbinit is properly setup

+
ubuntu@so2:/linux/tools/labs$ cat ~/.gdbinit
+add-auto-load-safe-path /linux/scripts/gdb/vmlinux-gdb.py
+
+
+

All of the kernel specific commands are prefixed with lx-. You can use +TAB in gdb to list all of them:

+
(gdb) lx-
+lx-clk-summary        lx-dmesg              lx-mounts
+lx-cmdline            lx-fdtdump            lx-ps
+lx-configdump         lx-genpd-summary      lx-symbols
+lx-cpus               lx-iomem              lx-timerlist
+lx-device-list-bus    lx-ioports            lx-version
+lx-device-list-class  lx-list-check
+lx-device-list-tree   lx-lsmod
+
+
+

The implementation of the commands can be found at +script/gdb/linux. Lets take a closer look at the lx-ps +implementation:

+
task_type = utils.CachedType("struct task_struct")
+
+
+def task_lists():
+ task_ptr_type = task_type.get_type().pointer()
+ init_task = gdb.parse_and_eval("init_task").address
+ t = g = init_task
+
+ while True:
+     while True:
+         yield t
+
+         t = utils.container_of(t['thread_group']['next'],
+                                task_ptr_type, "thread_group")
+         if t == g:
+             break
+
+     t = g = utils.container_of(g['tasks']['next'],
+                                task_ptr_type, "tasks")
+     if t == init_task:
+         return
+
+
+ class LxPs(gdb.Command):
+ """Dump Linux tasks."""
+
+ def __init__(self):
+     super(LxPs, self).__init__("lx-ps", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+     gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM"))
+     for task in task_lists():
+         gdb.write("{} {:^5} {}\n".format(
+             task.format_string().split()[0],
+             task["pid"].format_string(),
+             task["comm"].string()))
+
+
+
+

Quiz: Kernel gdb scripts

+

What is the following change of the lx-ps script trying to +accomplish?

+
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
+index 17ec19e9b5bf..7e43c163832f 100644
+--- a/scripts/gdb/linux/tasks.py
++++ b/scripts/gdb/linux/tasks.py
+@@ -75,10 +75,13 @@ class LxPs(gdb.Command):
+     def invoke(self, arg, from_tty):
+         gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM"))
+         for task in task_lists():
+-            gdb.write("{} {:^5} {}\n".format(
++            check = task["mm"].format_string() == "0x0"
++            gdb.write("{} {:^5} {}{}{}\n".format(
+                 task.format_string().split()[0],
+                 task["pid"].format_string(),
+-                task["comm"].string()))
++                "[" if check else "",
++                task["comm"].string(),
++                "]" if check else ""))
+
+
+ LxPs()
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/smp-slides.html b/refs/pull/405/merge/lectures/smp-slides.html new file mode 100644 index 00000000..7c6e375c --- /dev/null +++ b/refs/pull/405/merge/lectures/smp-slides.html @@ -0,0 +1,811 @@ + + + + + + + + Symmetric Multi-Processing — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Symmetric Multi-Processing

+ +
    +
  • Kernel Concurrency
  • +
  • Atomic operations
  • +
  • Spin locks
  • +
  • Cache thrashing
  • +
  • Optimized spin locks
  • +
  • Process and Interrupt Context Synchronization
  • +
  • Mutexes
  • +
  • Per CPU data
  • +
  • Memory Ordering and Barriers
  • +
  • Read-Copy Update
  • +
+ + + + +
+
+ +

Race conditions

+ +
    +
  • there are at least two execution contexts that run in "parallel":
      +
    • truly run in parallel (e.g. two system calls running on +different processors)
    • +
    • one of the contexts can arbitrary preempt the other (e.g. an +interrupt preempts a system call)
    • +
    +
  • +
  • the execution contexts perform read-write accesses to shared +memory
  • +
+ + + + +
+
+ +

Race condition: resource counter release

+ +
void release_resource()
+{
+     counter--;
+
+     if (!counter)
+         free_resource();
+}
+
+
+ + + + +
+
+ +

Race condition scenario

+ +

 

+../_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.png + + + + +
+
+ +

Avoiding race conditions

+ +
    +
  • make the critical section atomic (e.g. use atomic +instructions)
  • +
  • disable preemption during the critical section (e.g. disable +interrupts, bottom-half handlers, or thread preemption)
  • +
  • serialize the access to the critical section (e.g. use spin +locks or mutexes to allow only one context or thread in the +critical section)
  • +
+ + + + +
+
+ +

Linux kernel concurrency sources

+ +
    +
  • single core systems, non-preemptive kernel: the current +process can be preempted by interrupts
  • +
  • single core systems, preemptive kernel: above + the +current process can be preempted by other processes
  • +
  • multi-core systems: above + the current process can run +in parallel with another process or with an interrupt running on +another processor
  • +
+ + + + +
+
+ +

Atomic operations

+ +
    +
  • integer based:
      +
    • simple: atomic_inc(), atomic_dec(), +atomic_add(), atomic_sub()
    • +
    • conditional: atomic_dec_and_test(), atomic_sub_and_test()
    • +
    +
  • +
  • bit based:
      +
    • simple: test_bit(), set_bit(), +change_bit()
    • +
    • conditional: test_and_set_bit(), test_and_clear_bit(), +test_and_change_bit()
    • +
    +
  • +
+ + + + +
+
+ +

Using atomic_dec_and_test() to implement resource counter release

+ +
void release_resource()
+{
+    if (atomic_dec_and_test(&counter))
+         free_resource();
+}
+
+
+ + + + +
+
+ +

Atomic operations may not be atomic on SMP systems

+ +

 

+../_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.png + + + + +
+
+ +

Fixing atomic operations for SMP systems (x86)

+ +

 

+../_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.png + + + + +
+
+ +

Synchronization with interrupts (x86)

+ +
 #define local_irq_disable() \
+     asm volatile („cli” : : : „memory”)
+
+#define local_irq_enable() \
+    asm volatile („sti” : : : „memory”)
+
+#define local_irq_save(flags) \
+    asm volatile ("pushf ; pop %0" :"=g" (flags)
+                  : /* no input */: "memory") \
+    asm volatile("cli": : :"memory")
+
+#define local_irq_restore(flags) \
+    asm volatile ("push %0 ; popf"
+                  : /* no output */
+                  : "g" (flags) :"memory", "cc");
+
+
+ + + + +
+
+ +

Spin Lock Implementation Example (x86)

+ +
spin_lock:
+    lock bts [my_lock], 0
+    jc spin_lock
+
+/* critical section */
+
+spin_unlock:
+    mov [my_lock], 0
+
+
+

bts dts, src - bit test and set; it copies the src bit from the dts +memory address to the carry flag and then sets it:

+
CF <- dts[src]
+dts[src] <- 1
+
+
+ + + + +
+
+ +

Lock Contention

+ +
    +
  • There is lock contention when at least one core spins trying to +enter the critical section lock
  • +
  • Lock contention grows with the critical section size, time spent +in the critical section and the number of cores in the system
  • +
+ + + + +
+
+ +

Cache Thrashing

+ +

Cache thrashing occurs when multiple cores are trying to read and +write to the same memory resulting in excessive cache misses.

+

Since spin locks continuously access memory during lock contention, +cache thrashing is a common occurrence due to the way cache +coherency is implemented.

+ + + + +
+
+ +

Synchronized caches and memory

+ +

 

+../_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.png + + + + +
+
+ +

Unsynchronized caches and memory

+ +

 

+../_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.png + + + + +
+
+ +

Cache Coherency Protocols

+ +
    +
  • Bus snooping (sniffing) based: memory bus transactions are +monitored by caches and they take actions to preserve +coherency
  • +
  • Directory based: there is a separate entity (directory) that +maintains the state of caches; caches interact with directory +to preserve coherency
  • +
+

Bus snooping is simpler but it performs poorly when the number of +cores goes beyond 32-64.

+

Directory based cache coherence protocols scale much better (up +to thousands of cores) and are usually used in NUMA systems.

+ + + + +
+
+ +

MESI Cache Coherence Protocol

+ +
    +
  • Caching policy: write back
  • +
  • Cache line states
      +
    • Modified: owned by a single core and dirty
    • +
    • Exclusive: owned by a single core and clean
    • +
    • Shared: shared between multiple cores and clean
    • +
    • Invalid : the line is not cached
    • +
    +
  • +
+ + + + +
+
+ +

MESI State Transitions

+ +
    +
  • Invalid -> Exclusive: read request, all other cores have the line +in Invalid; line loaded from memory
  • +
  • Invalid -> Shared: read request, at least one core has the line +in Shared or Exclusive; line loaded from sibling cache
  • +
  • Invalid/Shared/Exclusive -> Modified: write request; all +other cores invalidate the line
  • +
  • Modified -> Invalid: write request from other core; line is +flushed to memory
  • +
+ + + + +
+
+ +

Cache thrashing due to spin lock contention

+ +

 

+../_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.png + + + + +
+
+ +

Optimized spin lock (KeAcquireSpinLock)

+ +

 

+
spin_lock:
+    rep ; nop
+    test lock_addr, 1
+    jnz spin_lock
+    lock bts lock_addr
+    jc spin_lock
+
+
+
    +
  • we first test the lock read only, using a non atomic +instructions, to avoid writes and thus invalidate operations +while we spin
  • +
  • only when the lock might be free, we try to acquire it
  • +
+ + + + +
+
+ +

Queued Spin Locks

+ +

 

+../_images/ditaa-58545831034f050660727be99cede213bc4a53c7.png + + + + +
+
+ +

Process and Interrupt Handler Synchronization Deadlock

+ +
    +
  • In the process context we take the spin lock
  • +
  • An interrupt occurs and it is scheduled on the same CPU core
  • +
  • The interrupt handler runs and tries to take the spin lock
  • +
  • The current CPU will deadlock
  • +
+ + + + +
+
+ +

Interrupt Synchronization for SMP

+ +
    +
  • In process context: disable interrupts and acquire a spin lock; +this will protect both against interrupt or other CPU cores race +conditions (spin_lock_irqsave() and +spin_lock_restore() combine the two operations)
  • +
  • In interrupt context: take a spin lock; this will will protect +against race conditions with other interrupt handlers or process +context running on different processors
  • +
+ + + + +
+
+ +

Bottom-Half Synchronization for SMP

+ +
    +
  • In process context use spin_lock_bh() (which combines +local_bh_disable() and spin_lock()) and +spin_unlock_bh() (which combines spin_unlock() and +local_bh_enable())
  • +
  • In bottom half context use: spin_lock() and +spin_unlock() (or spin_lock_irqsave() and +spin_lock_irqrestore() if sharing data with interrupt +handlers)
  • +
+ + + + +
+
+ +

Preemption

+ +

 

+

Preemption is configurable: when active it provides better latency +and response time, while when deactivated it provides better +throughput.

+

Preemption is disabled by spin locks and mutexes but it can be +manually disabled as well (by core kernel code).

+ + + + +
+
+ +

Preemption and Bottom-Half Masking

+ +
#define PREEMPT_BITS      8
+#define SOFTIRQ_BITS      8
+#define HARDIRQ_BITS      4
+#define NMI_BITS          1
+
+#define preempt_disable() preempt_count_inc()
+
+#define local_bh_disable() add_preempt_count(SOFTIRQ_OFFSET)
+
+#define local_bh_enable() sub_preempt_count(SOFTIRQ_OFFSET)
+
+#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+
+#define in_interrupt() irq_count()
+
+asmlinkage void do_softirq(void)
+{
+    if (in_interrupt()) return;
+    ...
+
+
+ + + + +
+
+ +

Mutexes

+ +
    +
  • They don't "waste" CPU cycles; system throughput is better than +spin locks if context switch overhead is lower than medium +spinning time
  • +
  • They can't be used in interrupt context
  • +
  • They have a higher latency than spin locks
  • +
+ + + + +
+
+ +

mutex_lock() fast path

+ +
void __sched mutex_lock(struct mutex *lock)
+{
+  might_sleep();
+
+  if (!__mutex_trylock_fast(lock))
+    __mutex_lock_slowpath(lock);
+}
+
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+    return true;
+
+  return false;
+}
+
+
+ + + + +
+
+ +

mutex_lock() slow path

+ +
...
+  spin_lock(&lock->wait_lock);
+...
+  /* add waiting tasks to the end of the waitqueue (FIFO): */
+  list_add_tail(&waiter.list, &lock->wait_list);
+...
+  waiter.task = current;
+...
+  for (;;) {
+    if (__mutex_trylock(lock))
+      goto acquired;
+  ...
+    spin_unlock(&lock->wait_lock);
+  ...
+    set_current_state(state);
+    spin_lock(&lock->wait_lock);
+  }
+  spin_lock(&lock->wait_lock);
+acquired:
+  __set_current_state(TASK_RUNNING);
+  mutex_remove_waiter(lock, &waiter, current);
+  spin_lock(&lock->wait_lock);
+...
+
+
+ + + + +
+
+ +

mutex_unlock() fast path

+ +
void __sched mutex_unlock(struct mutex *lock)
+{
+  if (__mutex_unlock_fast(lock))
+    return;
+  __mutex_unlock_slowpath(lock, _RET_IP_);
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
+    return true;
+
+  return false;
+}
+
+void __mutex_lock_slowpath(struct mutex *lock)
+{
+...
+  if (__mutex_waiter_is_first(lock, &waiter))
+          __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+...
+
+
+ + + + +
+
+ +

mutex_unlock() slow path

+ +
...
+spin_lock(&lock->wait_lock);
+if (!list_empty(&lock->wait_list)) {
+  /* get the first entry from the wait-list: */
+  struct mutex_waiter *waiter;
+  waiter = list_first_entry(&lock->wait_list, struct mutex_waiter,
+                            list);
+  next = waiter->task;
+  wake_q_add(&wake_q, next);
+}
+...
+spin_unlock(&lock->wait_lock);
+...
+wake_up_q(&wake_q);
+
+
+ + + + +
+
+ +

Per CPU data

+ +
    +
  • No need to synchronize to access the data
  • +
  • No contention, no performance impact
  • +
  • Well suited for distributed processing where aggregation is only +seldom necessary (e.g. statistics counters)
  • +
+ + + + +
+
+ +

Out of Order Compiler Generated Code

+ + ++++ + + + + + + + + +
C codeCompiler generated code
a = 1;
+b = 2;
+
+
+
MOV R10, 1
+MOV R11, 2
+STORE R11, b
+STORE R10, a
+
+
+
+ + + + +
+
+ +

Barriers

+ +
    +
  • A read barrier (rmb(), smp_rmb()) is used to +make sure that no read operation crosses the barrier; that is, +all read operation before the barrier are complete before +executing the first instruction after the barrier
  • +
  • A write barrier (wmb(), smp_wmb()) is used to +make sure that no write operation crosses the barrier
  • +
  • A simple barrier (mb(), smp_mb()) is used +to make sure that no write or read operation crosses the barrier
  • +
+ + + + +
+
+ +

Read Copy Update (RCU)

+ +
    +
  • Read-only lock-less access at the same time with write access
  • +
  • Write accesses still requires locks in order to avoid races +between writers
  • +
  • Requires unidirectional traversal by readers
  • +
+ + + + +
+
+ +

Removal and Reclamation

+ +
    +
  • Removal: removes references to elements. Some old readers may +still see the old reference so we can't free the element.
  • +
  • Elimination: free the element. This action is postponed until +all existing readers finish traversal (quiescent cycle). New +readers won't affect the quiescent cycle.
  • +
+ + + + +
+
+ +

RCU List Delete

+ +

 

+../_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.png + + + + +
+
+ +

RCU list APIs cheat sheet

+ +
/* list traversal */
+rcu_read_lock();
+list_for_each_entry_rcu(i, head) {
+  /* no sleeping, blocking calls or context switch allowed */
+}
+rcu_read_unlock();
+
+
+/* list element delete  */
+spin_lock(&lock);
+list_del_rcu(&node->list);
+spin_unlock(&lock);
+synchronize_rcu();
+kfree(node);
+
+/* list element add  */
+spin_lock(&lock);
+list_add_rcu(head, &node->list);
+spin_unlock(&lock);
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/smp.html b/refs/pull/405/merge/lectures/smp.html new file mode 100644 index 00000000..b640966b --- /dev/null +++ b/refs/pull/405/merge/lectures/smp.html @@ -0,0 +1,884 @@ + + + + + + Symmetric Multi-Processing — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Symmetric Multi-Processing

+

View slides

+
+

Lecture objectives:

+
    +
  • Kernel Concurrency
  • +
  • Atomic operations
  • +
  • Spin locks
  • +
  • Cache thrashing
  • +
  • Optimized spin locks
  • +
  • Process and Interrupt Context Synchronization
  • +
  • Mutexes
  • +
  • Per CPU data
  • +
  • Memory Ordering and Barriers
  • +
  • Read-Copy Update
  • +
+
+
+

Synchronization basics

+

Because the Linux kernel supports symmetric multi-processing (SMP) it +must use a set of synchronization mechanisms to achieve predictable +results, free of race conditions.

+
+

Note

+

We will use the terms core, CPU and processor as +interchangeable for the purpose of this lecture.

+
+

Race conditions can occur when the following two conditions happen +simultaneously:

+
    +
  • there are at least two execution contexts that run in "parallel":
      +
    • truly run in parallel (e.g. two system calls running on +different processors)
    • +
    • one of the contexts can arbitrary preempt the other (e.g. an +interrupt preempts a system call)
    • +
    +
  • +
  • the execution contexts perform read-write accesses to shared +memory
  • +
+

Race conditions can lead to erroneous results that are hard to debug, +because they manifest only when the execution contexts are scheduled +on the CPU cores in a very specific order.

+

A classical race condition example is an incorrect implementation for +a release operation of a resource counter:

+
void release_resource()
+{
+     counter--;
+
+     if (!counter)
+         free_resource();
+}
+
+
+

A resource counter is used to keep a shared resource available until +the last user releases it but the above implementation has a race +condition that can cause freeing the resource twice:

+

 

+../_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.png +

In most cases the release_resource() function will only free the +resource once. However, in the scenario above, if thread A is +preempted right after decrementing counter and thread B calls +release_resource() it will cause the resource to be freed. When +resumed, thread A will also free the resource since the counter value +is 0.

+

To avoid race conditions the programmer must first identify the +critical section that can generate a race condition. The critical +section is the part of the code that reads and writes shared memory +from multiple parallel contexts.

+

In the example above, the minimal critical section is starting with +the counter decrement and ending with checking the counter's value.

+

Once the critical section has been identified race conditions can be +avoided by using one of the following approaches:

+
    +
  • make the critical section atomic (e.g. use atomic +instructions)
  • +
  • disable preemption during the critical section (e.g. disable +interrupts, bottom-half handlers, or thread preemption)
  • +
  • serialize the access to the critical section (e.g. use spin +locks or mutexes to allow only one context or thread in the +critical section)
  • +
+
+
+

Linux kernel concurrency sources

+

There are multiple source of concurrency in the Linux kernel that +depend on the kernel configuration as well as the type of system it +runs on:

+
    +
  • single core systems, non-preemptive kernel: the current +process can be preempted by interrupts
  • +
  • single core systems, preemptive kernel: above + the +current process can be preempted by other processes
  • +
  • multi-core systems: above + the current process can run +in parallel with another process or with an interrupt running on +another processor
  • +
+
+

Note

+

We only discuss kernel concurrency and that is why a +non-preemptive kernel running on an single core system +has interrupts as the only source of concurrency.

+
+
+
+

Atomic operations

+

In certain circumstances we can avoid race conditions by using atomic +operations that are provided by hardware. Linux provides a unified API +to access atomic operations:

+
    +
  • integer based:
      +
    • simple: atomic_inc(), atomic_dec(), +atomic_add(), atomic_sub()
    • +
    • conditional: atomic_dec_and_test(), atomic_sub_and_test()
    • +
    +
  • +
  • bit based:
      +
    • simple: test_bit(), set_bit(), +change_bit()
    • +
    • conditional: test_and_set_bit(), test_and_clear_bit(), +test_and_change_bit()
    • +
    +
  • +
+

For example, we could use atomic_dec_and_test() to implement +the resource counter decrement and value checking atomic:

+
void release_resource()
+{
+    if (atomic_dec_and_test(&counter))
+         free_resource();
+}
+
+
+

One complication with atomic operations is encountered in +multi-core systems, where an atomic operation is not longer +atomic at the system level (but still atomic at the core level).

+

To understand why, we need to decompose the atomic operation in memory +loads and stores. Then we can construct race condition scenarios where +the load and store operations are interleaved across CPUs, like in the +example below where incrementing a value from two processors will +produce an unexpected result:

+

 

+../_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.png +

In order to provide atomic operations on SMP systems different +architectures use different techniques. For example, on x86 a LOCK +prefix is used to lock the system bus while executing the prefixed +operation:

+

 

+../_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.png +

On ARM the LDREX and STREX instructions are used together to guarantee +atomic access: LDREX loads a value and signals the exclusive monitor +that an atomic operation is in progress. The STREX attempts to store a +new value but only succeeds if the exclusive monitor has not detected +other exclusive operations. So, to implement atomic operations the +programmer must retry the operation (both LDREX and STREX) until the +exclusive monitor signals a success.

+

Although they are often interpreted as "light" or "efficient" +synchronization mechanisms (because they "don't require spinning or +context switches", or because they "are implemented in hardware so +they must be more efficient", or because they "are just instructions +so they must have similar efficiency as other instructions"), as seen +from the implementation details, atomic operations are actually +expensive.

+
+
+

Disabling preemption (interrupts)

+

On single core systems and non preemptive kernels the only source of +concurrency is the preemption of the current thread by an +interrupt. To prevent concurrency is thus sufficient to disable +interrupts.

+

This is done with architecture specific instructions, but Linux offers +architecture independent APIs to disable and enable interrupts:

+
 #define local_irq_disable() \
+     asm volatile („cli” : : : „memory”)
+
+#define local_irq_enable() \
+    asm volatile („sti” : : : „memory”)
+
+#define local_irq_save(flags) \
+    asm volatile ("pushf ; pop %0" :"=g" (flags)
+                  : /* no input */: "memory") \
+    asm volatile("cli": : :"memory")
+
+#define local_irq_restore(flags) \
+    asm volatile ("push %0 ; popf"
+                  : /* no output */
+                  : "g" (flags) :"memory", "cc");
+
+
+

Although the interrupts can be explicitly disabled and enable with +local_irq_disable() and local_irq_enable() these APIs +should only be used when the current state and interrupts is +known. They are usually used in core kernel code (like interrupt +handling).

+

For typical cases where we want to avoid interrupts due to concurrency +issues it is recommended to use the local_irq_save() and +local_irq_restore() variants. They take care of saving and +restoring the interrupts states so they can be freely called from +overlapping critical sections without the risk of accidentally +enabling interrupts while still in a critical section, as long as the +calls are balanced.

+
+
+

Spin Locks

+

Spin locks are used to serialize access to a critical section. They +are necessary on multi-core systems where we can have true execution +parallelism. This is a typical spin lock implementation:

+
spin_lock:
+    lock bts [my_lock], 0
+    jc spin_lock
+
+/* critical section */
+
+spin_unlock:
+    mov [my_lock], 0
+
+
+

bts dts, src - bit test and set; it copies the src bit from the dts +memory address to the carry flag and then sets it:

+
CF <- dts[src]
+dts[src] <- 1
+
+
+

As it can be seen, the spin lock uses an atomic instruction to make +sure that only one core can enter the critical section. If there are +multiple cores trying to enter they will continuously "spin" until the +lock is released.

+

While the spin lock avoids race conditions, it can have a significant +impact on the system's performance due to "lock contention":

+
    +
  • There is lock contention when at least one core spins trying to +enter the critical section lock
  • +
  • Lock contention grows with the critical section size, time spent +in the critical section and the number of cores in the system
  • +
+

Another negative side effect of spin locks is cache thrashing.

+

Cache thrashing occurs when multiple cores are trying to read and +write to the same memory resulting in excessive cache misses.

+

Since spin locks continuously access memory during lock contention, +cache thrashing is a common occurrence due to the way cache +coherency is implemented.

+
+
+

Cache coherency in multi-processor systems

+

The memory hierarchy in multi-processor systems is composed of local +CPU caches (L1 caches), shared CPU caches (L2 caches) and the main +memory. To explain cache coherency we will ignore the L2 cache and +only consider the L1 caches and main memory.

+

In the figure below we present a view of the memory hierarchy with two +variables A and B that fall into different cache lines and where +caches and the main memory are synchronized:

+

 

+../_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.png +

In the absence of a synchronization mechanism between the caches and +main memory, when CPU 0 executes A = A + B and CPU 1 executes B = +A + B we will have the following memory view:

+

 

+../_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.png +

In order to avoid the situation above multi-processor systems use +cache coherency protocols. There are two main types of cache coherency +protocols:

+
    +
  • Bus snooping (sniffing) based: memory bus transactions are +monitored by caches and they take actions to preserve +coherency
  • +
  • Directory based: there is a separate entity (directory) that +maintains the state of caches; caches interact with directory +to preserve coherency
  • +
+

Bus snooping is simpler but it performs poorly when the number of +cores goes beyond 32-64.

+

Directory based cache coherence protocols scale much better (up +to thousands of cores) and are usually used in NUMA systems.

+

A simple cache coherency protocol that is commonly used in practice is +MESI (named after the acronym of the cache line states names: +Modified, Exclusive, Shared and Invalid). It's main +characteristics are:

+
    +
  • Caching policy: write back
  • +
  • Cache line states
      +
    • Modified: owned by a single core and dirty
    • +
    • Exclusive: owned by a single core and clean
    • +
    • Shared: shared between multiple cores and clean
    • +
    • Invalid : the line is not cached
    • +
    +
  • +
+

Issuing read or write requests from CPU cores will trigger state +transitions, as exemplified below:

+
    +
  • Invalid -> Exclusive: read request, all other cores have the line +in Invalid; line loaded from memory
  • +
  • Invalid -> Shared: read request, at least one core has the line +in Shared or Exclusive; line loaded from sibling cache
  • +
  • Invalid/Shared/Exclusive -> Modified: write request; all +other cores invalidate the line
  • +
  • Modified -> Invalid: write request from other core; line is +flushed to memory
  • +
+
+

Note

+

The most important characteristic of the MESI protocol is +that it is a write-invalidate cache protocol. When writing to a +shared location all other caches are invalidated.

+
+

This has important performance impact in certain access patterns, and +one such pattern is contention for a simple spin lock implementation +like we discussed above.

+

To exemplify this issue lets consider a system with three CPU cores, +where the first has acquired the spin lock and it is running the +critical section while the other two are spinning waiting to enter the +critical section:

+

 

+../_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.png +

As it can be seen from the figure above due to the writes issued by +the cores spinning on the lock we see frequent cache line invalidate +operations which means that basically the two waiting cores will flush +and load the cache line while waiting for the lock, creating +unnecessary traffic on the memory bus and slowing down memory accesses +for the first core.

+

Another issue is that most likely data accessed by the first CPU +during the critical section is stored in the same cache line with the +lock (common optimization to have the data ready in the cache after +the lock is acquired). Which means that the cache invalidation +triggered by the two other spinning cores will slow down the execution +of the critical section which in turn triggers more cache invalidate +actions.

+
+
+

Optimized spin locks

+

As we have seen simple spin lock implementations can have poor +performance issues due to cache thrashing, especially as the number of +cores increase. To avoid this issue there are two possible strategies:

+
    +
  • reduce the number of writes and thus reduce the number of cache +invalidate operations
  • +
  • avoid the other processors spinning on the same cache line, and thus +avoid the cache invalidate operations
  • +
+

An optimized spin lock implementation that uses the first approach is +presented below:

+

 

+
spin_lock:
+    rep ; nop
+    test lock_addr, 1
+    jnz spin_lock
+    lock bts lock_addr
+    jc spin_lock
+
+
+
    +
  • we first test the lock read only, using a non atomic +instructions, to avoid writes and thus invalidate operations +while we spin
  • +
  • only when the lock might be free, we try to acquire it
  • +
+

The implementation also use the PAUSE instruction to avoid +pipeline flushes due to (false positive) memory order violations and +to add a small delay (proportional with the memory bus frequency) to +reduce power consumption.

+

A similar implementation with support for fairness (the CPU cores are +allowed in the critical section based on the time of arrival) is used +in the Linux kernel (the ticket spin lock) +for many architectures.

+

However, for the x86 architecture, the current spin lock +implementation uses a queued spin lock where the CPU cores spin on +different locks (hopefully distributed in different cache lines) to +avoid cache invalidation operations:

+

 

+../_images/ditaa-58545831034f050660727be99cede213bc4a53c7.png +

Conceptually, when a new CPU core tries to acquire the lock and it +fails it will add its private lock to the list of waiting CPU +cores. When the lock owner exits the critical section it unlocks the +next lock in the list, if any.

+

While a read spin optimized spin lock reduces most of the cache +invalidation operations, the lock owner can still generate cache +invalidate operations due to writes to data structures close to the +lock and thus part of the same cache line. This in turn generates +memory traffic on subsequent reads on the spinning cores.

+

Hence, queued spin locks scale much better for large number of cores +as is the case for NUMA systems. And since they have similar fairness +properties as the ticket lock it is the preferred implementation on +the x86 architecture.

+
+
+

Process and Interrupt Context Synchronization

+

Accessing shared data from both process and interrupt context is a +relatively common scenario. On single core systems we can do this by +disabling interrupts, but that won't work on multi-core systems, +as we can have the process running on one CPU core and the interrupt +context running on a different CPU core.

+

Using a spin lock, which was designed for multi-processor systems, +seems like the right solution, but doing so can cause common +deadlock conditions, as detailed by the following scenario:

+
    +
  • In the process context we take the spin lock
  • +
  • An interrupt occurs and it is scheduled on the same CPU core
  • +
  • The interrupt handler runs and tries to take the spin lock
  • +
  • The current CPU will deadlock
  • +
+

To avoid this issue a two fold approach is used:

+
    +
  • In process context: disable interrupts and acquire a spin lock; +this will protect both against interrupt or other CPU cores race +conditions (spin_lock_irqsave() and +spin_lock_restore() combine the two operations)
  • +
  • In interrupt context: take a spin lock; this will will protect +against race conditions with other interrupt handlers or process +context running on different processors
  • +
+

We have the same issue for other interrupt context handlers such as +softirqs, tasklets or timers and while disabling interrupts might +work, it is recommended to use dedicated APIs:

+
    +
  • In process context use spin_lock_bh() (which combines +local_bh_disable() and spin_lock()) and +spin_unlock_bh() (which combines spin_unlock() and +local_bh_enable())
  • +
  • In bottom half context use: spin_lock() and +spin_unlock() (or spin_lock_irqsave() and +spin_lock_irqrestore() if sharing data with interrupt +handlers)
  • +
+

As mentioned before, another source of concurrency in the Linux kernel +can be other processes, due to preemption.

+

 

+

Preemption is configurable: when active it provides better latency +and response time, while when deactivated it provides better +throughput.

+

Preemption is disabled by spin locks and mutexes but it can be +manually disabled as well (by core kernel code).

+

As for local interrupt enabling and disabling APIs, the bottom half +and preemption APIs allows them to be used in overlapping critical +sections. A counter is used to track the state of bottom half and +preemption. In fact the same counter is used, with different increment +values:

+
#define PREEMPT_BITS      8
+#define SOFTIRQ_BITS      8
+#define HARDIRQ_BITS      4
+#define NMI_BITS          1
+
+#define preempt_disable() preempt_count_inc()
+
+#define local_bh_disable() add_preempt_count(SOFTIRQ_OFFSET)
+
+#define local_bh_enable() sub_preempt_count(SOFTIRQ_OFFSET)
+
+#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+
+#define in_interrupt() irq_count()
+
+asmlinkage void do_softirq(void)
+{
+    if (in_interrupt()) return;
+    ...
+
+
+
+
+

Mutexes

+

Mutexes are used to protect against race conditions from other CPU +cores but they can only be used in process context. As opposed to +spin locks, while a thread is waiting to enter the critical section it +will not use CPU time, but instead it will be added to a waiting queue +until the critical section is vacated.

+

Since mutexes and spin locks usage intersect, it is useful to compare +the two:

+
    +
  • They don't "waste" CPU cycles; system throughput is better than +spin locks if context switch overhead is lower than medium +spinning time
  • +
  • They can't be used in interrupt context
  • +
  • They have a higher latency than spin locks
  • +
+

Conceptually, the mutex_lock() operation is relatively simple: +if the mutex is not acquired we can take the fast path via an atomic +exchange operation:

+
void __sched mutex_lock(struct mutex *lock)
+{
+  might_sleep();
+
+  if (!__mutex_trylock_fast(lock))
+    __mutex_lock_slowpath(lock);
+}
+
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+    return true;
+
+  return false;
+}
+
+
+

otherwise we take the slow path where we add ourselves to the mutex +waiting list and put ourselves to sleep:

+
...
+  spin_lock(&lock->wait_lock);
+...
+  /* add waiting tasks to the end of the waitqueue (FIFO): */
+  list_add_tail(&waiter.list, &lock->wait_list);
+...
+  waiter.task = current;
+...
+  for (;;) {
+    if (__mutex_trylock(lock))
+      goto acquired;
+  ...
+    spin_unlock(&lock->wait_lock);
+  ...
+    set_current_state(state);
+    spin_lock(&lock->wait_lock);
+  }
+  spin_lock(&lock->wait_lock);
+acquired:
+  __set_current_state(TASK_RUNNING);
+  mutex_remove_waiter(lock, &waiter, current);
+  spin_lock(&lock->wait_lock);
+...
+
+
+

The full implementation is a bit more complex: instead of going to +sleep immediately it optimistic spinning if it detects that the lock +owner is currently running on a different CPU as chances are the owner +will release the lock soon. It also checks for signals and handles +mutex debugging for locking dependency engine debug feature.

+

The mutex_unlock() operation is symmetric: if there are no +waiters on the mutex then we can take the fast path via an atomic exchange +operation:

+
void __sched mutex_unlock(struct mutex *lock)
+{
+  if (__mutex_unlock_fast(lock))
+    return;
+  __mutex_unlock_slowpath(lock, _RET_IP_);
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
+    return true;
+
+  return false;
+}
+
+void __mutex_lock_slowpath(struct mutex *lock)
+{
+...
+  if (__mutex_waiter_is_first(lock, &waiter))
+          __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+...
+
+
+
+

Note

+

Because struct task_struct is cached aligned the 7 +lower bits of the owner field can be used for various flags, +such as MUTEX_FLAG_WAITERS.

+
+

Otherwise we take the slow path where we pick up first waiter from the +list and wake it up:

+
...
+spin_lock(&lock->wait_lock);
+if (!list_empty(&lock->wait_list)) {
+  /* get the first entry from the wait-list: */
+  struct mutex_waiter *waiter;
+  waiter = list_first_entry(&lock->wait_list, struct mutex_waiter,
+                            list);
+  next = waiter->task;
+  wake_q_add(&wake_q, next);
+}
+...
+spin_unlock(&lock->wait_lock);
+...
+wake_up_q(&wake_q);
+
+
+
+
+

Per CPU data

+

Per CPU data avoids race conditions by avoiding to use shared +data. Instead, an array sized to the maximum possible CPU cores is +used and each core will use its own array entry to read and write +data. This approach certainly has advantages:

+
    +
  • No need to synchronize to access the data
  • +
  • No contention, no performance impact
  • +
  • Well suited for distributed processing where aggregation is only +seldom necessary (e.g. statistics counters)
  • +
+
+
+

Memory Ordering and Barriers

+

Modern processors and compilers employ out-of-order execution to +improve performance. For example, processors can execute "future" +instructions while waiting for current instruction data to be fetched +from memory.

+

Here is an example of out of order compiler generated code:

+ ++++ + + + + + + + + +
C codeCompiler generated code
a = 1;
+b = 2;
+
+
+
MOV R10, 1
+MOV R11, 2
+STORE R11, b
+STORE R10, a
+
+
+
+
+

Note

+

When executing instructions out of order the processor makes +sure that data dependency is observed, i.e. it won't execute +instructions whose input depend on the output of a previous +instruction that has not been executed.

+
+

In most cases out of order execution is not an issue. However, in +certain situations (e.g. communicating via shared memory between +processors or between processors and hardware) we must issue some +instructions before others even without data dependency between them.

+

For this purpose we can use barriers to order memory operations:

+
    +
  • A read barrier (rmb(), smp_rmb()) is used to +make sure that no read operation crosses the barrier; that is, +all read operation before the barrier are complete before +executing the first instruction after the barrier
  • +
  • A write barrier (wmb(), smp_wmb()) is used to +make sure that no write operation crosses the barrier
  • +
  • A simple barrier (mb(), smp_mb()) is used +to make sure that no write or read operation crosses the barrier
  • +
+
+
+

Read Copy Update (RCU)

+

Read Copy Update is a special synchronization mechanism similar with +read-write locks but with significant improvements over it (and some +limitations):

+
    +
  • Read-only lock-less access at the same time with write access
  • +
  • Write accesses still requires locks in order to avoid races +between writers
  • +
  • Requires unidirectional traversal by readers
  • +
+

In fact, the read-write locks in the Linux kernel have been deprecated +and then removed, in favor of RCU.

+

Implementing RCU for a new data structure is difficult, but a few +common data structures (lists, queues, trees) do have RCU APIs that +can be used.

+

RCU splits removal updates to the data structures in two phases:

+
    +
  • Removal: removes references to elements. Some old readers may +still see the old reference so we can't free the element.
  • +
  • Elimination: free the element. This action is postponed until +all existing readers finish traversal (quiescent cycle). New +readers won't affect the quiescent cycle.
  • +
+

As an example, lets take a look on how to delete an element from a +list using RCU:

+

 

+../_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.png +

In the first step it can be seen that while readers traverse the list +all elements are referenced. In step two a writer removes +element B. Reclamation is postponed since there are still readers that +hold references to it. In step three a quiescent cycle just expired +and it can be noticed that there are no more references to +element B. Other elements still have references from readers that +started the list traversal after the element was removed. In step 4 we +finally perform reclamation (free the element).

+

Now that we covered how RCU functions at the high level, lets looks at +the APIs for traversing the list as well as adding and removing an +element to the list:

+
/* list traversal */
+rcu_read_lock();
+list_for_each_entry_rcu(i, head) {
+  /* no sleeping, blocking calls or context switch allowed */
+}
+rcu_read_unlock();
+
+
+/* list element delete  */
+spin_lock(&lock);
+list_del_rcu(&node->list);
+spin_unlock(&lock);
+synchronize_rcu();
+kfree(node);
+
+/* list element add  */
+spin_lock(&lock);
+list_add_rcu(head, &node->list);
+spin_unlock(&lock);
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/syscalls-slides.html b/refs/pull/405/merge/lectures/syscalls-slides.html new file mode 100644 index 00000000..2eab0d73 --- /dev/null +++ b/refs/pull/405/merge/lectures/syscalls-slides.html @@ -0,0 +1,530 @@ + + + + + + + + System Calls — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

System Calls

+ +
    +
  • Linux system calls implementation
  • +
  • VDSO and virtual syscalls
  • +
  • Accessing user space from system calls
  • +
+ + + + +
+
+ +

System Calls as Kernel services

+ +

 

+../_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.png + + + + +
+
+ +

System Call Setup

+ +
    +
  • setup information to identify the system call and its parameters
  • +
  • trigger a kernel mode switch
  • +
  • retrieve the result of the system call
  • +
+ + + + +
+
+ +

Linux system call setup

+ +
    +
  • System calls are identified by numbers
  • +
  • The parameters for system calls are machine word sized (32 or 64 +bit) and they are limited to a maximum of 6
  • +
  • Uses registers to store them both (e.g. for 32bit x86: EAX for +system call and EBX, ECX, EDX, ESI, EDI, EBP for parameters)
  • +
+ + + + +
+
+ +

Example of Linux system call setup and handling

+ +../_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.png + + + + +
+
+ +

Linux System Call Dispatcher

+ +
/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
+{
+    enter_from_user_mode();
+    local_irq_enable();
+    do_syscall_32_irqs_on(regs);
+}
+
+/* simplified version of the Linux x86 32bit System Call Dispatcher */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+    unsigned int nr = regs->orig_ax;
+
+    if (nr < IA32_NR_syscalls)
+        regs->ax = ia32_sys_call_table[nr](regs->bx, regs->cx,
+                                           regs->dx, regs->si,
+                                           regs->di, regs->bp);
+    syscall_return_slowpath(regs);
+}
+
+
+ + + + +
+
+ +

Inspecting dup2 system call

+ +

 

+ + + + +
+
+ +

System Call Flow Summary

+ +
    +
  • The application is setting up the system call number and +parameters and it issues a trap instruction
  • +
  • The execution mode switches from user to kernel; the CPU switches +to a kernel stack; the user stack and the return address to user +space is saved on the kernel stack
  • +
  • The kernel entry point saves registers on the kernel stack
  • +
  • The system call dispatcher identifies the system call function +and runs it
  • +
  • The user space registers are restored and execution is switched +back to user (e.g. calling IRET)
  • +
  • The user space application resumes
  • +
+ + + + +
+
+ +

System Call Table

+ +
#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
+
+const sys_call_ptr_t ia32_sys_call_table[] = {
+  [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
+  #include <asm/syscalls_32.h>
+};
+
+
+
__SYSCALL_I386(0, sys_restart_syscall)
+__SYSCALL_I386(1, sys_exit)
+__SYSCALL_I386(2, sys_fork)
+__SYSCALL_I386(3, sys_read)
+__SYSCALL_I386(4, sys_write)
+#ifdef CONFIG_X86_32
+__SYSCALL_I386(5, sys_open)
+#else
+__SYSCALL_I386(5, compat_sys_open)
+#endif
+__SYSCALL_I386(6, sys_close)
+
+
+ + + + +
+
+ +

System Calls Pointer Parameters

+ +
    +
  • Never allow pointers to kernel-space
  • +
  • Check for invalid pointers
  • +
+ + + + +
+
+ +

Pointers to Kernel Space

+ +
    +
  • User access to kernel data if allowed in a write system call
  • +
  • User corrupting kernel data if allowed in a read system call
  • +
+ + + + +
+
+ +

Invalid pointers handling approaches

+ +
    +
  • Check the pointer against the user address space before using it, +or
  • +
  • Avoid checking the pointer and rely on the MMU to detect when the +pointer is invalid and use the page fault handler to determine +that the pointer was invalid
  • +
+ + + + +
+
+ +

Page fault handling

+ +
+
    +
  • Copy on write, demand paging, swapping: both the fault and +faulting addresses are in user space; the fault address is +valid (checked against the user address space)
  • +
  • Invalid pointer used in system call: the faulting address is +in kernel space; the fault address is in user space and it is +invalid
  • +
  • Kernel bug (kernel accesses invalid pointer): same as above
  • +
+
+ + + + +
+
+ +

Marking kernel code that accesses user space

+ +
    +
  • The exact instructions that access user space are recorded in a +table (exception table)
  • +
  • When a page fault occurs the faulting address is checked against +this table
  • +
+ + + + +
+
+ +

Cost analysis for pointer checks vs fault handling

+ + +++++ + + + + + + + + + + + + + + + + +
CostPointer checksFault handling
Valid addressaddress space searchnegligible
Invalid addressaddress space searchexception table search
+ + + + +
+
+ +

Virtual Dynamic Shared Object (VDSO)

+ +
    +
  • a stream of instructions to issue the system call is generated by +the kernel in a special memory area (formatted as an ELF shared +object)
  • +
  • that memory area is mapped towards the end of the user address +space
  • +
  • libc searches for VDSO and if present will use it to issue the +system call
  • +
+ + + + +
+
+ +

Inspecting VDSO

+ +

 

+ + + + +
+
+ +

Virtual System Calls (vsyscalls)

+ +
    +
  • "System calls" that run directly from user space, part of the VDSO
  • +
  • Static data (e.g. getpid())
  • +
  • Dynamic data update by the kernel a in RW map of the VDSO +(e.g. gettimeofday(), time(), )
  • +
+ + + + +
+
+ +

Accessing user space from system calls

+ +
/* OK: return -EFAULT if user_ptr is invalid */
+if (copy_from_user(&kernel_buffer, user_ptr, size))
+    return -EFAULT;
+
+/* NOK: only works if user_ptr is valid otherwise crashes kernel */
+memcpy(&kernel_buffer, user_ptr, size);
+
+
+ + + + +
+
+ +

get_user implementation

+ +
#define get_user(x, ptr)                                          \
+({                                                                \
+  int __ret_gu;                                                   \
+  register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);            \
+  __chk_user_ptr(ptr);                                            \
+  might_fault();                                                  \
+  asm volatile("call __get_user_%P4"                              \
+               : "=a" (__ret_gu), "=r" (__val_gu),                \
+                  ASM_CALL_CONSTRAINT                             \
+               : "0" (ptr), "i" (sizeof(*(ptr))));                \
+  (x) = (__force __typeof__(*(ptr))) __val_gu;                    \
+  __builtin_expect(__ret_gu, 0);                                  \
+})
+
+
+ + + + +
+
+ +

get_user pseudo code

+ +
#define get_user(x, ptr)                \
+    movl ptr, %eax                      \
+    call __get_user_1                   \
+    movl %edx, x                        \
+    movl %eax, result                   \
+
+
+ + + + +
+
+ +

get_user_1 implementation

+ +
.text
+ENTRY(__get_user_1)
+    mov PER_CPU_VAR(current_task), %_ASM_DX
+    cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+    jae bad_get_user
+    ASM_STAC
+1:  movzbl (%_ASM_AX),%edx
+    xor %eax,%eax
+    ASM_CLAC
+    ret
+ENDPROC(__get_user_1)
+
+bad_get_user:
+    xor %edx,%edx
+    mov $(-EFAULT),%_ASM_AX
+    ASM_CLAC
+    ret
+END(bad_get_user)
+
+_ASM_EXTABLE(1b,bad_get_user)
+
+
+ + + + +
+
+ +

Exception table entry

+ +
/* Exception table entry */
+# define _ASM_EXTABLE_HANDLE(from, to, handler)           \
+  .pushsection "__ex_table","a" ;                         \
+  .balign 4 ;                                             \
+  .long (from) - . ;                                      \
+  .long (to) - . ;                                        \
+  .long (handler) - . ;                                   \
+  .popsection
+
+# define _ASM_EXTABLE(from, to)                           \
+  _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+
+ + + + +
+
+ +

Exception table building

+ +
#define EXCEPTION_TABLE(align)                                    \
+  . = ALIGN(align);                                               \
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {               \
+          VMLINUX_SYMBOL(__start___ex_table) = .;                 \
+          KEEP(*(__ex_table))                                     \
+          VMLINUX_SYMBOL(__stop___ex_table) = .;                  \
+  }
+
+
+ + + + +
+
+ +

Exception table handling

+ +
bool ex_handler_default(const struct exception_table_entry *fixup,
+                        struct pt_regs *regs, int trapnr)
+{
+    regs->ip = ex_fixup_addr(fixup);
+    return true;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+    const struct exception_table_entry *e;
+    ex_handler_t handler;
+
+    e = search_exception_tables(regs->ip);
+    if (!e)
+        return 0;
+
+    handler = ex_fixup_handler(e);
+    return handler(e, regs, trapnr);
+}
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/syscalls.html b/refs/pull/405/merge/lectures/syscalls.html new file mode 100644 index 00000000..db0b1bac --- /dev/null +++ b/refs/pull/405/merge/lectures/syscalls.html @@ -0,0 +1,542 @@ + + + + + + System Calls — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

System Calls

+

View slides

+
+

Lecture objectives:

+
    +
  • Linux system calls implementation
  • +
  • VDSO and virtual syscalls
  • +
  • Accessing user space from system calls
  • +
+
+
+

Linux system calls implementation

+

At a high level system calls are "services" offered by the kernel to +user applications and they resemble library APIs in that they are +described as a function call with a name, parameters, and return value.

+

 

+../_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.png +

However, on a closer look, we can see that system calls are actually +not function calls, but specific assembly instructions (architecture +and kernel specific) that do the following:

+
    +
  • setup information to identify the system call and its parameters
  • +
  • trigger a kernel mode switch
  • +
  • retrieve the result of the system call
  • +
+

In Linux, system calls are identified by numbers and the parameters +for system calls are machine word sized (32 or 64 bit). There can be a +maximum of 6 system call parameters. Both the system call number and +the parameters are stored in certain registers.

+

For example, on 32bit x86 architecture, the system call identifier is +stored in the EAX register, while parameters in registers EBX, ECX, +EDX, ESI, EDI, EBP.

+

System libraries (e.g. libc) offers functions that implement the +actual system calls in order to make it easier for applications to use +them.

+

When a user to kernel mode transition occurs, the execution flow is +interrupted and it is transferred to a kernel entry point. This is +similar to how interrupts and exceptions are handled (in fact on some +architectures this transition happens as a result of an exception).

+

The system call entry point will save registers (which contains values +from user space, including system call number and system call +parameters) on stack and then it will continue with executing the +system call dispatcher.

+
+

Note

+

During the user - kernel mode transition the stack is also +switched from the user stack to the kernel stack. This is +explained in more details in the interrupts lecture.

+
+../_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.png +

The purpose of the system call dispatcher is to verify the system call +number and run the kernel function associated with the system call.

+
/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
+{
+    enter_from_user_mode();
+    local_irq_enable();
+    do_syscall_32_irqs_on(regs);
+}
+
+/* simplified version of the Linux x86 32bit System Call Dispatcher */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+    unsigned int nr = regs->orig_ax;
+
+    if (nr < IA32_NR_syscalls)
+        regs->ax = ia32_sys_call_table[nr](regs->bx, regs->cx,
+                                           regs->dx, regs->si,
+                                           regs->di, regs->bp);
+    syscall_return_slowpath(regs);
+}
+
+
+

To demonstrate the system call flow we are going to use the virtual +machine setup, attach gdb to a running kernel, add a breakpoint to the +dup2 system call and inspect the state.

+

 

+

In summary, this is what happens during a system call:

+
    +
  • The application is setting up the system call number and +parameters and it issues a trap instruction
  • +
  • The execution mode switches from user to kernel; the CPU switches +to a kernel stack; the user stack and the return address to user +space is saved on the kernel stack
  • +
  • The kernel entry point saves registers on the kernel stack
  • +
  • The system call dispatcher identifies the system call function +and runs it
  • +
  • The user space registers are restored and execution is switched +back to user (e.g. calling IRET)
  • +
  • The user space application resumes
  • +
+
+

System call table

+

The system call table is what the system call dispatcher uses to map +system call numbers to kernel functions:

+
#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
+
+const sys_call_ptr_t ia32_sys_call_table[] = {
+  [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
+  #include <asm/syscalls_32.h>
+};
+
+
+
__SYSCALL_I386(0, sys_restart_syscall)
+__SYSCALL_I386(1, sys_exit)
+__SYSCALL_I386(2, sys_fork)
+__SYSCALL_I386(3, sys_read)
+__SYSCALL_I386(4, sys_write)
+#ifdef CONFIG_X86_32
+__SYSCALL_I386(5, sys_open)
+#else
+__SYSCALL_I386(5, compat_sys_open)
+#endif
+__SYSCALL_I386(6, sys_close)
+
+
+
+
+

System call parameters handling

+

Handling system call parameters is tricky. Since these values are +setup by user space, the kernel can not assume correctness and must +always verify them thoroughly.

+

Pointers have a few important special cases that must be checked:

+
    +
  • Never allow pointers to kernel-space
  • +
  • Check for invalid pointers
  • +
+

Since system calls are executed in kernel mode, they have access to +kernel space and if pointers are not properly checked user +applications might get read or write access to kernel space.

+

For example, let's consider the case where such a check is not made for +the read or write system calls. If the user passes a kernel-space +pointer to a write system call then it can get access to kernel data +by later reading the file. If it passes a kernel-space pointer to a +read system call then it can corrupt kernel memory.

+

Likewise, if a pointer passed by the application is invalid +(e.g. unmapped, read-only for cases where it is used for writing), it +could "crash" the kernel. Two approaches could be used:

+
    +
  • Check the pointer against the user address space before using it, +or
  • +
  • Avoid checking the pointer and rely on the MMU to detect when the +pointer is invalid and use the page fault handler to determine +that the pointer was invalid
  • +
+

Although it sounds tempting, the second approach is not that easy to +implement. The page fault handler uses the fault address (the address +that was accessed), the faulting address (the address of the +instruction that did the access) and information from the user address +space to determine the cause:

+
+
    +
  • Copy on write, demand paging, swapping: both the fault and +faulting addresses are in user space; the fault address is +valid (checked against the user address space)
  • +
  • Invalid pointer used in system call: the faulting address is +in kernel space; the fault address is in user space and it is +invalid
  • +
  • Kernel bug (kernel accesses invalid pointer): same as above
  • +
+
+

But in the last two cases we don't have enough information to +determine the cause of the fault.

+

In order to solve this issue, Linux uses special APIs (e.g +copy_to_user()) to accesses user space that are specially +crafted:

+
    +
  • The exact instructions that access user space are recorded in a +table (exception table)
  • +
  • When a page fault occurs the faulting address is checked against +this table
  • +
+

Although the fault handling case may be more costly overall depending +on the address space vs exception table size, and it is more complex, +it is optimized for the common case and that is why it is preferred +and used in Linux.

+ +++++ + + + + + + + + + + + + + + + + +
CostPointer checksFault handling
Valid addressaddress space searchnegligible
Invalid addressaddress space searchexception table search
+
+
+
+

Virtual Dynamic Shared Object (VDSO)

+

The VDSO mechanism was born out of the necessity of optimizing the +system call implementation, in a way that does not impact libc with +having to track the CPU capabilities in conjunction with the kernel +version.

+

For example, x86 has two ways of issuing system calls: int 0x80 and +sysenter. The latter is significantly faster so it should be used when +available. However, it is only available for processors newer than +Pentium II and only for kernel versions greater than 2.6.

+

With VDSO the system call interface is decided by the kernel:

+
    +
  • a stream of instructions to issue the system call is generated by +the kernel in a special memory area (formatted as an ELF shared +object)
  • +
  • that memory area is mapped towards the end of the user address +space
  • +
  • libc searches for VDSO and if present will use it to issue the +system call
  • +
+

 

+

An interesting development of the VDSO is the virtual system calls +(vsyscalls) which run directly from user space. These vsyscalls are +also part of VDSO and they are accessing data from the VDSO page that +is either static or modified by the kernel in a separate read-write +map of the VDSO page. Examples of system calls that can be implemented +as vsyscalls are: getpid or gettimeofday.

+
    +
  • "System calls" that run directly from user space, part of the VDSO
  • +
  • Static data (e.g. getpid())
  • +
  • Dynamic data update by the kernel a in RW map of the VDSO +(e.g. gettimeofday(), time(), )
  • +
+
+
+

Accessing user space from system calls

+

As we mentioned earlier, user space must be accessed with special APIs +(get_user(), put_user(), copy_from_user(), +copy_to_user()) that check whether the pointer is in user space +and also handle the fault if the pointer is invalid. In case of invalid +pointers, they return a non-zero value.

+
/* OK: return -EFAULT if user_ptr is invalid */
+if (copy_from_user(&kernel_buffer, user_ptr, size))
+    return -EFAULT;
+
+/* NOK: only works if user_ptr is valid otherwise crashes kernel */
+memcpy(&kernel_buffer, user_ptr, size);
+
+
+

Let's examine the simplest API, get_user, as implemented for x86:

+
#define get_user(x, ptr)                                          \
+({                                                                \
+  int __ret_gu;                                                   \
+  register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);            \
+  __chk_user_ptr(ptr);                                            \
+  might_fault();                                                  \
+  asm volatile("call __get_user_%P4"                              \
+               : "=a" (__ret_gu), "=r" (__val_gu),                \
+                  ASM_CALL_CONSTRAINT                             \
+               : "0" (ptr), "i" (sizeof(*(ptr))));                \
+  (x) = (__force __typeof__(*(ptr))) __val_gu;                    \
+  __builtin_expect(__ret_gu, 0);                                  \
+})
+
+
+

The implementation uses inline assembly, which allows inserting ASM +sequences in C code and also handles access to/from variables in the +ASM code.

+

Based on the type size of the x variable, one of __get_user_1, +__get_user_2 or __get_user_4 will be called. Also, before executing +the assembly call, ptr will be moved to the first register EAX while +after the completion of assembly part the value of EAX will be moved +to __ret_gu and the EDX register will be moved to __val_gu.

+

It is equivalent to the following pseudo code:

+
#define get_user(x, ptr)                \
+    movl ptr, %eax                      \
+    call __get_user_1                   \
+    movl %edx, x                        \
+    movl %eax, result                   \
+
+
+

The __get_user_1 implementation for x86 is the following:

+
.text
+ENTRY(__get_user_1)
+    mov PER_CPU_VAR(current_task), %_ASM_DX
+    cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+    jae bad_get_user
+    ASM_STAC
+1:  movzbl (%_ASM_AX),%edx
+    xor %eax,%eax
+    ASM_CLAC
+    ret
+ENDPROC(__get_user_1)
+
+bad_get_user:
+    xor %edx,%edx
+    mov $(-EFAULT),%_ASM_AX
+    ASM_CLAC
+    ret
+END(bad_get_user)
+
+_ASM_EXTABLE(1b,bad_get_user)
+
+
+

The first two statements check the pointer (which is stored in EDX) +with the addr_limit field of the current task (process) descriptor to +make sure that we don't have a pointer to kernel space.

+

Then, SMAP is disabled, to allow access to user from kernel, and the +access to user space is done with the instruction at the 1: label. EAX +is then zeroed to mark success, SMAP is enabled, and the call returns.

+

The movzbl instruction is the one that does the access to user space +and its address is captured with the 1: label and stored in a special +section:

+
/* Exception table entry */
+# define _ASM_EXTABLE_HANDLE(from, to, handler)           \
+  .pushsection "__ex_table","a" ;                         \
+  .balign 4 ;                                             \
+  .long (from) - . ;                                      \
+  .long (to) - . ;                                        \
+  .long (handler) - . ;                                   \
+  .popsection
+
+# define _ASM_EXTABLE(from, to)                           \
+  _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+
+

For each address that accesses user space we have an entry in the +exception table, that is made up of: the faulting address(from), where +to jump to in case of a fault, and a handler function (that implements +the jump logic). All of these addresses are stored on 32bit in +relative format to the exception table, so that they work for both 32 +and 64 bit kernels.

+

All of the exception table entries are then collected in the +__ex_table section by the linker script:

+
#define EXCEPTION_TABLE(align)                                    \
+  . = ALIGN(align);                                               \
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {               \
+          VMLINUX_SYMBOL(__start___ex_table) = .;                 \
+          KEEP(*(__ex_table))                                     \
+          VMLINUX_SYMBOL(__stop___ex_table) = .;                  \
+  }
+
+
+

The section is guarded with __start___ex_table and __stop___ex_table +symbols, so that it is easy to find the data from C code. This table +is accessed by the fault handler:

+
bool ex_handler_default(const struct exception_table_entry *fixup,
+                        struct pt_regs *regs, int trapnr)
+{
+    regs->ip = ex_fixup_addr(fixup);
+    return true;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+    const struct exception_table_entry *e;
+    ex_handler_t handler;
+
+    e = search_exception_tables(regs->ip);
+    if (!e)
+        return 0;
+
+    handler = ex_fixup_handler(e);
+    return handler(e, regs, trapnr);
+}
+
+
+

All it does is to set the return address to the one in the field of +the exception table entry which, in case of the get_user exception +table entry, is bad_get_user which return -EFAULT to the caller.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/virt-slides.html b/refs/pull/405/merge/lectures/virt-slides.html new file mode 100644 index 00000000..74b74990 --- /dev/null +++ b/refs/pull/405/merge/lectures/virt-slides.html @@ -0,0 +1,699 @@ + + + + + + + + Virtualization — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

Virtualization

+ +
    +
  • Emulation basics
  • +
  • Virtualization basics
  • +
  • Paravirtualization basics
  • +
  • Hardware support for virtualization
  • +
  • Overview of the Xen hypervisor
  • +
  • Overview of the KVM hypervisor
  • +
+ + + + +
+
+ +

Emulation basics

+ +
    +
  • Instructions are emulated (each time they are executed)
  • +
  • The other system components are also emulated:
      +
    • MMU
    • +
    • Physical memory access
    • +
    • Peripherals
    • +
    +
  • +
  • Target architecture - the architecture that it is emulated
  • +
  • Host architecture - the architecture that the emulator runs on
  • +
  • For emulation target and host architectures can be different
  • +
+ + + + +
+
+ +

Virtualization basics

+ +
    +
  • Defined in a paper by Popek & Goldberg in 1974
  • +
  • Fidelity
  • +
  • Performance
  • +
  • Security
  • +
+../_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.png + + + + +
+
+ +

Classic virtualization

+ +
    +
  • Trap & Emulate
  • +
  • Same architecture for host and target
  • +
  • Most of the target instructions are natively executed
  • +
  • Target OS runs in non-privilege mode on the host
  • +
  • Privileged instructions are trapped and emulated
  • +
  • Two machine states: host and guest
  • +
+ + + + +
+
+ +

Software virtualization

+ +
    +
  • Not all architecture can be virtualized; e.g. x86:
      +
    • CS register encodes the CPL
    • +
    • Some instructions don't generate a trap (e.g. popf)
    • +
    +
  • +
  • Solution: emulate instructions using binary translation
  • +
+ + + + +
+
+ +

MMU virtualization

+ +
    +
  • "Fake" VM physical addresses are translated by the host to actual +physical addresses
  • +
  • Guest virtual address -> Guest physical address -> Host Physical Address
  • +
  • The guest page tables are not directly used by the host hardware
  • +
  • VM page tables are verified then translated into a new set of page +tables on the host (shadow page tables)
  • +
+ + + + +
+
+ +

Shadow page tables

+ +

 

+../_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.png + + + + +
+
+ +

Lazy shadow sync

+ +
    +
  • Guest page tables changes are typically batched
  • +
  • To avoid repeated traps, checks and transformations map guest +page table entries with write access
  • +
  • Update the shadow page table when
      +
    • The TLB is flushed
    • +
    • In the host page fault handler
    • +
    +
  • +
+ + + + +
+
+ +

I/O emulation

+ +

 

+../_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.png + + + + +
+
+ +

Example: qemu SiFive UART emulation

+ +
/*
+ * QEMU model of the UART on the SiFive E300 and U500 series SOCs.
+ *
+ * Copyright (c) 2016 Stefan O'Rear
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/log.h"
+#include "chardev/char.h"
+#include "chardev/char-fe.h"
+#include "hw/irq.h"
+#include "hw/char/sifive_uart.h"
+
+/*
+ * Not yet implemented:
+ *
+ * Transmit FIFO using "qemu/fifo8.h"
+ */
+
+/* Returns the state of the IP (interrupt pending) register */
+static uint64_t uart_ip(SiFiveUARTState *s)
+{
+    uint64_t ret = 0;
+
+    uint64_t txcnt = SIFIVE_UART_GET_TXCNT(s->txctrl);
+    uint64_t rxcnt = SIFIVE_UART_GET_RXCNT(s->rxctrl);
+
+    if (txcnt != 0) {
+        ret |= SIFIVE_UART_IP_TXWM;
+    }
+    if (s->rx_fifo_len > rxcnt) {
+        ret |= SIFIVE_UART_IP_RXWM;
+    }
+
+    return ret;
+}
+
+static void update_irq(SiFiveUARTState *s)
+{
+    int cond = 0;
+    if ((s->ie & SIFIVE_UART_IE_TXWM) ||
+        ((s->ie & SIFIVE_UART_IE_RXWM) && s->rx_fifo_len)) {
+        cond = 1;
+    }
+    if (cond) {
+        qemu_irq_raise(s->irq);
+    } else {
+        qemu_irq_lower(s->irq);
+    }
+}
+
+static uint64_t
+uart_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    unsigned char r;
+    switch (addr) {
+    case SIFIVE_UART_RXFIFO:
+        if (s->rx_fifo_len) {
+            r = s->rx_fifo[0];
+            memmove(s->rx_fifo, s->rx_fifo + 1, s->rx_fifo_len - 1);
+            s->rx_fifo_len--;
+            qemu_chr_fe_accept_input(&s->chr);
+            update_irq(s);
+            return r;
+        }
+        return 0x80000000;
+
+    case SIFIVE_UART_TXFIFO:
+        return 0; /* Should check tx fifo */
+    case SIFIVE_UART_IE:
+        return s->ie;
+    case SIFIVE_UART_IP:
+        return uart_ip(s);
+    case SIFIVE_UART_TXCTRL:
+        return s->txctrl;
+    case SIFIVE_UART_RXCTRL:
+        return s->rxctrl;
+    case SIFIVE_UART_DIV:
+        return s->div;
+    }
+
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad read: addr=0x%x\n",
+                  __func__, (int)addr);
+    return 0;
+}
+
+static void
+uart_write(void *opaque, hwaddr addr,
+           uint64_t val64, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    uint32_t value = val64;
+    unsigned char ch = value;
+
+    switch (addr) {
+    case SIFIVE_UART_TXFIFO:
+        qemu_chr_fe_write(&s->chr, &ch, 1);
+        update_irq(s);
+        return;
+    case SIFIVE_UART_IE:
+        s->ie = val64;
+        update_irq(s);
+        return;
+    case SIFIVE_UART_TXCTRL:
+        s->txctrl = val64;
+        return;
+    case SIFIVE_UART_RXCTRL:
+        s->rxctrl = val64;
+        return;
+    case SIFIVE_UART_DIV:
+        s->div = val64;
+        return;
+    }
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad write: addr=0x%x v=0x%x\n",
+                  __func__, (int)addr, (int)value);
+}
+
+static const MemoryRegionOps uart_ops = {
+    .read = uart_read,
+    .write = uart_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4
+    }
+};
+
+static void uart_rx(void *opaque, const uint8_t *buf, int size)
+{
+    SiFiveUARTState *s = opaque;
+
+    /* Got a byte.  */
+    if (s->rx_fifo_len >= sizeof(s->rx_fifo)) {
+        printf("WARNING: UART dropped char.\n");
+        return;
+    }
+    s->rx_fifo[s->rx_fifo_len++] = *buf;
+
+    update_irq(s);
+}
+
+static int uart_can_rx(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    return s->rx_fifo_len < sizeof(s->rx_fifo);
+}
+
+static void uart_event(void *opaque, QEMUChrEvent event)
+{
+}
+
+static int uart_be_change(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+
+    return 0;
+}
+
+/*
+ * Create UART device.
+ */
+SiFiveUARTState *sifive_uart_create(MemoryRegion *address_space, hwaddr base,
+    Chardev *chr, qemu_irq irq)
+{
+    SiFiveUARTState *s = g_malloc0(sizeof(SiFiveUARTState));
+    s->irq = irq;
+    qemu_chr_fe_init(&s->chr, chr, &error_abort);
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+    memory_region_init_io(&s->mmio, NULL, &uart_ops, s,
+                          TYPE_SIFIVE_UART, SIFIVE_UART_MAX);
+    memory_region_add_subregion(address_space, base, &s->mmio);
+    return s;
+}
+
+
+ + + + +
+
+ +

Paravirtualization

+ +
    +
  • Change the guest OS so that it cooperates with the VMM
      +
    • CPU paravirtualization
    • +
    • MMU paravirtualization
    • +
    • I/O paravirtualization
    • +
    +
  • +
  • VMM exposes hypercalls for:
      +
    • activate / deactivate the interrupts
    • +
    • changing page tables
    • +
    • accessing virtualized peripherals
    • +
    +
  • +
  • VMM uses events to trigger interrupts in the VM
  • +
+ + + + +
+
+ +

Intel VT-x

+ +
    +
  • Hardware extension to transform x86 to the point it can be +virtualized "classically"
  • +
  • New execution mode: non-root mode
  • +
  • Each non-root mode instance uses a Virtual Machine Control +Structure (VMCS) to store its state
  • +
  • VMM runs in root mode
  • +
  • VM-entry and VM-exit are used to transition between the two modes
  • +
+ + + + +
+
+ +

Virtual Machine Control Structure

+ +
    +
  • Guest information: state of the virtual CPU
  • +
  • Host information: state of the physical CPU
  • +
  • Saved information:
      +
    • visible state: segment registers, CR3, IDTR, etc.
    • +
    • internal state
    • +
    +
  • +
  • VMCS can not be accessed directly but certain information can be +accessed with special instructions
  • +
+ + + + +
+
+ +

VM entry & exit

+ +
    +
  • VM entry - new instructions that switches the CPU in non-root +mode and loads the VM state from a VMCS; host state is saved in +VMCS
  • +
  • Allows injecting interrupts and exceptions in the guest
  • +
  • VM exit will be automatically triggered based on the VMCS +configuration
  • +
  • When VM exit occurs host state is loaded from VMCS, guest state +is saved in VMCS
  • +
+ + + + +
+
+ +

VM execution control fields

+ +
    +
  • Selects conditions which triggers a VM exit; examples:
      +
    • If an external interrupt is generated
    • +
    • If an external interrupt is generated and EFLAGS.IF is set
    • +
    • If CR0-CR4 registers are modified
    • +
    +
  • +
  • Exception bitmap - selects which exceptions will generate a VM +exit
  • +
  • IO bitmap - selects which I/O addresses (IN/OUT accesses) +generates a VM exit
  • +
  • MSR bitmaps - selects which RDMSR or WRMSR instructions will +generate a VM exit
  • +
+ + + + +
+
+ +

Extend Page Tables

+ +
    +
  • Reduces the complexity of MMU virtualization and improves +performance
  • +
  • Access to CR3, INVLPG and page faults do not require VM exit +anymore
  • +
  • The EPT page table is controlled by the VMM
  • +
+../_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.png + + + + +
+
+ +

VPID

+ +
    +
  • VM entry and VM exit forces a TLB flush - loses VMM / VM translations
  • +
  • To avoid this issue a VPID (Virtual Processor ID) tag is +associated with each VM (VPID 0 is reserved for the VMM)
  • +
  • All TLB entries are tagged
  • +
  • At VM entry and exit just the entries associated with the tags +are flushed
  • +
  • When searching the TLB just the current VPID is used
  • +
+ + + + +
+
+ +

I/O virtualization

+ +../_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.png + + + + +
+
+ +

I/O MMU

+ +

VT-d protects and translates VM physical addresses using an I/O +MMU (DMA remaping)

+../_images/ditaa-d880751969de8642b2613caaca345d71acea4500.png + + + + +
+
+ +

Interrupt posting

+ +
    +
  • Messsage Signaled Interrupts (MSI) = DMA writes to the host +address range of the IRQ controller (e.g. 0xFEExxxxx)
  • +
  • Low bits of the address and the data indicate which interrupt +vector to deliver to which CPU
  • +
  • Interrupt remapping table points to the virtual CPU (VMCS) that +should receive the interrupt
  • +
  • I/O MMU will trap the IRQ controller write and look it up in the +interrupt remmaping table
      +
    • if that virtual CPU is currently running it will take the +interrupt directly
    • +
    • otherwise a bit is set in a table (Posted Interrupt Descriptor +table) and the interrupt will be inject next time that vCPU is +run
    • +
    +
  • +
+ + + + +
+
+ +

I/O virtualization

+ +../_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.png + + + + +
+
+ +

SR-IOV

+ +
    +
  • Single Root - Input Output Virtualization
  • +
  • Physical device with multiple Ethernet ports will be shown as +multiple device on the PCI bus
  • +
  • Physical Function is used for the control and can be configured
      +
    • to present itself as a new PCI device
    • +
    • which VLAN to use
    • +
    +
  • +
  • The new virtual function is enumerated on the bus and can be +assigned to a particular guest
  • +
+ + + + +
+
+ +

qemu

+ +
    +
  • Uses binary translation via Tiny Code Generator (TCG) for +efficient emulation
  • +
  • Supports different target and host architectures (e.g. running +ARM VMs on x86)
  • +
  • Both process and full system level emulation
  • +
  • MMU emulation
  • +
  • I/O emulation
  • +
  • Can be used with KVM for accelerated virtualization
  • +
+ + + + +
+
+ +

KVM

+ +../_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.png + + + + +
+
+ +

KVM

+ +
    +
  • Linux device driver for hardware virtualization (e.g. Intel VT-x, SVM)
  • +
  • IOCTL based interface for managing and running virtual CPUs
  • +
  • VMM components implemented inside the Linux kernel +(e.g. interrupt controller, timers)
  • +
  • Shadow page tables or EPT if present
  • +
  • Uses qemu or virtio for I/O virtualization
  • +
+ + + + +
+
+ +

Xen

+ +
    +
  • Type 1 = Bare Metal Hypervisor
  • +
  • Type 2 = Hypervisor embedded in an exist kernel / OS
  • +
+ + + + +
+
+ +

Xen

+ +../_images/xen-overview.png + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/lectures/virt.html b/refs/pull/405/merge/lectures/virt.html new file mode 100644 index 00000000..6aeacce8 --- /dev/null +++ b/refs/pull/405/merge/lectures/virt.html @@ -0,0 +1,661 @@ + + + + + + Virtualization — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Virtualization

+

View slides

+
+

Lecture objectives:

+
    +
  • Emulation basics
  • +
  • Virtualization basics
  • +
  • Paravirtualization basics
  • +
  • Hardware support for virtualization
  • +
  • Overview of the Xen hypervisor
  • +
  • Overview of the KVM hypervisor
  • +
+
+
+

Emulation basics

+
    +
  • Instructions are emulated (each time they are executed)
  • +
  • The other system components are also emulated:
      +
    • MMU
    • +
    • Physical memory access
    • +
    • Peripherals
    • +
    +
  • +
  • Target architecture - the architecture that it is emulated
  • +
  • Host architecture - the architecture that the emulator runs on
  • +
  • For emulation target and host architectures can be different
  • +
+
+
+

Virtualization basics

+
    +
  • Defined in a paper by Popek & Goldberg in 1974
  • +
  • Fidelity
  • +
  • Performance
  • +
  • Security
  • +
+../_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.png +
+
+

Classic virtualization

+
    +
  • Trap & Emulate
  • +
  • Same architecture for host and target
  • +
  • Most of the target instructions are natively executed
  • +
  • Target OS runs in non-privilege mode on the host
  • +
  • Privileged instructions are trapped and emulated
  • +
  • Two machine states: host and guest
  • +
+
+
+

Software virtualization

+
    +
  • Not all architecture can be virtualized; e.g. x86:
      +
    • CS register encodes the CPL
    • +
    • Some instructions don't generate a trap (e.g. popf)
    • +
    +
  • +
  • Solution: emulate instructions using binary translation
  • +
+
+
+

MMU virtualization

+
    +
  • "Fake" VM physical addresses are translated by the host to actual +physical addresses
  • +
  • Guest virtual address -> Guest physical address -> Host Physical Address
  • +
  • The guest page tables are not directly used by the host hardware
  • +
  • VM page tables are verified then translated into a new set of page +tables on the host (shadow page tables)
  • +
+
+

Shadow page tables

+

 

+../_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.png +
+
+

Lazy shadow sync

+
    +
  • Guest page tables changes are typically batched
  • +
  • To avoid repeated traps, checks and transformations map guest +page table entries with write access
  • +
  • Update the shadow page table when
      +
    • The TLB is flushed
    • +
    • In the host page fault handler
    • +
    +
  • +
+
+
+
+

I/O emulation

+

 

+../_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.png +
/*
+ * QEMU model of the UART on the SiFive E300 and U500 series SOCs.
+ *
+ * Copyright (c) 2016 Stefan O'Rear
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/log.h"
+#include "chardev/char.h"
+#include "chardev/char-fe.h"
+#include "hw/irq.h"
+#include "hw/char/sifive_uart.h"
+
+/*
+ * Not yet implemented:
+ *
+ * Transmit FIFO using "qemu/fifo8.h"
+ */
+
+/* Returns the state of the IP (interrupt pending) register */
+static uint64_t uart_ip(SiFiveUARTState *s)
+{
+    uint64_t ret = 0;
+
+    uint64_t txcnt = SIFIVE_UART_GET_TXCNT(s->txctrl);
+    uint64_t rxcnt = SIFIVE_UART_GET_RXCNT(s->rxctrl);
+
+    if (txcnt != 0) {
+        ret |= SIFIVE_UART_IP_TXWM;
+    }
+    if (s->rx_fifo_len > rxcnt) {
+        ret |= SIFIVE_UART_IP_RXWM;
+    }
+
+    return ret;
+}
+
+static void update_irq(SiFiveUARTState *s)
+{
+    int cond = 0;
+    if ((s->ie & SIFIVE_UART_IE_TXWM) ||
+        ((s->ie & SIFIVE_UART_IE_RXWM) && s->rx_fifo_len)) {
+        cond = 1;
+    }
+    if (cond) {
+        qemu_irq_raise(s->irq);
+    } else {
+        qemu_irq_lower(s->irq);
+    }
+}
+
+static uint64_t
+uart_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    unsigned char r;
+    switch (addr) {
+    case SIFIVE_UART_RXFIFO:
+        if (s->rx_fifo_len) {
+            r = s->rx_fifo[0];
+            memmove(s->rx_fifo, s->rx_fifo + 1, s->rx_fifo_len - 1);
+            s->rx_fifo_len--;
+            qemu_chr_fe_accept_input(&s->chr);
+            update_irq(s);
+            return r;
+        }
+        return 0x80000000;
+
+    case SIFIVE_UART_TXFIFO:
+        return 0; /* Should check tx fifo */
+    case SIFIVE_UART_IE:
+        return s->ie;
+    case SIFIVE_UART_IP:
+        return uart_ip(s);
+    case SIFIVE_UART_TXCTRL:
+        return s->txctrl;
+    case SIFIVE_UART_RXCTRL:
+        return s->rxctrl;
+    case SIFIVE_UART_DIV:
+        return s->div;
+    }
+
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad read: addr=0x%x\n",
+                  __func__, (int)addr);
+    return 0;
+}
+
+static void
+uart_write(void *opaque, hwaddr addr,
+           uint64_t val64, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    uint32_t value = val64;
+    unsigned char ch = value;
+
+    switch (addr) {
+    case SIFIVE_UART_TXFIFO:
+        qemu_chr_fe_write(&s->chr, &ch, 1);
+        update_irq(s);
+        return;
+    case SIFIVE_UART_IE:
+        s->ie = val64;
+        update_irq(s);
+        return;
+    case SIFIVE_UART_TXCTRL:
+        s->txctrl = val64;
+        return;
+    case SIFIVE_UART_RXCTRL:
+        s->rxctrl = val64;
+        return;
+    case SIFIVE_UART_DIV:
+        s->div = val64;
+        return;
+    }
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad write: addr=0x%x v=0x%x\n",
+                  __func__, (int)addr, (int)value);
+}
+
+static const MemoryRegionOps uart_ops = {
+    .read = uart_read,
+    .write = uart_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4
+    }
+};
+
+static void uart_rx(void *opaque, const uint8_t *buf, int size)
+{
+    SiFiveUARTState *s = opaque;
+
+    /* Got a byte.  */
+    if (s->rx_fifo_len >= sizeof(s->rx_fifo)) {
+        printf("WARNING: UART dropped char.\n");
+        return;
+    }
+    s->rx_fifo[s->rx_fifo_len++] = *buf;
+
+    update_irq(s);
+}
+
+static int uart_can_rx(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    return s->rx_fifo_len < sizeof(s->rx_fifo);
+}
+
+static void uart_event(void *opaque, QEMUChrEvent event)
+{
+}
+
+static int uart_be_change(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+
+    return 0;
+}
+
+/*
+ * Create UART device.
+ */
+SiFiveUARTState *sifive_uart_create(MemoryRegion *address_space, hwaddr base,
+    Chardev *chr, qemu_irq irq)
+{
+    SiFiveUARTState *s = g_malloc0(sizeof(SiFiveUARTState));
+    s->irq = irq;
+    qemu_chr_fe_init(&s->chr, chr, &error_abort);
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+    memory_region_init_io(&s->mmio, NULL, &uart_ops, s,
+                          TYPE_SIFIVE_UART, SIFIVE_UART_MAX);
+    memory_region_add_subregion(address_space, base, &s->mmio);
+    return s;
+}
+
+
+
+
+

Paravirtualization

+
    +
  • Change the guest OS so that it cooperates with the VMM
      +
    • CPU paravirtualization
    • +
    • MMU paravirtualization
    • +
    • I/O paravirtualization
    • +
    +
  • +
  • VMM exposes hypercalls for:
      +
    • activate / deactivate the interrupts
    • +
    • changing page tables
    • +
    • accessing virtualized peripherals
    • +
    +
  • +
  • VMM uses events to trigger interrupts in the VM
  • +
+
+
+

Intel VT-x

+
    +
  • Hardware extension to transform x86 to the point it can be +virtualized "classically"
  • +
  • New execution mode: non-root mode
  • +
  • Each non-root mode instance uses a Virtual Machine Control +Structure (VMCS) to store its state
  • +
  • VMM runs in root mode
  • +
  • VM-entry and VM-exit are used to transition between the two modes
  • +
+
+

Virtual Machine Control Structure

+
    +
  • Guest information: state of the virtual CPU
  • +
  • Host information: state of the physical CPU
  • +
  • Saved information:
      +
    • visible state: segment registers, CR3, IDTR, etc.
    • +
    • internal state
    • +
    +
  • +
  • VMCS can not be accessed directly but certain information can be +accessed with special instructions
  • +
+
+
+

VM entry & exit

+
    +
  • VM entry - new instructions that switches the CPU in non-root +mode and loads the VM state from a VMCS; host state is saved in +VMCS
  • +
  • Allows injecting interrupts and exceptions in the guest
  • +
  • VM exit will be automatically triggered based on the VMCS +configuration
  • +
  • When VM exit occurs host state is loaded from VMCS, guest state +is saved in VMCS
  • +
+
+
+

VM execution control fields

+
    +
  • Selects conditions which triggers a VM exit; examples:
      +
    • If an external interrupt is generated
    • +
    • If an external interrupt is generated and EFLAGS.IF is set
    • +
    • If CR0-CR4 registers are modified
    • +
    +
  • +
  • Exception bitmap - selects which exceptions will generate a VM +exit
  • +
  • IO bitmap - selects which I/O addresses (IN/OUT accesses) +generates a VM exit
  • +
  • MSR bitmaps - selects which RDMSR or WRMSR instructions will +generate a VM exit
  • +
+
+
+
+

Extend Page Tables

+
    +
  • Reduces the complexity of MMU virtualization and improves +performance
  • +
  • Access to CR3, INVLPG and page faults do not require VM exit +anymore
  • +
  • The EPT page table is controlled by the VMM
  • +
+../_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.png +
+

VPID

+
    +
  • VM entry and VM exit forces a TLB flush - loses VMM / VM translations
  • +
  • To avoid this issue a VPID (Virtual Processor ID) tag is +associated with each VM (VPID 0 is reserved for the VMM)
  • +
  • All TLB entries are tagged
  • +
  • At VM entry and exit just the entries associated with the tags +are flushed
  • +
  • When searching the TLB just the current VPID is used
  • +
+
+
+
+

I/O virtualization

+
+
    +
  • Direct access to hardware from a VM - in a controlled fashion
      +
    • Map the MMIO host directly to the guest
    • +
    • Forward interrupts
    • +
    +
  • +
+
+../_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.png +

Instead of trapping MMIO as with emulated devices we can allow the +guest to access the MMIO directly by mapping through its page tables.

+

Interrupts from the device are handled by the host kernel and a signal +is send to the VMM which injects the interrupt to the guest just as +for the emulated devices.

+

VT-d protects and translates VM physical addresses using an I/O +MMU (DMA remaping)

+../_images/ditaa-d880751969de8642b2613caaca345d71acea4500.png +
    +
  • Messsage Signaled Interrupts (MSI) = DMA writes to the host +address range of the IRQ controller (e.g. 0xFEExxxxx)
  • +
  • Low bits of the address and the data indicate which interrupt +vector to deliver to which CPU
  • +
  • Interrupt remapping table points to the virtual CPU (VMCS) that +should receive the interrupt
  • +
  • I/O MMU will trap the IRQ controller write and look it up in the +interrupt remmaping table
      +
    • if that virtual CPU is currently running it will take the +interrupt directly
    • +
    • otherwise a bit is set in a table (Posted Interrupt Descriptor +table) and the interrupt will be inject next time that vCPU is +run
    • +
    +
  • +
+../_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.png +
    +
  • Single Root - Input Output Virtualization
  • +
  • Physical device with multiple Ethernet ports will be shown as +multiple device on the PCI bus
  • +
  • Physical Function is used for the control and can be configured
      +
    • to present itself as a new PCI device
    • +
    • which VLAN to use
    • +
    +
  • +
  • The new virtual function is enumerated on the bus and can be +assigned to a particular guest
  • +
+
+
+

qemu

+
    +
  • Uses binary translation via Tiny Code Generator (TCG) for +efficient emulation
  • +
  • Supports different target and host architectures (e.g. running +ARM VMs on x86)
  • +
  • Both process and full system level emulation
  • +
  • MMU emulation
  • +
  • I/O emulation
  • +
  • Can be used with KVM for accelerated virtualization
  • +
+
+
+

KVM

+../_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.png +
    +
  • Linux device driver for hardware virtualization (e.g. Intel VT-x, SVM)
  • +
  • IOCTL based interface for managing and running virtual CPUs
  • +
  • VMM components implemented inside the Linux kernel +(e.g. interrupt controller, timers)
  • +
  • Shadow page tables or EPT if present
  • +
  • Uses qemu or virtio for I/O virtualization
  • +
+
+
+

Type 1 vs Type 2 Hypervisors

+
    +
  • Type 1 = Bare Metal Hypervisor
  • +
  • Type 2 = Hypervisor embedded in an exist kernel / OS
  • +
+
+
+

Xen

+../_images/xen-overview.png +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/objects.inv b/refs/pull/405/merge/objects.inv new file mode 100644 index 00000000..fd87c455 Binary files /dev/null and b/refs/pull/405/merge/objects.inv differ diff --git a/refs/pull/405/merge/search.html b/refs/pull/405/merge/search.html new file mode 100644 index 00000000..cb9bb85d --- /dev/null +++ b/refs/pull/405/merge/search.html @@ -0,0 +1,173 @@ + + + + + + Search — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/refs/pull/405/merge/searchindex.js b/refs/pull/405/merge/searchindex.js new file mode 100644 index 00000000..d74a26a7 --- /dev/null +++ b/refs/pull/405/merge/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({docnames:["index","info/contributing","info/extra-vm","info/vm","labs/arm_kernel_development","labs/block_device_drivers","labs/deferred_work","labs/device_drivers","labs/device_model","labs/filesystems_part1","labs/filesystems_part2","labs/infrastructure","labs/interrupts","labs/introduction","labs/kernel_api","labs/kernel_modules","labs/kernel_profiling","labs/memory_mapping","labs/networking","lectures/address-space","lectures/arch","lectures/debugging","lectures/fs","lectures/interrupts","lectures/intro","lectures/memory-management","lectures/networking","lectures/processes","lectures/smp","lectures/syscalls","lectures/virt","so2/assign-collaboration","so2/assign0-kernel-api","so2/assign1-kprobe-based-tracer","so2/assign2-driver-uart","so2/assign3-software-raid","so2/assign4-transport-protocol","so2/assign5-pitix","so2/assign7-kvm-vmm","so2/grading","so2/index","so2/lab1-intro","so2/lab10-networking","so2/lab11-arm-kernel-development","so2/lab12-kernel-profiling","so2/lab2-kernel-api","so2/lab3-device-drivers","so2/lab4-interrupts","so2/lab5-deferred-work","so2/lab6-memory-mapping","so2/lab7-block-device-drivers","so2/lab8-filesystems-part1","so2/lab9-filesystems-part2","so2/lec1-intro","so2/lec10-networking","so2/lec11-arch","so2/lec12-profiling","so2/lec12-virtualization","so2/lec2-syscalls","so2/lec3-processes","so2/lec4-interrupts","so2/lec5-smp","so2/lec6-address-space","so2/lec7-memory-management","so2/lec8-filesystems","so2/lec9-debugging"],envversion:53,filenames:["index.rst","info/contributing.rst","info/extra-vm.rst","info/vm.rst","labs/arm_kernel_development.rst","labs/block_device_drivers.rst","labs/deferred_work.rst","labs/device_drivers.rst","labs/device_model.rst","labs/filesystems_part1.rst","labs/filesystems_part2.rst","labs/infrastructure.rst","labs/interrupts.rst","labs/introduction.rst","labs/kernel_api.rst","labs/kernel_modules.rst","labs/kernel_profiling.rst","labs/memory_mapping.rst","labs/networking.rst","lectures/address-space.rst","lectures/arch.rst","lectures/debugging.rst","lectures/fs.rst","lectures/interrupts.rst","lectures/intro.rst","lectures/memory-management.rst","lectures/networking.rst","lectures/processes.rst","lectures/smp.rst","lectures/syscalls.rst","lectures/virt.rst","so2/assign-collaboration.rst","so2/assign0-kernel-api.rst","so2/assign1-kprobe-based-tracer.rst","so2/assign2-driver-uart.rst","so2/assign3-software-raid.rst","so2/assign4-transport-protocol.rst","so2/assign5-pitix.rst","so2/assign7-kvm-vmm.rst","so2/grading.rst","so2/index.rst","so2/lab1-intro.rst","so2/lab10-networking.rst","so2/lab11-arm-kernel-development.rst","so2/lab12-kernel-profiling.rst","so2/lab2-kernel-api.rst","so2/lab3-device-drivers.rst","so2/lab4-interrupts.rst","so2/lab5-deferred-work.rst","so2/lab6-memory-mapping.rst","so2/lab7-block-device-drivers.rst","so2/lab8-filesystems-part1.rst","so2/lab9-filesystems-part2.rst","so2/lec1-intro.rst","so2/lec10-networking.rst","so2/lec11-arch.rst","so2/lec12-profiling.rst","so2/lec12-virtualization.rst","so2/lec2-syscalls.rst","so2/lec3-processes.rst","so2/lec4-interrupts.rst","so2/lec5-smp.rst","so2/lec6-address-space.rst","so2/lec7-memory-management.rst","so2/lec8-filesystems.rst","so2/lec9-debugging.rst"],objects:{},objnames:{},objtypes:{},terms:{"0000000c":[15,41],"000002d8":[21,65],"0000eef4":[21,65],"0001b000":[27,59],"001f":[12,47],"001s":[16,44],"005f":[12,47],"006f":[12,47],"007b":[15,21,41,65],"008f":[12,47],"0091ffff":[4,43],"00a0":[12,47],"00a1":[12,47],"00c0":[12,47],"00d8":[21,65],"00df":[12,47],"00f0":[12,47],"00ff":[12,47],"011848be72f8bb42":[21,65],"015s":[16,44],"01f0":[12,47],"01f7":[12,47],"0209c000":[4,43],"0209ffff":[4,43],"020s":[16,44],"021a0000":[4,43],"021a3fff":[4,43],"02x":[5,50],"037a":[12,47],"037b":[12,47],"037f":[12,47],"03c0":[12,47],"03df":[12,47],"03f6":[12,47],"03f8":[12,47],"03ff":[12,47],"0716b000":[21,65],"0785f000":[21,65],"0804c000":[27,59],"0804d000":[27,59],"0806e000":[27,59],"08afb008":[21,65],"08afb050":[21,65],"092e1000":[17,49],"0m0":[16,44],"0ubuntu1":2,"0ul":[28,61],"0x0":[13,15,21,27,38,41,59,65],"0x00000000":[2,13,41],"0x00001234":[15,41],"0x0001234":[15,41],"0x0092":[19,62],"0x009a":[19,62],"0x01":[12,47],"0x08":[12,21,47,65],"0x10":[13,15,21,41,65],"0x100":[21,65],"0x1000":[21,65],"0x100000":38,"0x105":[21,65],"0x106":[15,41],"0x11":[13,41],"0x119":[21,65],"0x11c":[21,65],"0x12":[15,41],"0x120":[21,65],"0x1234":[15,41],"0x12c":[21,65],"0x140":[15,41],"0x14f":[21,65],"0x150":[15,41],"0x15f":[21,65],"0x163":[21,65],"0x164":[21,65],"0x17":[21,65],"0x170":[15,21,41,65],"0x17a0":[21,65],"0x180":[21,65],"0x19":[21,65],"0x190":[21,65],"0x19b8":[15,41],"0x1a":[15,41],"0x1a0":[21,65],"0x1a40":[15,41],"0x1b":[21,65],"0x1b0":[15,41],"0x1be":[21,65],"0x1c":[12,21,47,65],"0x1c2":[21,65],"0x1d":[12,21,47,65],"0x1e":[12,47],"0x1e0":[21,65],"0x1e3":[21,65],"0x1ee":[21,65],"0x1f0":[21,65],"0x1f2":[21,65],"0x1f45":[21,65],"0x2":38,"0x20":[15,21,41,65],"0x200":[21,65],"0x201a":[21,65],"0x21":[15,41],"0x2380":[21,65],"0x25":[21,65],"0x2590":[21,65],"0x27":[21,65],"0x2a":[12,47],"0x2b":[21,65],"0x2d":[15,41],"0x2e":[12,21,47,65],"0x2e0":[21,65],"0x2f":[21,65],"0x2f0":[21,65],"0x2f8":34,"0x2ff":34,"0x3":[15,41],"0x30":[12,15,21,41,47,65],"0x305":[21,65],"0x30a":[21,65],"0x31":[21,65],"0x310":[21,65],"0x317":[21,65],"0x32":[21,65],"0x35":[21,65],"0x37":[21,65],"0x378":[12,47],"0x379":[12,47],"0x37a":[12,47],"0x38":[13,21,41,65],"0x3824548b":[13,41],"0x39":[21,65],"0x3a":[21,65],"0x3f":[21,65],"0x3f8":[12,34,47],"0x3ff":34,"0x4":38,"0x40":[21,65],"0x400":38,"0x4092":[19,62],"0x409a":[19,62],"0x42":[21,65],"0x43":[15,41],"0x44902cc2":[21,65],"0x46":[21,65],"0x48":[21,65],"0x4b":[21,65],"0x4b4":[21,65],"0x4c":[21,65],"0x4f":[21,65],"0x5":[15,41],"0x50":[21,65],"0x52":[21,65],"0x57":[21,65],"0x5ea":[21,65],"0x6":[15,21,41,65],"0x60":[12,21,47,65],"0x61":[12,21,47,65],"0x63":[21,65],"0x64":[12,47],"0x640":[21,65],"0x65":[12,47],"0x66":[21,65],"0x6a":[21,65],"0x6c":[21,65],"0x7":[21,65],"0x70":[21,65],"0x71":[21,65],"0x72":[21,65],"0x74726976":38,"0x7d":[21,65],"0x7ffc3349ba50":[16,44],"0x8":[21,65],"0x80":[12,21,29,47,58,65],"0x8000":38,"0x80000000":[30,57],"0x830":[21,65],"0x858458f6":[9,51],"0x8a":[15,41],"0x8d":[21,65],"0x9":[21,65],"0x90":[21,65],"0x93":[21,65],"0x95":[21,65],"0x96":[21,65],"0x99":[21,65],"0x9b":[21,65],"0x9c":[12,47],"0x9d":[12,47],"0x9e":[12,47],"0xa":[21,65],"0xa09b":[19,62],"0xa0fb":[19,62],"0xa4":[15,21,41,65],"0xa9":[21,65],"0xaa":[12,17,47,49],"0xae":[12,47],"0xaf":[21,65],"0xb":[21,65],"0xb0":[12,21,47,65],"0xbb":[17,49],"0xc":[13,15,21,41,65],"0xc01013d3":[13,41],"0xc01013d7":[13,41],"0xc01013dc":[13,41],"0xc01013de":[13,41],"0xc01013e0":[13,41],"0xc01013e2":[13,41],"0xc01013e6":[13,41],"0xc01013eb":[13,41],"0xc01013ee":[13,41],"0xc0101431":[13,41],"0xc0111aab":[13,41],"0xc011482a":[13,41],"0xc011548b":[13,41],"0xc0115c6c":[13,41],"0xc011bc58":[13,41],"0xc014249e":[13,41],"0xc0142725":[13,41],"0xc0142d4e":[13,41],"0xc0142d7d":[13,41],"0xc0142de5":[13,41],"0xc014363d":[13,41],"0xc014369f":[13,41],"0xc014fee7":[13,41],"0xc015042c":[13,41],"0xc01507a1":[13,41],"0xc02e535c":[13,41],"0xc02e536c":[13,41],"0xc02e537c":[13,41],"0xc02e538c":[13,41],"0xc092":[19,62],"0xc093":[19,62],"0xc09a":[19,62],"0xc09b":[19,62],"0xc0f2":[19,62],"0xc0f3":[19,62],"0xc0fa":[19,62],"0xc0fb":[19,62],"0xc10001da":2,"0xc101f136":2,"0xc106a6dd":2,"0xc106a8c5":2,"0xc1244138":[21,65],"0xc13cb14a":2,"0xc13cf2f2":2,"0xc1507a7a":2,"0xc15de780":[23,60],"0xc15de874":[23,60],"0xc8816000":[21,65],"0xc8817000":[21,65],"0xc886d000":[15,41],"0xc888a000":[15,41],"0xc8903000":[15,41],"0xc895a000":[15,41],"0xc89ad000":[15,41],"0xc89d4000":[15,41],"0xc8d":[21,65],"0xcc":[17,49],"0xd":[21,65],"0xd5":[15,41],"0xd7871500":[21,65],"0xd84156c5635688c0":[21,65],"0xdd":[17,49],"0xde":[21,65],"0xe5":[21,65],"0xe77":[21,65],"0xe9":38,"0xeb":[21,65],"0xf0":[15,21,41,65],"0xfd":[21,65],"0xfeexxxxx":[30,57],"0xff":[12,47],"0xff800000":[23,60],"0xff801000":[23,60],"0xffe000ba":[13,41],"0xfffbd000":38,"0xffff":[19,23,60,62],"0xffffe000ul":[19,62],"0xfffff":[19,62],"0xfffff000":[19,62],"100m":[24,53],"10p":38,"12k":[14,45],"14x14":[4,43],"17ec19e9b5bf":[27,59],"1980s":[14,45],"1ubuntu1":[21,65],"209c000":[4,43],"20min":53,"20p":38,"2115503fc3e3":31,"21a0000":[4,43],"2981ce73ae801363":[15,41],"2mb":38,"30p":38,"31s":8,"32bit":[0,4,17,24,27,29,43,49,53,58,59],"32byte":[25,63],"34xc3":39,"3c92a02cc527":31,"3c92a02cc52700d2cd7c50a20297eef8553c207a":31,"3gbp":[26,54],"3nd":53,"3rd":[5,6,9,10,12,13,17,41,47,48,49,50,51,52,53],"400mb":2,"4480c000":[17,49],"4482e000":[17,49],"4482f000":[17,49],"449a9000":[17,49],"449ab000":[17,49],"449ac000":[17,49],"449af000":[17,49],"45eeb3d6ea8ff1":[15,41],"4gb":[19,62],"4kb":[19,24,53,62],"4mb":[19,25,62,63],"4umkcismqm":11,"512m":[4,43],"53asm":2,"5a5a5a5a":[21,65],"60k":[26,54],"628s":[21,65],"64bit":[4,19,43,62],"64k":[17,19,49,62],"6b6b6b6b":[21,65],"738mbp":[26,54],"77f49f83f2e42f91":[21,65],"7e43c163832f":[27,59],"7rwv63e9wf":3,"7th":[16,44],"8kb":[27,59],"978mbp":[26,54],"9f680e8136bf":31,"9fffffff":[4,43],"9gbp":[26,54],"\u00een\u0163ele":53,"\u00een\u021belegerea":53,"\u00eenaint":53,"\u00eensu\u0219irea":53,"\u00eentreb\u0103ril":53,"\u00eentreba\u0163i":53,"\u00eentreruperi":53,"\u015fi":53,"\u0219erb\u0103nescu":53,"\u0219i":53,"\u0219tefan":53,"a\u0163i":53,"abstract":[0,8,9,18,20,24,27,35,42,51,53,55,59],"activit\u0103\u021bi":53,"activita\u021bi":53,"adun\u0103ri":53,"ajust\u0103ri":53,"b\u0103lu\u021b\u0103":53,"baz\u0103":53,"break":[6,7,13,27,41,46,48,59],"byte":[4,5,7,9,10,12,14,15,16,17,18,19,21,22,23,27,30,34,35,36,37,38,41,42,43,44,45,46,47,49,50,51,52,57,59,60,62,64,65],"c\u00e2nd":53,"c\u00e2te":53,"c\u0103lduro":53,"cas\u0103":53,"case":[1,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,23,24,27,28,29,30,32,33,34,35,36,37,38,39,41,42,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,65],"catch":[19,21,62,65],"char":[3,5,6,7,8,9,10,12,14,15,17,18,21,26,30,31,33,37,38,41,42,45,46,47,48,49,50,51,52,54,57,65],"class":[0,6,7,19,21,22,24,27,46,48,53,59,62,64,65],"compara\u0163ii":53,"condi\u0163ii":53,"const":[5,6,7,8,9,10,12,18,19,22,26,29,30,31,36,42,46,47,48,50,51,52,54,57,58,62,64],"const\u0103":53,"contribu\u021bii":53,"corec\u021bii":53,"cuno\u0219tin\u021belor":53,"default":[2,3,7,8,10,13,14,15,16,19,24,26,27,34,36,39,41,44,45,46,52,53,54,59,62],"dep\u0103se\u0219t":53,"discu\u0163ii":53,"discu\u021bi":53,"discu\u021bii":53,"dup\u0103":53,"echilibra\u021bi":53,"enum":[6,18,19,22,23,42,48,60,62,64],"exerci\u021bii":53,"export":[8,14,32,45],"fi\u0219ier":[37,53],"fi\u0219ierelor":53,"final":[0,6,17,20,21,23,27,28,31,38,48,49,55,59,60,61,65],"final\u0103":53,"function":[0,2,3,6,7,10,12,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,33,34,35,36,37,38,41,42,44,45,46,47,48,49,52,53,55,57,58,59,60,61,62,63,64,65],"ghi\u021b\u0103":53,"goto":[5,18,27,28,42,50,59,61],"implement\u0103rii":53,"import":[0,5,6,7,8,9,10,13,14,15,16,17,18,21,22,24,27,28,29,39,41,42,44,45,46,48,49,50,51,52,53,58,59,61,64,65],"informa\u021bii":53,"instruc\u021biuni":53,"int":[4,5,6,7,8,9,10,12,14,15,17,18,19,21,22,23,25,26,27,29,30,31,38,41,42,43,45,46,47,48,49,50,51,52,54,57,58,59,60,62,63,64,65],"lauren\u021biu":53,"leg\u0103tur\u0103":53,"list\u0103":53,"long":[0,2,5,6,7,9,10,12,17,18,19,22,23,27,28,29,31,37,39,42,46,47,48,49,50,51,52,58,59,60,61,62,64],"mi\u0219u":53,"minim\u0103":53,"new":[0,1,2,5,6,7,9,10,11,13,14,15,17,18,19,21,22,23,24,25,27,28,30,37,38,41,42,45,46,48,49,50,51,52,53,57,59,60,61,62,63,64,65],"null":[6,7,8,9,10,12,14,15,16,17,18,19,21,22,27,30,41,42,44,45,46,47,48,49,51,52,57,59,62,64,65],"participa\u0163i":53,"pozi\u021bionar":0,"preciz\u0103ri":53,"primi\u021bi":53,"propune\u021bi":53,"pu\u0163in":53,"public":[18,24,30,37,39,42,53,57],"puncteaz\u0103":53,"purdil\u0103":53,"r\u0103ducanu":53,"r\u0103spunde\u021bi":53,"r\u0103zvan":53,"re\u021belei":53,"recomand\u0103":53,"return":[0,5,6,7,8,9,10,12,13,14,15,17,18,19,21,22,25,27,28,29,30,31,33,35,38,41,42,45,46,47,48,49,50,51,52,57,58,59,61,62,63,64,65],"short":[12,18,36,42,47,51],"socke\u021bi":53,"static":[4,5,7,8,9,10,12,13,14,15,16,17,18,19,21,23,27,28,29,30,31,32,33,34,35,36,37,38,41,42,43,44,45,46,47,49,50,51,52,57,58,59,60,61,62,65],"super":[9,10,27,51,52,59],"switch":[0,7,9,16,19,21,22,23,28,29,30,38,44,46,51,57,58,60,61,62,64,65],"true":[4,6,8,18,22,23,27,28,29,30,42,43,48,57,58,59,60,61,64],"try":[6,8,9,10,12,13,15,16,21,24,25,27,28,31,41,44,47,48,51,52,53,59,61,63,65],"var":[4,15,16,41,43,44],"virtual\u0103":53,"vo\u0219tri":53,"void":[5,6,7,8,9,10,12,14,15,17,18,19,21,22,23,25,26,27,28,29,30,31,41,42,45,46,47,48,49,50,51,52,54,57,58,59,60,61,62,63,64,65],"while":[5,6,7,8,9,10,12,13,14,15,16,17,18,19,22,23,24,27,28,29,41,42,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62,64],AND:[12,47],Adding:[0,5,50],And:[6,7,15,19,28,31,41,46,48,61,62],Are:2,BHs:[23,60],BUS:[8,24,53],Bus:[0,28,61],But:[12,14,27,29,38,45,47,58,59],CDs:[9,51],FOR:[30,57],For:[1,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,30,32,33,34,35,36,37,39,41,42,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64,65],HFS:[10,52],I2S:[4,43],IDE:[7,8,46],IDs:8,Into:[18,42],Its:[7,8,10,46,52],MIS:[12,47],NOT:[4,5,6,7,8,9,10,12,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],Not:[15,21,23,25,30,41,57,60,63,65],ORed:[26,54],OSes:[23,60],One:[5,7,8,10,14,15,16,21,24,25,27,28,41,44,45,46,50,52,53,59,61,63,65],PCs:[24,53],Such:[8,10,14,15,18,24,41,42,45,52,53],TLS:[19,62],TOS:[26,54],That:[6,7,9,13,15,18,19,41,42,46,48,51,62],The:[0,1,2,3,4,5,6,7,11,12,13,14,15,16,17,19,20,21,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,41,43,44,45,46,47,48,49,50,53,54,55,57,58,60,61,62,63,65],Their:[12,18,24,42,47,53],Then:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,21,23,27,28,29,31,41,42,43,44,45,46,47,48,49,50,51,52,58,59,60,61,65],There:[2,4,5,6,8,9,10,12,13,15,17,18,19,21,22,23,24,25,27,28,29,33,34,35,36,39,41,42,43,47,48,49,50,51,52,53,58,59,60,61,62,63,64,65],These:[5,6,7,8,9,10,11,13,14,15,16,18,19,22,23,24,29,37,41,42,44,45,46,48,50,51,52,53,58,60,62,64],Use:[0,4,5,6,7,8,9,10,12,13,15,16,17,18,19,24,27,38,41,42,43,44,46,47,48,49,50,51,52,53,59,62],Used:[13,22,24,25,41,53,63,64],Useful:[0,15,41],Uses:[22,25,27,30,57,59,63,64],Using:[0,5,6,7,8,12,13,14,15,17,18,23,25,28,39,41,42,45,46,47,48,49,50,60,61,63],VCS:31,VFS:[0,10,22,24,52,53,64],VMs:[27,30,38,57,59],Will:[6,23,48,60],With:[4,6,13,15,24,27,29,33,41,43,48,53,58,59],___wait_cond_timeout:[27,59],___wait_ev:[27,59],___wait_is_interrupt:[27,59],__add_wait_queu:[27,59],__add_wait_queue_entry_tail:[27,59],__align:[18,42],__aligned__:[27,59],__always_inlin:[19,27,28,29,58,59,61,62],__assembly__:[19,62],__attribute__:[22,27,59,64],__attribute_used__:[27,59],__be16:[18,36,42],__be32:[18,42],__big_endian_bitfield:[18,42],__bread:[9,51],__builtin_expect:[29,58],__cache_fre:[21,65],__chk_user_ptr:[29,58],__data_len:[5,50],__do_softirq:[21,65],__end_of_fixed_address:[19,62],__ex_tabl:[29,58],__exit:[8,18,31,42],__file__:[15,41],__fix_to_virt:[19,62],__forc:[29,58],__free_pag:35,__func__:[15,21,30,41,57,65],__function__:[15,41],__get_dma_pag:[25,63],__get_free_pag:[25,63],__get_user_1:[29,58],__get_user_2:[29,58],__get_user_4:[29,58],__get_user_:[29,58],__get_zero_pag:[25,63],__ia32_sys_writ:[16,44],__init:[8,9,18,31,42,51],__init__:[27,59],__int:[27,59],__inttyp:[29,58],__irqentry_text_end:[16,21,44,65],__kernel_vsyscal:[16,44],__kmalloc:33,__kmap_atomic_idx:[19,62],__label__:[27,59],__libc_start_main:[16,44],__line__:[15,41],__list_del:[21,65],__list_del_entri:[21,65],__little_endian_bitfield:[18,42],__lock_acquir:[21,65],__might_sleep:[21,65],__mutex_lock:[21,65],__mutex_lock_slowpath:[28,61],__mutex_set_flag:[28,61],__mutex_trylock:[28,61],__mutex_trylock_fast:[28,61],__mutex_unlock_fast:[28,61],__mutex_unlock_slowpath:[15,28,41,61],__mutex_waiter_is_first:[28,61],__noinstr_text_start:[16,44],__nr_syscall_compat_max:[29,58],__out:[27,59],__randomize_layout:[22,64],__rcu:[18,42],__request_region:[12,47],__ret:[27,59],__ret_gu:[29,58],__sched:[28,61],__sector:[5,50],__set_current_st:[27,28,59,61],__slab_error:[21,65],__sock_creat:[18,42],__start___ex_t:[29,58],__stop___ex_t:[29,58],__sum16:[18,42],__switch_to:[27,59],__switch_to_asm:[27,59],__syscall_i386:[29,58],__test_and_set_bit:[10,52],__this_cpu_inc_return:[19,62],__this_fixmap_does_not_exist:[19,62],__typeof__:[29,58],__u16:[18,26,37,42,54],__u32:[18,37,42],__u8:[18,36,37,42],__user:[7,18,31,42,46],__val_gu:[29,58],__virt_to_fix:[19,62],__visibl:[29,58],__vunmap:[15,41],__wait_ev:[27,59],__wake_up:[27,59],__wake_up_common:[27,59],__wake_up_common_lock:[27,59],__wq_entri:[27,59],_asm_ax:[29,58],_asm_dx:[29,58],_asm_ext:[29,58],_asm_extable_handl:[29,58],_checker:[32,33,34,35,36],_delai:[6,48],_devic:35,_iget:[10,52],_inode_info:[10,52],_ioc:[7,46],_ioc_non:[7,46],_ioc_read:[7,46],_ioc_writ:[7,46],_one_:[27,59],_raw_spin_lock:[21,65],_really_:[23,60],_register_driv:8,_ret_ip_:[28,61],_skb_refdst:[18,42],_stext:[15,41],_vimrc:[13,41],a_op:[10,22,52,64],aarch64:[4,43],abbrevi:[22,64],abc:[5,50],abil:[7,8,23,24,46,53,60],abl:[5,6,8,9,10,12,21,25,36,47,48,50,51,52,63,65],abnorm:[21,23,60,65],abort:[23,60],about:[0,5,6,7,8,9,10,12,15,17,18,21,22,27,32,33,34,35,36,37,38,39,42,46,47,48,49,50,51,52,59,64,65],abov:[2,5,6,7,8,9,10,12,13,14,15,16,18,21,28,29,32,33,34,35,36,37,38,41,42,44,45,46,47,48,50,51,52,58,61,65],absenc:[28,61],absent:[3,19,62],absolut:[6,12,15,19,41,47,48,62],ac1af6d88a25:31,academ:39,academia:[24,53],acceler:[0,30,57],accept:[0,5,6,24,35,36,39,48,50,53],access:[0,3,5,6,8,9,10,13,15,16,17,18,19,20,21,22,23,24,28,30,31,32,34,35,37,39,40,41,42,44,48,49,50,51,52,53,55,57,60,61,62,64,65],accessor:[5,20,50,55],accident:[28,61],accomplish:[7,9,12,18,23,27,42,46,47,51,59,60],accord:[5,6,7,8,10,14,15,37,38,39,41,45,46,48,50,52],accordingli:[5,7,10,46,50,52],account:[6,7,13,18,27,33,42,46,48,59],accur:[7,46],achiev:[3,8,14,24,27,28,39,45,53,59,61],ack:[18,42],ack_seq:[18,42],acknowledg:[12,23,47,60],acomodarea:53,acpi:[8,12,20,47,55],acquir:[5,6,9,10,12,13,14,18,21,28,35,39,41,42,45,47,48,50,51,52,61,65],acronym:[28,61],across:[18,19,24,28,42,53,61,62],act:[5,36,50],action:[0,5,6,7,8,12,14,26,28,45,46,47,48,50,54,61],activ:[5,10,12,13,15,16,18,21,23,24,27,28,30,35,39,41,42,44,47,50,51,52,53,57,59,60,61,65],active_mm:[27,59],activit:53,activitatea:53,actual:[5,6,7,8,10,12,15,27,28,29,30,37,41,46,47,48,50,52,57,58,59,61],adapt:[26,54],add:[0,1,4,5,6,7,9,10,12,13,14,15,16,20,22,25,26,27,28,29,32,33,34,35,36,38,41,43,44,45,46,47,48,50,51,52,54,55,58,59,61,63,64],add_disk:[5,50],add_pid:[14,45],add_preempt_count:[28,61],add_stor:8,add_tim:13,add_to_buff:[14,45],add_uevent_var:8,added:[2,5,6,7,8,9,10,12,13,14,15,27,28,33,41,45,46,47,48,50,51,52,59,61],addf:32,adding:[5,8,10,15,18,28,32,41,42,50,52,61],addison:53,addit:[1,6,7,8,9,10,12,13,14,15,16,21,33,34,35,36,41,44,45,46,47,48,51,52,65],addition:[5,9,50,51],addr2lin:[0,13],addr:[14,17,18,19,26,29,30,36,42,45,49,54,57,58,62],addr_len:[18,42],addr_limit:[29,58],addreess:[26,54],addres:[26,54],address:[0,2,3,4,5,6,8,12,13,14,15,17,20,21,25,26,27,28,29,30,33,34,36,38,39,40,41,43,45,47,48,49,50,54,55,57,58,59,61,63,65],address_spac:[10,22,30,52,57,64],address_space_oper:[10,22,52,64],adher:[14,45],adjac:[5,21,35,50,65],adjust:[15,18,21,39,41,42,65],admin:[7,46],adresarea:53,advanc:[7,15,24,35,39,41,46,53],advantag:[6,7,8,13,15,19,24,28,41,46,48,53,61,62],advic:[13,41],advoc:[24,53],af_inet:[18,26,42,54],af_stp:36,affect:[13,16,24,28,41,44,51,53,61],aften:[6,48],after:[0,1,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,23,24,27,28,29,35,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62,65],afterward:[13,15],again:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],against:[19,27,28,29,39,58,59,61,62],age:[21,65],aggreg:[26,28,54,61],agnost:[18,42],ago:[14,23,45,60],agre:31,ahead:[22,64],aid:[15,41],aim:[12,16,33,44,47],albeit:3,alessandro:53,alex:53,alexandru:53,algorithm:[24,25,53,63],alia:[18,42],alic:[15,41],align:[5,17,22,27,28,29,49,50,58,59,61,64],all:[1,2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,60,61,62,63,64,65],alloc:[0,4,5,6,7,8,9,10,12,13,17,19,20,21,22,23,24,27,33,35,37,38,39,41,43,46,47,48,49,50,51,52,53,55,59,60,62,64,65],alloc_chrdev_region:[7,46],alloc_disk:[5,50],alloc_inod:[9,10,51,52],alloc_io:[6,48],alloc_memori:[14,45],alloc_mmap_pag:[17,49],alloc_pag:[5,17,25,35,49,50,63],alloc_skb:[18,42],allow:[2,4,5,6,7,8,10,13,14,15,16,17,18,19,23,24,26,27,28,29,30,31,36,39,41,42,43,44,45,46,48,49,50,52,53,54,57,58,59,60,61,62],almost:[7,10,15,22,23,27,41,46,52,59,60,64],alo:38,alon:[13,14,41,45],along:[7,8,12,14,24,30,35,45,46,47,53,57],alpha:[24,53],alreadi:[0,1,2,3,4,5,6,8,9,10,12,13,14,15,16,17,18,21,27,34,35,36,37,41,42,43,44,45,47,48,49,50,51,52,59,65],alsa:[24,53],also:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,30,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64,65],alt:[7,46],alter:[23,60],altern:[0,6,9,10,13,15,25,34,41,48,51,52,63],although:[4,5,6,7,8,9,10,12,13,14,15,19,21,22,24,28,29,41,43,45,46,47,48,50,51,52,53,58,61,62,64,65],alwai:[5,6,7,10,14,15,19,21,22,23,24,27,29,31,37,38,45,46,48,50,52,53,58,59,60,62,64,65],ambigu:[13,41],amd:[24,53],amend:1,among:[7,8,15,18,22,39,41,42,46,64],amount:[6,7,15,16,33,41,44,46,48],ampersand:[18,42],ana:38,analog:[7,36,46],analogu:[7,46],analysi:[13,15,16,32,33,34,35,36,37,38,41,44,51],analyz:[5,8,12,13,15,16,18,41,42,44,47,50],ani:[1,5,6,8,9,10,12,13,14,15,16,17,18,21,22,23,24,27,28,30,35,36,38,39,41,42,44,45,47,48,49,50,51,52,53,57,59,60,61,64,65],annot:[16,44],announc:[12,39,47],anonym:[16,44],anoth:[2,4,5,6,7,8,12,13,14,16,17,21,23,24,27,28,34,39,43,44,45,46,47,48,49,50,53,59,60,61,65],ansi:[14,15,41,45],answer:[35,36,37,39,51],answer_flag:[18,42],answer_prot:[18,42],anymor:[6,13,21,30,39,41,48,57,65],anyon:[19,62],anyth:[1,9,13,16,44,51],anytim:[14,45],anywai:[27,59],anywher:[23,60],apach:[13,41],apeluri:53,api:[0,5,7,8,9,12,13,17,19,20,23,24,28,29,35,36,38,39,40,41,46,47,49,50,51,53,55,58,60,61,62],api_v:38,apic:[12,19,23,47,60,62],apic_timer_interrupt:[21,65],apm:[19,62],app:[7,16,44,46],apparmor:[24,53],appear:[5,7,8,12,13,15,17,32,41,46,47,49,50],append:[3,4,43],appli:[1,10,18,42,52],applianc:[4,43],applic:[2,4,7,13,16,18,19,24,26,27,29,41,42,43,44,46,53,54,58,59,62],approach:[5,6,8,10,12,16,17,19,24,28,29,39,44,47,48,49,50,52,53,58,61,62],appropri:[5,7,8,9,10,12,35,46,47,50,51,52],approx:[16,44],apr:[2,3,31],april:[33,34],aprofundar:53,apropo:2,apt:[4,13,32,33,34,35,36,37,38,43],arbitrari:[17,19,28,49,61,62],arbitrarili:[17,49],arbori:53,arch:[0,2,3,4,11,12,13,15,41,43,47],arch_cpu_idl:[2,21,65],arch_flush_lazy_mmu_mod:[19,62],arch_safe_halt:2,arch_start_context_switch:[27,59],architectur:[0,4,7,12,13,18,19,21,25,27,28,29,30,40,41,42,43,46,47,57,58,59,61,62,63,65],archiv:[16,32,33,34,35,36,37,38,44],area:[5,7,8,9,12,13,14,15,17,18,19,22,24,25,27,29,33,35,37,41,42,45,46,47,49,50,51,53,58,59,62,63,64],aren:[16,44],arg:[7,16,27,31,44,46,59],argument:[5,6,7,9,10,12,13,14,15,16,17,18,20,33,41,42,44,45,46,47,48,49,50,51,52,55],aris:8,arithmet:[10,52],arm32:[4,43],arm64:[4,24,43,53],arm:[0,3,6,16,20,24,28,30,40,44,48,53,55,57,61],arm_kernel_develop:[4,43],armi:[18,42],around:2,arpeggio:[7,46],arrai:[4,5,10,19,23,28,36,43,50,52,60,61,62],arrang:[4,23,43,60],array_map:[10,52],arriv:[23,28,60,61],art:[14,45],articl:[7,12,24,38,46,47,53],as_:[22,64],ascend:[18,42],ascii:[0,37,38],asgard:[15,41],asid:0,ask:[18,35,36,37,39,42,51],asm:[2,7,10,12,14,15,18,24,27,28,29,41,42,45,46,47,52,53,58,59,61],asm_call_constraint:[29,58],asm_clac:[29,58],asm_stac:[29,58],asmlinkag:[28,61],asmp:0,aspect:[10,13,16,41,44,52],asrc:[4,43],assembl:[13,15,29,41,58],assig:[0,39],assign:[0,3,5,7,8,17,30,40,46,49,50,57],assignment_grad:39,assist:39,associ:[5,6,7,8,9,10,12,13,14,15,17,18,19,21,22,23,24,25,29,30,33,34,35,36,37,41,42,45,46,47,48,49,50,51,52,53,57,58,60,62,63,64,65],assum:[5,6,7,10,11,29,32,33,34,35,36,46,48,50,52,58],assumpt:[5,37,50],asymmetr:0,asynchron:[5,6,22,23,48,50,60,64],ata:8,atkbd:[12,47],atkbd_interrupt:[12,47],atm:[18,42],atom:[0,5,6,7,10,12,19,27,46,47,48,50,52,59,62],atomic_add:[14,28,45,61],atomic_cmpxchg:[7,14,45,46],atomic_dec:[14,28,45,61],atomic_dec_and_test:[14,28,45,61],atomic_inc:[14,28,45,61],atomic_inc_and_test:[14,45],atomic_kmap:[19,62],atomic_long_cmpxchg_acquir:[28,61],atomic_long_cmpxchg_releas:[28,61],atomic_read:[14,45],atomic_set:[14,45],atomic_sub:[14,28,45,61],atomic_sub_and_test:[28,61],atomic_t:[7,14,22,26,45,46,54,64],attach:[8,13,18,24,29,42,53,58],attack:[27,59],attempt:[9,24,28,51,53,61],attend:39,attent:[16,23,24,44,53,60],attr:[8,10,52],attr_siz:[10,52],attribut:[0,6,7,17,24,46,48,49,53],atunci:53,audio:[4,43],augment:[6,16,44,48],authent:2,author:31,auto:[16,27,44,59],autogener:11,automat:[0,5,7,8,9,10,13,18,27,30,32,33,34,35,36,37,39,41,42,46,50,51,52,57,59],automot:[4,43],autoremove_wake_funct:[27,59],aux:[12,16,44,47],auxiliari:13,avail:[7,8,10,12,15,17,19,20,21,22,24,27,28,29,38,39,41,46,47,49,52,53,55,58,59,61,62,64,65],avoid:[5,6,11,12,14,15,17,19,22,23,24,25,28,29,30,39,41,45,47,48,49,50,53,57,58,60,61,62,63,64],awai:[27,59],awaken:[27,59],awar:[15,24,41,53],award:53,axp:[24,53],b31a257fd8b8:31,b7761000:[17,49],b7763000:[17,49],b7766000:[17,49],b7767000:[17,49],b7f46000:[27,59],b7f49000:[27,59],b7f59000:[27,59],b7f5b000:[27,59],b7f77000:[27,59],b7f79000:[27,59],b_bdev:[9,51],b_blocknr:[9,51],b_data:[9,10,51,52],b_size:[9,51],b_state:[9,51],back:[5,18,22,24,27,28,29,42,50,53,58,59,61,64],backend:[27,59],background:0,backlog:[18,42],backlog_rcv:[18,42],backtrac:[13,15,21,41,65],backup:[13,15,41],backward:[19,62],bad:[0,21,30,57,65],bad_elf:[16,44],bad_get_us:[29,58],badli:[27,59],balanc:[6,27,28,48,59,61],balign:[29,58],bare:[30,38,57],barrier:[0,27,59],base:[0,1,3,4,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22,23,24,26,28,29,30,36,38,39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,57,58,60,61,62,64,65],bash:37,basi:[6,8,24,48,53],basic:[0,5,6,9,10,13,14,15,16,17,18,19,21,27,36,37,41,42,44,45,48,49,50,51,52,59,62,65],batch:[30,35,57],bcm2835:[4,43],bd_disk:[5,50],bdev:[5,50],bdflush:[9,51],beagl:[4,43],beat:13,becam:[15,41],becaus:[1,2,4,5,6,7,9,10,12,13,14,15,16,17,19,21,22,23,24,27,28,33,36,39,41,43,44,45,46,47,48,49,50,51,52,53,59,60,61,62,64,65],bechler:53,becom:[7,15,41,46],beej:[18,42],been:[5,6,7,9,10,12,13,14,15,16,19,21,23,28,33,41,44,45,46,47,48,50,51,52,60,61,62,65],befor:[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,26,27,28,29,31,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,54,55,58,59,60,61,62,65],begin:[5,7,9,10,12,15,17,18,21,22,39,41,42,46,47,49,50,51,52,64,65],beginn:[10,52],behalf:[16,44],behav:[12,38,47],behavior:[4,5,7,8,9,14,16,21,39,43,44,45,46,50,51,65],behaviour:[6,12,16,44,47,48],behind:[7,46],being:[5,6,7,8,9,10,12,13,14,15,16,19,23,25,39,41,44,45,46,47,48,50,51,52,60,62,63],believ:39,bellow:[13,18,21,41,42,65],belong:[8,10,14,45,52],below:[4,5,6,7,8,9,10,14,15,17,18,19,21,22,23,24,27,28,38,41,42,43,45,46,48,49,50,51,52,53,59,60,61,62,64,65],bench:[16,44],benchmark:[16,44],benefit:39,benvenuti:53,besid:[3,19,62],best:[13,16,23,31,41,44,60],better:[1,13,15,16,19,21,28,41,44,51,61,62,65],between:[0,5,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,30,34,35,36,38,39,41,42,44,45,46,47,49,50,51,52,53,57,59,60,61,62,63,64,65],bex:0,bex_add_dev:8,bex_del_dev:8,bex_dev:8,bex_dev_attr:8,bex_devic:8,bex_misc:0,bex_misc_prob:8,bex_misc_remov:8,beyond:[28,39,61],bf914c1c:[21,65],bf914dbc:[21,65],bfa05000:[27,59],bfa15000:[17,49],bfa1a000:[27,59],bfa36000:[17,49],bffeb000:[27,59],bfree:37,bh_dirti:[9,51],bi_disk:[5,50],bi_end_io:[5,50],bi_io_vec:[5,50],bi_it:[5,50],bi_opf:[5,50],bi_priv:[5,50],bi_sector:[5,50],bibliograf:53,bibliografi:0,big:[7,16,18,42,44,46],bin:[4,9,27,36,43,51,59],binari:[4,13,16,24,30,41,43,44,53,57],bind:[3,10,13,18,20,24,36,41,42,52,53,55],bio:[0,8,16,19,21,35,44,62,65],bio_add_pag:[5,35,50],bio_alloc:[5,13,35,50],bio_alloc_bioset:13,bio_clon:[5,50],bio_data:[5,50],bio_data_dir:[5,50],bio_endio:35,bio_for_each_seg:[5,50],bio_put:[5,35,50],bio_structur:[5,50],bio_vec:[5,50],bio_write_messag:[5,50],biodoc:[5,50],bird:[15,41],bison:3,bit:[4,5,7,9,10,12,13,14,15,17,18,19,22,24,27,28,29,30,37,38,41,42,43,45,46,47,49,50,51,52,53,57,58,59,61,62,64],bitfield:[27,59],bitmap:[0,9,20,22,24,30,37,51,53,55,57,64],bitop:[10,14,45,52],bitwis:[0,10,52],black:[25,63],blackhol:[26,54],blk_cleanup_queu:[5,50],blk_mq_alloc_tag_set:[5,50],blk_mq_end_request:[5,50],blk_mq_f_should_merg:[5,50],blk_mq_free_tag_set:[5,50],blk_mq_hw_ctx:[5,50],blk_mq_init_queu:[5,50],blk_mq_op:[5,50],blk_mq_queue_data:[5,50],blk_mq_requeue_request:[5,50],blk_mq_start_request:[5,50],blk_mq_tag:[5,50],blk_mq_tag_set:[5,50],blk_qc_t:[5,50],blk_queue_logical_block_s:[5,50],blk_rq_byte:[5,50],blk_rq_cur_byt:[5,50],blk_rq_is_passthrough:[5,50],blk_rq_po:[5,50],blk_status_t:[5,50],blk_sts_ioerr:[5,50],blk_sts_ok:[5,50],blkdev:[5,50],blkdev_get_by_path:[5,35,50],blkdev_put:[5,35,50],blob:[4,13,37,41,43],block:[0,4,7,8,9,10,12,14,15,16,19,20,22,25,28,34,35,37,40,41,43,44,45,46,47,51,52,55,61,62,63,64],block_dev:[5,50],block_devic:[1,5,22,50,64],block_device_driv:[5,50],block_device_oper:[0,35],block_iopoll_softirq:[6,48],block_read_full_pag:[10,22,52,64],block_siz:37,block_size_bit:37,block_softirq:[6,23,48,60],block_truncate_pag:[10,52],block_write_begin:[10,52],block_write_full_pag:[10,52],bmap:[10,22,52,64],bmx_misc:8,board:0,bob:[15,41],bone:38,bonu:[16,39,44],book:39,bookmark:[27,59],bool:[6,18,22,28,29,42,48,58,61,64],boot:[0,2,3,6,8,12,14,15,19,24,35,37,41,45,47,48,53,62],bootlin:[13,41,53],bootload:[4,20,43,55],bootmem:[20,55],born:[29,58],both:[5,6,7,8,9,10,12,13,14,15,16,18,19,20,21,22,24,25,27,28,29,30,34,35,39,41,42,44,45,46,47,48,50,51,52,53,55,57,58,59,61,62,63,64,65],bottom:[5,6,12,16,23,24,28,44,47,48,50,53,60,61],bound:[21,24,39,53,65],boundari:[17,24,27,49,53,59],bovet:53,bpf:[16,44],bph:[15,41],brace:[13,41],braid:13,brain:[15,41],branch:1,brd:[2,5,50],breakpoint:[13,15,23,29,41,58,60],brels:[9,51],bridg:[24,53],brief:39,briefli:[31,38],bring:[9,16,44,51],brk:[24,53],broadcast:[2,26,54],broadcom:[4,43],broken:1,brows:[12,14,41,45,47],browser:0,brw:[7,46],bsd:[18,24,42,53],bsp:[4,43],btrace:[13,41],btrf:[13,15,22,41,64],bts:[28,61],bucharest:0,buddi:[25,63],buf:[8,12,30,47,57],bufer:[5,50],buff:[14,18,42,45],buffer:[0,5,7,8,10,14,16,18,21,22,23,25,38,42,44,45,46,50,52,60,63,64,65],buffer_head:[9,10,51,52],buffer_overflow:[21,65],buffer_s:[7,46],bug:[2,14,15,19,21,23,24,29,41,45,53,58,60,62,65],bug_on:[19,62],build:[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,24,33,35,37,38,41,42,43,44,45,46,47,48,49,50,51,52,53],build_bug_on:[18,19,42,62],buildid:[16,44],built:[2,3,8,9,13,14,15,27,41,45,51,59],builtin:[13,18,42],bullet:[13,41],bundl:[4,43],bus:[0,4,12,24,27,28,30,43,47,53,57,59,61],bus_attr:8,bus_attr_descr:8,bus_attribut:8,bus_create_fil:8,bus_for_each_dev:8,bus_for_each_drv:8,bus_regist:8,bus_remove_fil:8,bus_typ:8,bus_unregist:8,buse:[0,24,53],busi:[6,7,10,12,14,45,46,47,48,52],busybox:[13,41],bv_len:[5,50],bv_offset:[5,50],bv_page:[5,50],bvec:[5,50],bvec_alloc:13,bvec_it:[5,50],bye:[15,41],byteord:[18,42],bytestream:[12,47],bzimag:[2,3,13],c010102d:[15,41],c0103407:[15,41],c0140da6:[15,41],c014b698:[15,41],c014b7aa:[15,41],c01708e4:[15,41],c0170981:[15,41],c0240a08:[15,41],c02e535c:[13,41],c035d083:[15,41],c035e965:[15,41],c1001072:[21,65],c106b8ae:[21,65],c10cc1de:[21,65],c10cc7ca:[21,65],c10ccaba:[21,65],c1225063:[21,65],c12250a8:[21,65],c14d7d18:[21,65],c15c3f48:[21,65],c15c97c0:[21,65],c2c:[16,44],c5799000:[15,41],c5799e24:[15,41],c5799e58:[15,41],c5799e60:[15,41],c5799f20:[15,41],c5799f78:[15,41],c5799f8c:[15,41],c57b9280:[15,41],c57cb000:[15,41],c57cbe1c:[15,41],c57cbe24:[15,41],c57cbe34:[15,41],c57cbe58:[15,41],c57cbe60:[15,41],c57cbf20:[15,41],c57cbf78:[15,41],c57cbf8c:[15,41],c5db1d38:[15,41],c665c780:[15,41],c665cb00:[15,41],c66ec780:[15,41],c716c908:[21,65],c724f448:[15,41],c7257df0:[21,65],c72b51d8:[15,41],c780ff34:[21,65],c7ece8dc:[21,65],c7ed3584:[21,65],c8816000:[21,65],c8816005:[21,65],c8816006:[21,65],c8816008:[21,65],c8816010:[21,65],c8817000:[21,65],c8819200:[21,65],c89c3010:[15,41],c89c3016:[15,41],c89c3020:[15,41],c89c3380:[15,41],c89d4000:[15,41],c89d4001:[15,41],c89d4003:[15,41],c89d4005:[15,41],c89d400c:[15,41],c89d400f:[15,41],c89d4010:[15,41],c89d4011:[15,41],c89d4013:[15,41],c89d4014:[15,41],c89d4015:[15,41],c89d4016:[15,41],c89d4017:[15,41],c89d4018:[15,41],c89d4019:[15,41],c89d401a:[15,41],c89d401b:[15,41],c89d401c:[15,41],c89d401d:[15,41],c89d401e:[15,41],c89d401f:[15,41],c89d4020:[15,41],c89d4021:[15,41],c89d4023:[15,41],c89d4024:[15,41],c89d4025:[15,41],c89d4026:[15,41],c89d4027:[15,41],c89d4300:[15,41],c99:[7,15,41,46],cach:[0,7,10,16,19,21,24,25,44,46,52,53,62,63,65],cache_alloc_debugcheck_aft:[21,65],cacheabl:[22,64],cachedtyp:[27,59],cachelin:[19,27,59,62],caddr_t:[17,49],calcul:[6,18,35,36,39,42,48],calendar:53,call:[0,4,5,6,7,8,9,10,12,13,15,16,17,18,19,21,22,23,24,25,28,32,33,34,35,36,38,40,41,42,43,44,46,47,48,49,50,51,52,53,60,61,62,63,64,65],call_timer_f0:[21,65],call_timer_fn:[21,65],callback:[13,18,23,31,42,60],calle:[27,59],caller:[10,12,16,27,29,44,47,52,58,59],cam:53,can:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,60,61,62,63,64,65],cancel:[6,48],cancel_delayed_work:[6,48],cancel_delayed_work_sync:[6,48],cancel_work:[6,48],cancel_work_sync:[6,48],candid:[24,53],cannot:[12,21,23,27,35,36,37,39,47,59,60,65],capabl:[4,13,29,41,43,58],capac:[5,7,35,46,50],captur:[8,12,16,18,27,29,42,44,47,58,59],card:[7,8,23,24,46,53,60],card_id:8,care:[12,13,19,27,28,47,53,59,61,62],carefulli:[5,13,15,41,50],carri:[5,28,39,50,61],cast:[9,10,18,42,51,52],cat:[4,7,10,12,13,15,17,21,27,32,33,34,41,43,46,47,49,52,59,65],catalog:[39,53],categori:[7,14,23,45,46,60],caught:[19,62],caus:[5,6,14,15,16,18,21,23,28,29,38,41,42,44,45,48,50,58,60,61,65],cbea:31,cclose:[13,41],ccnext:[13,41],cdev:[7,8,31,46],cdev_add:[7,46],cdev_del:[7,46],cdev_init:[7,46],cdrom:[7,15,41,46],central:[4,5,12,43,47,50],cert:[24,53],certain:[4,6,7,8,11,12,13,14,16,23,24,28,29,30,39,41,43,44,45,46,47,48,53,57,58,60,61],certainli:[15,28,41,61],certif:[24,53],cesati:53,cfg:[13,41],cfile:[13,41],cgroup:[24,27,53,59],chain:[6,17,21,24,48,49,53,65],challeng:37,chanc:[12,24,28,32,33,34,35,36,37,38,47,53,61],chang:[0,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,23,24,25,27,30,34,41,42,43,44,45,46,47,48,49,50,51,52,53,57,59,60,62,63],change_bit:[14,28,45,61],channel:[8,18,32,33,34,35,36,42],chapter:[5,9,10,17,36,49,50,51,52],charact:[0,5,6,8,9,10,13,15,34,37,40,41,48,50,51,52],character:[5,17,49,50],characterist:[23,27,28,59,60,61],chardev:[3,30,57],charg:[12,47],chart:[18,42],chatgpt:39,cheat:[23,39,60],check:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,30,38,41,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,60,61,62,63,64,65],checker:[0,18,37,39,42],checkout:1,checkpatch:[21,24,32,33,34,35,36,37,38,53,65],checksum:[24,26,35,36,53,54],child:[4,43],children:[14,45],chip:0,chmod:[5,9,10,36,50,51,52],chocol:[10,52],choos:[7,8,9,37,39,46,51],chosen:[7,46],chr:[30,57],chrdev:13,christian:53,chunk:[19,62],cif:[9,51],circ_bbuf_pop:38,circ_bbuf_push:38,circuit:[4,43],circular:[12,15,21,47,65],circumst:[27,28,59,61],clac:[27,59],claim:[24,53],clang:[13,41],clangd:0,clarif:39,clarifi:[9,39,51],class_attr:8,class_attribut:8,class_dev_attr:8,class_device_attribut:8,class_regist:8,class_releas:8,class_unregist:8,classic:[0,12,14,18,28,42,45,47,61],claudiu:53,clean:[1,4,5,6,7,8,9,10,11,12,14,15,16,17,18,23,27,28,41,42,43,44,45,46,47,48,49,50,51,52,59,60,61],cleanup:[6,9,12,47,48,51],cleanup_modul:[7,15,41,46],clear:[10,12,15,17,18,21,23,38,41,42,47,49,52,60,65],clear_bit:[10,14,45,52],clear_buffer_uptod:[22,64],clear_inod:[10,52],clearli:[11,31],clearpagereserv:[17,49],cli:[23,28,60,61],click:1,client:[5,13,41,50],clk:[27,59],clock:[4,13,16,43,44],clock_event_devic:[20,55],clock_update_flag:[27,59],clocksourc:[20,55],clone:[0,1,4,5,24,37,43,50,53],clone_f:[27,59],clone_fil:[27,59],clone_newipc:[27,59],clone_newn:[27,59],clone_newnet:[27,59],clone_vm:[27,59],close:[0,10,13,14,16,24,26,27,28,34,39,41,44,45,52,53,54,59,61],close_disk:[5,50],closer:[6,7,27,29,46,48,58,59],closest:[9,10,51,52],clue:11,cma:[24,53],cmd:[6,7,11,15,27,31,41,46,48,59],cmd_flag:[5,50],cmd_mod:[15,41],cmd_size:[5,50],cmdline:[27,59],cmp:[29,58],cnext:[13,41],cnt:[27,59],coalesc:[25,26,54,63],coc:[13,41],coccinel:[21,65],cod:53,code:[0,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,21,22,23,27,28,29,30,31,32,33,34,35,36,37,39,42,43,44,45,46,47,48,49,50,51,52,57,58,59,60,61,62,64,65],coder:39,coher:0,colegilor:53,collabor:[0,39,40],colleagu:39,collect:[0,6,12,21,29,47,48,58,65],collector:[21,65],color:[25,63],column:[7,12,16,36,44,46,47],com1:[12,15,34,41,47],com2:34,com:[0,1,4,13,31,37,38,41,43,53],combin:[5,13,15,27,28,36,41,50,59,61],come:[5,15,16,18,19,27,41,42,44,50,59,62],comm:[6,15,16,21,27,41,44,48,59,65],command:[1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,22,23,24,27,32,35,36,37,41,42,43,44,45,46,47,48,49,50,51,52,53,59,60,64],command_data:[27,59],comment:[5,7,8,10,14,15,18,27,41,42,45,46,50,52,59],commit:[0,1],common:[4,5,6,7,8,9,12,14,15,18,19,22,24,28,29,39,41,42,43,45,46,47,48,50,51,53,58,61,62,64],commonli:[10,14,16,18,19,28,42,44,45,52,61,62],commun:[5,7,8,9,12,18,24,28,34,36,38,42,46,47,50,51,53,61],compani:[24,53],compaq:[24,53],compar:[1,8,13,15,16,17,18,19,25,28,41,42,44,49,61,62,63],comparison:8,compat:[4,7,24,34,43,46,53],compat_ioctl:[5,50],compat_sys_open:[29,58],compet:[24,53],compil:[0,3,5,6,7,9,10,12,13,14,18,19,21,24,28,33,34,35,36,39,42,45,46,47,48,50,51,52,53,61,62,65],compiled_sourc:[13,41],complain:[16,44],complement:[6,48],complet:[0,2,4,6,7,8,10,11,12,13,14,15,16,17,18,21,22,23,25,27,28,29,34,35,39,41,42,43,44,45,46,47,48,49,52,58,59,60,61,63,64,65],complex:[10,13,15,17,22,23,24,25,27,28,29,30,38,41,49,52,53,57,58,59,60,61,63,64],compli:[32,33,34,35,36,37,38],complic:[5,10,15,28,38,41,50,52,61],compon:[1,4,7,9,10,16,18,24,30,38,39,42,43,44,46,51,52,53,57],componentel:53,compos:[0,1,2,28,61],comprehens:[13,41],compress:[4,24,43,53],compris:[22,64],compromis:[13,25,41,63],comput:[0,4,13,14,17,18,19,41,42,43,45,49,62],comunit:0,concept:[0,7,8,9,13,14,41,45,46,51],conceptu:[27,28,59,61],concern:[27,59],conclud:[17,49],conclus:[14,39,45],concret:[9,51],concurr:[0,23,27,59,60],cond:[30,57],condit:[6,7,8,14,21,22,23,24,27,28,30,39,45,46,48,53,57,59,60,61,64,65],conduct:[16,44],confid:39,config:[4,13,15,16,21,38,41,43,44,65],config_btrfs_f:[15,41],config_debug_highmem:[19,62],config_debug_info:[21,65],config_debug_kmemleak:[21,65],config_debug_lock_alloc:33,config_debug_lockdep:[21,65],config_debug_slab:[21,65],config_dynamic_debug:[15,41],config_netconsole_dynam:[15,41],config_netfilt:[18,42],config_pci_mmconfig:[19,62],config_provide_ohci1394_dma_init:[19,62],config_read_only_thp_for_f:[22,64],config_retpolin:[27,59],config_sched_debug:[27,59],config_stackprotector:[27,59],config_x86_32:[19,29,58,62],config_x86_64:[19,62],config_x86_io_ap:[19,62],config_x86_local_ap:[19,62],config_x86_vsyscall_emul:[19,62],configdump:[27,59],configur:[1,2,3,4,5,7,8,10,12,13,15,16,18,24,27,28,30,38,41,42,43,44,46,47,50,52,53,57,59,61],confirm:[3,7,46],conflict:[1,8],conform:[24,26,53,54],confus:[8,14,45],conjunct:[10,29,52,58],connect:[0,5,7,8,12,13,15,16,23,24,26,27,36,41,44,46,47,50,53,54,59,60],connectionless:[18,42],consecut:[5,7,10,12,21,23,46,47,50,52,60,65],consequ:[5,7,8,18,27,37,42,46,50,59],consid:[5,7,8,14,17,21,23,28,29,34,35,36,39,45,46,49,50,58,60,61,65],consider:[5,50],consist:[5,7,9,18,19,35,39,42,46,50,51,62],consol:[3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,35,41,42,43,44,45,46,47,48,49,50,51,52],console_loglevel:[15,41],consolid:34,constant:[15,16,18,19,41,42,44,62],constantin:53,constraint:[23,60],construct:[7,10,12,13,18,24,27,28,42,46,47,52,53,59,61],constructor:[25,63],consult:[15,32,33,34,35,36,37,39,41],consum:[4,5,12,15,27,34,38,41,43,47,50,59],consumpt:[28,61],contact:39,contain:[0,1,4,5,6,7,8,9,10,11,12,13,14,15,17,18,21,22,24,25,29,32,35,36,37,39,41,42,43,45,46,47,48,49,50,51,52,53,58,63,64,65],container:2,container_of:[6,7,8,9,10,14,27,45,46,48,51,52,59],content:[0,7,8,9,10,12,13,14,15,16,18,19,22,23,24,28,32,33,37,41,42,44,45,46,47,51,52,53,60,61,62,64],context:[0,5,6,10,12,15,16,19,21,26,34,41,44,47,48,50,52,54,62,65],context_switch:[27,59],contigu:[0,19,25,62,63],continu:[1,2,5,6,8,9,10,12,13,15,16,21,23,24,27,28,29,39,41,44,47,48,50,51,52,53,58,59,60,61,65],contradict:[14,45],contrast:[18,42],contribut:[0,16,24,39,44,53],control:[0,4,5,7,8,9,13,15,18,20,22,24,27,31,33,36,41,42,43,46,50,51,53,55,59,64],convent:0,convention:[10,52],convers:[0,5,19,50,62],convert:[6,13,17,18,22,23,41,42,48,49,60,64],cool:[24,53],cooper:[24,30,53,57],cope:[4,43],copen:[13,41],copi:[0,2,4,5,7,9,10,11,12,13,14,15,16,17,18,19,20,22,24,25,27,29,30,36,37,38,41,42,43,44,45,46,47,49,50,51,52,53,55,57,58,59,62,63,64],copy_from_us:[6,7,13,18,29,42,46,48,58],copy_thread:[20,55],copy_to_us:[6,7,12,20,29,46,47,48,55,58],copyright:[2,30,57],corbet:53,core:[2,3,4,13,16,23,24,27,28,41,43,44,53,59,60,61],corelat:53,coroutin:[27,59],correct:[7,9,10,18,23,24,27,29,35,39,42,46,51,52,53,58,59,60],correctli:[9,39,51],correl:8,correspond:[4,5,7,8,9,10,12,13,15,18,23,36,41,42,43,46,47,50,51,52,60],corrobor:[14,45],corrupt:[11,14,21,29,35,45,58,65],cortex:[4,43],cost:[24,29,53,58],costli:[22,29,58,64],could:[5,6,10,14,17,21,27,28,29,36,45,48,49,50,52,58,59,61,65],couldn:[16,44],count:[5,7,8,9,12,14,16,17,38,44,45,46,47,49,50,51],counter:[6,8,10,12,14,15,16,17,21,22,25,28,41,44,45,47,48,49,52,61,63,64,65],counterpart:[14,45],coupl:[27,59],cours:[0,13,16,23,24,32,33,34,35,36,39,40,41,44,60],course_grad:39,cover:[10,28,35,39,52,61],cpl:[30,57],cppcheck:[32,33,34,35,36,37,38],cprev:[13,41],cpu0:[4,12,21,43,47,65],cpu1:[21,65],cpu:[0,5,6,12,13,14,15,16,17,19,21,23,25,27,29,30,34,37,41,44,45,47,48,49,50,57,58,59,60,62,63,65],cpu_startup_entri:[2,21,65],cpufreq:[4,43],cpuidle_idle_cal:2,cpuinfo:[4,43],cpuinfo_max_freq:[4,43],cpuinfo_min_freq:[4,43],cpus_ptr:[27,59],cr0:[21,30,57,65],cr0_et:38,cr0_mp:38,cr0_pe:38,cr2:[21,65],cr3:[19,21,30,57,62,65],cr4:[21,30,57,65],craft:[29,58],crash:[5,21,24,29,50,53,58,65],crc32:[15,35,41],crc:35,creat:[0,3,4,6,7,8,12,13,14,15,16,17,18,19,24,25,26,27,28,30,31,32,33,36,37,38,41,42,43,44,45,46,47,48,49,53,54,57,59,61,62,63],create_block_devic:[5,50],create_net:3,create_singlethread_workqueu:[6,48],create_workqueu:[6,48],creation:[0,5,8,9,10,50,51,52],cri:[24,53],critic:[4,12,14,15,23,24,27,28,41,43,45,47,53,59,60,61],crop:[10,52],cross:[0,4,28,43,61],cross_compil:[4,43],crosstool:[4,43],crush:[21,65],crush_it:[21,65],crusher:[21,65],crw:[7,46],cryptic:[15,41],crypto:[3,24,53],crypto_engin:3,cryptographi:[24,53],cscope:[0,15],cscopequickfix:[13,41],cst:[13,41],csto:[13,41],csum:36,csumerr:36,csverb:[13,41],ctag:[13,41],ctl:38,ctrl:[3,7,12,13,15,16,18,41,42,44,46,47],ctx:[10,52],cur:[0,16,44],curr:[27,28,59,61],current:[0,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,22,23,24,28,29,30,36,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,60,61,62,64],current_jiffi:[6,48],current_stack_point:[27,59],current_task:[29,58],current_thread_info:[27,59],current_tim:[9,51],cursor:[9,13,14,41,45,51],cursului:0,cursuri:0,custom:[0,24,53],cwd:[27,59],cword:[13,41],cwr:[18,42],cycl:[19,21,28,61,62,65],d_add:[10,52],d_fsdata:[10,52],d_inod:[10,22,52,64],d_instanti:[10,52],d_make_root:[9,10,51,52],d_name:[10,52],d_op:[10,52],d_parent:[10,52],d_sb:[10,52],d_state:[22,64],daddr:[18,42],daemon:[7,15,41,46],dai:[16,24,39,44,53],daniel:53,dar:53,dashboard:13,data:[0,6,8,9,10,13,14,16,18,19,23,24,25,26,27,29,30,32,33,34,35,36,37,38,41,42,44,45,48,51,52,53,54,57,58,59,60,62,63],data_block:[10,52],data_len:[18,26,42,54],data_reg:[12,47],databas:[0,13,15,41],datagram:36,datagram_pol:36,datasheet:34,dataw:[15,41],date:[1,31,39],davem:[24,53],david:[24,53],dax:[22,64],ddvlad:[13,41],deaconescu:53,deactiv:[6,10,12,15,28,30,34,39,41,47,48,52,57,61],deadlin:[0,32,33,34,35,36,37,38],deadlock:[6,12,21,27,28,47,48,59,61,65],deal:[9,16,22,24,44,51,53,64],dealloc:[5,9,18,22,25,33,42,50,51,63,64],dealt:[22,64],debian:3,debug:[0,2,14,16,23,27,28,40,44,45,53,59,60,61],debug_pagealloc:[0,15,41],debugf:[15,21,41,65],debugg:[0,13,21,27,59,65],dec:[24,53],decapsul:[26,54],decid:[14,21,27,29,45,58,59,65],decim:[13,41],decis:[18,42],declar:[4,5,6,8,13,14,41,43,45,48,50],declare_delayed_work:[6,48],declare_tasklet:[6,48],declare_tasklet_dis:[6,48],declare_wait_queue_head:[6,7,46,48],declare_work:[6,48],decod:[0,12,15,41,47],decompos:[28,61],decompress:[24,53],decreas:[16,32,33,34,35,36,37,38,44],decrement:[6,8,10,17,22,28,48,49,52,61,64],dedic:[10,17,19,23,24,28,32,33,34,35,36,37,49,52,53,60,61,62],deduct:39,deep:[18,42],deepen:[13,36,39,41],deeper:[27,59],def:[27,59],default_idl:[2,21,65],default_idle_cal:[21,65],default_wake_funct:[27,59],defconfig:[4,13,41,43],defer:[0,12,19,23,34,40,47,60,62],deferr:[0,6,7,14,45,46,48],deferred_work:[6,48],defin:[2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20,23,27,28,29,30,34,35,36,38,41,42,43,44,45,46,47,48,49,50,51,52,55,57,58,59,60,61,62],define_mutex:[14,45],define_per_cpu_page_align:[19,62],define_rwlock:[14,45],define_spinlock:[14,21,45,65],define_tim:[21,65],definit:[5,6,7,8,9,10,12,13,14,15,24,33,34,35,36,41,45,46,47,48,50,51,52,53],deiniti:8,del:0,del_gendisk:[5,50],del_pid:[14,45],del_tim:[6,48],del_timer_sync:[6,48],dela:32,delai:[6,12,28,39,47,48,61],delayed_work:[6,48],delet:[0,6,7,8,9,14,15,17,28,32,36,41,45,46,48,49,51,61],delete_block_devic:[5,50],delf:32,deliv:[30,57],delloc:[21,65],demand:[14,23,24,25,29,45,53,58,60,63],demo:0,demonstr:[4,27,29,43,58,59],den:53,denomin:[9,51],dentri:[0,24,53],dentry_oper:[9,10,51,52],depart:0,depend:[0,1,4,5,6,7,8,12,13,14,21,23,24,28,29,34,41,43,45,46,47,48,50,53,58,60,61,65],depict:[22,64],deploi:3,deploy:[7,10,46,52],depmod:3,deprec:[20,28,55,61],depth:[0,11,13,22,23,35,41,60,64],dequeu:[23,27,59,60],derefer:[21,65],dereferenc:[7,46],deregist:[5,8,50],deregistr:[7,9,46,51],descr:8,describ:[4,5,6,7,8,9,10,12,14,15,17,18,23,24,29,32,33,34,35,36,37,38,41,42,43,45,46,47,48,49,50,51,52,53,58,60],descript:[1,4,5,14,15,16,17,18,24,33,41,42,43,44,45,49,50,53],descriptor:[0,5,7,10,17,22,24,25,27,29,30,38,46,49,50,52,53,57,58,59,63,64],design:[3,7,12,14,19,21,23,24,28,31,45,46,47,53,60,61,62,65],desir:[7,12,15,34,41,46,47],desktop:[26,54],despr:0,dest:[18,42],destin:[0,15,16,22,26,36,39,41,44,54,64],destroi:[0,6,8,10,48,52],destroy_inod:[9,10,51,52],destroy_list:[14,45],destroy_workqueu:[6,48],destruct:[6,48],destructor:[18,25,26,42,54,63],destul:53,detail:[0,2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,27,28,29,30,32,41,42,43,44,45,46,47,48,49,50,51,52,57,58,59,61],detect:[8,9,13,14,15,16,21,23,28,29,34,41,44,45,51,58,60,61,65],detector:[21,65],determin:[0,2,5,7,8,10,12,13,15,17,19,20,21,22,23,29,33,41,46,47,49,50,52,55,58,60,62,64,65],dev:[2,3,4,5,6,7,8,9,10,12,13,15,16,17,18,26,27,31,33,35,38,41,42,43,44,46,47,48,49,50,51,52,54,59],dev_add_pack:36,dev_attr:8,dev_attr_:8,dev_dbg:[15,41],dev_id:[8,12,47],dev_kobj:8,dev_nam:[8,9,12,47,51],dev_releas:8,dev_root:8,dev_scratch:[18,42],dev_set_drvdata:8,dev_set_nam:8,dev_t:[7,8,9,46,51],dev_uev:8,develop:[0,3,5,8,9,10,12,13,15,18,21,22,29,37,40,41,42,47,50,51,52,58,64,65],devic:[0,3,6,9,10,13,14,15,16,18,19,20,22,23,27,30,33,34,35,37,40,41,42,44,45,48,51,52,55,57,59,60,62,64],device_address:38,device_attr:8,device_attr_ro:8,device_attribut:8,device_config:38,device_cr:8,device_create_fil:8,device_destroi:8,device_driv:[7,8,46],device_model:8,device_native_endian:[30,57],device_priv:8,device_readi:38,device_regist:8,device_remove_fil:8,device_reset:38,device_statu:38,device_t:38,device_table_t:38,device_unregist:8,devnam:8,devtmpf:13,diagram:[4,9,10,18,19,21,22,23,27,42,43,51,52,59,60,62,64,65],dialout:[7,46],dictat:39,did:[6,7,8,9,10,16,21,23,29,39,44,46,48,51,52,58,60,65],didact:[17,49],didn:[16,44],diff:[16,27,44,59],differ:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,22,23,24,26,27,28,30,34,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,57,59,60,61,62,64],differenti:[4,5,6,12,16,43,44,47,48,50],difficult:[13,14,23,24,28,37,41,45,53,60,61],dificil:53,dificultatea:53,digit:[4,43],dimens:[10,12,47,52],din:53,dinca:53,dir:[5,7,10,13,41,46,50,52],dir_emit:[10,52],direct:[5,7,8,9,10,15,16,17,30,32,33,34,35,36,37,38,39,41,44,46,49,50,51,52,53,57],direct_access:[5,50],direct_data_block:37,direct_io:[22,64],directli:[0,5,6,7,8,9,11,12,13,14,15,17,19,22,24,25,27,29,30,36,37,41,45,46,47,48,49,50,51,53,57,58,59,62,63,64],director:[10,52],directori:[0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19,24,27,28,32,33,34,35,36,37,41,42,43,44,45,46,47,48,49,50,51,53,59,61,62],dirti:[9,10,19,21,22,28,51,52,61,62,64,65],disabl:[0,4,6,12,19,20,21,23,24,27,29,34,43,47,48,53,55,58,59,60,62,65],disable_irq:[12,47],disable_irq_nosync:[12,47],disadvantag:[13,15,19,24,41,53,62],disapear:8,disass:[13,41],disassembl:[13,15,41],disc:[10,52],discard:[4,5,6,7,8,9,10,12,14,15,16,17,18,26,41,42,43,44,45,46,47,48,49,50,51,52,54],disciplina:53,disconnect:36,discourag:[14,45],discov:[4,8,13,43],discover:[4,43],discoveri:[24,53],discret:[18,42],discuss:[5,8,10,12,17,28,38,39,47,49,50,52,61],discutii:53,dishonest:39,disk1:[3,13],disk2:[3,13],disk:[0,7,8,9,10,15,16,22,24,35,37,41,44,46,51,52,53,64],disk_nam:[5,50],dismiss:[24,53],dispatch:[0,29,33,58],dispersi:53,displac:[7,46],displai:[0,3,5,6,7,9,10,12,13,14,15,16,33,41,44,45,46,47,48,50,51,52],dispos:33,distinguish:[12,14,45,47],distribut:[5,23,24,25,28,30,50,53,57,60,61,63],distro:[2,3,4,13,43],div:[30,57],dive:[18,42],divers:[24,53],divid:[17,23,24,39,49,53,60],divis:[7,23,39,46,60],divisor:37,dkogan:[13,41],dma1:[12,47],dma2:[12,47],dma32:[25,63],dma:[8,12,17,19,25,30,47,49,57,62,63],dmap:[22,37,64],dmap_block:37,dmesg:[6,12,14,15,18,27,41,42,45,47,48,59],dnsmasq:3,do_exit:[6,48],do_fast_syscall_32:[16,44],do_fork:[13,41],do_idl:[2,21,65],do_init_modul:[21,65],do_int80_syscall_32:[21,29,58,65],do_one_initcal:[21,65],do_oop:[21,65],do_pan:[21,65],do_softirq:[23,28,60,61],do_softirq_own_stack:[21,65],do_syscall_32_irqs_on:[29,58],do_sysenter_32:[16,44],doc:[0,1,4,38,43],docker:[0,1,27,36,59],dockerfil:0,document:[0,2,5,6,7,9,10,15,18,24,32,33,34,35,36,37,38,42,46,48,50,51,52,53],doe:[5,6,7,8,9,10,12,13,14,15,16,17,23,24,27,29,31,32,33,34,35,36,37,38,39,41,44,45,46,47,48,49,50,51,52,53,58,59,60],doesn:[2,4,5,13,16,22,41,43,44,50,64],doff:[18,42],doing:[6,12,13,16,23,24,28,44,47,48,53,60,61],don:[6,8,13,15,16,19,22,27,28,29,30,37,38,41,44,48,57,58,59,61,62,64],donald:[14,45],done:[1,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,23,24,27,28,29,32,33,34,35,36,37,39,41,42,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62],dot:37,doubl:[5,14,19,45,50,62],down:[7,14,15,18,24,28,33,41,42,45,46,53,61],down_interrupt:33,download:[2,3,4,13,15,16,39,41,43,44],downscript:3,downstream:[24,53],dpl:[19,23,60,62],drastic:[14,45],draw:39,drive:[3,5,7,9,13,46,50,51],driver:[0,3,4,6,11,13,14,15,16,18,22,23,26,27,30,31,37,38,39,40,41,42,43,44,45,48,54,57,59,60,64],driver_ack:38,driver_attr:8,driver_attribut:8,driver_create_fil:8,driver_data:[5,8,50],driver_ok:38,driver_priv:8,driver_regist:8,driver_remove_fil:8,driver_reset:38,driver_statu:38,driver_unregist:8,drm_mxsfb:[4,43],drop:[23,26,30,54,57,60],drop_inod:[9,51],drv:8,drv_attr:8,dsr:[21,65],dst:36,dt_dir:[10,52],dt_reg:[10,52],dt_unknown:[10,52],dtb:[4,43],dts:[4,28,43,61],dtsi:[4,43],due:[14,16,18,19,23,28,39,42,44,45,60,61,62],dummi:[21,31,65],dummy_exit:[15,41],dummy_init:[15,41],dump:[13,15,16,21,27,41,44,59,65],dump_stack:[13,21,41,65],dup2:[29,58],dup:[22,64],duplic:[22,24,53,64],durat:[23,60],dure:[0,5,12,16,19,23,24,27,28,29,38,39,41,44,47,50,53,58,59,60,61,62],dvd:[9,51],dwarv:[27,59],dynam:[0,5,6,7,9,13,14,16,21,23,24,44,45,46,48,50,51,53,60,65],dynamic_debug:[15,41],dyndbg:0,dyndbg_init:15,dzone:[22,37,64],dzone_block:37,e100:3,e300:[30,57],eabi5:[4,43],eabi:[4,43],each:[0,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,23,24,25,26,27,28,29,30,32,33,34,35,39,41,42,43,45,46,47,48,49,50,51,52,53,54,57,58,59,60,61,62,63,64,65],eagain:[22,64],earli:[15,41],earlier:[5,15,29,41,50,58],earlycon:[4,43],eas:[13,41],easi:[6,10,13,16,19,29,41,44,48,52,58,62],easier:[9,13,23,24,29,41,51,53,58,60],easiest:[15,41],easili:[9,13,15,17,21,25,41,49,51,63,65],eax:[13,15,19,21,27,29,41,58,59,62,65],eb002bf8:[21,65],ebp:[15,21,27,29,41,58,59,65],ebusi:[5,7,12,14,45,46,47,50],ebx:[15,21,27,29,41,58,59,65],ece:[18,42],echipa:0,echo:[5,7,8,10,11,12,15,21,32,41,46,47,50,52,65],ecp:8,ecx:[13,15,21,29,41,58,65],ed612000:[21,65],edg:[12,47],edi:[15,21,27,29,41,58,59,65],edit:[1,5,9,10,11,13,15,17,41,49,50,51,52,53],editor:[13,41],edx:[13,15,21,27,29,41,58,59,65],eest:2,efault:[7,29,46,58],efect:[13,41],effect:[5,7,10,13,15,16,19,22,23,25,27,28,41,44,46,50,52,59,60,61,62,63,64],effici:[13,15,17,22,24,27,28,30,41,49,53,57,59,61,64],effort:[27,59],eflag:[15,21,23,30,41,57,60,65],eg106:[1,10,13,39,41,52],eg306:39,egrep:[12,47],ehci:[15,41],eigh:[21,65],einval:[8,14,16,18,42,44,45],eio:[10,17,49,52],eip:[15,21,23,27,41,59,60,65],either:[6,10,11,12,14,16,18,19,21,22,23,24,25,27,29,34,37,42,44,45,47,48,52,53,58,59,60,62,63,64,65],elcdif:[4,43],electron:[4,43],elem:[14,45],element:[5,6,14,18,21,28,32,33,34,36,37,38,42,45,48,50,61,65],elf32:[15,41],elf:[0,4,13,29,36,41,43,58],elif:[15,18,41,42],elimin:[6,13,17,28,39,41,48,49,61],elixir:[13,41,53],els:[1,6,7,8,10,12,13,14,15,18,19,27,29,30,39,41,42,45,46,47,48,52,57,58,59,62],elsewher:[7,39,46],emac:[13,41],emb:[17,49],embed:[4,10,15,18,20,24,27,30,41,42,43,52,53,55,57,59],emerg:[6,48],emploi:[25,28,61,63],empti:[4,10,12,19,22,25,27,34,43,47,52,59,62,63,64],emul:[0,3,4,43],enabl:[3,6,12,13,15,17,18,19,21,23,24,28,29,33,37,38,41,42,47,48,49,53,58,60,61,62,65],enable_irq:[12,47],encapsul:[18,42],encod:[5,21,26,30,50,54,57,65],encompass:[17,49],encount:[5,15,24,28,39,41,50,53,61],encourag:[4,5,6,7,8,9,10,12,14,15,16,17,18,31,39,41,42,43,44,45,46,47,48,49,50,51,52],encrypt:[24,53],end:[5,6,7,8,9,10,12,13,15,16,17,18,19,21,22,23,24,26,28,29,32,33,35,37,39,41,42,44,46,47,48,49,50,51,52,53,54,58,60,61,62,64,65],endian:[18,30,42,57],endif:[13,15,18,19,22,27,29,41,42,58,59,62,64],endpoint:[18,42],endproc:[29,58],endwhil:[13,41],enforc:[24,53],engin:[0,28,61],enhancednf:[15,41],enhancen:[24,53],enobuf:[18,42],enodev:[8,12,47],enomem:[5,9,10,14,45,50,51,52],enospc:[10,52],enotempti:[10,52],enotti:[7,46],enough:[7,10,12,14,23,24,25,29,33,39,45,46,47,52,53,58,60,63],enqueu:[27,59],ensur:[7,8,10,14,27,35,39,45,46,52,59],enter:[2,3,10,11,12,13,14,15,22,28,41,45,47,52,61,64],enter_from_user_mod:[29,58],enter_lazy_tlb:[27,59],entir:[4,5,8,9,12,15,17,34,35,36,39,41,43,47,49,50,51],entiti:[7,8,9,10,14,28,45,46,51,52,61],entri:[0,5,7,8,9,12,13,14,17,19,20,21,22,23,25,26,27,28,29,33,36,37,38,45,46,47,49,50,51,54,55,58,59,60,61,62,63,64,65],entry_handl:33,entry_int80_32:[21,23,60,65],entry_sysenter_32:[23,60],enumer:[20,30,55,57],env:[8,15,41],environ:[2,8,9,13,27,36,39,41,51,59],ept:[30,57],equal:[10,14,18,26,39,42,45,52,54],equat:[15,41],equival:[5,9,14,15,18,29,39,41,42,45,50,51,58],eras:[22,64],erestartsi:[27,59],ernno:[14,45],err:[5,7,8,9,12,18,42,46,47,50,51],err_ptr:[10,52],errno:[3,7,14,45,46],erron:[5,28,50,61],error:[0,5,6,7,8,9,10,11,12,13,16,18,19,21,22,23,24,30,35,36,39,42,44,46,47,48,50,51,52,53,57,60,62,64,65],error_abort:[30,57],error_remove_pag:[22,64],errseq_t:[22,64],esi:[15,21,27,29,41,58,59,65],esp:[13,15,21,23,27,41,59,60,65],especi:[10,13,24,28,41,52,53,61],espfix:[19,62],essenti:[3,6,10,13,18,21,24,31,41,42,48,52,53,65],establish:[2,18,42],estim:51,etc:[4,5,6,7,8,9,10,11,12,13,14,15,18,22,23,24,25,26,27,30,38,39,41,42,43,45,46,47,48,50,51,52,53,54,57,59,60,63,64],eth0:[2,15,26,41,54],ether:[2,12,47],ethernet:[4,24,26,30,36,43,53,54,57],ethi:8,evalu:[4,13,16,26,27,32,33,34,35,36,37,41,43,44,54,59],even:[5,6,8,10,13,14,15,17,18,21,22,23,24,27,28,30,39,41,42,45,48,49,50,52,53,57,59,60,61,64,65],evenli:31,event:[6,7,8,12,16,21,23,27,30,44,46,47,48,57,59,60,65],events_list:[6,48],events_lock:[6,48],eventu:[9,13,15,23,41,51,60],everi:[2,5,6,7,8,13,15,18,19,22,24,25,26,27,41,42,46,48,50,53,54,59,62,63,64],everyon:39,everyth:[9,13,17,27,34,49,51,59],evict_inod:[10,22,52,64],evk:[4,43],evlist:[16,44],evolut:[14,45],ewouldblock:[7,46],ex_fixup_addr:[29,58],ex_fixup_handl:[29,58],ex_handler_default:[29,58],ex_handler_t:[29,58],exact:[21,29,58,65],exactli:[15,27,41,59],exam:39,examen:53,examin:[13,23,27,29,41,58,59,60],exampl:[0,1,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,21,22,24,27,28,29,30,33,34,35,36,38,39,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,61,62,64,65],exce:[5,14,39,45,50],exceed:[7,46],except:[0,9,12,14,21,24,25,29,30,32,33,34,35,36,37,38,39,45,47,51,53,57,58,63,65],exception_t:[29,58],exception_table_entri:[29,58],excess:[28,61],exchang:[8,23,28,60,61],exclus:[5,7,10,14,27,28,36,45,46,50,52,59,61],exe:[27,59],exec:[24,53],execut:[0,4,5,6,7,8,9,10,12,13,15,16,18,19,21,23,27,28,29,33,36,38,41,42,43,44,46,47,48,50,51,52,58,59,60,61,62,65],execv:[16,44],exemplifi:[28,61],exemplificar:53,exercic:[13,41],exercis:[0,1,11,39],exhaust:[14,45],exist:[5,6,8,9,10,11,13,14,15,21,22,24,27,28,30,36,39,41,45,48,50,51,52,53,57,59,61,64,65],exit:[0,3,5,6,7,12,14,17,18,21,27,28,31,38,42,45,46,47,48,49,50,59,61,65],exit_reason:38,expand:[13,18,33,41,42],expect:[2,8,12,15,16,17,19,23,38,41,44,47,49,60,62],expens:[28,61],experi:[16,44],experienc:[16,44],expert:[24,53],expir:[6,14,27,28,39,45,48,59,61],expiri:[6,48],explain:[14,18,28,29,31,42,45,58,61],explan:[14,39,45],explicit:[27,59],explicitli:[7,27,28,39,46,59,61],explor:[0,13],exponenti:[18,42],export_per_cpu_symbol_gpl:[19,62],export_symbol:[8,14,19,45,62],expos:[9,13,14,30,33,38,45,51,57],express:[13,27,41,59],ext2:[9,51],ext3:[9,51],ext4:[2,3,4,9,10,22,43,51,52,64],ext4_buffered_write_it:[16,44],ext4_file_write_it:[16,44],ext4_inode_info:[10,52],ext4_statf:[9,51],extend:[0,15,18,19,24,25,38,41,42,53,62,63],extens:[13,15,24,30,41,53,57],extent:2,extern:[12,14,18,23,25,30,39,42,45,47,57,60,63],extra:[0,2,5,10,17,19,23,27,49,50,52,53,59,60,62],extra_cflag:[15,41],extract:[5,10,14,15,18,41,42,45,50,52],extracurricular:51,eye:[4,43],f1de81a2:[21,65],f1de8265:[21,65],f1de82d6:[21,65],f1de82d:[21,65],f27f1138:[21,65],f27f12aa:[21,65],f27f12ef:[21,65],f4c6816a:[21,65],f4c6827f:[21,65],f4c682d6:[21,65],f4c682de:[21,65],f5118b873294:31,f_flag:[7,18,42,46],f_mode:[7,46],f_op:[7,10,46,52],f_po:[7,46],face:[16,39,44],facebook:53,facil:[6,14,17,45,48,49],facilit:[11,13,15,41],fact:[10,13,14,16,28,29,34,41,44,45,52,58,61],factor:51,faculti:[0,37],fail:[1,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,27,28,39,41,42,43,44,45,46,47,48,49,50,51,52,59,61,62],failur:[5,12,14,45,47,50],fair:[24,28,53,61],fairli:[23,60],fake:[15,30,57],fall:[7,14,28,39,45,46,61],fals:[7,27,28,46,59,61],famili:[0,18,36,42],familiar:[4,9,10,13,14,15,16,18,19,32,33,42,43,44,45,51,52,62],fane:53,faq:[35,36,37],far:8,fashion:[12,23,30,47,57,60],fast:[12,13,19,27,28,41,47,59,61,62],fastcal:[6,48],faster:[1,3,12,15,16,25,29,41,44,47,58,63],fat:[9,10,22,51,52,64],fatal:[15,21,41,65],fault:[0,15,16,19,21,23,29,30,39,41,44,57,58,60,62,65],faulti:[23,60],faust:[15,41],favor:[28,61],fdinfo:[27,59],fdtdump:[27,59],fe12:2,fe80:2,featur:[2,8,9,12,13,14,15,17,19,24,27,28,41,45,47,49,51,53,59,61,62],feb:[15,41],feedback:[1,16,39,44,53],feel:[4,39,43],fer:[16,44],fetch:[1,7,19,26,28,46,54,61,62],fetch_head:1,few:[0,13,15,19,21,22,23,24,27,28,29,38,41,53,58,59,60,61,62,64,65],ffffe000:[27,59],fffff000:[27,59],ffffffda:[21,65],fffffffc:[15,41],ffree:37,fiber:[27,59],fibmap:[22,64],fidel:[30,57],fiecar:53,field:[0,5,6,7,8,9,10,12,13,14,15,17,19,23,25,26,27,28,29,33,35,36,41,45,46,47,48,49,50,51,52,54,58,59,60,61,62,63],fifo8:[30,57],fifo:[9,10,28,30,51,52,57,61],figur:[7,14,21,23,24,28,45,46,53,60,61,65],file:[0,1,2,3,4,5,6,8,12,13,14,15,16,17,18,21,24,25,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,47,48,49,50,53,58,63,65],file_oper:[0,5,8,9,10,17,31,49,50,51,52],file_system:13,file_system_typ:[9,13,51],filenam:[9,10,27,51,52,59],fileread:[13,41],filesystem:[0,4,10,11,13,27,40,41,43,52,59],fileystem:[22,24,53,64],fill:[5,7,9,10,11,12,16,17,18,21,36,42,44,46,47,49,50,51,52,65],fill_return_buff:[27,59],fill_sup:[0,10,22,52,64],filp:[17,22,49,64],filsystem:[22,64],filter:[0,15,16,24,26,41,44,53,54],fin:[18,42],final_d:[10,52],find:[2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,29,32,33,34,35,36,37,39,41,42,43,44,45,46,47,48,49,50,51,52,58,60,62,64,65],find_first_bit:[10,52],find_first_zero_bit:[9,10,51,52],fine:[9,24,51,53],fingerprint:2,finish:[5,6,7,9,10,12,13,16,23,28,38,44,46,47,48,50,51,52,60,61],finish_task_switch:[27,59],finish_wait:[27,59],firewal:[18,42],firmwar:[8,24,53],first:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,28,29,31,32,33,35,36,37,38,41,42,43,44,45,46,47,48,49,50,51,52,53,55,58,60,61,62,63,65],first_minor:[5,50],fit:[30,57],fix:[0,1,5,6,7,14,18,23,24,25,42,45,46,48,50,53,60,63],fix_apic_bas:[19,62],fix_dbgp_bas:[19,62],fix_earlycon_mem_bas:[19,62],fix_hol:[19,62],fix_io_apic_base_0:[19,62],fix_io_apic_base_end:[19,62],fix_kmap_begin:[19,62],fix_kmap_end:[19,62],fix_ohci1394_bas:[19,62],fix_pcie_mcfg:[19,62],fix_to_virt:[19,62],fixaddr_start:[19,62],fixaddr_top:[19,62],fixed_address:[19,62],fixmap:[19,62],fixup:[29,58],fixup_except:[29,58],fl_owner_t:[7,46],flag:[5,6,7,9,10,12,14,15,17,18,20,22,23,25,27,28,36,38,41,42,45,46,47,48,49,50,51,52,55,59,60,61,63,64],flash:[22,64],flex:3,flexibl:[5,19,26,50,54,62],flood:0,floppi:[12,47],flow:[15,23,26,29,36,41,54,58,60],flush:[5,7,19,20,25,28,30,46,50,55,57,61,62,63],flush_scheduled_work:[6,48],flush_workqueu:[6,48],fly:[13,41],fmode_excl:[5,50],fmode_read:[5,7,46,50],fmode_t:[5,50],fmode_writ:[5,7,46,50],fmt:[14,45],focu:[0,11,16,44],fold:[28,61],folder:[3,15,16,24,38,41,44,53],follow:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,26,27,28,29,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,58,59,60,61,62,64,65],follow_link:[22,64],foot:[25,63],fop:[5,7,46,50],for_each:15,for_each_process:15,forc:[1,14,15,27,30,41,45,57,59],forcefulli:[14,24,45,53],forest:[10,52],forev:[2,8],forget:[6,7,15,38,46,48],forgotten:[10,52],fork:[0,24,27,53,59],form:[6,7,9,12,15,16,17,19,27,36,39,41,44,46,47,48,49,51,59,62],formal:39,format:[3,4,5,9,10,12,13,15,18,24,27,29,32,36,37,41,42,43,47,50,51,52,53,58,59],format_str:[27,59],formerli:8,formula:[6,39,48],forum:39,forward:[0,13,18,24,30,42,53,57],found:[1,4,6,7,8,9,10,12,13,14,15,16,17,18,25,27,32,33,34,35,36,37,38,41,42,43,44,45,46,47,48,49,51,52,59,63],foundat:[2,18,30,42,57],four:[12,14,23,45,47,60],fpu:[12,47],fput_light:[18,42],fput_need:[18,42],frag_off:[18,42],fragment:[19,25,62,63],frame:[17,19,26,49,54,62],framebuff:[24,53],framework:[13,16,18,24,26,41,42,44,53,54],frank:53,free:[0,2,6,9,10,13,14,17,18,19,20,21,22,24,25,28,30,34,35,37,39,41,42,45,48,49,51,52,53,55,57,61,62,63,64,65],free_irq:[12,47],free_mmap_pag:[17,49],free_resourc:[28,61],freebsd:[13,41],freed:[21,25,28,33,61,63,65],freeli:[28,39,61],freepag:[22,64],freestand:[13,41],freez:[15,41],freezer:[15,41],frequenc:[4,23,25,28,43,60,61,63],frequent:[6,7,16,24,25,27,28,44,46,48,53,59,61,63],friendli:13,from:[0,1,2,4,6,7,8,9,10,11,13,14,15,16,17,18,19,21,24,25,26,27,28,30,32,33,34,35,36,37,38,39,41,42,43,44,45,46,48,49,51,52,53,54,57,59,61,62,63,65],from_tti:[27,59],frontend:[13,41],frozen:39,fs_flag:[9,51],fs_requires_dev:[9,51],fs_super:[9,51],fs_type:[9,51],fs_userns_mount:[9,51],fsdata:[10,22,52,64],fsi:[9,51],fsl_asrc:[4,43],fsname:[10,52],ftdi:[15,41],ftp:[18,42],ftrace:[16,21,44,65],fulfil:[5,50],full:[2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,23,25,26,27,28,30,34,36,41,42,43,44,45,46,47,48,49,50,51,52,54,57,59,60,61,62,63],fulli:[5,12,13,19,24,39,41,47,50,53,62],fun:32,func:[15,27,41,59],funcion:[10,52],further:[0,8,19,24,39,53,62],furthermor:[15,41],futur:[5,6,28,48,50,61],g_malloc0:[30,57],ga4a6b62:[21,65],gain:[12,13,33,36,39,41,47],game:[4,43],garbag:[21,65],gate:[19,23,60,62],gather:[5,16,26,44,50,54],gcc:[3,4,7,43,46],gdb:[0,2,15,23,29,58,60],gdbinit:[13,27,59],gdt:[19,23,60,62],gdt_entry_apmbios_bas:[19,62],gdt_entry_default_user32_c:[19,62],gdt_entry_default_user_c:[19,62],gdt_entry_default_user_d:[19,62],gdt_entry_espfix_ss:[19,62],gdt_entry_init:[19,62],gdt_entry_kernel32_c:[19,62],gdt_entry_kernel_c:[19,62],gdt_entry_kernel_d:[19,62],gdt_entry_percpu:[19,62],gdt_entry_pnpbios_cs16:[19,62],gdt_entry_pnpbios_cs32:[19,62],gdt_entry_pnpbios_d:[19,62],gdt_entry_pnpbios_ts1:[19,62],gdt_entry_pnpbios_ts2:[19,62],gdt_page:[19,62],gdt_stack_canary_init:[19,62],gdtr:[19,62],gen_compile_command:[13,41],gendisk:[0,35],gener:[0,1,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,32,33,34,35,36,37,38,40,41,42,43,44,45,46,47,48,49,50,52,53,55,57,58,59,60,61,62,63,64,65],generic_block_bmap:[10,52],generic_delete_inod:[9,51],generic_file_llseek:[10,52],generic_file_mmap:[10,52],generic_file_read_it:[10,22,52,64],generic_file_write_it:[10,52],generic_read_dir:[10,52],generic_ro_fop:[7,46],generic_write_end:[10,52],genhd:[5,50],genpd:[27,59],geometr:[25,63],gestiunea:53,get:[0,1,3,4,5,6,7,8,9,12,14,15,16,18,24,27,28,29,31,32,33,34,35,36,37,38,42,43,44,45,46,47,48,50,51,53,58,59,61],get_ascii:[12,47],get_block:[22,64],get_block_t:[22,64],get_char:[12,47],get_idx:[12,47],get_next_ev:[6,48],get_proc:[6,48],get_task_mm:[17,49],get_typ:[27,59],get_us:[7,29,46,58],getattr:[10,52],getcwd:[13,41],getgeo:[5,50],getitng:41,getnam:[18,36,42],getpid:[29,58],getsockopt:36,gettimeofdai:[29,58],gfp:[25,63],gfp_atom:[14,45],gfp_kernel:[9,14,18,21,42,45,51,65],gfp_mask:[22,25,63,64],gfp_noio:[5,50],gfp_t:[22,64],ghioc:53,gid:[9,10,27,37,51,52,59],gid_t:37,git:[1,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,24,27,31,37,41,42,43,44,45,46,47,48,49,50,51,52,53,59],github:[0,1,4,13,37,39,41,43,53],gitlab:[0,4,32,33,34,35,36,37,43],give:[3,10,16,36,44,52],given:[5,6,7,8,9,10,13,14,15,16,21,23,26,27,34,39,41,44,45,46,48,50,51,52,54,59,60,65],glanc:[16,44],glibc:[19,62],global:[2,5,6,9,12,13,14,19,27,31,41,45,47,48,50,51,59,62],gnu:[2,4,15,30,41,43,57],gnueabihf:[4,43],goal:[1,8,16,24,39,44,53],goe:[17,26,28,49,54,61],going:[1,10,13,15,16,24,27,28,29,41,44,52,53,58,59,61],goldberg:[30,57],gonzui:[13,41],good:[0,16,24,31,40,44,53],goodi:[13,41],googl:[37,53],got:[30,57],gpio1:[4,43],gpio:[4,43],gpl:[2,15,21,31,41,65],gplv2:[24,53],gplv3:2,gpu:[4,43],grade:[0,16,32,33,34,35,36,37,38,40,44],gradual:[7,46],graduat:39,grain:[24,53],grant:[16,44],granular:[19,21,62,65],graph:[13,41],graphic:[4,13,24,43,53],greater:[14,29,45,58],greatli:[14,45],greg:53,grep:[12,13,15,34,41,47],grossli:[24,53],group:[7,8,9,22,23,24,25,27,39,46,51,53,59,60,63,64],grow:[28,61],gtk:[3,12,13,47],guarante:[5,6,12,19,24,28,37,47,48,50,53,61,62],guard:[19,21,29,58,62,65],guest16:38,guest16_end:38,guest:[0,16,30,44,57],guest_16_bit:38,guest_32_bit:38,guest_cod:38,guest_code_s:38,gui:[13,41],guid:[6,7,10,18,31,42,46,48,52],guidelin:[13,41],hack:[13,18,41,42],had:[5,7,21,24,46,50,53,65],half:[6,12,23,27,28,47,48,59,60,61],hall:53,halt:38,hand:[0,5,11,18,23,42,50,60],handl:[0,5,6,7,8,9,10,15,18,21,24,26,27,28,30,34,35,38,41,42,46,48,50,51,52,53,54,57,59,61,65],handler:[0,5,6,18,19,20,24,26,27,28,29,30,33,42,48,50,53,54,55,57,58,59,61,62],happen:[6,7,8,10,12,13,14,16,21,22,23,27,28,29,44,45,46,47,48,52,58,59,60,61,64,65],happend:[21,65],hard:[4,5,6,7,8,9,10,12,14,15,16,17,18,22,27,28,35,37,41,42,43,44,45,46,47,48,49,50,51,52,59,61,64],hardirq_bit:[28,61],hardirq_mask:[28,61],hardlink:[9,51],hardwar:[0,4,7,8,9,15,19,21,24,25,28,30,34,36,41,43,46,51,53,57,61,62,63,65],hartman:53,hartmut:53,has:[0,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,33,35,37,38,39,41,42,44,45,46,47,48,49,50,51,52,53,54,58,59,60,61,62,63,64,65],hash:[10,21,22,33,36,52,64,65],hashtabl:[33,36],have:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,30,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64,65],have_pid:[14,45],haven:[1,27,59],hc0:[21,65],hctx:[5,50],hd_geometri:[5,50],hda1:[7,46],hda2:[7,46],hda3:[13,41],hda:[7,46],hdr_len:[18,26,42,54],hdrerr:36,he1:[21,65],head:[1,9,14,18,25,26,27,28,31,38,42,45,51,53,54,59,61,63],head_32:2,header:[9,10,12,14,15,16,18,24,33,34,35,36,37,41,42,44,45,47,51,52,53],headers_end:[18,42],heap:[17,27,49,59],heavi:[25,63],heavili:[24,53],held:[14,21,39,45,65],hello:[0,38],hello_kdb:15,hello_kdb_break:15,hello_kdb_bug:15,help:[1,2,8,13,14,15,16,17,21,23,32,33,34,35,36,37,38,39,41,44,45,49,60,65],helper:[9,12,17,25,27,47,49,51,59,63],henc:[4,18,24,27,28,42,43,53,59,61],here:[2,4,7,8,9,15,17,18,19,21,22,23,27,28,31,35,41,42,43,46,49,51,59,60,61,62,64,65],heterogen:[13,41],hex:[21,24,53,65],hexadecim:[5,13,41,50],hexdump:[18,42],hi_softirq:[6,23,48,60],hierarch:[8,24,53],hierarchi:[0,4,8,9,10,28,43,51,52,61],high:[6,8,16,19,22,23,24,25,27,28,29,39,44,48,53,58,59,60,61,62,63,64],higher:[5,8,10,12,15,16,19,23,24,28,39,41,44,47,50,52,53,60,61,62],highest:[6,19,26,32,33,34,35,36,37,38,48,54,62],highmem:[0,17,25,49,63],hint:[9,15,41,51],hitachi:[24,53],hitm:[16,44],hlist_head:[9,51],hlt:[2,38],hog:[6,16,44,48],hold:[4,5,6,8,9,10,12,13,14,17,18,19,21,23,28,33,42,43,45,47,48,49,50,51,52,60,61,62,65],holder:[5,50],hole:[27,59],home:[2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,21,24,39,41,42,43,44,45,46,47,48,49,50,51,52,53,65],homework:[9,13,32,33,34,35,36,37,38,39,51],honest:39,hook:[18,24,26,42,53,54],hooknum:[18,42],hop:[26,54],hope:[30,57],hopefulli:[19,28,61,62],host:[0,1,2,3,7,10,13,15,18,22,26,30,36,38,41,42,46,52,54,57,64],hostnam:[18,42],hotkei:[16,44],hotplug:0,hour:39,housekeep:[19,62],hover:[13,41],how:[0,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,31,34,35,38,41,42,43,44,45,46,47,48,49,51,52,53,54,58,59,60,61,62,63,64,65],howev:[5,6,9,12,13,15,18,19,23,24,28,29,39,41,42,47,48,50,51,53,58,60,61,62],howto:[0,13,41],hpf:[9,51],hrtimer_softirq:[6,23,48,60],html:[0,2,38,53],hton:[18,42],htonl:[18,42],htop:[16,44],http:[0,1,2,4,7,13,18,24,30,31,36,37,38,41,42,43,46,53,57],hundr:[19,62],hvc0:[2,3,13,15,16,41,44],hwaddr:[30,57],hybrid:[24,53],hypercal:[27,30,57,59],hypervisor:[0,21,24,27,53,59,65],i2c1:[4,43],i2c:[4,12,43,47],i386:[3,15,19,41,62],i386_start_kernel:[21,65],i440fx:[21,65],i686:2,i8042:[12,47],i8042_data_reg:[12,47],i8042_kbd_irq:[12,47],i8042_read_data:[12,47],i8042_setup_kbd:[12,47],i8042_status_reg:[12,47],i82559er:3,i_atim:[9,10,51,52],i_blkbit:[10,52],i_block:[10,52],i_cdev:[7,46],i_count:[10,52],i_ctim:[9,10,51,52],i_dev:[10,52],i_fop:[9,10,51,52],i_gid:[9,10,51,52],i_gid_writ:[9,51],i_ino:[10,52],i_map:[10,52],i_mmap:[22,64],i_mmap_rwsem:[22,64],i_mmap_writ:[22,64],i_mod:[9,10,51,52],i_mtim:[9,10,51,52],i_new:[10,52],i_nlink:[10,52],i_op:[9,10,51,52],i_pag:[22,64],i_priv:[10,52],i_rdev:[10,52],i_sb:[10,52],i_siz:[9,10,51,52],i_size_read:[10,52],i_stat:[10,52],i_uid:[9,10,51,52],i_uid_writ:[9,51],ia32_nr_syscal:[29,58],ia32_sys_call_t:[29,58],ia_siz:[10,52],ia_valid:[10,52],iattr:[10,52],ibm:[24,53],icmp:[26,54],id_tabl:8,ide0:[12,47],ide1:[12,47],ide:8,ide_cd_mod:[15,41],idea:[9,14,21,39,45,51,65],ident:[7,13,46],identif:8,identifi:[4,5,6,7,8,9,10,12,13,14,15,16,17,21,22,23,25,28,29,31,41,43,44,45,46,47,48,49,50,51,52,58,60,61,63,64,65],idl:[2,6,48],ids:[16,24,44,53],idt:[23,60],idtr:[23,30,57,60],idtr_addr:[23,60],idtr_entri:[23,60],idx:[10,19,52,62],ier:[12,47],ifdef:[19,22,27,29,58,59,62,64],iff:[27,59],ifnam:3,ifndef:[19,62],iget_fail:[10,52],iget_lock:[10,52],ignor:[3,7,10,12,15,17,18,23,28,41,42,46,47,49,52,60,61],ignore_loglevel:15,ihl:[18,42],iir:[12,47],illeg:[19,62],illustr:[7,46],imag:[0,3,5,6,7,9,11,12,13,16,24,41,44,46,47,48,50,51,53],imap:[10,22,37,52,64],imap_block:37,img:[3,5,9,13,35,50,51],immedi:[5,14,23,28,35,45,50,60,61],impact:[24,28,29,53,58,61],implement:[0,4,5,6,9,10,14,16,17,18,19,20,22,23,24,25,28,30,31,32,37,38,39,42,43,44,45,48,49,50,51,52,53,55,57,60,61,62,63,64],impli:[24,30,53,57],implic:51,implicitli:[9,18,42,51],import_single_rang:[18,42],importantli:[19,62],impos:[18,24,42,53],imposs:[15,35,41],improv:[5,10,16,28,30,39,44,50,51,52,57,61],imx6ul:[4,43],imx8mm:[4,43],imx8mp:[4,43],imx_v6_v7_defconfig:[4,43],in_interrupt:[28,61],in_irq:[19,62],inactive_task_fram:[27,59],inaddr_loopback:[18,42],inappropri:[12,47],inb:[12,38,47],inb_p:[12,47],inc:2,inc_nlink:[9,51],includ:[1,2,4,5,6,7,8,9,10,12,14,15,16,17,18,21,23,24,25,29,30,36,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,60,63,65],incom:[12,19,27,47,59,62],incomplet:39,inconsist:[13,21,27,41,59,65],incorpor:8,incorrect:[7,21,28,35,39,46,61,65],increas:[0,6,9,16,18,23,24,26,28,32,33,34,35,36,37,38,42,44,48,51,53,54,60,61],increasingli:[23,60],increment:[8,9,10,11,14,15,19,23,28,45,51,52,60,61,62],incur:[24,39,53],indefinit:[16,44],independ:[5,8,12,14,18,24,25,28,42,45,47,50,53,61,63],index:[0,1,5,9,10,12,13,19,22,23,27,36,37,38,41,47,50,51,52,59,60,62,64],indic:[0,5,7,9,10,12,15,17,18,19,22,30,37,39,41,42,46,47,49,50,51,52,57,62,64],indirect:37,indirect_data_block:37,indirectli:[10,52],individu:[0,5,12,17,21,23,34,39,47,49,50,60,65],induct:[9,51],industri:[4,43],ineffici:[5,10,12,47,50,52],inet6:2,inet:[2,18,42],inet_cr:[18,42],inet_protosw_reus:[18,42],inet_sock_destruct:[18,42],inetd:3,infinit:[12,38,47],info:[0,4,5,6,9,12,13,14,21,23,43,45,47,48,50,51,53,60,65],inform:[0,1,5,7,8,9,10,13,14,15,16,17,18,19,21,22,23,24,25,27,29,30,32,33,34,35,36,37,38,39,41,42,44,45,46,49,50,51,52,53,57,58,59,60,62,63,64,65],infrastructur:[0,1,2,13,32,33,34,35,36,37,39],init:[2,4,6,7,8,14,15,16,17,23,24,31,41,43,44,45,46,48,49,53,60],init_delayed_work:[6,48],init_list_head:[14,27,45,59],init_modul:[7,12,15,18,21,41,42,46,47,65],init_nam:8,init_net:[18,27,42,59],init_onc:[10,52],init_ramfs_f:[9,51],init_task:[27,59],init_wait_entri:[27,59],init_waitqueue_head:[7,46],init_work:[6,23,48,60],initi:[0,1,6,7,8,10,12,13,14,17,18,20,21,22,23,24,27,31,36,37,42,45,46,47,48,49,52,53,55,59,60,64,65],initial_flag:[14,45],initrd:[24,53],inject:[16,30,44,57],inl:[12,47],inlin:[19,21,24,27,29,33,53,58,59,62,65],ino:[10,37,52],inod:[0,17,24,31,37,49,53],inode_direct_data_bl:37,inode_direct_data_block:37,inode_init_onc:[9,10,51,52],inode_init_own:[9,10,51,52],inode_newsize_ok:[10,52],inode_oper:[0,9,51],inode_vers:[10,52],inoi:37,input:[4,8,12,14,18,22,24,26,28,30,42,43,45,47,53,54,57,61,64],insert:[5,8,9,10,12,14,15,16,25,26,27,29,41,44,45,47,50,51,52,54,58,59,63],insert_inode_hash:[10,52],insid:[0,4,5,6,7,8,9,10,12,13,14,15,16,17,18,21,23,30,41,42,43,44,45,46,47,48,49,50,51,52,57,60,65],insight:[10,16,27,44,52,59],insmod:[7,8,9,10,12,14,15,18,21,41,42,45,46,47,51,52,65],inspect:[0,2,4,5,8,9,10,12,13,15,16,17,21,29,33,41,43,44,47,49,50,51,52,58,65],inspir:39,instal:[0,1,2,3,4,7,13,15,16,27,32,33,34,35,36,37,38,41,43,44,46,59],install_mod_path:3,instanc:[2,7,9,13,15,16,22,23,27,30,33,37,39,41,44,46,51,57,59,60,64],instanti:[10,18,42,52],instead:[1,5,7,12,14,15,18,19,24,27,28,30,36,41,42,45,46,47,50,53,57,59,61,62],instruct:[2,4,5,12,13,15,17,19,21,23,27,28,29,30,32,33,34,35,36,37,38,39,41,43,47,49,50,57,58,59,60,61,62,65],instructor:39,instrument:[4,16,21,33,43,44,65],insuffici:[12,47],int80:[23,60],int_max:[18,42],int_min:[18,42],integ:[9,10,14,18,25,28,42,45,51,52,61,63],integr:[1,4,8,9,13,15,16,39,41,43,44,51],intel:[0,12,13,24,38,41,47,53],intend:[13,18,41,42],intent:[11,39],inter:[24,53],interact:[7,8,9,10,15,16,17,18,22,24,28,32,36,38,41,42,44,46,49,51,52,53,61,64],interactiv:53,intercept:[15,18,33,41,42],interchang:[28,61],interconnect:[18,42],interest:[5,10,13,14,15,16,17,29,41,44,45,49,50,52,58],interfac:[2,4,5,6,7,8,9,12,13,15,16,17,18,21,23,24,26,27,29,30,32,33,35,36,37,39,41,42,43,44,46,47,48,49,50,51,53,54,57,58,59,60,65],interleav:[15,28,41,61],intern:[5,6,7,8,10,12,17,18,22,24,25,30,32,36,42,46,47,48,49,50,52,53,57,63,64],internet:[13,18,39,41,42],interpret:[0,28,61],interract:38,interrog:[12,47],interrupt:[0,4,6,7,8,14,15,19,20,21,24,26,27,29,30,34,40,41,43,45,46,48,53,54,55,57,58,59,62,65],intersect:[28,61],interv:[6,24,48,53],intr:[12,23,47,60],intricaci:[16,44],intro:0,introduc:[4,10,12,13,14,22,24,41,43,45,47,52,53,64],introducer:53,introduct:[0,22,40,64],intuit:[22,64],invalid:[10,14,15,16,19,20,21,28,29,35,41,44,45,52,55,58,61,62,65],invalidatepag:[22,64],inversio:[21,65],invert:[9,51],investig:[0,9,13,15,21,51,65],invit:[16,44],invlpg:[19,30,57,62],invoc:[15,41],invok:[6,9,10,27,48,51,52,59],involv:[5,8,9,10,12,14,19,39,45,47,50,51,52,62],inw:[12,47],iobas:8,ioc0:[12,47],iocb:[22,64],iocb_noio:[22,64],iocb_nowait:[22,64],iocla:53,ioctl:[0,4,5,18,30,31,33,34,36,38,42,43,50,57],ioctl_messag:[7,46],ioctl_set_addr:[18,42],iomem:[4,27,43,59],ionut:31,ioperm:[12,47],iopl:[12,47],iopol:[6,48],ioport:[12,27,34,47,59],ioremap:[19,62],iosnoop:[16,44],iounmap:[19,62],iov:[18,42],iov_it:[22,64],iovec:[18,42],ip_hdr:[18,42],ipc:[24,27,53,59],iph:[18,42],iphdr:[18,42],ipproto_tcp:[18,42],ipproto_udp:[18,42],iptabl:[18,42],iput:[9,10,51,52],ipv4:[18,24,42,53],ipv6:[24,53],iret:[19,23,29,58,60,62],irq:[0,8,12,24,30,47,53,57],irq_com1:34,irq_com2:34,irq_count:[28,61],irq_exit:[21,65],irq_handl:[12,47],irq_handler_t:[12,47],irq_no:[12,47],irq_non:[12,47],irq_poll_softirq:[6,23,48,60],irq_wake_thread:[12,47],irqf_oneshot:[12,47],irqf_shar:[12,47],irqflag:2,irqn:[23,60],irqreturn_t:[12,47],irqs_dis:[19,62],irrepar:[15,41],is_dirty_writeback:[22,64],is_en:[27,59],is_err:[5,50],is_key_press:[12,47],is_partially_uptod:[22,64],isa:8,isapnp:8,isn:8,iso9660:[9,51],isol:[24,27,53,59],isolate_mode_t:[22,64],isolate_pag:[22,64],isra:[21,65],issu:[1,5,6,7,11,12,13,15,16,17,19,21,23,24,26,27,28,29,30,33,39,41,44,46,47,48,49,50,53,54,57,58,59,60,61,62,65],item:[6,14,15,32,45,48],iter:[0,5,6,8,14,17,22,37,45,48,49,50,64],its:[5,6,7,8,9,10,12,13,14,15,16,17,18,22,23,24,25,27,28,29,30,33,35,37,38,39,41,42,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,63,64],itself:[3,16,21,23,24,25,30,44,53,57,60,63,65],izon:[22,37,64],izone_block:37,jae:[29,58],jbd2:[16,44],jetson:[4,43],jiffi:[6,13,14,21,45,48,65],jiffies_64:13,jiffies_valu:[6,48],jmp:[15,27,41,59],jnz:[28,61],job:[13,23,24,53,60],john:31,join:[13,41],jonathan:53,joystick:[7,46],json:[13,41],jtag:[21,65],jump:[12,13,23,29,41,47,58,60],june:[16,39,44],just:[1,5,6,8,9,10,12,13,15,17,19,21,22,23,24,27,28,30,41,47,48,49,50,51,52,53,57,59,60,61,62,64,65],juwucd7ldvurncamopepmhqejfftunlaqo:2,kacpid:[6,48],kallsym:[14,16,44,45],karma:53,kasan:0,kate:[13,41],kbd:[12,47],kbd_exit:[12,47],kbd_fop:[12,47],kbd_init:[12,47],kbd_interrupt_handl:[12,47],kbd_major:[12,47],kbd_minor:[12,47],kbd_read:[12,47],kblockd:[6,48],kbuild:[11,15,33,34,35,36,41],kconfig:[24,53],kcore:[13,41],kdb:[0,11],kdb_write_address:15,kde:[13,41],kdir:[2,15,41],keep:[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,24,25,28,29,37,39,41,42,43,44,45,46,47,48,49,50,51,52,53,58,61,63],kei:[0,2,13,15,26,27,41,54,59],kept:[9,23,51,60],kern:[15,18,41,42],kern_alert:[10,14,15,41,45,52],kern_crit:[14,15,41,45],kern_debug:[14,15,41,45],kern_emerg:[14,15,41,45],kern_err:[5,14,15,41,45,50],kern_info:[14,15,41,45],kern_level:[15,41],kern_notic:[5,14,15,41,45,50],kern_warn:[14,15,41,45],kernel:[3,5,7,8,9,10,12,17,19,20,22,23,25,26,29,30,33,34,35,36,37,38,39,40,46,47,49,50,51,52,54,55,57,58,60,62,63,64],kernel_:[18,42],kernel_api:[14,45],kernel_buff:[7,29,46,58],kernel_modul:[11,15,41],kernel_profil:[16,44],kernel_recvmsg:[18,42],kernel_sector_s:[5,35,50],kernel_sendmsg:[18,42],kernel_thread:13,kernelnewbi:[13,41],kernelsocket:[18,42],keyboard:[0,3,7,8,13,15,41,46],keycod:[12,47],keylogg:[12,47],keystrok:[12,47],keyword:[0,6,11,12,17,47,48,49],kfifo:[0,34],kfree:[9,14,17,21,28,33,45,49,51,61,65],kfree_mem:33,kgdboc:[15,41],khelper:[6,48],ki_flag:[22,64],kib:[16,44],kill:[15,16,18,21,27,41,42,44,59,65],kill_anon_sup:[9,51],kill_block_sup:[9,51],kill_litter_sup:[9,51],kill_sb:0,killabl:[27,59],kilobyt:[5,50],kiocb:[22,64],kit:[4,43],kkeil:[19,62],klau:53,klogd:[15,41],kludg:[22,64],km_type_nr:[19,62],kmalloc:[10,14,17,21,25,33,45,49,52,63,65],kmalloc_area:[17,49],kmalloc_mem:33,kmalloc_prob:33,kmalloc_probe_entry_handl:33,kmalloc_probe_handl:33,kmalloc_ptr:[17,49],kmap:[17,19,49,62],kmap_atom:[5,19,50,62],kmap_atomic_high_prot:[19,62],kmap_atomic_idx_push:[19,62],kmap_atomic_prot:[19,62],kmap_prot:[19,62],kmap_pt:[19,62],kmem:[16,44],kmem_cach:[21,65],kmem_cache_alloc:[10,13,21,52,65],kmem_cache_alloc_trac:[21,65],kmem_cache_cr:[21,65],kmem_cache_fre:[21,65],kmemcheck:[21,65],kmemleak:0,kmmap:[17,49],knife:[18,42],know:[0,5,12,14,15,16,18,21,24,40,42,44,45,47,50,53,65],knowledg:[5,9,10,13,33,34,36,39,41,50,51,52],known:[2,6,9,10,13,14,23,28,41,45,48,51,52,60,61],knuth:[14,45],kob:8,kobj:8,kobj_typ:8,kobj_uevent_env:8,kobject:0,kobject_get:8,kobject_init:8,kobject_put:8,kobject_set_nam:8,korner:[18,42],kprobe:[0,11,16,21,39,40,44,53,65],kref:8,kretprob:33,kroah:53,kscope:0,kset:8,ksoftirqd:[6,21,23,48,60,65],kstatf:[9,51],ksys_writ:[16,44],kthread:[6,21,48,65],kthread_creat:[6,48],kthread_run:[6,48],kthreadd:[16,44],ktime_t:[18,26,42,54],ktype:8,kunmap_atom:[5,19,50,62],kvec:[18,42],kvm:[0,3,16,24,39,40,44,53],kvm_api:38,kvm_api_vers:38,kvm_create_vcpu:38,kvm_create_vm:38,kvm_exit_hlt:38,kvm_exit_io:38,kvm_exit_mmio:38,kvm_fd:38,kvm_get_api_vers:38,kvm_get_sreg:38,kvm_get_vcpu_mmap_s:38,kvm_reg:38,kvm_run:38,kvm_set_sreg:38,kvm_set_tss_addr:38,kvm_set_user_memory_region:38,kworker:[16,44],kzalloc:[9,51],l12:39,lab10:[9,51],lab9:[9,51],lab:[0,2,3,7,8,11,21,27,36,37,39,40,53,59,65],label:[3,4,12,15,18,24,29,41,42,43,47,53,58],labor:0,laboratoar:13,laboratori:[0,2,5,9,10,32,50,51,52],laboratorului:0,lack:[5,15,41,50],laddr:[19,62],languag:[13,14,15,41,45],laptop:39,larg:[6,7,9,14,16,19,22,23,24,26,27,28,44,45,46,48,51,53,54,59,60,61,62,64],larger:[2,8,10,13,19,25,26,41,52,54,62,63],largest:[24,53],last:[6,7,10,12,13,15,18,22,23,25,27,28,29,38,39,41,42,46,47,48,52,58,59,60,61,63,64],late:39,latenc:[0,23,28,34,60,61],later:[2,6,7,8,9,10,21,23,29,30,46,48,51,52,57,58,60,65],latest:[0,1,4,5,6,7,8,9,10,12,14,15,16,17,18,38,41,42,43,44,45,46,47,48,49,50,51,52],latm:[16,44],latter:[18,23,29,42,58,60],launch:[0,13],launder_pag:[22,64],law:2,layer:[0,18,26,36,40,42,54],layout:[0,19,23,35,60,62],lazi:[0,27,59],lcd:[4,43],lcdif:[4,43],ld_preload:[13,41],ldrex:[28,61],ldt:[19,23,60,62],ldtr:[19,62],lea:[13,41],lead:[4,7,8,12,14,15,16,23,25,28,39,41,43,44,45,46,47,60,61,63],leak:[21,65],leak_init:[21,65],leakag:[27,59],learn:[11,16,17,21,34,41,44,49,53,65],least:[4,5,6,10,15,18,19,23,24,28,34,39,41,42,43,48,50,52,53,60,61,62],leav:[3,11,15,18,22,25,41,42,63,64],lectur:[0,36,40],led:[18,42],left:[1,5,10,18,23,42,50,52,60],legaci:8,len:[5,7,10,17,18,21,22,26,36,38,42,46,49,50,52,54,64,65],length:[7,9,10,17,22,36,37,46,49,51,52,64],less:[7,10,13,15,19,25,28,37,41,46,52,61,62,63],let:[6,12,13,16,18,19,21,23,24,27,28,29,38,39,41,42,44,47,48,53,58,59,60,61,62,65],letter:[5,38,50],level:[0,1,7,8,10,12,13,14,15,16,18,19,21,22,23,24,25,27,28,29,30,33,35,36,39,41,42,44,45,46,47,52,53,57,58,59,60,61,62,63,64,65],lib:[4,13,14,15,17,21,24,27,41,43,45,49,53,59,65],libc:[14,17,29,45,49,58],libncurses5:[3,13],librari:[4,13,14,15,24,29,41,43,45,53,58],libsw:[13,41],libthread_db:[13,41],licens:[2,24,30,31,53,57],life:[16,44],light:[6,28,48,61],lighter:[10,52],lightweight:[13,27,41,59],like:[4,6,7,8,9,10,11,12,13,16,17,18,19,21,23,27,28,39,41,42,43,44,46,47,48,49,51,52,59,60,61,62,65],likewis:[29,58],limit:[6,9,10,12,14,19,21,23,24,28,39,45,47,48,51,52,53,60,61,62,65],lin:[15,41],line:[1,6,7,8,12,13,14,15,16,18,21,23,24,28,32,36,38,41,42,44,45,46,47,48,53,60,61,65],linear:[0,25,63],linearli:[25,63],link:[0,2,3,4,5,8,9,13,14,15,16,21,22,24,25,26,27,36,37,41,43,44,45,50,51,53,54,59,63,64,65],linker:[29,38,58],linu:[24,53],linux:[2,3,5,6,7,9,10,11,15,16,17,21,26,27,30,32,33,34,35,36,37,38,40,44,46,48,49,50,51,52,54,57,59,65],list:[0,2,3,4,5,6,7,8,9,10,11,13,15,16,17,18,22,25,26,27,28,32,33,34,35,36,37,38,39,41,42,43,44,46,48,49,50,51,52,53,54,59,61,63,64],list_add:[6,14,27,45,48,59],list_add_rcu:[28,61],list_add_tail:[27,28,59,61],list_del:[6,14,21,27,45,48,59,65],list_del_init:[27,59],list_del_init_car:[27,59],list_del_rcu:[28,61],list_empti:[27,28,59,61],list_empty_car:[27,59],list_entri:[10,14,45,52],list_exit:[14,45],list_first_entri:[6,27,28,48,59,61],list_for_each:[14,45],list_for_each_entry_rcu:[28,61],list_for_each_entry_saf:[14,45],list_for_each_entry_safe_from:[27,59],list_for_each_saf:[14,45],list_head:[5,6,8,13,14,18,21,22,42,45,48,50,64,65],list_m:[21,65],list_next_entri:[27,59],list_poison1:[21,65],list_poison2:[21,65],lista:0,listen:[0,3,13,36],listen_backlog:[18,42],littl:[14,18,24,42,45,53],live:[15,41],lld:[7,46],llseek:[7,10,46,52],lnet_sock_accept:[18,42],load:[0,4,5,6,7,8,9,10,12,13,14,16,17,21,22,27,28,30,32,34,43,44,45,46,47,48,49,50,51,52,57,59,61,64,65],load_modul:[15,21,41,65],load_offset:[29,58],loadabl:[15,24,41,53],loader:[24,53],loc:[12,47],local:[0,1,2,4,5,6,7,8,10,12,13,14,15,16,17,18,19,23,24,25,26,28,36,37,41,42,43,44,45,46,47,48,49,50,52,53,54,60,61,62,63],local_bh_dis:[6,23,28,48,60,61],local_bh_en:[6,23,28,48,60,61],local_irq_dis:[12,28,47,61],local_irq_en:[12,28,29,47,58,61],local_irq_restor:[28,61],local_irq_sav:[28,61],localhost:[2,18,42],locat:[1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22,23,24,28,41,42,43,44,45,46,47,48,49,50,51,52,53,55,60,61,62,64],lock1:[14,45],lock2:[14,45],lock:[0,5,7,10,16,18,19,21,22,24,27,33,42,44,46,50,52,53,59,62,64,65],lock_acquir:[21,65],lock_addr:[28,61],lock_class_kei:[9,51],lockdep:0,lockdep_assert_held:[27,59],locked_ioctl:[5,50],locking2:[21,65],loff_t:[7,9,10,22,31,46,51,52,64],log:[1,8,10,12,13,14,15,16,21,30,41,44,45,47,52,57,65],log_guest_error:[30,57],logic:[5,19,24,26,29,35,50,53,54,58,62],logical_disk_nam:35,logical_disk_s:35,logical_disk_sector:35,login:[2,3,13],loglevel:[3,4,15,41,43],longer:[3,4,5,6,8,12,13,14,15,28,36,41,43,45,47,48,50,61],look:[0,4,6,7,9,12,13,14,15,16,18,21,22,23,25,27,28,29,30,32,41,42,43,44,45,46,47,48,51,57,58,59,60,61,63,64,65],lookup:[0,22,26,37,54,64],loop:[3,10,12,14,16,17,37,38,44,45,47,49,52],loopback:[2,18,36,37,42],lose:[30,57],lost:[15,16,41,44],lot:[12,15,20,27,38,41,47,55,59],love:[9,10,51,52,53],low:[5,16,22,23,24,30,44,50,53,57,60,64],lower:[16,21,23,24,25,28,44,53,60,61,63,65],lower_up:2,lowercas:34,lowest:[6,8,17,19,48,49,62],lowmem:[17,19,25,49,62,63],lpt:8,lro:[26,54],lru:[25,63],lrwx:[27,59],lrwxrwxrwx:3,lsb:[4,13,41,43],lsc:[13,41],lseek:[7,10,46,52],lsmod:[14,15,27,41,45,59],lsof:[16,44],lsp:[13,41],lucru:53,lucrul:53,lvm:[24,53],lwn:[7,24,46,53],lxc:[27,59],lxn:[17,49],lxp:[27,59],lxr:[0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65],mac:[15,24,36,41,53],mac_head:[18,26,42,54],mac_len:[18,26,42,54],machin:[0,1,4,5,6,7,9,10,12,14,15,16,18,24,27,29,33,34,35,36,39,40,41,42,43,44,45,46,47,48,50,51,52,53,58,59],macro:[0,5,6,7,8,10,12,13,14,15,18,19,24,27,33,35,36,37,38,41,42,45,46,47,48,50,52,53,59,62],macrodefinit:[5,9,50,51],made:[5,6,7,12,13,14,17,18,22,24,29,32,33,34,35,36,38,39,42,45,46,47,48,49,50,53,58,64],madv_merg:38,madvis:38,magic:[7,9,21,22,37,38,46,51,64,65],magic_valu:38,magnet:[7,46],magnitud:[16,44],mai:[5,6,7,8,11,12,14,16,19,22,23,24,25,27,28,29,32,33,34,35,36,37,38,39,44,45,46,47,48,50,53,58,59,60,61,62,63,64],mail:[13,31,32,33,34,35,36,37,41,53],main:[0,1,2,5,6,8,10,12,13,15,16,19,21,22,26,28,37,38,41,44,47,48,50,52,54,61,62,64,65],mainli:[10,19,52,62],mainlin:[15,41],maintain:[0,5,6,8,9,14,18,21,22,25,28,42,45,48,50,51,61,63,64,65],maintainership:[24,53],major:[0,5,8,23,33,34,35,50,60],make:[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,23,24,27,28,29,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62],make_bad_inod:[10,52],makefil:[2,3,5,9,13,15,16,41,44,50,51],makenod:[6,8,48],malloc:[14,45],man:[13,18,41,42],manag:[0,5,7,8,10,12,14,15,16,17,18,21,30,31,32,39,40,41,42,44,45,46,47,49,50,52,57,65],mandat:[19,62],mandatori:[5,32,33,34,35,36,50],mani:[5,6,7,8,10,12,13,14,16,17,19,22,24,25,27,28,33,34,35,36,41,44,45,46,47,48,49,50,52,53,59,61,62,63,64],manifest:[28,61],manipul:[14,20,22,24,45,53,55,64],manner:[24,53],mantain:[13,41],manual:[1,2,6,7,13,16,18,28,42,44,46,48,61],manufactur:[4,43],map:[0,5,7,9,10,12,13,16,20,22,24,25,27,29,30,40,41,44,46,47,50,51,52,53,55,57,58,59,63,64],map_anonym:38,map_bh:[9,10,51,52],map_noreserv:38,map_priv:38,mappabl:[22,64],marc:53,march:32,marco:53,marginea:53,mark:[5,6,7,8,9,10,11,12,14,15,16,17,18,29,36,41,42,44,45,46,47,48,49,50,51,52,58],mark_buffer_dirti:[9,10,51,52],mark_inode_dirti:[10,52],mark_lock:[21,65],marker:[15,21,41,65],market:[24,53],mask:[6,23,25,26,27,48,54,59,60,63],maskabl:[23,60],master:[1,4,5,6,7,8,9,10,12,13,14,15,16,17,18,37,41,42,43,44,45,46,47,48,49,50,51,52,53],match:[8,12,13,17,26,27,41,47,49,54,59],match_devic:8,materi:39,matter:1,max:[7,9,36,38,46,51],max_access_s:[30,57],max_io_ap:[19,62],max_lfs_files:[9,51],max_number_devic:31,max_queue_len:38,max_siz:[14,45],max_softirq_restart:[23,60],max_softirq_tim:[23,60],maxact:33,maxim:[5,50],maximum:[4,7,9,10,24,28,29,37,39,43,46,51,52,53,58,61],maxlen:38,maxsiz:[7,46],mcimx6ul:[4,43],mcr:[12,47],mean:[1,4,5,6,7,8,9,10,12,13,15,16,18,19,23,24,28,34,35,37,38,39,41,42,43,44,46,47,48,50,51,52,53,60,61,62],meaning:38,meaningless:[24,53],meant:[4,43],meantim:[6,48],measur:[14,16,38,39,44,45],mechan:[5,6,8,14,16,17,23,24,28,29,33,44,45,48,49,50,53,58,60,61],media_chang:[5,50],mediat:[7,46],medic:[4,43],mediu:53,medium:[28,61],meet:[8,34,35,39],megabyt:[16,44],mem:[14,16,17,27,38,44,45,49,59],mem_map:[25,63],membarrier_switch_mm:[27,59],member:[1,7,8,14,18,27,31,42,45,46,59],memblock:[20,55],memcmp:[14,45],memcpi:[5,14,24,29,38,45,50,53,58],memmov:[14,30,45,57],memori:[0,2,5,6,7,8,9,10,12,13,16,19,22,23,27,29,30,33,38,40,41,44,46,47,48,50,51,52,57,58,59,60,62,64],memoriei:53,memory_exit:[14,45],memory_init:[14,45],memory_map:[17,49],memory_region_add_subregion:[30,57],memory_region_init_io:[30,57],memoryregion:[30,57],memoryregionop:[30,57],memset:[14,21,45,65],mention:[4,5,7,28,29,32,33,34,35,36,37,38,43,46,50,58,61],menuconfig:[4,13,15,18,37,41,42,43],merchant:[30,57],mere:38,merg:[0,1,5,16,24,44,50,53],merit:[24,53],mesag:31,mesi:[28,61],messag:[0,1,4,5,6,9,10,12,13,14,15,16,17,21,24,26,27,31,38,41,43,44,45,47,48,49,50,51,52,53,54,59,65],messsag:[30,57],meta:[22,64],metadata:[9,10,24,51,52,53],metainform:[9,36,51],metal:[30,57],method:[5,7,8,9,13,15,20,22,23,38,41,46,50,51,55,60,64],methodolog:0,micro:0,microkernel:[15,24,41,53],mid10:[21,65],mid:[7,46],might:[1,4,15,16,17,18,21,22,24,27,28,29,31,42,43,44,49,53,58,59,61,64,65],might_fault:[29,58],might_sleep:[27,28,59,61],migrat:[22,27,59,64],migrate_async:[22,64],migrate_mod:[22,64],migratepag:[22,64],mihai:53,militaru:53,miller:[24,53],million:[24,53],min:[7,39,46,53],min_access_s:[30,57],mind:[9,12,14,16,18,24,31,42,44,45,47,51,53],minf:0,minfs_add_link:[10,52],minfs_alloc_inod:[9,51],minfs_creat:[10,52],minfs_destroy_inod:[9,51],minfs_dir_entri:[10,52],minfs_dir_inode_oper:[10,52],minfs_dir_oper:[10,52],minfs_fill_sup:[9,51],minfs_find_entri:[10,52],minfs_fs_typ:[9,51],minfs_iget:[9,51],minfs_inod:[9,10,51,52],minfs_inode_info:[9,10,51,52],minfs_lookup:[10,52],minfs_mount:[9,51],minfs_new_inod:[10,52],minfs_num_entri:[10,52],minfs_op:[9,51],minfs_readdir:[10,52],minfs_sb_info:[9,51],minfs_super_block:[9,51],minicom:[0,2,3,13],minim:[2,3,4,9,13,28,41,43,51,61],minimalist:[10,52],minimum:[4,6,19,23,43,48,60,62],minix:[9,10,37,51,52],minix_add_link:[10,52],minix_alloc_inod:[9,51],minix_aop:[10,52],minix_bmap:[10,52],minix_cr:[10,52],minix_destroy_inod:[9,51],minix_dir_inode_oper:[10,52],minix_dir_oper:[10,52],minix_evict_inod:[10,52],minix_file_inode_oper:[10,52],minix_file_oper:[10,52],minix_fill_sup:[9,51],minix_find_entri:[10,52],minix_get_block:[10,37,52],minix_getattr:[10,52],minix_i:[10,52],minix_iget:[10,52],minix_inod:[10,52],minix_inode_info:[10,52],minix_link:[10,52],minix_lookup:[10,52],minix_mkdir:[10,52],minix_mknod:[10,52],minix_mount:[9,51],minix_new_inod:[10,52],minix_readdir:[10,52],minix_readpag:[10,52],minix_rmdir:[10,52],minix_sb_info:[9,51],minix_setattr:[10,52],minix_symlink:[10,52],minix_trunc:[10,52],minix_unlink:[10,52],minix_v1:[10,52],minix_v1_raw_inod:[10,52],minix_write_begin:[10,52],minix_write_fail:[10,52],minix_write_inod:[10,52],minix_writepag:[10,52],minor:[0,5,8,12,33,34,35,47,50],minu:[15,41],minut:[13,39,41,51,53],mip:[24,25,53,63],misc:0,misc_deregist:8,misc_regist:8,miscdevic:33,miss:[8,14,19,21,23,27,28,33,45,59,60,61,62,65],mistak:[6,15,32,33,34,35,36,37,41,48],mix:[19,62],mk_pte:[19,62],mkdev:[7,46],mkdir:[1,3,9,10,13,22,37,51,52,64],mkf:[9,10,51,52],mknod:[5,6,7,10,12,46,47,48,50,52],mm_struct:[0,13,15,25,63],mmap:[7,10,16,17,22,24,25,36,38,44,46,49,52,53,63,64],mmcblk0:[4,43],mmdrop:[27,59],mmgrab:[27,59],mmio:[30,38,57],mmput:[17,49],mmu:[0,17,25,29,49,58,63],mnt:[4,9,10,43,51,52],mobil:[4,39,43],mod1:[15,41],mod2:[15,41],mod:[11,15,41],mod_nam:8,mod_tim:[6,21,23,48,60,65],mode:[0,3,5,6,7,9,10,12,13,14,15,17,18,19,24,27,29,30,37,39,41,42,45,46,47,48,49,50,51,52,53,57,58,59,62],mode_nam:[15,41],model:[0,3,6,10,30,48,52,57],modem:[12,47],modern:[15,18,24,28,41,42,53,61],modif:[13,24,34,37,41,53],modifi:[1,4,5,6,7,8,9,10,14,15,17,18,19,21,22,27,28,29,30,39,41,42,43,45,46,48,49,50,51,52,57,58,59,61,62,64,65],modinst:3,modpost:[11,15,41],modprob:[15,41],modul:[0,2,4,5,6,7,8,9,10,11,12,13,16,17,18,21,31,32,33,34,35,36,37,42,43,44,46,47,48,49,50,51,52,65],modular:[10,15,24,41,52,53],module_author:[15,31,41],module_descript:[15,31,41],module_device_t:8,module_exit:[15,18,21,31,41,42,65],module_init:[15,18,21,31,41,42,65],module_licens:[15,21,41,65],module_nam:[12,47],module_put:[18,42],modules_instal:3,moment:[9,10,12,13,41,47,51,52],momentarili:[19,62],mon:31,mon_proc:[6,48],mondai:[32,33,34,36],monitor:[0,6,15,16,23,24,28,33,41,44,48,53,60,61],monolit:53,monolith:[0,15,41],month:[24,53],moodl:[32,33,34,35,36,38,39],more:[2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,30,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64,65],moreov:[14,45],most:[4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,21,22,23,24,26,28,30,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,57,60,61,62,64,65],mostli:[4,10,20,22,23,43,52,55,60,64],motorola:[24,53],mount:[0,2,3,4,5,8,10,13,15,21,27,41,43,50,52,59,65],mount_bdev:[9,51],mount_nodev:[9,51],mount_opt:[9,51],mount_pseudo:[9,51],mount_singl:[9,51],mountain:[10,52],mous:[7,8,46],mov:[13,15,19,21,28,29,41,58,61,62,65],movabl:[25,63],move:[5,7,13,20,23,27,29,41,46,50,55,58,59,60],move_addr_to_kernel:[18,42],movl:[15,27,29,41,58,59],movzbl:[29,58],mozilla:[13,41],ms_remountm:[9,51],msg:[18,42],msg_control:[18,42],msg_controllen:[18,42],msg_dontwait:[18,42],msg_flag:[18,42],msg_iter:[18,42],msg_name:[18,42],msg_namelen:[18,42],msghdr:[18,42],msi:[30,57],msleep:[12,47],msr:[30,57],mss:[26,54],msust:[10,52],mtu:2,much:[5,6,7,10,13,15,16,23,24,25,27,28,41,44,46,48,50,52,53,59,60,61,63],muller:53,mult:53,multi:[0,11,15,19,27,40,41,59,62],multi_mod:[15,41],multicast:2,multilib:[3,7,46],multipl:[2,5,6,8,9,10,11,12,13,14,15,16,19,22,23,24,25,27,28,30,33,35,41,44,45,47,48,50,51,52,53,57,59,60,61,62,63,64],multiplex:36,multiprocess:0,multiprocessor:[6,48],multitask:[14,24,45,53],must:[1,5,6,7,8,9,10,12,13,14,15,17,18,19,22,24,27,28,29,31,32,34,35,36,37,38,39,41,42,45,46,47,48,49,50,51,52,53,58,59,61,62,64],mutex:[0,6,48],mutex_flag_wait:[28,61],mutex_init:[14,45],mutex_lock:[12,14,21,28,33,45,47,61,65],mutex_lock_nest:[21,33,65],mutex_remove_wait:[28,61],mutex_unlock:[14,21,28,33,45,61,65],mutex_wait:[28,61],mutual:[5,14,45,50],mvar:[15,41],mx6ul:[4,43],my_access:[12,47],my_acquir:[14,45],my_baseport:[12,47],my_blkdev_nam:[5,50],my_block_dev:[5,50],my_block_exit:[5,50],my_block_init:[5,50],my_block_major:[5,50],my_block_minor:[5,50],my_block_op:[5,50],my_block_open:[5,50],my_block_releas:[5,50],my_block_request:[5,50],my_block_transf:[5,50],my_bu:8,my_bus_descr:8,my_bus_devic:8,my_bus_device_releas:8,my_bus_exit:8,my_bus_init:8,my_bus_typ:8,my_class:8,my_classdev:8,my_cleanup:8,my_data:[6,7,12,46,47,48],my_debug_func:15,my_dev_releas:8,my_devic:[7,8,46],my_device_data:[6,7,12,46,47,48],my_device_driv:[7,46],my_driv:8,my_exit:8,my_first_minor:[7,46],my_fop:[7,46],my_fork:1,my_handl:[12,47],my_hook_exit:[18,42],my_hook_init:[18,42],my_init:[8,12,14,45,47],my_ioctl:[7,18,42,46],my_ioctl_data:[7,46],my_ioctl_down:[7,46],my_ioctl_filter_address:[18,42],my_ioctl_get_buff:[7,46],my_ioctl_in:[7,46],my_ioctl_print:[7,46],my_ioctl_set_buff:[7,46],my_ioctl_timer_alloc:[6,48],my_ioctl_timer_cancel:[6,48],my_ioctl_timer_mon:[6,48],my_ioctl_timer_set:[6,48],my_ioctl_up:[7,46],my_irq:[12,47],my_list:[14,45],my_lock:[28,61],my_major:[7,46],my_match:8,my_max_minor:[7,46],my_minor:[7,46],my_minor_count:[7,46],my_modul:[14,45],my_nf_hookfn:[18,42],my_nfho:[18,42],my_nr_port:[12,47],my_oops_exit:[15,41],my_oops_init:[15,41],my_open:[7,46],my_pnp_driv:8,my_pnp_prob:8,my_pnp_remov:8,my_pnp_tbl:8,my_port:[18,42],my_proc_file_op:[17,49],my_queue_op:[5,50],my_read:[7,46],my_register_devic:8,my_register_driv:8,my_releas:[7,14,45,46],my_seq_open:[17,49],my_seq_show:[17,49],my_show_bus_descr:8,my_submit_bio:[5,50],my_test_messag:[18,42],my_thread_f:[6,48],my_uev:8,my_unregister_devic:8,my_unregister_driv:8,my_work:[6,48],my_work_handl:[6,48],my_workqueu:[6,48],my_writ:[7,46],my_xfer_bio:[5,50],my_xfer_request:[5,50],myaddr:[18,42],mybdev:[5,7,46,50],myblock:[5,50],mybu:8,mybus0:8,mycdev:[7,46],myclass0:8,myclass:8,mydev0:8,mydev:8,mydisk:[5,9,13,50,51],mydriv:8,myf:0,myfs_aop:[10,52],myfs_creat:[10,52],myfs_dir_inode_oper:[10,52],myfs_file_inode_oper:[10,52],myfs_file_oper:[10,52],myfs_fill_sup:[9,51],myfs_get_inod:[9,10,51,52],myfs_mkdir:[10,52],myfs_mknod:[10,52],myfs_mount:[9,51],mykthread0:[6,48],myrul:8,name:[3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,32,33,34,35,36,37,38,41,42,43,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62,63,64,65],name_len:[10,52],name_s:8,namefmt:[6,48],namespac:[0,18,24,42,53],nano:[4,43],nat:[18,26,42,54],nativ:[16,30,44,57],native_safe_halt:2,natur:[6,48],navig:[0,1,15,17,35,49],ncp:[9,51],necesar:0,necess:[19,29,58,62],necessari:[5,7,8,10,12,13,14,15,22,28,31,37,38,41,45,46,47,50,52,61,64],necessarili:[24,53],need:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,55,59,60,61,62,63,64,65],neg:[7,12,14,22,28,45,46,47,61,64],neglig:[29,58],neovim:[13,41],nest:[0,6,12,47,48],net:[3,7,8,13,15,18,24,27,36,41,42,46,53,59],net_devic:[18,26,42,54],net_famili:[18,42],net_proto_famili:36,net_rx_softirq:[6,23,48,60],net_softirq:[26,54],net_tx_sofirq:[6,48],net_tx_softirq:[6,23,48,60],netcat:[0,15,41],netconsol:[0,3],netdev:3,netem:[18,42],netfilt:0,netfilter_ipv4:[18,42],network:[0,4,6,8,9,15,23,27,36,40,41,43,48,51,59,60],network_head:[18,26,42,54],neural:[4,43],never:[10,19,29,52,58,62],new_inod:[9,10,51,52],new_sock:[18,42],newer:[29,58],newli:[5,10,18,25,36,42,50,52,63],newlin:[17,49],newsock:[18,42],nex:38,next:[1,6,7,9,10,12,13,14,15,18,19,21,22,23,24,25,26,27,28,30,32,33,34,35,36,38,39,41,42,45,46,47,48,51,52,53,54,57,59,60,61,62,63,64,65],next_jiffi:[6,48],next_task:[14,45],nf_:[18,42],nf_accept:[18,42],nf_drop:[18,42],nf_hook_op:[18,42],nf_hook_stat:[18,42],nf_hookfn:[18,42],nf_inet_forward:[18,42],nf_inet_hook:[18,42],nf_inet_local_in:[18,42],nf_inet_local_out:[18,42],nf_inet_numhook:[18,42],nf_inet_post_rout:[18,42],nf_inet_pre_rout:[18,42],nf_ip_forward:[26,54],nf_ip_hook_prior:[18,42],nf_ip_local_in:[26,54],nf_ip_local_out:[26,54],nf_ip_numhook:[26,54],nf_ip_post_rout:[26,54],nf_ip_pre_rout:[26,54],nf_ip_pri_conntrack:[18,42],nf_ip_pri_conntrack_confirm:[18,42],nf_ip_pri_conntrack_defrag:[18,42],nf_ip_pri_conntrack_help:[18,42],nf_ip_pri_filt:[18,42],nf_ip_pri_first:[18,42],nf_ip_pri_last:[18,42],nf_ip_pri_mangl:[18,42],nf_ip_pri_nat_dst:[18,42],nf_ip_pri_nat_src:[18,42],nf_ip_pri_raw:[18,42],nf_ip_pri_secur:[18,42],nf_ip_pri_selinux_first:[18,42],nf_ip_pri_selinux_last:[18,42],nf_max_verdict:[18,42],nf_queue:[18,42],nf_register_net_hook:[18,42],nf_repeat:[18,42],nf_stolen:[18,42],nf_stop:[18,42],nf_unregister_net_hook:[18,42],nfs:[9,51],nfsd:[15,41],nic:3,nicknam:[18,42],nmap:[13,41],nmi:[12,23,47,60],nmi_bit:[28,61],noarp:2,nobodi:[6,19,48,62],nobuff:36,node1:[4,43],node:[4,5,7,8,9,10,14,24,25,28,43,45,46,50,51,52,53,61,63],nodev:[21,65],nograph:[4,43],noinlin:[21,65],nok:[29,58],non:[0,4,5,7,12,14,15,18,19,22,23,24,28,29,30,34,41,42,43,45,46,47,50,53,57,58,60,61,62,64],none:[2,3,4,8,9,10,15,41,43,51,52],nonetheless:[16,44],nonmask:[23,60],noop:2,nop:[15,28,41,61],noqueu:2,noret_typ:[6,48],normal:[8,9,14,22,23,25,27,45,51,59,60,63,64],nosock:36,nota:53,notabl:[5,50],notar:0,note:[5,6,7,8,12,13,14,15,16,17,19,21,22,24,27,38,39,41,44,45,46,47,48,49,50,53,59,62,64,65],noteworthi:[15,18,41,42],noth:[1,6,18,22,42,48,64],notic:[4,5,7,8,10,12,13,14,15,19,21,23,27,28,31,41,43,45,46,47,50,52,59,60,61,62,65],notif:[0,5,12,47,50],notifi:[5,7,8,10,46,50,52],notion:[13,14,36,39,41,45],noul:53,now:[1,4,6,9,10,12,13,14,15,16,22,23,24,25,27,28,38,41,43,44,45,47,48,51,52,53,59,60,61,63,64],nowdai:38,npage:[17,49],npu:[4,43],nr_cpu:[19,62],nr_exclus:[27,59],nr_hw_queue:[5,50],nr_page:[22,64],nr_port:8,nr_sector:[5,50],nr_softirq:[6,23,48,60],nr_thp:[22,64],nrexcept:[22,64],nrpage:[22,64],nsproxi:[27,59],ntf:[9,10,22,51,52,64],ntoh:[18,42],ntohl:[18,42],nttcp:3,num:[7,18,42,46],num_byt:[10,52],numa:[5,20,28,50,55,61],numa_no_nod:[5,50],numa_nod:[5,50],number:[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,33,34,36,37,38,41,42,44,45,46,47,48,49,50,51,52,53,54,58,59,60,61,62,63,64,65],numer:[18,37,42],nutshel:53,nvidia:[4,43],nvim:[13,41],nxp:[4,43],o_append:[7,46],o_cloexec:[18,42],o_nonblock:[0,18,42],o_rdonli:[7,46],o_rdwr:38,o_sync:[7,46],o_trunc:[7,46],obiectivel:0,obj:[11,15,41],objdump:0,object:[0,8,39],objtool:[27,59],observ:[6,8,12,14,15,16,18,28,42,44,45,47,48,61],obtain:[5,7,10,12,15,17,18,27,33,36,39,42,46,47,49,50,52,59],obviou:39,obvious:[15,41,51],occupi:[9,10,14,37,45,51,52],occur:[6,7,8,12,14,15,19,22,23,25,27,28,29,30,39,41,45,46,47,48,57,58,59,60,61,62,63,64],occurr:[28,32,61],octavian:53,octob:39,odroid:[4,43],off:[6,7,15,31,38,39,41,46,48],off_t:[17,49],offend:[21,65],offer:[7,8,12,13,15,16,18,23,24,28,29,39,41,42,44,46,47,53,58,60,61],offici:[4,39,43],offlin:39,offload:[24,26,53,54],offset:[5,7,15,17,18,19,21,22,23,25,26,31,41,42,46,49,50,54,60,62,63,64,65],offsetof:[14,45],often:[6,7,8,10,12,14,15,18,22,23,24,28,41,42,45,46,47,48,52,53,60,61,64],okfn:[18,42],old:[7,9,10,14,19,23,24,27,28,45,46,51,52,53,59,60,61,62],oldconfig:[13,41],older:39,oldest:[15,41],onc:[1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,21,22,23,26,28,36,41,42,43,44,45,46,47,48,49,50,51,52,54,60,61,62,64,65],one:[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22,23,24,25,27,28,29,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62,63,64],ones:[9,10,14,18,42,45,51,52],onli:[3,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,41,42,45,46,47,48,49,50,51,52,53,54,58,59,60,61,62,63,64,65],onlin:[2,39],onto:[4,43],oop:[0,6,7,11,19,46,48,62],oops:[15,41],oops_init:[21,65],op_oop:[15,41],op_read:[15,41],op_writ:[15,41],opaqu:[30,57],open:[0,1,4,5,9,10,12,13,14,17,24,31,34,38,39,41,43,45,47,49,50,51,52,53],open_disk:[5,50],open_softirq:[23,60],opengrok:[13,41],oper:[0,4,5,12,13,15,16,19,21,25,27,31,33,34,35,36,37,39,41,43,44,47,50,59,62,63,65],operar:53,operst:[15,41],opinion:[16,44],oppos:[16,19,24,27,28,38,44,53,59,61,62],ops:[5,8,18,36,42,50],optim:[0,2,5,10,19,24,25,29,50,52,53,58,62,63],optimist:[28,61],option:[0,3,5,6,7,9,10,13,16,18,21,24,25,33,34,37,38,39,42,44,46,48,50,51,52,53,63,65],option_both:34,option_com1:34,option_com2:34,order:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,21,22,23,24,25,27,29,31,32,33,34,35,36,37,41,42,43,44,45,46,47,48,50,51,52,53,58,59,60,62,63,64,65],orderli:[12,47],org:[2,13,18,24,30,38,42,53,57],organ:[5,7,8,18,19,22,24,42,46,50,53,62,64],orient:[7,36,46],orig_ax:[29,58],origin:[1,12,13,22,27,41,47,59,64],osc:[4,43],osdep:[30,57],osdev:38,osi:36,oss:[24,53],other:[0,2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,22,23,24,26,27,28,30,36,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,57,59,60,61,62,64],otherwis:[6,7,8,9,10,12,13,19,27,28,29,30,39,41,46,47,48,51,52,57,58,59,61,62],our:[0,4,7,12,13,15,16,24,35,38,39,41,43,44,46,47,53],ourselv:[14,28,45,61],out:[2,5,7,9,10,12,13,15,16,17,18,21,28,29,30,39,41,42,44,46,47,49,50,51,52,57,58,61,65],out_blk_init:[5,50],out_err:[5,50],out_module_put:[18,42],out_put:[18,42],outb:[12,38,47],outb_p:[12,47],outbound:[18,42],outl:[12,47],output:[0,3,4,5,7,8,12,15,16,18,22,23,26,28,30,41,42,43,44,46,47,50,54,57,60,61,64],outsid:[18,19,21,24,27,42,53,59,62,65],outw:[12,47],over:[6,7,8,9,10,13,14,15,16,17,18,24,25,27,28,36,41,42,44,45,46,48,49,51,52,53,59,61,63],overal:[29,31,58],overflow:[19,21,23,60,62,65],overhead:[14,16,17,21,24,28,44,45,49,53,61,65],overlap:[19,28,61,62],overload:[24,53],overview:[0,19,22,25,30,40,57,62,63,64],overwrit:[5,27,50,59],overwritten:[21,65],own:[1,4,9,15,18,19,22,24,25,27,28,41,42,43,51,53,59,61,62,63,64],owner:[5,7,8,9,12,18,22,28,31,36,42,46,47,50,51,61,64],packag:[0,1,2,3,7,13,15,16,18,27,36,41,42,44,46,59],packet:[0,24,36,53],packet_typ:36,pad:[27,59],paddr:[19,62],page:[0,1,4,5,10,12,14,15,16,18,20,21,23,24,27,29,32,33,34,35,36,37,38,39,41,42,43,44,45,47,50,52,53,55,58,59,60,65],page_address:[17,19,49,62],page_align:[17,49],page_mask:[19,62],page_offset:[17,49],page_s:[8,9,17,49,51],page_shift:[9,17,19,49,51,62],page_to_pfn:[17,49],pageabl:0,pagecach:[22,64],pagemap:[9,51],pagep:[10,22,52,64],pager:[16,44],pagin:[19,62],pahlk:53,pahol:[27,59],pai:[24,53],panel:[4,13,41,43],panic:[0,15,41],panic_init:[21,65],panic_tim:[21,65],paper:[30,57],paragraph:39,parallel:[0,5,8,13,14,15,16,19,23,28,41,44,45,50,60,61,62],paramet:[0,5,6,7,8,9,10,12,13,14,16,17,18,22,34,38,42,44,45,46,47,48,49,50,51,52,64],paravirt:[27,59],paravirtu:0,parcur:53,parcurgerea:53,parent:[8,9,10,13,14,16,17,22,27,41,44,45,49,51,52,59,64],parport0:[12,47],parport_pc:8,parport_pc_exit:8,parport_pc_init:8,parport_pc_pnp_driv:8,parport_pc_pnp_prob:8,parport_pc_pnp_remov:8,parport_pc_pnp_tbl:8,pars:8,parse_and_ev:[27,59],part:[0,5,7,14,15,16,18,19,21,22,23,24,25,27,28,29,36,37,39,40,41,42,44,45,46,50,53,58,59,60,61,62,63,64,65],partial:[7,22,25,46,63,64],particip:39,particular:[6,8,9,10,11,12,18,19,22,23,24,30,42,47,48,51,52,53,57,60,62,64],particularli:[16,24,44,53],partit:[5,8,13,22,27,35,50,59,64],partner:31,pass:[5,6,7,8,9,10,13,15,16,18,24,27,29,32,33,34,35,36,37,38,39,41,42,44,46,48,50,51,52,53,58,59],password:[3,13],past:[10,19,52,62],patch:[24,53],path:[2,7,9,13,15,16,18,22,26,27,28,32,33,34,35,36,37,38,41,42,44,46,51,54,59,61,64],pathnam:[17,49],pattern:[21,28,61,65],paus:[28,61],payment:39,pcb:[27,59],pcd:[19,62],pci:[8,30,57],pcnet32:[15,41],pde:[21,65],peak:[16,44],peanut:[10,52],peculiar:[6,24,48,53],peer:[18,39,42],penal:39,penalti:[0,19,24,53,62],pend:[12,16,23,27,30,44,47,57,59,60],pentium:[29,58],pentru:53,peopl:[16,24,39,44,53],per:[0,5,15,19,27,38,39,41,50,59,62],per_cpu_var:[27,29,58,59],percentag:[16,44],perf:0,perform:[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,23,24,26,27,28,30,41,42,43,44,45,46,47,48,49,50,51,52,53,54,57,59,60,61,62,65],period:[0,15,19,24,41,53,62],peripher:[4,12,19,30,43,47,57,62],perm:[17,49],perman:[0,2],permiss:[9,10,12,13,16,17,18,36,42,44,47,49,51,52],permit:[2,12,23,39,47,60],perror:38,person:[16,44],perspect:[5,7,46,50],pf_:[18,42],pf_inet:[18,42],pf_packet:[18,36,42],pf_stp:36,pfifo_fast:2,pfn:[17,19,49,62],pfn_to_pag:[17,49],pg_buddi:[25,63],pg_reserv:[17,49],pgd:[19,62],pgd_offset:[19,62],pgd_t:[19,62],pgoff_t:[22,64],pgprot_t:[17,19,49,62],phase:[0,12,13,15,23,28,38,41,47,60,61],phone:[4,39,43],phy:[19,62],phys_addr:[19,62],physic:[0,5,7,8,9,10,12,13,14,15,18,19,20,21,23,24,30,35,41,42,45,46,47,50,51,52,53,55,57,60,62,65],physical_disk1_nam:35,physical_disk2_nam:35,physical_disk_nam:[5,50],pi4:[18,42],pi_lock:[27,59],pic1:[12,47],pic2:[12,47],pic:[23,60],pick:[25,27,28,31,59,61,63],picocom:[15,41],pid:[6,14,15,16,17,18,21,27,33,41,42,44,45,48,49,59,65],pid_list:[14,45],pid_t:[14,45],piec:39,piix:[21,65],pin:[23,60],pio:38,pipe1:3,pipe2:3,pipe:[3,9,10,37,51,52],pipef:[9,51],pipelin:[28,61],pitix_dir_entri:37,pitix_inod:37,pitix_mag:37,pitix_name_len:37,pitix_super_block:37,pitix_vers:37,place:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,23,25,26,27,39,41,42,43,44,45,46,47,48,49,50,51,52,54,59,60,62,63,65],plagiar:0,plai:[0,15,41],plan:[6,9,12,13,31,47,48,51],platform:[0,1,4,6,7,8,12,15,16,17,18,19,41,42,43,44,46,47,48,49,62],ple:[14,45],pleas:[2,4,5,6,7,8,9,10,12,14,15,16,17,18,23,31,37,39,41,42,43,44,45,46,47,48,49,50,51,52,60],pll:[4,43],plu:[4,43],plug:0,plugin:13,pm_message_t:8,pmap:[17,49],pmd:[19,62],pmd_offset:[19,62],pmd_t:[19,62],pnp0400:8,pnp0401:8,pnp:[0,19,62],pnp_bu:8,pnp_bus_match:8,pnp_bus_typ:8,pnp_dev:8,pnp_device_id:8,pnp_device_prob:8,pnp_driver:8,pnp_id:8,pnp_irq:8,pnp_irq_valid:8,pnp_port_len:8,pnp_port_start:8,pnp_port_valid:8,pnp_register_driv:8,pnp_register_drvi:8,pnp_unregister_driv:8,pnpbio:[8,19,62],point:[0,4,5,7,8,9,10,13,14,15,16,17,18,19,20,22,23,24,25,26,27,29,30,34,37,41,42,43,44,45,46,49,50,51,52,53,54,55,57,58,59,60,62,63,64],pointer:[0,5,6,7,8,9,10,12,14,15,17,18,21,22,24,25,27,29,38,41,42,45,46,47,48,49,50,51,52,53,58,59,63,64,65],poison:[21,65],poki:[2,3,13],polici:[0,28,61],politehnica:0,poll:[12,36,47],pool:[6,25,38,48,63],poor:[23,28,60,61],poorli:[28,61],pop:[4,5,6,7,8,9,10,12,14,15,16,17,18,19,23,27,28,41,42,43,44,45,46,47,48,49,50,51,52,59,60,61,62],popek:[30,57],popescu:[31,53],popf:[28,30,57,61],popfl:[27,59],popl:[27,59],popsect:[27,29,58,59],popul:[15,25,27,38,41,59,63],popular:[4,43],port:[0,3,4,6,7,8,13,15,17,18,23,26,30,34,36,38,41,42,43,46,48,49,54,57,60],portabl:[0,15,41],portal:53,portion:[5,14,45,50],portul:53,pos:[10,14,22,45,52,64],posit:[5,7,9,10,25,28,46,50,51,52,61,63],posix:[14,24,27,45,53,59],possibl:[5,6,7,8,9,10,12,13,14,15,17,18,21,22,23,24,27,28,33,36,39,41,42,45,46,47,48,49,50,51,52,53,59,60,61,64,65],post:[30,38,57],postpon:[23,28,60,61],potenti:[1,21,23,27,59,60,65],power:[5,8,24,25,28,50,53,61,63],powerpc64:[24,53],powerpc:[20,24,53,55],ppid:[27,59],ppph:[16,44],pps:[26,54],pr706:39,pr_alert:[14,45],pr_crit:[14,45],pr_debug:[14,15,41,45],pr_emerg:[14,45],pr_err:[14,17,45,49],pr_fmt:[14,45],pr_info:[6,7,12,14,17,21,45,46,47,48,49,65],pr_notic:[14,45],pr_warn:[14,45],practic:[3,16,18,24,28,39,42,44,53,61],pre:[6,48],prebuild:3,preced:[5,13,41,50],precis:[6,48],predefin:[6,7,15,27,36,41,46,48,59],predict:[28,61],preempt:[0,6,15,19,23,24,28,41,48,53,60,61,62],preempt_bit:[28,61],preempt_count:[28,61],preempt_count_inc:[28,61],preempt_dis:[27,28,59,61],preempt_en:[27,59],preemption:[0,27,59],preemptiv:[0,6,28,48,61],prefer:[12,23,28,29,39,47,58,60,61],preferred_lft:2,prefix:[11,13,15,18,27,28,41,42,59,61],prentic:53,prepar:[2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],prepare_lock_switch:[27,59],prepare_task_switch:[27,59],prepare_to_wait_ev:[27,59],presenc:[5,9,10,34,50,51,52],present:[0,5,6,7,8,9,10,13,14,15,17,18,19,21,22,24,28,29,30,35,36,37,38,41,42,45,46,48,49,50,51,52,53,57,58,61,62,64,65],preserv:[7,28,46,61],press:[3,12,13,47],pressur:[22,64],pretti:[24,27,53,59],prev:[14,18,21,26,27,42,45,54,59,65],prev_mm:[27,59],prevent:[5,6,14,22,24,27,28,39,45,48,50,53,59,61,64],preview:32,previou:[0,5,6,7,8,9,10,12,13,14,15,17,18,21,23,28,38,39,41,42,45,46,47,48,49,50,51,52,60,61,65],previous:[4,5,9,10,17,19,22,27,43,49,50,51,52,59,62,64],prezentar:53,prezentarea:53,price:[19,62],primari:[8,39],primit:[6,12,14,15,20,24,27,41,45,47,48,53,55,59],prin:53,principl:[15,18,41,42],print:[4,5,6,12,13,14,15,16,17,18,21,23,25,38,41,42,43,44,45,47,48,49,50,60,63,65],print_debug:[15,41],print_hex_dump_byt:[15,41],print_hex_dump_debug:[15,41],print_sock_address:[18,42],print_usage_bug:[21,65],printer:8,printf:[6,14,15,30,41,45,48,57],printk:[0,1,4,5,10,18,21,42,43,50,52,65],prior:[8,18,42],prioriti:[0,6,15,18,20,26,39,41,42,48,54,55],priv:[18,42],privat:[5,6,7,8,9,10,12,18,22,25,27,28,31,37,39,42,46,47,48,50,51,52,59,61,63,64],private_data:[5,7,22,46,50,64],private_list:[22,64],private_lock:[22,64],priviledg:[19,62],privileg:[19,23,24,30,53,57,60,62],privilleg:[16,44],probabl:[13,15,41],probe:[0,12,16,33,44,47],problem:[0,5,7,12,13,14,15,21,34,39,41,45,46,47,50,65],problemat:39,proc:[0,4,5,6,7,9,11,12,13,14,17,21,27,32,33,34,36,43,45,46,47,48,49,50,51,59,65],proc_creat:[17,36,49],proc_entry_nam:[17,49],proc_remov:36,procedur:[38,39],proceed:[12,47],proces:53,procesor:53,process:[0,2,4,8,9,10,12,13,15,16,17,18,19,21,23,29,30,32,33,34,35,36,37,38,40,41,42,43,44,47,49,51,52,57,58,60,62,65],process_timeout:[21,65],processor:[0,4,6,8,12,14,23,24,25,27,29,30,43,45,47,48,53,57,58,59,60,63],procf:[0,9,32,33,36,51],produc:[4,21,28,34,38,43,61,65],product:[15,18,41,42],profession:51,profil:[0,21,38,40,53,65],profunzim:53,program:[5,6,7,8,10,12,13,14,15,16,17,18,19,23,24,30,38,41,42,44,45,46,47,48,49,50,52,53,57,60,62],programar:53,programm:[0,7,9,14,24,28,45,46,51,53,61],progress:[27,28,31,59,61],project:[1,2,3,4,9,10,13,18,24,39,41,42,43,51,52,53],promovar:53,prompt:[3,13],propag:[22,64],proper:[7,33,46],properli:[9,10,12,15,24,27,29,41,47,51,52,53,58,59],properti:[4,5,16,19,22,23,24,27,28,43,44,50,53,59,60,61,62,64],proport:[26,28,53,54,61],proportion:39,propuneri:53,prot:[17,19,49,62],prot_writ:38,protect:[6,12,14,15,17,19,22,24,28,30,38,41,45,47,48,49,53,57,61,62,64],protector:[19,62],proto:[26,54],proto_op:[0,36],protocol:[0,8,13,16,18,28,38,39,40,41,42,44,53,61],prototyp:[7,18,42,46],proven:[15,41],provid:[2,3,4,5,8,9,10,12,13,14,15,16,18,19,20,21,23,24,27,28,33,34,35,36,38,39,41,42,43,44,45,47,50,51,52,53,55,59,60,61,62,65],pseudo:[9,13,15,29,41,51,58],psh:[18,42],psi_ttwu_dequeu:[27,59],pso:[15,41],pt_reg:[29,58],pte:[19,62],pte_non:[19,62],pte_offset:[19,62],pte_pag:[19,62],pte_t:[19,62],pthread_creat:[27,59],pthread_mutex_trylock:[14,45],ptr:[14,29,38,45,58],ptrace:[14,45],ptrval:[21,65],pts:[2,3,13,15,16,27,41,44,59],pty:3,pub:[13,24,36,53],publicli:[24,39,53],publish:[30,39,57],pud:[19,62],pud_offet:[19,62],pud_offset:[19,62],pud_t:[19,62],pull:[0,4,5,6,7,8,9,10,12,14,15,16,17,18,24,31,41,42,43,44,45,46,47,48,49,50,51,52,53],punct:53,punctajul:53,punish:39,punit:39,pure:[22,64],purpos:[5,6,8,12,13,17,22,23,24,27,28,29,30,39,41,47,48,49,50,53,57,58,59,60,61,64],push:[1,13,15,19,21,23,26,28,41,54,60,61,62,65],pushf:[28,61],pushfl:[27,59],pushl:[27,59],pushsect:[27,29,58,59],put:[7,14,18,23,24,27,28,42,45,46,53,59,60,61],put_char:[12,47],put_idx:[12,47],put_link:[22,64],put_sup:[9,22,51,64],put_task_struct:[6,48],put_us:[7,12,29,46,47,58],putback_pag:[22,64],pwd:[15,41],pwt:[19,62],python3:3,python:13,q_ctrl:38,q_elem_t:38,qapi:[30,57],qdisc:2,qemu:[0,1,2,3,5,7,9,15,16,21,32,41,44,46,50,51,65],qemu_chr_fe_accept_input:[30,57],qemu_chr_fe_init:[30,57],qemu_chr_fe_set_handl:[30,57],qemu_chr_fe_writ:[30,57],qemu_displai:[3,12,13,47],qemu_irq:[30,57],qemu_irq_low:[30,57],qemu_irq_rais:[30,57],qemu_log_mask:[30,57],qemu_opt:[5,9,13,50,51],qemuarm:[4,43],qemuchrev:[30,57],qemux86:[2,3,12,13,16,17,21,44,47,49,65],qlen:2,qnx6_find_entri:[10,52],qnx6_readdir:[10,52],qstr:[10,52],qt4:[13,41],qual:[29,58],qualifi:[24,53],qualiti:[16,44],quantiti:33,quantum:[14,45],que:[5,50],queri:[0,7,13,41,46],question:[0,39],queu:[7,18,27,28,42,46,59,61],queue:[0,6,12,18,24,26,27,28,34,36,42,47,48,53,54,59,61],queue_control:38,queue_control_t:38,queue_delayed_work:[6,48],queue_depth:[5,50],queue_rq:[5,50],queue_work:[6,48],queuedata:[5,50],quick:[8,13,15,35,41],quickfix:[13,41],quickli:[15,24,41,53],quickstart:0,quiescent:[28,61],quiet:[13,41],quit:[10,15,18,41,42,52],quiz:[0,39],quizz:39,quota:[10,52],r10:[28,61],r11:[28,61],race:[6,12,24,27,28,47,48,53,59,61],raddr:[18,42],radix:[22,64],raid:[0,24,39,40,53],rais:[23,60],raise_softirq:[23,60],ram:[0,4,7,24,43,46,53],ramdisk:[5,50],ramf:[9,10,51,52],ramfs_aop:[10,52],ramfs_creat:[10,52],ramfs_dir_inode_oper:[10,52],ramfs_file_inode_oper:[10,52],ramfs_file_oper:[10,52],ramfs_fill_sup:[9,51],ramfs_fs_info:[9,51],ramfs_fs_typ:[9,51],ramfs_get_inod:[9,51],ramfs_kill_sb:[9,51],ramfs_mag:[9,51],ramfs_mkdir:[10,52],ramfs_mknod:[10,52],ramfs_mount:[9,51],ramfs_op:[9,51],ramfs_parse_opt:[9,51],ramfs_show_opt:[9,51],random:[5,50],rang:[12,17,19,20,30,34,37,47,49,55,57,62],rare:[24,27,53,59],raspberri:[4,43],rather:[10,18,42,52],raw:[3,5,9,13,18,42,50,51],raw_inod:[10,52],rax:38,razvan:[3,9,51],rb_node:[18,42],rb_root_cach:[22,64],rbnode:[18,42],rc1:[24,53],rc2:[18,24,42,53],rci:[15,41],rcu:[0,6,23,48,60],rcu_read_lock:[28,61],rcu_read_unlock:[28,61],rcu_softirq:[6,23,48,60],rdai:[12,47],rdmsr:[30,57],reach:[7,8,13,15,17,18,22,23,41,42,46,49,60,64],reachabl:[21,65],react:[23,60],reactiv:[6,12,27,47,48,59],read:[0,2,4,8,13,14,15,16,19,20,21,23,24,25,29,30,31,32,33,34,35,36,37,38,41,43,44,45,53,55,57,58,60,62,63,65],read_inod:[10,22,52,64],read_it:[10,22,52,64],read_lock:[14,45],read_lock_irq:[12,47],read_lock_irqsav:[12,47],read_unlock:[14,45],read_unlock_irq:[12,47],read_unlock_irqrestor:[12,47],readahead:[22,64],readahead_control:[22,64],readdir:[10,52],readelf:[16,44],reader:[14,28,45,61],readi:[5,7,27,28,46,50,59,61],readlink:[22,64],readm:[32,33,34,35,36,37,38],readpag:[10,22,52,64],real:[0,5,13,15,16,41,44,50],real_mod:38,real_par:[14,45],realiti:[5,50],realli:[27,59],realloc:[21,65],rear:[30,57],rearm:[6,48],reason:[5,6,7,8,9,10,12,13,14,15,17,39,41,45,46,47,48,49,50,51,52],reassert:[23,60],rebas:1,reboot:[8,15,41],rebuild:[0,4,5,6,7,8,9,10,12,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],receiv:[0,1,5,6,7,8,10,12,13,14,15,22,26,30,33,36,38,39,41,45,46,47,48,50,52,54,57,64],recent:[7,22,46,64],recept:[18,34,42],recheck:[27,59],reciproc:8,reclam:[28,61],recommend:[0,4,5,6,7,9,10,11,12,13,14,15,16,18,28,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,50,51,52,61],recompil:[4,13,34,43],record:[7,16,21,29,33,39,44,46,58,65],recoveri:35,recurs:[14,24,45,53],recv:[18,36,42],recvfrom:[18,36,42],recvmsg:[18,36,42],red:[25,63],redirect:[3,7,15,39,41,46],redistribut:[2,30,57],redo:[0,35],redon:39,reduc:[0,8,9,14,15,19,24,25,28,30,32,33,34,35,36,37,41,45,51,53,57,61,62,63],redzon:[21,65],ref:53,refcount_t:[18,42],refer:[0,2,3,4,5,6,7,8,9,10,12,14,15,17,18,19,22,24,28,36,37,38,39,42,43,45,46,47,48,49,50,51,52,53,61,62,64],referenc:[7,10,13,28,37,41,46,52,61],reflect:[8,24,53],refresh:[16,44],refus:8,reg:[12,18,29,38,42,47,58],regard:[8,9,24,33,39,51,53],regardless:[23,39,60],regener:[13,41],region:[12,14,20,21,25,27,34,45,47,55,59,63,65],regist:[0,4,13,15,17,18,19,21,23,27,29,30,34,36,37,38,41,42,43,49,57,58,59,60,62,65],register_blkdev:[5,50],register_chrdev_region:[7,46],register_devic:8,register_filesystem:[9,51],registr:[0,8,9,51,53],registri:[12,47],regular:[0,2,9,13,15,16,19,24,26,41,44,51,53,54,62],regularli:[24,53],reilli:53,reiserf:[9,51],reiter:39,rel:[5,12,14,15,24,28,29,37,39,41,45,47,50,53,58,61],relai:[5,50],relat:[2,8,9,13,14,16,17,18,22,24,32,33,34,35,36,41,42,44,45,49,51,53,64],relationship:[22,64],relativ:53,relax:[6,48],relay_exit:[5,50],relay_init:[5,50],releas:[0,5,8,9,10,12,14,15,17,18,19,21,24,28,31,35,36,37,41,42,45,47,49,50,51,52,53,61,62,65],release_region:[12,47],release_resourc:[28,61],releasepag:[22,64],relev:[1,14,15,41,45],reli:[19,27,29,39,58,59,62],reliabl:[21,65],relinquish:[24,53],reload:[12,14,27,45,47,59],reloc:[20,55],remain:[6,7,9,13,24,41,46,48,51,53],remap:[17,30,49,57],remap_pfn_rang:[17,49],remark:0,rememb:[6,18,36,42,48],remmap:[30,57],remot:[1,2,13,18,36,42],remount:[9,51],remount_f:[9,22,51,64],remov:[0,2,4,5,6,7,10,12,14,15,18,22,23,24,25,27,28,33,41,42,43,45,46,47,48,50,52,53,59,60,61,63,64],remove_from_buff:[14,45],remove_proc_entri:[17,49],renam:[10,22,52,64],rent:39,reorder:[5,50],rep:[28,61],repeat:[13,15,16,30,41,44,57],repeatedli:[12,47],replac:[6,7,9,15,41,46,48,51],replace_lxr:[9,51],repo:[1,3,4,5,6,7,8,9,10,12,14,15,16,17,18,31,32,33,34,35,36,37,41,42,43,44,45,46,47,48,49,50,51,52],report:[2,7,12,14,16,21,23,39,44,45,46,47,60,65],repositori:[0,2,4,5,6,7,8,9,10,12,14,15,16,17,18,37,41,42,43,44,45,46,47,48,49,50,51,52],repres:[5,6,7,8,9,10,12,13,14,15,16,17,18,19,23,24,35,37,38,39,41,42,44,45,46,47,48,49,50,51,52,53,60,62],represant:[4,43],represent:[8,18,42],req:[5,50],req_iter:[5,50],req_op:[5,50],req_op_read:[5,50],req_op_writ:[5,50],request:[0,6,7,9,10,15,16,21,22,24,28,31,33,35,38,39,41,44,46,48,51,52,53,61,64,65],request_irq:[12,47],request_queu:[5,35,50],request_region:[12,47],request_threaded_irq:[12,47],requests_irq:[12,47],requir:[1,2,3,5,6,7,8,9,12,13,14,15,16,17,18,19,22,23,24,25,27,28,30,32,33,34,35,36,37,38,39,41,42,44,45,46,47,48,49,50,51,53,57,59,60,61,62,63,64],rerun:13,res1:[18,42],res:[13,18,36,42],reschedul:[6,23,48,60],resembl:[0,19,29,58,62],reser:38,reserv:[6,7,10,12,15,19,23,24,25,26,30,36,39,41,46,47,48,52,53,54,57,60,62,63],reset:[0,4,5,6,7,8,9,10,14,15,16,17,18,38,39,41,42,43,44,45,46,48,49,50,51,52],reset_buff:[12,14,45,47],resid:[7,8,9,14,17,23,45,46,49,51,60],resolv:[0,11],resort:39,resourc:[0,2,6,7,8,9,12,13,14,16,18,24,28,39,41,42,44,45,46,47,48,51,53,61],respect:[4,5,7,8,9,13,14,15,17,18,25,35,36,39,42,43,45,46,49,50,51,63],respond:[5,7,8,15,41,46,50],respons:[0,5,7,8,9,10,18,23,24,28,42,46,50,51,52,53,60,61],rest:[5,6,7,9,12,18,22,23,31,33,34,35,36,37,42,46,47,48,50,51,60,64],rest_init:[2,21,65],restart:[13,15,24,41,53],restor:[5,23,27,28,29,50,58,59,60,61],restore_al:[21,65],restrict:[0,6,18,19,23,42,48,60,62],resubmit:39,result:[4,5,7,10,13,14,15,16,17,18,23,24,25,27,28,29,31,32,33,35,38,39,41,42,43,44,45,46,49,50,52,53,58,59,60,61,63],resum:[8,11,23,27,28,29,58,59,60,61],resurs:0,ret:[8,10,13,15,17,23,27,29,30,41,49,52,57,58,59,60],ret_from_fork:[21,65],retain:[5,9,10,33,39,50,51,52],retak:0,retaken:39,retri:[7,28,46,61],retriev:[7,14,18,26,27,29,42,45,46,54,58,59],retroact:39,retval:[18,42],reus:[24,53],revalidate_disk:[5,50],revers:[18,21,42,65],review:[0,1,4,5,6,7,8,9,10,12,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],revis:15,revisit:[12,47],rework:[6,48],rezolvarea:53,rflag:38,right:[1,7,9,10,12,13,17,18,19,21,22,23,28,31,37,38,39,42,46,47,49,51,52,60,61,62,64,65],ring:[26,54],rip:38,risc:[4,24,25,43,53,63],risk:[28,61],ritter:53,rmb:[28,61],rmdir:[3,10,22,37,52,64],rmmod:[7,10,12,14,15,18,41,42,45,46,47,52],robert:[9,10,51,52,53],rol:53,role:[5,7,8,9,10,39,46,50,51,52],rom:[5,50],room:[26,39,54],root:[0,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,21,22,24,27,30,37,41,42,43,44,45,46,47,48,49,50,52,53,57,59,64,65],rootf:[0,36],rootkit:[13,41],round:31,rout:[0,15,18,23,24,27,37,41,42,53,59,60],routin:[0,6,7,14,15,18,22,34,41,42,45,46,48,64],row:36,rpi:[4,43],rpl:[19,62],rq_data_dir:[5,50],rq_flag:[27,59],rq_for_each_seg:[5,50],rqcf_act_skip:[27,59],rqcf_req_skip:[27,59],rs232:[12,47],rsa:2,rsb:[27,59],rsb_clear_loop:[27,59],rsp:38,rss:[16,44],rst:[1,18,42],rtc:[12,47],rto:[23,60],rubini:53,rule:[0,4,5,8,13,14,23,26,31,37,38,40,41,43,45,50,54,60],run:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,27,28,29,30,35,41,42,43,44,45,46,47,48,49,50,51,52,53,55,57,58,59,60,61,62,64,65],run_timer_softirq:[21,65],runnabl:[27,59],runqueu:[27,59],runtim:[15,24,41,53],rw_semaphor:[22,64],rwlock_init:[14,45],rwlock_t:[14,45],rx_fifo:[30,57],rx_fifo_len:[30,57],rxcnt:[30,57],rxctrl:[30,57],rxpkt:36,s_blocksiz:[9,51],s_blocksize_bit:[9,51],s_dev:[9,51],s_dirt:[9,51],s_flag:[9,51],s_fs_info:[9,51],s_id:[9,51],s_ifdir:[9,51],s_isdir:[9,10,51,52],s_isreg:[9,10,51,52],s_iwusr:8,s_lock_kei:[9,51],s_magic:[9,51],s_maxbyt:[9,51],s_op:[9,51],s_root:[9,10,51,52],s_time_gran:[9,51],s_type:[9,51],s_umount_kei:[9,51],sad:[27,59],saddr:[18,42],safe:[6,19,21,27,48,59,62,65],safest:39,sai:[16,44],said:[23,24,53,60],salturi:53,same:[1,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22,23,24,25,27,28,29,30,35,36,37,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64],sampl:[0,6,16,18,21,24,33,42,44,48,53,65],sanit:[21,65],saniti:[16,18,42,44],sat:2,satisfi:[7,14,45,46],sato:[2,3,16,44],satur:[16,44],save:[7,9,10,12,13,15,16,23,27,28,29,30,41,44,46,47,51,52,57,58,59,60,61],save_mount_opt:[9,51],saw:8,sb_bread:[9,10,51,52],sb_read:[10,52],sbh:[10,52],sbi:[10,52],sbin:[4,8,15,17,41,43,49],sc1:[21,65],scalabl:0,scale:[24,26,28,53,54,61],scan:[12,21,27,47,59,65],scancod:0,scatter:[5,26,50,54],scenario:[5,15,21,23,28,50,60,61,65],sched:[2,6,7,14,15,16,33,41,44,45,46,48],sched_class:[27,59],sched_softirq:[6,23,48,60],sched_task_group:[27,59],schedul:[5,6,7,12,14,16,20,23,24,27,28,33,44,45,46,47,48,50,53,55,59,60,61],schedule_delayed_work:[6,48],schedule_delayed_work_on:[6,48],schedule_on_each_cpu:[6,48],schedule_timeout:[7,13,14,45,46],schedule_work:[6,23,48,60],schemat:[14,45],scheme:[0,23,35,37,60],school:[3,9,51],scienc:0,scientif:[24,53],scm:[24,53],scope:[2,26,54],score:39,scratch:[4,38,39,43],screen:[13,15,39,41],screencast:[23,27,59,60],scri:53,script:[0,3,5,6,9,10,13,16,18,24,29,32,33,34,35,36,37,38,41,42,44,48,50,51,52,53,58],scroll:13,scsi:[7,8,24,46,53],sdb:[5,50],sdk:[3,16,44],se0:[21,65],search:[0,2,7,9,13,14,15,16,19,22,25,29,30,41,44,45,46,51,57,58,62,63,64],search_exception_t:[29,58],second:[5,6,7,9,10,12,14,15,18,23,29,33,35,36,41,42,45,46,47,48,50,51,52,58,60],secondari:13,seconds_valu:[6,48],section:[1,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22,23,24,27,28,29,39,41,42,43,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62,64],sector1:35,sector2:35,sector3:35,sector:[5,9,35,50,51],sector_t:[5,10,22,50,52,64],secur:[0,7,26,30,46,54,57],see:[0,1,2,4,5,6,7,8,9,10,12,13,14,15,16,17,18,21,23,24,27,28,29,30,34,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,65],seed:35,seek:[7,39,46],seem:[7,28,46,61],seen:[2,5,7,8,12,14,16,18,23,27,28,37,42,44,45,46,47,50,59,60,61],segment:[0,5,14,16,23,25,26,30,39,44,45,50,54,57,60,63],seldom:[28,61],select:[4,11,12,19,21,23,27,30,31,43,47,57,59,60,62,65],select_task_rq:[27,59],selector:[0,23,26,38,54,60],self:[27,59],selinux:[24,53],semant:[13,41],semaphor:[13,14,24,27,45,53,59],semest:39,semestru:53,semiconductor:[4,43],send:[0,5,10,12,15,16,30,35,36,37,41,44,47,50,51,52,57],send_ev:[6,48],send_test_bio:[5,50],sender:0,sendmsg:[18,36,42],sendpag:36,sendto:[18,36,42],sens:[9,39,51],sensor:[12,23,47,60],sent:[5,6,7,9,12,14,15,17,18,36,41,42,45,46,47,48,49,50,51],sentinel:[14,45],separ:[4,6,9,17,24,27,28,29,35,43,48,49,51,53,58,59,61],septemb:39,seq:[18,42],seq_fil:[17,49],seq_printf:[17,49],sequenc:[6,7,10,12,13,18,23,27,29,42,46,47,48,52,58,59,60],sequenti:[7,46],sergiu:[2,53],seri:[5,15,30,41,50,57],serial:[0,2,3,7,8,13,15,23,24,27,28,34,41,46,53,59,60,61],serio:[12,47],seriou:[10,14,45,52],serv:[7,19,46,62],server:[13,18,24,41,42,53],servic:[3,8,23,24,26,29,53,54,58,60],session:[13,15,16,39,41,44],set:[0,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,30,34,37,38,39,41,42,43,44,45,46,47,48,49,51,52,53,57,58,59,60,61,62,63,64,65],set_bit:[10,14,28,45,52,61],set_buffer_new:[10,52],set_capac:[5,35,50],set_current_st:[7,14,27,28,45,46,59,61],set_fixmap:[19,62],set_fixmap_nocach:[19,62],set_page_dirti:[10,22,52,64],set_pt:[19,62],set_task_cpu:[27,59],set_thread_area:[19,62],setattr:[10,15,41,52],setattr_copi:[10,52],setattr_prepar:[10,52],setpagereserv:[17,49],setsockopt:36,setting_up_long_mod:38,setup:[0,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24,25,27,29,32,41,42,43,44,45,46,47,48,49,50,51,52,53,58,59,62,63,65],setup_tim:[23,60],seven:31,sever:[4,7,9,10,12,14,15,18,21,23,24,38,41,42,43,45,46,47,51,52,53,60,65],sgi:[15,41],sha256:2,shadow:[0,21,22,27,59,64,65],shall:[22,64],shallow:[27,59],share:[0,4,5,7,8,9,10,12,14,15,16,17,18,19,22,23,27,28,31,38,39,41,42,43,44,45,46,47,49,50,51,52,59,60,61,62,64],sheet:[23,60],shell:[2,4,15,18,41,42,43],shift:[12,17,47,49],ship:[24,53],shmem:[22,64],shortcut:[13,41],should:[0,1,4,5,6,7,8,10,12,14,15,16,17,18,19,23,24,27,28,29,30,31,32,33,34,35,36,38,39,41,42,43,44,45,46,47,48,49,50,52,53,57,58,59,60,61,62],show:[2,3,6,7,8,9,12,13,14,15,16,17,18,19,21,22,23,27,38,41,42,44,45,46,47,48,49,51,59,60,62,64,65],show_opt:[9,51],showkei:[12,47],shown:[4,8,10,12,13,14,15,18,21,30,33,41,42,43,45,47,52,57,65],shut:[7,14,45,46],shutdown:[8,36],si_addr:[16,44],si_cod:[16,44],si_kernel:[16,44],si_signo:[16,44],sibl:[28,61],side:[1,22,28,61,64],sifiv:[30,57],sifive_uart:[30,57],sifive_uart_cr:[30,57],sifive_uart_div:[30,57],sifive_uart_get_rxcnt:[30,57],sifive_uart_get_txcnt:[30,57],sifive_uart_i:[30,57],sifive_uart_ie_rxwm:[30,57],sifive_uart_ie_txwm:[30,57],sifive_uart_ip:[30,57],sifive_uart_ip_rxwm:[30,57],sifive_uart_ip_txwm:[30,57],sifive_uart_max:[30,57],sifive_uart_rxctrl:[30,57],sifive_uart_rxfifo:[30,57],sifive_uart_txctrl:[30,57],sifive_uart_txfifo:[30,57],sifiveuartst:[30,57],sign:31,signal:[7,16,23,27,28,30,35,44,46,57,59,60,61],signal_pending_st:[27,59],signatur:[7,12,17,24,46,47,49,53],signifi:[10,52],signific:[6,12,14,16,18,19,24,28,42,44,45,47,48,53,61,62],significantli:[15,24,29,41,53,58],sigsegv:[16,44],silent:[9,39,51],similar:[6,7,8,9,10,12,13,14,15,18,21,23,28,29,35,39,41,42,45,46,47,48,51,52,58,60,61,65],similarli:[7,9,13,15,41,46,51],simpl:[0,5,6,7,8,9,10,13,14,15,16,17,21,22,24,25,28,31,36,37,38,41,44,45,46,48,49,50,51,52,53,61,63,64,65],simple_:[10,52],simple_device_id:[4,43],simple_dir_inode_oper:[9,51],simple_dir_oper:[9,10,51,52],simple_driv:[4,43],simple_lookup:[10,52],simple_statf:[9,51],simpler:[13,15,22,27,28,41,59,61,64],simplest:[1,3,5,29,50,58],simpli:[2,3,9,13,14,24,41,45,51,53],simplic:[12,37,47],simplifi:[4,5,7,8,9,22,23,24,27,29,32,33,34,35,36,37,38,43,46,50,51,53,58,59,60,64],simplist:[15,41],simqueu:38,simqueue_t:38,simul:[6,7,15,46,48],simultan:[24,28,53,61],simvirtio:38,sin_addr:[18,42],sin_famili:[18,42],sin_port:[18,42],sinc:[1,5,6,8,9,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,33,35,38,41,42,44,45,47,48,49,50,51,53,58,59,60,61,62,63,64,65],sincronizar:53,singl:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,25,27,28,30,35,37,41,42,43,44,45,46,47,48,49,50,51,52,57,59,61,62,63],single_open:[17,49],singular:[19,62],sis:[22,64],sistem:[37,53],sit0:2,sit:[2,7,46],site:[13,37,39,41],situat:[8,12,13,14,15,18,28,39,41,42,45,47,61],size:[4,5,7,9,10,12,13,14,16,17,18,19,21,22,24,25,27,28,29,30,31,33,35,37,38,41,42,43,44,45,46,47,49,50,51,52,53,57,58,59,61,62,63,64,65],size_t:[5,7,8,10,17,18,31,42,46,49,50,52],sizeof:[7,9,14,18,21,22,26,29,30,42,45,46,51,54,57,58,64,65],sk_:[18,42],sk_alloc:[18,42],sk_backlog_rcv:[18,42],sk_buff:[0,36],sk_buff_data_t:[18,26,42,54],sk_can_reus:[18,42],sk_data_readi:[18,42],sk_destruct:[18,42],sk_error_report:[18,42],sk_no_check_rx:[18,42],sk_no_check_tx:[18,42],sk_pad:[18,42],sk_prot:[18,42],sk_protocol:[18,42],sk_reus:[18,42],sk_send_head:[18,42],sk_socket:[18,42],sk_state_chang:[18,42],sk_type:[18,42],sk_userlock:[18,42],sk_write_spac:[18,42],skb:[0,18,42],skb_clone:[18,42],skb_mac_head:[26,54],skb_mac_header_was_set:[26,54],skb_mstamp:[18,42],skb_network_head:[26,54],skb_pull:[26,54],skb_push:[26,54],skb_put:[26,54],skb_reserv:[26,54],skb_reset_mac_head:[26,54],skb_reset_network_head:[26,54],skb_reset_transport_head:[26,54],skb_set_mac_head:[26,54],skb_set_network_head:[26,54],skb_set_transport_head:[26,54],skb_transport_head:[26,54],skb_trim:[26,54],skbuff:[18,42],skel:[4,5,6,7,8,9,10,11,12,14,15,16,17,18,21,37,41,42,43,44,45,46,47,48,49,50,51,52,65],skeleton:[0,1,4,5,6,7,8,9,10,11,12,14,15,16,17,18,32,33,34,35,36,39,41,42,43,44,45,46,47,48,49,50,51,52],skelton:11,skill:[5,13,16,18,35,36,39,41,42,44,50],skip:[5,50],skthread:[6,48],slab:[0,10,14,25,45,52,63],slab_alloc:13,slash:[13,41],sleep:[0,3,5,7,16,17,18,23,27,28,42,44,46,49,50,59,60,61],slice:[14,24,27,45,53,59],slide:[13,19,20,21,22,23,24,25,26,27,28,29,30,39,41,53,54,55,56,57,58,59,60,61,62,63,64,65],slightli:[18,22,42,64],slot:[10,37,39,52],slow:[7,15,16,28,41,44,46,61],slowdown:[16,44],slower:[3,12,21,47,65],slowest:[16,44],slub:[21,65],slub_debug:0,smack:[24,53],small:[0,6,7,8,19,24,26,27,28,38,46,48,53,54,59,61,62],smaller:[7,10,15,24,25,26,41,46,52,53,54,63],smallest:[5,50],smap:[29,58],smbf:[9,51],smp:[0,2,14,19,21,27,28,45,59,61,62,65],smp_apic_timer_interrupt:[21,65],smp_mb:[27,28,59,61],smp_processor_id:[19,62],smp_rmb:[28,61],smp_wmb:[28,61],snif:[28,61],snippet:[5,12,18,19,27,38,42,47,50,59,62],snoop:[28,61],snprintf:[5,8,50],so2:[0,2,3,4,8,9,13,16,27,31,37,40],so2_cdev:[7,46],so2_cdev_test:[7,46],so2_dev:[7,46],so2_device_data:[7,46],so2_oops_exit:[21,65],so2_oops_init:[21,65],so2_panic_init:[21,65],soc:[4,30,43,57],sock1:[18,42],sock2:[18,42],sock:[0,26,54],sock_:[18,42],sock_cloexec:[18,42],sock_creat:[18,42],sock_create_kern:[18,42],sock_create_lit:[18,42],sock_dgram:[18,26,42,54],sock_init_data:[18,42],sock_map_fd:[18,42],sock_max:[18,42],sock_no_:36,sock_no_accept:36,sock_no_getnam:36,sock_no_getsockopt:36,sock_no_ioctl:36,sock_no_listen:36,sock_no_mmap:36,sock_no_sendpag:36,sock_no_setsockopt:36,sock_no_shutdown:36,sock_no_socketpair:36,sock_nonblock:[18,42],sock_nospac:[18,42],sock_recvmsg:[18,42],sock_regist:[18,42],sock_releas:[18,42],sock_sendmsg:[18,42],sock_stream:[18,42],sock_type_mask:[18,42],sock_unregist:[18,42],sockaddr:[18,26,42,54],sockaddr_in:[18,26,42,54],sockaddr_len:[18,42],sockaddr_storag:[18,42],sockaddr_stp:36,socket:[0,3,24,27,36,53,59],socket_st:[18,42],socket_wq:[18,42],socketpair:[18,36,42],sockf:[9,51],sockfd_lookup_light:[18,42],sofirq:[6,48],sofitrq:[6,48],soft:0,softirq:[0,19,21,23,28,60,61,62,65],softirq_bit:[28,61],softirq_mask:[28,61],softirq_offset:[28,61],softmmu:[4,43],softwar:[0,2,4,8,16,21,24,39,40,43,44,53,65],sole:39,solut:[5,10,14,16,23,25,28,30,35,39,44,45,50,52,57,60,61,63],solv:[0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,29,39,41,42,43,44,45,46,47,48,49,50,51,52,58],some:[1,2,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,33,39,41,42,44,45,46,47,48,49,50,51,52,53,55,57,58,59,60,61,62,63,64,65],someon:[1,12,16,39,44,47],someth:[8,12,17,47,49],sometim:[7,13,22,23,24,25,27,41,46,53,59,60,63,64],soon:[6,7,28,46,48,61],sophist:[4,43],sort:[5,22,24,50,53,64],sound:[4,7,8,24,29,43,46,53,58],sourc:[0,1,4,5,6,7,8,9,10,11,12,14,15,16,17,18,20,23,26,27,31,33,34,35,36,37,39,42,43,44,45,46,47,48,49,50,51,52,54,55,59,60],sourceforg:[13,41],sourceweb:0,space:[0,4,5,6,8,12,13,14,15,16,17,20,21,22,25,27,34,35,38,40,41,43,44,45,47,48,49,50,55,59,63,64,65],span:[22,25,63,64],sparc:[24,53],spars:[21,32,33,34,35,36,37,38,65],spdx:31,special:[5,6,7,9,10,12,14,15,18,19,21,22,23,24,25,26,27,28,29,30,34,37,39,41,42,45,46,47,48,50,51,52,53,54,57,58,59,60,61,62,63,64,65],specif:[0,5,6,7,8,9,10,11,12,13,14,15,16,18,24,25,26,27,28,29,33,34,36,39,41,42,44,45,46,47,48,50,51,52,53,54,58,59,61,63],specifi:[5,6,7,8,9,10,12,13,14,15,16,18,19,22,34,36,39,41,42,44,45,46,47,48,50,51,52,62,64],specul:[27,59],speed:[5,7,15,16,18,19,41,42,44,46,50,62],spelunk:0,spend:[16,23,44,60],spent:[16,28,44,61],spi:[4,12,43,47],spike:[16,44],spin:[0,14,24,45,53],spin_lock:[6,12,14,21,28,45,47,48,61,65],spin_lock_bh:[6,28,48,61],spin_lock_init:[12,14,45,47],spin_lock_irq:[12,47],spin_lock_irqrestor:[28,61],spin_lock_irqsav:[12,27,28,47,59,61],spin_lock_restor:[28,61],spin_unlock:[6,12,14,21,27,28,45,47,48,59,61,65],spin_unlock_bh:[6,28,48,61],spin_unlock_irq:[12,47],spin_unlock_irqrestor:[12,27,47,59],spinlock:[0,6,12,27,47,48,59],spinlock_t:[5,6,12,13,14,22,45,47,48,50,64],split:[0,5,13,19,24,25,26,27,28,38,41,50,53,54,59,61,62,63],spot:39,spread:[9,51],sprintf:8,sram:[4,43],src:[1,2,10,11,13,15,21,26,28,33,34,35,36,41,52,54,61,65],ss_connect:[18,42],sscanf:8,ssd:[22,64],ssh:[0,7,11,16,18,42,44,46],sshd:[16,44],ssignment:31,ssize_t:[7,8,22,31,46,64],ssr:35,ssr_first_minor:35,ssr_major:35,stabil:[27,59],stabl:[15,41],stac:[27,59],stack:[0,9,14,15,16,17,18,19,21,22,26,27,29,35,36,38,42,44,45,49,51,54,58,59,62,64,65],stack_canari:[19,27,59,62],stack_canary_offset:[27,59],stackexchang:39,stacktrac:15,stage:[0,11,15,41],stand:[13,14,41,45],standard:[3,4,5,6,7,8,12,14,15,18,21,24,34,36,41,42,43,45,46,47,48,50,53,65],start:[0,1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,26,27,28,32,33,34,35,36,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,59,60,61,62,64,65],start_kernel:[2,21,65],startup:[5,8,50],startup_32_smp:[2,21,65],starv:[16,44],starvat:[23,60],stash:[4,5,6,7,8,9,10,12,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],stat:[9,12,16,25,27,44,47,51,59,63],state:[0,2,5,6,7,8,12,13,14,15,18,21,22,24,25,28,29,30,39,41,42,45,46,47,48,50,53,57,58,61,63,64,65],state_add_uevent_s:8,state_in_sysf:8,state_initi:8,state_remove_uevent_s:8,statement:[0,23,27,29,39,58,59,60],statf:[9,22,51,64],station:[15,41],statist:[0,9,16,22,28,36,44,51,61,64],statm:[27,59],statu:[1,4,5,6,7,8,9,10,12,13,14,15,16,17,18,21,24,25,27,38,41,42,43,44,45,46,47,48,49,50,51,52,53,59,63,65],status_reg:[12,47],stdout:38,stduent:[13,41],stefan:[30,57],step:[1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,22,28,38,41,42,43,44,45,46,47,48,49,50,51,52,61,64,65],sti:[2,23,28,60,61],still:[1,3,5,7,12,13,16,23,24,27,28,44,46,47,50,53,59,60,61],stone:[24,53],stop:[2,4,5,6,7,8,9,10,12,14,15,16,17,18,21,41,42,43,44,45,46,47,48,49,50,51,52,65],storag:[4,5,18,19,22,42,43,50,62,64],store:[0,4,5,7,8,9,10,13,14,15,17,18,19,22,23,25,27,28,29,30,32,33,34,35,36,37,38,41,42,43,45,46,49,50,51,52,57,58,59,60,61,62,63,64],stp:36,stp_bind:36,stp_connect:36,stp_header:36,stp_op:36,stp_proc_full_filenam:36,stp_recvmsg:36,stp_releas:36,stp_sendmsg:36,stp_stat:36,str:[15,41],strace:[16,44],straight:[24,53],strap:0,strategi:[28,61],strcat:[14,45],strchr:[14,45],strcmp:[10,14,45,52],strcpy:[14,45],stream:[16,29,44,58],strength:[16,44],strex:[28,61],strict:[24,53],stricter:[15,41],strictli:[3,15,41],string:[0,4,8,9,10,12,13,15,18,24,27,32,37,41,42,43,47,51,52,53,59],string_len:[14,45],strip:[4,43],strlcat:[14,45],strlcpy:[14,45],strlen:[8,14,45],strncat:[14,45],strnchr:[14,45],strncmp:[8,14,45],strncpy:[14,45],strnicmp:[14,45],strongli:[4,5,6,7,8,9,10,12,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],strrchr:[14,45],strstr:[14,45],struc:[13,41],struct:[0,6,8,10,12,13,14,15,19,20,21,22,24,25,28,29,31,33,35,36,37,38,41,45,47,48,52,53,55,58,61,62,63,64,65],structur:[0,4,6,12,13,14,15,22,23,24,25,27,28,31,33,35,36,37,41,43,45,47,48,53,59,60,61,63,64],structura:53,structurii:53,struggl:1,stub:36,student:[0,1,9,10,13,15,36,38,39,41,51,52],studi:[13,16,41,44],stutter:[16,44],style:[0,24,32,33,34,35,36,37,38,53],sub:[0,21,24,53,65],sub_preempt_count:[28,61],subcommand:[16,44],subdirectori:[8,9,10,13,51,52],subgroup:39,subiect:53,subject:[16,44],submiss:[5,32,33,34,35,36,39,50],submit:[0,24,53],submit_bio:[5,35,50],submit_bio_wait:[5,35,50],subnet:[26,54],subsequ:[1,3,6,10,25,28,48,52,61,63],subsys_priv:8,subsystem:[5,6,7,8,9,10,13,15,17,18,22,23,24,25,33,35,36,41,42,46,48,49,50,51,52,53,60,63,64],subtask:38,subtract:39,subtre:[9,51],succ:53,succe:[28,61],success:[4,7,8,9,12,14,18,27,28,29,42,43,45,46,47,51,58,59,61],successfulli:[5,7,8,9,13,46,50,51],sudo:[3,4,13,32,33,34,35,36,37,38,43],suffic:[6,48],suffici:[6,9,13,17,28,41,48,49,51,61],suffix:[15,41],sugestii:53,suggest:[16,24,44,53],suit:[16,28,37,44,61],suitabl:[8,9,21,51,65],sum:[16,27,39,44,59],sum_of_assignment_scor:39,summar:[23,60],summari:[0,13,14,18,19,21,22,27,29,41,42,45,58,59,62,64,65],summer:39,sun:[24,53],sunrpc:[15,41],super_block:[0,10,52],super_oper:[9,10,51,52],superblock:[0,10,24,37,52,53],superh:[24,53],superior:[5,24,50,53],supermodul:[15,41],superoper:0,supervisor:[19,62],suport:53,suportului:53,support:[0,3,5,7,8,9,10,12,13,14,15,16,17,18,19,20,21,22,23,24,27,28,30,32,34,35,36,37,39,41,42,44,45,46,47,49,50,51,52,53,55,57,59,60,61,62,64,65],sure:[1,2,4,5,6,7,8,9,10,12,14,15,16,17,18,23,27,28,29,35,36,37,38,41,42,43,44,45,46,47,48,49,50,51,52,58,59,60,61],survei:51,surveil:33,suspect:[21,65],suspend:[6,8,13,14,16,23,44,45,48,60],svc_tcp_accept:[15,41],svcsock:[15,41],svm:[30,57],swap:[14,17,24,25,29,45,49,53,58,63],swap_activ:[22,64],swap_deactiv:[22,64],swap_info_struct:[22,64],swapfil:[22,64],swapper:[6,21,48,65],swiss:[18,42],switch_context:[20,55],switch_mm:[27,59],switch_mm_irqs_off:[27,59],switch_to:[27,59],sym:[29,58],sym_code_end:[27,59],sym_code_start:[27,59],symbol:[0,2,3,5,6,7,8,12,13,14,15,16,22,27,29,33,37,41,44,45,46,47,48,50,58,59,64],symbol_nam:[13,41],symlink:[10,13,22,52,64],symmetr:[0,40],symver:[15,41],syn:[18,42],sync:[0,14,21,27,45,59,65],synchron:[0,5,6,12,23,24,27,33,34,35,47,48,50,53,59,60],synchronize_rcu:[28,61],syntax:[6,13,14,45,48],synthes:[16,44],sys:[4,5,8,12,15,16,21,41,43,44,47,50,65],sys_accept4:[18,42],sys_access:13,sys_bind:[18,42],sys_call_ptr_t:[29,58],sys_call_t:[13,41],sys_clos:[29,58],sys_exit:[29,58],sys_fd:38,sys_fork:[13,29,41,58],sys_getpeernam:[18,42],sys_init_modul:[15,21,41,65],sys_listen:[18,42],sys_membarri:[27,59],sys_ni_syscal:[29,58],sys_open:[29,58],sys_read:[29,58],sys_restart_syscal:[13,29,41,58],sys_send:[18,42],sys_sendto:[18,42],sys_socket:[18,42],sys_writ:[29,58],syscal:[23,29,58,60],syscall_cal:[21,65],syscall_define3:[18,42],syscall_define6:[18,42],syscall_return_slowpath:[29,58],syscalls_32:[29,58],sysent:[29,58],sysenter_do_cal:[15,41],sysf:[0,5,9,15,24,41,50,51,53],sysfs_dir:8,syslog:[15,41],syslogd:[15,27,41,59],sysret:[19,62],sysrq:[15,41],system:[0,3,5,6,7,8,12,13,14,15,16,17,18,21,30,31,33,35,36,37,39,41,42,44,45,46,47,48,49,50,57,65],sysv:[4,43],sysvinit:[17,49],tab:[2,27,59],tabel:53,tabl:[0,8,9,10,12,16,17,20,22,25,27,33,34,36,44,47,49,51,52,55,59,63,64],tablet:39,tag:[0,30,57],tag_list:[5,50],tag_set:[5,50],tail:[15,18,26,38,41,42,54],tailor:[24,53],taint:[12,15,21,41,47,65],take:[1,2,3,5,6,7,8,9,10,12,13,14,15,16,17,18,21,22,24,26,27,28,30,32,39,41,42,44,45,46,47,48,49,50,51,52,53,54,57,59,61,64,65],taken:[6,9,12,14,33,45,47,48,51],tap0:3,tap1:3,tap:3,tape:[7,46],target:[2,4,11,13,15,22,23,30,41,43,53,57,60,64],task:[0,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,21,23,28,29,31,41,42,43,44,45,46,47,48,49,50,51,52,58,60,61,62,65],task_addr_limit:[29,58],task_dead:[6,48],task_info:[14,45],task_info_add_for_curr:[14,45],task_info_add_to_list:[14,45],task_info_find_pid:[14,45],task_info_purge_list:[14,45],task_info_remove_expir:[14,45],task_interrupt:[6,7,27,46,48,59],task_list:[27,59],task_nam:[4,5,6,7,8,9,10,12,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],task_norm:[27,59],task_ptr_typ:[27,59],task_readi:[27,59],task_rq:[27,59],task_run:[7,27,28,46,59,61],task_stack_canari:[27,59],task_struct:[0,6,13,14,15,17,24,25,28,41,45,48,49,53,61,63],task_threadsp:[27,59],task_typ:[27,59],task_uinterrupt:[27,59],task_uninterrupt:[7,27,46,59],tasklet:[0,19,28,61,62],tasklet_dis:[6,23,48,60],tasklet_en:[6,23,48,60],tasklet_hi_schedul:[6,48],tasklet_init:[6,23,48,60],tasklet_schedul:[6,23,48,60],tasklet_sofitirq:[23,60],tasklet_softirq:[6,23,48,60],tasklet_struct:[6,48],tavi:[2,11,21,26,27,53,54,59,65],tcg:[30,57],tcp:[0,26,54],tcp_hdr:[18,42],tcp_sock:[18,42],tcp_tsorted_anchor:[18,42],tcpdump:0,tcph:[18,42],tcphdr:[18,42],teach:[1,51],team:[0,1,16,32,33,34,35,36,38,39,44],teamwork:39,techniqu:[0,12,13,21,25,28,41,47,61,63,65],technolog:[8,27,59],tell:[10,15,41,52],tema2:31,tema:53,teme:[0,36],temel:53,temelor:0,templat:[1,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,37,41,42,43,44,45,46,47,48,49,50,51,52],temporari:[0,14,23,45,60],temporarili:[23,60],tempt:[29,58],teo:53,teodora:53,teodorescu:53,term:[0,7,8,23,28,30,46,57,60,61],termin:[6,9,15,25,41,48,51,63],terminolog:[0,18,42],ters:[32,33,34,35,36,37,38],test2:8,test:[0,4,5,6,7,8,11,12,13,15,16,17,18,21,24,28,38,39,41,42,43,44,46,47,48,49,50,53,61,65],test_addr:[18,42],test_and_change_bit:[9,14,28,45,51,61],test_and_clear_bit:[9,10,14,28,45,51,52,61],test_and_set_bit:[9,10,14,28,45,51,52,61],test_bit:[28,61],test_daddr:[18,42],test_kasan:[21,65],test_pan:[21,65],testar:53,testului:53,texa:[4,43],text:[13,15,27,29,41,58,59],tgid:[27,59],than:[2,5,7,9,10,12,13,14,15,18,19,21,23,24,25,28,29,32,33,34,35,36,37,38,41,42,45,46,47,50,51,52,53,58,60,61,62,63,65],thei:[0,1,5,6,7,8,9,10,11,12,14,15,17,18,19,22,23,24,25,27,28,29,30,36,39,41,42,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64],them:[1,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22,23,24,25,27,28,29,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62,63,64],themselv:[6,23,48,60],theoret:0,therefor:[5,6,9,12,14,15,18,41,42,45,47,48,50,51],thermal:[23,60],thi:[0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,30,31,33,34,36,37,38,39,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,60,61,62,63,64,65],thing:[4,8,9,12,14,16,18,21,22,24,38,42,43,44,45,47,51,53,64,65],think:[16,44],third:[9,12,17,39,47,49,51],this_modul:[5,7,8,9,31,36,46,50,51],thoroughli:[29,58],those:[6,8,9,10,12,15,17,18,19,23,24,27,39,41,42,47,48,49,51,52,53,59,60,62],though:[10,15,19,41,52,62],thought:[16,44],thousand:[24,28,53,61],thp:[22,64],thrash:[28,61],thread:[0,7,9,12,13,14,15,19,21,23,24,28,35,41,45,46,47,51,53,60,61,62,65],thread_a:[21,65],thread_b:[21,65],thread_fn:[12,47],thread_group:[27,59],thread_info:[20,27,55,59],thread_siz:[27,59],thread_struct:[27,59],threadfn:[6,48],three:[6,7,12,18,19,22,23,28,38,42,46,47,48,60,61,62,64],through:[0,5,7,8,12,13,15,16,17,18,19,24,26,27,30,31,32,36,38,39,41,42,44,46,47,49,50,53,54,57,59,62],throughput:[24,28,53,61],thu:[5,6,7,8,12,13,14,15,18,19,22,23,27,28,33,41,42,45,46,47,48,50,59,60,61,62,64],thursdai:35,tick:[6,13,14,20,27,45,48,55,59],ticket:[28,61],time:[0,1,2,4,5,6,7,8,9,10,12,13,14,15,17,18,19,21,22,23,24,25,27,28,29,30,34,35,37,39,41,42,43,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64,65],timechart:[16,44],timeout:[6,7,27,46,48,59],timer:[0,4,12,20,21,27,28,30,43,47,55,57,59,61,65],timer_funct:[6,48],timer_list:[6,21,48,65],timer_setup:[6,21,48,65],timer_softirq:[6,23,48,60],timer_timeout:[6,48],timer_type_acct:[6,48],timer_type_alloc:[6,48],timer_type_non:[6,48],timer_type_set:[6,48],timerfn:[21,65],timerlist:[27,59],timestamp:[14,45],tini:[30,57],tip:0,tire:[15,41],tlb:[19,20,21,25,30,53,55,57,62,63,65],tldp:34,tldr:0,tls:[13,41],tmp:[3,11,14,45],to_bex_devic:8,to_my_driv:8,todo1:11,todo2:11,todo:[4,5,6,7,8,9,10,11,12,14,15,16,17,18,31,32,39,41,42,43,44,45,46,47,48,49,50,51,52],togeth:[4,5,10,16,19,22,24,27,28,37,39,43,44,50,52,53,59,61,62,64],tomoyo:[24,53],too:[12,15,16,18,19,23,36,39,41,42,44,47,60,62],tool:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,24,27,32,33,34,35,36,37,38,41,42,43,45,46,47,48,49,50,51,52,53,59],toolchain:0,top:[0,1,4,5,6,23,24,26,27,31,32,43,48,50,53,54,59,60],topic:[0,11,19,32,33,34,35,36,39,62],torvald:[24,53],tos:[18,42],tot_len:[18,42],total:[5,9,10,12,14,16,17,22,33,39,44,45,47,49,50,51,52,64],touch:[3,9,10,51,52],toward:[24,29,53,58],trace:[0,14,15,16,21,24,44,45,53,65],trace_hardirqs_off_thunk:[21,65],trace_hardirqs_on_cal:[15,21,41,65],trace_hardirqs_on_thunk:[15,41],tracepoint:[16,21,44,65],tracer:[0,39,40,53],tracer_add_process:33,tracer_remove_process:33,track:[13,20,21,25,28,29,31,34,36,37,55,58,61,63,65],trade:[15,41],tradit:[7,46],tradition:[7,46],traffic:[18,28,42,61],transact:[28,61],transfer:[5,7,12,16,18,19,23,27,29,34,39,42,44,46,47,50,58,59,60,62],transform:[19,24,30,53,57,62],transit:[20,27,28,29,30,39,55,57,58,59,61],translat:[0,10,12,17,21,22,30,38,47,49,52,57,64,65],transmiss:[18,34,42],transmit:[5,6,7,12,14,15,18,30,32,34,41,42,45,46,47,48,50,57],transpar:[18,42],transport:[0,18,26,39,40,42,53,54],transport_head:[18,26,42,54],trap:[23,29,30,57,58,60],trap_pf:[15,41],trapnr:[29,58],travers:[5,28,50,61],treat:[8,9,51],treatment:[6,48],tree:[0,9,10,12,13,15,18,20,21,22,24,25,27,28,32,33,34,35,36,37,38,41,42,47,51,52,53,55,59,61,63,64,65],tri:[8,13,19,21,27,28,41,59,61,62,65],trick:[12,47],tricki:[12,29,47,58],trigger:[15,16,21,22,23,28,29,30,38,41,44,57,58,60,61,64,65],troubleshoot:[13,15,36,39,41],truesiz:[18,26,42,54],truli:[28,61],truncat:[10,37,52],truncate_inode_pag:[10,52],truncate_pagecach:[10,52],truncate_sets:[10,52],try_to_wake_up:[27,59],trylock:[14,45],tss:[19,62],tstamp:[18,26,42,54],ttl:[18,42],ttwu_queu:[27,59],ttwu_runn:[27,59],tty:[6,7,16,24,44,46,48,53],ttymxc0:[4,43],ttys0:[7,15,41,46],ttys1:[7,46],ttyusb0:[15,41],ttyusb:[15,41],ttzuiv5k:2,tuesdai:[37,38],tunabl:[16,44],tune:0,ture:[13,41],turn:[5,6,15,28,39,41,48,50,61],tutori:[9,10,18,42,51,52,53],tux:[18,42],twice:[10,12,28,47,52,61],two:[0,4,5,6,7,8,9,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,30,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,53,57,58,59,60,61,62,63,64,65],txcnt:[30,57],txctrl:[30,57],txpkt:36,txt:[5,6,7,9,10,15,33,41,46,48,50,51,52],type:[0,2,4,5,6,7,9,10,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,35,36,37,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,58,59,60,61,62,63,64,65],type_show:8,type_sifive_uart:[30,57],typedef:[7,12,18,38,42,46,47],typic:[0,2,4,5,6,8,9,12,14,18,19,20,22,23,25,27,28,30,42,43,45,47,48,50,51,55,57,59,60,61,62,63,64],typo:1,u16:[18,38,42],u32:[18,38,42],u500:[30,57],u64:[18,42],u_int8_t:[18,42],uaccess:[7,46],uapi:[14,18,42,45],uart16550:[31,34],uart16550_dev:31,uart16550_exit:31,uart16550_fop:31,uart16550_init:31,uart16550_interrupt:31,uart16550_ioctl:31,uart16550_ioctl_set_lin:34,uart16550_open:31,uart16550_read:31,uart16550_releas:31,uart16550_writ:31,uart:[0,4,12,30,39,40,43,47,57],uart_be_chang:[30,57],uart_can_rx:[30,57],uart_ev:[30,57],uart_ip:[30,57],uart_op:[30,57],uart_read:[30,57],uart_rx:[30,57],uart_writ:[30,57],ubuntu:[2,3,21,27,59,65],udev:[4,7,8,43,46],udevadm:8,udevinfo:8,udevmonitor:8,udevtest:8,udf:[9,51],udp:[0,36],udp_hdr:[18,42],udp_sock:[18,42],udph:[18,42],udphdr:[18,42],uevent:0,uevent_suppress:8,uid:[9,10,27,37,51,52,59],uid_t:37,uint16_t:38,uint32_t:[30,38,57],uint64_t:[23,30,38,57,60],uint8_t:[30,38,57],ultim:[15,41],ultrasparc:[24,53],umount:[3,9,10,51,52],unabl:[5,12,15,21,41,47,50,65],unalloc:[7,37,46],unalt:[7,46],unam:[2,15,41],unavoid:[27,59],unblock:[7,46],uncertainti:39,uncom:[14,16,44,45],uncompress:[13,41],undefin:[12,19,47,62],under:[4,5,8,13,16,19,24,30,35,41,43,44,50,53,57,62],underflow:[19,27,59,62],underli:[5,50],understand:[4,6,7,9,10,12,13,15,16,17,18,19,28,31,35,36,38,39,41,42,43,44,46,47,48,49,51,52,53,61,62],understood:[15,41],undesir:[23,60],unexpect:[28,61],unfortun:[22,64],unhandl:39,unicast:[26,54],unidirect:[28,61],unifi:[8,14,24,28,45,53,61],uniform:[0,12,47],unikraft:[16,44],uniniti:[15,21,41,65],union:[18,42],uniqu:[5,7,9,10,12,22,46,47,50,51,52,64],unit:[4,5,6,13,17,19,24,27,41,43,48,49,50,53,59,62],univers:0,unix:[7,9,10,14,17,22,24,45,46,49,51,52,53,64],unknown:[10,15,41,52],unless:[10,15,19,27,41,52,59,62],unlik:[8,9,10,14,15,18,19,21,22,41,42,45,51,52,62,64,65],unlink:[10,22,37,52,64],unload:[0,4,5,6,7,8,9,10,12,14,21,32,43,45,46,47,48,50,51,52,65],unlock:[7,14,18,27,28,33,42,45,46,59,61],unlock_buff:[22,64],unlock_new_inod:[10,52],unlocked_ioctl:[7,31,46],unmap:[5,21,29,50,58,65],unmask:[23,60],unmount:[0,5,22,50,64],unnecessari:[28,61],unplan:[6,48],unpredict:[14,45],unreach:[26,54],unreferenc:[21,65],unregist:[0,5,8,12,18,42,47,50],unregister_blkdev:[5,50],unregister_chrdev_region:[7,46],unregister_filesystem:[9,51],unregistr:[0,8],unrel:[14,39,45],unreli:[6,48],unsaf:[21,65],unshar:[24,53],unsign:[5,6,7,8,9,10,12,14,17,18,19,22,26,27,28,29,30,31,37,38,42,45,46,47,48,49,50,51,52,54,57,58,59,61,62,64],unstrip:[13,41],until:[7,10,12,14,15,16,18,22,23,25,27,28,34,39,41,42,44,45,46,47,52,59,60,61,63,64],unui:53,unus:[10,12,14,19,21,22,36,37,45,47,52,62,64,65],up_prob:33,up_probe_handl:33,upb:[16,44],updat:[0,1,2,3,5,7,10,11,12,14,15,19,22,25,27,29,30,45,46,47,50,52,57,58,59,62,63,64],update_irq:[30,57],upload:39,upon:[5,8,21,50,65],upper:[5,23,50,60],uppercas:34,uprob:[16,21,44,65],ups:[27,59],upstream:[24,53],urandom:[16,44],urg:[18,42],urg_ptr:[18,42],uri:53,usabl:13,usag:[6,10,13,14,15,16,18,21,23,28,34,41,42,44,45,48,52,60,61,65],usb:[4,8,15,24,41,43,53],usdhc:[4,43],use:[0,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,51,52,53,54,55,57,58,59,60,61,62,63,64,65],use_after_fre:[21,65],use_after_free2:[21,65],use_before_init:[21,65],use_bio_transf:[5,50],use_pool:38,used:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,26,27,28,29,30,33,36,37,38,39,41,42,43,44,45,46,47,48,50,51,52,53,54,55,57,58,59,60,61,62,63,64,65],useful:[0,5,10,12,13,14,15,16,18,21,28,30,35,37,41,42,44,45,47,50,52,57,61,65],usefulli:[19,62],user:[0,5,6,7,8,9,10,12,13,14,15,16,17,19,20,21,22,23,25,26,27,28,34,35,38,41,44,45,46,47,48,49,50,51,52,54,55,59,60,61,62,63,64,65],user_buff:[7,31,46],user_parameter_valid:[14,45],user_ptr:[29,58],usermod:8,usernam:13,userpac:[12,47],userspac:[0,5,6,8,22,25,27,48,50,59,63,64],uses:[3,5,6,7,8,10,12,13,14,15,17,18,19,21,22,23,24,25,26,27,28,29,30,35,37,41,42,45,46,47,48,49,50,52,53,54,57,58,59,60,61,62,63,64,65],using:[0,1,2,3,4,5,7,8,9,10,11,12,14,15,16,17,18,19,21,23,24,25,27,28,29,30,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,49,50,51,52,53,57,58,59,60,61,62,63,65],usr:[4,13,15,24,41,43,53],usual:[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,23,24,28,41,42,43,44,45,46,47,48,49,50,51,52,53,60,61,62],util:[5,6,8,9,10,13,15,27,36,41,48,50,51,52,53,59],v1_minix_iget:[9,10,51,52],vacat:[28,61],vaddr:[18,19,42,62],val64:[30,57],val:[7,12,46,47],valentin:53,valhalla:[9,51],vali:53,valid:[5,7,9,10,12,16,19,21,22,27,29,30,44,46,47,50,51,52,57,58,59,62,64,65],valid_lft:2,valu:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,21,23,25,27,28,29,30,34,35,37,38,41,42,43,44,45,46,47,48,49,50,51,52,57,58,59,60,61,63,65],valuabl:[15,41],value1:[4,43],value2:[4,43],vanilla:13,vari:[5,50],variabl:[0,2,3,5,6,7,8,9,10,11,12,13,15,16,17,18,24,25,27,28,29,36,38,41,42,44,46,47,48,49,50,51,52,53,58,59,61,63],variant:[6,13,28,41,48,61],varieti:[20,55],variou:[4,5,6,7,8,9,15,18,19,22,24,28,34,36,41,42,43,46,48,50,51,53,61,62,64],vax:[24,53],vcpu:[30,38,57],vda:[3,13,16,44],vdb:[5,10,13,35,50,52],vdc:[13,35],vdd:[9,13,51],vdso:[0,17,27,49,59],vec:[5,18,42,50],vector:[5,9,18,23,25,30,37,42,50,51,57,60,63],vendor:[4,43],verbos:[3,13,41],veri:[6,8,9,10,13,14,15,16,18,19,22,24,28,38,41,42,44,45,48,51,52,53,61,62,64],verif:[5,9,10,17,39,49,50,51,52],verifi:[5,6,8,9,10,12,27,29,30,32,33,34,35,36,37,38,39,47,48,50,51,52,57,58,59],verify_redzone_fre:[21,65],versa:[7,9,17,46,49,51],version:[0,1,2,3,4,5,6,7,9,10,12,13,14,15,16,17,18,27,29,30,31,36,37,38,41,42,43,44,45,46,47,48,49,50,51,52,57,58,59],vfree:[5,15,19,41,50,62],vfs:[9,10,51,52],vfs_inod:[9,51],vfs_read:[7,46],vfs_write:[16,44],vga:[3,12,15,41,47],via:[0,3,6,7,8,12,15,17,18,23,24,26,28,30,33,35,36,38,41,42,46,47,48,49,53,54,57,60,61],vice:[7,9,17,46,49,51],video:[4,31,43],view:[4,6,7,8,9,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,32,41,42,43,44,45,46,48,49,51,53,54,55,56,57,58,59,60,61,62,63,64,65],vim:[1,13,15,41],vimrc:[13,41],violat:[28,61],virt:[24,38,53],virt_to_fix:[19,62],virt_to_pag:[17,49],virt_to_phi:[17,19,49,62],virtconsol:3,virtio:[3,5,9,13,30,38,50,51,57],virtio_crypto:3,virtiocon0:[3,15,41],virtual:[0,1,4,5,6,7,8,10,12,14,15,16,17,18,19,20,27,33,34,35,36,39,40,41,42,43,44,45,46,47,48,49,50,52,55,59,62],virtual_cpu:38,virtual_machin:38,virtualbox:[15,16,41,44],virtualizar:53,visibl:[8,14,15,16,24,27,30,41,44,45,53,57,59],visual:[16,44],vlan:[30,57],vm_area_struct:[0,13,15,25,63],vm_end:[17,49],vm_file:[17,49],vm_flag:[17,49],vm_next:[17,49],vm_op:[17,49],vm_operations_struct:13,vm_page_prot:[17,49],vm_pgoff:[17,49],vm_prev:[17,49],vm_share:[22,64],vm_start:[17,49],vm_struct:[17,25,49,63],vma:[15,17,21,41,49,65],vma_iter:[17,49],vmalloc:[5,13,17,19,24,25,49,50,53,62,63],vmalloc_area:[17,49],vmalloc_to_pag:[17,49],vmalloc_to_pfn:[17,49],vmc:[30,57],vmchecker:[13,32,33,34,35,36,37,39,53],vmexit:38,vmlinux:[2,4,13,21,27,41,43,59,65],vmlinux_symbol:[29,58],vmm:[0,30,57],vmmap:[17,49],vmware:[15,16,41,44],vmxnet:[12,47],volatil:[2,27,28,29,38,58,59,61],volum:[7,46],voluntari:[27,59],voluntarili:[6,39,48],vpcu:38,vpid:0,vscode:[13,41],vsyscal:[29,58],vsyscall_addr:[19,62],vsyscall_pag:[19,62],vsz:[16,44],vvar:[17,49],wai:[1,3,4,5,6,7,8,9,12,13,14,15,16,18,19,21,22,24,28,29,31,39,41,42,43,44,45,46,47,48,50,51,53,58,61,62,64,65],wait:[0,6,12,13,14,16,17,18,22,23,24,27,28,34,35,39,42,44,45,47,48,49,53,59,60,61,64],wait_ev:[6,7,27,46,48,59],wait_event_:[27,59],wait_event_interrupt:[7,46],wait_event_interruptible_timeout:[7,46],wait_event_timeout:[7,46],wait_list:[28,61],wait_lock:[28,61],wait_queue_entri:[27,59],wait_queue_entry_t:[27,59],wait_queue_head:[27,59],wait_queue_head_t:[7,46],waiter:[27,28,59,61],waitqueu:[27,28,59,61],waitqueue_walk_break_cnt:[27,59],wake:[0,7,9,28,46,51,61],wake_flag:[27,59],wake_q:[28,61],wake_q_add:[28,61],wake_up:[6,7,27,46,48,59],wake_up_interrupt:[7,46],wake_up_process:[6,48],wake_up_q:[28,61],wakeup:[27,59],walk:[0,19,62],wall:[15,41],want:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,27,28,39,41,42,43,44,45,46,47,48,49,50,51,52,59,61],warn:[2,14,21,30,39,45,57,65],warn_on_onc:[19,27,59,62],warranti:[2,30,57],wast:[28,61],watch:[15,41],watchdog:[23,60],wb_err:[22,64],wbc:[9,10,22,51,52,64],wchan:[27,59],weak:[16,44],web:[13,18,41,42],websit:[13,41],wednesdai:39,week:[24,39,53],weekli:[24,53],wehrl:53,weight:39,weird:[16,44],weisz:53,welcom:1,well:[4,5,9,10,11,13,14,15,16,17,21,22,23,24,27,28,35,36,37,41,43,44,45,49,50,51,52,53,59,60,61,64,65],were:[5,6,7,12,15,16,19,21,22,27,39,41,44,46,47,48,50,59,62,64,65],weslei:53,wf_:[27,59],wf_sync:[27,59],wget:[13,36],what:[0,4,6,7,8,9,10,12,13,14,15,16,21,24,26,27,29,31,38,41,43,44,45,46,47,48,51,52,53,54,58,59,65],when:[2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,27,28,29,30,33,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,64,65],whenc:[7,46],whenev:[14,17,27,45,49,59],where:[5,6,7,8,9,10,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,33,34,35,36,37,39,41,42,44,45,46,47,48,49,50,51,52,53,58,59,60,61,62,63,64,65],wherev:[9,51],whether:[12,14,16,21,29,39,44,45,47,58,65],which:[0,1,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,27,28,29,30,32,33,34,36,37,38,41,42,43,44,45,46,47,48,49,50,51,52,53,57,58,59,60,61,62,63,64,65],who:[7,39,46],whoever:[18,42],whole:[8,13,16,24,41,44,53],whom:39,whose:[6,9,10,12,13,14,15,24,28,39,41,45,47,48,51,52,53,61],why:[5,6,7,8,9,13,15,16,19,25,28,29,31,41,44,46,48,50,51,58,61,62,63],wide:[24,53],wight:[27,59],wiki:[9,37,38,51],wikibak:[9,51],wikibook:[13,41],win32:[19,62],window:[13,14,15,18,19,24,27,41,42,45,53,59,62],wine:[19,62],wish:[16,39,44],within:[5,6,7,8,9,10,13,14,15,17,18,37,39,41,42,45,46,48,49,50,51,52],without:[5,6,8,9,10,12,13,15,16,18,19,22,24,27,28,30,36,39,41,42,44,47,48,50,51,52,53,57,59,61,62,64],wmb:[28,61],woken:[7,16,23,27,44,46,59,60],won:[6,12,23,24,28,47,48,53,60,61],word:[2,12,13,18,29,41,42,47,58],work:[0,1,2,3,4,5,7,8,9,10,11,12,13,15,16,17,18,19,21,23,24,28,29,34,35,36,38,39,40,41,42,43,44,46,47,49,50,51,52,53,58,60,61,62,65],work_struct:[6,48],worker:[6,48],workload:[16,44],workqueu:[0,24,35,53],workqueue_struct:[6,48],workspac:11,worksqueue_struct:[6,48],workstat:[9,10,51,52],world:[0,7,16,24,31,38,44,46,53],worth:[4,37,43],would:[2,12,13,14,15,18,22,24,25,27,39,41,42,45,47,53,59,63,64],wouldn:[21,65],wq_entri:[27,59],wq_flag_bookmark:[27,59],wq_flag_exclus:[27,59],wq_head:[27,59],wq_name:[7,46],wrap:[16,27,44,59],wrapper:[16,44],writ:[15,41],write:[0,8,9,10,12,14,15,16,18,19,21,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,41,42,44,45,47,51,52,53,54,57,58,59,60,61,62,63,65],write_begin:[10,22,52,64],write_end:[10,22,52,64],write_inod:[9,10,22,51,52,64],write_it:[10,52],write_lock:[14,45],write_lock_irq:[12,47],write_lock_irqsav:[12,47],write_sup:[22,64],write_unlock:[14,45],write_unlock_irq:[12,47],write_unlock_irqrestor:[12,47],writeback:[22,64],writeback_control:[9,10,22,51,52,64],writeback_index:[22,64],writepag:[10,22,52,64],writer:[14,28,45,61],written:[7,9,10,12,16,34,37,44,46,47,51,52],wrmsr:[30,57],wrong:35,wrote:[16,39,44],www:[2,13,30,38,57],x86:[0,2,3,4,11,12,13,15,24,28,29,30,41,43,47,53,57,58,61],x86_32:[19,62],x86_64:2,x86_feature_rsb_ctxsw:[27,59],xarg:[12,47],xarrai:[22,64],xcscope:[13,41],xen:0,xfs:[9,51],xor:[15,29,36,41,58],xu4:[4,43],year:[16,39,44],yes:[2,15,41],yet:[7,9,10,16,22,27,30,39,44,46,51,52,57,59,64],yield:[15,27,41,59],yocto:[2,3,4,13,43],yocto_imag:[2,3,9,51],you:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,23,27,30,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,57,59,60,65],your:[0,1,2,4,5,6,7,13,14,15,16,17,18,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,48,49,50,51],your_branch_nam:1,your_usernam:1,yourself:[14,15,16,18,42,44,45],yourselv:39,zero:[5,7,9,10,13,23,24,27,29,35,41,46,50,51,52,53,58,59,60],zimag:[2,4,43],zone:[0,17,49],zserg:38,zzzzzzzzzzzzzzz:[21,65],zzzzzzzzzzzzzzzz:[21,65]},titles:["Linux Kernel Teaching","Contributing to linux-kernel-labs","Customizing the Virtual Machine Setup","Recommended Setup","Kernel Development on ARM","Block Device Drivers","Deferred work","Character device drivers","Linux Device Model","File system drivers (Part 1)","File system drivers (Part 2)","Infrastructure","I/O access and Interrupts","Introduction","Kernel API","Kernel modules","Kernel Profiling","Memory mapping","Networking","Address Space","Architecture Layer","Debugging","Filesystem Management","Interrupts","Introduction","Memory Management","Network Management","Processes","Symmetric Multi-Processing","System Calls","Virtualization","Collaboration","Assignment 0 - Kernel API","Assignment 1 - Kprobe based tracer","Assignment 2 - Driver UART","Assignment 3 - Software RAID","Assignment 4 - SO2 Transport Protocol","Assignment 5 - PITIX Filesystem","Assignment 7 - SO2 Virtual Machine Manager with KVM","SO2 - General Rules and Grading","Operating Systems 2","SO2 Lab 01 - Introduction","SO2 Lab 10 - Networking","SO2 Lab 11 - Kernel Development on ARM","SO2 Lab 12 - Kernel Profiling","SO2 Lab 02 - Kernel API","SO2 Lab 03 - Character device drivers","SO2 Lab 04 - I/O access and Interrupts","SO2 Lab 05 - Deferred work","SO2 Lab 06 - Memory Mapping","SO2 Lab 07 - Block Device Drivers","SO2 Lab 08 - File system drivers (Part 1)","SO2 Lab 09 - File system drivers (Part 2)","SO2 Lecture 01 - Course overview and Linux kernel introduction","SO2 Lecture 10 - Networking","SO2 Lecture 11 - Architecture Layer","SO2 Lecture 12 - Profiling","SO2 Lecture 12 - Virtualization","SO2 Lecture 02 - System calls","SO2 Lecture 03 - Processes","SO2 Lecture 04 - Interrupts","SO2 Lecture 05 - Symmetric Multi-Processing","SO2 Lecture 06 - Address Space","SO2 Lecture 07 - Memory Management","SO2 Lecture 08 - Filesystem Management","SO2 Lecture 09 - Kernel debugging"],titleterms:{"32bit":[19,62],"abstract":[22,64],"class":[8,39],"final":39,"function":[5,8,9,50,51],"import":35,"long":38,"new":[8,16,44],"pozi\u021bionar":53,"return":[23,60],"switch":[24,27,53,59],Adding:[8,13],Bus:8,The:[8,9,10,18,22,27,42,51,52,59,64],Use:[14,31,45],Useful:[5,50],Using:[2,27,38,59],VFS:[9,51],about:[13,41],acceler:[26,54],accept:[18,42],access:[7,12,14,25,27,29,45,46,47,58,59,63],action:[23,60],add:[8,31],addr2lin:[15,21,41,65],address:[7,10,18,19,23,24,42,46,52,53,60,62],alloc:[14,25,45,63],alreadi:[7,46],api:[14,32,45],arch:[20,24,53,55],architectur:[20,23,24,53,55,60],arm:[4,43],ascii:[12,47],asid:[19,62],asmp:[24,53],assig:[32,33,34,35,36,37,38],assign:[31,32,33,34,35,36,37,38,39],asymmetr:[24,53],atom:[14,28,45,61],attribut:[8,22,64],background:[6,12,47,48],bad:[16,44],barrier:[28,61],base:33,basic:[8,24,28,30,53,57,61],between:[6,48],bex:8,bex_misc:8,bibliografi:53,bio:[5,50],bitmap:[10,52],bitwis:[14,45],block:[5,6,24,27,48,50,53,59],block_device_oper:[5,50],board:[4,43],boot:[4,13,20,43,55],buffer:[6,9,12,19,34,47,48,51,62],build:1,bus:8,buse:8,cach:[9,22,28,51,61,64],call:[14,20,27,29,45,55,58,59],chang:[1,31],charact:[7,12,14,45,46,47],checker:[21,65],chip:[4,43],clangd:[13,41],classic:[30,57],clone:[27,59],close:[7,18,22,42,46,64],code:[13,20,24,38,41,53,55],coher:[28,61],collabor:31,commit:31,compil:[4,15,41,43],complet:[5,9,50,51],comunit:53,concept:[23,24,53,60],concurr:[28,61],connect:[2,3,18,42],contain:[2,27,59],content:[5,50],context:[14,23,24,27,28,45,53,59,60,61],contigu:[17,49],contribut:1,control:[6,12,23,30,38,47,48,57,60],convent:[14,45],convers:[18,42],copi:[28,61],cours:53,cpu:[4,24,28,38,43,53,61],creat:[1,5,9,10,22,50,51,52,64],creation:[18,42],cross:[13,41],cscope:[13,41],cur:53,current:[27,59],cursului:53,cursuri:53,custom:2,data:[5,7,12,22,28,46,47,50,61,64],databas:[26,54],deadlin:39,debug:[13,15,21,38,41,65],debug_pagealloc:[21,65],debugg:[2,15,41],decod:[21,65],defer:[6,48],deferr:[23,60],del:8,delet:[5,10,22,50,52,64],demo:[16,44],dentri:[9,10,22,51,52,64],descriptor:[19,23,60,62],despr:53,destin:[18,42],destroi:[9,51],detail:[33,34,35,36],determin:[27,59],develop:[4,24,43,53],devic:[4,5,7,8,12,17,24,26,38,43,46,47,49,50,53,54],directori:[10,22,52,64],disabl:[28,61],disk:[5,13,50],dispatch:[5,50],displai:[17,18,42,49],docker:2,document:[1,13,41],driver:[5,7,8,9,10,12,17,20,24,34,46,47,49,50,51,52,53,55],dure:15,dynam:[15,29,41,58],dyndbg:[15,41],each:31,echipa:53,elf:[16,44],emul:[30,57],entri:[10,30,52,57],error:[14,15,41,45],exampl:[15,23,26,41,54,60],except:[20,23,55,60],execut:[14,24,30,45,53,57],exercis:[4,5,6,7,8,9,10,12,13,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],exit:[30,57],extend:[30,57],extra:[7,12,15,39,46,47],famili:[26,54],fault:[25,63],field:[18,30,42,57],file:[7,9,10,22,27,46,51,52,59,64],file_oper:[7,46],filesystem:[9,22,24,37,51,53,64],fill_sup:[9,51],filter:[18,42],fix:[19,62],flood:[23,60],fork:1,forward:[26,54],free:[5,50],from:[5,12,22,23,29,47,50,58,60,64],further:[5,6,9,10,12,17,18,42,47,48,49,50,51,52],gdb:[13,21,27,41,59,65],gendisk:[5,50],gener:[9,39,51],get:[10,13,41,52],github:31,gitlab:31,good:41,grade:39,guest:38,handl:[12,14,23,25,29,45,47,58,60,63],handler:[12,23,47,60],hardwar:[5,12,23,26,47,50,54,60],hello:[4,43],hierarchi:[24,53],highmem:[19,62],hotplug:8,how:[5,50],hypervisor:[30,57],imag:[2,4,43],implement:[7,8,12,26,27,29,33,34,35,36,46,47,54,58,59],increas:39,indic:[14,45],individu:31,info:[15,41],inform:[4,6,12,26,43,47,48,54],infrastructur:11,initi:[5,9,15,38,50,51],inod:[7,9,10,22,46,51,52,64],inode_oper:[10,52],insid:31,inspect:[19,23,27,59,60,62],intel:[30,57],interpret:[12,47],interrupt:[12,23,28,47,60,61],intro:[4,5,6,7,8,12,14,15,43,45,46,47,48,50],introduct:[13,24,41,53],investig:[16,44],ioctl:[6,7,46,48],irq:[20,23,55,60],iter:[10,52],kasan:[21,65],kdb:[15,41],kei:[12,47],kernel:[0,1,2,4,6,13,14,15,16,18,21,24,27,28,32,41,42,43,44,45,48,53,59,61,65],keyboard:[12,47],keyword:13,kfifo:[12,47],kill_sb:[9,51],kmemleak:[21,65],know:[35,41],kobject:8,kprobe:33,kscope:[13,41],kvm:[30,38,57],lab:[1,4,5,6,9,10,12,13,14,15,16,17,18,41,42,43,44,45,46,47,48,49,50,51,52],labor:53,laboratori:[7,13,39,41,46],laboratorului:53,latenc:[16,44],launch:[16,44],layer:[5,20,50,55],layout:[24,53],lazi:[30,57],lectur:[19,20,21,22,23,24,25,26,27,28,29,30,39,53,54,55,56,57,58,59,60,61,62,63,64,65],level:[5,50],linear:[19,62],link:[10,52],linux:[0,1,4,8,12,13,14,18,19,22,23,24,28,29,41,42,43,45,47,53,58,60,61,62,64],list:[14,21,45,65],lista:53,listen:[18,42],load:[15,41],local:[9,51],lock:[6,12,14,28,45,47,48,61],lockdep:[21,65],look:[19,62],lookup:[10,52],lxr:[13,41],machin:[2,3,13,20,30,38,55,57],macro:[9,51],maintain:[24,53],major:[7,46],make:1,manag:[20,22,24,25,26,38,53,54,55,63,64],map:[17,19,49,62],memori:[4,14,15,17,20,21,24,25,28,43,45,49,53,55,61,63,65],merg:31,messag:[7,18,42,46],methodolog:[16,44],micro:[24,53],minf:[9,10,51,52],minicom:[15,41],minor:[7,46],misc:8,mm_struct:[17,49],mmu:[19,20,30,55,57,62],mode:38,model:[8,9,24,51,53],modul:[14,15,24,41,45,53],monitor:8,monolith:[24,53],mount:[9,22,51,64],multi:[5,24,28,50,53,61],multiprocess:[24,53],mutex:[14,28,45,61],myf:[9,10,51,52],namespac:[27,59],navig:[13,41],necesar:53,nest:[23,60],netcat:[18,42],netconsol:[15,41],netfilt:[18,26,42,54],network:[18,24,26,42,53,54],non:[17,25,27,49,59,63],notar:53,notif:8,o_nonblock:[7,46],obiectivel:53,objdump:[15,21,41,65],object:[4,5,6,7,9,10,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,57,58,59,60,61,62,63,64,65],oop:[15,21,41,65],open:[7,22,27,46,59,64],oper:[6,7,8,9,10,14,17,18,22,24,28,40,42,45,46,48,49,51,52,53,61,64],optim:[28,61],option:[15,19,41,62],order:[28,61],other:[21,65],our:[14,45],overview:[5,7,8,14,15,16,17,18,20,24,26,27,41,42,44,45,46,49,50,53,54,55,59],packag:[4,43],packet:[18,23,26,42,54,60],page:[17,19,22,25,30,49,57,62,63,64],pageabl:[24,53],panic:[21,65],parallel:[12,47],paramet:[15,29,41,58],paravirtu:[30,57],part:[9,10,51,52],penalti:[32,33,34,35,36,37,38,39],per:[28,61],perf:[16,21,44,65],period:[6,48],perman:[19,62],phase:[26,54],physic:[17,25,49,63],pitix:37,plagiar:39,plai:8,platform:[20,55],plug:8,pnp:8,point:39,polici:[26,54],port:[12,47],portabl:[24,53],preempt:[27,59],preemption:[28,61],preemptiv:[14,24,27,45,53,59],present:39,previou:[27,59],printk:[14,15,41,45],prioriti:[23,60],probe:8,problem:[16,44],proc:[15,41],process:[5,6,7,14,20,24,26,27,28,45,46,48,50,53,54,55,59,61],processor:[28,61],procf:[17,49],profil:[16,44,56],programm:[23,60],proto_op:[18,42],protocol:[26,36,54],pull:1,qemu:[4,13,30,43,57],queri:[22,64],question:[32,33,34,35,36,37],queue:[5,7,38,46,50],quickstart:[32,33,34,35,36],quiz:[23,27,59,60],raid:35,ram:[5,50],rcu:[28,61],read:[5,6,7,9,10,12,17,18,22,28,42,46,47,48,49,50,51,52,61,64],real:38,rebuild:2,receiv:[18,42],recommend:3,redo:39,reduc:[16,44],refer:[13,41],regist:[5,7,8,9,12,46,47,50,51],registr:[5,7,46,50],regular:[10,52],releas:[7,46],remark:[13,41],remov:8,repositori:1,request:[1,5,12,23,47,50,60],reset:[12,47],resourc:[27,32,33,34,35,36,37,59],respons:[16,44],restrict:[7,46],resurs:53,retak:39,review:31,root:[9,51],rootf:[4,43],rout:[26,54],routin:[12,47],rule:39,run:38,sampl:36,scalabl:[24,53],scancod:[12,47],scheme:34,script:[27,59],search:[10,52],secur:[24,53],segment:[19,62],selector:[19,62],send:[18,26,42,54],sender:[18,42],serial:[12,47],set:[5,50],setup:[2,3,20,38,55],shadow:[30,57],share:[6,24,29,48,53,58],simpl:[4,43],sk_buff:[18,26,42,54],skb:[26,54],skeleton:[31,38],slab:[21,65],sleep:[14,45],slub_debug:[21,65],small:[25,63],smp:[23,24,53,60],so2:[36,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65],sock:[18,42],socket:[18,26,42,54],soft:[23,60],softirq:[6,48],softwar:[5,26,30,35,50,54,57],sourc:[13,24,28,41,53,61],sourceweb:[13,41],space:[7,10,18,19,24,29,42,46,52,53,58,62],specif:[20,55],spelunk:13,spin:[28,61],spinlock:[14,45],split:31,ssh:2,stack:[13,23,24,41,53,60],stage:[5,50],start:[3,31],state:[27,59],statement:[32,33,34,35,36,37],statist:[12,47],store:[12,47],strap:[20,55],string:[14,45],struct:[5,7,9,17,18,26,27,42,46,49,50,51,54,59],structur:[1,5,7,8,9,10,17,18,30,38,42,46,49,50,51,52,57],sub:[15,41],submit:[5,32,33,34,35,36,37,38,39,50],summari:[23,60],super_block:[9,51],superblock:[9,22,51,64],superoper:[10,52],support:[4,43],symbol:[10,52],symmetr:[24,28,53,61],sync:[30,57],synchron:[14,28,45,61],sysf:8,system:[4,9,10,19,20,22,23,24,27,28,29,40,43,51,52,53,55,58,59,60,61,62,64],tabl:[19,23,26,29,30,54,57,58,60,62],tag:[5,50],task:[24,27,38,53,59],task_struct:[27,59],tasklet:[6,23,48,60],tcp:[18,42],tcpdump:36,teach:0,team:31,techniqu:[26,54],teme:53,temelor:53,temporari:[19,62],term:[24,53],terminolog:[23,60],test:[9,10,14,32,33,34,35,36,37,45,51,52],thi:[13,41],thread:[6,16,20,27,44,48,55,59],through:[10,52],time:[16,20,44,55],timer:[6,23,48,60],tip:[32,33,34,35,36,37,38],tldr:38,tool:[16,21,44,65],toolchain:[4,43],top:[16,44],trace:[13,41],tracer:33,translat:[19,62],transport:36,tree:[4,43],tune:[16,44],type:[8,30,57],typic:[24,53],uart:34,udp:[18,26,42,54],uevent:8,uniform:[25,63],unload:[15,41],unmount:[9,51],unregist:[7,9,46,51],unregistr:[7,46],updat:[28,61],use:[5,50],used:[17,49],useful:[9,51],user:[18,24,29,42,53,58],userspac:[12,17,47,49],using:[6,13,48],variabl:[14,45],vdso:[29,58],version:8,via:2,virtual:[2,3,9,13,22,24,25,29,30,38,51,53,57,58,63,64],vm_area_struct:[17,49],vmm:38,vpid:[30,57],wait:[5,7,46,50],wake:[27,59],what:[23,60],work:[6,14,31,45,48],workqueu:[6,23,48,60],world:[4,43],write:[5,7,17,22,46,49,50,64],x86:[19,23,27,59,60,62],xen:[30,57],zone:[25,63]}}) \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign-collaboration.html b/refs/pull/405/merge/so2/assign-collaboration.html new file mode 100644 index 00000000..4e5bbb7b --- /dev/null +++ b/refs/pull/405/merge/so2/assign-collaboration.html @@ -0,0 +1,332 @@ + + + + + + Collaboration — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Collaboration

+

Collaboration is essential in open source world and we encourage you +to pick a team partner to work on selected assignments.

+

Here is a simple guide to get you started:

+
+

1. Use Github / Gitlab

+

Best way to share your work inside the team is to use a version control system (VCS) +in order to track each change. Mind that you must make your repo private and only allow +read/write access rights to team members.

+
+
+

2. Start with a skeleton for the assignment

+

Add init/exit functions, driver operations and global structures that you driver might need.

+
// SPDX-License-Identifier: GPL-2.0
+/*
+ * uart16550.c - UART16550 driver
+ *
+ * Author: John Doe <john.doe@mail.com>
+ * Author: Ionut Popescu <ionut.popescu@mail.com>
+ */
+struct uart16550_dev {
+   struct cdev cdev;
+   /*TODO */
+};
+
+static struct uart16550_dev devs[MAX_NUMBER_DEVICES];
+
+static int uart16550_open(struct inode *inode, struct file *file)
+{
+    /*TODO */
+    return 0;
+}
+
+static int uart16550_release(struct inode *inode, struct file *file)
+{
+   /*TODO */
+   return 0;
+}
+
+static ssize_t uart16550_read(struct file *file,  char __user *user_buffer,
+                              size_t size, loff_t *offset)
+{
+      /*TODO */
+}
+
+static ssize_t uart16550_write(struct file *file,
+                               const char __user *user_buffer,
+                               size_t size, loff_t *offset)
+{
+     /*TODO */
+}
+
+static long
+uart16550_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+      /*TODO */
+      return 0;
+}
+
+static const struct file_operations uart16550_fops = {
+       .owner          = THIS_MODULE,
+       .open           = uart16550_open,
+       .release        = uart16550_release,
+       .read           = uart16550_read,
+       .write          = uart16550_write,
+       .unlocked_ioctl = uart16550_ioctl
+};
+
+static int __init uart16550_init(void)
+{
+  /* TODO: */
+}
+
+static void __exit uart16550_exit(void)
+{
+   /* TODO: */
+}
+
+module_init(uart16550_init);
+module_exit(uart16550_exit);
+
+MODULE_DESCRIPTION("UART16550 Driver");
+MODULE_AUTHOR("John Doe <john.doe@mail.com");
+MODULE_AUTHOR("Ionut Popescu <ionut.popescu@mail.com");
+
+
+
+
+

3. Add a commit for each individual change

+

First commit must always be the skeleton file. And the rest of the code should be on top of skeleton file. +Please write a good commit mesage. Explain briefly what the commit does and why it is necessary.

+

Follow the seven rules of writing a good commit message: https://cbea.ms/git-commit/#seven-rules

+
Commit 3c92a02cc52700d2cd7c50a20297eef8553c207a (HEAD -> tema2)
+Author: John Doe <john.doe@mail.com>
+Date:   Mon Apr 4 11:54:39 2022 +0300
+
+  uart16550: Add initial skeleton for ssignment #2
+
+  This adds simple skeleton file for uart16550 assignment. Notice
+  module init/exit callbacks and file_operations dummy implementation
+  for open/release/read/write/ioctl.
+
+  Signed-off-by: John Doe <john.doe@mail.com>
+
+
+
+
+

4. Split the work inside the team

+

Add TODOs with each team member tasks. Try to split the work evenly.

+

Before starting to code, make a plan. On top of your skeleton file, add TODOs with each member tasks. Agree on global +structures and the overall driver design. Then start coding.

+
+
+

5. Do reviews

+

Create Pull Requests with your commits and go through review rounds with your team members. You can follow How to create a PR video.

+
+
+

6. Merge the work

+

The final work is the result of merging all the pull requests. Following the commit messages +one should clearly understand the progress of the code and how the work was managed inside the team.

+
f5118b873294 uart16550: Add uart16550_interrupt implementation
+2115503fc3e3 uart16550: Add uart16550_ioctl implementation
+b31a257fd8b8 uart16550: Add uart16550_write implementation
+ac1af6d88a25 uart16550: Add uart16550_read implementation
+9f680e8136bf uart16550: Add uart16550_open/release implementation
+3c92a02cc527 uart16550: Add skeleton for SO2 assignment #2
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign0-kernel-api.html b/refs/pull/405/merge/so2/assign0-kernel-api.html new file mode 100644 index 00000000..268115fe --- /dev/null +++ b/refs/pull/405/merge/so2/assign0-kernel-api.html @@ -0,0 +1,307 @@ + + + + + + Assignment 0 - Kernel API — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Assignment 0 - Kernel API

+
    +
  • Deadline: Monday, 25 March 2024, 23:59
  • +
+
+

Assignment's Objectives

+
    +
  • getting familiar with the qemu setup
  • +
  • loading/unloading kernel modules
  • +
  • getting familiar with the list API implemented in the kernel
  • +
  • have fun :)
  • +
+
+
+

Statement

+

Write a kernel module called list (the resulting file must be called list.ko) which stores data (strings) +in an internal list.

+

It is mandatory to use the list API +implemented in the kernel. +For details you can take a look at the laboratory 2.

+

The module exports a directory named list to procfs. The directory contains two files:

+
    +
  • management: with write-only access; is the interface for transmitting commands to the kernel module
  • +
  • preview: with read-only access; is the interface through which the internal contents of the kernel list can be viewed.
  • +
+

The code skeleton implements the two procfs files. +You will need to create a list and implement support for adding and reading data. Follow the TODOs in the code for details.

+

To interact with the kernel list, you must write commands (using the echo command) in the /proc/list/management file:

+
    +
  • addf name: adds the name element to the top of the list
  • +
  • adde name: adds the name element to the end of the list
  • +
  • delf name: deletes the first appearance of the name item from the list
  • +
  • dela name: deletes all occurrences of the name element in the list
  • +
+

Viewing the contents of the list is done by viewing the contents of the /proc/list/preview file (use the` cat` command). +The format contains one element on each line.

+
+
+

Testing

+

In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +test script called _checker. +The test script assumes that the kernel module is called list.ko.

+
+
+

QuickStart

+

It is mandatory to start the implementation of the assignment from the code skeleton found in the list.c file. +You should follow the instructions in the README.md file of the assignment's repo.

+
+

Tips

+

To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the Coding Style document.

+

Also, use the following static analysis tools to verify the code:

+
    +
  • checkpatch.pl
  • +
+
$ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/list.c
+
+
+
    +
  • sparse
  • +
+
$ sudo apt-get install sparse
+$ cd linux
+$ make C=2 /path/to/your/list.c
+
+
+
    +
  • cppcheck
  • +
+
$ sudo apt-get install cppcheck
+$ cppcheck /path/to/your/list.c
+
+
+
+
+

Penalties

+

Information about assigments penalties can be found on the +General Directions page.

+

In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above.

+
+
+

Submitting the assigment

+

The assignment will be graded automatically using the vmchecker-next infrastructure. +The submission will be made on moodle on the course's page to the related assignment. +You will find the submission details in the README.md file of the repo.

+
+
+
+

Resources

+

We recommend that you use gitlab to store your homework. Follow the directions in +README.md file.

+
+
+

Questions

+

For questions about the topic, you can consult the mailing list archives +or you can write a question on the dedicated Teams channel.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign1-kprobe-based-tracer.html b/refs/pull/405/merge/so2/assign1-kprobe-based-tracer.html new file mode 100644 index 00000000..df4bb3fa --- /dev/null +++ b/refs/pull/405/merge/so2/assign1-kprobe-based-tracer.html @@ -0,0 +1,375 @@ + + + + + + Assignment 1 - Kprobe based tracer — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Assignment 1 - Kprobe based tracer

+
    +
  • Deadline: Monday, 8 April 2024, 23:59
  • +
+
+

Assignment's Objectives

+
    +
  • gaining knowledge related to the instrumentation of functions in the Linux kernel (kretprobes mechanism)
  • +
  • gaining knowledge regarding the /proc file system from the Linux kernel
  • +
  • get familiar with data structures specific to the Linux kernel (hash table and list)
  • +
+
+
+

Statement

+

Build a kernel operations surveillant.

+

With this surveillant, we aim to intercept:

+
    +
  • kmalloc and kfree calls
  • +
  • schedule calls
  • +
  • up and down_interruptible calls
  • +
  • mutex_lock and mutex_unlock calls
  • +
+

The surveillant will hold, at the process level, the number of calls for each of the above functions. +For the kmalloc and kfree calls the total quantity of allocated and deallocated memory will be +shown.

+

The surveillant will be implemented as a kernel module with the name tracer.ko.

+
+

Implementation details

+

The interception will be done by recording a sample (kretprobe) for each of the above functions. The +surveillant will retain a list/hashtable with the monitored processes and will account for +the above information for these processes.

+

For the control of the list/hashtable with the monitored processes, a char device called /dev/tracer +will be used, with major 10 and minor 42. It will expose an ioctl interface with two arguments:

+
    +
  • the first argument is the request to the monitoring subsystem:

    +
    +
      +
    • TRACER_ADD_PROCESS
    • +
    • TRACER_REMOVE_PROCESS
    • +
    +
    +
  • +
  • the second argument is the PID of the process for which the monitoring request will be executed

    +
  • +
+

In order to create a char device with major 10 you will need to use the miscdevice interface in the kernel. +Definitions of related macros can be found in the tracer.h header.

+

Since the kmalloc function is inline for instrumenting the allocated amount of memory, the __kmalloc +function will be inspected as follows:

+
    +
  • a kretprobe will be used, which will retain the amount of memory allocated and the address of the allocated memory area.
  • +
  • the .entry_handler and .handler fields in the kretprobe structure will be used to retain information about the amount of memory allocated and the address from which the allocated memory starts.
  • +
+
static struct kretprobe kmalloc_probe = {
+   .entry_handler = kmalloc_probe_entry_handler, /* entry handler */
+   .handler = kmalloc_probe_handler, /* return probe handler */
+   .maxactive = 32,
+};
+
+
+

Since the kfree function only receives the address of the memory area to be freed, in order to determine +the total amount of memory freed, we will need to determine its size based on the address of the area. +This is possible because there is an address-size association made when inspecting the __kmalloc function.

+

For the rest of the instrumentation functions it is enough to use a kretprobe.

+
static struct kretprobe up_probe = {
+   .entry_handler = up_probe_handler,
+   .maxactive = 32,
+};
+
+
+

The virtual machine kernel has the CONFIG_DEBUG_LOCK_ALLOC option enabled where the mutex_lock symbol +is a macro that expands to mutex_lock_nested. Thus, in order to obtain information about the mutex_lock +function you will have to instrument the mutex_lock_nested function.

+

Processes that have been added to the list/hashtable and that end their execution will be removed +from the list/hashtable. Also, a process will be removed from the dispatch list/hashtable following +the TRACER_REMOVE_PROCESS operation.

+

The information retained by the surveillant will be displayed via the procfs file system, in the /proc/tracer file. +For each monitored process an entry is created in the /proc/tracer file having as first field the process PID. +The entry will be read-only, and a read operation on it will display the retained results. An example of +displaying the contents of the entry is:

+
$cat /proc/tracer
+PID   kmalloc kfree kmalloc_mem kfree_mem  sched   up     down  lock   unlock
+42    12      12    2048        2048        124    2      2     9      9
+1099  0       0     0           0           1984   0      0     0      0
+1244  0       0     0           0           1221   100   1023   1023   1002
+1337  123     99    125952      101376      193821 992   81921  7421   6392
+
+
+
+
+
+

Testing

+

In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +test script called _checker. +The test script assumes that the kernel module is called tracer.ko.

+
+
+

QuickStart

+

It is mandatory to start the implementation of the assignment from the code skeleton found in the src directory. +There is only one header in the skeleton called tracer.h. +You will provide the rest of the implementation. You can add as many *.c` sources and additional *.h` headers. +You should also provide a Kbuild file that will compile the kernel module called tracer.ko. +Follow the instructions in the README.md file of the assignment's repo.

+
+

Tips

+

To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the Coding Style document.

+

Also, use the following static analysis tools to verify the code:

+
    +
  • checkpatch.pl
  • +
+
$ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/tracer.c
+
+
+
    +
  • sparse
  • +
+
$ sudo apt-get install sparse
+$ cd linux
+$ make C=2 /path/to/your/tracer.c
+
+
+
    +
  • cppcheck
  • +
+
$ sudo apt-get install cppcheck
+$ cppcheck /path/to/your/tracer.c
+
+
+
+
+

Penalties

+

Information about assigments penalties can be found on the +General Directions page. In addition, the following +elements will be taken into account:

+
    +
  • -2: missing of proper disposal of resources (kretprobes, entries in /proc)
  • +
  • -2: data synchronization issues for data used by multiple executing instances (e.g. the list/hashtable)
  • +
+

In exceptional cases (the assigment passes the tests but it is not complying with the requirements) +and if the assigment does not pass all the tests, the grade may decrease more than mentioned above.

+
+
+

Submitting the assigment

+

The assignment will be graded automatically using the vmchecker-next infrastructure. +The submission will be made on moodle on the course's page to the related assignment. +You will find the submission details in the README.md file of the repo.

+
+
+
+

Resources

+ +

We recommend that you use gitlab to store your homework. Follow the directions in +README.

+
+
+

Questions

+

For questions about the topic, you can consult the mailing list archives +or you can write a question on the dedicated Teams channel.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign2-driver-uart.html b/refs/pull/405/merge/so2/assign2-driver-uart.html new file mode 100644 index 00000000..cc1a9317 --- /dev/null +++ b/refs/pull/405/merge/so2/assign2-driver-uart.html @@ -0,0 +1,354 @@ + + + + + + Assignment 2 - Driver UART — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Assignment 2 - Driver UART

+
    +
  • Deadline: Monday, 22 April 2024, 23:59
  • +
  • The assigment is individual
  • +
+
+

Assignment's Objectives

+
    +
  • consolidating the knowledge of device drivers
  • +
  • read hardware documentation and track the desired functionality in the documentation
  • +
  • work with interrupts; use of non-blocking functions in interrupt context
  • +
  • use of buffers; synchronization
  • +
  • kernel modules with parameters
  • +
+
+
+

Statement

+

Write a kernel module that implements a driver for the serial port (UART16550). +The device driver must support the two standard serial ports in a PC, COM1 and COM2 (0x3f8 and 0x2f8, +in fact the entire range of 8 addresses 0x3f8-0x3ff and 0x2f8-0x2ff specific to the two ports). +In addition to the standard routines (open, read, write, close), +the driver must also have support for changing communication parameters using an ioctl operation (UART16550_IOCTL_SET_LINE).

+

The driver must use interrupts for both reception and transmission to reduce latency and CPU usage time. +Read and write calls must also be blocking. Assignments that do not meet these requirements will not be considered. +It is recommended that you use a buffer for the read routine and another buffer for the write routine for each serial port in the driver.

+

A blocking read call means that the read routine called from the user-space will be blocked until at least one byte is read +(the read buffer in the kernel is empty and no data can be read). +A blocking write call means that the write routine called from the user-space will be blocked until at least one byte is written +(the write buffer in the kernel is full and no data can be written).

+
+

Buffers Scheme

+../_images/buffers-scheme.png +

Data transfer between the various buffers is a Producer-Consumer problem. Example:

+
    +
  • The process is the producer and the device is the consumer if it is written from the process to the device; the process will block until there is at least one free space in the consumer's buffer
  • +
  • The process is the consumer and the device is the producer if it is read from a process from the device; the process will block until there is at least one element in the producer's buffer.
  • +
+
+
+
+

Implementation Details

+
    +
  • the driver will be implemented as a kernel module named uart16550.ko
  • +
  • the driver will be accessed as a character device driver, with different functions depending on the parameters transmitted to the load module:
      +
    • the major parameter will specify the major with which the device must be registered
    • +
    • the option parameter will specify how it works:
        +
      • OPTION_BOTH: will also register COM1 and COM2, with the major given by the major parameter and the minors 0 (for COM1) and 1 (for COM2);
      • +
      • OPTION_COM1: will only register COM1, with the major major and minor 0;
      • +
      • OPTION_COM2: will only register COM2, with the major major and minor 1;
      • +
      +
    • +
    • to learn how to pass parameters in Linux, see tldp
    • +
    • the default values are major=42 and option=OPTION_BOTH.
    • +
    +
  • +
  • the interrupt number associated with COM1 is 4 (IRQ_COM1) and the interrupt number associated with COM2 is 3 (IRQ_COM2)
  • +
  • the header with the definitions needed for special operations;
  • +
  • a starting point in implementing read / write routines is the example of uppercase / lowercase character device driver; the only difference is that you have to use two buffers, one for read and one for write;
  • +
  • you can use kfifo for buffers;
  • +
  • you do not have to use deferred functions to read / write data from / to ports (you can do everything from interrupt context);
  • +
  • you will need to synchronize the read / write routines with the interrupt handling routine for the routines to be blocking; it is recommended to use synchronization with waiting queues
  • +
  • In order for the assigment to work, the default serial driver must be disabled:
      +
    • cat /proc/ioports | grep serial will detect the presence of the default driver on the regions where COM1 and COM2 are defined
    • +
    • in order to deactivate it, the kernel must be recompiled, either by setting the serial driver as the module, or by deactivating it completely (this modification is already made on the virtual machine)
        +
      • Device Drivers -> Character devices -> Serial driver -> 8250/16550 and compatible serial support.
      • +
      +
    • +
    +
  • +
+
+
+

Testing

+

In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +test script called _checker. +The test script assumes that the kernel module is called uart16550.ko.

+
+
+

QuickStart

+

It is mandatory to start the implementation of the assignment from the code skeleton found in the src directory. +There is only one header in the skeleton called uart16550.h. +You will provide the rest of the implementation. You can add as many *.c` sources and additional *.h` headers. +You should also provide a Kbuild file that will compile the kernel module called uart16550.ko. +Follow the instructions in the README.md file of the assignment's repo.

+
+

Tips

+

To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the Coding Style document.

+

Also, use the following static analysis tools to verify the code:

+
    +
  • checkpatch.pl
  • +
+
$ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/list.c
+
+
+
    +
  • sparse
  • +
+
$ sudo apt-get install sparse
+$ cd linux
+$ make C=2 /path/to/your/list.c
+
+
+
    +
  • cppcheck
  • +
+
$ sudo apt-get install cppcheck
+$ cppcheck /path/to/your/list.c
+
+
+
+
+

Penalties

+

Information about assigments penalties can be found on the +General Directions page.

+

In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above.

+
+
+

Submitting the assigment

+

The assignment will be graded automatically using the vmchecker-next infrastructure. +The submission will be made on moodle on the course's page to the related assignment. +You will find the submission details in the README.md file of the repo.

+
+
+
+

Resources

+ +

We recommend that you use gitlab to store your homework. Follow the directions in +README.

+
+
+

Questions

+

For questions about the topic, you can consult the mailing list archives +or you can write a question on the dedicated Teams channel.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign3-software-raid.html b/refs/pull/405/merge/so2/assign3-software-raid.html new file mode 100644 index 00000000..f3773aae --- /dev/null +++ b/refs/pull/405/merge/so2/assign3-software-raid.html @@ -0,0 +1,366 @@ + + + + + + Assignment 3 - Software RAID — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Assignment 3 - Software RAID

+
    +
  • Deadline: Thursday, 16 May 2024, 23:59
  • +
+

Implementing a software RAID module that uses a logical block device that will read and write data from two physical devices, +ensuring the consistency and synchronization of data from the two physical devices. The type of RAID implemented will be similar to a RAID 1.

+
+

Assignment's Objectives

+
    +
  • in-depth understanding of how the I/O subsystem works.
  • +
  • acquire advanced skills working with bio structures.
  • +
  • work with the block / disk devices in the Linux kernel.
  • +
  • acquire skills to navigate and understand the code and API dedicated to the I/O subsystem in Linux.
  • +
+
+
+

Statement

+

Write a kernel module that implements the RAID software functionality. Software RAID provides an abstraction between +the logical device and the physical devices. The implementation will use RAID scheme 1.

+

The virtual machine has two hard disks that will represent the physical devices: /dev/vdb and /dev/vdc. The operating system +will provide a logical device (block type) that will interface the access from the user space. Writing requests to the logical device +will result in two writes, one for each hard disk. Hard disks are not partitioned. It will be considered that each hard disk has a +single partition that covers the entire disk.

+

Each partition will store a sector along with an associated checksum (CRC32) to ensure error recovery. At each reading, the related +information from both partitions is read. If a sector of the first partition has corrupt data (CRC value is wrong) then the sector +on the second partition will be read; at the same time the sector of the first partition will be corrected. Similar in the case of +a reading of a corrupt sector on the second partition. If a sector has incorrect CRC values on both partitions, an appropriate error +code will be returned.

+
+

Important to know

+

To ensure error recovery, a CRC code is associated with each sector. CRC codes are stored by LOGICAL_DISK_SIZE byte of the partition +(macro defined in the assignment header). The disk structure will have the following layout:

+
+-----------+-----------+-----------+     +---+---+---+
+|  sector1  |  sector2  |  sector3  |.....|C1 |C2 |C3 |
++-----------+-----------+-----------+     +---+---+---+
+
+
+

where C1, C2, C3 are the values CRC sectors sector1, sector2, sector3. The CRC area is found immediately after the LOGICAL_DISK_SIZE bytes of the partition.

+

As a seed for CRC use 0(zero).

+
+
+
+

Implementation Details

+
    +
  • the kernel module will be named ssr.ko
  • +
  • the logical device will be accessed as a block device with the major SSR_MAJOR and minor SSR_FIRST_MINOR under the name /dev/ssr (via the macro LOGICAL_DISK_NAME)
  • +
  • the virtual device (LOGICAL_DISK_NAME - /dev/ssr) will have the capacity of LOGICAL_DISK_SECTORS (use set_capacity with the struct gendisk structure)
  • +
  • the two disks are represented by the devices /dev/vdb, respectively /dev/vdc, defined by means of macros PHYSICAL_DISK1_NAME, respectively PHYSICAL_DISK2_NAME
  • +
  • to work with the struct block _device structure associated with a physical device, you can use the blkdev_get_by_path and blkdev_put functions
  • +
  • for the handling of requests from the user space, we recommend not to use a request_queue, but to do processing at struct bio level +using the submit_bio field of struct block_device_operations
  • +
  • since data sectors are separated from CRC sectors you will have to build separate bio structures for data and CRC values
  • +
  • to allocate a struct bio for physical disks you can use bio_alloc(); to add data pages to bio use alloc_page() and bio_add_page()
  • +
  • to free up the space allocated for a struct bio you need to release the pages allocated to the bio (using the __free_page() macro ) and call +bio_put()
  • +
  • when generating a struct bio structure, consider that its size must be multiple of the disk sector size (KERNEL_SECTOR_SIZE)
  • +
  • to send a request to a block device and wait for it to end, you can use the submit_bio_wait() function
  • +
  • use bio_endio() to signal the completion of processing a bio structure
  • +
  • for the CRC32 calculation you can use the crc32() macro provided by the kernel
  • +
  • useful macro definitions can be found in the assignment support header
  • +
  • a single request processing function for block devices can be active at one time in a call stack (more details here). +You will need to submit requests for physical devices in a kernel thread; we recommend using workqueues.
  • +
  • For a quick run, use a single bio to batch send the read/write request for CRC values for adjacent sectors. For example, +if you need to send requests for CRCs in sectors 0, 1, ..., 7, use a single bio, not 8 bios.
  • +
  • our recommendations are not mandatory (any solution that meets the requirements of the assignment is accepted)
  • +
+
+
+

Testing

+

In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +test script called _checker. +The test script assumes that the kernel module is called ssr.ko.

+

If, as a result of the testing process, the sectors on both disks contain invalid data, resulting in +read errors that make the module impossible to use, you will need to redo the two disks in the +virtual machine using the commands:

+
$ dd if=/dev/zero of=/dev/vdb bs=1M
+$ dd if=/dev/zero of=/dev/vdc bs=1M
+
+
+

You can also get the same result using the following command to start the virtual machine:

+
$ rm disk{1,2}.img; make console # or rm disk{1,2}.img; make boot
+
+
+
+
+

QuickStart

+

It is mandatory to start the implementation of the assignment from the code skeleton found in the src directory. +There is only one header in the skeleton called ssr.h. +You will provide the rest of the implementation. You can add as many *.c` sources and additional *.h` headers. +You should also provide a Kbuild file that will compile the kernel module called ssr.ko. +Follow the instructions in the README.md file of the assignment's repo.

+
+

Tips

+

To increase your chances of getting the highest grade, read and follow the Linux kernel +coding style described in the Coding Style document.

+

Also, use the following static analysis tools to verify the code:

+
    +
  • checkpatch.pl
  • +
+
$ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c
+
+
+
    +
  • sparse
  • +
+
$ sudo apt-get install sparse
+$ cd linux
+$ make C=2 /path/to/your/file.c
+
+
+
    +
  • cppcheck
  • +
+
$ sudo apt-get install cppcheck
+$ cppcheck /path/to/your/file.c
+
+
+
+
+

Penalties

+

Information about assigments penalties can be found on the +General Directions page.

+

In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above.

+
+
+

Submitting the assigment

+

The assignment will be graded automatically using the vmchecker-next infrastructure. +The submission will be made on moodle on the course's page to the related assignment. +You will find the submission details in the README.md file of the repo.

+
+
+
+

Resources

+
    +
  • implementation of the RAID software in the Linux kernel
  • +
+

We recommend that you use gitlab to store your homework. Follow the directions in +README.

+
+
+

Questions

+

For questions about the topic, you can consult the mailing list archives +or you can write a question on the dedicated Teams channel.

+

Before you ask a question, make sure that:

+
+
    +
  • you have read the statement of the assigment well
  • +
  • the question is not already presented on the FAQ page
  • +
  • the answer cannot be found in the mailing list archives
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign4-transport-protocol.html b/refs/pull/405/merge/so2/assign4-transport-protocol.html new file mode 100644 index 00000000..da41dc61 --- /dev/null +++ b/refs/pull/405/merge/so2/assign4-transport-protocol.html @@ -0,0 +1,438 @@ + + + + + + Assignment 4 - SO2 Transport Protocol — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Assignment 4 - SO2 Transport Protocol

+
    +
  • Deadline: Monday, 29 May 2023, 23:00
  • +
  • This assignment can be made in teams (max 2). Only one of them must submit the assignment, and the names of the student should be listed in a README file.
  • +
+

Implement a simple datagram transport protocol - STP (SO2 Transport Protocol).

+
+

Assignment's Objectives

+
    +
  • gaining knowledge about the operation of the networking subsystem in the Linux kernel
  • +
  • obtaining skills to work with the basic structures of the networking subsystem in Linux
  • +
  • deepening the notions related to communication and networking protocols by implementing a protocol in an existing protocol stack
  • +
+
+
+

Statement

+

Implement, in the Linux kernel, a protocol called STP (SO2 Transport Protocol), at network and transport level, that works using datagrams (it is not connection-oriented and does not use flow-control elements).

+

The STP protocol acts as a Transport layer protocol (port-based multiplexing) but operates at level 3 (Network) of the OSI stack, above the Data Link level.

+

The STP header is defined by the struct stp_header structure:

+
struct stp_header {
+        __be16 dst;
+        __be16 src;
+        __be16 len;
+        __u8 flags;
+        __u8 csum;
+};
+
+
+

where:

+
+
    +
  • len is the length of the packet in bytes (including the header);
  • +
  • dst and src are the destination and source ports, respectively;
  • +
  • flags contains various flags, currently unused (marked reserved);
  • +
  • csum is the checksum of the entire package including the header; the checksum is calculated by exclusive OR (XOR) between all bytes.
  • +
+
+

Sockets using this protocol will use the AF_STP family.

+

The protocol must work directly over Ethernet. The ports used are between 1 and 65535. Port 0 is not used.

+

The definition of STP-related structures and macros can be found in the assignment support header.

+
+
+

Implementation Details

+

The kernel module will be named af_stp.ko.

+

You have to define a structure of type net_proto_family, which provides the operation to create STP sockets. +Newly created sockets are not associated with any port or interface and cannot receive / send packets. +You must initialize the socket ops field with the list of operations specific to the STP family. +This field refers to a structure proto_ops which must include the following functions:

+
    +
  • release: releases an STP socket
  • +
  • bind: associates a socket with a port (possibly also an interface) on which packets will be received / sent:
      +
    • there may be bind sockets only on one port (not on an interface)
    • +
    • sockets associated with only one port will be able to receive packets sent to that port on all interfaces (analogous to UDP sockets associated with only one port); these sockets cannot send packets because the interface from which they can be sent via the standard sockets API cannot be specified
    • +
    • two sockets cannot be binded to the same port-interface combination:
        +
      • if there is a socket already binded with a port and an interface then a second socket cannot be binded to the same port and the same interface or without a specified interface
      • +
      • if there is a socket already binded to a port but without a specified interface then a second socket cannot be binded to the same port (with or without a specified interface)
      • +
      +
    • +
    • we recommend using a hash table for bind instead of other data structures (list, array); in the kernel there is a hash table implementation in the hashtable.h header
    • +
    +
  • +
  • connect: associates a socket with a remote port and hardware address (MAC address) to which packets will be sent / received:
      +
    • this should allow send / recv operations on the socket instead of sendmsg / recvmsg or sendto / recvfrom
    • +
    • once connected to a host, sockets will only accept packets from that host
    • +
    • once connected, the sockets can no longer be disconnected
    • +
    +
  • +
  • sendmsg, recvmsg: send or receive a datagram on an STP socket:
      +
    • for the receive part, metainformation about the host that sent the packet can be stored in the cb field in sk_buff
    • +
    +
  • +
  • poll: the default function datagram_poll will have to be used
  • +
  • for the rest of the operations the predefined stubs in the kernel will have to be used (sock_no_*)
  • +
+
static const struct proto_ops stp_ops = {
+        .family = PF_STP,
+        .owner = THIS_MODULE,
+        .release = stp_release,
+        .bind = stp_bind,
+        .connect = stp_connect,
+        .socketpair = sock_no_socketpair,
+        .accept = sock_no_accept,
+        .getname = sock_no_getname,
+        .poll = datagram_poll,
+        .ioctl = sock_no_ioctl,
+        .listen = sock_no_listen,
+        .shutdown = sock_no_shutdown,
+        .setsockopt = sock_no_setsockopt,
+        .getsockopt = sock_no_getsockopt,
+        .sendmsg = stp_sendmsg,
+        .recvmsg = stp_recvmsg,
+        .mmap = sock_no_mmap,
+        .sendpage = sock_no_sendpage,
+};
+
+
+

Socket operations use a type of address called sockaddr_stp, a type defined in the assignment support header. +For the bind operation, only the port and the index of the interface on which the socket is bind will be considered. +For the receive operation, only the addr and port fields in the structure will be filled in with the MAC address of the host that sent the packet and with the port from which it was sent. +Also, when sending a packet, the destination host will be obtained from the addr and port fields of this structure.

+

You need to register a structure packet_type, using the call dev_add_pack to be able to receive STP packets from the network layer.

+

The protocol will need to provide an interface through the procfs file system for statistics on sent / received packets. +The file must be named /proc/net/stp_stats, specified by the STP_PROC_FULL_FILENAME macro in assignment support header. +The format must be of simple table type with 2 rows: on the first row the header of the table, and on the second row the statistics corresponding to the columns. +The columns of the table must be in order:

+
RxPkts HdrErr CsumErr NoSock NoBuffs TxPkts
+
+
+

where:

+
    +
  • RxPkts - the number of packets received
  • +
  • HdrErr - the number of packets received with header errors (packets too short or with source or destination 0 ports)
  • +
  • CsumErr - the number of packets received with checksum errors
  • +
  • NoSock - the number of received packets for which no destination socket was found
  • +
  • NoBuffs - the number of received packets that could not be received because the socket queue was full
  • +
  • TxPkts - the number of packets sent
  • +
+

To create or delete the entry specified by STP_PROC_FULL_FILENAME we recommend using the functions proc_create and proc_remove.

+
+

Sample Protocol Implementations

+

For examples of protocol implementation, we recommend the implementation of PF_PACKET sockets and the various functions in UDP implementation or IP implementation.

+
+
+
+

Testing

+

In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, +the assignment evaluation will be done automatically with the help of a +test script called _checker. +The test script assumes that the kernel module is called af_stp.ko.

+
+

tcpdump

+

You can use the tcpdump utility to troubleshoot sent packets. +The tests use the loopback interface; to track sent packets you can use a command line of the form:

+
tcpdump -i lo -XX
+
+
+

You can use a static version of tcpdump. +To add to the PATH environment variable in the virtual machine, copy this file to /linux/tools/labs/rootfs/bin. +Create the directory if it does not exist. Remember to give the tcpdump file execution permissions:

+
# Connect to the docker using ./local.sh docker interactive
+cd /linux/tools/labs/rootfs/bin
+wget http://elf.cs.pub.ro/so2/res/teme/tcpdump
+chmod +x tcpdump
+
+
+
+
+
+

QuickStart

+

It is mandatory to start the implementation of the assignment from the code skeleton found in the src directory. +There is only one header in the skeleton called stp.h. +You will provide the rest of the implementation. You can add as many *.c` sources and additional *.h` headers. +You should also provide a Kbuild file that will compile the kernel module called af_stp.ko. +Follow the instructions in the README.md file of the assignment's repo.

+
+

Tips

+

To increase your chances of getting the highest grade, read and follow the Linux kernel coding style described in the Coding Style document.

+

Also, use the following static analysis tools to verify the code:

+
    +
  • checkpatch.pl

    +
    $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c
    +
    +
    +
  • +
  • sparse

    +
    $ sudo apt-get install sparse
    +$ cd linux
    +$ make C=2 /path/to/your/file.c
    +
    +
    +
  • +
  • cppcheck

    +
    $ sudo apt-get install cppcheck
    +$ cppcheck /path/to/your/file.c
    +
    +
    +
  • +
+
+
+

Penalties

+

Information about assigments penalties can be found on the General Directions page.

+

In exceptional cases (the assigment passes the tests by not complying with the requirements) and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above.

+
+
+

Submitting the assigment

+

The assignment will be graded automatically using the vmchecker-next infrastructure. +The submission will be made on moodle on the course's page to the related assignment. +You will find the submission details in the README.md file of the repo.

+
+
+
+

Resources

+ +

We recommend that you use gitlab to store your homework. Follow the directions in README.

+
+
+

Questions

+

For questions about the topic, you can consult the mailing list archives +or you can write a question on the dedicated Teams channel.

+

Before you ask a question, make sure that:

+
+
    +
  • you have read the statement of the assigment well
  • +
  • the question is not already presented on the FAQ page
  • +
  • the answer cannot be found in the mailing list archives
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign5-pitix.html b/refs/pull/405/merge/so2/assign5-pitix.html new file mode 100644 index 00000000..471d5f5a --- /dev/null +++ b/refs/pull/405/merge/so2/assign5-pitix.html @@ -0,0 +1,354 @@ + + + + + + Assignment 5 - PITIX Filesystem — The Linux Kernel documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Assignment 5 - PITIX Filesystem

+

Deadline: Tuesday, 24 May 2022, 23:00

+
+

Statement

+

Write a kernel module to implement the PITIX file system, version 2. +This file system will only support files and directories. +Support operations for hard or symbolic links will not be implemented. +Also, support operations for special files (pipes, character devices, or blocks) will not be implemented. +Basically you need to implement the following:

+
+
    +
  • for directories: lookup, unlink, mkdir, rmdir, iterate
  • +
  • for files: create, truncate, bitmap functions, see minix_get_block.
  • +
+
+

The rest of the functions either have generic kernel implementations, or you don't have to implement them.

+

The disk structure of the file system is:

+
+--------------+-----------+-----------+------------+-----------------------+
+|              |           |           |            |                       |
+|  superblock  |   imap    |   dmap    |    izone   |         dzone         |
++--------------+-----------+-----------+------------+-----------------------+
+   4096 bytes     1 block     1 block     32 blocks    8*block_size blocks
+
+
+

where:

+
    +
  • Superblock is the superblock (4096 bytes)
  • +
  • Imap contains the bitmap of the blocks occupied by the inodes (1 block)
  • +
  • Dmap contains the bitmap of the blocks occupied by the data (1 block)
  • +
  • Izone contains inodes (32 blocks)
  • +
  • Dzone contains the data (the actual contents of the files) (8 * block_size blocks)
  • +
+

The superblock (on disk) is described by the following structure:

+
struct pitix_super_block {
+        unsigned long magic;
+        __u8 version;
+        __u8 block_size_bits;
+        __u8 imap_block;
+        __u8 dmap_block;
+        __u8 izone_block;
+        __u8 dzone_block;
+        __u16 bfree;
+        __u16 ffree;
+};
+
+
+

where:

+
    +
  • magic must be initialized with PITIX_MAGIC
  • +
  • version must be initialized with 2 (PITIX_VERSION)
  • +
  • block_size_bits is the block size of two; the block size can be 512, 1024, 2048, or 4096
  • +
  • Imap_block is the block number (relative to the device) to the bit vector used for the allocation / release sites inode
  • +
  • dmap_block is the block number (relative to the device) for the bit vector used to allocate / release data blocks
  • +
  • izone_block is the number of the first block (relative to the device) of the inode area
  • +
  • dzone_block is the number of the first block (relative to the device) of the data area
  • +
  • bfree is the number of free blocks (unallocated)
  • +
  • ffree is the number of free (unallocated) inodes
  • +
+

The inodes will be stored in the inode area and are described by the following structure:

+
struct pitix_inode {
+        __u32 mode;
+        uid_t uid;
+        gid_t gid;
+        __u32 size;
+        __u32 time;
+        __u16 direct_data_blocks [INODE_DIRECT_DATA_BLOCKS];
+        __u16 indirect_data_block;
+};
+
+
+

where:

+
    +
  • mode represents the access rights and inode type (file or directory) as represented in the kernel
  • +
  • uid represents the UID as it is represented in the kernel
  • +
  • gid represents the GID as it is represented in the kernel
  • +
  • size is the size of the file / directory
  • +
  • time represents the modification time as it is represented in the kernel
  • +
  • direct_data_blocks is a vector (size INODE_DIRECT_DATA_BLOCKS ) that contains indexes of direct data blocks
  • +
  • indirect_data_block is the index of a data block that contains the indexes of indirect data blocks
  • +
+

The index of a data block (direct or indirect) indicates the number of that data block relative to the data area (Dzone). +The size of an index is 2 bytes.

+

As can be seen from its structure, the inode uses a simple routing scheme for data blocks. +Blocks in the range [0, INODE_DIRECT_DATA_BLOCKS) are blocks of direct data and are referenced by elements of the vector direct_data_blocks and blocks in the range [INODE_DIRECT_DATA_BLOCKS, INODE_DIRECT_DATA_BL) are indirect data blocks and are referred to by indices within the data block indicated by indirect_data_block.

+

The data block indicated by indirect_data_block must be allocated when we have to refer to a first block of indirect data and must be released when there are no more blocks of indirect data.

+

Unused indexes must be set to 0. +The first block, the one with index 0, is always allocated when formatting. This block cannot be used and, consequently, the value 0:

+
    +
  • in an element of the vector, direct_data_blocks means free slot (that element does not refer to a block of data directly)
  • +
  • indirect_data_block means that no data block is allocated to keep track of indirect data blocks (when no indirect data blocks are needed)
  • +
  • an index within the data block referred to as indirect_data_block means free slot (that index does not refer to an indirect data block)
  • +
+

It is guaranteed that the number of bytes occupied by an inode on the disk is a divisor of the block size.

+

Directories have associated a single block of data (referred to as direct_data_block [0]) in which directory entries will be stored. These are described by the following structure:

+
struct pitix_dir_entry {
+        __u32 ino;
+        char name [PITIX_NAME_LEN];
+};
+
+
+

where

+
    +
  • inoi is the inode number of the file or directory; this number is an index in the inode area
  • +
  • name is the name of the file or directory; maximum name length is 16 bytes (PITIX_NAME_LEN); if the name length is less than 16 bytes, then the name will end with the ASCII character that has the code 0 (same as for strings)
  • +
+

The root directory will be assigned inode 0 and data block 0.

+

For simplicity, at mkdir it is not necessary to create the entries . (dot) and .. (dot dot) in the new directory; the checker uses this assumption.

+

All numeric values are stored on disk in byte-order CPU.

+

In the `assignment header <https://github.com/linux-kernel-labs/linux/blob/master/tools/labs/templates/assignments/5-pitix/pitix.h`__ you will find the structures described above together with useful macros and statements of the main functions to be implemented.

+

The kernel module will be named pitix.ko.

+
+
+

Testing

+
+

Note

+

Enable Loop Devices support using make menuconfig. Device drivers -> Block devices -> Loopback device support

+
+

In order to simplify the assignment evaluation process, but also to reduce the mistakes of the submitted assignments, the assignment evaluation will be done automatically with with the help of public tests that are in the new infrastructure.

+

For local testing, use the following commands:

+
$ git clone https://github.com/linux-kernel-labs/linux.git
+$ cd linux/tools/labs
+$ LABS=assignments/5-pitix make skels
+$ #the development of the assignment will be written in the 5-pitix directory
+$ make build
+$ make copy
+$ make boot
+
+
+

Instructions for using the test suite can be found in the README file.

+
+

Tips

+

To increase your chances of getting the highest grade, read and follow the Linux kernel coding style described in the Coding Style document.

+

Also, use the following static analysis tools to verify the code:

+
    +
  • checkpatch.pl
  • +
+
$ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c
+
+
+
    +
  • sparse
  • +
+
$ sudo apt-get install sparse
+$ cd linux
+$ make C=2 /path/to/your/file.c
+
+
+
    +
  • cppcheck
  • +
+
$ sudo apt-get install cppcheck
+$ cppcheck /path/to/your/file.c
+
+
+
+
+

Penalties

+

As a more difficult assignment, it is worth 2 points.

+

Information about assigments penalties can be found on the +General Directions page.

+

In exceptional cases (the assigment passes the tests by not complying with the requirements) +and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above.

+
+
+

Submitting the assigment

+

The assignment archive will be submitted to vmchecker, according to the rules on the +rules page.

+

In the vmchecker interface choose the Google Challenge - Sistem de fișiere option for this assignment.

+
+
+
+

Resources

+ +

We recommend that you use GitLab to store your homework. Follow the directions in +README +and on the dedicated Git wiki page.

+

The resources for the assignment can also be found in the so2-assignments repo on GitHub. +The repo contains a Bash script +that helps you create a private repository on the faculty GitLab instance. +Follow the tips from the README and +on the dedicated Wiki page.

+
+
+

Questions

+

For questions about the assigment, you can consult the mailing list archives +or send an e-mail (you must be registered). +Please follow and follow the tips for use of the list.

+

Before you ask a question, make sure that:

+
    +
  • you have read the statement of the assigment well
  • +
  • the question is not already presented on the FAQ page
  • +
  • the answer cannot be found in the mailing list archives
  • +
+
+
+ + +
+
+
+ +
+ +
+

© Copyright The kernel development community.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/assign7-kvm-vmm.html b/refs/pull/405/merge/so2/assign7-kvm-vmm.html new file mode 100644 index 00000000..bdbf5db7 --- /dev/null +++ b/refs/pull/405/merge/so2/assign7-kvm-vmm.html @@ -0,0 +1,508 @@ + + + + + + Assignment 7 - SO2 Virtual Machine Manager with KVM — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Assignment 7 - SO2 Virtual Machine Manager with KVM

+
    +
  • Deadline: Tuesday, 29 May 2023, 23:00
  • +
  • This assignment can be made in teams (max 2). Only one of them must submit the assignment, and the names of the student should be listed in a README file.
  • +
+

In this assignment we will work on a simple Virtual Machine Manager (VMM). We will be using the KVM API +from the Linux kernel.

+

The assignment has two components: the VM code and the VMM code. We will be using a very simple protocol +to enable the communication between the two components. The protocol is called SIMVIRTIO.

+
+

I. Virtual Machine Manager

+

In general, to build a VMM from scratch we will have to implement three main functionalities: initialize the VMM, initialize the virtual CPU and run the guest code. We will split the implementation of the VMM in these three phases.

+
+

1. Initialize the VMM

+

A VM will be represented in general by three elements, a file descriptor used to interact with the KVM API, a file descriptor per VM used to configure it (e.g. set its memory) and a pointer to the VM's memory. We provide you with the following structure to start from when working with a VM.

+
typedef struct vm {
+        int sys_fd;
+        int fd;
+        char *mem;
+} virtual_machine;
+
+
+

The first step in initializing the KVM VM is to interract with the [KVM_API](https://www.kernel.org/doc/html/latest/virt/kvm/api.html]. The KVM API is exposed via /dev/kvm. We will be using ioctl calls to call the API.

+

The snippet below shows how one can call KVM_GET_API_VERSION to get the KVM API Version

+
int kvm_fd = open("/dev/kvm", O_RDWR);
+if (kvm_fd < 0) {
+    perror("open /dev/kvm");
+    exit(1);
+}
+
+int api_ver = ioctl(kvm_fd, KVM_GET_API_VERSION, 0);
+if (api_ver < 0) {
+    perror("KVM_GET_API_VERSION");
+    exit(1);
+}
+
+
+

Let us now go briefly through how a VMM initializes a VM. This is only the bare bones, a VMM may do lots of other things during VM initialization.

+
    +
  1. We first use KVM_GET_API_VERSION to check that we are running the expected version of KVM, KVM_API_VERSION.
  2. +
  3. We now create the VM using KVM_CREATE_VM. Note that calling KVM_CREATE_VM returns a file descriptor. We will be using this file descriptor for the next phases of the setup.
  4. +
  5. (Optional) On Intel based CPUs we will have to call KVM_SET_TSS_ADDR with address 0xfffbd000
  6. +
  7. Next, we allocate the memory for the VM, we will be using mmap for this with PROT_WRITE, MAP_PRIVATE, MAP_ANONYMOUS and MAP_NORESERVE. We recommend allocating 0x100000 bytes for the VM.
  8. +
  9. We flag the memory as MADV_MERGEABLE using madvise
  10. +
  11. Finally, we use KVM_SET_USER_MEMORY_REGION to assign the memory to the VM.
  12. +
+

Make sure you understand what file descriptor to use and when, we use the KVM fd when calling KVM_CREATE_VM, but when interacting with the vm such as calling KVM_SET_USER_MEMORY_REGION we use the VMs +file descriptor

+

TLDR: API used for VM initialization:

+
    +
  • KVM_GET_API_VERSION
  • +
  • KVM_CREATE_VM
  • +
  • KVM_SET_TSS_ADDR
  • +
  • KVM_SET_USER_MEMORY_REGION.
  • +
+
+

2. Initialize a virtual CPU

+

We need a Virtual CPU (VCPU) to store registers.

+
typedef struct vcpu {
+        int fd;
+        struct kvm_run *kvm_run;
+} virtual_cpu;
+
+
+

To create a virtual CPU we will do the following: +1. Call KVM_CREATE_VCPU to create the virtual CPU. This call returns a file descriptor. +2. Use KVM_GET_VCPU_MMAP_SIZE to get the size of the shared memory +3. Allocated the necessary VCPU mem size with mmap. We will be passing the VCPU file descriptor to the mmap call. We can store the result in kvm_run.

+

TLDR: API used for VM

+
    +
  • KVM_CREATE_VCPU
  • +
  • KVM_GET_VCPU_MMAP_SIZE
  • +
+

We recommend using 2MB pages to simplify the translation process

+
+
+
+
+

Running the VM

+
+

Setup real mode

+

At first, the CPU will start in Protected mode. To do run any meaningful code, we will switch the CPU to [Real mode](https://wiki.osdev.org/Real_Mode). To do this we will +need to configure several CPU registers.

+
    +
  1. First, we will use KVM_GET_SREGS to get the registers. We use struct kvm_regs for this task.
  2. +
  3. We will need to set cs.selector and cs.base to 0. We will use KVM_SET_SREGS to set the registers.
  4. +
  5. Next we will clear all FLAGS bits via the rflags register, this means setting rflags to 2 since bit 1 must always be to 1. We alo set the RIP register to 0.
  6. +
+
+
+

Setup long mode

+

Read mode is all right for very simple guests, such as the one found in the folder guest_16_bits. But, +most programs nowdays need 64 bits addresses, and such we will need to switch to long mode. The following article from OSDev presents all the necessary information about [Setting Up Long Mode](https://wiki.osdev.org/Setting_Up_Long_Mode).

+

In vcpu.h, you may found helpful macros such as CR0_PE, CR0_MP, CR0_ET, etc.

+

Since we will running a more complex program, we will also create a small stack for our program +regs.rsp = 1 << 20;. Don't forget to set the RIP and RFLAGS registers.

+
+
+

Running

+

After we setup our VCPU in real or long mode we can finally start running code on the VM.

+
    +
  1. We copy to the vm memory the guest code, memcpy(vm->mem, guest_code, guest_code_size) The guest code will be available in two variables which will be discussed below.
  2. +
  3. In a infinite loop we run the following:
  4. +
+
+
    +
  • We call KVM_RUN on the VCPU file descriptor to run the VPCU
  • +
  • Through the shared memory of the VCPU we check the exit_reason parameter to see if the guest has made any requests:
  • +
  • We will handle the following VMEXITs: KVM_EXIT_MMIO, KVM_EXIT_IO and KVM_EXIT_HLT. KVM_EXIT_MMIO is triggered when the VM writes to a MMIO address. KVM_EXIT_IO is called when the VM calls inb or outb. KVM_EXIT_HLT is called when the user does a hlt instruction.
  • +
+
+
+
+

Guest code

+

The VM that is running is also called guest. We will be using the guest to test our implementation.

+
    +
  1. To test the implementation before implementing SIMVIRTIO. The guest will write at address 400 and the RAX register the value 42.
  2. +
  3. To test a more complicated implementation,we will extend the previous program to also write "Hello, world!n" on port 0xE9 using the outb instruction.
  4. +
  5. To test the implementation of SIMVIRTIO, we will
  6. +
+

How do we get the guest code? The guest code is available at the following static pointers guest16, guest16_end-guest16. The linker script is populating them.

+

## SIMVIRTIO: +From the communication between the guest and the VMM we will implement a very simple protocol called SIMVIRTIO. It's a simplified version of the real protocol used in the real world called virtio.

+

Configuration space:

+ +++++++++ + + + + + + + + + + + + + + + + + + + + +
u32u16u8u8u8u8u8
magic value +Rmax queue len +Rdevice status +Rdriver status +R/Wqueue selector +R/WQ0(TX) CTL +R/WQ1(RX) CTL +R/w
+
+
+

Controller queues

+

We provide you with the following structures and methods for the SIMVIRTIO implementation.

+
typedef uint8_t q_elem_t;
+typedef struct queue_control {
+    // Ptr to current available head/producer index in 'buffer'.
+    unsigned head;
+    // Ptr to last index in 'buffer' used by consumer.
+    unsigned tail;
+} queue_control_t;
+typedef struct simqueue {
+    // MMIO queue control.
+    volatile queue_control_t *q_ctrl;
+    // Size of the queue buffer/data.
+    unsigned maxlen;
+    // Queue data buffer.
+    q_elem_t *buffer;
+} simqueue_t;
+int circ_bbuf_push(simqueue_t *q, q_elem_t data)
+{
+}
+int circ_bbuf_pop(simqueue_t *q, q_elem_t *data)
+{
+}
+
+
+
+
+

Device structures

+
#define MAGIC_VALUE 0x74726976
+#define DEVICE_RESET 0x0
+#define DEVICE_CONFIG 0x2
+#define DEVICE_READY 0x4
+#define DRIVER_ACK 0x0
+#define DRIVER 0x2
+#define DRIVER_OK 0x4
+#define DRIVER_RESET 0x8000
+typedef struct device {
+    uint32_t magic;
+    uint8_t device_status;
+    uint8_t driver_status;
+    uint8_t max_queue_len;
+} device_t;
+typedef struct device_table {
+    uint16_t count;
+    uint64_t device_addresses[10];
+ } device_table_t;
+
+
+

We will be implementing the following handles: +* MMIO (read/write) VMEXIT +* PIO (read/write) VMEXIT

+
+
+
+

Using the skeleton

+
+
+

Debugging

+
+
+

Tasks

+
    +
  1. 30p Implement a simple VMM that runs the code from guest_16_bits. We will be running the VCPU in read mode for this task
  2. +
  3. 20p Extend the previous implementation to run the VCPU in real mode. We will be running the guest_32_bits example
  4. +
  5. 30p Implement the SIMVIRTIO protocol.
  6. +
  7. 10p Implement pooling as opposed to VMEXIT. We will use the macro USE_POOLING to switch this option on and off.
  8. +
  9. 10p Add profiling code. Measure the number of VMEXITs triggered by the VMM.
  10. +
+
+

Submitting the assigment

+

The assignment archive will be submitted on Moodle, according to the rules on the rules page.

+
+
+

Tips

+

To increase your chances of getting the highest grade, read and follow the Linux kernel coding style described in the Coding Style document.

+

Also, use the following static analysis tools to verify the code:

+
    +
  • checkpatch.pl

    +
    $ linux/scripts/checkpatch.pl --no-tree --terse -f /path/to/your/file.c
    +
    +
    +
  • +
  • sparse

    +
    $ sudo apt-get install sparse
    +$ cd linux
    +$ make C=2 /path/to/your/file.c
    +
    +
    +
  • +
  • cppcheck

    +
    $ sudo apt-get install cppcheck
    +$ cppcheck /path/to/your/file.c
    +
    +
    +
  • +
+
+
+

Penalties

+

Information about assigments penalties can be found on the General Directions page.

+

In exceptional cases (the assigment passes the tests by not complying with the requirements) and if the assigment does not pass all the tests, the grade will may decrease more than mentioned above.

+

## References +We recommend you the following readings before starting to work on the homework: +* [KVM host in a few lines of code](https://zserge.com/posts/kvm/)

+
+
+

TLDR

+
    +
  1. The VMM creates and initializes a virtual machine and a virtual CPU
  2. +
  3. We switch to real mode and check run the simple guest code from guest_16_bits
  4. +
  5. We switch to long mode and run the more complex guest from guest_32_bits
  6. +
  7. We implement the SIMVIRTIO protocol. We will describe how it behaves in the following subtasks.
  8. +
  9. The guest writes in the TX queue (queue 0) the ascii code for R which will result in a VMEXIT
  10. +
+

6. the VMM will handle the VMEXIT caused by the previous write in the queue. When the guests receiver the +R letter it will initiate the reser procedure of the device and set the device status to DEVICE_RESET +7. After the reset handling, the guest must set the status of the device to DRIVER_ACK. After this, the guest will write to the TX queue the letter C +8. In the VMM we will initialize the config process when letter C is received.It will set the device status to DEVICE_CONFIG and add a new entry in the device_table +9. After the configuration process is finished, the guest will set the driver status to DRIVER_OK +10. Nex, the VMM will set the device status to DEVICE_READY +11. The guest will write in the TX queue "Ana are mere" and will execute a halt +12. The VMM will print to the STDOUT the message received and execute the halt request +13. Finally, the VMM will verify that at address 0x400 and in register RAX is stored the value 42

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/grading.html b/refs/pull/405/merge/so2/grading.html new file mode 100644 index 00000000..953de357 --- /dev/null +++ b/refs/pull/405/merge/so2/grading.html @@ -0,0 +1,447 @@ + + + + + + SO2 - General Rules and Grading — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 - General Rules and Grading

+
+

General Rules

+
+

1. Laboratory

+

There is no formal rule for dividing students; everyone can participate in any laboratory as long as the following rules are respected. +Priority for participation is given to students from the respective group (34xC3 or optional). +The limit of students in a laboratory is 14 people. +Starting from the third week, the participation list in the laboratory is "frozen". +Students who have a retake can participate in any laboratory as long as there are available spots. +Like other students, the participation list is "frozen" starting from the third week. +The division is done on the laboratory hours division page. +You can make up for a maximum of 2 laboratories (you can attend another subgroup) (in those laboratories where there are available spots). +Laboratories cannot be made up retroactively. You cannot make up a laboratory from the previous week within the same laboratory week. +Laboratory activities take place only in the laboratory room. +We encourage you to go through the brief and laboratory exercises at home. +You can solve exercises at home, but you will have to start from scratch in the laboratory.

+
+
+

2. Final deadline for submitting assignments

+

The final deadline for submitting SO2 assignments is Wednesday, May 29, 2024, 23:59.. +Beyond this date, assignments cannot be submitted anymore. +Please ensure timely submission of assignments with complete information to be graded. +We will not accept assignments submitted after this date or assignments not submitted on vmchecker-next. +For the testing part, assignments will receive the score indicated from testing on vmchecker-next; tests failed due to reasons unrelated to vmchecker-next will not be graded. +Assignments cannot be submitted for the special June 2023 exam session. +Assignments can be resubmitted after TODO for the September 2024 exam session. +The deadline for submitting assignments for the Fall 2024 session is TODO.

+
+
+

3. Assignment Presentations

+

The SO2 team reserves the right to request presentations for some homework assignments. +A presentation involves a discussion with at least two assistants about the completion of the assignment, the solution used, and any encountered issues. +The purpose of the assignment presentation sessions is to clarify any uncertainties regarding the completion of the assignment and to verify its correctness. +Individuals who will present an assignment will be contacted at least 24 hours in advance by the laboratory assistant. +Most likely, a 15-minute slot before/after the SO2 class or at the end of the SO2 laboratory session will be used.

+
+
+

4. Rules on Assignments

+

The assignments for Operating Systems 2 are individual, except when explicitly stated that an assignment can be solved in a team. +This is because the primary objective of the assignments is for you to acquire or deepen your practical skills. +If the level of collaboration is too high or if you seek solutions online, this objective will not be achieved. +Each assignment is to be completed by a student without consulting the source code of their peers.

+

We understand that teamwork is important, but we do not have the environment to carry out team projects in the Operating Systems 2 course. +If you encounter any problems in completing an assignment, use the discussion list or ask the laboratory assistants or course instructors. +Our role is to help you solve them. +Feel free to rely on the SO2 team.

+

You can discuss among yourselves within the bounds of common sense; that is, you should not dictate a solution to someone, but you can offer a general idea. +If you are the one being asked and providing explanations, please consider redirecting to the discussion list and the SO2 team. +It is not allowed to request the solution to an assignment on a site like StackExchange, Rent a Coder, ChatGPT etc. +You can ask more generic questions, but do not request the solution to the assignment.

+

You can freely use code from the laboratory, skeletons provided by us. +You can use external resources (GitHub, open-source code, or others) as long as they do not represent obvious solutions to the assignments, publicly available with or without intention. +See also the next paragraph.

+

It is not allowed to publish assignment solutions (even after the end of the course). +If you find assignment solutions on GitHub or elsewhere, report them to the discussion list or privately to the laboratory assistant or course instructor. +We reiterate that if you need clarification that you would address to older colleagues or other forums, StackExchange, or other sources, use the discussion list and the SO2 team. +It is the safest and most honest way to solve problems.

+

It is not allowed to transfer files between yourselves. +In general, we recommend not to screen-share with another colleague, whether for inspiration or to help them with their assignment. +Avoid testing an assignment on a colleague's system. +There may be exceptions; you can help someone troubleshoot, but please ensure that it does not transition from "let's solve this problem together" to "let me solve your assignment for you". +However, we recommend using the discussion list or the SO2 team to ask questions.

+
+
+

5. Penalties for Plagiarized Assignments

+

In general, we consider punitive measures as a last resort. +As long as the assignment is completed individually, without problematic source code contribution from external sources, then it is not a plagiarized assignment.

+

The notion of a plagiarized assignment refers to, without limitation, situations such as:

+
+
    +
  • Two assignments that are similar enough to draw this conclusion;
  • +
  • Using source code from the internet that is an obvious solution to the assignment;
  • +
  • Using pieces of code from another colleague;
  • +
  • Accessing another colleague's code during the assignment;
  • +
  • Modifying an existing assignment;
  • +
  • Following another colleague's code;
  • +
  • Direct assistance in completing the assignment (someone else wrote or dictated the code);
  • +
  • Someone else wrote the assignment (voluntarily, for payment, or other benefits).
  • +
  • If two assignments are considered plagiarized, both the source and destination will be penalized equally, without discussions about who plagiarized from whom and whose fault it is.
  • +
+
+
+

Warning

+

Plagiarizing an assignment results in the elimination of points for the assignments completed up to that session. +Any assignment submitted until that session receives a score of 0 and cannot be resubmitted during the current academic year. +If there were instances of plagiarized assignments during the semester, it will be possible to obtain points in the summer, for the September session, from assignments not yet submitted. +We reiterate that our goal is not and will not be penalization for plagiarism. +We consider cheating to be dishonest behavior that will be punished if it occurs. +However, our goal is to prevent cheating; for this purpose, we offer support and resources from the team in all its forms (discussion list, face-to-face discussions with the SO2 team). +Please use them with confidence; we believe that an honest approach to completing assignments will also result in a gain of knowledge and skills for you.

+
+
+
+

6. Retake/Grade Increase

+

In the retake/grade increase session in September, only assignments can be submitted, only the final exam can be retaken, or both. +You can continue to submit assignments with the deadlines from the semester, meaning you can achieve a maximum grade of 7 for each assignment. +Assignments are submitted using the vmchecker-next interface. +If you did not have plagiarized assignments during the semester, you can (re)submit any assignments. +If there were instances of plagiarized assignments during the semester, you can submit only assignments not yet submitted during the semester. +The submission deadline is TODO

+

If you do not wish to retake the final exam, you can choose not to participate in the exam. +Grades will be recorded in the official catalog, according to the SO2 catalog.

+

In the special retake/grade increase session in June, only the final exam can be retaken, and no homework assignments can be submitted.

+

The exam in the retake session will consist of 11 equally weighted topics (for a total of 3 points - one topic is a bonus). Passing the exam is conditional on obtaining 1 point out of the 3 points assigned to the course. In practice, this means correctly solving 3 out of the 11 topics in the exam.

+

In the case of retaking the final exam, the higher grade will be retained (between the semester grade and the grade from the retake session).

+

You can participate in only one exam during a session.

+
+
+

7. Class Redo

+

If you prefer, you can keep the score from the previous academic year for the entire semester's activity (labs, assignments, course work), and only retake the final exam. +You cannot keep the score for individual components of the semester (only assignments or only course work).

+

If you want to keep the score from the previous academic year for the entire semester's activity, you must announce this at the begining of the semester. +Otherwise, the score from the previous academic year's semester will be reset according to the default mode.

+

By default, the score for the academic year will be reset on October 1. +If you do not graduate from the course during the current academic year, you will need to retake it completely during the next academic year.

+
+
+
+

Grading

+

You must achieve at least 4.5 points out of 10 to pass.

+
+

1. Lectures (3 points)

+
    +
  • Completion of the course is conditioned by obtaining 30% (3 out of 10) of the course score.
  • +
  • The lecture score will be obtained from 11 lecture quizzes to be completed before each class (one quiz is a bonus).
  • +
  • +
    Each course assignment contains a set of 4 questions from the material covered in the previous class (one question is a bonus).
    +
      +
    • There will be no final exam.
    • +
    • Each question is scored with 0 or 1.
    • +
    • A question is scored only if it is fully and correctly answered.
    • +
    • A question answered incompletely or one answered completely but with incorrect specifications or errors will not be scored.
    • +
    • Course assignments cannot be redone.
    • +
    • Each assignment lasts 3 minutes.
    • +
    • The score is obtained from the formula min(sum_of_assignment_scores / 10 * 4/3, 10).
    • +
    • The assignments are closed book.
    • +
    +
    +
    +
  • +
  • +
    For those who cannot attend the course assignments or wish to improve their course score, an assignment will be given at the end of the semester (during the last class) covering all the course material.
    +
      +
    • The end-of-semester assignment (last class) consists of 11 questions for the 3 course points and lasts 60 minutes.
    • +
    • The end-of-semester assignment is open-book. You are allowed to use class notes, books, slides, laptops, or tablets without internet access.
    • +
    • Access with mobile phones is not permitted. Phones must be turned off/silent/deactivated during the exam.
    • +
    • You may download course materials, labs, or other resources for offline use.
    • +
    +
    +
    +
  • +
+
+
+

2. Laboratory (2 points)

+
    +
  • The laboratories are held in EG106, EG306, and PR706.
  • +
  • Completion of the laboratory exercises leads to obtaining 10 or 11 points allocated for the laboratory.
  • +
  • The final grade for the laboratory is calculated using the formula (sum(l1:l12) / 12).
  • +
+
+
+

3. Assignments (5 points + Extra)

+
    +
  • +
    There are 4 Assignments:
    +
      +
    • Assignment 0 - "Kernel API" - 0.5 points
    • +
    • Assignment 1 - "Kprobe based tracer" - 1.5 points
    • +
    • Assignment 2 - "Driver UART" 1.5 points
    • +
    • Assignment 3 - "Software RAID" - 1.5 points
    • +
    +
    +
    +
  • +
  • +
    Extra activities:
    +
      +
    • SO2 transport protocol - 2 points
    • +
    • SO2 Virtual Machine Manager with KVM - 2 points
    • +
    +
    +
    +
  • +
  • +
    In case the total score for assignments + "Extra" activities exceeds 5 points, the following procedure will be followed:
    +
      +
    • 5 points are considered as part of the total score.
    • +
    • The difference between the total score and 5 points will be proportionally adjusted relative to the grade obtained in the lecture.
    • +
    +
    +
    +
  • +
+
S = A0 + A1 + A2 + A3 + Extra;
+if (S <= 5)
+    assignment_grade = S;
+else
+    assignment_grade = 5 + (S - 5) * course_grade / 3; // 0 <= course_grade <=3
+
+
+
    +
  • +
    The verification and scoring of assignments:
    +
      +
    • Assignments are tested against plagiarism.
    • +
    • Assignments will be automatically verified using the vmchecker-next infrastructure integrated with moodle.
    • +
    • The verification tests are public.
    • +
    • Students who upload their assignments on Moodle must wait for the checker's feedback in the feedback section of the assignment upload page.
    • +
    • The grade listed in the feedback section will be the final grade for the assigment.
    • +
    • There may be exceptional situations where this rule is not considered (for example, if the assignment is implemented solely to pass the tests and does not meet the assignment requirements).
    • +
    • The verification system deducts points (automatically) for certain situations (segmentation faults, unhandled exceptions, compilation errors, or warnings) regardless of the test results.
    • +
    • Deductions are specified in the instructions list and in the assignment statement.
    • +
    • Deductions are subtracted from the assignment grade (maximum of 10) not from the assignment score.
    • +
    +
    +
    +
  • +
  • +
    Late assignments
    +
      +
    • Each assignment has a deadline of 2 weeks from the publication date. (exception! Assignment 0)
    • +
    • After the deadline, 0.25 points per day (out of 10, the maximum grade for each assignment) will be deducted for 12 days (up to a maximum grade of 7).
    • +
    • The deduction is from the grade (maximum 10), not from the score. An assignment incurs deductions of 0.25 points per day from the maximum grade (10), regardless of its score.
    • +
    • For example, if for assignment 3 (scored with 1.5 points) the delay is 4 days, you will receive a deduction of 4 * 0.25 = 1 point from the grade, resulting in a maximum grade of 9, equivalent to a maximum score of 1.35 points.
    • +
    • After 12 days, no further deductions will be made; a maximum grade of 7 can be obtained for an assignment submitted 13 days after the deadline expiration, or 50 days, or more, including during the retake session.
    • +
    +
    +
    +
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/index.html b/refs/pull/405/merge/so2/index.html new file mode 100644 index 00000000..00c2d690 --- /dev/null +++ b/refs/pull/405/merge/so2/index.html @@ -0,0 +1,247 @@ + + + + + + Operating Systems 2 — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ + +
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab1-intro.html b/refs/pull/405/merge/so2/lab1-intro.html new file mode 100644 index 00000000..5972d832 --- /dev/null +++ b/refs/pull/405/merge/so2/lab1-intro.html @@ -0,0 +1,1683 @@ + + + + + + SO2 Lab 01 - Introduction — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 01 - Introduction

+
+

Lab objectives

+
    +
  • presenting the rules and objectives of the Operating Systems 2 lab
  • +
  • introducing the lab documentation
  • +
  • introducing the Linux kernel and related resources
  • +
  • creating simple modules
  • +
  • describing the process of kernel module compilation
  • +
  • presenting how a module can be used with a kernel
  • +
  • simple kernel debugging methods
  • +
+
+
+

About this laboratory

+

The Operating Systems 2 lab is a kernel programming and driver development lab. +The objectives of the laboratory are:

+
    +
  • deepening the notions presented in the course
  • +
  • presentation of kernel programming interfaces (kernel API)
  • +
  • gaining documenting, development and debugging skills on a freestanding +environment
  • +
  • acquiring knowledge and skills for drivers development
  • +
+

A laboratory will present a set of concepts, applications and commands +specific to a given problem. The lab will start with a presentation +(each lab will have a set of slides) (15 minutes) and the remaining +time will be allocated to the lab exercises (80 minutes).

+

For best laboratory performance, we recommend that you read the related slides. +To fully understand a laboratory, we recommend going through the lab support. For +in-depth study, use the supporting documentation.

+
+ +
+

Documentation

+

Kernel development is a difficult process, compared to user space +programming. The API is different and the complexity of the subsystems +in kernel requires additional preparation. The associated documentation +is heterogeneous, sometimes requiring the inspection of multiple sources +to have a more complete understanding of a certain aspect.

+

The main advantages of the Linux kernel are the access to sources and +the open development system. Because of this, the Internet offers a +larger number of documentation for the kernel.

+

A few links related to the Linux kernel are shown bellow:

+ +

The links are not comprehensive. Using The Internet and +kernel source code is essential.

+
+
+

Kernel Modules Overview

+

A monolithic kernel, though faster than a microkernel, has the disadvantage of +lack of modularity and extensibility. On modern monolithic kernels, this has +been solved by using kernel modules. A kernel module (or loadable kernel mode) +is an object file that contains code that can extend the kernel functionality +at runtime (it is loaded as needed); When a kernel module is no longer needed, +it can be unloaded. Most of the device drivers are used in the form of kernel +modules.

+

For the development of Linux device drivers, it is recommended to download the +kernel sources, configure and compile them and then install the compiled version +on the test /development tool machine.

+
+
+

An example of a kernel module

+

Below is a very simple example of a kernel module. When loading into the kernel, +it will generate the message "Hi". When unloading the kernel module, the +"Bye" message will be generated.

+
#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("My kernel module");
+MODULE_AUTHOR("Me");
+MODULE_LICENSE("GPL");
+
+static int dummy_init(void)
+{
+        pr_debug("Hi\n");
+        return 0;
+}
+
+static void dummy_exit(void)
+{
+        pr_debug("Bye\n");
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+
+
+

The generated messages will not be displayed on the console but will be saved +in a specially reserved memory area for this, from where they will be extracted +by the logging daemon (syslog). To display kernel messages, you can use the +dmesg command or inspect the logs:

+
# cat /var/log/syslog | tail -2
+Feb 20 13:57:38 asgard kernel: Hi
+Feb 20 13:57:43 asgard kernel: Bye
+
+# dmesg | tail -2
+Hi
+Bye
+
+
+
+
+

Compiling kernel modules

+

Compiling a kernel module differs from compiling an user program. First, other +headers should be used. Also, the module should not be linked to libraries. +And, last but not least, the module must be compiled with the same options as +the kernel in which we load the module. For these reasons, there is a standard +compilation method (kbuild). This method requires the use of two files: +a Makefile and a Kbuild file.

+

Below is an example of a Makefile:

+
KDIR = /lib/modules/`uname -r`/build
+
+kbuild:
+        make -C $(KDIR) M=`pwd`
+
+clean:
+        make -C $(KDIR) M=`pwd` clean
+
+
+

And the example of a Kbuild file used to compile a module:

+
EXTRA_CFLAGS = -Wall -g
+
+obj-m        = modul.o
+
+
+

As you can see, calling make on the Makefile file in the +example shown will result in the make invocation in the kernel +source directory (/lib/modules/`uname -r`/build) and referring to the +current directory (M = `pwd`). This process ultimately leads to reading +the Kbuild file from the current directory and compiling the module +as instructed in this file.

+
+

Note

+

For labs we will configure different KDIR, according to +the virtual machine specifications:

+
KDIR = /home/student/src/linux
+[...]
+
+
+
+

A Kbuild file contains one or more directives for compiling a kernel +module. The easiest example of such a directive is obj-m = +module.o. Following this directive, a kernel module (ko - kernel +object) will be created, starting from the module.o file. module.o will +be created starting from module.c or module.S. All of these files can +be found in the Kbuild's directory.

+

An example of a Kbuild file that uses several sub-modules is shown +below:

+
EXTRA_CFLAGS = -Wall -g
+
+obj-m        = supermodule.o
+supermodule-y = module-a.o module-b.o
+
+
+

For the example above, the steps to compile are:

+
+
    +
  • compile the module-a.c and module-b.c sources, +resulting in module-a.o and module-b.o objects
  • +
  • module-a.o and module-b.o will then be linked +in supermodule.o
  • +
  • from supermodule.o will be created supermodule.ko +module
  • +
+
+

The suffix of targets in Kbuild determines how they are used, as +follows:

+
+
    +
  • M (modules) is a target for loadable kernel modules
  • +
  • Y (yes) represents a target for object files to be compiled and then +linked to a module ($(mode_name)-y) or within the kernel (obj-y)
  • +
  • any other target suffix will be ignored by Kbuild and will not be +compiled
  • +
+
+
+

Note

+

These suffixes are used to easily configure the kernel by running the +make menuconfig command or directly editing the +.config file. This file sets a series of variables that are +used to determine which features are added to the kernel at build +time. For example, when adding BTRFS support with make +menuconfig, add the line CONFIG_BTRFS_FS = y to the +.config file. The BTRFS kbuild contains the line +obj-$(CONFIG_BTRFS_FS):= btrfs.o, which becomes obj-y:= +btrfs.o. This will compile the btrfs.o object and will be +linked to the kernel. Before the variable was set, the line became +obj:=btrfs.o and so it was ignored, and the kernel was build +without BTRFS support.

+
+

For more details, see the Documentation/kbuild/makefiles.txt and +Documentation/kbuild/modules.txt files within the kernel sources.

+
+
+

Loading/unloading a kernel module

+

To load a kernel module, use the insmod utility. This utility +receives as a parameter the path to the *.ko file in which the module +was compiled and linked. Unloading the module from the kernel is done using +the rmmod command, which receives the module name as a parameter.

+
$ insmod module.ko
+$ rmmod module.ko
+
+
+

When loading the kernel module, the routine specified as a parameter of the +module_init macro will be executed. Similarly, when the module is unloaded +the routine specified as a parameter of the module_exit will be executed.

+

A complete example of compiling and loading/unloading a kernel module is +presented below:

+
faust:~/lab-01/modul-lin# ls
+Kbuild  Makefile  modul.c
+
+faust:~/lab-01/modul-lin# make
+make -C /lib/modules/`uname -r`/build M=`pwd`
+make[1]: Entering directory `/usr/src/linux-2.6.28.4'
+  LD      /root/lab-01/modul-lin/built-in.o
+  CC [M]  /root/lab-01/modul-lin/modul.o
+  Building modules, stage 2.
+  MODPOST 1 modules
+  CC      /root/lab-01/modul-lin/modul.mod.o
+  LD [M]  /root/lab-01/modul-lin/modul.ko
+make[1]: Leaving directory `/usr/src/linux-2.6.28.4'
+
+faust:~/lab-01/modul-lin# ls
+built-in.o  Kbuild  Makefile  modul.c  Module.markers
+modules.order  Module.symvers  modul.ko  modul.mod.c
+modul.mod.o  modul.o
+
+faust:~/lab-01/modul-lin# insmod modul.ko
+
+faust:~/lab-01/modul-lin# dmesg | tail -1
+Hi
+
+faust:~/lab-01/modul-lin# rmmod modul
+
+faust:~/lab-01/modul-lin# dmesg | tail -2
+Hi
+Bye
+
+
+

Information about modules loaded into the kernel can be found using the +lsmod command or by inspecting the /proc/modules, +/sys/module directories.

+
+
+

Kernel Module Debugging

+

Troubleshooting a kernel module is much more complicated than debugging a +regular program. First, a mistake in a kernel module can lead to blocking the +entire system. Troubleshooting is therefore much slowed down. To avoid reboot, +it is recommended to use a virtual machine (qemu, virtualbox, vmware).

+

When a module containing bugs is inserted into the kernel, it will eventually +generate a kernel oops. +A kernel oops is an invalid operation detected by the kernel and can only +be generated by the kernel. For a stable kernel version, it almost certainly +means that the module contains a bug. After the oops appears, the kernel will +continue to work.

+

Very important to the appearance of a kernel oops is saving the generated +message. As noted above, messages generated by the kernel are saved in logs and +can be displayed with the dmesg command. To make sure that no kernel +message is lost, it is recommended to insert/test the kernel directly from the +console, or periodically check the kernel messages. Noteworthy is that an oops +can occur because of a programming error, but also a because of hardware error.

+

If a fatal error occurs, after which the system can not return to a stable +state, a kernel panic is +generated.

+

Look at the kernel module below that contains a bug that generates an oops:

+
/*
+ * Oops generating kernel module
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+MODULE_DESCRIPTION ("Oops");
+MODULE_LICENSE ("GPL");
+MODULE_AUTHOR ("PSO");
+
+#define OP_READ         0
+#define OP_WRITE        1
+#define OP_OOPS         OP_WRITE
+
+static int my_oops_init (void)
+{
+        int *a;
+
+        a = (int *) 0x00001234;
+#if OP_OOPS == OP_WRITE
+        *a = 3;
+#elif OP_OOPS == OP_READ
+        printk (KERN_ALERT "value = %d\n", *a);
+#else
+#error "Unknown op for oops!"
+#endif
+
+        return 0;
+}
+
+static void my_oops_exit (void)
+{
+}
+
+module_init (my_oops_init);
+module_exit (my_oops_exit);
+
+
+

Inserting this module into the kernel will generate an oops:

+
faust:~/lab-01/modul-oops# insmod oops.ko
+[...]
+
+faust:~/lab-01/modul-oops# dmesg | tail -32
+BUG: unable to handle kernel paging request at 00001234
+IP: [<c89d4005>] my_oops_init+0x5/0x20 [oops]
+  *de = 00000000
+Oops: 0002 [#1] PREEMPT DEBUG_PAGEALLOC
+last sysfs file: /sys/devices/virtual/net/lo/operstate
+Modules linked in: oops(+) netconsole ide_cd_mod pcnet32 crc32 cdrom [last unloaded: modul]
+
+Pid: 4157, comm: insmod Not tainted (2.6.28.4 #2) VMware Virtual Platform
+EIP: 0060:[<c89d4005>] EFLAGS: 00010246 CPU: 0
+EIP is at my_oops_init+0x5/0x20 [oops]
+EAX: 00000000 EBX: fffffffc ECX: c89d4300 EDX: 00000001
+ESI: c89d4000 EDI: 00000000 EBP: c5799e24 ESP: c5799e24
+ DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
+Process insmod (pid: 4157, ti=c5799000 task=c665c780 task.ti=c5799000)
+Stack:
+ c5799f8c c010102d c72b51d8 0000000c c5799e58 c01708e4 00000124 00000000
+ c89d4300 c5799e58 c724f448 00000001 c89d4300 c5799e60 c0170981 c5799f8c
+ c014b698 00000000 00000000 c5799f78 c5799f20 00000500 c665cb00 c89d4300
+Call Trace:
+ [<c010102d>] ? _stext+0x2d/0x170
+ [<c01708e4>] ? __vunmap+0xa4/0xf0
+ [<c0170981>] ? vfree+0x21/0x30
+ [<c014b698>] ? load_module+0x19b8/0x1a40
+ [<c035e965>] ? __mutex_unlock_slowpath+0xd5/0x140
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c014b7aa>] ? sys_init_module+0x8a/0x1b0
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c0240a08>] ? trace_hardirqs_on_thunk+0xc/0x10
+ [<c0103407>] ? sysenter_do_call+0x12/0x43
+Code: <c7> 05 34 12 00 00 03 00 00 00 5d c3 eb 0d 90 90 90 90 90 90 90 90
+EIP: [<c89d4005>] my_oops_init+0x5/0x20 [oops] SS:ESP 0068:c5799e24
+---[ end trace 2981ce73ae801363 ]---
+
+
+

Although relatively cryptic, the message provided by the kernel to the +appearance of an oops provides valuable information about the error. First line:

+
BUG: unable to handle kernel paging request at 00001234
+EIP: [<c89d4005>] my_oops_init + 0x5 / 0x20 [oops]
+
+
+

Tells us the cause and the address of the instruction that generated the error. +In our case this is an invalid access to memory.

+

Next line

+
+
Oops: 0002 [# 1] PREEMPT DEBUG_PAGEALLOC
+

Tells us that it's the first oops (#1). This is important in the context that +an oops can lead to other oopses. Usually only the first oops is relevant. +Furthermore, the oops code (0002) provides information about the error type +(see arch/x86/include/asm/trap_pf.h):

+
+
    +
  • Bit 0 == 0 means no page found, 1 means protection fault
  • +
  • Bit 1 == 0 means read, 1 means write
  • +
  • Bit 2 == 0 means kernel, 1 means user mode
  • +
+
+

In this case, we have a write access that generated the oops (bit 1 is 1).

+

Below is a dump of the registers. It decodes the instruction pointer (EIP) +value and notes that the bug appeared in the my_oops_init function with +a 5-byte offset (EIP: [<c89d4005>] my_oops_init+0x5). The message also +shows the stack content and a backtrace of calls until then.

+

If an invalid read call is generated (#define OP_OOPS OP_READ), the message +will be the same, but the oops code will differ, which would now be 0000:

+
faust:~/lab-01/modul-oops# dmesg | tail -33
+BUG: unable to handle kernel paging request at 00001234
+IP: [<c89c3016>] my_oops_init+0x6/0x20 [oops]
+  *de = 00000000
+Oops: 0000 [#1] PREEMPT DEBUG_PAGEALLOC
+last sysfs file: /sys/devices/virtual/net/lo/operstate
+Modules linked in: oops(+) netconsole pcnet32 crc32 ide_cd_mod cdrom
+
+Pid: 2754, comm: insmod Not tainted (2.6.28.4 #2) VMware Virtual Platform
+EIP: 0060:[<c89c3016>] EFLAGS: 00010292 CPU: 0
+EIP is at my_oops_init+0x6/0x20 [oops]
+EAX: 00000000 EBX: fffffffc ECX: c89c3380 EDX: 00000001
+ESI: c89c3010 EDI: 00000000 EBP: c57cbe24 ESP: c57cbe1c
+ DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
+Process insmod (pid: 2754, ti=c57cb000 task=c66ec780 task.ti=c57cb000)
+Stack:
+ c57cbe34 00000282 c57cbf8c c010102d c57b9280 0000000c c57cbe58 c01708e4
+ 00000124 00000000 c89c3380 c57cbe58 c5db1d38 00000001 c89c3380 c57cbe60
+ c0170981 c57cbf8c c014b698 00000000 00000000 c57cbf78 c57cbf20 00000580
+Call Trace:
+ [<c010102d>] ? _stext+0x2d/0x170
+ [<c01708e4>] ? __vunmap+0xa4/0xf0
+ [<c0170981>] ? vfree+0x21/0x30
+ [<c014b698>] ? load_module+0x19b8/0x1a40
+ [<c035d083>] ? printk+0x0/0x1a
+ [<c035e965>] ? __mutex_unlock_slowpath+0xd5/0x140
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c014b7aa>] ? sys_init_module+0x8a/0x1b0
+ [<c0140da6>] ? trace_hardirqs_on_caller+0x106/0x150
+ [<c0240a08>] ? trace_hardirqs_on_thunk+0xc/0x10
+ [<c0103407>] ? sysenter_do_call+0x12/0x43
+Code: <a1> 34 12 00 00 c7 04 24 54 30 9c c8 89 44 24 04 e8 58 a0 99 f7 31
+EIP: [<c89c3016>] my_oops_init+0x6/0x20 [oops] SS:ESP 0068:c57cbe1c
+---[ end trace 45eeb3d6ea8ff1ed ]---
+
+
+
+

objdump

+

Detailed information about the instruction that generated the oops can be found +using the objdump utility. Useful options to use are -d +to disassemble the code and -S for interleaving C code in assembly +language code. For efficient decoding, however, we need the address where the +kernel module was loaded. This can be found in /proc/modules.

+

Here's an example of using objdump on the above module to identify +the instruction that generated the oops:

+
faust:~/lab-01/modul-oops# cat /proc/modules
+oops 1280 1 - Loading 0xc89d4000
+netconsole 8352 0 - Live 0xc89ad000
+pcnet32 33412 0 - Live 0xc895a000
+ide_cd_mod 34952 0 - Live 0xc8903000
+crc32 4224 1 pcnet32, Live 0xc888a000
+cdrom 34848 1 ide_cd_mod, Live 0xc886d000
+
+faust:~/lab-01/modul-oops# objdump -dS --adjust-vma=0xc89d4000 oops.ko
+
+oops.ko:     file format elf32-i386
+
+
+Disassembly of section .text:
+
+c89d4000 <init_module>:
+#define OP_READ         0
+#define OP_WRITE        1
+#define OP_OOPS         OP_WRITE
+
+static int my_oops_init (void)
+{
+c89d4000:       55                      push   %ebp
+#else
+#error "Unknown op for oops!"
+#endif
+
+        return 0;
+}
+c89d4001:       31 c0                   xor    %eax,%eax
+#define OP_READ         0
+#define OP_WRITE        1
+#define OP_OOPS         OP_WRITE
+
+static int my_oops_init (void)
+{
+c89d4003:       89 e5                   mov    %esp,%ebp
+        int *a;
+
+        a = (int *) 0x00001234;
+#if OP_OOPS == OP_WRITE
+        *a = 3;
+c89d4005:       c7 05 34 12 00 00 03    movl   $0x3,0x1234
+c89d400c:       00 00 00
+#else
+#error "Unknown op for oops!"
+#endif
+
+        return 0;
+}
+c89d400f:       5d                      pop    %ebp
+c89d4010:       c3                      ret
+c89d4011:       eb 0d                   jmp    c89c3020 <cleanup_module>
+c89d4013:       90                      nop
+c89d4014:       90                      nop
+c89d4015:       90                      nop
+c89d4016:       90                      nop
+c89d4017:       90                      nop
+c89d4018:       90                      nop
+c89d4019:       90                      nop
+c89d401a:       90                      nop
+c89d401b:       90                      nop
+c89d401c:       90                      nop
+c89d401d:       90                      nop
+c89d401e:       90                      nop
+c89d401f:       90                      nop
+
+c89d4020 <cleanup_module>:
+
+static void my_oops_exit (void)
+{
+c89d4020:       55                      push   %ebp
+c89d4021:       89 e5                   mov    %esp,%ebp
+}
+c89d4023:       5d                      pop    %ebp
+c89d4024:       c3                      ret
+c89d4025:       90                      nop
+c89d4026:       90                      nop
+c89d4027:       90                      nop
+
+
+

Note that the instruction that generated the oops (c89d4005 identified +earlier) is:

+
+
C89d4005: c7 05 34 12 00 00 03 movl $ 0x3,0x1234
+

That is exactly what was expected - storing value 3 at 0x0001234.

+

The /proc/modules is used to find the address where a kernel module is +loaded. The --adjust-vma option allows you to display instructions +relative to 0xc89d4000. The -l option displays the number of +each line in the source code interleaved with the assembly language code.

+
+
+

addr2line

+

A more simplistic way to find the code that generated an oops is to use the +addr2line utility:

+
faust:~/lab-01/modul-oops# addr2line -e oops.o 0x5
+/root/lab-01/modul-oops/oops.c:23
+
+
+

Where 0x5 is the value of the program counter (EIP = c89d4005) that +generated the oops, minus the base address of the module (0xc89d4000) +according to /proc/modules

+
+
+

minicom

+

Minicom (or other equivalent utilities, eg picocom, +screen) is a utility that can be used to connect and interact with a +serial port. The serial port is the basic method for analyzing kernel messages +or interacting with an embedded system in the development phase. There are two +more common ways to connect:

+
    +
  • a serial port where the device we are going to use is /dev/ttyS0
  • +
  • a serial USB port (FTDI) in which case the device we are going to use is +/dev/ttyUSB.
  • +
+

For the virtual machine used in the lab, the device that we need to use is +displayed after the virtual machine starts:

+
char device redirected to /dev/pts/20 (label virtiocon0)
+
+
+

Minicom use:

+
#for connecting via COM1 and using a speed of 115,200 characters per second
+minicom -b 115200 -D /dev/ttyS0
+
+#For USB serial port connection
+minicom -D /dev/ttyUSB0
+
+#To connect to the serial port of the virtual machine
+minicom -D /dev/pts/20
+
+
+
+
+

netconsole

+

Netconsole is a utility that allows logging of kernel debugging +messages over the network. This is useful when the disk logging system does not +work or when serial ports are not available or when the terminal does not +respond to commands. Netconsole comes in the form of a kernel +module.

+

To work, it needs the following parameters:

+
+
    +
  • port, IP address, and the source interface name of the debug station
  • +
  • port, MAC address, and IP address of the machine to which the debug +messages will be sent
  • +
+
+

These parameters can be configured when the module is inserted into the kernel, +or even while the module is inserted if it has been compiled with the +CONFIG_NETCONSOLE_DYNAMIC option.

+

An example configuration when inserting netconsole kernel module is +as follows:

+
alice:~# modprobe netconsole netconsole=6666@192.168.191.130/eth0,6000@192.168.191.1/00:50:56:c0:00:08
+
+
+

Thus, the debug messages on the station that has the address +192.168.191.130 will be sent to the eth0 interface, having source port +6666. The messages will be sent to 192.168.191.1 with the MAC address +00:50:56:c0:00:08, on port 6000.

+

Messages can be played on the destination station using netcat:

+
bob:~ # nc -l -p 6000 -u
+
+
+

Alternatively, the destination station can configure syslogd to +intercept these messages. More information can be found in +Documentation/networking/netconsole.txt.

+
+
+

Printk debugging

+

The two oldest and most useful debugging aids are Your Brain and Printf.

+

For debugging, a primitive way is often used, but it is quite effective: +printk debugging. Although a debugger can also be used, it is generally +not very useful: simple bugs (uninitialized variables, memory management +problems, etc.) can be easily localized by control messages and the +kernel-decoded oop message.

+

For more complex bugs, even a debugger can not help us too much unless the +operating system structure is very well understood. When debugging a kernel +module, there are a lot of unknowns in the equation: multiple contexts (we have +multiple processes and threads running at a time), interruptions, virtual +memory, etc.

+

You can use printk to display kernel messages to user space. It is +similar to printf's functionality; the only difference is that the +transmitted message can be prefixed with a string of "<n>", where +n indicates the error level (loglevel) and has values between 0 and +7. Instead of "<n>", the levels can also be coded by symbolic +constants:

+
KERN_EMERG - n = 0
+KERN_ALERT - n = 1
+KERN_CRIT - n = 2
+KERN_ERR - n = 3
+KERN_WARNING - n = 4
+KERN_NOTICE - n = 5
+KERN_INFO - n = 6
+KERN_DEBUG - n = 7
+
+
+

The definitions of all log levels are found in linux/kern_levels.h. +Basically, these log levels are used by the system to route messages sent to +various outputs: console, log files in /var/log etc.

+
+

Note

+

To display printk messages in user space, the printk +log level must be of higher priority than console_loglevel +variable. The default console log level can be configured from +/proc/sys/kernel/printk.

+

For instance, the command:

+
echo 8 > /proc/sys/kernel/printk
+
+
+

will enable all the kernel log messages to be displayed in the +console. That is, the logging level has to be strictly less than the +console_loglevel variable. For example, if the +console_loglevel has a value of 5 (specific to +KERN_NOTICE), only messages with loglevel stricter than 5 +(i.e KERN_EMERG, KERN_ALERT, KERN_CRIT, +KERN_ERR, KERN_WARNING) will be shown.

+
+

Console-redirected messages can be useful for quickly viewing the effect of +executing the kernel code, but they are no longer so useful if the kernel +encounters an irreparable error and the system freezes. In this case, the logs +of the system must be consulted, as they keep the information between system +restarts. These are found in /var/log and are text files, populated by +syslogd and klogd during the kernel run. syslogd and +klogd take the information from the virtual file system mounted in +/proc. In principle, with syslogd and klogd turned on, +all messages coming from the kernel will go to /var/log/kern.log.

+

A simpler version for debugging is using the /var/log/debug file. It +is populated only with the printk messages from the kernel with the +KERN_DEBUG log level.

+

Given that a production kernel (similar to the one we're probably running with) +contains only release code, our module is among the few that send messages +prefixed with KERN_DEBUG . In this way, we can easily navigate through the +/var/log/debug information by finding the messages corresponding to a +debugging session for our module.

+

Such an example would be the following:

+
# Clear the debug file of previous information (or possibly a backup)
+$ echo "New debug session" > /var/log/debug
+# Run the tests
+# If there is no critical error causing a panic kernel, check the output
+# if a critical error occurs and the machine only responds to a restart,
+  restart the system and check /var/log/debug.
+
+
+

The format of the messages must obviously contain all the information of +interest in order to detect the error, but inserting in the code printk +to provide detailed information can be as time-consuming as writing the code to +solve the problem. This is usually a trade-off between the completeness of the +debugging messages displayed using printk and the time it takes to +insert these messages into the text.

+

A very simple way, less time-consuming for inserting printk and +providing the possibility to analyze the flow of instructions for tests is the +use of the predefined constants __FILE__, __LINE__ and +__func__:

+
+
    +
  • __FILE__ is replaced by the compiler with the name of the source file +it is currently being compiled.
  • +
  • __LINE__ is replaced by the compiler with the line number on which the +current instruction is found in the current source file.
  • +
  • __func__ /__FUNCTION__ is replaced by the compiler with the name +of the function in which the current instruction is found.
  • +
+
+
+

Note

+

__FILE__ and __LINE__ are part of the ANSI C specifications: +__func__ is part of specification C99; __FUNCTION__ is a GNU +C extension and is not portable; However, since we write code for the +Linux kernel, we can use it without any problems.

+
+

The following macro definition can be used in this case:

+
#define PRINT_DEBUG \
+       printk (KERN_DEBUG "[% s]: FUNC:% s: LINE:% d \ n", __FILE__,
+               __FUNCTION__, __LINE__)
+
+
+

Then, at each point where we want to see if it is "reached" in execution, +insert PRINT_DEBUG; This is a simple and quick way, and can yield by carefully +analyzing the output.

+

The dmesg command is used to view the messages printed with +printk but not appearing on the console.

+

To delete all previous messages from a log file, run:

+
cat /dev/null > /var/log/debug
+
+
+

To delete messages displayed by the dmesg command, run:

+
dmesg -c
+
+
+
+
+

Dynamic debugging

+

Dynamic dyndbg +debugging enables dynamic debugging activation/deactivation. +Unlike printk, it offers more advanced printk options for the +messages we want to display; it is very useful for complex modules or +troubleshooting subsystems. +This significantly reduces the amount of messages displayed, leaving only +those relevant for the debug context. To enable dyndbg, the kernel must be +compiled with the CONFIG_DYNAMIC_DEBUG option. Once configured, +pr_debug(), dev_dbg() and print_hex_dump_debug(), +print_hex_dump_bytes() can be dynamically enabled per call.

+

The /sys/kernel/debug/dynamic_debug/control file from the debugfs (where +/sys/kernel/debug is the path to which debugfs was mounted) is used to +filter messages or to view existing filters.

+
mount -t debugfs none /debug
+
+
+

Debugfs +is a simple file system, used as a kernel-space interface and +user-space interface to configure different debug options. Any debug utility +can create and use its own files /folders in debugfs.

+

For example, to display existing filters in dyndbg, you will use:

+
cat /debug/dynamic_debug/control
+
+
+

And to enable the debug message from line 1603 in the svcsock.c file:

+
echo 'file svcsock.c line 1603 +p' > /debug/dynamic_debug/control
+
+
+

The /debug/dynamic_debug/control file is not a regular file. It shows +the dyndbg settings on the filters. Writing in it with an echo will change +these settings (it will not actually make a write). Be aware that the file +contains settings for dyndbg debugging messages. Do not log in this file.

+
+

Dyndbg Options

+
    +
  • func - just the debug messages from the functions that have the same +name as the one defined in the filter.

    +
    echo 'func svc_tcp_accept +p' > /debug/dynamic_debug/control
    +
    +
    +
  • +
  • file - the name of the file(s) for which we want to display the debug +messages. It can be just the source name, but also the absolute path or +kernel-tree path.

    +
    file svcsock.c
    +file kernel/freezer.c
    +file /usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svcsock.c
    +
    +
    +
  • +
  • module - module name.

    +
    module sunrpc
    +
    +
    +
  • +
  • format - only messages whose display format contains the specified string.

    +
    format "nfsd: SETATTR"
    +
    +
    +
  • +
  • line - the line or lines for which we want to enable debug calls.

    +
    # Triggers debug messages between lines 1603 and 1605 in the svcsock.c file
    +$ echo 'file svcsock.c line 1603-1605 +p' > /sys/kernel/debug/dynamic_debug/control
    +# Enables debug messages from the beginning of the file to line 1605
    +$ echo 'file svcsock.c line -1605 +p' > /sys/kernel/debug/dynamic_debug/control
    +
    +
    +
  • +
+

In addition to the above options, a series of flags can be added, removed, or set +with operators +, - or =:

+
+
    +
  • p activates the pr_debug() .
  • +
  • f includes the name of the function in the printed message.
  • +
  • l includes the line number in the printed message.
  • +
  • m includes the module name in the printed message.
  • +
  • t includes the thread id if it is not called from interrupt context
  • +
  • _ no flag is set.
  • +
+
+
+
+
+

KDB: Kernel debugger

+

The kernel debugger has proven to be very useful to facilitate the development and +debugging process. One of its main advantages is the possibility to perform live debugging. +This allows us to monitor, in real time, the accesses to memory or even modify the memory +while debugging. +The debugger has been integrated in the mainline kernel starting with version 2.6.26-rci. +KDB is not a source debugger, but for a complete analysis it can be used in parallel with +gdb and symbol files -- see the GDB debugging section

+

To use KDB, you have the following options:

+
+
    +
  • non-usb keyboard + VGA text console
  • +
  • serial port console
  • +
  • USB EHCI debug port
  • +
+
+

For the lab, we will use a serial interface connected to the host. +The following command will activate GDB over the serial port:

+
echo hvc0 > /sys/module/kgdboc/parameters/kgdboc
+
+
+

KDB is a stop mode debugger, which means that, while it is active, all the other processes +are stopped. The kernel can be forced to enter KDB during execution using the following +SysRq command

+
echo g > /proc/sysrq-trigger
+
+
+

or by using the key combination Ctrl+O g in a terminal connected to the serial port +(for example using minicom).

+

KDB has various commands to control and define the context of the debugged system:

+
+
    +
  • lsmod, ps, kill, dmesg, env, bt (backtrace)
  • +
  • dump trace logs
  • +
  • hardware breakpoints
  • +
  • modifying memory
  • +
+
+

For a better description of the available commands you can use the help command in +the KDB shell. +In the next example, you can notice a simple KDB usage example which sets a hardware +breakpoint to monitor the changes of the mVar variable.

+
# trigger KDB
+echo g > /proc/sysrq-trigger
+# or if we are connected to the serial port issue
+Ctrl-O g
+# breakpoint on write access to the mVar variable
+kdb> bph mVar dataw
+# return from KDB
+kdb> go
+
+
+
+

Note

+

If you want to learn how to easily browse through the Linux source code +and how to debug kernel code, read the Good to know +section.

+
+
+
+
+

Exercises

+
+

Remarks

+
+

Note

+
    +
  • Usually, the steps used to develop a kernel module are the +following:
      +
    • editing the module source code (on the physical machine);
    • +
    • module compilation (on the physical machine);
    • +
    • generation of the minimal image for the virtual machine; +this image contains the kernel, your module, busybox and +eventually test programs;
    • +
    • starting the virtual machine using QEMU;
    • +
    • running the tests in the virtual machine.
    • +
    +
  • +
  • When using cscope, use ~/src/linux. +If there is no cscope.out file, you can generate it using +the command make ARCH=x86 cscope.
  • +
  • You can find more details about the virtual machine at +Recommended Setup.
  • +
+
+
+

Important

+

Before solving an exercice, carefully read all its bullets.

+
+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is kernel_modules. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/kernel_modules/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+
+

1. Kernel module

+

To work with the kernel modules, we will follow the steps described +above.

+
+
Generate the skeleton for the task named 1-2-test-mod then build the module,
+
by running the following command in tools/labs.
+
+
$ LABS=kernel_modules make skels
+$ make build
+
+
+

These command will build all the modules in the current +lab skeleton.

+
+

Warning

+

Until after solving exercise 3, you will get a compilation error for +3-error-mod. To avoid this issue, remove the directory +skels/kernel_modules/3-error-mod/ and remove the corresponding +line from skels/Kbuild.

+
+

Start the VM using make console, and perform the following tasks:

+
    +
  • load the kernel module.
  • +
  • list the kernel modules and check if current module is present
  • +
  • unload the kernel module
  • +
  • view the messages displayed at loading/unloading the kernel module using +dmesg command
  • +
+
+

Note

+

Read Loading/unloading a kernel module section. When unloading +a kernel module, you can specify only the module name +(without extension).

+
+
+
+

2. Printk

+

Watch the virtual machine console. Why were the messages displayed directly +to the virtual machine console?

+

Configure the system such that the messages are not displayed directly +on the serial console, and they can only be inspected using dmesg.

+
+

Hint

+

One option is to set the console log level by writting +the desired level to /proc/sys/kernel/printk. +Use a value smaller than the level used for the prints in +the source code of the module.

+
+

Load/unload the module again. +The messages should not be printed to the virtual machine console, +but they should be visible when running dmesg.

+
+
+

3. Error

+

Generate the skeleton for the task named 3-error-mod. Compile the +sources and get the corresponding kernel module.

+

Why have compilation +errors occurred? Hint: How does this module differ from the previous module?

+

Modify the module to solve the cause of those errors, then compile and test +the module.

+
+
+

4. Sub-modules

+

Inspect the C source files mod1.c and mod2.c in 4-multi-mod/. +Module 2 contains only the definition of a function used by module 1.

+

Change the Kbuild file to create the multi_mod.ko module from the +two C source files.

+
+

Hint

+

Read the Compiling kernel modules section of the lab.

+
+

Compile, copy, boot the VM, load and unload the kernel module. Make sure messages +are properly displayed on the console.

+
+
+

5. Kernel oops

+

Enter the directory for the task 5-oops-mod and inspect the +C source file. Notice where the problem will occur. Add the compilation flag +-g in the Kbuild file.

+
+

Hint

+

Read Compiling kernel modules section of the lab.

+
+

Compile the corresponding module and load it into the kernel. Identify the memory +address at which the oops appeared.

+
+

Hint

+

Read `Debugging`_ section of the lab. To identify the +address, follow the oops message and extract the value of +the instructions pointer (EIP) register.

+
+

Determine which instruction has triggered the oops.

+
+

Hint

+

Use the proc/modules information to get the load address of +the kernel module. Use, on the physical machine, objdump +and/or addr2line . Objdump needs debugging support for +compilation! Read the lab's objdump and addr2line +sections.

+
+

Try to unload the kernel module. Notice that the operation does not +work because there are references from the kernel module within the +kernel since the oops; Until the release of those references (which is +almost impossible in the case of an oops), the module can not be +unloaded.

+
+
+

6. Module parameters

+

Enter the directory for the task 6-cmd-mod and inspect the C +cmd_mod.c source file. Compile and copy the associated module and +load the kernel module to see the printk message. Then unload the +module from the kernel.

+

Without modifying the sources, load the kernel module so that the +message shown is Early bird gets tired.

+
+

Hint

+

The str variable can be changed by passing a parameter to +the module. Find more information here.

+
+
+
+

7. Proc info

+

Check the skeleton for the task named 7-list-proc. Add code to +display the Process ID (PID) and the executable name for the current +process.

+

Follow the commands marked with TODO. +The information must be displayed both when loading and unloading the +module.

+
+

Note

+
    +
  • In the Linux kernel, a process is described by the +struct task_struct. Use LXR or cscope to find the +definition of struct task_struct.
  • +
  • To find the structure field that contains the name of the +executable, look for the "executable" comment.
  • +
  • The pointer to the structure of the current process +running at a given time in the kernel is given by the +current variable (of the type +struct task_struct*).
  • +
+
+
+

Hint

+

To use current you'll need to include the header +in which the struct task_struct is defined, i.e +linux/sched.h.

+
+

Compile, copy, boot the VM and load the module. Unload the kernel module.

+

Repeat the loading/unloading operation. Note that the PIDs of the +displayed processes differ. This is because a process is created +from the executable /sbin/insmod when the module is loaded and +when the module is unloaded a process is created from the executable +/sbin/rmmod.

+
+
+
+

Good to know

+

The following sections contain useful information for getitng used to the Linux +kernel code and debugging techniques.

+
+
+

Source code navigation

+
+

cscope

+

Cscope is a tool for +efficient navigation of C sources. To use it, a cscope database must +be generated from the existing sources. In a Linux tree, the command +make ARCH=x86 cscope is sufficient. Specification of the +architecture through the ARCH variable is optional but recommended; +otherwise, some architecture dependent functions will appear multiple +times in the database.

+

You can build the cscope database with the command make +ARCH=x86 COMPILED_SOURCE=1 cscope. This way, the cscope database will +only contain symbols that have already been used in the compile +process before, thus resulting in better performance when searching +for symbols.

+

Cscope can also be used as stand-alone, but it is more useful when +combined with an editor. To use cscope with vim, it is necessary to +install both packages and add the following lines to the file +.vimrc (the machine in the lab already has the settings):

+
if has("cscope")
+        " Look for a 'cscope.out' file starting from the current directory,
+        " going up to the root directory.
+        let s:dirs = split(getcwd(), "/")
+        while s:dirs != []
+                let s:path = "/" . join(s:dirs, "/")
+                if (filereadable(s:path . "/cscope.out"))
+                        execute "cs add " . s:path . "/cscope.out " . s:path . " -v"
+                        break
+                endif
+                let s:dirs = s:dirs[:-2]
+        endwhile
+
+        set csto=0  " Use cscope first, then ctags
+        set cst     " Only search cscope
+        set csverb  " Make cs verbose
+
+        nmap `<C-\>`s :cs find s `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`g :cs find g `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`c :cs find c `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`t :cs find t `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`e :cs find e `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap `<C-\>`f :cs find f `<C-R>`=expand("`<cfile>`")`<CR>``<CR>`
+        nmap `<C-\>`i :cs find i ^`<C-R>`=expand("`<cfile>`")`<CR>`$`<CR>`
+        nmap `<C-\>`d :cs find d `<C-R>`=expand("`<cword>`")`<CR>``<CR>`
+        nmap <F6> :cnext <CR>
+        nmap <F5> :cprev <CR>
+
+        " Open a quickfix window for the following queries.
+        set cscopequickfix=s-,c-,d-,i-,t-,e-,g-
+endif
+
+
+

The script searches for a file called cscope.out in the current directory, or +in parent directories. If vim finds this file, you can use the shortcut Ctrl +] +or Ctrl+\ g (the combination control-\ followed by g) to jump directly to +the definition of the word under the cursor (function, variable, structure, etc.). +Similarly, you can use Ctrl+\ s to go where the word under the cursor is used.

+

You can take a cscope-enabled .vimrc file (also contains other goodies) from +https://github.com/ddvlad/cfg/blob/master/_vimrc. +The following guidelines are based on this file, but also show basic vim commands +that have the same effect.

+

If there are more than one results (usually there are) you can move between them +using F6 and F5 (:ccnext and :cprev). +You can also open a new panel showing the results using :copen. To close +the panel, use the :cclose command.

+

To return to the previous location, use Ctrl+o (o, not zero). +The command can be used multiple times and works even if cscope changed the +file you are currently editing.

+

To go to a symbol definition directly when vim starts, use vim -t <symbol_name> +(for example vim -t task_struct). Otherwise, if you started vim and want +to search for a symbol by name, use cs find g <symbol_name> (for example +cs find g task_struct).

+

If you found more than one results and opened a panel showing all the matches +(using :copen) and you want to find a symbol of type structure, +it is recommended to search in the results panel (using / -- slash) +the character { (opening brace).

+
+

Important

+

You can get a summary of all the cscope commands using :cs help.

+

For more info, use the vim built-in help command: :h cscope or :h copen.

+
+

If you use emacs, install the xcscope-el package and +add the following lines in ~/.emacs.

+
(require ‘xcscope)
+(cscope-setup)
+
+
+

These commands will activate cscope for the C and C++ modes automatically. +C-s s is the key bindings prefix and C-s s s is used to +search for a symbol (if you call it when the cursor is over a word, +it will use that). For more details, check https://github.com/dkogan/xcscope.el

+
+
+

clangd

+

Clangd is a language server that provides tools +for navigating C and C++ code. +Language Server Protocol +facilitates features like go-to-definition, find-references, hover, completion, etc., +using semantic whole project analysis.

+

Clangd requires a compilation database to understand the kernel source code. +It can be generated with:

+
make defconfig
+make
+scripts/clang-tools/gen_compile_commands.py
+
+
+

LSP clients:

+ +
+
+

Kscope

+

For a simpler interface, Kscope +is a cscope frontend which uses QT. It is lightweight, very fast and very +easy to use. It allows searching using regular expressions, call graphs, etc. +Kscope is no longer mantained.

+

There is also a port +of version 1.6 for Qt4 and KDE 4 which keeps the integration of the text +editor Kate and is easier to use than the last version on SourceForge.

+
+
+

LXR Cross-Reference

+

LXR (LXR Cross-Reference) is a tool that allows indexing and +referencing the symbols in the source code of a program using +a web interface. The web interface shows links to +locations in files where a symbol is defined or used. Development website +for LXR is http://sourceforge.net/projects/lxr. Similar tools +are OpenGrok and +Gonzui.

+

Although LXR was originally intended for the Linux kernel sources, it is +also used in the sources of Mozilla, +Apache HTTP Server and +FreeBSD.

+

There are a number of sites that use LXR for cross-referencing the +the sources of the Linux kernel, the main site being the original site of +development which does not work anymore. You can +use https://elixir.bootlin.com/.

+

LXR allows searching for an identifier (symbol), after a free text +or after a file name. The main feature and, at the same +time, the main advantage provided is the ease of finding the declaration +of any global identifier. This way, it facilitates quick access to function +declarations, variables, macro definitions and the code can be easily +navigated. Also, the fact that it can detect what code areas are affected +when a variable or function is changed is a real advantage in the development +and debugging phase.

+
+
+

SourceWeb

+

SourceWeb is a source code indexer +for C and C++. It uses the +framework +provided by the Clang compiler to index the code.

+

The main difference between cscope and SourceWeb is the fact that SourceWeb +is, in a way, a compiler pass. SourceWeb doesn't index all the code, but +only the code that was efectively compiled by the compiler. This way, some +problems are eliminated, such as ambiguities about which variant of a function +defined in multiple places is used. This also means that the indexing takes +more time, because the compiled files must pass one more time through +the indexer to generate the references.

+

Usage example:

+
make oldconfig
+sw-btrace make -j4
+sw-btrace-to-compile-db
+sw-clang-indexer --index-project
+sourceweb index
+
+
+

sw-btrace is a script that adds the libsw-btrace.so +library to LD_PRELOAD. This way, the library is loaded by +every process started by make (basically, the compiler), +registers the commands used to start the processes and generates +a filed called btrace.log. This file is then used by +sw-btrace-to-compile-db which converts it to a format defined +by clang: JSON Compilation Database. +This JSON Compilation Database resulted from the above steps is then +used by the indexer, which makes one more pass through the compiled +source files and generates the index used by the GUI.

+

Word of advice: don't index the sources you are working with, but use +a copy, because SourceWeb doesn't have, at this moment, the capability +to regenerate the index for a single file and you will have to regenerate +the complete index.

+
+
+
+

Kernel Debugging

+

Debugging a kernel is a much more difficult process than the debugging +of a program, because there is no support from the operating system. +This is why this process is usually done using two computers, connected +on serial interfaces.

+
+

gdb (Linux)

+

A simpler debug method on Linux, but with many disadvantages, +is local debugging, using gdb, +the uncompressed kernel image (vmlinux) and /proc/kcore +(the real-time kernel image). This method is usually used to inspect +the kernel and detect certain inconsistencies while it runs. The +method is useful especially if the kernel was compiled using the +-g option, which keeps debug information. Some well-known +debug techniques can't be used by this method, such as breakpoints +of data modification.

+
+

Note

+

Because /proc is a virtual filesystem, /proc/kcore +does not physically exist on the disk. It is generated on-the-fly +by the kernel when a program tries to access proc/kcore.

+

It is used for debugging purposes.

+

From man proc, we have:

+
/proc/kcore
+This file represents the physical memory of the system and is stored in the ELF core file format.  With this pseudo-file, and
+an unstripped kernel (/usr/src/linux/vmlinux) binary, GDB can be used to examine the current state of any kernel data struc‐
+tures.
+
+
+
+

The uncompressed kernel image offers information about the data structures +and symbols it contains.

+
student@eg106$ cd ~/src/linux
+student@eg106$ file vmlinux
+vmlinux: ELF 32-bit LSB executable, Intel 80386, ...
+student@eg106$ nm vmlinux | grep sys_call_table
+c02e535c R sys_call_table
+student@eg106$ cat System.map | grep sys_call_table
+c02e535c R sys_call_table
+
+
+

The nm utility is used to show the symbols in an object or +executable file. In our case, vmlinux is an ELF file. Alternately, +we can use the file System.map to view information about the +symbols in kernel.

+

Then we use gdb to inspect the symbols using the uncompressed +kernel image. A simple gdb session is the following:

+
student@eg106$ cd ~/src/linux
+stduent@eg106$ gdb --quiet vmlinux
+Using host libthread_db library "/lib/tls/libthread_db.so.1".
+(gdb) x/x 0xc02e535c
+0xc02e535c `<sys_call_table>`:    0xc011bc58
+(gdb) x/16 0xc02e535c
+0xc02e535c `<sys_call_table>`:    0xc011bc58      0xc011482a      0xc01013d3     0xc014363d
+0xc02e536c `<sys_call_table+16>`: 0xc014369f      0xc0142d4e      0xc0142de5     0xc011548b
+0xc02e537c `<sys_call_table+32>`: 0xc0142d7d      0xc01507a1      0xc015042c     0xc0101431
+0xc02e538c `<sys_call_table+48>`: 0xc014249e      0xc0115c6c      0xc014fee7     0xc0142725
+(gdb) x/x sys_call_table
+0xc011bc58 `<sys_restart_syscall>`:       0xffe000ba
+(gdb) x/x &sys_call_table
+0xc02e535c `<sys_call_table>`:    0xc011bc58
+(gdb) x/16 &sys_call_table
+0xc02e535c `<sys_call_table>`:    0xc011bc58      0xc011482a      0xc01013d3     0xc014363d
+0xc02e536c `<sys_call_table+16>`: 0xc014369f      0xc0142d4e      0xc0142de5     0xc011548b
+0xc02e537c `<sys_call_table+32>`: 0xc0142d7d      0xc01507a1      0xc015042c     0xc0101431
+0xc02e538c `<sys_call_table+48>`: 0xc014249e      0xc0115c6c      0xc014fee7     0xc0142725
+(gdb) x/x sys_fork
+0xc01013d3 `<sys_fork>`:  0x3824548b
+(gdb) disass sys_fork
+Dump of assembler code for function sys_fork:
+0xc01013d3 `<sys_fork+0>`:        mov    0x38(%esp),%edx
+0xc01013d7 `<sys_fork+4>`:        mov    $0x11,%eax
+0xc01013dc `<sys_fork+9>`:        push   $0x0
+0xc01013de `<sys_fork+11>`:       push   $0x0
+0xc01013e0 `<sys_fork+13>`:       push   $0x0
+0xc01013e2 `<sys_fork+15>`:       lea    0x10(%esp),%ecx
+0xc01013e6 `<sys_fork+19>`:       call   0xc0111aab `<do_fork>`
+0xc01013eb `<sys_fork+24>`:       add    $0xc,%esp
+0xc01013ee `<sys_fork+27>`:       ret
+End of assembler dump.
+
+
+

It can be noticed that the uncompressed kernel image was used as an argument +for gdb. The image can be found in the root of the kernel sources +after compilation.

+

A few commands used for debugging using gdb are:

+
    +
  • x (examine) - Used to show the contents of the memory area +whose address is specified as an argument to the command (this address +can be the value of a physical address, a symbol or the address of a +symbol). It can take as arguments (preceded by /): the format +to display the data in (x for hexadecimal, d for +decimal, etc.), how many memory units to display and the size of a +memory unit.
  • +
  • disassemble - Used to disassemble a function.
  • +
  • p (print) - Used to evaluate and show the value of an +expression. The format to show the data in can be specified as +an argument (/x for hexadecimal, /d for decimal, etc.).
  • +
+

The analysis of the kernel image is a method of static analysis. If we +want to perform dynamic analysis (analyzing how the kernel runs, not +only its static image) we can use /proc/kcore; this is a dynamic +image (in memory) of the kernel.

+
student@eg106$ gdb ~/src/linux/vmlinux /proc/kcore
+Core was generated by `root=/dev/hda3 ro'.
+#0  0x00000000 in ?? ()
+(gdb) p sys_call_table
+$1 = -1072579496
+(gdb) p /x sys_call_table
+$2 = 0xc011bc58
+(gdb) p /x &sys_call_table
+$3 = 0xc02e535c
+(gdb) x/16 &sys_call_table
+0xc02e535c `<sys_call_table>`:    0xc011bc58      0xc011482a      0xc01013d3     0xc014363d
+0xc02e536c `<sys_call_table+16>`: 0xc014369f      0xc0142d4e      0xc0142de5     0xc011548b
+0xc02e537c `<sys_call_table+32>`: 0xc0142d7d      0xc01507a1      0xc015042c     0xc0101431
+0xc02e538c `<sys_call_table+48>`: 0xc014249e      0xc0115c6c      0xc014fee7     0xc0142725
+
+
+

Using the dynamic image of the kernel is useful for detecting rootkits.

+ +
+
+

Getting a stack trace

+

Sometimes, you will want information about the trace the execution +reaches a certain point. You can determine this information using +cscope or LXR, but some function are called from many +execution paths, which makes this method difficult.

+

In these situations, it is useful to get a stack trace, which can be +simply done using the function dump_stack().

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab10-networking.html b/refs/pull/405/merge/so2/lab10-networking.html new file mode 100644 index 00000000..3de3e5af --- /dev/null +++ b/refs/pull/405/merge/so2/lab10-networking.html @@ -0,0 +1,1418 @@ + + + + + + SO2 Lab 10 - Networking — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 10 - Networking

+
+

Lab objectives

+
+
    +
  • Understanding the Linux kernel networking architecture
  • +
  • Acquiring practical IP packet management skills using a packet filter or +firewall
  • +
  • Familiarize yourself with how to use sockets at the Linux kernel level
  • +
+
+
+
+

Overview

+

The development of the Internet has led to an exponential increase in network +applications and, as a consequence, to increasing the speed and productivity +requirements of an operating system's networking subsystem. The networking +subsystem is not an essential component of an operating system kernel (the Linux +kernel can be compiled without networking support). It is, however, quite +unlikely for a computing system (or even an embedded device) to have a +non-networked operating system due to the need for connectivity. Modern operating +systems use the TCP/IP stack. Their kernel +implements protocols up to the transport layer, while application layer protocols +are typically implemented in user space (HTTP, FTP, SSH, etc.).

+
+

Networking in user space

+

In user space the abstraction of network communication is the socket. The +socket abstracts a communication channel and is the kernel-based TCP/IP stack +interaction interface. An IP socket is associated with an IP address, the +transport layer protocol used (TCP, UDP etc) and a port. Common function calls +that use sockets are: creation (socket), initialization +(bind), connecting (connect), waiting for a connection +(listen, accept), closing a socket (close).

+

Network communication is accomplished via read/write or recv/send calls +for TCP sockets and recvfrom/sendto for UDP sockets. Transmission and +reception operations are transparent to the application, leaving encapsulation +and transmission over network at the kernel's discretion. However, it is +possible to implement the TCP/IP stack in user space using raw sockets (the +PF_PACKET option when creating a socket), or implementing an application +layer protocol in kernel (TUX web server).

+

For more details about user space programming using sockets, see Beej's Guide to +Network Programming Using Internet +Sockets.

+
+
+
+

Linux networking

+

The Linux kernel provides three basic structures for working with network +packets: struct socket, struct sock and struct +sk_buff.

+

The first two are abstractions of a socket:

+
+
    +
  • struct socket is an abstraction very close to user space, ie BSD +sockets used to program +network applications;
  • +
  • struct sock or INET socket in Linux terminology is the network +representation of a socket.
  • +
+
+

The two structures are related: the struct socket contains an INET +socket field, and the struct sock has a BSD socket that holds it.

+

The struct sk_buff structure is the representation of a network packet +and its status. The structure is created when a kernel packet is received, +either from the user space or from the network interface.

+
+

The struct socket structure

+

The struct socket structure is the kernel representation of a BSD +socket, the operations that can be executed on it are similar to those offered +by the kernel (through system calls). Common operations with sockets +(creation, initialization/bind, closing, etc.) result in specific system +calls; they work with the struct socket structure.

+

The struct socket operations are described in net/socket.c and +are independent of the protocol type. The struct socket structure is thus +a generic interface over particular network operations implementations. +Typically, the names of these operations begin with the sock_ prefix.

+
+

Operations on the socket structure

+

Socket operations are:

+
+
Creation
+

Creation is similar to calling the socket() function in user space, but the +struct socket created will be stored in the res parameter:

+
+
    +
  • int sock_create(int family, int type, int protocol, struct socket **res) +creates a socket after the socket() system call;
  • +
  • int sock_create_kern(struct net *net, int family, int type, int protocol, +struct socket **res) creates a kernel socket;
  • +
  • int sock_create_lite(int family, int type, int protocol, struct socket **res) +creates a kernel socket without parameter sanity checks.
  • +
+
+

The parameters of these calls are as follows:

+
+
    +
  • net, where it is present, used as reference to the network namespace used; +we will usually initialize it with init_net;
  • +
  • family represents the family of protocols used in the transfer of +information; they usually begin with the PF_ (Protocol Family) string; +the constants representing the family of protocols used are found in +linux/socket.h, of which the most commonly used is PF_INET, for +TCP/IP protocols;
  • +
  • type is the type of socket; the constants used for this parameter are +found in linux/net.h, of which the most used are SOCK_STREAM for +a connection based source-to-destination communication and SOCK_DGRAM +for connectionless communication;
  • +
  • protocol represents the protocol used and is closely related to the +type parameter; the constants used for this parameter are found in +linux/in.h, of which the most used are IPPROTO_TCP for TCP and +IPPROTO_UDP for UDP.
  • +
+
+

To create a TCP socket in kernel space, you must call:

+
struct socket *sock;
+int err;
+
+err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+if (err < 0) {
+        /* handle error */
+}
+
+
+

and for creating UDP sockets:

+
struct socket *sock;
+int err;
+
+err = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+if (err < 0) {
+        /* handle error */
+}
+
+
+

A usage sample is part of the sys_socket() system call handler:

+
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
+{
+      int retval;
+      struct socket *sock;
+      int flags;
+
+      /* Check the SOCK_* constants for consistency.  */
+      BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
+      BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
+      BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
+      BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
+
+      flags = type & ~SOCK_TYPE_MASK;
+      if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+              return -EINVAL;
+      type &= SOCK_TYPE_MASK;
+
+      if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+              flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+      retval = sock_create(family, type, protocol, &sock);
+      if (retval < 0)
+              goto out;
+
+      return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
+}
+
+
+
+
+
Closing
+

Close connection (for sockets using connection) and release associated +resources:

+
+
    +
  • void sock_release(struct socket *sock) calls the release function in +the ops field of the socket structure:
  • +
+
+
void sock_release(struct socket *sock)
+{
+      if (sock->ops) {
+              struct module *owner = sock->ops->owner;
+
+              sock->ops->release(sock);
+              sock->ops = NULL;
+              module_put(owner);
+      }
+      //...
+}
+
+
+
+
+
Sending/receiving messages
+

The messages are sent/received using the following functions:

+
+
    +
  • int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags);
  • +
  • int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags);
  • +
  • int sock_sendmsg(struct socket *sock, struct msghdr *msg);
  • +
  • int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size);
  • +
+
+

The message sending/receiving functions will then call the sendmsg/ +recvmsg function in the ops field of the socket. Functions +containing kernel_ as a prefix are used when the socket is used in the +kernel.

+

The parameters are:

+
+
    +
  • msg, a struct msghdr structure, containing the message to be +sent/received. Among the important components of this structure are msg_name +and msg_namelen, which, for UDP sockets, must be filled in with the address +to which the message is sent (struct sockaddr_in);
  • +
  • vec, a struct kvec structure, containing a pointer to the buffer +containing its data and size; as can be seen, it has a similar structure to the +struct iovec structure (the struct iovec structure +corresponds to the user space data, and the struct kvec structure +corresponds to kernel space data).
  • +
+
+

A usage example can be seen in the sys_sendto() system call handler:

+
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
+              unsigned int, flags, struct sockaddr __user *, addr,
+              int, addr_len)
+{
+      struct socket *sock;
+      struct sockaddr_storage address;
+      int err;
+      struct msghdr msg;
+      struct iovec iov;
+      int fput_needed;
+
+      err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter);
+      if (unlikely(err))
+              return err;
+      sock = sockfd_lookup_light(fd, &err, &fput_needed);
+      if (!sock)
+              goto out;
+
+      msg.msg_name = NULL;
+      msg.msg_control = NULL;
+      msg.msg_controllen = 0;
+      msg.msg_namelen = 0;
+      if (addr) {
+              err = move_addr_to_kernel(addr, addr_len, &address);
+              if (err < 0)
+                      goto out_put;
+              msg.msg_name = (struct sockaddr *)&address;
+              msg.msg_namelen = addr_len;
+      }
+      if (sock->file->f_flags & O_NONBLOCK)
+              flags |= MSG_DONTWAIT;
+      msg.msg_flags = flags;
+      err = sock_sendmsg(sock, &msg);
+
+out_put:
+      fput_light(sock->file, fput_needed);
+out:
+      return err;
+}
+
+
+
+
+
+

The struct socket fields

+
/**
+ *  struct socket - general BSD socket
+ *  @state: socket state (%SS_CONNECTED, etc)
+ *  @type: socket type (%SOCK_STREAM, etc)
+ *  @flags: socket flags (%SOCK_NOSPACE, etc)
+ *  @ops: protocol specific socket operations
+ *  @file: File back pointer for gc
+ *  @sk: internal networking protocol agnostic socket representation
+ *  @wq: wait queue for several uses
+ */
+struct socket {
+      socket_state            state;
+
+      short                   type;
+
+      unsigned long           flags;
+
+      struct socket_wq __rcu  *wq;
+
+      struct file             *file;
+      struct sock             *sk;
+      const struct proto_ops  *ops;
+};
+
+
+

The noteworthy fields are:

+
+
    +
  • ops - the structure that stores pointers to protocol-specific functions;
  • +
  • sk - The INET socket associated with it.
  • +
+
+
+
The struct proto_ops structure
+

The struct proto_ops structure contains the implementations of the specific +operations implemented (TCP, UDP, etc.); these functions will be called from +generic functions through struct socket (sock_release(), +sock_sendmsg(), etc.)

+

The struct proto_ops structure therefore contains a number of function +pointers for specific protocol implementations:

+
struct proto_ops {
+      int             family;
+      struct module   *owner;
+      int             (*release)   (struct socket *sock);
+      int             (*bind)      (struct socket *sock,
+                                    struct sockaddr *myaddr,
+                                    int sockaddr_len);
+      int             (*connect)   (struct socket *sock,
+                                    struct sockaddr *vaddr,
+                                    int sockaddr_len, int flags);
+      int             (*socketpair)(struct socket *sock1,
+                                    struct socket *sock2);
+      int             (*accept)    (struct socket *sock,
+                                    struct socket *newsock, int flags, bool kern);
+      int             (*getname)   (struct socket *sock,
+                                    struct sockaddr *addr,
+                                    int peer);
+      //...
+}
+
+
+

The initialization of the ops field from struct socket is done in +the __sock_create() function, by calling the create() function, +specific to each protocol; an equivalent call is the implementation of the +__sock_create() function:

+
//...
+      err = pf->create(net, sock, protocol, kern);
+      if (err < 0)
+              goto out_module_put;
+//...
+
+
+

This will instantiate the function pointers with calls specific to the protocol +type associated with the socket. The sock_register() and +sock_unregister() calls are used to fill the net_families vector.

+

For the rest of the socket operations (other than creating, closing, and +sending/receiving a message as described above in the Operations on the socket +structure section), the functions sent via pointers in this structure will be +called. For example, for bind, which associates a socket with a socket on +the local machine, we will have the following code sequence:

+
#define MY_PORT 60000
+
+struct sockaddr_in addr = {
+      .sin_family = AF_INET,
+      .sin_port = htons (MY_PORT),
+      .sin_addr = { htonl (INADDR_LOOPBACK) }
+};
+
+//...
+      err = sock->ops->bind (sock, (struct sockaddr *) &addr, sizeof(addr));
+      if (err < 0) {
+              /* handle error */
+      }
+//...
+
+
+

As you can see, for transmitting the address and port information that +will be associated with the socket, a struct sockaddr_in is filled.

+
+
+
+
+

The struct sock structure

+

The struct sock describes an INET socket. Such a structure is +associated with a user space socket and implicitly with a struct +socket structure. The structure is used to store information about the status +of a connection. The structure's fields and associated operations usually begin +with the sk_ string. Some fields are listed below:

+
struct sock {
+      //...
+      unsigned int            sk_padding : 1,
+                              sk_no_check_tx : 1,
+                              sk_no_check_rx : 1,
+                              sk_userlocks : 4,
+                              sk_protocol  : 8,
+                              sk_type      : 16;
+      //...
+      struct socket           *sk_socket;
+      //...
+      struct sk_buff          *sk_send_head;
+      //...
+      void                    (*sk_state_change)(struct sock *sk);
+      void                    (*sk_data_ready)(struct sock *sk);
+      void                    (*sk_write_space)(struct sock *sk);
+      void                    (*sk_error_report)(struct sock *sk);
+      int                     (*sk_backlog_rcv)(struct sock *sk,
+                                                struct sk_buff *skb);
+      void                    (*sk_destruct)(struct sock *sk);
+};
+
+
+

+
+
    +
  • sk_protocol is the type of protocol used by the socket;
  • +
  • sk_type is the socket type (SOCK_STREAM, SOCK_DGRAM, etc.);
  • +
  • sk_socket is the BSD socket that holds it;
  • +
  • sk_send_head is the list of struct sk_buff structures for +transmission;
  • +
  • the function pointers at the end are callbacks for different situations.
  • +
+
+

Initializing the struct sock and attaching it to a BSD socket is done +using the callback created from net_families (called +__sock_create()). Here's how to initialize the struct sock +structure for the IP protocol, in the inet_create() function:

+
/*
+ *    Create an inet socket.
+ */
+
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+                     int kern)
+{
+
+      struct sock *sk;
+
+      //...
+      err = -ENOBUFS;
+      sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
+      if (!sk)
+              goto out;
+
+      err = 0;
+      if (INET_PROTOSW_REUSE & answer_flags)
+              sk->sk_reuse = SK_CAN_REUSE;
+
+
+      //...
+      sock_init_data(sock, sk);
+
+      sk->sk_destruct    = inet_sock_destruct;
+      sk->sk_protocol    = protocol;
+      sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+      //...
+}
+
+
+
+
+

The struct sk_buff structure

+

The struct sk_buff (socket buffer) describes a network packet. The +structure fields contain information about both the header and packet contents, +the protocols used, the network device used, and pointers to the other +struct sk_buff. A summary description of the content of the structure +is presented below:

+
struct sk_buff {
+      union {
+              struct {
+                      /* These two members must be first. */
+                      struct sk_buff          *next;
+                      struct sk_buff          *prev;
+
+                      union {
+                              struct net_device       *dev;
+                              /* Some protocols might use this space to store information,
+                               * while device pointer would be NULL.
+                               * UDP receive path is one user.
+                               */
+                              unsigned long           dev_scratch;
+                      };
+              };
+
+              struct rb_node  rbnode; /* used in netem & tcp stack */
+      };
+      struct sock             *sk;
+
+        union {
+              ktime_t         tstamp;
+              u64             skb_mstamp;
+      };
+
+      /*
+       * This is the control buffer. It is free to use for every
+       * layer. Please put your private variables there. If you
+       * want to keep them across layers you have to do a skb_clone()
+       * first. This is owned by whoever has the skb queued ATM.
+       */
+      char                    cb[48] __aligned(8);
+
+      unsigned long           _skb_refdst;
+      void                    (*destructor)(struct sk_buff *skb);
+        union {
+              struct {
+                      unsigned long   _skb_refdst;
+                      void            (*destructor)(struct sk_buff *skb);
+              };
+              struct list_head        tcp_tsorted_anchor;
+      };
+      /* ... */
+
+      unsigned int            len,
+                              data_len;
+      __u16                   mac_len,
+                              hdr_len;
+
+         /* ... */
+
+      __be16                  protocol;
+      __u16                   transport_header;
+      __u16                   network_header;
+      __u16                   mac_header;
+
+      /* private: */
+      __u32                   headers_end[0];
+      /* public: */
+
+      /* These elements must be at the end, see alloc_skb() for details.  */
+      sk_buff_data_t          tail;
+      sk_buff_data_t          end;
+      unsigned char           *head,
+                              *data;
+      unsigned int            truesize;
+      refcount_t              users;
+};
+
+
+

where:

+
+
    +
  • next and prev are pointers to the next, and previous element in the +buffer list;
  • +
  • dev is the device which sends or receives the buffer;
  • +
  • sk is the socket associated with the buffer;
  • +
  • destructor is the callback that deallocates the buffer;
  • +
  • transport_header, network_header, and mac_header are offsets +between the beginning of the packet and the beginning of the various headers +in the packets. They are internally maintained by the various processing +layers through which the packet passes. To get pointers to the headers, use +one of the following functions: tcp_hdr(), udp_hdr(), +ip_hdr(), etc. In principle, each protocol provides a function to +get a reference to the header of that protocol within a received packet. +Keep in mind that the network_header field is not set until the packet +reaches the network layer and the transport_header field is not set +until the packet reaches the transport layer.
  • +
+
+

The structure of an IP header +(struct iphdr) has the following fields:

+
struct iphdr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+      __u8    ihl:4,
+              version:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+      __u8    version:4,
+              ihl:4;
+#else
+#error        "Please fix <asm/byteorder.h>"
+#endif
+      __u8    tos;
+      __be16  tot_len;
+      __be16  id;
+      __be16  frag_off;
+      __u8    ttl;
+      __u8    protocol;
+      __sum16 check;
+      __be32  saddr;
+      __be32  daddr;
+      /*The options start here. */
+};
+
+
+

where:

+
+
    +
  • protocol is the transport layer protocol used;
  • +
  • saddr is the source IP address;
  • +
  • daddr is the destination IP address.
  • +
+
+

The structure of a TCP header +(struct tcphdr) has the following fields:

+
struct tcphdr {
+      __be16  source;
+      __be16  dest;
+      __be32  seq;
+      __be32  ack_seq;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+      __u16   res1:4,
+              doff:4,
+              fin:1,
+              syn:1,
+              rst:1,
+              psh:1,
+              ack:1,
+              urg:1,
+              ece:1,
+              cwr:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+      __u16   doff:4,
+              res1:4,
+              cwr:1,
+              ece:1,
+              urg:1,
+              ack:1,
+              psh:1,
+              rst:1,
+              syn:1,
+              fin:1;
+#else
+#error        "Adjust your <asm/byteorder.h> defines"
+#endif
+      __be16  window;
+      __sum16 check;
+      __be16  urg_ptr;
+};
+
+
+

where:

+
+
    +
  • source is the source port;
  • +
  • dest is the destination port;
  • +
  • syn, ack, fin are the TCP flags used; for a more detailed view, +see this diagram.
  • +
+
+

The structure of a UDP header +(struct udphdr) has the following fields:

+
struct udphdr {
+      __be16  source;
+      __be16  dest;
+      __be16  len;
+      __sum16 check;
+};
+
+
+

where:

+
+
    +
  • source is the source port;
  • +
  • dest is the destination port.
  • +
+
+

An example of accessing the information present in the headers of a network +packet is as follows:

+
struct sk_buff *skb;
+
+struct iphdr *iph = ip_hdr(skb);                 /* IP header */
+/* iph->saddr  - source IP address */
+/* iph->daddr  - destination IP address */
+if (iph->protocol == IPPROTO_TCP) {              /* TCP protocol */
+        struct tcphdr *tcph = tcp_hdr(skb);      /* TCP header */
+        /* tcph->source  - source TCP port */
+        /* tcph->dest    - destination TCP port */
+} else if (iph->protocol == IPPROTO_UDP) {       /* UDP protocol */
+        struct udphdr *udph = udp_hdr(skb);      /* UDP header */
+        /* udph->source  - source UDP port */
+        /* udph->dest    - destination UDP port */
+}
+
+
+
+
+
+

Conversions

+

In different systems, there are several ways of ordering bytes in a word +(Endianness), including: Big +Endian (the most +significant byte first) and Little +Endian (the least +significant byte first). Since a network interconnects systems with different +platforms, the Internet has imposed a standard sequence for the storage of +numerical data, called network byte-order. In +contrast, the byte sequence for the representation of numerical data on the host +computer is called host byte-order. Data received/sent from/to the network is in +the network byte-order format and should be converted between this format and +the host byte-order.

+

For converting we use the following macros:

+
+
    +
  • u16 htons(u16 x) converts a 16 bit integer from host byte-order to +network byte-order (host to network short);
  • +
  • u32 htonl(u32 x) converts a 32 bit integer from host byte-order to +network byte-order (host to network long);
  • +
  • u16 ntohs(u16 x) converts a 16 bit integer from network byte-order to +host byte-order (network to host short);
  • +
  • u32 ntohl(u32 x) converts a 32 bit integer from network byte-order to +host byte-order (network to host long).
  • +
+
+
+
+

netfilter

+

Netfilter is the name of the kernel interface for capturing network packets for +modifying/analyzing them (for filtering, NAT, etc.). The netfilter interface is used in user space by iptables.

+

In the Linux kernel, packet capture using netfilter is done by attaching hooks. +Hooks can be specified in different locations in the path followed by a kernel +network packet, as needed. An organization chart with the route followed by a +package and the possible areas for a hook can be found here.

+

The header included when using netfilter is linux/netfilter.h.

+

A hook is defined through the struct nf_hook_ops structure:

+
struct nf_hook_ops {
+      /* User fills in from here down. */
+      nf_hookfn               *hook;
+      struct net_device       *dev;
+      void                    *priv;
+      u_int8_t                pf;
+      unsigned int            hooknum;
+      /* Hooks are ordered in ascending priority. */
+      int                     priority;
+};
+
+
+

where:

+
+
    +
  • pf is the package type (PF_INET, etc.);
  • +
  • +
    priority is the priority; priorities are defined in
    +
    uapi/linux/netfilter_ipv4.h as follows:
    +
    +
  • +
+
+
enum nf_ip_hook_priorities {
+      NF_IP_PRI_FIRST = INT_MIN,
+      NF_IP_PRI_CONNTRACK_DEFRAG = -400,
+      NF_IP_PRI_RAW = -300,
+      NF_IP_PRI_SELINUX_FIRST = -225,
+      NF_IP_PRI_CONNTRACK = -200,
+      NF_IP_PRI_MANGLE = -150,
+      NF_IP_PRI_NAT_DST = -100,
+      NF_IP_PRI_FILTER = 0,
+      NF_IP_PRI_SECURITY = 50,
+      NF_IP_PRI_NAT_SRC = 100,
+      NF_IP_PRI_SELINUX_LAST = 225,
+      NF_IP_PRI_CONNTRACK_HELPER = 300,
+      NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
+      NF_IP_PRI_LAST = INT_MAX,
+};
+
+
+

+
+
    +
  • dev is the device (network interface) on which the capture is +intended;
  • +
  • hooknum is the type of hook used. When a packet is captured, the +processing mode is defined by the hooknum and hook fields. For IP, +hook types are defined in linux/netfilter.h:
  • +
+
+
enum nf_inet_hooks {
+      NF_INET_PRE_ROUTING,
+      NF_INET_LOCAL_IN,
+      NF_INET_FORWARD,
+      NF_INET_LOCAL_OUT,
+      NF_INET_POST_ROUTING,
+      NF_INET_NUMHOOKS
+};
+
+
+

+
+
    +
  • hook is the handler called when capturing a network packet (packet sent +as a struct sk_buff structure). The private field is private information +handed to the handler. The capture handler prototype is defined by the +nf_hookfn type:
  • +
+
+
struct nf_hook_state {
+      unsigned int hook;
+      u_int8_t pf;
+      struct net_device *in;
+      struct net_device *out;
+      struct sock *sk;
+      struct net *net;
+      int (*okfn)(struct net *, struct sock *, struct sk_buff *);
+};
+
+typedef unsigned int nf_hookfn(void *priv,
+                             struct sk_buff *skb,
+                             const struct nf_hook_state *state);
+
+
+

For the nf_hookfn() capture function, the priv parameter is the +private information with which the struct nf_hook_ops was +initialized. skb is the pointer to the captured network packet. Based on +skb information, packet filtering decisions are made. The function's +state parameter is the status information related to the packet capture, +including the input interface, the output interface, the priority, the hook +number. Priority and hook number are useful for allowing the same function to +be called by several hooks.

+

A capture handler can return one of the constants NF_*:

+
/* Responses from hook functions. */
+#define NF_DROP 0
+#define NF_ACCEPT 1
+#define NF_STOLEN 2
+#define NF_QUEUE 3
+#define NF_REPEAT 4
+#define NF_STOP 5
+#define NF_MAX_VERDICT NF_STOP
+
+
+

NF_DROP is used to filter (ignore) a packet, and NF_ACCEPT is used to +accept a packet and forward it.

+

Registering/unregistering a hook is done using the functions defined in +linux/netfilter.h:

+
/* Function to register/unregister hook points. */
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+                        unsigned int n);
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+                           unsigned int n);
+
+
+
+

Attention

+

Prior to version 3.11-rc2 of the Linux kernel, +there are some restrictions related to the use of header extraction functions +from a struct sk_buff structure set as a parameter in a netfilter +hook. While the IP header can be obtained each time using ip_hdr(), +the TCP and UDP headers can be obtained with tcp_hdr() and +udp_hdr() only for packages that come from inside the system rather +than the ones that are received from outside the system. In the latter case, +you must manually calculate the header offset in the package:

+
// For TCP packets (iph->protocol == IPPROTO_TCP)
+tcph = (struct tcphdr*)((__u32*)iph + iph->ihl);
+// For UDP packets (iph->protocol == IPPROTO_UDP)
+udph = (struct udphdr*)((__u32*)iph + iph->ihl);
+
+
+

This code works in all filtering situations, so it's recommended to use it +instead of header access functions.

+
+

A usage example for a netfilter hook is shown below:

+
#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+static unsigned int my_nf_hookfn(void *priv,
+              struct sk_buff *skb,
+              const struct nf_hook_state *state)
+{
+      /* process packet */
+      //...
+
+      return NF_ACCEPT;
+}
+
+static struct nf_hook_ops my_nfho = {
+      .hook        = my_nf_hookfn,
+      .hooknum     = NF_INET_LOCAL_OUT,
+      .pf          = PF_INET,
+      .priority    = NF_IP_PRI_FIRST
+};
+
+int __init my_hook_init(void)
+{
+      return nf_register_net_hook(&init_net, &my_nfho);
+}
+
+void __exit my_hook_exit(void)
+{
+      nf_unregister_net_hook(&init_net, &my_nfho);
+}
+
+module_init(my_hook_init);
+module_exit(my_hook_exit);
+
+
+
+
+

netcat

+

When developing applications that include networking code, one of the most +used tools is netcat. Also nicknamed "Swiss-army knife for TCP / IP". It allows:

+
+
    +
  • Initiating TCP connections;
  • +
  • Waiting for a TCP connection;
  • +
  • Sending and receiving UDP packets;
  • +
  • Displaying traffic in hexdump format;
  • +
  • Run a program after establishing a connection (eg, a shell);
  • +
  • Set special options in sent packages.
  • +
+
+

Initiating TCP connections:

+
nc hostname port
+
+
+

Listening to a TCP port:

+
nc -l -p port
+
+
+

Sending and receiving UDP packets is done adding the -u command line option.

+
+

Note

+

The command is nc; often netcat is an alias for this +command. There are other implementations of the netcat command, some of which +have slightly different parameters than the classic implementation. Run +man nc or nc -h to check how to use it.

+
+

For more information on netcat, check the following tutorial.

+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is networking. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/networking/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Important

+

You need to make sure that the netfilter support is active in kernel. It +is enabled via CONFIG_NETFILTER. To activate it, run make menuconfig in +the linux directory and check the Network packet filtering framework +(Netfilter) option in Networking support -> Networking options. If it +was not enabled, enable it (as builtin, not external module - it must be +marked with *).

+
+
+

1. Displaying packets in kernel space

+

Write a kernel module that displays the source address and port for TCP packets +that initiate an outbound connection. Start from the code in +1-2-netfilter and fill in the areas marked with TODO 1, taking into +account the comments below.

+

You will need to register a netfilter hook of type NF_INET_LOCAL_OUT as explained +in the netfilter section.

+

The struct sk_buff structure lets you access the packet headers using +specific functions. The ip_hdr() function returns the IP header as a +pointer to a struct iphdr structure. The tcp_hdr() function +returns the TCP header as a pointer to a struct tcphdr structure.

+

The diagram explains how to make a TCP connection. The connection initiation +packet has the SYN flag set in the TCP header and the ACK flag cleared.

+
+

Note

+

To display the source IP address, use the %pI4 format of the printk +function. Details can be found in the kernel documentation (IPv4 +addresses section). The following is an example code snippet that uses +%pI4:

+
printk("IP address is %pI4\n", &iph->saddr);
+
+
+

When using the %pI4 format, the argument to printk is a pointer. Hence the +construction &iph->saddr (with operator & - ampersand) instead of +iph->saddr.

+
+

The source TCP port is, in the TCP header, in the network byte-order format. +Read through the Conversions section. Use ntohs() to convert.

+

For testing, use the 1-2-netfilter/user/test-1.sh file. The test creates +a connection to the localhost, a connection that will be intercepted and +displayed by the kernel module. The script is copied on the virtual machine by +the make copy command only if it is marked as executable. The script +uses the statically compiled netcat tool stored in +skels/networking/netcat; this program must have execution +permissions.

+

After running the checker the output should be similar to the one bellow:

+
# ./test-1.sh
+[  229.783512] TCP connection initiated from 127.0.0.1:44716
+Should show up in filter.
+Check dmesg output.
+
+
+
+
+

2. Filtering by destination address

+

Extend the module from exercise 1 so that you can specify a destination address +by means of a MY_IOCTL_FILTER_ADDRESS ioctl call. You'll only show packages +containing the specified destination address. To solve this task, fill in the +areas marked with TODO 2 and follow the specifications below.

+

To implement the ioctl routine, you must fill out the my_ioctl function. +Review the section in ioctl. The address sent from user space is in +network byte-order, so there will be NO need for conversion.

+
+

Note

+

The IP address sent via ioctl is sent by address, not by value. The +address must be stored in the ioctl_set_addr variable. For copying use +copy_from_user().

+
+

To compare the addresses, fill out the test_daddr function. Addresses in +network byte-order will be used without having to convert addresses (if they +are equal from left to right they will be equal if reversed too).

+

The test_daddr function must be called from the netfilter hook to display +the connection initialization packets for which the destination address is the +one sent through the ioctl routine. The connection initiation packet has the +SYN flag set in the TCP header and the ACK flag cleared. You have to +check two things:

+
+
    +
  • the TCP flags;
  • +
  • the destination address of the packet (using test_addr).
  • +
+
+

For testing, use the 1-2-netfilter/user/test-2.sh script. This script +needs to compile the 1-2-netfilter/user/test.c file in the test +executable. Compilation is done automatically on the physical system when +running the make build command. The test script is copied to the +virtual machine only if it is marked as executable. The script uses the +statically compiled netcat tool in skels/networking/netcat; +this executable must have execution permissions.

+

After running the checker the output should be similar to the one bellow:

+
# ./test-2.sh
+[  797.673535] TCP connection initiated from 127.0.0.1:44721
+Should show up in filter.
+Should NOT show up in filter.
+Check dmesg output.
+
+
+

The test ask for packet filtering first for the 127.0.0.1 IP address and +then for the 127.0.0.2 IP address. The first connection initiation packet +(to 127.0.0.1) is intercepted and displayed by the filter, while the second +(to 127.0.0.2) is not intercepted.

+
+
+

3. Listening on a TCP socket

+

Write a kernel module that creates a TCP socket that listens to connections on +port 60000 on the loopback interface (in init_module). Start from the +code in 3-4-tcp-sock fill in the areas marked with TODO 1 taking +into account the observations below.

+

Read the Operations on the socket structure and The struct proto_ops +structure sections.

+

The sock socket is a server socket and must be put in the listening +state. That is, the bind and listen operations must be applied to the +socket. For the bind and listen equivalent, in kernel space you will +need to call sock->ops->...; examples of such functions you can call are +sock->ops->bind, sock->ops->listen etc.

+
+

Note

+

For example, call sock->ops->bind, or sock->ops->listen functions, see +how they are called in the sys_bind() and sys_listen() system +call handlers.

+

Look for the system call handlers in the net/socket.c file in the Linux +kernel source code tree.

+
+
+

Note

+

For the second argument of the listen (backlog) call, use the +LISTEN_BACKLOG.

+
+

Remember to release the socket in the module's exit function and in the area +marked with error labels; use sock_release().

+

For testing, run the 3-4-tcp_sock/test-3.sh script. The script is +copied on the virtual machine by make copy only if it is marked as +executable.

+

After running the test, a TCP socket will be displayed by listening to +connections on port 60000.

+
+
+

4. Accepting connections in kernel space

+

Expand the module from the previous exercise to allow an external connection (no +need to send any message, only accept new connections). Fill in the areas marked +with TODO 2.

+

Read the Operations on the socket structure and The struct proto_ops +structure sections.

+

For the kernel space accept equivalent, see the system call handler for +sys_accept4(). Follow the lnet_sock_accept +implementation, and how the sock->ops->accept call is used. Use 0 as +the value for the second to last argument (flags), and true for the +last argument (kern).

+
+

Note

+

Look for the system call handlers in the net/socket.c file in the Linux +kernel source code tree.

+
+
+

Note

+

The new socket (new_sock) must be created with the +sock_create_lite() function and then its operations must be configured +using

+
newsock->ops = sock->ops;
+
+
+
+

Print the address and port of the destination socket. To find the peer name of a +socket (its address), refer to the sys_getpeername() system call handler.

+
+

Note

+

The first argument for the sock->ops->getname function will be the +connection socket, ie new_sock, the one initialized with by the accept +call.

+

The last argument of the sock->ops->getname function will be 1, +meaning that we want to know about the endpoint or the peer (remote end or +peer).

+

Display the peer address (indicated by the raddr variable) using the +print_sock_address macro defined in the file.

+
+

Release the newly created socket (after accepting the connection) in the module +exit function and after the error label. After adding the accept code to the +module initialization function, the insmod operation will lock until +a connection is established. You can unlock using netcat on that +port. Consequently, the test script from the previous exercise will not work.

+

For testing, run the 3-4-tcp_sock/test-4.sh script. The script is copied on +the virtual machine by make copy only if it is marked as executable.

+

Nothing special will be displayed (in the kernel buffer). The success of the +test will be defined by the connection establishment. Then use Ctrl+c to +stop the test script, and then you can remove the kernel module.

+
+
+

5. UDP socket sender

+

Write a kernel module that creates a UDP socket and sends the message from the +MY_TEST_MESSAGE macro on the socket to the loopback address on port +60001.

+

Start from the code in 5-udp-sock.

+

Read the Operations on the socket structure and The struct proto_ops +structure sections.

+

To see how to send messages in the kernel space, see the sys_send() +system call handler or Sending/receiving messages.

+
+

Hint

+

The msg_name field of the struct msghdr structure must be +initialized to the destination address (pointer to struct sockaddr) +and the msg_namelen field to the address size.

+

Initialize the msg_flags field of the struct msghdr structure +to 0.

+

Initialize the msg_control and msg_controllen fields of the +struct msghdr structure to NULL and 0 respectively.

+
+

For sending the message use kernel_sendmsg().

+

The message transmission parameters are retrieved from the kernel space. Cast +the struct iovec structure pointer to a struct kvec pointer +in the kernel_sendmsg() call.

+
+

Hint

+

The last two parameters of kernel_sendmsg() are 1 (number of I/O +vectors) and len (message size).

+
+

For testing, use the test-5.sh file. The script is copied on the virtual +machine by the make copy command only if it is marked as executable. +The script uses the statically compiled netcat tool stored in +skels/networking/netcat; this executable must have execution +permissions.

+

For a correct implementation, running the test-5.sh script will cause +the kernelsocket message to be displayed like in the output below:

+
/root # ./test-5.sh
++ pid=1059
++ sleep 1
++ nc -l -u -p 60001
++ insmod udp_sock.ko
+kernelsocket
++ rmmod udp_sock
++ kill 1059
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab11-arm-kernel-development.html b/refs/pull/405/merge/so2/lab11-arm-kernel-development.html new file mode 100644 index 00000000..da649963 --- /dev/null +++ b/refs/pull/405/merge/so2/lab11-arm-kernel-development.html @@ -0,0 +1,621 @@ + + + + + + SO2 Lab 11 - Kernel Development on ARM — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 11 - Kernel Development on ARM

+
+

Lab objectives

+
    +
  • get a feeling of what System on a Chip (SoC) means
  • +
  • get familiar with embedded world using ARM as a supported architecture
  • +
  • understand what a Board Support Package means (BSP)
  • +
  • compile and boot an ARM kernel with Qemu using i.MX6UL platform as an example
  • +
  • get familiar with hardware description using Device Trees
  • +
+
+
+

System on a Chip

+

A System on a Chip (SoC) is an integrated circuit (IC) that integrates an entire system onto it. The components +that can be usually found on an SoC include a central processing unit (CPU), memory, input/output ports, storage devices +together with more sophisticated modules like audio digital interfaces, neural processing units (NPU) or graphical +processing units (GPU).

+
+
SoCs can be used in various applications most common are:
+
    +
  • consumer electronics (TV sets, mobile phones, video game consoles)
  • +
  • industrial computers (medical imaging, etc)
  • +
  • automotive
  • +
  • home appliances
  • +
+
+
+

The leading architecture for SoCs is ARM. Worth mentioning here is that there are also x86-based SoCs platforms. Another thing +we need to keep an eye on is RISC-V an open standard instruction set architecture.

+

A simplified view of an ARM platform is shown in the image below:

+../_images/schematic1.png +

We will refer as a reference platform at NXP's i.MX6UL platform, but in general all SoC's contain the following building blocks:

+
+
+
    +
  • one or more CPU cores
  • +
  • a system bus
  • +
  • clock and reset module
      +
    • PLL
    • +
    • OSC
    • +
    • reset controller
    • +
    +
  • +
+
+
    +
  • interrupt controller
  • +
  • timers
  • +
  • memory controller
  • +
  • peripheral controllers +
  • +
+
+

Here is the complete block diagram for i.MX6UL platform:

+IMX6UL-BD +

i.MX6UL Evaluation Kit board looks like this:

+imx6ul-evk +

Other popular SoC boards:

+
+
+
+
+

Board Support package

+

A board support package (BSP) is the minimal set of software packages that allow to demonstrate the capabilities of a certain hardware platform. This includes:

+
+
    +
  • toolchain
  • +
  • bootloader
  • +
  • Linux kernel image, device tree files and drivers
  • +
  • root filesystem
  • +
+
+

Semiconductor manufacturers usually provide a BSP together with an evaluation board. BSP is typically bundled using Yocto

+
+
+

Toolchain

+

Because our development machines are mostly x86-based we need a cross compiler that can produce executable +code for ARM platform.

+

We can build our own cross compiler from scratch using https://crosstool-ng.github.io/ or we can install one

+
$ sudo apt-get install gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf # for arm32
+$ sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu     # for arm64
+
+
+

There are several of toolchain binaries depending on the configuration:

+
+
    +
  • With "arm-eabi-gcc" you have the Linux system C library which will make calls into the kernel IOCTLs, e.g. for allocating memory pages to the process.
  • +
  • With "arm-eabi-none-gcc" you are running on platform which doesn't have an operating system at all - so the C library is different to cope with that.
  • +
+
+
+

Compiling the Linux kernel on ARM

+

Compile the kernel for 32bit ARM boards:

+
# select defconfig based on your platform
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make imx_v6_v7_defconfig
+# compile the kernel
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+

Compile the kernel for 64bit ARM boards:

+
# for 64bit ARM there is a single config for all supported boards
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make defconfig
+# compile the kernel
+$ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make -j8
+
+
+
+
+
+

Linux kernel image

+

The kernel image binary is named vmlinux and it can be found in the root of the kernel tree. Compressed image used for booting can be found under:

+
    +
  • arch/arm/boot/Image, for arm32
  • +
  • arch/arm64/boot/Image, for arm64
  • +
+
$ file vmlinux
+  vmlinux: ELF 32-bit LSB executable, ARM, EABI5 version 1 (SYSV), statically linked, not stripped
+
+$ file vmlinux
+  vmlinux: ELF 64-bit LSB shared object, ARM aarch64, version 1 (SYSV), statically linked, not stripped
+
+
+
+
+

Rootfs

+

The root filesystem (rootfs) is the filesystem mounted at the top of files hierarchy (/). It should contain at least +the critical files allowing the system to boot to a shell.

+
root@so2$ tree -d -L 2
+├── bin
+├── boot
+├── dev
+├── etc
+├── home
+│   └── root
+├── lib
+│   └── udev
+├── mnt
+├── proc
+├── sbin
+│   └── init
+├── sys
+├── usr
+│   ├── bin
+│   ├── include
+│   ├── lib
+└── var
+
+
+

As for x86 we will make use of Yocto rootfs images. In order to download an ext4 rootfs image for arm32 one needs to run:

+
$ cd tools/labs/
+$ ARCH=arm make core-image-minimal-qemuarm.ext4
+
+
+
+
+

Device tree

+

Device tree (DT) is a tree structure used to describe the hardware devices in a system. Each node in the tree describes a device hence it is called device node. DT was introduced +to provide a way to discover non-discoverable hardware (e.g a device on an I2C bus). This information was previously stored inside the source code for the Linux kernel. This meant that +each time we needed to modify a node for a device the kernel needed to be recompiled. This no longer holds true as device tree and kernel image are separate binaries now.

+

Device trees are stored inside device tree sources (.dts) and compiled into device tree blobs (.dtb).

+
# compile dtbs
+$ make dtbs
+
+# location for DT sources on arm32
+$ ls arch/arm/boot/dts/
+  imx6ul-14x14-evk.dtb imx6ull-14x14-evk.dtb bcm2835-rpi-a-plus.dts
+
+# location for DT source on arm64
+$ ls arch/arm64/boot/dts/<vendor>
+  imx8mm-evk.dts imx8mp-evk.dts
+
+
+

The following image is a represantation of a simple device tree, describing board type, cpu and memory.

+../_images/dts_node1.png +

Notice that a device tree node can be defined using label: name@address:

+
+
    +
  • label, is an identifier used to reference the node from other places
  • +
  • name, node identifier
  • +
  • address, used to differentiate nodes with the same name.
  • +
+
+

A node might contain several properties arranged in the name = value format. The name is a string +and the value can be bytes, strings, array of strings.

+

Here is an example:

+
/ {
+     node@0 {
+          empty-property;
+          string-property = "string value";
+          string-list-property = "string value 1", "string value 2";
+          int-list-property = <value1 value2>;
+
+          child-node@0 {
+                  child-empty-property;
+                  child-string-property = "string value";
+                  child-node-reference = <&child-node1>;
+          };
+
+          child-node1: child-node@1 {
+                  child-empty-property;
+                  child-string-property = "string value";
+          };
+   };
+};
+
+
+
+
+

Qemu

+

We will use qemu-system-arm to boot 32bit ARM platforms. Although, this can be installed from official distro repos, for example:

+
sudo apt-get install -y qemu-system-arm
+
+
+

We strongly recommend using latest version of qemu-system-arm build from sources:

+
$ git clone https://gitlab.com/qemu-project/qemu.git
+$ ./configure --target-list=arm-softmmu --disable-docs
+$ make -j8
+$ ./build/qemu-system-arm
+
+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is arm_kernel_development. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/arm_kernel_development/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Warning

+

The rules for working with the virtual machine for ARM are modified as follows

+
# modules build
+tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make build
+# modules copy
+tools/labs $ ARCH=arm make copy
+# kernel build
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+
+
+

0. Intro

+

Inspect the following locations in the Linux kernel code and identify platforms and vendors using +ARM architecture:

+
+
    +
  • 32-bit: arch/arm/boot/dts
  • +
  • 64-bit: arch/arm64/boot/dts
  • +
+
+

Use qemu and look at the supported platforms:

+
../qemu/build/arm-softmmu/qemu-system-arm -M ?
+
+
+
+

Note

+

We used our own compiled version of Qemu for arm32. See Qemu section for more details.

+
+
+
+

1. Boot

+

Use qemu to boot i.MX6UL platform. In order to boot, we first need to compile the kernel. +Review Compiling the Linux kernel on ARM section.

+

Successful compilation will result in the following binaries:

+
+
    +
  • arch/arm/boot/Image, kernel image compiled for ARM
  • +
  • arch/arm/boot/dts/imx6ul-14x14-evk.dtb, device tree blob for i.MX6UL board
  • +
+
+

Review Rootfs section and download core-image-minimal-qemuarm.ext4 rootfs. +Run qemu using then following command:

+
../qemu/build/arm-softmmu/qemu-system-arm -M mcimx6ul-evk -cpu cortex-a7 -m 512M \
+  -kernel arch/arm/boot/zImage -nographic  -dtb arch/arm/boot/dts/imx6ul-14x14-evk.dtb \
+  -append "root=/dev/mmcblk0 rw console=ttymxc0 loglevel=8 earlycon printk" -sd tools/labs/core-image-minimal-qemuarm.ext4
+
+
+
+

Note

+

LCDIF and ASRC devices are not well supported with Qemu. Remove them from compilation.

+
+
$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make menuconfig
+# set FSL_ASRC=n and DRM_MXSFB=n
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+

Once the kernel is booted check kernel version and cpu info:

+
$ cat /proc/cpuinfo
+$ cat /proc/version
+
+
+
+
+

2. CPU information

+

Inspect the CPU configuration for NXP i.MX6UL board. Start with arch/arm/boot/dts/imx6ul-14x14-evk.dts.

+
+
    +
  • find cpu@0 device tree node and look for operating-points property.
  • +
  • read the maximum and minimum operating frequency the processor can run
  • +
+
+
$ cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq
+$ cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq
+
+
+
+
+
+
+

3. I/O memory

+

Inspect I/O space configuration for NXP i.MX6UL board. Start with arch/arm/boot/dts/imx6ul-14x14-evk.dts and identify each device mentioned below.

+
$ cat /proc/iomem
+  00900000-0091ffff : 900000.sram sram@900000
+  0209c000-0209ffff : 209c000.gpio gpio@209c000
+  021a0000-021a3fff : 21a0000.i2c i2c@21a0000
+  80000000-9fffffff : System RAM
+
+
+

Identify device tree nodes corresponding to:

+
+
    +
  • System RAM, look for memory@80000000 node in arch/arm/boot/dts/imx6ul-14x14-evk.dtsi. What's the size of the System RAM?
  • +
  • GPIO1, look for gpio@209c000 node in arch/arm/boot/dts/imx6ul.dtsi. What's the size of the I/O space for this device?
  • +
  • I2C1, look for i2c@21a0000 node in arch/arm/boot/dts/imx6ul.dtsi. What's the size of the I/O spaces for this device?
  • +
+
+
+
+

4. Hello World

+

Implement a simple kernel module that prints a message at load/unload time. Compile it and load it on i.MX6UL emulated platform.

+
# modules build
+tools/labs $ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make build
+# modules copy
+tools/labs $ ARCH=arm make copy
+# kernel build
+$ ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- make -j8
+
+
+
+
+

5. Simple device

+

Implement a driver for a simple platform device. Find TODO 1 and notice how simple_driver is declared and register as a platform driver. +Follow TODO 2 and add the so2,simple-device-v1 and so2,simple-device-v2 compatible strings in the simple_device_ids array.

+

Create two device tree nodes in arch/arm/boot/dts/imx6ul.dtsi under soc node with compatible strings so2,simple-device-v1 and +so2,simple-device-v2 respectively. Then notice the behavior when loading simple_driver module.

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab12-kernel-profiling.html b/refs/pull/405/merge/so2/lab12-kernel-profiling.html new file mode 100644 index 00000000..ee02cd06 --- /dev/null +++ b/refs/pull/405/merge/so2/lab12-kernel-profiling.html @@ -0,0 +1,680 @@ + + + + + + SO2 Lab 12 - Kernel Profiling — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 12 - Kernel Profiling

+
+

Lab Objectives

+
+
    +
  • Familiarize yourself with the basics of Linux kernel profiling
  • +
  • Understanding basic profiling tools
  • +
  • Learning profiling methodologies and good practices
  • +
+
+
+
+

Overview

+

Up until now we have studied how the different components of the Linux kernel +work, and how to write drivers that interface with them in order to provide +support for devices or protocols. This has helped us understand how the Linux +kernel works, but most people will not get to write kernel drivers.

+

Nonetheless, the skills learned will help us to write applications that better +integrate with the whole operating system. In order to do this, one has to have +a good view of both the user space and the kernel space.

+

This session aims to merge the work we have done up until now in the kernel +space with real world use cases where we do not write kernel space code, but we +look through the kernel using profiling tools, in order to debug issues that +we're having when writing regular, low-level, applications.

+

Another focus of this session will be learning a general methodology for +debugging software issues, and we will approach some tools that give us insight +from the kernel on the way our application runs.

+
+
+

Profiling Tools

+

The main tool that we will focus our attention on is perf, which offers +support for tracing applications, and also inspecting general aspects of the +system. We will also be using debugging tools that most people have used in +their day to day life, such as htop, ps, lsof and others.

+
+

perf

+

perf is a tool that instruments the CPU using +tracepoints, kprobes and uprobes. This tool allows us to take a look at what +functions are being called at a given point. This allows us to take a peak at +where the kernel is pending the most time, print out call stacks of functions, +and in general log what the CPU is running.

+

perf integrates modules such as: +* static tracing +* dynamic tracing +* resource monitoring

+

The tracing interface that is offered by perf can be used by itself, using the +perf command together with its subcommands.

+
root@qemux86:~# ./skels/kernel_profiling/perf
+
+ usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS]
+
+ The most commonly used perf commands are:
+   annotate        Read perf.data (created by perf record) and display annotated code
+   archive         Create archive with object files with build-ids found in perf.data file
+   bench           General framework for benchmark suites
+   buildid-cache   Manage build-id cache.
+   buildid-list    List the buildids in a perf.data file
+   c2c             Shared Data C2C/HITM Analyzer.
+   config          Get and set variables in a configuration file.
+   data            Data file related processing
+   diff            Read perf.data files and display the differential profile
+   evlist          List the event names in a perf.data file
+   ftrace          simple wrapper for kernel's ftrace functionality
+   inject          Filter to augment the events stream with additional information
+   kallsyms        Searches running kernel for symbols
+   kmem            Tool to trace/measure kernel memory properties
+   kvm             Tool to trace/measure kvm guest os
+   list            List all symbolic event types
+   lock            Analyze lock events
+   mem             Profile memory accesses
+   record          Run a command and record its profile into perf.data
+   report          Read perf.data (created by perf record) and display the profile
+   sched           Tool to trace/measure scheduler properties (latencies)
+   script          Read perf.data (created by perf record) and display trace output
+   stat            Run a command and gather performance counter statistics
+   test            Runs sanity tests.
+   timechart       Tool to visualize total system behavior during a workload
+   top             System profiling tool.
+   version         display the version of perf binary
+   probe           Define new dynamic tracepoints
+
+ See 'perf help COMMAND' for more information on a specific command.
+
+
+

In the output above we can see all of perf's subcommands together with a +description of their functionality, the most significant of which are:

+
    +
  • stat - displays statistics such as the number of context switches and page +faults;
  • +
  • top - an interactive interface where we can inspect the most frequent +function calls and their caller. This interface allows us direct feedback +while profiling;
  • +
  • list - lists the static trace point that we can instrument inside the +kernel. These are useful when trying to get an insight from inside the kernel;
  • +
  • probe - add a dynamic trace point that instruments a function call in +order to be recorded by perf;
  • +
  • record - records function calls and stack traces based on tracing points +defined by the user; It can also record specific function calls and their +stack traces. The record is saved in a file, named perf.data by default;
  • +
  • report - displays the information saved in a perf recording.
  • +
+

Another way to use perf's interface is through scripts that wrap over perf that +offer a higher level way of looking at events or data, without needing to know +the intricacies of the command. An example of this is the iosnoop.sh script, +which displays what I/O transfers are taking place.

+
+
+

ps

+

ps is the Linux tool that allows us to monitor the processes that are +running at a given time on the machine, including the kernel threads. This is a +simple and easy to use way of checking at a glance what processes are running on +the CPU, and what is their CPU and memory usage.

+

In order to list all the processes running, we use to ps aux command in the +following way:

+
TODO
+root@qemux86:~/skels/kernel_profiling/0-demo# cd
+ root@qemux86:~# ps aux
+ USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
+ root         1  0.0  0.5   2004  1256 ?        Ss   12:06   0:12 init [5]
+ root         2  0.0  0.0      0     0 ?        S    12:06   0:00 [kthreadd]
+ [...]
+ root       350  4.5  4.4  11132 10688 hvc0     T    12:07  17:21 ./io-app
+ root      1358  0.0  0.0      0     0 ?        I    14:30   0:00 [kworker/u2:1-e
+ root      2293  0.1  1.5   5516  3704 ?        Ss   18:18   0:00 sshd: root@pts/
+ root      2295  0.0  1.3   3968  3232 pts/0    Ss+  18:19   0:00 -sh
+ root      2307  0.0  0.0      0     0 ?        I    18:19   0:00 [kworker/u2:2-e
+ root      2350  0.0  0.7   3032  1792 hvc0     R+   18:26   0:00 ps aux
+ root      2392  2.6  0.0      0     0 ?        D    18:31   0:00 test-script
+
+
+

One information of note is that the 7th column represents the that of the +process, S meaning suspended, D suspended due to I/O, and R meaning +running.

+
+
+

time

+

The time command allows us to inspect the amount of time spent by a +process in I/O, running the application code, or running code in kernel space. +This can be useful in order to find out whether an application's issue comes +from running too much in kernel space, so it has some overhead when it does +system calls, or the issue is in the user code.

+
root@qemux86:~# time dd if=/dev/urandom of=./test-file bs=1K count=10
+10+0 records in
+10+0 records out
+10240 bytes (10 kB, 10 KiB) copied, 0.00299749 s, 3.4 MB/s
+
+real        0m0.020s
+user        0m0.001s
+sys 0m0.015s
+
+
+

In the output above we timed the generation of a file using dd. The result +of the timing is displayed at the bottom of output. The values outputted by the +tool are the following:

+
    +
  • real - the amount of time has passed from the start of the application to +its finishing;
  • +
  • user - time spent running the dd code;
  • +
  • sys - time spent running kernel code on behalf of the process.
  • +
+

We see that the sum of the user and sys values doesn't add up to the +real value. This happens either when the application runs on multiple cores, +in which case the sum might be higher, or the application sleeps, in which case +the sum is lower.

+
+
+

top

+

top is an application that is found on most systems which lists in real time +the applications that are running on the system. top runs interactively, and +it auto-refreshes its output, as opposed to ps. We use this tool when we +want a high level of continuous monitoring.

+
+
+
+

Profiling Methodology

+

When doing profiling, our goal is to identify the cause of a problem. Usually +this problem is observed by someone when their application doesn't work as +expected. When we say that an application did not work as expected, this can +mean different things for different people. For example, one person might +complain that the application has a slowdown, while another might say that the +application runs on the CPU, but it doesn't output anything.

+

The first step in any problem solving context is to understand the default +behaviour of the application we're trying to debug, and to make sure that it is +now not running in the expected parameters.

+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is kernel_profiling. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/kernel_profiling/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Note

+

This session will require us to use the perf tracing tool. When running +natively on our systems, we have to install the +linux-tools-<version>-generic package using a package manager in order +to run it. Because in our visual machine we don't have access to a package +manager, we will be downloading the perf binary from this link. Download the application in +the skels/kernel_profiling directory, and grant in execution +permissions.

+
+
+

Warning

+

When running perf, make sure that you're running the downloaded version, +not the version in the PATH variable.

+
+
+

Note

+

When going through this session's exercises, we will have to run command in +parallel. In order to do this, we will have to connect to the virtual machine +using SSH. We recommend using the core-image-sato-sdk-qemu image, since it +has the tools that we need. To run the virtual machine using the +core-image-sato-sdk-qemu file system, uncomment line 16 in the +qemu/Makefile file.

+
+
+

Note

+

If you wish to run the perf-tools based scripts that we have included in +the repository, such as iosnoop.sh, you will have to grant it execution +privilleges, in order to be copied to the virtual machine file system.

+
+
+

Note

+

In order to improve the course of SO2, its components and the way it is +conducted, your opinions are very useful to us. Please fill the feedback form +on curs.upb.ro platform.

+

The form is anonymous and is active between May 22 and June 2, 2023. The +results will be visible to the SO2 team after all the grades have been +marked.

+

We invite you to evaluate the activity of the SO2 team and specify its +strengths and weaknesses and your suggestions for improving the subject. +Your feedback is very important to us to increase the quality of the subject +in the coming years.

+

We are particularly interested in:

+
+
    +
  • What did you not like and what do you think did not go well?
  • +
  • Why didn't you like it and why do you think it didn't go well?
  • +
  • What should we do to make things better?
  • +
+
+
+
+
+

0. Demo: Profiling I/O Problems

+

When working with I/O, we have to keep in mind that it is one of the slowest +systems in the operating system, compared to memory, which is an order of +magnitude faster, and scheduling, which deals with what is currently running on +the CPU.

+

Because of this, I/O operations have do be thought out, because you might starve +you application by saturating the system with requests. Another issue that you +might face is that the I/O's slow speed might affect your application's +responsiveness, if it waits for the I/O operations to finish.

+

Let's take a look at an application and debug its issues.

+

We are going to run the io-app application, from the 0-demo directory.

+

In order to inspect what is running on the CPU, and look at the stack of the +process, we can use the perf record subcommand in the following way:

+
root@qemux86:~# ./perf record -a -g
+Couldn't synthesize bpf events.
+^C[ perf record: Woken up 7 times to write data ]
+[ perf record: Captured and wrote 1.724 MB perf.data (8376 samples) ]
+
+
+

perf will record values indefinitely, but we can close it using the Ctrl+c +hotkey. We used the -a option in order to probe all CPUs, and -g option, +which record the whole call stack.

+

To visualize the recorded information, we will use the perf report command, +which will bring up a pager which will display the most frequent function calls +that were found on the CPU, and their call stack.

+
root@qemux86:~# ./perf report --header -F overhead,comm,parent
+# Total Lost Samples: 0
+#
+# Samples: 8K of event 'cpu-clock:pppH'
+# Event count (approx.): 2094000000
+#
+# Overhead  Command          Parent symbol
+# ........  ...............  .............
+#
+    58.63%  io-app           [other]
+            |
+             --58.62%--__libc_start_main
+                       main
+                       __kernel_vsyscall
+                       |
+                        --58.61%--__irqentry_text_end
+                                  do_SYSENTER_32
+                                  do_fast_syscall_32
+                                  __noinstr_text_start
+                                  __ia32_sys_write
+                                  ksys_write
+                                  vfs_write
+                                  |
+                                   --58.60%--ext4_file_write_iter
+                                             ext4_buffered_write_iter
+[...]
+
+
+

We have used the --header in order to print the table header, and -F +overhead,comm,parent, in order to print the percentage of time where the call +stack, the command and the caller.

+

We can see that the io-app command is doing some writes in the file system, +and this contributes to much of the load on the system.

+

Armed with this information, we know that there are many I/O calls being done by +the application. In order to look at the size of these requests, we can use the +iosnoop.sh script in order to see how big these requests are.

+
root@qemux86:~/skels/kernel_profiling# ./iosnoop.sh 1
+Tracing block I/O. Ctrl-C to end.
+COMM         PID    TYPE DEV      BLOCK        BYTES     LATms
+io-app       889    WS   254,0    4800512      1310720     2.10
+io-app       889    WS   254,0    4803072      1310720     2.04
+io-app       889    WS   254,0    4805632      1310720     2.03
+io-app       889    WS   254,0    4808192      1310720     2.43
+io-app       889    WS   254,0    4810752      1310720     3.48
+io-app       889    WS   254,0    4813312      1310720     3.46
+io-app       889    WS   254,0    4815872      524288     1.03
+io-app       889    WS   254,0    5029888      1310720     5.82
+io-app       889    WS   254,0    5032448      786432     5.80
+jbd2/vda-43  43     WS   254,0    2702392      8192       0.22
+kworker/0:1H 34     WS   254,0    2702408      4096       0.40
+io-app       889    WS   254,0    4800512      1310720     2.60
+io-app       889    WS   254,0    4803072      1310720     2.58
+[...]
+
+
+

From this output we see that the io-app is reading in a loop from the fact +that the first block 4800512 is repeating, and that it is doing big reads, +since it is reading one megabyte fer request. This constant looping adds the +load to the system that we're experiencing.

+
+

1. Investigating Reduced Responsiveness

+

The io.ko module, located in the kernel_profiling/1-io directory, +decreases the system's responsiveness when inserted. We see that the command +line stutters when typing commands, but when running top, we see that the +system's load is not high, and there aren't any processes that are hogging +resources.

+

Find out what the io.ko module is doing and why is it leading to the +stuttering effect that we experience.

+
+

Hint

+

Trace all the functions being called and check where the CPU is +spending most of its time. In order to do this, you can run either perf +record and perf report to view the output, or perf top.

+
+
+
+

2. Launching New Threads

+

We want to run the same function in a loop 100 times in parallel. We have +implemented two solutions inside the scheduling binary file, located in the +kernel_profiling/2-scheduling directory.

+

When executing the scheduling binary, it prints a message in parallel from +100 running instances. We can tune this execution by running the application +either with the first parameter 0 or 1.

+

Find out which solution is better, and why.

+
+
+

3. Tuning cp

+

Our goal is to write a copy of the cp tool integrated in Linux, which has +been implemented by the memory binary, in the kernel_profiling/3-memory +directory. It implements two approaches that we can take for the copy operation:

+
    +
  • reading the contents of the source file in a buffer in memory using the +read() system call, and writing that buffer to the destination file using +the write() system call;
  • +
  • mapping the source and destination files to memory using the mmap system +call, and copying the contents of the source file to the destination in +memory.
  • +
+

Another tunable parameter that we're going to use is the block size of to copies +that we're going to make, either through reads/writes or in memory.

+

1) Investigate which of the two copying mechanisms is faster. For this step, you +will use the 1024 block size.

+

2) Once you have found which copying mechanism is faster, change the block size +parameter and see which value gives you the best copies. Why?

+
+
+

4. I/O Latency

+

We have written a module that reads the content of a disk. Insert the bio.ko +module, located in the 4-bio module, we see a large spike in the system's +load, as can be seen in the top command, but we see that the system is still +responsive.

+

Investigate what is causing the increased load to the system. Is it an I/O issue, +or is it a scheduling issue?

+
+

Hint

+

Try to trace the I/O operations using perf, or use the +iosnoop.sh script in order to inspect what I/O is happening at a +certain point.

+
+
+
+

5. Bad ELF

+
+

Note

+

This is a bonus exercise that has been tested on a native Linux system. +It may run under the QEMU virtual machine, but the behavior was weird in our testing. +We recommend you used a native (or VirtualBox or VMware) Linux system.

+
+

We managed to build (as part of a Unikraft build) an ELF file that is valid when doing static analysis, but that can't be executed. +The file is bad_elf, located in the 5-bad-elf/ folder.

+

Running it triggers a segmentation fault message. +Running it using strace show an error with execve().

+
... skels/kernel_profiling/5-bad-elf$ ./bad_elf
+Segmentation fault
+
+... skels/kernel_profiling/5-bad-elf$ strace ./bad_elf
+execve("./bad_elf", ["./bad_elf"], 0x7ffc3349ba50 /* 70 vars \*/) = -1 EINVAL (Invalid argument)
+--- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=NULL} ---
++++ killed by SIGSEGV +++
+Segmentation fault (core dumped)
+
+
+

The ELF file itself is valid:

+
... skels/kernel_profiling/5-bad-elf$ readelf -a bad_elf
+
+
+

The issue is to be detected in the kernel.

+

Use either perf, or, better yet ftrace to inspect the kernel function calls done by the program. +Identify the function call that sends out the SIGSEGV signal. +Identify the cause of the issue. +Find that cause in the manual page elf(5).

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab2-kernel-api.html b/refs/pull/405/merge/so2/lab2-kernel-api.html new file mode 100644 index 00000000..7a5f2dcf --- /dev/null +++ b/refs/pull/405/merge/so2/lab2-kernel-api.html @@ -0,0 +1,1125 @@ + + + + + + SO2 Lab 02 - Kernel API — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 02 - Kernel API

+
+

Lab objectives

+
+
    +
  • Familiarize yourself with the basic Linux kernel API
  • +
  • Description of memory allocation mechanisms
  • +
  • Description of locking mechanisms
  • +
+
+
+
+

Overview

+

Inside the current lab we present a set of concepts and basic functions required +for starting Linux kernel programming. It is important to note that kernel +programming differs greatly from user space programming. The kernel is a +stand-alone entity that can not use libraries in user-space (not even libc). +As a result, the usual user-space functions (printf, malloc, free, open, read, +write, memcpy, strcpy, etc.) can no longer be used. In conclusion, kernel +programming is based on a totally new and independent API that is unrelated to +the user-space API, whether we refer to POSIX or ANSI C (standard C language +library functions).

+
+
+

Accessing memory

+

An important difference in kernel programming is how to access and allocate +memory. Due to the fact that kernel programming is very close to the physical +machine, there are important rules for memory management. First, it works with +several types of memory:

+
+
    +
  • Physical memory
  • +
  • Virtual memory from the kernel address space
  • +
  • Virtual memory from a process's address space
  • +
  • Resident memory - we know for sure that the accessed pages are present in +physical memory
  • +
+
+

Virtual memory in a process's address space can not be considered resident due +to the virtual memory mechanisms implemented by the operating system: pages may +be swapped or simply may not be present in physical memory as a result of the +demand paging mechanism. The memory in the kernel address space can be resident +or not. Both the data and code segments of a module and the kernel stack of a +process are resident. Dynamic memory may or may not be resident, depending on +how it is allocated.

+

When working with resident memory, things are simple: memory can be accessed at +any time. But if working with non-resident memory, then it can only be accessed +from certain contexts. Non-resident memory can only be accessed from the +process context. Accessing non-resident memory from the context of an +interrupt has unpredictable results and, therefore, when the operating +system detects such access, it will take drastic measures: blocking or +resetting the system to prevent serious corruption.

+

The virtual memory of a process can not be accessed directly from the kernel. +In general, it is totally discouraged to access the address space of a process, +but there are situations where a device driver needs to do it. The typical case +is where the device driver needs to access a buffer from the user-space. In +this case, the device driver must use special features and not directly access +the buffer. This is necessary to prevent access to invalid memory areas.

+

Another difference from the user-space scheduling, relative to memory, is due to +the stack, a stack whose size is fixed and limited. A stack of 4K is used in +Linux, and a stack of 12K is used in Windows. For this reason, the +allocation of large structures on stack or the use of recursive calls should +be avoided.

+
+
+

Contexts of execution

+

In relation to kernel execution, we distinguish two contexts: process context +and interrupt context. We are in the process context when we run code as a +result of a system call or when we run in the context of a kernel thread. When +we run in a routine to handle an interrupt or a deferrable action, we run in +an interrupt context.

+

Some of the kernel API calls can block the current process. Common examples are +using a semaphore or waiting for a condition. In this case, the process is +put into the WAITING state and another process is running. An interesting +situation occurs when a function that can lead to the current process to be +suspended, is called from an interrupt context. In this case, there is no +current process, and therefore the results are unpredictable. Whenever the +operating system detects this condition will generate an error condition that +will cause the operating system to shut down.

+
+
+

Locking

+

One of the most important features of kernel programming is parallelism. Linux +supports SMP systems with multiple processors and kernel preemptivity. This +makes kernel programming more difficult because access to global variables must +be synchronized with either spinlock primitives or blocking primitives. Although +it is recommended to use blocking primitives, they can not be used in an +interrupt context, so the only locking solution in the context of an interrupt +is spinlocks.

+

Spinlocks are used in order to achieve mutual exclusion. When it can not get +access to the critical region, it does not suspend the current process, but it +uses the busy-waiting mechanism (waiting in a while() loop for the lock +to be released). +The code that runs in the critical region protected by a spinlock is not allowed +to suspend the current process (it must adhere to the execution conditions in +the interrupt context). Moreover, the CPU will not be released except for +the case of an interrupt. Due to the mechanism used, it is important that a +spinlock is being held as little time as possible.

+
+
+

Preemptivity

+

Linux uses preemptive kernels. The notion of preemptive multitasking should not +be confused with the notion of a preemptive kernel. The notion of preemptive +multitasking refers to the fact that the operating system forcefully interrupts +a process running in user space when its quantum (time slice) expires, in order +to run another process. +A kernel is preemptive if a process running in kernel mode (as a result of a +system call) can be interrupted so that another process is being run.

+

Because of preemptivity, when we share resources between two portions of code +that can run from different process contexts, we need to protect ourselves with +synchronization primitives, even in the case of a single processor.

+
+
+

Linux Kernel API

+
+

Convention indicating errors

+

For Linux kernel programming, the convention used for calling functions to +indicate success is the same as in UNIX programming: 0 for success, or a value +other than 0 for failure. +For failures, negative values are returned as shown in the example below:

+
if (alloc_memory() != 0)
+    return -ENOMEM;
+
+if (user_parameter_valid() != 0)
+    return -EINVAL;
+
+
+

The exhaustive list of errors and a summary explanation can be found in +include/uapi/asm-generic/errno-base.h and in +include/uapi/asm-generic/ernno.h.

+
+
+

Strings of characters

+

In Linux, the kernel programmer is provided with the usual routine functions: +strcpy(), strncpy(), strlcpy(), strcat(), +strncat(), strlcat(), strcmp(), strncmp(), +strnicmp(), strchr(), strnchr(), strrchr(), +strstr(), strlen(), memset(), memmove(), +memcmp(), etc. These functions are declared in the +include/linux/string.h header and are implemented in the kernel in the +lib/string.c file.

+
+
+

printk

+

The printf equivalent in the kernel is printk, defined in +include/linux/printk.h. The printk() syntax is very similar +to printf(). The first +parameter of printk() decides the log category in which the current log +falls into:

+
#define KERN_EMERG   "<0>"  /* system is unusable */
+#define KERN_ALERT   "<1>"  /* action must be taken immediately */
+#define KERN_CRIT    "<2>"  /* critical conditions */
+#define KERN_ERR     "<3>"  /* error conditions */
+#define KERN_WARNING "<4>"  /* warning conditions */
+#define KERN_NOTICE  "<5>"  /* normal but significant condition */
+#define KERN_INFO    "<6>"  /* informational */
+#define KERN_DEBUG   "<7>"  /* debug-level messages */
+
+
+

Thus, a warning message in the kernel would be sent with:

+
printk(KERN_WARNING "my_module input string %s\n", buff);
+
+
+

If the logging level is missing from the printk() call, logging is done +with the default level at the time of the call. One thing to keep in mind is +that messages sent with printk() are only visible on the console if and +only if their level exceeds the default level set on the console.

+

To reduce the size of lines when using printk(), it is recommended to +use the following help functions instead of directly using the printk() +call:

+
pr_emerg(fmt, ...); /* similar to printk(KERN_EMERG pr_fmt(fmt), ...); */
+pr_alert(fmt, ...); /* similar to printk(KERN_ALERT pr_fmt(fmt), ...); */
+pr_crit(fmt, ...); /* similar to printk(KERN_CRIT pr_fmt(fmt), ...); */
+pr_err(fmt, ...); /* similar to printk(KERN_ERR pr_fmt(fmt), ...); */
+pr_warn(fmt, ...); /* similar to printk(KERN_WARNING pr_fmt(fmt), ...); */
+pr_notice(fmt, ...); /* similar to printk(KERN_NOTICE pr_fmt(fmt), ...); */
+pr_info(fmt, ...); /* similar to printk(KERN_INFO pr_fmt(fmt), ...); */
+pr_debug(fmt, ...); /* similar to printk(KERN_DEBUG pr_fmt(fmt), ...); */
+
+
+

A special case is pr_debug() that calls the printk() function +only when the DEBUG macro is defined or if dynamic debugging is used.

+
+
+

Memory allocation

+

In Linux only resident memory can be allocated, using kmalloc() call. +A typical kmalloc() call is presented below:

+
#include <linux/slab.h>
+
+string = kmalloc (string_len + 1, GFP_KERNEL);
+if (!string) {
+    //report error: -ENOMEM;
+}
+
+
+

As you can see, the first parameter indicates the size in bytes of the allocated +area. The function returns a pointer to a memory area that can be directly used +in the kernel, or NULL if memory could not be allocated. The second +parameter specifies how allocation should be done and the most commonly used +values for this are:

+
+
    +
  • GFP_KERNEL - using this value may cause the current process to +be suspended. Thus, it can not be used in the interrupt context.
  • +
  • GFP_ATOMIC - using this value it ensures that the +kmalloc() function does not suspend the current process. It can be +used anytime.
  • +
+
+

The counterpart to the kmalloc() function is kfree(), a function +that receives as argument an area allocated by kmalloc(). This function +does not suspend the current process and can therefore be called from any +context.

+
+
+

lists

+

Because linked lists are often used, the Linux kernel API provides a unified +way of defining and using lists. This involves using a +struct list_head element in the structure we want to consider as a +list node. The struct list_head is defined in +include/linux/list.h along with all the other functions that manipulate +the lists. The following code shows the definition of +the struct list_head and the use of an element of this type in another +well-known structure in the Linux kernel:

+
struct list_head {
+    struct list_head *next, *prev;
+};
+
+struct task_struct {
+    ...
+    struct list_head children;
+    ...
+};
+
+
+

The usual routines for working with lists are the following:

+
+
    +
  • LIST_HEAD(name) is used to declare the sentinel of a list
  • +
  • INIT_LIST_HEAD(struct list_head *list)() is used to initialize the +sentinel of a list when dynamic allocation is made, by setting the value of +the next and prev to list fields.
  • +
  • list_add(struct list_head *new, struct list_head *head)() adds the +new element after the head element.
  • +
  • list_del(struct list_head *entry)() deletes the item at the +entry address of the list it belongs to.
  • +
  • list_entry(ptr, type, member) returns the structure with the +type type that contains the element ptr from the list, +having the name member within the structure.
  • +
  • list_for_each(pos, head) iterates over a list using +pos as a cursor.
  • +
  • list_for_each_safe(pos, n, head) iterates over a list using +pos as a cursor and n as a temporary cursor. +This macro is used to delete an item from the list.
  • +
+
+

The following code shows how to use these routines:

+
#include <linux/slab.h>
+#include <linux/list.h>
+
+struct pid_list {
+    pid_t pid;
+    struct list_head list;
+};
+
+LIST_HEAD(my_list);
+
+static int add_pid(pid_t pid)
+{
+    struct pid_list *ple = kmalloc(sizeof *ple, GFP_KERNEL);
+
+    if (!ple)
+        return -ENOMEM;
+
+    ple->pid = pid;
+    list_add(&ple->list, &my_list);
+
+    return 0;
+}
+
+static int del_pid(pid_t pid)
+{
+    struct list_head *i, *tmp;
+    struct pid_list *ple;
+
+    list_for_each_safe(i, tmp, &my_list) {
+        ple = list_entry(i, struct pid_list, list);
+        if (ple->pid == pid) {
+            list_del(i);
+            kfree(ple);
+            return 0;
+        }
+    }
+
+    return -EINVAL;
+}
+
+static void destroy_list(void)
+{
+    struct list_head *i, *n;
+    struct pid_list *ple;
+
+    list_for_each_safe(i, n, &my_list) {
+        ple = list_entry(i, struct pid_list, list);
+        list_del(i);
+        kfree(ple);
+    }
+}
+
+
+

The evolution of the list can be seen in the following figure:

+../_images/list_evolution1.png +

You see the stack type behavior introduced by the list_add macro, +and the use of a sentinel.

+

From the above example, it can be noticed that the way to define and use a list +(double-linked) is generic and, at the same time, it does not introduce an +additional overhead. The struct list_head is used to maintain the +links between the list elements. It can be noticed that iterating over the list +is also done with this structure, and that retrieving a list element can be done +using list_entry. This idea of implementing and using a list is not +new, as it has already been described in The Art of Computer Programming by +Donald Knuth in the 1980s.

+

Several kernel list functions and macro definitions are presented and explained +in the include/linux/list.h header.

+
+
+

Spinlock

+

spinlock_t (defined in linux/spinlock.h) is the basic type +that implements the spinlock concept in Linux. It describes a spinlock, and the +operations associated with a spinlock are spin_lock_init(), +spin_lock(), spin_unlock(). An example of use is given below:

+
#include <linux/spinlock.h>
+
+DEFINE_SPINLOCK(lock1);
+spinlock_t lock2;
+
+spin_lock_init(&lock2);
+
+spin_lock(&lock1);
+/* critical region */
+spin_unlock(&lock1);
+
+spin_lock(&lock2);
+/* critical region */
+spin_unlock(&lock2);
+
+
+

In Linux, you can use reader-writer spinlocks, useful for readers-writers +problems. +These types of locks are identified by rwlock_t, and the functions +that can work on a reader-writer spinlock are +rwlock_init(), +read_lock(), +write_lock(). +An example of use:

+
#include <linux/spinlock.h>
+
+DEFINE_RWLOCK(lock);
+
+struct pid_list {
+    pid_t pid;
+    struct list_head list;
+};
+
+int have_pid(struct list_head *lh, int pid)
+{
+    struct list_head *i;
+    void *elem;
+
+    read_lock(&lock);
+    list_for_each(i, lh) {
+        struct pid_list *pl = list_entry(i, struct pid_list, list);
+        if (pl->pid == pid) {
+            read_unlock(&lock);
+            return 1;
+        }
+    }
+    read_unlock(&lock);
+
+    return 0;
+}
+
+void add_pid(struct list_head *lh, struct pid_list *pl)
+{
+    write_lock(&lock);
+    list_add(&pl->list, lh);
+    write_unlock(&lock);
+}
+
+
+
+
+

mutex

+

A mutex is a variable of the struct mutex type (defined in +linux/mutex.h). +Functions and macros for working with mutexes are listed below:

+
#include <linux/mutex.h>
+
+/* functions for mutex initialization */
+void mutex_init(struct mutex *mutex);
+DEFINE_MUTEX(name);
+
+/* functions for mutex acquire */
+void mutex_lock(struct mutex *mutex);
+
+/* functions for mutex release */
+void mutex_unlock(struct mutex *mutex);
+
+
+

Operations are similar to classic mutex operations in user-space or spinlock +operations: the mutex is acquired before entering the critical region and it is +released after exiting the critical region. Unlike spinlocks, these operations +can only be used in process context.

+
+
+

Atomic variables

+

Often, you only need to synchronize access to a simple variable, such as a +counter. For this, an atomic_t type can be used (defined in +include/linux/atomic.h), that holds an integer value. Below are some +operations that can be performed on an atomic_t variable.

+
#include <asm/atomic.h>
+
+void atomic_set(atomic_t *v, int i);
+int atomic_read(atomic_t *v);
+void atomic_add(int i, atomic_t *v);
+void atomic_sub(int i, atomic_t *v);
+void atomic_inc(atomic_t *v);
+void atomic_dec(atomic_t *v);
+int atomic_inc_and_test(atomic_t *v);
+int atomic_dec_and_test(atomic_t *v);
+int atomic_cmpxchg(atomic_t *v, int old, int new);
+
+
+
+

Use of atomic variables

+

A common way of using atomic variables is to store the status of an action +(e.g. a flag). So we can use an atomic variable to mark exclusive actions. For +example, we consider that an atomic variable can have the LOCKED and UNLOCKED +values, and if the respective variable equals LOCKED then a specific function +should return -EBUSY. +Such an usage is shown schematically in the code below:

+
#define LOCKED       0
+#define UNLOCKED     1
+
+static atomic_t flag;
+
+static int my_acquire(void)
+{
+     int initial_flag;
+
+     /*
+      * Check if flag is UNLOCKED; if so, lock it and do it atomically.
+      *
+      * This is the atomic equivalent of
+      *      if (flag == UNLOCKED)
+      *              flag = LOCKED;
+      *      else
+      *              return -EBUSY;
+      */
+     initial_flag = atomic_cmpxchg(&flag, UNLOCKED, LOCKED);
+     if (initial_flag == LOCKED) {
+             printk(KERN_ALERT "Already locked.\n");
+             return -EBUSY;
+     }
+
+     /* Do your thing after getting the lock. */
+     [...]
+}
+
+static void my_release(void)
+{
+     /* Release flag; mark it as unlocked. */
+     atomic_set(&flag, UNLOCKED);
+}
+
+void my_init(void)
+{
+     [...]
+     /* Atomic variable is initially unlocked. */
+     atomic_set(&flag, UNLOCKED);
+
+     [...]
+}
+
+
+

The above code is the equivalent of using a trylock (such as +pthread_mutex_trylock()).

+

We can also use a variable to store the size of a buffer and for atomic +updates of the respective variable. The code below is such an example:

+
static unsigned char buffer[MAX_SIZE];
+static atomic_t size;
+
+static void add_to_buffer(unsigned char value)
+{
+     buffer[atomic_read(&size)] = value;
+     atomic_inc(&size);
+}
+
+static unsigned char remove_from_buffer(void)
+{
+     unsigned char value;
+
+     value = buffer[atomic_read(&size)];
+     atomic_dec(&size);
+
+     return value
+}
+
+static void reset_buffer(void)
+{
+     atomic_set(&size, 0);
+}
+
+void my_init(void)
+{
+     [...]
+     /* Initialized buffer and size. */
+     atomic_set(&size, 0);
+     memset(buffer, 0, sizeof(buffer));
+
+     [...]
+}
+
+
+
+
+
+

Atomic bitwise operations

+

The kernel provides a set of functions (in asm/bitops.h) that modify or +test bits in an atomic way.

+
#include <asm/bitops.h>
+
+void set_bit(int nr, void *addr);
+void clear_bit(int nr, void *addr);
+void change_bit(int nr, void *addr);
+int test_and_set_bit(int nr, void *addr);
+int test_and_clear_bit(int nr, void *addr);
+int test_and_change_bit(int nr, void *addr);
+
+
+

Addr represents the address of the memory area whose bits are being +modified or tested and nr is the bit on which the operation is +performed.

+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is kernel_api. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/kernel_api/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR find the definitions of the following symbols in the Linux kernel:

+
+
    +
  • struct list_head
  • +
  • INIT_LIST_HEAD()
  • +
  • list_add()
  • +
  • list_for_each
  • +
  • list_entry
  • +
  • container_of
  • +
  • offsetof
  • +
+
+
+
+

1. Memory allocation in Linux kernel

+

Generate the skeleton for the task named 1-mem and browse the +contents of the mem.c file. Observe the use of kmalloc() +call for memory allocation.

+
+
    +
  1. Compile the source code and load the mem.ko module using +insmod.
  2. +
  3. View the kernel messages using the dmesg command.
  4. +
  5. Unload the kernel module using the rmmod mem command.
  6. +
+
+
+

Note

+

Review the Memory Allocation section in the lab.

+
+
+
+

2. Sleeping in atomic context

+

Generate the skeleton for the task named 2-sched-spin and browse +the contents of the sched-spin.c file.

+
+
    +
  1. Compile the source code and load the module, according the above info: +(make build and make copy)
  2. +
  3. Notice that it is waiting for 5 seconds until the insertion +order is complete.
  4. +
  5. Unload the kernel module.
  6. +
  7. Look for the lines marked with: TODO 0 to create an atomic +section. Re-compile the source code and reload the module into +the kernel.
  8. +
+
+

You should now get an error. Look at the stack trace. What is the +cause of the error?

+
+

Hint

+

In the error message, follow the line containing the BUG +for a description of the error. You are not allowed to sleep in +atomic context. The atomic context is given by a section +between a lock operation and an unlock on a spinlock.

+
+
+

Note

+

The +schedule_timeout() function, corroborated with the +set_current_state macro, forces the current process to wait +for 5 seconds.

+
+
+

Note

+

Review the Contexts of execution, Locking and Spinlock +sections.

+
+
+
+

3. Working with kernel memory

+

Generate the skeleton for the task named 3-memory directory and +browse the contents of the memory.c file. Notice the comments +marked with TODO. You must allocate 4 structures of type struct +task_info and initialize them (in memory_init()), then print and +free them (in memory_exit()).

+
+
    +
  1. (TODO 1) Allocate memory for struct task_info structure and +initialize its fields:

    +
      +
    • The pid field to the PID transmitted as a parameter;
    • +
    • The timestamp field to the value of the jiffies +variable, which holds the number of ticks that have occurred since the +system booted.
    • +
    +
  2. +
  3. (TODO 2) Allocate struct task_info for the current process, +the parent process, the next process, the next process of the next +process, with the following information:

    +
      +
    • PID of the current process, which can be retrieved from +struct task_struct structure, returned by current +macro.
    • +
    +
    +

    Hint

    +

    Search for pid in task_struct.

    +
    +
      +
    • PID of the parent process of the current process.
    • +
    +
    +

    Hint

    +

    Search for the relevant field from struct task_struct +structure. Look after "parent".

    +
    +
      +
    • PID of the next process from the list of processes, relative to the +current process.
    • +
    +
    +

    Hint

    +

    Use next_task macro, which returns a pointer to the next +process (i.e a struct task_struct structure).

    +
    +
      +
    • PID of the next process of the next process, relative to the current +process.
    • +
    +
    +

    Hint

    +

    Call the next_task macro 2 times.

    +
    +
  4. +
  5. (TODO 3) Display the four structures.

    +
      +
    • Use printk() to display their two fields:
    • +
    +

    pid and timestamp.

    +
  6. +
  7. (TODO 4) Release the memory occupied by the structures +(use kfree()).

    +
  8. +
+
+
+

Hint

+
    +
  • You can access the current process using current +macro.
  • +
  • Look for the relevant fields in the struct task_struct +structure (pid, parent).
  • +
  • Use the next_task macro. The macro returns the pointer to +the next process (ie. a struct task_struct* structure).
  • +
+
+
+

Note

+

The struct task_struct structure contains two fields to +designate the parent of a task:

+
    +
  • real_parent points to the process that created the +task or to process with pid 1 (init) if the parent +completed its execution.
  • +
  • parent indicates to the current task parent (the +process that will be reported if the task completes +execution).
  • +
+

In general, the values of the two fields are the same, but +there are situations where they differ, for example when +using the ptrace() system call.

+
+
+

Hint

+

Review the Memory allocation section in the lab.

+
+
+
+

4. Working with kernel lists

+

Generate the skeleton for the task named 4-list. Browse the +contents of the list.c file and notice the comments marked with +TODO. The current process will add the four structures from the +previous exercise into a list. The list will be built in the +task_info_add_for_current() function which is called when module is +loaded. The list will be printed and deleted in the list_exit() +function and the task_info_purge_list() function.

+
+
    +
  1. (TODO 1) Complete the task_info_add_to_list() function to allocate +a struct task_info structure and add it to the list.
  2. +
  3. (TODO 2) Complete the task_info_purge_list() function to delete +all the elements in the list.
  4. +
  5. Compile the kernel module. Load and unload the module by +following the messages displayed by the kernel.
  6. +
+
+
+

Hint

+

Review the labs Lists section. When deleting items from +the list, you will need to use either the +list_for_each_safe or list_for_each_entry_safe +macros.

+
+
+
+

5. Working with kernel lists for process handling

+

Generate the skeleton for the task named 5-list-full. Browse the +contents of the list-full.c and notice comments marked with +TODO. In addition to the 4-list functionality we add the +following:

+
+
    +
  • A count field showing how many times a process has been "added" +to the list.

    +
  • +
  • If a process is "added" several times, no new entry is created in +the list, but:

    +
    +
      +
    • Update the timestamp field.
    • +
    • Increment count.
    • +
    +
    +
  • +
  • To implement the counter facility, add a task_info_find_pid() +function that searches for a pid in the existing list.

    +
  • +
  • If found, return the reference to the task_info struct. If +not, return NULL.

    +
  • +
  • An expiration facility. If a process was added more than 3 +seconds ago and if it does not have a count greater than 5 then +it is considered expired and is removed from the list.

    +
  • +
  • The expiration facility is already implemented in the +task_info_remove_expired() function.

    +
  • +
+
    +
  1. (TODO 1) Implement the task_info_find_pid() function.

    +
  2. +
  3. (TODO 2) Change a field of an item in the list so it does not +expire. It must not satisfy a part of the expiration condition +from task_info_remove_expired().

    +
    +

    Hint

    +

    For TODO 2, extract the first element from the list (the one +referred by head.next) and set the count +field to a large enough value. Use atomic_set() function.

    +
    +
  4. +
  5. Compile, copy, load and unload the kernel module following the displayed +messages. +Kernel module loading will take some time, because sleep() is +being called by schedule_timeout() function.

    +
  6. +
+
+
+
+

6. Synchronizing list work

+

Generate the skeleton for the task named 6-list-sync.

+
+
    +
  1. Browse the code and look for TODO 1 string.
  2. +
  3. Use a spinlock or a read-write lock to synchronize access to the +list.
  4. +
  5. Compile, load and unload the kernel module.
  6. +
+
+
+

Important

+

Always lock data, not code!

+
+
+

Note

+

Read Spinlock section of the lab.

+
+
+
+

7. Test module calling in our list module

+

Generate the skeleton for the task named 7-list-test and browse +the contents of the list-test.c file. We'll use it as a test +module. It will call functions exported by the 6-list-sync +task. The exported functions are the ones marked with extern in +list-test.c file.

+

Uncomment the commented code from 7-list-test.c. Look for TODO 1.

+

To export the above functions from the module located at 6-list-sync/ +directory, the following steps are required:

+
+
    +
  1. Functions must not be static.
  2. +
  3. Use the EXPORT_SYMBOL macro to export the kernel symbols. For +example: EXPORT_SYMBOL(task_info_remove_expired);. The +macro must be used for each function after the function is defined. +Browse the code and look for the TODO 2 string in the +list-sync.c.
  4. +
  5. Remove from the module from 6-list-sync the code that avoids the +expiration of a list item (it is in contradiction to our exercise).
  6. +
  7. Compile and load the module from 6-list-sync/. Once loaded, it +exposes exported functions and can be used by the test +module. You can check this by searching for the function names +in /proc/kallsyms before and after loading the module.
  8. +
  9. Compile the test module and then load it.
  10. +
  11. Use lsmod to check that the two modules have been loaded. +What do you notice?
  12. +
  13. Unload the kernel test module.
  14. +
+
+

What should be the unload order of the two modules (the module from +6-list-sync and the test module)? What happens if you use another order?

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab3-device-drivers.html b/refs/pull/405/merge/so2/lab3-device-drivers.html new file mode 100644 index 00000000..02a99dc1 --- /dev/null +++ b/refs/pull/405/merge/so2/lab3-device-drivers.html @@ -0,0 +1,1235 @@ + + + + + + SO2 Lab 03 - Character device drivers — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 03 - Character device drivers

+
+

Laboratory objectives

+
+
    +
  • understand the concepts behind character device driver
  • +
  • understand the various operations that can be performed on character devices
  • +
  • working with waiting queues
  • +
+
+
+
+

Overview

+

In UNIX, hardware devices are accessed by the user through special device +files. These files are grouped into the /dev directory, and system calls +open, read, write, close, lseek, mmap etc. are +redirected by the operating system to the device driver associated with the +physical device. The device driver is a kernel component (usually a module) +that interacts with a hardware device.

+

In the UNIX world there are two categories of device files and thus +device drivers: character and block. This division is done by the speed, +volume and way of organizing the data to be transferred from the device to the +system and vice versa. In the first category, there are slow devices, which +manage a small amount of data, and access to data does not require frequent +seek queries. Examples are devices such as keyboard, mouse, serial ports, +sound card, joystick. In general, operations with these devices (read, write) +are performed sequentially byte by byte. The second category includes devices +where data volume is large, data is organized on blocks, and search is common. +Examples of devices that fall into this category are hard drives, cdroms, ram +disks, magnetic tape drives. For these devices, reading and writing is done at +the data block level.

+

For the two types of device drivers, the Linux kernel offers different APIs. +If for character devices system calls go directly to device drivers, in case of +block devices, the drivers do not work directly with system calls. In +the case of block devices, communication between the user-space and the block +device driver is mediated by the file management subsystem and the block device +subsystem. The role of these subsystems is to prepare the device driver's +necessary resources (buffers), to keep the recently read data in the cache +buffer, and to order the read and write operations for performance reasons.

+
+
+

Majors and minors

+

In UNIX, the devices traditionally had a unique, fixed identifier associated +with them. This tradition is preserved in Linux, although identifiers can be +dynamically allocated (for compatibility reasons, most drivers still use static +identifiers). The identifier consists of two parts: major and minor. The first +part identifies the device type (IDE disk, SCSI disk, serial port, etc.) +and the second one identifies the device (first disk, second serial port, +etc.). Most times, the major identifies the driver, while the minor identifies +each physical device served by the driver. In general, a driver will have a +major associate and will be responsible for all minors associated with that +major.

+
$ ls -la /dev/hda? /dev/ttyS?
+brw-rw----  1 root disk    3,  1 2004-09-18 14:51 /dev/hda1
+brw-rw----  1 root disk    3,  2 2004-09-18 14:51 /dev/hda2
+crw-rw----  1 root dialout 4, 64 2004-09-18 14:52 /dev/ttyS0
+crw-rw----  1 root dialout 4, 65 2004-09-18 14:52 /dev/ttyS1
+
+
+

As can be seen from the example above, device-type information can be found +using the ls command. The special character files are identified by the c +character in the first column of the command output, and the block type by the +character b. In columns 5 and 6 of the result you can see the +major, respectively the minor for each device.

+

Certain major identifiers are statically assigned to devices (in the +Documentation/admin-guide/devices.txt file from the kernel sources). When choosing the +identifier for a new device, you can use two methods: static (choose a number +that does not seem to be used already) or dynamically. In /proc/devices are the +loaded devices, along with the major identifier.

+

To create a device type file, use the mknod command; the command receives the +type (block or character), major and minor of the device +(mknod name type major minor). Thus, if you want to create a character device +named mycdev with the major 42 and minor 0, use the command:

+
# mknod /dev/mycdev c 42 0
+
+
+

To create the block device with the name mybdev with the major 240 and minor 0 +the command will be:

+
# mknod /dev/mybdev b 240 0
+
+
+

Next, we'll refer to character devices as drivers.

+
+
+

Data structures for a character device

+

In the kernel, a character-type device is represented by +struct cdev, a structure used to register it in the +system. Most driver operations use three important structures: +struct file_operations, struct file and struct inode.

+
+

struct file_operations

+

As mentioned above, the character device drivers receive unaltered system calls +made by users over device-type files. Consequently, implementation of a character +device driver means implementing the system calls specific to files: open, +close, read, write, lseek, mmap, etc. These operations are +described in the fields of the struct file_operations structure:

+
#include <linux/fs.h>
+
+struct file_operations {
+    struct module *owner;
+    loff_t (*llseek) (struct file *, loff_t, int);
+    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
+    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
+    [...]
+    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+    [...]
+    int (*open) (struct inode *, struct file *);
+    int (*flush) (struct file *, fl_owner_t id);
+    int (*release) (struct inode *, struct file *);
+    [...]
+
+
+

It can be noticed that the signature of the function differs from the system +call that the user uses. The operating system sits between the user and +the device driver to simplify implementation in the device driver.

+

open does not receive the parameter path or the various parameters that control +the file opening mode. Similarly, read, write, release, ioctl, lseek +do not receive as a parameter a file descriptor. Instead, these routines receive as +parameters two structures: file and inode. Both structures represent a file, +but from different perspectives.

+
+
Most parameters for the presented operations have a direct meaning:
+
    +
  • file and inode identifies the device type file;
  • +
  • size is the number of bytes to be read or written;
  • +
  • offset is the displacement to be read or written (to be updated +accordingly);
  • +
  • user_buffer user buffer from which it reads / writes;
  • +
  • whence is the way to seek (the position where the search operation starts);
  • +
  • cmd and arg are the parameters sent by the users to the ioctl call (IO +control).
  • +
+
+
+
+
+

inode and file structures

+

An inode represents a file from the point of view of the file system. Attributes +of an inode are the size, rights, times associated with the file. An inode uniquely +identifies a file in a file system.

+

The file structure is still a file, but closer to the user's point of view. +From the attributes of the file structure we list: the inode, the file name, +the file opening attributes, the file position. All open files at a given time +have associated a file structure.

+

To understand the differences between inode and file, we will use an analogy +from object-oriented programming: if we consider a class inode, then the files +are objects, that is, instances of the inode class. Inode represents the static +image of the file (the inode has no state), while the file represents the +dynamic image of the file (the file has state).

+

Returning to device drivers, the two entities have almost always standard ways +of using: the inode is used to determine the major and minor of the device on +which the operation is performed, and the file is used to determine the flags +with which the file was opened, but also to save and access (later) private +data.

+

The file structure contains, among many fields:

+
+
    +
  • f_mode, which specifies read (FMODE_READ) or write +(FMODE_WRITE);
  • +
  • f_flags, which specifies the file opening flags (O_RDONLY, +O_NONBLOCK, O_SYNC, O_APPEND, O_TRUNC, etc.);
  • +
  • f_op, which specifies the operations associated with the file (pointer to +the file_operations structure );
  • +
  • private_data, a pointer that can be used by the programmer to store +device-specific data; The pointer will be initialized to a memory location +assigned by the programmer.
  • +
  • f_pos, the offset within the file
  • +
+
+

The inode structure contains, among much information, an i_cdev +field, which is a pointer to the structure that defines the character +device (when the inode corresponds to a character device).

+
+
+
+

Implementation of operations

+

To implement a device driver, it is recommended that you create a structure +that contains information about the device, information used in the module. In +the case of a driver for a character device, the structure will contain a cdev +structure field to refer to the device. The following example uses the struct +my_device_data:

+
#include <linux/fs.h>
+#include <linux/cdev.h>
+
+struct my_device_data {
+    struct cdev cdev;
+    /* my data starts here */
+    //...
+};
+
+static int my_open(struct inode *inode, struct file *file)
+{
+    struct my_device_data *my_data;
+
+    my_data = container_of(inode->i_cdev, struct my_device_data, cdev);
+
+    file->private_data = my_data;
+    //...
+}
+
+static int my_read(struct file *file, char __user *user_buffer, size_t size, loff_t *offset)
+{
+    struct my_device_data *my_data;
+
+    my_data = (struct my_device_data *) file->private_data;
+
+    //...
+}
+
+
+

A structure like my_device_data will contain the data associated with a device. +The cdev field (cdev type) is a character-type device and is used to record it +in the system and identify the device. The pointer to the cdev member can be +found using the i_cdev field of the inode structure (using the container_of +macro). In the private_data field of the file structure, information can be +stored at open which is then available in the read, write, release, etc. +routines.

+
+
+

Registration and unregistration of character devices

+

The registration/unregistration of a device is made by specifying the major and +minor. The dev_t type is used to keep the identifiers of a device (both major +and minor) and can be obtained using the MKDEV macro.

+

For the static assignment and unallocation of device identifiers, the +register_chrdev_region and unregister_chrdev_region functions are used:

+
#include <linux/fs.h>
+
+int register_chrdev_region(dev_t first, unsigned int count, char *name);
+void unregister_chrdev_region(dev_t first, unsigned int count);
+
+
+

It is recommended that device identifiers be dynamically assigned to the +alloc_chrdev_region function.

+

Below sequence reserves my_minor_count devices, starting with my_major +major and my_first_minor minor (if the max value for minor is exceeded, +move to the next major):

+
#include <linux/fs.h>
+...
+
+err = register_chrdev_region(MKDEV(my_major, my_first_minor), my_minor_count,
+                             "my_device_driver");
+if (err != 0) {
+    /* report error */
+    return err;
+}
+...
+
+
+

After assigning the identifiers, the character device will have to be +initialized (cdev_init) and the kernel will have to be notified(cdev_add). The +cdev_add function must be called only after the device is ready to receive +calls. Removing a device is done using the cdev_del function.

+
#include <linux/cdev.h>
+
+void cdev_init(struct cdev *cdev, struct file_operations *fops);
+int cdev_add(struct cdev *dev, dev_t num, unsigned int count);
+void cdev_del(struct cdev *dev);
+
+
+

The following sequence registers and initializes MY_MAX_MINORS devices:

+
#include <linux/fs.h>
+#include <linux/cdev.h>
+
+#define MY_MAJOR       42
+#define MY_MAX_MINORS  5
+
+struct my_device_data {
+    struct cdev cdev;
+    /* my data starts here */
+    //...
+};
+
+struct my_device_data devs[MY_MAX_MINORS];
+
+const struct file_operations my_fops = {
+    .owner = THIS_MODULE,
+    .open = my_open,
+    .read = my_read,
+    .write = my_write,
+    .release = my_release,
+    .unlocked_ioctl = my_ioctl
+};
+
+int init_module(void)
+{
+    int i, err;
+
+    err = register_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS,
+                                 "my_device_driver");
+    if (err != 0) {
+        /* report error */
+        return err;
+    }
+
+    for(i = 0; i < MY_MAX_MINORS; i++) {
+        /* initialize devs[i] fields */
+        cdev_init(&devs[i].cdev, &my_fops);
+        cdev_add(&devs[i].cdev, MKDEV(MY_MAJOR, i), 1);
+    }
+
+    return 0;
+}
+
+
+

While the following sequence deletes and unregisters them:

+
void cleanup_module(void)
+{
+    int i;
+
+    for(i = 0; i < MY_MAX_MINORS; i++) {
+        /* release devs[i] fields */
+        cdev_del(&devs[i].cdev);
+    }
+    unregister_chrdev_region(MKDEV(MY_MAJOR, 0), MY_MAX_MINORS);
+}
+
+
+
+

Note

+

Initialization of the struct my_fops used the initialization +of members by name, defined in C99 standard (see designated +initializers and the file_operations structure). Structure +members who do not explicitly appear in this initialization +will be set to the default value for their type. For +example, after the initialization above, my_fops.mmap will +be NULL.

+
+
+
+

Access to the address space of the process

+

A driver for a device is the interface between an application and hardware. As +a result, we often have to access user-space data. Accessing it can not be done +directly (by dereferencing a user-space pointer). Direct access of a +user-space pointer can lead to incorrect behavior (depending on architecture, a +user-space pointer may not be valid or mapped to kernel-space), a kernel oops +(the user-mode pointer can refer to a non-resident memory area) or security +issues. Proper access to user-space data is done by calling the macros / +functions below:

+
#include <asm/uaccess.h>
+
+put_user(type val, type *address);
+get_user(type val, type *address);
+unsigned long copy_to_user(void __user *to, const void *from, unsigned long n);
+unsigned long copy_from_user(void *to, const void __user *from, unsigned long n);
+
+
+

All macros / functions return 0 in case of success and another value in case of +error and have the following roles:

+
+
    +
  • put_user store the value val to user-space address address; +Type can be one on 8, 16, 32, 64 bit (the maximum supported type depends on the +hardware platform);
  • +
  • get_user analogue to the previous function, only that val will be set to a +value identical to the value at the user-space address given by address;
  • +
  • copy_to_user copies n bytes from the kernel-space, from the address +referenced by from in user-space to the address referenced by to;
  • +
  • copy_from_user copies n bytes from user-space from the address +referenced by from in kernel-space to the address referenced by to.
  • +
+
+

A common section of code that works with these functions is:

+
#include <asm/uaccess.h>
+
+/*
+ * Copy at most size bytes to user space.
+ * Return ''0'' on success and some other value on error.
+ */
+if (copy_to_user(user_buffer, kernel_buffer, size))
+    return -EFAULT;
+else
+    return 0;
+
+
+
+
+

Open and release

+

The open function performs the initialization of a device. In most cases, +these operations refer to initializing the device and filling in specific data +(if it is the first open call). The release function is about releasing +device-specific resources: unlocking specific data and closing the device if +the last call is close.

+

In most cases, the open function will have the following structure:

+
static int my_open(struct inode *inode, struct file *file)
+{
+    struct my_device_data *my_data =
+             container_of(inode->i_cdev, struct my_device_data, cdev);
+
+    /* validate access to device */
+    file->private_data = my_data;
+
+    /* initialize device */
+    ...
+
+    return 0;
+}
+
+
+

A problem that occurs when implementing the open function is access control. +Sometimes a device needs to be opened once at a time; More specifically, do not +allow the second open before the release. To implement this restriction, you +choose a way to handle an open call for an already open device: it can return +an error (-EBUSY), block open calls until a release operation, or shut down +the device before do the open.

+

At the user-space call of the open and close functions on the device, call +my_open and my_release in the driver. An example of a user-space call:

+
int fd = open("/dev/my_device", O_RDONLY);
+if (fd < 0) {
+    /* handle error */
+}
+
+/* do work */
+//..
+
+close(fd);
+
+
+
+
+

Read and write

+

The read and write operations are reaching the device driver as a +result of an user-space program calling the read or write system calls:

+
if (read(fd, buffer, size) < 0) {
+    /* handle error */
+}
+
+if (write(fd, buffer, size) < 0) {
+    /* handle error */
+}
+
+
+

The read and write functions transfer data between the device and the +user-space: the read function reads the data from the device and transfers it +to the user-space, while writing reads the user-space data and writes it to the +device. The buffer received as a parameter is a user-space pointer, which is +why it is necessary to use the copy_to_user or copy_from_user functions.

+

The value returned by read or write can be:

+
+
    +
  • the number of bytes transferred; if the returned value is less than the size +parameter (the number of bytes requested), then it means that a partial +transfer was made. Most of the time, the user-space app calls the system call +(read or write) function until the required data number is transferred.
  • +
  • 0 to mark the end of the file in the case of read ; if write returns the +value 0 then it means that no byte has been written and that no error has +occurred; In this case, the user-space application retries the write call.
  • +
  • a negative value indicating an error code.
  • +
+
+

To perform a data transfer consisting of several partial transfers, the +following operations should be performed:

+
+
    +
  • transfer the maximum number of possible bytes between the buffer received +as a parameter and the device (writing to the device/reading from the device +will be done from the offset received as a parameter);
  • +
  • update the offset received as a parameter to the position from which the +next read / write data will begin;
  • +
  • return the number of bytes transferred.
  • +
+
+

The sequence below shows an example for the read function that takes +into account the internal buffer size, user buffer size and the offset:

+
static int my_read(struct file *file, char __user *user_buffer,
+                   size_t size, loff_t *offset)
+{
+    struct my_device_data *my_data = (struct my_device_data *) file->private_data;
+    ssize_t len = min(my_data->size - *offset, size);
+
+    if (len <= 0)
+        return 0;
+
+    /* read data from my_data->buffer to user buffer */
+    if (copy_to_user(user_buffer, my_data->buffer + *offset, len))
+        return -EFAULT;
+
+    *offset += len;
+    return len;
+}
+
+
+

The images below illustrate the read operation and how data is +transferred between the user-space and the driver:

+
+
    +
  1. when the driver has enough data available (starting with the OFFSET +position) to accurately transfer the required size (SIZE) to the user.
  2. +
  3. when a smaller amount is transferred than required.
  4. +
+
+../_images/read1.png +../_images/read21.png +

We can look at the read operation implemented by the driver as a response to a +user-space read request. In this case, the driver is responsible for advancing +the offset according to how much it reads and returning the read size (which +may be less than what is required).

+

The structure of the write function is similar:

+
static int my_write(struct file *file, const char __user *user_buffer,
+                    size_t size, loff_t * offset)
+{
+    struct my_device_data *my_data = (struct my_device_data *) file->private_data;
+    ssize_t len = min(my_data->size - *offset, size);
+
+    if (len <= 0)
+        return 0;
+
+    /* read data from user buffer to my_data->buffer */
+    if (copy_from_user(my_data->buffer + *offset, user_buffer, len))
+        return -EFAULT;
+
+    *offset += len;
+    return len;
+}
+
+
+

The write operation will respond to a write request from user-space. In +this case, depending on the maximum driver capacity (MAXSIZ), it can +write more or less than the required size.

+../_images/write1.png +../_images/write21.png +
+
+

ioctl

+

In addition to read and write operations, a driver needs the ability to perform +certain physical device control tasks. These operations are accomplished by +implementing a ioctl function. Initially, the ioctl system call used Big Kernel +Lock. That's why the call was gradually replaced with its unlocked version +called unlocked_ioctl. You can read more on LWN: +http://lwn.net/Articles/119652/

+
static long my_ioctl (struct file *file, unsigned int cmd, unsigned long arg);
+
+
+

cmd is the command sent from user-space. If a value is being sent from the +user-space call, it can be accessed directly. If a buffer is fetched, the arg +value will be a pointer to it, and must be accessed through the copy_to_user +or copy_from_user.

+

Before implementing the ioctl function, the numbers corresponding to the +commands must be chosen. One method is to choose consecutive numbers starting +at 0, but it is recommended to use _IOC(dir, type, nr, size) macro definition +to generate ioctl codes. The macro definition parameters are as follows:

+
+
    +
  • dir represents the data transfer (_IOC_NONE , _IOC_READ, +_IOC_WRITE).
  • +
  • type represents the magic number (Documentation/ioctl/ioctl-number.txt);
  • +
  • nr is the ioctl code for the device;
  • +
  • size is the size of the transferred data.
  • +
+
+

The following example shows an implementation for a ioctl function:

+
#include <asm/ioctl.h>
+
+#define MY_IOCTL_IN _IOC(_IOC_WRITE, 'k', 1, sizeof(my_ioctl_data))
+
+static long my_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    struct my_device_data *my_data =
+         (struct my_device_data*) file->private_data;
+    my_ioctl_data mid;
+
+    switch(cmd) {
+    case MY_IOCTL_IN:
+        if( copy_from_user(&mid, (my_ioctl_data *) arg,
+                           sizeof(my_ioctl_data)) )
+            return -EFAULT;
+
+        /* process data and execute command */
+
+        break;
+    default:
+        return -ENOTTY;
+    }
+
+    return 0;
+}
+
+
+

At the user-space call for the ioctl function, the my_ioctl function of the +driver will be called. An example of such a user-space call:

+
if (ioctl(fd, MY_IOCTL_IN, buffer) < 0) {
+    /* handle error */
+}
+
+
+
+
+

Waiting queues

+

It is often necessary for a thread to wait for an operation to finish, +but it is desirable that this wait is not busy-waiting. Using waiting +queues we can block a thread until an event occurs. When the condition +is satisfied, elsewhere in the kernel, in another process, in an +interrupt or deferrable work, we will wake up the process.

+

A waiting queue is a list of processes that are waiting for a specific +event. A queue is defined with the wait_queue_head_t type and can +be used by the functions/macros:

+
#include <linux/wait.h>
+
+DECLARE_WAIT_QUEUE_HEAD(wq_name);
+
+void init_waitqueue_head(wait_queue_head_t *q);
+
+int wait_event(wait_queue_head_t q, int condition);
+
+int wait_event_interruptible(wait_queue_head_t q, int condition);
+
+int wait_event_timeout(wait_queue_head_t q, int condition, int timeout);
+
+int wait_event_interruptible_timeout(wait_queue_head_t q, int condition, int timeout);
+
+void wake_up(wait_queue_head_t *q);
+
+void wake_up_interruptible(wait_queue_head_t *q);
+
+
+

The roles of the macros / functions above are:

+
+
    +
  • init_waitqueue_head() initializes the queue; to initialize the +queue at compile time, you can use the DECLARE_WAIT_QUEUE_HEAD macro;
  • +
  • wait_event() and wait_event_interruptible() adds the current thread to the +queue while the condition is false, sets it to TASK_UNINTERRUPTIBLE or +TASK_INTERRUPTIBLE and calls the scheduler to schedule a new thread; Waiting +will be interrupted when another thread will call the wake_up function;
  • +
  • wait_event_timeout() and wait_event_interruptible_timeout() have the same +effect as the above functions, only waiting can be interrupted at the end of +the timeout received as a parameter;
  • +
  • wake_up() puts all threads off from state TASK_INTERRUPTIBLE and +TASK_UNINTERRUPTIBLE in TASK_RUNNING status; Remove these threads from the +queue;
  • +
  • wake_up_interruptible() same action, but only threads with TASK_INTERRUPTIBLE +status are woken up.
  • +
+
+

A simple example is that of a thread waiting to change the value of a flag. The +initializations are done by the sequence:

+
#include <linux/sched.h>
+
+wait_queue_head_t wq;
+int flag = 0;
+
+init_waitqueue_head(&wq);
+
+
+

A thread will wait for the flag to be changed to a value other than zero:

+
wait_event_interruptible(wq, flag != 0);
+
+
+

While another thread will change the flag value and wake up the waiting threads:

+
flag = 1 ;
+wake_up_interruptible (&wq);
+
+
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is device_drivers. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/device_drivers/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR find the definitions +of the following symbols in the Linux kernel:

+
+
    +
  • struct file
  • +
  • struct file_operations
  • +
  • generic_ro_fops
  • +
  • vfs_read()
  • +
+
+
+
+

1. Register/unregister

+

The driver will control a single device with the MY_MAJOR major and +MY_MINOR minor (the macros defined in the kernel/so2_cdev.c file).

+
+
    +
  1. Create /dev/so2_cdev character device node using mknod.

    +
    +

    Hint

    +

    Read Majors and minors section in the lab.

    +
    +
  2. +
  3. Implement the registration and deregistration of the device with the name +so2_cdev, respectively in the init and exit module functions. Implement TODO 1.

    + +
  4. +
  5. Display, using pr_info, a message after the registration and unregistration +operations to confirm that they were successful. Then load the module into the kernel:

    +
    $ insmod so2_cdev.ko
    +
    +
    +

    And see character devices in /proc/devices:

    +
    $ cat /proc/devices | less
    +
    +
    +

    Identify the device type registered with major 42 . Note that /proc/devices +contains only the device types (major) but not the actual devices (i.e. minors).

    +
    +

    Note

    +

    Entries in /dev are not created by loading the module. These can be created +in two ways:

    +
      +
    • manually, using the mknod command as we did above.
    • +
    • automatically using udev daemon
    • +
    +
    +
  6. +
  7. Unload the kernel module

    +
    rmmod so2_cdev
    +
    +
    +
  8. +
+
+
+
+

2. Register an already registered major

+

Modify MY_MAJOR so that it points to an already used major number.

+
+

Hint

+

See /proc/devices to get an already assigned major.

+
+

See errno-base.h +and figure out what does the error code mean. +Return to the initial configuration of the module.

+
+
+

3. Open and close

+

Run cat /dev/so2_cdev to read data from our char device. +Reading does not work because the driver does not have the open function implemented. +Follow comments marked with TODO 2 and implement them.

+
+
    +
  1. Initialize your device +
  2. +
  3. Implement the open and release functions in the driver.
  4. +
  5. Display a message in the open and release functions.
  6. +
  7. Read again /dev/so2_cdev file. Follow the messages displayed by the kernel. +We still get an error because read function is not yet implemented.
  8. +
+
+
+

Note

+

The prototype of a device driver's operations is in the file_operations +structure. Read Open and release section.

+
+
+
+

4. Access restriction

+

Restrict access to the device with atomic variables, so that a single process +can open the device at a time. The rest will receive the "device busy" error +(-EBUSY). Restricting access will be done in the open function displayed by +the driver. Follow comments marked with TODO 3 and implement them.

+
+
    +
  1. Add an atomic_t variable to the device structure.
  2. +
  3. Initialize the variable at module initialization.
  4. +
  5. Use the variable in the open function to restrict access to the device. We +recommend using atomic_cmpxchg().
  6. +
  7. Reset the variable in the release function to retrieve access to the device.
  8. +
  9. To test your deployment, you'll need to simulate a long-term use of your +device. To simulate a sleep, call the scheduler at the end of the device opening:
  10. +
+
+
set_current_state(TASK_INTERRUPTIBLE);
+schedule_timeout(1000);
+
+
+
+

Note

+

The advantage of the atomic_cmpxchg function is that it can check the +old value of the variable and set it up to a new value, all in one +atomic operation. Read more details about atomic_cmpxchg +An example of use is here.

+
+
+
+

5. Read operation

+

Implement the read function in the driver. Follow comments marked with TODO 4 and implement them.

+
+
    +
  1. Keep a buffer in so2_device_data structure initialized with the value of MESSAGE macro. +Initializing this buffer will be done in module init function.
  2. +
  3. At a read call, copy the contents of the kernel space buffer into the user +space buffer.
      +
    • Use the copy_to_user() function to copy information from kernel space to +user space.
    • +
    • Ignore the size and offset parameters at this time. You can assume that +the buffer in user space is large enough. You do not need to check the +validity of the size argument of the read function.
    • +
    • The value returned by the read call is the number of bytes transmitted +from the kernel space buffer to the user space buffer.
    • +
    +
  4. +
  5. After implementation, test using cat /dev/so2_cdev.
  6. +
+
+
+

Note

+

The command cat /dev/so2_cdev does not end (use Ctrl+C). +Read the read and write sections and Access to the address space of the process +If you want to display the offset value use a construction of the form: +pr_info("Offset: %lld \n", *offset); The data type loff_t (used by offset ) is a typedef for long long int.

+
+

The cat command reads to the end of the file, and the end of the file is +signaled by returning the value 0 in the read. Thus, for a correct implementation, +you will need to update and use the offset received as a parameter in the read +function and return the value 0 when the user has reached the end of the buffer.

+

Modify the driver so that the cat commands ends:

+
+
    +
  1. Use the size parameter.
  2. +
  3. For every read, update the offset parameter accordingly.
  4. +
  5. Ensure that the read function returns the number of bytes that were copied +into the user buffer.
  6. +
+
+
+

Note

+

By dereferencing the offset parameter it is possible to read and move the current +position in the file. Its value needs to be updated every time a read is done +successfully.

+
+
+
+

6. Write operation

+

Add the ability to write a message into kernel buffer to replace the predefined message. Implement +the write function in the driver. Follow comments marked with TODO 5

+

Ignore the offset parameter at this time. You can assume that the driver buffer is +large enough. You do not need to check the validity of the write function size +argument.

+
+

Note

+

The prototype of a device driver's operations is in the file_operations +structure. +Test using commands:

+
echo "arpeggio"> /dev/so2_cdev
+cat /dev/so2_cdev
+
+
+

Read the read and write sections and Access to the address space of the process

+
+
+
+

7. ioctl operation

+

For this exercise, we want to add the ioctl MY_IOCTL_PRINT to display the +message from the IOCTL_MESSAGE macro in the driver. +Follow the comments marked with TODO 6

+

For this:

+
+
    +
  1. Implement the ioctl function in the driver.
  2. +
  3. We need to use user/so2_cdev_test.c to call the +ioctl function with the appropriate parameters.
  4. +
  5. To test, we will use an user-space program (user/so2_cdev_test.c) +which will call the ioctl function with the required arguments.
  6. +
+
+
+

Note

+

The macro MY_IOCTL_PRINT is defined in the file include/so2_cdev.h, +which is shared between the kernel module and the user-space program.

+

Read the ioctl section in the lab.

+
+
+

Note

+

The user-space code is compiled automatically at make build and +copied at make copy.

+

Because we need to compile the program for qemu machine which is 32 bit, +if your host is 64 bit then you need to install gcc-multilib package.

+
+
+
+
+

Extra Exercises

+
+

Ioctl with messaging

+

Add two ioctl operations to modify the message associated with the +driver. Use fixed-length buffer ( BUFFER_SIZE ).

+
+
    +
  1. Add the ioctl function from the driver the following operations:
      +
    • MY_IOCTL_SET_BUFFER for writing a message to the device;
    • +
    • MY_IOCTL_GET_BUFFER to read a message from your device.
    • +
    +
  2. +
  3. For testing, pass the required command line arguments to the +user-space program.
  4. +
+
+
+

Note

+

Read the ioctl and Access to the address space of the process +sections of the lab.

+
+
+
+

Ioctl with waiting queues

+

Add two ioctl operations to the device driver for queuing.

+
+
    +
  1. Add the ioctl function from the driver the following operations:
      +
    • MY_IOCTL_DOWN to add the process to a queue;
    • +
    • MY_IOCTL_UP to remove the process from a queue.
    • +
    +
  2. +
  3. Fill the device structure with a wait_queue_head_t field and a flag.
  4. +
  5. Do not forget to initialize the wait queue and flag.
  6. +
  7. Remove exclusive access condition from previous exercise
  8. +
  9. For testing, pass the required command line arguments to the +user-space program.
  10. +
+
+

When the process is added to the queue, it will remain blocked in execution; To +run the queue command open a new console in the virtual machine with Alt+F2 ; +You can return to the previous console with Alt+F1. If you're connected via +SSH to the virtual machine, open a new console.

+
+

Note

+

Read the ioctl and Waiting queues sections in the lab.

+
+
+
+

O_NONBLOCK implementation

+
+

Note

+

If a file is open with the O_NONBLOCK flag, then its +operations will be non-blocking.

+

In case data is not available when performing a read, the following +happens:

+
+
    +
  • if the file has been open with O_NONBLOCK, the read call +will return -EWOULDBLOCK.
  • +
  • otherwise, the current task (process) will be placed in a waiting +queue and will be unblocked as soon as data becomes available +(in our case, at write).
  • +
+
+
+
    +
  • To allow unblocking the read operation, remove the exclusive access +condition from previous exercises.
  • +
  • You can use the queue defined for the previous exercise.
  • +
  • You can ignore the file offset.
  • +
  • Modify the initial size of data to 0, to allow testing.
  • +
  • For testing, pass the required command line arguments to the +user-space program.
      +
    • when using the n option, the test program will change the open flags +to O_NONBLOCK and then perform a read.
    • +
    +
  • +
  • What are the flags used to open the file when running cat /dev/so2_dev?
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab4-interrupts.html b/refs/pull/405/merge/so2/lab4-interrupts.html new file mode 100644 index 00000000..ae95c548 --- /dev/null +++ b/refs/pull/405/merge/so2/lab4-interrupts.html @@ -0,0 +1,1291 @@ + + + + + + SO2 Lab 04 - I/O access and Interrupts — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 04 - I/O access and Interrupts

+
+

Lab objectives

+
    +
  • communication with peripheral devices
  • +
  • implement interrupt handlers
  • +
  • synchronizing interrupts with process context
  • +
+

Keywords: IRQ, I/O port, I/O address, base address, UART, request_region, release_region, inb, outb

+
+
+

Background information

+

A peripheral device is controlled by writing and reading its +registers. Often, a device has multiple registers that can be accessed +at consecutive addresses either in the memory address space or in the +I/O address space. Each device connected to the I/O bus has a set of +I/O addresses, called I/O ports. I/O ports can be mapped to physical +memory addresses so that the processor can communicate with the device +through instructions that work directly with the memory. For +simplicity, we will directly use I/O ports (without mapping to physical +memory addresses) to communicate with physical devices.

+

The I/O ports of each device are structured into a set of specialized +registers to provide a uniform programming interface. Thus, most +devices will have the following types of registers:

+
    +
  • Control registers that receive device commands
  • +
  • Status registers, which contain information about the device's +internal status
  • +
  • Input registers from which data is taken from the device
  • +
  • Output registers in which the data is written to transmit it to the +device
  • +
+

Physical ports are differentiated by the number of bits: they can be +8, 16 or 32-bit ports.

+

For example, the parallel port has 8 8-bit I/O ports starting at base +address 0x378. The data log is found at base address (0x378), status +register at base + 1 (0x379), and control at base address + 2 +(0x37a). The data log is both an entry and exit log.

+

Although there are devices that can be fully controlled using I/O +ports or special memory areas, there are situations where this is +insufficient. The main problem that needs to be addressed is that +certain events occur at undefined moments in time and it is +inefficient for the processor (CPU) to interrogate the status of the +device repeatedly (polling). The way to solve this problem is using an +Interrupt ReQuest (IRQ) which is a hardware notification by which the +processor is announced that a particular external event happened.

+

For IRQs to be useful device drivers must implement handlers, i.e. a +particular sequence of code that handles the interrupt. Because in +many situations the number of interrupts available is limited, a +device driver must behave in an orderly fashion with interruptions: +interrupts must be requested before being used and released when they +are no longer needed. In addition, in some situations, device drivers +must share an interrupt or synchronize with interrupts. All of these will be +discussed further.

+

When we need to access shared resources between an interrupt +routine (A) and code running in process context or in bottom-half +context (B), we must use a special synchronization technique. In (A) +we need to use a spinlock primitive, and in (B) we must disable +interrupts AND use a spinlock primitive. Disabling interrupts is not +enough because the interrupt routine can run on a processor other than +the one running (B).

+

Using only a spinlock can lead to a deadlock. The classic example of +deadlock in this case is:

+
    +
  1. We run a process on the X processor, and we acquire the lock
  2. +
  3. Before releasing the lock, an interrupt is generated on the X processor
  4. +
  5. The interrupt handling routine will try to acquire the lock and it +will go into an infinite loop
  6. +
+
+
+

Accessing the hardware

+

In Linux, the I/O ports access is implemented on all architectures and +there are several APIs that can be used.

+
+

Request access to I/O ports

+

Before accessing I/O ports we first must request access to them, to +make sure there is only one user. In order to do so, one must use the +request_region() function:

+
#include <linux/ioport.h>
+
+struct resource *request_region(unsigned long first, unsigned long n,
+                                const char *name);
+
+
+

To release a reserved region one must use the release_region() function:

+
void release_region(unsigned long start, unsigned long n);
+
+
+

For example, the serial port COM1 has the base address 0x3F8 and it +has 8 ports and this is a code snippet of how to request access to +these ports:

+
#include <linux/ioport.h>
+
+#define MY_BASEPORT 0x3F8
+#define MY_NR_PORTS 8
+
+if (!request_region(MY_BASEPORT, MY_NR_PORTS, "com1")) {
+     /* handle error */
+     return -ENODEV;
+}
+
+
+

To release the ports one would use something like:

+
release_region(MY_BASEPORT, MY_NR_PORTS);
+
+
+

Most of the time, port requests are done at the driver initialization +or probe time and the port releasing is done at the removal of the +device or module.

+

All of the port requests can be seen from userspace via the +/proc/ioports file:

+
$ cat /proc/ioports
+0000-001f : dma1
+0020-0021 : pic1
+0040-005f : timer
+0060-006f : keyboard
+0070-0077 : rtc
+0080-008f : dma page reg
+00a0-00a1 : pic2
+00c0-00df : dma2
+00f0-00ff : fpu
+0170-0177 : ide1
+01f0-01f7 : ide0
+0376-0376 : ide1
+0378-037a : parport0
+037b-037f : parport0
+03c0-03df : vga+
+03f6-03f6 : ide0
+03f8-03ff : serial
+...
+
+
+
+
+

Accessing I/O ports

+

After a driver has obtained the desired I/O port range, one can +perform read or write operations on these ports. Since physical ports +are differentiated by the number of bits (8, 16, or 32 bits), there +are different port access functions depending on their size. The +following port access functions are defined in asm/io.h:

+
    +
  • unsigned inb(int port), reads one byte (8 bits) from port
  • +
  • void outb(unsigned char byte, int port), writes one byte (8 bits) to port
  • +
  • unsigned inw(int port), reads two bytes (16-bit) ports
  • +
  • void outw(unsigned short word, int port), writes two bytes (16-bits) to port
  • +
  • unsigned inl (int port), reads four bytes (32-bits) from port
  • +
  • void outl(unsigned long word, int port), writes four bytes (32-bits) to port
  • +
+

The port argument specifies the address of the port where the reads or +writes are done, and its type is platform dependent (may be unsigned +long or unsigned short).

+

Some devices may have problems when the processor is trying to +transfer data too fast to and from the device. To avoid this issue we +may need to insert a delay after an I/O operation and there are functions +you can use that introduce this delay. Their names are similar to +those described above, with the exception that it ends in _p: inb_p, +outb_p, etc.

+

For example, the following sequence writes a byte on COM1 serial port +and then reads it:

+
#include <asm/io.h>
+#define MY_BASEPORT 0x3F8
+
+unsigned char value = 0xFF;
+outb(value, MY_BASEPORT);
+value = inb(MY_BASEPORT);
+
+
+
+
+

5. Accessing I/O ports from userspace

+

Although the functions described above are defined for device drivers, +they can also be used in user space by including the <sys/io.h> +header. In order to be used, ioperm or iopl must first be called to +get permission to perform port operations. The ioperm function obtains +permission for individual ports, while iopl for the entire I/O address +space. To use these features, the user must be root.

+

The following sequence used in user space gets permission for the +first 3 ports of the serial port, and then releases them:

+
#include <sys/io.h>
+#define MY_BASEPORT 0x3F8
+
+if (ioperm(MY_BASEPORT, 3, 1)) {
+     /* handle error */
+}
+
+if (ioperm(MY_BASEPORT, 3, 0)) {
+     /* handle error */
+}
+
+
+

The third parameter of the ioperm function is used to request or +release port permission: 1 to get permission and 0 to release.

+
+
+
+

Interrupt handling

+
+

Requesting an interrupt

+

As with other resources, a driver must gain access to an interrupt +line before it can use it and release it at the end of the execution.

+

In Linux, the request to obtain and release an interrupt is done using +the requests_irq() and free_irq() functions:

+
#include <linux/interrupt.h>
+
+typedef irqreturn_t (*irq_handler_t)(int, void *);
+
+int request_irq(unsigned int irq_no, irq_handler_t handler,
+                unsigned long flags, const char *dev_name, void *dev_id);
+
+void free_irq(unsigned int irq_no, void *dev_id);
+
+
+

Note that to get an interrupt, the developer calls +request_irq(). When calling this function you must specify the +interrupt number (irq_no), a handler that will be called when the +interrupt is generated (handler), flags that will instruct the +kernel about the desired behaviour (flags), the name of the device +using this interrupt (dev_name), and a pointer that can be +configured by the user at any value, and that has no global +significance (dev_id). Most of the time, dev_id will be +pointer to the device driver's private data. When the interrupt is +released, using the free_irq() function, the developer must +send the same pointer value (dev_id) along with the same interrupt +number (irq_no). The device name (dev_name) is used to display +statistics in /proc/interrupts.

+

The value that request_irq() returns is 0 if the entry was +successful or a negative error code indicating the reason for the +failure. A typical value is -EBUSY which means that the interrupt +was already requested by another device driver.

+

The handler function is executed in interrupt context which means +that we can't call blocking APIs such as mutex_lock() or +msleep(). We must also avoid doing a lot of work in the +interrupt handler and instead use deferred work if needed. The actions +performed in the interrupt handler include reading the device +registers to get the status of the device and acknowledge the +interrupt, operations that most of the time can be performed with +non-blocking calls.

+

There are situations where although a device uses interrupts we can't +read the device's registers in a non-blocking mode (for example a +sensor connected to an I2C or SPI bus whose driver does not guarantee +that bus read / write operations are non-blocking ). In this +situation, in the interruption, we must plan a work-in-process action +(work queue, kernel thread) to access the device's registers. Because +such a situation is relatively common, the kernel provides the +request_threaded_irq() function to write interrupt handling +routines running in two phases: a process-phase and an interrupt +context phase:

+
#include <linux/interrupt.h>
+
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+                         irq_handler_t thread_fn,
+                         unsigned long flags, const char *name, void *dev);
+
+
+

handler is the function running in interrupt context, and will +implement critical operations while the thread_fn function runs in +process context and implements the rest of the operations.

+

The flags that can be transmitted when an interruption is made are:

+
    +
  • IRQF_SHARED announces the kernel that the interrupt can be +shared with other devices. If this flag is not set, then if there is +already a handler associated with the requested interrupt, the +request for interrupt will fail. A shared interrupt is handled in a +special way by the kernel: all the associated interrupt handlers +will be executed until the device that generated the interrupt will +be identified. But how can a device driver know if the interrupt +handling routine was activated by an interrupt generated by the +device it manages? Virtually all devices that offer interrupt +support have a status register that can be interrogated in the +handling routine to see if the interrupt was or was not generated by +the device (for example, in the case of the 8250 serial port, this +status register is IIR - Interrupt Information Register). When +requesting a shared interrupt, the dev_id argument must be unique +and it must not be NULL. Usually it is set to module's private +data.
  • +
  • IRQF_ONESHOT interrupt will be reactivated after running the process +context routine; Without this flag, the interrupt will be +reactivated after running the handler routine in the context of +the interrupt
  • +
+

Requesting the interrupt can be done either at the initialization of +the driver (init_module()), when the device is probed, or when +the device is used (e.g. during open).

+

The following example performs the interrupt request for the COM1 +serial port:

+
#include <linux/interrupt.h>
+
+#define MY_BASEPORT 0x3F8
+#define MY_IRQ 4
+
+static my_init(void)
+{
+     [...]
+     struct my_device_data *my_data;
+     int err;
+
+     err = request_irq(MY_IRQ, my_handler, IRQF_SHARED,
+                       "com1", my_data);
+     if (err < 0) {
+         /* handle error*/
+         return err;
+     }
+     [...]
+}
+
+
+

As you can see, the IRQ for serial port COM1 is 4, which is used in +shared mode (IRQF_SHARED).

+
+

Attention

+

When requesting a shared interrupt (IRQF_SHARED) the +dev_id argument can not be NULL.

+
+

To release the interrupt associated with the serial port, the +following operations will be executed:

+
free_irq (MY_IRQ, my_data);
+
+
+

During the initialization function (init_module()), or in the +function that opens the device, interrupts must be activated for the +device. This operation is dependent on the device, but most often +involves setting a bit from the control register.

+

As an example, for the 8250 serial port, the following operations must +be performed to enable interrupts:

+
#include <asm/io.h>
+#define MY_BASEPORT 0x3F8
+
+outb(0x08, MY_BASEPORT+4);
+outb(0x01, MY_BASEPORT+1);
+
+
+

In the above example, two operations are performed:

+
    +
  1. All interruptions are activated by setting bit 3 (Aux Output 2) in +the MCR register - Modem Control Register
  2. +
  3. The RDAI (Transmit Holding Register Empty Interrupt) is activated +by setting the appropriate bit in the IER - Interrupt Enable +Register.
  4. +
+
+
+

Implementing an interrupt handler

+

Lets take a look at the signature of the interrupt handler function:

+
irqreturn_t (*handler)(int irq_no, void *dev_id);
+
+
+

The function receives as parameters the number of the interrupt +(irq_no) and the pointer sent to request_irq() when the +interrupt was requested. The interrupt handling routine must return a +value with a type of typedef irqreturn_t. For the current kernel +version, there are three valid values: IRQ_NONE, IRQ_HANDLED, +and IRQ_WAKE_THREAD. The device driver must return IRQ_NONE if +it notices that the interrupt has not been generated by the device it +is in charge. Otherwise, the device driver must return IRQ_HANDLED +if the interrupt can be handled directly from the interrupt context or +IRQ_WAKE_THREAD to schedule the running of the process context +processing function.

+

The skeleton for an interrupt handler is:

+
irqreturn_t my_handler(int irq_no, void *dev_id)
+{
+    struct my_device_data *my_data = (struct my_device_data *) dev_id;
+
+    /* if interrupt is not for this device (shared interrupts) */
+        /* return IRQ_NONE;*/
+
+    /* clear interrupt-pending bit */
+    /* read from device or write to device*/
+
+    return IRQ_HANDLED;
+}
+
+
+

Typically, the first thing executed in the interrupt handler is to +determine whether the interrupt was generated by the device that the +driver ordered. This usually reads information from the device's +registers to indicate whether the device has generated an +interrupt. The second thing is to reset the interrupt pending bit on +the physical device as most devices will no longer generate +interruptions until this bit has been reset (e.g. for the 8250 +serial port bit 0 in the IIR register must be cleared).

+
+
+

Locking

+

Because the interrupt handlers run in interrupt context the actions +that can be performed are limited: unable to access user space memory, +can't call blocking functions. Also, synchronization using spinlocks is +tricky and can lead to deadlocks if the spinlock used is already +acquired by a process that has been interrupted by the running +handler.

+

However, there are cases where device drivers have to synchronize +using interrupts, such as when data is shared between the interrupt +handler and process context or bottom-half handlers. In these +situations it is necessary to both deactivate the interrupt and use +spinlocks.

+

There are two ways to disable interrupts: disabling all interrupts, at +the processor level, or disabling a particular interrupt at the device +or interrupt controller level. Processor disabling is faster and is +therefore preferred. For this purpose, there are locking functions +that disable and enable interrupts acquiring and release a spinlock at +the same time: spin_lock_irqsave(), +spin_unlock_irqrestore(), spin_lock_irq(), and +spin_unlock_irq():

+
#include <linux/spinlock.h>
+
+void spin_lock_irqsave (spinlock_t * lock, unsigned long flags);
+void spin_unlock_irqrestore (spinlock_t * lock, unsigned long flags);
+
+void spin_lock_irq (spinlock_t * lock);
+void spin_unlock_irq (spinlock_t * lock);
+
+
+

The spin_lock_irqsave() function disables interrupts for the +local processor before it obtains the spinlock; The previous state of +the interrupts is saved in flags.

+

If you are absolutely sure that the interrupts on the current +processor have not already been disabled by someone else and you are +sure you can activate the interrupts when you release the spinlock, +you can use spin_lock_irq().

+

For read / write spinlocks there are similar functions available:

+
    +
  • read_lock_irqsave()
  • +
  • read_unlock_irqrestore()
  • +
  • read_lock_irq()
  • +
  • read_unlock_irq()
  • +
  • write_lock_irqsave()
  • +
  • write_unlock_irqrestore()
  • +
  • write_lock_irq()
  • +
  • write_unlock_irq()
  • +
+

If we want to disable interrupts at the interrupt controller level +(not recommended because disabling a particular interrupt is slower, +we can not disable shared interrupts) we can do this with +disable_irq(), disable_irq_nosync(), and +enable_irq(). Using these functions will disable the interrupts on +all processors. Calls can be nested: if disable_irq is called twice, +it will require as many calls enable_irq to enable it. The difference +between disable_irq and disable_irq_nosync is that the first one will +wait for the executed handlers to finish. Because of this, +disable_irq_nosync() is generally faster, but may lead to +races with the interrupts handler, so when not sure use +disable_irq().

+

The following sequence disables and then enables the interrupt for +the COM1 serial port:

+
#define MY_IRQ 4
+
+disable_irq (MY_IRQ);
+enable_irq (MY_IRQ);
+
+
+

It is also possible to disable interrupts at the device level. This +approach is also slower than disabling interrupts at the processor +level, but it works with shared interrupts. The way to accomplish this +is device specific and it usually means we have to clear a bit from +one of the control registers.

+

It is also possible to disable all interrupts for the current +processor independent of taking locks. Disabling all interruptions by +device drivers for synchronization purposes is inappropriate because +races are still possible if the interrupt is handled on another +CPU. For reference, the functions that disable / enable interrupts on +the local processor are local_irq_disable() and +local_irq_enable().

+

In order to use a resource shared between process context and the +interrupt handling routine, the functions described above will be used +as follows:

+
static spinlock_t lock;
+
+/* IRQ handling routine: interrupt context */
+irqreturn_t kbd_interrupt_handle(int irq_no, void * dev_id)
+{
+    ...
+    spin_lock(&lock);
+    /* Critical region - access shared resource */
+    spin_unlock (&lock);
+    ...
+}
+
+/* Process context: Disable interrupts when locking */
+static void my_access(void)
+{
+    unsigned long flags;
+
+    spin_lock_irqsave(&lock, flags);
+    /* Critical region - access shared resource */
+    spin_unlock_irqrestore(&lock, flags);
+
+    ...
+}
+
+void my_init (void)
+{
+    ...
+    spin_lock_init (&lock);
+    ...
+}
+
+
+

The my_access function above runs in process context. To +synchronize access to the shared data, we disable the interrupts and +use the spinlock lock, i.e. the spin_lock_irqsave() and +spin_unlock_irqrestore() functions.

+

In the interrupt handling routine, we use the spin_lock() and +spin_unlock() functions to access the shared resource.

+
+

Note

+

The flags argument for spin_lock_irqsave() and +spin_unlock_irqrestore() is a value and not a pointer but keep +in mind that spin_lock_irqsave() function changes the value of +the flag, since this is actually a macro.

+
+
+
+

Interrupt statistics

+

Information and statistics about system interrupts can be found in +/proc/interrupts or /proc/stat. Only system interrupts with +associated interrupt handlers appear in /proc/interrupts:

+
# cat /proc/interrupts
+                CPU0
+0:           7514294       IO-APIC-edge   timer
+1:              4528       IO-APIC-edge   i8042
+6:                 2       IO-APIC-edge   floppy
+8:                 1       IO-APIC-edge   rtc
+9:                 0       IO-APIC-level  acpi
+12:             2301       IO-APIC-edge   i8042
+15:               41       IO-APIC-edge   ide1
+16:             3230       IO-APIC-level  ioc0
+17:             1016       IO-APIC-level  vmxnet ether
+NMI:               0
+LOC:         7229438
+ERR:               0
+MIS:               0
+
+
+

The first column specifies the IRQ associated with the interrupt. The +following column shows the number of interrupts that were generated +for each processor in the system; The last two columns provide +information about the interrupt controller and the device name that +registered the handler for that interrupt.

+

The /proc/state file provides information about system activity, +including the number of interruptions generated since the last (re)boot +of the system:

+
# cat /proc/stat | grep in
+intr 7765626 7754228 4620 0 0 0 0 2 0 1 0 0 0 2377 0 0 41 3259 1098 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+
+
+

Each line in the /proc/state file begins with a keyword that +specifies the meaning of the information on the line. For information +on interrupts, this keyword is intr. The first number on the line +represents the total number of interrupts, and the other numbers +represent the number of interrupts for each IRQ, starting at 0. The +counter includes the number of interrupts for all processors in the +system.

+
+
+
+

Further reading

+ + +
+

Keyboard controller

+
    +
  • Intel 8042
  • +
  • drivers/input/serio/i8042.c
  • +
  • drivers/input/keyboard/atkbd.c
  • +
+
+ +
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is interrupts. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/interrupts/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR, find the definitions of the following symbols in the Linux kernel:

+
    +
  • struct resource
  • +
  • request_region() and __request_region()
  • +
  • request_irq() and request_threaded_irq()
  • +
  • inb() for the x86 architecture.
  • +
+

Analyze the following Linux code:

+
    +
  • Keyboard initialization function i8042_setup_kbd()
  • +
  • The AT or PS/2 keyboard interrupt function atkbd_interrupt()
  • +
+
+
+

Keyboard driver

+

The next exercise's objective is to create a driver that uses the +keyboard IRQ, inspect the incoming key codes and stores them in a +buffer. The buffer will be accessible from userspace via character +device driver.

+
+
+

1. Request the I/O ports

+

To start with, we aim to allocate memory in the I/O space for hardware +devices. We will see that we cannot allocate space for the keyboard +because the designated region is already allocated. Then we will allocate +I/O space for unused ports.

+

The kbd.c file contains a skeleton for the keyboard driver. Browse +the source code and inspect kbd_init(). Notice that the I/O +ports we need are I8042_STATUS_REG and I8042_DATA_REG.

+

Follow the sections maked with TODO 1 in the skeleton. Request the I/O +ports in kbd_init() and make sure to check for errors and to properly +clean-up in case of errors. When requesting, set the reserving caller's ID +string (name) with MODULE_NAME macro. Also, add code to release the I/O +ports in kbd_exit().

+
+

Note

+

You can review the Request access to I/O ports section before +proceeding.

+
+

Now build the module and copy it to the VM image:

+
tools/labs $ make build
+tools/labs $ make copy
+
+
+

Now start the VM and insert the module:

+
root@qemux86:~# insmod skels/interrupts/kbd.ko
+kbd: loading out-of-tree module taints kernel.
+insmod: can't insert 'skels/interrupts/kbd.ko': Device or resource busy
+
+
+

Notice that you get an error when trying to request the I/O +ports. This is because we already have a driver that has requested the +I/O ports. To validate check the /proc/ioports file for the +STATUS_REG and DATA_REG values:

+
root@qemux86:~# cat /proc/ioports | egrep "(0060|0064)"
+0060-0060 : keyboard
+0064-0064 : keyboard
+
+
+

Lets find out which driver register these ports and try to remove the +module associated with it.

+
$ find -name \*.c | xargs grep \"keyboard\"
+
+find -name \*.c | xargs grep \"keyboard\" | egrep '(0x60|0x64)'
+...
+./arch/x86/kernel/setup.c:{ .name = "keyboard", .start = 0x60, .end = 0x60,
+./arch/x86/kernel/setup.c:{ .name = "keyboard", .start = 0x64, .end = 0x64
+
+
+

It looks like the I/O ports are registered by the kernel during the +boot, and we won't be able to remove the associated module. Instead, +let's trick the kernel and register ports 0x61 and 0x65.

+

Use the function request_region() (inside the kbd_init() +function) to allocate the ports and the function release_region() +(inside the kbd_exit() function) to release the allocated memory.

+

This time we can load the module and /proc/ioports shows that the +owner of these ports is our module:

+
root@qemux86:~# insmod skels/interrupts/kbd.ko
+kbd: loading out-of-tree module taints kernel.
+Driver kbd loaded
+root@qemux86:~# cat /proc/ioports | grep kbd
+0061-0061 : kbd
+0065-0065 : kbd
+
+
+

Let's remove the module and check that the I/O ports are released:

+
root@qemux86:~# rmmod kbd
+Driver kbd unloaded
+root@qemux86:~# cat /proc/ioports | grep kbd
+root@qemux86:~#
+
+
+
+
+

2. Interrupt handling routine

+

For this task we will implement and register an interrupt handler for +the keyboard interrupt. You can review the Requesting an interrupt +section before proceeding.

+

Follow the sections marked with TODO 2 in the skeleton.

+

First, define an empty interrupt handling routine named +kbd_interrupt_handler().

+
+

Note

+

Since we already have a driver that uses this interrupt we +should report the interrupt as not handled (i.e. return +IRQ_NONE) so that the original driver still has a +chance to process it.

+
+

Then register the interrupt handler routine using +request_irq. The interrupt number is defined by the +I8042_KBD_IRQ macro. The interrupt handling routine must be +requested with IRQF_SHARED to share the interrupt line with +the keyboard driver (i8042).

+
+

Note

+

For shared interrupts, dev_id can not be NULL . Use +&devs[0], that is pointer to struct kbd. This +structure contains all the information needed for device +management. To see the interrupt in /proc/interrupts, do +not use NULL for dev_name . You can use the MODULE_NAME +macro.

+

If the interrupt requesting fails make sure to properly +cleanup by jumping to the right label, in this case the one +the releases the I/O ports and continues with unregistering +the character device driver.

+
+

Compile, copy and load module in the kernel. Check that the interrupt +line has been registered by looking at /proc/interrupts . Determine +the IRQ number from the source code (see I8042_KBD_IRQ) and verify +that there are two drivers registered at this interrupt line (which +means that we have a shared interrupt line): the i8042 initial driver +and our driver.

+
+

Note

+

More details about the format of the /proc/interrupts can +be found in the Interrupt statistics section.

+
+

Print a message inside the routine to make sure it is called. Compile +and reload the module into the kernel. Check that the interrupt handling +routine is called when you press the keyboard on the virtual machine, +using dmesg. Also note that when you use the serial port no +keyboard interrupt is generated.

+
+

Attention

+

To get access to the keyboard on the virtual machine +boot with "QEMU_DISPLAY=gtk make boot".

+
+
+
+

3. Store ASCII keys to buffer

+

Next, we want to collect the keystrokes in a buffer whose content we +will then send to the user space. For this routine we will add the +following in the interrupt handling:

+
    +
  • capture the pressed keys (only pressed, ignore released)
  • +
  • identify the ASCII characters.
  • +
  • copy the ASCII characters corresponding to the keystrokes and store +them in the buffer of the device
  • +
+

Follow the sections marked TODO 3 in the skeleton.

+
+

Reading the data register

+

First, fill in the i8042_read_data() function to read the +I8042_DATA_REG of the keyboard controller. The function +just needs to return the value of the register. The value of the +registry is also called scancode, which is what is generated at each +keystroke.

+
+

Hint

+

Read the I8042_DATA_REG register using inb() and +store the value in the local variable val. +Revisit the Accessing I/O ports section.

+
+

Call the i8042_read_data() in the +kbd_interrupt_handler() and print the value read.

+

Print information about the keystrokes in the following format:

+
pr_info("IRQ:% d, scancode = 0x%x (%u,%c)\n",
+   irq_no, scancode, scancode, scancode);
+
+
+

Where scancode is the value of the read register using the +i8042_read_data() function.

+

Notice that the scancode (reading of the read register) is not an ASCII +character of the pressed key. We'll have to understand the scancode.

+
+
+

Interpreting the scancode

+

Note that the registry value is a scancode, not the ASCII value of the +character pressed. Also note that an interrupt is sent both when the +key is pressed and when the key is released. We only need to select +the code when the key is pressed and then and decode the ASCII +character.

+
+

Note

+

To check scancode, we can use the showkey command (showkey +-s).

+

In this form, the command will display the key scancodes for +10 seconds after the last pressed key end then it will +stop. If you press and release a key you will get two +scancodes: one for the pressed key and one for the released +key. E.g:

+
    +
  • If you press the ENTER key, you will get the 0x1c ( 0x1c ) +and 0x9c (for the released key)

    +
  • +
  • If you press the key a you will get the 0x1e (key pressed) +and 0x9e (for the key release)

    +
  • +
  • If you press b you will get 0x30 (key pressed) and 0xb0 +(for the release key)

    +
  • +
  • If you press the c key, you will get the 0x2e (key +pressed) 0xae and 0xae (for the released key)

    +
  • +
  • If you press the Shift key you will get the 0x2a (key +pressed) 0xaa and 0xaa (for the released key)

    +
  • +
  • If you press the Ctrl key you will get the 0x1d (key +pressed) and 0x9d (for the release key)

    +

    As also indicated in this article, a key +release scancode is 128 (0x80) higher then a key press +scancode. This is how we can distinguish between a press +key scancode and a release scancode.

    +

    A scancode is translated into a keycode that matches a +key. A pressed scanned keycode and a released scancode +have the same keycode. For the keys shown above we have +the following table:

    + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyKey Press ScancodeKey Release ScancodeKeycode
    ENTER0x1c0x9c0x1c (28)
    a0x1e0x9e0x1e (30)
    b0x300xb00x30 (48)
    c0x2e0xae0x2e (46)
    Shift0x2a0xaa0x2a (42)
    Ctrl0x1d0x9d0x1d (29)
    +

    The press / release key is performed in the is_key_press() +function and obtaining the ASCII character of a scancode +takes place in the get_ascii() function.

    +
  • +
+
+

In the interrupt handler check the scancode to see if the key is +pressed or released then determine the corresponding ASCII +character.

+
+

Hint

+

To check for press / release, use is_key_press(). +Use get_ascii() function to get the corresponding +ASCII code. Both functions expect the scancode.

+
+
+

Hint

+

To display the received information use the following +format.

+
pr_info("IRQ %d: scancode=0x%x (%u) pressed=%d ch=%c\n",
+        irq_no, scancode, scancode, pressed, ch);
+
+
+

Where scancode is the value of the data register, and ch is +the value returned by the get_ascii() function.

+
+
+
+

Store characters to the buffer

+

We want to collect the pressed characters (not the other keys) into +a circular buffer that can be consumed from user space.

+

Update the interrupt handler to add a pressed ASCII character to the +end of the device buffer. If the buffer is full, the character will be +discarded.

+
+

Hint

+

The device buffer is the field buf in the device's +struct kbd. To get the device data from the interrupt handler +use the following construct:

+
struct kbd *data = (struct kbd *) dev_id;
+
+
+

The buffer's dimension is located in struct kbd's field, +count. The put_idx and get_idx fields +specify the next writing and reading index. Take a look at the +put_char() function's implementation to observe how the data is +added to the circular buffer.

+
+
+

Attention

+

Synchronize the access to the buffer and the helper +indexes with a spinlock. +Define the spinlock in the device struct struct kbd +and initialize it in kbd_init().

+

Use the spin_lock() and spin_unlock() functions +to protect the buffer in the interrupt handler.

+

Revisit the Locking section.

+
+
+
+
+

4. Reading the buffer

+

In order to have access to the keylogger's data, we have to send it to +the user space. We will do this using the /dev/kbd character device. When +reading from this device, we will get the data from the buffer in the kernel +space, where we collected the keys pressed.

+

For this step +follow the sections marked with TODO 4 in the kbd_read() function.

+

Implement get_char() in a similar way to put_char(). Be careful +when implementing the circular buffer.

+

In the kbd_read() function copy the data from the buffer to the +userspace buffer.

+
+

Hint

+

Use get_char() to read a character from the buffer +and put_user() to store it to the user buffer.

+
+
+

Attention

+

In the read function, use spin_lock_irqsave() and +spin_unlock_irqrestore() for locking.

+

Revisit the Locking section.

+
+
+

Attention

+

We cannot use put_user() or copy_to_user() +while holding the lock, as userpace access is not permitted from +atomic contexts.

+

For more info, read the Access to the address space of the +process section in the +previous lab.

+
+

For testing, you will need to create the /dev/kbd character device +driver using the mknod before reading from it. The device master and +minor are defined as KBD_MAJOR and KBD_MINOR:

+
mknod /dev/kbd c 42 0
+
+
+

Build, copy and boot the virtual machine and load the module. Test it +using the command:

+
cat /dev/kbd
+
+
+
+
+

5. Reset the buffer

+

Reset the buffer if the device is written to. For this step follow the +sections marked with TODO 5 in the skeleton.

+

Implement reset_buffer() and add the write operation to kbd_fops.

+
+

Attention

+

In the write function Use spin_lock_irqsave() and +spin_unlock_irqrestore() for locking when resetting the +buffer.

+

Revisit the Locking section.

+
+

For testing, you will need to create the /dev/kbd character device +driver using the mknod before reading from it. The device master and +minor are defined as KBD_MAJOR and KBD_MINOR:

+
mknod /dev/kbd c 42 0
+
+
+

Build, copy and boot the virtual machine and load the module. +Test it using the command:

+
cat /dev/kbd
+
+
+

Press some keys, then run the command echo "clear" > /dev/kbd. +Check the buffer's content again. It should be reset.

+
+
+
+

Extra Exercises

+
+

1. kfifo

+

Implement a keylogger using the +kfifo API.

+
+

Hint

+

Follow the API call examples from the kernel code. +For example, the file bytestream-examples.c.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab5-deferred-work.html b/refs/pull/405/merge/so2/lab5-deferred-work.html new file mode 100644 index 00000000..f46d32dc --- /dev/null +++ b/refs/pull/405/merge/so2/lab5-deferred-work.html @@ -0,0 +1,1107 @@ + + + + + + SO2 Lab 05 - Deferred work — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 05 - Deferred work

+
+

Lab objectives

+
    +
  • Understanding deferred work (i.e. code scheduled to be executed at a +later time)
  • +
  • Implementation of common tasks that uses deferred work
  • +
  • Understanding the peculiarities of synchronization for deferred work
  • +
+

Keywords: softirq, tasklet, struct tasklet_struct, bottom-half +handlers, jiffies, HZ, timer, struct timer_list, spin_lock_bh, +spin_unlock_bh, workqueue, struct work_struct, kernel thread, events/x

+
+
+

Background information

+

Deferred work is a class of kernel facilities that allows one to +schedule code to be executed at a later timer. This scheduled code can +run either in the process context or in interruption context depending +on the type of deferred work. Deferred work is used to complement the +interrupt handler functionality since interrupts have important +requirements and limitations:

+
    +
  • The execution time of the interrupt handler must be as small as +possible
  • +
  • In interrupt context we can not use blocking calls
  • +
+

Using deferred work we can perform the minimum required work in the +interrupt handler and schedule an asynchronous action from the +interrupt handler to run at a later time and execute the rest of the +operations.

+

Deferred work that runs in interrupt context is also known as +bottom-half, since its purpose is to execute the rest of the actions +from an interrupt handler (top-half).

+

Timers are another type of deferred work that are used to schedule the +execution of future actions after a certain amount of time has passed.

+

Kernel threads are not themselves deferred work, but can be used to +complement the deferred work mechanisms. In general, kernel threads +are used as "workers" to process events whose execution contains +blocking calls.

+

There are three typical operations that are used with all types of +deferred work:

+
    +
  1. Initialization. Each type is described by a structure whose +fields will have to be initialized. The handler to be scheduled is +also set at this time.
  2. +
  3. Scheduling. Schedules the execution of the handler as soon as +possible (or after expiry of a timeout).
  4. +
  5. Masking or Canceling. Disables the execution of the +handler. This action can be either synchronous (which guarantees +that the handler will not run after the completion of canceling) or +asynchronous.
  6. +
+
+

Attention

+

When doing deferred work cleanup, like freeing the +structures associated with the deferred work or +removing the module and thus the handler code from the +kernel, always use the synchronous type of canceling +the deferred work.

+
+

The main types of deferred work are kernel threads and softirqs. Work +queues are implemented on top of kernel threads and tasklets and +timers on top of softirqs. Bottom-half handlers were the first +implementation of deferred work in Linux, but in the meantime it was +replaced by softirqs. That is why some functions presented +contain bh in their name.

+
+
+

Softirqs

+

softirqs can not be used by device drivers, they are reserved for +various kernel subsystems. Because of this there is a fixed number of +softirqs defined at compile time. For the current kernel version we +have the following types defined:

+
enum {
+    HI_SOFTIRQ = 0,
+    TIMER_SOFTIRQ,
+    NET_TX_SOFTIRQ,
+    NET_RX_SOFTIRQ,
+    BLOCK_SOFTIRQ,
+    IRQ_POLL_SOFTIRQ,
+    TASKLET_SOFTIRQ,
+    SCHED_SOFTIRQ,
+    HRTIMER_SOFTIRQ,
+    RCU_SOFTIRQ,
+    NR_SOFTIRQS
+};
+
+
+

Each type has a specific purpose:

+
    +
  • HI_SOFTIRQ and TASKLET_SOFTIRQ - running tasklets
  • +
  • TIMER_SOFTIRQ - running timers
  • +
  • NET_TX_SOFIRQ and NET_RX_SOFTIRQ - used by the networking subsystem
  • +
  • BLOCK_SOFTIRQ - used by the IO subsystem
  • +
  • BLOCK_IOPOLL_SOFTIRQ - used by the IO subsystem to increase performance when the iopoll handler is invoked;
  • +
  • SCHED_SOFTIRQ - load balancing
  • +
  • HRTIMER_SOFTIRQ - implementation of high precision timers
  • +
  • RCU_SOFTIRQ - implementation of RCU type mechanisms [1]
  • +
+ + + + + +
[1]RCU is a mechanism by which destructive operations +(e.g. deleting an element from a chained list) are done in two +steps: (1) removing references to deleted data and (2) freeing +the memory of the element. The second setup is done only after +we are sure nobody uses the element anymore. The advantage of +this mechanism is that reading the data can be done without +synchronization. For more information see +Documentation/RCU/rcu.txt.
+

The highest priority is the HI_SOFTIRQ type softirqs, followed in +order by the other softirqs defined. RCU_SOFTIRQ has the lowest +priority.

+

Softirqs are running in interrupt context which means that they can +not call blocking functions. If the sofitrq handler requires calls to +such functions, work queues can be scheduled to execute these blocking +calls.

+
+

Tasklets

+

A tasklet is a special form of deferred work that runs in interrupt +context, just like softirqs. The main difference between sofirqs and tasklets +is that tasklets can be allocated dynamically and thus they can be used +by device drivers. A tasklet is represented by struct +tasklet and as many other kernel structures it needs to be +initialized before being used. A pre-initialized tasklet can be defined +as following:

+
void handler(unsigned long data);
+
+DECLARE_TASKLET(tasklet, handler, data);
+DECLARE_TASKLET_DISABLED(tasklet, handler, data);
+
+
+

If we want to initialize the tasklet manually we can use the following +approach:

+
void handler(unsigned long data);
+
+struct tasklet_struct tasklet;
+
+tasklet_init(&tasklet, handler, data);
+
+
+

The data parameter will be sent to the handler when it is executed.

+

Programming tasklets for running is called scheduling. Tasklets are +running from softirqs. Tasklets scheduling is done with:

+
void tasklet_schedule(struct tasklet_struct *tasklet);
+
+void tasklet_hi_schedule(struct tasklet_struct *tasklet);
+
+
+

When using tasklet_schedule, a TASKLET_SOFTIRQ softirq is +scheduled and all tasklets scheduled are run. For +tasklet_hi_schedule, a HI_SOFTIRQ softirq is scheduled.

+

If a tasklet was scheduled multiple times and it did not run between +schedules, it will run once. Once the tasklet has run, it can be +re-scheduled, and will run again at a later timer. Tasklets can be +re-scheduled from their handlers.

+

Tasklets can be masked and the following functions can be used:

+
void tasklet_enable(struct tasklet_struct * tasklet);
+void tasklet_disable(struct tasklet_struct * tasklet);
+
+
+

Remember that since tasklets are running from softirqs, blocking calls +can not be used in the handler function.

+
+
+

Timers

+

A particular type of deferred work, very often used, are timers. They +are defined by struct timer_list. They run in interrupt +context and are implemented on top of softirqs.

+

To be used, a timer must first be initialized by calling timer_setup():

+
#include <linux/sched.h>
+
+void timer_setup(struct timer_list * timer,
+                 void (*function)(struct timer_list *),
+                 unsigned int flags);
+
+
+

The above function initializes the internal fields of the structure +and associates function as the timer handler. Since timers are planned +over softirqs, blocking calls can not be used in the code associated +with the treatment function.

+

Scheduling a timer is done with mod_timer():

+
int mod_timer(struct timer_list *timer, unsigned long expires);
+
+
+

Where expires is the time (in the future) to run the handler +function. The function can be used to schedule or reschedule a timer.

+

The time unit is jiffie. The absolute value of a jiffie +is dependent on the platform and it can be found using the +HZ macro that defines the number of jiffies for 1 second. To +convert between jiffies (jiffies_value) and seconds (seconds_value), +the following formulas are used:

+
jiffies_value = seconds_value * HZ ;
+seconds_value = jiffies_value / HZ ;
+
+
+

The kernel maintains a counter that contains the number of jiffies +since the last boot, which can be accessed via the jiffies +global variable or macro. We can use it to calculate a time in the +future for timers:

+
#include <linux/jiffies.h>
+
+unsigned long current_jiffies, next_jiffies;
+unsigned long seconds = 1;
+
+current_jiffies = jiffies;
+next_jiffies = jiffies + seconds * HZ;
+
+
+

To stop a timer, use del_timer() and del_timer_sync():

+
int del_timer(struct timer_list *timer);
+int del_timer_sync(struct timer_list *timer);
+
+
+

These functions can be called for both a scheduled timer and an +unplanned timer. del_timer_sync() is used to eliminate the +races that can occur on multiprocessor systems, since at the end of +the call it is guaranteed that the timer processing function does not +run on any processor.

+

A frequent mistake in using timers is that we forget to turn off +timers. For example, before removing a module, we must stop the timers +because if a timer expires after the module is removed, the handler +function will no longer be loaded into the kernel and a kernel oops +will be generated.

+

The usual sequence used to initialize and schedule a one-second +timeout is:

+
#include <linux/sched.h>
+
+void timer_function(struct timer_list *);
+
+struct timer_list timer ;
+unsigned long seconds = 1;
+
+timer_setup(&timer, timer_function, 0);
+mod_timer(&timer, jiffies + seconds * HZ);
+
+
+

And to stop it:

+
del_timer_sync(&timer);
+
+
+
+
+

Locking

+

For synchronization between code running in process context (A) and +code running in softirq context (B) we need to use special locking +primitives. We must use spinlock operations augmented with +deactivation of bottom-half handlers on the current processor in (A), +and in (B) only basic spinlock operations. Using spinlocks makes sure +that we don't have races between multiple CPUs while deactivating the +softirqs makes sure that we don't deadlock in the softirq is scheduled +on the same CPU where we already acquired a spinlock.

+

We can use the local_bh_disable() and +local_bh_enable() to disable and enable softirqs handlers (and +since they run on top of softirqs also timers and tasklets):

+
void local_bh_disable(void);
+void local_bh_enable(void);
+
+
+

Nested calls are allowed, the actual reactivation of the softirqs is +done only when all local_bh_disable() calls have been complemented by +local_bh_enable() calls:

+
/* We assume that softirqs are enabled */
+local_bh_disable();  /* Softirqs are now disabled */
+local_bh_disable();  /* Softirqs remain disabled */
+
+local_bh_enable();  /* Softirqs remain disabled */
+local_bh_enable();  /* Softirqs are now enabled */
+
+
+
+

Attention

+

These above calls will disable the softirqs only on the +local processor and they are usually not safe to use, they must be +complemented with spinlocks.

+
+

Most of the time device drivers will use special versions of spinlocks +calls for synchronization like spin_lock_bh() and +spin_unlock_bh():

+
void spin_lock_bh(spinlock_t *lock);
+void spin_unlock_bh(spinlock_t *lock);
+
+
+
+
+
+

Workqueues

+

Workqueues are used to schedule actions to run in process context. The +base unit with which they work is called work. There are two types of +work:

+
    +
  • struct work_struct - it schedules a task to run at +a later time
  • +
  • struct delayed_work - it schedules a task to run after at +least a given time interval
  • +
+

A delayed work uses a timer to run after the specified time +interval. The calls with this type of work are similar to those for +struct work_struct, but has _delayed in the functions +names.

+

Before using them a work item must be initialized. There are two types +of macros that can be used, one that declares and initializes the work +item at the same time and one that only initializes the work item (and +the declaration must be done separately):

+
#include <linux/workqueue.h>
+
+DECLARE_WORK(name , void (*function)(struct work_struct *));
+DECLARE_DELAYED_WORK(name, void(*function)(struct work_struct *));
+
+INIT_WORK(struct work_struct *work, void(*function)(struct work_struct *));
+INIT_DELAYED_WORK(struct delayed_work *work, void(*function)(struct work_struct *));
+
+
+

DECLARE_WORK() and DECLARE_DELAYED_WORK() declare and +initialize a work item, and INIT_WORK() and +INIT_DELAYED_WORK() initialize an already declared work item.

+

The following sequence declares and initiates a work item:

+
#include <linux/workqueue.h>
+
+void my_work_handler(struct work_struct *work);
+
+DECLARE_WORK(my_work, my_work_handler);
+
+
+

Or, if we want to initialize the work item separately:

+
void my_work_handler(struct work_struct * work);
+
+struct work_struct my_work;
+
+INIT_WORK(&my_work, my_work_handler);
+
+
+

Once declared and initialized, we can schedule the task using +schedule_work() and schedule_delayed_work():

+
schedule_work(struct work_struct *work);
+
+schedule_delayed_work(struct delayed_work *work, unsigned long delay);
+
+
+

schedule_delayed_work() can be used to plan a work item for +execution with a given delay. The delay time unit is jiffies.

+

Work items can not be masked but they can be canceled by calling +cancel_delayed_work_sync() or cancel_work_sync():

+
int cancel_work_sync(struct delayed_work *work);
+int cancel_delayed_work_sync(struct delayed_work *work);
+
+
+

The call only stops the subsequent execution of the work item. If the +work item is already running at the time of the call, it will continue +to run. In any case, when these calls return, it is guaranteed that +the task will no longer run.

+
+

Attention

+

While there are versions of these functions that are +not synchronous (.e.g. cancel_work()) do not +use them when you are performing cleanup work otherwise +race condition could occur.

+
+

We can wait for a workqueue to complete running all of its work items by calling flush_scheduled_work():

+
void flush_scheduled_work(void);
+
+
+

This function is blocking and, therefore, can not be used in interrupt +context. The function will wait for all work items to be completed. +For delayed work items, cancel_delayed_work must be called +before flush_scheduled_work().

+

Finally, the following functions can be used to schedule work items on +a particular processor (schedule_delayed_work_on()), or on all +processors (schedule_on_each_cpu()):

+
int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
+int schedule_on_each_cpu(void(*function)(struct work_struct *));
+
+
+

A usual sequence to initialize and schedule a work item is the following:

+
void my_work_handler(struct work_struct *work);
+
+struct work_struct my_work;
+
+INIT_WORK(&my_work, my_work_handler);
+
+schedule_work(&my_work);
+
+
+

And for waiting for termination of a work item:

+
flush_scheduled_work();
+
+
+

As you can see, the my_work_handler function receives the task as +the parameter. To be able to access the module's private data, you can +use container_of():

+
struct my_device_data {
+    struct work_struct my_work;
+    // ...
+};
+
+void my_work_handler(struct work_struct *work)
+{
+   struct my_device_data * my_data;
+
+   my_data = container_of(work, struct my_device_data,  my_work);
+   // ...
+}
+
+
+

Scheduling work items with the functions above will run the handler in +the context of a kernel thread called events/x, where x is the +processor number. The kernel will initialize a kernel thread (or a +pool of workers) for each processor present in the system:

+
$ ps -e
+PID TTY TIME CMD
+1?  00:00:00 init
+2 ?  00:00:00 ksoftirqd / 0
+3 ?  00:00:00 events / 0 <--- kernel thread that runs work items
+4 ?  00:00:00 khelper
+5 ?  00:00:00 kthread
+7?  00:00:00 kblockd / 0
+8?  00:00:00 kacpid
+
+
+

The above functions use a predefined workqueue (called events), and +they run in the context of the events/x thread, as noted +above. Although this is sufficient in most cases, it is a shared +resource and large delays in work items handlers can cause delays for +other queue users. For this reason there are functions for creating +additional queues.

+

A workqueue is represented by struct workqueue_struct. A new +workqueue can be created with these functions:

+
struct workqueue_struct *create_workqueue(const char *name);
+struct workqueue_struct *create_singlethread_workqueue(const char *name);
+
+
+

create_workqueue() uses one thread for each processor in the +system, and create_singlethread_workqueue() uses a single +thread.

+

To add a task in the new queue, use queue_work() or +queue_delayed_work():

+
int queue_work(struct workqueue_struct * queue, struct work_struct *work);
+
+int queue_delayed_work(struct workqueue_struct *queue,
+                       struct delayed_work * work , unsigned long delay);
+
+
+

queue_delayed_work() can be used to plan a work for execution +with a given delay. The time unit for the delay is jiffies.

+

To wait for all work items to finish call flush_workqueue():

+
void flush_workqueue(struct worksqueue_struct * queue);
+
+
+

And to destroy the workqueue call destroy_workqueue()

+
void destroy_workqueue(struct workqueue_struct *queue);
+
+
+

The next sequence declares and initializes an additional workqueue, +declares and initializes a work item and adds it to the queue:

+
void my_work_handler(struct work_struct *work);
+
+struct work_struct my_work;
+struct workqueue_struct * my_workqueue;
+
+my_workqueue = create_singlethread_workqueue("my_workqueue");
+INIT_WORK(&my_work, my_work_handler);
+
+queue_work(my_workqueue, &my_work);
+
+
+

And the next code sample shows how to remove the workqueue:

+
flush_workqueue(my_workqueue);
+destroy_workqueue(my_workqueue);
+
+
+

The work items planned with these functions will run in the context of +a new kernel thread called my_workqueue, the name passed to +create_singlethread_workqueue().

+
+
+

Kernel threads

+

Kernel threads have emerged from the need to run kernel code in +process context. Kernel threads are the basis of the workqueue +mechanism. Essentially, a kernel thread is a thread that only runs in +kernel mode and has no user address space or other user attributes.

+

To create a kernel thread, use kthread_create():

+
#include <linux/kthread.h>
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+                                      void *data, const char namefmt[], ...);
+
+
+
    +
  • threadfn is a function that will be run by the kernel thread
  • +
  • data is a parameter to be sent to the function
  • +
  • namefmt represents the kernel thread name, as it is displayed in +ps/top ; Can contain sequences %d , %s etc. Which will be replaced +according to the standard printf syntax.
  • +
+

For example, the following call:

+
kthread_create (f, NULL, "%skthread%d", "my", 0);
+
+
+

Will create a kernel thread with the name mykthread0.

+

The kernel thread created with this function will be stopped (in the +TASK_INTERRUPTIBLE state). To start the kernel thread, call the +wake_up_process():

+
#include <linux/sched.h>
+
+int wake_up_process(struct task_struct *p);
+
+
+

Alternatively, you can use kthread_run() to create and run a +kernel thread:

+
struct task_struct * kthread_run(int (*threadfn)(void *data)
+                                 void *data, const char namefmt[], ...);
+
+
+

Even if the programming restrictions for the function running within +the kernel thread are more relaxed and scheduling is closer to +scheduling in userspace, there are, however, some limitations to be +taken into account. We will list below the actions that can or can not +be made from a kernel thread:

+
    +
  • can't access the user address space (even with copy_from_user, +copy_to_user) because a kernel thread does not have a user address +space
  • +
  • can't implement busy wait code that runs for a long time; if the +kernel is compiled without the preemptive option, that code will run +without being preempted by other kernel threads or user processes +thus hogging the system
  • +
  • can call blocking operations
  • +
  • can use spinlocks, but if the hold time of the lock is significant, +it is recommended to use mutexes
  • +
+

The termination of a kernel thread is done voluntarily, within the +function running in the kernel thread, by calling do_exit():

+
fastcall NORET_TYPE void do_exit(long code);
+
+
+

Most of the implementations of kernel threads handlers use the same +model and it is recommended to start using the same model to avoid +common mistakes:

+
#include <linux/kthread.h>
+
+DECLARE_WAIT_QUEUE_HEAD(wq);
+
+// list events to be processed by kernel thread
+struct list_head events_list;
+struct spin_lock events_lock;
+
+
+// structure describing the event to be processed
+struct event {
+    struct list_head lh;
+    bool stop;
+    //...
+};
+
+struct event* get_next_event(void)
+{
+    struct event *e;
+
+    spin_lock(&events_lock);
+    e = list_first_entry(&events_list, struct event*, lh);
+    if (e)
+        list_del(&e->lh);
+    spin_unlock(&events_lock);
+
+    return e
+}
+
+int my_thread_f(void *data)
+{
+    struct event *e;
+
+    while (true) {
+        wait_event(wq, (e = get_next_event));
+
+        /* Event processing */
+
+        if (e->stop)
+            break;
+    }
+
+    do_exit(0);
+}
+
+/* start and start kthread */
+kthread_run(my_thread_f, NULL, "%skthread%d", "my", 0);
+
+
+

With the template above, the kernel thread requests can be issued +with:

+
void send_event(struct event *ev)
+{
+    spin_lock(&events_lock);
+    list_add(&ev->lh, &events_list);
+    spin_unlock(&events_lock);
+    wake_up(&wq);
+}
+
+
+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is deferred_work. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/deferred_work/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR, find the definitions of the following symbols:

+
    +
  • jiffies
  • +
  • struct timer_list
  • +
  • spin_lock_bh function()
  • +
+
+
+

1.Timer

+

We're looking at creating a simple kernel module that displays a +message at TIMER_TIMEOUT seconds after the module's kernel load.

+

Generate the skeleton for the task named 1-2-timer and follow the +sections marked with TODO 1 to complete the task.

+
+

Hint

+

Use pr_info(...). Messages will be displayed on the +console and can also be viewed using dmesg. When scheduling +the timer we need to use the absolute time of the system (in +the future) in number of ticks. The current time of the +system in the number of ticks is given by jiffies. +Thus, the absolute time we need to pass to the timer is +jiffies + TIMER_TIMEOUT * HZ.

+

For more information review the Timers section.

+
+
+
+

2. Periodic timer

+

Modify the previous module to display the message in once every +TIMER_TIMEOUT seconds. Follow the section marked with TODO 2 in the +skeleton.

+
+
+

3. Timer control using ioctl

+

We plan to display information about the current process after N +seconds of receiving a ioctl call from user space. N is transmitted as +ioctl parameter.

+

Generate the skeleton for the task named 3-4-5-deferred and +follow the sections marked with TODO 1 in the skeleton driver.

+

You will need to implement the following ioctl operations.

+
    +
  • MY_IOCTL_TIMER_SET to schedule a timer to run after a number of +seconds which is received as an argument to ioctl. The timer does +not run periodically. +* This command receives directly a value, not a pointer.
  • +
  • MY_IOCTL_TIMER_CANCEL to deactivate the timer.
  • +
+
+

Note

+

Review ioctl for a way to access the ioctl argument.

+
+
+

Note

+

Review the Timers section for information on enabling / +disabling a timer. In the timer handler, display the current +process identifier (PID) and the process executable image name.

+
+
+

Hint

+

You can find the current process identifier using the pid +and comm fields of the current process. For details, +review proc-info.

+
+
+

Hint

+

To use the device driver from userspace you must create the +device character file /dev/deferred using the mknod +utility. Alternatively, you can run the +3-4-5-deferred/kernel/makenode script that performs this +operation.

+
+

Enable and disable the timer by calling user-space ioctl +operations. Use the 3-4-5-deferred/user/test program to test +planning and canceling of the timer. The program receives the ioctl +type operation and its parameters (if any) on the command line.

+
+

Hint

+

Run the test executable without arguments to observe the +command line options it accepts.

+

To enable the timer after 3 seconds use:

+
./test s 3
+
+
+

To disable the timer use:

+
./test c
+
+
+
+

Note that every time the current process the timer runs from is +swapper/0 with PID 0. This process is the idle process. It is +running when there is nothing else to run on. Because the virtual +machine is very light and does not do much it is natural to see this +process most of the time.

+
+
+

4. Blocking operations

+

Next we want to see what happens when we perform blocking operations +in a timer routine. For this we try to call in the timer-handling +routines a function called alloc_io() that simulates a blocking +operation.

+

Modify the module so that when you receive MY_IOCTL_TIMER_ALLOC +command the timer handler will call alloc_io(). Follow the +sections marked with TODO 2 in the skeleton.

+

Use the same timer. To differentiate functionality in the timer +handler, use a flag in the device structure. Use the +TIMER_TYPE_ALLOC and TIMER_TYPE_SET macros defined in the code +skeleton. For initialization, use TIMER_TYPE_NONE.

+

Run the test program to verify the functionality of task 3. Run the +test program again to call alloc_io().

+
+

Note

+

The driver causes an error because a blocking function is +called in the atomic context (the timer handler runs +interrupt context).

+
+
+
+

5. Workqueues

+

We will modify the module to prevent the error observed in the +previous task.

+

To do so, lets call alloc_io() using workqueues. Schedule a +work item from the timer handler In the work handler (running in +process context) call the alloc_io(). Follow the sections +marked with TODO 3 in the skeleton and review the Workqueues +section if needed.

+
+

Hint

+

Add a new field with the type struct work_struct +in your device structure. Initialize this field. Schedule +the work from the timer handler using schedule_work(). +Schedule the timer handler aften N seconds from the ioctl.

+
+
+
+

6. Kernel thread

+

Implement a simple module that creates a kernel thread that shows the +current process identifier.

+

Generate the skeleton for the task named 6-kthread and follow the +TODOs from the skeleton.

+
+

Note

+

There are two options for creating and running a thread:

+
    +
  • kthread_run() to create and run the thread
  • +
  • kthread_create() to create a suspended thread and +then start it running with wake_up_process().
  • +
+

Review the Kernel Threads section if needed.

+
+
+

Attention

+

Synchronize the thread termination with module unloading:

+
    +
  • The thread should finish when the module is unloaded
  • +
  • Wait for the kernel thread to exit before continuing +with unloading
  • +
+
+
+

Hint

+

For synchronization use two wait queues and two flags.

+

Review waiting-queues on how to use waiting queue.

+

Use atomic variables for flags. Review Atomic variables.

+
+
+
+

7. Buffer shared between timer and process

+

The purpose of this task is to exercise the synchronization between a +deferrable action (a timer) and process context. Set up a periodic +timer that monitors a list of processes. If one of the processes +terminate a message is printed. Processes can be dynamically added to +the list. Use the 3-4-5-deferred/kernel/ skeleton as a base and +follow the TODO 4 markings to complete the task.

+

When the MY_IOCTL_TIMER_MON command is received check that the given +process exists and if so add to the monitored list of +processes and then arm the timer after setting its type.

+
+

Hint

+

Use get_proc() which checks the pid, finds the +associated struct task_struct and allocates a +struct mon_proc item you can add to your +list. Note that the function also increases the reference +counter of the task, so that its memory won't be free when +the task terminates.

+
+
+

Attention

+

Use a spinlock to protect the access to the list. Note +that since we share data with the timer handler we need +to disable bottom-half handlers in addition to taking +the lock. Review the Locking section.

+
+
+

Hint

+

Collect the information every second from a timer. Use the +existing timer and add new behaviour for it via the +TIMER_TYPE_ACCT. To set the flag, use the t argument of +the test program.

+
+

In the timer handler iterate over the list of monitored processes and +check if they have terminated. If so, print the process name and pid +then remove the process from the list, decrement the task usage +counter so that it's memory can be free and finally free the +struct mon_proc structure.

+
+

Hint

+

Use the state field of struct task_struct(). A +task has terminated if its state is TASK_DEAD.

+
+
+

Hint

+

Use put_task_struct() to decrement the task usage +counter.

+
+
+

Attention

+

Make sure you protect the list access with a +spinlock. The simple variant will suffice.

+
+
+

Attention

+

Make sure to use the safe iteration over the list since +we may need to remove an item from the list.

+
+

Rearm the timer after checking the list.

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab6-memory-mapping.html b/refs/pull/405/merge/so2/lab6-memory-mapping.html new file mode 100644 index 00000000..f4528349 --- /dev/null +++ b/refs/pull/405/merge/so2/lab6-memory-mapping.html @@ -0,0 +1,709 @@ + + + + + + SO2 Lab 06 - Memory Mapping — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 06 - Memory Mapping

+
+

Lab objectives

+
    +
  • Understand address space mapping mechanisms
  • +
  • Learn about the most important structures related to memory management
  • +
+

Keywords:

+
    +
  • address space
  • +
  • mmap()
  • +
  • struct page
  • +
  • struct vm_area_struct
  • +
  • struct vm_struct
  • +
  • remap_pfn_range
  • +
  • SetPageReserved()
  • +
  • ClearPageReserved()
  • +
+
+
+

Overview

+

In the Linux kernel it is possible to map a kernel address space to a +user address space. This eliminates the overhead of copying user space +information into the kernel space and vice versa. This can be done +through a device driver and the user space device interface +(/dev).

+

This feature can be used by implementing the mmap() operation +in the device driver's struct file_operations and using the +mmap() system call in user space.

+

The basic unit for virtual memory management is a page, which size is +usually 4K, but it can be up to 64K on some platforms. Whenever we +work with virtual memory we work with two types of addresses: virtual +address and physical address. All CPU access (including from kernel +space) uses virtual addresses that are translated by the MMU into +physical addresses with the help of page tables.

+

A physical page of memory is identified by the Page Frame Number +(PFN). The PFN can be easily computed from the physical address by +dividing it with the size of the page (or by shifting the physical +address with PAGE_SHIFT bits to the right).

+../_images/paging1.png +

For efficiency reasons, the virtual address space is divided into +user space and kernel space. For the same reason, the kernel space +contains a memory mapped zone, called lowmem, which is contiguously +mapped in physical memory, starting from the lowest possible physical +address (usually 0). The virtual address where lowmem is mapped is +defined by PAGE_OFFSET.

+

On a 32bit system, not all available memory can be mapped in lowmem and +because of that there is a separate zone in kernel space called +highmem which can be used to arbitrarily map physical memory.

+

Memory allocated by kmalloc() resides in lowmem and it is +physically contiguous. Memory allocated by vmalloc() is not +contiguous and does not reside in lowmem (it has a dedicated zone in +highmem).

+../_images/kernel-virtmem-map1.png +
+
+

Structures used for memory mapping

+

Before discussing about the memory mapping mechanism over a device, +we will present some of the basic structures used by the Linux memory +management subsystem. +Some of the basic structures are: struct page, +struct vm_area_struct, struct mm_struct.

+
+

struct page

+

struct page is used to embed information about all physical +pages in the system. The kernel has a struct page structure +for all pages in the system.

+

There are many functions that interact with this structure:

+
    +
  • virt_to_page() returns the page associated with a virtual +address
  • +
  • pfn_to_page() returns the page associated with a page frame +number
  • +
  • page_to_pfn() return the page frame number associated with a +struct page
  • +
  • page_address() returns the virtual address of a +struct page; this functions can be called only for pages from +lowmem
  • +
  • kmap() creates a mapping in kernel for an arbitrary physical +page (can be from highmem) and returns a virtual address that can be +used to directly reference the page
  • +
+
+
+

struct vm_area_struct

+

struct vm_area_struct holds information about a contiguous +virtual memory area. The memory areas of a process can be viewed by +inspecting the maps attribute of the process via procfs:

+
root@qemux86:~# cat /proc/1/maps
+#address          perms offset  device inode     pathname
+08048000-08050000 r-xp 00000000 fe:00 761        /sbin/init.sysvinit
+08050000-08051000 r--p 00007000 fe:00 761        /sbin/init.sysvinit
+08051000-08052000 rw-p 00008000 fe:00 761        /sbin/init.sysvinit
+092e1000-09302000 rw-p 00000000 00:00 0          [heap]
+4480c000-4482e000 r-xp 00000000 fe:00 576        /lib/ld-2.25.so
+4482e000-4482f000 r--p 00021000 fe:00 576        /lib/ld-2.25.so
+4482f000-44830000 rw-p 00022000 fe:00 576        /lib/ld-2.25.so
+44832000-449a9000 r-xp 00000000 fe:00 581        /lib/libc-2.25.so
+449a9000-449ab000 r--p 00176000 fe:00 581        /lib/libc-2.25.so
+449ab000-449ac000 rw-p 00178000 fe:00 581        /lib/libc-2.25.so
+449ac000-449af000 rw-p 00000000 00:00 0
+b7761000-b7763000 rw-p 00000000 00:00 0
+b7763000-b7766000 r--p 00000000 00:00 0          [vvar]
+b7766000-b7767000 r-xp 00000000 00:00 0          [vdso]
+bfa15000-bfa36000 rw-p 00000000 00:00 0          [stack]
+
+
+

A memory area is characterized by a start address, a stop address, +length, permissions.

+

A struct vm_area_struct is created at each mmap() +call issued from user space. A driver that supports the mmap() +operation must complete and initialize the associated +struct vm_area_struct. The most important fields of this +structure are:

+
    +
  • vm_start, vm_end - the beginning and the end of +the memory area, respectively (these fields also appear in +/proc/<pid>/maps);
  • +
  • vm_file - the pointer to the associated file structure (if any);
  • +
  • vm_pgoff - the offset of the area within the file;
  • +
  • vm_flags - a set of flags;
  • +
  • vm_ops - a set of working functions for this area
  • +
  • vm_next, vm_prev - the areas of the same process +are chained by a list structure
  • +
+
+
+

struct mm_struct

+

struct mm_struct encompasses all memory areas associated +with a process. The mm field of struct task_struct +is a pointer to the struct mm_struct of the current process.

+
+
+
+

Device driver memory mapping

+

Memory mapping is one of the most interesting features of a Unix +system. From a driver's point of view, the memory-mapping facility +allows direct memory access to a user space device.

+

To assign a mmap() operation to a driver, the mmap +field of the device driver's struct file_operations must be +implemented. If that is the case, the user space process can then use +the mmap() system call on a file descriptor associated with +the device.

+

The mmap system call takes the following parameters:

+
void *mmap(caddr_t addr, size_t len, int prot,
+           int flags, int fd, off_t offset);
+
+
+

To map memory between a device and user space, the user process must +open the device and issue the mmap() system call with the resulting +file descriptor.

+

The device driver mmap() operation has the following signature:

+
int (*mmap)(struct file *filp, struct vm_area_struct *vma);
+
+
+

The filp field is a pointer to a struct file created when +the device is opened from user space. The vma field is used to +indicate the virtual address space where the memory should be mapped +by the device. A driver should allocate memory (using +kmalloc(), vmalloc(), alloc_pages()) and then +map it to the user address space as indicated by the vma parameter +using helper functions such as remap_pfn_range().

+

remap_pfn_range() will map a contiguous physical address space +into the virtual space represented by vm_area_struct:

+
int remap_pfn_range (structure vm_area_struct *vma, unsigned long addr,
+                     unsigned long pfn, unsigned long size, pgprot_t prot);
+
+
+

remap_pfn_range() expects the following parameters:

+
    +
  • vma - the virtual memory space in which mapping is made;
  • +
  • addr - the virtual address space from where remapping begins; page +tables for the virtual address space between addr and addr + size +will be formed as needed
  • +
  • pfn - the page frame number to which the virtual address should be +mapped
  • +
  • size - the size (in bytes) of the memory to be mapped
  • +
  • prot - protection flags for this mapping
  • +
+

Here is an example of using this function that contiguously maps the +physical memory starting at page frame number pfn (memory that was +previously allocated) to the vma->vm_start virtual address:

+
struct vm_area_struct *vma;
+unsigned long len = vma->vm_end - vma->vm_start;
+int ret ;
+
+ret = remap_pfn_range(vma, vma->vm_start, pfn, len, vma->vm_page_prot);
+if (ret < 0) {
+    pr_err("could not map the address area\n");
+    return -EIO;
+}
+
+
+

To obtain the page frame number of the physical memory we must +consider how the memory allocation was performed. For each +kmalloc(), vmalloc(), alloc_pages(), we must +used a different approach. For kmalloc() we can use something +like:

+
static char *kmalloc_area;
+
+unsigned long pfn = virt_to_phys((void *)kmalloc_area)>>PAGE_SHIFT;
+
+
+

while for vmalloc():

+
static char *vmalloc_area;
+
+unsigned long pfn = vmalloc_to_pfn(vmalloc_area);
+
+
+

and finally for alloc_pages():

+
struct page *page;
+
+unsigned long pfn = page_to_pfn(page);
+
+
+
+

Attention

+

Note that memory allocated with vmalloc() is not +physically contiguous so if we want to map a range allocated +with vmalloc(), we have to map each page individually +and compute the physical address for each page.

+
+

Since the pages are mapped to user space, they might be swapped +out. To avoid this we must set the PG_reserved bit on the page. +Enabling is done using SetPageReserved() while reseting it +(which must be done before freeing the memory) is done with +ClearPageReserved():

+
void alloc_mmap_pages(int npages)
+{
+    int i;
+    char *mem = kmalloc(PAGE_SIZE * npages);
+
+    if (!mem)
+        return mem;
+
+    for(i = 0; i < npages * PAGE_SIZE; i += PAGE_SIZE)
+        SetPageReserved(virt_to_page(((unsigned long)mem) + i));
+
+    return mem;
+}
+
+void free_mmap_pages(void *mem, int npages)
+{
+    int i;
+
+    for(i = 0; i < npages * PAGE_SIZE; i += PAGE_SIZE)
+        ClearPageReserved(virt_to_page(((unsigned long)mem) + i));
+
+    kfree(mem);
+}
+
+
+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is memory_mapping. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/memory_mapping/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

1. Mapping contiguous physical memory to userspace

+

Implement a device driver that maps contiguous physical memory +(e.g. obtained via kmalloc()) to userspace.

+

Review the Device driver memory mapping section, generate the +skeleton for the task named kmmap and fill in the areas marked +with TODO 1.

+

Start with allocating a NPAGES+2 memory area page using kmalloc() +in the module init function and find the first address in the area that is +aligned to a page boundary.

+
+

Hint

+

The size of a page is PAGE_SIZE.

+

Store the allocated area in kmalloc_ptr and the page +aligned address in kmalloc_area:

+

Use PAGE_ALIGN() to determine kmalloc_area.

+
+

Enable the PG_reserved bit of each page with +SetPageReserved(). Clear the bit with +ClearPageReserved() before freeing the memory.

+
+

Hint

+

Use virt_to_page() to translate virtual pages into +physical pages, as required by SetPageReserved() +and ClearPageReserved().

+
+

For verification purpose (using the test below), fill in the first 4 +bytes of each page with the following values: 0xaa, 0xbb, 0xcc, 0xdd.

+

Implement the mmap() driver function.

+
+

Hint

+

For mapping, use remap_pfn_range(). The third +argument for remap_pfn_range() is a page frame number (PFN).

+

To convert from virtual kernel address to physical address, +use virt_to_phys().

+

To convert a physical address to its PFN, shift the address +with PAGE_SHIFT bits to the right.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 1
+
+
+

If everything goes well, the test will show "matched" messages.

+
+
+

2. Mapping non-contiguous physical memory to userspace

+

Implement a device driver that maps non-contiguous physical memory +(e.g. obtained via vmalloc()) to userspace.

+

Review the Device driver memory mapping section, generate the +skeleton for the task named vmmap and fill in the areas marked +with TODO 1.

+

Allocate a memory area of NPAGES with vmalloc().

+
+

Hint

+

The size of a page is PAGE_SIZE. +Store the allocated area in vmalloc_area. +Memory allocated by vmalloc() is paged aligned.

+
+

Enable the PG_reserved bit of each page with +SetPageReserved(). Clear the bit with +ClearPageReserved() before freeing the memory.

+
+

Hint

+

Use vmalloc_to_page() to translate virtual pages +into physical pages used by the functions +SetPageReserved() and ClearPageReserved().

+
+

For verification purpose (using the test below), fill in the first 4 +bytes of each page with the following values: 0xaa, 0xbb, 0xcc, 0xdd.

+

Implement the mmap driver function.

+
+

Hint

+

To convert from virtual vmalloc address to physical address, +use vmalloc_to_pfn() which returns a PFN directly.

+
+
+

Attention

+

vmalloc pages are not physically contiguous so it is +needed to use remap_pfn_range() for each page.

+

Loop through all virtual pages and for each: +* determine the physical address +* map it with remap_pfn_range()

+

Make sure that you determine the physical address +each time and that you use a range of one page for mapping.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 1
+
+
+

If everything goes well, the test will show "matched" messages.

+
+
+

3. Read / write operations in mapped memory

+

Modify one of the previous modules to allow read / write operations on +your device. This is a didactic exercise to see that the same space +can also be used with the mmap() call and with read() +and write() calls.

+

Fill in areas marked with TODO 2.

+
+

Note

+

The offset parameter sent to the read / write operation can +be ignored as all reads / writes from the test program will +be done with 0 offsets.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 2
+
+
+
+
+

4. Display memory mapped in procfs

+

Using one of the previous modules, create a procfs file in which you +display the total memory mapped by the calling process.

+

Fill in the areas marked with TODO 3.

+

Create a new entry in procfs (PROC_ENTRY_NAME, defined in +mmap-test.h) that will show the total memory mapped by the process +that called the read() on that file.

+
+

Hint

+

Use proc_create(). For the mode parameter, use 0, +and for the parent parameter use NULL. Use +my_proc_file_ops() for operations.

+
+

In the module exit function, delete the PROC_ENTRY_NAME entry +using remove_proc_entry().

+
+

Note

+

A (complex) use and description of the struct +seq_file interface can be found here in this example .

+

For this exercise, just a simple use of the interface +described here is +sufficient. Check the "extra-simple" API described there.

+
+

In the my_seq_show() function you will need to:

+
    +
  • Obtain the struct mm_struct structure of the current process +using the get_task_mm() function.

    +
    +

    Hint

    +

    The current process is available via the current variable +of type struct task_struct*.

    +
    +
  • +
  • Iterate through the entire struct vm_area_struct list +associated with the process.

    +
    +

    Hint

    +

    Use the variable vma_iterator and start from +mm->mmap. Use the vm_next field of +the struct vm_area_struct to navigate through +the list of memory areas. Stop when you reach NULL.

    +
    +
  • +
  • Use vm_start and vm_end for each area to compute the total size.

    +
  • +
  • Use pr_info("%lx %lxn, ...)() to print vm_start and vm_end for +each area.

    +
  • +
  • To release struct mm_struct, decrement the reference +counter of the structure using mmput().

    +
  • +
  • Use seq_printf() to write to the file. Show only the total count, +no other messages. Do not even show newline (n).

    +
  • +
+

In my_seq_open() register the display function +(my_seq_show()) using single_open().

+
+

Note

+

single_open() can use NULL as its third argument.

+
+

For testing, load the kernel module and run:

+
root@qemux86:~# skels/memory_mapping/test/mmap-test 3
+
+
+
+

Note

+

The test waits for a while (it has an internal sleep +instruction). As long as the test waits, use the +pmap command in another console to see the +mappings of the test and compare those to the test results.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab7-block-device-drivers.html b/refs/pull/405/merge/so2/lab7-block-device-drivers.html new file mode 100644 index 00000000..2a98751a --- /dev/null +++ b/refs/pull/405/merge/so2/lab7-block-device-drivers.html @@ -0,0 +1,1397 @@ + + + + + + SO2 Lab 07 - Block Device Drivers — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 07 - Block Device Drivers

+
+

Lab objectives

+
+
    +
  • acquiring knowledge about the behavior of the I/O subsystem on Linux
  • +
  • hands-on activities in structures and functions of block devices
  • +
  • acquiring basic skills for utilizing the API for block devices, by solving +exercises
  • +
+
+
+
+

Overview

+

Block devices are characterized by random access to data organized in fixed-size +blocks. Examples of such devices are hard drives, CD-ROM drives, RAM disks, etc. +The speed of block devices is generally much higher than the speed of character +devices, and their performance is also important. This is why the Linux kernel +handles differently these 2 types of devices (it uses a specialized API).

+

Working with block devices is therefore more complicated than working with +character devices. Character devices have a single current position, while block +devices must be able to move to any position in the device to provide random +access to data. To simplify work with block devices, the Linux kernel provides +an entire subsystem called the block I/O (or block layer) subsystem.

+

From the kernel perspective, the smallest logical unit of addressing is the +block. Although the physical device can be addressed at sector level, the kernel +performs all disk operations using blocks. Since the smallest unit of physical +addressing is the sector, the size of the block must be a multiple of the size +of the sector. Additionally, the block size must be a power of 2 and can not +exceed the size of a page. The size of the block may vary depending on the file +system used, the most common values being 512 bytes, 1 kilobytes and 4 +kilobytes.

+
+
+

Register a block I/O device

+

To register a block I/O device, function register_blkdev() is used. +To deregister a block I/O device, function unregister_blkdev() is +used.

+

Starting with version 4.9 of the Linux kernel, the call to +register_blkdev() is optional. The only operations performed by this +function are the dynamic allocation of a major (if the major argument is 0 when +calling the function) and creating an entry in /proc/devices. In +future kernel versions it may be removed; however, most drivers still call it.

+

Usually, the call to the register function is performed in the module +initialization function, and the call to the deregister function is performed in +the module exit function. A typical scenario is presented below:

+
#include <linux/fs.h>
+
+#define MY_BLOCK_MAJOR           240
+#define MY_BLKDEV_NAME          "mybdev"
+
+static int my_block_init(void)
+{
+    int status;
+
+    status = register_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
+    if (status < 0) {
+             printk(KERN_ERR "unable to register mybdev block device\n");
+             return -EBUSY;
+     }
+     //...
+}
+
+static void my_block_exit(void)
+{
+     //...
+     unregister_blkdev(MY_BLOCK_MAJOR, MY_BLKDEV_NAME);
+}
+
+
+
+
+

Register a disk

+

Although the register_blkdev() function obtains a major, it does not +provide a device (disk) to the system. For creating and using block devices +(disks), a specialized interface defined in linux/genhd.h is used.

+

The useful functions defined in linux/genhd.h are to register /allocate +a disk, add it to the system, and de-register /unmount the disk.

+

The alloc_disk() function is used to allocate a disk, and the +del_gendisk() function is used to deallocate it. Adding the disk to the +system is done using the add_disk() function.

+

The alloc_disk() and add_disk() functions are typically used in +the module initialization function, and the del_gendisk() function in +the module exit function.

+
#include <linux/fs.h>
+#include <linux/genhd.h>
+
+#define MY_BLOCK_MINORS       1
+
+static struct my_block_dev {
+    struct gendisk *gd;
+    //...
+} dev;
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    dev->gd = alloc_disk(MY_BLOCK_MINORS);
+    //...
+    add_disk(dev->gd);
+}
+
+static int my_block_init(void)
+{
+    //...
+    create_block_device(&dev);
+}
+
+static void delete_block_device(struct my_block_dev *dev)
+{
+    if (dev->gd)
+        del_gendisk(dev->gd);
+    //...
+}
+
+static void my_block_exit(void)
+{
+    delete_block_device(&dev);
+    //...
+}
+
+
+

As with character devices, it is recommended to use my_block_dev +structure to store important elements describing the block device.

+

Note that immediately after calling the add_disk() function (actually +even during the call), the disk is active and its methods can be called at any +time. As a result, this function should not be called before the driver is fully +initialized and ready to respond to requests for the registered disk.

+

It can be noticed that the basic structure in working with block devices (disks) +is the struct gendisk structure.

+

After a call to del_gendisk(), the struct gendisk structure +may continue to exist (and the device operations may still be called) if there +are still users (an open operation was called on the device but the associated +release operation has not been called). One solution is to keep the number of +users of the device and call the del_gendisk() function only when there +are no users left of the device.

+
+
+

struct gendisk structure

+

The struct gendisk structure stores information about a disk. As +stated above, such a structure is obtained from the alloc_disk() call +and its fields must be filled before it is sent to the add_disk() +function.

+

The struct gendisk structure has the following important fields:

+
+
    +
  • major, first_minor, minor, describing +the identifiers used by the disk; a disk must have at least one minor; if +the disk allows the partitioning operation, a minor must be allocated for +each possible partition
  • +
  • disk_name, which represents the disk name as it appears in +/proc/partitions and in sysfs (/sys/block)
  • +
  • fops, representing operations associated with the disk
  • +
  • queue, which represents the queue of requests
  • +
  • capacity, which is disk capacity in 512 byte sectors; +it is initialized using the set_capacity() function
  • +
  • private_data, which is a pointer to private data
  • +
+
+

An example of filling a struct gendisk structure is presented below:

+
#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#define NR_SECTORS                   1024
+
+#define KERNEL_SECTOR_SIZE           512
+
+static struct my_block_dev {
+    //...
+    spinlock_t lock;                /* For mutual exclusion */
+    struct request_queue *queue;    /* The device request queue */
+    struct gendisk *gd;             /* The gendisk structure */
+    //...
+} dev;
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    ...
+    /* Initialize the gendisk structure */
+    dev->gd = alloc_disk(MY_BLOCK_MINORS);
+    if (!dev->gd) {
+        printk (KERN_NOTICE "alloc_disk failure\n");
+        return -ENOMEM;
+    }
+
+    dev->gd->major = MY_BLOCK_MAJOR;
+    dev->gd->first_minor = 0;
+    dev->gd->fops = &my_block_ops;
+    dev->gd->queue = dev->queue;
+    dev->gd->private_data = dev;
+    snprintf (dev->gd->disk_name, 32, "myblock");
+    set_capacity(dev->gd, NR_SECTORS);
+
+    add_disk(dev->gd);
+
+    return 0;
+}
+
+static int my_block_init(void)
+{
+    int status;
+    //...
+    status = create_block_device(&dev);
+    if (status < 0)
+        return status;
+    //...
+}
+
+static void delete_block_device(struct my_block_dev *dev)
+{
+    if (dev->gd) {
+        del_gendisk(dev->gd);
+    }
+    //...
+}
+
+static void my_block_exit(void)
+{
+    delete_block_device(&dev);
+    //...
+}
+
+
+

As stated before, the kernel considers a disk as a vector of 512 byte sectors. +In reality, the devices may have a different size of the sector. To work with +these devices, the kernel needs to be informed about the real size of a sector, +and for all operations the necessary conversions must be made.

+

To inform the kernel about the device sector size, a parameter of the request +queue must be set just after the request queue is allocated, using the +blk_queue_logical_block_size() function. All requests generated by the +kernel will be multiple of this sector size and will be aligned accordingly. +However, communication between the device and the driver will still be performed +in sectors of 512 bytes in size, so conversion should be done each time (an +example of such conversion is when calling the set_capacity() function +in the code above).

+
+
+

struct block_device_operations structure

+

Just as for a character device, operations in struct file_operations +should be completed, so for a block device, the operations in +struct block_device_operations should be completed. The association +of operations is done through the fops field in the +struct gendisk +structure.

+

Some of the fields of the struct block_device_operations structure +are presented below:

+
struct block_device_operations {
+    int (*open) (struct block_device *, fmode_t);
+    int (*release) (struct gendisk *, fmode_t);
+    int (*locked_ioctl) (struct block_device *, fmode_t, unsigned,
+                         unsigned long);
+    int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+    int (*compat_ioctl) (struct block_device *, fmode_t, unsigned,
+                         unsigned long);
+    int (*direct_access) (struct block_device *, sector_t,
+                          void **, unsigned long *);
+    int (*media_changed) (struct gendisk *);
+    int (*revalidate_disk) (struct gendisk *);
+    int (*getgeo)(struct block_device *, struct hd_geometry *);
+    blk_qc_t (*submit_bio) (struct bio *bio);
+    struct module *owner;
+}
+
+
+

open() and release() operations are called directly from user +space by utilities that may perform the following tasks: partitioning, file +system creation, file system verification. In a mount() operation, the +open() function is called directly from the kernel space, the file +descriptor being stored by the kernel. A driver for a block device can not +differentiate between open() calls performed from user space and kernel +space.

+

An example of how to use these two functions is given below:

+
#include <linux/fs.h>
+#include <linux/genhd.h>
+
+static struct my_block_dev {
+    //...
+    struct gendisk * gd;
+    //...
+} dev;
+
+static int my_block_open(struct block_device *bdev, fmode_t mode)
+{
+    //...
+
+    return 0;
+}
+
+static int my_block_release(struct gendisk *gd, fmode_t mode)
+{
+    //...
+
+    return 0;
+}
+
+struct block_device_operations my_block_ops = {
+    .owner = THIS_MODULE,
+    .open = my_block_open,
+    .release = my_block_release
+};
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    //....
+    dev->gd->fops = &my_block_ops;
+    dev->gd->private_data = dev;
+    //...
+}
+
+
+

Please notice that there are no read or write operations. These operations are +performed by the request() function associated with the request queue +of the disk.

+
+
+

Request Queues - Multi-Queue Block Layer

+

Drivers for block devices use queues to store the block I/O requests that will +be processed. A request queue is represented by the +struct request_queue structure. The request queue is made up of a +double-linked list of requests and their associated control information. The +requests are added to the queue by higher-level kernel code (for example, file +systems).

+

The block device driver associates each queue with a handling function, which +will be called for each request in the queue +(the struct request structure).

+

In earlier version of the Linux kernel, each device driver had associated one or +more request queues (struct request_queue), where any client could add +requests, while also being able to reorder them. +The problem with this approach is that it requires a per-queue lock, making it +inefficient in distributed systems.

+

The Multi-Queue Block Queing Mechanism +solves this issue by splitting the device driver queue in two parts:

+
+
    +
  1. Software staging queues
  2. +
  3. Hardware dispatch queues
  4. +
+
+
+

Software staging queues

+

The staging queues hold requests from the clients before sending them to the +block device driver. To prevent the waiting for a per-queue lock, a staging +queue is allocated for each CPU or node. A software queue is associated to +only one hardware queue.

+

While in this queue, the requests can be merged or reordered, according to an +I/O Scheduler, in order to maximize performance. This means that only the +requests coming from the same CPU or node can be optimized.

+

Staging queues are usually not used by the block device drivers, but only +internally by the I/O subsystem to optimize requests before sending them to the +device drivers.

+
+
+

Hardware dispatch queues

+

The hardware queues (struct blk_mq_hw_ctx) are used to send the +requests from the staging queues to the block device driver. +Once in this queue, the requests can't be merged or reordered.

+

Depending on the underlying hardware, a block device driver can create multiple +hardware queues in order to improve parallelism and maximize performance.

+
+
+

Tag sets

+

A block device driver can accept a request before the previous one is completed. +As a consequence, the upper layers need a way to know when a request is +completed. For this, a "tag" is added to each request upon submission and sent +back using a completion notification after the request is completed.

+

The tags are part of a tag set (struct blk_mq_tag_set), which is +unique to a device. +The tag set structure is allocated and initialized before the request queues +and also stores some of the queues properties.

+
struct blk_mq_tag_set {
+  ...
+  const struct blk_mq_ops   *ops;
+  unsigned int               nr_hw_queues;
+  unsigned int               queue_depth;
+  unsigned int               cmd_size;
+  int                        numa_node;
+  void                      *driver_data;
+  struct blk_mq_tags       **tags;
+  struct list_head           tag_list;
+  ...
+};
+
+
+

Some of the fields in struct blk_mq_tag_set are:

+
+
    +
  • ops - Queue operations, most notably the request handling function.
  • +
  • nr_hw_queues - The number of hardware queues allocated for the device
  • +
  • queue_depth - Hardware queues size
  • +
  • cmd_size - Number of extra bytes allocated at the end of the device, to +be used by the block device driver, if needed.
  • +
  • numa_node - In NUMA systems, the index of the node the storage device is +connected to.
  • +
  • driver_data - Data private to the driver, if needed.
  • +
  • tags - Pointer to an array of nr_hw_queues tag sets.
  • +
  • tag_list - List of request queues using this tag set.
  • +
+
+
+
+

Create and delete a request queue

+

Request queues are created using the blk_mq_init_queue() function and +are deleted using blk_cleanup_queue(). The first function creates both +the hardware and the software queues and initializes their structures.

+

Queue properties, including the number of hardware queues, their capacity and +request handling function are configured using the blk_mq_tag_set +structure, as described above.

+

An example of using these functions is as follows:

+
#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+static struct my_block_dev {
+    //...
+    struct blk_mq_tag_set tag_set;
+    struct request_queue *queue;
+    //...
+} dev;
+
+static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx,
+                                     const struct blk_mq_queue_data *bd)
+//...
+
+static struct blk_mq_ops my_queue_ops = {
+   .queue_rq = my_block_request,
+};
+
+static int create_block_device(struct my_block_dev *dev)
+{
+    /* Initialize tag set. */
+    dev->tag_set.ops = &my_queue_ops;
+    dev->tag_set.nr_hw_queues = 1;
+    dev->tag_set.queue_depth = 128;
+    dev->tag_set.numa_node = NUMA_NO_NODE;
+    dev->tag_set.cmd_size = 0;
+    dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+    err = blk_mq_alloc_tag_set(&dev->tag_set);
+    if (err) {
+        goto out_err;
+    }
+
+    /* Allocate queue. */
+    dev->queue = blk_mq_init_queue(&dev->tag_set);
+    if (IS_ERR(dev->queue)) {
+        goto out_blk_init;
+    }
+
+    blk_queue_logical_block_size(dev->queue, KERNEL_SECTOR_SIZE);
+
+     /* Assign private data to queue structure. */
+    dev->queue->queuedata = dev;
+    //...
+
+out_blk_init:
+    blk_mq_free_tag_set(&dev->tag_set);
+out_err:
+    return -ENOMEM;
+}
+
+static int my_block_init(void)
+{
+    int status;
+    //...
+    status = create_block_device(&dev);
+    if (status < 0)
+        return status;
+    //...
+}
+
+static void delete_block_device(struct block_dev *dev)
+{
+    //...
+    blk_cleanup_queue(dev->queue);
+    blk_mq_free_tag_set(&dev->tag_set);
+}
+
+static void my_block_exit(void)
+{
+    delete_block_device(&dev);
+    //...
+}
+
+
+

After initializing the tag set structure, the tag lists are allocated using the +blk_mq_alloc_tag_set() function. +The pointer to the function which will process the requests +(my_block_request()) is filled in the my_queue_ops structure and +then the pointer to this structure is added to the tag set.

+

The queue is created using the blk_mq_init_queue() function, based on +the information added in the tag set.

+

As part of the request queue initialization, you can configure the +queuedata field, which is equivalent to the private_data +field in other structures.

+
+
+

Useful functions for processing request queues

+

The queue_rq function from struct blk_mq_ops is used to handle +requests for working with the block device. +This function is the equivalent of read and write functions encountered on +character devices. The function receives the requests for the device as +arguments and can use various functions for processing them.

+

The functions used to process the requests in the handler are described below:

+
+
    +
  • blk_mq_start_request() - must be called before starting processing +a request;
  • +
  • blk_mq_requeue_request() - to re-send the request in the queue;
  • +
  • blk_mq_end_request() - to end request processing and notify the +upper layers.
  • +
+
+
+
+
+

Requests for block devices

+

A request for a block device is described by struct request +structure.

+

The fields of struct request structure include:

+
+
    +
  • cmd_flags: a series of flags including direction (reading or +writing); to find out the direction, the macrodefinition +rq_data_dir is used, which returns 0 for a read request and 1 +for a write request on the device;
  • +
  • __sector: the first sector of the transfer request; if the +device sector has a different size, the appropriate conversion should be +done. To access this field, use the blk_rq_pos macro;
  • +
  • __data_len: the total number of bytes to be transferred; to +access this field the blk_rq_bytes macro is used;
  • +
  • generally, data from the current struct bio will be +transferred; the data size is obtained using the +blk_rq_cur_bytes macro;
  • +
  • bio, a dynamic list of struct bio structures that +is a set of buffers associated to the request; this field is accessed by +macrodefinition rq_for_each_segment if there are multiple +buffers, or by bio_data macrodefinition in case there is only +one associated buffer;
  • +
+
+

We will discuss more about the struct bio structure and its +associated operations in the bio_structure section.

+
+

Create a request

+

Read /write requests are created by code layers superior to the kernel I/O +subsystem. Typically, the subsystem that creates requests for block devices is +the file management subsystem. The I/O subsystem acts as an interface between +the file management subsystem and the block device driver. The main operations +under the responsibility of the I/O subsystem are adding requests to the queue +of the specific block device and sorting and merging requests according to +performance considerations.

+
+
+

Process a request

+

The central part of a block device driver is the request handling function +(queue_rq). In previous examples, the function that fulfilled this role was +my_block_request(). As stated in the +Create and delete a request queue section, this function is associated to the +driver when creating the tag set structure.

+

This function is called when the kernel considers that the driver should process +I/O requests. The function must start processing the requests from the queue, +but it is not mandatory to finish them, as requests may be finished by other +parts of the driver.

+

The request function runs in an atomic context and must follow the rules for +atomic code (it does not need to call functions that can cause sleep, etc.).

+

Calling the function that processes the requests is asynchronous relative +to the actions of any userspace process and no assumptions about the process +in which the respective function is running should be made. Also, it should not +be assumed that the buffer provided by a request is from kernel space or user +space, any operation that accesses the userspace being erroneous.

+

One of the simplest request handling function is presented below:

+
static blk_status_t my_block_request(struct blk_mq_hw_ctx *hctx,
+                                     const struct blk_mq_queue_data *bd)
+{
+    struct request *rq = bd->rq;
+    struct my_block_dev *dev = q->queuedata;
+
+    blk_mq_start_request(rq);
+
+    if (blk_rq_is_passthrough(rq)) {
+        printk (KERN_NOTICE "Skip non-fs request\n");
+        blk_mq_end_request(rq, BLK_STS_IOERR);
+        goto out;
+    }
+
+    /* do work */
+    ...
+
+    blk_mq_end_request(rq, BLK_STS_OK);
+
+out:
+    return BLK_STS_OK;
+}
+
+
+

The my_block_request() function performs the following operations:

+
+
    +
  • Get a pointer to the request structure from the bd argument and start +its processing using the blk_mq_start_request() function.
  • +
  • A block device can receive calls which do not transfer data blocks (e.g. +low level operations on the disk, instructions referring to special ways of +accessing the device). Most drivers do not know how to handle these +requests and return an error.
  • +
  • To return an error, blk_mq_end_request() function is called, +BLK_STS_IOERR being the second argument.
  • +
  • The request is processed according to the needs of the associated device.
  • +
  • The request ends. In this case, blk_mq_end_request() function is +called in order to complete the request.
  • +
+
+
+
+
+

struct bio structure

+

Each struct request structure is an I/O block request, but may come +from combining more independent requests from a higher level. The sectors to be +transferred for a request can be scattered into the main memory but they always +correspond to a set of consecutive sectors on the device. The request is +represented as a series of segments, each corresponding to a buffer in memory. +The kernel can combine requests that refer to adjacent sectors but will not +combine write requests with read requests into a single +struct request structure.

+

A struct request structure is implemented as a linked list of +struct bio structures together with information that allows the +driver to retain its current position while processing the request.

+

The struct bio structure is a low-level description of a portion of +a block I/O request.

+
struct bio {
+    //...
+    struct gendisk          *bi_disk;
+    unsigned int            bi_opf;         /* bottom bits req flags, top bits REQ_OP. Use accessors. */
+    //...
+    struct bio_vec          *bi_io_vec;     /* the actual vec list */
+    //...
+    struct bvec_iter        bi_iter;
+    /...
+    void                    *bi_private;
+    //...
+};
+
+
+

In turn, the struct bio structure contains a bi_io_vec +vector of struct bio_vec structures. It consists of the individual +pages in the physical memory to be transferred, the offset within the page and +the size of the buffer. To iterate through a struct bio structure, +we need to iterate through the vector of struct bio_vec and transfer +the data from every physical page. To simplify vector iteration, the +struct bvec_iter structure is used. This structure maintains +information about how many buffers and sectors were consumed during the +iteration. The request type is encoded in the bi_opf field; to +determine it, use the bio_data_dir() function.

+
+

Create a struct bio structure

+

Two functions can be used to create a struct bio structure:

+
+
    +
  • bio_alloc(): allocates space for a new structure; the structure +must be initialized;
  • +
  • bio_clone(): makes a copy of an existing struct bio +structure; the newly obtained structure is initialized with the values of +the cloned structure fields; the buffers are shared with the +struct bio structure that has been cloned so that access to the +buffers has to be done carefully to avoid access to the same memory area +from the two clones;
  • +
+
+

Both functions return a new struct bio structure.

+
+
+

Submit a struct bio structure

+

Usually, a struct bio structure is created by the higher levels of +the kernel (usually the file system). A structure thus created is then +transmitted to the I/O subsystem that gathers more struct bio +structures into a request.

+

For submitting a struct bio structure to the associated I/O device +driver, the submit_bio() function is used. The function receives as +argument an initialized struct bio structure that will be added to +a request from the request queue of an I/O device. From that queue, it can be +processed by the I/O device driver using a specialized function.

+
+
+

Wait for the completion of a struct bio structure

+

Submitting a struct bio structure to a driver has the effect of +adding it to a request from the request queue from where it will be further +processed. Thus, when the submit_bio() function returns, it is not +guaranteed that the processing of the structure has finished. If you want to +wait for the processing of the request to be finished, use the +submit_bio_wait() function.

+

To be notified when the processing of a struct bio structure ends +(when we do not use submit_bio_wait() function), the +bi_end_io field of the structure should be used. This field +specifies the function that will be called at the end of the +struct bio structure processing. You can use the +bi_private field of the structure to pass information to the +function.

+
+
+

Initialize a struct bio structure

+

Once a struct bio structure has been allocated and before being +transmitted, it must be initialized.

+

Initializing the structure involves filling in its important fields. As +mentioned above, the bi_end_io field is used to specify the function +called when the processing of the structure is finished. The +bi_private field is used to store useful data that can be accessed +in the function pointed by bi_end_io.

+

The bi_opf field specifies the type of operation.

+
struct bio *bio = bio_alloc(GFP_NOIO, 1);
+//...
+bio->bi_disk = bdev->bd_disk;
+bio->bi_iter.bi_sector = sector;
+bio->bi_opf = REQ_OP_READ;
+bio_add_page(bio, page, size, offset);
+//...
+
+
+

In the code snippet above we specified the block device to which we sent the +following: struct bio structure, startup sector, operation +(REQ_OP_READ or REQ_OP_WRITE) and content. The content of a +struct bio structure is a buffer described by: a physical page, +the offset in the page and the size of the bufer. A page can be assigned using +the alloc_page() call.

+
+

Note

+

The size field of the bio_add_page() call must be +a multiple of the device sector size.

+
+
+
+

How to use the content of a struct bio structure

+

To use the content of a struct bio structure, the structure's +support pages must be mapped to the kernel address space from where they can be +accessed. For mapping /unmapping, use the kmap_atomic and +the kunmap_atomic macros.

+

A typical example of use is:

+
static void my_block_transfer(struct my_block_dev *dev, size_t start,
+                              size_t len, char *buffer, int dir);
+
+
+static int my_xfer_bio(struct my_block_dev *dev, struct bio *bio)
+{
+    struct bio_vec bvec;
+    struct bvec_iter i;
+    int dir = bio_data_dir(bio);
+
+    /* Do each segment independently. */
+    bio_for_each_segment(bvec, bio, i) {
+        sector_t sector = i.bi_sector;
+        char *buffer = kmap_atomic(bvec.bv_page);
+        unsigned long offset = bvec.bv_offset;
+        size_t len = bvec.bv_len;
+
+        /* process mapped buffer */
+        my_block_transfer(dev, sector, len, buffer + offset, dir);
+
+        kunmap_atomic(buffer);
+    }
+
+    return 0;
+}
+
+
+

As it can be seen from the example above, iterating through a +struct bio requires iterating through all of its segments. A segment +(struct bio_vec) is defined by the physical address page, the offset +in the page and its size.

+

To simplify the processing of a struct bio, use the +bio_for_each_segment macrodefinition. It will iterate through all +segments, and will also update global information stored in an iterator +(struct bvec_iter) such as the current sector as well as other +internal information (segment vector index, number of bytes left to be +processed, etc.) .

+

You can store information in the mapped buffer, or extract information.

+

In case request queues are used and you needed to process the requests +at struct bio level, use the rq_for_each_segment +macrodefinition instead of the bio_for_each_segment macrodefinition. +This macrodefinition iterates through each segment of each +struct bio structure of a struct request structure and +updates a struct req_iterator structure. The +struct req_iterator contains the current struct bio +structure and the iterator that traverses its segments.

+

A typical example of use is:

+
struct bio_vec bvec;
+struct req_iterator iter;
+
+rq_for_each_segment(bvec, req, iter) {
+    sector_t sector = iter.iter.bi_sector;
+    char *buffer = kmap_atomic(bvec.bv_page);
+    unsigned long offset = bvec.bv_offset;
+    size_t len = bvec.bv_len;
+    int dir = bio_data_dir(iter.bio);
+
+    my_block_transfer(dev, sector, len, buffer + offset, dir);
+
+    kunmap_atomic(buffer);
+}
+
+
+
+
+

Free a struct bio structure

+

Once a kernel subsystem uses a struct bio structure, it will have to +release the reference to it. This is done by calling bio_put() function.

+
+
+

Set up a request queue at struct bio level

+

We have previously seen how we can specify a function to be used to process +requests sent to the driver. The function receives as argument the requests and +carries out processing at struct request level.

+

If, for flexibility reasons, we need to specify a function that carries +out processing at struct bio structure level, we no longer +use request queues and we will need to fill the submit_bio field in the +struct block_device_operations associated to the driver.

+

Below is a typical example of initializing a function that carries out +processing at struct bio structure level:

+
// the declaration of the function that carries out processing
+// :c:type:`struct bio` structures
+static blk_qc_t my_submit_bio(struct bio *bio);
+
+struct block_device_operations my_block_ops = {
+   .owner = THIS_MODULE,
+   .submit_bio = my_submit_bio
+   ...
+};
+
+
+
+
+ +
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is block_device_drivers. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/block_device_drivers/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

0. Intro

+

Using LXR find the definitions of the following symbols in the Linux kernel:

+
+
    +
  • struct bio
  • +
  • struct bio_vec
  • +
  • bio_for_each_segment
  • +
  • struct gendisk
  • +
  • struct block_device_operations
  • +
  • struct request
  • +
+
+
+
+

1. Block device

+

Create a kernel module that allows you to register or deregister a block device. +Start from the files in the 1-2-3-6-ram-disk/kernel directory in the +lab skeleton.

+

Follow the comments marked with TODO 1 in the laboratory skeleton. Use the +existing macrodefinitions (MY_BLOCK_MAJOR, +MY_BLKDEV_NAME). Check the value returned by the register function, +and in case of error, return the error code.

+

Compile the module, copy it to the virtual machine and insert it into the +kernel. Verify that your device was successfully created inside the +/proc/devices. +You will see a device with major 240.

+

Unload the kernel module and check that the device was unregistered.

+
+

Hint

+

Review the Register a block I/O device section.

+
+

Change the MY_BLOCK_MAJOR value to 7. Compile the module, copy it to +the virtual machine, and insert it into the kernel. Notice that the insertion +fails because there is already another driver/device registered in the kernel +with the major 7.

+

Restore the 240 value for the MY_BLOCK_MAJOR macro.

+
+
+

2. Disk registration

+

Modify the previous module to add a disk associated with the driver. Analyze the +macrodefinitions, my_block_dev structure and existing functions from +the ram-disk.c file.

+

Follow the comments marked with TODO 2. Use the +create_block_device() and the delete_block_device() functions.

+
+

Hint

+

Review the Register a disk and Process a request sections.

+
+

Fill in the my_block_request() function to process the request +without actually processing your request: display the "request received" message +and the following information: start sector, total size, data size from the +current struct bio structure, direction. To validate a request type, +use the blk_rq_is_passthrough() (the function returns 0 in the case in +which we are interested, i.e. when the request is generated by the file system).

+
+

Hint

+

To find the needed info, review the Requests for block devices +section.

+
+

Use the blk_mq_end_request() function to finish processing the +request.

+

Insert the module into the kernel and inspect the messages printed +by the module. When a device is added, a request is sent to the device. Check +the presence of /dev/myblock and if it doesn't exist, create the device +using the command:

+
mknod /dev/myblock b 240 0
+
+
+

To generate writing requests, use the command:

+
echo "abc"> /dev/myblock
+
+
+

Notice that a write request is preceded by a read request. The request +is done to read the block from the disk and "update" its content with the +data provided by the user, without overwriting the rest. After reading and +updating, writing takes place.

+
+
+

3. RAM disk

+

Modify the previous module to create a RAM disk: requests to the device will +result in reads/writes in a memory area.

+

The memory area dev->data is already allocated in the source code of +the module using vmalloc() and deallocated using vfree().

+
+

Note

+

Review the Process a request section.

+
+

Follow the comments marked with TODO 3 to complete the +my_block_transfer() function to write/read the request information +in/from the memory area. The function will be called for each request within +the queue processing function: my_block_request(). To write/read +to/from the memory area, use memcpy(). To determine the write/read +information, use the fields of the struct request structure.

+
+

Hint

+

To find out the size of the request data, use the +blk_rq_cur_bytes macro. Do not use the +blk_rq_bytes macro.

+
+
+

Hint

+

To find out the buffer associated to the request, use +bio_data`(:c:data:`rq->bio).

+
+
+

Hint

+

A description of useful macros is in the Requests for block devices +section.

+
+
+

Hint

+

You can find useful information in the +block device driver example +from Linux Device Driver.

+
+

For testing, use the test file user/ram-disk-test.c. +The test program is compiled automatically at make build, copied to the +virtual machine at make copy and can be run on the QEMU virtual machine +using the command:

+
./ram-disk-test
+
+
+

There is no need to insert the module into the kernel, it will be inserted by +the ram-disk-test command.

+

Some tests may fail because of lack of synchronization between the transmitted +data (flush).

+
+
+

4. Read data from the disk

+

The purpose of this exercise is to read data from the +PHYSICAL_DISK_NAME disk (/dev/vdb) directly from the kernel.

+
+

Attention

+

Before solving the exercise, we need to make sure the disk is +added to the virtual machine.

+

Check the variable QEMU_OPTS from qemu/Makefile. +There should already be two extra disks added using -drive ....

+

If there are not, generate a file that we will use as +the disk image using the command: +dd if=/dev/zero of=qemu/mydisk.img bs=1024 count=1 +and add the following option: +-drive file=qemu/mydisk.img,if=virtio,format=raw +to qemu/Makefile (in the QEMU_OPTS variable, +after the root disk).

+
+

Follow the comments marked with TODO 4 in the directory 4-5-relay/ +and implement open_disk() and close_disk(). +Use the blkdev_get_by_path() and blkdev_put() functions. The +device must be opened in read-write mode exclusively +(FMODE_READ | FMODE_WRITE | FMODE_EXCL), and +as holder you must use the current module (THIS_MODULE).

+

Implement the send_test_bio() function. You will have to create a new +struct bio structure and fill it, submit it and wait for it. Read the +first sector of the disk. To wait, call the submit_bio_wait() function.

+
+

Hint

+

The first sector of the disk is the sector with the index 0. +This value must be used to initialize the field +bi_iter.bi_sector of the struct bio.

+

For the read operation, use the REQ_OP_READ macro to +initialize the bi_opf field of the struct bio.

+
+

After finishing the operation, display the first 3 bytes of data read by +struct bio structure. Use the format "% 02x" for printk() +to display the data and the kmap_atomic and kunmap_atomic +macros respectively.

+
+

Hint

+

As an argument for the kmap_atomic() function, just use the +page which is allocated above in the code, in the page +variable.

+
+ +

For testing, use the test-relay-disk script, which is copied on the +virtual machine when running make copy. If it is not copied, make +sure it is executable:

+
chmod +x test-relay-disk
+
+
+

There is no need to load the module into the kernel, it will be loaded by +test-relay-disk.

+

Use the command below to run the script:

+
./test-relay-disk
+
+
+

The script writes "abc" at the beginning of the disk indicated by +PHYSICAL_DISK_NAME. After running, the module will display 61 62 63 +(the corresponding hexadecimal values of letters "a", "b" and "c").

+
+
+

5. Write data to the disk

+

Follow the comments marked with TODO 5 to write a message +(BIO_WRITE_MESSAGE) on the disk.

+

The send_test_bio() function receives as argument the operation type +(read or write). Call in the relay_init() function the function for +reading and in the relay_exit() function the function for writing. We +recommend using the REQ_OP_READ and the REQ_OP_WRITE +macros.

+

Inside the send_test_bio() function, if the operation is write, fill in +the buffer associated to the struct bio structure with the message +BIO_WRITE_MESSAGE. Use the kmap_atomic and the +kunmap_atomic macros to work with the buffer associated to the +struct bio structure.

+
+

Hint

+

You need to update the type of the operation associated to the +struct bio structure by setting the bi_opf field +accordingly.

+
+

For testing, run the test-relay-disk script using the command:

+
./test-relay-disk
+
+
+

The script will display the "read from /dev/sdb: 64 65 66" message at the +standard output.

+
+
+

6. Processing requests from the request queue at struct bio level

+

In the implementation from Exercise 3, we have only processed a +struct bio_vec of the current struct bio from the request. +We want to process all struct bio_vec structures from all +struct bio structures. +For this, we will iterate through all struct bio requests and through +all struct bio_vec structures (also called segments) of each +struct bio.

+

Add, within the ramdisk implementation (1-2-3-6-ram-disk/ directory), +support for processing the requests from the request queue at +struct bio level. Follow the comments marked with TODO 6.

+

Set the USE_BIO_TRANSFER macro to 1.

+

Implement the my_xfer_request() function. Use the +rq_for_each_segment macro to iterate through the bio_vec +structures of each struct bio from the request.

+
+

Hint

+

Review the indications and the code snippets from the +How to use the content of a struct bio structure section.

+
+
+

Hint

+

Use the struct bio segment iterator to get the current +sector (iter.iter.bi_sector).

+
+
+

Hint

+

Use the request iterator to get the reference to the current +struct bio (iter.bio).

+
+
+

Hint

+

Use the bio_data_dir macro to find the reading or writing +direction for a struct bio.

+
+

Use the kmap_atomic or the kunmap_atomic macros to map +the pages of each struct bio structure and access its associated +buffers. For the actual transfer, call the my_block_transfer() function +implemented in the previous exercise.

+

For testing, use the ram-disk-test.c test file:

+
./ram-disk-test
+
+
+

There is no need to insert the module into the kernel, it will be inserted by +the ram-disk-test executable.

+

Some tests may crash because of lack of synchronization between the transmitted +data (flush).

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab8-filesystems-part1.html b/refs/pull/405/merge/so2/lab8-filesystems-part1.html new file mode 100644 index 00000000..3be92acc --- /dev/null +++ b/refs/pull/405/merge/so2/lab8-filesystems-part1.html @@ -0,0 +1,980 @@ + + + + + + SO2 Lab 08 - File system drivers (Part 1) — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 08 - File system drivers (Part 1)

+
+

Lab objectives

+
+
    +
  • acquiring knowledge about the Virtual Filesystem (VFS) in Linux and understanding concepts regarding 'inode', 'dentry', 'file', superblock and data block.
  • +
  • understanding the process of mounting a file system inside VFS.
  • +
  • knowledge regarding various file system types and understanding differences between file systems with physical support (on disk) and the ones without physical support.
  • +
+
+
+
+

Virtual Filesystem (VFS)

+

The Virtual Filesystem (also known as VFS) is a component of the kernel that handles all system calls related to files and file systems. +VFS is a generic interface between the user and a particular file system. +This abstraction simplifies the implementation of file systems and provides an easier integration of multiple file systems. This way, the implementation of a file system is accomplished by using the API provided by the VFS, and the generic hardware and I/O subsystem communication parts are handled by VFS.

+

From a functional point of view, file systems can be grouped into:

+
+
    +
  • disk file systems (ext3, ext4, xfs, fat, ntfs, etc.)
  • +
  • network file systems (nfs, smbfs/cifs, ncp, etc.)
  • +
  • virtual filesystems (procfs, sysfs, sockfs, pipefs, etc.)
  • +
+
+

A Linux kernel instance will use VFS for the hierarchy (a tree) of directories and files. +A new file system will be added as a VFS subtree using the mount operation. +A file system is usually mounted from the environment for which it was built (from a block type device, from network, etc.). +In particular, however, the VFS can use a normal file as a virtual block device, so it is possible to mount disk file systems over normal files. This way, stacks of file systems can be created.

+

The basic idea of VFS is to provide a single file model that can represent files from any file system. +The file system driver is responsible for bringing to the common denominator. +This way the kernel can create a single directory structure that contains the entire system. +There will be a file system that will be the root, the rest being mounted in its various directories.

+
+
+

The general file system model

+

The general file system model, to which any implemented file system needs to be reduced, consists of several well-defined entities: superblock, inode, file, and dentry. +These entities are file system metadata (they contain information about data or other metadata).

+

Model entities interact using some VFS or kernel subsystems: dentry cache, inode cache, buffer cache. +Each entity is treated as an object: it has a associated data structure and a pointer to a table of methods. The induction of particular behavior for each component is done by replacing the associated methods.

+
+

superblock

+

The superblock stores the information needed for a mounted file system:

+
+
    +
  • inode and blocks locations
  • +
  • file system block size
  • +
  • maximum filename length
  • +
  • maximum file size
  • +
  • the location of the root inode
  • +
+
+
+

Localization:

+
+
    +
  • In the case of disk file systems, the superblock has a correspondent in the first block of the disk. (Filesystem Control Block).
  • +
  • In VFS, all superblocks of filesystems are retained in a list of structures of type struct super_block and the methods in structures of type struct super_operations.
  • +
+
+
+
+
+

inode

+

The inode (index node) keeps information about a file in the general sense (abstraction): regular file, directory, special file (pipe, fifo), block device, character device, link, or anything that can be abstracted as a file.

+

An inode stores information like:

+
+
    +
  • file type;
  • +
  • file size;
  • +
  • access rights;
  • +
  • access or modify time;
  • +
  • location of data on the disk (pointers to disk blocks containing data).
  • +
+
+
+

Note

+

Usually, the inode does not contain the file name. The name is stored by the dentry entity. This way, an inode can have multiple names (hardlinks).

+
+
+

Localization:

+

Like the superblock, the inode has a disk correspondent. +The inodes on disk are generally grouped into a specialized area (inode area) separated from the data blocks area; In some file systems, the equivalents of the inodes are spread in the file system structure (FAT); +As a VFS entity, an inode is represented by the structure struct inode and by the operations with it defined in the structure struct inode_operations.

+

Each inode is generally identified by a number. On Linux, the -i argument of the ls command shows the inode number associated with each file:

+
razvan@valhalla:~/school/so2/wiki$ ls -i
+1277956 lab10.wiki  1277962 lab9.wikibak  1277964 replace_lxr.sh
+1277954 lab9.wiki   1277958 link.txt      1277955 homework.wiki
+
+
+
+
+
+

file

+

File is the component of the file system model that is closest to the user. +The structure exists only as a VFS entity in memory and has no physical correspondent on disk.

+

While the inode abstracts a file on the disk, the file structure abstracts an open file. +From the point of view of the process, the file entity abstracts the file. From the point of view of the file system implementation, however, the inode is the entity that abstracts the file.

+

The file structure maintains information such as:

+
+
    +
  • file cursor position;
  • +
  • file opening rights;
  • +
  • pointer to the associated inode (eventually its index).
  • +
+
+
+

Localization:

+
+
    +
  • The structure struct file is the associated VFS entity, and the structure struct file_operations represents the operations associated with it.
  • +
+
+
+
+
+

dentry

+

The dentry (directory entry) associates an inode with a file name.

+

Generally, a dentry structure contains two fields:

+
+
    +
  • an integer that identifies the inode;
  • +
  • a string representing its name.
  • +
+
+

The dentry is a specific part of a path that can be a directory or a file. For example, for the path /bin/vi, dentry objects will be created for /, bin, and vi (a total of 3 dentry objects).

+
+
    +
  • the dentry has a correspondent on the disk, but the correspondence is not direct because each file system keeps the dentries in a specific way
  • +
  • in VFS, the dentry entity is represented by the structure struct dentry and the operations with it are defined in the struct dentry_operations structure.
  • +
+
+
+
+
+

Register and unregister filesystems

+

In the current version, the Linux kernel supports about 50 file systems, including:

+
+
    +
  • ext2/ ext4
  • +
  • reiserfs
  • +
  • xfs
  • +
  • fat
  • +
  • ntfs
  • +
  • iso9660
  • +
  • udf for CDs and DVDs
  • +
  • hpfs
  • +
+
+

On a single system, however, it is unlikely that there will be more than 5-6 file systems. For this reason, file systems (or, more correctly, file system types) are implemented as modules and can be loaded or unloaded at any time.

+

In order to be able to dynamically load / unload a file system module, a file system registration / deregistration API is required. The structure describing a particular file system is struct file_system_type:

+
+
+
#include <linux/fs.h>
+
+struct file_system_type {
+         const char *name;
+         int fs_flags;
+         struct dentry *(*mount) (struct file_system_type *, int,
+                                   const char *, void *);
+         void (*kill_sb) (struct super_block *);
+         struct module *owner;
+         struct file_system_type * next;
+         struct hlist_head fs_supers;
+         struct lock_class_key s_lock_key;
+         struct lock_class_key s_umount_key;
+         //...
+};
+
+
+
+
    +
  • name is a string representing the name that will identify a file system (the argument passed to mount -t).
  • +
  • owner is THIS_MODULE for file systems implemented in modules, and NULL if they are written directly into the kernel.
  • +
  • The mount function reads the superblock from the disk in memory when loading the file system. The function is unique to each file system.
  • +
  • The kill_sb function releases the super-block from memory.
  • +
  • fs_flags specifies the flags with which the file system must be mounted. An example of such flag is FS_REQUIRES_DEV that specifies to VFS that the file system needs a disk (it is not a virtual file system).
  • +
  • fs_supers is a list containing all the superblocks associated with this file system. Since the same file system can be mounted multiple times, there will be a separate superblock for each mount.
  • +
+
+

The registration of a file system into the kernel is generally performed in the module initialization function. For registration, the programmer will have to

+
+
    +
  1. initialize a structure of type struct file_system_type with the name, the flags, the function that implements the superblock reading operation and the reference to the structure that identifies the current module
  2. +
  3. call the register_filesystem() function.
  4. +
+
+

When unloading the module, you must unregister the file system by calling the unregister_filesystem() function.

+

An example of registering a virtual file system is found in the code for ramfs:

+
static struct file_system_type ramfs_fs_type = {
+        .name           = "ramfs",
+        .mount          = ramfs_mount,
+        .kill_sb        = ramfs_kill_sb,
+        .fs_flags       = FS_USERNS_MOUNT,
+};
+
+static int __init init_ramfs_fs(void)
+{
+        if (test_and_set_bit(0, &once))
+                return 0;
+        return register_filesystem(&ramfs_fs_type);
+}
+
+
+
+

Functions mount, kill_sb

+

When mounting the file system, the kernel calls the mount function defined within the structure file_system_type. The function makes a set of initializations and returns a dentry (the structure struct dentry) that represents the mount point directory. Usually mount() is a simple function that calls one of the functions:

+
+
    +
  • mount_bdev(), which mounts a file system stored on a block device
  • +
  • mount_single(), which mounts a file system that shares an instance between all mount operations
  • +
  • mount_nodev(), which mounts a file system that is not on a physical device
  • +
  • mount_pseudo(), a helper function for pseudo-file systems (sockfs, pipefs, generally file systems that can not be mounted)
  • +
+
+

These functions get as parameter a pointer to a function fill_super() that will be called after the superblock initialization to finish its initialization by the driver. An example of such a function can be found in the fill_super section.

+

When unmounting the file system, the kernel calls kill_sb(), which performs cleanup operations and invokes one of the functions:

+
+
    +
  • kill_block_super(), which unmounts a file system on a block device
  • +
  • kill_anon_super(), which unmounts a virtual file system (information is generated when requested)
  • +
  • kill_litter_super(), which unmounts a file system that is not on a physical device (the information is kept in memory)
  • +
+
+

An example for a file system without disk support is the ramfs_mount() function in the ramfs file system:

+
struct dentry *ramfs_mount(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+{
+        return mount_nodev(fs_type, flags, data, ramfs_fill_super);
+}
+
+
+

An example for a file system from disk is the minix_mount() function in the minix file system:

+
struct dentry *minix_mount(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+{
+         return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+}
+
+
+
+
+
+

Superblock in VFS

+

The superblock exists both as a physical entity (entity on disk) and as a VFS entity (within the struct super_block structure). +The superblock contains only metainformation and is used to write and read metadata from the disk (inodes, directory entries). +A superblock (and implicitly the struct super_block structure) will contain information about the block device used, the list of inodes, a pointer to the inode of the file system root directory, and a pointer to the superblock operations.

+
+

The struct super_block structure

+

Part of the struct super_block structure definition is presented below:

+
struct super_block {
+        //...
+        dev_t                   s_dev;              /* identifier */
+        unsigned char           s_blocksize_bits;   /* block size in bits */
+        unsigned long           s_blocksize;        /* block size in bytes */
+        unsigned char           s_dirt;             /* dirty flag */
+        loff_t                  s_maxbytes;         /* max file size */
+        struct file_system_type *s_type;            /* filesystem type */
+        struct super_operations *s_op;              /* superblock methods */
+        //...
+        unsigned long           s_flags;            /* mount flags */
+        unsigned long           s_magic;            /* filesystem’s magic number */
+        struct dentry           *s_root;            /* directory mount point */
+        //...
+        char                    s_id[32];           /* informational name */
+        void                    *s_fs_info;         /* filesystem private info */
+};
+
+
+
+
The superblock stores global information for an instance of a file system:
+
    +
  • the physical device on which it resides
  • +
  • block size
  • +
  • the maximum size of a file
  • +
  • file system type
  • +
  • the operations it supports
  • +
  • magic number (identifies the file system)
  • +
  • the root directory dentry
  • +
+
+
+

Additionally, a generic pointer (void *) stores the private data of the file system. +The superblock can be viewed as an abstract object to which its own data is added when there is a concrete implementation.

+
+
+

Superblock operations

+

The superblock operations are described by the struct super_operations structure:

+
struct super_operations {
+       //...
+       int (*write_inode) (struct inode *, struct writeback_control *wbc);
+       struct inode *(*alloc_inode)(struct super_block *sb);
+       void (*destroy_inode)(struct inode *);
+
+       void (*put_super) (struct super_block *);
+       int (*statfs) (struct dentry *, struct kstatfs *);
+       int (*remount_fs) (struct super_block *, int *, char *);
+       //...
+};
+
+
+

The fields of the structure are function pointers with the following meanings:

+
+
    +
  • write_inode, alloc_inode, destroy_inode write, allocate, respectively release resources associated with an inode and are described in the next lab
  • +
  • put_super is called when the superblock is released at umount; within this function, any resources (generally memory) from the file system's private data must be released;
  • +
  • remount_fs is called when the kernel detects a remount attempt (mount flag MS_REMOUNTM); most of the time here must be detected if a switch from read-only to read-write or vice versa is attempted; this can be done simply because both the old flags (in sb->s_flags) and the new flags (the flags argument) can be accessed; data is a pointer to the data sent by mount() that represent file system specific options;
  • +
  • statfs is called when a statfs system call is done (try stat –f or df); this call must fill the fields of the struct kstatfs structure, as it is done, for example, in the ext4_statfs() function.
  • +
+
+
+
+
+

The fill_super() function

+

As specified, the fill_super() function is called to terminate the superblock initialization. This initialization involves filling the struct super_block structure fields and the initialization of the root directory inode.

+

An example of implementation is the ramfs_fill_super() function which is called to initialize the remaining fields in the superblock:

+
#include <linux/pagemap.h>
+
+#define RAMFS_MAGIC     0x858458f6
+
+static const struct super_operations ramfs_ops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .show_options   = ramfs_show_options,
+};
+
+static int ramfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct ramfs_fs_info *fsi;
+        struct inode *inode;
+        int err;
+
+        save_mount_options(sb, data);
+
+        fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+        sb->s_fs_info = fsi;
+        if (!fsi)
+                return -ENOMEM;
+
+        err = ramfs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                return err;
+
+        sb->s_maxbytes          = MAX_LFS_FILESIZE;
+        sb->s_blocksize         = PAGE_SIZE;
+        sb->s_blocksize_bits    = PAGE_SHIFT;
+        sb->s_magic             = RAMFS_MAGIC;
+        sb->s_op                = &ramfs_ops;
+        sb->s_time_gran         = 1;
+
+        inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
+        sb->s_root = d_make_root(inode);
+        if (!sb->s_root)
+                return -ENOMEM;
+
+        return 0;
+}
+
+
+

The kernel provides generic function to implement operations with file system structures. +The generic_delete_inode() and simple_statfs() functions used in the above code are such functions and can be used to implement the drivers if their functionality is sufficient.

+

The ramfs_fill_super() function in the above code fills some fields in the superblock, then reads the root inode and allocates the root dentry. +Reading the root inode is done in the ramfs_get_inode() function, and consists of allocating a new inode using new_inode() and initializing it. In order to free the inode, iput() is used, and d_make_root() is used to allocate the root dentry.

+

An example implementation for a disk file system is the minix_fill_super() function in the minix file system. +The functionality for the disk file system is similar to that of the virtual file system, with the exception of using the buffer cache. +Also, the minix file system keeps private data using the struct minix_sb_info structure. +A large part of this function deals with the initialization of these private data. +The private data is allocated using the kzalloc() function and stored in the s_fs_info field of the superblock structure.

+

VFS functions typically get as arguments the superblock, an inode and/or a dentry that contain a pointer to the superblock so that these private data can be easily accessed.

+
+
+

Buffer cache

+

Buffer cache is a kernel subsystem that handles caching (both read and write) blocks from block devices. +The base entity used by buffer cache is the struct buffer_head structure. +The most important fields in this structure are:

+
+
    +
  • b_data, pointer to a memory area where the data was read from or where the data must be written to
  • +
  • b_size, buffer size
  • +
  • b_bdev, the block device
  • +
  • b_blocknr, the number of block on the device that has been loaded or needs to be saved on the disk
  • +
  • b_state, the status of the buffer
  • +
+
+

There are some important functions that work with these structures:

+
+
    +
  • __bread(): reads a block with the given number and given size in a buffer_head structure; in case of success returns a pointer to the buffer_head structure, otherwise it returns NULL;
  • +
  • sb_bread(): does the same thing as the previous function, but the size of the read block is taken from the superblock, as well as the device from which the read is done;
  • +
  • mark_buffer_dirty(): marks the buffer as dirty (sets the BH_Dirty bit); the buffer will be written to the disk at a later time (from time to time the bdflush kernel thread wakes up and writes the buffers to disk);
  • +
  • brelse(): frees up the memory used by the buffer, after it has previously written the buffer on disk if needed;
  • +
  • map_bh(): associates the buffer-head with the corresponding sector.
  • +
+
+
+
+

Functions and useful macros

+

The super block typically contains a map of occupied blocks (by inodes, dentries, data) in the form of a bitmap (vector of bits). To work with such maps, it is recommend to use the following features:

+
+
    +
  • find_first_zero_bit(), to find the first zero bit in a memory area. The size parameter means the number of bits in the search area;
  • +
  • test_and_set_bit(), to set a bit and get the old value;
  • +
  • test_and_clear_bit(), to delete a bit and get the old value;
  • +
  • test_and_change_bit(), to invert the value of a bit and get the old value.
  • +
+
+

The following macrodefinitions can be used to verify the type of an inode:

+
+
    +
  • S_ISDIR (inode->i_mode) to check if the inode is a directory;
  • +
  • S_ISREG (inode->i_mode) to check if the inode is a regular file (not a link or device file).
  • +
+
+
+
+

Further reading

+
    +
  1. Robert Love -- Linux Kernel Development, Second Edition -- Chapter +12. The Virtual Filesystem
  2. +
  3. Understanding the Linux Kernel, 3rd edition - Chapter 12. The Virtual +Filesystem
  4. +
  5. Linux Virtual File System (presentation)
  6. +
  7. Understanding Unix/Linux Filesystem
  8. +
  9. Creating Linux virtual filesystems
  10. +
  11. The Linux Documentation Project - VFS
  12. +
  13. The "Virtual File System" in Linux
  14. +
  15. A Linux Filesystem Tutorial
  16. +
  17. The Linux Virtual File System
  18. +
  19. Documentation/filesystems/vfs.txt
  20. +
  21. File systems sources
  22. +
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is filesystems. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/filesystems/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Important

+

In order to have a better understanding of what we do well and we can do +better, what factors affect your implication in teaching, extracurricular +but also professional activities, we ask you to complete this survey. The survey is a short one, +having answers with check marks, with an estimated completion time of +3-5 minutes. Obviously, we will send you the analysis of the survey and +use it to improve the teaching activities.

+
+
+

myfs

+

To begin, we plan to get familiar with the interface exposed by the Linux kernel and the Virtual File System (VFS) component. That is why, for the beginning, we will work with a simple, virtual file system (i.e. without physical disk support). The file system is called myfs.

+

For this we will access the myfs/ subdirectory in the laboratory skeleton. We will implement the superblock operations within this lab, and the next lab will continue with the inode operations.

+
+

1. Register and unregister the myfs file system

+

The first step in working with the file system is to register and unregister it. We want to do this for the file system described in myfs.c. Check the file contents and follow the directions marked with TODO 1.

+

The steps you need to take are described in the section Register and unregister filesystems. Use the "myfs" string for the file system name.

+
+

Note

+

Within the file system structure, use the myfs_mount function present in the code skeleton to fill the superblock (done when mounting). In myfs_mount call the function specific to a file system without disk support. As an argument for the specific mount function, use the function of type fill_super defined in the code skeleton. You can review the Functions mount, kill_sb section.

+

To destroy the superblock (done at unmounting) use kill_litter_super, also a function specific to a file system without disk support. The function is already implemented, you need to fill it in the struct file_system_type structure.

+
+

After completing the sections marked with TODO 1 , compile the module, copy it to the QEMU virtual machine, and start the virtual machine. Load the kernel module and then check the presence of the myfs file system within the /proc/filesystems file.

+

At the moment, the file system is only registered, it does not expose operations to use it. If we try to mount it, the operation will fail. To try mounting, we create mount point /mnt/myfs/.

+
# mkdir -p /mnt/myfs
+
+
+

and then we use the mount command:

+
# mount -t myfs none /mnt/myfs
+
+
+

The error message we get shows that we have not implemented the operations that work on the superblock. We will have to implement the operations on the superblock and initialize the root inode. We will do this further.

+
+

Note

+

The none argument sent to the mount command indicates that we do not have a device from which to mount, the file system being a virtual one. Similarly, this is how the procfs or sysfs filesystems are mounted on Linux systems.

+
+
+
+

2. Completing myfs superblock

+

To be able to mount the file system, we need to fill its superblock's fields, that is, a generic VFS structure of type struct super_block. +We will fill out the structure within the myfs_fill_super() function; the superblock is represented by the variable sb passed as an argument to the function. +Follow the hints marked with TODO 2.

+
+

Note

+

To fill the myfs_fill_super function, you can start from the example in the section The fill_super() function.

+

For the superblock structure fields, use the macros defined within the code skeleton wherever possible.

+
+

The s_op field in the superblock structure must be initialized to the superblock operations structures (type struct super_operations). You need to define such a structure.

+

For information on defining the struct super_operations structure and filling the superblock, see the section Superblock operations.

+
+

Note

+

Initialize the drop_inode and statfs fields of struct super_operations structure.

+
+

Although the superblock will be properly initialized at this time, the mount operation will continue to fail. +In order for the operation to be successfully completed, the root inode will have to be initialized, which we will do for the next exercise.

+
+
+

3. Initialize myfs root inode

+

The root inode is the inode of the file system root directory (i.e. /). +Initialization is done when the file system is mounted. +The myfs_fill_super function, called at mount, is the one that calls the myfs_get_inode function that creates and initializes an inode. +Typically, this function is used to create and initialize all inodes; In this exercise, however, we will only create the root inode.

+

The inode is allocated inside the myfs_get_inode function (local variable inode, allocated using the new_inode() function call).

+

To successfully complete mounting the file system, you will need to fill the myfs_get_inode function. Follow directions marked with TODO 3. A starting point is the ramfs_get_inode function.

+
+

Note

+

To initialize uid, gid and mode , you can use the inode_init_owner() function as it is used in ramfs_get_inode(). +When you call inode_init_owner(), use NULL as the second parameter because there is no parent directory for the created inode.

+

Initialize the i_atime, i_ctime, and i_mtime of the VFS inode to the value returned by the current_time() function.

+

You will need to initialize the operations for the inode of type directory. To do this, follow the steps:

+
+
    +
  1. Check if this is a directory type inode using the S_ISDIR macro.
  2. +
  3. For the i_op and i_fop fields, use kernel functions that are already implemented:
      +
    • for i_op: simple_dir_inode_operations.
    • +
    • for i_fop: simple_dir_operations
    • +
    +
  4. +
  5. Increase the number of links for the directory using the inc_nlink() function.
  6. +
+
+
+
+
+

4. Test myfs mount and unmount

+

Now we can mount the filesystem. +Follow the steps above to compile the kernel module, copy to the virtual machine, and start the virtual machine, then insert the kernel module, create the mount point /mnt/myfs/, and mount the file system. +We verify that the file system was mounted by inspecting the /proc/mounts file.

+

What inode number does the /mnt/myfs directory have? Why?

+
+

Note

+

To display the inode number of a directory, use the command:

+
ls -di /path/to/directory
+
+
+

where /path/to/directory/ is the path to the directory whose inode number we want to display.

+
+

We check myfs file system statistics using the following command:

+
stat -f /mnt/myfs
+
+
+

We want to see what the mount point /mnt/myfs contains and if we can create files. +For this we run the commands:

+
# ls -la /mnt/myfs
+# touch /mnt/myfs/a.txt
+
+
+

We can see that we can not create the a.txt file on the file system. +This is because we have not implemented the operations to work with inodes in the struct super_operations structure. +We will implement these operations within the next lab.

+

Unmount the file system using the command

+
umount /mnt/myfs
+
+
+

Unload the kernel module corresponding to the file system as well.

+
+

Note

+

To test the entire functionality, you can use the test-myfs.sh script:

+
./test-myfs.sh
+
+
+

The script is copied to the virtual machine using make copy only if it is executable:

+
student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/myfs/test-myfs.sh
+
+
+
+
+

Note

+

The statistics displayed for the file system are minimal because the information is provided by the simple_statfs function.

+
+
+
+
+

minfs

+

Next, we will implement the basics of a very simple file system, called minfs, with disk support. +We will use a disk in the virtual machine that we will format and mount with the minfs filesystem.

+

For this we will access the minfs/kernel directory from the laboratory skeleton and work with the code in minfs.c. +Just like myfs we will not implement the operations for working with inodes. We will just limit to working with the superblock and, therefore, mounting. +The rest of the operations will be implemented in the next lab.

+

Follow the diagram below to clarify the role of structures within the minfs file system.

+../_images/minfs1.png +
+

1. Registering and unregistering the minfs file system

+
+

Note

+

Before solving the exercise, we need to add a disk to the virtual machine. To do this, generate a file that we will use as the disk image using the following command:

+
dd if=/dev/zero of=mydisk.img bs=1M count=100
+
+
+

and add the -drive file=mydisk.img,if=virtio,format=raw argument to the qemu command in qemu/Makefile (in the QEMU_OPTS variable). +The new argument for the qemu command must be added after the one for the existing disk (YOCTO_IMAGE).

+
+

To register and unregister the file system, you will need to fill the minfs_fs_type and minfs_mount functions in minfs.c. Follow the directions marked with TODO 1.

+
+

Note

+

In the file system structure, for mount, use the minfs_mount function from in the code skeleton. +In this function, call the function to mount a file system with disk support (See the Functions mount, kill_sb section. Use mount_bdev()). +Choose the most suitable function for destroying the superblock (done at unmount); keep in mind that it is a file system with disk support. Use the kill_block_super() function.

+

Initialize the fs_flags field of the minfs_fs_type structure with the appropriate value for a file system with disk support. See the section Register and unregister filesystems.

+

The function for filling the superblock is minfs_fill_super.

+
+

After completing the sections marked with TODO 1, compile the module, copy it into the QEMU virtual machine, and start the virtual machine. +Load the kernel module and then check the presence of the minfs file system within the /proc/filesystems file.

+

To test the mounting of the minfs file system we will need to format the disk with its structure. Formatting requires the mkfs.minfs formatting tool from the minfs/user directory. The utility is automatically compiled when running make build and copied to the virtual machine at make copy.

+

After compiling, copying, and starting the virtual machine, format the /dev/vdd using the formatting utility:

+
# ./mkfs.minfs /dev/vdd
+
+
+

Load the kernel module:

+
# insmod minfs.ko
+
+
+

Create mount point /mnt/minfs/:

+
# mkdir -p /mnt/minfs/
+
+
+

and mount the filesystem

+
# mount -t minfs /dev/vdd /mnt/minfs/
+
+
+

The operation fails because the root inode is not initialized.

+
+
+

2. Completing minfs superblock

+

To be able to mount the file system, you will need to fill the superblock (i.e a structure with type struct super_block) within the minfs_fill_super function; it is the s argument of the function. +The structure of operations on the superblock is already defined: minfs_ops. +Follow the directions marked with TODO 2. You can also follow the implementation of the minix_fill_super function.

+
+

Note

+

Some structures are found in the header file minfs.h.

+

For information on working with buffers, go to the Buffer cache section.

+

Read the first block on the disk (block with index 0). +To read the block, use the sb_bread() function. +Cast the read data (the b_data field in the struct buffer_head structure) to the structure storing the minfs superblock information on the disk: struct minfs_super_block, defined in the source code file.

+

Structure struct minfs_super_block holds file system-specific information that is not found in the struct super_block generic structure (in this case only version). +Those additional information (found in struct minfs_super_block (on disk) but not in struct super_block (VFS)) will be stored in the struct minfs_sb_info structure.

+
+

To check the functionality, we need a function for reading the root inode. +For the time being, use the myfs_get_inode function from myfs file system exercises. +Copy the function into the source code and call it the same as you did for myfs. +The third argument when calling the myfs_get_inode function is the inode creation permissions, similar to the virtual file system exercise (myfs).

+

Validate the implementation by executing the commands from the previous exercise.

+
+
+

3. Creating and destroying minfs inodes

+

For mounting, we need to initialize the root inode, and to get the root inode, we need to implement the functions to work with inodes. +That is, you need to implement the minfs_alloc_inode and minfs_destroy_inode functions. +Follow the directions marked with TODO 3. You can use the minix_alloc_inode() and minix_destroy_inode() functions as a model.

+

For the implementation, look at the macros and structures in the minfs.h header file.

+
+

Note

+

For memory allocation/deallocation in minfs_alloc_inode and minfs_destroy_inode, we recommend using kzalloc() and kfree().

+

In minfs_alloc_inode allocate structures with type struct minfs_inode_info, but only return structures with type struct inode, i.e. return those given by the vfs_inode field.

+

In the minfs_alloc_inode function, call inode_init_once() to initialize the inode.

+

In the destroy_inode function, you can access the structure with type struct minfs_inode_info using the container_of macro.

+
+
+

Note

+

In this exercise, you have implemented the minfs_alloc_inode and minfs_destroy_inode functions, but they are not yet called. The correctness of the implementation will be checked at the end of the next exercise.

+
+
+
+

4. Initialize minfs root inode

+

Initializing the root inode is required in order to mount the file system. +For this, you will need to complete the minfs_ops structure with the minfs_alloc_inode and minfs_destroy_inode functions and fill the minfs_iget function.

+

The minfs_iget function is the function called to allocate a VFS inode (i.e. struct inode) and fill it with minfs inode-specific information from the disk (i.e. struct minfs_inode).

+

Follow the directions marked with TODO 4. +Fill out the alloc_inode and destroy_inode fields of struct super_operations structure with the functions implemented in the previous step.

+

The information about the root inode is found in the second block on the disk (the inode with index 1). +Make minfs_iget read the root minfs inode from the disk (struct minfs_inode) and fill in the VFS inode (struct inode).

+

In the minfs_fill_super function, replace the myfs_get_inode call with the minfs_iget function call.

+
+

Note

+

To implement the minfs_iget function, follow the implementation of V1_minix_iget. +To read a block, use the sb_bread() function. +Cast the read data (the b_data field of the struct buffer_head structure) to the minfs inode from the disk (struct minfs_inode).

+

The i_uid, i_gid, i_mode, i_size must be filled in the VFS inode with the values in the minfs inode structure read from disk. +To initialize the i_uid and i_gid fields, use the functions i_uid_write() , and i_gid_write().

+

Initialize the i_atime , i_ctime, and i_mtime fields of the VFS inode to the value returned by the current_time() function.

+

You will need to initialize the operations for the inode with type directory. To do this, follow the steps:

+
+
    +
  1. Check if this is a directory type inode using the S_ISDIR macro.
  2. +
  3. For the i_op and i_fop fields, use kernel functions already implemented:
      +
    • for i_op: simple_dir_inode_operations() .
    • +
    • for i_fop: simple_dir_operations()
    • +
    +
  4. +
  5. Increment the number of links for the directory using the inc_nlink() function.
  6. +
+
+
+
+
+

5. Testing of minfs mount and unmount

+

Now we can mount the filesystem. +Follow the steps above to compile the kernel module, copy to the virtual machine, start the virtual machine, and then insert the kernel module, create mount point /mnt/minfs/ and mount the file system. +We verify that the file system was mounted by investigating the /proc/mounts file.

+

We check that everything is fine by listing the mount point contents /mnt/minfs/:

+
# ls /mnt/minfs/
+
+
+

After mount and verification, unmount the file system and unload the module from the kernel.

+
+

Note

+

Alternatively, to test the entire functionality, you can use the test-minfs.sh script:

+
# ./test-minfs.sh
+
+
+

The script is copied to the virtual machine when running the make copy command only if is executable.

+
student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/minfs/user/test-minfs.sh
+
+
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lab9-filesystems-part2.html b/refs/pull/405/merge/so2/lab9-filesystems-part2.html new file mode 100644 index 00000000..1a31376f --- /dev/null +++ b/refs/pull/405/merge/so2/lab9-filesystems-part2.html @@ -0,0 +1,1236 @@ + + + + + + SO2 Lab 09 - File system drivers (Part 2) — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lab 09 - File system drivers (Part 2)

+
+

Lab objectives

+
+
    +
  • Improving the knowledge about inode, file and dentry.
  • +
  • Acquiring knowledge about adding support for working with regular files and directories in VFS (Virtual File System).
  • +
  • Acquiring knowledge about the internal implementation of a file system.
  • +
+
+
+
+

Inode

+

The inode is an essential component of a UNIX file system and, at the same time, an important component of VFS. An inode is a metadata (it has information about information). +An inode uniquely identifies a file on disk and holds information about it (uid, gid, access rights, access times, pointers to data blocks, etc.). +An important aspect is that an inode does not have information about the file name (it is retained by the associated struct dentry structure).

+

The inode refers to a file on the disk. To refer an open file (associated with a file descriptor within a process), the struct file structure is used. +An inode can have any number of (zero or more) file structures associated (multiple processes can open the same file, or a process can open the same file several times).

+

Inode exists both as a VFS entity (in memory) and as a disk entity (for UNIX, HFS, NTFS, etc.). +The inode in VFS is represented by the structure struct inode. +Like the other structures in VFS, struct inode is a generic structure that covers the options for all supported file types, even those that do not have an associated disk entity (such as FAT).

+
+

The inode structure

+

The inode structure is the same for all file systems. In general, file systems also have private information. These are referenced through the i_private field of the structure. +Conventionally, the structure that keeps that particular information is called <fsname>_inode_info, where fsname represents the file system name. For example, minix and ext4 filesystems store particular information in structures struct minix_inode_info, or struct ext4_inode_info.

+

Some of the important fields of struct inode are:

+
+
    +
  • i_sb : The superblock structure of the file system the inode belongs to.
  • +
  • i_rdev: the device on which this file system is mounted
  • +
  • i_ino : the number of the inode (uniquely identifies the inode within the file system)
  • +
  • i_blkbits: number of bits used for the block size == log2(block size)
  • +
  • i_mode, i_uid, i_gid: access rights, uid, gid
  • +
  • i_size: file/directory/etc. size in bytes
  • +
  • i_mtime, i_atime, i_ctime: change, access, and creation time
  • +
  • i_nlink: the number of names entries (dentries) that use this inode; for file systems without links (either hard or symbolic) this is always set to 1
  • +
  • i_blocks: the number of blocks used by the file (all blocks, not just data); this is only used by the quota subsystem
  • +
  • i_op, i_fop: pointers to operations structures: struct inode_operations and struct file_operations; i_mapping->a_ops contains a pointer to struct address_space_operations.
  • +
  • i_count: the inode counter indicating how many kernel components use it.
  • +
+
+

Some functions that can be used to work with inodes:

+
+
    +
  • new_inode(): creates a new inode, sets the i_nlink field to 1 and initializes i_blkbits, i_sb and i_dev;

    +
  • +
  • insert_inode_hash(): adds the inode to the hash table of inodes; an interesting effect of this call is that the inode will be written to the disk if it is marked as dirty;

    +
    +

    Warning

    +

    An inode created with new_inode() is not in the hash table, and unless you have serious reasons not to, you must enter it in the hash table;

    +
    +
  • +
  • mark_inode_dirty(): marks the inode as dirty; at a later moment, it will be written on the disc;

    +
  • +
  • iget_locked(): loads the inode with the given number from the disk, if it is not already loaded;

    +
  • +
  • unlock_new_inode(): used in conjunction with iget_locked(), releases the lock on the inode;

    +
  • +
  • iput(): tells the kernel that the work on the inode is finished; if no one else uses it, it will be destroyed (after being written on the disk if it is maked as dirty);

    +
  • +
  • make_bad_inode(): tells the kernel that the inode can not be used; It is generally used from the function that reads the inode when the inode could not be read from the disk, being invalid.

    +
  • +
+
+
+
+

Inode operations

+
+

Getting an inode

+

One of the main inode operations is obtaining an inode (the struct inode in VFS). +Until version 2.6.24 of the Linux kernel, the developer defined a read_inode function. +Starting with version 2.6.25, the developer must define a <fsname>_iget where <fsname> is the name of the file system. +This function is responsible with finding the VFS inode if it exists or creating a new one and filling it with the information from the disk.

+

Generally, this function will call iget_locked() to get the inode structure from VFS. If the inode is newly created then it will need to read the inode from the disk (using sb_bread()) and fill in the useful information.

+

An example of such a function is minix_iget():

+
static struct inode *V1_minix_iget(struct inode *inode)
+{
+      struct buffer_head * bh;
+      struct minix_inode * raw_inode;
+      struct minix_inode_info *minix_inode = minix_i(inode);
+      int i;
+
+      raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
+      if (!raw_inode) {
+              iget_failed(inode);
+              return ERR_PTR(-EIO);
+      ...
+}
+
+struct inode *minix_iget(struct super_block *sb, unsigned long ino)
+{
+      struct inode *inode;
+
+      inode = iget_locked(sb, ino);
+      if (!inode)
+              return ERR_PTR(-ENOMEM);
+      if (!(inode->i_state & I_NEW))
+              return inode;
+
+      if (INODE_VERSION(inode) == MINIX_V1)
+              return V1_minix_iget(inode);
+    ...
+}
+
+
+

The minix_iget function gets the VFS inode using iget_locked(). +If the inode is already existing (not new == the I_NEW flag is not set) the function returns. +Otherwise, the function calls the V1_minix_iget() function that will read the inode from the disk using minix_V1_raw_inode() and then complete the VFS inode with the read information.

+
+
+

Superoperations

+

Many of the superoperations (components of the struct super_operations structure used by the superblock) are used when working with inodes. These operations are described next:

+
+
    +
  • alloc_inode: allocates an inode. +Usually, this funcion allocates a struct <fsname>_inode_info structure and performs basic VFS inode initialization (using inode_init_once()); +minix uses for allocation the kmem_cache_alloc() function that interacts with the SLAB subsystem. +For each allocation, the cache construction is called, which in the case of minix is the init_once() function. +Alternatively, kmalloc() can be used, in which case the inode_init_once() function should be called. +The alloc_inode() function will be called by the new_inode() and iget_locked() functions.
  • +
  • write_inode : saves/updates the inode received as a parameter on disk; to update the inode, though inefficient, for beginners it is recommended to use the following sequence of operations:
      +
    • load the inode from the disk using the sb_bread() function;
    • +
    • modify the buffer according to the saved inode;
    • +
    • mark the buffer as dirty using mark_buffer_dirty(); the kernel will then handle its writing on the disk;
    • +
    • an example is the minix_write_inode() function in the minix file system
    • +
    +
  • +
  • evict_inode: removes any information about the inode with the number received in the i_ino field from the disk and memory (both the inode on the disk and the associated data blocks). This involves performing the following operations:
      +
    • delete the inode from the disk;
    • +
    • updates disk bitmaps (if any);
    • +
    • delete the inode from the page cache by calling truncate_inode_pages();
    • +
    • delete the inode from memory by calling clear_inode() ;
    • +
    • an example is the minix_evict_inode() function from the minix file system.
    • +
    +
  • +
  • destroy_inode releases the memory occupied by inode
  • +
+
+
+
+

inode_operations

+

The inode operations are described by the struct inode_operations structure.

+

Inodes are of several types: file, directory, special file (pipe, fifo), block device, character device, link etc. +For this reason, the operations that an inode needs to implement are different for each type of inode. +Below are detailed operations for a file type inode and a directory inode.

+

The operations of an inode are initialized and accessed using the i_op field of the structure struct inode.

+
+
+
+
+

The file structure

+

The file structure corresponds to a file open by a process and exists only in memory, being associated with an inode. +It is the closest VFS entity to user-space; the structure fields contain familiar information of a user-space file (access mode, file position, etc.) and the operations with it are performed by known system calls (read, write , etc.).

+

The file operations are described by the struct file_operations structure.

+

The file operations for a file system are initialized using the i_fop field of the struct inode structure. +When opening a file, the VFS initializes the f_op field of the struct file structure with address of inode->i_fop, such that subsequent system calls use the value stored in the file->f_op.

+
+
+

Regular files inodes

+

To work with the inode, the i_op and i_fop fields of the inode structure must be filled in. +The type of the inode determines the operations that it needs to implement.

+
+

Regular files inode operations

+

In the minix file system, the minix_file_inode_operations structure is defined for the operations on an inode and for the file operations the minix_file_operations structure is defined:

+
const struct file_operations minix_file_operations = {
+         .llseek         = generic_file_llseek,
+         .read_iter      = generic_file_read_iter,
+         //...
+         .write_iter     = generic_file_write_iter,
+         //...
+         .mmap           = generic_file_mmap,
+         //...
+};
+
+const struct inode_operations minix_file_inode_operations = {
+        .setattr        = minix_setattr,
+        .getattr        = minix_getattr,
+};
+
+        //...
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &minix_file_inode_operations;
+                inode->i_fop = &minix_file_operations;
+        }
+        //...
+
+
+

The functions generic_file_llseek() , generic_file_mmap() , generic_file_read_iter() and generic_file_write_iter() are implemented in the kernel.

+

For simple file systems, only the truncation operation (truncate system call) must be implemented. +Although initially there was a dedicated operation, starting with 3.14 the operation was embedded in setattr: if the paste size is different from the current size of the inode, then a truncate operation must be performed. +An example of implementing this verification is in the minix_setattr() function:

+
static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = d_inode(dentry);
+        int error;
+
+        error = setattr_prepare(dentry, attr);
+        if (error)
+                return error;
+
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = inode_newsize_ok(inode, attr->ia_size);
+                if (error)
+                        return error;
+
+                truncate_setsize(inode, attr->ia_size);
+                minix_truncate(inode);
+        }
+
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
+}
+
+
+

The truncate operation involves:

+
+
    +
  • freeing blocks of data on the disk that are now extra (if the new dimension is smaller than the old one) or allocating new blocks (for cases where the new dimension is larger)
  • +
  • updating disk bit maps (if used);
  • +
  • updating the inode;
  • +
  • filling with zero the space that was left unused from the last block using the block_truncate_page() function.
  • +
+
+

An example of the implementation of the cropping operation is the minix_truncate() function in the minix file system.

+
+
+

Address space operations

+

There is a close link between the address space of a process and files: the execution of the programs is done almost exclusively by mapping the file into the process address space. +Because this approach works very well and is quite general, it can also be used for regular system calls such as read and write.

+

The structure that describes the address space is struct address_space, and the operations with it are described by the structure struct address_space_operations. To initialize the address space operations, fill inode->i_mapping->a_ops of the file type inode.

+

An example is the minix_aops structure in the minix file system:

+
static const struct address_space_operations minix_aops = {
+       .readpage = minix_readpage,
+       .writepage = minix_writepage,
+       .write_begin = minix_write_begin,
+       .write_end = generic_write_end,
+       .bmap = minix_bmap
+};
+
+//...
+if (S_ISREG(inode->i_mode)) {
+      inode->i_mapping->a_ops = &minix_aops;
+}
+//...
+
+
+

The generic_write_end() function is already implemented. +Most of the specific functions are very easy to implement, as follows:

+
static int minix_writepage(struct page *page, struct writeback_control *wbc)
+{
+         return block_write_full_page(page, minix_get_block, wbc);
+}
+
+static int minix_readpage(struct file *file, struct page *page)
+{
+         return block_read_full_page(page, minix_get_block);
+}
+
+static void minix_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, inode->i_size);
+                minix_truncate(inode);
+        }
+}
+
+static int minix_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
+{
+        int ret;
+
+        ret = block_write_begin(mapping, pos, len, flags, pagep,
+                                minix_get_block);
+        if (unlikely(ret))
+                minix_write_failed(mapping, pos + len);
+
+        return ret;
+}
+
+static sector_t minix_bmap(struct address_space *mapping, sector_t block)
+{
+         return generic_block_bmap(mapping, block, minix_get_block);
+}
+
+
+

All that needs to be done is to implement minix_get_block, which has to translate a block of a file into a block on the device. +If the flag create received as a parameter is set, a new block must be allocated. +In case a new block is created, the bit map must be updated accordingly. +To notify the kernel not to read the block from the disk, bh must be marked with set_buffer_new(). The buffer must be associated with the block through map_bh().

+
+
+
+

Dentry structure

+

Directories operations use the struct dentry structure. +Its main task is to make links between inodes and filenames. +The important fields of this structure are presented below:

+
struct dentry {
+        //...
+        struct inode             *d_inode;     /* associated inode */
+        //...
+        struct dentry            *d_parent;    /* dentry object of parent */
+        struct qstr              d_name;       /* dentry name */
+        //...
+
+        struct dentry_operations *d_op;        /* dentry operations table */
+        struct super_block       *d_sb;        /* superblock of file */
+        void                     *d_fsdata;    /* filesystem-specific data */
+        //...
+};
+
+
+

Fields meaning:

+
+
    +
  • d_inode: the inode referenced by this dentry;
  • +
  • d_parent: the dentry associated with the parent directory;
  • +
  • d_name: a struct qstr structure that contains the fields name and len (the name and the length of the name).
  • +
  • d_op: operations with dentries, represented by the struct dentry_operations structure. +The kernel implements default operations so there is no need to (re)implement them. Some file systems can do optimizations based on the specific structure of the dentries.
  • +
  • d_fsdata: field reserved for the file system that implements dentry operations;
  • +
+
+
+

Dentry operations

+

The most commonly operations applied to dentries are:

+
+
    +
  • d_make_root: allocates the root dentry. It is generally used in the function that is called to read the superblock (fill_super), which must initialize the root directory. +So the root inode is obtained from the superblock and is used as an argument to this function, to fill the s_root field from the struct super_block structure.
  • +
  • d_add: associates a dentry with an inode; the dentry received as a parameter in the calls discussed above signifies the entry (name, length) that needs to be created. This function will be used when creating/loading a new inode that does not have a dentry associated with it and has not yet been introduced to the hash table of inodes (at lookup);
  • +
  • d_instantiate: The lighter version of the previous call, in which the dentry was previously added in the hash table.
  • +
+
+
+

Warning

+

d_instantiate must be used to implement create calls (mkdir, mknod, rename, symlink) and NOT d_add.

+
+
+
+
+

Directory inodes operations

+

The operations for directory type inodes have a higher complexity level than the ones for files. +The developer must define operations for inodes and operations for files. +In minix, these operations are defined in minix_dir_inode_operations and minix_dir_operations:

+
struct inode_operations minix_dir_inode_operations = {
+      .create = minix_create,
+      .lookup = minix_lookup,
+      .link = minix_link,
+      .unlink = minix_unlink,
+      .symlink = minix_symlink,
+      .mkdir = minix_mkdir,
+      .rmdir = minix_rmdir,
+      .mknod = minix_mknod,
+      //...
+};
+
+struct file_operations minix_dir_operations = {
+      .llseek = generic_file_llseek,
+      .read = generic_read_dir,
+      .iterate = minix_readdir,
+      //...
+};
+
+        //...
+      if (S_ISDIR(inode->i_mode)) {
+              inode->i_op = &minix_dir_inode_operations;
+              inode->i_fop = &minix_dir_operations;
+              inode->i_mapping->a_ops = &minix_aops;
+      }
+       //...
+
+
+

The only function already implemented is generic_read_dir().

+

The functions that implement the operations on directory inodes are the ones described below.

+
+

Creating an inode

+

The inode creation function is indicated by the field create in the inode_operations structure. +In the minix case, the function is minix_create(). +This function is called by the open and creat system calls. Such a function performs the following operations:

+
+
    +
  1. Introduces a new entry into the physical structure on the disk; the update of the bit maps on the disk must not be forgotten.
  2. +
  3. Configures access rights to those received as a parameter.
  4. +
  5. Marks the inode as dirty with the mark_inode_dirty() function.
  6. +
  7. Instantiates the directory entry (dentry) with the d_instantiate function.
  8. +
+
+
+
+

Creating a directory

+

The directory creation function is indicated by the mkdir field in the inode_operations structure. +In the minix case, the function is minix_mkdir(). +This function is called by the mkdir system call. Such a function performs the following operations:

+
+
    +
  1. Calls minix_create().
  2. +
  3. Allocates a data block for the directory.
  4. +
  5. Creates the "." and ".." entries.
  6. +
+
+
+ + + +
+

Deleting a directory

+

The directory delete function is indicated by the rmdir field in the inode_operations structure. +In the minix case, the function is minix_rmdir(). +This function is called by the rmdir system call. +Such a function performs the following operations:

+
+
    +
  1. Performs the operations done by minix_unlink.
  2. +
  3. Ensures that the directory is empty; otherwise, returns ENOTEMPTY.
  4. +
  5. Also deletes the data blocks.
  6. +
+
+
+
+

Searching for an inode in a directory

+

The function that searches for an entry in a directory and extracts the inode is indicated by the lookup field in the inode_operations structure. +In the minix case, the function is minix_lookup. +This function is called indirectly when information about the inode associated with an entry in a directory is needed. +Such a function performs the following operations:

+
+
    +
  1. Searches in the directory indicated by dir the entry having the name dentry->d_name.name.
  2. +
  3. If the entry is found, it will return NULL and associate the inode with the name using the d_add() function.
  4. +
  5. Otherwise, returns ERR_PTR.
  6. +
+
+
+
+

Iterating through entries in a directory

+

The function which iterates through the entries in a directory (lists the directory contents) is indicated by the field iterate in the struct file_operations structure. +In the minix case, the function is minix_readdir. +This function is called by the readdir system call.

+

The function returns either all entries in the directory or just a part when the buffer allocated for it is not available. +A call of this function can return:

+
+
    +
  • a number equal to the existing number of entries if there is enough space in the corresponding user space buffer;
  • +
  • a number smaller than the actual number of entries, as much as there was space in the corresponding user space buffer;
  • +
  • 0, where there are no more entries to read.
  • +
+
+

The function will be called consecutively until all available entries are read. The function is called at least twice.

+
+
    +
  • It is only called twice if:
      +
    • the first call reads all entries and returns their number;
    • +
    • the second call returns 0, having no other entries to read.
    • +
    +
  • +
  • It is called more than twice if the first call does not return the total number of entries.
  • +
+
+

The function performs the following operations:

+
+
    +
  1. Iterates over the entries (the dentries) from the current directory.
  2. +
  3. For each dentry found, increments ctx->pos.
  4. +
  5. For each valid dentry (an inode other than 0, for example), calls the dir_emit() function.
  6. +
  7. If the dir_emit() function returns a value other than zero, it means that the buffer in the user space is full and the function returns.
  8. +
+
+

The arguments of the dir_emit function are:

+
+
    +
  • ctx is the directory iteration context, passed as an argument to the iterate function;
  • +
  • name is the name of the entry (a string of characters);
  • +
  • name_len is the length of the entry name;
  • +
  • ino is the inode number associated with the entry;
  • +
  • type identifies the entry type: DT_REG (file), DT_DIR (directory), DT_UNKNOWN etc. DT_UNKNOWN can be used when the entry type is unknown.
  • +
+
+
+
+
+

Bitmap operations

+

When working with the file systems, management information (what block is free or busy, what inode is free or busy) is stored using bitmaps. +For this we often need to use bit operations. Such operations are:

+
+
    +
  • searching the first 0 bit: representing a free block or inode
  • +
  • marking a bit as 1: marking a busy block or inode
  • +
+
+

The bitmap operations are found in headers from include/asm-generic/bitops, especially in find.h and atomic.h. Usual functions, with names indicating their role, are:

+
+
    +
  • find_first_zero_bit()
  • +
  • find_first_bit()
  • +
  • set_bit()
  • +
  • clear_bit()
  • +
  • test_and_set_bit()
  • +
  • test_and_clear_bit()
  • +
+
+

These functions usually receive the address of the bitmap, possibly its size (in bytes) and, if necessary, the index of the bit that needs to be activated (set) or deactivated (clear).

+

Some usage examples are listed below:

+
unsigned int map;
+unsigned char array_map[NUM_BYTES];
+size_t idx;
+int changed;
+
+/* Find first zero bit in 32 bit integer. */
+idx = find_first_zero_bit(&map, 32);
+printk (KERN_ALERT "The %zu-th bit is the first zero bit.\n", idx);
+
+/* Find first one bit in NUM_BYTES bytes array. */
+idx = find_first_bit(array_map, NUM_BYTES * 8);
+printk (KERN_ALERT "The %zu-th bit is the first one bit.\n", idx);
+
+/*
+ * Clear the idx-th bit in integer.
+ * It is assumed idx is less the number of bits in integer.
+ */
+clear_bit(idx, &map);
+
+/*
+ * Test and set the idx-th bit in array.
+ * It is assumed idx is less the number of bits in array.
+ */
+changed = __test_and_set_bit(idx, &sbi->imap);
+if (changed)
+      printk(KERN_ALERT "%zu-th bit changed\n", idx);
+
+
+
+
+

Further reading

+
    +
  1. Robert Love -- Linux Kernel Development, Second Edition -- Chapter +12. The Virtual Filesystem
  2. +
  3. Understanding the Linux Kernel, 3rd edition - Chapter 12. The Virtual +Filesystem
  4. +
  5. Linux Virtual File System (presentation)
  6. +
  7. Understanding Unix/Linux Filesystem
  8. +
  9. Creating Linux virtual filesystems
  10. +
  11. The Linux Documentation Project - VFS
  12. +
  13. The "Virtual File System" in Linux
  14. +
  15. A Linux Filesystem Tutorial
  16. +
  17. The Linux Virtual File System
  18. +
  19. Documentation/filesystems/vfs.txt
  20. +
  21. File systems sources
  22. +
+
+
+

Exercises

+
+

Important

+

We strongly encourage you to use the setup from this repository.

+
+
To solve exercises, you need to perform these steps:
+
    +
  • prepare skeletons from templates
  • +
  • build modules
  • +
  • start the VM and test the module in the VM.
  • +
+
+
+

The current lab name is filesystems. See the exercises for the task name.

+

The skeleton code is generated from full source examples located in +tools/labs/templates. To solve the tasks, start by generating +the skeleton code for a complete lab:

+
tools/labs $ make clean
+tools/labs $ LABS=<lab name> make skels
+
+
+

You can also generate the skeleton for a single task, using

+
tools/labs $ LABS=<lab name>/<task name> make skels
+
+
+

Once the skeleton drivers are generated, build the source:

+
tools/labs $ make build
+
+
+

Then, start the VM:

+
tools/labs $ make console
+
+
+

The modules are placed in /home/root/skels/filesystems/<task_name>.

+

You DO NOT need to STOP the VM when rebuilding modules! +The local skels directory is shared with the VM.

+

Review the Exercises section for more detailed information.

+
+
+

Warning

+

Before starting the exercises or generating the skeletons, please run git pull inside the Linux repo, +to make sure you have the latest version of the exercises.

+

If you have local changes, the pull command will fail. Check for local changes using git status. +If you want to keep them, run git stash before pull and git stash pop after. +To discard the changes, run git reset --hard master.

+

If you already generated the skeleton before git pull you will need to generate it again.

+
+
+

Important

+

In this lab, we will continue the implementation of the file systems started in the previous one. +For this, we will generate the laboratory skeleton using the following command:

+
TODO=5 LABS=filesystems make skels
+
+
+

After this, we will start the implementation from TODO 5.

+
+
+

myfs

+

For the exercises below, we will use the myfs file system whose implementation we started with the previous lab. +We stopped after mounting the file system and now we will continue with the operations for regular files and directories. +At the end of these exercises, we will be able to create, modify and delete regular directories and files.

+

We will mostly use the inode and dentry VFS structures. +The inode structure defines a file (of any type: regular, directory, link), while the dentry structure defines a name, which is an entry in a directory.

+

For this we will access the myfs directory in the lab skeleton. +The previously generated skeleton contains the solution for the previous lab; we will start from this. As in the previous lab, we will use the ramfs file system as a starting point.

+
+

1. Directory operations

+

To begin with, we will implement the operations for working with directories. +The operations of creating a file or deleting a file are also directory operations; these operations result in adding or deleting a directory entry (dentry).

+

At the end of this exercise we will be able to create and delete entries in the file system. We will not be able to read and write to regular files; we will do so in the next exercise.

+

Follow directions marked with TODO 5 which will guide you through the steps you need to take.

+

You will need to specify the following directory operations:

+
+
    +
  • create a file (create function)
  • +
  • search (lookup function)
  • +
  • link (link function)
  • +
  • create directory (mkdir function)
  • +
  • deletion (rmdir and unlink functions)
  • +
  • create node (mknod)
  • +
  • rename (rename function)
  • +
+
+

For this, define the myfs_dir_inode_operations structure in the code, where marked with TODO 5. +To begin, just define the structure myfs_dir_inode_operations; you will define the structures myfs_file_operations, myfs_file_inode_operations , and myfs_aops in the next exercise.

+
+

Tip

+

Read the section Directory inodes operations

+

As a model, you are following the ramfs_dir_inode_operations structure.

+
+

Implement the mkdir, mknod and create operations inside myfs_mkdir, myfs_mknod and myfs_create. +These operations will allow you to create directories and files in the file system.

+
+

Tip

+

We recommend making the code modular using a mknod function, which you can also use for the next exercise. +For inode reading and allocation, use myfs_get_inode, which is already implemented.

+

As a model, follow the next functions implemented in the ramfs file system:

+
+
    +
  • ramfs_mknod()
  • +
  • ramfs_mkdir()
  • +
  • ramfs_create()
  • +
+
+
+

For the other functions, use generic calls (simple_*) already defined in VFS.

+

In the myfs_get_inode function, initialize the operations fields of the directory inodes:

+
+
    +
  • i_op must be initialized to the address of the structure myfs_dir_inode_operations;
  • +
  • i_fop must be initialized to the address of the structure simple_dir_operations, defined in VFS.
  • +
+
+
+

Note

+

i_op is a pointer to a structure of type struct inode_operations containing operations that have to do with the inode, which are, for a directory, creating a new entry, listing entries, deleting entries, etc.

+

i_fop is a pointer to a structure of type struct file_operations containing operations that have to do with the file structure associated with the inode, such as read, write, and lseek.

+
+
+
Testing
+

Once the module is done, we can test the creation of files and directories. +To do this, we compile the kernel module (using make build) and copy the resulting file (myfs.ko) and the test scripts (test-myfs-{1,2}.sh) in the virtual machine directory (using make copy).

+
+

Note

+

The test scripts are copied to the virtual machine using make copy only if they are executable:

+
student@workstation:~/linux/tools/labs$ chmod +x skels/filesystems/myfs/test-myfs-*.sh
+
+
+
+

After starting the virtual machine, insert the module, create the mount point and mount the file system:

+
# insmod myfs.ko
+# mkdir -p /mnt/myfs
+# mount -t myfs none /mnt/myfs
+
+
+

Now we can create file hierarchies and subdirectories in the mounted directory (/mnt/myfs). +We use commands like the ones below:

+
# touch /mnt/myfs/peanuts.txt
+# mkdir -p /mnt/myfs/mountain/forest
+# touch /mnt/myfs/mountain/forest/tree.txt
+# rm /mnt/myfs/mountain/forest/tree.txt
+# rmdir /mnt/myfs/mountain/forest
+
+
+

At this time we can not read or write files. When running commands such as the following ones we will get errors.

+
# echo "chocolate" > /mnt/myfs/peanuts.txt
+# cat /mnt/myfs/peanuts.txt
+
+
+

This happens because we have not implemented the operations for working with files; we will do so further.

+

To unload the kernel module, use the command

+
umount /mnt/myfs
+rmmod myfs
+
+
+

To test the functionality provided by the kernel module, we can use the dedicated script test-myfs-1.sh. +If the implementation is correct, no error messages will be displayed.

+
+
+
+

2. File operations

+

We want to implement the operations for working with files, which are used for accessing a file's content: read, write, truncate, etc. +For this you will specify the operations described in the structures struct inode_operations, struct file_operations and struct address_space_operations.

+

Follow the locations marked with TODO 6 which will guide you through the steps you need to take.

+

Start by defining myfs_file_inode_operations and myfs_file_operations.

+
+

Tip

+

Read the section Regular files inode operations.

+

Use the generic function provided by VFS.

+

An example of implementation is the ramfs file system. +Follow the implementation of ramfs_file_inode_operations and ramfs_file_operations.

+
+

Inside the function myfs_get_inode, initialize the operations fields for the regular file inodes:

+
+
    +
  • i_op must be initialized to myfs_file_inode_operations;
  • +
  • i_fop msust be initialized to myfs_file_operations.
  • +
+
+

Continue with defining the structure myfs_aops.

+
+

Tip

+

Read the section Address space operations.

+

Use the generic functions provided by VFS.

+

An implementation example is the ramfs file system: the ramfs_aops structure.

+

You do not need to define the function of type set_page_dirty.

+
+

Initialize the i_mapping->a_ops field of the inode structure to myfs_aops.

+
+
Testing
+

For testing, we use the steps described in the previous exercise. +In addition to those steps, we will now be able to read, write and modify a file using commands like the ones below:

+
# echo "chocolate" > /mnt/myfs/peanuts.txt
+# cat /mnt/myfs/peanuts.txt
+
+
+

To test the functionality provided by the module, we can use the dedicated script:

+
# ./test-myfs-2.sh
+
+
+

If the implementation is correct, no error messages will be displayed when running the above script.

+
+
+
+
+

minfs

+

For the exercises below, we will use the minfs file system whose development started in the previous lab. +This is a file system with disk support. +We stopped after mounting the file system and now we will continue with the operations on regular files and directories. +At the end of these exercises we will be able to create and delete entries in the file system.

+

We will mainly use the inode and dentry VFS structures. +The inode structure defines a file (of any type: regular, directory, link), while the dentry structure defines a name, which is a directory entry.

+

For this we will access the minfs/kernel directory from the laboratory skeleton. +The generated skeleton contains the solution from the previous lab; we will start from this. +As in the previous lab, we will use the minix file system as a starting point.

+

We will use the formatting tool mkfs.minfs in the minfs/user directory which is automatically compiled when running make build and copied to the virtual machine at make copy.

+

The formatting tool prepares a virtual machine disk using a command like

+
# ./mkfs.minfs /dev/vdb
+
+
+

After formatting, the disk has a structure like the one in the diagram below:

+../_images/minfs_arch1.png +

As shown in the diagram, minfs is a minimalist file system. +minfs contains a maximum of 32 inodes, each inode having a single data block (the file size is limited to block size). +The super block contains a 32-bit map (imap), each bit indicating the use of an inode.

+
+

Note

+

Before you start working, go through the minfs/kernel/minfs.h header file. +This file contains the structures and macros that will be used in these exercises. +These structures and macros define the file system as described in the diagram above.

+
+
+

1. Iterate operation

+

At first we want to be able to list the contents of the root directory. +For this we must be able to read the entries in the root directory, which means implementing the iterate operation. +The iterate operation is a field within the minfs_dir_operations structure (of type file_operations) and is implemented by the function minfs_readdir. We need to implement this function.

+

Follow directions marked with TODO 5 which will guide you through the steps you need to take.

+
+

Tip

+

Read the section Directory inodes operations

+

As a starting point, follow the minix_readdir() function. +The function is rather complicated, but it gives you an insight into the steps you have to do.

+

Follow, in minfs.c and minfs.h, the definitions of structures struct minfs_inode_info, struct minfs_inode and struct minfs_dir_entry. +You will use them in the minfs_readdir implementation.

+
+

Obtain the inode and the structure struct minfs_inode_info associated with the directory. +The structure struct minfs_inode_info is useful to find out the directory's data block. +From this structure you get the data_block field, representing the data block index on the disk.

+
+

Tip

+

To get the structure struct minfs_inode_info structure, use list_entry() or container_of().

+
+

Use sb_bread() to read the directory data block.

+
+

Tip

+

The data block of the directory is indicated by the data_block field of the structure struct minfs_inode_info corresponding to the directory.

+

The data in the block is referenced by the b_data field of the buffer_head structure (the usual code will be bh->b_data). +This block (being the data block of a directory) contains an array of at most MINFS_NUM_ENTRIES entries of type struct minfs_dir_entry (directory entries specific to minfs). +Use casting to struct minfs_dir_entry * to work with the data in the block.

+
+

Iterate over all the entries in the data block and fill the user space buffer inside the for loop.

+
+

Tip

+

For each index, get the corresponding entry of the struct minfs_dir_entry by using pointer arithmetics on the bh->b_data field. +Ignore dentries that have an ino field equal to 0. Such a dentry is a free slot in the director's dentry list.

+

For each valid entry, there is an existing call dir_emit() with the appropriate parameters. This is the call that sends the dentries to the caller (and then to user space).

+

Check the call examples in qnx6_readdir() and minix_readdir().

+
+
+
Testing
+

Once the module is done, we can test the listing of the root directory contents. +To do this, we compile the kernel module (make build) and copy the result to the virtual machine together with the test scripts (minfs/user/test-minfs-{0,1}.sh) and the formatting utility (minfs/user/mkfs.minfs) using make copy, then start the machine.

+
+

Note

+

The test scripts are copied to the virtual machine only if they are executable:

+
student@eg106:~/src/linux/tools/labs$ chmod +x skels/filesystems/minfs/user/test-minfs*.sh
+
+
+
+

After we start the virtual machine, we format the /dev/vdb disk, create the mount point and mount the file system:

+
# ./mkfs.minfs /dev/vdb
+# mkdir -p /mnt/minfs
+# mount -t minfs /dev/vdb /mnt/minfs
+
+
+

Now we can list the contents of the root directory:

+
# ls -l /mnt/minfs
+
+
+

We notice that there is already a file (a.txt); it is created by the formatting utility.

+

We also notice that we are not allowed to display information for a file using the ls command. +This is because we have not implemented the lookup function. We will implement it in the next exercise.

+

To test the functionality provided by the module, we can use the dedicated script:

+
# ./test-minfs-0.sh
+# ./test-minfs-1.sh
+
+
+
+
+
+

2. Lookup operation

+

To properly list the contents of a directory, we need to implement the search functionality, ie the lookup operation. +The lookup operation is a field within the minfs_dir_inode_operations structure (of type inode_operations) and is implemented by the minfs_lookup function. +This function (minfs_lookup) needs to be implemented. +We will actually implement the minfs_find_entry function called by minfs_lookup .

+

Follow directions marked with TODO 6 which will tell you the steps you need to take.

+
+

Tip

+

Read the section Directory inodes operations

+

As a starting point, read the functions qnx6_find_entry() and minix_find_entry().

+
+

In the minfs_find_entry function, iterate over the directory where the dentry is: dentry->d_parent->d_inode. +Iterating means going through the entries in the directory's data block (of type struct minfs_dir_entry) and locate, if it exists, the requested entry.

+
+

Tip

+

From the structure of type struct minfs_inode_info corresponding to the directory, find out the data block index and read it (sb_read). +You will access the block contents using bh->b_data. +The directory data block contains an array of at most MINFS_NUM_ENTRIES entries of type struct minfs_dir_entry. +Use pointer arithmetics to get entries of type struct minfs_dir_entry from the data block (bh->b_data).

+

Check the presence of the name (stored in the local variable name) in the directory (if there is an entry in the data block whose name is a string equal to the given name). Use strcmp() to verify.

+

Ignore dentries that have an ino field equal to 0. Those dentries are free slots in the directory dentry list.

+

Store in the final_de variable the dentry found. +If you do not find any dentry, then the final_de variable will have the value NULL, the value with which it was initialized.

+
+

Comment the simple_lookup call in the minfs_lookup function to invoke the implementation of minfs_readdir.

+
+
Testing
+

For testing, we use the steps described in the previous exercise. +The long file listing (ls -l) of the contents of a directory (root directory) will display permissions and other file-specific information:

+
# ls -l /mnt/minfs
+
+
+

To test the functionality provided by the module, we can use the dedicated scripts:

+
# ./test-minfs-0.sh
+# ./test-minfs-1.sh
+
+
+

If the implementation is correct, no error messages will be displayed when running the scripts above.

+
+

Note

+

After mounting the file system using the command

+
# mount -t minfs /dev/vdb /mnt/minfs
+
+
+

we try to create a file using the command

+
# touch /mnt/minfs/peanuts.txt
+
+
+

We notice that we get an error because we did not implement the directory operations that allow us to create a file. +We will do this for the next exercise.

+
+
+
+
+

3. Create operation

+

In order to allow the creation of a file in a directory, we must implement the create operation. +The create operation is a field in the minfs_dir_inode_operations structure (of type inode_operations) and is implemented by the minfs_create function. We need to implement this function. +In fact, we will implement the minfs_new_inode (which creates and initializes an inode) and minfs_add_link which adds a link (or name or dentry) for the created inode.

+

Follow directions marked with TODO 7 which will guide you through the steps you need to take.

+
+

Tip

+

Read the section Directory inodes operations

+

Inspect the code in the minfs_create and the skeleton of functions minfs_new_inode and minfs_add_link.

+
+

Implement the function minfs_new_inode. Inside this function you will create (using new_inode()) and initialize an inode. The initialization is done using the data from disk.

+
+

Tip

+

Use the minix_new_inode() function as a model. +Find the first free inode in imap (sbi->imap). +Use bitwise operations (find_first_zero_bit and set_bit). +Read the Bitmap operations section.

+

The buffer for the superblock (sbi->sbh) must be marked as dirty .

+

You must initialize the usual fields as it is done for the myfs file system. +Initialize the i_mode field to 0 in the call to inode_init_owner. It will be initialized in the caller later.

+
+

Implement the minfs_add_link function. The function adds a new dentry (struct minfs_dir_entry) to the parent directory data block (dentry->d_parent->d_inode).

+
+

Tip

+

Use the function minix_add_link function as a model.

+
+

In minfs_add_link we want to find the first free place for the dentry. +For this, you will iterate over the directory data block and you will find the first free entry. A free dentry has the ino field equal to 0.

+
+

Tip

+

In order to work with the directory, get the inode of type struct minfs_inode_info corresponding to the parent directory (the dir inode). +Do not use the variable inode to get struct minfs_inode_info; that inode belongs to the file, not to the parent directory inside which you want to add the link/dentry. +To get the struct minfs_inode_info structure, use container_of().

+

The structure struct minfs_inode_info is useful for finding the directory data block (the one indicated by the dentry->d_parent->d_inode, which is the dir variable). +From this structure, get the data_block field, representing index of the data block on the disk. +This block contains the entries in the directory. Use sb_bread() to read the block and then bh->b_data to refer to the data. +The block contains at most MINFS_NUM_ENTRIES entries of type struct minfs_dir_entry.

+

If all entries are occupied, return -ENOSPC.

+

Iterate over the entries in the data block using the variable de and extract the first free entry (for which the ino field is 0).

+

When you have found a free place, fill in the corresponding entry:

+
+
    +
  • the inode->i_ino field in de->ino
  • +
  • the dentry->d_name.name field in de->name
  • +
+
+

Then mark the buffer dirty.

+
+
+
Testing
+

For testing, we use the steps described in the previous exercise. +Now we can create files within the file system:

+
# touch /mnt/minfs/peanuts.txt
+
+
+

To test the functionality provided by the module, we can use the dedicated script:

+
# ./test-minfs-2.sh
+
+
+

If the deployment is valid, no error messages will be displayed following the above script run.

+
+

Note

+

The current implementation of the minfs file system is not definitive. +To be complete, the implementations needs function to delete files, create and delete directories, rename entries, and modify the contents of a file.

+
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec1-intro-slides.html b/refs/pull/405/merge/so2/lec1-intro-slides.html new file mode 100644 index 00000000..9b49f9f8 --- /dev/null +++ b/refs/pull/405/merge/so2/lec1-intro-slides.html @@ -0,0 +1,769 @@ + + + + + + + + SO2 Lecture 01 - Course overview and Linux kernel introduction — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 01 - Course overview and Linux kernel introduction

+ + + + + +
+
+ +

Echipa

+ +
    +
  • Daniel Băluță (Daniel), Răzvan Deaconescu (Răzvan, RD), Claudiu +Ghioc (Claudiu), Valentin Ghiță (Vali), Sergiu Weisz (Sergiu), +Octavian Purdilă (Tavi)
  • +
  • Alexandru Militaru (Alex), Teodora Șerbănescu (Teo), Ștefan +Teodorescu (Ștefan, Fane), Mihai Popescu (Mihai, Mișu), +Constantin Răducanu, Daniel Dinca, Laurențiu Ștefan
  • +
  • Mult succes în noul semestru!
  • +
+ + + + +
+
+ +

Poziționare curs

+ +../_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.png + + + + +
+ +
+ +

Comunitate

+ + + + + + +
+
+ +

Notare

+ +
    +
  • 2 puncte activitate la laborator
  • +
  • 3 puncte „examen”, notare pe parcurs
  • +
  • 5 puncte teme de casă
  • +
  • Activități "extra"
  • +
  • Punctajul din teme de casă + activitați extra ce depăsește 5 +puncte e corelat direct proportional cu nota de la examen
  • +
  • Tema 0 - 0,5 puncte
  • +
  • Temele 1, 2, 3 - câte 1,5 puncte fiecare
  • +
  • Condiţii de promovare: nota finală 4.5, nota minimă examen 3
  • +
+ + + + +
+
+ +

Obiectivele cursului

+ +
    +
  • Prezentarea structurii interne a unui sistem de operare
  • +
  • Target: sisteme de operare de uz general
  • +
  • Structura și componentele unui kernel monolitic
  • +
  • Procese, FS, Networking
  • +
  • Memory management
  • +
  • Exemplificare pe Linux
  • +
+ + + + +
+
+ +

Obiectivele laboratorului si a temelor

+ +
    +
  • Însușirea cunoștințelor necesare implementării de device drivere
  • +
  • Înțelegerea în profunzime a cunoștințelor prin rezolvarea de +exerciții
  • +
+ + + + +
+
+ +

Cursuri necesare

+ +
    +
  • Programare: C
  • +
  • SD: tabele de dispersie, arbori echilibrați
  • +
  • IOCLA: lucrul cu registre și instrucțiuni de bază (adunări, comparaţii, salturi)
  • +
  • CN: TLB/CAM, memorie, procesor, I/O
  • +
  • PC, RL: ethernet, IP, sockeți
  • +
  • SO: procese, fișiere, thread-uri, memorie virtuală
  • +
+ + + + +
+
+ +

Despre curs

+ +
    +
  • 12 cursuri
  • +
  • interactiv
  • +
  • participaţi la discuţii
  • +
  • întrebaţi atunci când nu aţi înţeles
  • +
  • destul de “dens”, se recomandă călduros parcurgerea suportului bibliografic înainte şi după curs
  • +
  • 1h:20 prezentare + 20min teste si discutii pe marginea testului
  • +
+ + + + +
+
+ +

Lista cursuri

+ +
    +
  • Introducere
  • +
  • Apeluri de sistem
  • +
  • Procese
  • +
  • Întreruperi
  • +
  • Sincronizare
  • +
  • Adresarea memoriei
  • +
+
    +
  • Gestiunea memoriei
  • +
  • Gestiunea fișierelor
  • +
  • Kernel debugging
  • +
  • Gestiunea rețelei
  • +
  • Virtualizare
  • +
  • Kernel profiling
  • +
+
+ + + + +
+
+ +

Despre laborator

+ +
    +
  • Kernel Modules and Device Drivers
  • +
  • 15 min prezentare / 80 de minute lucru
  • +
  • se punctează activitatea
  • +
  • learn by doing
  • +
+ + + + +
+
+ +

Despre teme

+ +
    +
  • necesare: aprofundare API (laborator) și concepte (curs)
  • +
  • teste publice
  • +
  • suport de testare (vmchecker)
  • +
  • relativ puţin cod de scris dar relativ dificile
  • +
  • dificultatea constă în acomodarea cu noul mediu
  • +
+ + + + +
+
+ +

Lista teme

+ +
    +
  • Tema 0 - Kernel API
  • +
  • Kprobe based tracer
  • +
  • Driver pentru portul serial
  • +
  • Software RAID
  • +
  • SO2 Transport Protocol
  • +
+ + + + +
+
+ +

Bibliografie curs

+ +
    +
  • Linux Kernel Development, 3rd edition, Robert Love, Addison +Wesley, 2010
  • +
  • Understanding the Linux Kernel, 3rd edition, Daniel P. Bovet & +Marco Cesati, O'Reilly 2005
  • +
  • Linux Networking Architecture, Klaus Wehrle, Frank Pahlke, +Hartmut Ritter, Daniel Muller, Marc Bechler, Prentice Hall 2004
  • +
  • Understanding Linux Network Internals, Christian Benvenuti, O'Reilly 2005
  • +
+ + + + +
+
+ +

Bibliografie laborator

+ +
    +
  • Linux Device Drivers, 3nd edition, Alessandro Rubini & Jonathan +Corbet, O'Reilly 2006
  • +
  • Linux Kernel in a Nutshell, Greg Kroah-Hartman, O'Reilly 2005
  • +
+ + + + +
+
+ +

Introduction

+ +
    +
  • Basic operating systems terms and concepts
  • +
  • Overview of the Linux kernel
  • +
+ + + + +
+
+ +

User vs Kernel

+ +
    +
  • Execution modes
      +
    • Kernel mode
    • +
    • User mode
    • +
    +
  • +
  • Memory protection
      +
    • Kernel-space
    • +
    • User-space
    • +
    +
  • +
+ + + + +
+
+ +

Typical operating system architecture

+ +../_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.png + + + + +
+
+ +

Monolithic kernel

+ +../_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.png + + + + +
+
+ +

Micro-kernel

+ +../_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.png + + + + +
+
+ +

Monolithic kernels can be modular

+ +
    +
  • Components can enabled or disabled at compile time
  • +
  • Support of loadable kernel modules (at runtime)
  • +
  • Organize the kernel in logical, independent subsystems
  • +
  • Strict interfaces but with low performance overhead: macros, +inline functions, function pointers
  • +
+ + + + +
+
+ +

"Hybrid" kernels

+ +

Many operating systems and kernel experts have dismissed the label +as meaningless, and just marketing. Linus Torvalds said of this +issue:

+

"As to the whole 'hybrid kernel' thing - it's just marketing. It's +'oh, those microkernels had good PR, how can we try to get good PR +for our working kernel? Oh, I know, let's use a cool name and try +to imply that it has all the PR advantages that that other system +has'."

+ + + + +
+
+ +

Address space

+ +
    +
  • Physical address space
      +
    • RAM and peripheral memory
    • +
    +
  • +
  • Virtual address space
      +
    • How the CPU sees the memory (when in protected / paging mode)
    • +
    • Process address space
    • +
    • Kernel address space
    • +
    +
  • +
+ + + + +
+ +
+ +

Execution contexts

+ +
    +
  • Process context
      +
    • Code that runs in user mode, part of a process
    • +
    • Code that runs in kernel mode, as a result of a system call +issued by a process
    • +
    +
  • +
  • Interrupt context
      +
    • Code that runs as a result of an interrupt
    • +
    • Always runs in kernel mode
    • +
    +
  • +
+ + + + +
+
+ +

Multi-tasking

+ +
    +
  • An OS that supports the "simultaneous" execution of multiple processes
  • +
  • Implemented by fast switching between running processes to allow +the user to interact with each program
  • +
  • Implementation:
      +
    • Cooperative
    • +
    • Preemptive
    • +
    +
  • +
+ + + + +
+
+ +

Preemptive kernel

+ +

Preemptive multitasking and preemptive kernels are different terms.

+

A kernel is preemptive if a process can be preempted while running +in kernel mode.

+

However, note that non-preemptive kernels may support preemptive +multitasking.

+ + + + +
+
+ +

Pageable kernel memory

+ +

A kernel supports pageable kernel memory if parts of kernel memory +(code, data, stack or dynamically allocated memory) can be swapped +to disk.

+ + + + +
+
+ +

Kernel stack

+ +

Each process has a kernel stack that is used to maintain the +function call chain and local variables state while it is executing +in kernel mode, as a result of a system call.

+

The kernel stack is small (4KB - 12 KB) so the kernel developer has +to avoid allocating large structures on stack or recursive calls +that are not properly bounded.

+ + + + +
+
+ +

Portability

+ +
    +
  • Architecture and machine specific code (C & ASM)
  • +
  • Independent architecture code (C):
      +
    • kernel core (further split in multiple subsystems)
    • +
    • device drivers
    • +
    +
  • +
+ + + + +
+
+ +

Asymmetric MultiProcessing (ASMP)

+ +../_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.png + + + + +
+
+ +

Symmetric MultiProcessing (SMP)

+ +../_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.png + + + + +
+
+ +

CPU Scalability

+ +
    +
  • Use lock free algorithms when possible
  • +
  • Use fine grained locking for high contention areas
  • +
  • Pay attention to algorithm complexity
  • +
+ + + + +
+
+ +

Linux development model

+ +
    +
  • Open source, GPLv2 License
  • +
  • Contributors: companies, academia and independent developers
  • +
  • Development cycle: 3 – 4 months which consists of a 1 - 2 week +merge window followed by bug fixing
  • +
  • Features are only allowed in the merge window
  • +
  • After the merge window a release candidate is done on a weekly +basis (rc1, rc2, etc.)
  • +
+ + + + +
+
+ +

Maintainer hierarchy

+ +
    +
  • Linus Torvalds is the maintainer of the Linux kernel and merges pull +requests from subsystem maintainers
  • +
  • Each subsystem has one or more maintainers that accept patches or +pull requests from developers or device driver maintainers
  • +
  • Each maintainer has its own git tree, e.g.:
      +
    • Linux Torvalds: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    • +
    • David Miller (networking): git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/
    • +
    +
  • +
  • Each subsystem may maintain a -next tree where developers can submit +patches for the next merge window
  • +
+ + + + +
+
+ +

Linux source code layout

+ +../_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.png + + + + +
+
+ +

Linux kernel architecture

+ +../_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.png + + + + +
+
+ +

arch

+ +
    +
  • Architecture specific code
  • +
  • May be further sub-divided in machine specific code
  • +
  • Interfacing with the boot loader and architecture specific +initialization
  • +
  • Access to various hardware bits that are architecture or machine +specific such as interrupt controller, SMP controllers, BUS +controllers, exceptions and interrupt setup, virtual memory handling
  • +
  • Architecture optimized functions (e.g. memcpy, string operations, +etc.)
  • +
+ + + + +
+
+ +

Device drivers

+ +
    +
  • Unified device model
  • +
  • Each subsystem has its own specific driver interfaces
  • +
  • Many device driver types (TTY, serial, SCSI, fileystem, ethernet, +USB, framebuffer, input, sound, etc.)
  • +
+ + + + +
+
+ +

Process management

+ +
    +
  • Unix basic process management and POSIX threads support
  • +
  • Processes and threads are abstracted as tasks
  • +
  • Operating system level virtualization
      +
    • Namespaces
    • +
    • Control groups
    • +
    +
  • +
+ + + + +
+
+ +

Memory management

+ +
    +
  • Management of the physical memory: allocating and freeing memory
  • +
  • Management of the virtual memory: paging, swapping, demand +paging, copy on write
  • +
  • User services: user address space management (e.g. mmap(), brk(), +shared memory)
  • +
  • Kernel services: SL*B allocators, vmalloc
  • +
+ + + + +
+
+ +

Block I/O management

+ +../_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.png + + + + +
+
+ +

Virtual Filesystem Switch

+ +../_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.png + + + + +
+
+ +

Networking stack

+ +../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png + + + + +
+
+ +

Linux Security Modules

+ +
    +
  • Hooks to extend the default Linux security model
  • +
  • Used by several Linux security extensions:
      +
    • Security Enhancened Linux
    • +
    • AppArmor
    • +
    • Tomoyo
    • +
    • Smack
    • +
    +
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec1-intro.html b/refs/pull/405/merge/so2/lec1-intro.html new file mode 100644 index 00000000..01327636 --- /dev/null +++ b/refs/pull/405/merge/so2/lec1-intro.html @@ -0,0 +1,916 @@ + + + + + + SO2 Lecture 01 - Course overview and Linux kernel introduction — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 01 - Course overview and Linux kernel introduction

+

View slides

+
+

Echipa

+
    +
  • Daniel Băluță (Daniel), Răzvan Deaconescu (Răzvan, RD), Claudiu +Ghioc (Claudiu), Valentin Ghiță (Vali), Sergiu Weisz (Sergiu), +Octavian Purdilă (Tavi)
  • +
  • Alexandru Militaru (Alex), Teodora Șerbănescu (Teo), Ștefan +Teodorescu (Ștefan, Fane), Mihai Popescu (Mihai, Mișu), +Constantin Răducanu, Daniel Dinca, Laurențiu Ștefan
  • +
  • Mult succes în noul semestru!
  • +
+
+
+

Poziționare curs

+../_images/ditaa-fbe06955ffc165cbdc9cb6074abf0db807b3c5cd.png +
+
+

Resurse

+ +
+
+

Comunitate

+ +
+
+

Notare

+
    +
  • 2 puncte activitate la laborator
  • +
  • 3 puncte „examen”, notare pe parcurs
  • +
  • 5 puncte teme de casă
  • +
  • Activități "extra"
  • +
  • Punctajul din teme de casă + activitați extra ce depăsește 5 +puncte e corelat direct proportional cu nota de la examen
  • +
  • Tema 0 - 0,5 puncte
  • +
  • Temele 1, 2, 3 - câte 1,5 puncte fiecare
  • +
  • Condiţii de promovare: nota finală 4.5, nota minimă examen 3
  • +
+
+
+

Obiectivele cursului

+
    +
  • Prezentarea structurii interne a unui sistem de operare
  • +
  • Target: sisteme de operare de uz general
  • +
  • Structura și componentele unui kernel monolitic
  • +
  • Procese, FS, Networking
  • +
  • Memory management
  • +
  • Exemplificare pe Linux
  • +
+
+
+

Obiectivele laboratorului si a temelor

+
    +
  • Însușirea cunoștințelor necesare implementării de device drivere
  • +
  • Înțelegerea în profunzime a cunoștințelor prin rezolvarea de +exerciții
  • +
+
+
+

Cursuri necesare

+
    +
  • Programare: C
  • +
  • SD: tabele de dispersie, arbori echilibrați
  • +
  • IOCLA: lucrul cu registre și instrucțiuni de bază (adunări, comparaţii, salturi)
  • +
  • CN: TLB/CAM, memorie, procesor, I/O
  • +
  • PC, RL: ethernet, IP, sockeți
  • +
  • SO: procese, fișiere, thread-uri, memorie virtuală
  • +
+
+
+

Despre curs

+
    +
  • 12 cursuri
  • +
  • interactiv
  • +
  • participaţi la discuţii
  • +
  • întrebaţi atunci când nu aţi înţeles
  • +
  • destul de “dens”, se recomandă călduros parcurgerea suportului bibliografic înainte şi după curs
  • +
  • 1h:20 prezentare + 20min teste si discutii pe marginea testului
  • +
+
+
+

Lista cursuri

+
    +
  • Introducere
  • +
  • Apeluri de sistem
  • +
  • Procese
  • +
  • Întreruperi
  • +
  • Sincronizare
  • +
  • Adresarea memoriei
  • +
+
    +
  • Gestiunea memoriei
  • +
  • Gestiunea fișierelor
  • +
  • Kernel debugging
  • +
  • Gestiunea rețelei
  • +
  • Virtualizare
  • +
  • Kernel profiling
  • +
+
+
+
+

Despre laborator

+
    +
  • Kernel Modules and Device Drivers
  • +
  • 15 min prezentare / 80 de minute lucru
  • +
  • se punctează activitatea
  • +
  • learn by doing
  • +
+
+
+

Despre teme

+
    +
  • necesare: aprofundare API (laborator) și concepte (curs)
  • +
  • teste publice
  • +
  • suport de testare (vmchecker)
  • +
  • relativ puţin cod de scris dar relativ dificile
  • +
  • dificultatea constă în acomodarea cu noul mediu
  • +
+
+
+

Lista teme

+
    +
  • Tema 0 - Kernel API
  • +
  • Kprobe based tracer
  • +
  • Driver pentru portul serial
  • +
  • Software RAID
  • +
  • SO2 Transport Protocol
  • +
+
+
+

Bibliografie curs

+
    +
  • Linux Kernel Development, 3rd edition, Robert Love, Addison +Wesley, 2010
  • +
  • Understanding the Linux Kernel, 3rd edition, Daniel P. Bovet & +Marco Cesati, O'Reilly 2005
  • +
  • Linux Networking Architecture, Klaus Wehrle, Frank Pahlke, +Hartmut Ritter, Daniel Muller, Marc Bechler, Prentice Hall 2004
  • +
  • Understanding Linux Network Internals, Christian Benvenuti, O'Reilly 2005
  • +
+
+
+

Bibliografie laborator

+
    +
  • Linux Device Drivers, 3nd edition, Alessandro Rubini & Jonathan +Corbet, O'Reilly 2006
  • +
  • Linux Kernel in a Nutshell, Greg Kroah-Hartman, O'Reilly 2005
  • +
+
+
+

Lecture objectives:

+
    +
  • Basic operating systems terms and concepts
  • +
  • Overview of the Linux kernel
  • +
+
+
+

Basic operating systems terms and concepts

+
+

User vs Kernel

+

Kernel and user are two terms that are often used in operating +systems. Their definition is pretty straight forward: The kernel is +the part of the operating system that runs with higher privileges +while user (space) usually means by applications running with low +privileges.

+

However these terms are heavily overloaded and might have very +specific meanings in some contexts.

+

User mode and kernel mode are terms that may refer specifically to the +processor execution mode. Code that runs in kernel mode can fully +[1] control the CPU while code that runs in user mode has +certain limitations. For example, local CPU interrupts can only be +disabled or enable while running in kernel mode. If such an operation +is attempted while running in user mode an exception will be generated +and the kernel will take over to handle it.

+ + + + + +
[1]some processors may have even higher privileges than +kernel mode, e.g. a hypervisor mode, that is only +accessible to code running in a hypervisor (virtual +machine monitor)
+

User space and kernel space may refer specifically to memory +protection or to virtual address spaces associated with either the +kernel or user applications.

+

Grossly simplifying, the kernel space is the memory area that is +reserved to the kernel while user space is the memory area reserved to +a particular user process. The kernel space is accessed protected so +that user applications can not access it directly, while user space +can be directly accessed from code running in kernel mode.

+
+
+

Typical operating system architecture

+

In the typical operating system architecture (see the figure below) +the operating system kernel is responsible for access and sharing the +hardware in a secure and fair manner with multiple applications.

+../_images/ditaa-48374873962ca32ada36c14ab9a83b60f112a1e0.png +

The kernel offers a set of APIs that applications issue which are +generally referred to as "System Calls". These APIs are different from +regular library APIs because they are the boundary at which the +execution mode switch from user mode to kernel mode.

+

In order to provide application compatibility, system calls are rarely +changed. Linux particularly enforces this (as opposed to in kernel +APIs that can change as needed).

+

The kernel code itself can be logically separated in core kernel +code and device drivers code. Device drivers code is responsible of +accessing particular devices while the core kernel code is +generic. The core kernel can be further divided into multiple logical +subsystems (e.g. file access, networking, process management, etc.)

+
+
+

Monolithic kernel

+

A monolithic kernel is one where there is no access protection between +the various kernel subsystems and where public functions can be +directly called between various subsystems.

+../_images/ditaa-3dc899167df5e16a230c434cf5d6964cb5868482.png +

However, most monolithic kernels do enforce a logical separation +between subsystems especially between the core kernel and device +drivers with relatively strict APIs (but not necessarily fixed in +stone) that must be used to access services offered by one subsystem +or device drivers. This, of course, depends on the particular kernel +implementation and the kernel's architecture.

+
+
+

Micro kernel

+

A micro-kernel is one where large parts of the kernel are protected +from each-other, usually running as services in user space. Because +significant parts of the kernel are now running in user mode, the +remaining code that runs in kernel mode is significantly smaller, hence +micro-kernel term.

+../_images/ditaa-c8a3d93d0109b7be6f608871d16adff4aaa933da.png +

In a micro-kernel architecture the kernel contains just enough code +that allows for message passing between different running +processes. Practically that means implement the scheduler and an IPC +mechanism in the kernel, as well as basic memory management to setup +the protection between applications and services.

+

One of the advantages of this architecture is that the services are +isolated and hence bugs in one service won't impact other services.

+

As such, if a service crashes we can just restart it without affecting +the whole system. However, in practice this is difficult to achieve +since restarting a service may affect all applications that depend on +that service (e.g. if the file server crashes all applications with +opened file descriptors would encounter errors when accessing them).

+

This architecture imposes a modular approach to the kernel and offers +memory protection between services but at a cost of performance. What +is a simple function call between two services on monolithic kernels +now requires going through IPC and scheduling which will incur a +performance penalty [2].

+ + + + + +
[2]https://lwn.net/Articles/220255/
+
+
+

Micro-kernels vs monolithic kernels

+

Advocates of micro-kernels often suggest that micro-kernel are +superior because of the modular design a micro-kernel +enforces. However, monolithic kernels can also be modular and there +are several approaches that modern monolithic kernels use toward this +goal:

+
    +
  • Components can enabled or disabled at compile time
  • +
  • Support of loadable kernel modules (at runtime)
  • +
  • Organize the kernel in logical, independent subsystems
  • +
  • Strict interfaces but with low performance overhead: macros, +inline functions, function pointers
  • +
+

There is a class of operating systems that (used to) claim to be +hybrid kernels, in between monolithic and micro-kernels (e.g. Windows, +Mac OS X). However, since all of the typical monolithic services run +in kernel-mode in these operating systems, there is little merit to +qualify them other then monolithic kernels.

+

Many operating systems and kernel experts have dismissed the label +as meaningless, and just marketing. Linus Torvalds said of this +issue:

+

"As to the whole 'hybrid kernel' thing - it's just marketing. It's +'oh, those microkernels had good PR, how can we try to get good PR +for our working kernel? Oh, I know, let's use a cool name and try +to imply that it has all the PR advantages that that other system +has'."

+
+
+

Address space

+

The address space term is an overload term that can have different +meanings in different contexts.

+

The physical address space refers to the way the RAM and device +memories are visible on the memory bus. For example, on 32bit Intel +architecture, it is common to have the RAM mapped into the lower +physical address space while the graphics card memory is mapped high +in the physical address space.

+

The virtual address space (or sometimes just address space) refers to +the way the CPU sees the memory when the virtual memory module is +activated (sometime called protected mode or paging enabled). The +kernel is responsible of setting up a mapping that creates a virtual +address space in which areas of this space are mapped to certain +physical memory areas.

+

Related to the virtual address space there are two other terms that +are often used: process (address) space and kernel (address) space.

+

The process space is (part of) the virtual address space associated +with a process. It is the "memory view" of processes. It is a +continuous area that starts at zero. Where the process's address space +ends depends on the implementation and architecture.

+

The kernel space is the "memory view" of the code that runs in kernel +mode.

+
+
+

User and kernel sharing the virtual address space

+

A typical implementation for user and kernel spaces is one where the +virtual address space is shared between user processes and the kernel.

+

In this case kernel space is located at the top of the address space, +while user space at the bottom. In order to prevent the user processes +from accessing kernel space, the kernel creates mappings that prevent +access to the kernel space from user mode.

+ +
+
+

Execution contexts

+

One of the most important jobs of the kernel is to service interrupts +and to service them efficiently. This is so important that a special +execution context is associated with it.

+

The kernel executes in interrupt context when it runs as a result of +an interrupt. This includes the interrupt handler, but it is not +limited to it, there are other special (software) constructs that run +in interrupt mode.

+

Code running in interrupt context always runs in kernel mode and there +are certain limitations that the kernel programmer has to be aware of +(e.g. not calling blocking functions or accessing user space).

+

Opposed to interrupt context there is process context. Code that runs +in process context can do so in user mode (executing application code) +or in kernel mode (executing a system call).

+
+
+

Multi-tasking

+

Multitasking is the ability of the operating system to +"simultaneously" execute multiple programs. It does so by quickly +switching between running processes.

+

Cooperative multitasking requires the programs to cooperate to achieve +multitasking. A program will run and relinquish CPU control back +to the OS, which will then schedule another program.

+

With preemptive multitasking the kernel will enforce strict limits for +each process, so that all processes have a fair chance of +running. Each process is allowed to run a time slice (e.g. 100ms) +after which, if it is still running, it is forcefully preempted and +another task is scheduled.

+
+
+

Preemptive kernel

+

Preemptive multitasking and preemptive kernels are different terms.

+

A kernel is preemptive if a process can be preempted while running +in kernel mode.

+

However, note that non-preemptive kernels may support preemptive +multitasking.

+
+
+

Pageable kernel memory

+

A kernel supports pageable kernel memory if parts of kernel memory +(code, data, stack or dynamically allocated memory) can be swapped +to disk.

+
+
+

Kernel stack

+

Each process has a kernel stack that is used to maintain the +function call chain and local variables state while it is executing +in kernel mode, as a result of a system call.

+

The kernel stack is small (4KB - 12 KB) so the kernel developer has +to avoid allocating large structures on stack or recursive calls +that are not properly bounded.

+
+
+

Portability

+

In order to increase portability across various architectures and +hardware configurations, modern kernels are organized as follows at the +top level:

+
    +
  • Architecture and machine specific code (C & ASM)
  • +
  • Independent architecture code (C):
      +
    • kernel core (further split in multiple subsystems)
    • +
    • device drivers
    • +
    +
  • +
+

This makes it easier to reuse code as much as possible between +different architectures and machine configurations.

+
+
+

Asymmetric MultiProcessing (ASMP)

+

Asymmetric MultiProcessing (ASMP) is a way of supporting multiple +processors (cores) by a kernel, where a processor is dedicated to the +kernel and all other processors run user space programs.

+

The disadvantage of this approach is that the kernel throughput +(e.g. system calls, interrupt handling, etc.) does not scale with the +number of processors and hence typical processes frequently use system +calls. The scalability of the approach is limited to very specific +systems (e.g. scientific applications).

+../_images/ditaa-cb16db58a2489307b74d4f70256a48c81c65f6c6.png +
+
+

Symmetric MultiProcessing (SMP)

+

As opposed to ASMP, in SMP mode the kernel can run on any of the +existing processors, just as user processes. This approach is more +difficult to implement, because it creates race conditions in the +kernel if two processes run kernel functions that access the same +memory locations.

+

In order to support SMP the kernel must implement synchronization +primitives (e.g. spin locks) to guarantee that only one processor is +executing a critical section.

+../_images/ditaa-08aff771b3ff7a5525df7b0c090e28c836502788.png +
+
+

CPU Scalability

+

CPU scalability refers to how well the performance scales with +the number of cores. There are a few things that the kernel developer +should keep in mind with regard to CPU scalability:

+
    +
  • Use lock free algorithms when possible
  • +
  • Use fine grained locking for high contention areas
  • +
  • Pay attention to algorithm complexity
  • +
+
+
+
+

Overview of the Linux kernel

+
+

Linux development model

+

The Linux kernel is one the largest open source projects in the world +with thousands of developers contributing code and millions of lines of +code changed for each release.

+

It is distributed under the GPLv2 license, which simply put, +requires that any modification of the kernel done on software that is +shipped to customer should be made available to them (the customers), +although in practice most companies make the source code publicly +available.

+

There are many companies (often competing) that contribute code to the +Linux kernel as well as people from academia and independent +developers.

+

The current development model is based on doing releases at fixed +intervals of time (usually 3 - 4 months). New features are merged into +the kernel during a one or two week merge window. After the merge +window, a release candidate is done on a weekly basis (rc1, rc2, etc.)

+
+
+

Maintainer hierarchy

+

In order to scale the development process, Linux uses a hierarchical +maintainership model:

+
    +
  • Linus Torvalds is the maintainer of the Linux kernel and merges pull +requests from subsystem maintainers
  • +
  • Each subsystem has one or more maintainers that accept patches or +pull requests from developers or device driver maintainers
  • +
  • Each maintainer has its own git tree, e.g.:
      +
    • Linux Torvalds: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    • +
    • David Miller (networking): git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/
    • +
    +
  • +
  • Each subsystem may maintain a -next tree where developers can submit +patches for the next merge window
  • +
+

Since the merge window is only a maximum of two weeks, most of the +maintainers have a -next tree where they accept new features from +developers or maintainers downstream while even when the merge window +is closed.

+

Note that bug fixes are accepted even outside merge window in the +maintainer's tree from where they are periodically pulled by the +upstream maintainer regularly, for every release candidate.

+
+
+

Linux source code layout

+../_images/ditaa-f45246aade5ecc7cfb71f7f103a57f95fc7c2b9e.png +

These are the top level of the Linux source code folders:

+
    +
  • arch - contains architecture specific code; each architecture is +implemented in a specific sub-folder (e.g. arm, arm64, x86)
  • +
  • block - contains the block subsystem code that deals with reading +and writing data from block devices: creating block I/O requests, +scheduling them (there are several I/O schedulers available), +merging requests, and passing them down through the I/O stack to the +block device drivers
  • +
  • certs - implements support for signature checking using certificates
  • +
  • crypto - software implementation of various cryptography algorithms +as well as a framework that allows offloading such algorithms in +hardware
  • +
  • Documentation - documentation for various subsystems, Linux kernel +command line options, description for sysfs files and format, device +tree bindings (supported device tree nodes and format)
  • +
  • drivers - driver for various devices as well as the Linux driver +model implementation (an abstraction that describes drivers, devices +buses and the way they are connected)
  • +
  • firmware - binary or hex firmware files that are used by various +device drivers
  • +
  • fs - home of the Virtual Filesystem Switch (generic filesystem code) +and of various filesystem drivers
  • +
  • include - header files
  • +
  • init - the generic (as opposed to architecture specific) +initialization code that runs during boot
  • +
  • ipc - implementation for various Inter Process Communication system +calls such as message queue, semaphores, shared memory
  • +
  • kernel - process management code (including support for kernel +thread, workqueues), scheduler, tracing, time management, generic +irq code, locking
  • +
  • lib - various generic functions such as sorting, checksums, +compression and decompression, bitmap manipulation, etc.
  • +
  • mm - memory management code, for both physical and virtual memory, +including the page, SL*B and CMA allocators, swapping, virtual memory +mapping, process address space manipulation, etc.
  • +
  • net - implementation for various network stacks including IPv4 and +IPv6; BSD socket implementation, routing, filtering, packet +scheduling, bridging, etc.
  • +
  • samples - various driver samples
  • +
  • scripts - parts the build system, scripts used for building modules, +kconfig the Linux kernel configurator, as well as various other +scripts (e.g. checkpatch.pl that checks if a patch is conform with +the Linux kernel coding style)
  • +
  • security - home of the Linux Security Module framework that allows +extending the default (Unix) security model as well as +implementation for multiple such extensions such as SELinux, smack, +apparmor, tomoyo, etc.
  • +
  • sound - home of ALSA (Advanced Linux Sound System) as well as the +old Linux sound framework (OSS)
  • +
  • tools - various user space tools for testing or interacting with +Linux kernel subsystems
  • +
  • usr - support for embedding an initrd file in the kernel image
  • +
  • virt - home of the KVM (Kernel Virtual Machine) hypervisor
  • +
+
+
+

Linux kernel architecture

+../_images/ditaa-b9ffae65be16d30be11b5eca188a7a143b1b8227.png +
+

arch

+
    +
  • Architecture specific code
  • +
  • May be further sub-divided in machine specific code
  • +
  • Interfacing with the boot loader and architecture specific +initialization
  • +
  • Access to various hardware bits that are architecture or machine +specific such as interrupt controller, SMP controllers, BUS +controllers, exceptions and interrupt setup, virtual memory handling
  • +
  • Architecture optimized functions (e.g. memcpy, string operations, +etc.)
  • +
+

This part of the Linux kernel contains architecture specific code and +may be further sub-divided in machine specific code for certain +architectures (e.g. arm).

+

"Linux was first developed for 32-bit x86-based PCs (386 or +higher). These days it also runs on (at least) the Compaq Alpha AXP, +Sun SPARC and UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, +Hitachi SuperH, IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD +x86-64 and CRIS architectures.”

+

It implements access to various hardware bits that are architecture or +machine specific such as interrupt controller, SMP controllers, BUS +controllers, exceptions and interrupt setup, virtual memory handling.

+

It also implements architecture optimized functions (e.g. memcpy, +string operations, etc.)

+
+
+

Device drivers

+

The Linux kernel uses a unified device model whose purpose is to +maintain internal data structures that reflect the state and structure +of the system. Such information includes what devices are present, +what is their status, what bus they are attached to, to what driver +they are attached, etc. This information is essential for implementing +system wide power management, as well as device discovery and dynamic +device removal.

+

Each subsystem has its own specific driver interface that is tailored +to the devices it represents in order to make it easier to write +correct drivers and to reduce code duplication.

+

Linux supports one of the most diverse set of device drivers type, +some examples are: TTY, serial, SCSI, fileystem, ethernet, USB, +framebuffer, input, sound, etc.

+
+
+

Process management

+

Linux implements the standard Unix process management APIs such as +fork(), exec(), wait(), as well as standard POSIX threads.

+

However, Linux processes and threads are implemented particularly +different than other kernels. There are no internal structures +implementing processes or threads, instead there is a struct +task_struct that describe an abstract scheduling unit called task.

+

A task has pointers to resources, such as address space, file +descriptors, IPC ids, etc. The resource pointers for tasks that are +part of the same process point to the same resources, while resources +of tasks of different processes will point to different resources.

+

This peculiarity, together with the clone() and unshare() system +call allows for implementing new features such as namespaces.

+

Namespaces are used together with control groups (cgroup) to implement +operating system virtualization in Linux.

+

cgroup is a mechanism to organize processes hierarchically and +distribute system resources along the hierarchy in a controlled and +configurable manner.

+
+
+

Memory management

+

Linux memory management is a complex subsystem that deals with:

+
    +
  • Management of the physical memory: allocating and freeing memory
  • +
  • Management of the virtual memory: paging, swapping, demand +paging, copy on write
  • +
  • User services: user address space management (e.g. mmap(), brk(), +shared memory)
  • +
  • Kernel services: SL*B allocators, vmalloc
  • +
+
+
+

Block I/O management

+

The Linux Block I/O subsystem deals with reading and writing data from +or to block devices: creating block I/O requests, transforming block I/O +requests (e.g. for software RAID or LVM), merging and sorting the +requests and scheduling them via various I/O schedulers to the block +device drivers.

+../_images/ditaa-0a96997f269a7a9cd0cdc9c9125f6e62e549be94.png +
+
+

Virtual Filesystem Switch

+

The Linux Virtual Filesystem Switch implements common / generic +filesystem code to reduce duplication in filesystem drivers. It +introduces certain filesystem abstractions such as:

+
    +
  • inode - describes the file on disk (attributes, location of data +blocks on disk)
  • +
  • dentry - links an inode to a name
  • +
  • file - describes the properties of an opened file (e.g. file +pointer)
  • +
  • superblock - describes the properties of a formatted filesystem +(e.g. number of blocks, block size, location of root directory on +disk, encryption, etc.)
  • +
+../_images/ditaa-afa57a07e21b1b842554278abe30fea575278452.png +

The Linux VFS also implements a complex caching mechanism which +includes the following:

+
    +
  • the inode cache - caches the file attributes and internal file +metadata
  • +
  • the dentry cache - caches the directory hierarchy of a filesystem
  • +
  • the page cache - caches file data blocks in memory
  • +
+
+
+

Networking stack

+../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png +
+
+

Linux Security Modules

+
    +
  • Hooks to extend the default Linux security model
  • +
  • Used by several Linux security extensions:
      +
    • Security Enhancened Linux
    • +
    • AppArmor
    • +
    • Tomoyo
    • +
    • Smack
    • +
    +
  • +
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec10-networking-slides.html b/refs/pull/405/merge/so2/lec10-networking-slides.html new file mode 100644 index 00000000..8bca6f1a --- /dev/null +++ b/refs/pull/405/merge/so2/lec10-networking-slides.html @@ -0,0 +1,524 @@ + + + + + + + + SO2 Lecture 10 - Networking — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 10 - Networking

+ + + + + +
+
+ +

Network Management

+ +
    +
  • Socket implementation
  • +
  • Routing implementation
  • +
  • Network Device Interface
  • +
  • Hardware and Software Acceleration Techniques
  • +
+ + + + +
+
+ +

Network Management Overview

+ +../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png + + + + +
+
+ +

Sockets Implementation Overview

+ +../_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.png + + + + +
+
+ +

Sockets Families and Protocols

+ +../_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.png + + + + +
+
+ +

Example: UDP send

+ +
char c;
+struct sockaddr_in addr;
+int s;
+
+s = socket(AF_INET, SOCK_DGRAM, 0);
+connect(s, (struct sockaddr*)&addr, sizeof(addr));
+write(s, &c, 1);
+close(s);
+
+
+ + + + +
+
+ +

Example: UDP send

+ +../_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.png + + + + +
+
+ +

Network processing phases

+ +
    +
  • Interrupt handler - device driver fetches data from the RX ring, +creates a network packet and queues it to the network stack for +processing
  • +
  • NET_SOFTIRQ - packet goes through the stack layer and it is +processed: decapsulate Ethernet frame, check IP packet and route +it, if local packet decapsulate protocol packet (e.g. TCP) and +queues it to a socket
  • +
  • Process context - application fetches data from the socket queue +or pushes data to the socket queue
  • +
+ + + + +
+
+ +

Packet Routing

+ +../_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.png + + + + +
+
+ +

Routing Table

+ +
tavi@desktop-tavi:~/src/linux$ ip route list table main
+default via 172.30.240.1 dev eth0
+172.30.240.0/20 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip route list table local
+broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1
+local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1
+local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1
+broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1
+broadcast 172.30.240.0 dev eth0 proto kernel scope link src 172.30.249.241
+local 172.30.249.241 dev eth0 proto kernel scope host src 172.30.249.241
+broadcast 172.30.255.255 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip rule list
+0:      from all lookup local
+32766:  from all lookup main
+32767:  from all lookup default
+
+
+ + + + +
+
+ +

Routing Policy Database

+ +
    +
  • "Regular" routing only uses the destination address
  • +
  • To increase flexibility a "Routing Policy Database" is used that +allows different routing based on other fields such as the source +address, protocol type, transport ports, etc.
  • +
  • This is encoded as a list of rules that are evaluated based on +their priority (priority 0 is the highest)
  • +
  • Each rule has a selector (how to match the packet) and an +action (what action to take if the packet matches)
  • +
  • Selectors: source address, destination address, type of service (TOS), +input interface, output interface, etc.
  • +
  • Action: lookup / unicast - use given routing table, blackhole - +drop packet, unreachable - send ICMP unreachable message and drop +packet, etc.
  • +
+ + + + +
+
+ +

Routing table processing

+ +
    +
  • Special table for local addreses -> route packets to sockets +based on family, type, ports
  • +
  • Check every routing entry for starting with the most specific +routes (e.g. 192.168.0.0/24 is checked before 192.168.0.0/16)
  • +
  • A route matches if the packet destination addreess logical ORed +with the subnet mask equals the subnet address
  • +
  • Once a route matches the following information is retrieved: +interface, link layer next-hop address, network next host address
  • +
+ + + + +
+
+ +

Forward Information Database (removed in 3.6)

+ +

 

+../_images/fidb-overview1.png + + + + +
+
+ +

Forward Information Database (removed in 3.6)

+ +../_images/fidb-details1.png + + + + +
+
+ +

Routing Cache (removed in 3.6)

+ +

 

+../_images/routing-cache1.png + + + + +
+
+ +

FIB TRIE

+ +

 

+../_images/fib-trie1.png + + + + +
+
+ +

Compressed Trie

+ +

 

+../_images/fib-trie-compressed1.png + + + + +
+
+ +

Netfilter

+ +
    +
  • Framework that implements packet filtering and NAT
  • +
  • It uses hooks inserted in key places in the packet flow:
      +
    • NF_IP_PRE_ROUTING
    • +
    • NF_IP_LOCAL_IN
    • +
    • NF_IP_FORWARD
    • +
    • NF_IP_LOCAL_OUT
    • +
    • NF_IP_POST_ROUTING
    • +
    • NF_IP_NUMHOOKS
    • +
    +
  • +
+ + + + +
+
+ +

Network packets (skbs)

+ +../_images/skb1.png + + + + +
+
+ +

struct sk_buff

+ +
struct sk_buff {
+    struct sk_buff *next;
+    struct sk_buff *prev;
+
+    struct sock *sk;
+    ktime_t tstamp;
+    struct net_device *dev;
+    char cb[48];
+
+    unsigned int len,
+    data_len;
+    __u16 mac_len,
+    hdr_len;
+
+    void (*destructor)(struct sk_buff *skb);
+
+    sk_buff_data_t transport_header;
+    sk_buff_data_t network_header;
+    sk_buff_data_t mac_header;
+    sk_buff_data_t tail;
+    sk_buff_data_t end;
+
+    unsigned char *head,
+    *data;
+    unsigned int truesize;
+    atomic_t users;
+
+
+ + + + +
+
+ +

skb APIs

+ +
/* reserve head room */
+void skb_reserve(struct sk_buff *skb, int len);
+
+/* add data to the end */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len);
+
+/* add data to the top */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the top */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the end */
+unsigned char *skb_trim(struct sk_buff *skb, unsigned int len);
+
+unsigned char *skb_transport_header(const struct sk_buff *skb);
+
+void skb_reset_transport_header(struct sk_buff *skb);
+
+void skb_set_transport_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_network_header(const struct sk_buff *skb);
+
+void skb_reset_network_header(struct sk_buff *skb);
+
+void skb_set_network_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_mac_header(const struct sk_buff *skb);
+
+int skb_mac_header_was_set(const struct sk_buff *skb);
+
+void skb_reset_mac_header(struct sk_buff *skb);
+
+void skb_set_mac_header(struct sk_buff *skb, const int offset);
+
+
+ + + + +
+
+ +

skb data management

+ +

 

+../_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.png + + + + +
+
+ +

Network Device Interface

+ +../_images/net-dev-hw1.png + + + + +
+
+ +

Advanced features

+ +
    +
  • Scatter-Gather
  • +
  • Checksum offloading: Ethernet, IP, UDP, TCP
  • +
  • Adaptive interrupt handling (coalescence, adaptive)
  • +
+ + + + +
+
+ +

TCP offload

+ +
    +
  • Full offload - Implement TCP/IP stack in hardware
  • +
  • Issues:
      +
    • Scaling number of connections
    • +
    • Security
    • +
    • Conformance
    • +
    +
  • +
+ + + + +
+
+ +

Performance observation

+ +
    +
  • Performance is proportional with the number of packets to be +processed
  • +
  • Example: if an end-point can process 60K pps
      +
    • 1538 MSS -> 738Mbps
    • +
    • 2038 MSS -> 978Mbps
    • +
    • 9038 MSS -> 4.3Gbps
    • +
    • 20738 MSS -> 9.9Gbps
    • +
    +
  • +
+ + + + +
+
+ +

Stateless offload

+ +
    +
  • The networking stack processes large packets
  • +
  • TX path: the hardware splits large packets in smaller packets +(TCP Segmentation Offload)
  • +
  • RX path: the hardware aggregates small packets into larger +packets (Large Receive Offload - LRO)
  • +
+ + + + +
+
+ +

TCP Segmentation Offload

+ +../_images/tso1.png + + + + +
+
+ +

Large Receive Offload

+ +../_images/lro1.png + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec10-networking.html b/refs/pull/405/merge/so2/lec10-networking.html new file mode 100644 index 00000000..fc62e945 --- /dev/null +++ b/refs/pull/405/merge/so2/lec10-networking.html @@ -0,0 +1,462 @@ + + + + + + SO2 Lecture 10 - Networking — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 10 - Networking

+

View slides

+
+

Lecture objectives:

+
    +
  • Socket implementation
  • +
  • Routing implementation
  • +
  • Network Device Interface
  • +
  • Hardware and Software Acceleration Techniques
  • +
+
+
+

Network Management Overview

+../_images/ditaa-a2ded49c8b739635d6742479583443fb10ad120a.png +
+
+

Sockets Implementation Overview

+../_images/ditaa-79e3734c36891f6c04d684aa5caa39f76915dbaf.png +
+
+

Sockets Families and Protocols

+../_images/ditaa-bf1244d1a5c3d99bd8d40148d81cb3e5748c0b94.png +
+

Example: UDP send

+
char c;
+struct sockaddr_in addr;
+int s;
+
+s = socket(AF_INET, SOCK_DGRAM, 0);
+connect(s, (struct sockaddr*)&addr, sizeof(addr));
+write(s, &c, 1);
+close(s);
+
+
+../_images/ditaa-ee04e3e544de75375b914f7645c79d5ae46fe6f3.png +
+
+
+

Network processing phases

+
    +
  • Interrupt handler - device driver fetches data from the RX ring, +creates a network packet and queues it to the network stack for +processing
  • +
  • NET_SOFTIRQ - packet goes through the stack layer and it is +processed: decapsulate Ethernet frame, check IP packet and route +it, if local packet decapsulate protocol packet (e.g. TCP) and +queues it to a socket
  • +
  • Process context - application fetches data from the socket queue +or pushes data to the socket queue
  • +
+
+
+

Packet Routing

+../_images/ditaa-528948c80a3fd78b89fb6f7bd69503a58b93a4ae.png +
+

Routing Table(s)

+
tavi@desktop-tavi:~/src/linux$ ip route list table main
+default via 172.30.240.1 dev eth0
+172.30.240.0/20 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip route list table local
+broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1
+local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1
+local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1
+broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1
+broadcast 172.30.240.0 dev eth0 proto kernel scope link src 172.30.249.241
+local 172.30.249.241 dev eth0 proto kernel scope host src 172.30.249.241
+broadcast 172.30.255.255 dev eth0 proto kernel scope link src 172.30.249.241
+
+tavi@desktop-tavi:~/src/linux$ ip rule list
+0:      from all lookup local
+32766:  from all lookup main
+32767:  from all lookup default
+
+
+
+
+

Routing Policy Database

+
    +
  • "Regular" routing only uses the destination address
  • +
  • To increase flexibility a "Routing Policy Database" is used that +allows different routing based on other fields such as the source +address, protocol type, transport ports, etc.
  • +
  • This is encoded as a list of rules that are evaluated based on +their priority (priority 0 is the highest)
  • +
  • Each rule has a selector (how to match the packet) and an +action (what action to take if the packet matches)
  • +
  • Selectors: source address, destination address, type of service (TOS), +input interface, output interface, etc.
  • +
  • Action: lookup / unicast - use given routing table, blackhole - +drop packet, unreachable - send ICMP unreachable message and drop +packet, etc.
  • +
+
+
+

Routing table processing

+
    +
  • Special table for local addreses -> route packets to sockets +based on family, type, ports
  • +
  • Check every routing entry for starting with the most specific +routes (e.g. 192.168.0.0/24 is checked before 192.168.0.0/16)
  • +
  • A route matches if the packet destination addreess logical ORed +with the subnet mask equals the subnet address
  • +
  • Once a route matches the following information is retrieved: +interface, link layer next-hop address, network next host address
  • +
+
+
+

Forwarding Information Database

+

 

+../_images/fidb-overview1.png +../_images/fidb-details1.png +

 

+../_images/routing-cache1.png +

 

+../_images/fib-trie1.png +

 

+../_images/fib-trie-compressed1.png +
+
+
+

Netfilter

+
    +
  • Framework that implements packet filtering and NAT
  • +
  • It uses hooks inserted in key places in the packet flow:
      +
    • NF_IP_PRE_ROUTING
    • +
    • NF_IP_LOCAL_IN
    • +
    • NF_IP_FORWARD
    • +
    • NF_IP_LOCAL_OUT
    • +
    • NF_IP_POST_ROUTING
    • +
    • NF_IP_NUMHOOKS
    • +
    +
  • +
+
+
+

Network packets / skbs (struct sk_buff)

+../_images/skb1.png +
struct sk_buff {
+    struct sk_buff *next;
+    struct sk_buff *prev;
+
+    struct sock *sk;
+    ktime_t tstamp;
+    struct net_device *dev;
+    char cb[48];
+
+    unsigned int len,
+    data_len;
+    __u16 mac_len,
+    hdr_len;
+
+    void (*destructor)(struct sk_buff *skb);
+
+    sk_buff_data_t transport_header;
+    sk_buff_data_t network_header;
+    sk_buff_data_t mac_header;
+    sk_buff_data_t tail;
+    sk_buff_data_t end;
+
+    unsigned char *head,
+    *data;
+    unsigned int truesize;
+    atomic_t users;
+
+
+
/* reserve head room */
+void skb_reserve(struct sk_buff *skb, int len);
+
+/* add data to the end */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len);
+
+/* add data to the top */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the top */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
+
+/* discard data at the end */
+unsigned char *skb_trim(struct sk_buff *skb, unsigned int len);
+
+unsigned char *skb_transport_header(const struct sk_buff *skb);
+
+void skb_reset_transport_header(struct sk_buff *skb);
+
+void skb_set_transport_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_network_header(const struct sk_buff *skb);
+
+void skb_reset_network_header(struct sk_buff *skb);
+
+void skb_set_network_header(struct sk_buff *skb, const int offset);
+
+unsigned char *skb_mac_header(const struct sk_buff *skb);
+
+int skb_mac_header_was_set(const struct sk_buff *skb);
+
+void skb_reset_mac_header(struct sk_buff *skb);
+
+void skb_set_mac_header(struct sk_buff *skb, const int offset);
+
+
+

 

+../_images/ditaa-91073cb05a3f537eb54ab10745c307531e6795a0.png +
+
+

Network Device

+../_images/net-dev-hw1.png +
    +
  • Scatter-Gather
  • +
  • Checksum offloading: Ethernet, IP, UDP, TCP
  • +
  • Adaptive interrupt handling (coalescence, adaptive)
  • +
+
+
+

Hardware and Software Acceleration Techniques

+
    +
  • Full offload - Implement TCP/IP stack in hardware
  • +
  • Issues:
      +
    • Scaling number of connections
    • +
    • Security
    • +
    • Conformance
    • +
    +
  • +
+
    +
  • Performance is proportional with the number of packets to be +processed
  • +
  • Example: if an end-point can process 60K pps
      +
    • 1538 MSS -> 738Mbps
    • +
    • 2038 MSS -> 978Mbps
    • +
    • 9038 MSS -> 4.3Gbps
    • +
    • 20738 MSS -> 9.9Gbps
    • +
    +
  • +
+
    +
  • The networking stack processes large packets
  • +
  • TX path: the hardware splits large packets in smaller packets +(TCP Segmentation Offload)
  • +
  • RX path: the hardware aggregates small packets into larger +packets (Large Receive Offload - LRO)
  • +
+../_images/tso1.png +../_images/lro1.png +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec11-arch-slides.html b/refs/pull/405/merge/so2/lec11-arch-slides.html new file mode 100644 index 00000000..374b61eb --- /dev/null +++ b/refs/pull/405/merge/so2/lec11-arch-slides.html @@ -0,0 +1,254 @@ + + + + + + + + SO2 Lecture 11 - Architecture Layer — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 11 - Architecture Layer

+ + + + + +
+
+ +

Introduction

+ +
    +
  • Overview of the arch layer
  • +
  • Overview of the boot process
  • +
+ + + + +
+
+ +

Overview of the arch layer

+ +../_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.png + + + + +
+
+ +

Bootstrap

+ +
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Move / Relocate kernel code
  • +
+ + + + +
+
+ +

Bootstrap

+ +
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Copy bootloader arguments and determine kernel run location
  • +
  • Move / relocate kernel code to final location
  • +
  • Initial MMU setup - map the kernel
  • +
+ + + + +
+
+ +

Memory Setup

+ +
    +
  • Determine available memory and setup the boot memory allocator
  • +
  • Manages memory regions before the page allocator is setup
  • +
  • Bootmem - used a bitmap to track free blocks
  • +
  • Memblock - deprecates bootmem and adds support for memory ranges
      +
    • Supports both physical and virtual addresses
    • +
    • support NUMA architectures
    • +
    +
  • +
+ + + + +
+
+ +

MMU management

+ +
    +
  • Implements the generic page table manipulation APIs: types, +accessors, flags
  • +
  • Implement TLB management APIs: flush, invalidate
  • +
+ + + + +
+
+ +

Thread Management

+ +
    +
  • Defines the thread type (struct thread_info) and implements +functions for allocating threads (if needed)
  • +
  • Implement copy_thread() and switch_context()
  • +
+ + + + +
+
+ +

Timer Management

+ +
    +
  • Setup the timer tick and provide a time source
  • +
  • Mostly transitioned to platform drivers
      +
    • clock_event_device - for scheduling timers
    • +
    • clocksource - for reading the time
    • +
    +
  • +
+ + + + +
+
+ +

IRQs and exception management

+ +
    +
  • Define interrupt and exception handlers / entry points
  • +
  • Setup priorities
  • +
  • Platform drivers for interrupt controllers
  • +
+ + + + +
+
+ +

System calls

+ +
    +
  • Define system call entry point(s)
  • +
  • Implement user-space access primitives (e.g. copy_to_user)
  • +
+ + + + +
+
+ +

Platform Drivers

+ +
    +
  • Platform and architecture specific drivers
  • +
  • Bindings to platform device enumeration methods (e.g. device tree +or ACPI)
  • +
+ + + + +
+
+ +

Machine specific code

+ +
    +
  • Some architectures use a "machine" / "platform" abstraction
  • +
  • Typical for architecture used in embedded systems with a lot of +variety (e.g. ARM, powerPC)
  • +
+ + + + +
+
+ +

Boot flow inspection

+ + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec11-arch.html b/refs/pull/405/merge/so2/lec11-arch.html new file mode 100644 index 00000000..7cbf7cb3 --- /dev/null +++ b/refs/pull/405/merge/so2/lec11-arch.html @@ -0,0 +1,317 @@ + + + + + + SO2 Lecture 11 - Architecture Layer — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 11 - Architecture Layer

+

View slides

+
+

Lecture objectives:

+
    +
  • Overview of the arch layer
  • +
  • Overview of the boot process
  • +
+
+
+

Overview of the arch layer

+../_images/ditaa-ae895f3a8e26b92bf6c6ecbbd71e2c88912d5607.png +
+

Boot strap

+
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Move / Relocate kernel code
  • +
+
+
+

Boot strap

+
    +
  • The first kernel code that runs
  • +
  • Typically runs with the MMU disabled
  • +
  • Copy bootloader arguments and determine kernel run location
  • +
  • Move / relocate kernel code to final location
  • +
  • Initial MMU setup - map the kernel
  • +
+
+
+

Memory setup

+
    +
  • Determine available memory and setup the boot memory allocator
  • +
  • Manages memory regions before the page allocator is setup
  • +
  • Bootmem - used a bitmap to track free blocks
  • +
  • Memblock - deprecates bootmem and adds support for memory ranges
      +
    • Supports both physical and virtual addresses
    • +
    • support NUMA architectures
    • +
    +
  • +
+
+
+

MMU management

+
    +
  • Implements the generic page table manipulation APIs: types, +accessors, flags
  • +
  • Implement TLB management APIs: flush, invalidate
  • +
+
+
+

Thread Management

+
    +
  • Defines the thread type (struct thread_info) and implements +functions for allocating threads (if needed)
  • +
  • Implement copy_thread() and switch_context()
  • +
+
+
+

Time Management

+
    +
  • Setup the timer tick and provide a time source
  • +
  • Mostly transitioned to platform drivers
      +
    • clock_event_device - for scheduling timers
    • +
    • clocksource - for reading the time
    • +
    +
  • +
+
+
+

IRQs and exception management

+
    +
  • Define interrupt and exception handlers / entry points
  • +
  • Setup priorities
  • +
  • Platform drivers for interrupt controllers
  • +
+
+
+

System calls

+
    +
  • Define system call entry point(s)
  • +
  • Implement user-space access primitives (e.g. copy_to_user)
  • +
+
+
+

Platform Drivers

+
    +
  • Platform and architecture specific drivers
  • +
  • Bindings to platform device enumeration methods (e.g. device tree +or ACPI)
  • +
+
+
+

Machine specific code

+
    +
  • Some architectures use a "machine" / "platform" abstraction
  • +
  • Typical for architecture used in embedded systems with a lot of +variety (e.g. ARM, powerPC)
  • +
+
+
+
+

Overview of the boot process

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec12-profiling-slides.html b/refs/pull/405/merge/so2/lec12-profiling-slides.html new file mode 100644 index 00000000..a3c2d027 --- /dev/null +++ b/refs/pull/405/merge/so2/lec12-profiling-slides.html @@ -0,0 +1,70 @@ + + + + + + + + SO2 Lecture 12 - Profiling — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 12 - Profiling

+ + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec12-profiling.html b/refs/pull/405/merge/so2/lec12-profiling.html new file mode 100644 index 00000000..d28a2221 --- /dev/null +++ b/refs/pull/405/merge/so2/lec12-profiling.html @@ -0,0 +1,157 @@ + + + + + + SO2 Lecture 12 - Profiling — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec12-virtualization-slides.html b/refs/pull/405/merge/so2/lec12-virtualization-slides.html new file mode 100644 index 00000000..fb844dfd --- /dev/null +++ b/refs/pull/405/merge/so2/lec12-virtualization-slides.html @@ -0,0 +1,709 @@ + + + + + + + + SO2 Lecture 12 - Virtualization — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 12 - Virtualization

+ + + + + +
+
+ +

Virtualization

+ +
    +
  • Emulation basics
  • +
  • Virtualization basics
  • +
  • Paravirtualization basics
  • +
  • Hardware support for virtualization
  • +
  • Overview of the Xen hypervisor
  • +
  • Overview of the KVM hypervisor
  • +
+ + + + +
+
+ +

Emulation basics

+ +
    +
  • Instructions are emulated (each time they are executed)
  • +
  • The other system components are also emulated:
      +
    • MMU
    • +
    • Physical memory access
    • +
    • Peripherals
    • +
    +
  • +
  • Target architecture - the architecture that it is emulated
  • +
  • Host architecture - the architecture that the emulator runs on
  • +
  • For emulation target and host architectures can be different
  • +
+ + + + +
+
+ +

Virtualization basics

+ +
    +
  • Defined in a paper by Popek & Goldberg in 1974
  • +
  • Fidelity
  • +
  • Performance
  • +
  • Security
  • +
+../_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.png + + + + +
+
+ +

Classic virtualization

+ +
    +
  • Trap & Emulate
  • +
  • Same architecture for host and target
  • +
  • Most of the target instructions are natively executed
  • +
  • Target OS runs in non-privilege mode on the host
  • +
  • Privileged instructions are trapped and emulated
  • +
  • Two machine states: host and guest
  • +
+ + + + +
+
+ +

Software virtualization

+ +
    +
  • Not all architecture can be virtualized; e.g. x86:
      +
    • CS register encodes the CPL
    • +
    • Some instructions don't generate a trap (e.g. popf)
    • +
    +
  • +
  • Solution: emulate instructions using binary translation
  • +
+ + + + +
+
+ +

MMU virtualization

+ +
    +
  • "Fake" VM physical addresses are translated by the host to actual +physical addresses
  • +
  • Guest virtual address -> Guest physical address -> Host Physical Address
  • +
  • The guest page tables are not directly used by the host hardware
  • +
  • VM page tables are verified then translated into a new set of page +tables on the host (shadow page tables)
  • +
+ + + + +
+
+ +

Shadow page tables

+ +

 

+../_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.png + + + + +
+
+ +

Lazy shadow sync

+ +
    +
  • Guest page tables changes are typically batched
  • +
  • To avoid repeated traps, checks and transformations map guest +page table entries with write access
  • +
  • Update the shadow page table when
      +
    • The TLB is flushed
    • +
    • In the host page fault handler
    • +
    +
  • +
+ + + + +
+
+ +

I/O emulation

+ +

 

+../_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.png + + + + +
+
+ +

Example: qemu SiFive UART emulation

+ +
/*
+ * QEMU model of the UART on the SiFive E300 and U500 series SOCs.
+ *
+ * Copyright (c) 2016 Stefan O'Rear
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/log.h"
+#include "chardev/char.h"
+#include "chardev/char-fe.h"
+#include "hw/irq.h"
+#include "hw/char/sifive_uart.h"
+
+/*
+ * Not yet implemented:
+ *
+ * Transmit FIFO using "qemu/fifo8.h"
+ */
+
+/* Returns the state of the IP (interrupt pending) register */
+static uint64_t uart_ip(SiFiveUARTState *s)
+{
+    uint64_t ret = 0;
+
+    uint64_t txcnt = SIFIVE_UART_GET_TXCNT(s->txctrl);
+    uint64_t rxcnt = SIFIVE_UART_GET_RXCNT(s->rxctrl);
+
+    if (txcnt != 0) {
+        ret |= SIFIVE_UART_IP_TXWM;
+    }
+    if (s->rx_fifo_len > rxcnt) {
+        ret |= SIFIVE_UART_IP_RXWM;
+    }
+
+    return ret;
+}
+
+static void update_irq(SiFiveUARTState *s)
+{
+    int cond = 0;
+    if ((s->ie & SIFIVE_UART_IE_TXWM) ||
+        ((s->ie & SIFIVE_UART_IE_RXWM) && s->rx_fifo_len)) {
+        cond = 1;
+    }
+    if (cond) {
+        qemu_irq_raise(s->irq);
+    } else {
+        qemu_irq_lower(s->irq);
+    }
+}
+
+static uint64_t
+uart_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    unsigned char r;
+    switch (addr) {
+    case SIFIVE_UART_RXFIFO:
+        if (s->rx_fifo_len) {
+            r = s->rx_fifo[0];
+            memmove(s->rx_fifo, s->rx_fifo + 1, s->rx_fifo_len - 1);
+            s->rx_fifo_len--;
+            qemu_chr_fe_accept_input(&s->chr);
+            update_irq(s);
+            return r;
+        }
+        return 0x80000000;
+
+    case SIFIVE_UART_TXFIFO:
+        return 0; /* Should check tx fifo */
+    case SIFIVE_UART_IE:
+        return s->ie;
+    case SIFIVE_UART_IP:
+        return uart_ip(s);
+    case SIFIVE_UART_TXCTRL:
+        return s->txctrl;
+    case SIFIVE_UART_RXCTRL:
+        return s->rxctrl;
+    case SIFIVE_UART_DIV:
+        return s->div;
+    }
+
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad read: addr=0x%x\n",
+                  __func__, (int)addr);
+    return 0;
+}
+
+static void
+uart_write(void *opaque, hwaddr addr,
+           uint64_t val64, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    uint32_t value = val64;
+    unsigned char ch = value;
+
+    switch (addr) {
+    case SIFIVE_UART_TXFIFO:
+        qemu_chr_fe_write(&s->chr, &ch, 1);
+        update_irq(s);
+        return;
+    case SIFIVE_UART_IE:
+        s->ie = val64;
+        update_irq(s);
+        return;
+    case SIFIVE_UART_TXCTRL:
+        s->txctrl = val64;
+        return;
+    case SIFIVE_UART_RXCTRL:
+        s->rxctrl = val64;
+        return;
+    case SIFIVE_UART_DIV:
+        s->div = val64;
+        return;
+    }
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad write: addr=0x%x v=0x%x\n",
+                  __func__, (int)addr, (int)value);
+}
+
+static const MemoryRegionOps uart_ops = {
+    .read = uart_read,
+    .write = uart_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4
+    }
+};
+
+static void uart_rx(void *opaque, const uint8_t *buf, int size)
+{
+    SiFiveUARTState *s = opaque;
+
+    /* Got a byte.  */
+    if (s->rx_fifo_len >= sizeof(s->rx_fifo)) {
+        printf("WARNING: UART dropped char.\n");
+        return;
+    }
+    s->rx_fifo[s->rx_fifo_len++] = *buf;
+
+    update_irq(s);
+}
+
+static int uart_can_rx(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    return s->rx_fifo_len < sizeof(s->rx_fifo);
+}
+
+static void uart_event(void *opaque, QEMUChrEvent event)
+{
+}
+
+static int uart_be_change(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+
+    return 0;
+}
+
+/*
+ * Create UART device.
+ */
+SiFiveUARTState *sifive_uart_create(MemoryRegion *address_space, hwaddr base,
+    Chardev *chr, qemu_irq irq)
+{
+    SiFiveUARTState *s = g_malloc0(sizeof(SiFiveUARTState));
+    s->irq = irq;
+    qemu_chr_fe_init(&s->chr, chr, &error_abort);
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+    memory_region_init_io(&s->mmio, NULL, &uart_ops, s,
+                          TYPE_SIFIVE_UART, SIFIVE_UART_MAX);
+    memory_region_add_subregion(address_space, base, &s->mmio);
+    return s;
+}
+
+
+ + + + +
+
+ +

Paravirtualization

+ +
    +
  • Change the guest OS so that it cooperates with the VMM
      +
    • CPU paravirtualization
    • +
    • MMU paravirtualization
    • +
    • I/O paravirtualization
    • +
    +
  • +
  • VMM exposes hypercalls for:
      +
    • activate / deactivate the interrupts
    • +
    • changing page tables
    • +
    • accessing virtualized peripherals
    • +
    +
  • +
  • VMM uses events to trigger interrupts in the VM
  • +
+ + + + +
+
+ +

Intel VT-x

+ +
    +
  • Hardware extension to transform x86 to the point it can be +virtualized "classically"
  • +
  • New execution mode: non-root mode
  • +
  • Each non-root mode instance uses a Virtual Machine Control +Structure (VMCS) to store its state
  • +
  • VMM runs in root mode
  • +
  • VM-entry and VM-exit are used to transition between the two modes
  • +
+ + + + +
+
+ +

Virtual Machine Control Structure

+ +
    +
  • Guest information: state of the virtual CPU
  • +
  • Host information: state of the physical CPU
  • +
  • Saved information:
      +
    • visible state: segment registers, CR3, IDTR, etc.
    • +
    • internal state
    • +
    +
  • +
  • VMCS can not be accessed directly but certain information can be +accessed with special instructions
  • +
+ + + + +
+
+ +

VM entry & exit

+ +
    +
  • VM entry - new instructions that switches the CPU in non-root +mode and loads the VM state from a VMCS; host state is saved in +VMCS
  • +
  • Allows injecting interrupts and exceptions in the guest
  • +
  • VM exit will be automatically triggered based on the VMCS +configuration
  • +
  • When VM exit occurs host state is loaded from VMCS, guest state +is saved in VMCS
  • +
+ + + + +
+
+ +

VM execution control fields

+ +
    +
  • Selects conditions which triggers a VM exit; examples:
      +
    • If an external interrupt is generated
    • +
    • If an external interrupt is generated and EFLAGS.IF is set
    • +
    • If CR0-CR4 registers are modified
    • +
    +
  • +
  • Exception bitmap - selects which exceptions will generate a VM +exit
  • +
  • IO bitmap - selects which I/O addresses (IN/OUT accesses) +generates a VM exit
  • +
  • MSR bitmaps - selects which RDMSR or WRMSR instructions will +generate a VM exit
  • +
+ + + + +
+
+ +

Extend Page Tables

+ +
    +
  • Reduces the complexity of MMU virtualization and improves +performance
  • +
  • Access to CR3, INVLPG and page faults do not require VM exit +anymore
  • +
  • The EPT page table is controlled by the VMM
  • +
+../_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.png + + + + +
+
+ +

VPID

+ +
    +
  • VM entry and VM exit forces a TLB flush - loses VMM / VM translations
  • +
  • To avoid this issue a VPID (Virtual Processor ID) tag is +associated with each VM (VPID 0 is reserved for the VMM)
  • +
  • All TLB entries are tagged
  • +
  • At VM entry and exit just the entries associated with the tags +are flushed
  • +
  • When searching the TLB just the current VPID is used
  • +
+ + + + +
+
+ +

I/O virtualization

+ +../_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.png + + + + +
+
+ +

I/O MMU

+ +

VT-d protects and translates VM physical addresses using an I/O +MMU (DMA remaping)

+../_images/ditaa-d880751969de8642b2613caaca345d71acea4500.png + + + + +
+
+ +

Interrupt posting

+ +
    +
  • Messsage Signaled Interrupts (MSI) = DMA writes to the host +address range of the IRQ controller (e.g. 0xFEExxxxx)
  • +
  • Low bits of the address and the data indicate which interrupt +vector to deliver to which CPU
  • +
  • Interrupt remapping table points to the virtual CPU (VMCS) that +should receive the interrupt
  • +
  • I/O MMU will trap the IRQ controller write and look it up in the +interrupt remmaping table
      +
    • if that virtual CPU is currently running it will take the +interrupt directly
    • +
    • otherwise a bit is set in a table (Posted Interrupt Descriptor +table) and the interrupt will be inject next time that vCPU is +run
    • +
    +
  • +
+ + + + +
+
+ +

I/O virtualization

+ +../_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.png + + + + +
+
+ +

SR-IOV

+ +
    +
  • Single Root - Input Output Virtualization
  • +
  • Physical device with multiple Ethernet ports will be shown as +multiple device on the PCI bus
  • +
  • Physical Function is used for the control and can be configured
      +
    • to present itself as a new PCI device
    • +
    • which VLAN to use
    • +
    +
  • +
  • The new virtual function is enumerated on the bus and can be +assigned to a particular guest
  • +
+ + + + +
+
+ +

qemu

+ +
    +
  • Uses binary translation via Tiny Code Generator (TCG) for +efficient emulation
  • +
  • Supports different target and host architectures (e.g. running +ARM VMs on x86)
  • +
  • Both process and full system level emulation
  • +
  • MMU emulation
  • +
  • I/O emulation
  • +
  • Can be used with KVM for accelerated virtualization
  • +
+ + + + +
+
+ +

KVM

+ +../_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.png + + + + +
+
+ +

KVM

+ +
    +
  • Linux device driver for hardware virtualization (e.g. Intel VT-x, SVM)
  • +
  • IOCTL based interface for managing and running virtual CPUs
  • +
  • VMM components implemented inside the Linux kernel +(e.g. interrupt controller, timers)
  • +
  • Shadow page tables or EPT if present
  • +
  • Uses qemu or virtio for I/O virtualization
  • +
+ + + + +
+
+ +

Xen

+ +
    +
  • Type 1 = Bare Metal Hypervisor
  • +
  • Type 2 = Hypervisor embedded in an exist kernel / OS
  • +
+ + + + +
+
+ +

Xen

+ +../_images/xen-overview1.png + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec12-virtualization.html b/refs/pull/405/merge/so2/lec12-virtualization.html new file mode 100644 index 00000000..f752ebc5 --- /dev/null +++ b/refs/pull/405/merge/so2/lec12-virtualization.html @@ -0,0 +1,696 @@ + + + + + + SO2 Lecture 12 - Virtualization — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 12 - Virtualization

+

View slides

+
+

Lecture objectives:

+
    +
  • Emulation basics
  • +
  • Virtualization basics
  • +
  • Paravirtualization basics
  • +
  • Hardware support for virtualization
  • +
  • Overview of the Xen hypervisor
  • +
  • Overview of the KVM hypervisor
  • +
+
+
+

Emulation basics

+
    +
  • Instructions are emulated (each time they are executed)
  • +
  • The other system components are also emulated:
      +
    • MMU
    • +
    • Physical memory access
    • +
    • Peripherals
    • +
    +
  • +
  • Target architecture - the architecture that it is emulated
  • +
  • Host architecture - the architecture that the emulator runs on
  • +
  • For emulation target and host architectures can be different
  • +
+
+
+

Virtualization basics

+
    +
  • Defined in a paper by Popek & Goldberg in 1974
  • +
  • Fidelity
  • +
  • Performance
  • +
  • Security
  • +
+../_images/ditaa-91f08f7db4b54069e16694eab8d75c06400fc47b.png +
+
+

Classic virtualization

+
    +
  • Trap & Emulate
  • +
  • Same architecture for host and target
  • +
  • Most of the target instructions are natively executed
  • +
  • Target OS runs in non-privilege mode on the host
  • +
  • Privileged instructions are trapped and emulated
  • +
  • Two machine states: host and guest
  • +
+
+
+

Software virtualization

+
    +
  • Not all architecture can be virtualized; e.g. x86:
      +
    • CS register encodes the CPL
    • +
    • Some instructions don't generate a trap (e.g. popf)
    • +
    +
  • +
  • Solution: emulate instructions using binary translation
  • +
+
+
+

MMU virtualization

+
    +
  • "Fake" VM physical addresses are translated by the host to actual +physical addresses
  • +
  • Guest virtual address -> Guest physical address -> Host Physical Address
  • +
  • The guest page tables are not directly used by the host hardware
  • +
  • VM page tables are verified then translated into a new set of page +tables on the host (shadow page tables)
  • +
+
+

Shadow page tables

+

 

+../_images/ditaa-8632e22c6d89bd18f97c9cef127444486b5077df.png +
+
+

Lazy shadow sync

+
    +
  • Guest page tables changes are typically batched
  • +
  • To avoid repeated traps, checks and transformations map guest +page table entries with write access
  • +
  • Update the shadow page table when
      +
    • The TLB is flushed
    • +
    • In the host page fault handler
    • +
    +
  • +
+
+
+
+

I/O emulation

+

 

+../_images/ditaa-bb69666d75b9670e542682753fb8cc9b77ff8894.png +
/*
+ * QEMU model of the UART on the SiFive E300 and U500 series SOCs.
+ *
+ * Copyright (c) 2016 Stefan O'Rear
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/log.h"
+#include "chardev/char.h"
+#include "chardev/char-fe.h"
+#include "hw/irq.h"
+#include "hw/char/sifive_uart.h"
+
+/*
+ * Not yet implemented:
+ *
+ * Transmit FIFO using "qemu/fifo8.h"
+ */
+
+/* Returns the state of the IP (interrupt pending) register */
+static uint64_t uart_ip(SiFiveUARTState *s)
+{
+    uint64_t ret = 0;
+
+    uint64_t txcnt = SIFIVE_UART_GET_TXCNT(s->txctrl);
+    uint64_t rxcnt = SIFIVE_UART_GET_RXCNT(s->rxctrl);
+
+    if (txcnt != 0) {
+        ret |= SIFIVE_UART_IP_TXWM;
+    }
+    if (s->rx_fifo_len > rxcnt) {
+        ret |= SIFIVE_UART_IP_RXWM;
+    }
+
+    return ret;
+}
+
+static void update_irq(SiFiveUARTState *s)
+{
+    int cond = 0;
+    if ((s->ie & SIFIVE_UART_IE_TXWM) ||
+        ((s->ie & SIFIVE_UART_IE_RXWM) && s->rx_fifo_len)) {
+        cond = 1;
+    }
+    if (cond) {
+        qemu_irq_raise(s->irq);
+    } else {
+        qemu_irq_lower(s->irq);
+    }
+}
+
+static uint64_t
+uart_read(void *opaque, hwaddr addr, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    unsigned char r;
+    switch (addr) {
+    case SIFIVE_UART_RXFIFO:
+        if (s->rx_fifo_len) {
+            r = s->rx_fifo[0];
+            memmove(s->rx_fifo, s->rx_fifo + 1, s->rx_fifo_len - 1);
+            s->rx_fifo_len--;
+            qemu_chr_fe_accept_input(&s->chr);
+            update_irq(s);
+            return r;
+        }
+        return 0x80000000;
+
+    case SIFIVE_UART_TXFIFO:
+        return 0; /* Should check tx fifo */
+    case SIFIVE_UART_IE:
+        return s->ie;
+    case SIFIVE_UART_IP:
+        return uart_ip(s);
+    case SIFIVE_UART_TXCTRL:
+        return s->txctrl;
+    case SIFIVE_UART_RXCTRL:
+        return s->rxctrl;
+    case SIFIVE_UART_DIV:
+        return s->div;
+    }
+
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad read: addr=0x%x\n",
+                  __func__, (int)addr);
+    return 0;
+}
+
+static void
+uart_write(void *opaque, hwaddr addr,
+           uint64_t val64, unsigned int size)
+{
+    SiFiveUARTState *s = opaque;
+    uint32_t value = val64;
+    unsigned char ch = value;
+
+    switch (addr) {
+    case SIFIVE_UART_TXFIFO:
+        qemu_chr_fe_write(&s->chr, &ch, 1);
+        update_irq(s);
+        return;
+    case SIFIVE_UART_IE:
+        s->ie = val64;
+        update_irq(s);
+        return;
+    case SIFIVE_UART_TXCTRL:
+        s->txctrl = val64;
+        return;
+    case SIFIVE_UART_RXCTRL:
+        s->rxctrl = val64;
+        return;
+    case SIFIVE_UART_DIV:
+        s->div = val64;
+        return;
+    }
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: bad write: addr=0x%x v=0x%x\n",
+                  __func__, (int)addr, (int)value);
+}
+
+static const MemoryRegionOps uart_ops = {
+    .read = uart_read,
+    .write = uart_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4
+    }
+};
+
+static void uart_rx(void *opaque, const uint8_t *buf, int size)
+{
+    SiFiveUARTState *s = opaque;
+
+    /* Got a byte.  */
+    if (s->rx_fifo_len >= sizeof(s->rx_fifo)) {
+        printf("WARNING: UART dropped char.\n");
+        return;
+    }
+    s->rx_fifo[s->rx_fifo_len++] = *buf;
+
+    update_irq(s);
+}
+
+static int uart_can_rx(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    return s->rx_fifo_len < sizeof(s->rx_fifo);
+}
+
+static void uart_event(void *opaque, QEMUChrEvent event)
+{
+}
+
+static int uart_be_change(void *opaque)
+{
+    SiFiveUARTState *s = opaque;
+
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+
+    return 0;
+}
+
+/*
+ * Create UART device.
+ */
+SiFiveUARTState *sifive_uart_create(MemoryRegion *address_space, hwaddr base,
+    Chardev *chr, qemu_irq irq)
+{
+    SiFiveUARTState *s = g_malloc0(sizeof(SiFiveUARTState));
+    s->irq = irq;
+    qemu_chr_fe_init(&s->chr, chr, &error_abort);
+    qemu_chr_fe_set_handlers(&s->chr, uart_can_rx, uart_rx, uart_event,
+        uart_be_change, s, NULL, true);
+    memory_region_init_io(&s->mmio, NULL, &uart_ops, s,
+                          TYPE_SIFIVE_UART, SIFIVE_UART_MAX);
+    memory_region_add_subregion(address_space, base, &s->mmio);
+    return s;
+}
+
+
+
+
+

Paravirtualization

+
    +
  • Change the guest OS so that it cooperates with the VMM
      +
    • CPU paravirtualization
    • +
    • MMU paravirtualization
    • +
    • I/O paravirtualization
    • +
    +
  • +
  • VMM exposes hypercalls for:
      +
    • activate / deactivate the interrupts
    • +
    • changing page tables
    • +
    • accessing virtualized peripherals
    • +
    +
  • +
  • VMM uses events to trigger interrupts in the VM
  • +
+
+
+

Intel VT-x

+
    +
  • Hardware extension to transform x86 to the point it can be +virtualized "classically"
  • +
  • New execution mode: non-root mode
  • +
  • Each non-root mode instance uses a Virtual Machine Control +Structure (VMCS) to store its state
  • +
  • VMM runs in root mode
  • +
  • VM-entry and VM-exit are used to transition between the two modes
  • +
+
+

Virtual Machine Control Structure

+
    +
  • Guest information: state of the virtual CPU
  • +
  • Host information: state of the physical CPU
  • +
  • Saved information:
      +
    • visible state: segment registers, CR3, IDTR, etc.
    • +
    • internal state
    • +
    +
  • +
  • VMCS can not be accessed directly but certain information can be +accessed with special instructions
  • +
+
+
+

VM entry & exit

+
    +
  • VM entry - new instructions that switches the CPU in non-root +mode and loads the VM state from a VMCS; host state is saved in +VMCS
  • +
  • Allows injecting interrupts and exceptions in the guest
  • +
  • VM exit will be automatically triggered based on the VMCS +configuration
  • +
  • When VM exit occurs host state is loaded from VMCS, guest state +is saved in VMCS
  • +
+
+
+

VM execution control fields

+
    +
  • Selects conditions which triggers a VM exit; examples:
      +
    • If an external interrupt is generated
    • +
    • If an external interrupt is generated and EFLAGS.IF is set
    • +
    • If CR0-CR4 registers are modified
    • +
    +
  • +
  • Exception bitmap - selects which exceptions will generate a VM +exit
  • +
  • IO bitmap - selects which I/O addresses (IN/OUT accesses) +generates a VM exit
  • +
  • MSR bitmaps - selects which RDMSR or WRMSR instructions will +generate a VM exit
  • +
+
+
+
+

Extend Page Tables

+
    +
  • Reduces the complexity of MMU virtualization and improves +performance
  • +
  • Access to CR3, INVLPG and page faults do not require VM exit +anymore
  • +
  • The EPT page table is controlled by the VMM
  • +
+../_images/ditaa-cc9a2e995be74ee99646ea4bf0e551d766fa92ef.png +
+

VPID

+
    +
  • VM entry and VM exit forces a TLB flush - loses VMM / VM translations
  • +
  • To avoid this issue a VPID (Virtual Processor ID) tag is +associated with each VM (VPID 0 is reserved for the VMM)
  • +
  • All TLB entries are tagged
  • +
  • At VM entry and exit just the entries associated with the tags +are flushed
  • +
  • When searching the TLB just the current VPID is used
  • +
+
+
+
+

I/O virtualization

+
+
    +
  • Direct access to hardware from a VM - in a controlled fashion
      +
    • Map the MMIO host directly to the guest
    • +
    • Forward interrupts
    • +
    +
  • +
+
+../_images/ditaa-3901edd823cdc7a6f429ebc37cbc541e650abc96.png +

Instead of trapping MMIO as with emulated devices we can allow the +guest to access the MMIO directly by mapping through its page tables.

+

Interrupts from the device are handled by the host kernel and a signal +is send to the VMM which injects the interrupt to the guest just as +for the emulated devices.

+

VT-d protects and translates VM physical addresses using an I/O +MMU (DMA remaping)

+../_images/ditaa-d880751969de8642b2613caaca345d71acea4500.png +
    +
  • Messsage Signaled Interrupts (MSI) = DMA writes to the host +address range of the IRQ controller (e.g. 0xFEExxxxx)
  • +
  • Low bits of the address and the data indicate which interrupt +vector to deliver to which CPU
  • +
  • Interrupt remapping table points to the virtual CPU (VMCS) that +should receive the interrupt
  • +
  • I/O MMU will trap the IRQ controller write and look it up in the +interrupt remmaping table
      +
    • if that virtual CPU is currently running it will take the +interrupt directly
    • +
    • otherwise a bit is set in a table (Posted Interrupt Descriptor +table) and the interrupt will be inject next time that vCPU is +run
    • +
    +
  • +
+../_images/ditaa-2cb0eb0056bb775d1446843d62241fd660662c96.png +
    +
  • Single Root - Input Output Virtualization
  • +
  • Physical device with multiple Ethernet ports will be shown as +multiple device on the PCI bus
  • +
  • Physical Function is used for the control and can be configured
      +
    • to present itself as a new PCI device
    • +
    • which VLAN to use
    • +
    +
  • +
  • The new virtual function is enumerated on the bus and can be +assigned to a particular guest
  • +
+
+
+

qemu

+
    +
  • Uses binary translation via Tiny Code Generator (TCG) for +efficient emulation
  • +
  • Supports different target and host architectures (e.g. running +ARM VMs on x86)
  • +
  • Both process and full system level emulation
  • +
  • MMU emulation
  • +
  • I/O emulation
  • +
  • Can be used with KVM for accelerated virtualization
  • +
+
+
+

KVM

+../_images/ditaa-f8fcc760ef5dad50d1038ed3426d0fcce12fd3e6.png +
    +
  • Linux device driver for hardware virtualization (e.g. Intel VT-x, SVM)
  • +
  • IOCTL based interface for managing and running virtual CPUs
  • +
  • VMM components implemented inside the Linux kernel +(e.g. interrupt controller, timers)
  • +
  • Shadow page tables or EPT if present
  • +
  • Uses qemu or virtio for I/O virtualization
  • +
+
+
+

Type 1 vs Type 2 Hypervisors

+
    +
  • Type 1 = Bare Metal Hypervisor
  • +
  • Type 2 = Hypervisor embedded in an exist kernel / OS
  • +
+
+
+

Xen

+../_images/xen-overview1.png +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec2-syscalls-slides.html b/refs/pull/405/merge/so2/lec2-syscalls-slides.html new file mode 100644 index 00000000..93ccc076 --- /dev/null +++ b/refs/pull/405/merge/so2/lec2-syscalls-slides.html @@ -0,0 +1,540 @@ + + + + + + + + SO2 Lecture 02 - System calls — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 02 - System calls

+ + + + + +
+
+ +

System Calls

+ +
    +
  • Linux system calls implementation
  • +
  • VDSO and virtual syscalls
  • +
  • Accessing user space from system calls
  • +
+ + + + +
+
+ +

System Calls as Kernel services

+ +

 

+../_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.png + + + + +
+
+ +

System Call Setup

+ +
    +
  • setup information to identify the system call and its parameters
  • +
  • trigger a kernel mode switch
  • +
  • retrieve the result of the system call
  • +
+ + + + +
+
+ +

Linux system call setup

+ +
    +
  • System calls are identified by numbers
  • +
  • The parameters for system calls are machine word sized (32 or 64 +bit) and they are limited to a maximum of 6
  • +
  • Uses registers to store them both (e.g. for 32bit x86: EAX for +system call and EBX, ECX, EDX, ESI, EDI, EBP for parameters)
  • +
+ + + + +
+
+ +

Example of Linux system call setup and handling

+ +../_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.png + + + + +
+
+ +

Linux System Call Dispatcher

+ +
/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
+{
+    enter_from_user_mode();
+    local_irq_enable();
+    do_syscall_32_irqs_on(regs);
+}
+
+/* simplified version of the Linux x86 32bit System Call Dispatcher */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+    unsigned int nr = regs->orig_ax;
+
+    if (nr < IA32_NR_syscalls)
+        regs->ax = ia32_sys_call_table[nr](regs->bx, regs->cx,
+                                           regs->dx, regs->si,
+                                           regs->di, regs->bp);
+    syscall_return_slowpath(regs);
+}
+
+
+ + + + +
+
+ +

Inspecting dup2 system call

+ +

 

+ + + + +
+
+ +

System Call Flow Summary

+ +
    +
  • The application is setting up the system call number and +parameters and it issues a trap instruction
  • +
  • The execution mode switches from user to kernel; the CPU switches +to a kernel stack; the user stack and the return address to user +space is saved on the kernel stack
  • +
  • The kernel entry point saves registers on the kernel stack
  • +
  • The system call dispatcher identifies the system call function +and runs it
  • +
  • The user space registers are restored and execution is switched +back to user (e.g. calling IRET)
  • +
  • The user space application resumes
  • +
+ + + + +
+
+ +

System Call Table

+ +
#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
+
+const sys_call_ptr_t ia32_sys_call_table[] = {
+  [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
+  #include <asm/syscalls_32.h>
+};
+
+
+
__SYSCALL_I386(0, sys_restart_syscall)
+__SYSCALL_I386(1, sys_exit)
+__SYSCALL_I386(2, sys_fork)
+__SYSCALL_I386(3, sys_read)
+__SYSCALL_I386(4, sys_write)
+#ifdef CONFIG_X86_32
+__SYSCALL_I386(5, sys_open)
+#else
+__SYSCALL_I386(5, compat_sys_open)
+#endif
+__SYSCALL_I386(6, sys_close)
+
+
+ + + + +
+
+ +

System Calls Pointer Parameters

+ +
    +
  • Never allow pointers to kernel-space
  • +
  • Check for invalid pointers
  • +
+ + + + +
+
+ +

Pointers to Kernel Space

+ +
    +
  • User access to kernel data if allowed in a write system call
  • +
  • User corrupting kernel data if allowed in a read system call
  • +
+ + + + +
+
+ +

Invalid pointers handling approaches

+ +
    +
  • Check the pointer against the user address space before using it, +or
  • +
  • Avoid checking the pointer and rely on the MMU to detect when the +pointer is invalid and use the page fault handler to determine +that the pointer was invalid
  • +
+ + + + +
+
+ +

Page fault handling

+ +
+
    +
  • Copy on write, demand paging, swapping: both the fault and +faulting addresses are in user space; the fault address is +valid (checked against the user address space)
  • +
  • Invalid pointer used in system call: the faulting address is +in kernel space; the fault address is in user space and it is +invalid
  • +
  • Kernel bug (kernel accesses invalid pointer): same as above
  • +
+
+ + + + +
+
+ +

Marking kernel code that accesses user space

+ +
    +
  • The exact instructions that access user space are recorded in a +table (exception table)
  • +
  • When a page fault occurs the faulting address is checked against +this table
  • +
+ + + + +
+
+ +

Cost analysis for pointer checks vs fault handling

+ + +++++ + + + + + + + + + + + + + + + + +
CostPointer checksFault handling
Valid addressaddress space searchnegligible
Invalid addressaddress space searchexception table search
+ + + + +
+
+ +

Virtual Dynamic Shared Object (VDSO)

+ +
    +
  • a stream of instructions to issue the system call is generated by +the kernel in a special memory area (formatted as an ELF shared +object)
  • +
  • that memory area is mapped towards the end of the user address +space
  • +
  • libc searches for VDSO and if present will use it to issue the +system call
  • +
+ + + + +
+
+ +

Inspecting VDSO

+ +

 

+ + + + +
+
+ +

Virtual System Calls (vsyscalls)

+ +
    +
  • "System calls" that run directly from user space, part of the VDSO
  • +
  • Static data (e.g. getpid())
  • +
  • Dynamic data update by the kernel a in RW map of the VDSO +(e.g. gettimeofday(), time(), )
  • +
+ + + + +
+
+ +

Accessing user space from system calls

+ +
/* OK: return -EFAULT if user_ptr is invalid */
+if (copy_from_user(&kernel_buffer, user_ptr, size))
+    return -EFAULT;
+
+/* NOK: only works if user_ptr is valid otherwise crashes kernel */
+memcpy(&kernel_buffer, user_ptr, size);
+
+
+ + + + +
+
+ +

get_user implementation

+ +
#define get_user(x, ptr)                                          \
+({                                                                \
+  int __ret_gu;                                                   \
+  register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);            \
+  __chk_user_ptr(ptr);                                            \
+  might_fault();                                                  \
+  asm volatile("call __get_user_%P4"                              \
+               : "=a" (__ret_gu), "=r" (__val_gu),                \
+                  ASM_CALL_CONSTRAINT                             \
+               : "0" (ptr), "i" (sizeof(*(ptr))));                \
+  (x) = (__force __typeof__(*(ptr))) __val_gu;                    \
+  __builtin_expect(__ret_gu, 0);                                  \
+})
+
+
+ + + + +
+
+ +

get_user pseudo code

+ +
#define get_user(x, ptr)                \
+    movl ptr, %eax                      \
+    call __get_user_1                   \
+    movl %edx, x                        \
+    movl %eax, result                   \
+
+
+ + + + +
+
+ +

get_user_1 implementation

+ +
.text
+ENTRY(__get_user_1)
+    mov PER_CPU_VAR(current_task), %_ASM_DX
+    cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+    jae bad_get_user
+    ASM_STAC
+1:  movzbl (%_ASM_AX),%edx
+    xor %eax,%eax
+    ASM_CLAC
+    ret
+ENDPROC(__get_user_1)
+
+bad_get_user:
+    xor %edx,%edx
+    mov $(-EFAULT),%_ASM_AX
+    ASM_CLAC
+    ret
+END(bad_get_user)
+
+_ASM_EXTABLE(1b,bad_get_user)
+
+
+ + + + +
+
+ +

Exception table entry

+ +
/* Exception table entry */
+# define _ASM_EXTABLE_HANDLE(from, to, handler)           \
+  .pushsection "__ex_table","a" ;                         \
+  .balign 4 ;                                             \
+  .long (from) - . ;                                      \
+  .long (to) - . ;                                        \
+  .long (handler) - . ;                                   \
+  .popsection
+
+# define _ASM_EXTABLE(from, to)                           \
+  _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+
+ + + + +
+
+ +

Exception table building

+ +
#define EXCEPTION_TABLE(align)                                    \
+  . = ALIGN(align);                                               \
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {               \
+          VMLINUX_SYMBOL(__start___ex_table) = .;                 \
+          KEEP(*(__ex_table))                                     \
+          VMLINUX_SYMBOL(__stop___ex_table) = .;                  \
+  }
+
+
+ + + + +
+
+ +

Exception table handling

+ +
bool ex_handler_default(const struct exception_table_entry *fixup,
+                        struct pt_regs *regs, int trapnr)
+{
+    regs->ip = ex_fixup_addr(fixup);
+    return true;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+    const struct exception_table_entry *e;
+    ex_handler_t handler;
+
+    e = search_exception_tables(regs->ip);
+    if (!e)
+        return 0;
+
+    handler = ex_fixup_handler(e);
+    return handler(e, regs, trapnr);
+}
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec2-syscalls.html b/refs/pull/405/merge/so2/lec2-syscalls.html new file mode 100644 index 00000000..ededaa78 --- /dev/null +++ b/refs/pull/405/merge/so2/lec2-syscalls.html @@ -0,0 +1,577 @@ + + + + + + SO2 Lecture 02 - System calls — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 02 - System calls

+

View slides

+
+

Lecture objectives:

+
    +
  • Linux system calls implementation
  • +
  • VDSO and virtual syscalls
  • +
  • Accessing user space from system calls
  • +
+
+
+

Linux system calls implementation

+

At a high level system calls are "services" offered by the kernel to +user applications and they resemble library APIs in that they are +described as a function call with a name, parameters, and return value.

+

 

+../_images/ditaa-e76e44cad2e92f2134ab77f6a09605b29524d039.png +

However, on a closer look, we can see that system calls are actually +not function calls, but specific assembly instructions (architecture +and kernel specific) that do the following:

+
    +
  • setup information to identify the system call and its parameters
  • +
  • trigger a kernel mode switch
  • +
  • retrieve the result of the system call
  • +
+

In Linux, system calls are identified by numbers and the parameters +for system calls are machine word sized (32 or 64 bit). There can be a +maximum of 6 system call parameters. Both the system call number and +the parameters are stored in certain registers.

+

For example, on 32bit x86 architecture, the system call identifier is +stored in the EAX register, while parameters in registers EBX, ECX, +EDX, ESI, EDI, EBP.

+

System libraries (e.g. libc) offers functions that implement the +actual system calls in order to make it easier for applications to use +them.

+

When a user to kernel mode transition occurs, the execution flow is +interrupted and it is transferred to a kernel entry point. This is +similar to how interrupts and exceptions are handled (in fact on some +architectures this transition happens as a result of an exception).

+

The system call entry point will save registers (which contains values +from user space, including system call number and system call +parameters) on stack and then it will continue with executing the +system call dispatcher.

+
+

Note

+

During the user - kernel mode transition the stack is also +switched from the user stack to the kernel stack. This is +explained in more details in the interrupts lecture.

+
+../_images/ditaa-eeb919cd078d0ba5021028fa628bb47d7d6866e2.png +

The purpose of the system call dispatcher is to verify the system call +number and run the kernel function associated with the system call.

+
/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
+{
+    enter_from_user_mode();
+    local_irq_enable();
+    do_syscall_32_irqs_on(regs);
+}
+
+/* simplified version of the Linux x86 32bit System Call Dispatcher */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+    unsigned int nr = regs->orig_ax;
+
+    if (nr < IA32_NR_syscalls)
+        regs->ax = ia32_sys_call_table[nr](regs->bx, regs->cx,
+                                           regs->dx, regs->si,
+                                           regs->di, regs->bp);
+    syscall_return_slowpath(regs);
+}
+
+
+

To demonstrate the system call flow we are going to use the virtual +machine setup, attach gdb to a running kernel, add a breakpoint to the +dup2 system call and inspect the state.

+

 

+

In summary, this is what happens during a system call:

+
    +
  • The application is setting up the system call number and +parameters and it issues a trap instruction
  • +
  • The execution mode switches from user to kernel; the CPU switches +to a kernel stack; the user stack and the return address to user +space is saved on the kernel stack
  • +
  • The kernel entry point saves registers on the kernel stack
  • +
  • The system call dispatcher identifies the system call function +and runs it
  • +
  • The user space registers are restored and execution is switched +back to user (e.g. calling IRET)
  • +
  • The user space application resumes
  • +
+
+

System call table

+

The system call table is what the system call dispatcher uses to map +system call numbers to kernel functions:

+
#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
+
+const sys_call_ptr_t ia32_sys_call_table[] = {
+  [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
+  #include <asm/syscalls_32.h>
+};
+
+
+
__SYSCALL_I386(0, sys_restart_syscall)
+__SYSCALL_I386(1, sys_exit)
+__SYSCALL_I386(2, sys_fork)
+__SYSCALL_I386(3, sys_read)
+__SYSCALL_I386(4, sys_write)
+#ifdef CONFIG_X86_32
+__SYSCALL_I386(5, sys_open)
+#else
+__SYSCALL_I386(5, compat_sys_open)
+#endif
+__SYSCALL_I386(6, sys_close)
+
+
+
+
+

System call parameters handling

+

Handling system call parameters is tricky. Since these values are +setup by user space, the kernel can not assume correctness and must +always verify them thoroughly.

+

Pointers have a few important special cases that must be checked:

+
    +
  • Never allow pointers to kernel-space
  • +
  • Check for invalid pointers
  • +
+

Since system calls are executed in kernel mode, they have access to +kernel space and if pointers are not properly checked user +applications might get read or write access to kernel space.

+

For example, let's consider the case where such a check is not made for +the read or write system calls. If the user passes a kernel-space +pointer to a write system call then it can get access to kernel data +by later reading the file. If it passes a kernel-space pointer to a +read system call then it can corrupt kernel memory.

+

Likewise, if a pointer passed by the application is invalid +(e.g. unmapped, read-only for cases where it is used for writing), it +could "crash" the kernel. Two approaches could be used:

+
    +
  • Check the pointer against the user address space before using it, +or
  • +
  • Avoid checking the pointer and rely on the MMU to detect when the +pointer is invalid and use the page fault handler to determine +that the pointer was invalid
  • +
+

Although it sounds tempting, the second approach is not that easy to +implement. The page fault handler uses the fault address (the address +that was accessed), the faulting address (the address of the +instruction that did the access) and information from the user address +space to determine the cause:

+
+
    +
  • Copy on write, demand paging, swapping: both the fault and +faulting addresses are in user space; the fault address is +valid (checked against the user address space)
  • +
  • Invalid pointer used in system call: the faulting address is +in kernel space; the fault address is in user space and it is +invalid
  • +
  • Kernel bug (kernel accesses invalid pointer): same as above
  • +
+
+

But in the last two cases we don't have enough information to +determine the cause of the fault.

+

In order to solve this issue, Linux uses special APIs (e.g +copy_to_user()) to accesses user space that are specially +crafted:

+
    +
  • The exact instructions that access user space are recorded in a +table (exception table)
  • +
  • When a page fault occurs the faulting address is checked against +this table
  • +
+

Although the fault handling case may be more costly overall depending +on the address space vs exception table size, and it is more complex, +it is optimized for the common case and that is why it is preferred +and used in Linux.

+ +++++ + + + + + + + + + + + + + + + + +
CostPointer checksFault handling
Valid addressaddress space searchnegligible
Invalid addressaddress space searchexception table search
+
+
+
+

Virtual Dynamic Shared Object (VDSO)

+

The VDSO mechanism was born out of the necessity of optimizing the +system call implementation, in a way that does not impact libc with +having to track the CPU capabilities in conjunction with the kernel +version.

+

For example, x86 has two ways of issuing system calls: int 0x80 and +sysenter. The latter is significantly faster so it should be used when +available. However, it is only available for processors newer than +Pentium II and only for kernel versions greater than 2.6.

+

With VDSO the system call interface is decided by the kernel:

+
    +
  • a stream of instructions to issue the system call is generated by +the kernel in a special memory area (formatted as an ELF shared +object)
  • +
  • that memory area is mapped towards the end of the user address +space
  • +
  • libc searches for VDSO and if present will use it to issue the +system call
  • +
+

 

+

An interesting development of the VDSO is the virtual system calls +(vsyscalls) which run directly from user space. These vsyscalls are +also part of VDSO and they are accessing data from the VDSO page that +is either static or modified by the kernel in a separate read-write +map of the VDSO page. Examples of system calls that can be implemented +as vsyscalls are: getpid or gettimeofday.

+
    +
  • "System calls" that run directly from user space, part of the VDSO
  • +
  • Static data (e.g. getpid())
  • +
  • Dynamic data update by the kernel a in RW map of the VDSO +(e.g. gettimeofday(), time(), )
  • +
+
+
+

Accessing user space from system calls

+

As we mentioned earlier, user space must be accessed with special APIs +(get_user(), put_user(), copy_from_user(), +copy_to_user()) that check whether the pointer is in user space +and also handle the fault if the pointer is invalid. In case of invalid +pointers, they return a non-zero value.

+
/* OK: return -EFAULT if user_ptr is invalid */
+if (copy_from_user(&kernel_buffer, user_ptr, size))
+    return -EFAULT;
+
+/* NOK: only works if user_ptr is valid otherwise crashes kernel */
+memcpy(&kernel_buffer, user_ptr, size);
+
+
+

Let's examine the simplest API, get_user, as implemented for x86:

+
#define get_user(x, ptr)                                          \
+({                                                                \
+  int __ret_gu;                                                   \
+  register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);            \
+  __chk_user_ptr(ptr);                                            \
+  might_fault();                                                  \
+  asm volatile("call __get_user_%P4"                              \
+               : "=a" (__ret_gu), "=r" (__val_gu),                \
+                  ASM_CALL_CONSTRAINT                             \
+               : "0" (ptr), "i" (sizeof(*(ptr))));                \
+  (x) = (__force __typeof__(*(ptr))) __val_gu;                    \
+  __builtin_expect(__ret_gu, 0);                                  \
+})
+
+
+

The implementation uses inline assembly, which allows inserting ASM +sequences in C code and also handles access to/from variables in the +ASM code.

+

Based on the type size of the x variable, one of __get_user_1, +__get_user_2 or __get_user_4 will be called. Also, before executing +the assembly call, ptr will be moved to the first register EAX while +after the completion of assembly part the value of EAX will be moved +to __ret_gu and the EDX register will be moved to __val_gu.

+

It is equivalent to the following pseudo code:

+
#define get_user(x, ptr)                \
+    movl ptr, %eax                      \
+    call __get_user_1                   \
+    movl %edx, x                        \
+    movl %eax, result                   \
+
+
+

The __get_user_1 implementation for x86 is the following:

+
.text
+ENTRY(__get_user_1)
+    mov PER_CPU_VAR(current_task), %_ASM_DX
+    cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+    jae bad_get_user
+    ASM_STAC
+1:  movzbl (%_ASM_AX),%edx
+    xor %eax,%eax
+    ASM_CLAC
+    ret
+ENDPROC(__get_user_1)
+
+bad_get_user:
+    xor %edx,%edx
+    mov $(-EFAULT),%_ASM_AX
+    ASM_CLAC
+    ret
+END(bad_get_user)
+
+_ASM_EXTABLE(1b,bad_get_user)
+
+
+

The first two statements check the pointer (which is stored in EDX) +with the addr_limit field of the current task (process) descriptor to +make sure that we don't have a pointer to kernel space.

+

Then, SMAP is disabled, to allow access to user from kernel, and the +access to user space is done with the instruction at the 1: label. EAX +is then zeroed to mark success, SMAP is enabled, and the call returns.

+

The movzbl instruction is the one that does the access to user space +and its address is captured with the 1: label and stored in a special +section:

+
/* Exception table entry */
+# define _ASM_EXTABLE_HANDLE(from, to, handler)           \
+  .pushsection "__ex_table","a" ;                         \
+  .balign 4 ;                                             \
+  .long (from) - . ;                                      \
+  .long (to) - . ;                                        \
+  .long (handler) - . ;                                   \
+  .popsection
+
+# define _ASM_EXTABLE(from, to)                           \
+  _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+
+

For each address that accesses user space we have an entry in the +exception table, that is made up of: the faulting address(from), where +to jump to in case of a fault, and a handler function (that implements +the jump logic). All of these addresses are stored on 32bit in +relative format to the exception table, so that they work for both 32 +and 64 bit kernels.

+

All of the exception table entries are then collected in the +__ex_table section by the linker script:

+
#define EXCEPTION_TABLE(align)                                    \
+  . = ALIGN(align);                                               \
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {               \
+          VMLINUX_SYMBOL(__start___ex_table) = .;                 \
+          KEEP(*(__ex_table))                                     \
+          VMLINUX_SYMBOL(__stop___ex_table) = .;                  \
+  }
+
+
+

The section is guarded with __start___ex_table and __stop___ex_table +symbols, so that it is easy to find the data from C code. This table +is accessed by the fault handler:

+
bool ex_handler_default(const struct exception_table_entry *fixup,
+                        struct pt_regs *regs, int trapnr)
+{
+    regs->ip = ex_fixup_addr(fixup);
+    return true;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+    const struct exception_table_entry *e;
+    ex_handler_t handler;
+
+    e = search_exception_tables(regs->ip);
+    if (!e)
+        return 0;
+
+    handler = ex_fixup_handler(e);
+    return handler(e, regs, trapnr);
+}
+
+
+

All it does is to set the return address to the one in the field of +the exception table entry which, in case of the get_user exception +table entry, is bad_get_user which return -EFAULT to the caller.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec3-processes-slides.html b/refs/pull/405/merge/so2/lec3-processes-slides.html new file mode 100644 index 00000000..5f8f3870 --- /dev/null +++ b/refs/pull/405/merge/so2/lec3-processes-slides.html @@ -0,0 +1,996 @@ + + + + + + + + SO2 Lecture 03 - Processes — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 03 - Processes

+ + + + + +
+
+ +

Processes and threads

+ +
    +
  • Process and threads
  • +
  • Context switching
  • +
  • Blocking and waking up
  • +
  • Process context
  • +
+ + + + +
+
+ +

What is a process?

+ +
    +
  • An address space
  • +
  • One or more threads
  • +
  • Opened files
  • +
  • Sockets
  • +
  • Semaphores
  • +
+
    +
  • Shared memory regions
  • +
  • Timers
  • +
  • Signal handlers
  • +
  • Many other resources and status information
  • +
+
+

All this information is grouped in the Process Control Group +(PCB). In Linux this is struct task_struct.

+ + + + +
+
+ +

Overview of process resources

+ +
                +-------------------------------------------------------------------+
+                | dr-x------    2 tavi tavi 0  2021 03 14 12:34 .                   |
+                | dr-xr-xr-x    6 tavi tavi 0  2021 03 14 12:34 ..                  |
+                | lrwx------    1 tavi tavi 64 2021 03 14 12:34 0 -> /dev/pts/4     |
+           +--->| lrwx------    1 tavi tavi 64 2021 03 14 12:34 1 -> /dev/pts/4     |
+           |    | lrwx------    1 tavi tavi 64 2021 03 14 12:34 2 -> /dev/pts/4     |
+           |    | lr-x------    1 tavi tavi 64 2021 03 14 12:34 3 -> /proc/18312/fd |
+           |    +-------------------------------------------------------------------+
+           |                 +----------------------------------------------------------------+
+           |                 | 08048000-0804c000 r-xp 00000000 08:02 16875609 /bin/cat        |
+$ ls -1 /proc/self/          | 0804c000-0804d000 rw-p 00003000 08:02 16875609 /bin/cat        |
+cmdline    |                 | 0804d000-0806e000 rw-p 0804d000 00:00 0 [heap]                 |
+cwd        |                 | ...                                                            |
+environ    |    +----------->| b7f46000-b7f49000 rw-p b7f46000 00:00 0                        |
+exe        |    |            | b7f59000-b7f5b000 rw-p b7f59000 00:00 0                        |
+fd --------+    |            | b7f5b000-b7f77000 r-xp 00000000 08:02 11601524 /lib/ld-2.7.so  |
+fdinfo          |            | b7f77000-b7f79000 rw-p 0001b000 08:02 11601524 /lib/ld-2.7.so  |
+maps -----------+            | bfa05000-bfa1a000 rw-p bffeb000 00:00 0 [stack]                |
+mem                          | ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]                 |
+root                         +----------------------------------------------------------------+
+stat                 +----------------------------+
+statm                |  Name: cat                 |
+status ------+       |  State: R (running)        |
+task         |       |  Tgid: 18205               |
+wchan        +------>|  Pid: 18205                |
+                     |  PPid: 18133               |
+                     |  Uid: 1000 1000 1000 1000  |
+                     |  Gid: 1000 1000 1000 1000  |
+                     +----------------------------+
+
+
+ + + + +
+
+ +

struct task_struct

+ +
$ pahole -C task_struct vmlinux
+
+struct task_struct {
+    struct thread_info thread_info;                  /*     0     8 */
+    volatile long int          state;                /*     8     4 */
+    void *                     stack;                /*    12     4 */
+
+    ...
+
+    /* --- cacheline 45 boundary (2880 bytes) --- */
+    struct thread_struct thread __attribute__((__aligned__(64))); /*  2880  4288 */
+
+    /* size: 7168, cachelines: 112, members: 155 */
+    /* sum members: 7148, holes: 2, sum holes: 12 */
+    /* sum bitfield members: 7 bits, bit holes: 2, sum bit holes: 57 bits */
+    /* paddings: 1, sum paddings: 2 */
+    /* forced alignments: 6, forced holes: 2, sum forced holes: 12 */
+} __attribute__((__aligned__(64)));
+
+
+ + + + +
+
+ +

Inspecting task_struct

+ +

 

+ + + + +
+
+ +

Quiz: Inspect opened files

+ +

Use the debugger to inspect the process named syslogd.

+
    +
  • What command should we use to list the opened file descriptors?
  • +
  • How many file descriptors are opened?
  • +
  • What command should we use the determine the file name for opened file descriptor 3?
  • +
  • What is the filename for file descriptor 3?
  • +
+ + + + +
+
+ +

Threads

+ +
    +
  • Each thread has its own stack and together with the register +values it determines the thread execution state
  • +
  • A thread runs in the context of a process and all threads in the +same process share the resources
  • +
  • The kernel schedules threads not processes and user-level threads +(e.g. fibers, coroutines, etc.) are not visible at the kernel level
  • +
+ + + + +
+
+ +

Classic implementation (Windows)

+ +

 

+../_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.png + + + + +
+
+ +

Linux implementation

+ +

 

+../_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.png + + + + +
+
+ +

The clone system call

+ +
    +
  • CLONE_FILES - shares the file descriptor table with the parent
  • +
  • CLONE_VM - shares the address space with the parent
  • +
  • CLONE_FS - shares the filesystem information (root directory, +current directory) with the parent
  • +
  • CLONE_NEWNS - does not share the mount namespace with the parent
  • +
  • CLONE_NEWIPC - does not share the IPC namespace (System V IPC +objects, POSIX message queues) with the parent
  • +
  • CLONE_NEWNET - does not share the networking namespaces (network +interfaces, routing table) with the parent
  • +
+ + + + +
+
+ +

Namespaces and "containers"

+ +
    +
  • Containers = a form of lightweight virtual machines
  • +
  • Container based technologies: LXC, docker
  • +
  • Containers are built of top of kernel namespaces
  • +
  • Kernel namespaces allows isolation of otherwise globally visible +resources
  • +
  • struct nsproxy has multiple namespaces each of which +can be selectively shared between groups of processes
  • +
  • At boot initial namespaces are created (e.g. init_net) +that are by default shared between new processes (e.g. list of +available network interfaces)
  • +
  • New namespace can be created a runtime and new processes can +point to these new namespaces
  • +
+ + + + +
+
+ +

Accessing the current process

+ +

Accessing the current process is a frequent operation:

+
    +
  • opening a file needs access to struct task_struct's +file field
  • +
  • mapping a new file needs access to struct task_struct's +mm field
  • +
  • Over 90% of the system calls needs to access the current process +structure so it needs to be fast
  • +
  • The current macro is available to access to current +process's struct task_struct
  • +
+ + + + +
+
+ +

Accessing the current process on x86

+ +

 

+../_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.png + + + + +
+
+ +

Previous implementation for current (x86)

+ +
/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __attribute_used__;
+
+/* how to get the thread information struct from C */
+static inline struct thread_info *current_thread_info(void)
+{
+   return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE – 1));
+}
+
+#define current current_thread_info()->task
+
+
+ + + + +
+
+ +

Quiz: previous implementation for current (x86)

+ +

What is the size of struct thread_info?

+

Which of the following are potential valid sizes for +struct thread_info: 4095, 4096, 4097?

+ + + + +
+
+ +

Overview the context switching processes

+ +../_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.png + + + + +
+
+ +

context_switch

+ +
static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+         struct task_struct *next, struct rq_flags *rf)
+{
+    prepare_task_switch(rq, prev, next);
+
+    /*
+     * For paravirt, this is coupled with an exit in switch_to to
+     * combine the page table reload and the switch backend into
+     * one hypercall.
+     */
+    arch_start_context_switch(prev);
+
+    /*
+     * kernel -> kernel   lazy + transfer active
+     *   user -> kernel   lazy + mmgrab() active
+     *
+     * kernel ->   user   switch + mmdrop() active
+     *   user ->   user   switch
+     */
+    if (!next->mm) {                                // to kernel
+        enter_lazy_tlb(prev->active_mm, next);
+
+        next->active_mm = prev->active_mm;
+        if (prev->mm)                           // from user
+            mmgrab(prev->active_mm);
+        else
+            prev->active_mm = NULL;
+    } else {                                        // to user
+        membarrier_switch_mm(rq, prev->active_mm, next->mm);
+        /*
+         * sys_membarrier() requires an smp_mb() between setting
+         * rq->curr / membarrier_switch_mm() and returning to userspace.
+         *
+         * The below provides this either through switch_mm(), or in
+         * case 'prev->active_mm == next->mm' through
+         * finish_task_switch()'s mmdrop().
+         */
+        switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+        if (!prev->mm) {                        // from kernel
+            /* will mmdrop() in finish_task_switch(). */
+            rq->prev_mm = prev->active_mm;
+            prev->active_mm = NULL;
+        }
+    }
+
+    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
+    prepare_lock_switch(rq, next, rf);
+
+    /* Here we just switch the register state and the stack. */
+    switch_to(prev, next, prev);
+    barrier();
+
+    return finish_task_switch(prev);
+  }
+
+
+ + + + +
+
+ +

switch_to

+ +
#define switch_to(prev, next, last)               \
+do {                                              \
+    ((last) = __switch_to_asm((prev), (next)));   \
+} while (0)
+
+
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(__switch_to_asm)
+    /*
+     * Save callee-saved registers
+     * This must match the order in struct inactive_task_frame
+     */
+    pushl   %ebp
+    pushl   %ebx
+    pushl   %edi
+    pushl   %esi
+    /*
+     * Flags are saved to prevent AC leakage. This could go
+     * away if objtool would have 32bit support to verify
+     * the STAC/CLAC correctness.
+     */
+    pushfl
+
+    /* switch stack */
+    movl    %esp, TASK_threadsp(%eax)
+    movl    TASK_threadsp(%edx), %esp
+
+  #ifdef CONFIG_STACKPROTECTOR
+    movl    TASK_stack_canary(%edx), %ebx
+    movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+  #endif
+
+  #ifdef CONFIG_RETPOLINE
+    /*
+     * When switching from a shallower to a deeper call stack
+     * the RSB may either underflow or use entries populated
+     * with userspace addresses. On CPUs where those concerns
+     * exist, overwrite the RSB with entries which capture
+     * speculative execution to prevent attack.
+     */
+    FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+    #endif
+
+    /* Restore flags or the incoming task to restore AC state. */
+    popfl
+    /* restore callee-saved registers */
+    popl    %esi
+    popl    %edi
+    popl    %ebx
+    popl    %ebp
+
+    jmp     __switch_to
+  SYM_CODE_END(__switch_to_asm)
+  .popsection
+
+
+ + + + +
+
+ +

Inspecting task_struct

+ +

 

+ + + + +
+
+ +

Quiz: context switch

+ +

We are executing a context switch. Select all of the statements that are true.

+
    +
  • the ESP register is saved in the task structure
  • +
  • the EIP register is saved in the task structure
  • +
  • general registers are saved in the task structure
  • +
  • the ESP register is saved on the stack
  • +
  • the EIP register is saved on the stack
  • +
  • general registers are saved on the stack
  • +
+ + + + +
+
+ +

Task states

+ +../_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.png + + + + +
+
+ +

Blocking the current thread

+ +
    +
  • Set the current thread state to TASK_UINTERRUPTIBLE or +TASK_INTERRUPTIBLE
  • +
  • Add the task to a waiting queue
  • +
  • Call the scheduler which will pick up a new task from the READY +queue
  • +
  • Do the context switch to the new task
  • +
+ + + + +
+
+ +

wait_event

+ +
/**
+ * wait_event - sleep until a condition gets true
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event(wq_head, condition)            \
+do {                                              \
+  might_sleep();                                  \
+  if (condition)                                  \
+          break;                                  \
+  __wait_event(wq_head, condition);               \
+} while (0)
+
+#define __wait_event(wq_head, condition)                                  \
+    (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,   \
+                        schedule())
+
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)    \
+({                                                                       \
+    __label__ __out;                                                     \
+    struct wait_queue_entry __wq_entry;                                  \
+    long __ret = ret;       /* explicit shadow */                        \
+                                                                         \
+    init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);     \
+    for (;;) {                                                           \
+        long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
+                                                                         \
+        if (condition)                                                   \
+            break;                                                       \
+                                                                         \
+        if (___wait_is_interruptible(state) && __int) {                  \
+            __ret = __int;                                               \
+            goto __out;                                                  \
+        }                                                                \
+                                                                         \
+        cmd;                                                             \
+    }                                                                    \
+    finish_wait(&wq_head, &__wq_entry);                                  \
+   __out:  __ret;                                                        \
+ })
+
+ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+ {
+    wq_entry->flags = flags;
+    wq_entry->private = current;
+    wq_entry->func = autoremove_wake_function;
+    INIT_LIST_HEAD(&wq_entry->entry);
+ }
+
+ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+ {
+     unsigned long flags;
+     long ret = 0;
+
+     spin_lock_irqsave(&wq_head->lock, flags);
+     if (signal_pending_state(state, current)) {
+         /*
+          * Exclusive waiter must not fail if it was selected by wakeup,
+          * it should "consume" the condition we were waiting for.
+          *
+          * The caller will recheck the condition and return success if
+          * we were already woken up, we can not miss the event because
+          * wakeup locks/unlocks the same wq_head->lock.
+          *
+          * But we need to ensure that set-condition + wakeup after that
+          * can't see us, it should wake up another exclusive waiter if
+          * we fail.
+          */
+         list_del_init(&wq_entry->entry);
+         ret = -ERESTARTSYS;
+     } else {
+         if (list_empty(&wq_entry->entry)) {
+             if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+                 __add_wait_queue_entry_tail(wq_head, wq_entry);
+             else
+                 __add_wait_queue(wq_head, wq_entry);
+         }
+         set_current_state(state);
+     }
+     spin_unlock_irqrestore(&wq_head->lock, flags);
+
+     return ret;
+ }
+
+ static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add(&wq_entry->entry, &wq_head->head);
+ }
+
+ static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add_tail(&wq_entry->entry, &wq_head->head);
+ }
+
+ /**
+  * finish_wait - clean up after waiting in a queue
+  * @wq_head: waitqueue waited on
+  * @wq_entry: wait descriptor
+  *
+  * Sets current thread back to running state and removes
+  * the wait descriptor from the given waitqueue if still
+  * queued.
+  */
+ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     unsigned long flags;
+
+     __set_current_state(TASK_RUNNING);
+     /*
+      * We can check for list emptiness outside the lock
+      * IFF:
+      *  - we use the "careful" check that verifies both
+      *    the next and prev pointers, so that there cannot
+      *    be any half-pending updates in progress on other
+      *    CPU's that we haven't seen yet (and that might
+      *    still change the stack area.
+      * and
+      *  - all other users take the lock (ie we can only
+      *    have _one_ other CPU that looks at or modifies
+      *    the list).
+      */
+     if (!list_empty_careful(&wq_entry->entry)) {
+         spin_lock_irqsave(&wq_head->lock, flags);
+         list_del_init(&wq_entry->entry);
+         spin_unlock_irqrestore(&wq_head->lock, flags);
+     }
+ }
+
+
+ + + + +
+
+ +

Waking up a task

+ +
    +
  • Select a task from the waiting queue
  • +
  • Set the task state to TASK_READY
  • +
  • Insert the task into the scheduler's READY queue
  • +
  • On SMP system this is a complex operation: each processor has its +own queue, queues need to be balanced, CPUs needs to be signaled
  • +
+ + + + +
+
+ +

wake_up

+ +
#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+               int nr_exclusive, void *key)
+{
+    __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+                  int nr_exclusive, int wake_flags, void *key)
+{
+  unsigned long flags;
+  wait_queue_entry_t bookmark;
+
+  bookmark.flags = 0;
+  bookmark.private = NULL;
+  bookmark.func = NULL;
+  INIT_LIST_HEAD(&bookmark.entry);
+
+  do {
+          spin_lock_irqsave(&wq_head->lock, flags);
+          nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+                                          wake_flags, key, &bookmark);
+          spin_unlock_irqrestore(&wq_head->lock, flags);
+  } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+}
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+                            int nr_exclusive, int wake_flags, void *key,
+                  wait_queue_entry_t *bookmark)
+{
+    wait_queue_entry_t *curr, *next;
+    int cnt = 0;
+
+    lockdep_assert_held(&wq_head->lock);
+
+    if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+          curr = list_next_entry(bookmark, entry);
+
+          list_del(&bookmark->entry);
+          bookmark->flags = 0;
+    } else
+          curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+    if (&curr->entry == &wq_head->head)
+          return nr_exclusive;
+
+    list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
+          unsigned flags = curr->flags;
+          int ret;
+
+          if (flags & WQ_FLAG_BOOKMARK)
+                  continue;
+
+          ret = curr->func(curr, mode, wake_flags, key);
+          if (ret < 0)
+                  break;
+          if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+                  break;
+
+          if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+                          (&next->entry != &wq_head->head)) {
+                  bookmark->flags = WQ_FLAG_BOOKMARK;
+                  list_add_tail(&bookmark->entry, &next->entry);
+                  break;
+          }
+    }
+
+    return nr_exclusive;
+}
+
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+{
+    int ret = default_wake_function(wq_entry, mode, sync, key);
+
+    if (ret)
+        list_del_init_careful(&wq_entry->entry);
+
+    return ret;
+}
+
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
+                    void *key)
+{
+    WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+    return try_to_wake_up(curr->private, mode, wake_flags);
+}
+
+/**
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Conceptually does:
+ *
+ *   If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * This function is atomic against schedule() which would dequeue the task.
+ *
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ *  - p->sched_class
+ *  - p->cpus_ptr
+ *  - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
+ *  - ttwu_queue()       -- new rq, for enqueue of the task;
+ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ *           %false otherwise.
+ */
+ static int
+ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ {
+     ...
+
+
+ + + + +
+
+ +

Non preemptive kernel

+ +
    +
  • At every tick the kernel checks to see if the current process has +its time slice consumed
  • +
  • If that happens a flag is set in interrupt context
  • +
  • Before returning to userspace the kernel checks this flag and +calls schedule() if needed
  • +
  • In this case tasks are not preempted while running in kernel mode +(e.g. system call) so there are no synchronization issues
  • +
+ + + + +
+
+ +

Preemptive kernel

+ +
    +
  • Tasks can be preempted even when running in kernel mode
  • +
  • It requires new synchronization primitives to be used in critical +sections: preempt_disable and +preempt_enable
  • +
  • Spinlocks also disable preemption
  • +
  • When a thread needs to be preempted a flag is set and action is +taken (e.g. scheduler is called) when preemption is reactivated
  • +
+ + + + +
+
+ +

Process context

+ +

The kernel is executing in process context when it is running a +system call.

+

In process context there is a well defined context and we can +access the current process data with current

+

In process context we can sleep (wait on a condition).

+

In process context we can access the user-space (unless we are +running in a kernel thread context).

+ + + + +
+
+ +

Kernel threads

+ +

Sometimes the kernel core or device drivers need to perform blocking +operations and thus they need to run in process context.

+

Kernel threads are used exactly for this and are a special class of +tasks that don't "userspace" resources (e.g. no address space or +opened files).

+ + + + +
+
+ +

Inspecting kernel threads

+ +

 

+ + + + +
+
+ +

Quiz: Kernel gdb scripts

+ +

What is the following change of the lx-ps script trying to +accomplish?

+
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
+index 17ec19e9b5bf..7e43c163832f 100644
+--- a/scripts/gdb/linux/tasks.py
++++ b/scripts/gdb/linux/tasks.py
+@@ -75,10 +75,13 @@ class LxPs(gdb.Command):
+     def invoke(self, arg, from_tty):
+         gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM"))
+         for task in task_lists():
+-            gdb.write("{} {:^5} {}\n".format(
++            check = task["mm"].format_string() == "0x0"
++            gdb.write("{} {:^5} {}{}{}\n".format(
+                 task.format_string().split()[0],
+                 task["pid"].format_string(),
+-                task["comm"].string()))
++                "[" if check else "",
++                task["comm"].string(),
++                "]" if check else ""))
+
+
+ LxPs()
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec3-processes.html b/refs/pull/405/merge/so2/lec3-processes.html new file mode 100644 index 00000000..dd483dbb --- /dev/null +++ b/refs/pull/405/merge/so2/lec3-processes.html @@ -0,0 +1,1119 @@ + + + + + + SO2 Lecture 03 - Processes — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 03 - Processes

+

View slides

+
+

Lecture objectives

+
    +
  • Process and threads
  • +
  • Context switching
  • +
  • Blocking and waking up
  • +
  • Process context
  • +
+
+
+

Processes and threads

+

A process is an operating system abstraction that groups together +multiple resources:

+
    +
  • An address space
  • +
  • One or more threads
  • +
  • Opened files
  • +
  • Sockets
  • +
  • Semaphores
  • +
+
    +
  • Shared memory regions
  • +
  • Timers
  • +
  • Signal handlers
  • +
  • Many other resources and status information
  • +
+
+

All this information is grouped in the Process Control Group +(PCB). In Linux this is struct task_struct.

+
+

Overview of process resources

+

A summary of the resources a process has can be obtain from the +/proc/<pid> directory, where <pid> is the process id for the +process we want to look at.

+
                +-------------------------------------------------------------------+
+                | dr-x------    2 tavi tavi 0  2021 03 14 12:34 .                   |
+                | dr-xr-xr-x    6 tavi tavi 0  2021 03 14 12:34 ..                  |
+                | lrwx------    1 tavi tavi 64 2021 03 14 12:34 0 -> /dev/pts/4     |
+           +--->| lrwx------    1 tavi tavi 64 2021 03 14 12:34 1 -> /dev/pts/4     |
+           |    | lrwx------    1 tavi tavi 64 2021 03 14 12:34 2 -> /dev/pts/4     |
+           |    | lr-x------    1 tavi tavi 64 2021 03 14 12:34 3 -> /proc/18312/fd |
+           |    +-------------------------------------------------------------------+
+           |                 +----------------------------------------------------------------+
+           |                 | 08048000-0804c000 r-xp 00000000 08:02 16875609 /bin/cat        |
+$ ls -1 /proc/self/          | 0804c000-0804d000 rw-p 00003000 08:02 16875609 /bin/cat        |
+cmdline    |                 | 0804d000-0806e000 rw-p 0804d000 00:00 0 [heap]                 |
+cwd        |                 | ...                                                            |
+environ    |    +----------->| b7f46000-b7f49000 rw-p b7f46000 00:00 0                        |
+exe        |    |            | b7f59000-b7f5b000 rw-p b7f59000 00:00 0                        |
+fd --------+    |            | b7f5b000-b7f77000 r-xp 00000000 08:02 11601524 /lib/ld-2.7.so  |
+fdinfo          |            | b7f77000-b7f79000 rw-p 0001b000 08:02 11601524 /lib/ld-2.7.so  |
+maps -----------+            | bfa05000-bfa1a000 rw-p bffeb000 00:00 0 [stack]                |
+mem                          | ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]                 |
+root                         +----------------------------------------------------------------+
+stat                 +----------------------------+
+statm                |  Name: cat                 |
+status ------+       |  State: R (running)        |
+task         |       |  Tgid: 18205               |
+wchan        +------>|  Pid: 18205                |
+                     |  PPid: 18133               |
+                     |  Uid: 1000 1000 1000 1000  |
+                     |  Gid: 1000 1000 1000 1000  |
+                     +----------------------------+
+
+
+
+
+

struct task_struct

+

Lets take a close look at struct task_struct. For that we +could just look at the source code, but here we will use a tool called +pahole (part of the dwarves install package) in order to get +some insights about this structure:

+
$ pahole -C task_struct vmlinux
+
+struct task_struct {
+    struct thread_info thread_info;                  /*     0     8 */
+    volatile long int          state;                /*     8     4 */
+    void *                     stack;                /*    12     4 */
+
+    ...
+
+    /* --- cacheline 45 boundary (2880 bytes) --- */
+    struct thread_struct thread __attribute__((__aligned__(64))); /*  2880  4288 */
+
+    /* size: 7168, cachelines: 112, members: 155 */
+    /* sum members: 7148, holes: 2, sum holes: 12 */
+    /* sum bitfield members: 7 bits, bit holes: 2, sum bit holes: 57 bits */
+    /* paddings: 1, sum paddings: 2 */
+    /* forced alignments: 6, forced holes: 2, sum forced holes: 12 */
+} __attribute__((__aligned__(64)));
+
+
+

As you can see it is a pretty large data structure: almost 8KB in size +and 155 fields.

+
+
+

Inspecting task_struct

+

The following screencast is going to demonstrate how we can inspect +the process control block (struct task_struct) by connecting +the debugger to the running virtual machine. We are going to use a +helper gdb command lx-ps to list the processes and the address of +the task_struct for each process.

+

 

+
+
+

Quiz: Inspect a task to determine opened files

+

Use the debugger to inspect the process named syslogd.

+
    +
  • What command should we use to list the opened file descriptors?
  • +
  • How many file descriptors are opened?
  • +
  • What command should we use the determine the file name for opened file descriptor 3?
  • +
  • What is the filename for file descriptor 3?
  • +
+
+
+

Threads

+

A thread is the basic unit that the kernel process scheduler uses to +allow applications to run the CPU. A thread has the following +characteristics:

+
    +
  • Each thread has its own stack and together with the register +values it determines the thread execution state
  • +
  • A thread runs in the context of a process and all threads in the +same process share the resources
  • +
  • The kernel schedules threads not processes and user-level threads +(e.g. fibers, coroutines, etc.) are not visible at the kernel level
  • +
+

The typical thread implementation is one where the threads is +implemented as a separate data structure which is then linked to the +process data structure. For example, the Windows kernel uses such an +implementation:

+

 

+../_images/ditaa-4b5c1874d3924d9716f26d4893a3e4f313bf1c43.png +

Linux uses a different implementation for threads. The basic unit is +called a task (hence the struct task_struct) and it is used +for both threads and processes. Instead of embedding resources in the +task structure it has pointers to these resources.

+

Thus, if two threads are the same process will point to the same +resource structure instance. If two threads are in different processes +they will point to different resource structure instances.

+

 

+../_images/ditaa-fd771038e88b95def30ae9bd4df0b7bd6b7b3503.png +
+
+

The clone system call

+

In Linux a new thread or process is create with the clone() +system call. Both the fork() system call and the +pthread_create() function uses the clone() +implementation.

+

It allows the caller to decide what resources should be shared with +the parent and which should be copied or isolated:

+
    +
  • CLONE_FILES - shares the file descriptor table with the parent
  • +
  • CLONE_VM - shares the address space with the parent
  • +
  • CLONE_FS - shares the filesystem information (root directory, +current directory) with the parent
  • +
  • CLONE_NEWNS - does not share the mount namespace with the parent
  • +
  • CLONE_NEWIPC - does not share the IPC namespace (System V IPC +objects, POSIX message queues) with the parent
  • +
  • CLONE_NEWNET - does not share the networking namespaces (network +interfaces, routing table) with the parent
  • +
+

For example, if CLONE_FILES | CLONE_VM | CLONE_FS is used by the +caller then effectively a new thread is created. If these flags are +not used then a new process is created.

+
+
+

Namespaces and "containers"

+

"Containers" are a form of lightweight virtual machines that share the +same kernel instance, as opposed to normal virtualization where a +hypervisor runs multiple VMs, each with its one kernel +instance.

+

Examples of container technologies are LXC - that allows running +lightweight "VM" and docker - a specialized container for running a +single application.

+

Containers are built on top of a few kernel features, one of which is +namespaces. They allow isolation of different resources that would +otherwise be globally visible. For example, without containers, all +processes would be visible in /proc. With containers, processes in one +container will not be visible (in /proc or be killable) to other +containers.

+

To achieve this partitioning, the struct nsproxy structure +is used to group types of resources that we want to partition. It +currently supports IPC, networking, cgroup, mount, networking, PID, +time namespaces. For example, instead of having a global list for +networking interfaces, the list is part of a struct net. The +system initializes with a default namespace (init_net) and by +default all processes will share this namespace. When a new namespace +is created a new net namespace is created and then new processes can +point to that new namespace instead of the default one.

+
+
+

Accessing the current process

+

Accessing the current process is a frequent operation:

+
    +
  • opening a file needs access to struct task_struct's +file field
  • +
  • mapping a new file needs access to struct task_struct's +mm field
  • +
  • Over 90% of the system calls needs to access the current process +structure so it needs to be fast
  • +
  • The current macro is available to access to current +process's struct task_struct
  • +
+

In order to support fast access in multi processor configurations a +per CPU variable is used to store and retrieve the pointer to the +current struct task_struct:

+

 

+../_images/ditaa-019489e686a2f60f1594e37458cfcb10320eae0f.png +

Previously the following sequence was used as the implementation for +the current macro:

+
/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __attribute_used__;
+
+/* how to get the thread information struct from C */
+static inline struct thread_info *current_thread_info(void)
+{
+   return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE – 1));
+}
+
+#define current current_thread_info()->task
+
+
+
+
+

Quiz: previous implementation for current (x86)

+

What is the size of struct thread_info?

+

Which of the following are potential valid sizes for +struct thread_info: 4095, 4096, 4097?

+
+
+
+

Context switching

+

The following diagram shows an overview of the Linux kernel context +switch process:

+../_images/ditaa-f6b228332baf165f498d8a1bb0bc0bdb91ae50c5.png +

Note that before a context switch can occur we must do a kernel +transition, either with a system call or with an interrupt. At that +point the user space registers are saved on the kernel stack. At some +point the schedule() function will be called which can decide +that a context switch must occur from T0 to T1 (e.g. because the +current thread is blocking waiting for an I/O operation to complete or +because it's allocated time slice has expired).

+

At that point context_switch() will perform architecture +specific operations and will switch the address space if needed:

+
static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+         struct task_struct *next, struct rq_flags *rf)
+{
+    prepare_task_switch(rq, prev, next);
+
+    /*
+     * For paravirt, this is coupled with an exit in switch_to to
+     * combine the page table reload and the switch backend into
+     * one hypercall.
+     */
+    arch_start_context_switch(prev);
+
+    /*
+     * kernel -> kernel   lazy + transfer active
+     *   user -> kernel   lazy + mmgrab() active
+     *
+     * kernel ->   user   switch + mmdrop() active
+     *   user ->   user   switch
+     */
+    if (!next->mm) {                                // to kernel
+        enter_lazy_tlb(prev->active_mm, next);
+
+        next->active_mm = prev->active_mm;
+        if (prev->mm)                           // from user
+            mmgrab(prev->active_mm);
+        else
+            prev->active_mm = NULL;
+    } else {                                        // to user
+        membarrier_switch_mm(rq, prev->active_mm, next->mm);
+        /*
+         * sys_membarrier() requires an smp_mb() between setting
+         * rq->curr / membarrier_switch_mm() and returning to userspace.
+         *
+         * The below provides this either through switch_mm(), or in
+         * case 'prev->active_mm == next->mm' through
+         * finish_task_switch()'s mmdrop().
+         */
+        switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+        if (!prev->mm) {                        // from kernel
+            /* will mmdrop() in finish_task_switch(). */
+            rq->prev_mm = prev->active_mm;
+            prev->active_mm = NULL;
+        }
+    }
+
+    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
+    prepare_lock_switch(rq, next, rf);
+
+    /* Here we just switch the register state and the stack. */
+    switch_to(prev, next, prev);
+    barrier();
+
+    return finish_task_switch(prev);
+  }
+
+
+

Then it will call the architecture specific switch_to +implementation to switch the registers state and kernel stack. Note +that registers are saved on stack and that the stack pointer is saved +in the task structure:

+
#define switch_to(prev, next, last)               \
+do {                                              \
+    ((last) = __switch_to_asm((prev), (next)));   \
+} while (0)
+
+
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(__switch_to_asm)
+    /*
+     * Save callee-saved registers
+     * This must match the order in struct inactive_task_frame
+     */
+    pushl   %ebp
+    pushl   %ebx
+    pushl   %edi
+    pushl   %esi
+    /*
+     * Flags are saved to prevent AC leakage. This could go
+     * away if objtool would have 32bit support to verify
+     * the STAC/CLAC correctness.
+     */
+    pushfl
+
+    /* switch stack */
+    movl    %esp, TASK_threadsp(%eax)
+    movl    TASK_threadsp(%edx), %esp
+
+  #ifdef CONFIG_STACKPROTECTOR
+    movl    TASK_stack_canary(%edx), %ebx
+    movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+  #endif
+
+  #ifdef CONFIG_RETPOLINE
+    /*
+     * When switching from a shallower to a deeper call stack
+     * the RSB may either underflow or use entries populated
+     * with userspace addresses. On CPUs where those concerns
+     * exist, overwrite the RSB with entries which capture
+     * speculative execution to prevent attack.
+     */
+    FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+    #endif
+
+    /* Restore flags or the incoming task to restore AC state. */
+    popfl
+    /* restore callee-saved registers */
+    popl    %esi
+    popl    %edi
+    popl    %ebx
+    popl    %ebp
+
+    jmp     __switch_to
+  SYM_CODE_END(__switch_to_asm)
+  .popsection
+
+
+

You can notice that the instruction pointer is not explicitly +saved. It is not needed because:

+
+
    +
  • a task will always resume in this function
  • +
  • the schedule() (context_switch() is always +inlined) caller's return address is saved on the kernel stack
  • +
  • a jmp is used to execute __switch_to() which is a function +and when it returns it will pop the original (next task) return +address from the stack
  • +
+
+

The following screencast uses the debugger to setup a breaking in +__switch_to_asm and examine the stack during the context switch:

+

 

+
+

Quiz: context switch

+

We are executing a context switch. Select all of the statements that are true.

+
    +
  • the ESP register is saved in the task structure
  • +
  • the EIP register is saved in the task structure
  • +
  • general registers are saved in the task structure
  • +
  • the ESP register is saved on the stack
  • +
  • the EIP register is saved on the stack
  • +
  • general registers are saved on the stack
  • +
+
+
+
+

Blocking and waking up tasks

+
+

Task states

+

The following diagram shows to the task (threads) states and the +possible transitions between them:

+../_images/ditaa-0b8cde2be9bbd195ac9dcaeac978a8bbe0d3b805.png +
+
+

Blocking the current thread

+

Blocking the current thread is an important operation we need to +perform to implement efficient task scheduling - we want to run other +threads while I/O operations complete.

+

In order to accomplish this the following operations take place:

+
    +
  • Set the current thread state to TASK_UINTERRUPTIBLE or +TASK_INTERRUPTIBLE
  • +
  • Add the task to a waiting queue
  • +
  • Call the scheduler which will pick up a new task from the READY +queue
  • +
  • Do the context switch to the new task
  • +
+

Below are some snippets for the wait_event +implementation. Note that the waiting queue is a list with some extra +information like a pointer to the task struct.

+

Also note that a lot of effort is put into making sure no deadlock can +occur between wait_event and wake_up: the task +is added to the list before checking condition, signals are +checked before calling schedule().

+
/**
+ * wait_event - sleep until a condition gets true
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event(wq_head, condition)            \
+do {                                              \
+  might_sleep();                                  \
+  if (condition)                                  \
+          break;                                  \
+  __wait_event(wq_head, condition);               \
+} while (0)
+
+#define __wait_event(wq_head, condition)                                  \
+    (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,   \
+                        schedule())
+
+/*
+ * The below macro ___wait_event() has an explicit shadow of the __ret
+ * variable when used from the wait_event_*() macros.
+ *
+ * This is so that both can use the ___wait_cond_timeout() construct
+ * to wrap the condition.
+ *
+ * The type inconsistency of the wait_event_*() __ret variable is also
+ * on purpose; we use long where we can return timeout values and int
+ * otherwise.
+ */
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)    \
+({                                                                       \
+    __label__ __out;                                                     \
+    struct wait_queue_entry __wq_entry;                                  \
+    long __ret = ret;       /* explicit shadow */                        \
+                                                                         \
+    init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);     \
+    for (;;) {                                                           \
+        long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
+                                                                         \
+        if (condition)                                                   \
+            break;                                                       \
+                                                                         \
+        if (___wait_is_interruptible(state) && __int) {                  \
+            __ret = __int;                                               \
+            goto __out;                                                  \
+        }                                                                \
+                                                                         \
+        cmd;                                                             \
+    }                                                                    \
+    finish_wait(&wq_head, &__wq_entry);                                  \
+   __out:  __ret;                                                        \
+ })
+
+ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+ {
+    wq_entry->flags = flags;
+    wq_entry->private = current;
+    wq_entry->func = autoremove_wake_function;
+    INIT_LIST_HEAD(&wq_entry->entry);
+ }
+
+ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+ {
+     unsigned long flags;
+     long ret = 0;
+
+     spin_lock_irqsave(&wq_head->lock, flags);
+     if (signal_pending_state(state, current)) {
+         /*
+          * Exclusive waiter must not fail if it was selected by wakeup,
+          * it should "consume" the condition we were waiting for.
+          *
+          * The caller will recheck the condition and return success if
+          * we were already woken up, we can not miss the event because
+          * wakeup locks/unlocks the same wq_head->lock.
+          *
+          * But we need to ensure that set-condition + wakeup after that
+          * can't see us, it should wake up another exclusive waiter if
+          * we fail.
+          */
+         list_del_init(&wq_entry->entry);
+         ret = -ERESTARTSYS;
+     } else {
+         if (list_empty(&wq_entry->entry)) {
+             if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+                 __add_wait_queue_entry_tail(wq_head, wq_entry);
+             else
+                 __add_wait_queue(wq_head, wq_entry);
+         }
+         set_current_state(state);
+     }
+     spin_unlock_irqrestore(&wq_head->lock, flags);
+
+     return ret;
+ }
+
+ static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add(&wq_entry->entry, &wq_head->head);
+ }
+
+ static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     list_add_tail(&wq_entry->entry, &wq_head->head);
+ }
+
+ /**
+  * finish_wait - clean up after waiting in a queue
+  * @wq_head: waitqueue waited on
+  * @wq_entry: wait descriptor
+  *
+  * Sets current thread back to running state and removes
+  * the wait descriptor from the given waitqueue if still
+  * queued.
+  */
+ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+     unsigned long flags;
+
+     __set_current_state(TASK_RUNNING);
+     /*
+      * We can check for list emptiness outside the lock
+      * IFF:
+      *  - we use the "careful" check that verifies both
+      *    the next and prev pointers, so that there cannot
+      *    be any half-pending updates in progress on other
+      *    CPU's that we haven't seen yet (and that might
+      *    still change the stack area.
+      * and
+      *  - all other users take the lock (ie we can only
+      *    have _one_ other CPU that looks at or modifies
+      *    the list).
+      */
+     if (!list_empty_careful(&wq_entry->entry)) {
+         spin_lock_irqsave(&wq_head->lock, flags);
+         list_del_init(&wq_entry->entry);
+         spin_unlock_irqrestore(&wq_head->lock, flags);
+     }
+ }
+
+
+
+
+

Waking up a task

+

We can wake-up tasks by using the wake_up primitive. The +following high level operations are performed to wake up a task:

+
    +
  • Select a task from the waiting queue
  • +
  • Set the task state to TASK_READY
  • +
  • Insert the task into the scheduler's READY queue
  • +
  • On SMP system this is a complex operation: each processor has its +own queue, queues need to be balanced, CPUs needs to be signaled
  • +
+
#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+               int nr_exclusive, void *key)
+{
+    __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+                  int nr_exclusive, int wake_flags, void *key)
+{
+  unsigned long flags;
+  wait_queue_entry_t bookmark;
+
+  bookmark.flags = 0;
+  bookmark.private = NULL;
+  bookmark.func = NULL;
+  INIT_LIST_HEAD(&bookmark.entry);
+
+  do {
+          spin_lock_irqsave(&wq_head->lock, flags);
+          nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+                                          wake_flags, key, &bookmark);
+          spin_unlock_irqrestore(&wq_head->lock, flags);
+  } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+}
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+                            int nr_exclusive, int wake_flags, void *key,
+                  wait_queue_entry_t *bookmark)
+{
+    wait_queue_entry_t *curr, *next;
+    int cnt = 0;
+
+    lockdep_assert_held(&wq_head->lock);
+
+    if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+          curr = list_next_entry(bookmark, entry);
+
+          list_del(&bookmark->entry);
+          bookmark->flags = 0;
+    } else
+          curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+    if (&curr->entry == &wq_head->head)
+          return nr_exclusive;
+
+    list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
+          unsigned flags = curr->flags;
+          int ret;
+
+          if (flags & WQ_FLAG_BOOKMARK)
+                  continue;
+
+          ret = curr->func(curr, mode, wake_flags, key);
+          if (ret < 0)
+                  break;
+          if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+                  break;
+
+          if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+                          (&next->entry != &wq_head->head)) {
+                  bookmark->flags = WQ_FLAG_BOOKMARK;
+                  list_add_tail(&bookmark->entry, &next->entry);
+                  break;
+          }
+    }
+
+    return nr_exclusive;
+}
+
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+{
+    int ret = default_wake_function(wq_entry, mode, sync, key);
+
+    if (ret)
+        list_del_init_careful(&wq_entry->entry);
+
+    return ret;
+}
+
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
+                    void *key)
+{
+    WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+    return try_to_wake_up(curr->private, mode, wake_flags);
+}
+
+/**
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Conceptually does:
+ *
+ *   If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * This function is atomic against schedule() which would dequeue the task.
+ *
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ *  - p->sched_class
+ *  - p->cpus_ptr
+ *  - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
+ *  - ttwu_queue()       -- new rq, for enqueue of the task;
+ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ *           %false otherwise.
+ */
+ static int
+ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ {
+     ...
+
+
+
+
+
+

Preempting tasks

+

Up until this point we look at how context switches occurs voluntary +between threads. Next we will look at how preemption is handled. We +will start wight the simpler case where the kernel is configured as +non preemptive and then we will move to the preemptive kernel case.

+
+

Non preemptive kernel

+
    +
  • At every tick the kernel checks to see if the current process has +its time slice consumed
  • +
  • If that happens a flag is set in interrupt context
  • +
  • Before returning to userspace the kernel checks this flag and +calls schedule() if needed
  • +
  • In this case tasks are not preempted while running in kernel mode +(e.g. system call) so there are no synchronization issues
  • +
+
+
+

Preemptive kernel

+

In this case the current task can be preempted even if we are running +in kernel mode and executing a system call. This requires using a +special synchronization primitives: preempt_disable and +preempt_enable.

+

In order to simplify handling for preemptive kernels and since +synchronization primitives are needed for the SMP case anyway, +preemption is disabled automatically when a spinlock is used.

+

As before, if we run into a condition that requires the preemption of +the current task (its time slices has expired) a flag is set. This +flag is checked whenever the preemption is reactivated, e.g. when +exiting a critical section through a spin_unlock() and if +needed the scheduler is called to select a new task.

+
+
+
+

Process context

+

Now that we have examined the implementation of processes and threads +(tasks), how context switching occurs, how we can block, wake-up and +preempt tasks, we can finally define what the process context is what +are its properties:

+

The kernel is executing in process context when it is running a +system call.

+

In process context there is a well defined context and we can +access the current process data with current

+

In process context we can sleep (wait on a condition).

+

In process context we can access the user-space (unless we are +running in a kernel thread context).

+
+

Kernel threads

+

Sometimes the kernel core or device drivers need to perform blocking +operations and thus they need to run in process context.

+

Kernel threads are used exactly for this and are a special class of +tasks that don't "userspace" resources (e.g. no address space or +opened files).

+

The following screencast takes a closer look at kernel threads:

+

 

+
+
+
+

Using gdb scripts for kernel inspection

+

The Linux kernel comes with a predefined set of gdb extra commands we +can use to inspect the kernel during debugging. They will +automatically be loaded as long gdbinit is properly setup

+
ubuntu@so2:/linux/tools/labs$ cat ~/.gdbinit
+add-auto-load-safe-path /linux/scripts/gdb/vmlinux-gdb.py
+
+
+

All of the kernel specific commands are prefixed with lx-. You can use +TAB in gdb to list all of them:

+
(gdb) lx-
+lx-clk-summary        lx-dmesg              lx-mounts
+lx-cmdline            lx-fdtdump            lx-ps
+lx-configdump         lx-genpd-summary      lx-symbols
+lx-cpus               lx-iomem              lx-timerlist
+lx-device-list-bus    lx-ioports            lx-version
+lx-device-list-class  lx-list-check
+lx-device-list-tree   lx-lsmod
+
+
+

The implementation of the commands can be found at +script/gdb/linux. Lets take a closer look at the lx-ps +implementation:

+
task_type = utils.CachedType("struct task_struct")
+
+
+def task_lists():
+ task_ptr_type = task_type.get_type().pointer()
+ init_task = gdb.parse_and_eval("init_task").address
+ t = g = init_task
+
+ while True:
+     while True:
+         yield t
+
+         t = utils.container_of(t['thread_group']['next'],
+                                task_ptr_type, "thread_group")
+         if t == g:
+             break
+
+     t = g = utils.container_of(g['tasks']['next'],
+                                task_ptr_type, "tasks")
+     if t == init_task:
+         return
+
+
+ class LxPs(gdb.Command):
+ """Dump Linux tasks."""
+
+ def __init__(self):
+     super(LxPs, self).__init__("lx-ps", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+     gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM"))
+     for task in task_lists():
+         gdb.write("{} {:^5} {}\n".format(
+             task.format_string().split()[0],
+             task["pid"].format_string(),
+             task["comm"].string()))
+
+
+
+

Quiz: Kernel gdb scripts

+

What is the following change of the lx-ps script trying to +accomplish?

+
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
+index 17ec19e9b5bf..7e43c163832f 100644
+--- a/scripts/gdb/linux/tasks.py
++++ b/scripts/gdb/linux/tasks.py
+@@ -75,10 +75,13 @@ class LxPs(gdb.Command):
+     def invoke(self, arg, from_tty):
+         gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM"))
+         for task in task_lists():
+-            gdb.write("{} {:^5} {}\n".format(
++            check = task["mm"].format_string() == "0x0"
++            gdb.write("{} {:^5} {}{}{}\n".format(
+                 task.format_string().split()[0],
+                 task["pid"].format_string(),
+-                task["comm"].string()))
++                "[" if check else "",
++                task["comm"].string(),
++                "]" if check else ""))
+
+
+ LxPs()
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec4-interrupts-slides.html b/refs/pull/405/merge/so2/lec4-interrupts-slides.html new file mode 100644 index 00000000..0d2932d8 --- /dev/null +++ b/refs/pull/405/merge/so2/lec4-interrupts-slides.html @@ -0,0 +1,638 @@ + + + + + + + + SO2 Lecture 04 - Interrupts — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 04 - Interrupts

+ + + + + +
+
+ +

Interrupts

+ +
    +
  • Interrupts and exceptions (x86)
  • +
  • Interrupts and exceptions (Linux)
  • +
  • Deferrable work
  • +
  • Timers
  • +
+ + + + +
+
+ +

Interrupts

+ +
    +
  • synchronous, generated by executing an instruction
  • +
  • asynchronous, generated by an external event
  • +
  • maskable
      +
    • can be ignored
    • +
    • signaled via INT pin
    • +
    +
  • +
  • non-maskable
      +
    • cannot be ignored
    • +
    • signaled via NMI pin
    • +
    +
  • +
+ + + + +
+
+ +

Exceptions

+ +
    +
  • processor detected
      +
    • faults
    • +
    • traps
    • +
    • aborts
    • +
    +
  • +
  • programmed
      +
    • int n
    • +
    +
  • +
+ + + + +
+
+ +

Quiz: interrupt terminology

+ +

For each of the following terms on the left select all the terms +from right that best describe them.

+
    +
  • Watchdog
  • +
  • Demand paging
  • +
  • Division by zero
  • +
  • Timer
  • +
  • System call
  • +
  • Breakpoint
  • +
+
    +
  • Exception
  • +
  • Interrupt
  • +
  • Maskable
  • +
  • Nonmaskable
  • +
  • Trap
  • +
  • Fault
  • +
+
+ + + + +
+
+ +

Programmable Interrupt Controller

+ +

 

+../_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.png + + + + +
+
+ +

Interrupt controllers in SMP systems

+ +

 

+../_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.png + + + + +
+
+ +

Enabling/disabling the interrupts

+ +
    +
  • at the device level
      +
    • by programming the device control registers
    • +
    +
  • +
  • at the PIC level
      +
    • PIC can be programmed to disable a given IRQ line
    • +
    +
  • +
  • at the CPU level; for example, on x86 one can use the following +instructions:
  • +
+
+
    +
  • cli (CLear Interrupt flag)
  • +
  • sti (SeT Interrupt flag)
  • +
+
+ + + + +
+
+ +

Interrupt priorities

+ +

 

+../_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.png + + + + +
+
+ +

Quiz: hardware concepts

+ +

Which of the following statements are true?

+
    +
  • The CPU can start processing a new interrupt before the current +one is finished
  • +
  • Interrupts can be disabled at the device level
  • +
  • Lower priority interrupts can not preempt handlers for higher +priority interrupts
  • +
  • Interrupts can be disabled at the interrupt controller level
  • +
  • On SMP systems the same interrupt can be routed to different CPUs
  • +
  • Interrupts can be disabled at the CPU level
  • +
+ + + + +
+
+ +

Interrupt Descriptor Table

+ +
    +
  • it is used as a jump table by the CPU when a given vector is triggered
  • +
  • it is an array of 256 x 8 bytes entries
  • +
  • may reside anywhere in physical memory
  • +
  • processor locates IDT by the means of IDTR
  • +
+ + + + +
+
+ +

Linux IRQ vector layout

+ +

 

+../_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.png + + + + +
+
+ +

Interrupt descriptor table entry (gate)

+ +

 

+../_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.png + + + + +
+
+ +

Interrupt handler address

+ +

 

+../_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.png + + + + +
+
+ +

Interrupt handler stack

+ +

 

+../_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.png + + + + +
+
+ +

Handling an interrupt request

+ +
    +
  • CPU checks the current privilege level

    +
  • +
  • if need to change privilege level

    +
    +
      +
    • change stack with the one associated with new privilege
    • +
    • save old stack information on the new stack
    • +
    +
    +
  • +
  • save EFLAGS, CS, EIP on stack

    +
  • +
  • save error code on stack in case of an abort

    +
  • +
  • execute the kernel interrupt handler

    +
  • +
+ + + + +
+
+ +

Returning from an interrupt

+ +
    +
  • pop the error code (in case of an abort)
  • +
  • call IRET
      +
    • pops values from the stack and restore the following register: CS, EIP, EFLAGS
    • +
    • if privilege level changed returns to the old stack and old privilege level
    • +
    +
  • +
+ + + + +
+
+ +

Inspecting the x86 interrupt handling

+ +

 

+ + + + +
+
+ +

Quiz: x86 interrupt handling

+ +

The following gdb commands are used to determine the handler for +the int80 based system call exception. Select and arrange the +commands or output of the commands in the correct order.

+
(void *) 0xc15de780 <entry_SYSENTER_32>
+
+set $idtr_addr=($idtr_entry>>48<<16)|($idtr_entry&0xffff)
+
+print (void*)$idtr_addr
+
+set $idtr = 0xff800000
+
+(void *) 0xc15de874 <entry_INT80_32>
+
+set $idtr = 0xff801000
+
+set $idtr_entry = *(uint64_t*)($idtr + 8 * 128)
+
+monitor info registers
+
+
+ + + + +
+
+ +

Interrupt handling in Linux

+ +

 

+../_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.png + + + + +
+
+ +

IRQ and exception nesting in Linux

+ +
    +
  • an exception (e.g. page fault, system call) can not preempt an interrupt; +if that occurs it is considered a bug
  • +
  • an interrupt can preempt an exception
  • +
  • an interrupt can not preempt another interrupt (it used to be possible)
  • +
+ + + + +
+
+ +

Interrupt/Exception nesting

+ +

 

+../_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.png + + + + +
+
+ +

Interrupt context

+ +
+
    +
  • it runs as a result of an IRQ (not of an exception)
  • +
  • there is no well defined process context associated
  • +
  • not allowed to trigger a context switch (no sleep, schedule, or user memory access)
  • +
+
+ + + + +
+
+ +

Deferrable actions

+ +
+
    +
  • Schedule callback functions to run at a later time
  • +
  • Interrupt context deferrable actions
  • +
  • Process context deferrable actions
  • +
  • APIs for initialization, scheduling, and masking
  • +
+
+ + + + +
+
+ +

Soft IRQs

+ +
+

Soft IRQ APIs:

+
+
    +
  • initialize: open_softirq()
  • +
  • activation: raise_softirq()
  • +
  • masking: local_bh_disable(), local_bh_enable()
  • +
+
+

Once activated, the callback function do_softirq() runs either:

+
+
    +
  • after an interrupt handler or
  • +
  • from the ksoftirqd kernel thread
  • +
+
+
+ + + + +
+
+ +

ksoftirqd

+ +
+
    +
  • minimum priority kernel thread
  • +
  • runs softirqs after certain limits are reached
  • +
  • tries to achieve good latency and avoid process starvation
  • +
+
+ + + + +
+
+ +

Types of soft IRQs

+ +
/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
+   frequency threaded job scheduling. For almost all the purposes
+   tasklets are more than enough. F.e. all serial device BHs et
+   al. should be converted to tasklets, not to softirqs.
+*/
+
+enum
+{
+  HI_SOFTIRQ=0,
+  TIMER_SOFTIRQ,
+  NET_TX_SOFTIRQ,
+  NET_RX_SOFTIRQ,
+  BLOCK_SOFTIRQ,
+  IRQ_POLL_SOFTIRQ,
+  TASKLET_SOFTIRQ,
+  SCHED_SOFTIRQ,
+  HRTIMER_SOFTIRQ,
+  RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
+
+  NR_SOFTIRQS
+};
+
+
+ + + + +
+
+ +

Packet flood example

+ +

 

+ + + + +
+
+ +

Tasklets

+ +

Tasklets are a dynamic type (not limited to a fixed number) of +deferred work running in interrupt context.

+

Tasklets API:

+
+
    +
  • initialization: tasklet_init()
  • +
  • activation: tasklet_schedule()
  • +
  • masking: tasklet_disable(), tasklet_enable()
  • +
+
+

Tasklets are implemented on top of two dedicated softirqs: +TASKLET_SOFITIRQ and HI_SOFTIRQ

+

Tasklets are also serialized, i.e. the same tasklet can only execute on one processor.

+ + + + +
+
+ +

Workqueues

+ +

Workqueues are a type of deferred work that runs in process context.

+

They are implemented on top of kernel threads.

+

Workqueues API:

+
+
    +
  • init: INIT_WORK
  • +
  • activation: schedule_work()
  • +
+
+ + + + +
+
+ +

Timers

+ +
+

Timers are implemented on top of the TIMER_SOFTIRQ

+

Timer API:

+
    +
  • initialization: setup_timer()
  • +
  • activation: mod_timer()
  • +
+
+ + + + +
+
+ +

Deferrable actions summary

+ +
+
    +
  • softIRQ
      +
    • runs in interrupt context
    • +
    • statically allocated
    • +
    • same handler may run in parallel on multiple cores
    • +
    +
  • +
  • tasklet
      +
    • runs in interrupt context
    • +
    • can be dynamically allocated
    • +
    • same handler runs are serialized
    • +
    +
  • +
  • workqueues
      +
    • run in process context
    • +
    +
  • +
+
+ + + + +
+
+ +

Quiz: Linux interrupt handling

+ +

Which of the following phases of interrupt handling runs with +interrupts disabled at the CPU level?

+
    +
  • Critical
  • +
  • Immediate
  • +
  • Deferred
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec4-interrupts.html b/refs/pull/405/merge/so2/lec4-interrupts.html new file mode 100644 index 00000000..f9ab1e6f --- /dev/null +++ b/refs/pull/405/merge/so2/lec4-interrupts.html @@ -0,0 +1,790 @@ + + + + + + SO2 Lecture 04 - Interrupts — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 04 - Interrupts

+

View slides

+
+

Lecture objectives

+
    +
  • Interrupts and exceptions (x86)
  • +
  • Interrupts and exceptions (Linux)
  • +
  • Deferrable work
  • +
  • Timers
  • +
+
+
+

What is an interrupt?

+

An interrupt is an event that alters the normal execution flow of a +program and can be generated by hardware devices or even by the CPU +itself. When an interrupt occurs the current flow of execution is +suspended and interrupt handler runs. After the interrupt handler runs +the previous execution flow is resumed.

+

Interrupts can be grouped into two categories based on the source of +the interrupt. They can also be grouped into two other categories based +on the ability to postpone or temporarily disable the interrupt:

+
    +
  • synchronous, generated by executing an instruction
  • +
  • asynchronous, generated by an external event
  • +
  • maskable
      +
    • can be ignored
    • +
    • signaled via INT pin
    • +
    +
  • +
  • non-maskable
      +
    • cannot be ignored
    • +
    • signaled via NMI pin
    • +
    +
  • +
+

Synchronous interrupts, usually named exceptions, handle conditions detected by the +processor itself in the course of executing an instruction. Divide by zero or +a system call are examples of exceptions.

+

Asynchronous interrupts, usually named interrupts, are external events generated +by I/O devices. For example a network card generates an interrupts to signal +that a packet has arrived.

+

Most interrupts are maskable, which means we can temporarily postpone +running the interrupt handler when we disable the interrupt until the +time the interrupt is re-enabled. However, there are a few critical +interrupts that can not be disabled/postponed.

+
+

Exceptions

+

There are two sources for exceptions:

+
    +
  • processor detected
      +
    • faults
    • +
    • traps
    • +
    • aborts
    • +
    +
  • +
  • programmed
      +
    • int n
    • +
    +
  • +
+

Processor detected exceptions are raised when an abnormal condition is +detected while executing an instruction.

+

A fault is a type of exception that is reported before the execution of the +instruction and can be usually corrected. The saved EIP is the address of +the instruction that caused the fault, so after the fault is corrected +the program can re-execute the faulty instruction. (e.g page fault).

+

A trap is a type of exception that is reported after the execution of the +instruction in which the exception was detected. The saved EIP is the address +of the instruction after the instruction that caused the trap. (e.g debug trap).

+
+
+

Quiz: interrupt terminology

+

For each of the following terms on the left select all the terms +from right that best describe them.

+
    +
  • Watchdog
  • +
  • Demand paging
  • +
  • Division by zero
  • +
  • Timer
  • +
  • System call
  • +
  • Breakpoint
  • +
+
    +
  • Exception
  • +
  • Interrupt
  • +
  • Maskable
  • +
  • Nonmaskable
  • +
  • Trap
  • +
  • Fault
  • +
+
+
+
+
+

Hardware Concepts

+
+

Programmable Interrupt Controller

+

 

+../_images/ditaa-5db1739b80a83b12505e4ff749b5e69fccd01f1b.png +

A device supporting interrupts has an output pin used for signaling an Interrupt ReQuest. IRQ +pins are connected to a device named Programmable Interrupt Controller (PIC) which is connected +to CPU's INTR pin.

+

A PIC usually has a set of ports used to exchange information with the CPU. When a device +connected to one of the PIC's IRQ lines needs CPU attention the following flow happens:

+
+
    +
  • device raises an interrupt on the corresponding IRQn pin
  • +
  • PIC converts the IRQ into a vector number and writes it to a port for CPU to read
  • +
  • PIC raises an interrupt on CPU INTR pin
  • +
  • PIC waits for CPU to acknowledge an interrupt before raising another interrupt
  • +
  • CPU acknowledges the interrupt then it starts handling the interrupt
  • +
+
+

Will see later how the CPU handles the interrupt. Notice that by +design PIC won't raise another interrupt until the CPU acknowledged +the current interrupt.

+
+

Note

+

Once the interrupt is acknowledged by the CPU the interrupt +controller can request another interrupt, regardless if the CPU +finished handled the previous interrupt or not. Thus, depending on +how the OS controls the CPU it is possible to have nested +interrupts.

+
+

The interrupt controller allows each IRQ line to be individually +disabled. This allows simplifying design by making sure that interrupt +handlers are always executed serially.

+
+
+

Interrupt controllers in SMP systems

+

In SMP systems we may have multiple interrupt controllers in the +systems.

+

For example, on the x86 architecture each core has a local APIC used +to process interrupts from locally connected devices like timers or +thermals sensors. Then there is an I/O APIC is used to distribute IRQ +from external devices to CPU cores.

+

 

+../_images/ditaa-9d23d02ebdff6eeb6bec8044480f055de9852ecc.png +
+
+

Interrupt Control

+

In order to synchronize access to shared data between the interrupt handler +and other potential concurrent activities such as driver initialization or +driver data processing, it is often required to enable and disable interrupts in +a controlled fashion.

+

This can be accomplished at several levels:

+
    +
  • at the device level
      +
    • by programming the device control registers
    • +
    +
  • +
  • at the PIC level
      +
    • PIC can be programmed to disable a given IRQ line
    • +
    +
  • +
  • at the CPU level; for example, on x86 one can use the following +instructions:
  • +
+
+
    +
  • cli (CLear Interrupt flag)
  • +
  • sti (SeT Interrupt flag)
  • +
+
+
+
+

Interrupt priorities

+

Most architectures also support interrupt priorities. When this is +enabled, it permits interrupt nesting only for those interrupts that +have a higher priority than the current priority level.

+

 

+../_images/ditaa-8b00a68b494f72d54b5fad38c88f7265aadaaa0e.png +
+

Note

+

Not all architectures support interrupt priorities. It is also +difficult to support defining a generic scheme for interrupt +priorities for general use OSes and some kernels (Linux included) +do not use interrupt priorities. On the other hand most RTOS use +interrupt priorities since they are typically used in more +constraint use-cases where it is easier to define interrupt +priorities.

+
+
+
+

Quiz: hardware concepts

+

Which of the following statements are true?

+
    +
  • The CPU can start processing a new interrupt before the current +one is finished
  • +
  • Interrupts can be disabled at the device level
  • +
  • Lower priority interrupts can not preempt handlers for higher +priority interrupts
  • +
  • Interrupts can be disabled at the interrupt controller level
  • +
  • On SMP systems the same interrupt can be routed to different CPUs
  • +
  • Interrupts can be disabled at the CPU level
  • +
+
+
+
+

Interrupt handling on the x86 architecture

+

This section will examine how interrupts are handled by the CPU on the +x86 architecture.

+
+

Interrupt Descriptor Table

+

The interrupt descriptor table (IDT) associates each interrupt or exception +identifier with a descriptor for the instructions that service the associated +event. We will name the identifier as vector number and the associated +instructions as interrupt/exception handler.

+

An IDT has the following characteristics:

+
    +
  • it is used as a jump table by the CPU when a given vector is triggered
  • +
  • it is an array of 256 x 8 bytes entries
  • +
  • may reside anywhere in physical memory
  • +
  • processor locates IDT by the means of IDTR
  • +
+

Below we can find Linux IRQ vector layout. The first 32 entries are reserved +for exceptions, vector 128 is used for syscall interface and the rest are +used mostly for hardware interrupts handlers.

+

 

+../_images/ditaa-5b3c93f6e612d0cc0e4d4837d92a443627405262.png +

On x86 an IDT entry has 8 bytes and it is named gate. There can be 3 types of gates:

+
+
    +
  • interrupt gate, holds the address of an interrupt or exception handler. +Jumping to the handler disables maskable interrupts (IF flag is cleared).
  • +
  • trap gates, similar to an interrupt gate but it does not disable maskable +interrupts while jumping to interrupt/exception handler.
  • +
  • task gates (not used in Linux)
  • +
+
+

Let's have a look at several fields of an IDT entry:

+
+
    +
  • segment selector, index into GDT/LDT to find the start of the code segment where +the interrupt handlers reside
  • +
  • offset, offset inside the code segment
  • +
  • T, represents the type of gate
  • +
  • DPL, minimum privilege required for using the segments content.
  • +
+
+

 

+../_images/ditaa-eff5e0e3b58ce239d5310b22b89c0927be5853bd.png +
+
+

Interrupt handler address

+

In order to find the interrupt handler address we first need to find the start +address of the code segment where interrupt handler resides. For this we +use the segment selector to index into GDT/LDT where we can find the corresponding +segment descriptor. This will provide the start address kept in the 'base' field. +Using base address and the offset we can now go to the start of the interrupt handler.

+

 

+../_images/ditaa-b2023fce22479e20bbe08fd76eed87e9a0527688.png +
+
+

Stack of interrupt handler

+

Similar to control transfer to a normal function, a control transfer +to an interrupt or exception handler uses the stack to store the +information needed for returning to the interrupted code.

+

As can be seen in the figure below, an interrupt pushes the EFLAGS register +before saving the address of the interrupted instruction. Certain types +of exceptions also cause an error code to be pushed on the stack to help +debug the exception.

+

 

+../_images/ditaa-85b69602726fa6143fc3ba0ffdb492454864aacf.png +
+
+

Handling an interrupt request

+

After an interrupt request has been generated the processor runs a sequence of +events that eventually end up with running the kernel interrupt handler:

+
    +
  • CPU checks the current privilege level

    +
  • +
  • if need to change privilege level

    +
    +
      +
    • change stack with the one associated with new privilege
    • +
    • save old stack information on the new stack
    • +
    +
    +
  • +
  • save EFLAGS, CS, EIP on stack

    +
  • +
  • save error code on stack in case of an abort

    +
  • +
  • execute the kernel interrupt handler

    +
  • +
+
+
+

Returning from an interrupt handler

+

Most architectures offer special instructions to clean up the stack and resume +the execution after the interrupt handler has been executed. On x86 IRET is used +to return from an interrupt handler. IRET is similar to RET except that IRET +increments ESP by extra four bytes (because of the flags on stack) and moves the +saved flags into EFLAGS register.

+

To resume the execution after an interrupt the following sequence is used (x86):

+
    +
  • pop the error code (in case of an abort)
  • +
  • call IRET
      +
    • pops values from the stack and restore the following register: CS, EIP, EFLAGS
    • +
    • if privilege level changed returns to the old stack and old privilege level
    • +
    +
  • +
+
+
+

Inspecting the x86 interrupt handling

+

 

+
+
+

Quiz: x86 interrupt handling

+

The following gdb commands are used to determine the handler for +the int80 based system call exception. Select and arrange the +commands or output of the commands in the correct order.

+
(void *) 0xc15de780 <entry_SYSENTER_32>
+
+set $idtr_addr=($idtr_entry>>48<<16)|($idtr_entry&0xffff)
+
+print (void*)$idtr_addr
+
+set $idtr = 0xff800000
+
+(void *) 0xc15de874 <entry_INT80_32>
+
+set $idtr = 0xff801000
+
+set $idtr_entry = *(uint64_t*)($idtr + 8 * 128)
+
+monitor info registers
+
+
+
+
+
+

Interrupt handling in Linux

+

In Linux the interrupt handling is done in three phases: critical, immediate and +deferred.

+

In the first phase the kernel will run the generic interrupt handler that +determines the interrupt number, the interrupt handler for this particular +interrupt and the interrupt controller. At this point any timing critical +actions will also be performed (e.g. acknowledge the interrupt at the interrupt +controller level). Local processor interrupts are disabled for the duration of +this phase and continue to be disabled in the next phase.

+

In the second phase, all of the device driver's handlers associated with this +interrupt will be executed. At the end of this phase, the interrupt controller's +"end of interrupt" method is called to allow the interrupt controller to +reassert this interrupt. The local processor interrupts are enabled at this +point.

+
+

Note

+

It is possible that one interrupt is associated with multiple +devices and in this case it is said that the interrupt is +shared. Usually, when using shared interrupts it is the +responsibility of the device driver to determine if the interrupt +is target to its device or not.

+
+

Finally, in the last phase of interrupt handling interrupt context deferrable +actions will be run. These are also sometimes known as "bottom half" of the +interrupt (the upper half being the part of the interrupt handling that runs +with interrupts disabled). At this point, interrupts are enabled on the local +processor.

+

 

+../_images/ditaa-da31e3d17a4d55e5c3dbc0bd5903306418a896ca.png +
+

Nested interrupts and exceptions

+

Linux used to support nested interrupts but this was removed some time +ago in order to avoid increasingly complex solutions to stack +overflows issues - allow just one level of nesting, allow multiple +levels of nesting up to a certain kernel stack depth, etc.

+

However, it is still possible to have nesting between exceptions and +interrupts but the rules are fairly restrictive:

+
    +
  • an exception (e.g. page fault, system call) can not preempt an interrupt; +if that occurs it is considered a bug
  • +
  • an interrupt can preempt an exception
  • +
  • an interrupt can not preempt another interrupt (it used to be possible)
  • +
+

The diagram below shows the possible nesting scenarios:

+

 

+../_images/ditaa-2e49ca6ac606dab4b2b53231cfbe85ff06312d36.png +
+
+

Interrupt context

+

While an interrupt is handled (from the time the CPU jumps to the interrupt +handler until the interrupt handler returns - e.g. IRET is issued) it is said +that code runs in "interrupt context".

+

Code that runs in interrupt context has the following properties:

+
+
    +
  • it runs as a result of an IRQ (not of an exception)
  • +
  • there is no well defined process context associated
  • +
  • not allowed to trigger a context switch (no sleep, schedule, or user memory access)
  • +
+
+
+
+

Deferrable actions

+

Deferrable actions are used to run callback functions at a later time. If +deferrable actions scheduled from an interrupt handler, the associated callback +function will run after the interrupt handler has completed.

+

There are two large categories of deferrable actions: those that run in +interrupt context and those that run in process context.

+

The purpose of interrupt context deferrable actions is to avoid doing too much +work in the interrupt handler function. Running for too long with interrupts +disabled can have undesired effects such as increased latency or poor system +performance due to missing other interrupts (e.g. dropping network packets +because the CPU did not react in time to dequeue packets from the network +interface and the network card buffer is full).

+

Deferrable actions have APIs to: initialize an instance, activate or +schedule the action and mask/disable and unmask/enable the execution +of the callback function. The latter is used for synchronization purposes between +the callback function and other contexts.

+

Typically the device driver will initialize the deferrable action +structure during the device instance initialization and will activate +/ schedule the deferrable action from the interrupt handler.

+
+
+

Soft IRQs

+

Soft IRQs is the term used for the low-level mechanism that implements deferring +work from interrupt handlers but that still runs in interrupt context.

+
+

Soft IRQ APIs:

+
+
    +
  • initialize: open_softirq()
  • +
  • activation: raise_softirq()
  • +
  • masking: local_bh_disable(), local_bh_enable()
  • +
+
+

Once activated, the callback function do_softirq() runs either:

+
+
    +
  • after an interrupt handler or
  • +
  • from the ksoftirqd kernel thread
  • +
+
+
+

Since softirqs can reschedule themselves or other interrupts can occur that +reschedules them, they can potentially lead to (temporary) process starvation if +checks are not put into place. Currently, the Linux kernel does not allow +running soft irqs for more than MAX_SOFTIRQ_TIME or rescheduling for +more than MAX_SOFTIRQ_RESTART consecutive times.

+

Once these limits are reached a special kernel thread, ksoftirqd is woken up +and all of the rest of pending soft irqs will be run from the context of this +kernel thread.

+

Soft irqs usage is restricted, they are use by a handful of subsystems that have +low latency requirements and high frequency:

+
/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
+   frequency threaded job scheduling. For almost all the purposes
+   tasklets are more than enough. F.e. all serial device BHs et
+   al. should be converted to tasklets, not to softirqs.
+*/
+
+enum
+{
+  HI_SOFTIRQ=0,
+  TIMER_SOFTIRQ,
+  NET_TX_SOFTIRQ,
+  NET_RX_SOFTIRQ,
+  BLOCK_SOFTIRQ,
+  IRQ_POLL_SOFTIRQ,
+  TASKLET_SOFTIRQ,
+  SCHED_SOFTIRQ,
+  HRTIMER_SOFTIRQ,
+  RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
+
+  NR_SOFTIRQS
+};
+
+
+
+
+

Packet flood example

+

The following screencast will look at what happens when we flood the +system with a large number of packets. Since at least a part of the +packet processing is happening in softirq we should expect the CPU to +spend most of the time running softirqs but the majority of that +should be in the context of the ksoftirqd thread.

+

 

+
+
+

Tasklets

+

Tasklets are a dynamic type (not limited to a fixed number) of +deferred work running in interrupt context.

+

Tasklets API:

+
+
    +
  • initialization: tasklet_init()
  • +
  • activation: tasklet_schedule()
  • +
  • masking: tasklet_disable(), tasklet_enable()
  • +
+
+

Tasklets are implemented on top of two dedicated softirqs: +TASKLET_SOFITIRQ and HI_SOFTIRQ

+

Tasklets are also serialized, i.e. the same tasklet can only execute on one processor.

+
+
+

Workqueues

+
+

Workqueues are a type of deferred work that runs in process context.

+

They are implemented on top of kernel threads.

+

Workqueues API:

+
+
    +
  • init: INIT_WORK
  • +
  • activation: schedule_work()
  • +
+
+
+
+
+

Timers

+
+

Timers are implemented on top of the TIMER_SOFTIRQ

+

Timer API:

+
    +
  • initialization: setup_timer()
  • +
  • activation: mod_timer()
  • +
+
+
+
+

Deferrable actions summary

+

Here is a cheat sheet which summarizes Linux deferrable actions:

+
+
    +
  • softIRQ
      +
    • runs in interrupt context
    • +
    • statically allocated
    • +
    • same handler may run in parallel on multiple cores
    • +
    +
  • +
  • tasklet
      +
    • runs in interrupt context
    • +
    • can be dynamically allocated
    • +
    • same handler runs are serialized
    • +
    +
  • +
  • workqueues
      +
    • run in process context
    • +
    +
  • +
+
+
+
+

Quiz: Linux interrupt handling

+

Which of the following phases of interrupt handling runs with +interrupts disabled at the CPU level?

+
    +
  • Critical
  • +
  • Immediate
  • +
  • Deferred
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec5-smp-slides.html b/refs/pull/405/merge/so2/lec5-smp-slides.html new file mode 100644 index 00000000..75b38a6c --- /dev/null +++ b/refs/pull/405/merge/so2/lec5-smp-slides.html @@ -0,0 +1,821 @@ + + + + + + + + SO2 Lecture 05 - Symmetric Multi-Processing — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 05 - Symmetric Multi-Processing

+ + + + + +
+
+ +

Symmetric Multi-Processing

+ +
    +
  • Kernel Concurrency
  • +
  • Atomic operations
  • +
  • Spin locks
  • +
  • Cache thrashing
  • +
  • Optimized spin locks
  • +
  • Process and Interrupt Context Synchronization
  • +
  • Mutexes
  • +
  • Per CPU data
  • +
  • Memory Ordering and Barriers
  • +
  • Read-Copy Update
  • +
+ + + + +
+
+ +

Race conditions

+ +
    +
  • there are at least two execution contexts that run in "parallel":
      +
    • truly run in parallel (e.g. two system calls running on +different processors)
    • +
    • one of the contexts can arbitrary preempt the other (e.g. an +interrupt preempts a system call)
    • +
    +
  • +
  • the execution contexts perform read-write accesses to shared +memory
  • +
+ + + + +
+
+ +

Race condition: resource counter release

+ +
void release_resource()
+{
+     counter--;
+
+     if (!counter)
+         free_resource();
+}
+
+
+ + + + +
+
+ +

Race condition scenario

+ +

 

+../_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.png + + + + +
+
+ +

Avoiding race conditions

+ +
    +
  • make the critical section atomic (e.g. use atomic +instructions)
  • +
  • disable preemption during the critical section (e.g. disable +interrupts, bottom-half handlers, or thread preemption)
  • +
  • serialize the access to the critical section (e.g. use spin +locks or mutexes to allow only one context or thread in the +critical section)
  • +
+ + + + +
+
+ +

Linux kernel concurrency sources

+ +
    +
  • single core systems, non-preemptive kernel: the current +process can be preempted by interrupts
  • +
  • single core systems, preemptive kernel: above + the +current process can be preempted by other processes
  • +
  • multi-core systems: above + the current process can run +in parallel with another process or with an interrupt running on +another processor
  • +
+ + + + +
+
+ +

Atomic operations

+ +
    +
  • integer based:
      +
    • simple: atomic_inc(), atomic_dec(), +atomic_add(), atomic_sub()
    • +
    • conditional: atomic_dec_and_test(), atomic_sub_and_test()
    • +
    +
  • +
  • bit based:
      +
    • simple: test_bit(), set_bit(), +change_bit()
    • +
    • conditional: test_and_set_bit(), test_and_clear_bit(), +test_and_change_bit()
    • +
    +
  • +
+ + + + +
+
+ +

Using atomic_dec_and_test() to implement resource counter release

+ +
void release_resource()
+{
+    if (atomic_dec_and_test(&counter))
+         free_resource();
+}
+
+
+ + + + +
+
+ +

Atomic operations may not be atomic on SMP systems

+ +

 

+../_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.png + + + + +
+
+ +

Fixing atomic operations for SMP systems (x86)

+ +

 

+../_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.png + + + + +
+
+ +

Synchronization with interrupts (x86)

+ +
 #define local_irq_disable() \
+     asm volatile („cli” : : : „memory”)
+
+#define local_irq_enable() \
+    asm volatile („sti” : : : „memory”)
+
+#define local_irq_save(flags) \
+    asm volatile ("pushf ; pop %0" :"=g" (flags)
+                  : /* no input */: "memory") \
+    asm volatile("cli": : :"memory")
+
+#define local_irq_restore(flags) \
+    asm volatile ("push %0 ; popf"
+                  : /* no output */
+                  : "g" (flags) :"memory", "cc");
+
+
+ + + + +
+
+ +

Spin Lock Implementation Example (x86)

+ +
spin_lock:
+    lock bts [my_lock], 0
+    jc spin_lock
+
+/* critical section */
+
+spin_unlock:
+    mov [my_lock], 0
+
+
+

bts dts, src - bit test and set; it copies the src bit from the dts +memory address to the carry flag and then sets it:

+
CF <- dts[src]
+dts[src] <- 1
+
+
+ + + + +
+
+ +

Lock Contention

+ +
    +
  • There is lock contention when at least one core spins trying to +enter the critical section lock
  • +
  • Lock contention grows with the critical section size, time spent +in the critical section and the number of cores in the system
  • +
+ + + + +
+
+ +

Cache Thrashing

+ +

Cache thrashing occurs when multiple cores are trying to read and +write to the same memory resulting in excessive cache misses.

+

Since spin locks continuously access memory during lock contention, +cache thrashing is a common occurrence due to the way cache +coherency is implemented.

+ + + + +
+
+ +

Synchronized caches and memory

+ +

 

+../_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.png + + + + +
+
+ +

Unsynchronized caches and memory

+ +

 

+../_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.png + + + + +
+
+ +

Cache Coherency Protocols

+ +
    +
  • Bus snooping (sniffing) based: memory bus transactions are +monitored by caches and they take actions to preserve +coherency
  • +
  • Directory based: there is a separate entity (directory) that +maintains the state of caches; caches interact with directory +to preserve coherency
  • +
+

Bus snooping is simpler but it performs poorly when the number of +cores goes beyond 32-64.

+

Directory based cache coherence protocols scale much better (up +to thousands of cores) and are usually used in NUMA systems.

+ + + + +
+
+ +

MESI Cache Coherence Protocol

+ +
    +
  • Caching policy: write back
  • +
  • Cache line states
      +
    • Modified: owned by a single core and dirty
    • +
    • Exclusive: owned by a single core and clean
    • +
    • Shared: shared between multiple cores and clean
    • +
    • Invalid : the line is not cached
    • +
    +
  • +
+ + + + +
+
+ +

MESI State Transitions

+ +
    +
  • Invalid -> Exclusive: read request, all other cores have the line +in Invalid; line loaded from memory
  • +
  • Invalid -> Shared: read request, at least one core has the line +in Shared or Exclusive; line loaded from sibling cache
  • +
  • Invalid/Shared/Exclusive -> Modified: write request; all +other cores invalidate the line
  • +
  • Modified -> Invalid: write request from other core; line is +flushed to memory
  • +
+ + + + +
+
+ +

Cache thrashing due to spin lock contention

+ +

 

+../_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.png + + + + +
+
+ +

Optimized spin lock (KeAcquireSpinLock)

+ +

 

+
spin_lock:
+    rep ; nop
+    test lock_addr, 1
+    jnz spin_lock
+    lock bts lock_addr
+    jc spin_lock
+
+
+
    +
  • we first test the lock read only, using a non atomic +instructions, to avoid writes and thus invalidate operations +while we spin
  • +
  • only when the lock might be free, we try to acquire it
  • +
+ + + + +
+
+ +

Queued Spin Locks

+ +

 

+../_images/ditaa-58545831034f050660727be99cede213bc4a53c7.png + + + + +
+
+ +

Process and Interrupt Handler Synchronization Deadlock

+ +
    +
  • In the process context we take the spin lock
  • +
  • An interrupt occurs and it is scheduled on the same CPU core
  • +
  • The interrupt handler runs and tries to take the spin lock
  • +
  • The current CPU will deadlock
  • +
+ + + + +
+
+ +

Interrupt Synchronization for SMP

+ +
    +
  • In process context: disable interrupts and acquire a spin lock; +this will protect both against interrupt or other CPU cores race +conditions (spin_lock_irqsave() and +spin_lock_restore() combine the two operations)
  • +
  • In interrupt context: take a spin lock; this will will protect +against race conditions with other interrupt handlers or process +context running on different processors
  • +
+ + + + +
+
+ +

Bottom-Half Synchronization for SMP

+ +
    +
  • In process context use spin_lock_bh() (which combines +local_bh_disable() and spin_lock()) and +spin_unlock_bh() (which combines spin_unlock() and +local_bh_enable())
  • +
  • In bottom half context use: spin_lock() and +spin_unlock() (or spin_lock_irqsave() and +spin_lock_irqrestore() if sharing data with interrupt +handlers)
  • +
+ + + + +
+
+ +

Preemption

+ +

 

+

Preemption is configurable: when active it provides better latency +and response time, while when deactivated it provides better +throughput.

+

Preemption is disabled by spin locks and mutexes but it can be +manually disabled as well (by core kernel code).

+ + + + +
+
+ +

Preemption and Bottom-Half Masking

+ +
#define PREEMPT_BITS      8
+#define SOFTIRQ_BITS      8
+#define HARDIRQ_BITS      4
+#define NMI_BITS          1
+
+#define preempt_disable() preempt_count_inc()
+
+#define local_bh_disable() add_preempt_count(SOFTIRQ_OFFSET)
+
+#define local_bh_enable() sub_preempt_count(SOFTIRQ_OFFSET)
+
+#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+
+#define in_interrupt() irq_count()
+
+asmlinkage void do_softirq(void)
+{
+    if (in_interrupt()) return;
+    ...
+
+
+ + + + +
+
+ +

Mutexes

+ +
    +
  • They don't "waste" CPU cycles; system throughput is better than +spin locks if context switch overhead is lower than medium +spinning time
  • +
  • They can't be used in interrupt context
  • +
  • They have a higher latency than spin locks
  • +
+ + + + +
+
+ +

mutex_lock() fast path

+ +
void __sched mutex_lock(struct mutex *lock)
+{
+  might_sleep();
+
+  if (!__mutex_trylock_fast(lock))
+    __mutex_lock_slowpath(lock);
+}
+
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+    return true;
+
+  return false;
+}
+
+
+ + + + +
+
+ +

mutex_lock() slow path

+ +
...
+  spin_lock(&lock->wait_lock);
+...
+  /* add waiting tasks to the end of the waitqueue (FIFO): */
+  list_add_tail(&waiter.list, &lock->wait_list);
+...
+  waiter.task = current;
+...
+  for (;;) {
+    if (__mutex_trylock(lock))
+      goto acquired;
+  ...
+    spin_unlock(&lock->wait_lock);
+  ...
+    set_current_state(state);
+    spin_lock(&lock->wait_lock);
+  }
+  spin_lock(&lock->wait_lock);
+acquired:
+  __set_current_state(TASK_RUNNING);
+  mutex_remove_waiter(lock, &waiter, current);
+  spin_lock(&lock->wait_lock);
+...
+
+
+ + + + +
+
+ +

mutex_unlock() fast path

+ +
void __sched mutex_unlock(struct mutex *lock)
+{
+  if (__mutex_unlock_fast(lock))
+    return;
+  __mutex_unlock_slowpath(lock, _RET_IP_);
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
+    return true;
+
+  return false;
+}
+
+void __mutex_lock_slowpath(struct mutex *lock)
+{
+...
+  if (__mutex_waiter_is_first(lock, &waiter))
+          __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+...
+
+
+ + + + +
+
+ +

mutex_unlock() slow path

+ +
...
+spin_lock(&lock->wait_lock);
+if (!list_empty(&lock->wait_list)) {
+  /* get the first entry from the wait-list: */
+  struct mutex_waiter *waiter;
+  waiter = list_first_entry(&lock->wait_list, struct mutex_waiter,
+                            list);
+  next = waiter->task;
+  wake_q_add(&wake_q, next);
+}
+...
+spin_unlock(&lock->wait_lock);
+...
+wake_up_q(&wake_q);
+
+
+ + + + +
+
+ +

Per CPU data

+ +
    +
  • No need to synchronize to access the data
  • +
  • No contention, no performance impact
  • +
  • Well suited for distributed processing where aggregation is only +seldom necessary (e.g. statistics counters)
  • +
+ + + + +
+
+ +

Out of Order Compiler Generated Code

+ + ++++ + + + + + + + + +
C codeCompiler generated code
a = 1;
+b = 2;
+
+
+
MOV R10, 1
+MOV R11, 2
+STORE R11, b
+STORE R10, a
+
+
+
+ + + + +
+
+ +

Barriers

+ +
    +
  • A read barrier (rmb(), smp_rmb()) is used to +make sure that no read operation crosses the barrier; that is, +all read operation before the barrier are complete before +executing the first instruction after the barrier
  • +
  • A write barrier (wmb(), smp_wmb()) is used to +make sure that no write operation crosses the barrier
  • +
  • A simple barrier (mb(), smp_mb()) is used +to make sure that no write or read operation crosses the barrier
  • +
+ + + + +
+
+ +

Read Copy Update (RCU)

+ +
    +
  • Read-only lock-less access at the same time with write access
  • +
  • Write accesses still requires locks in order to avoid races +between writers
  • +
  • Requires unidirectional traversal by readers
  • +
+ + + + +
+
+ +

Removal and Reclamation

+ +
    +
  • Removal: removes references to elements. Some old readers may +still see the old reference so we can't free the element.
  • +
  • Elimination: free the element. This action is postponed until +all existing readers finish traversal (quiescent cycle). New +readers won't affect the quiescent cycle.
  • +
+ + + + +
+
+ +

RCU List Delete

+ +

 

+../_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.png + + + + +
+
+ +

RCU list APIs cheat sheet

+ +
/* list traversal */
+rcu_read_lock();
+list_for_each_entry_rcu(i, head) {
+  /* no sleeping, blocking calls or context switch allowed */
+}
+rcu_read_unlock();
+
+
+/* list element delete  */
+spin_lock(&lock);
+list_del_rcu(&node->list);
+spin_unlock(&lock);
+synchronize_rcu();
+kfree(node);
+
+/* list element add  */
+spin_lock(&lock);
+list_add_rcu(head, &node->list);
+spin_unlock(&lock);
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec5-smp.html b/refs/pull/405/merge/so2/lec5-smp.html new file mode 100644 index 00000000..dc01bed9 --- /dev/null +++ b/refs/pull/405/merge/so2/lec5-smp.html @@ -0,0 +1,919 @@ + + + + + + SO2 Lecture 05 - Symmetric Multi-Processing — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 05 - Symmetric Multi-Processing

+

View slides

+
+

Lecture objectives:

+
    +
  • Kernel Concurrency
  • +
  • Atomic operations
  • +
  • Spin locks
  • +
  • Cache thrashing
  • +
  • Optimized spin locks
  • +
  • Process and Interrupt Context Synchronization
  • +
  • Mutexes
  • +
  • Per CPU data
  • +
  • Memory Ordering and Barriers
  • +
  • Read-Copy Update
  • +
+
+
+

Synchronization basics

+

Because the Linux kernel supports symmetric multi-processing (SMP) it +must use a set of synchronization mechanisms to achieve predictable +results, free of race conditions.

+
+

Note

+

We will use the terms core, CPU and processor as +interchangeable for the purpose of this lecture.

+
+

Race conditions can occur when the following two conditions happen +simultaneously:

+
    +
  • there are at least two execution contexts that run in "parallel":
      +
    • truly run in parallel (e.g. two system calls running on +different processors)
    • +
    • one of the contexts can arbitrary preempt the other (e.g. an +interrupt preempts a system call)
    • +
    +
  • +
  • the execution contexts perform read-write accesses to shared +memory
  • +
+

Race conditions can lead to erroneous results that are hard to debug, +because they manifest only when the execution contexts are scheduled +on the CPU cores in a very specific order.

+

A classical race condition example is an incorrect implementation for +a release operation of a resource counter:

+
void release_resource()
+{
+     counter--;
+
+     if (!counter)
+         free_resource();
+}
+
+
+

A resource counter is used to keep a shared resource available until +the last user releases it but the above implementation has a race +condition that can cause freeing the resource twice:

+

 

+../_images/ditaa-35f7597b35b83bb0025ac2a5f158c9eae23050c8.png +

In most cases the release_resource() function will only free the +resource once. However, in the scenario above, if thread A is +preempted right after decrementing counter and thread B calls +release_resource() it will cause the resource to be freed. When +resumed, thread A will also free the resource since the counter value +is 0.

+

To avoid race conditions the programmer must first identify the +critical section that can generate a race condition. The critical +section is the part of the code that reads and writes shared memory +from multiple parallel contexts.

+

In the example above, the minimal critical section is starting with +the counter decrement and ending with checking the counter's value.

+

Once the critical section has been identified race conditions can be +avoided by using one of the following approaches:

+
    +
  • make the critical section atomic (e.g. use atomic +instructions)
  • +
  • disable preemption during the critical section (e.g. disable +interrupts, bottom-half handlers, or thread preemption)
  • +
  • serialize the access to the critical section (e.g. use spin +locks or mutexes to allow only one context or thread in the +critical section)
  • +
+
+
+

Linux kernel concurrency sources

+

There are multiple source of concurrency in the Linux kernel that +depend on the kernel configuration as well as the type of system it +runs on:

+
    +
  • single core systems, non-preemptive kernel: the current +process can be preempted by interrupts
  • +
  • single core systems, preemptive kernel: above + the +current process can be preempted by other processes
  • +
  • multi-core systems: above + the current process can run +in parallel with another process or with an interrupt running on +another processor
  • +
+
+

Note

+

We only discuss kernel concurrency and that is why a +non-preemptive kernel running on an single core system +has interrupts as the only source of concurrency.

+
+
+
+

Atomic operations

+

In certain circumstances we can avoid race conditions by using atomic +operations that are provided by hardware. Linux provides a unified API +to access atomic operations:

+
    +
  • integer based:
      +
    • simple: atomic_inc(), atomic_dec(), +atomic_add(), atomic_sub()
    • +
    • conditional: atomic_dec_and_test(), atomic_sub_and_test()
    • +
    +
  • +
  • bit based:
      +
    • simple: test_bit(), set_bit(), +change_bit()
    • +
    • conditional: test_and_set_bit(), test_and_clear_bit(), +test_and_change_bit()
    • +
    +
  • +
+

For example, we could use atomic_dec_and_test() to implement +the resource counter decrement and value checking atomic:

+
void release_resource()
+{
+    if (atomic_dec_and_test(&counter))
+         free_resource();
+}
+
+
+

One complication with atomic operations is encountered in +multi-core systems, where an atomic operation is not longer +atomic at the system level (but still atomic at the core level).

+

To understand why, we need to decompose the atomic operation in memory +loads and stores. Then we can construct race condition scenarios where +the load and store operations are interleaved across CPUs, like in the +example below where incrementing a value from two processors will +produce an unexpected result:

+

 

+../_images/ditaa-ddd14be50300088958e86912bc5f396797634a3a.png +

In order to provide atomic operations on SMP systems different +architectures use different techniques. For example, on x86 a LOCK +prefix is used to lock the system bus while executing the prefixed +operation:

+

 

+../_images/ditaa-c11fccb956cdf115910f9f72e1dc14cd7ed549ff.png +

On ARM the LDREX and STREX instructions are used together to guarantee +atomic access: LDREX loads a value and signals the exclusive monitor +that an atomic operation is in progress. The STREX attempts to store a +new value but only succeeds if the exclusive monitor has not detected +other exclusive operations. So, to implement atomic operations the +programmer must retry the operation (both LDREX and STREX) until the +exclusive monitor signals a success.

+

Although they are often interpreted as "light" or "efficient" +synchronization mechanisms (because they "don't require spinning or +context switches", or because they "are implemented in hardware so +they must be more efficient", or because they "are just instructions +so they must have similar efficiency as other instructions"), as seen +from the implementation details, atomic operations are actually +expensive.

+
+
+

Disabling preemption (interrupts)

+

On single core systems and non preemptive kernels the only source of +concurrency is the preemption of the current thread by an +interrupt. To prevent concurrency is thus sufficient to disable +interrupts.

+

This is done with architecture specific instructions, but Linux offers +architecture independent APIs to disable and enable interrupts:

+
 #define local_irq_disable() \
+     asm volatile („cli” : : : „memory”)
+
+#define local_irq_enable() \
+    asm volatile („sti” : : : „memory”)
+
+#define local_irq_save(flags) \
+    asm volatile ("pushf ; pop %0" :"=g" (flags)
+                  : /* no input */: "memory") \
+    asm volatile("cli": : :"memory")
+
+#define local_irq_restore(flags) \
+    asm volatile ("push %0 ; popf"
+                  : /* no output */
+                  : "g" (flags) :"memory", "cc");
+
+
+

Although the interrupts can be explicitly disabled and enable with +local_irq_disable() and local_irq_enable() these APIs +should only be used when the current state and interrupts is +known. They are usually used in core kernel code (like interrupt +handling).

+

For typical cases where we want to avoid interrupts due to concurrency +issues it is recommended to use the local_irq_save() and +local_irq_restore() variants. They take care of saving and +restoring the interrupts states so they can be freely called from +overlapping critical sections without the risk of accidentally +enabling interrupts while still in a critical section, as long as the +calls are balanced.

+
+
+

Spin Locks

+

Spin locks are used to serialize access to a critical section. They +are necessary on multi-core systems where we can have true execution +parallelism. This is a typical spin lock implementation:

+
spin_lock:
+    lock bts [my_lock], 0
+    jc spin_lock
+
+/* critical section */
+
+spin_unlock:
+    mov [my_lock], 0
+
+
+

bts dts, src - bit test and set; it copies the src bit from the dts +memory address to the carry flag and then sets it:

+
CF <- dts[src]
+dts[src] <- 1
+
+
+

As it can be seen, the spin lock uses an atomic instruction to make +sure that only one core can enter the critical section. If there are +multiple cores trying to enter they will continuously "spin" until the +lock is released.

+

While the spin lock avoids race conditions, it can have a significant +impact on the system's performance due to "lock contention":

+
    +
  • There is lock contention when at least one core spins trying to +enter the critical section lock
  • +
  • Lock contention grows with the critical section size, time spent +in the critical section and the number of cores in the system
  • +
+

Another negative side effect of spin locks is cache thrashing.

+

Cache thrashing occurs when multiple cores are trying to read and +write to the same memory resulting in excessive cache misses.

+

Since spin locks continuously access memory during lock contention, +cache thrashing is a common occurrence due to the way cache +coherency is implemented.

+
+
+

Cache coherency in multi-processor systems

+

The memory hierarchy in multi-processor systems is composed of local +CPU caches (L1 caches), shared CPU caches (L2 caches) and the main +memory. To explain cache coherency we will ignore the L2 cache and +only consider the L1 caches and main memory.

+

In the figure below we present a view of the memory hierarchy with two +variables A and B that fall into different cache lines and where +caches and the main memory are synchronized:

+

 

+../_images/ditaa-4d63c157487ff8291f2a6e93fe680ec38c1a3212.png +

In the absence of a synchronization mechanism between the caches and +main memory, when CPU 0 executes A = A + B and CPU 1 executes B = +A + B we will have the following memory view:

+

 

+../_images/ditaa-7ee0f9bb5f5af586e043afd47cfbad0adcc34888.png +

In order to avoid the situation above multi-processor systems use +cache coherency protocols. There are two main types of cache coherency +protocols:

+
    +
  • Bus snooping (sniffing) based: memory bus transactions are +monitored by caches and they take actions to preserve +coherency
  • +
  • Directory based: there is a separate entity (directory) that +maintains the state of caches; caches interact with directory +to preserve coherency
  • +
+

Bus snooping is simpler but it performs poorly when the number of +cores goes beyond 32-64.

+

Directory based cache coherence protocols scale much better (up +to thousands of cores) and are usually used in NUMA systems.

+

A simple cache coherency protocol that is commonly used in practice is +MESI (named after the acronym of the cache line states names: +Modified, Exclusive, Shared and Invalid). It's main +characteristics are:

+
    +
  • Caching policy: write back
  • +
  • Cache line states
      +
    • Modified: owned by a single core and dirty
    • +
    • Exclusive: owned by a single core and clean
    • +
    • Shared: shared between multiple cores and clean
    • +
    • Invalid : the line is not cached
    • +
    +
  • +
+

Issuing read or write requests from CPU cores will trigger state +transitions, as exemplified below:

+
    +
  • Invalid -> Exclusive: read request, all other cores have the line +in Invalid; line loaded from memory
  • +
  • Invalid -> Shared: read request, at least one core has the line +in Shared or Exclusive; line loaded from sibling cache
  • +
  • Invalid/Shared/Exclusive -> Modified: write request; all +other cores invalidate the line
  • +
  • Modified -> Invalid: write request from other core; line is +flushed to memory
  • +
+
+

Note

+

The most important characteristic of the MESI protocol is +that it is a write-invalidate cache protocol. When writing to a +shared location all other caches are invalidated.

+
+

This has important performance impact in certain access patterns, and +one such pattern is contention for a simple spin lock implementation +like we discussed above.

+

To exemplify this issue lets consider a system with three CPU cores, +where the first has acquired the spin lock and it is running the +critical section while the other two are spinning waiting to enter the +critical section:

+

 

+../_images/ditaa-b26d802c286bda6c559b4dcfa8a7fb27f840463e.png +

As it can be seen from the figure above due to the writes issued by +the cores spinning on the lock we see frequent cache line invalidate +operations which means that basically the two waiting cores will flush +and load the cache line while waiting for the lock, creating +unnecessary traffic on the memory bus and slowing down memory accesses +for the first core.

+

Another issue is that most likely data accessed by the first CPU +during the critical section is stored in the same cache line with the +lock (common optimization to have the data ready in the cache after +the lock is acquired). Which means that the cache invalidation +triggered by the two other spinning cores will slow down the execution +of the critical section which in turn triggers more cache invalidate +actions.

+
+
+

Optimized spin locks

+

As we have seen simple spin lock implementations can have poor +performance issues due to cache thrashing, especially as the number of +cores increase. To avoid this issue there are two possible strategies:

+
    +
  • reduce the number of writes and thus reduce the number of cache +invalidate operations
  • +
  • avoid the other processors spinning on the same cache line, and thus +avoid the cache invalidate operations
  • +
+

An optimized spin lock implementation that uses the first approach is +presented below:

+

 

+
spin_lock:
+    rep ; nop
+    test lock_addr, 1
+    jnz spin_lock
+    lock bts lock_addr
+    jc spin_lock
+
+
+
    +
  • we first test the lock read only, using a non atomic +instructions, to avoid writes and thus invalidate operations +while we spin
  • +
  • only when the lock might be free, we try to acquire it
  • +
+

The implementation also use the PAUSE instruction to avoid +pipeline flushes due to (false positive) memory order violations and +to add a small delay (proportional with the memory bus frequency) to +reduce power consumption.

+

A similar implementation with support for fairness (the CPU cores are +allowed in the critical section based on the time of arrival) is used +in the Linux kernel (the ticket spin lock) +for many architectures.

+

However, for the x86 architecture, the current spin lock +implementation uses a queued spin lock where the CPU cores spin on +different locks (hopefully distributed in different cache lines) to +avoid cache invalidation operations:

+

 

+../_images/ditaa-58545831034f050660727be99cede213bc4a53c7.png +

Conceptually, when a new CPU core tries to acquire the lock and it +fails it will add its private lock to the list of waiting CPU +cores. When the lock owner exits the critical section it unlocks the +next lock in the list, if any.

+

While a read spin optimized spin lock reduces most of the cache +invalidation operations, the lock owner can still generate cache +invalidate operations due to writes to data structures close to the +lock and thus part of the same cache line. This in turn generates +memory traffic on subsequent reads on the spinning cores.

+

Hence, queued spin locks scale much better for large number of cores +as is the case for NUMA systems. And since they have similar fairness +properties as the ticket lock it is the preferred implementation on +the x86 architecture.

+
+
+

Process and Interrupt Context Synchronization

+

Accessing shared data from both process and interrupt context is a +relatively common scenario. On single core systems we can do this by +disabling interrupts, but that won't work on multi-core systems, +as we can have the process running on one CPU core and the interrupt +context running on a different CPU core.

+

Using a spin lock, which was designed for multi-processor systems, +seems like the right solution, but doing so can cause common +deadlock conditions, as detailed by the following scenario:

+
    +
  • In the process context we take the spin lock
  • +
  • An interrupt occurs and it is scheduled on the same CPU core
  • +
  • The interrupt handler runs and tries to take the spin lock
  • +
  • The current CPU will deadlock
  • +
+

To avoid this issue a two fold approach is used:

+
    +
  • In process context: disable interrupts and acquire a spin lock; +this will protect both against interrupt or other CPU cores race +conditions (spin_lock_irqsave() and +spin_lock_restore() combine the two operations)
  • +
  • In interrupt context: take a spin lock; this will will protect +against race conditions with other interrupt handlers or process +context running on different processors
  • +
+

We have the same issue for other interrupt context handlers such as +softirqs, tasklets or timers and while disabling interrupts might +work, it is recommended to use dedicated APIs:

+
    +
  • In process context use spin_lock_bh() (which combines +local_bh_disable() and spin_lock()) and +spin_unlock_bh() (which combines spin_unlock() and +local_bh_enable())
  • +
  • In bottom half context use: spin_lock() and +spin_unlock() (or spin_lock_irqsave() and +spin_lock_irqrestore() if sharing data with interrupt +handlers)
  • +
+

As mentioned before, another source of concurrency in the Linux kernel +can be other processes, due to preemption.

+

 

+

Preemption is configurable: when active it provides better latency +and response time, while when deactivated it provides better +throughput.

+

Preemption is disabled by spin locks and mutexes but it can be +manually disabled as well (by core kernel code).

+

As for local interrupt enabling and disabling APIs, the bottom half +and preemption APIs allows them to be used in overlapping critical +sections. A counter is used to track the state of bottom half and +preemption. In fact the same counter is used, with different increment +values:

+
#define PREEMPT_BITS      8
+#define SOFTIRQ_BITS      8
+#define HARDIRQ_BITS      4
+#define NMI_BITS          1
+
+#define preempt_disable() preempt_count_inc()
+
+#define local_bh_disable() add_preempt_count(SOFTIRQ_OFFSET)
+
+#define local_bh_enable() sub_preempt_count(SOFTIRQ_OFFSET)
+
+#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+
+#define in_interrupt() irq_count()
+
+asmlinkage void do_softirq(void)
+{
+    if (in_interrupt()) return;
+    ...
+
+
+
+
+

Mutexes

+

Mutexes are used to protect against race conditions from other CPU +cores but they can only be used in process context. As opposed to +spin locks, while a thread is waiting to enter the critical section it +will not use CPU time, but instead it will be added to a waiting queue +until the critical section is vacated.

+

Since mutexes and spin locks usage intersect, it is useful to compare +the two:

+
    +
  • They don't "waste" CPU cycles; system throughput is better than +spin locks if context switch overhead is lower than medium +spinning time
  • +
  • They can't be used in interrupt context
  • +
  • They have a higher latency than spin locks
  • +
+

Conceptually, the mutex_lock() operation is relatively simple: +if the mutex is not acquired we can take the fast path via an atomic +exchange operation:

+
void __sched mutex_lock(struct mutex *lock)
+{
+  might_sleep();
+
+  if (!__mutex_trylock_fast(lock))
+    __mutex_lock_slowpath(lock);
+}
+
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+    return true;
+
+  return false;
+}
+
+
+

otherwise we take the slow path where we add ourselves to the mutex +waiting list and put ourselves to sleep:

+
...
+  spin_lock(&lock->wait_lock);
+...
+  /* add waiting tasks to the end of the waitqueue (FIFO): */
+  list_add_tail(&waiter.list, &lock->wait_list);
+...
+  waiter.task = current;
+...
+  for (;;) {
+    if (__mutex_trylock(lock))
+      goto acquired;
+  ...
+    spin_unlock(&lock->wait_lock);
+  ...
+    set_current_state(state);
+    spin_lock(&lock->wait_lock);
+  }
+  spin_lock(&lock->wait_lock);
+acquired:
+  __set_current_state(TASK_RUNNING);
+  mutex_remove_waiter(lock, &waiter, current);
+  spin_lock(&lock->wait_lock);
+...
+
+
+

The full implementation is a bit more complex: instead of going to +sleep immediately it optimistic spinning if it detects that the lock +owner is currently running on a different CPU as chances are the owner +will release the lock soon. It also checks for signals and handles +mutex debugging for locking dependency engine debug feature.

+

The mutex_unlock() operation is symmetric: if there are no +waiters on the mutex then we can take the fast path via an atomic exchange +operation:

+
void __sched mutex_unlock(struct mutex *lock)
+{
+  if (__mutex_unlock_fast(lock))
+    return;
+  __mutex_unlock_slowpath(lock, _RET_IP_);
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+  unsigned long curr = (unsigned long)current;
+
+  if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
+    return true;
+
+  return false;
+}
+
+void __mutex_lock_slowpath(struct mutex *lock)
+{
+...
+  if (__mutex_waiter_is_first(lock, &waiter))
+          __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+...
+
+
+
+

Note

+

Because struct task_struct is cached aligned the 7 +lower bits of the owner field can be used for various flags, +such as MUTEX_FLAG_WAITERS.

+
+

Otherwise we take the slow path where we pick up first waiter from the +list and wake it up:

+
...
+spin_lock(&lock->wait_lock);
+if (!list_empty(&lock->wait_list)) {
+  /* get the first entry from the wait-list: */
+  struct mutex_waiter *waiter;
+  waiter = list_first_entry(&lock->wait_list, struct mutex_waiter,
+                            list);
+  next = waiter->task;
+  wake_q_add(&wake_q, next);
+}
+...
+spin_unlock(&lock->wait_lock);
+...
+wake_up_q(&wake_q);
+
+
+
+
+

Per CPU data

+

Per CPU data avoids race conditions by avoiding to use shared +data. Instead, an array sized to the maximum possible CPU cores is +used and each core will use its own array entry to read and write +data. This approach certainly has advantages:

+
    +
  • No need to synchronize to access the data
  • +
  • No contention, no performance impact
  • +
  • Well suited for distributed processing where aggregation is only +seldom necessary (e.g. statistics counters)
  • +
+
+
+

Memory Ordering and Barriers

+

Modern processors and compilers employ out-of-order execution to +improve performance. For example, processors can execute "future" +instructions while waiting for current instruction data to be fetched +from memory.

+

Here is an example of out of order compiler generated code:

+ ++++ + + + + + + + + +
C codeCompiler generated code
a = 1;
+b = 2;
+
+
+
MOV R10, 1
+MOV R11, 2
+STORE R11, b
+STORE R10, a
+
+
+
+
+

Note

+

When executing instructions out of order the processor makes +sure that data dependency is observed, i.e. it won't execute +instructions whose input depend on the output of a previous +instruction that has not been executed.

+
+

In most cases out of order execution is not an issue. However, in +certain situations (e.g. communicating via shared memory between +processors or between processors and hardware) we must issue some +instructions before others even without data dependency between them.

+

For this purpose we can use barriers to order memory operations:

+
    +
  • A read barrier (rmb(), smp_rmb()) is used to +make sure that no read operation crosses the barrier; that is, +all read operation before the barrier are complete before +executing the first instruction after the barrier
  • +
  • A write barrier (wmb(), smp_wmb()) is used to +make sure that no write operation crosses the barrier
  • +
  • A simple barrier (mb(), smp_mb()) is used +to make sure that no write or read operation crosses the barrier
  • +
+
+
+

Read Copy Update (RCU)

+

Read Copy Update is a special synchronization mechanism similar with +read-write locks but with significant improvements over it (and some +limitations):

+
    +
  • Read-only lock-less access at the same time with write access
  • +
  • Write accesses still requires locks in order to avoid races +between writers
  • +
  • Requires unidirectional traversal by readers
  • +
+

In fact, the read-write locks in the Linux kernel have been deprecated +and then removed, in favor of RCU.

+

Implementing RCU for a new data structure is difficult, but a few +common data structures (lists, queues, trees) do have RCU APIs that +can be used.

+

RCU splits removal updates to the data structures in two phases:

+
    +
  • Removal: removes references to elements. Some old readers may +still see the old reference so we can't free the element.
  • +
  • Elimination: free the element. This action is postponed until +all existing readers finish traversal (quiescent cycle). New +readers won't affect the quiescent cycle.
  • +
+

As an example, lets take a look on how to delete an element from a +list using RCU:

+

 

+../_images/ditaa-5193a924360bebc83d2f81188cd0b0093ec01e6a.png +

In the first step it can be seen that while readers traverse the list +all elements are referenced. In step two a writer removes +element B. Reclamation is postponed since there are still readers that +hold references to it. In step three a quiescent cycle just expired +and it can be noticed that there are no more references to +element B. Other elements still have references from readers that +started the list traversal after the element was removed. In step 4 we +finally perform reclamation (free the element).

+

Now that we covered how RCU functions at the high level, lets looks at +the APIs for traversing the list as well as adding and removing an +element to the list:

+
/* list traversal */
+rcu_read_lock();
+list_for_each_entry_rcu(i, head) {
+  /* no sleeping, blocking calls or context switch allowed */
+}
+rcu_read_unlock();
+
+
+/* list element delete  */
+spin_lock(&lock);
+list_del_rcu(&node->list);
+spin_unlock(&lock);
+synchronize_rcu();
+kfree(node);
+
+/* list element add  */
+spin_lock(&lock);
+list_add_rcu(head, &node->list);
+spin_unlock(&lock);
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec6-address-space-slides.html b/refs/pull/405/merge/so2/lec6-address-space-slides.html new file mode 100644 index 00000000..6d6ca403 --- /dev/null +++ b/refs/pull/405/merge/so2/lec6-address-space-slides.html @@ -0,0 +1,719 @@ + + + + + + + + SO2 Lecture 06 - Address Space — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 06 - Address Space

+ + + + + +
+
+ +

Address Space

+ +
    +
  • x86 MMU
      +
    • Segmentation
    • +
    • Paging
    • +
    • TLB
    • +
    +
  • +
  • Linux Address Space
      +
    • User
    • +
    • Kernel
    • +
    • High memory
    • +
    +
  • +
+ + + + +
+
+ +

x86 MMU

+ +

 

+../_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.png + + + + +
+
+ +

Selectors

+ +

 

+../_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.png +
    +
  • Selectors: CS, DS, SS, ES, FS, GS
  • +
  • Index: indexes the segment descriptor table
  • +
  • TI: selects either the GDT or LDT
  • +
  • RPL: for CS only indicates the running (current) priviledge level
  • +
  • GDTR and LDTR registers points to the base of GDP and LDT
  • +
+ + + + +
+
+ +

Segment descriptor

+ +

 

+../_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.png +
    +
  • Base: linear address for the start of the segment
  • +
  • Limit: size of the segment
  • +
  • G: granularity bit: if set the size is in bytes otherwise in 4K pages
  • +
  • B/D: data/code
  • +
  • Type: code segment, data/stack, TSS, LDT, GDT
  • +
  • Protection: the minimum priviledge level required to access the +segment (RPL is checked against DPL)
  • +
+ + + + +
+
+ +

Segmentation in Linux

+ +
/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ *   0 - null                                                             <=== cacheline #1
+ *   1 - reserved
+ *   2 - reserved
+ *   3 - reserved
+ *
+ *   4 - unused                                                           <=== cacheline #2
+ *   5 - unused
+ *
+ *  ------- start of TLS (Thread-Local Storage) segments:
+ *
+ *   6 - TLS segment #1                   [ glibc's TLS segment ]
+ *   7 - TLS segment #2                   [ Wine's %fs Win32 segment ]
+ *   8 - TLS segment #3                                                   <=== cacheline #3
+ *   9 - reserved
+ *  10 - reserved
+ *  11 - reserved
+ *
+ *  ------- start of kernel segments:
+ *
+ *  12 - kernel code segment                                              <=== cacheline #4
+ *  13 - kernel data segment
+ *  14 - default user CS
+ *  15 - default user DS
+ *  16 - TSS                                                              <=== cacheline #5
+ *  17 - LDT
+ *  18 - PNPBIOS support (16->32 gate)
+ *  19 - PNPBIOS support
+ *  20 - PNPBIOS support                                                  <=== cacheline #6
+ *  21 - PNPBIOS support
+ *  22 - PNPBIOS support
+ *  23 - APM BIOS support
+ *  24 - APM BIOS support                                                 <=== cacheline #7
+ *  25 - APM BIOS support
+ *
+ *  26 - ESPFIX small SS
+ *  27 - per-cpu                  [ offset to per-cpu data area ]
+ *  28 - stack_canary-20          [ for stack protector ]                 <=== cacheline #8
+ *  29 - unused
+ *  30 - unused
+ *  31 - TSS for double fault handler
+ */
+
+ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+ #ifdef CONFIG_X86_64
+         /*
+          * We need valid kernel segments for data and code in long mode too
+          * IRET will check the segment types  kkeil 2000/10/28
+          * Also sysret mandates a special GDT layout
+          *
+          * TLS descriptors are currently at a different place compared to i386.
+          * Hopefully nobody expects them at a fixed place (Wine?)
+          */
+         [GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER32_CS]   = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ #else
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+         /*
+          * Segments used for calling PnP BIOS have byte granularity.
+          * They code segments and data segments have fixed 64k limits,
+          * the transfer segment sizes are set at run time.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_PNPBIOS_CS32]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_PNPBIOS_CS16]        = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_DS]          = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS1]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS2]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /*
+          * The APM segments have byte granularity and their bases
+          * are set at run time.  All have 64k limits.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_APMBIOS_BASE]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_APMBIOS_BASE+1]      = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* data */
+         [GDT_ENTRY_APMBIOS_BASE+2]      = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+         [GDT_ENTRY_ESPFIX_SS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_PERCPU]              = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         GDT_STACK_CANARY_INIT
+ #endif
+ } };
+ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+
+ + + + +
+
+ +

Inspecting selectors and segments

+ +

 

+ + + + +
+
+ +

Regular paging

+ +

 

+../_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.png + + + + +
+
+ +

Extended paging

+ +../_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.png + + + + +
+
+ +

Page tables

+ +
    +
  • Both page directory and page table have 1024 entries
  • +
  • Each entry has 4 bytes
  • +
  • The special CR3 register point to the base of the page directory
  • +
  • Page directory entries points to the base of the page table
  • +
  • All tables are stored in memory
  • +
  • All table addresses are physical addresses
  • +
+ + + + +
+
+ +

Page table entry fields

+ +
    +
  • Present/Absent
  • +
  • PFN (Page Frame Number): the most 20 significant bits of the physical address
  • +
  • Accessed - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Dirty - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Access rights: Read/Write
  • +
  • Privilege: User/Supervisor
  • +
  • Page size - only for page directory; if set extended paging is used
  • +
  • PCD (page cache disable), PWT (page write through)
  • +
+ + + + +
+
+ +

Linux paging

+ +../_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.png + + + + +
+
+ +

Linux APIs for page table handling

+ +
struct * page;
+pgd_t pgd;
+pmd_t pmd;
+pud_t pud;
+pte_t pte;
+void *laddr, *paddr;
+
+pgd = pgd_offset(mm, vaddr);
+pud = pud_offet(pgd, vaddr);
+pmd = pmd_offset(pud, vaddr);
+pte = pte_offset(pmd, vaddr);
+page = pte_page(pte);
+laddr = page_address(page);
+paddr = virt_to_phys(laddr);
+
+
+ + + + +
+
+ +

What about platforms with less then 4 levels of pagination?

+ +
static inline pud_t * pud_offset(pgd_t * pgd,unsigned long address)
+{
+    return (pud_t *)pgd;
+}
+
+static inline pmd_t * pmd_offset(pud_t * pud,unsigned long address)
+{
+    return (pmd_t *)pud;
+}
+
+
+ + + + +
+
+ +

Translation Look-aside Buffer

+ +
    +
  • Caches paging information (PFN, rights, privilege)
  • +
  • Content Addressable Memory / Associative Memory
      +
    • Very small (64-128)
    • +
    • Very fast (single cycle due to parallel search implementation)
    • +
    +
  • +
  • CPUs usually have two TLBs: i-TLB (code) and d-TLB (data)
  • +
  • TLB miss penalty: up hundreds of cycles
  • +
+ + + + +
+
+ +

TLB invalidation

+ +

Single address invalidation:

+
mov $addr, %eax
+invlpg %(eax)
+
+
+

Full invalidation:

+
mov %cr3, %eax
+mov %eax, %cr3
+
+
+ + + + +
+
+ +

Address space options for 32bit systems

+ +

 

+../_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.png + + + + +
+
+ +

Advantages and disadvantages

+ +
    +
  • Disadvantages for dedicated kernel space:
      +
    • Fully invalidating the TLB for every system call
    • +
    +
  • +
  • Disadvantages for shared address space
      +
    • Less address space for both kernel and user processes
    • +
    +
  • +
+ + + + +
+
+ +

Linux address space for 32bit systems

+ +

 

+../_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.png + + + + +
+
+ +

Virtual to physical address translations for I/O transfers

+ +
    +
  • Use the virtual address of a kernel buffer in order to copy to +data from from user space
  • +
  • Walk the page tables to transform the kernel buffer virtual +address to a physical address
  • +
  • Use the physical address of the kernel buffer to start a DMA +transfer
  • +
+ + + + +
+
+ +

Linear mappings

+ +
    +
  • Virtual to physical address space translation is reduced to one +operation (instead of walking the page tables)
  • +
  • Less memory is used to create the page tables
  • +
  • Less TLB entries are used for the kernel memory
  • +
+ + + + +
+
+ +

Highmem

+ +

 

+../_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.png + + + + +
+
+ +

Multi-page permanent mappings

+ +
void* vmalloc(unsigned long size);
+void vfree(void * addr);
+
+void *ioremap(unsigned long offset, unsigned size);
+void iounmap(void * addr);
+
+
+ + + + +
+
+ +

Fixed-mapped linear addresses

+ +
    +
  • Reserved virtual addresses (constants)
  • +
  • Mapped to physical addresses during boot
  • +
+
set_fixmap(idx, phys_addr)
+set_fixmap_nocache(idx, phys_addr)
+
+
+ + + + +
+
+ +

Fixed-mapped linear addresses

+ +
/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+    FIX_HOLE,
+#else
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+    VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
+#endif
+#endif
+    FIX_DBGP_BASE,
+    FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+    FIX_OHCI1394_BASE,
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+    FIX_APIC_BASE,        /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+    FIX_IO_APIC_BASE_0,
+    FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#ifdef CONFIG_X86_32
+    FIX_KMAP_BEGIN,       /* reserved pte's for temporary kernel mappings */
+    FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+    FIX_PCIE_MCFG,
+#endif
+
+
+ + + + +
+
+ +

Conversion between virtual address fixed address indexes

+ +
#define __fix_to_virt(x)  (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)  ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+#ifndef __ASSEMBLY__
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+ static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+ {
+     BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
+     return __fix_to_virt(idx);
+ }
+
+ static inline unsigned long virt_to_fix(const unsigned long vaddr)
+ {
+     BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+     return __virt_to_fix(vaddr);
+ }
+
+
+ inline long fix_to_virt(const unsigned int idx)
+ {
+     if (idx >= __end_of_fixed_addresses)
+         __this_fixmap_does_not_exist();
+     return (0xffffe000UL - (idx << PAGE_SHIFT));
+ }
+
+
+ + + + +
+
+ +

Temporary mappings

+ +
    +
  • kmap_atomic(), kunmap_atomic()
  • +
  • No context switch is permitted in atomic kmap section
  • +
  • Can be used in interrupt context
  • +
  • No locking required
  • +
  • Only invalidates on TLB entry
  • +
+ + + + +
+
+ +

Temporary mappings implementation

+ +
#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot)
+
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
+{
+  unsigned long vaddr;
+  int idx, type;
+
+  type = kmap_atomic_idx_push();
+  idx = type + KM_TYPE_NR*smp_processor_id();
+  vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+  BUG_ON(!pte_none(*(kmap_pte-idx)));
+  set_pte(kmap_pte-idx, mk_pte(page, prot));
+  arch_flush_lazy_mmu_mode();
+
+  return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_high_prot);
+
+static inline int kmap_atomic_idx_push(void)
+{
+  int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+  WARN_ON_ONCE(in_irq() && !irqs_disabled());
+  BUG_ON(idx >= KM_TYPE_NR);
+#endif
+  return idx;
+}
+
+
+ + + + +
+
+ +

Implementation of temporary mappings

+ +
    +
  • Use the fixed-mapped linear addresses
  • +
  • Every CPU has KM_TYPE_NR reserved entries to be used for +temporary mappings
  • +
  • Stack like selection: every user picks the current entry and +increments the "stack" counter
  • +
+ + + + +
+
+ +

Permanent mappings

+ +
    +
  • kmap(), kunmap()
  • +
  • Context switches are allowed
  • +
  • Only available in process context
  • +
  • One page table is reserved for permanent mappings
  • +
  • Page counter
      +
    • 0 - page is not mapped, free and ready to use
    • +
    • 1 - page is not mapped, may be present in TLB needs flushing before using
    • +
    • N - page is mapped N-1 times
    • +
    +
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec6-address-space.html b/refs/pull/405/merge/so2/lec6-address-space.html new file mode 100644 index 00000000..74e8e2f7 --- /dev/null +++ b/refs/pull/405/merge/so2/lec6-address-space.html @@ -0,0 +1,802 @@ + + + + + + SO2 Lecture 06 - Address Space — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 06 - Address Space

+

View slides

+
+

Lecture objectives:

+
    +
  • x86 MMU
      +
    • Segmentation
    • +
    • Paging
    • +
    • TLB
    • +
    +
  • +
  • Linux Address Space
      +
    • User
    • +
    • Kernel
    • +
    • High memory
    • +
    +
  • +
+
+
+

x86 MMU

+

The x86 MMU has a segmentation and a pagination unit. The segmentation +unit can be used to define logical memory segments defined by a +logical (virtual) start address, a base linear (mapped) address and a +size. A segment can also restrict access based on the access type +(read, execute, write) or the privilege level (we can define some +segments to be accessible only by kernel for example).

+

When the CPU makes a memory access, it will use the segmentation unit +to translate the logical address to a linear address, based on the +information in the segment descriptor.

+

If pagination is enabled the linear address will be further +transformed into a physical address, using the information from the +page tables.

+

Note that the segmentation unit can not be disabled, so if the MMU has +been enabled, segmentation will always be used.

+

 

+../_images/ditaa-f3703e3f627a948c59f6f960518d5f68eb7becec.png +
+

Selectors

+

A program can use multiple segments and in order to determine which +segment to use, special registers (named selectors) are used. The +basic selectors that are typically used are CS - "Code Selector", DS - +"Data Selector" and SS - "Stack Selector".

+

Instruction fetches will by default use CS, while data access will by +default use DS unless the stack is used (e.g. data access through the +pop and push instructions) in which case SS will be used by default.

+

Selectors have three main fields: the index, the table index and the +running privilege level:

+

 

+../_images/ditaa-d6845a04f0ec792beec598d2a9f4c5b92c65529e.png +

The index will be used to determine which entry of the descriptor +table should be used. TI is used to select either the Global +Descriptor Table (GDT) or the Local Descriptor Table (LDT). The tables +are effectively arrays that start at the location specified in the +special registers GDTR (for GDT) and LDTR (for LDT).

+
+

Note

+

LDT was designed so that applications can define their own +particular segments. Although not many applications use this +feature, Linux (and Windows) provide system calls that +allows an application to create their own segments.

+
+

RPL is only used for CS and it represents the current privilege +level. There are 4 privilege levels, the highest level being 0 (and +typically used by the kernel) and the lowest is 3 (and typically used +by user applications).

+
+
+

Segment descriptor

+

The CPU will use the index field of the selector to access an 8 byte +descriptor:

+

 

+../_images/ditaa-5cd4a8fa1ad97cff4bb1f64da13ce9ebfcfc4562.png +
    +
  • Base: linear address for the start of the segment
  • +
  • Limit: size of the segment
  • +
  • G: granularity bit: if set the size is in bytes otherwise in 4K pages
  • +
  • B/D: data/code
  • +
  • Type: code segment, data/stack, TSS, LDT, GDT
  • +
  • Protection: the minimum priviledge level required to access the +segment (RPL is checked against DPL)
  • +
+

Some of the descriptor fields should be familiar. And that is because +there is some resemblance with Interrupt Descriptors we looked at +previously.

+
+
+

Segmentation in Linux

+

In Linux, segments are not used to define the stack, code or +data. These will be setup using the paging unit as it allows better +granularity and more importantly it allows Linux to use a generic +approach that works on other architectures (that don't have +segmentation support).

+

However, because the segmentation unit can not be disabled Linux must +create 4 generic 0 - 4GB segments for: kernel code, kernel data, user +code and user data.

+

Besides these, Linux uses segments for implementing Thread Local +Storage (TLS) together with the set_thread_area system call.

+

It also uses the TSS segment in order to define the kernel stack to +use when a change of privilege (e.g. system call, interrupt while +running in user-space) occurs.

+
/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ *   0 - null                                                             <=== cacheline #1
+ *   1 - reserved
+ *   2 - reserved
+ *   3 - reserved
+ *
+ *   4 - unused                                                           <=== cacheline #2
+ *   5 - unused
+ *
+ *  ------- start of TLS (Thread-Local Storage) segments:
+ *
+ *   6 - TLS segment #1                   [ glibc's TLS segment ]
+ *   7 - TLS segment #2                   [ Wine's %fs Win32 segment ]
+ *   8 - TLS segment #3                                                   <=== cacheline #3
+ *   9 - reserved
+ *  10 - reserved
+ *  11 - reserved
+ *
+ *  ------- start of kernel segments:
+ *
+ *  12 - kernel code segment                                              <=== cacheline #4
+ *  13 - kernel data segment
+ *  14 - default user CS
+ *  15 - default user DS
+ *  16 - TSS                                                              <=== cacheline #5
+ *  17 - LDT
+ *  18 - PNPBIOS support (16->32 gate)
+ *  19 - PNPBIOS support
+ *  20 - PNPBIOS support                                                  <=== cacheline #6
+ *  21 - PNPBIOS support
+ *  22 - PNPBIOS support
+ *  23 - APM BIOS support
+ *  24 - APM BIOS support                                                 <=== cacheline #7
+ *  25 - APM BIOS support
+ *
+ *  26 - ESPFIX small SS
+ *  27 - per-cpu                  [ offset to per-cpu data area ]
+ *  28 - stack_canary-20          [ for stack protector ]                 <=== cacheline #8
+ *  29 - unused
+ *  30 - unused
+ *  31 - TSS for double fault handler
+ */
+
+ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+ #ifdef CONFIG_X86_64
+         /*
+          * We need valid kernel segments for data and code in long mode too
+          * IRET will check the segment types  kkeil 2000/10/28
+          * Also sysret mandates a special GDT layout
+          *
+          * TLS descriptors are currently at a different place compared to i386.
+          * Hopefully nobody expects them at a fixed place (Wine?)
+          */
+         [GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER32_CS]   = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ #else
+         [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+         [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+         [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+         /*
+          * Segments used for calling PnP BIOS have byte granularity.
+          * They code segments and data segments have fixed 64k limits,
+          * the transfer segment sizes are set at run time.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_PNPBIOS_CS32]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_PNPBIOS_CS16]        = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_DS]          = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS1]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /* 16-bit data */
+         [GDT_ENTRY_PNPBIOS_TS2]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+         /*
+          * The APM segments have byte granularity and their bases
+          * are set at run time.  All have 64k limits.
+          */
+         /* 32-bit code */
+         [GDT_ENTRY_APMBIOS_BASE]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+         /* 16-bit code */
+         [GDT_ENTRY_APMBIOS_BASE+1]      = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+         /* data */
+         [GDT_ENTRY_APMBIOS_BASE+2]      = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+         [GDT_ENTRY_ESPFIX_SS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         [GDT_ENTRY_PERCPU]              = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+         GDT_STACK_CANARY_INIT
+ #endif
+ } };
+ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+
+
+
+

Inspecting selectors and segments

+

 

+
+
+

x86 Paging

+

The x86 paging unit support two types of paging: regular and extended paging.

+

Regular paging has 2 levels and a fixed page size of 4KB. The linear +address is split in three fields:

+
    +
  • Directory (the 10 most significant bits)
  • +
  • Table (the next 10 most bits)
  • +
  • Offset (the least significant 12 bits)
  • +
+

 

+../_images/ditaa-def299abebe530d760a6c8f16c791bbb016f9238.png +

When extended paging is enabled, a single level is used and pages are +4MB. The linear address is split in two fields:

+
    +
  • Directory (10 most significant bits)
  • +
  • Offset (least significant 22 bits)
  • +
+../_images/ditaa-709c2e7a68bfcdcfe9c1938d6ef2a0c9b5627931.png +
+
+

Page tables

+

We can mix regular and extended paging, the directory page has a bit +that specifies if extended or regular paging should be used. The +special CR3 register points to the base of the page directory and page +directory entries point to the base of the page table.

+

Both page directory and page table have 1024 entries and each entry +has 4 bytes.

+

All tables are stored in memory and the page table addresses are +physical addresses.

+

Page table entry fields:

+
    +
  • Present/Absent
  • +
  • PFN (Page Frame Number): the most 20 significant bits of the physical address
  • +
  • Accessed - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Dirty - not updated by hardware (can be used by OS for housekeeping)
  • +
  • Access rights: Read/Write
  • +
  • Privilege: User/Supervisor
  • +
  • Page size - only for page directory; if set extended paging is used
  • +
  • PCD (page cache disable), PWT (page write through)
  • +
+
+
+

Linux paging

+

Linux paging uses 4 levels in order to support 64bit +architectures. The diagram below shows how the various virtual address +chunks are used to index the page tables and compute the physical +address.

+../_images/ditaa-5e4d73e3fcb24db9d1f8c16daddf98694c063fe6.png +

Linux has a common API for creating and walking page tables. Creating +and modifying address spaces for kernel and processes is done using +the same generic code which relies on macros and functions to +translate these generic operations in code that runs on different +architectures.

+

Here is an example of how we can translate a virtual address to a +physical address, using the Linux page table APIs:

+
struct * page;
+pgd_t pgd;
+pmd_t pmd;
+pud_t pud;
+pte_t pte;
+void *laddr, *paddr;
+
+pgd = pgd_offset(mm, vaddr);
+pud = pud_offet(pgd, vaddr);
+pmd = pmd_offset(pud, vaddr);
+pte = pte_offset(pmd, vaddr);
+page = pte_page(pte);
+laddr = page_address(page);
+paddr = virt_to_phys(laddr);
+
+
+

In order to support architectures with less than 4 levels of +pagination (such as for x86 32bits) some macros and / or functions are +0 / empty:

+
static inline pud_t * pud_offset(pgd_t * pgd,unsigned long address)
+{
+    return (pud_t *)pgd;
+}
+
+static inline pmd_t * pmd_offset(pud_t * pud,unsigned long address)
+{
+    return (pmd_t *)pud;
+}
+
+
+
+
+

Translation Look-aside Buffer

+

When using virtual memory, due to the table page organization, we may +need an extra 1 (x86 extended paging), 2 (x86 regular paging) or 3 +(x86 64bit) memory access(es).

+

A special cache, called Translation Look-aside Buffer (TLB) is used to +speed up translations from virtual address to physical addresses.

+

The TLB has the following properties:

+
    +
  • Caches paging information (PFN, rights, privilege)
  • +
  • Content Addressable Memory / Associative Memory
      +
    • Very small (64-128)
    • +
    • Very fast (single cycle due to parallel search implementation)
    • +
    +
  • +
  • CPUs usually have two TLBs: i-TLB (code) and d-TLB (data)
  • +
  • TLB miss penalty: up hundreds of cycles
  • +
+

As with other caches, we must be careful to not create consistency +issues.

+

For example, when changing the mapping of one page to point to a +different physical memory location in the page tables, we must +invalidate the associated TLB entry. Otherwise, the MMU will do the +translation to the old physical address instead of the new physical +address.

+

The x86 platform supports TLB invalidation through two types of +operations.

+

Single address invalidation:

+
mov $addr, %eax
+invlpg %(eax)
+
+
+

Full invalidation:

+
mov %cr3, %eax
+mov %eax, %cr3
+
+
+
+
+
+

Linux address space

+
+

Address space options for 32bit systems

+

There are two main options for implementing kernel and user space: +either dedicated address spaces for each, or split a shared address +space.

+

 

+../_images/ditaa-d5d1129b0298a2ea5f116c9d4b246eb1b888db6b.png +

Each has advantages and disadvantages:

+
    +
  • Disadvantages for dedicated kernel space:
      +
    • Fully invalidating the TLB for every system call
    • +
    +
  • +
  • Disadvantages for shared address space
      +
    • Less address space for both kernel and user processes
    • +
    +
  • +
+

Linux is using a split address space for 32 bit systems, although in +the past there were options for supporting 4/4s split or dedicated +kernel address space (on those architecture that supports it, +e.g. x86). Linux always uses split address space for 64 bit systems.

+

On overview of the Linux address space is presented below:

+

 

+../_images/ditaa-3985c420def8f30934a72ea8c738a00ed629c298.png +
+
+

Linear mappings

+

Linear mappings refer to particular way of mapping virtual pages to +physical pages, where virtual page V, V + 1, ... V + n is mapped to +physical pages P, P + 1, ..., P + n.

+

To understand the necessity of linear mappings, we should look at +common kernel operations that involves using both the virtual and +physical address of a page such as an I/O transfer:

+
    +
  • Use the virtual address of a kernel buffer in order to copy to +data from from user space
  • +
  • Walk the page tables to transform the kernel buffer virtual +address to a physical address
  • +
  • Use the physical address of the kernel buffer to start a DMA +transfer
  • +
+

However, if we use linear mappings and the kernel buffers are in the +linear mapping area, then:

+
    +
  • Virtual to physical address space translation is reduced to one +operation (instead of walking the page tables)
  • +
  • Less memory is used to create the page tables
  • +
  • Less TLB entries are used for the kernel memory
  • +
+
+
+

Highmem

+

The "highmem" part of the virtual address space is used to create +arbitrary mappings (as opposed to linear mappings in lowmem). On 32bit +systems the highmem area is absolutely required in order to access +physical memory outside of lowmem. However, highmem is also used on +64bit systems but the use-case there is mainly to allow arbitrary +mappings in kernel space.

+

 

+../_images/ditaa-bb8455a43088bf800eece11869f6ff857574605d.png +

There are multiple types of mappings in the highmem area:

+
    +
  • Multi-page permanent mappings (vmalloc, ioremap)
  • +
  • Temporary 1 page mappings (atomic_kmap)
  • +
  • Permanent 1 page mappings (kmap, fix-mapped linear addresses)
  • +
+

Multiple page mappings allows mapping of ranges of physical memory +into the highmem area. Each such mapping is guarded by a +non-accessible page to catch buffer overflow and underflow errors.

+

The APIs that maps multiple pages into highmem are:

+
void* vmalloc(unsigned long size);
+void vfree(void * addr);
+
+void *ioremap(unsigned long offset, unsigned size);
+void iounmap(void * addr);
+
+
+

vmalloc() is used to allocate non-contiguous system memory +pages as a contiguous segment in the kernel virtual address space. It +is usefully when allocating large buffers because due to fragmentation +it is unlikely to find free large chunks of physical contiguous memory.

+

ioremap() is used to map device memory or device registers +into the kernel address space. It maps a contiguous physical memory +range into highmem with page caching disabled.

+
+
+

Fixed-mapped linear addresses

+

Fixed-mapped linear addresses are a special class of singular page +mappings that are used for accessing registers of commonly used +peripherals such as the APIC or IO APIC.

+

Typical I/O access for peripherals is to use a base (the kernel +virtual address space where the peripheral registers are mapped) + +offsets for various registers.

+

In order to optimize access, the base is reserved at compile time +(e.g. 0xFFFFF000). Since the base is constant, the various register +accesses of the form base + register offset will also be constant +and thus the compiler will avoid generating an extra instruction.

+

In summary, fixed-mapped linear addresses are:

+
    +
  • Reserved virtual addresses (constants)
  • +
  • Mapped to physical addresses during boot
  • +
+
set_fixmap(idx, phys_addr)
+set_fixmap_nocache(idx, phys_addr)
+
+
+

These addresses are architecture defined and, as an example, this is +the map for x86:

+
/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+    FIX_HOLE,
+#else
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+    VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
+#endif
+#endif
+    FIX_DBGP_BASE,
+    FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+    FIX_OHCI1394_BASE,
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+    FIX_APIC_BASE,        /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+    FIX_IO_APIC_BASE_0,
+    FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#ifdef CONFIG_X86_32
+    FIX_KMAP_BEGIN,       /* reserved pte's for temporary kernel mappings */
+    FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+    FIX_PCIE_MCFG,
+#endif
+
+
+

Notice how easy is to do the conversion between the virtual address +and the fixed address indexes:

+
#define __fix_to_virt(x)  (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)  ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+#ifndef __ASSEMBLY__
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+ static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+ {
+     BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
+     return __fix_to_virt(idx);
+ }
+
+ static inline unsigned long virt_to_fix(const unsigned long vaddr)
+ {
+     BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+     return __virt_to_fix(vaddr);
+ }
+
+
+ inline long fix_to_virt(const unsigned int idx)
+ {
+     if (idx >= __end_of_fixed_addresses)
+         __this_fixmap_does_not_exist();
+     return (0xffffe000UL - (idx << PAGE_SHIFT));
+ }
+
+
+
+
+

Temporary mappings

+

Temporary mappings can be used to map a single physical page, very +fast, in kernel space. It can be used in interrupt context but the +atomic kmap section, defined in between the kmap_atomic() and +kunmap_atomic() can not be preempted. That is why these are +called temporary mappings, as they can only be used momentarily.

+

Temporary mappings are very fast because there is no locking or +searching required and also there is no full TLB invalidation, just +the particular virtual page will be TLB invalidated.

+

Here are some code snippets that show that temporary mappings are +implemented:

+
#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot)
+
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
+{
+  unsigned long vaddr;
+  int idx, type;
+
+  type = kmap_atomic_idx_push();
+  idx = type + KM_TYPE_NR*smp_processor_id();
+  vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+  BUG_ON(!pte_none(*(kmap_pte-idx)));
+  set_pte(kmap_pte-idx, mk_pte(page, prot));
+  arch_flush_lazy_mmu_mode();
+
+  return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_high_prot);
+
+static inline int kmap_atomic_idx_push(void)
+{
+  int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+  WARN_ON_ONCE(in_irq() && !irqs_disabled());
+  BUG_ON(idx >= KM_TYPE_NR);
+#endif
+  return idx;
+}
+
+
+

Notice that fix-mapped linear addresses and a stack like approach is +used: each CPU has KM_TYPE_NR reserved entries which are used in a +first code first serve option. This allows using multiple temporary +mappings at once, for example one in process context, one in an +interrupt handler, and a few more in tasklets or softirqs.

+
+
+

Permanent mappings

+

Permanent mappings allows users to hold on to a mapping for long +(undefined) periods of time which means that context switch are +allowed after a mapping and before releasing it.

+

This flexibility comes with a price though. A search operation is +performed to find a free entry and they can not be used in interrupt +context - the operation that tries to find a free virtual address page +may block. There is a limited number of permanent mappings available +(topically one page is reserved for permanent mappings)

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec7-memory-management-slides.html b/refs/pull/405/merge/so2/lec7-memory-management-slides.html new file mode 100644 index 00000000..b3adc06f --- /dev/null +++ b/refs/pull/405/merge/so2/lec7-memory-management-slides.html @@ -0,0 +1,560 @@ + + + + + + + + SO2 Lecture 07 - Memory Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 07 - Memory Management

+ + + + + +
+
+ +

Memory Management

+ +
    +
  • Physical Memory Management
      +
    • Page allocations
    • +
    • Small allocations
    • +
    +
  • +
  • Virtual Memory Management
  • +
  • Page Fault Handling Overview
  • +
+ + + + +
+
+ +

Physical Memory Management

+ +
    +
  • Algorithms and data structure that keep track of physical memory +pages
  • +
  • Independent of virtual memory management
  • +
  • Both virtual and physical memory management is required for complete +memory management
  • +
  • Physical pages are being tracked using a special data structure: +struct page
  • +
  • All physical pages have an entry reserved in the mem_map +vector
  • +
  • The physical page status may include: a counter for how many +times is a page used, position in swap or file, buffers for this +page, position int the page cache, etc.
  • +
+ + + + +
+
+ +

Memory zones

+ +
    +
  • DMA zone
  • +
  • DMA32 zone
  • +
  • Normal zone (LowMem)
  • +
  • HighMem Zone
  • +
  • Movable Zone
  • +
+ + + + +
+
+ +

Non-Uniform Memory Access

+ +
    +
  • Physical memory is split in between multiple nodes, one for each CPU
  • +
  • There is single physical address space accessible from every node
  • +
  • Access to the local memory is faster
  • +
  • Each node maintains is own memory zones (.e. DMA, NORMAL, HIGHMEM, etc.)
  • +
+ + + + +
+
+ +

Page allocation

+ +
/* Allocates 2^order contiguous pages and returns a pointer to the
+ * descriptor for the first page
+ */
+struct page *alloc_pages(gfp_mask, order);
+
+/* allocates a single page */
+struct page *alloc_page(gfp_mask);
+
+
+/* helper functions that return the kernel virtual address */
+void *__get_free_pages(gfp_mask, order);
+void *__get_free_page(gfp_mask);
+void *__get_zero_page(gfp_mask);
+void *__get_dma_pages(gfp_mask, order);
+
+
+ + + + +
+
+ +

Why only allocate pages in chunks of power of 2?

+ +
    +
  • Typical memory allocation algorithms have linear complexity
  • +
  • Why not use paging?
      +
    • Sometime we do need contiguous memory allocations (for DMA)
    • +
    • Allocation would require page table changes and TLB flushes
    • +
    • Not able to use extended pages
    • +
    • Some architecture directly (in hardware) linearly maps a part +of the address space (e.g. MIPS)
    • +
    +
  • +
+ + + + +
+
+ +

The buddy algorithm

+ +
    +
  • Free blocks are distributed in multiple lists
  • +
  • Each list contains blocks of the same size
  • +
  • The block size is a power of two
  • +
+ + + + +
+
+ +

Allocating a block of size N

+ +
    +
  • If there is a free block in the N-size list, pick the first
  • +
  • If not, look for a free block in the 2N-size list
  • +
  • Split the 2N-size block in two N-size blocks and add them to the +N-size list
  • +
  • Now that we have the N-size list populated, pick the first free +block from that list
  • +
+ + + + +
+
+ +

Freeing a block of size N

+ +
    +
  • If the "buddy" is free coalesce into a 2N-size block
  • +
  • Try until no more free buddy block is found and place the +resulting block in the respective list
  • +
+ + + + +
+
+ +

The Linux implementation

+ +
    +
  • 11 lists for blocks of 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, +1024 pages
  • +
  • Each memory zone has its own buddy allocator
  • +
  • Each zone has a vector of descriptors for free blocks, one entry +for each size
  • +
  • The descriptor contains the number of free blocks and the head of +the list
  • +
  • Blocks are linked in the list using the lru field of +struct page
  • +
  • Free pages have the PG_buddy flag set
  • +
  • The page descriptor keeps a copy of the block size in the private +field to easily check if the "buddy" is free
  • +
+ + + + +
+
+ +

Small allocations

+ +
    +
  • Buddy is used to allocate pages
  • +
  • Many of the kernel subsystems need to allocate buffers smaller +than a page
  • +
  • Typical solution: variable size buffer allocation
      +
    • Leads to external fragmentation
    • +
    +
  • +
  • Alternative solution: fixed size buffer allocation
      +
    • Leads to internal fragmentation
    • +
    +
  • +
  • Compromise: fixed size block allocation with multiple sizes, geometrically distributed
      +
    • e.g.: 32, 64, ..., 131056
    • +
    +
  • +
+ + + + +
+
+ +

The SLAB allocator

+ +
    +
  • Buffers = objects
  • +
  • Uses buddy to allocate a pool of pages for object allocations
  • +
  • Each object (optionally) has a constructor and destructor
  • +
  • Deallocated objects are cached - avoids subsequent calls for +constructors and buddy allocation / deallocation
  • +
+ + + + +
+
+ +

Why SLAB?

+ +
    +
  • The kernel will typically allocate and deallocate multiple types +the same data structures over time (e.g. struct +task_struct) effectively using fixed size allocations. Using the +SLAB reduces the frequency of the more heavy +allocation/deallocation operations.
  • +
  • For variable size buffers (which occurs less frequently) a +geometric distribution of caches with fixed-size can be used
  • +
  • Reduces the memory allocation foot-print since we are searching a +much smaller memory area, compared to buddy which can span over a +larger area
  • +
  • Employs cache optimization techniques (slab coloring)
  • +
+ + + + +
+
+ +

Slab architecture

+ +../_images/slab-overview1.png + + + + +
+
+ +

Cache descriptors

+ +
    +
  • A name to identify the cache for stats
  • +
  • object constructor and destructor functions
  • +
  • size of the objects
  • +
  • Flags
  • +
  • Size of the slab in power of 2 pages
  • +
  • GFP masks
  • +
  • One or mores slabs, grouped by state: full, partially full, empty
  • +
+ + + + +
+
+ +

SLAB descriptors

+ +
    +
  • Number of objects
  • +
  • Memory region where the objects are stored
  • +
  • Pointer to the first free object
  • +
  • Descriptor are stored either in
      +
    • the SLAB itself (if the object size is lower the 512 or if +internal fragmentation leaves enough space for the SLAB +descriptor)
    • +
    • in generic caches internally used by the SLAB allocator
    • +
    +
  • +
+ + + + +
+
+ +

Slab detailed architecture

+ +../_images/slab-detailed-arch1.png + + + + +
+
+ +

Generic vs specific caches

+ +
    +
  • Generic caches are used internally by the slab allocator
      +
    • allocating memory for cache and slab descriptors
    • +
    +
  • +
  • They are also used to implement kmalloc() by implementing +20 caches with object sizes geometrically distributed between +32bytes and 4MB
  • +
  • Specific cache are created on demand by kernel subsystems
  • +
+ + + + +
+
+ +

Object descriptors

+ +../_images/slab-object-descriptors1.png + + + + +
+
+ +

Object descriptors

+ +
    +
  • Only used for free objects
  • +
  • An integer that points to the next free object
  • +
  • The last free object uses a terminator value
  • +
  • Internal descriptors - stored in the slab
  • +
  • External descriptors - stored in generic caches
  • +
+ + + + +
+
+ +

SLAB coloring

+ +../_images/slab-coloring1.png + + + + +
+
+ +

Virtual memory management

+ +
    +
  • Used in both kernel and user space
  • +
  • Using virtual memory requires:
      +
    • reserving (allocating) a segment in the virtual address space +(be it kernel or user)
    • +
    • allocating one or more physical pages for the buffer
    • +
    • allocating one or more physical pages for page tables and +internal structures
    • +
    • mapping the virtual memory segment to the physical allocated +pages
    • +
    +
  • +
+ + + + +
+
+ +

Address space descriptors

+ +

 

+../_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.png + + + + +
+
+ +

Address space descriptors

+ +
    +
  • Page table is used either by:
      +
    • The CPU's MMU
    • +
    • The kernel to handle TLB exception (some RISC processors)
    • +
    +
  • +
  • The address space descriptor is used by the kernel to maintain +high level information such as file and file offset (for mmap +with files), read-only segment, copy-on-write segment, etc.
  • +
+ + + + +
+
+ +

Allocating virtual memory

+ +
    +
  • Search a free area in the address space descriptor
  • +
  • Allocate memory for a new area descriptor
  • +
  • Insert the new area descriptor in the address space descriptor
  • +
  • Allocate physical memory for one or more page tables
  • +
  • Setup the page tables for the newly allocated area in the virtual +address space
  • +
  • Allocating (on demand) physical pages and map them in the virtual +address space by updating the page tables
  • +
+ + + + +
+
+ +

Freeing virtual memory

+ +
    +
  • Removing the area descriptor
  • +
  • Freeing the area descriptor memory
  • +
  • Updating the page tables to remove the area from the virtual +address space
  • +
  • Flushing the TLB for the freed virtual memory area
  • +
  • Freeing physical memory of the page tables associated with the +freed area
  • +
  • Freeing physical memory of the freed virtual memory area
  • +
+ + + + +
+
+ +

Linux virtual memory management

+ +
    +
  • Kernel
      +
    • vmalloc
        +
      • area descriptor: struct vm_struct
      • +
      • address space descriptor: simple linked list of struct vm_struct
      • +
      +
    • +
    +
  • +
  • Userspace
      +
    • area descriptor: struct vm_area_struct
    • +
    • address space descriptor: struct mm_struct, red-black tree
    • +
    +
  • +
+ + + + +
+
+ +

Linux virtual memory management

+ +../_images/page-fault-handling1.png + + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec7-memory-management.html b/refs/pull/405/merge/so2/lec7-memory-management.html new file mode 100644 index 00000000..cf36a285 --- /dev/null +++ b/refs/pull/405/merge/so2/lec7-memory-management.html @@ -0,0 +1,468 @@ + + + + + + SO2 Lecture 07 - Memory Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 07 - Memory Management

+

View slides

+
+

Lecture objectives:

+
    +
  • Physical Memory Management
      +
    • Page allocations
    • +
    • Small allocations
    • +
    +
  • +
  • Virtual Memory Management
  • +
  • Page Fault Handling Overview
  • +
+
+
+

Physical Memory Management

+
    +
  • Algorithms and data structure that keep track of physical memory +pages
  • +
  • Independent of virtual memory management
  • +
  • Both virtual and physical memory management is required for complete +memory management
  • +
  • Physical pages are being tracked using a special data structure: +struct page
  • +
  • All physical pages have an entry reserved in the mem_map +vector
  • +
  • The physical page status may include: a counter for how many +times is a page used, position in swap or file, buffers for this +page, position int the page cache, etc.
  • +
+
+

Memory zones

+
    +
  • DMA zone
  • +
  • DMA32 zone
  • +
  • Normal zone (LowMem)
  • +
  • HighMem Zone
  • +
  • Movable Zone
  • +
+
+
+

Non-Uniform Memory Access

+
    +
  • Physical memory is split in between multiple nodes, one for each CPU
  • +
  • There is single physical address space accessible from every node
  • +
  • Access to the local memory is faster
  • +
  • Each node maintains is own memory zones (.e. DMA, NORMAL, HIGHMEM, etc.)
  • +
+
+
+

Page allocation

+
/* Allocates 2^order contiguous pages and returns a pointer to the
+ * descriptor for the first page
+ */
+struct page *alloc_pages(gfp_mask, order);
+
+/* allocates a single page */
+struct page *alloc_page(gfp_mask);
+
+
+/* helper functions that return the kernel virtual address */
+void *__get_free_pages(gfp_mask, order);
+void *__get_free_page(gfp_mask);
+void *__get_zero_page(gfp_mask);
+void *__get_dma_pages(gfp_mask, order);
+
+
+
    +
  • Typical memory allocation algorithms have linear complexity
  • +
  • Why not use paging?
      +
    • Sometime we do need contiguous memory allocations (for DMA)
    • +
    • Allocation would require page table changes and TLB flushes
    • +
    • Not able to use extended pages
    • +
    • Some architecture directly (in hardware) linearly maps a part +of the address space (e.g. MIPS)
    • +
    +
  • +
+
    +
  • Free blocks are distributed in multiple lists
  • +
  • Each list contains blocks of the same size
  • +
  • The block size is a power of two
  • +
+
    +
  • If there is a free block in the N-size list, pick the first
  • +
  • If not, look for a free block in the 2N-size list
  • +
  • Split the 2N-size block in two N-size blocks and add them to the +N-size list
  • +
  • Now that we have the N-size list populated, pick the first free +block from that list
  • +
+
    +
  • If the "buddy" is free coalesce into a 2N-size block
  • +
  • Try until no more free buddy block is found and place the +resulting block in the respective list
  • +
+
    +
  • 11 lists for blocks of 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, +1024 pages
  • +
  • Each memory zone has its own buddy allocator
  • +
  • Each zone has a vector of descriptors for free blocks, one entry +for each size
  • +
  • The descriptor contains the number of free blocks and the head of +the list
  • +
  • Blocks are linked in the list using the lru field of +struct page
  • +
  • Free pages have the PG_buddy flag set
  • +
  • The page descriptor keeps a copy of the block size in the private +field to easily check if the "buddy" is free
  • +
+
+
+

Small allocations

+
    +
  • Buddy is used to allocate pages
  • +
  • Many of the kernel subsystems need to allocate buffers smaller +than a page
  • +
  • Typical solution: variable size buffer allocation
      +
    • Leads to external fragmentation
    • +
    +
  • +
  • Alternative solution: fixed size buffer allocation
      +
    • Leads to internal fragmentation
    • +
    +
  • +
  • Compromise: fixed size block allocation with multiple sizes, geometrically distributed
      +
    • e.g.: 32, 64, ..., 131056
    • +
    +
  • +
+
    +
  • Buffers = objects
  • +
  • Uses buddy to allocate a pool of pages for object allocations
  • +
  • Each object (optionally) has a constructor and destructor
  • +
  • Deallocated objects are cached - avoids subsequent calls for +constructors and buddy allocation / deallocation
  • +
+
    +
  • The kernel will typically allocate and deallocate multiple types +the same data structures over time (e.g. struct +task_struct) effectively using fixed size allocations. Using the +SLAB reduces the frequency of the more heavy +allocation/deallocation operations.
  • +
  • For variable size buffers (which occurs less frequently) a +geometric distribution of caches with fixed-size can be used
  • +
  • Reduces the memory allocation foot-print since we are searching a +much smaller memory area, compared to buddy which can span over a +larger area
  • +
  • Employs cache optimization techniques (slab coloring)
  • +
+../_images/slab-overview1.png +
    +
  • A name to identify the cache for stats
  • +
  • object constructor and destructor functions
  • +
  • size of the objects
  • +
  • Flags
  • +
  • Size of the slab in power of 2 pages
  • +
  • GFP masks
  • +
  • One or mores slabs, grouped by state: full, partially full, empty
  • +
+
    +
  • Number of objects
  • +
  • Memory region where the objects are stored
  • +
  • Pointer to the first free object
  • +
  • Descriptor are stored either in
      +
    • the SLAB itself (if the object size is lower the 512 or if +internal fragmentation leaves enough space for the SLAB +descriptor)
    • +
    • in generic caches internally used by the SLAB allocator
    • +
    +
  • +
+../_images/slab-detailed-arch1.png +
    +
  • Generic caches are used internally by the slab allocator
      +
    • allocating memory for cache and slab descriptors
    • +
    +
  • +
  • They are also used to implement kmalloc() by implementing +20 caches with object sizes geometrically distributed between +32bytes and 4MB
  • +
  • Specific cache are created on demand by kernel subsystems
  • +
+../_images/slab-object-descriptors1.png +
    +
  • Only used for free objects
  • +
  • An integer that points to the next free object
  • +
  • The last free object uses a terminator value
  • +
  • Internal descriptors - stored in the slab
  • +
  • External descriptors - stored in generic caches
  • +
+../_images/slab-coloring1.png +
+
+
+

Virtual memory management

+
    +
  • Used in both kernel and user space
  • +
  • Using virtual memory requires:
      +
    • reserving (allocating) a segment in the virtual address space +(be it kernel or user)
    • +
    • allocating one or more physical pages for the buffer
    • +
    • allocating one or more physical pages for page tables and +internal structures
    • +
    • mapping the virtual memory segment to the physical allocated +pages
    • +
    +
  • +
+

 

+../_images/ditaa-0eda95a3f39dfac448fd07589656b123d3548328.png +
    +
  • Page table is used either by:
      +
    • The CPU's MMU
    • +
    • The kernel to handle TLB exception (some RISC processors)
    • +
    +
  • +
  • The address space descriptor is used by the kernel to maintain +high level information such as file and file offset (for mmap +with files), read-only segment, copy-on-write segment, etc.
  • +
+
    +
  • Search a free area in the address space descriptor
  • +
  • Allocate memory for a new area descriptor
  • +
  • Insert the new area descriptor in the address space descriptor
  • +
  • Allocate physical memory for one or more page tables
  • +
  • Setup the page tables for the newly allocated area in the virtual +address space
  • +
  • Allocating (on demand) physical pages and map them in the virtual +address space by updating the page tables
  • +
+
    +
  • Removing the area descriptor
  • +
  • Freeing the area descriptor memory
  • +
  • Updating the page tables to remove the area from the virtual +address space
  • +
  • Flushing the TLB for the freed virtual memory area
  • +
  • Freeing physical memory of the page tables associated with the +freed area
  • +
  • Freeing physical memory of the freed virtual memory area
  • +
+
    +
  • Kernel
      +
    • vmalloc
        +
      • area descriptor: struct vm_struct
      • +
      • address space descriptor: simple linked list of struct vm_struct
      • +
      +
    • +
    +
  • +
  • Userspace
      +
    • area descriptor: struct vm_area_struct
    • +
    • address space descriptor: struct mm_struct, red-black tree
    • +
    +
  • +
+
+
+

Fault page handling

+../_images/page-fault-handling1.png +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec8-filesystems-slides.html b/refs/pull/405/merge/so2/lec8-filesystems-slides.html new file mode 100644 index 00000000..f757454b --- /dev/null +++ b/refs/pull/405/merge/so2/lec8-filesystems-slides.html @@ -0,0 +1,582 @@ + + + + + + + + SO2 Lecture 08 - Filesystem Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 08 - Filesystem Management

+ + + + + +
+
+ +

Filesystem Management

+ +
    +
  • Filesystem abstractions
  • +
  • Filesystem operations
  • +
  • Linux VFS
  • +
  • Overview of Linux I/O Management
  • +
+ + + + +
+
+ +

Filesystem Abstractions

+ +
    +
  • superblock
  • +
  • file
  • +
  • inode
  • +
  • dentry
  • +
+ + + + +
+
+ +

Filesystem Abstractions - in memory

+ +../_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.png + + + + +
+
+ +

Filesystem Abstractions - on storage

+ +../_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.png + + + + +
+
+ +

Simple filesystem example

+ +

 

+../_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.png + + + + +
+
+ +

Overview

+ +../_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.png + + + + +
+
+ +

Filesystem Operations

+ +
    +
  • Mount
  • +
  • Open a file
  • +
  • Querying file attributes
  • +
  • Reading data from a file
  • +
  • Writing file to a file
  • +
  • Creating a file
  • +
  • Deleting a file
  • +
+ + + + +
+
+ +

Mounting a filesystem

+ +
    +
  • Input: a storage device (partition)
  • +
  • Output: dentry pointing to the root directory
  • +
  • Steps: check device, determine filesystem parameters, locate the root inode
  • +
  • Example: check magic, determine block size, read the root inode and create dentry
  • +
+ + + + +
+
+ +

Opening a file

+ +
    +
  • Input: path
  • +
  • Output: file descriptor
  • +
  • Steps:
      +
    • Determine the filesystem type
    • +
    • For each name in the path: lookup parent dentry, load inode, +load data, find dentry
    • +
    • Create a new file that points to the last dentry
    • +
    • Find a free entry in the file descriptor table and set it to file
    • +
    +
  • +
+ + + + +
+
+ +

Querying file attributes

+ +
    +
  • Input: path
  • +
  • Output: file attributes
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Read file attributes from the inode
    • +
    +
  • +
+ + + + +
+
+ +

Reading data from a file

+ +
    +
  • Input: file descriptor, offset, length
  • +
  • Output: data
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Determine data blocks
    • +
    • Copy data blocks to memory
    • +
    +
  • +
+ + + + +
+
+ +

Writing data to a file

+ +
    +
  • Input: file descriptor, offset, length, data
  • +
  • Output:
  • +
  • Steps:
      +
    • Allocate one or more data blocks
    • +
    • Add the allocated blocks to the inode and update file size
    • +
    • Copy data from userspace to internal buffers and write them to +storage
    • +
    +
  • +
+ + + + +
+
+ +

Closing a file

+ +
    +
  • Input: file descriptor
  • +
  • Output:
  • +
  • Steps:
      +
    • set the file descriptor entry to NULL
    • +
    • Decrement file reference counter
    • +
    • When the counter reaches 0 free file
    • +
    +
  • +
+ + + + +
+
+ +

Directories

+ +

Directories are special files which contain one or more dentries.

+ + + + +
+
+ +

Creating a file

+ +
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • Determine the inode directory
    • +
    • Read data blocks and find space for a new dentry
    • +
    • Write back the modified inode directory data blocks
    • +
    +
  • +
+ + + + +
+
+ +

Deleting a file

+ +
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • determine the parent inode
    • +
    • read parent inode data blocks
    • +
    • find and erase the dentry (check for links)
    • +
    • when last file is closed: deallocate data and inode blocks
    • +
    +
  • +
+ + + + +
+
+ +

Virtual File System

+ +../_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.png + + + + +
+
+ +

Superblock Operations

+ +
    +
  • fill_super
  • +
  • put_super
  • +
  • write_super
  • +
  • read_inode
  • +
+
    +
  • write_inode
  • +
  • evict_inode
  • +
  • statfs
  • +
  • remount_fs
  • +
+
+ + + + +
+
+ +

Inode Operations

+ +
    +
  • create
  • +
  • lookup
  • +
  • link
  • +
  • unlink
  • +
  • symlink
  • +
  • mkdir
  • +
+
    +
  • rmdir
  • +
  • rename
  • +
  • readlink
  • +
  • follow_link
  • +
  • put_link
  • +
  • ...
  • +
+
+ + + + +
+
+ +

The Inode Cache

+ +
    +
  • Caches inodes into memory to avoid costly storage operations
  • +
  • An inode is cached until low memory conditions are triggered
  • +
  • inodes are indexed with a hash table
  • +
  • The inode hash function takes the superblock and inode number as +inputs
  • +
+ + + + +
+
+ +

The Dentry Cache

+ +
    +
  • State:
      +
    • Used – d_inode is valid and the dentry object is in use
    • +
    • Unused – d_inode is valid but the dentry object is not in use
    • +
    • Negative – d_inode is not valid; the inode was not yet loaded +or the file was erased
    • +
    +
  • +
  • Dentry cache
      +
    • List of used dentries (dentry->d_state == used)
    • +
    • List of the most recent used dentries (sorted by access time)
    • +
    • Hash table to avoid searching the tree
    • +
    +
  • +
+ + + + +
+
+ +

The Page Cache

+ +
    +
  • Caches file data and not block device data
  • +
  • Uses the struct address_space to translate file offsets +to block offsets
  • +
  • Used for both read / write and mmap
  • +
  • Uses a radix tree
  • +
+ + + + +
+
+ +

struct address_space

+ +
/**
+ * struct address_space - Contents of a cacheable, mappable object.
+ * @host: Owner, either the inode or the block_device.
+ * @i_pages: Cached pages.
+ * @gfp_mask: Memory allocation flags to use for allocating pages.
+ * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
+ * @i_mmap: Tree of private and shared mappings.
+ * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+ * @nrpages: Number of page entries, protected by the i_pages lock.
+ * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
+ * @writeback_index: Writeback starts here.
+ * @a_ops: Methods.
+ * @flags: Error bits and flags (AS_*).
+ * @wb_err: The most recent error which has occurred.
+ * @private_lock: For use by the owner of the address_space.
+ * @private_list: For use by the owner of the address_space.
+ * @private_data: For use by the owner of the address_space.
+ */
+struct address_space {
+  struct inode            *host;
+  struct xarray           i_pages;
+  gfp_t                   gfp_mask;
+  atomic_t                i_mmap_writable;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+  /* number of thp, only for non-shmem files */
+  atomic_t                nr_thps;
+#endif
+  struct rb_root_cached   i_mmap;
+  struct rw_semaphore     i_mmap_rwsem;
+  unsigned long           nrpages;
+  unsigned long           nrexceptional;
+  pgoff_t                 writeback_index;
+  const struct address_space_operations *a_ops;
+  unsigned long           flags;
+  errseq_t                wb_err;
+  spinlock_t              private_lock;
+  struct list_head        private_list;
+  void                    *private_data;
+} __attribute__((aligned(sizeof(long)))) __randomize_layout;
+
+struct address_space_operations {
+  int (*writepage)(struct page *page, struct writeback_control *wbc);
+  int (*readpage)(struct file *, struct page *);
+
+  /* Write back some dirty pages from this mapping. */
+  int (*writepages)(struct address_space *, struct writeback_control *);
+
+  /* Set a page dirty.  Return true if this dirtied it */
+  int (*set_page_dirty)(struct page *page);
+
+  /*
+   * Reads in the requested pages. Unlike ->readpage(), this is
+   * PURELY used for read-ahead!.
+   */
+  int (*readpages)(struct file *filp, struct address_space *mapping,
+                  struct list_head *pages, unsigned nr_pages);
+  void (*readahead)(struct readahead_control *);
+
+  int (*write_begin)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned flags,
+                          struct page **pagep, void **fsdata);
+  int (*write_end)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata);
+
+  /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
+  sector_t (*bmap)(struct address_space *, sector_t);
+  void (*invalidatepage) (struct page *, unsigned int, unsigned int);
+  int (*releasepage) (struct page *, gfp_t);
+  void (*freepage)(struct page *);
+  ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+  /*
+   * migrate the contents of a page to the specified target. If
+   * migrate_mode is MIGRATE_ASYNC, it must not block.
+   */
+  int (*migratepage) (struct address_space *,
+                  struct page *, struct page *, enum migrate_mode);
+  bool (*isolate_page)(struct page *, isolate_mode_t);
+  void (*putback_page)(struct page *);
+  int (*launder_page) (struct page *);
+  int (*is_partially_uptodate) (struct page *, unsigned long,
+                                  unsigned long);
+  void (*is_dirty_writeback) (struct page *, bool *, bool *);
+  int (*error_remove_page)(struct address_space *, struct page *);
+
+  /* swapfile support */
+  int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
+                          sector_t *span);
+  void (*swap_deactivate)(struct file *file);
+};
+
+
+ + + + +
+
+ +

Reading data

+ +
/**
+ * generic_file_read_iter - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the "read_iter()" routine for all filesystems
+ * that can use the page cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
+ * be returned when no data can be read without waiting for I/O requests
+ * to complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
+ * requests shall be made for the read or for readahead.  When no data
+ * can be read, -EAGAIN shall be returned.  When readahead would be
+ * triggered, a partial, possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+
+
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec8-filesystems.html b/refs/pull/405/merge/so2/lec8-filesystems.html new file mode 100644 index 00000000..55b95608 --- /dev/null +++ b/refs/pull/405/merge/so2/lec8-filesystems.html @@ -0,0 +1,689 @@ + + + + + + SO2 Lecture 08 - Filesystem Management — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 08 - Filesystem Management

+

View slides

+
+

Lecture objectives:

+
    +
  • Filesystem abstractions
  • +
  • Filesystem operations
  • +
  • Linux VFS
  • +
  • Overview of Linux I/O Management
  • +
+
+
+

Filesystem Abstractions

+

A fileystem is a way to organize files and directories on storage +devices such as hard disks, SSDs or flash memory. There are many types +of filesystems (e.g. FAT, ext4, btrfs, ntfs) and on one running system +we can have multiple instances of the same filesystem type in use.

+

While filesystems use different data structures to organizing the +files, directories, user data and meta (internal) data on storage +devices there are a few common abstractions that are used in almost +all filesystems:

+
    +
  • superblock
  • +
  • file
  • +
  • inode
  • +
  • dentry
  • +
+

Some of these abstractions are present both on disk and in memory +while some are only present in memory.

+

The superblock abstraction contains information about the filesystem +instance such as the block size, the root inode, filesystem size. It +is present both on storage and in memory (for caching purposes).

+

The file abstraction contains information about an opened file such +as the current file pointer. It only exists in memory.

+

The inode is identifying a file on disk. It exists both on storage +and in memory (for caching purposes). An inode identifies a file in a +unique way and has various properties such as the file size, access +rights, file type, etc.

+
+

Note

+

The file name is not a property of the file.

+
+

The dentry associates a name with an inode. It exists both on +storage and in memory (for caching purposes).

+

The following diagram shows the relationship between the various filesystem +abstractions as they used in memory:

+../_images/ditaa-29f54aaa1a85b819ff29cb7d101a4d646b3b0b06.png +

Note that not all of the one to many relationships between the various +abstractions are depicted.

+

Multiple file descriptors can point to the same file because we can +use the dup() system call to duplicate a file descriptor.

+

Multiple file abstractions can point to the same dentry if we open +the same path multiple times.

+

Multiple dentries can point to the same inode when hard links are +used.

+

The following diagram shows the relationship of the filesystem +abstraction on storage:

+../_images/ditaa-bc662dab7bb3d9ba3a37efbf69b82c513dcaadd4.png +

The diagram shows that the superblock is typically stored at the +beginning of the fileystem and that various blocks are used with +different purposes: some to store dentries, some to store inodes and +some to store user data blocks. There are also blocks used to manage +the available free blocks (e.g. bitmaps for the simple filesystems).

+

The next diagram show a very simple filesystem where blocks are +grouped together by function:

+
    +
  • the superblock contains information about the block size as well as +the IMAP, DMAP, IZONE and DZONE areas.
  • +
  • the IMAP area is comprised of multiple blocks which contains a +bitmap for inode allocation; it maintains the allocated/free state +for all inodes in the IZONE area
  • +
  • the DMAP area is comprised of multiple blocks which contains a +bitmap for data blocks; it maintains the allocated/free state for +all blocks the DZONE area
  • +
+

 

+../_images/ditaa-8b59fc3f5245ffb5d7089dc80cf2e306c39a62d8.png +
+
+

Filesystem Operations

+

The following diagram shows a high level overview of how the file +system drivers interact with the rest of the file system "stack". In +order to support multiple filesystem types and instances Linux +implements a large and complex subsystem that deals with filesystem +management. This is called Virtual File System (or sometimes Virtual +File Switch) and it is abbreviated with VFS.

+../_images/ditaa-6d39f541805ae8197b413ec9c79116382abc4dbc.png +

VFS translates the complex file management related system calls to +simpler operations that are implemented by the device drivers. These +are some of the operations that a file system must implement:

+
    +
  • Mount
  • +
  • Open a file
  • +
  • Querying file attributes
  • +
  • Reading data from a file
  • +
  • Writing file to a file
  • +
  • Creating a file
  • +
  • Deleting a file
  • +
+

The next sections will look in-depth at some of these operations.

+
+

Mounting a filesystem

+

A summary of a typical implementation is presented below:

+
    +
  • Input: a storage device (partition)
  • +
  • Output: dentry pointing to the root directory
  • +
  • Steps: check device, determine filesystem parameters, locate the root inode
  • +
  • Example: check magic, determine block size, read the root inode and create dentry
  • +
+
+
+

Opening a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output: file descriptor
  • +
  • Steps:
      +
    • Determine the filesystem type
    • +
    • For each name in the path: lookup parent dentry, load inode, +load data, find dentry
    • +
    • Create a new file that points to the last dentry
    • +
    • Find a free entry in the file descriptor table and set it to file
    • +
    +
  • +
+
+
+

Querying file attributes

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output: file attributes
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Read file attributes from the inode
    • +
    +
  • +
+
+
+

Reading data from a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: file descriptor, offset, length
  • +
  • Output: data
  • +
  • Steps:
      +
    • Access file->dentry->inode
    • +
    • Determine data blocks
    • +
    • Copy data blocks to memory
    • +
    +
  • +
+
+
+

Writing data to a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: file descriptor, offset, length, data
  • +
  • Output:
  • +
  • Steps:
      +
    • Allocate one or more data blocks
    • +
    • Add the allocated blocks to the inode and update file size
    • +
    • Copy data from userspace to internal buffers and write them to +storage
    • +
    +
  • +
+
+
+

Closing a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: file descriptor
  • +
  • Output:
  • +
  • Steps:
      +
    • set the file descriptor entry to NULL
    • +
    • Decrement file reference counter
    • +
    • When the counter reaches 0 free file
    • +
    +
  • +
+
+
+

Directories

+

Directories are special files which contain one or more dentries.

+
+
+

Creating a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • Determine the inode directory
    • +
    • Read data blocks and find space for a new dentry
    • +
    • Write back the modified inode directory data blocks
    • +
    +
  • +
+
+
+

Deleting a file

+

A summary of a typical implementation is presented below:

+
    +
  • Input: path
  • +
  • Output:
  • +
  • Steps:
      +
    • determine the parent inode
    • +
    • read parent inode data blocks
    • +
    • find and erase the dentry (check for links)
    • +
    • when last file is closed: deallocate data and inode blocks
    • +
    +
  • +
+
+
+
+

Linux Virtual File System

+

Although the main purpose for the original introduction of VFS in UNIX +kernels was to support multiple filesystem types and instances, a side +effect was that it simplified fileystem device driver development +since command parts are now implement in the VFS. Almost all of the +caching and buffer management is dealt with VFS, leaving just +efficient data storage management to the filesystem device driver.

+

In order to deal with multiple filesystem types, VFS introduced the +common filesystem abstractions previously presented. Note that the +filesystem driver can also use its own particular fileystem +abstractions in memory (e.g. ext4 inode or dentry) and that there +might be a different abstraction on storage as well. Thus we may end +up with three slightly different filesystem abstractions: one for +VFS - always in memory, and two for a particular filesystem - one in +memory used by the filesystem driver, and one on storage.

+../_images/ditaa-e3a27a84dde42de58bcc5c360e1c4b15062507c2.png +
+

Superblock Operations

+

VFS requires that all filesystem implement a set of "superblock +operations".

+

They deal with initializing, updating and freeing the VFS superblock:

+
+
    +
  • fill_super() - reads the filesystem statistics (e.g. total +number of inode, free number of inodes, total number of blocks, free +number of blocks)
  • +
  • write_super() - updates the superblock information on storage +(e.g. updating the number of free inode or data blocks)
  • +
  • put_super() - free any data associated with the filsystem +instance, called when unmounting a filesystem
  • +
+
+

The next class of operations are dealing with manipulating fileystem +inodes. These operations will receive VFS inodes as parameters but the +filesystem driver may use its own inode structures internally and, if +so, they will convert in between them as necessary.

+

A summary of the superblock operations are presented below:

+
    +
  • fill_super
  • +
  • put_super
  • +
  • write_super
  • +
  • read_inode
  • +
+
    +
  • write_inode
  • +
  • evict_inode
  • +
  • statfs
  • +
  • remount_fs
  • +
+
+
+
+

Inode Operations

+

The next set of operations that VFS calls when interacting with +filesystem device drivers are the "inode operations". Non-intuitively +these mostly deal with manipulating dentries - looking up a file name, +creating, linking and removing files, dealing with symbolic links, +creating and removing directories.

+

This is the list of the most important inode operations:

+
    +
  • create
  • +
  • lookup
  • +
  • link
  • +
  • unlink
  • +
  • symlink
  • +
  • mkdir
  • +
+
    +
  • rmdir
  • +
  • rename
  • +
  • readlink
  • +
  • follow_link
  • +
  • put_link
  • +
  • ...
  • +
+
+
+
+

The Inode Cache

+

The inode cache is used to avoid reading and writing inodes to and +from storage every time we need to read or update them. The cache uses +a hash table and inodes are indexed with a hash function which takes +as parameters the superblock (of a particular filesystem instance) and +the inode number associated with an inode.

+

inodes are cached until either the filesystem is unmounted, the inode +deleted or the system enters a memory pressure state. When this +happens the Linux memory management system will (among other things) +free inodes from the inode cache based on how often they were +accessed.

+
    +
  • Caches inodes into memory to avoid costly storage operations
  • +
  • An inode is cached until low memory conditions are triggered
  • +
  • inodes are indexed with a hash table
  • +
  • The inode hash function takes the superblock and inode number as +inputs
  • +
+
+
+

The Dentry Cache

+
    +
  • State:
      +
    • Used – d_inode is valid and the dentry object is in use
    • +
    • Unused – d_inode is valid but the dentry object is not in use
    • +
    • Negative – d_inode is not valid; the inode was not yet loaded +or the file was erased
    • +
    +
  • +
  • Dentry cache
      +
    • List of used dentries (dentry->d_state == used)
    • +
    • List of the most recent used dentries (sorted by access time)
    • +
    • Hash table to avoid searching the tree
    • +
    +
  • +
+
+
+

The Page Cache

+
    +
  • Caches file data and not block device data
  • +
  • Uses the struct address_space to translate file offsets +to block offsets
  • +
  • Used for both read / write and mmap
  • +
  • Uses a radix tree
  • +
+
/**
+ * struct address_space - Contents of a cacheable, mappable object.
+ * @host: Owner, either the inode or the block_device.
+ * @i_pages: Cached pages.
+ * @gfp_mask: Memory allocation flags to use for allocating pages.
+ * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
+ * @i_mmap: Tree of private and shared mappings.
+ * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+ * @nrpages: Number of page entries, protected by the i_pages lock.
+ * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
+ * @writeback_index: Writeback starts here.
+ * @a_ops: Methods.
+ * @flags: Error bits and flags (AS_*).
+ * @wb_err: The most recent error which has occurred.
+ * @private_lock: For use by the owner of the address_space.
+ * @private_list: For use by the owner of the address_space.
+ * @private_data: For use by the owner of the address_space.
+ */
+struct address_space {
+  struct inode            *host;
+  struct xarray           i_pages;
+  gfp_t                   gfp_mask;
+  atomic_t                i_mmap_writable;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+  /* number of thp, only for non-shmem files */
+  atomic_t                nr_thps;
+#endif
+  struct rb_root_cached   i_mmap;
+  struct rw_semaphore     i_mmap_rwsem;
+  unsigned long           nrpages;
+  unsigned long           nrexceptional;
+  pgoff_t                 writeback_index;
+  const struct address_space_operations *a_ops;
+  unsigned long           flags;
+  errseq_t                wb_err;
+  spinlock_t              private_lock;
+  struct list_head        private_list;
+  void                    *private_data;
+} __attribute__((aligned(sizeof(long)))) __randomize_layout;
+
+struct address_space_operations {
+  int (*writepage)(struct page *page, struct writeback_control *wbc);
+  int (*readpage)(struct file *, struct page *);
+
+  /* Write back some dirty pages from this mapping. */
+  int (*writepages)(struct address_space *, struct writeback_control *);
+
+  /* Set a page dirty.  Return true if this dirtied it */
+  int (*set_page_dirty)(struct page *page);
+
+  /*
+   * Reads in the requested pages. Unlike ->readpage(), this is
+   * PURELY used for read-ahead!.
+   */
+  int (*readpages)(struct file *filp, struct address_space *mapping,
+                  struct list_head *pages, unsigned nr_pages);
+  void (*readahead)(struct readahead_control *);
+
+  int (*write_begin)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned flags,
+                          struct page **pagep, void **fsdata);
+  int (*write_end)(struct file *, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata);
+
+  /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
+  sector_t (*bmap)(struct address_space *, sector_t);
+  void (*invalidatepage) (struct page *, unsigned int, unsigned int);
+  int (*releasepage) (struct page *, gfp_t);
+  void (*freepage)(struct page *);
+  ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+  /*
+   * migrate the contents of a page to the specified target. If
+   * migrate_mode is MIGRATE_ASYNC, it must not block.
+   */
+  int (*migratepage) (struct address_space *,
+                  struct page *, struct page *, enum migrate_mode);
+  bool (*isolate_page)(struct page *, isolate_mode_t);
+  void (*putback_page)(struct page *);
+  int (*launder_page) (struct page *);
+  int (*is_partially_uptodate) (struct page *, unsigned long,
+                                  unsigned long);
+  void (*is_dirty_writeback) (struct page *, bool *, bool *);
+  int (*error_remove_page)(struct address_space *, struct page *);
+
+  /* swapfile support */
+  int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
+                          sector_t *span);
+  void (*swap_deactivate)(struct file *file);
+};
+
+
+
/**
+ * generic_file_read_iter - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the "read_iter()" routine for all filesystems
+ * that can use the page cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
+ * be returned when no data can be read without waiting for I/O requests
+ * to complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
+ * requests shall be made for the read or for readahead.  When no data
+ * can be read, -EAGAIN shall be returned.  When readahead would be
+ * triggered, a partial, possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec9-debugging-slides.html b/refs/pull/405/merge/so2/lec9-debugging-slides.html new file mode 100644 index 00000000..e668003a --- /dev/null +++ b/refs/pull/405/merge/so2/lec9-debugging-slides.html @@ -0,0 +1,840 @@ + + + + + + + + SO2 Lecture 09 - Kernel debugging — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +

SO2 Lecture 09 - Kernel debugging

+ + + + + +
+
+ +

Debugging

+ +
    +
  • decoding an oops/panic
  • +
  • list debugging
  • +
  • memory debugging
  • +
  • locking debugging
  • +
  • profiling
  • +
+ + + + +
+
+ +

Oops module

+ +
static noinline void do_oops(void)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_oops_init(void)
+{
+    pr_info("oops_init\n");
+    do_oops();
+
+    return 0;
+}
+
+static void so2_oops_exit(void)
+{
+    pr_info("oops exit\n");
+}
+
+module_init(so2_oops_init);
+module_exit(so2_oops_exit);
+
+
+ + + + +
+
+ +

Oops information

+ +
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_oops+0x8/0x10 [oops]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: oops(O+)
+CPU: 0 PID: 234 Comm: insmod Tainted: G           O     4.15.0+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_oops+0x8/0x10 [oops]
+CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690
+EIP: 0x44902cc2
+EFLAGS: 00000206 CPU: 0
+EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008
+ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c
+DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8
+Killed
+
+
+ + + + +
+
+ +

Oops stacktrace

+ +
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+Call Trace:
+so2_oops_init+0x17/0x20 [oops]
+do_one_initcall+0x37/0x170
+? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0
+? __might_sleep+0x32/0x90
+? trace_hardirqs_on_caller+0x11c/0x1a0
+? do_init_module+0x17/0x1c2
+? kmem_cache_alloc+0xa4/0x1e0
+? do_init_module+0x17/0x1c2
+do_init_module+0x46/0x1c2
+load_module+0x1f45/0x2380
+SyS_init_module+0xe5/0x100
+do_int80_syscall_32+0x61/0x190
+entry_INT80_32+0x2f/0x2f
+Killed
+
+
+ + + + +
+
+ +

Debugging

+ +
    +
  • CONFIG_DEBUG_INFO
  • +
  • addr2line
  • +
  • gdb
  • +
  • objdump -dSr
  • +
+ + + + +
+
+ +

addr2line

+ +
$ addr2line -e oops.o  0x08
+$ skels/debugging/oops/oops.c:5
+$ # 0x08 is the offset of the offending instruction inside the oops.ko module
+
+
+ + + + +
+
+ +

objdump

+ +
$ cat /proc/modules
+oops 20480 1 - Loading 0xc8816000 (O+)
+
+$ objdump -dS --adjust-vma=0xc8816000 oops.ko
+c8816000:       b8 61 00 00 00          mov    $0x61,%eax
+
+static noinline void do_oops(void)
+{
+c8816005:       55                      push   %ebp
+c8816006:       89 e5                   mov    %esp,%ebp
+*(int*)0x42 = 'a';
+c8816008:       a3 42 00 00 00          mov    %eax,0x42
+
+
+ + + + +
+
+ +

gdb

+ +
$ gdb ./vmlinux
+
+(gdb) list *(do_panic+0x8)
+0xc1244138 is in do_panic (lib/test_panic.c:8).
+3
+4 static struct timer_list panic_timer;
+5
+6 static void do_panic(struct timer_list *unused)
+7 {
+8         *(int*)0x42 = 'a';
+9 }
+10
+11        static int so2_panic_init(void)
+
+
+ + + + +
+
+ +

Kernel panic

+ +
static struct timer_list panic_timer;
+
+static void do_panic(struct timer_list *unused)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_panic_init(void)
+{
+    pr_info("panic_init\n");
+
+    timer_setup(&panic_timer,  do_panic, 0);
+    mod_timer(&panic_timer, jiffies + 2 * HZ);
+
+    return 0;
+}
+
+
+ + + + +
+
+ +

List debugging

+ +
static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = (struct list_head*)LIST_POISON1;
+        entry->prev = (struct list_head*)LIST_POISON2;
+}
+
+BUG: unable to handle kernel NULL pointer dereference at 00000100
+IP: crush+0x80/0xb0 [list]
+
+
+ + + + +
+
+ +

Memory debugging

+ +
    +
  • SLAB/SLUB debugging
  • +
  • KASAN
  • +
  • kmemcheck
  • +
  • DEBUG_PAGEALLOC
  • +
+ + + + +
+
+ +

Slab debugging

+ +
    +
  • CONFIG_DEBUG_SLAB
  • +
  • poisoned based memory debuggers
  • +
+../_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.png + + + + +
+
+ +

Use before initialize bugs

+ +
BUG: unable to handle kernel paging request at 5a5a5a5a
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f1de81a2>] use_before_init+0x31/0x38 [crusher]
+[<f1de8265>] crush_it+0x38/0xa9 [crusher]
+[<f1de82de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f1de82d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_before_init(void)
+{
+     struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+     printk("%s\n", __func__);
+     list_del(&m->lh);
+}
+
+
+ + + + +
+
+ +

Use after free bug

+ +
BUG: unable to handle kernel paging request at 6b6b6b6b
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f4c6816a>] use_after_free+0x38/0x3f [crusher]
+[<f4c6827f>] crush_it+0x52/0xa9 [crusher]
+[<f4c682de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f4c682d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_after_free(void)
+{
+    struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    kfree(m);
+    list_del(&m->lh);
+}
+
+
+ + + + +
+
+ +

Use after free bug

+ +
# insmod /system/lib/modules/crusher.ko test=use_before_init
+Slab corruption: size-4096 start=ed612000, len=4096
+000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 6b 6b
+
+
+
noinline void use_after_free2(void)
+{
+    char *b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+    memset(b, 0, 30);
+    b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+}
+
+
+ + + + +
+
+ +

Buffer overflow bugs

+ +
slab error in verify_redzone_free(): cache `dummy': memory outside object was overwritten
+Pid: 1282, comm: insmod Not tainted 3.0.16-mid10-00007-ga4a6b62-dirty #70
+Call Trace:
+[<c10cc1de>] __slab_error+0x17/0x1c
+[<c10cc7ca>] __cache_free+0x12c/0x317
+[<c10ccaba>] kmem_cache_free+0x2b/0xaf
+[<f27f1138>] buffer_overflow+0x4c/0x57 [crusher]
+[<f27f12aa>] crush_it+0x6c/0xa9 [crusher]
+[<f27f12ef>] init_module+0x8/0xd [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+eb002bf8: redzone 1:0xd84156c5635688c0, redzone 2:0x0
+
+
+
noinline void buffer_overflow(void)
+{
+    struct kmem_cache *km = kmem_cache_create("dummy", 3000, 0, 0, NULL);
+    char *b = kmem_cache_alloc(km, GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    memset(b, 0, 3016);
+    kmem_cache_free(km, b);
+}
+
+
+ + + + +
+
+ +

DEBUG_PAGEALLOC

+ +
    +
  • Memory debugger that works at a page level
  • +
  • Detects invalid accesses either by:
      +
    • Filling pages with poison byte patterns and checking the pattern at +reallocation
    • +
    • Unmapping the dellocated pages from kernel space (just a few +architectures)
    • +
    +
  • +
+ + + + +
+
+ +

KASan

+ +
    +
  • dynamic memory error detector
  • +
  • finds user-after-free or out-of-bound bugs
  • +
  • uses shadow memory to track memory operations
  • +
  • lib/test_kasan.c
  • +
+ + + + +
+
+ +

KASan vs DEBUG_PAGEALLOC

+ +

KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page granularity +level, so it able to find more bugs.

+ + + + +
+
+ +

KASan vs SLUB_DEBUG

+ +
    +
  • SLUB_DEBUG has lower overhead than KASan.
  • +
  • SLUB_DEBUG in most cases are not able to detect bad reads, KASan able to +detect both reads and writes.
  • +
  • In some cases (e.g. redzone overwritten) SLUB_DEBUG detect bugs only on +allocation/freeing of object. KASan catch bugs right before it will happen, +so we always know exact place of first bad read/write.
  • +
+ + + + +
+
+ +

Kmemleak

+ +
    +
  • enable kernel config: CONFIG_DEBUG_KMEMLEAK
  • +
  • setup: mount -t debugfs nodev /sys/kernel/debug
  • +
  • trigger a memory scan: echo scan > /sys/kernel/debug/kmemleak
  • +
  • show memory leaks: cat /sys/kernel/debug/kmemleak
  • +
  • clear all possible leaks: echo clear > /sys/kernel/debug/kmemleak
  • +
+ + + + +
+
+ +

Kmemleak example

+ +
static int leak_init(void)
+{
+    pr_info("%s\n", __func__);
+
+    (void)kmalloc(16, GFP_KERNEL);
+
+    return 0;
+}
+
+MODULE_LICENSE("GPL v2");
+module_init(leak_init);
+
+
+ + + + +
+
+ +

Kmemleak report

+ +
root@qemux86:~# insmod skels/debugging/leak/leak.ko
+leak: loading out-of-tree module taints kernel.
+leak_init
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
+root@qemux86:~# cat /sys/kernel/debug/kmemleak
+unreferenced object 0xd7871500 (size 32):
+comm "insmod", pid 237, jiffies 4294902108 (age 24.628s)
+hex dump (first 32 bytes):
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a a5  ZZZZZZZZZZZZZZZ.
+backtrace:
+[<(ptrval)>] kmem_cache_alloc_trace+0x163/0x310
+[<(ptrval)>] leak_init+0x2f/0x1000 [leak]
+[<(ptrval)>] do_one_initcall+0x57/0x2e0
+[<(ptrval)>] do_init_module+0x4b/0x1be
+[<(ptrval)>] load_module+0x201a/0x2590
+[<(ptrval)>] sys_init_module+0xfd/0x120
+[<(ptrval)>] do_int80_syscall_32+0x6a/0x1a0
+
+
+ + + + +
+
+ +

Lockdep checker

+ +
    +
  • CONFIG_DEBUG_LOCKDEP
  • +
  • Detects lock inversio, circular dependencies, incorrect usage of locks +(including interrupt context)
  • +
  • Maintains dependency between classes of locks not individual locks
  • +
  • Each scenario is only checked once and hashed
  • +
+ + + + +
+
+ +

AB BA Deadlock Example

+ +
static noinline int thread_a(void *unused)
+{
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+
+  mutex_unlock(&b);
+  mutex_unlock(&a);
+
+  return 0;
+}
+
+
+
static noinline int thread_b(void *unused)
+{
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+
+  mutex_unlock(&a);
+  mutex_unlock(&b);
+
+  return 0;
+}
+
+
+ + + + +
+
+ +

AB BA Deadlock Report

+ +
thread_a acquired A
+thread_a acquired B
+thread_b acquired B
+
+======================================================
+WARNING: possible circular locking dependency detected
+4.19.0+ #4 Tainted: G           O
+------------------------------------------------------
+thread_b/238 is trying to acquire lock:
+(ptrval) (a){+.+.}, at: thread_b+0x48/0x90 [locking]
+
+but task is already holding lock:
+(ptrval) (b){+.+.}, at: thread_b+0x27/0x90 [locking]
+
+which lock already depends on the new lock.
+
+
+ + + + +
+
+ +

AB BA Deadlock Report (dependency chain)

+ +
the existing dependency chain (in reverse order) is:
+
+-> #1 (b){+.+.}:
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_a+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+-> #0 (a){+.+.}:
+      lock_acquire+0x93/0x190
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_b+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+
+ + + + +
+
+ +

AB BA Deadlock Report (unsafe locking scenario)

+ +
other info that might help us debug this:
+
+Possible unsafe locking scenario:
+
+CPU0                    CPU1
+----                    ----
+lock(b);
+                        lock(a);
+                        lock(b);
+lock(a);
+
+*** DEADLOCK ***
+
+
+ + + + +
+
+ +

IRQ Deadlock Example

+ +
static DEFINE_SPINLOCK(lock);
+
+static void timerfn(struct timer_list *unused)
+{
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+}
+
+static DEFINE_TIMER(timer, timerfn);
+
+int init_module(void)
+{
+  mod_timer(&timer, jiffies);
+
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+  return 0;
+}
+
+
+ + + + +
+
+ +

IRQ Deadlock Report

+ +
init_module acquiring lock
+init_module acquired lock
+init_module released lock
+timerfn acquiring lock
+
+================================
+WARNING: inconsistent lock state
+4.19.0+ #4 Tainted: G           O
+--------------------------------
+inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+ksoftirqd/0/9 [HC0[0]:SC1[1]:HE1:SE0] takes:
+(ptrval) (lock#4){+.?.}, at: timerfn+0x25/0x60 [locking2]
+{SOFTIRQ-ON-W} state was registered at:
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+init_module+0x35/0x70 [locking2]
+do_one_initcall+0x57/0x2e0
+do_init_module+0x4b/0x1be
+load_module+0x201a/0x2590
+sys_init_module+0xfd/0x120
+do_int80_syscall_32+0x6a/0x1a0
+restore_all+0x0/0x8d
+
+
+ + + + +
+
+ +

IRQ Deadlock Report

+ +
Possible unsafe locking scenario:
+
+       CPU0
+       ----
+       lock(lock#4);
+       <Interrupt>
+       lock(lock#4);
+
+       *** DEADLOCK ***
+
+1 lock held by ksoftirqd/0/9:
+#0: (ptrval) (/home/tavi/src/linux/tools/labs/skels/./debugging/locking2/locking2.c:13){+.-.}, at: call_timer_f0
+stack backtrace:
+CPU: 0 PID: 9 Comm: ksoftirqd/0 Tainted: G           O      4.19.0+ #4
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
+Call Trace:
+dump_stack+0x66/0x96
+print_usage_bug.part.26+0x1ee/0x200
+mark_lock+0x5ea/0x640
+__lock_acquire+0x4b4/0x17a0
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+timerfn+0x25/0x60 [locking2]
+
+
+ + + + +
+
+ +

perf

+ +
    +
  • performance counters, tracepoints, kprobes, uprobes
  • +
  • hardware events: CPU cycles, TLB misses, cache misses
  • +
  • software events: page faults , context switches
  • +
  • collects backtraces (user + kernel)
  • +
+ + + + +
+
+ +

Other tools

+ +
    +
  • ftrace
  • +
  • kprobes
  • +
  • sparse
  • +
  • coccinelle
  • +
  • checkpatch.pl
  • +
  • printk
  • +
  • dump_stack()
  • +
+ + + + +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/refs/pull/405/merge/so2/lec9-debugging.html b/refs/pull/405/merge/so2/lec9-debugging.html new file mode 100644 index 00000000..35c9774a --- /dev/null +++ b/refs/pull/405/merge/so2/lec9-debugging.html @@ -0,0 +1,933 @@ + + + + + + SO2 Lecture 09 - Kernel debugging — The Linux Kernel documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

SO2 Lecture 09 - Kernel debugging

+

View slides

+
+

Lecture objectives:

+

One essential part of Linux kernel development is debugging. In user space we had +the support of the kernel so we could easily stop processes and use gdb to inspect +their behavior. In the kernel, in order to use gdb we need to use hypervisor like +QEMU or JTAG based hardware interfaces which are not always available. The Linux +kernel provides a set of tools and debug options useful for investigating abnormal +behavior.

+

In this lecture we will learn about:

+
    +
  • decoding an oops/panic
  • +
  • list debugging
  • +
  • memory debugging
  • +
  • locking debugging
  • +
  • profiling
  • +
+
+
+

Decoding an oops/panic

+

An oops is an inconsistent state that the kernel detects inside itself. +Upon detecting an oops the Linux kernel kills the offending process, +prints information that can help debug the problem and continues execution +but with limited reliability.

+

Lets consider the following Linux kernel module:

+
static noinline void do_oops(void)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_oops_init(void)
+{
+    pr_info("oops_init\n");
+    do_oops();
+
+    return 0;
+}
+
+static void so2_oops_exit(void)
+{
+    pr_info("oops exit\n");
+}
+
+module_init(so2_oops_init);
+module_exit(so2_oops_exit);
+
+
+

Notice that ''do_oops'' function tries to write at an invalid memory address. Because the kernel +cannot find a suitable physical page were to write, it kills the insmod task in the context of +which ''do_oops'' runs. Then it prints the following oops message:

+
+
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_oops+0x8/0x10 [oops]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: oops(O+)
+CPU: 0 PID: 234 Comm: insmod Tainted: G           O     4.15.0+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_oops+0x8/0x10 [oops]
+EFLAGS: 00000292 CPU: 0
+EAX: 00000061 EBX: 00000000 ECX: c7ed3584 EDX: c7ece8dc
+ESI: c716c908 EDI: c8816010 EBP: c7257df0 ESP: c7257df0
+DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
+CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690
+Call Trace:
+so2_oops_init+0x17/0x20 [oops]
+do_one_initcall+0x37/0x170
+? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0
+? __might_sleep+0x32/0x90
+? trace_hardirqs_on_caller+0x11c/0x1a0
+? do_init_module+0x17/0x1c2
+? kmem_cache_alloc+0xa4/0x1e0
+? do_init_module+0x17/0x1c2
+do_init_module+0x46/0x1c2
+load_module+0x1f45/0x2380
+SyS_init_module+0xe5/0x100
+do_int80_syscall_32+0x61/0x190
+entry_INT80_32+0x2f/0x2f
+EIP: 0x44902cc2
+EFLAGS: 00000206 CPU: 0
+EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008
+ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c
+DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8
+EIP: do_oops+0x8/0x10 [oops] SS:ESP: 0068:c7257df0
+CR2: 0000000000000042
+---[ end trace 011848be72f8bb42 ]---
+Killed
+
+
+
+

An oops contains information about the IP which caused the fault, register status, process, +CPU on which the fault happend like below:

+
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_oops+0x8/0x10 [oops]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: oops(O+)
+CPU: 0 PID: 234 Comm: insmod Tainted: G           O     4.15.0+ #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_oops+0x8/0x10 [oops]
+CR0: 80050033 CR2: 00000042 CR3: 0785f000 CR4: 00000690
+EIP: 0x44902cc2
+EFLAGS: 00000206 CPU: 0
+EAX: ffffffda EBX: 08afb050 ECX: 0000eef4 EDX: 08afb008
+ESI: 00000000 EDI: bf914dbc EBP: 00000000 ESP: bf914c1c
+DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 04 c7 04 24 24 70 81 c8 e8
+Killed
+
+
+

Another important thing that an oops can provide is the stack trace of functions called before +the fault happend:

+
root@qemux86:~/skels/debugging/oops# insmod oops.ko
+BUG: unable to handle kernel NULL pointer dereference at 00000042
+Call Trace:
+so2_oops_init+0x17/0x20 [oops]
+do_one_initcall+0x37/0x170
+? cache_alloc_debugcheck_after.isra.19+0x15f/0x2f0
+? __might_sleep+0x32/0x90
+? trace_hardirqs_on_caller+0x11c/0x1a0
+? do_init_module+0x17/0x1c2
+? kmem_cache_alloc+0xa4/0x1e0
+? do_init_module+0x17/0x1c2
+do_init_module+0x46/0x1c2
+load_module+0x1f45/0x2380
+SyS_init_module+0xe5/0x100
+do_int80_syscall_32+0x61/0x190
+entry_INT80_32+0x2f/0x2f
+Killed
+
+
+
+

Decoding an oops

+
    +
  • CONFIG_DEBUG_INFO
  • +
  • addr2line
  • +
  • gdb
  • +
  • objdump -dSr
  • +
+
+
+

addr2line

+

addr2line translates addresses into file names and line numbers. Given +an address in an executable it uses the debugging information to figure out +which file name and line number are associated with it.

+

Modules are loaded at dynamic addresses but are compiled starting with 0 as +a base address. So, in order to find the line number for a given dynamic address +we need to know module's load address.

+
$ addr2line -e oops.o  0x08
+$ skels/debugging/oops/oops.c:5
+$ # 0x08 is the offset of the offending instruction inside the oops.ko module
+
+
+
+
+

objdump

+

Similar we can determine the offending line using objdump:

+
$ cat /proc/modules
+oops 20480 1 - Loading 0xc8816000 (O+)
+
+$ objdump -dS --adjust-vma=0xc8816000 oops.ko
+c8816000:       b8 61 00 00 00          mov    $0x61,%eax
+
+static noinline void do_oops(void)
+{
+c8816005:       55                      push   %ebp
+c8816006:       89 e5                   mov    %esp,%ebp
+*(int*)0x42 = 'a';
+c8816008:       a3 42 00 00 00          mov    %eax,0x42
+
+
+
+
+

gdb

+
$ gdb ./vmlinux
+
+(gdb) list *(do_panic+0x8)
+0xc1244138 is in do_panic (lib/test_panic.c:8).
+3
+4 static struct timer_list panic_timer;
+5
+6 static void do_panic(struct timer_list *unused)
+7 {
+8         *(int*)0x42 = 'a';
+9 }
+10
+11        static int so2_panic_init(void)
+
+
+
+
+

Kernel panic

+

A kernel panic is a special type of oops where the kernel cannot continue execution. For example +if the function do_oops from above was called in the interrupt context, the kernel wouldn't know how to kill +and it will decide that it is better to crash the kernel and stop execution.

+

Here is a sample code that will generate a kernel panic:

+
static struct timer_list panic_timer;
+
+static void do_panic(struct timer_list *unused)
+{
+    *(int*)0x42 = 'a';
+}
+
+static int so2_panic_init(void)
+{
+    pr_info("panic_init\n");
+
+    timer_setup(&panic_timer,  do_panic, 0);
+    mod_timer(&panic_timer, jiffies + 2 * HZ);
+
+    return 0;
+}
+
+
+

Loading the module will generate the following kernel panic message:

+
root@qemux86:~/skels/debugging/panic# insmod panic.ko
+panic: loading out-of-tree module taints kernel.
+panic_init
+root@qemux86:~/skels/debugging/panic# BUG: unable to handle kernel NULL pointer dereference at 00000042
+IP: do_panic+0x8/0x10 [panic]
+*pde = 00000000
+Oops: 0002 [#1] SMP
+Modules linked in: panic(O)
+CPU: 0 PID: 0 Comm: swapper/0 Tainted: G           O     4.15.0+ #19
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+EIP: do_panic+0x8/0x10 [panic]
+EFLAGS: 00010246 CPU: 0
+EAX: 00000061 EBX: 00000101 ECX: 000002d8 EDX: 00000000
+ESI: c8817000 EDI: c8819200 EBP: c780ff34 ESP: c780ff34
+DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
+CR0: 80050033 CR2: 00000042 CR3: 0716b000 CR4: 00000690
+Call Trace:
+<SOFTIRQ>
+call_timer_fn+0x63/0xf0
+? process_timeout+0x10/0x10
+run_timer_softirq+0x14f/0x170
+? 0xc8817000
+? trace_hardirqs_on_caller+0x9b/0x1a0
+__do_softirq+0xde/0x1f2
+? __irqentry_text_end+0x6/0x6
+do_softirq_own_stack+0x57/0x70
+</SOFTIRQ>
+irq_exit+0x7d/0x90
+smp_apic_timer_interrupt+0x4f/0x90
+? trace_hardirqs_off_thunk+0xc/0x1d
+apic_timer_interrupt+0x3a/0x40
+EIP: default_idle+0xa/0x10
+EFLAGS: 00000246 CPU: 0
+EAX: c15c97c0 EBX: 00000000 ECX: 00000000 EDX: 00000001
+ESI: 00000000 EDI: 00000000 EBP: c15c3f48 ESP: c15c3f48
+DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
+arch_cpu_idle+0x9/0x10
+default_idle_call+0x19/0x30
+do_idle+0x105/0x180
+cpu_startup_entry+0x25/0x30
+rest_init+0x1e3/0x1f0
+start_kernel+0x305/0x30a
+i386_start_kernel+0x95/0x99
+startup_32_smp+0x15f/0x164
+Code: <a3> 42 00 00 00 5d c3 90 55 89 e5 83 ec 08 c7 04 24 24 80 81 c8 e8
+EIP: do_panic+0x8/0x10 [panic] SS:ESP: 0068:c780ff34
+CR2: 0000000000000042
+---[ end trace 77f49f83f2e42f91 ]---
+Kernel panic - not syncing: Fatal exception in interrupt
+Kernel Offset: disabled
+---[ end Kernel panic - not syncing: Fatal exception in interrupt
+
+
+
+
+
+

List debugging

+

In order to catch access to uninitialized elements the kernel uses poison +magic values.

+
static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = (struct list_head*)LIST_POISON1;
+        entry->prev = (struct list_head*)LIST_POISON2;
+}
+
+BUG: unable to handle kernel NULL pointer dereference at 00000100
+IP: crush+0x80/0xb0 [list]
+
+
+
+
+

Memory debugging

+

There are several tools for memory debugging:

+
    +
  • SLAB/SLUB debugging
  • +
  • KASAN
  • +
  • kmemcheck
  • +
  • DEBUG_PAGEALLOC
  • +
+
+

Slab debugging

+

Slab debugging uses a memory poison technique to detect several types of memory +bugs in the SLAB/SUB allocators.

+

The allocated buffers are guarded with memory that has been filled in with +special markers. Any adjacent writes to the buffer will be detected at a later +time when other memory management operations on that buffer are performed +(e.g. when the buffer is freed).

+

Upon allocation of the buffer, the buffer it is also filled in with a special +value to potentially detect buffer access before initialization (e.g. if the +buffer holds pointers). The value is selected in such a way that it is unlikely +to be a valid address and as such to trigger kernel bugs at the access time.

+

A similar technique is used when freeing the buffer: the buffer is filled with +another special value that will cause kernel bugs if pointers are accessed after +the memory is freed. In this case, the allocator also checks the next time the +buffer is allocated that the buffer was not modified.

+

The diagram bellow shows a summary of the way SLAB/SLUB poisoning works:

+
    +
  • CONFIG_DEBUG_SLAB
  • +
  • poisoned based memory debuggers
  • +
+../_images/ditaa-5e6f93e563d6e94c14fe3d483f988e0579b05b38.png +

Example of an use before initialize bug:

+
BUG: unable to handle kernel paging request at 5a5a5a5a
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f1de81a2>] use_before_init+0x31/0x38 [crusher]
+[<f1de8265>] crush_it+0x38/0xa9 [crusher]
+[<f1de82de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f1de82d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_before_init(void)
+{
+     struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+     printk("%s\n", __func__);
+     list_del(&m->lh);
+}
+
+
+

Example of an use after free bug:

+
BUG: unable to handle kernel paging request at 6b6b6b6b
+IP: [<c1225063>] __list_del_entry+0x37/0x71
+…
+Call Trace:
+[<c12250a8>] list_del+0xb/0x1b
+[<f4c6816a>] use_after_free+0x38/0x3f [crusher]
+[<f4c6827f>] crush_it+0x52/0xa9 [crusher]
+[<f4c682de>] init_module+0x8/0xa [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<f4c682d6>] ? crush_it+0xa9/0xa9 [crusher]
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+
+
+
noinline void use_after_free(void)
+{
+    struct list_m *m = kmalloc(sizeof(*m), GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    kfree(m);
+    list_del(&m->lh);
+}
+
+
+

Another example of an use after free bug is shown below. Note that this time the +bug is detected at the next allocation.

+
# insmod /system/lib/modules/crusher.ko test=use_before_init
+Slab corruption: size-4096 start=ed612000, len=4096
+000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 6b 6b
+
+
+
noinline void use_after_free2(void)
+{
+    char *b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+    memset(b, 0, 30);
+    b = kmalloc(3000, GFP_KERNEL);
+    kfree(b);
+}
+
+
+

Finally this is an example of a buffer overflow bug:

+
slab error in verify_redzone_free(): cache `dummy': memory outside object was overwritten
+Pid: 1282, comm: insmod Not tainted 3.0.16-mid10-00007-ga4a6b62-dirty #70
+Call Trace:
+[<c10cc1de>] __slab_error+0x17/0x1c
+[<c10cc7ca>] __cache_free+0x12c/0x317
+[<c10ccaba>] kmem_cache_free+0x2b/0xaf
+[<f27f1138>] buffer_overflow+0x4c/0x57 [crusher]
+[<f27f12aa>] crush_it+0x6c/0xa9 [crusher]
+[<f27f12ef>] init_module+0x8/0xd [crusher]
+[<c1001072>] do_one_initcall+0x72/0x119
+[<c106b8ae>] sys_init_module+0xc8d/0xe77
+[<c14d7d18>] syscall_call+0x7/0xb
+eb002bf8: redzone 1:0xd84156c5635688c0, redzone 2:0x0
+
+
+
noinline void buffer_overflow(void)
+{
+    struct kmem_cache *km = kmem_cache_create("dummy", 3000, 0, 0, NULL);
+    char *b = kmem_cache_alloc(km, GFP_KERNEL);
+
+    printk("%s\n", __func__);
+    memset(b, 0, 3016);
+    kmem_cache_free(km, b);
+}
+
+
+
+
+

DEBUG_PAGEALLOC

+
    +
  • Memory debugger that works at a page level
  • +
  • Detects invalid accesses either by:
      +
    • Filling pages with poison byte patterns and checking the pattern at +reallocation
    • +
    • Unmapping the dellocated pages from kernel space (just a few +architectures)
    • +
    +
  • +
+
+
+

KASan

+

KASan is a dynamic memory error detector designed to find use-after-free +and out-of-bounds bugs.

+

The main idea of KASAN is to use shadow memory to record whether each byte +of memory is safe to access or not, and use compiler's instrumentation to +check the shadow memory on each memory access.

+

Address sanitizer uses 1 byte of shadow memory to track 8 bytes of kernel +address space. It uses 0-7 to encode the number of consecutive bytes at +the beginning of the eigh-byte region that are valid.

+

See The Kernel Address Sanitizer (KASAN) for more information and have a look +at lib/test_kasan.c for an example of problems that KASan can detect.

+
    +
  • dynamic memory error detector
  • +
  • finds user-after-free or out-of-bound bugs
  • +
  • uses shadow memory to track memory operations
  • +
  • lib/test_kasan.c
  • +
+
+

KASan vs DEBUG_PAGEALLOC

+

KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page granularity +level, so it able to find more bugs.

+
+
+

KASan vs SLUB_DEBUG

+
    +
  • SLUB_DEBUG has lower overhead than KASan.
  • +
  • SLUB_DEBUG in most cases are not able to detect bad reads, KASan able to +detect both reads and writes.
  • +
  • In some cases (e.g. redzone overwritten) SLUB_DEBUG detect bugs only on +allocation/freeing of object. KASan catch bugs right before it will happen, +so we always know exact place of first bad read/write.
  • +
+
+
+
+

Kmemleak

+

Kmemleak provides a way of detecting kernel memory leaks in a way similar to a +tracing garbage collector. Since tracing pointers is not possible in C, kmemleak +scans the kernel stacks as well as dynamically and statically kernel memory for +pointers to allocated buffers. A buffer for which there is no pointer is +considered as leaked. The basic steps to use kmemleak are presented bellow, for +more information see Kernel Memory Leak Detector

+
    +
  • enable kernel config: CONFIG_DEBUG_KMEMLEAK
  • +
  • setup: mount -t debugfs nodev /sys/kernel/debug
  • +
  • trigger a memory scan: echo scan > /sys/kernel/debug/kmemleak
  • +
  • show memory leaks: cat /sys/kernel/debug/kmemleak
  • +
  • clear all possible leaks: echo clear > /sys/kernel/debug/kmemleak
  • +
+

As an example, lets look at the following simple module:

+
static int leak_init(void)
+{
+    pr_info("%s\n", __func__);
+
+    (void)kmalloc(16, GFP_KERNEL);
+
+    return 0;
+}
+
+MODULE_LICENSE("GPL v2");
+module_init(leak_init);
+
+
+

Loading the module and triggering a kmemleak scan will issue the +following report:

+
root@qemux86:~# insmod skels/debugging/leak/leak.ko
+leak: loading out-of-tree module taints kernel.
+leak_init
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+root@qemux86:~# echo scan > /sys/kernel/debug/kmemleak
+kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
+root@qemux86:~# cat /sys/kernel/debug/kmemleak
+unreferenced object 0xd7871500 (size 32):
+comm "insmod", pid 237, jiffies 4294902108 (age 24.628s)
+hex dump (first 32 bytes):
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
+5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a a5  ZZZZZZZZZZZZZZZ.
+backtrace:
+[<(ptrval)>] kmem_cache_alloc_trace+0x163/0x310
+[<(ptrval)>] leak_init+0x2f/0x1000 [leak]
+[<(ptrval)>] do_one_initcall+0x57/0x2e0
+[<(ptrval)>] do_init_module+0x4b/0x1be
+[<(ptrval)>] load_module+0x201a/0x2590
+[<(ptrval)>] sys_init_module+0xfd/0x120
+[<(ptrval)>] do_int80_syscall_32+0x6a/0x1a0
+
+
+
+

Note

+

Notice that we did not had to unload the module to detect the memory +leak since kmemleak detects that the allocated buffer is not +reachable anymore.

+
+
+
+
+

Lockdep checker

+
    +
  • CONFIG_DEBUG_LOCKDEP
  • +
  • Detects lock inversio, circular dependencies, incorrect usage of locks +(including interrupt context)
  • +
  • Maintains dependency between classes of locks not individual locks
  • +
  • Each scenario is only checked once and hashed
  • +
+

Lets take for example the following kernel module that runs two kernel threads:

+
static noinline int thread_a(void *unused)
+{
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+
+  mutex_unlock(&b);
+  mutex_unlock(&a);
+
+  return 0;
+}
+
+
+
static noinline int thread_b(void *unused)
+{
+  mutex_lock(&b); pr_info("%s acquired B\n", __func__);
+  mutex_lock(&a); pr_info("%s acquired A\n", __func__);
+
+  mutex_unlock(&a);
+  mutex_unlock(&b);
+
+  return 0;
+}
+
+
+

Loading this module with lockdep checker active will produce the following +kernel log:

+
thread_a acquired A
+thread_a acquired B
+thread_b acquired B
+
+======================================================
+WARNING: possible circular locking dependency detected
+4.19.0+ #4 Tainted: G           O
+------------------------------------------------------
+thread_b/238 is trying to acquire lock:
+(ptrval) (a){+.+.}, at: thread_b+0x48/0x90 [locking]
+
+but task is already holding lock:
+(ptrval) (b){+.+.}, at: thread_b+0x27/0x90 [locking]
+
+which lock already depends on the new lock.
+
+
+

As you can see, although the deadlock condition did not trigger (because thread +A did not complete execution before thread B started execution) the lockdep +checker identified a potential deadlock scenario.

+

Lockdep checker will provide even more information to help determine what caused +the deadlock, like the dependency chain:

+
the existing dependency chain (in reverse order) is:
+
+-> #1 (b){+.+.}:
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_a+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+-> #0 (a){+.+.}:
+      lock_acquire+0x93/0x190
+      __mutex_lock+0x60/0x830
+      mutex_lock_nested+0x20/0x30
+      thread_b+0x48/0x90 [locking]
+      kthread+0xeb/0x100
+      ret_from_fork+0x2e/0x38
+
+
+

and even an unsafe locking scenario:

+
other info that might help us debug this:
+
+Possible unsafe locking scenario:
+
+CPU0                    CPU1
+----                    ----
+lock(b);
+                        lock(a);
+                        lock(b);
+lock(a);
+
+*** DEADLOCK ***
+
+
+

Another example of unsafe locking issues that lockdep checker detects +is unsafe locking from interrupt context. Lets consider the following +kernel module:

+
static DEFINE_SPINLOCK(lock);
+
+static void timerfn(struct timer_list *unused)
+{
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+}
+
+static DEFINE_TIMER(timer, timerfn);
+
+int init_module(void)
+{
+  mod_timer(&timer, jiffies);
+
+  pr_info("%s acquiring lock\n", __func__);
+  spin_lock(&lock);   pr_info("%s acquired lock\n", __func__);
+  spin_unlock(&lock); pr_info("%s released lock\n", __func__);
+  return 0;
+}
+
+
+

As in the previous case, loading the module will trigger a lockdep +warning:

+
init_module acquiring lock
+init_module acquired lock
+init_module released lock
+timerfn acquiring lock
+
+================================
+WARNING: inconsistent lock state
+4.19.0+ #4 Tainted: G           O
+--------------------------------
+inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+ksoftirqd/0/9 [HC0[0]:SC1[1]:HE1:SE0] takes:
+(ptrval) (lock#4){+.?.}, at: timerfn+0x25/0x60 [locking2]
+{SOFTIRQ-ON-W} state was registered at:
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+init_module+0x35/0x70 [locking2]
+do_one_initcall+0x57/0x2e0
+do_init_module+0x4b/0x1be
+load_module+0x201a/0x2590
+sys_init_module+0xfd/0x120
+do_int80_syscall_32+0x6a/0x1a0
+restore_all+0x0/0x8d
+
+
+

The warning will also provide additional information and a potential unsafe +locking scenario:

+
Possible unsafe locking scenario:
+
+       CPU0
+       ----
+       lock(lock#4);
+       <Interrupt>
+       lock(lock#4);
+
+       *** DEADLOCK ***
+
+1 lock held by ksoftirqd/0/9:
+#0: (ptrval) (/home/tavi/src/linux/tools/labs/skels/./debugging/locking2/locking2.c:13){+.-.}, at: call_timer_f0
+stack backtrace:
+CPU: 0 PID: 9 Comm: ksoftirqd/0 Tainted: G           O      4.19.0+ #4
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
+Call Trace:
+dump_stack+0x66/0x96
+print_usage_bug.part.26+0x1ee/0x200
+mark_lock+0x5ea/0x640
+__lock_acquire+0x4b4/0x17a0
+lock_acquire+0x93/0x190
+_raw_spin_lock+0x39/0x50
+timerfn+0x25/0x60 [locking2]
+
+
+
+
+

perf

+
    +
  • performance counters, tracepoints, kprobes, uprobes
  • +
  • hardware events: CPU cycles, TLB misses, cache misses
  • +
  • software events: page faults , context switches
  • +
  • collects backtraces (user + kernel)
  • +
+
+
+

Other tools

+
    +
  • ftrace
  • +
  • kprobes
  • +
  • sparse
  • +
  • coccinelle
  • +
  • checkpatch.pl
  • +
  • printk
  • +
  • dump_stack()
  • +
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file