diff --git a/SQL_production/TM_2023A_data.sql b/SQL_production/TM_2023A_data.sql new file mode 100644 index 0000000..f05f1ae --- /dev/null +++ b/SQL_production/TM_2023A_data.sql @@ -0,0 +1,434 @@ + +----------------------------------------------------------------------------- +----------------------------------------------------------------------------- +---- Data for TMs ---- +---- ---- +---- This code creates the data download tables and the front end data ---- +---- ---- +---- It has been updated for 2022 to include size in the breakdowns and ---- +---- exam cohort - this made a difference to VRQ3 sports studies where ---- +---- different gnumbers were classed as either applied general or tech ---- +---- level. The front end has also had an update on 2021 to include ---- +---- qualification type to disaggregate A and AS level and to give more ---- +---- detail to study skills. Also removed a lot of the unusual manual ---- +---- grade changes. ---- +---- ---- +----------------------------------------------------------------------------- +----------------------------------------------------------------------------- + +-- declare the year and all the names of the base tables to be updated to appropriate tables +DECLARE @RPYEAR AS INTEGER = 2023 + +If object_Id('tempDB..#subj_tab') is not null drop table #subj_tab +select * into #subj_tab from QRD.dbo.Subje01_2023_10_31 +If object_Id('tempDB..#tab2') is not null drop table #tab2 +select * into #tab2 from QRD.dbo.Table2_2023_10_31 +If object_Id('tempDB..#tab3') is not null drop table #tab3 +select * into #tab3 from QRD.dbo.Table3_2023_10_31 +If object_Id('tempDB..#tab4') is not null drop table #tab4 +select * into #tab4 from QRD.dbo.Table4_2023_10_31 +If object_Id('tempDB..#subj') is not null drop table #subj +--select distinct SUBLEVNO, SUBJ, MAPPING into #subj from [L3VA].[U2022].[QUAL_SUBJ_LOOKUP] +select distinct SUBJ, MAPPING into #subj from [L3VA].[U2023].[SUBJCODES_lookup] + +If object_Id('tempDB..#raw_inst') is not null drop table #raw_inst +select * into #raw_inst from KS5_RESTRICTED.[Outputs].[RAW_Inst_POST16_202223A] +If object_Id('tempDB..#raw_LA') is not null drop table #raw_LA +select * into #raw_LA from KS5_RESTRICTED.[Outputs].[RAW_LEA_POST16_202223A] +If object_Id('tempDB..#exam') is not null drop table #exam +select * into #exam from [KS5_RESTRICTED].[Outputs].[Exam_PT_POST16_202223A] +If object_Id('tempDB..#CSCP_lookup') is not null drop table #CSCP_lookup +select * into #CSCP_lookup from KS5_RESTRICTED.Internal.CSCP_subject_lookup +If object_Id('tempDB..#indicator') is not null drop table #indicator +select * into #indicator from KS5_RESTRICTED.[Outputs].[PupilIndicators_POST16_202223A] +If object_Id('tempDB..#PRIOR') is not null drop table #PRIOR +select * into #PRIOR from KS5_STATISTICS_RESTRICTED.[EES_2023A].[PRIORS_KS4end2022] +If object_Id('tempDB..#Allocations') is not null drop table #Allocations +select PUPILID, END_KS into #Allocations from KS5_RESTRICTED.[Outputs].[PupilAllocations_POST16_202223A] + +---------------------------------------------------------- +-- +--Most of this initial set up code is not important for TMs it's taken from the performance tables data download +-- +---------------------------------------------------------- + +--calculating the original PTQ INCLUDE ignoring COVID impacted to use to flag the COVID impacted results +--the spec for PTQ INCLUDE doesn't include removing # but it appears to be the case when QAing with RM - i think there is something somewhere about # quals +If object_Id('tempDB..#QRD_PTQINCLUDE') is not null drop table #QRD_PTQINCLUDE +select distinct a.QUID, case when a.syllabus_ref = '***' then '' else a.syllabus_ref end as syllabus_ref, a.Wolf_Included_1618 as Wolf_Included_1618_original, KS5_Academic, KS5_Subset, +case when syllabus_ref != 'KNO' and (App1618 = 1 and ((KS5_Equivalences = 1 and KS5_Academic = 1) or Wolf_Included_1618 in (1,2,3)) and QUID != '#') then 1 else 0 end PTQ_INCLUDE_original +into #QRD_PTQINCLUDE +from #subj_tab as a +left join #tab4 as b +on a.Qual_Type = b.Qual_Type + +--get the asize and gsize of the different qualications +If object_Id('tempDB..#QRD_SIZE') is not null drop table #QRD_SIZE +select a.QUID, +max(coalesce(b.GCSE_Equivalent, c.GCSE_Equivalent)) as GSIZE, +max(coalesce(b.AL_Equivalent, c.AL_Equivalent)) as ASIZE +into #QRD_SIZE +from #subj_tab as a +left join #tab2 as b +on a.Qual_Type = b.Qual_Type +left join #tab3 as c +on a.QUID = c.QUID +group by a.QUID + + +--Code from Ops to create the school type lookup +If object_Id('tempDB..#insti_data') is not null drop table #insti_data + select distinct r.LEALAB, URN, schname, TAB1618, +case when b.nftype = 20 then 'Sponsored Academy' +when b.nftype = 21 then 'Community School' +when b.nftype = 22 then 'Voluntary Aided School' +when b.nftype = 23 then 'Voluntary Controlled School' +when b.nftype = 24 then 'Foundation School' +when b.nftype = 25 then 'City Technology College' +when b.nftype = 26 then 'Community Special School' +when b.nftype = 27 then 'Foundation Special School' +when b.nftype = 28 then 'Non-Maintained Special School' +when b.nftype = 29 then 'Independent School Approved for SEN pupils' +when b.nftype = 30 then 'Independent School' +when b.nftype = 31 and b.FESITYPE=1 then 'Agriculture and Horticulture College' +when b.nftype = 31 and b.FESITYPE=2 then 'Art, Design and Performing Art College' +when b.nftype = 31 and b.FESITYPE=3 then 'General Further Education College' +when b.nftype = 31 and b.FESITYPE=4 then 'General Further Education College (Special)' +when b.nftype = 31 and b.FESITYPE=5 then 'Sixth Form College' +when b.nftype = 31 and b.FESITYPE=8 then 'Specialist Designated College' +when b.nftype = 31 and b.FESITYPE=9 then 'Tertiary College' +when b.nftype = 32 then 'Community hospital school' +when b.nftype = 33 then 'Foundation hospital school' +when b.nftype = 34 then 'Pupil Referral Unit' +when b.nftype = 35 then 'Sixth Form Centre / Consortia' +when b.nftype = 36 then 'Ministry of Defence Funded College' +when b.nftype = 38 then 'Special College' +when b.nftype = 41 then 'European Schools' +when b.nftype = 42 then 'Playing for Success Centres' +when b.nftype = 43 then 'Offshore Schools' +when b.nftype = 44 then 'Service Childrens Education' +when b.nftype = 45 then 'Higher Education Institutions' +when b.nftype = 46 then 'Welsh Establishment' +when b.nftype = 47 then 'LA Nursery School' +when b.nftype = 48 then 'Other Independent Special School' +when b.nftype = 49 then 'Early Years setting' +when b.nftype = 50 then 'Sponsored Special Academy' +when b.nftype = 51 then 'Converter Academy' +when b.nftype = 52 then 'Free School - Mainstream' +when b.nftype = 53 then 'Free School - Special' +when b.nftype = 54 then 'British Overseas School' +when b.nftype = 55 then 'Special Converter Academy' +when b.nftype = 56 then 'Alternative Provision' +when b.nftype = 57 then 'University Technical College' +when b.nftype = 58 then 'Studio School' +when b.nftype = 59 then 'Free School - 16-19' +when b.nftype = 60 then 'International Schools' +when b.nftype = 63 then 'Academy 16-19 Converter' +when b.nftype = 64 then 'Academy 16-19 Sponsor Led' +when b.nftype = 97 then 'Alternative provision' +when b.nftype = 98 then 'Legacy types' +when b.nftype = 99 then 'Secure Unit' +else b.NFTYPE end as [School or college type] +into #insti_data +from #raw_inst as b +left join #raw_LA as r +on b.LEA = r.LEA + + +/* Get data from exam file. Join on the front end friendly subject names - currently in an internal table on KS5_RESTRICTED +but ultimately we want this to end up in the QRD. Join on the calculated PTQ INCLUDE, ASIZE and GSIZE +Grade X has been removed for TMs +*/ +--DECLARE @RPYEAR AS INTEGER = 2022 +If object_Id('tempDB..#examprep') is not null drop table #examprep +select LAESTAB, URN, PUPILID, a.GNUMBER, a.BRDSUBNO, _QualificationTypeCode as QUAL_TYPE, SUBLEVNO, AMDEXAM, EXAMYEAR, d.ASIZE, d.GSIZE, MAPPING, +SEASON, GRADE, POINTS_1618, POTENTIAL_LEVEL, c.subject_user_engagement as [Subject], coalesce(PTQ_INCLUDE_original, 0) as PTQ_INCLUDE_original, PTQ_INCLUDE, +DISC_ALL, DISC_0_1, DISC_1_2, DISC_0_2, DISC_SINGLE, DISCS_ALL, DISCS_0_1, DISCS_1_2, DISCS_0_2, DISCS_SINGLE, DISCB_ALL, DISCB_0_1, DISCB_1_2, DISCB_0_2, DISCB_SINGLE, +DISC_ALL_FULL, DISC_0_1_FULL, DISC_1_2_FULL, DISC_0_2_FULL, DISC_SINGLE_FULL, _Wolf_Included_1618, EXAMNO +into #examprep +from #exam as a +left join #QRD_PTQINCLUDE as b +on a.GNUMBER = b.QUID and a.BRDSUBNO = b.Syllabus_Ref +left join #CSCP_lookup as c +on a.MAPPING = c.DISC_CODE +left join #QRD_SIZE as d +on a.GNUMBER = d.QUID +where GRADE != 'X' -- added to stop messing up AS level discounting + + +/* +select results for the appropriate provider based on COND and without discounting +make changes to the grade: + - including adding covid impact result based on ptq include - this can be removed once COVID impacted results are out of scope of the data + - setting pending and no result to no result but now seperating Fail out from no result so they are reported seperately + - making changes to IB where the cumulative points is given with a letter grade eg. 32D - we remove the letter grade. In the cases of a fail eg. 32F, the result is listed + as no result - using points to identify this + - sublevno 129 is an IB combined certificate - the grades that seem to exist are K and J where K is awarded and J is unawareded. This has been set as a pass for K and no result for J + in previous code and then later in the code the pass was set to no result. We are not sure why this was done and does not appear correct and so I have left as pass and fail. +*/ +--DECLARE @RPYEAR AS INTEGER = 2022 +If object_Id('tempDB..#examcut') is not null drop table #examcut +select b.*, Qual_Description, z.GPTSPE_1, z.PTSPE_1, +case when PTQ_INCLUDE = 0 and PTQ_INCLUDE_original = 1 then 'COVID result' -- this doesn't matter as I have added the PTQ_INCLUDE = 1 filter to the where statement so this is redundent +when GRADE IN ('Q','R','X') then 'No result' +when GRADE IN ('F','U','N') then 'Fail' +when SUBLEVNO = 130 and POINTS_1618 > 0 then LEFT(GRADE, 2) +when SUBLEVNO = 130 and POINTS_1618 = 0 then 'Fail' +when sublevno=129 then (case when grade='K' then 'Pass' when grade in ('J','F','Q','U','X') then 'Fail' end) +when grade = 'D*' then '*D' + when grade = 'PM' then 'MP' + when grade = 'MD' then 'DM' + when grade = 'D**' then '**D' + when grade = 'DD*' then '*DD' + when grade = 'MDD' then 'DDM' + when grade = 'MMD' then 'DMM' + when grade = 'PMM' then 'MMP' + when grade = 'PPM' then 'MPP' +else GRADE end GRADE_UPDATE, +case when _Wolf_Included_1618 IN (1,2,3) then case when PTSPE_1 <1 then '<1' +when PTSPE_1 >=1 and PTSPE_1 <2 then '1-<2' +when PTSPE_1 >=2 and PTSPE_1 <3 then '2-<3' +when PTSPE_1 >=3 and PTSPE_1 <4 then '3-<4' +when PTSPE_1 >=4 and PTSPE_1 <5 then '4-<5' +when PTSPE_1 >=5 and PTSPE_1 <6 then '5-<6' +when PTSPE_1 >=6 and PTSPE_1 <7 then '6-<7' +when PTSPE_1 >=6 and PTSPE_1 <7 then '6-<7' +when PTSPE_1 >=7 and PTSPE_1 <8 then '7-<8' +when PTSPE_1 >=8 and PTSPE_1 <9 then '8-<9' +when PTSPE_1 >=9 then '9>=' end +else case when GPTSPE_1 <1 then '<1' +when GPTSPE_1 >=1 and GPTSPE_1 <2 then '1-<2' +when GPTSPE_1 >=2 and GPTSPE_1 <3 then '2-<3' +when GPTSPE_1 >=3 and GPTSPE_1 <4 then '3-<4' +when GPTSPE_1 >=4 and GPTSPE_1 <5 then '4-<5' +when GPTSPE_1 >=5 and GPTSPE_1 <6 then '5-<6' +when GPTSPE_1 >=6 and GPTSPE_1 <7 then '6-<7' +when GPTSPE_1 >=6 and GPTSPE_1 <7 then '6-<7' +when GPTSPE_1 >=7 and GPTSPE_1 <8 then '7-<8' +when GPTSPE_1 >=8 and GPTSPE_1 <9 then '8-<9' +when GPTSPE_1 >=9 then '9>=' end end as PRIOR_BAND, +CASE WHEN (COND =1) THEN DISC_ALL -- not moved, discount over all years, otherwise + WHEN (COND =2) THEN DISC_0_1 -- else didn't move in RY and RY-1 and exam is in RY or RY-1 + WHEN (COND =3) THEN DISC_1_2-- else didn't move in RY-1 and RY-2 and exam is in RY-1 or RY-2 + WHEN (COND =4) THEN DISC_0_2-- else same inst in RY and RY-2 and exam is in RY or RY-2 + WHEN (COND =5) THEN DISC_SINGLE-- else exam is in RY + WHEN (COND =6) THEN DISC_SINGLE-- else exam is in RY-1 + WHEN (COND =7) THEN DISC_SINGLE-- else exam is in RY-2 + ELSE 1 END AS DISC, -- points discounting flag + CASE WHEN (COND =1) THEN DISCS_ALL + WHEN (COND =2) THEN DISCS_0_1 + WHEN (COND =3) THEN DISCS_1_2 + WHEN (COND =4) THEN DISCS_0_2 + WHEN (COND =5) THEN DISCS_SINGLE + WHEN (COND =6) THEN DISCS_SINGLE + WHEN (COND =7) THEN DISCS_SINGLE + ELSE 1 END AS DISCS,-- size discounting flag + CASE WHEN (COND =1) THEN DISCB_ALL + WHEN (COND =2) THEN DISCB_0_1 + WHEN (COND =3) THEN DISCB_1_2 + WHEN (COND =4) THEN DISCB_0_2 + WHEN (COND =5) THEN DISCB_SINGLE + WHEN (COND =6) THEN DISCB_SINGLE + WHEN (COND =7) THEN DISCB_SINGLE + ELSE 1 END AS DISCB,-- subset discounting flag, + CASE WHEN (COND =1) THEN DISC_ALL_FULL -- not moved, discount over all years, otherwise + WHEN (COND =2) THEN DISC_0_1_FULL -- else didn't move in RY and RY-1 and exam is in RY or RY-1 + WHEN (COND =3) THEN DISC_1_2_FULL -- else didn't move in RY-1 and RY-2 and exam is in RY-1 or RY-2 + WHEN (COND =4) THEN DISC_0_2_FULL -- else same inst in RY and RY-2 and exam is in RY or RY-2 + WHEN (COND =5) THEN DISC_SINGLE_FULL -- else exam is in RY + WHEN (COND =6) THEN DISC_SINGLE_FULL -- else exam is in RY-1 + WHEN (COND =7) THEN DISC_SINGLE_FULL -- else exam is in RY-2 + ELSE 1 END AS DISC_FULL +into #examcut +from #indicator as a +left join #PRIOR z +on a.PUPILID = z.pupilid and a.KS4_YEAR_CALC = z.KS4_YEAR_CALC +LEFT join #Allocations y +on a.PUPILID = y.PUPILID +left join #examprep as b +on a.PUPILID = b.PUPILID and a.URN = b.URN +left join #tab4 as c +on b.SUBLEVNO = c.Qual_Number +where [TRIGGER] = 1 and RECTYPE = 1 and COND IN (1,2,3,4,5,6,7) and NAT1618 = 1 and END_KS = 1 --?? + and PTQ_INCLUDE = 1 --and PTQ_INCLUDE_original = 1 -- remove COVID impacted quals for 2022 TMs + AND (AMDEXAM NOT IN ('TO','CL', 'NR','D','W') OR AMDEXAM IS NULL) +and ((COND = 1 and ((EXAMYEAR IN (@RPYEAR, (@RPYEAR - 1), (@RPYEAR - 2)) and SEASON = 'S') or (EXAMYEAR IN ((@RPYEAR - 1), (@RPYEAR - 2), (@RPYEAR - 3)) and SEASON = 'W')) ) +OR (COND = 2 and ((EXAMYEAR IN (@RPYEAR, (@RPYEAR - 1)) and SEASON = 'S') or (EXAMYEAR IN ((@RPYEAR - 1), (@RPYEAR - 2)) and SEASON = 'W'))) +OR (COND = 3 and ((EXAMYEAR IN ((@RPYEAR - 1), (@RPYEAR - 2)) and SEASON = 'S') or (EXAMYEAR IN ((@RPYEAR - 2), (@RPYEAR - 3)) and SEASON = 'W')) ) +OR (COND = 4 and ((EXAMYEAR IN (@RPYEAR, (@RPYEAR - 2)) and SEASON = 'S') or (EXAMYEAR IN ((@RPYEAR - 1), (@RPYEAR - 3)) and SEASON = 'W')) ) +OR (COND = 5 and ((EXAMYEAR = @RPYEAR and SEASON = 'S') or (EXAMYEAR = (@RPYEAR - 1) and SEASON = 'W')) ) +OR (COND = 6 and ((EXAMYEAR = (@RPYEAR - 1) and SEASON = 'S') or (EXAMYEAR = (@RPYEAR - 2) and SEASON = 'W')) ) +OR (COND = 7 and ((EXAMYEAR = (@RPYEAR - 2) and SEASON = 'S') or (EXAMYEAR = (@RPYEAR - 3) and SEASON = 'W')) )) + + +--pull all the grades from the QRD tables 2 and 3 for the qualifications to be reported +If object_Id('tempDB..#grade_struc') is not null drop table #grade_struc +select a.*, b.Grade as grade2, c.Grade as grade3, b.points as points2, c.Points as points3, +coalesce(c.grade, b.grade) as QRD_grade, coalesce(c.points, b.points) as QRD_points, +coalesce(c.last_input_year, b.last_input_year) as last_input_year +into #grade_struc +from ( +select distinct GNUMBER, QUAL_TYPE, SUBLEVNO +from #examcut) as a +left join #tab2 as b +on a.QUAL_TYPE = b.Qual_Type +left join #tab3 as c +on a.GNUMBER = c.QUID + +--altering some of the grades for no results and fails and the ordering of distinction, merit and pass +--eg MDD becomes DDM +--DECLARE @RPYEAR AS INTEGER = 2022 +If object_Id('tempDB..#grade_struc_update') is not null drop table #grade_struc_update +select distinct GNUMBER, QRD_points, QUAL_TYPE, +case when QRD_grade IN ('Q','R','X') then 'No result' +when QRD_grade IN ('F','U','N') then 'Fail' +when SUBLEVNO = 130 and QRD_points > 0 then LEFT(QRD_grade, 2) +when SUBLEVNO = 130 and QRD_points = 0 then 'Fail' +when sublevno=129 then (case when QRD_grade='K' then 'Pass' when QRD_grade in ('J','F','Q','U','X') then 'Fail' else QRD_grade end) + when QRD_grade = 'D*' then '*D' + when QRD_grade = 'PM' then 'MP' + when QRD_grade = 'MD' then 'DM' + when QRD_grade = 'D**' then '**D' + when QRD_grade = 'DD*' then '*DD' + when QRD_grade = 'MDD' then 'DDM' + when QRD_grade = 'MMD' then 'DMM' + when QRD_grade = 'PMM' then 'MMP' + when QRD_grade = 'PPM' then 'MPP' +else QRD_grade end as GRADE_update +into #grade_struc_update +from #grade_struc +where last_input_year = @RPYEAR + +--concatenate the grades to make a grade structure but remove fail and no result in contrast to previous years +If object_Id('tempDB..#generic_gradeStruc') is not null drop table #generic_gradeStruc +Select Main.GNUMBER, Main.qual_type, + Left(Main.gradeStructure,Len(Main.gradeStructure)-1) As "gradeStructure" +into #generic_gradeStruc +From + ( + Select distinct ST2.GNUMBER, ST2.qual_type, + ( + Select ST1.GRADE_update + ',' AS [text()] + From #grade_struc_update ST1 + Where ST1.GNUMBER = ST2.GNUMBER and ST1.GRADE_update NOT IN ('No result', 'Fail') + ORDER BY ST1.GNUMBER, QRD_points desc, GRADE_update + For XML PATH ('') + ) gradeStructure + From #grade_struc_update ST2 + ) [Main] + +--create the exam cohort flag +If object_Id('tempDB..#all_dat') is not null drop table #all_dat +select a.*, b.gradeStructure, +case when KS5_Academic = 1 and KS5_Subset = 1 and SUBLEVNO != 101 then 'A level' +when KS5_Academic = 1 then 'Other academic' +when _Wolf_Included_1618 = 1 then 'Tech level' +when _Wolf_Included_1618 = 2 then 'Applied general' +when _Wolf_Included_1618 = 3 then 'Technical certificate' end as Qualification +into #all_dat +from #examcut as a +left join #generic_gradeStruc as b +on a.GNUMBER = b.GNUMBER +left join #QRD_PTQINCLUDE as y +on a.GNUMBER = y.QUID and a.BRDSUBNO = y.Syllabus_Ref + + +--national, qualification and subject +If object_Id('tempDB..#nat_subj') is not null drop table #nat_subj +select Qual_Description, a.SUBLEVNO, Potential_Level, ASIZE, GSIZE, a.MAPPING, SUBJ, [Subject], gradeStructure, PRIOR_BAND, GRADE_UPDATE as GRADE, --Qualification, +count(PUPILID) as total_students +into #nat_subj +from #all_dat as a +left join #insti_data as d +on a.URN = d.URN +left join #subj e +on a.MAPPING = e.MAPPING --and a.SUBLEVNO = e.SUBLEVNO +where --TAB1618 = 1 + ((Qualification IN ('Other academic', 'Tech level', 'Applied general') and DISCS = 0) OR +(Qualification = 'A level' and (DISCS=0 OR (DISCS=1 AND DISC=1 AND DISCB=0))) OR +(Qualification = 'Technical certificate' and DISC_FULL = 0)) +group by Qual_Description, a.SUBLEVNO, Potential_Level, ASIZE, GSIZE, a.MAPPING, SUBJ, [Subject], gradeStructure, PRIOR_BAND, GRADE_UPDATE --Qualification, +order by Qual_Description, SUBLEVNO, ASIZE, GSIZE, [Subject], GRADE_UPDATE, PRIOR_BAND + +--create national data again but for AS so we can ignore usual discounting and implement AS only discounting +If object_Id('tempDB..#nat_subj_AS') is not null drop table #nat_subj_AS +select 'GCE AS level (All)' as Qual_Description, 699 as SUBLEVNO, Potential_Level, ASIZE, GSIZE, a.MAPPING, SUBJ, [Subject], gradeStructure, PRIOR_BAND, GRADE_UPDATE as GRADE, --Qualification, +count(PUPILID) as total_students +into #nat_subj_AS +from (select *, ROW_NUMBER() OVER (PARTITION BY LAESTAB, PUPILID, MAPPING, SUBLEVNO +ORDER BY POINTS_1618 DESC, EXAMYEAR DESC, SEASON DESC, EXAMNO) AS MAINEXAM +from #all_dat +where SUBLEVNO = 121) as a +left join #insti_data as d +on a.URN = d.URN +left join /*( +select distinct MAPPING, SUBJ +from*/ #subj +/*where SUBLEVNO IN (111, 121))*/ e +on a.MAPPING = e.MAPPING --and a.SUBLEVNO = e.SUBLEVNO +where MAINEXAM = 1 --TAB1618 = 1 +group by Qual_Description, a.SUBLEVNO, Potential_Level, ASIZE, GSIZE, a.MAPPING, SUBJ, [Subject], gradeStructure, PRIOR_BAND, GRADE_UPDATE --Qualification, +order by Qual_Description, SUBLEVNO, ASIZE, GSIZE, [Subject], GRADE_UPDATE, PRIOR_BAND + +-- bind the 2 tables together +If object_Id('tempDB..#bind_nat_dat') is not null drop table #bind_nat_dat +select * +into #bind_nat_dat +from +( +select Qual_Description, SUBLEVNO, Potential_Level, ASIZE, GSIZE, MAPPING, [Subject], gradeStructure, PRIOR_BAND, SUBJ, GRADE, total_students +from #nat_subj +where GRADE NOT IN ('COVID result', 'No result') +union +select Qual_Description, SUBLEVNO, Potential_Level, ASIZE, GSIZE, MAPPING, [Subject], gradeStructure, PRIOR_BAND, SUBJ, GRADE, total_students +from #nat_subj_AS +where GRADE NOT IN ('COVID result', 'No result')) as a + +--filter out quals where there are 15 or fewer entries and output final table +If object_Id('tempDB..#TM_data') is not null drop table #TM_data +select b.* +into #TM_data +from ( +select SUBLEVNO, Potential_Level, ASIZE, GSIZE, MAPPING, gradeStructure +from #bind_nat_dat +group by SUBLEVNO, Potential_Level, ASIZE, GSIZE, MAPPING, gradeStructure +having SUM(total_students) > 15) a +left join #bind_nat_dat b +on a.SUBLEVNO = b.SUBLEVNO and a.POTENTIAL_LEVEL = b.POTENTIAL_LEVEL +and a.ASIZE = b.ASIZE and a.GSIZE = b.GSIZE and a.MAPPING = b.MAPPING and a.gradeStructure = b.gradeStructure + + +select * from #TM_data +where PRIOR_BAND is NULL + +-- 11913 rows +-- up to 11914 rows with grade structure added in +-- 955 NULLS in prior band + +select * from #TM_data +where [Subject] is NULL +-- should be zero + + +--drop table [KS5_STATISTICS_RESTRICTED].[TM_2022].[TM_data_2023A] + + +select * +into [KS5_STATISTICS_RESTRICTED].[TM_2023].[TM_data_2023A] +from #TM_data + + +-- Note +-- JB, 05/01/2023 + +-- One of the columns had a character length that caused errors in R, +-- this was identified in SQL using + +-- use KS5_STATISTICS_RESTRICTED +-- select COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH +-- from INFORMATION_SCHEMA.COLUMNS +-- where TABLE_SCHEMA = 'TM_2022' +-- and TABLE_NAME = 'TM_data_2022A' + +-- and then corrected using +-- alter table [TM_2023].[TM_data_2023A] alter column gradeStructure varchar (200) not NULL; diff --git a/background_scripts/data_processing.R b/background_scripts/data_processing.R index 3174862..cdd70b0 100644 --- a/background_scripts/data_processing.R +++ b/background_scripts/data_processing.R @@ -17,144 +17,54 @@ library(readr) rm(list=ls()) # ----------------------------------------------------------------------------------------------------------------------------- -# ---- Things to change between runs ---- +# ---- Source functions ---- # ----------------------------------------------------------------------------------------------------------------------------- -tm_file = "[KS5_STATISTICS_RESTRICTED].[TM_2023].[TM_data_2023U]" +source("./background_scripts/data_processing_func.R") +# ----------------------------------------------------------------------------------------------------------------------------- +# ---- Things to change between runs ---- +# ----------------------------------------------------------------------------------------------------------------------------- ancillary_save_path <- "//lonnetapp01/DSGA2/!!Secure Data/SFR/2023/KS5/November_2023/06_ancillary/" current_year <- "2023U" # ----------------------------------------------------------------------------------------------------------------------------- -# ---- Reading in the data from SQL tables ---- +# ---- Thing to add - Reading in the data from SQL tables and running function ---- # ----------------------------------------------------------------------------------------------------------------------------- - # establish connection to server -con <- DBI::dbConnect(odbc::odbc(), driver = "SQL Server", - server = "VMT1PR-DHSQL02") +con <- DBI::dbConnect(odbc::odbc(), driver = "SQL Server", server = "VMT1PR-DHSQL02") -# Select data from SQL tables +# Select data from SQL tables - need to add the current year for unamended run and change from U to A for the amended run -tm_data_raw <- tbl(con, sql(paste("select * from", tm_file))) %>% collect() +tm_data_raw_2023 <- tbl(con, sql("select * from [KS5_STATISTICS_RESTRICTED].[TM_2023].[TM_data_2023U]")) %>% collect() +tm_data_raw_2022 <- tbl(con, sql("select * from [KS5_STATISTICS_RESTRICTED].[TM_2022].[TM_data_2022A]")) %>% collect() # disconnect DBI::dbDisconnect(con) +# add current year to the function call to produce the processed data - only needs updating for unamended runs as +# the SQL update above deals with version - -# ----------------------------------------------------------------------------------------------------------------------------- -# ---- SORTING NAs ---- -# ----------------------------------------------------------------------------------------------------------------------------- - -colSums(is.na(tm_data_raw)) %>% as.data.frame() -# NAs in prior band which need removing -# NAs in subj because no subj code was assigned within the SQL production code -# SUBJ is assigned by joining on the QUAL_SUBJ_LOOKUP, -# but some subjects are filtered out of the L3VA process if they are not entered in 5 or more institutions -# for the TMs we don't mind this rule, so we've decided to leave them in but we will have to create new SUBJ codes -# to ensure I'm not overwriting any existing SUBJ codes, I'm generating 3 digit random numbers (SUBJ is usually 5 digits) - - -tm_data_subj_na <- tm_data_raw %>% - filter(!(is.na(PRIOR_BAND)), - is.na(SUBJ)) %>% - group_by(Subject) %>% - mutate(SUBJ = sample(100:900,1)) - -tm_data <- tm_data_raw %>% - filter(!(is.na(PRIOR_BAND)), - !(is.na(SUBJ))) %>% - bind_rows(tm_data_subj_na) - -colSums(is.na(tm_data)) %>% as.data.frame() - - - -# ----------------------------------------------------------------------------------------------------------------------------- -# ---- NUMBERS & PERCENTAGES CALCULATED ---- -# ----------------------------------------------------------------------------------------------------------------------------- - - -student_numbers <- tm_data %>% - mutate(GRADE = case_when(GRADE == "Fail" & SUBLEVNO != 130 ~ "U", - TRUE ~ GRADE), - SIZE = case_when(ASIZE == 0 ~ GSIZE, - TRUE ~ ASIZE), - QUAL_ID = paste0(SUBLEVNO, SUBJ, SIZE, gradeStructure), - ROW_ID = paste0(SUBLEVNO, SUBJ, SIZE, PRIOR_BAND, gradeStructure)) %>% - mutate(across(c(everything(), -total_students), ~as.character(.))) %>% - arrange(ROW_ID) %>% - pivot_wider(names_from = GRADE, values_from = total_students) - - - -student_percentages <- student_numbers %>% - janitor::adorn_percentages() %>% - mutate_if(is.numeric, function(x){round(x*100, 2)}) %>% - mutate_if(is.numeric, ~paste0(.x, "%")) - +processed_data_2023 <- TM_data_prod_func(tm_data_raw_2023, 2023) +processed_data_2022 <- TM_data_prod_func(tm_data_raw_2022, 2022) +# This needs updating to make sure it is the current year data as it is used to produce the ancillary data +# should only need updating for unamended run as with the above - -# ----------------------------------------------------------------------------------------------------------------------------- -# ---- LOOKUP & FULL GRADE OPTIONS CALCULATED ---- -# ----------------------------------------------------------------------------------------------------------------------------- - - -qual_lookup <- tm_data %>% - select(Qual_Description, SUBLEVNO, Subject, SUBJ, ASIZE, GSIZE, gradeStructure) %>% - distinct() %>% - mutate(SIZE = case_when(ASIZE == 0 ~ GSIZE, - TRUE ~ ASIZE)) - - -grade_lookup_u <- tm_data %>% - select(SUBLEVNO, SUBJ, ASIZE, GSIZE, gradeStructure) %>% - distinct() %>% - mutate(GRADE = case_when(SUBLEVNO == 130 ~ "Fail", - TRUE ~ "U")) - -grade_lookup_sep <- tm_data %>% - select(SUBLEVNO, SUBJ, ASIZE, GSIZE, gradeStructure) %>% - distinct() %>% - mutate(GRADE = gradeStructure) %>% - separate_rows(. , GRADE, sep = ",") - -grade_lookup <- bind_rows(grade_lookup_u, grade_lookup_sep) %>% - arrange(SUBLEVNO, SUBJ, ASIZE, GSIZE, gradeStructure, GRADE) %>% - mutate(SIZE = case_when(ASIZE == 0 ~ GSIZE, - TRUE ~ ASIZE)) - - - -# would like to sort numeric and character grades differently so that all grades go from low - high -# for numeric this is ascending, for character this is descending -# will need to split again based on numeric and character and then re-combine again -grades_char <- grade_lookup %>% - mutate(char_grade_check = is.na(suppressWarnings(as.numeric(grade_lookup$GRADE)))) %>% # gives extra column - True is character, False is number - filter(char_grade_check == TRUE) %>% - arrange(desc(GRADE)) - -grades_num <- grade_lookup %>% - mutate(char_grade_check = is.na(suppressWarnings(as.numeric(grade_lookup$GRADE)))) %>% # gives extra column - True is character, False is number - filter(char_grade_check == FALSE) %>% - arrange(GRADE) - -grades_ordered_lookup <- bind_rows(grades_char, grades_num) %>% - arrange(SUBLEVNO) - +current_year_data <- processed_data_2023 # ----------------------------------------------------------------------------------------------------------------------------- # ---- ANCILLARY DATA FOR EES ---- # ----------------------------------------------------------------------------------------------------------------------------- -ancillary_data_numbers <- student_numbers %>% +ancillary_data_numbers <- current_year_data$student_numbers %>% + select(-ReportYr) %>% pivot_longer(!c(Qual_Description, SUBLEVNO, Potential_Level, ASIZE, GSIZE, MAPPING, Subject, gradeStructure, PRIOR_BAND, SUBJ, SIZE, QUAL_ID, ROW_ID), names_to = "grade", values_to = "count") %>% @@ -168,10 +78,11 @@ ancillary_data_numbers <- student_numbers %>% grade, count) %>% filter(!is.na(count)) %>% - arrange(qualification_code, subject_code, size, grade_structure, prior_attainment_band, grade) + arrange(qualification_code, subject_code, size, grade_structure, prior_attainment_band, grade, .locale = "en") -ancillary_data_percentages <- student_percentages %>% +ancillary_data_percentages <- current_year_data$student_percentages %>% + select(-ReportYr) %>% pivot_longer(!c(Qual_Description, SUBLEVNO, Potential_Level, ASIZE, GSIZE, MAPPING, Subject, gradeStructure, PRIOR_BAND, SUBJ, SIZE, QUAL_ID, ROW_ID), names_to = "grade", values_to = "percentage") %>% @@ -185,7 +96,7 @@ ancillary_data_percentages <- student_percentages %>% grade, percentage) %>% filter(percentage != "NA%") %>% - arrange(qualification_code, subject_code, size, grade_structure, prior_attainment_band, grade) + arrange(qualification_code, subject_code, size, grade_structure, prior_attainment_band, grade, .locale = "en") ancillary_data <- ancillary_data_numbers %>% @@ -196,44 +107,26 @@ ancillary_data <- ancillary_data_numbers %>% write_csv(ancillary_data, paste0(ancillary_save_path, 'tm_numbers_percentages_', current_year, '.csv')) - # ----------------------------------------------------------------------------------------------------------------------------- -# ---- add academic year to data ---- +# ---- Saving Data ---- # ----------------------------------------------------------------------------------------------------------------------------- -student_numbers <- student_numbers %>% - mutate(ReportYr = 2023) %>% - select(ReportYr, everything()) +# bind all the processed data -student_percentages <- student_percentages %>% - mutate(ReportYr = 2023) %>% - select(ReportYr, everything()) +student_numbers <- lapply(ls(pattern = "processed_data_", .GlobalEnv), function(x){ + bind_rows(mget(x, .GlobalEnv)[[1]]["student_numbers"])}) %>% bind_rows() -qual_lookup <- qual_lookup %>% - mutate(ReportYr = 2023) %>% - select(ReportYr, everything()) +student_percentages <- lapply(ls(pattern = "processed_data_", .GlobalEnv), function(x){ + bind_rows(mget(x, .GlobalEnv)[[1]]["student_percentages"])}) %>% bind_rows() -grades_ordered_lookup <- grades_ordered_lookup %>% - mutate(ReportYr = 2023) %>% - select(ReportYr, everything()) +qual_lookup <- lapply(ls(pattern = "processed_data_", .GlobalEnv), function(x){ + bind_rows(mget(x, .GlobalEnv)[[1]]["qual_lookup"])}) %>% bind_rows() +grades_ordered_lookup <- lapply(ls(pattern = "processed_data_", .GlobalEnv), function(x){ + bind_rows(mget(x, .GlobalEnv)[[1]]["grades_ordered_lookup"])}) %>% bind_rows() -# grade_list %>% filter(SUBLEVNO == 253, SUBJ == 20596, ASIZE == 1) - -# ----------------------------------------------------------------------------------------------------------------------------- -# ---- Saving Data ---- -# ----------------------------------------------------------------------------------------------------------------------------- - -student_numbers_old <- read_rds("./data/all_student_numbers.rds") -student_percentages_old <- read_rds("./data/all_student_percentages.rds") -qual_lookup_old <- read_rds("./data/qual_lookup.rds") -grades_ordered_lookup_old <- read_rds("./data/grade_lookup.rds") - -student_numbers <- bind_rows(student_numbers, student_numbers_old) -student_percentages <- bind_rows(student_percentages, student_percentages_old) -qual_lookup <- bind_rows(qual_lookup, qual_lookup_old) -grades_ordered_lookup <- bind_rows(grades_ordered_lookup, grades_ordered_lookup_old) +# save as rds files saveRDS(student_numbers, "./data/all_student_numbers.rds") saveRDS(student_percentages, "./data/all_student_percentages.rds") @@ -241,4 +134,3 @@ saveRDS(student_percentages, "./data/all_student_percentages.rds") saveRDS(qual_lookup, "./data/qual_lookup.rds") saveRDS(grades_ordered_lookup, "./data/grade_lookup.rds") - diff --git a/background_scripts/data_processing_func.R b/background_scripts/data_processing_func.R new file mode 100644 index 0000000..6c9c554 --- /dev/null +++ b/background_scripts/data_processing_func.R @@ -0,0 +1,124 @@ + + +TM_data_prod_func <- function(sql_data, ReportYear) { + + # ----------------------------------------------------------------------------------------------------------------------------- + # ---- SORTING NAs ---- + # ----------------------------------------------------------------------------------------------------------------------------- + + colSums(is.na(sql_data)) %>% as.data.frame() + # NAs in prior band which need removing + # NAs in subj because no subj code was assigned within the SQL production code + # SUBJ is assigned by joining on the QUAL_SUBJ_LOOKUP, + # but some subjects are filtered out of the L3VA process if they are not entered in 5 or more institutions + # for the TMs we don't mind this rule, so we've decided to leave them in but we will have to create new SUBJ codes + # to ensure I'm not overwriting any existing SUBJ codes, I'm generating 3 digit random numbers (SUBJ is usually 5 digits) + + + tm_data_subj_na <- sql_data %>% + filter(!(is.na(PRIOR_BAND)), + is.na(SUBJ)) %>% + group_by(Subject) %>% + mutate(SUBJ = sample(100:900,1)) + + tm_data <- sql_data %>% + filter(!(is.na(PRIOR_BAND)), + !(is.na(SUBJ))) %>% + bind_rows(tm_data_subj_na) + + colSums(is.na(tm_data)) %>% as.data.frame() + + # ----------------------------------------------------------------------------------------------------------------------------- + # ---- NUMBERS & PERCENTAGES CALCULATED ---- + # ----------------------------------------------------------------------------------------------------------------------------- + + student_numbers <- tm_data %>% + mutate(GRADE = case_when(GRADE == "Fail" & SUBLEVNO != 130 ~ "U", + TRUE ~ GRADE), + SIZE = case_when(ASIZE == 0 ~ GSIZE, + TRUE ~ ASIZE), + QUAL_ID = paste0(SUBLEVNO, SUBJ, SIZE, gradeStructure), + ROW_ID = paste0(SUBLEVNO, SUBJ, SIZE, PRIOR_BAND, gradeStructure)) %>% + mutate(across(c(everything(), -total_students), ~as.character(.))) %>% + arrange(ROW_ID, .locale = "en") %>% + pivot_wider(names_from = GRADE, values_from = total_students) + + + student_percentages <- student_numbers %>% + janitor::adorn_percentages() %>% + mutate_if(is.numeric, function(x){round(x*100, 2)}) %>% + mutate_if(is.numeric, ~paste0(.x, "%")) + + # ----------------------------------------------------------------------------------------------------------------------------- + # ---- LOOKUP & FULL GRADE OPTIONS CALCULATED ---- + # ----------------------------------------------------------------------------------------------------------------------------- + + + qual_lookup <- tm_data %>% + select(Qual_Description, SUBLEVNO, Subject, SUBJ, ASIZE, GSIZE, gradeStructure) %>% + distinct() %>% + mutate(SIZE = case_when(ASIZE == 0 ~ GSIZE, + TRUE ~ ASIZE)) + + + grade_lookup_u <- tm_data %>% + select(SUBLEVNO, SUBJ, ASIZE, GSIZE, gradeStructure) %>% + distinct() %>% + mutate(GRADE = case_when(SUBLEVNO == 130 ~ "Fail", + TRUE ~ "U")) + + grade_lookup_sep <- tm_data %>% + select(SUBLEVNO, SUBJ, ASIZE, GSIZE, gradeStructure) %>% + distinct() %>% + mutate(GRADE = gradeStructure) %>% + separate_rows(. , GRADE, sep = ",") + + grade_lookup <- bind_rows(grade_lookup_u, grade_lookup_sep) %>% + arrange(SUBLEVNO, SUBJ, ASIZE, GSIZE, gradeStructure, GRADE) %>% + mutate(SIZE = case_when(ASIZE == 0 ~ GSIZE, + TRUE ~ ASIZE)) + + + # would like to sort numeric and character grades differently so that all grades go from low - high + # for numeric this is ascending, for character this is descending + # will need to split again based on numeric and character and then re-combine again + grades_char <- grade_lookup %>% + mutate(char_grade_check = is.na(suppressWarnings(as.numeric(grade_lookup$GRADE)))) %>% # gives extra column - True is character, False is number + filter(char_grade_check == TRUE) %>% + arrange(desc(GRADE)) + + grades_num <- grade_lookup %>% + mutate(char_grade_check = is.na(suppressWarnings(as.numeric(grade_lookup$GRADE)))) %>% # gives extra column - True is character, False is number + filter(char_grade_check == FALSE) %>% + arrange(GRADE) + + grades_ordered_lookup <- bind_rows(grades_char, grades_num) %>% + arrange(SUBLEVNO) + + # ----------------------------------------------------------------------------------------------------------------------------- + # ---- add academic year to data ---- + # ----------------------------------------------------------------------------------------------------------------------------- + + student_numbers <- student_numbers %>% + mutate(ReportYr = ReportYear) %>% + select(ReportYr, everything()) + + student_percentages <- student_percentages %>% + mutate(ReportYr = ReportYear) %>% + select(ReportYr, everything()) + + qual_lookup <- qual_lookup %>% + mutate(ReportYr = ReportYear) %>% + select(ReportYr, everything()) + + grades_ordered_lookup <- grades_ordered_lookup %>% + mutate(ReportYr = ReportYear) %>% + select(ReportYr, everything()) + + + # grade_list %>% filter(SUBLEVNO == 253, SUBJ == 20596, ASIZE == 1) + + return(list(student_numbers = student_numbers, student_percentages = student_percentages, + qual_lookup = qual_lookup, grades_ordered_lookup = grades_ordered_lookup)) + +} diff --git a/ui.R b/ui.R index 60b6b3a..f2d52f9 100644 --- a/ui.R +++ b/ui.R @@ -80,7 +80,7 @@ ui <- function(input, output, session) { ), shinyjs::useShinyjs(), customDisconnectMessage(), - useShinydashboard(), + #useShinydashboard(), tags$head(includeHTML(("google-analytics.html"))), tags$head( tags$link(