/****************************************************************** * SAS program to create the MTUS World 5.53, 5.8, and 6.0 datasets * for the UK 1995 time diary survey, * written by Oiching Yeung and Aaron Lai, * Centre for Time Use Research, University of Oxford, * September 2009 ******************************************************************/ ********** Assign a library **********; libname mylib 'C:\personal\MTUS\UK1995'; %let libdir=C:\personal\MTUS\UK1995; ******************************************************************* *********************** Data preparation ****************** ******************************************************************; ** Check serialno **; * Check if serialno is unique, total number of rows is 2005; proc sql; select count(distinct serialno) from mylib.uk1995_orig; quit; * 1976, duplicates found; * Check if serialno is blank; proc sql; select count(*) from mylib.uk1995_orig where serialno eq .; quit; * 27 rows have a blank serialno; * Look at the content of the rows with a blank serialno; data s1; set mylib.uk1995_orig; if serialno eq .; run; * their tot1 (total sleep time) is 1440 but the profile info is available, assign a new number to them; * Get the largest serialno; proc sql; select max(serialno) format z12. from mylib.uk1995_orig; quit; * 21100309505; * Check for missing age and sex; proc sql; select count(*) from mylib.uk1995_orig where serialno eq . and (q112 eq . or q113 eq .); quit; * 0, ok; * Assign new numbers, use a hardcoded number plus sex*100 plus age; data s2; set mylib.uk1995_orig; if serialno eq . then serialno = 22000000000 + q112*100 + q113; run; * Check again if serialno is unique; proc sql; select count(distinct serialno) from s2; quit; * 2003: 2 duplicates; * List the duplicates; proc sql; select serialno from (select serialno, count(*) as cnt from s2 group by 1) where cnt gt 1; quit; * 5016069505 16088249505; * Add 1 to the serialno for each duplicate; proc sort data=s2 out=s2; by serialno region area address; run; data s3 (drop = lstno); set s2; by serialno region area address; format newno best12.; retain lstno newno; lstno = newno; if last.address then newno = serialno; if last.address and serialno eq lstno then newno = serialno + 1; run; * List the modified duplicates; proc sql; select serialno, newno from s3 where serialno ne newno; quit; * Check again if serialno is unique; proc sql; select count(distinct newno) from s3; quit; * 2005; * Create a temp modified data set for further work; data mylib.mod1; set s3; run; * Clean up; proc datasets library=work; delete s1 s2 s3; quit; ** Accuracy check of the date variables **; proc sql; select month, count(*) from mylib.mod1 group by 1; quit; * 3 cases of blank, 1961 cases month=5, and 41 cases of month=6; * assign all months to 5; proc sql; update mylib.mod1 set month = 5 where month eq . or month eq 6; quit; proc sql; select count(*) from mylib.mod1 where date eq .; quit; * 41 is missing; proc sql; select count(*) from mylib.mod1 where day eq .; quit; * 3 is missing; * Compare date/day to desigday; data d1; set mylib.mod1 (keep = newno date desigday day month); select (date); when (7,14,21,28) wdate = 7; when (1,8,15,22,29) wdate = 1; when (2,9,16,23,30) wdate = 2; when (3,10,17,24,31) wdate = 3; when (4,11,18,25) wdate = 4; when (5,12,19,26) wdate = 5; when (6,13,20,27) wdate = 6; otherwise; end; select (day); when (7,14,21,28) wday = 7; when (1,8,15,22,29) wday = 1; when (2,9,16,23,30) wday = 2; when (3,10,17,24,31) wday = 3; when (4,11,18,25) wday = 4; when (5,12,19,26) wday = 5; when (6,13,20,27) wday = 6; otherwise; end; run; proc sql; select count(*) from d1 where wdate ne desigday; quit; * 137; proc sql; select count(*) from d1 where wday ne desigday; quit; * 1982, day does not correspond with desigday well, no good to use; proc sql; select desigday, count(*) from d1 group by 1; quit; proc sql; select wdate, count(*) from d1 group by 1; quit; * pretty even distribution; * create a new desigday variable with the corrections if wdate (based on date) is blank (meaning blank date), then use the original desigday; data d2; set d1 (drop = day wday); if wdate eq . then newdesigday = desigday; else newdesigday = wdate; run; proc sql; select newdesigday, count(*) from d2 group by 1; quit; * even distribution; * Update desigday; proc sql; update mylib.mod1 a set desigday = (select newdesigday from d2 b where a.newno = b.newno) where newno in (select newno from d2); quit; proc sql; select desigday, count(*) from mylib.mod1 group by 1; quit; * matched; * Clean up; proc datasets library=work; delete d1 d2; quit; ** Check if the adult, child, and household variables synchronized **; proc sql; create table h1 as ( select newno, q16 as q16_tot, q12 as q12_adu, q13, q14, q15, q113 as q113_age, case when q17 eq 1 then '(1) 1 adu 16-64' when q17 eq 2 then '(2) 1 adu 65+ ' when q17 eq 3 then '(3) 2 adu 16-64' when q17 eq 4 then '(4) 2 adu 1 65+' when q17 eq 5 then '(5) 3 adu ages ' when q17 eq 6 then '(6) 1-2 child ' when q17 eq 7 then '(7) 3+ child ' else '.' end as q17_hhtypA, case when q18 eq 1 then '(1) one person only' when q18 eq 2 then '(2) mar/coh child+ ' when q18 eq 3 then '(3) mar/coh 0 child' when q18 eq 4 then '(4) lone par child+' when q18 eq 5 then '(5) all others ' else '.' end as q18_hhtypB, case when q114 eq 1 then '(1) HOH ' when q114 eq 2 then '(2) HOH partner ' when q114 eq 3 then '(3) HOH/ptnr child ' when q114 eq 4 then '(4) HOH/ptnr par ' when q114 eq 5 then '(5) Other relation ' when q114 eq 6 then '(6) Non related adu' else '.' end as q114_relHH, case when q115 eq 1 then '(1) Married ' when q115 eq 2 then '(2) Cohabiting' when q115 eq 3 then '(3) Single ' when q115 eq 4 then '(4) Widowed ' when q115 eq 5 then '(5) Divorced ' when q115 eq 6 then '(6) Separated ' else '.' end as q115_marstat, tot5, count(*) from mylib.mod1 where sum(q12, q13, q14, q15) ne q16 or (sum(q13, q14, q15) eq 0 and (q18 eq 2 or q18 eq 4)) or (sum(q13, q14, q15) ge 1 and q18 eq 3) or ((q17 eq 1 or q17 eq 2) and q12 lt 1) or ((q17 eq 3 or q17 eq 4) and q12 lt 2) or (q17 eq 5 and q12 lt 3) or (q17 eq 6 and sum(q13, q14, q15) eq 0) or (q17 eq 7 and sum(q13, q14, q15) lt 3) or (q18 eq 1 and q16 ge 2) group by 1,2,3,4,5,6,7,8,9,10,11,12); quit; * 81 rows; * The following cases can be updated with some assumptions; proc sql; update mylib.mod1 set q12 = 1 where newno eq 14071159505; update mylib.mod1 set q12 = 2 where newno eq 8038309505 or newno eq 9041149505 or newno eq 14073279505 or newno eq 16089279505; update mylib.mod1 set q16 = 2 where newno eq 8035149505 or newno eq 9041149505 or newno eq 12055279505 or newno eq 16087219505; update mylib.mod1 set q16 = 3 where newno eq 6026179505 or newno eq 9039129505 or newno eq 15081259505 or newno eq 21099199505; update mylib.mod1 set q16 = 4 where newno eq 5022279505 or newno eq 8038309505 or newno eq 21099279505; update mylib.mod1 set q16 = 5 where newno eq 2006249505; update mylib.mod1 set q16 = 6 where newno eq 7032259505; update mylib.mod1 set q17 = 1 where newno eq 3007169505 or newno eq 5017239505 or newno eq 16089299505 or newno eq 21098169505; update mylib.mod1 set q17 = 2 where newno eq 13063229505 or newno eq 17090189505; update mylib.mod1 set q17 = 3 where newno eq 5018219505; update mylib.mod1 set q18 = 2 where newno eq 6024239505 or newno eq 7031289505 or newno eq 12054209505 or newno eq 13068029505 or newno eq 13068159505 or newno eq 15078049505; update mylib.mod1 set q18 = 3 where newno eq 12055139505 or newno eq 13068089505 or newno eq 13068199505 or newno eq 14071199505 or newno eq 14071199505; update mylib.mod1 set q18 = 4 where newno eq 2005239505 or newno eq 3008239505 or newno eq 5017149505 or newno eq 9039129505; quit; * Clean up; proc datasets library=work; delete h1; quit; ** Look at the distribution of q73 to determine the income groups **; data i1; set mylib.mod1 (keep= newno q73); if q73 eq 98 then q73a = 1; else if q73 eq 99 then q73a = .; else q73a = q73; run; proc freq data=i1; tables q73a; run; * determined that values 1-4 => group 1 (lowest 25%) 5-13 => group 2 (middle 50%) 14-21 => group 3 (highest 25%); * Clean up; proc datasets library=work; delete i1; quit; ** Check if q111 and tenure synchronized **; proc sql; select q111, tenure, count(*) from mylib.mod1 group by 1,2; quit; * looks like tenure is more precise as q111=2 can be either rent or mortgage shared; ** Check if q116 and lefted synchronized for student **; proc sql; select q116, lefted, count(*) from mylib.mod1 where q116 eq 97 or lefted eq 5 group by 1,2; quit; * ok; ** Check if q130 and cars are synchronized **; proc sql; select q130, cars, count(*) from mylib.mod1 group by 1,2; quit; * looks like cars is more precise; ** Check if emp, q118, q125, and q122 synchronized **; proc sql; create table e1 as ( select case when emp eq 1 then 'full time' when emp eq 2 then 'part time' when emp eq 3 then 'unemploy ' when emp eq 4 then 'inactive ' else '.' end as emp_1, case when q118 eq 1 then 'full time' when q118 eq 2 then 'part time' when q118 eq 3 then 'part time' else '.' end as q118_1, case when q125 eq 1 then 'employed' when q125 eq 2 then 'self-emp' else '.' end as q125_1, case when q122 eq 1 then 'yes in-paid emp' when q122 eq 2 then 'no in-paid emp ' else '.' end as q122_1, count(*) from mylib.mod1 group by 1,2,3,4); quit; * not very synchronized, seems emp is more uniformed and has less missing values; ** Check the employment when q119=blank **; proc sql; create table e2 as ( select case when emp eq 1 then 'full time' when emp eq 2 then 'part time' when emp eq 3 then 'unemploy ' when emp eq 4 then 'inactive ' else '.' end as emp_1, case when q118 eq 1 then 'full time' when q118 eq 2 then 'part time' when q118 eq 3 then 'part time' else '.' end as q118_1, case when q125 eq 1 then 'employed' when q125 eq 2 then 'self-emp' else '.' end as q125_1, case when q122 eq 1 then 'yes in-paid emp' when q122 eq 2 then 'no in-paid emp ' else '.' end as q122_1, q113, count(*) from mylib.mod1 where q119 eq . group by 1,2,3,4,5); quit; * 99% of the q119=blank are either employed/selfemployed, if their age is 65+, then set retired=1, else set to retired=0; * Clean up; proc datasets library=work; delete e1 e2; quit; ** Check the distribution of sex and age **; proc sql; select q112, q113, count(*) from mylib.mod1 group by 1,2; quit; ** Check the values of the activity time slot variables **; data a1; set mylib.mod1(keep= newno _:); run; proc transpose data=a1 out=a2; by newno; run; proc sql; select col1, count(*) from a2 group by 1; quit; * other than activity code 1-31 (31 is probably the code for missing), found 1 case of 12.8, 1 case of 13.2; proc sql; select * from a2 where col1 eq 12.8 or col1 eq 13.2; quit; * newno=14070219505 _12_15=13.2 _12_30=12.8; data a3; set mylib.mod1 (keep= newno _: tot: daytot); if newno eq 14070219505; run; * from the types of activity before and after these time slots, these bad activities can be reassign to activity code 13, note that due to this problem, daytot=1410(although this variable may not be used), need to add 30 mins to tot13 and daytot; * Update the variables; proc sql; update mylib.mod1 set _12_15 = 13 where newno eq 14070219505; update mylib.mod1 set _12_30 = 13 where newno eq 14070219505; update mylib.mod1 set tot13 = tot13 + 30 where newno eq 14070219505; update mylib.mod1 set daytot = daytot + 30 where newno eq 14070219505; quit; * Clean up; proc datasets library=work; delete a1 a2 a3; quit; ** Check the accuracy of the total activity time variables **; * Sum up total activity time from the time slot variables; data ta1; set mylib.mod1(keep= newno _:); run; proc transpose data=ta1 out=ta2; by newno; run; proc sql; create table ta3 as ( select newno, col1 as actv, count(*) as actv_cnt from ta2 group by 1,2); quit; data ta4; set ta3; actv_tot = actv_cnt * 15; run; * Get the total activity time variables; data tt1; set mylib.mod1(keep= newno tot: drop= tot); run; proc transpose data=tt1 out=tt2; by newno; run; data tt3; set tt2; actv = substr(_name_,4,length(_name_)-3) * 1; rename col1 = actv_tot; run; * Compare the values; proc sql; create table c1 as ( select a.*, b.actv_tot as tt_tot from ta4 a join tt3 b on a.newno = b.newno where a.actv eq b.actv and a.actv_tot ne b.actv_tot); quit; * 0, tot* variables are ok to use; * Clean up; proc datasets library=work; delete ta1 ta2 ta3 ta4 tt1 tt2 tt3 c1; quit; ** Calculate imputed personal/household care and imputed sleep **; data i1 (drop= tot31); set mylib.mod1(keep= newno _: tot31); if tot31 gt 0; run; proc transpose data=i1 out=i2; by newno; run; * Assign a date to make the time in sequence since the diary started at 4am; data i3 (rename = (col1=actv)); set i2; length ts $30; if length(compress(_name_)) eq 6 then ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2); else if length(compress(_name_)) eq 5 and (substr(compress(_name_),1,3) eq '_0_' or substr(compress(_name_),1,3) eq '_1_' or substr(compress(_name_),1,3) eq '_2_' or substr(compress(_name_),1,3) eq '_3_') then ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); ep_time = input(ts, datetime.); format ep_time datetime.; run; proc sort data=i3(keep= newno _name_ actv ep_time) out=t_input; by newno ep_time; run; * Test the missing gap; data t_test_missing t_adjustor(drop = actv ep_time); set t_input; by newno ep_time; retain; * 4 variables to denote the pre- and post-missing time and activities; format start_miss_ts datetime13.; format end_miss_ts datetime13.; format lag_ts datetime13.; lag_ts = lag(ep_time); * reset for different resp; if first.newno then do; start_miss_ts = .; end_miss_ts = .; pre_actv = .; post_actv = .; lactv = .; lag_ts = .; end; lactv = lag(actv); drop lactv lag_ts; * check for something then missing; if lactv ne 31 and actv eq 31 then do; start_miss_ts = ep_time; pre_actv = lactv; end_miss_ts = .; post_actv = .; end; length startkey $70; length endkey $70; * check for missing then something; if lactv eq 31 and actv ne 31 then do; end_miss_ts = lag_ts; post_actv = actv; startkey = catt(newno,start_miss_ts); endkey = catt(newno,end_miss_ts); if (end_miss_ts - start_miss_ts) le 60*60*2 then output t_adjustor; end; startkey = catt(newno,start_miss_ts); endkey = catt(newno,end_miss_ts); output t_test_missing; run; data t_test; set t_input; imputed_actv = .; length key $70; key = catt(newno,ep_time); run; * Imputing data case by case; * case: sleep -> missing -> paidwork imputed personal care; proc sql; * imputted personal care; update t_test set imputed_actv = 3 where key in (select startkey from t_adjustor where pre_actv = 1 and post_actv lt 3 ) ; update t_test set imputed_actv = 1 where key in (select startkey from t_adjustor where pre_actv = 1 and post_actv gt 2 ) ; * home to travel; update t_test set imputed_actv = 62 where key in (select startkey from t_adjustor b where (pre_actv lt 7 or pre_actv between 13 and 21) and (post_actv between 8 and 12 or post_actv = 23 or post_actv between 26 and 28) and (end_miss_ts - start_miss_ts) lt 2*15*60 ) ; update t_test set imputed_actv = 1 where key in (select startkey from t_adjustor b where (pre_actv lt 7 or pre_actv between 13 and 21) and (post_actv between 8 and 12 or post_actv = 23 or post_actv between 26 and 28) and (end_miss_ts - start_miss_ts) gt 15*60 ) ; update t_test set imputed_actv = 62 where key in (select endkey from t_adjustor b where (pre_actv lt 7 or pre_actv between 13 and 21) and (post_actv between 8 and 12 or post_actv = 23 or post_actv between 26 and 28) and (end_miss_ts - start_miss_ts) gt 15*60 ) ; * travel to home; update t_test set imputed_actv = 62 where key in (select startkey from t_adjustor b where (post_actv lt 7 or post_actv between 13 and 21) and (pre_actv between 8 and 12 or pre_actv = 23 or pre_actv between 26 and 28) and (end_miss_ts - start_miss_ts) lt 2*15*60 ) ; update t_test set imputed_actv = 1 where key in (select startkey from t_adjustor b where (post_actv lt 7 or post_actv between 13 and 21) and (pre_actv between 8 and 12 or pre_actv = 23 or pre_actv between 26 and 28) and (end_miss_ts - start_miss_ts) gt 15*60 ) ; update t_test set imputed_actv = 62 where key in (select endkey from t_adjustor b where (post_actv lt 7 or post_actv between 13 and 21) and (pre_actv between 8 and 12 or pre_actv = 23 or pre_actv between 26 and 28) and (end_miss_ts - start_miss_ts) gt 15*60 ) ; * travel and then back home; update t_test set imputed_actv = 1 where key in (select endkey from t_adjustor b where (post_actv lt 7 or post_actv between 13 and 21) and (pre_actv = 7) ) ; * home and then travel; update t_test set imputed_actv = 1 where key in (select endkey from t_adjustor b where (pre_actv lt 7 or pre_actv between 13 and 21) and (post_actv = 7) ) ; quit; proc sql; select imputed_actv, count(*) from t_test group by 1; quit; * Create imputed variables; proc sql; create table imp1 as ( select newno, imputed_actv, count(*) as cnt from t_test where imputed_actv ne . group by 1,2); quit; proc sql; create table imp2 as ( select newno, max(case when imputed_actv eq 1 then cnt*15 else 0 end) as imp_m1, max(case when imputed_actv eq 3 then cnt*15 else 0 end) as imp_m3, max(case when imputed_actv eq 62 then cnt*15 else 0 end) as imp_m62 from imp1 group by 1); quit; * Subtract the imputed time from tot31 (total missing); proc sql; create table imp3 as ( select a.*, case when b.newno ne . then b.imp_m1 else 0 end as imp_m1, case when b.newno ne . then b.imp_m3 else 0 end as imp_m3, case when b.newno ne . then b.imp_m62 else 0 end as imp_m62, case when b.newno ne . then a.tot31 - (b.imp_m1 + b.imp_m3 + b.imp_m62) else a.tot31 end as newtot31 from mylib.mod1 a left join imp2 b on a.newno = b.newno); quit; * Check; proc sql; select count(*) from imp3 where newtot31 lt 0; quit; * 0, ok; ** Compute missing continuously 90+ minutes of activity **; proc sort data=t_test out=m1; by newno ep_time; run; * Identify the problem cases; data m2; set m1; by newno; retain cnt; if _n_ eq 1 then cnt = 0; lagactv = lag(actv); if actv eq 31 and imputed_actv eq . then cnt = cnt + 1; if cnt gt 0 and actv ne 31 and lagactv eq 31 then do; if cnt ge 6 then output; cnt = 0; end; run; * Tag the problem cases; proc sql; create table imp4 as ( select a.*, case when b.newno ne . then 1 else 0 end as mis90plus from imp3 a left join (select distinct newno from m2) b on a.newno = b.newno); quit; ** Update the imputed time slots **; * Reassign activity codes (add a 9 to the code) due to the imputed time: 1 -> 91 (imputed personal or household care) 3 -> 93 (imputed sleep) 62 -> 962 (mode of recorded travel); data u1; set t_test (keep = newno _name_ actv imputed_actv); if imputed_actv ne .; select(imputed_actv); when (1) do; newactv = 91; end; when (3) do; newactv = 93; end; when (62) do; newactv = 962; end; otherwise; end; run; * Write the update statements to a SAS program; data _null_; set u1 end=eof; file "&libdir.\imptimeslot.sas"; length aline $ 200; if _n_ eq 1 then do; aline = "proc sql;"; put aline; end; aline = "update imp4 set " || _name_ || " = " || newactv || " where newno = " || newno || " ; "; put aline; if eof eq 1 then do; aline = "quit;"; put aline; end; run; * Execute the SAS program; %include "&libdir.\imptimeslot.sas"; * Create a temp modified data set for further work; data mylib.mod2; set imp4; run; * Clean up; proc datasets library=work; delete i1 i2 i3 t_input t_test_missing t_adjustor t_test imp1 imp2 imp3 imp4 m1 m2 u1; quit; ** Calculate from/to work/education travel time **; ** tot7=travel, tot8=paidwork, tot9=education ** ** tot7 -> av5, av17, av5 -> main63, av17 -> main62 **; data t1 (drop= tot7); set mylib.mod2(keep= newno _: tot7); if tot7 gt 0; run; proc transpose data=t1 out=t2; by newno; run; * Assign a date to make the time in sequence since the diary started at 4am; data t3 (rename = (col1=actv)); set t2; length ts $30; if length(compress(_name_)) eq 6 then ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2); else if length(compress(_name_)) eq 5 and (substr(compress(_name_),1,3) eq '_0_' or substr(compress(_name_),1,3) eq '_1_' or substr(compress(_name_),1,3) eq '_2_' or substr(compress(_name_),1,3) eq '_3_') then ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); ep_time = input(ts, datetime.); format ep_time datetime.; run; proc sort data=t3(keep= newno _name_ actv ep_time) out=t4; by newno ep_time; run; * Find to-work/education and from-work/education time slots; data tr1; set t4; by newno ep_time; retain; format start_tr end_tr datetime.; lagactv = lag(actv); if first.newno then do; lagactv = 0; tag_tr = 0; end; if actv eq 7 and lagactv ne 7 then start_tr = ep_time; if (actv eq 8 or actv eq 9) and lagactv eq 7 then do; end_tr = ep_time; tag_tr = 1; end; if actv eq 7 and (lagactv eq 8 or lagactv eq 9) then start_tr = ep_time; if (actv ne 7 and actv ne 8 and actv ne 9) and lagactv eq 7 then do; end_tr = ep_time; tag_tr = 0; end; run; proc sort data=tr1 out=tr2; by newno descending ep_time ; run; data tr3; set tr2; by newno descending ep_time; retain; format newstart_tr newend_tr datetime.; needfix = 0; if lagactv eq 7 and tag_tr eq 1 then do; newstart_tr = start_tr; newend_tr = end_tr; needfix = 1; end; if actv eq 7 and tag_tr eq 1 then do; newstart_tr = start_tr; newend_tr = end_tr; needfix = 1; end; if actv ne 7 and needfix eq 0 then do; newstart_tr = .; newend_tr = .; end; if ep_time ge newstart_tr and ep_time le newend_tr then needfix = 1; run; * Reassign travel code 7 to 97 for to/from work/education time Write the update statements to a SAS program; data u1; set tr3 (keep = newno _name_ actv needfix); if actv eq 7 and needfix; run; data _null_; set u1 end=eof; file "&libdir.\travtimeslot.sas"; length aline $ 200; if _n_ eq 1 then do; aline = "proc sql;"; put aline; end; aline = "update mylib.mod2 set " || _name_ || " = 97 where newno = " || newno || " ; "; put aline; if eof eq 1 then do; aline = "quit;"; put aline; end; run; * Execute the SAS program; %include "&libdir.\travtimeslot.sas"; * Redefine tot7; proc sql; create table tr4 as ( select a.newno, a.cnt * 15 as av5 from (select newno, count(*) as cnt from u1 group by 1) a ); quit; proc sql; create table tr5 as ( select a.*, case when b.newno ne . then b.av5 else 0 end as cal_av5 from mylib.mod2 a left join tr4 b on a.newno = b.newno); quit; data tr6; set tr5; cal_av17 = tot7 - cal_av5; run; * Check; proc sql; select count(*) from tr6 where cal_av17 lt 0; quit; *0, ok; * Create a temp modified data set for further work; data mylib.mod3; set tr6; run; * Clean up; proc datasets library=work; delete t1 t2 t3 t4 tr1 tr2 tr3 u1 tr4 tr5 tr6; quit; ** Final count the number of episodes **; data e1; set mylib.mod3(keep= newno _:); run; proc transpose data=e1 out=e2; by newno; run; * Assign a date to make the time in sequence since the diary started at 4am; data e3 (rename = (col1=actv)); set e2; length ts $30; if length(compress(_name_)) eq 6 then ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2); else if length(compress(_name_)) eq 5 and (substr(compress(_name_),1,3) eq '_0_' or substr(compress(_name_),1,3) eq '_1_' or substr(compress(_name_),1,3) eq '_2_' or substr(compress(_name_),1,3) eq '_3_') then ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); ep_time = input(ts, datetime.); format ep_time datetime.; run; proc sort data=e3(keep= newno _name_ actv ep_time) out=e4; by newno ep_time; run; * Get episode count; data e5; set e4; by newno ep_time; retain; if first.newno then ep_cnt = 1; if first.newno ne 1 and actv ne lag(actv) then ep_cnt = ep_cnt + 1; run; * Get the number of episodes; proc sql; create table ep1 as ( select newno, max(ep_cnt) as num_ep from e5 group by 1); quit; * Append the number of episodes variable; proc sql; create table ep2 as ( select a.*, b.num_ep from mylib.mod3 a left join ep1 b on a.newno = b.newno); quit; * Create a temp modified data set for further work; data mylib.mod4; set ep2; run; * Clean up; proc datasets library=work; delete e1 e2 e3 e4 e5 ep1 ep2; quit; ** Fix the cases that have tot10 (breaks) but no tot8 (paid work) and emp not equal 1 or 2 (not employed) **; data b1 (keep= newno tot10); set mylib.mod4 (keep= newno tot8 tot18 tot10 emp); if tot8 eq 0 and tot18 eq 0 and tot10 gt 0 and emp ne 1 and emp ne 2; run; * 40 cases; proc sql; create table b2 as ( select a.*, case when b.newno eq . then a.tot10 else 0 end as tot10_wrk, case when b.newno ne . then b.tot10 else 0 end as tot10_oth from mylib.mod4 a left join b1 b on a.newno = b.newno ); quit; * Create a temp modified data set for further work; data mylib.mod5; set b2; run; * Clean up; proc datasets library=work; delete b1 b2; quit; * Create the final modified data set for the conversion; data mylib.uk1995_mod; set mylib.mod5; run; ******************************************************************* ************* Harmonized variables for 5.53 and 5.8 ************* ******************************************************************; data mylib.hvar1 (keep = newno date desigday wta _: imp_m: newtot31 mis90plus num_ep cal_av:); set mylib.uk1995_mod; _countreg = 37; *_country = 6; _survey = 1995; _swave = 0; _msamp = 0; _hldid = 0; _persid = newno; _id = _persid; _parntid1 = -9; _parntid2 = -9; _partid = -9; select (desigday); * 1=sun; when (1) _day = 2; when (2) _day = 3; when (3) _day = 4; when (4) _day = 5; when (5) _day = 6; when (6) _day = 7; when (7) _day = 1; otherwise _day = -8; end; _month = month; _year = _survey; _diary = 1; _badcase = 0; * a placeholder, to be updated later; * q16 is total people in household; * q18 is household type; * q115 is marital status; if q16 eq 1 or q18 eq 1 then _hhtype = 1; else if q16 eq 2 and (q115 eq 1 or q115 eq 2) then _hhtype = 2; else if q16 gt 2 and (q18 eq 2 or q18 eq 3) then _hhtype = 3; else if q16 gt 2 and (q115 eq 1 or q115 eq 2) then _hhtype = 3; else _hhtype = 4; if q16 ne . then _hhldsize = q16; else _hhldsize = -8; * q13/q14/q15 is number of children age 0-4/5-10/11-15; * q113 is age 16 or older; * q114 is relationship to household: 3 = partner's child; if (q113 eq 16 or q113 eq 17) and q114 eq 3 then _nchild = sum(q13, q14, q15) + 1; * diarist plus the number of child; else _nchild = sum(q13, q14, q15); * q14 is age 5-10 and q15 is age 11-15, so agekidx values 2 and 3 may not be precise; if q13 ge 1 then _agekidx = 1; else if q14 ge 1 then _agekidx = 2; else if q15 ge 1 then _agekidx = 3; else if q113 eq 16 or q113 eq 17 then _agekidx = 3; else if q113 eq 18 then _agekidx = 4; else if sum(q13, q14, q15) eq 0 then _agekidx = -7; else _agekidx = -8; _agekid2 = -9; * q73 is gross income and use it as is; if q73 ne . then _incorig = q73; else _incorig = -8; * The income group distribution is determined above; if (q73 ge 1 and q73 le 4) or q73 eq 98 then _income = 1; else if q73 ge 5 and q73 le 13 then _income = 2; else if q73 ge 14 and q73 le 21 then _income = 3; else _income = -8; if tenure eq 1 or tenure eq 2 then _ownhome = 1; else if tenure eq 3 or tenure eq 4 then _ownhome = 2; else _ownhome = 3; _urban = -9; _computer = -9; if cars eq 0 then _vehicle = 0; else if cars eq 1 then _vehicle = 3; else if cars eq 2 or cars eq 3 then _vehicle = 4; else _vehicle = -8; _sex = q112; * q113 is age; if q113 lt 80 and q113 ne . then _age = q113; else if q113 ge 80 then _age = 80; else _age = -8; * q13/q14/q15 is number of children age 0-4/5-10/11-15; * q113 is age; * q114 is relationship to household: 3 = partner's child; if q113 ge 18 and q113 le 39 and sum(q13, q14, q15) eq 0 then _famstat = 0; else if q113 ge 18 and q13 ge 1 then _famstat = 1; else if q113 ge 18 and q13 eq 0 and sum(q14, q15) ge 1 then _famstat = 2; else if q113 ge 40 and sum(q13, q14, q15) eq 0 then _famstat = 3; else if q113 ne . and q113 lt 18 and q114 eq 3 then _famstat = 4; else if q113 ne . and q113 lt 18 then _famstat = 5; else _famstat = -8; * q114 is relationship to household: 3 = partner's child; * q115 is marital status; if q114 eq 3 and (q115 eq 3 or q115 eq 4 or q115 eq 5 or q115 eq 6) then _cphome = 1; else if q114 eq 1 or q114 eq 2 or q114 eq 4 or q114 eq 5 or q114 eq 6 then _cphome = 0; else _cphome = -8; * q18 is household type B; if q18 eq 4 then _singpar = 1; else if q18 eq 1 or q18 eq 2 or q18 eq 3 or q18 eq 5 then _singpar = 0; else _singpar = -8; * q114 is relationship to household; if q114 eq 1 or q114 eq 2 or q114 eq 3 or q114 eq 4 then _relrefp = q114; else if q114 eq 5 then _relrefp = 9; else if q114 eq 6 then _relrefp = 10; else _relrefp = -8; * q115 is marital status; if (q115 eq 1 or q115 eq 2) then _civstat = 1; else if q115 eq 3 or q115 eq 4 or q115 eq 5 or q115 eq 6 then _civstat = 2; else _civstat = -8; if q115 eq 1 then _cohab = 0; else if q115 eq 2 then _cohab = 1; else if q115 eq 3 or q115 eq 4 or q115 eq 5 or q115 eq 6 then _cohab = -7; else _cohab = -8; _citizen = -9; * emp is economic status; if emp eq 1 then _empstat = 1; else if emp eq 2 then _empstat = 2; else if emp eq 3 then _empstat = 4; else if emp eq 4 then _empstat = 4; else _empstat = -8; if _empstat eq 1 or _empstat eq 2 or _empstat eq 3 then _emp = 1; else if _empstat eq 4 then _emp = 0; else _emp = -8; if emp eq 1 or emp eq 2 or emp eq 4 then _unemp = 0; else if emp eq 3 then _unemp = 1; else _unemp = -8; * q116 is age left full education, 97 = still in full time education; if q116 eq 97 then _student = 1; else if q116 eq 99 or q116 eq . then _student = -8; else _student = 0; * q119 is last week activity see above for the finding on q119; if q119 eq 7 then _retired = 1; else if q119 eq . and q113 ge 65 then _retired = 1; * age 65+; else _retired = 0; if q113 ge 70 and _empstat eq 4 then _retired = 1; * age 70+ and not in paid work; _empsp = -9; _workhrs = -9; _empinclm = -9; _occup = -9; _sector = -9; * lefted is age completed full time education; * q113 is age; if lefted ne . then _educa = lefted; else _educa = -8; if lefted eq 1 or lefted eq 2 or lefted eq 3 then _edtry = lefted; else if lefted eq 4 then _edtry = 3; else if lefted eq 5 and q113 gt 0 and q113 le 17 then _edtry = 1; else if lefted eq 5 and q113 eq 18 then _edtry = 2; else if lefted eq 5 and q113 ge 19 then _edtry = 3; else if lefted eq 6 then _edtry = 1; else _edtry = -8; _rushed = -9; _health = -9; _carer = -9; _disab = -9; _av1 = sum(tot8,tot10_wrk); _av2 = tot18; _av3 = -9; _av4 = tot9; _av5 = cal_av5; _av6 = tot3; _av7 = sum(tot13,tot14); _av8 = sum(tot6,tot15); _av9 = tot16; _av10 = tot11; _av11 = tot5; _av12 = -9; _av13 = sum(tot2,imp_m1); _av14 = -9; _av15 = tot4; _av16 = sum(tot1,imp_m3); _av17 = sum(imp_m62,cal_av17); _av18 = tot26; _av19 = tot25; _av20 = -9; _av21 = -9; _av22 = -9; _av23 = sum(tot12,tot28); _av24 = tot27; _av25 = -9; _av26 = -9; _av27 = -9; _av28 = tot23; _av29 = tot24; _av30 = -9; _av31 = tot19; _av32 = -9; _av33 = tot17; _av34 = -9; _av35 = tot21; _av36 = tot29; _av37 = tot20; _av38 = -9; _av39 = -9; _av40 = sum(tot22,tot30,tot10_oth); _av41 = newtot31; _ocombwt = wta; * A placeholder, to be calculater later; _propwt = 0.0; * 5.8 specific variables; _main1 = imp_m1; _main2 = tot1; _main3 = imp_m3; _main4 = tot2; _main5 = -9; _main6 = tot4; _main7 = tot8; _main8 = tot18; _main9 = -9; _main10 = -9; _main11 = -9; _main12 = tot10_wrk; _main13 = -9; _main14 = -9; _main15 = tot9; _main16 = tot17; _main17 = -9; _main18 = tot3; _main19 = -9; _main20 = tot13; _main21 = tot14; _main22 = tot15; _main23 = -9; _main24 = tot11; _main25 = -9; _main26 = -9; _main27 = -9; _main28 = tot5; _main29 = -9; _main30 = -9; _main31 = -9; _main32 = tot6; _main33 = sum(tot12,tot28); _main34 = -9; _main35 = tot26; _main36 = -9; _main37 = tot27; _main38 = -9; _main39 = tot23; _main40 = -9; _main41 = -9; _main42 = tot25; _main43 = -9; _main44 = -9; _main45 = -9; _main46 = tot16; _main47 = -9; _main48 = tot24; _main49 = tot20; _main50 = -9; _main51 = sum(tot30,tot10_oth); _main52 = -9; _main53 = -9; _main54 = tot22; _main55 = tot29; _main56 = tot21; _main57 = -9; _main58 = -9; _main59 = tot19; _main60 = -9; _main61 = -9; _main62 = sum(imp_m62,cal_av17); _main63 = cal_av5; _main64 = -9; _main65 = -9; _main66 = -9; _main67 = -9; _main68 = -9; _main69 = newtot31; _sppart = -9; * UK supplement variables; select (region); when (1,2,3,4,5,6) _region = 1; when (7,8,9) _region = 2; when (10) _region = 3; when (11,12,13,14) _region = 4; when (15) _region = 5; when (16,17) _region = 6; when (18,19,20,21) _region = 7; otherwise _region = -8; end; * q116 is age left full education; if q116 eq 97 then _aglftsch = 0; else if q116 le 15 and q116 ne . then _aglftsch = 15; else if q116 ge 21 then _aglftsch = 21; else if q116 eq . then _aglftsch = -8; else _aglftsch = q116; * q71 is ethnic group; select (q71); when (1) _ethnic = 1; when (2,3,4) _ethnic = 2; when (5,6,7) _ethnic = 3; when (8,9) _ethnic = 4; otherwise _ethnic = -8; end; run; ******************************************************************* ************* Adjusted variables for 5.53 and 5.8 ************* ******************************************************************; ** Define the categories for badcase **; * Tag the cases with missing data or with a situation; data b1; set mylib.hvar1 (keep= newno _day _sex _age num_ep mis90plus _main:); nosleep = 0; noeat = 0; nocare = 0; notravex = 0; noday = 0; nosexage = 0; less7ep = 0; careother = 0; homeallday = 0; mis2basicplus = 0; if sum(_main2,_main3) eq 0 then nosleep = 1; if sum(_main6,_main18,_main39) eq 0 then noeat = 1; if sum(_main1,_main4) eq 0 then nocare = 1; if sum(_main42,_main62,_main63) eq 0 then notravex = 1; misbasic = sum(nosleep, noeat, nocare, notravex); if _day lt 1 then noday = 1; if _sex lt 1 or _age lt 1 then nosexage = 1; if num_ep lt 7 then less7ep = 1; if misbasic ge 2 then mis2basicplus = 1; if _main28 gt 1 or _main32 gt 1 then careother = 1; * at home all day: home time at least 1000 minutes and no travel time; if sum(_main1,_main2,_main3,_main4,_main8,_main18, _main20,_main21,_main22,_main55,_main59) ge 1000 and sum(_main62, _main63) eq 0 then homeallday = 1; * missing 2 or more basic activities and exception; if misbasic ge 2 then do; if careother or homeallday or (misbasic eq 2 and num_ep ge 15) then mis2basicplus = 0; else mis2basicplus = 1; end; run; * Identify bad cases; data b2; set b1; badcase = 0; if noday and nosexage and mis90plus and less7ep and mis2basicplus then badcase = 31; else if noday and mis90plus and less7ep and mis2basicplus then badcase = 30; else if nosexage and mis90plus and less7ep and mis2basicplus then badcase = 29; else if nosexage and noday and less7ep and mis2basicplus then badcase = 28; else if nosexage and noday and mis90plus and mis2basicplus then badcase = 27; else if nosexage and noday and mis90plus and less7ep then badcase = 26; else if mis90plus and less7ep and mis2basicplus then badcase = 25; else if noday and less7ep and mis2basicplus then badcase = 24; else if noday and mis90plus and mis2basicplus then badcase = 23; else if noday and mis90plus and less7ep then badcase = 22; else if nosexage and mis2basicplus and less7ep then badcase = 21; else if nosexage and mis90plus and mis2basicplus then badcase = 20; else if nosexage and mis90plus and less7ep then badcase = 19; else if nosexage and noday and mis2basicplus then badcase = 18; else if nosexage and noday and less7ep then badcase = 17; else if nosexage and noday and mis90plus then badcase = 16; else if less7ep and mis2basicplus then badcase = 15; else if mis90plus and mis2basicplus then badcase = 14; else if mis90plus and less7ep then badcase = 13; else if noday and mis2basicplus then badcase = 12; else if noday and less7ep then badcase = 11; else if noday and mis90plus then badcase = 10; else if nosexage and mis2basicplus then badcase = 9; else if nosexage and less7ep then badcase = 8; else if nosexage and mis90plus then badcase = 7; else if nosexage and noday then badcase = 6; else if mis2basicplus then badcase = 5; else if less7ep then badcase = 4; else if mis90plus then badcase = 3; else if noday then badcase = 2; else if nosexage then badcase = 1; run; * List the distribution of badcases; proc sql; select badcase, count(*) from b2 group by 1; quit; * 1887 badcase=0; * Update badcase; proc sql; update mylib.hvar1 a set _badcase = (select badcase from b2 b where a.newno = b.newno) where newno in (select newno from b2) ; quit; proc sql; select _badcase, count(*) from mylib.hvar1 group by 1; quit; * ok; ** Calculate propwt **; * Check wta; proc sql; select newno, wta, _badcase from mylib.hvar1 where wta eq . or wta eq 0; quit; * one good case: newno=14071159505 wta=0; proc sql; create table w1 as ( select * from mylib.uk1995_orig where serialno eq 14071159505 ); quit; * looks like the assignment of wta=0 was triggered by q12=0 (# of adults); * Get the sex, age, marital status, and education of this case; proc sql; select newno, _sex, _age, _civstat, _cohab, _educa, _edtry, wta from mylib.hvar1 where newno eq 14071159505; quit; * Get the median of wta from a similar group; proc sql; create table w2 as ( select newno, _sex, _age, _civstat, _cohab, _educa, _edtry, wta from mylib.hvar1 where _sex eq 1 and _age eq 29 and _civstat eq 2 and _cohab eq -7 and _educa eq 3 and _edtry eq 3 ); quit; * 8 cases, median is wta=0.527909426; * Update the wta; proc sql; update mylib.hvar1 a set wta = 0.527909426 where newno eq 14071159505; quit; * Check weight distribution; proc means data=mylib.hvar1 n min mean max sum; var wta; class _badcase; run; * 1883 good cases (_badcase eq 0); * Make adjustment to the weight due to the identified bad cases, account for age, sex and employment status; data grp1; set mylib.hvar1 (keep= newno wta _age _sex _day _emp _badcase); if _age lt 18 and _sex eq 1 then agesexgp = 1; if _age gt 17 and _age lt 26 and _sex eq 1 and _emp eq 1 then agesexgp = 2; if _age gt 17 and _age lt 26 and _sex eq 1 and _emp lt 1 then agesexgp = 3; if _age gt 25 and _age lt 40 and _sex eq 1 and _emp eq 1 then agesexgp = 4; if _age gt 25 and _age lt 40 and _sex eq 1 and _emp lt 1 then agesexgp = 5; if _age gt 39 and _age lt 60 and _sex eq 1 and _emp eq 1 then agesexgp = 6; if _age gt 39 and _age lt 60 and _sex eq 1 and _emp lt 1 then agesexgp = 7; if _age gt 59 and _age lt 71 and _sex eq 1 then agesexgp = 8; if _age gt 70 and _sex eq 1 then agesexgp = 9; if _age lt 18 and _sex eq 2 then agesexgp = 10; if _age gt 17 and _age lt 26 and _sex eq 2 and _emp eq 1 then agesexgp = 11; if _age gt 17 and _age lt 26 and _sex eq 2 and _emp lt 1 then agesexgp = 12; if _age gt 25 and _age lt 40 and _sex eq 2 and _emp eq 1 then agesexgp = 13; if _age gt 25 and _age lt 40 and _sex eq 2 and _emp lt 1 then agesexgp = 14; if _age gt 39 and _age lt 60 and _sex eq 2 and _emp eq 1 then agesexgp = 15; if _age gt 39 and _age lt 60 and _sex eq 2 and _emp lt 1 then agesexgp = 16; if _age gt 59 and _age lt 71 and _sex eq 2 then agesexgp = 17; if _age gt 70 and _sex eq 2 then agesexgp = 18; if _badcase gt 0 then agesexgp = 99; * set the bad cases to zero weight; if agesexgp ne 99 then wt2 = wta; else if agesexgp eq 99 then wt2 = 0; run; proc sql; create table grpnu as ( select agesexgp, sum(wt2) as grouptot, count(*) as groupn from grp1 group by 1 ); quit; proc sql; create table daynu as ( select agesexgp, _day, sum(wt2) as daytot, count(*) as dayn from grp1 group by 1, 2 ); quit; proc sql; create table grp2 as ( select a.*, b.grouptot, b.groupn, c.daytot, c.dayn from grp1 a left join grpnu b on a.agesexgp = b.agesexgp left join daynu c on a.agesexgp = c.agesexgp and a._day = c._day ); quit; data grp3; set grp2; exptot = grouptot/7; if _badcase eq 0 then pwt = (exptot/daytot)/(dayn/daytot); else pwt = 0; run; data grp4; set grp3; newwt = pwt*(2005/1883); run; proc means data=grp4 n min mean max sum; var newwt; run; * something is not right, mean=0.9946613 and sum=1994.30; * NEED TO WORK ON...; * Update prompwt; proc sql; update mylib.hvar1 a set _propwt = (select newwt from grp4 b where a.newno = b.newno) where newno in (select newno from grp4); quit; * Clean up; proc datasets library=work; delete b1 b2 w1 w2 grpnu daynu grp1 grp2 grp3 grp4; quit; ******************************************************************* ****************** Harmonized variables for 6.0 ***************** ******************************************************************; ** Calculate the time variables **; proc transpose data=mylib.uk1995_mod(keep= newno _:) out=t1; by newno; run; * 192480; * Assign a date to make the time in sequence since the diary started at 4am; data t2 (rename = (col1=actv)); set t1; length ts $30; if length(compress(_name_)) eq 6 then ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2); else if length(compress(_name_)) eq 5 and (substr(compress(_name_),1,3) eq '_0_' or substr(compress(_name_),1,3) eq '_1_' or substr(compress(_name_),1,3) eq '_2_' or substr(compress(_name_),1,3) eq '_3_') then ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2); ep_time = input(ts, datetime.); format ep_time datetime.; run; proc sort data=t2(keep= newno actv ep_time) out=t3; by newno ep_time; run; * Get episode count; data t4; set t3; by newno ep_time; retain; if first.newno then ep_cnt = 1; if first.newno ne 1 and actv ne lag(actv) then ep_cnt = ep_cnt + 1; run; proc sql; create table t5 as ( select newno, ep_cnt as _epnum, actv, min(ep_time) as ep_start, max(ep_time) as ep_end from t4 group by 1,2,3); quit; data t6; set t5; format ep_start ep_end datetime.; _clockst = compress(hour(ep_start)) || '.' || compress(minute(ep_start)); run; data t7; set t6; format begtime datetime.; if _n_ eq 1 then begtime = ep_start; retain; _start = hour(ep_start - begtime)*60 + minute(ep_start - begtime); if _epnum ne 1 then _start = _start + 1; _end = hour(ep_end - begtime)*60 + minute(ep_end - begtime) + 15; _time = _end - _start; if _epnum ne 1 then _time = _time + 1; run; * Assign the activity variables; data a1; set t7; select (actv); when (1) do; _av=16; _main=2; _sec=-9; _inout=1; _eloc=1; end; when (2) do; _av=13; _main=4; _sec=-9; _inout=1; _eloc=1; end; when (3) do; _av=6; _main=18; _sec=-9; _inout=1; _eloc=1; end; when (4) do; _av=15; _main=6; _sec=-9; _inout=1; _eloc=1; end; when (5) do; _av=11; _main=28; _sec=-9; _inout=1; _eloc=-8; end; when (6) do; _av=8; _main=32; _sec=-9; _inout=1; _eloc=-8; end; when (7) do; _av=17; _main=62; _sec=-9; _inout=3; _eloc=8; end; when (97) do; _av=5; _main=63; _sec=-9; _inout=3; _eloc=8; end; * computed to/from work/education; when (8) do; _av=1; _main=7; _sec=-9; _inout=-8; _eloc=3; end; when (9) do; _av=4; _main=15; _sec=-9; _inout=1; _eloc=4; end; when (10) do; _av=1; _main=12; _sec=-9; _inout=-8; _eloc=-8; end; when (11) do; _av=10; _main=24; _sec=-9; _inout=1; _eloc=5; end; when (12) do; _av=23; _main=33; _sec=-9; _inout=-8; _eloc=-8; end; when (13) do; _av=7; _main=20; _sec=-9; _inout=1; _eloc=1; end; when (14) do; _av=7; _main=21; _sec=-9; _inout=1; _eloc=1; end; when (15) do; _av=8; _main=22; _sec=-9; _inout=-8; _eloc=1; end; when (16) do; _av=9; _main=46; _sec=-9; _inout=2; _eloc=-8; end; when (17) do; _av=33; _main=16; _sec=-9; _inout=1; _eloc=1; end; when (18) do; _av=2; _main=8; _sec=-9; _inout=1; _eloc=1; end; when (19) do; _av=31; _main=59; _sec=-9; _inout=1; _eloc=1; end; when (20) do; _av=37; _main=49; _sec=-9; _inout=-8; _eloc=-8; end; when (21) do; _av=35; _main=56; _sec=-9; _inout=-8; _eloc=-8; end; when (22) do; _av=40; _main=54; _sec=-9; _inout=1; _eloc=-8; end; when (23) do; _av=28; _main=39; _sec=-9; _inout=1; _eloc=6; end; when (24) do; _av=29; _main=48; _sec=-9; _inout=1; _eloc=-8; end; when (25) do; _av=19; _main=42; _sec=-9; _inout=-8; _eloc=-8; end; when (26) do; _av=18; _main=35; _sec=-9; _inout=-8; _eloc=-8; end; when (27) do; _av=24; _main=37; _sec=-9; _inout=1; _eloc=-8; end; when (28) do; _av=23; _main=33; _sec=-9; _inout=-8; _eloc=-8; end; when (29) do; _av=36; _main=55; _sec=-9; _inout=-8; _eloc=1; end; when (30) do; _av=40; _main=51; _sec=-9; _inout=-8; _eloc=-8; end; when (31) do; _av=41; _main=69; _sec=-9; _inout=-8; _eloc=-8; end; when (91) do; _av=13; _main=1; _sec=-9; _inout=1; _eloc=1; end; * imputed personal/household care; when (93) do; _av=16; _main=3; _sec=-9; _inout=1; _eloc=1; end; * imputed sleep; when (962) do; _av=17; _main=62; _sec=-9; _inout=3; _eloc=8; end; * imputed travel; otherwise; end; run; * Create the rest of the variables; data mylib.hvar2; merge a1 (in=a) mylib.hvar1 (in=b keep= newno date _countreg _survey _swave _msamp _hldid _persid _id _day _month _year _diary _badcase _sex _age); by newno; if a; if date ne . then _cday = date; else _cday = -8; _mtrav = -9; _ict = -9; _alone = -9; _child = -9; _sppart = -9; _oad = -9; run; ******************************************************************* ****************** Quality checks ***************** ******************************************************************; * Run each manually to check; ** Step 1: data distribution **; * badcase; proc sql; select _badcase, count(*) from mylib.hvar1 group by 1; quit; ** Step 2: propwt **; * propwt; proc sql; select sum(_propwt) from mylib.hvar1; quit; * 1994.296; * each day of the week should appear in equal proportion (14.286%); proc freq data=mylib.hvar1; tables _day; run; * ranging from 12.72% to 17.16%; * sex and age group; data tmp; set mylib.hvar1 (keep= newno _sex _age _propwt); if _age le 18 then agegrp = 1; else if _age le 25 then agegrp = 2; else if _age le 40 then agegrp = 3; else if _age le 60 then agegrp = 4; else if _age le 70 then agegrp = 5; else if _age gt 70 then agegrp = 6; else agegrp = 0; run; proc sql; select _sex, agegrp, count(*) from tmp group by 1,2; quit; * Both male and female have higher numbers in age groups 3 & 4, female has higher numbers in age group 5 & 6 then male; ** Step 3: cross checking data **; * Means of age, hhldsize and nchild by categories of famstat; proc means data=mylib.hvar1 n min mean max; var _age _hhldsize _nchild; class _famstat; run; * ok; * Median of age by retired; proc means data=mylib.hvar1 n min mean median max; var _age; class _retired; run; * ok, the median age of retired is 71 and the median age of non-retired is 40; * nchild is less then hhldsize; proc sql; select _hhldsize, _nchild, count(*) from mylib.hvar1 where _hhldsize lt _nchild group by 1,2; quit; * 0; ** Cross-tabs of famstat by nchild, hhtype, agekidx, civstat and cphome **; * - most should be cphome=1 if famstat=4 - famstat=5 should always map to cphome=0; proc sql; select _famstat, _cphome, count(*) from mylib.hvar1 group by 1,2; quit; * all 32 famstat=4 has cphome=1; * no famstat=5 cases; * famstat=1,2,4 should not match to hhtype=1,2; proc sql; select _famstat, _hhtype, count(*) from mylib.hvar1 group by 1,2; quit; * 12055279505: famstat=1 and hhtype=2, this woman is a single parent, and lives in a 2 person household, but also is listed as married 15081119505: famstat=2 and hhtype=2, this is a married man in a couple with a child; * fix these 2 cases; proc sql; update mylib.hvar1 set _hhtype=4 where newno eq 12055279505; * other household types; update mylib.hvar1 set _civstat=2 where newno eq 12055279505; * as she does not live with a partner; update mylib.hvar1 set _cohab=-7 where newno eq 12055279505; * again, she is not living with a partner; update mylib.hvar1 set _hhtype=3 where newno eq 15081119505; * couple + others; update mylib.hvar1 set _hhldsize=3 where newno eq 15081119505; quit; * - famstat=0,3 should match to agekidx=-7 or 4 & nchild=0 - other famstat values should have no agekidx=-7 or 4 or nchild=0 values - famstat=1 should match to agekidx=1 cases only - famstat=2 should match to agekidx=2,3 cases only - famstat=4,5 should match to agekidx=1,2 or 3; proc sql; select _famstat, _agekidx, _nchild, count(*) from mylib.hvar1 group by 1,2,3; quit; * ok; ** Cross-tab agekidx by nchild **; * - nchild=0 should match to all cases agekidx=-7 or agekidx=4 - no positive cases of nchild should have agekidx=-7 or agekidx=4 - no cases of agekidx>-7 and <4 should match to nchild=0; proc sql; select _agekidx, _nchild, count(*) from mylib.hvar1 group by 1,2; quit; * ok; ** Cross-tabs empstat by emp, unemp, student, retired disab **; * emp=1 should correspond only with empstat=1,2,3; proc sql; select _emp, _empstat, count(*) from mylib.hvar1 group by 1,2; quit; * Most people coded as unemployed should not be working, more students and retired should be empstat=4, but some students and retired people should be coded as working; proc sql; select _student, _retired, _empstat, _emp, _unemp, count(*) from mylib.hvar1 group by 1,2,3,4,5; quit; * ok; * Check cases with unemployment but got work hours; proc sql; create table t1 as ( select * from mylib.mod5 where newno in (select newno from mylib.hvar1 where _unemp eq 1 and _av1 gt 1) ); quit; * 2 cases have unknown break time, assign empstat as unknown job hours; * 9039279505 and 11048169505; proc sql; update mylib.hvar1 set _empstat=3 where newno eq 9039279505 or newno eq 11048169505; quit; ** Cross-tabs with empsp and partid by civstat and relrefp **; * People not in couples should have values of -7 for empsp and partid; * no data to check; * relrep=2 (spouse of reference person) and civstat=1 should not match with empsp or partid=-7; * no data to check; ** Cross-tab civstat by hhtype relrefp partid cohab cphome **; * - no hhtype=2 should match with civstat=2 - few civstat=1 should match to hhtype=1 (single person household); proc sql; select _hhtype, _civstat, count(*) from mylib.hvar1 group by 1,2; quit; * ok; * Diarists with relrefp values of 2 (spouse of reference person) should have a civstat value of 1 (in couple); proc sql; select _relrefp, _civstat, count(*) from mylib.hvar1 group by 1,2; quit; * ok; * All people with cohab=1 or cohab=0 should have a civstat=1; proc sql; select _cohab, _civstat, count(*) from mylib.hvar1 group by 1,2; quit; * ok; * All people with cphome=1 should be civstat=2; proc sql; select _cphome, _civstat, count(*) from mylib.hvar1 group by 1,2; quit; * ok; ** Cross-tab hhtype by relrefp **; * - hhtype=1 only should correspond with relrefp=1 - hhtype=2 should correspond with relrefp values of 1 and 2 - hhtypes 3 and 4 should correspond with all relrefp values; proc sql; select _hhtype, _relrefp, count(*) from mylib.hvar1 group by 1,2; quit; * ok; * not all relrefp values because this survey does not have sibling or in-laws data; ** Compute a summary of the total minutes recorded in the diary, should equal 1440 **; * summing the total time by adding up all AV variables that can be computed for the survey; data tmp; set mylib.hvar1 (keep= newno _av:); retain; array ct{41} _av:; actvct = 0; do i = 1 to 41; if ct{i} ne -9 then actvct = actvct + ct{i}; end; run; proc sql; select count(*) from tmp where actvct ne 1440; quit; * 0; * summing the total time by adding up all MAIN variables that can be computed for the survey; data tmp; set mylib.hvar1 (keep= newno _main:); retain; array ct{69} _main:; actvct = 0; do i = 1 to 69; if ct{i} ne -9 then actvct = actvct + ct{i}; end; run; proc sql; select count(*) from tmp where actvct ne 1440; quit; * 0; * summing the total time by adding up all time variables for each diary in the World 6 file; proc sql; select count(*) from (select newno, sum(_time) as tmct from mylib.hvar2 group by 1) a where a.tmct ne 1440; quit; * 0; ** Step 4: time distributions in the basic activities by days of week **; * av1 is paid work, av2 is paid work at home, av4 is school, and av16 is sleep; proc sql; select _day, sum(_av1) as wk, sum(_av2) as wkhm, sum(_av4) as sch, sum(_av16) as slp, count(*) from mylib.hvar1 group by 1; quit; * looks about right; * Step 5: min and max values **; proc means data=mylib.hvar1 n min mean max; var _hhldsize _ownhome _nchild _agekidx _income _vehicle; class _hhtype; run; * ok, no discrepancies; ** Step 6: check one person household and its relrefp **; proc sql; select _hhtype, _relrefp, count(*) from mylib.hvar1 where _hhtype eq 1 group by 1,2; quit; ******************************************************************* ************************ Create datasets ************************ ******************************************************************; ** Create the 5.53 dataset **; * Read the dataset format into memory; proc format cntlin = mylib.MTUSschemaW553; run; * Create the dataset; data mylib.UK1995W553 (keep= countreg survey swave msamp hldid persid id parntid1 parntid2 partid day month year diary badcase hhtype hhldsize nchild agekidx agekid2 incorig income ownhome urban computer vehicle sex age famstat cphome singpar relrefp civstat cohab citizen empstat emp unemp student retired empsp workhrs empinclm occup sector educa edtry rushed health carer disab av1 av2 av3 av4 av5 av6 av7 av8 av9 av10 av11 av12 av13 av14 av15 av16 av17 av18 av19 av20 av21 av22 av23 av24 av25 av26 av27 av28 av29 av30 av31 av32 av33 av34 av35 av36 av37 av38 av39 av40 av41 ocombwt propwt); set mylib.schema553 mylib.hvar1; countreg = _countreg; *country = _country; survey = _survey; swave = _swave; msamp = _msamp; hldid = _hldid; persid = _persid; id = _id; parntid1 = _parntid1; parntid2 = _parntid2; partid = _partid; day = _day; month = _month; year = _year; diary = _diary; badcase = _badcase; hhtype = _hhtype; hhldsize = _hhldsize; nchild = _nchild; agekidx = _agekidx; agekid2 = _agekid2; incorig = _incorig; income = _income; ownhome = _ownhome; urban = _urban; computer = _computer; vehicle = _vehicle; sex = _sex; age = _age; famstat = _famstat; cphome = _cphome; singpar = _singpar; relrefp = _relrefp; civstat = _civstat; cohab = _cohab; citizen = _citizen; empstat = _empstat; emp = _emp; unemp = _unemp; student = _student; retired = _retired; empsp = _empsp; workhrs = _workhrs; empinclm = _empinclm; occup = _occup; sector = _sector; educa = _educa; edtry = _edtry; rushed = _rushed; health = _health; carer = _carer; disab = _disab; av1 = _av1; av2 = _av2; av3 = _av3; av4 = _av4; av5 = _av5; av6 = _av6; av7 = _av7; av8 = _av8; av9 = _av9; av10 = _av10; av11 = _av11; av12 = _av12; av13 = _av13; av14 = _av14; av15 = _av15; av16 = _av16; av17 = _av17; av18 = _av18; av19 = _av19; av20 = _av20; av21 = _av21; av22 = _av22; av23 = _av23; av24 = _av24; av25 = _av25; av26 = _av26; av27 = _av27; av28 = _av28; av29 = _av29; av30 = _av30; av31 = _av31; av32 = _av32; av33 = _av33; av34 = _av34; av35 = _av35; av36 = _av36; av37 = _av37; av38 = _av38; av39 = _av39; av40 = _av40; av41 = _av41; ocombwt = _ocombwt; propwt = _propwt; run; * Remove the dummy row carried over from the schema; data mylib.UK1995W553; set mylib.UK1995W553; if countreg eq . then delete; run; * Remove the formats for viewing the actual values; data v1; set mylib.UK1995W553; format _all_; run; ** Create the 5.8 datasets **; * Read the dataset format into memory; proc format cntlin = mylib.MTUSschemaW58; run; * Create the general dataset; data set58 (keep= countreg survey swave msamp hldid persid id parntid1 parntid2 partid day month year diary badcase hhtype hhldsize nchild agekidx agekid2 incorig income ownhome urban computer vehicle sex age famstat cphome singpar relrefp civstat cohab citizen empstat emp unemp student retired empsp workhrs empinclm occup sector educa edtry rushed health carer disab av1 av2 av3 av4 av5 av6 av7 av8 av9 av10 av11 av12 av13 av14 av15 av16 av17 av18 av19 av20 av21 av22 av23 av24 av25 av26 av27 av28 av29 av30 av31 av32 av33 av34 av35 av36 av37 av38 av39 av40 av41 ocombwt propwt main1 main2 main3 main4 main5 main6 main7 main8 main9 main10 main11 main12 main13 main14 main15 main16 main17 main18 main19 main20 main21 main22 main23 main24 main25 main26 main27 main28 main29 main30 main31 main32 main33 main34 main35 main36 main37 main38 main39 main40 main41 main42 main43 main44 main45 main46 main47 main48 main49 main50 main51 main52 main53 main54 main55 main56 main57 main58 main59 main60 main61 main62 main63 main64 main65 main66 main67 main68 main69 sppart); set mylib.schema58 mylib.hvar1; countreg = _countreg; *country = _country; survey = _survey; swave = _swave; msamp = _msamp; hldid = _hldid; persid = _persid; id = _id; parntid1 = _parntid1; parntid2 = _parntid2; partid = _partid; day = _day; month = _month; year = _year; diary = _diary; badcase = _badcase; hhtype = _hhtype; hhldsize = _hhldsize; nchild = _nchild; agekidx = _agekidx; agekid2 = _agekid2; incorig = _incorig; income = _income; ownhome = _ownhome; urban = _urban; computer = _computer; vehicle = _vehicle; sex = _sex; age = _age; famstat = _famstat; cphome = _cphome; singpar = _singpar; relrefp = _relrefp; civstat = _civstat; cohab = _cohab; citizen = _citizen; empstat = _empstat; emp = _emp; unemp = _unemp; student = _student; retired = _retired; empsp = _empsp; workhrs = _workhrs; empinclm = _empinclm; occup = _occup; sector = _sector; educa = _educa; edtry = _edtry; rushed = _rushed; health = _health; carer = _carer; disab = _disab; av1 = _av1; av2 = _av2; av3 = _av3; av4 = _av4; av5 = _av5; av6 = _av6; av7 = _av7; av8 = _av8; av9 = _av9; av10 = _av10; av11 = _av11; av12 = _av12; av13 = _av13; av14 = _av14; av15 = _av15; av16 = _av16; av17 = _av17; av18 = _av18; av19 = _av19; av20 = _av20; av21 = _av21; av22 = _av22; av23 = _av23; av24 = _av24; av25 = _av25; av26 = _av26; av27 = _av27; av28 = _av28; av29 = _av29; av30 = _av30; av31 = _av31; av32 = _av32; av33 = _av33; av34 = _av34; av35 = _av35; av36 = _av36; av37 = _av37; av38 = _av38; av39 = _av39; av40 = _av40; av41 = _av41; ocombwt = _ocombwt; propwt = _propwt; main1 = _main1; main2 = _main2; main3 = _main3; main4 = _main4; main5 = _main5; main6 = _main6; main7 = _main7; main8 = _main8; main9 = _main9; main10 = _main10; main11 = _main11; main12 = _main12; main13 = _main13; main14 = _main14; main15 = _main15; main16 = _main16; main17 = _main17; main18 = _main18; main19 = _main19; main20 = _main20; main21 = _main21; main22 = _main22; main23 = _main23; main24 = _main24; main25 = _main25; main26 = _main26; main27 = _main27; main28 = _main28; main29 = _main29; main30 = _main30; main31 = _main31; main32 = _main32; main33 = _main33; main34 = _main34; main35 = _main35; main36 = _main36; main37 = _main37; main38 = _main38; main39 = _main39; main40 = _main40; main41 = _main41; main42 = _main42; main43 = _main43; main44 = _main44; main45 = _main45; main46 = _main46; main47 = _main47; main48 = _main48; main49 = _main49; main50 = _main50; main51 = _main51; main52 = _main52; main53 = _main53; main54 = _main54; main55 = _main55; main56 = _main56; main57 = _main57; main58 = _main58; main59 = _main59; main60 = _main60; main61 = _main61; main62 = _main62; main63 = _main63; main64 = _main64; main65 = _main65; main66 = _main66; main67 = _main67; main68 = _main68; main69 = _main69; sppart = _sppart; run; * Create the adult and kid datasets; data mylib.UK1995W58 mylib.UK1995W58kid; set set58; if countreg eq . then delete; * remove the dummy row carried over from the schema; if age ge 1 and age le 17 then output mylib.UK1995W58kid; else output mylib.UK1995W58; run; * Remove the formats for viewing the actual values; data v2; set mylib.UK1995W58; format _all_; run; data v3; set mylib.UK1995W58kid; format _all_; run; ** Create the 6.0 datasets **; * Read the dataset format into memory; proc format cntlin = mylib.MTUSschemaW60; run; * Create the general dataset; data set60 (keep= countreg survey swave msamp hldid persid id day cday month year diary badcase sex age time clockst start end epnum main sec av inout eloc mtrav ict alone child sppart oad); set mylib.schema60 mylib.hvar2; countreg = _countreg; survey = _survey; swave = _swave; msamp = _msamp; hldid = _hldid; persid = _persid; id = _id; day = _day; cday = _cday; month = _month; year = _year; diary = _diary; badcase = _badcase; sex = _sex; age = _age; time = _time; clockst = _clockst; start = _start; end = _end; epnum = _epnum; main = _main; sec = _sec; av = _av; inout = _inout; eloc = _eloc; mtrav = _mtrav; ict = _ict; alone = _alone; child = _child; sppart = _sppart; oad = _oad; run; * Create the adult and kid datasets; data mylib.UK1995W60 mylib.UK1995W60kid; set set60; if countreg eq . then delete; * remove the dummy row carried over from the schema; if age ge 1 and age le 17 then output mylib.UK1995W60kid; else output mylib.UK1995W60; run; * Remove the formats for viewing the actual values; data v4; set mylib.UK1995W60; format _all_; run; data v5; set mylib.UK1995W60kid; format _all_; run; ** Create the UK Supplement dataset **; * Read the dataset formats into memory; proc format cntlin = mylib.MTUSschemaUKSup; run; * Create the dataset; data mylib.UK1995extravar (keep = survey hldid persid id sexd aged incolab educalab empinlab aglftsch ethnic region animal); set mylib.schemaUKSup mylib.hvar1; survey = _survey; hldid = _hldid; persid = _persid; id = _id; sexd = _sex; aged = _age; incolab = _incorig; educalab = _educa; empinlab = _empinclm; aglftsch = _aglftsch; ethnic = _ethnic; region = _region; animal = -9; run; * Remove the dummy row carried over from the schema; data mylib.UK1995extravar; set mylib.UK1995extravar; if survey eq . then delete; run; * Remove the formats for viewing the actual values; data v5; set mylib.UK1995extravar; format _all_; run; * Clean up the temp datasets; /* proc datasets library=mylib; delete mod1 uk1995_mod hvar1 hvar2; quit; */ * eof;