/******************************************************************
 * SAS program to create the MTUS World 5.53, 5.8, and 6.0 datasets
 * for the UK 1995 time diary survey,
 * written by Oiching Yeung and Aaron Lai,
 * Centre for Time Use Research, University of Oxford,
 * September 2009
 ******************************************************************/


********** Assign a library **********;
libname mylib 'C:\personal\MTUS\UK1995';
%let libdir=C:\personal\MTUS\UK1995;


*******************************************************************
***********************     Data preparation     ******************
******************************************************************;

** Check serialno **;

* Check if serialno is unique, total number of rows is 2005;
proc sql;
  select count(distinct serialno)
  from mylib.uk1995_orig;
quit;
* 1976, duplicates found;

* Check if serialno is blank;
proc sql;
  select count(*)
  from mylib.uk1995_orig
  where serialno eq .;
quit;
* 27 rows have a blank serialno;

* Look at the content of the rows with a blank serialno;
data s1;
  set mylib.uk1995_orig;
	if serialno eq .;
run;
* their tot1 (total sleep time) is 1440 but the profile info is available,
  assign a new number to them;

* Get the largest serialno;
proc sql;
  select max(serialno) format z12.
  from mylib.uk1995_orig;
quit;
* 21100309505;

* Check for missing age and sex;
proc sql;
  select count(*)
  from mylib.uk1995_orig
	where serialno eq .
	and (q112 eq . or q113 eq .);
quit;
* 0, ok;

* Assign new numbers, use a hardcoded number plus sex*100 plus age; 
data s2;
  set mylib.uk1995_orig;
	if serialno eq . then serialno = 22000000000 + q112*100 + q113;
run;

* Check again if serialno is unique;
proc sql;
  select count(distinct serialno)
  from s2;
quit;
* 2003: 2 duplicates;

* List the duplicates;
proc sql;
  select serialno
  from (select serialno, count(*) as cnt
        from s2
        group by 1)
  where cnt gt 1;
quit;
* 5016069505
  16088249505;

* Add 1 to the serialno for each duplicate;

proc sort data=s2 out=s2; 
  by serialno region area address;
run;

data s3 (drop = lstno);
  set s2;
  by serialno region area address;
  format newno best12.;
  retain lstno newno;
  lstno = newno;
  if last.address then newno = serialno;
  if last.address and serialno eq lstno then newno = serialno + 1;
run;

* List the modified duplicates;
proc sql; 
  select serialno, newno 
  from s3
  where serialno ne newno;
quit;

* Check again if serialno is unique;
proc sql;
  select count(distinct newno)
  from s3;
quit;
* 2005;

* Create a temp modified data set for further work;
data mylib.mod1; set s3; run;

* Clean up;
proc datasets library=work; delete s1 s2 s3; quit;


** Accuracy check of the date variables **;

proc sql;
  select month, count(*)
  from mylib.mod1
  group by 1;
quit;
* 3 cases of blank, 1961 cases month=5, and 41 cases of month=6;

* assign all months to 5;
proc sql;
  update mylib.mod1 set month = 5 where month eq . or month eq 6;
quit;

proc sql;
  select count(*)
  from mylib.mod1
  where date eq .;
quit;
* 41 is missing;

proc sql;
  select count(*)
  from mylib.mod1
  where day eq .;
quit;
* 3 is missing;

* Compare date/day to desigday;
data d1;
  set mylib.mod1 (keep = newno date desigday day month);
	select (date);
	  when (7,14,21,28) wdate = 7;
		when (1,8,15,22,29) wdate = 1;
		when (2,9,16,23,30) wdate = 2;
		when (3,10,17,24,31) wdate = 3;
		when (4,11,18,25) wdate = 4;
		when (5,12,19,26) wdate = 5;
		when (6,13,20,27) wdate = 6;	
		otherwise;
	end;
	select (day);
	  when (7,14,21,28) wday = 7;
		when (1,8,15,22,29) wday = 1;
		when (2,9,16,23,30) wday = 2;
		when (3,10,17,24,31) wday = 3;
		when (4,11,18,25) wday = 4;
		when (5,12,19,26) wday = 5;
		when (6,13,20,27) wday = 6;		
		otherwise;
	end;
run;

proc sql;
  select count(*)
  from d1
  where wdate ne desigday;
quit;
* 137;

proc sql;
  select count(*)
  from d1
  where wday ne desigday;
quit;
* 1982, day does not correspond with desigday well, no good to use;

proc sql;
  select desigday, count(*)
  from d1
  group by 1;
quit;
proc sql;
  select wdate, count(*)
  from d1
  group by 1;
quit;
* pretty even distribution;

* create a new desigday variable with the corrections
  if wdate (based on date) is blank (meaning blank date), then use the original desigday;
data d2;
  set d1 (drop = day wday);
  if wdate eq . then newdesigday = desigday;
	else newdesigday = wdate;
run;

proc sql;
  select newdesigday, count(*)
  from d2
  group by 1;
quit;
* even distribution;

* Update desigday;
proc sql;
  update mylib.mod1 a
	set desigday = (select newdesigday
	                from d2 b
							    where a.newno = b.newno)
  where newno in (select newno from d2);
quit;

proc sql;
  select desigday, count(*)
  from mylib.mod1
  group by 1;
quit;
* matched;

* Clean up;
proc datasets library=work; delete d1 d2; quit;


** Check if the adult, child, and household variables synchronized **;

proc sql;
  create table h1 as (
  select newno, q16 as q16_tot, q12 as q12_adu, q13, q14, q15, q113 as q113_age,
    case when q17 eq 1 then '(1) 1 adu 16-64'
         when q17 eq 2 then '(2) 1 adu 65+  '
			   when q17 eq 3 then '(3) 2 adu 16-64'
			   when q17 eq 4 then '(4) 2 adu 1 65+'
			   when q17 eq 5 then '(5) 3 adu ages '
			   when q17 eq 6 then '(6) 1-2 child  '
			   when q17 eq 7 then '(7) 3+ child   '
         else '.' end as q17_hhtypA,
    case when q18 eq 1 then '(1) one person only'
	       when q18 eq 2 then '(2) mar/coh child+ '
		  	 when q18 eq 3 then '(3) mar/coh 0 child'
			   when q18 eq 4 then '(4) lone par child+'
			   when q18 eq 5 then '(5) all others     '
         else '.' end as q18_hhtypB,
    case when q114 eq 1 then '(1) HOH            '
	       when q114 eq 2 then '(2) HOH partner    '
			   when q114 eq 3 then '(3) HOH/ptnr child '
			   when q114 eq 4 then '(4) HOH/ptnr par   '
			   when q114 eq 5 then '(5) Other relation '
			   when q114 eq 6 then '(6) Non related adu'
         else '.' end as q114_relHH, 
    case when q115 eq 1 then '(1) Married   '
	       when q115 eq 2 then '(2) Cohabiting'
			   when q115 eq 3 then '(3) Single    ' 
			   when q115 eq 4 then '(4) Widowed   '
		  	 when q115 eq 5 then '(5) Divorced  '  
			   when q115 eq 6 then '(6) Separated '
	       else '.' end as q115_marstat,
    tot5, count(*)
  from mylib.mod1
  where sum(q12, q13, q14, q15) ne q16 
	  or (sum(q13, q14, q15) eq 0 and (q18 eq 2 or q18 eq 4))
	  or (sum(q13, q14, q15) ge 1 and q18 eq 3)
	  or ((q17 eq 1 or q17 eq 2) and q12 lt 1)
	  or ((q17 eq 3 or q17 eq 4) and q12 lt 2)
  	or (q17 eq 5 and q12 lt 3)
	  or (q17 eq 6 and sum(q13, q14, q15) eq 0)
	  or (q17 eq 7 and sum(q13, q14, q15) lt 3)
    or (q18 eq 1 and q16 ge 2)
  group by 1,2,3,4,5,6,7,8,9,10,11,12);
quit;
* 81 rows;

* The following cases can be updated with some assumptions;
proc sql;
  update mylib.mod1 set q12 = 1 where newno eq 14071159505;
	update mylib.mod1 set q12 = 2 where newno eq 8038309505
                                   or newno eq 9041149505
                                   or newno eq 14073279505
                                   or newno eq 16089279505;
  update mylib.mod1 set q16 = 2 where newno eq 8035149505
                                   or newno eq 9041149505
                                   or newno eq 12055279505
                                   or newno eq 16087219505;
	update mylib.mod1 set q16 = 3 where newno eq 6026179505
                                   or newno eq 9039129505
                                   or newno eq 15081259505                                         
                                   or newno eq 21099199505;
	update mylib.mod1 set q16 = 4 where newno eq 5022279505
                                   or newno eq 8038309505
                                   or newno eq 21099279505;
  update mylib.mod1 set q16 = 5 where newno eq 2006249505;
	update mylib.mod1 set q16 = 6 where newno eq 7032259505;
  update mylib.mod1 set q17 = 1 where newno eq 3007169505 
                                   or newno eq 5017239505
                                   or newno eq 16089299505
                                   or newno eq 21098169505;
  update mylib.mod1 set q17 = 2 where newno eq 13063229505
                                   or newno eq 17090189505;
	update mylib.mod1 set q17 = 3 where newno eq 5018219505;
	update mylib.mod1 set q18 = 2 where newno eq 6024239505
                                   or newno eq 7031289505
                                   or newno eq 12054209505
                                   or newno eq 13068029505
																	 or newno eq 13068159505
																	 or newno eq 15078049505;
  update mylib.mod1 set q18 = 3 where newno eq 12055139505                                     
                                   or newno eq 13068089505
                                   or newno eq 13068199505
																	 or newno eq 14071199505
																	 or newno eq 14071199505;
  update mylib.mod1 set q18 = 4 where newno eq 2005239505
                                   or newno eq 3008239505 
                                   or newno eq 5017149505
                                   or newno eq 9039129505;
quit;

* Clean up;
proc datasets library=work; delete h1; quit;


** Look at the distribution of q73 to determine the income groups **;

data i1;
  set mylib.mod1 (keep= newno q73);
	if q73 eq 98 then q73a = 1;
	else if q73 eq 99 then q73a = .;
	else q73a = q73;
run;

proc freq data=i1;
  tables q73a;
run;
* determined that values
    1-4 => group 1 (lowest 25%)
    5-13 => group 2 (middle 50%)
    14-21 => group 3 (highest 25%);

* Clean up;
proc datasets library=work; delete i1; quit;


** Check if q111 and tenure synchronized **;
proc sql;
  select q111, tenure, count(*)
  from mylib.mod1
  group by 1,2;
quit;
* looks like tenure is more precise as q111=2 can be either rent or mortgage shared;


** Check if q116 and lefted synchronized for student **;
proc sql;
  select q116, lefted, count(*)
  from mylib.mod1
	where q116 eq 97 or lefted eq 5
  group by 1,2;
quit;
* ok;

** Check if q130 and cars are synchronized **;
proc sql;
  select q130, cars, count(*)
  from mylib.mod1
  group by 1,2;
quit;
* looks like cars is more precise;


** Check if emp, q118, q125, and q122 synchronized **;
proc sql;
create table e1 as (
  select
    case when emp eq 1 then 'full time' 
		     when emp eq 2 then 'part time'
				 when emp eq 3 then 'unemploy '
				 when emp eq 4 then 'inactive '
    else '.' end as emp_1,
		case when q118 eq 1 then 'full time' 
		     when q118 eq 2 then 'part time'
				 when q118 eq 3 then 'part time'
    else '.' end as q118_1,
		case when q125 eq 1 then 'employed' 
		     when q125 eq 2 then 'self-emp'
    else '.' end as q125_1,
		case when q122 eq 1 then 'yes in-paid emp' 
		     when q122 eq 2 then 'no in-paid emp '
    else '.' end as q122_1,
    count(*)
	from mylib.mod1
	group by 1,2,3,4);
quit;
* not very synchronized, seems emp is more uniformed and has less missing values;

** Check the employment when q119=blank **;
proc sql;
create table e2 as (
  select
    case when emp eq 1 then 'full time' 
		     when emp eq 2 then 'part time'
				 when emp eq 3 then 'unemploy '
				 when emp eq 4 then 'inactive '
    else '.' end as emp_1,
		case when q118 eq 1 then 'full time' 
		     when q118 eq 2 then 'part time'
				 when q118 eq 3 then 'part time'
    else '.' end as q118_1,
		case when q125 eq 1 then 'employed' 
		     when q125 eq 2 then 'self-emp'
    else '.' end as q125_1,
		case when q122 eq 1 then 'yes in-paid emp' 
		     when q122 eq 2 then 'no in-paid emp '
    else '.' end as q122_1, q113,
    count(*)
	from mylib.mod1
	where q119 eq .
	group by 1,2,3,4,5);
quit;
* 99% of the q119=blank are either employed/selfemployed,
  if their age is 65+, then set retired=1, else set to retired=0;

* Clean up;
proc datasets library=work; delete e1 e2; quit;


** Check the distribution of sex and age **;
proc sql;
  select q112, q113, count(*)
  from mylib.mod1
  group by 1,2;
quit;


** Check the values of the activity time slot variables **;

data a1;
  set mylib.mod1(keep= newno _:);
run;

proc transpose data=a1 out=a2;
  by newno;
run;

proc sql;
  select col1, count(*)
	from a2
	group by 1;
quit;
* other than activity code 1-31 (31 is probably the code for missing), 
  found 1 case of 12.8, 1 case of 13.2;

proc sql;
  select *
	from a2
	where col1 eq 12.8 or col1 eq 13.2;
quit;
* newno=14070219505 _12_15=13.2 _12_30=12.8;

data a3;
  set mylib.mod1 (keep= newno _: tot: daytot);
	if newno eq 14070219505;
run;
* from the types of activity before and after these time slots,
  these bad activities can be reassign to activity code 13,
  note that due to this problem, daytot=1410(although this variable may not be used),
  need to add 30 mins to tot13 and daytot;

* Update the variables;
proc sql;
  update mylib.mod1 set _12_15 = 13 where newno eq 14070219505;
  update mylib.mod1 set _12_30 = 13 where newno eq 14070219505;
	update mylib.mod1 set tot13 = tot13 + 30 where newno eq 14070219505;
	update mylib.mod1 set daytot = daytot + 30 where newno eq 14070219505;
quit;

* Clean up;
proc datasets library=work; delete a1 a2 a3; quit;


** Check the accuracy of the total activity time variables **;

* Sum up total activity time from the time slot variables;

data ta1;
  set mylib.mod1(keep= newno _:);
run;

proc transpose data=ta1 out=ta2;
  by newno;
run;

proc sql;
create table ta3 as (
  select newno, col1 as actv, count(*) as actv_cnt
	from ta2
	group by 1,2);
quit;

data ta4;
  set ta3;
	actv_tot = actv_cnt * 15;
run;

* Get the total activity time variables;

data tt1;
  set mylib.mod1(keep= newno tot:
                 drop= tot);
run;

proc transpose data=tt1 out=tt2;
  by newno;
run;

data tt3;
  set tt2;
	actv = substr(_name_,4,length(_name_)-3) * 1;
	rename col1 = actv_tot;
run;

* Compare the values;
proc sql;
create table c1 as (
  select a.*, b.actv_tot as tt_tot
	from ta4 a
	join tt3 b
	on a.newno = b.newno
	where a.actv eq b.actv
	and a.actv_tot ne b.actv_tot);
quit;
* 0, tot* variables are ok to use;

* Clean up;
proc datasets library=work; delete ta1 ta2 ta3 ta4 tt1 tt2 tt3 c1; quit;


** Calculate imputed personal/household care and imputed sleep **;

data i1 (drop= tot31);
  set mylib.mod1(keep= newno _: tot31);
	if tot31 gt 0;
run;

proc transpose data=i1 out=i2;
  by newno;
run;

* Assign a date to make the time in sequence since the diary started at 4am;
data i3 (rename = (col1=actv));
  set i2;
	length ts $30;

	if length(compress(_name_)) eq 6 then
    ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2);
	else if length(compress(_name_)) eq 5 
          and (substr(compress(_name_),1,3) eq '_0_'
               or substr(compress(_name_),1,3) eq '_1_'
					     or substr(compress(_name_),1,3) eq '_2_'
							 or substr(compress(_name_),1,3) eq '_3_') then
		ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);
	else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);

	ep_time = input(ts, datetime.);
	format ep_time datetime.; 
run;

proc sort data=i3(keep= newno _name_ actv ep_time) out=t_input;
  by newno ep_time;
run;

* Test the missing gap;

data t_test_missing t_adjustor(drop = actv ep_time);
	set t_input;
	by newno ep_time;

	retain;

	* 4 variables to denote the pre- and post-missing time and activities;
	format start_miss_ts datetime13.;
	format end_miss_ts datetime13.;
	format lag_ts datetime13.;

	lag_ts = lag(ep_time);

	* reset for different resp;
	if first.newno then do;
		start_miss_ts = .;
		end_miss_ts = .;
		pre_actv = .;
		post_actv = .;
		lactv = .;
		lag_ts = .;
	end;

	lactv = lag(actv);

	drop lactv lag_ts;

	* check for something then missing;
	if lactv ne 31 and actv eq 31 then do;
		start_miss_ts = ep_time;
		pre_actv = lactv;

		end_miss_ts = .;
		post_actv = .;
	end;

	length startkey $70;
	length endkey $70;

	* check for missing then something;
	if lactv eq 31 and actv ne 31 then do;
		end_miss_ts = lag_ts;
		post_actv = actv;

		startkey = catt(newno,start_miss_ts);
		endkey = catt(newno,end_miss_ts);
		if (end_miss_ts - start_miss_ts) le 60*60*2 then output t_adjustor;
	end;

	startkey = catt(newno,start_miss_ts);
	endkey = catt(newno,end_miss_ts);
	output t_test_missing;
run;

data t_test;
	set t_input;

	imputed_actv = .;

	length key $70;
	key = catt(newno,ep_time);
run;

* Imputing data case by case;

* case: sleep -> missing -> paidwork
  imputed personal care;
proc sql;
	* imputted personal care;
	update t_test set imputed_actv = 3 
	where key in (select 
		startkey
		from t_adjustor
		where pre_actv = 1 and post_actv lt 3
		)
	;

	update t_test set imputed_actv = 1 
	where key in (select 
		startkey
		from t_adjustor
		where pre_actv = 1 and post_actv gt 2
		)
	;

	* home to travel;
	update t_test set imputed_actv = 62 
	where key in (select 
		startkey
		from t_adjustor b
		where (pre_actv lt 7 or pre_actv between 13 and 21) and
			(post_actv between 8 and 12 or post_actv = 23 or post_actv between 26 and 28)
			and (end_miss_ts - start_miss_ts) lt 2*15*60
		)
	;

	update t_test set imputed_actv = 1 
	where key in (select 
		startkey
		from t_adjustor b
		where (pre_actv lt 7 or pre_actv between 13 and 21) and
			(post_actv between 8 and 12 or post_actv = 23 or post_actv between 26 and 28)
			and (end_miss_ts - start_miss_ts) gt 15*60
		)
	;

	update t_test set imputed_actv = 62 
	where key in (select 
		endkey
		from t_adjustor b
		where (pre_actv lt 7 or pre_actv between 13 and 21) and
			(post_actv between 8 and 12 or post_actv = 23 or post_actv between 26 and 28)
			and (end_miss_ts - start_miss_ts) gt 15*60
		)
	;

	* travel to home;
	update t_test set imputed_actv = 62 
	where key in (select 
		startkey
		from t_adjustor b
		where (post_actv lt 7 or post_actv between 13 and 21) and
			(pre_actv between 8 and 12 or pre_actv = 23 or pre_actv between 26 and 28)
			and (end_miss_ts - start_miss_ts) lt 2*15*60
		)
	;

	update t_test set imputed_actv = 1 
	where key in (select 
		startkey
		from t_adjustor b
		where (post_actv lt 7 or post_actv between 13 and 21) and
			(pre_actv between 8 and 12 or pre_actv = 23 or pre_actv between 26 and 28)
			and (end_miss_ts - start_miss_ts) gt 15*60
		)
	;

	update t_test set imputed_actv = 62 
	where key in (select 
		endkey
		from t_adjustor b
		where (post_actv lt 7 or post_actv between 13 and 21) and
			(pre_actv between 8 and 12 or pre_actv = 23 or pre_actv between 26 and 28)
			and (end_miss_ts - start_miss_ts) gt 15*60
		)
	;

	* travel and then back home;
	update t_test set imputed_actv = 1 
	where key in (select 
		endkey
		from t_adjustor b
		where (post_actv lt 7 or post_actv between 13 and 21) and
			(pre_actv = 7)
		)
	;

	* home and then travel;
	update t_test set imputed_actv = 1 
	where key in (select 
		endkey
		from t_adjustor b
		where (pre_actv lt 7 or pre_actv between 13 and 21) and
			(post_actv = 7)
		)
	;
	quit;

proc sql;
  select imputed_actv, count(*)
  from t_test
  group by 1;
quit;

* Create imputed variables;

proc sql;
create table imp1 as (
  select newno, imputed_actv, count(*) as cnt
  from t_test
	where imputed_actv ne .
  group by 1,2);
quit;

proc sql;
create table imp2 as (
  select newno,
		max(case when imputed_actv eq 1 then cnt*15 else 0 end) as imp_m1,
		max(case when imputed_actv eq 3 then cnt*15 else 0 end) as imp_m3,
		max(case when imputed_actv eq 62 then cnt*15 else 0 end) as imp_m62
	from imp1
  group by 1);
quit;

* Subtract the imputed time from tot31 (total missing);
proc sql;
create table imp3 as (
  select a.*,
    case when b.newno ne . then b.imp_m1 else 0 end as imp_m1,
    case when b.newno ne . then b.imp_m3 else 0 end as imp_m3,
		case when b.newno ne . then b.imp_m62 else 0 end as imp_m62,
		case when b.newno ne . then a.tot31 - (b.imp_m1 + b.imp_m3 + b.imp_m62)
      else a.tot31 end as newtot31
  from mylib.mod1 a
	left join imp2 b
	on a.newno = b.newno);
quit;

* Check;
proc sql;
  select count(*)
  from imp3
  where newtot31 lt 0;
quit;
* 0, ok;

** Compute missing continuously 90+ minutes of activity **;

proc sort data=t_test out=m1;
  by newno ep_time;
run;

* Identify the problem cases;
data m2;
  set m1;
	by newno;
	retain cnt;

	if _n_ eq 1 then cnt = 0;
	lagactv = lag(actv);

	if actv eq 31 and imputed_actv eq . then cnt = cnt + 1;
	if cnt gt 0 and actv ne 31 and lagactv eq 31 then do;
	  if cnt ge 6 then output;
		cnt = 0;
	end;
run;

* Tag the problem cases;
proc sql;
create table imp4 as (
  select a.*, case when b.newno ne . then 1 else 0 end as mis90plus
  from imp3 a
	left join (select distinct newno from m2) b
	on a.newno = b.newno);
quit;


** Update the imputed time slots **;

* Reassign activity codes (add a 9 to the code) due to the imputed time:
   1 -> 91 (imputed personal or household care)
   3 -> 93 (imputed sleep)
  62 -> 962 (mode of recorded travel); 
data u1;
  set t_test (keep = newno _name_ actv imputed_actv);

	if imputed_actv ne .;
	select(imputed_actv);
		when (1) do; newactv = 91; end;
		when (3) do; newactv = 93; end;
		when (62) do; newactv = 962; end;
		otherwise;
	end;
run;

* Write the update statements to a SAS program;
data _null_;
	set u1 end=eof;
	file "&libdir.\imptimeslot.sas";

	length aline $ 200;

	if _n_ eq 1 then do;
		aline = "proc sql;";
		put aline;
	end;

	aline = "update imp4 set " || _name_ || " = " || newactv || " where newno = " || newno || " ; ";
	put aline;

	if eof eq 1 then do;
		aline = "quit;";
		put aline;
	end;
run;

* Execute the SAS program;
%include "&libdir.\imptimeslot.sas";

* Create a temp modified data set for further work;
data mylib.mod2; set imp4; run;

* Clean up;
proc datasets library=work; 
  delete i1 i2 i3 t_input t_test_missing t_adjustor t_test imp1 imp2 imp3 imp4 m1 m2 u1; 
quit;


** Calculate from/to work/education travel time **;
** tot7=travel, tot8=paidwork, tot9=education **
** tot7 -> av5, av17,  av5 -> main63,  av17 -> main62 **;

data t1 (drop= tot7);
  set mylib.mod2(keep= newno _: tot7);
	if tot7 gt 0;
run;

proc transpose data=t1 out=t2;
  by newno;
run;

* Assign a date to make the time in sequence since the diary started at 4am;
data t3 (rename = (col1=actv));
  set t2;
	length ts $30;

	if length(compress(_name_)) eq 6 then
    ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2);
	else if length(compress(_name_)) eq 5 
          and (substr(compress(_name_),1,3) eq '_0_'
               or substr(compress(_name_),1,3) eq '_1_'
					     or substr(compress(_name_),1,3) eq '_2_'
							 or substr(compress(_name_),1,3) eq '_3_') then
		ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);
	else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);

	ep_time = input(ts, datetime.);
	format ep_time datetime.; 
run;

proc sort data=t3(keep= newno _name_ actv ep_time) out=t4;
  by newno ep_time;
run;

* Find to-work/education and from-work/education time slots;

data tr1;
  set t4;
	by newno ep_time;
  retain;
  format start_tr end_tr datetime.;

	lagactv = lag(actv);

	if first.newno then do;
		lagactv = 0;
		tag_tr = 0;
	end;

	if actv eq 7 and lagactv ne 7 then start_tr = ep_time;
	if (actv eq 8 or actv eq 9) and lagactv eq 7 then do;
    end_tr = ep_time;
		tag_tr = 1;
	end;
  if actv eq 7 and (lagactv eq 8 or lagactv eq 9) then start_tr = ep_time;
	if (actv ne 7 and actv ne 8 and actv ne 9) and lagactv eq 7 then do;
    end_tr = ep_time;
		tag_tr = 0;
	end;
run;

proc sort data=tr1 out=tr2;
  by newno descending ep_time ;
run;

data tr3;
  set tr2;
	by newno descending ep_time;
  retain;
	format newstart_tr newend_tr datetime.;

	needfix = 0;

	if lagactv eq 7 and tag_tr eq 1 then do;
		newstart_tr = start_tr;
		newend_tr = end_tr;
		needfix = 1;
	end;

	if actv eq 7 and tag_tr eq 1 then do;
		newstart_tr = start_tr;
		newend_tr = end_tr;
		needfix = 1;
	end;

	if actv ne 7 and needfix eq 0 then do;
		newstart_tr = .;
		newend_tr = .;
	end;

	if ep_time ge newstart_tr and ep_time le newend_tr then needfix = 1;
run;

* Reassign travel code 7 to 97 for to/from work/education time
  Write the update statements to a SAS program;

data u1;
  set tr3 (keep = newno _name_ actv needfix);
	if actv eq 7 and needfix;
run;

data _null_;
	set u1 end=eof;
	file "&libdir.\travtimeslot.sas";

	length aline $ 200;

	if _n_ eq 1 then do;
		aline = "proc sql;";
		put aline;
	end;

	aline = "update mylib.mod2 set " || _name_ || " = 97 where newno = " || newno || " ; ";
	put aline;

	if eof eq 1 then do;
		aline = "quit;";
		put aline;
	end;
run;

* Execute the SAS program;
%include "&libdir.\travtimeslot.sas";

* Redefine tot7;

proc sql;
create table tr4 as (
  select a.newno, a.cnt * 15 as av5
	from (select newno, count(*) as cnt
	      from u1
	      group by 1) a
);
quit;

proc sql;
create table tr5 as (
  select a.*,
         case when b.newno ne . then b.av5 else 0 end as cal_av5
	from mylib.mod2 a
	left join tr4 b
	on a.newno = b.newno);
quit;

data tr6;
  set tr5;
  cal_av17 = tot7 - cal_av5;
run;

* Check;
proc sql;
  select count(*)
  from tr6
  where cal_av17 lt 0;
quit;
*0, ok;

* Create a temp modified data set for further work;
data mylib.mod3; set tr6; run;

* Clean up;
proc datasets library=work; 
  delete t1 t2 t3 t4 tr1 tr2 tr3 u1 tr4 tr5 tr6; 
quit;


** Final count the number of episodes **;

data e1;
  set mylib.mod3(keep= newno _:);
run;

proc transpose data=e1 out=e2;
  by newno;
run;

* Assign a date to make the time in sequence since the diary started at 4am;
data e3 (rename = (col1=actv));
  set e2;
	length ts $30;

	if length(compress(_name_)) eq 6 then
    ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2);
	else if length(compress(_name_)) eq 5 
          and (substr(compress(_name_),1,3) eq '_0_'
               or substr(compress(_name_),1,3) eq '_1_'
					     or substr(compress(_name_),1,3) eq '_2_'
							 or substr(compress(_name_),1,3) eq '_3_') then
		ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);
	else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);

	ep_time = input(ts, datetime.);
	format ep_time datetime.; 
run;

proc sort data=e3(keep= newno _name_ actv ep_time) out=e4;
  by newno ep_time;
run;

* Get episode count;
data e5;
  set e4;
	by newno ep_time;
	retain;

	if first.newno then ep_cnt = 1;
	if first.newno ne 1 and actv ne lag(actv) then ep_cnt = ep_cnt + 1;
run;

* Get the number of episodes;
proc sql;
  create table ep1 as (
    select newno, max(ep_cnt) as num_ep
    from e5
    group by 1);
quit;

* Append the number of episodes variable;
proc sql;
create table ep2 as (
  select a.*, b.num_ep   
  from mylib.mod3 a
	left join ep1 b
	on a.newno = b.newno);
quit;

* Create a temp modified data set for further work;
data mylib.mod4; set ep2; run;

* Clean up;
proc datasets library=work; 
  delete e1 e2 e3 e4 e5 ep1 ep2; 
quit;


** Fix the cases that have tot10 (breaks) but no tot8 (paid work) and emp not equal 1 or 2 (not employed) **;

data b1 (keep= newno tot10);
  set mylib.mod4 (keep= newno tot8 tot18 tot10 emp);
  if tot8 eq 0 and tot18 eq 0 and tot10 gt 0 and emp ne 1 and emp ne 2;
run;
* 40 cases;

proc sql;
  create table b2 as (
    select a.*,
		  case when b.newno eq . then a.tot10 else 0 end as tot10_wrk,
			case when b.newno ne . then b.tot10 else 0 end as tot10_oth
		from mylib.mod4 a
		left join b1 b
		on a.newno = b.newno
);
quit;

* Create a temp modified data set for further work;
data mylib.mod5; set b2; run;

* Clean up;
proc datasets library=work; 
  delete b1 b2; 
quit;

* Create the final modified data set for the conversion;
data mylib.uk1995_mod; set mylib.mod5; run;


*******************************************************************
*************  Harmonized variables for 5.53 and 5.8  *************
******************************************************************;

data mylib.hvar1 (keep = newno date desigday wta _:
                         imp_m: newtot31 mis90plus num_ep cal_av:);
  set mylib.uk1995_mod;

  _countreg = 37;
  *_country = 6;
  _survey = 1995;
  _swave = 0;
  _msamp = 0;
  _hldid = 0;
  _persid = newno;
  _id = _persid;
  _parntid1 = -9;
  _parntid2 = -9;
  _partid = -9;

	select (desigday); * 1=sun;
	  when (1) _day = 2;
		when (2) _day = 3;
		when (3) _day = 4;
		when (4) _day = 5;
		when (5) _day = 6;
		when (6) _day = 7;
		when (7) _day = 1;	
		otherwise _day = -8;
	end;

	_month = month; 
  _year = _survey;
  _diary = 1;

  _badcase = 0;  * a placeholder, to be updated later;

	* q16 is total people in household;
	* q18 is household type;
	* q115 is marital status;

	if q16 eq 1 or q18 eq 1 then _hhtype = 1;
	else if q16 eq 2 and (q115 eq 1 or q115 eq 2) then _hhtype = 2;
  else if q16 gt 2 and (q18 eq 2 or q18 eq 3) then _hhtype = 3;
	else if q16 gt 2 and (q115 eq 1 or q115 eq 2) then _hhtype = 3;
  else _hhtype = 4;

	if q16 ne . then _hhldsize = q16;
  else _hhldsize = -8;

	* q13/q14/q15 is number of children age 0-4/5-10/11-15;
	* q113 is age 16 or older;
	* q114 is relationship to household: 3 = partner's child;
  if (q113 eq 16 or q113 eq 17) and q114 eq 3 
			then _nchild = sum(q13, q14, q15) + 1;  * diarist plus the number of child;
	else _nchild = sum(q13, q14, q15);

  * q14 is age 5-10 and q15 is age 11-15, so agekidx values 2 and 3 may not be precise;
  if q13 ge 1 then _agekidx = 1;
	else if q14 ge 1 then _agekidx = 2;
	else if q15 ge 1 then _agekidx = 3;
  else if q113 eq 16 or q113 eq 17 then _agekidx = 3;
	else if q113 eq 18 then _agekidx = 4;
	else if sum(q13, q14, q15) eq 0 then _agekidx = -7;
  else _agekidx = -8;

  _agekid2 = -9;
  
	* q73 is gross income and use it as is;
	if q73 ne . then _incorig = q73;
	else _incorig = -8;

	* The income group distribution is determined above;
  if (q73 ge 1 and q73 le 4) or q73 eq 98 then _income = 1;
  else if q73 ge 5 and q73 le 13 then _income = 2;
  else if q73 ge 14 and q73 le 21 then _income = 3;
  else _income = -8; 

	if tenure eq 1 or tenure eq 2 then _ownhome = 1;
	else if tenure eq 3 or tenure eq 4 then _ownhome = 2;
	else _ownhome = 3;

  _urban = -9;
  _computer = -9;

	if cars eq 0 then _vehicle = 0;
	else if cars eq 1 then _vehicle = 3;
	else if cars eq 2 or cars eq 3 then _vehicle = 4;
  else _vehicle = -8;

  _sex = q112;

	* q113 is age;
	if q113 lt 80 and q113 ne . then _age = q113;
	else if q113 ge 80 then _age = 80;
	else _age = -8;

	* q13/q14/q15 is number of children age 0-4/5-10/11-15;
	* q113 is age;
	* q114 is relationship to household: 3 = partner's child;

  if q113 ge 18 and q113 le 39 and sum(q13, q14, q15) eq 0 then _famstat = 0;
	else if q113 ge 18 and q13 ge 1 then _famstat = 1;
	else if q113 ge 18 and q13 eq 0 and sum(q14, q15) ge 1 then _famstat = 2;
  else if q113 ge 40 and sum(q13, q14, q15) eq 0 then _famstat = 3;
  else if q113 ne . and q113 lt 18 and q114 eq 3 then _famstat = 4;
	else if q113 ne . and q113 lt 18 then _famstat = 5;
  else _famstat = -8;

	* q114 is relationship to household: 3 = partner's child;
	* q115 is marital status;
	if q114 eq 3 and (q115 eq 3 or q115 eq 4 or q115 eq 5 or q115 eq 6) then _cphome = 1;
	else if q114 eq 1 or q114 eq 2 or q114 eq 4 or q114 eq 5 or q114 eq 6 then _cphome = 0;
  else _cphome = -8;

	* q18 is household type B;
	if q18 eq 4 then _singpar = 1;
	else if q18 eq 1 or q18 eq 2 or q18 eq 3 or q18 eq 5 then _singpar = 0;
	else _singpar = -8;

	* q114 is relationship to household;
	if q114 eq 1 or q114 eq 2 or q114 eq 3 or q114 eq 4 then _relrefp = q114;
  else if q114 eq 5 then _relrefp = 9;
  else if q114 eq 6 then _relrefp = 10;
  else _relrefp = -8;
 
	* q115 is marital status;

	if (q115 eq 1 or q115 eq 2) then _civstat = 1;
	else if q115 eq 3 or q115 eq 4 or q115 eq 5 or q115 eq 6 then _civstat = 2;
	else _civstat = -8;

	if q115 eq 1 then _cohab = 0;
  else if q115 eq 2 then _cohab = 1;
	else if q115 eq 3 or q115 eq 4 or q115 eq 5 or q115 eq 6 then _cohab = -7;
	else _cohab = -8;

  _citizen = -9;

	* emp is economic status;

  if emp eq 1 then _empstat = 1;
	else if emp eq 2 then _empstat = 2;
	else if emp eq 3 then _empstat = 4;
	else if emp eq 4 then _empstat = 4;
	else _empstat = -8;

	if _empstat eq 1 or _empstat eq 2 or _empstat eq 3 then _emp = 1;
	else if _empstat eq 4 then _emp = 0;
	else _emp = -8;

	if emp eq 1 or emp eq 2	or emp eq 4 then _unemp = 0;
	else if emp eq 3 then _unemp = 1;
  else _unemp = -8;

	* q116 is age left full education, 97 = still in full time education;
	if q116 eq 97 then _student = 1;
	else if q116 eq 99 or q116 eq . then _student = -8;
	else _student = 0;

	* q119 is last week activity
	  see above for the finding on q119;
	if q119 eq 7 then _retired = 1;
  else if q119 eq . and q113 ge 65 then _retired = 1; * age 65+;
	else _retired = 0; 
	if q113 ge 70 and _empstat eq 4 then _retired = 1;  * age 70+ and not in paid work;

  _empsp = -9;
  _workhrs = -9;
  _empinclm = -9;
  _occup = -9;
  _sector = -9;

	* lefted is age completed full time education;
  * q113 is age;

	if lefted ne . then _educa = lefted;
	else _educa = -8;

  if lefted eq 1 or lefted eq 2 or lefted eq 3 then _edtry = lefted;
	else if lefted eq 4 then _edtry = 3;
	else if lefted eq 5 and q113 gt 0 and q113 le 17 then _edtry = 1;
	else if lefted eq 5 and q113 eq 18 then _edtry = 2;
	else if lefted eq 5 and q113 ge 19 then _edtry = 3;
	else if lefted eq 6 then _edtry = 1;
  else _edtry = -8;

  _rushed = -9;
  _health = -9;
	_carer = -9;
  _disab = -9;

  _av1 = sum(tot8,tot10_wrk);
  _av2 = tot18;
  _av3 = -9;
  _av4 = tot9;
  _av5 = cal_av5;
  _av6 = tot3;
  _av7 = sum(tot13,tot14);
  _av8 = sum(tot6,tot15);
  _av9 = tot16;
  _av10 = tot11;
  _av11 = tot5;
  _av12 = -9;
  _av13 = sum(tot2,imp_m1);
  _av14 = -9;
  _av15 = tot4;
  _av16 = sum(tot1,imp_m3);
  _av17 = sum(imp_m62,cal_av17);
  _av18 = tot26;
  _av19 = tot25;
  _av20 = -9;
  _av21 = -9;
  _av22 = -9;
  _av23 = sum(tot12,tot28);
  _av24 = tot27;
  _av25 = -9;
  _av26 = -9;
  _av27 = -9;
  _av28 = tot23;
  _av29 = tot24;
  _av30 = -9;
  _av31 = tot19;
  _av32 = -9;
  _av33 = tot17;
  _av34 = -9;
  _av35 = tot21;
  _av36 = tot29;
  _av37 = tot20;
  _av38 = -9;
  _av39 = -9;
  _av40 = sum(tot22,tot30,tot10_oth);
  _av41 = newtot31;

  _ocombwt = wta;

  * A placeholder, to be calculater later;
  _propwt = 0.0;

  * 5.8 specific variables;

  _main1 = imp_m1;
  _main2 = tot1;
  _main3 = imp_m3;
  _main4 = tot2;
  _main5 = -9;
  _main6 = tot4;
  _main7 = tot8;
  _main8 = tot18;
  _main9 = -9;
  _main10 = -9;
  _main11 = -9;
  _main12 = tot10_wrk;
  _main13 = -9;
  _main14 = -9;
  _main15 = tot9;
  _main16 = tot17;
  _main17 = -9;
  _main18 = tot3;
  _main19 = -9;
  _main20 = tot13;
  _main21 = tot14;
  _main22 = tot15;
  _main23 = -9;
  _main24 = tot11;
  _main25 = -9;
  _main26 = -9;
  _main27 = -9;
  _main28 = tot5;
  _main29 = -9;
  _main30 = -9;
  _main31 = -9;
  _main32 = tot6;
  _main33 = sum(tot12,tot28);
  _main34 = -9;
  _main35 = tot26;
  _main36 = -9;
  _main37 = tot27;
  _main38 = -9;
  _main39 = tot23;
  _main40 = -9;
  _main41 = -9;
  _main42 = tot25;
  _main43 = -9;
  _main44 = -9;
  _main45 = -9;
  _main46 = tot16;
  _main47 = -9;
  _main48 = tot24;
  _main49 = tot20;
  _main50 = -9;
  _main51 = sum(tot30,tot10_oth);
  _main52 = -9;
  _main53 = -9;
  _main54 = tot22;
  _main55 = tot29;
  _main56 = tot21;
  _main57 = -9;
  _main58 = -9;
  _main59 = tot19;
  _main60 = -9;
  _main61 = -9;
  _main62 = sum(imp_m62,cal_av17);
  _main63 = cal_av5;
  _main64 = -9;
  _main65 = -9;
  _main66 = -9;
  _main67 = -9;
  _main68 = -9;
	_main69 = newtot31;

	_sppart = -9;

	* UK supplement variables;

	select (region);
    when (1,2,3,4,5,6) _region = 1;
    when (7,8,9) _region = 2;
    when (10) _region = 3;
	  when (11,12,13,14) _region = 4;
	  when (15) _region = 5;
	  when (16,17) _region = 6;
	  when (18,19,20,21) _region = 7;
    otherwise _region = -8;
  end;

	* q116 is age left full education;
  if q116 eq 97 then _aglftsch = 0; 
  else if q116 le 15 and q116 ne . then _aglftsch = 15;
  else if q116 ge 21 then _aglftsch = 21;
	else if q116 eq . then _aglftsch = -8;
	else _aglftsch = q116;

	* q71 is ethnic group;
	select (q71);
    when (1) _ethnic = 1;
    when (2,3,4) _ethnic = 2;
    when (5,6,7) _ethnic = 3;
	  when (8,9) _ethnic = 4;
    otherwise _ethnic = -8;
  end;

run;


*******************************************************************
*************  Adjusted variables for 5.53 and 5.8  *************
******************************************************************;

** Define the categories for badcase **;

* Tag the cases with missing data or with a situation;
data b1;
  set mylib.hvar1 (keep= newno _day _sex _age num_ep mis90plus _main:);

  nosleep = 0;
  noeat = 0;
  nocare = 0;
  notravex = 0;
  noday = 0;
  nosexage = 0;
  less7ep = 0;
  careother = 0;
  homeallday = 0;
  mis2basicplus = 0;

  if sum(_main2,_main3) eq 0 then nosleep = 1;
  if sum(_main6,_main18,_main39) eq 0 then noeat = 1;
  if sum(_main1,_main4) eq 0 then nocare = 1;
  if sum(_main42,_main62,_main63) eq 0 then notravex = 1;
  misbasic = sum(nosleep, noeat, nocare, notravex);

  if _day lt 1 then noday = 1;
  if _sex lt 1 or _age lt 1 then nosexage = 1;
  if num_ep lt 7 then less7ep = 1;
  if misbasic ge 2 then mis2basicplus = 1;
  if _main28 gt 1 or _main32 gt 1 then careother = 1;

  * at home all day: home time at least 1000 minutes and no travel time;
  if sum(_main1,_main2,_main3,_main4,_main8,_main18,
         _main20,_main21,_main22,_main55,_main59) ge 1000
     and sum(_main62, _main63) eq 0 then homeallday = 1;

  * missing 2 or more basic activities and exception;
  if misbasic ge 2
    then do;
      if careother or homeallday or (misbasic eq 2 and num_ep ge 15) then mis2basicplus = 0;
      else mis2basicplus = 1;
    end;
run;

* Identify bad cases;
data b2;
  set b1;

  badcase = 0;

  if noday and nosexage and mis90plus and less7ep and mis2basicplus then badcase = 31;
  else if noday and mis90plus and less7ep and mis2basicplus then badcase = 30;
  else if nosexage and mis90plus and less7ep and mis2basicplus then badcase = 29;
  else if nosexage and noday and less7ep and mis2basicplus then badcase = 28;
  else if nosexage and noday and mis90plus and mis2basicplus then badcase = 27;
  else if nosexage and noday and mis90plus and less7ep then badcase = 26;
  else if mis90plus and less7ep and mis2basicplus then badcase = 25;
  else if noday and less7ep and mis2basicplus then badcase = 24;
  else if noday and mis90plus and mis2basicplus then badcase = 23;
  else if noday and mis90plus and less7ep then badcase = 22;
  else if nosexage and mis2basicplus and less7ep then badcase = 21;
  else if nosexage and mis90plus and mis2basicplus then badcase = 20;
  else if nosexage and mis90plus and less7ep then badcase = 19;
  else if nosexage and noday and mis2basicplus then badcase = 18;
  else if nosexage and noday and less7ep then badcase = 17;
  else if nosexage and noday and mis90plus then badcase = 16;
  else if less7ep and mis2basicplus then badcase = 15;
  else if mis90plus and mis2basicplus then badcase = 14;
  else if mis90plus and less7ep then badcase = 13;
  else if noday and mis2basicplus then badcase = 12;
  else if noday and less7ep then badcase = 11;
  else if noday and mis90plus then badcase = 10;
  else if nosexage and mis2basicplus then badcase = 9;
  else if nosexage and less7ep then badcase = 8;
  else if nosexage and mis90plus then badcase = 7;
  else if nosexage and noday then badcase = 6;
  else if mis2basicplus then badcase = 5;
  else if less7ep then badcase = 4;
  else if mis90plus then badcase = 3;
  else if noday then badcase = 2;
  else if nosexage then badcase = 1;
run;

* List the distribution of badcases;
proc sql;
  select badcase, count(*)
  from b2
  group by 1;
quit;
* 1887 badcase=0;

* Update badcase;
proc sql;
  update mylib.hvar1 a
	set _badcase = (select badcase
	                from b2 b
							    where a.newno = b.newno)
  where newno in (select newno from b2)
;
quit;

proc sql;
  select _badcase, count(*)
  from mylib.hvar1
  group by 1;
quit;
* ok;

** Calculate propwt **; 

* Check wta;
proc sql;
  select newno, wta, _badcase
	from mylib.hvar1
	where wta eq . or wta eq 0;
quit;
* one good case: newno=14071159505 wta=0;

proc sql;
create table w1 as (
  select *
  from mylib.uk1995_orig
  where serialno eq 14071159505
);
quit;
* looks like the assignment of wta=0 was triggered by q12=0 (# of adults);

* Get the sex, age, marital status, and education of this case;
proc sql;
  select newno, _sex, _age, _civstat, _cohab, _educa, _edtry, wta
  from mylib.hvar1
  where newno eq 14071159505;
quit;

* Get the median of wta from a similar group;
proc sql;
create table w2 as (
  select newno, _sex, _age, _civstat, _cohab, _educa, _edtry, wta
  from mylib.hvar1
  where _sex eq 1
  and _age eq 29
  and _civstat eq 2
	and _cohab eq -7
	and _educa eq 3
	and _edtry eq 3
);
quit;
* 8 cases, median is wta=0.527909426;

* Update the wta;
proc sql;
  update mylib.hvar1 a
	set wta = 0.527909426
  where newno eq 14071159505;
quit;

* Check weight distribution;
proc means data=mylib.hvar1 n min mean max sum;
	var wta;
	class _badcase;
run;
* 1883 good cases (_badcase eq 0);

* Make adjustment to the weight due to the identified bad cases,
  account for age, sex and employment status;

data grp1;
  set mylib.hvar1 (keep= newno wta _age _sex _day _emp _badcase);
  if _age lt 18 and _sex eq 1 then agesexgp = 1;
  if _age gt 17 and _age lt 26 and _sex eq 1 and _emp eq 1 then agesexgp = 2;
  if _age gt 17 and _age lt 26 and _sex eq 1 and _emp lt 1 then agesexgp = 3;
	if _age gt 25 and _age lt 40 and _sex eq 1 and _emp eq 1 then agesexgp = 4;
  if _age gt 25 and _age lt 40 and _sex eq 1 and _emp lt 1 then agesexgp = 5;
  if _age gt 39 and _age lt 60 and _sex eq 1 and _emp eq 1 then agesexgp = 6;
  if _age gt 39 and _age lt 60 and _sex eq 1 and _emp lt 1 then agesexgp = 7;
  if _age gt 59 and _age lt 71 and _sex eq 1 then agesexgp = 8;
  if _age gt 70 and _sex eq 1 then agesexgp = 9;
  if _age lt 18 and _sex eq 2 then agesexgp = 10;
  if _age gt 17 and _age lt 26 and _sex eq 2 and _emp eq 1 then agesexgp = 11;
  if _age gt 17 and _age lt 26 and _sex eq 2 and _emp lt 1 then agesexgp = 12;
  if _age gt 25 and _age lt 40 and _sex eq 2 and _emp eq 1 then agesexgp = 13;
  if _age gt 25 and _age lt 40 and _sex eq 2 and _emp lt 1 then agesexgp = 14;
  if _age gt 39 and _age lt 60 and _sex eq 2 and _emp eq 1 then agesexgp = 15;
  if _age gt 39 and _age lt 60 and _sex eq 2 and _emp lt 1 then agesexgp = 16;
  if _age gt 59 and _age lt 71 and _sex eq 2 then agesexgp = 17;
  if _age gt 70 and _sex eq 2 then agesexgp = 18;
	if _badcase gt 0 then agesexgp = 99;

	* set the bad cases to zero weight;
	if agesexgp ne 99 then wt2 = wta;
	else if agesexgp eq 99 then wt2 = 0;
run;

proc sql;
  create table grpnu as (
    select agesexgp, sum(wt2) as grouptot, count(*) as groupn
	  from grp1
    group by 1
  );
quit;

proc sql;
  create table daynu as (
    select agesexgp, _day, sum(wt2) as daytot, count(*) as dayn
	  from grp1
    group by 1, 2
  );
quit;

proc sql;
  create table grp2 as (
	  select a.*, b.grouptot, b.groupn, c.daytot, c.dayn
		from grp1 a
		left join grpnu b
		on a.agesexgp = b.agesexgp
		left join daynu c
    on a.agesexgp = c.agesexgp
		and a._day = c._day
	);
quit;

data grp3;
  set grp2;
	exptot = grouptot/7;
  if _badcase eq 0 then pwt = (exptot/daytot)/(dayn/daytot);
	else pwt = 0;
run;

data grp4;
  set grp3;
  newwt = pwt*(2005/1883);
run;

proc means data=grp4 n min mean max sum;
	var newwt;
run;
* something is not right, mean=0.9946613 and sum=1994.30;
* NEED TO WORK ON...;

* Update prompwt;
proc sql;
  update mylib.hvar1 a
	set _propwt = (select newwt
	                from grp4 b
							    where a.newno = b.newno)
  where newno in (select newno from grp4);
quit;

* Clean up;
proc datasets library=work; 
  delete b1 b2 w1 w2 grpnu daynu grp1 grp2 grp3 grp4; 
quit;


*******************************************************************
******************  Harmonized variables for 6.0  *****************
******************************************************************;

** Calculate the time variables **;

proc transpose data=mylib.uk1995_mod(keep= newno _:) out=t1;
  by newno;
run;
* 192480;

* Assign a date to make the time in sequence since the diary started at 4am;
data t2 (rename = (col1=actv));
  set t1;
	length ts $30;

	if length(compress(_name_)) eq 6 then
    ts = '01jan2009' || ':' || substr(_name_,2,2) || ':' || substr(_name_,5,2);
	else if length(compress(_name_)) eq 5 
          and (substr(compress(_name_),1,3) eq '_0_'
               or substr(compress(_name_),1,3) eq '_1_'
					     or substr(compress(_name_),1,3) eq '_2_'
							 or substr(compress(_name_),1,3) eq '_3_') then
		ts = '02jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);
	else ts = '01jan2009' || ':' || substr(compress(_name_),2,1) || ':' || substr(compress(_name_),4,2);

	ep_time = input(ts, datetime.);
	format ep_time datetime.; 
run;

proc sort data=t2(keep= newno actv ep_time) out=t3;
  by newno ep_time;
run;

* Get episode count;
data t4;
  set t3;
	by newno ep_time;
	retain;

	if first.newno then ep_cnt = 1;
	if first.newno ne 1 and actv ne lag(actv) then ep_cnt = ep_cnt + 1;
run;

proc sql;
  create table t5 as (
    select newno, ep_cnt as _epnum, actv, min(ep_time) as ep_start, max(ep_time) as ep_end
    from t4
    group by 1,2,3);
quit;

data t6;
  set t5;
  format ep_start ep_end datetime.;
	_clockst = compress(hour(ep_start)) || '.' || compress(minute(ep_start));
run;

data t7;
  set t6;
  format begtime datetime.;
	if _n_ eq 1 then begtime = ep_start; retain;

	_start = hour(ep_start - begtime)*60 + minute(ep_start - begtime);
	if _epnum ne 1 then _start = _start + 1;

	_end = hour(ep_end - begtime)*60 + minute(ep_end - begtime) + 15;

	_time = _end - _start;
	if _epnum ne 1 then _time = _time + 1;
run;

* Assign the activity variables;

data a1;
  set t7;
	select (actv);
	  when (1) do; _av=16; _main=2; _sec=-9; _inout=1; _eloc=1; end;
		when (2) do; _av=13; _main=4; _sec=-9; _inout=1; _eloc=1; end;
		when (3) do; _av=6; _main=18; _sec=-9; _inout=1; _eloc=1; end;
		when (4) do; _av=15; _main=6; _sec=-9; _inout=1; _eloc=1; end;
		when (5) do; _av=11; _main=28; _sec=-9; _inout=1; _eloc=-8; end;
		when (6) do; _av=8; _main=32; _sec=-9; _inout=1; _eloc=-8; end;
		when (7) do; _av=17; _main=62; _sec=-9; _inout=3; _eloc=8; end;
		when (97) do; _av=5; _main=63; _sec=-9; _inout=3; _eloc=8; end; * computed to/from work/education;
		when (8) do; _av=1; _main=7; _sec=-9; _inout=-8; _eloc=3; end;
		when (9) do; _av=4; _main=15; _sec=-9; _inout=1; _eloc=4; end;
		when (10) do; _av=1; _main=12; _sec=-9; _inout=-8; _eloc=-8; end;
		when (11) do; _av=10; _main=24; _sec=-9; _inout=1; _eloc=5; end;
		when (12) do; _av=23; _main=33; _sec=-9; _inout=-8; _eloc=-8; end;
		when (13) do; _av=7; _main=20; _sec=-9; _inout=1; _eloc=1; end;
		when (14) do; _av=7; _main=21; _sec=-9; _inout=1; _eloc=1; end;
		when (15) do; _av=8; _main=22; _sec=-9; _inout=-8; _eloc=1; end;
		when (16) do; _av=9; _main=46; _sec=-9; _inout=2; _eloc=-8; end;
		when (17) do; _av=33; _main=16; _sec=-9; _inout=1; _eloc=1; end;
		when (18) do; _av=2; _main=8; _sec=-9; _inout=1; _eloc=1; end;
		when (19) do; _av=31; _main=59; _sec=-9; _inout=1; _eloc=1; end;
		when (20) do; _av=37; _main=49; _sec=-9; _inout=-8; _eloc=-8; end;
		when (21) do; _av=35; _main=56; _sec=-9; _inout=-8; _eloc=-8; end;
		when (22) do; _av=40; _main=54; _sec=-9; _inout=1; _eloc=-8; end;
		when (23) do; _av=28; _main=39; _sec=-9; _inout=1; _eloc=6; end;
		when (24) do; _av=29; _main=48; _sec=-9; _inout=1; _eloc=-8; end;
		when (25) do; _av=19; _main=42; _sec=-9; _inout=-8; _eloc=-8; end;
		when (26) do; _av=18; _main=35; _sec=-9; _inout=-8; _eloc=-8; end;
		when (27) do; _av=24; _main=37; _sec=-9; _inout=1; _eloc=-8; end;
		when (28) do; _av=23; _main=33; _sec=-9; _inout=-8; _eloc=-8; end;
		when (29) do; _av=36; _main=55; _sec=-9; _inout=-8; _eloc=1; end;
		when (30) do; _av=40; _main=51; _sec=-9; _inout=-8; _eloc=-8; end;
		when (31) do; _av=41; _main=69; _sec=-9; _inout=-8; _eloc=-8; end;
		when (91) do; _av=13; _main=1; _sec=-9; _inout=1; _eloc=1; end; * imputed personal/household care;
		when (93) do; _av=16; _main=3; _sec=-9; _inout=1; _eloc=1; end; * imputed sleep;
		when (962) do; _av=17; _main=62; _sec=-9; _inout=3; _eloc=8; end; * imputed travel;
		otherwise;
	end;
run;

* Create the rest of the variables;
data mylib.hvar2;
  merge a1 (in=a)
        mylib.hvar1 (in=b keep= newno date _countreg _survey _swave _msamp _hldid _persid _id
                           _day _month _year _diary _badcase _sex _age);
  by newno;
	if a;
	
	if date ne . then _cday = date;
	else _cday = -8;

  _mtrav = -9;
  _ict = -9;
	_alone = -9;
  _child = -9;
	_sppart = -9;
	_oad = -9;

run;


*******************************************************************
******************  Quality checks  *****************
******************************************************************;
* Run each manually to check;

** Step 1: data distribution **;

* badcase;
proc sql;
select _badcase, count(*)
from mylib.hvar1
group by 1;
quit;

** Step 2: propwt **;

* propwt;
proc sql;
select sum(_propwt)
from mylib.hvar1;
quit;
* 1994.296;

* each day of the week should appear in equal proportion (14.286%);
proc freq data=mylib.hvar1; tables _day; run;
* ranging from 12.72% to 17.16%;

* sex and age group;
data tmp;
  set mylib.hvar1 (keep= newno _sex _age _propwt);
  if _age le 18 then agegrp = 1;
  else if _age le 25 then agegrp = 2;
  else if _age le 40 then agegrp = 3;
  else if _age le 60 then agegrp = 4;
  else if _age le 70 then agegrp = 5;
  else if _age gt 70 then agegrp = 6;
	else agegrp = 0;
run;
proc sql;
select _sex, agegrp, count(*)
from tmp
group by 1,2;
quit;
* Both male and female have higher numbers in age groups 3 & 4,
  female has higher numbers in age group 5 & 6 then male;

** Step 3: cross checking data **;

* Means of age, hhldsize and nchild by categories of famstat;
proc means data=mylib.hvar1 n min mean max;
	var _age _hhldsize _nchild;
	class _famstat;
run;
* ok;

* Median of age by retired;
proc means data=mylib.hvar1 n min mean median max;
	var _age;
	class _retired;
run;
* ok, the median age of retired is 71 and the median age of non-retired is 40;

* nchild is less then hhldsize;
proc sql;
select _hhldsize, _nchild, count(*)
from mylib.hvar1
where _hhldsize lt _nchild
group by 1,2;
quit;
* 0;

** Cross-tabs of famstat by nchild, hhtype, agekidx, civstat and cphome **;

* - most should be cphome=1 if famstat=4
  - famstat=5 should always map to cphome=0;
proc sql;
select _famstat, _cphome, count(*)
from mylib.hvar1
group by 1,2;
quit;
* all 32 famstat=4 has cphome=1;
* no famstat=5 cases;

* famstat=1,2,4 should not match to hhtype=1,2;
proc sql;
select _famstat, _hhtype, count(*)
from mylib.hvar1
group by 1,2;
quit;
* 12055279505: famstat=1 and hhtype=2, this woman is a single parent, and lives in a 2 person household, but also is
listed as married
  15081119505: famstat=2 and hhtype=2, this is a married man in a couple with a child;
* fix these 2 cases;
proc sql;
  update mylib.hvar1 set _hhtype=4 where newno eq 12055279505; * other household types;
  update mylib.hvar1 set _civstat=2 where newno eq 12055279505; * as she does not live with a partner;
  update mylib.hvar1 set _cohab=-7 where newno eq 12055279505; * again, she is not living with a partner;
  update mylib.hvar1 set _hhtype=3 where newno eq 15081119505; * couple + others;
  update mylib.hvar1 set _hhldsize=3 where newno eq 15081119505;
quit;

* - famstat=0,3 should match to agekidx=-7 or 4 & nchild=0
  - other famstat values should have no agekidx=-7 or 4 or nchild=0 values
  - famstat=1 should match to agekidx=1 cases only
  - famstat=2 should match to agekidx=2,3 cases only
  - famstat=4,5 should match to agekidx=1,2 or 3;
proc sql;
select _famstat, _agekidx, _nchild, count(*)
from mylib.hvar1
group by 1,2,3;
quit;
* ok;

** Cross-tab agekidx by nchild **;

* - nchild=0 should match to all cases agekidx=-7 or agekidx=4
  - no positive cases of nchild should have agekidx=-7 or agekidx=4
  - no cases of agekidx>-7 and <4 should match to nchild=0;
proc sql;
select _agekidx, _nchild, count(*)
from mylib.hvar1
group by 1,2;
quit;
* ok;

** Cross-tabs empstat by emp, unemp, student, retired disab **;

* emp=1 should correspond only with empstat=1,2,3;
proc sql;
select _emp, _empstat, count(*)
from mylib.hvar1
group by 1,2;
quit;

* Most people coded as unemployed should not be working,
  more students and retired should be empstat=4,
  but some students and retired people should be coded as working;
proc sql;
select _student, _retired, _empstat, _emp, _unemp, count(*)
from mylib.hvar1
group by 1,2,3,4,5;
quit;
* ok;

* Check cases with unemployment but got work hours;
proc sql;
create table t1 as (
  select *
	from mylib.mod5
	where newno in (select newno
                  from mylib.hvar1
	                where _unemp eq 1
	                and _av1 gt 1)
  );
quit;
* 2 cases have unknown break time, assign empstat as unknown job hours;
* 9039279505 and 11048169505;
proc sql;
  update mylib.hvar1 set _empstat=3 where newno eq 9039279505 or newno eq 11048169505;
quit;

** Cross-tabs with empsp and partid by civstat and relrefp **;

* People not in couples should have values of -7 for empsp and partid;
* no data to check;

* relrep=2 (spouse of reference person) and civstat=1 should not match with empsp 
  or partid=-7;
* no data to check;

** Cross-tab civstat by hhtype relrefp partid cohab cphome **;

* - no hhtype=2 should match with civstat=2
  - few civstat=1 should match to hhtype=1 (single person household);
proc sql;
select _hhtype, _civstat, count(*)
from mylib.hvar1
group by 1,2;
quit;
* ok;

* Diarists with relrefp values of 2 (spouse of reference person) 
  should have a civstat value of 1 (in couple);
proc sql;
select _relrefp, _civstat, count(*)
from mylib.hvar1
group by 1,2;
quit;
* ok;

* All people with cohab=1 or cohab=0 should have a civstat=1;
proc sql;
select _cohab, _civstat, count(*)
from mylib.hvar1
group by 1,2;
quit;
* ok;

* All people with cphome=1 should be civstat=2;
proc sql;
select _cphome, _civstat, count(*)
from mylib.hvar1
group by 1,2;
quit;
* ok;

** Cross-tab hhtype by relrefp **;

* - hhtype=1 only should correspond with relrefp=1
  - hhtype=2 should correspond with relrefp values of 1 and 2
  - hhtypes 3 and 4 should correspond with all relrefp values;
proc sql;
select _hhtype, _relrefp, count(*)
from mylib.hvar1
group by 1,2;
quit;
* ok;
* not all relrefp values because this survey does not have sibling or in-laws data;

** Compute a summary of the total minutes recorded in the diary, should equal 1440 **;

* summing the total time by adding up all AV variables that can be computed for the survey;

data tmp;
  set mylib.hvar1 (keep= newno _av:);
  retain;
  array ct{41} _av:;

  actvct = 0;
  do i = 1 to 41;
	  if ct{i} ne -9 then actvct = actvct + ct{i};
  end;
run;

proc sql;
select count(*)
from tmp
where actvct ne 1440;
quit;
* 0;

* summing the total time by adding up all MAIN variables that can be computed for the survey;

data tmp;
  set mylib.hvar1 (keep= newno _main:);
  retain;
  array ct{69} _main:;

  actvct = 0;
  do i = 1 to 69;
	  if ct{i} ne -9 then actvct = actvct + ct{i};
  end;
run;

proc sql;
select count(*)
from tmp
where actvct ne 1440;
quit;
* 0;

* summing the total time by adding up all time variables for each diary in the World 6 file;
proc sql;
select count(*)
from (select newno, sum(_time) as tmct from mylib.hvar2 group by 1) a
where a.tmct ne 1440;
quit;
* 0;

** Step 4: time distributions in the basic activities by days of week **;

* av1 is paid work, av2 is paid work at home, av4 is school, and av16 is sleep;
proc sql;
select _day, sum(_av1) as wk, sum(_av2) as wkhm, sum(_av4) as sch, sum(_av16) as slp, count(*)
from mylib.hvar1
group by 1;
quit;
* looks about right;

* Step 5: min and max values **;

proc means data=mylib.hvar1 n min mean max;
	var _hhldsize _ownhome _nchild _agekidx _income _vehicle;
	class _hhtype;
run;
* ok, no discrepancies;

** Step 6: check one person household and its relrefp **;

proc sql;
select _hhtype, _relrefp, count(*)
from mylib.hvar1
where _hhtype eq 1
group by 1,2;
quit;


*******************************************************************
************************  Create datasets  ************************
******************************************************************;

** Create the 5.53 dataset **; 

* Read the dataset format into memory;
proc format cntlin = mylib.MTUSschemaW553; run;

* Create the dataset;
data mylib.UK1995W553 
       (keep= countreg survey swave msamp 
              hldid persid id parntid1 parntid2 partid 
              day month year diary badcase 
              hhtype hhldsize nchild agekidx agekid2 
              incorig income ownhome urban computer vehicle 
              sex age famstat cphome singpar relrefp 
              civstat cohab citizen empstat emp unemp 
              student retired empsp workhrs empinclm 
              occup sector educa edtry rushed health carer disab 
              av1 av2 av3 av4 av5 av6 av7 av8 av9
              av10 av11 av12 av13 av14 av15 av16 av17 av18 av19 
              av20 av21 av22 av23 av24 av25 av26 av27 av28 av29 
              av30 av31 av32 av33 av34 av35 av36 av37 av38 av39 
              av40 av41 ocombwt propwt);
  set mylib.schema553 mylib.hvar1;

  countreg = _countreg;
  *country = _country;
  survey = _survey;
  swave = _swave;
  msamp = _msamp;
  hldid = _hldid;
  persid = _persid;
  id = _id;
  parntid1 = _parntid1;
  parntid2 = _parntid2;
  partid = _partid;
  day = _day;
  month = _month;
  year = _year;
  diary = _diary;
  badcase = _badcase;
  hhtype = _hhtype;
  hhldsize = _hhldsize;
  nchild = _nchild;
  agekidx = _agekidx;
  agekid2 = _agekid2;
  incorig = _incorig;
  income = _income;
  ownhome = _ownhome;
  urban = _urban;
  computer = _computer;
  vehicle = _vehicle;
  sex = _sex;
  age = _age;
  famstat = _famstat;
  cphome = _cphome;
  singpar = _singpar;
  relrefp = _relrefp;
  civstat = _civstat;
  cohab = _cohab;
  citizen = _citizen;
  empstat = _empstat;
  emp = _emp;
  unemp = _unemp;
  student = _student;
  retired = _retired;
  empsp = _empsp;
  workhrs = _workhrs;
  empinclm = _empinclm;
  occup = _occup;
  sector = _sector;
  educa = _educa;
  edtry = _edtry;
  rushed = _rushed;
  health = _health;
  carer = _carer;
  disab = _disab;
  av1 = _av1;
  av2 = _av2;
  av3 = _av3;
  av4 = _av4;
  av5 = _av5;
  av6 = _av6;
  av7 = _av7;
  av8 = _av8;
  av9 = _av9;
  av10 = _av10;
  av11 = _av11;
  av12 = _av12;
  av13 = _av13;
  av14 = _av14;
  av15 = _av15;
  av16 = _av16;
  av17 = _av17;
  av18 = _av18;
  av19 = _av19;
  av20 = _av20;
  av21 = _av21;
  av22 = _av22;
  av23 = _av23;
  av24 = _av24;
  av25 = _av25;
  av26 = _av26;
  av27 = _av27;
  av28 = _av28;
  av29 = _av29;
  av30 = _av30;
  av31 = _av31;
  av32 = _av32;
  av33 = _av33;
  av34 = _av34;
  av35 = _av35;
  av36 = _av36;
  av37 = _av37;
  av38 = _av38;
  av39 = _av39;
  av40 = _av40;
  av41 = _av41;
  ocombwt = _ocombwt;
  propwt = _propwt;
run;

* Remove the dummy row carried over from the schema;
data mylib.UK1995W553;
  set mylib.UK1995W553;
  if countreg eq . then delete;
run;

* Remove the formats for viewing the actual values;
data v1;
  set mylib.UK1995W553;
	format _all_;
run;


** Create the 5.8 datasets **; 

* Read the dataset format into memory;
proc format cntlin = mylib.MTUSschemaW58; run;

* Create the general dataset;
data set58
       (keep= countreg survey swave msamp 
              hldid persid id parntid1 parntid2 partid 
              day month year diary badcase 
              hhtype hhldsize nchild agekidx agekid2 
              incorig income ownhome urban computer vehicle 
              sex age famstat cphome singpar relrefp 
              civstat cohab citizen empstat emp unemp 
              student retired empsp workhrs empinclm 
              occup sector educa edtry rushed health carer disab 
              av1 av2 av3 av4 av5 av6 av7 av8 av9
              av10 av11 av12 av13 av14 av15 av16 av17 av18 av19 
              av20 av21 av22 av23 av24 av25 av26 av27 av28 av29 
              av30 av31 av32 av33 av34 av35 av36 av37 av38 av39 
              av40 av41 ocombwt propwt
							main1 main2 main3 main4 main5 main6 main7 main8 main9 
              main10 main11 main12 main13 main14 main15 main16 main17 main18 main19 
              main20 main21 main22 main23 main24 main25 main26 main27 main28 main29 
              main30 main31 main32 main33 main34 main35 main36 main37 main38 main39 
              main40 main41 main42 main43 main44 main45 main46 main47 main48 main49
              main50 main51 main52 main53 main54 main55 main56 main57 main58 main59
              main60 main61 main62 main63 main64 main65 main66 main67 main68 main69
              sppart);
  set mylib.schema58 mylib.hvar1;

  countreg = _countreg;
  *country = _country;
  survey = _survey;
  swave = _swave;
  msamp = _msamp;
  hldid = _hldid;
  persid = _persid;
  id = _id;
  parntid1 = _parntid1;
  parntid2 = _parntid2;
  partid = _partid;
  day = _day;
  month = _month;
  year = _year;
  diary = _diary;
  badcase = _badcase;
  hhtype = _hhtype;
  hhldsize = _hhldsize;
  nchild = _nchild;
  agekidx = _agekidx;
  agekid2 = _agekid2;
  incorig = _incorig;
  income = _income;
  ownhome = _ownhome;
  urban = _urban;
  computer = _computer;
  vehicle = _vehicle;
  sex = _sex;
  age = _age;
  famstat = _famstat;
  cphome = _cphome;
  singpar = _singpar;
  relrefp = _relrefp;
  civstat = _civstat;
  cohab = _cohab;
  citizen = _citizen;
  empstat = _empstat;
  emp = _emp;
  unemp = _unemp;
  student = _student;
  retired = _retired;
  empsp = _empsp;
  workhrs = _workhrs;
  empinclm = _empinclm;
  occup = _occup;
  sector = _sector;
  educa = _educa;
  edtry = _edtry;
  rushed = _rushed;
  health = _health;
  carer = _carer;
  disab = _disab;
  av1 = _av1;
  av2 = _av2;
  av3 = _av3;
  av4 = _av4;
  av5 = _av5;
  av6 = _av6;
  av7 = _av7;
  av8 = _av8;
  av9 = _av9;
  av10 = _av10;
  av11 = _av11;
  av12 = _av12;
  av13 = _av13;
  av14 = _av14;
  av15 = _av15;
  av16 = _av16;
  av17 = _av17;
  av18 = _av18;
  av19 = _av19;
  av20 = _av20;
  av21 = _av21;
  av22 = _av22;
  av23 = _av23;
  av24 = _av24;
  av25 = _av25;
  av26 = _av26;
  av27 = _av27;
  av28 = _av28;
  av29 = _av29;
  av30 = _av30;
  av31 = _av31;
  av32 = _av32;
  av33 = _av33;
  av34 = _av34;
  av35 = _av35;
  av36 = _av36;
  av37 = _av37;
  av38 = _av38;
  av39 = _av39;
  av40 = _av40;
  av41 = _av41;
  ocombwt = _ocombwt;
  propwt = _propwt;
  main1 = _main1;
  main2 = _main2;
  main3 = _main3;
  main4 = _main4;
  main5 = _main5;
  main6 = _main6;
  main7 = _main7;
  main8 = _main8;
  main9 = _main9;
  main10 = _main10;
  main11 = _main11;
  main12 = _main12;
  main13 = _main13;
  main14 = _main14;
  main15 = _main15;
  main16 = _main16;
  main17 = _main17;
  main18 = _main18;
  main19 = _main19;
  main20 = _main20;
  main21 = _main21;
  main22 = _main22;
  main23 = _main23;
  main24 = _main24;
  main25 = _main25;
  main26 = _main26;
  main27 = _main27;
  main28 = _main28;
  main29 = _main29;
  main30 = _main30;
  main31 = _main31;
  main32 = _main32;
  main33 = _main33;
  main34 = _main34;
  main35 = _main35;
  main36 = _main36;
  main37 = _main37;
  main38 = _main38;
  main39 = _main39;
  main40 = _main40;
  main41 = _main41;
  main42 = _main42;
  main43 = _main43;
  main44 = _main44;
  main45 = _main45;
  main46 = _main46;
  main47 = _main47;
  main48 = _main48;
  main49 = _main49;
  main50 = _main50;
  main51 = _main51;
  main52 = _main52;
  main53 = _main53;
  main54 = _main54;
  main55 = _main55;
  main56 = _main56;
  main57 = _main57;
  main58 = _main58;
  main59 = _main59;
  main60 = _main60;
  main61 = _main61;
  main62 = _main62;
  main63 = _main63;
  main64 = _main64;
  main65 = _main65;
  main66 = _main66;
  main67 = _main67;
  main68 = _main68;
  main69 = _main69;
  sppart = _sppart;
run;

* Create the adult and kid datasets;
data mylib.UK1995W58 mylib.UK1995W58kid;
  set set58;
  if countreg eq . then delete; * remove the dummy row carried over from the schema;
  if age ge 1 and age le 17 then output mylib.UK1995W58kid;
	else output mylib.UK1995W58;
run;

* Remove the formats for viewing the actual values;
data v2;
  set mylib.UK1995W58;
	format _all_;
run;
data v3;
  set mylib.UK1995W58kid;
	format _all_;
run;


** Create the 6.0 datasets **; 

* Read the dataset format into memory;
proc format cntlin = mylib.MTUSschemaW60; run;

* Create the general dataset;
data set60
       (keep= countreg survey swave msamp
              hldid persid id 
              day cday month year diary badcase
              sex age 
              time clockst start end epnum
              main sec av
              inout eloc mtrav 
              ict alone child sppart oad);
  set mylib.schema60 mylib.hvar2;

	countreg = _countreg;
  survey = _survey;
  swave = _swave;
  msamp = _msamp;
  hldid = _hldid;
  persid = _persid;
  id = _id;
  day = _day;
  cday = _cday;
  month = _month;
  year = _year;
  diary = _diary;
  badcase = _badcase;
  sex = _sex;
  age = _age;
  time = _time;
  clockst = _clockst;
  start = _start;
  end = _end;
  epnum = _epnum;
  main = _main;
  sec = _sec;
  av = _av;
  inout = _inout;
  eloc = _eloc;
  mtrav = _mtrav;
  ict = _ict;
  alone = _alone;
  child = _child;
  sppart = _sppart;
  oad = _oad;
run;

* Create the adult and kid datasets;
data mylib.UK1995W60 mylib.UK1995W60kid;
  set set60;
  if countreg eq . then delete; * remove the dummy row carried over from the schema;
  if age ge 1 and age le 17 then output mylib.UK1995W60kid;
	else output mylib.UK1995W60;
run;

* Remove the formats for viewing the actual values;
data v4;
  set mylib.UK1995W60;
	format _all_;
run;
data v5;
  set mylib.UK1995W60kid;
	format _all_;
run;


** Create the UK Supplement dataset **; 

* Read the dataset formats into memory;
proc format cntlin = mylib.MTUSschemaUKSup; run;

* Create the dataset;
data mylib.UK1995extravar
	(keep = survey hldid persid id
	        sexd aged incolab educalab empinlab
          aglftsch ethnic region animal);
  set mylib.schemaUKSup mylib.hvar1;

  survey = _survey;
	hldid = _hldid;
	persid = _persid;
	id = _id;
	sexd = _sex;
	aged = _age;
	incolab = _incorig;
	educalab = _educa;
	empinlab = _empinclm;
  aglftsch = _aglftsch;
	ethnic = _ethnic;
  region = _region;
	animal = -9;
run;

* Remove the dummy row carried over from the schema;
data mylib.UK1995extravar;
  set mylib.UK1995extravar;
  if survey eq . then delete;
run;

* Remove the formats for viewing the actual values;
data v5;
  set mylib.UK1995extravar;
	format _all_;
run;


* Clean up the temp datasets;
/*
proc datasets library=mylib; delete mod1 uk1995_mod hvar1 hvar2; quit;
*/

* eof;