************************************************************
************************************************************
***														 ***
***		    Do-file for working with pairfam data   	 ***
***			   	 EVENT-HISTORY ANALYSIS					 ***
***              	DATA SET "BIOPART"		             ***
***                   Release 13.0			             ***
***	  													 ***
***					   May 2022		   	                 ***
***														 ***
***			    Author: Nina Schumann					 ***
***														 ***
************************************************************
************************************************************

* This do-file shows some common operations for working with
* the generated data set "biopart" when applying event-history analyses.
* Adapt it to your problem, and you should be ready
* to start with your analysis.
*
* Attention for wave 12 & 13: Before running the dofile please append the datasets 
* anchor12_capi.dta (anchor13_capi.dta) and anchor12_cati.dta (anchor13_cati.dta) 
* and name the combined dataset anchor12.dta (anchor13.dta)! For more information
* on the mode change in W12/W13, please see the Data Manual, Chapter 12. 


***************************************************************************
***                     PRELIMINARIES                                   ***
***************************************************************************

clear all
set more off		// tells Stata not to pause for --more-- messages

global inpath 	`""insert your datapath here""'  // directory of original data

***********************************************************
****   LOADING THE DATA      ************
***********************************************************

cd $inpath
use biopart, clear  					// load partner episode data

* Have a look at the structure of this data set
br in 1/10
		

***********************************************************
***** GENERATING EPISODE DATA ON UNION DURATION *****
***********************************************************

***** 1) Preparing the data for event-history analysis (EHA) ****

keep  id sample partindex intdat* dob sexp relbeg relend b?beg b?end partcurrw* *flag* cohbeg cohend
drop if relbeg==-3			// drop if no relationship episode exists
sort id partindex

* Failure indicator (i.e. event): 1=Separation/Divorce
gen separ=0
replace separ=1 if relend!=-99
tab separ, m	

* Generate last interview
egen intdat_max=rowmax(intdatw*)

* Censored unions get the interview date
replace relend = intdat_max  if relend==-99 

* Drop unions with missing dates 
drop if relbeg==-7 | relend==-7

* Drop unions if partner died 
drop if relend==-66

* Drop unions with age at start of union below 10 
gen ageunion=(relbeg-dob)/12
drop if ageunion<10                //we do not believe in these unions

* Drop unions with age at end of union below 14 
gen ageendunion=(relend-dob)/12
drop if ageendunion<14             //by design these unions should not be in the data

* Check flagged cases
tab1 biopartflag?, m  
drop if biopartflag4==1 	// inconsistency year of birth partner

* How many failures now?
tab separ, m	

* Compute duration
gen dur=relend-relbeg
recode dur 0 = 0.5
tab dur, m



***** 2) Analyzing the data ************

* Declare data to be event-history data
stset dur, failure(separ)  // dur=process time; separ=event/failure

* Check what stset did
list id partindex relbeg relend dur separ _t0 _t _d  in 1/10, nol

* "Smoothed" hazard function
sts graph, tmax(316) hazard width(2) ci xlabel(0(24)316)  





********************************************
******* TAKING REGARD OF BREAKS ************
********************************************

* Until now we ignored the information on union breaks. For over 1,600 unions we 
* have a first break. For some of these unions we have further breaks. 
* For unions with a break, duration is too long if we ignore breaks.
* Strategy: A break is treated like a separation (the start of a break defines
*           the end of an union episode, the end of a break defines the start 
*           of a new union episode). Thus, we finally will have a
*           dataset with multiple episodes for a union with break(s).

***** 1) Imputing missing date information ****

* If one date is missing, impute the other (assuming the break lasts 1 month)
* If both the beginning date and end date are missing, delete the break.
forvalues z=1/7 {
	quietly replace b`z'beg = b`z'end if b`z'beg==-7 & b`z'end>0  
	quietly replace b`z'end = b`z'beg if b`z'beg>0   & b`z'end==-7  
}
for var b?beg : count if X>0    // count the n with (valid) break information
generate nbreaks=0              // the number of breaks per union
forvalues x=1/7 {
	replace nbreaks=nbreaks+1 if b`x'beg>0
}
tab nbreaks, m
* One union has 7 breaks. For this union we have to generate 8 episodes.



***** 2) Defining the beginning and end dates of the episodes ****

* We checked that breaks are between the beginnning and end of the union and do not
* overlap within a union. They are stored in chronological order.
* Furthermore, the duration of a break is always >=0.	

* Defining the beginning dates of the episodes.
generate relbeg1=relbeg					    // start of first episode 
forvalues x=2/8 {
	local y=`x'-1
	gen relbeg`x'=b`y'end if b`y'end>0 	// start of 2nd,..., 8th episode 
}	                                    // = end of previous break

* Defining the end dates of the episodes
forvalues x=1/7 {
  generate relend`x'=b`x'beg if b`x'beg>0     //end of 1st-7th episode=start of break 
  replace  relend`x'=relend     if nbreaks+1==`x' //end of last episode=date of separation 
}
generate relend8=relend if b7end>0               //end of 8th episode


***** 3) Switch to long-format (multiple episodes per union) data ****

gen unionid = _n                            //generating the union identifier
keep unionid id intdat* dob sexp relbeg1-relbeg8 relend1-relend8 nbreaks separ partcurrw* sample cohbeg cohend
reshape long relbeg@ relend@, i(unionid) j(epinr)
drop if relbeg==. | relend==.				//drop empty episodes
tab epinr, m


***** 4) Preparing the data for EHA *********

* Failure indicator: 1=Separation/Divorce
bysort unionid (epinr): replace separ=1 if _n<_N   //a break is a separation
tab separ, m		

* Computing duration
generate dur = relend-relbeg 
recode dur 0 = 0.5


***** 5) Analyzing the data ************

stset dur, failure(separ)

* "Smoothed" hazard function
sts graph, tmax(316) hazard width(2) ci xlabel(0(24)316)  




********************************************
******* EHA WITH TIME-CONSTANT COVARIATES ************
********************************************

* Two time-constant covariates: cohort, country of birth

* Merge anchor1, pairfam main sample
merge m:1 id using anchor1, keepusing(cohort cob)  
	tab sample _merge, m 
	drop if _merge==2  
	drop _merge

* Merge anchor1, DemoDiff subsample
merge m:1 id using anchor1_dd, keepusing(cohort cob) nol update
	tab sample _merge, m  
	drop if _merge==2  
	drop _merge

* Merge anchor11, refreshment subsample
merge m:1 id using anchor11, keepusing(cohort cob) nol update
	tab sample _merge, m  
	drop if _merge==2  
	drop _merge	
	
* Cohort	
tab cohort, m

* Country of birth: Federal Republic of Germany, German Democratic Republic, other
tab cob, m
recode cob -7=.
recode cob 4/23=3
lab def cob 1 "Federal Republic of Germany" 2 "German Democratic Republic" 3 "Other" 														
lab val cob cob

* Sort data set
sort id unionid

* Multivarite Cox model	
stcox i.cohort i.cob, vce(cluster id)  

* Conditional Effect Plots (rate function)
stcurve, hazard at1(cohort=1) at2(cohort=2) at3(cohort=3) at4(cohort=4)
stcurve, hazard at1(cob=1) at2(cob=2) at3(cob=3) xsize(4.5)				///
       lwidth(thick thick thick)                                        ///
       xtitle("union duration in months", size(large) margin(0 0 0 2))  ///
	   ytitle("separatioon rate", size(large))                			///            
	   legend(pos(1) ring(0) row(2) order(1 2 3) lab(1 "FRG")           ///
		lab(2 "GDR") lab(3 "other") size(medlarge))   					///	
       xlabel(, labsize(medium))                                 ///
	   ylabel(, angle(0) grid labsize(medium) format(%5.3f))

	



********************************************
******* EHA WITH TIME-VARYING COVARIATES ************
********************************************

* additional time-varying covariate: cohabitation 

* Check data structure
list id unionid relbeg relend dur _t0 _t _d  in 1/10, nol  

* Restrict episode to one per unionid 
keep if epinr==1 

* If there is more than 1 episode per unionid, the stset command will create
* st variables for only one, randomly-selected episode. This episode may be 
* defined as the first, last, or any other specific episode. In the following, 
* only the first relationship episode is kept; however, this is just a suggestion. 
* Each user must decide based on the research question.

* "stsplit" requires an ID variable; not each observation is a different union --> id(unionid)
stset dur, failure(separ) id(unionid)  

* Episode splitting
stsplit T0, every(1)  // split at each month

* Variable "cohab": currently cohabitating - yes or no
replace cohend=intdat_max if cohend==-99
gen cohab=0
replace cohab=1 if (relbeg+T0)>=cohbeg & (relbeg+T0)<cohend
tab cohab, m

* Now delete episodes with identical values (no loss of information)
drop T0  // drop variable created by stsplit (necessary for "stjoin")
stjoin  

* Check data structure
list id unionid relbeg relend dur cohab _t0 _t _d  in 1/10, nol  // first ID: now 8 rows (relationship 5 "splitted")

* Multivarite Cox model	
stcox i.cohort i.cob cohab, vce(cluster id)