*! Polychoric correlations -- v.1.4.3, by Stas Kolenikov program define polychoric, rclass version 8.2 #delimit ; syntax varlist(min=2 numeric) [if] [in] [aw fw pw /], [pca pw VERBose NOLOG SCore(str) dots IMissing NSCore(int 0) noINIT *] ; #delimit cr * PCA to perform PCA * pw for pairwise correlations * verbose to output the correlation type, rho, s.e., and goodness of fit if applicable * NOLOG * score to generate scores from PCA * dots to entertain the user with % signs * imissing to believe that the missing of an ordinal variables should be imputed zero if "`imissing'" == "" { local imissing not } if "`score'"=="" & `nscore'>0 { di as err "cannot specify nscore without score" exit 198 } * else local nscore=5 * this is a bit weird: what if the user specifies -score- * without the name of the new variable? Then it gets into `options' if index("`options'","score") { di as err "must specify new variable prefix with -score-" exit 198 } if `:word count `varlist'' == 2{ local verbose verbose } tempvar w1 if "`weight'" ~= "" { qui g double `w1' = `exp' local www [`weight'=`w1'] } else { qui g double `w1' = 1 local www [pw=`w1'] local exp `w1' } * that way, we always have weights if "`score'"~="" { confirm new var `score'1 } if "`pw'"==""{ marksample touse } else { marksample touse, novar } tokenize `varlist' local nvar: word count `varlist' tempname corrmat mat `corrmat' = J(`nvar',`nvar',.) forvalues i=1/`nvar' { mat `corrmat'[`i',`i']=1 } mat rown `corrmat' = `varlist' mat coln `corrmat' = `varlist' * compress `varlist' local i=1 local ndots = `nvar'*(`nvar'-1)/2 local idots 0 while "``i''"~="" { local j = `i'+1 while "``j''"~="" { polych1 ``i'' ``j'' `www' if `touse' , `options' `init' if "`dots'"~="" { if int(`idots'/`ndots'*10)-int(`++idots'/`ndots'*10)~=0 { di as text int(10*`idots'/`ndots') "0%" _c } else { di as text "." _c } } mat `corrmat'[`i',`j']=r(rho) mat `corrmat'[`j',`i']=r(rho) if "`verbose'"=="verbose" { di _n /// as text "Variables : " as res "``i'' ``j''" _n /// as text "Type : " as res r(type) _n /// as text "Rho = " as res r(rho) _n /// as text "S.e. = " as res r(se_rho) if "`r(type)'" == "polychoric" { di as text "Goodness of fit tests:" _n /// as text "Pearson G2 = " as res r(G2) /// as text ", Prob( >chi2(" as res r(dfG2) as text")) = " as res r(pG2) _n /// as text "LR X2 = " as res r(X2) /// as text ", Prob( >chi2(" as res r(dfX2) as text")) = " as res r(pX2) } } local `j++' } local i=`i'+1 tokenize `varlist' } return add if "`verbose'" == "" { di as text _n "Polychoric correlation matrix" mat li `corrmat', noheader } if "`pca'"~="" { return clear polypca `corrmat' `touse' `www' `imissing' `nscore' `score' return add /* if "`score'"~="" { cap noi sum `score'* } */ } return matrix R `corrmat' end prog def polypca, rclass * perform PCA with the estimated matrix args corrmat touse www imissing nscore score * the correlation matrix * parse the weights tokenize `www' , parse(" [=]") * should become `1' == [, `2' == weight type, `3' == "=", `4' == exp, `5' == ] local exp `4' tempname X V value if `nscore'==0 local nscore . mat symeigen `X' `V' = `corrmat' * `V' are the eigenvalues, `X' are the eigenvectors local nvar = colsof(`corrmat') local p = min(`nvar',`nscore') di _n as text "Principal component analysis" _n(2) " k {c |} Eigenvalues {c |} Proportion explained {c |} Cum. explained" di as text "{dup 4:{c -}}{c +}{dup 15:{c -}}{c +}{dup 24:{c -}}{c +}{dup 18:{c -}}" local sum=0 forvalues i=1/`nvar' { return scalar lambda`i' = `V'[1,`i'] local sum = `sum'+return(lambda`i') #delimit ; di as res " "`i' as text " {c |} " as res %9.6f `V'[1,`i'] _col(21) as text "{c |} " as res %9.6f `V'[1,`i']/`nvar' _col(46) as text "{c |} " as res %8.6f `sum'/`nvar' ; #delimit cr } if "`score'"~= "" { * set trace on local varlist : rownames `X' tempvar tt ii ** mat li `X' di as text _n _col(15) "{bf: Scoring coefficients}" _n(2) /// " Variable {c |} Coeff. 1 {c |} Coeff. 2 {c |} Coeff. 3 " /// _n "{dup 54:{c -}}" _c qui foreach x of varlist `varlist' { noi di _n as res " `x'" _col(16) _c * is it continuous or discrete? cap drop `tt' cap drop __tt`x' * cap confirm byte var `x' * need to properly determine if continuous or discrete cap inspect `x' if r(N_unique)>9 { * continuous * egen __tt`x' = std(`x') if `touse' sum `x' [iw=`exp'] g double __tt`x' = (`x'-r(mean))/r(sd) if `touse' forvalues i=1/3 { noi di as text " {c |} " as res %9.6f /* sqrt(`V'[1,`i'])* */`X'[rownumb(`X',"`x'"),`i'] " " _c } } else { /* cap tab `x' if `touse' if r(r) > 10 | _rc == 134 { * quasi-continuous * egen __tt`x' = std(`x') if `touse' sum `x' [iw=`exp'] g double __tt`x' = (`x'-r(mean))/r(sd) if `touse' forvalues i=1/3 { noi di as text " {c |} " as res %9.6f `X'[rownumb(`X',"`x'"),`i'] " " _c } } else { */ * discrete; make it a centered categorized normal ** noi di as text " : ordinal" _c local ncat = r(N_unique) sum `exp', mean local N = r(sum) sort `touse' `x' cap drop `ii' egen byte `ii' = group(`x') if `touse' local p0 = 0.1/`N' local t0 = invnorm(`p0') ** noi di `t0' forvalues k=1/`ncat' { sum `exp' if `ii'<=`k' , mean local p`k' = (r(sum)-0.5)/`N' local t`k' = invnorm(`p`k'') ** noi di `t`k'' } local p`ncat' = (`N'-0.1)/`N' local t`ncat' = invnorm(`p`ncat'') ** noi di `t`ncat'' gen double __tt`x' = 0 if `touse' forvalues k=1/`ncat' { local k1 = `k'-1 scalar `value' = ( exp(-.5*`t`k1''*`t`k1'') - exp(-.5*`t`k''*`t`k'') ) /// /(sqrt(2*_pi)*(norm(`t`k'')-norm(`t`k1'') ) ) replace __tt`x' = `value' if `touse' & `ii'==`k' * need to determine what was the original category sum `x' if `ii'==`k' noi di _n _col(14) as res %-2.0f r(mean) _c forvalues i=1/3 { noi di as text " {c |} " as res %9.6f /* sqrt(`V'[1,`i'])* */ `X'[rownumb(`X',"`x'"),`i']*`value' " " _c } } if "`imissing'" == "imissing" { replace _tt`x' = 0 if `touse' & mi(`x') } ** } } } di nobreak { qui forvalues i=1/`p' { * we'll score `p' components prefixed by `score' gen double `score'`i'=0 if `touse' foreach x of varlist `varlist' { replace `score'`i' = `score'`i' + /* sqrt(`V'[1,`i'])* */ `X'[rownumb(`X',"`x'"),`i']*__tt`x' if `touse' } } } } return matrix eigenvalues `V' return matrix eigenvectors `X' polyquit end prog def polych1, rclass syntax varlist(numeric min=2 max=2) if [aw fw pw /] [, noINIT * ] *** !!! какие-то глюки с score nscore local www [`weight'=`exp'] marksample touse local x1 `1' local x2 `2' forvalues i = 1/2 { /* prior to 1.3.2 cap confirm byte var `x`i'' if _rc==7 { local call `call'c continue } cap tab `x`i'' * if more than 10 categories -- treat as continuous if r(r)>9 | _rc==134 { */ cap inspect `x`i'' if r(N_unique)>9 { local call `call'c } else { local call `call'd } } if "`call'"=="cc" { * both are continuous ** qui corr `x1' `x2' `www' if `touse' tempname A qui mat accum `A' = `x1' `x2' `www' if `touse' , nocons dev return scalar sum_w = r(N) qui count if !mi(`x1') & !mi(`x2') & !mi(`exp') return scalar N=r(N) mat `A' = corr(`A') return scalar rho = `A'[1,2] return local type Pearson return scalar se_rho = sqrt( (1-return(rho)*return(rho))/(return(N)-2) ) exit } if "`call'"=="dc" { * the first variable has to be continuous, and the second, discrete local call cd `x2' `x1' `www' if `touse' return local type polyserial } if "`call'"=="cd" { * the first variable has to be continuous, and the second, discrete local call cd `x1' `x2' `www' if `touse' return local type polyserial } if "`call'"=="dd" { * the first variable has to be continuous, and the second, discrete local call dd `x2' `x1' `www' if `touse' return local type polychoric } cap noi corr`call' , `init' `options' *********** * set trace off if _rc==1 { polyquit exit 1 } return add end prog def corrdd, rclass sort * the module to compute the polychoric correlation **************** * set tracedepth 2 * set trace on syntax varlist(numeric min=2 max=2) if [aw fw pw /], [ * noINIT ITERate(int 50) SEArch(str) ] cap confirm integer number `search' if !_rc { local searchstr search(quietly) repeat(`search') } else { if "`search'" == "" { local searchstr search(off) } else { local searchstr search(`search') } } marksample touse * compute the thresholds tempvar x1 x2 cap drop __POLY* eret clear ret clear qui forvalues k = 1/2 { sort `touse' ``k'' egen `x`k'' = group(``k'') if `touse' tab `x`k'' local r`k' = r(r) sum `exp' if `touse', meanonly local N = r(sum) return scalar sum_w = `N' return scalar N = r(N) gen __POLY`k'hi = . gen __POLY`k'lo = . forvalues h = 1/`r`k'' { * create the variables: upper threshold - lower threshold local h1 = `h'-1 sum `exp' if `x`k''<=`h' & `touse', meanonly replace __POLY`k'hi = cond(`h'==`r`k'',10,invnorm( (r(sum)-.5)/`N' ) ) if `x`k'' == `h' & `touse' sum `exp' if `x`k'' <= `h1' & `touse', meanonly replace __POLY`k'lo = cond(`h'==1,-10,invnorm( (r(sum)-.5)/`N' ) ) if `x`k'' == `h' & `touse' } } qui corr `1' `2' if `touse' local mcorr = r(rho) local mcorr = sign(`mcorr')*min(0.9, (1+abs(`mcorr'))/2) if "`init'" == "noinit" { local initstr init(_cons = `mcorr') } else { local initstr } * shouldn't -collapse- come somewhere here so that we don't have * to compute a complicated bivariate distribution for too many observations? * * needed further: __POLY([1|2][hi|lo]&pi); sum of weights; `touse' * no, for some reason, it did not work: the s.e.s are wrong qui gen double __POLYpi = . preserve qui keep if `touse' cap ml model lf polych_ll (rho: `touse' =) if `touse' /// [fw=`counts'] [`weight' = `exp'] /// , maximize `options' search(off) bounds(rho: -1 1) init(_cons = 0) iter(0) nolog if _rc==1 { polyquit exit 1 } local ll0c = e(ll) cap noi ml model lf polych_ll (rho: `touse' =) if `touse' /// [fw=`counts'] [`weight' = `exp'] /// , maximize `options' `searchstr' bounds(rho: -1 1) /// `initstr' iter(`iterate') nolog local rc=_rc if `rc'==1 { polyquit exit 1 } else if `rc' { cap noi ml model lf polych_ll (rho: `touse' =) if `touse' /// [fw=`counts'] [`weight' = `exp'] /// , maximize `options' search(quietly) repeat(10) bounds(rho: -1 1) /// iter(`iterate') nolog } return scalar rho = _b[_cons] return scalar se_rho = _se[_cons] collapse (sum) `exp' (mean) `touse' (mean) __POLY* if `touse', by(`x1' `x2') * tests * null hypothesis: no structure local df0 = `r1'*`r2' - `r1' - `r2' tempvar ll pp tempvar counts qui g `ll' = sum( `exp'*ln(`exp'/`N') ) local ll0 = `ll'[_N] * Likelihood ratio return scalar G2 = 2*(`ll0'-e(ll)) return scalar dfG2 = `df0' return scalar pG2 = chi2tail(`df0',return(G2)) * Pearson chi-square qui g double `pp' = sum( ((`exp'/`N'-__POLYpi)^2)/__POLYpi ) return scalar X2 = `pp'[_N]*`N' return scalar dfX2 = `df0' return scalar pX2 = chi2tail(`df0',return(X2)) restore * no correlation return scalar LR0 = -2*(`ll0c'-e(ll)) return scalar pLR0 = chi2tail(1,return(LR0)) polyquit end prog def corrcd, rclass * the module to compute the polyserial correlation syntax varlist(numeric min=2 max=2) if [aw fw pw /], [ * ITERate(int 50) noINIT SEArch(str) ] * the first variable is continuous, the second is discrete cap confirm integer number `search' if !_rc { local searchstr search(quietly) repeat(`search') } else { if "`search'" == "" { local searchstr search(off) } else { local searchstr search(`search') } } marksample touse * thresholds for the discrete part tempvar x1 x2 cap drop __POLY* ret clear qui{ * egen `x1' = std(`1') if `touse' sum `1' [iw=`exp'] g double `x1' = (`1'-r(mean))/r(sd) if `touse' sort `touse' `2' `1' egen `x2' = group(`2') if `touse' tab `x2' if `touse' local r2 = r(r) sum `exp' if `touse', mean local N = r(sum) return scalar N = r(N) return scalar sum_w = `N' gen __POLY2hi = . gen __POLY2lo = . forvalues h = 1/`r2' { * create the variables: upper threshold - lower threshold local h1 = `h'-1 sum `exp' if `x2'<=`h' & `touse' replace __POLY2hi = cond(`h'==`r2',10,invnorm( (r(sum)-.5)/`N' ) ) if `x2' == `h' & `touse' sum `exp' if `x2' <= `h1' & `touse' replace __POLY2lo = cond(`h'==1,-10,invnorm( (r(sum)-.5)/`N' ) ) if `x2' == `h' & `touse' } spearman `1' `2' if `touse' local mcorr = r(rho) } if "`init'" == "noinit" { local initstr init(_cons = `mcorr') } else { local initstr init(_cons = 0) } eret clear cap ml model lf polyser_ll (rho: `x1' =) if `touse' [`weight'=`exp'] , /// maximize `options' search(off) bounds(rho: -1 1) init(_cons = 0) iter(0) nolog if _rc==1 { polyquit exit 1 } local ll0c = e(ll) cap noi ml model lf polyser_ll (rho: `x1' =) if `touse' [`weight'=`exp'] , /// maximize `options' `searchstr' bounds(rho: -1 1) `initstr' nolog iter(`iterate') local rc=_rc if `rc'==1 { polyquit exit 1 } if `rc' { cap noi ml model lf polyser_ll (rho: `x1' =) if `touse' [`weight'=`exp'] , /// maximize `options' `bounds(rho: -1 1) nolog iter(`iterate') /// search(quietly) repeat(10) } * return the correlation coefficient return scalar rho = _b[_cons] return scalar se_rho = _se[_cons] * no correlation return scalar LR0 = -2*(`ll0c'-e(ll)) return scalar pLR0 = chi2tail(1,return(LR0)) end pro def polyquit cap drop __POLY* end exit History: v.1.1 -- Aug 2003 The basic development of everything v.1.2 -- November 2003 -- weights accomodated -- imissing option v.1.3 -- February 2004 -- polychoricpca as a separate command -- nscore option added -- changed the order of arguments in polychoric.polypca v.1.3.2 -- -inspect- is used to count the number of categories in place of -tab-; no need to -compress- v.1.3.3 -- iterate, search, and other stuff to failsafe convergence v.1.3.4 -- init string changed, score option clarified v.1.4 -- weights dealt with properly v.1.4.1 -- April 27, 2004 -- bug with PCA fixed (categorical variables not recognized properly) v.1.4.2 -- output the original category numbers v.1.4.3 -- the default matrix is filled with missing values rather than zeroes