*! Version 3.4 26 May 2014 *! Jean-Benoit Hardouin ************************************************************************************************************ * hcavar: Hierachical Clusters Analysis (HCA) of variables * Version 3.4: May 26, 2014 /* DETECT option available for polytomous items */ * * Use the Detect Stata program (ssc install detect) * * Historic : * Under the name of -hcaccprox- * Version 1 [2004-01-18], Jean-Benoit Hardouin * Version 2 [2004-05-12], Jean-Benoit Hardouin * Version 3 [2005-12-31], Jean-Benoit Hardouin * Version 3.1 [2006-01-15], Jean-Benoit Hardouin /* correction if there is only one individual with a given score*/ * Version 3.2 [2010-04-15], Jean-Benoit Hardouin /* Possibility to use Polytomous Items with CCOR, CCOV and MH*/ * Version 3.3 [2014-05-07], Jean-Benoit Hardouin, Bastien Perrot /* HTML option, if option*/ * Version 3.4 [2014-05-26], Jean-Benoit Hardouin, Bastien Perrot /* DETECT option available for polytomous items */ * * Jean-benoit Hardouin - Department of Biomathematics and Biostatistics - University of Nantes - France * EA 4275 "Biostatistics, Clinical Research and Subjective Measures in Health Sciences" * jean-benoit.hardouin@univ-nantes.fr * * News about this program :http://www.anaqol.org * * Copyright 2004-2006, 2010 Jean-Benoit Hardouin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * ************************************************************************************************************ program define hcavar34, rclass version 9 syntax varlist(min=2 numeric) [if] [in] [,PROX(string) METHod(string) PARTition(numlist) MEASures DETect MATrix(string) noDENDROgram HTML(string)] tempfile hcaccproxfile qui save `hcaccproxfile',replace preserve if "`if'"!="" { qui keep `if' } if "`html'"!="" { //set scheme sj //local htmlregion "graphregion(fcolor(white) ifcolor(white))" di "<!-- SphereCalc start of response -->" di "<pre>" } local nbitems : word count `varlist' tokenize `varlist' local type=0 forvalues i=1/`nbitems' { qui drop if ``i''==. qui inspect ``i'' if r(N_unique)>`type'&r(N_unique)!=. { local type=r(N_unique) } else if r(N_unique)>`type'&r(N_unique)==. { local type "100" } } if `type'==100 { local type ">99" } tempname proximity whereitems local prox=lower("`prox'") local method=lower("`method'") matrix define `proximity'=J(`nbitems',`nbitems',0) matrix define `whereitems'=J(`=`nbitems'-1',`nbitems',0) /**************************PROXIMITIES MEASURES DESCRIPTION************************/ if "`matrix'"!="" { local desprox="Defined by the user" } if "`prox'"=="" { local prox="pearson" } else if "`prox'"=="a" { local prox="jaccard" } else if "`prox'"=="ad" { local prox="matching" } else if "`prox'"=="corr" { local prox="pearson" } if "`type'">"2"&"`prox'"!="pearson"&"`prox'"!="ccov"&"`prox'"!="ccor"&"`prox'"!="mh" { di in red "Only the {hi:pearson}, {hi:ccov} and {hi:ccor} measures of proximity are available with ordinal or numerous variables" di in red "Please correct your {hi:prox} option." exit } if "`partition'"==""&"`detect'"!="" { di in ye "option partition() required" error 198 } local existmeas=0 foreach i in jaccard matching pearson russel dice ccor mh ccov { if "`prox'"=="`i'" { local existmeas=1 } } if `existmeas'==0 { di in red "You must define an existing measure of proximity (jaccard(a), matching(ad), pearson(cor), russel, dice, ccov, ccor, mh)." di in red "Please correct your {hi:prox} option." exit } if "`prox'"=="ccov"|"`prox'"=="mh" { local proxmin=0 } if "`prox'"=="matching" { local desprox="Matching" } else if "`prox'"=="jaccard" { local desprox="Jaccard" } else if "`prox'"=="russel" { local desprox="Russel" } else if "`prox'"=="dice" { local desprox="Dice" } else if "`prox'"=="pearson" { local desprox="Pearson" } else if "`prox'"=="ccov" { local desprox="Conditional covariances" } else if "`prox'"=="ccor" { local desprox="Conditional correlations" } else if "`prox'"=="mh" { local desprox="Mantel Hanzel" } /**************************PROXIMITIES MEASURES DESCRIPTION************************/ if "`method'"=="upgma"|"`method'"=="" { local method="average" } if "`method'"=="wpgma"|"`method'"=="" { local method="waverage" } local vermethod=0 foreach i in average waverage single centroid median complete wards { if "`method'"=="`i'" { local vermethod=1 } } if `vermethod'==0 { di in red "You must define an existing method to define the proximity between two clusters of items:" di in red _col(10) "- single: single linkage" di in red _col(10) "- complete: complete linkage " di in red _col(10) "- average(UPGMA): Unweighted Pair-Group Method of Average" di in red _col(10) "- waverage(WPGMA): Unweighted Pair-Group Method of Average" di in red _col(10) "- wards: Ward's linkage" di in red "Please correct your method option" exit } if "`method'"=="single"|"`method'"=="singlelinkage" { local method single local desmethod="Single linkage" } else if "`method'"=="complete"|"`method'"=="completelinkage" { local desmethod="Complete linkage" } else if "`method'"=="median"|"`method'"=="medianlinkage" { local desmethod="Median linkage (no dendrogram)" } else if "`method'"=="centroid"|"`method'"=="centroidlinkage" { local desmethod="Centroid linkage (no dendrogram)" } else if "`method'"=="average"|"`method'"=="averagelinkage" { local desmethod="Unweighted Pair-Group Method of Average" } else if "`method'"=="waverage"|"`method'"=="waveragelinkage" { local desmethod="Weighted Pair-Group Method of Average" } else if "`method'"=="wards"|"`method'"=="wardslinkage" { local desmethod="Ward's linkage" } forvalues i=1/`nbitems' { matrix `whereitems'[1,`i']=`i' } tempvar score genscore `varlist',score(`score') qui su `score' local maxscore=r(max) forvalues k=0/`maxscore' { qui count if `score'==`k' local nk`k'=r(N) } qui count local N=r(N) di in green "{hline 80}" di in green "Number of individuals with none missing values: " in ye `N' di in green "Maximal number of modalities for a variable: " in ye "`type'" di in green "Proximity measures: " in ye "`desprox'" di in green "Method to aggregate clusters: " in ye "`desmethod'" di in green "{hline 80}" di di /*************************Measure of proximities*********************************/ if "`matrix'"=="" { forvalues i=1/`nbitems' { forvalues j=`=`i'+1'/`nbitems' { /***********************************Proximity AD*************************/ if "`prox'"=="matching" { /*ad*/ qui count if ``i''==1&``j''==1 local tmp11=r(N) qui count if ``i''==0&``j''==0 local tmp00=r(N) matrix `proximity'[`i',`j']=sqrt(1-(`tmp11'+`tmp00')/`N') matrix `proximity'[`j',`i']=`proximity'[`i',`j'] } /***********************************Proximity A**************************/ else if "`prox'"=="jaccard" { /*a*/ qui count if ``i''==1&``j''==1 local tmp11=r(N) qui count if ``i''==0&``j''==0 local tmp00=r(N) matrix `proximity'[`i',`j']=sqrt(1-`tmp11'/(`N'-`tmp00')) matrix `proximity'[`j',`i']=`proximity'[`i',`j'] } /***********************************Proximity Russel**************************/ else if "`prox'"=="russel" { qui count if ``i''==1&``j''==1 local tmp11=r(N) matrix `proximity'[`i',`j']=sqrt(1-`tmp11'/`N') matrix `proximity'[`j',`i']=`proximity'[`i',`j'] } /***********************************Proximity A**************************/ else if "`prox'"=="dice" { qui count if ``i''==1&``j''==1 local tmp11=r(N) qui count if ``i''==0&``j''==0 local tmp00=r(N) matrix `proximity'[`i',`j']=sqrt(1-2*`tmp11'/(`N'+`tmp11'-`tmp00')) matrix `proximity'[`j',`i']=`proximity'[`i',`j'] } /**********************************Proximity COR*************************/ else if "`prox'"=="pearson" { /*corr*/ qui corr ``i'' ``j'' matrix `proximity'[`i',`j']=sqrt(2*(1-r(rho))) matrix `proximity'[`j',`i']=`proximity'[`i',`j'] } /***********************************Proximity CCOV**********************/ else if "`prox'"=="ccov" { local dij=0 local Ntemp=`N' forvalues k=1/`=`maxscore'-1' { if `nk`k''!=0 { if `nk`k''>1 { qui corr ``i'' ``j'' if `score'==`k',cov local covi`i'j`j'k`k'=r(cov_12) } else if `nk`k''==1 { local Ntemp=`Ntemp'-1 local covi`i'j`j'k`k'=0 } else { local covi`i'j`j'k`k'=0 } local dij=`dij'+`covi`i'j`j'k`k''*`nk`k'' } } matrix `proximity'[`i',`j']=-`dij'/`Ntemp' matrix `proximity'[`j',`i']=`proximity'[`i',`j'] if `proxmin'<`dij'/`Ntemp' { local proxmin=`dij'/`Ntemp' } } /***********************************Proximity CCOR**********************/ else if "`prox'"=="ccor" { local dij=0 local nnull=0 local Ntemp=`N' forvalues k=1/`=`maxscore'-1' { if `nk`k''!=0 { if `nk`k''>1 { qui corr ``i'' ``j'' if `score'==`k' local cori`i'j`j'k`k'=r(rho) } else if `nk`k''==1 { local Ntemp=`Ntemp'-1 local cori`i'j`j'k`k'=0 } else { local cori`i'j`j'k`k'=0 } if `cori`i'j`j'k`k''!=. { local dij=`dij'+`cori`i'j`j'k`k''*`nk`k'' } else if `cori`i'j`j'k`k''==. { local nnull=`nnull'+`nk`k'' } } } matrix `proximity'[`i',`j']=sqrt(2*(1-`dij'/(`Ntemp'-`nnull'))) matrix `proximity'[`j',`i']=`proximity'[`i',`j'] } /***********************************Proximity MH************************/ else if "`prox'"=="mh" { local numij=0 local denom=0 forvalues k=1/`=`maxscore'-1' { if `nk`k''!=0 { qui count if ``i''==1&``j''==1&`score'==`k' local A=r(N) qui count if ``i''==0&``j''==1&`score'==`k' local B=r(N) qui count if ``i''==1&``j''==0&`score'==`k' local C=r(N) qui count if ``i''==0&``j''==0&`score'==`k' local D=r(N) local numij=`numij'+`A'*`D'/`nk`k'' local denomij=`denomij'+`B'*`C'/`nk`k'' } } matrix `proximity'[`i',`j']=-log(`numij'/`denomij') matrix `proximity'[`j',`i']=`proximity'[`i',`j'] if `proxmin'<log(`numij'/`denomij') { local proxmin=-`proximity'[`i',`j'] } } } } if "`prox'"=="ccov"|"`prox'"=="mh" { forvalues i=1/`nbitems' { forvalues j=`=`i'+1'/`nbitems' { matrix `proximity'[`i',`j']=`proximity'[`i',`j']+`proxmin' if `proximity'[`i',`j']<0 { matrix `proximity'[`i',`j']=0 } matrix `proximity'[`j',`i']=`proximity'[`i',`j'] } } } } /**********************END OD THE COMPUTING OF THE PROXIMITIES**************************************/ else { matrix `proximity'=`matrix' } matrix rowname `proximity'=`varlist' matrix colname `proximity'=`varlist' if "`measures'"!="" { di in green "{hline 50}" di in green "Measures of proximity between the items" di in green "{hline 50}" matrix list `proximity', noheader di } /**********************CLUSTERING PROCEDURE **********************************************/ qui clustermat `method' `proximity',clear labelvar(name) local hor "hor" if "`method'"!="centroid"&"`method'"!="median"&"`dendrogram'"=="" { if "`html'" != "" { qui local saving "saving(`c(tmpdir)'/`html'_dendro,replace) nodraw" qui cluster dendro ,labels(name) hor ylabel(,angle(0)) title("Hierarchical Cluster Analysis on variables") subtitle("`desmethod'") xtitle("`desprox' proximities") `saving' qui graph use `c(tmpdir)'/`html'_dendro.gph qui graph export `c(tmpdir)'/`html'_dendro.eps, replace di "<br />" di "<img src=" _char(34) "/data/`html'_dendro.png" _char(34) di " class=" _char(34) "resgraph" _char(34) " alt=" _char(34) "dendro" _char(34) " title= " _char(34) "Hierarchical Cluster Analysis on variables - click to enlarge" _char(34) " width=" _char(34) "350" _char(34) " height=" _char(34) "240" _char(34) " >" } else { qui cluster dendro ,labels(name) hor ylabel(,angle(0)) title("Hierarchical Cluster Analysis on variables") subtitle("`desmethod'") xtitle("`desprox' proximities") } } if "`partition'"!="" { foreach i of numlist `partition' { qui cluster gen cluster`i'=group(`i') } tempname clusters mkmat cluster* ,mat(`clusters') matrix rownames `clusters'=`varlist' local compteur=0 foreach i of numlist `partition' { local ++compteur di di in green "{hline 30}" di in green "Partition in `i' cluster(s)" di in green "{hline 30}" di forvalues j=1/`i' { local cluster`i'_`j' local nbi`i'_`j'=0 forvalues k=1/`nbitems' { if `clusters'[`k',`compteur']==`j' { local cluster`i'_`j' `cluster`i'_`j'' ``k'' local ++nbi`i'_`j' } } di in green "Cluster `j': " in ye "`cluster`i'_`j''" } } return matrix clusters=`clusters' } /**********************DETECT OPTION **************************************************/ use `hcaccproxfile',clear if "`detect'"!="" { foreach i of numlist `partition' { local liste local part forvalues j=1/`i' { local liste "`liste' `cluster`i'_`j''" local part "`part' `nbi`i'_`j''" } qui detect `liste',part(`part') local detect`i'=r(DETECT) local Iss`i'=r(Iss) local R`i'=r(R) } tempname indexes matrix define `indexes'=J(`compteur',4,0) matrix colnames `indexes'=Clusters DETECT Iss R di "" di in green "{hline 50}" di in green "Indexes to compare the partitions of the items" di in green "{hline 50}" di "" di in green _col(29) "DETECT" _col(43) "Iss" _col(56) "R" local compteur=0 foreach k of numlist `partition' { local ++compteur matrix `indexes'[`compteur',1]=`k' matrix `indexes'[`compteur',2]=`detect`k'' matrix `indexes'[`compteur',3]=`Iss`k'' matrix `indexes'[`compteur',4]=`R`k'' di _col(5) in green "`k' cluster(s):" _col(27) in yellow %8.5f `detect`k'' _col(38) %8.5f `Iss`k'' _col(49) %8.5f `R`k'' } return matrix indexes=`indexes' } return local nbvar=`nbitems' return matrix measures=`proximity' restore, not *use `hcaccproxfile',clear end