** Citation Count Analysis Script ** ** Author: Brian Nosek ** ** Creation Date: 14 August 2009 ** ** Last revision: 26 March 2010 **; libname web 'C:\primary\Data\Citations\'; data citations; informat lastname $25.; informat firstname middlen whosearched $15.; informat gender ethnicity $1.; informat univ $40.; informat searchdate $11.; infile "C:\primary\Data\Citations\individuals.txt" delimiter='09'x firstobs=2; input lastname $ firstname $ middlen $ gender $ ethnicity $ PhDyear Univ $ SearchDate $ WhoSearched $ CleanComplexity TotalCites PubYears hindex eindex hmindex WOStotal updated; YsincePhD = 2010 - PhDyear; *2009 PhD's marked as 1 year; if updated = 1 then YsincePhD = YsincePhD + 0.5; *for new and updated searches for PSPB revision; if univ = "Brown" then univ = "Brown University"; *correcting typo; informat wholename $40.; wholename = trim(lastname)||", "||trim(firstname); informat pyear $4.; pyear = phdyear; informat nameinst $80.; nameinst = trim(firstname)||" "||trim(lastname)||", "||trim(univ)||" ("||trim(Pyear)||" PhD)"; logcite = log(totalcites); logh = log(hindex); loge = log(eindex); loghm = log(hmindex); if loghm < 0 then loghm = 0; *retain absolute zero, corrects one outlier with an h of 1; if loge = . then loge = 0; *corrects one outlier with an e of 0; * CORE INDIVIDUAL INDICIES; ** CUMULATIVE IMPACT INDICIES; * TotalCites = total citation count from Publish or Perish analysis; * hindex = h calculation based on Publish or Perish analysis, Hirsch 2005; * hmindex = fractionalized h correcting for number of co-authors, correlated >.95 with h; * eindex = very similar to and correlates extremely highly with total citation count; * Icumulative = average of above four after standardizing with mean of 0 and std of 1; ** CAREER-STAGE IMPACT INDICIES; * byYcite = TotalCites score based on Years since PhD: log(total citations) corrects for heteroscedasicity, then subtract that value from expected log citation value for years since PhD based on regression; * byYh = hindex score based on Years since PhD: log(h-index) corrects for heteroscedasicity to increase comparability across years, then subtract that value from expected log citation value for years since PhD based on regression; * byYe - same; * byYhm - same; * Icareerstage = average of above four after standardizing with mean of 0 and std of 1; * CORE DEPARTMENT INDICIES; ** CUMULATIVE IMPACT INDICIES; * DCites = sum of total citation count among core faculty; * Dh = sum of h-indices among core faculty; * De = sum of e-indices; * Dhm = sum of hm-indices; * Dcumulative = average of above four after standardizing with mean of 0 and std of 1; ** CAREER STAGE ADJUSTED IMPACT INDICIES; * DbyYcite = average of DbyYcite among core faculty; * DbyYh = average of DbyYh among core faculty; * DbyYe - same; * DbyYhm - same; * Dcareerstage = average of above four after standardizing with mean of 0 and std of 1; ** AGGREGATE INDICATOR; * Daggregate = average of the cumulative and career-stage indices after standardizing each with mean of 0 and std of 1; * ALTERNATIVE INDICIES available that were not used in analysis; mindex = hindex/ysincePhD; *Hirsch, 2005; * WOStotal = Web of Science total citation count; *just used as reliability check for Publish or Perish count; i = 1; *DUMMY VARIABLE; if gender = "F" then gender = "f"; else if gender = "M" then gender = "m"; if ethnicity in ("W", "w") then ethnic = "w"; else if ethnicity in ('"', 'a', 'b', 'h', 'o', 'A', 'B', 'H', 'O') then ethnic = "o"; *breakdown by grad decade; if PhDyear > 1999 then PhDdec = 2000; else if PhDyear > 1989 then PhDdec = 1990; else if PhDyear > 1979 then PhDdec = 1980; else if PhDyear > 1969 then PhDdec = 1970; else if PhDyear > 1959 then PhDdec = 1960; else if PhDyear > 1949 then PhDdec = 1950; run; * basic information and demographics of sample; proc contents;run; proc means;run; *mean years since phd; proc means n; class PhDdec; var i;run; proc means n; class gender; var i;run; proc means n; class ethnicity; var i;run; proc means n; class ethnic; var i;run; proc means n; class updated; var i;run; * representation of universities and how many searches completed by each researcher; proc means n; class univ; var i;run; proc means n; class whosearched; var i;run; * data for J. Schwartz illustration in intro; proc univariate; var totalcites; where 1999 < PhDyear < 2006;run; * overall correlations among basic indicies; proc corr; var YsincePhD totalcites hindex eindex hmindex wostotal;run; proc corr; var totalcites hindex eindex hmindex wostotal; partial YsincePhD; run; proc means; var totalcites wostotal; where wostotal > -1;run; proc corr alpha; var totalcites hindex eindex hmindex;run; * plots of citation and h data by years since PhD illustrates heteroscedasticity; proc reg ; model totalcites = YsincePhD /stb;run; proc reg; model hindex = YsincePhD /stb;run; proc reg; model eindex = YsincePhD /stb;run; proc reg; model hmindex = YsincePhD /stb;run; * threatens interpretation of regression estimates and comparability of deviation scores across career span; * The log of citations and h was used and these non-linear adjustments eliminated these correlations; * That suggests that the meaning of citeCS and hCS are comparable across the career span * regressions to generate citation and h intercepts and slopes across the whole sample; proc reg; model logcite = YsincePhD / r; run; proc reg; model logh = YsincePhD / r;run; proc reg; model loge = YsincePhD / r;run; proc reg; model loghm = YsincePhD / r;run; proc means; var logcite logh loge loghm;run; * distribution analysis example for logcite; filename grafout 'C:\primary\Data\Citations\'; ODS HTML PATH=GrafOut FILE="citationresiduals.html"; ODS GRAPHICS ON; PROC REG DATA=citations PLOTS(UNPACKPANELS); MODEL logcite = YsincePhD; RUN;QUIT; ODS GRAPHICS OFF; ODS HTML CLOSE; * illustrating averages for each year since PhD as comparison to regression estimates; proc means; class YsincePhD; var totalcites;run; proc means; class YsincePhD; var eindex;run; proc means; class YsincePhD; var hindex;run; proc means; class YsincePhD; var hmindex;run; * refit regressions just for early career folks - <11 years since PhD; proc reg; model logcite = YsincePhD; where YsincePhD < 11;run; proc reg; model loge = YsincePhD;where YsincePhD < 11;run; proc reg; model logh = YsincePhD;where YsincePhD < 11;run; proc reg; model loghm = YsincePhD;where YsincePhD < 11;run; * calculating deviations from regression line for career stage indicators; data citations; set citations; citeINT = 5.461; citeSLOPE = .0803; citeCS = logcite - (citeINT + YsincePhD*citeSLOPE); *deviation from expected citation value; * for early career analysis - testing whether systematically more early-career researchers are under the expected regression value; if citeCS < 0 then citeEXP = -1; else citeEXP = 1; hINT = 1.929; hSLOPE = .0413; hCS = logh - (hINT + YsincePhD*hSLOPE); *deviation from expected h value; * for early career analysis; if hCS < 0 then hEXP = -1; else hEXP = 1; eINT = 2.505; eSLOPE = .0383; eCS = loge - (eINT + YsincePhD*eSLOPE); *deviation from expected e value; * for early career analysis; if eCS < 0 then eEXP = -1; else eEXP = 1; hmINT = 1.220; hmSLOPE = .0485; hmCS = loghm - (hmINT + YsincePhD*hmSLOPE); *deviation from expected hm value; * for early career analysis; if hmCS < 0 then hmEXP = -1; else hmEXP = 1; * simple correlations among key variables; proc corr; var YsincePhD totalcites hindex citeCS hCS;run; proc corr; var YsincePhD eindex hmindex eCS hmCS;run; proc corr; var YsincePhD totalcites hindex citeCS hCS; with eindex hmindex eCS hmCS;run; * creating variables to retain raw citation and h variables before standardizing; data citations; set citations; RAWcites = totalcites; RAWh = hindex; RAWe = eindex; RAWhm = hmindex; * standardizing key variables; proc standard mean=0 std=1 out=citations; var totalcites hindex eindex hmindex citecs hcs ecs hmcs;run; * creating aggregate scores; data citations; set citations; Icareerstage = mean(citecs, hcs, ecs, hmcs); Icumulative = mean(totalcites, hindex, eindex, hmindex); *Iaggregate = mean(Icareerstage, Icumulative); * blend of both indicators, not used in article; * means and corrs of all key variables; proc means data=citations; run; proc corr; var Icumulative Icareerstage;run; proc corr; var Icumulative Icareerstage; with totalcites hindex eindex hmindex citecs hcs ecs hmcs;run; * ranking of cumulative scores; proc sort; by Icumulative; proc print; var nameinst Icumulative RAWcites RAWe RAWh RAWhm;run; proc print; var nameinst Icumulative totalcites eindex hindex hmindex;run; /* proc print data=citations; var wholename Icumulative Icareerstage;run; */ * ranking of career-stage scores; proc sort; by Icareerstage; proc print; var nameinst Icareerstage citeCS eCS hCS hmCS;run; * early career analysis - shows that early career estimates are off; proc means; class citeEXP; var citeCS; where ysincephd < 6;run; proc means; class eEXP; var eCS; where ysincephd < 6;run; proc means; class hEXP; var hCS; where ysincephd < 6;run; proc means; class hmEXP; var hmCS; where ysincephd < 6;run; proc means; class citeEXP; var citeCS; where 5 < ysincephd < 11;run; proc means; class eEXP; var eCS; where 5 < ysincephd < 11;run; proc means; class hEXP; var hCS; where 5 < ysincephd < 11;run; proc means; class hmEXP; var hmCS; where 5 < ysincephd < 11;run; /* proc contents;run; data web.forlisa; set citations; keep nameinst YsincePhD wholename univ RAWcites RAWh RAWe RAWhm Icumulative Icareerstage citeCS eCS hCS hmCS; run; ** END OF BY INDIVIDUAL ANALYSES; /*trying a multilevel model*/ proc mixed data=citations noclprint noinfo ord; class univ; model citeEXP eEXP hEXP hmEXP = /cl; *use corrb to see correlations between fixed effects; random intercept / subject=univ cl; *should type=un be in this model; *use 'cl' to see confidence intervals; run; ** BY DEPARTMENT ANALYSES; * creating by department variables; proc means n mean data=citations; class univ; var citeCS; output out=D1; run; proc means mean data=citations; class univ; var hCS; output out=D2; run; proc means mean data=citations; class univ; var RAWcites; output out=D3; run; proc means mean data=citations; class univ; var RAWh; output out=D4; run; proc means mean data=citations; class univ; var YsincePhD; output out=D5; run; proc means mean data=citations; class univ; var eCS; output out=D6; run; proc means mean data=citations; class univ; var hmCS; output out=D7; run; proc means mean data=citations; class univ; var RAWe; output out=D8; run; proc means mean data=citations; class univ; var RAWhm; output out=D9; run; data D1a; set D1; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D1b; set D1; if _STAT_ in ("N") then ; else delete; deptN = citeCS; drop _type_ _freq_ _stat_ citeCS; data D2; set D2; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D3; set D3; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D4; set D4; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D5; set D5; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D6; set D6; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D7; set D7; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D8; set D8; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data D9; set D9; if _STAT_ in ("MEAN") then ; else delete; drop _type_ _freq_ _stat_; data Dcitations; merge D1a D1b D2 D3 D4 D5 D6 D7 D8 D9; by univ; if univ = "" then delete; Drawcites = deptN*RAWcites; Drawh = deptN*RAWh; Drawe = deptN*RAWe; Drawhm = deptN*RAWhm; Dyears = YsincePhD; drop ysincePhD; Dcites = Drawcites; Dh = Drawh; De = Drawe; Dhm = Drawhm; run; * standardizing key variables; proc standard mean=0 std=1 out=Dcitations; var Dcites Dh De Dhm citeCS hCS eCS hmCS;run; * creating aggrgated variables; data Dcitations; set Dcitations; DciteCS = citeCS; DhCS = hCS; DeCS = eCS; DhmCS = hmCS; drop citeCS hCS eCS hmCS; Dcumulative = mean(Dcites, Dh, De, Dhm); Dcareerstage = mean(DciteCS, DhCS, DeCS, DhmCS); Daggregate = mean(Dcumulative, Dcareerstage); run; * basic means and correlations of key variables; proc means;run; proc corr data=Dcitations; var deptN Dyears Dcumulative Drawcites Drawh Drawe Drawhm;run; proc corr data=Dcitations; var deptN Dyears Dcareerstage DciteCS DhCS DeCS DhmCS;run; proc corr data=Dcitations; var deptN Dyears Dcumulative Drawcites Drawh Drawe Drawhm; with deptN Dyears Dcareerstage DciteCS DhCS DeCS DhmCS;run; * cumulative rankings; proc sort; by Dcumulative; proc print; var univ deptN Dyears Dcumulative Dcites De Dh Dhm; run; proc print; var univ deptN Dyears Dcumulative Drawcites Drawe Drawh Drawhm; run; * how much does size and seniority of department account for Dcumulative ranks?; proc reg data=Dcitations; model Dcumulative = deptN Dyears;run; /* *what does it look like with average cites and h rather than cumulative (sum)?; proc standard mean=0 std=1 out=Dcitations; var rawcites rawh rawe rawhm;run; data Dcitations; set Dcitations; Davgcum = mean(rawcites, rawh, rawe, rawhm); proc corr; var deptN Dyears Davgcum rawcites rawh rawe rawhm;run; proc sort; by Davgcum; proc print; var univ deptN Dyears Davgcum rawcites rawe rawh rawhm; run; *end of "average" footnote analysis; */ * career stage rankings; proc sort; by Dcareerstage; proc print; var univ deptN Dyears Dcareerstage DciteCS DeCS DhCS DhmCS; run; * how much does size and seniority of department account for Dcareerstage ranks?; proc reg data=Dcitations; model Dcareerstage = deptN Dyears;run; * aggregate of cumulative and career stage and rankings; proc sort; by Daggregate; proc print; var univ deptN Dyears Daggregate Dcumulative Dcareerstage; run; *mizzou and rochester focus - for general discussion; proc print data=citations; var wholename Icumulative Icareerstage; where univ = "University of Missouri";run; proc print data=citations; var wholename Icumulative Icareerstage; where univ = "University of Rochester";run; ** END OF BY DEPARTMENT ANALYSES; ** GENDER AND RACE/ETHNICITY ANALYSES; * bring individual dataset back into focus; data citations; set citations; * Gender and Ethnicity mean comparisons; proc means; class gender; var ysincePhD Icumulative Icareerstage ;run; proc means; class ethnic; var ysincePhD Icumulative Icareerstage ;run; proc means; class PhDdec gender; var Icareerstage; run; proc means; class PhDdec ethnic; var Icareerstage; run; * two regression strategies for testing gender and race/ethnicity differences - and contribution of years since PhD; proc glm; class gender ethnic; model Icumulative = gender ethnic;run; proc glm; class gender ethnic; model Icumulative = gender ethnic ysincephd;run; proc glm; class gender ethnic; model Icareerstage = gender ethnic;run;