#This file exemplifies programming #Sala-i-Martin : I jsut ran 2 million regressions # in R #BASIC STRUCTURE: # there is a small regression function (OLSest) and a # simulation function (SalaiMartin2mn) that calls OLSest many times and # calulates weighted parameter averages # # SalaiMartin2mn assesses the robustness of one covariate ("examined_regressor") # by calculating its coefficient and stnd. error in all regressions consisting # of some fixed variables (keepfix_vector) and all possible combinations of the # other variables included in a given data set ################################## # OLSest(yXmat,bincludeintercept=FALSE,bwhite=TRUE) # #This is a simple OLS function with White heteroskedasticity-consistent variance # INPUTS: # yXmat: [numeric matrix] having the dependent in the first column and the regressors thereafter, can contain NAs # bincludeintercept(optional, default false): [logical] if set to TRUE and intercept is included in regression # bwhite(optional,default true): [logical] if TTRUE, then a White Covariance matrix is added to function output # OUTPUT: # a list having the following elements: # $betahat: [numeric vector] vector of OLS coefficients; intercept is included as first element of betahat # $RSS: [numeric scalar] residual sum of squares # $betavar: [numeric matrix] standard OLS beta covariance matrix: sigma^2 (X'X)^-1 # $betavarwhite: [numeric matrix] White heteroskedasticity-consistent beta covariance # estimator: (X'X)^-1 X' diag(e_t^2) X (X'X)^-1 # $loglik: [numeric scalar] log-likelihood OLSest<- function(yXmat,bincludeintercept=FALSE,bwhite=TRUE) { yXomit=na.omit(yXmat) # throw out NAs yvec=as.vector(yXomit[,1]) nbobs=length(yvec) if (bincludeintercept) { Xmat=yXomit Xmat[,1]=1 } else { Xmat=yXomit[,-1] } degfree=nbobs-ncol(Xmat) XtXinv=chol2inv(chol(crossprod(Xmat))) #(X'X)^-1 Xty=crossprod(Xmat,yvec) #X'y betavec=as.vector(crossprod(XtXinv,Xty)) # (X'X)^-1 X'y RSS=as.vector(crossprod(yvec)-crossprod(betavec,Xty)) #y'y- b'X'y= ehat'ehat VCV_normal = XtXinv *RSS/degfree # sigma^2 (X'X)^-1 loglikeli=-nbobs/2*(log(2*pi)+log(RSS/nbobs)+1) if (bwhite) { #white estimator resids=as.vector(yvec-Xmat%*%betavec) VCV_white=XtXinv %*% crossprod(Xmat*resids) %*% XtXinv } else { VCV_white=VCV_normal #fill VCV_white in any case } return(list(betahat=betavec,sumsq=RSS,betavar=VCV_normal,betavar_white=VCV_white,loglik=loglikeli)) } ############################ # SalaiMartin2mn(dependent, regressors, examined_regressor,keepfix_vector=numeric(0),include_intercept=TRUE,number_of_changing_regressors=3) # # This function examines the robustness of one regressor by including a large number of other regressors in the model # Y= R b_ri + Z b_zi + X_i b_xi + E_i # where R is the examined regressor, Z a number of fixed regressors to be included in each regression set # and X_i is the i-th combination of e.g. 3 variables taken out of the remaining regressors # INPUTS: # dependent: [data.frame, 1 column] dependent variable # regressors: [data.frame] regressors # examined_regressor: [numeric or character] either the name of the regressor to be assessed, or its integer index in the data.frame "regressors", (R above) # keepfix_vector(optional, default none): [numeric vector or character vector] names or index (referring to data.frame "regressors") of the variables to be included in each regression (Z above) # set =numeric(0) or =0 if you do not want to have any fixed regressors # include_intercept (optional, default true): [logical] if set to TRUE, includes intercept in each regression # number_of_changing_regressors (optional, default 3): [integer scalar] number of 'random' variables to be included in each regression # OUTPUT: # a list containing two sublists: # sublist $WEIGHTED contains the weigthed average of parameters # $weighted$betahat: [numeric scalar] weigthed av. of betas for examined_regressor # $weighted$betaSE: [numeric scalar] weigthed av. of OLS standard errors for examined_regressor # $weighted$Pval: [numeric scalar] Prob value for the t-test of $betahat and $betaSE # $weighted$betaSEwhite: [numeric scalar] weigthed av. of White standard errors for examined_regressor # $weighted$Pval: [numeric scalar] Prob value for the t-test of $betahat and $betaSEwhite # sublist $SIMPLE contains the simple average of parameters (i.e. not weighted) SalaiMartin2mn <- function(dependent, regressors, examined_regressor,keepfix_vector=numeric(0),include_intercept=TRUE,number_of_changing_regressors=3) { #get input yall=data.matrix(dependent) Xall=data.matrix(regressors) yXall=cbind(yall,Xall) ncolall=ncol(Xall) ######## USABILITY CHECKS ############### if (sum(which(keepfix_vector==examined_regressor))>0) {stop("examined_regressor must not be element of keepfix_vector!!")} #check if examined not element of keepfix examined_regressor=examined_regressor[1] #ensure that examined is scalar if (sum(keepfix_vector==0)>0) {keepfix_vector=integer(0)} #if 0 in keepfix, then do not keep any regs fixed #if examined or keepfix are characters, look for the respective indices: if (is.character(examined_regressor)) { examined_regressor=which(names(regressors)==examined_regressor) } if (is.character(keepfix_vector)) { keepfix_vector=which(!(is.na(match(names(regressors),keepfix_vector)))) } #just to ensure that there are no strange numbers... examined_regressor=as.integer(examined_regressor) keepfix_vector=as.integer(keepfix_vector) ########## REAL CALC ################# #get combinations of all variables that are not in examined or keepfix combiset=combn((1:ncolall)[-c(examined_regressor,keepfix_vector)],number_of_changing_regressors) #initialize storing variables for on-the-fly summing total_lik <- 0 total_beta <- 0 total_betavariance <- 0 total_betavarianceWhite <- 0 totalW_beta <- 0 totalW_betavariance <- 0 totalW_betavarianceWhite <- 0 #in OLSest the first coef may be the intercept: adjust for that examined_index_in_output=1+as.integer(include_intercept) cat(as.character(Sys.time()),": Starting Simulation runs...\n") #the big loop: for (irun in 1:ncol(combiset)) { # get the regressor indices, with examined bieng first regindices2include=c(examined_regressor,sort(c(combiset[,irun],keepfix_vector))) # extract the respective X matrix yX2regress=yXall[,c(1,(regindices2include+1))] #estimate model=OLSest(yXmat=yX2regress,bincludeintercept=include_intercept,bwhite=TRUE) #sum up 'on the fly' and weight with likelihood currlik=exp(model$loglik) totalW_beta<-totalW_beta + model$betahat[examined_index_in_output]*currlik totalW_betavariance <- totalW_betavariance + as.vector(model$betavar[examined_index_in_output,examined_index_in_output])*currlik totalW_betavarianceWhite <- totalW_betavarianceWhite + as.vector(model$betavar_white[examined_index_in_output,examined_index_in_output])*currlik total_lik <- total_lik + currlik #sum up on the fly for simple averages total_beta<-total_beta + model$betahat[examined_index_in_output] total_betavariance <- total_betavariance + as.vector(model$betavar[examined_index_in_output,examined_index_in_output]) total_betavarianceWhite <- total_betavarianceWhite + as.vector(model$betavar_white[examined_index_in_output,examined_index_in_output]) #inform user about progress if (irun %% 5000==0) {cat(as.character(Sys.time()),": run number ",irun,"\n")} } ######### FINAL CALCS ########## #calc simple averages av_beta <- total_beta / irun av_betavariance <- total_betavariance / irun av_betaSE <- sqrt(av_betavariance) av_betavarianceWhite <- total_betavarianceWhite / irun av_betaWhiteSE <- sqrt(av_betavarianceWhite) pval_t <- (1-pt(abs(av_beta/av_betaSE),nrow(yXall)))*2 pval_t_white <- (1-pt(abs(av_beta/av_betaWhiteSE),nrow(yXall)))*2 #calc weighted averages Wav_beta <- totalW_beta / total_lik Wav_betavariance <- totalW_betavariance / total_lik Wav_betaSE <- sqrt(Wav_betavariance) Wav_betavarianceWhite <- totalW_betavarianceWhite / total_lik Wav_betaWhiteSE <- sqrt(Wav_betavarianceWhite) Wpval_t <- (1-pt(abs(Wav_beta/Wav_betaSE),nrow(yXall)))*2 Wpval_t_white <- (1-pt(abs(Wav_beta/Wav_betaWhiteSE),nrow(yXall)))*2 #Output lists... av_outlist=list(betahat=av_beta ,betaSE=av_betaSE,Pval=pval_t,betaSEwhite=av_betaWhiteSE,Pval_white=pval_t_white) Wav_outlist=list(betahat=Wav_beta ,betaSE=Wav_betaSE,Pval=Wpval_t,betaSEwhite=Wav_betaWhiteSE,Pval_white=Wpval_t_white) ######### DISPLAY RESUTLS ################## av_outtable=av_outlist names(av_outtable)=c("Av.Beta","Std.Err.","Prob.","White Std.Err.","White Prob.") Wav_outtable=Wav_outlist names(Wav_outtable)=c("Av.Beta","Std.Err.","Prob.","White Std.Err.","White Prob.") #this is just to have nicely named output tables #print header info cat("\n\n--------- RESULTS ------------------------------------\n") cat(" Tested variable number",examined_regressor,":",names(regressors)[examined_regressor],"\n") if (sum(keepfix_vector)>0) { cat(" Fixed regressors",paste(keepfix_vector, collapse=", "),":",names(regressors)[keepfix_vector],"\n") } else { cat(" Fixed regressors: none\n") } cat(" ",irun,"iterations over all other regressors\n") #print parameters cat("\n------- WEIGHTED AVERAGE PARAMETERS -------------------------------------------\n") print(unlist(Wav_outtable)) cat("\n------- SIMPLE AVERAGE PARAMETERS ----------------------------------------------\n") print(unlist(av_outtable)) cat("--------------------------------------------------------------------------------\n") ####### RETURN FUNCITON OUTPUT ################### return(list(weighted=Wav_outlist, simple=av_outlist)) } #GET DATA #millions.csv ist just the excel file millions.xls saved as CSV (in comma-separated-value format), nothing else growthdata = read.csv("millions.csv",na.strings=c("#NV","."),header=T,row.names=2,nrows=134) #we have to tell R to read in only 134 rows, since the excel file contained several more rows of junk #moreover we set row.names=2 (the country codes are in the second column), and specify what NA is in the Excel CSV format. #DO SALA-I-MARTIN testoutput=SalaiMartin2mn(growthdata[,3],growthdata[,4:64],"X39",c("X1","X2","X3"),T)