#This file exemplifies programming
#Sala-i-Martin : I jsut ran 2 million regressions
# in R

#BASIC STRUCTURE:
#    there is a small regression function (OLSest) and a 
#    simulation function (SalaiMartin2mn) that calls OLSest many times and  
#    calulates weighted parameter averages
#
#    SalaiMartin2mn assesses the robustness of one covariate ("examined_regressor")
#    by calculating its coefficient and stnd. error in all regressions consisting
#    of some fixed variables (keepfix_vector) and all possible combinations of the 
#    other variables included in a given data set


##################################
# OLSest(yXmat,bincludeintercept=FALSE,bwhite=TRUE) 
#
#This is a simple OLS function with White heteroskedasticity-consistent variance
# INPUTS:
#    yXmat: [numeric matrix] having the dependent in the first column and the regressors thereafter, can contain NAs 
#    bincludeintercept(optional, default false): [logical] if set to TRUE and intercept is included in regression
#    bwhite(optional,default true): [logical] if TTRUE, then a White Covariance matrix is added to function output
# OUTPUT:
# a list having the following elements:   
#    $betahat: [numeric vector] vector of OLS coefficients; intercept is included as first element of betahat
#    $RSS: [numeric scalar] residual sum of squares
#    $betavar: [numeric matrix] standard OLS beta covariance matrix: sigma^2 (X'X)^-1
#    $betavarwhite: [numeric matrix] White heteroskedasticity-consistent beta covariance 
#            estimator: (X'X)^-1 X' diag(e_t^2)  X (X'X)^-1
#    $loglik: [numeric scalar] log-likelihood
OLSest<- function(yXmat,bincludeintercept=FALSE,bwhite=TRUE) {
    yXomit=na.omit(yXmat) # throw out NAs
    yvec=as.vector(yXomit[,1])
    nbobs=length(yvec)
    if (bincludeintercept) {
        Xmat=yXomit
        Xmat[,1]=1
    } else {
        Xmat=yXomit[,-1]
    }

    degfree=nbobs-ncol(Xmat)

    XtXinv=chol2inv(chol(crossprod(Xmat))) #(X'X)^-1
    Xty=crossprod(Xmat,yvec) #X'y
    betavec=as.vector(crossprod(XtXinv,Xty)) # (X'X)^-1 X'y
    RSS=as.vector(crossprod(yvec)-crossprod(betavec,Xty)) #y'y- b'X'y= ehat'ehat
    VCV_normal = XtXinv *RSS/degfree # sigma^2 (X'X)^-1
    loglikeli=-nbobs/2*(log(2*pi)+log(RSS/nbobs)+1) 
    if (bwhite) { #white estimator
        resids=as.vector(yvec-Xmat%*%betavec)
        VCV_white=XtXinv %*% crossprod(Xmat*resids) %*% XtXinv
    } else {
        VCV_white=VCV_normal #fill VCV_white in any case
    }
    return(list(betahat=betavec,sumsq=RSS,betavar=VCV_normal,betavar_white=VCV_white,loglik=loglikeli))
}


############################
# SalaiMartin2mn(dependent, regressors, examined_regressor,keepfix_vector=numeric(0),include_intercept=TRUE,number_of_changing_regressors=3) 
#
#    This function examines the robustness of one regressor by including a large number of other regressors in the model
#    Y= R b_ri + Z b_zi + X_i b_xi + E_i 
#    where R is the examined regressor, Z a number of fixed regressors to be included in each regression set
#    and X_i is the i-th combination of e.g. 3 variables taken out of the remaining regressors

# INPUTS:
#    dependent: [data.frame, 1 column] dependent variable
#    regressors: [data.frame] regressors
#    examined_regressor: [numeric or character] either the name of the regressor to be assessed, or its integer index in the data.frame "regressors", (R above)
#    keepfix_vector(optional, default none): [numeric vector or character vector] names or index (referring to data.frame "regressors") of the variables to be included in each regression (Z above)
#        set =numeric(0) or =0 if you do not want to have any fixed regressors
#    include_intercept (optional, default true): [logical] if set to TRUE, includes intercept in each regression
#    number_of_changing_regressors (optional, default 3): [integer scalar] number of 'random' variables to be included in each regression
# OUTPUT: 
# a list containing two sublists:
# sublist $WEIGHTED contains the weigthed average of parameters
#    $weighted$betahat: [numeric scalar] weigthed av. of betas for examined_regressor
#    $weighted$betaSE: [numeric scalar] weigthed av. of OLS standard errors for examined_regressor
#    $weighted$Pval: [numeric scalar] Prob value for the t-test of $betahat and $betaSE
#    $weighted$betaSEwhite: [numeric scalar] weigthed av. of White standard errors for examined_regressor
#    $weighted$Pval: [numeric scalar] Prob value for the t-test of $betahat and $betaSEwhite
# sublist $SIMPLE contains the simple average of parameters (i.e. not weighted)


SalaiMartin2mn <- function(dependent, regressors, examined_regressor,keepfix_vector=numeric(0),include_intercept=TRUE,number_of_changing_regressors=3) {

    #get input    
    yall=data.matrix(dependent)
    Xall=data.matrix(regressors)
    yXall=cbind(yall,Xall)
    ncolall=ncol(Xall)
    
    ######## USABILITY CHECKS ###############
    if (sum(which(keepfix_vector==examined_regressor))>0) {stop("examined_regressor must not be element of keepfix_vector!!")}
        #check if examined not element of keepfix

    examined_regressor=examined_regressor[1] #ensure that examined is scalar
    if (sum(keepfix_vector==0)>0) {keepfix_vector=integer(0)} #if 0 in keepfix, then do not keep any regs fixed
    
    #if examined or keepfix are characters, look for the respective indices:
    if (is.character(examined_regressor)) {
        examined_regressor=which(names(regressors)==examined_regressor)
    }
    if (is.character(keepfix_vector)) {
        keepfix_vector=which(!(is.na(match(names(regressors),keepfix_vector))))
    }

    #just to ensure that there are no strange numbers...
    examined_regressor=as.integer(examined_regressor)
    keepfix_vector=as.integer(keepfix_vector)


    ########## REAL CALC #################

    #get combinations of all variables that are not in examined or keepfix
    combiset=combn((1:ncolall)[-c(examined_regressor,keepfix_vector)],number_of_changing_regressors)
     
    #initialize storing variables for on-the-fly summing
        total_lik <- 0
        total_beta <- 0
        total_betavariance <- 0
        total_betavarianceWhite <- 0
        totalW_beta <- 0
        totalW_betavariance <- 0
        totalW_betavarianceWhite <- 0

        #in OLSest the first coef may be the intercept: adjust for that
        examined_index_in_output=1+as.integer(include_intercept) 
  
    cat(as.character(Sys.time()),": Starting Simulation runs...\n")

    #the big loop:    
    for (irun in 1:ncol(combiset)) {
        # get the regressor indices, with examined bieng first
        regindices2include=c(examined_regressor,sort(c(combiset[,irun],keepfix_vector)))
        # extract the respective X matrix
        yX2regress=yXall[,c(1,(regindices2include+1))]
    
        #estimate
        model=OLSest(yXmat=yX2regress,bincludeintercept=include_intercept,bwhite=TRUE)
    
        #sum up 'on the fly' and weight with likelihood
        currlik=exp(model$loglik)
        totalW_beta<-totalW_beta + model$betahat[examined_index_in_output]*currlik
        totalW_betavariance <- totalW_betavariance + as.vector(model$betavar[examined_index_in_output,examined_index_in_output])*currlik
        totalW_betavarianceWhite <- totalW_betavarianceWhite + as.vector(model$betavar_white[examined_index_in_output,examined_index_in_output])*currlik
        total_lik <- total_lik + currlik
        #sum up on the fly for simple averages
        total_beta<-total_beta + model$betahat[examined_index_in_output]
        total_betavariance <- total_betavariance + as.vector(model$betavar[examined_index_in_output,examined_index_in_output])
        total_betavarianceWhite <- total_betavarianceWhite + as.vector(model$betavar_white[examined_index_in_output,examined_index_in_output])
    
        #inform user about progress
        if (irun %% 5000==0) {cat(as.character(Sys.time()),": run number ",irun,"\n")}
    }
    
    ######### FINAL CALCS ##########
    #calc simple averages
    av_beta <- total_beta / irun
    av_betavariance <- total_betavariance / irun
    av_betaSE <- sqrt(av_betavariance)
    av_betavarianceWhite <- total_betavarianceWhite / irun
    av_betaWhiteSE <- sqrt(av_betavarianceWhite)
    pval_t <- (1-pt(abs(av_beta/av_betaSE),nrow(yXall)))*2
    pval_t_white <- (1-pt(abs(av_beta/av_betaWhiteSE),nrow(yXall)))*2
    
    #calc weighted averages
    Wav_beta <- totalW_beta / total_lik
    Wav_betavariance <- totalW_betavariance / total_lik
    Wav_betaSE <- sqrt(Wav_betavariance)
    Wav_betavarianceWhite <- totalW_betavarianceWhite / total_lik
    Wav_betaWhiteSE <- sqrt(Wav_betavarianceWhite)
    Wpval_t <- (1-pt(abs(Wav_beta/Wav_betaSE),nrow(yXall)))*2
    Wpval_t_white <- (1-pt(abs(Wav_beta/Wav_betaWhiteSE),nrow(yXall)))*2

    #Output lists...
    av_outlist=list(betahat=av_beta ,betaSE=av_betaSE,Pval=pval_t,betaSEwhite=av_betaWhiteSE,Pval_white=pval_t_white)
    Wav_outlist=list(betahat=Wav_beta ,betaSE=Wav_betaSE,Pval=Wpval_t,betaSEwhite=Wav_betaWhiteSE,Pval_white=Wpval_t_white)

    ######### DISPLAY RESUTLS ##################
    av_outtable=av_outlist
    names(av_outtable)=c("Av.Beta","Std.Err.","Prob.","White Std.Err.","White Prob.")
    Wav_outtable=Wav_outlist
    names(Wav_outtable)=c("Av.Beta","Std.Err.","Prob.","White Std.Err.","White Prob.")
        #this is just to have nicely named output tables

    #print header info
    cat("\n\n--------- RESULTS ------------------------------------\n")
    cat("   Tested variable number",examined_regressor,":",names(regressors)[examined_regressor],"\n")
    if (sum(keepfix_vector)>0) {
        cat("   Fixed regressors",paste(keepfix_vector, collapse=", "),":",names(regressors)[keepfix_vector],"\n")
    } else {
        cat("   Fixed regressors: none\n")
    }
    cat("  ",irun,"iterations over all other regressors\n")
    #print parameters
    cat("\n------- WEIGHTED AVERAGE PARAMETERS -------------------------------------------\n")
    print(unlist(Wav_outtable))
    cat("\n------- SIMPLE AVERAGE PARAMETERS ----------------------------------------------\n")
    print(unlist(av_outtable))
    cat("--------------------------------------------------------------------------------\n")

    ####### RETURN FUNCITON OUTPUT ###################
    return(list(weighted=Wav_outlist, simple=av_outlist))
}


#GET DATA
#millions.csv ist just the excel file millions.xls saved as CSV (in comma-separated-value format), nothing else

growthdata = read.csv("millions.csv",na.strings=c("#NV","."),header=T,row.names=2,nrows=134)

#we have to tell R to read in only 134 rows, since the excel file contained several more rows of junk
#moreover we set row.names=2 (the country codes are in the second column), and specify what NA is in the Excel CSV format.


#DO SALA-I-MARTIN
testoutput=SalaiMartin2mn(growthdata[,3],growthdata[,4:64],"X39",c("X1","X2","X3"),T)