# ------- GETTING DATA ---------------- wages=read.table("wagesmicrodata_tabs.txt", header=TRUE) # read data from file attach(wages) # get column names out of data.frame #create new variables out of data: nonwhite<- race!=3 hispanic <- race==2 # create ney data.frame with dependent var and regressors: newdata=data.frame(log(wage),sex,union,nonwhite, hispanic, education, experience, experience^2) detach(wages) # delete the wages columns # assign nice names to data frame entries: names(newdata)[1]="lwage" names(newdata)[2]="female" names(newdata)[8]="experience2" N=length(nonwhite) #determeine sample length intercept=rep(1,N) # create 'one-vector' #create matrices out of data frame: yall <- newdata[,1] #dependent Xall <- as.matrix(newdata[,-1]) #regressors - design matrix Xall <- cbind(intercept, Xall) #add one-vector to design matrix k <- dim(Xall)[2] #number of variables (incl intercept) # ------ DO FIRST REGRESSION ----------- #this is actually the most inefficent way to do a regression... # never do it like that in real life! XtXinv <- solve( t(Xall) %*% Xall ) # (X'X)^-1 P <- Xall %*% XtXinv %*% t(Xall) # P= X (X'X)^-1 X' M <- diag(N) - P # M= I - X (X'X)^-1 X' b <- XtXinv %*% t(Xall) %*% yall #estimators bhat=(X'X)^-1 X'y ehat <- yall - Xall %*% b #residuals SSR= t(ehat) %*% ehat #sum of sq of resids Sigma2hat <- SSR/(N-k) #sigma^2 estimator (= standd error of regression ^2) stderrs <- sqrt(diag(XtXinv) * Sigma2hat) #get standard errors of beta out of sigmahat^2 * (X'X)^-1 yall_demean=yall-mean(yall) R2 <- 1-SSR/(t(yall_demean) %*% yall_demean) #Rsquared Ftest=R2/(1-R2)*(N-k)/(k-1) #joint test whether all params (Except constant) are signif - "F-test" Ftest_pval=1-pf(Ftest,k-1,N-k) #---------- DISPLAY OUTPUT -------------- cat("\n _______ JOINT REGRESSION: ____________\n") cat("\n coefficients and standard errors : \n") #bind coefs & stderrs: bs=cbind(b,stderrs) colnames(bs)=c("coefs","std.errs") print(bs) #put out R2 and F: cat("\nR-squared: ",format(R2, digits=4)) cat("\n\nJoint significance of parameters:\nF-stat ",Ftest, ", Pval: ", format(Ftest_pval),"\n\n") #put out P and M: cat("\n first entries of P:\n") print(P[1:10,1:5]) cat("\n first entries of M:\n") print(M[1:10,1:5]) # ------------- SPLIT-SAMPLE REGRESSION AND TEST ----------------- femx=Xall[,2]*Xall[,-2] # Xall[,-(1:2)] is Xall without 'female'. Xall[,2]*Xall[,-2] makes all male entries 0 malx=(1-Xall[,2])*Xall[,-2] #make all female entries zero Xsplit=cbind(femx,malx) #combine to large design matrix bsplit <- solve(t(Xsplit) %*% Xsplit) %*% t(Xsplit) %*% yall #regress - it's the same b as from two independent regressions ehat=yall - Xsplit %*% bsplit #residuals m=dim(femx)[2] #number of regressors in femx k=(2*m) # nb total regressors R=cbind(diag(m),-diag(m)) #big R matrix of format (I, -I) # do wald test / chow test: muval=t(R %*% bsplit) %*% solve(R %*% solve(t(Xsplit) %*% Xsplit) %*% t(R)) %*% (R %*% bsplit) fstat=muval / (t(ehat) %*% ehat) *(N-k)/m # -------------- DISPLAY OUTPUT ---------------- splitcoefs = array(bsplit,c(m,2)) rownames(splitcoefs) = rownames(bsplit)[1:m] colnames(splitcoefs) = c("female","male") cat("\n\n ______ SPLIT SAMPLE TEST ________\n \n independent coefficients:\n") print(splitcoefs) cat("\n ------------------------\n\n") cat("F test for equality:\n") cat("F-stat: ",fstat, "pval: ",1-pf(fstat,m,N-k),"\n") #Done by Stefan Zeugner, 2007