# ------- GETTING DATA ----------------


wages=read.table("wagesmicrodata_tabs.txt", header=TRUE) # read data from file
attach(wages) # get column names out of data.frame

#create new variables out of data:
nonwhite<- race!=3
hispanic <- race==2

# create ney data.frame with dependent var and regressors:
newdata=data.frame(log(wage),sex,union,nonwhite, hispanic, education, experience, experience^2)

detach(wages) # delete the wages columns

# assign nice names to data frame entries:
names(newdata)[1]="lwage"
names(newdata)[2]="female"
names(newdata)[8]="experience2"

N=length(nonwhite) #determeine sample length
intercept=rep(1,N) # create 'one-vector'

#create matrices out of data frame:
yall <- newdata[,1] #dependent
Xall <- as.matrix(newdata[,-1]) #regressors - design matrix
Xall <- cbind(intercept, Xall) #add one-vector to design matrix

k <- dim(Xall)[2] #number of variables (incl intercept)


# ------ DO FIRST REGRESSION -----------
#this is actually the most inefficent way to do a regression...
# never do it like that in real life!

XtXinv <- solve( t(Xall) %*% Xall ) # (X'X)^-1

P <- Xall %*% XtXinv  %*% t(Xall) # P= X (X'X)^-1 X'
M <- diag(N) - P # M= I - X (X'X)^-1 X'

b <- XtXinv  %*% t(Xall) %*% yall #estimators bhat=(X'X)^-1 X'y

ehat <- yall - Xall %*% b #residuals 
SSR= t(ehat) %*% ehat #sum of sq of resids
Sigma2hat <- SSR/(N-k) #sigma^2 estimator (= standd error of regression ^2)

stderrs <- sqrt(diag(XtXinv) * Sigma2hat) #get standard errors of beta out of sigmahat^2 * (X'X)^-1
yall_demean=yall-mean(yall)
R2 <- 1-SSR/(t(yall_demean) %*% yall_demean) #Rsquared
Ftest=R2/(1-R2)*(N-k)/(k-1) #joint test whether all params (Except constant) are signif - "F-test"
Ftest_pval=1-pf(Ftest,k-1,N-k)

#---------- DISPLAY OUTPUT --------------
cat("\n _______ JOINT REGRESSION: ____________\n")
cat("\n   coefficients and standard errors : \n")

#bind coefs & stderrs:
bs=cbind(b,stderrs)
colnames(bs)=c("coefs","std.errs")
print(bs)

#put out R2 and F:
cat("\nR-squared: ",format(R2, digits=4))
cat("\n\nJoint significance of parameters:\nF-stat ",Ftest, ", Pval: ", format(Ftest_pval),"\n\n")

#put out P and M:
cat("\n first entries of P:\n")
print(P[1:10,1:5])
cat("\n first entries of M:\n")
print(M[1:10,1:5])


# ------------- SPLIT-SAMPLE REGRESSION AND TEST -----------------

femx=Xall[,2]*Xall[,-2] 
    # Xall[,-(1:2)] is Xall without 'female'. Xall[,2]*Xall[,-2] makes all male entries 0 
malx=(1-Xall[,2])*Xall[,-2] #make all female entries zero

Xsplit=cbind(femx,malx) #combine to large design matrix

bsplit <- solve(t(Xsplit) %*% Xsplit) %*% t(Xsplit) %*% yall #regress - it's the same b as from two independent regressions
ehat=yall - Xsplit %*% bsplit #residuals

m=dim(femx)[2] #number of regressors in femx
k=(2*m) # nb total regressors

R=cbind(diag(m),-diag(m)) #big R matrix of format (I, -I)

# do wald test / chow test:
muval=t(R %*% bsplit) %*% solve(R %*% solve(t(Xsplit) %*% Xsplit) %*% t(R)) %*% (R %*% bsplit) 
fstat=muval / (t(ehat) %*% ehat) *(N-k)/m


# -------------- DISPLAY OUTPUT ----------------
splitcoefs = array(bsplit,c(m,2))
rownames(splitcoefs) = rownames(bsplit)[1:m]
colnames(splitcoefs) = c("female","male")

cat("\n\n ______ SPLIT SAMPLE TEST ________\n \n independent coefficients:\n")
print(splitcoefs)
cat("\n    ------------------------\n\n")
cat("F test for equality:\n")
cat("F-stat: ",fstat, "pval: ",1-pf(fstat,m,N-k),"\n")


#Done by Stefan Zeugner, 2007