#  EASY  R  FOR  REGRESSIONS

# Notes on now to use the  free statistical package R in Windows.
#I will show how to load data into it and run regressions.

# The hash market means this is a comment line.
# February 26, 2010.
# Professor  Eric Rasmusen, erasmuse@Indiana.edu.
# See http://www.rasmusen.org/a/r.htm for more files. 
#Some of this draft is untested.

#R is a wonderfully crafted statistics packages with
#amazingly bad documentation.
#These notes show how to make it do the basics of regressions.

######################################

# To download R go to: The Comprehensive R Archive Network
# at   http://cran.case.edu/
#Download it and install it on your computer, which is easy to do.

######################################

# GETTING STARTED

# Click on the R icon that the installation program created on
#your desktop. A window will open up.

#To find out what directory R is writing files to, reset it to the D:
# root directory, and check again to see if it worked,  type:

 getwd()
 setwd("D:\\")
 getwd()

#Notice how slashes need to be doubled for R to read them.

#A subdirectory would look like
# "D:\\_Take-to-office\\rfiles\\

#Now let's fix it so R doesn't show us lots
# of meaningless numbers.
 #  Scientific notation discouraged at intensity 6.
#4 digits suggested (there is no way to require only 4)
  options(scipen = 6)
  options(digits = 4)

#Only  uncomment the next line if you know why.
  #options(defaultPackages=c("car", "hmisc") )


######################################

#READING IN DATA

#A data file is just plain text. The format is like this
#file called r-data.txt:

#   price floor area rooms age cent.heat state
#1          52.00 111.0 830   	 5 6.2           no Illinois
# 2 60    128.0 710 5 7.5 no Illinois
#3 35  101.0 1000 5 4.2 no Illinois
#4 50 131.0 690 6 8.8 no Indiana
#5 20 93.0 900 5 1  yes Ohio
#6 57  101.0        1000 5 4  no Illinois
#7 80 100  690 6 8              no Indiana
#8 30  90   800 5 2  yes Ohio

#Notice the lack of an observation number on the first line.
# Capitalization matters. price and Price could
# be different variables.
#Spacing and tab symbols  don't matter, but
#linebreaks do, I think.
#Notice how binary yes/no variables and string variables
# are easily accommodated.


#The first read-in command creates an object called "data2":

data2 <- read.table("r-data.txt")

#That uses the file D:/r-data.txt, since we set D:/ as
#the working directory.


# In case we want to use  lags or time series functions,   transform
#the dataset thus:

 tdata2 <- ts(data2)


#Then  create some lags if you want--- one  and four  periods
#here:

temp1= lag(tdata2,-1)
temp4= lag(tdata2,-4)
 tdata3 <- ts.union(tdata2, temp1, temp4)


# Now convert it back to a normal dataset:

data3 <-  data.frame(tdata3)

#Unfortunately, R has given stupid names to many of the
#variables.  To see the stupid names, type

 colnames(data3)

#Thus, let's convert at least some of them back.
#R is very bad at renaming.
#First, let's get R to put  our original names in quotes:

colnames(data2)

#Then let's cut-and-paste those  into a command to fix up data3:

colnames(data3)[1:7] <- c ( "price",     "floor",     "area",
"rooms",     "age",       "cent.heat", "state")

#Cut and paste from that last command and insert 1's and 4's:

 colnames(data3)[8:14] <- c ( "price1",     "floor1",     "area1",
"rooms1",     "age1",       "cent.heat1", "state1")

  colnames(data3)[15:21] <- c ( "price4",     "floor4",     "area4",
"rooms4",     "age4",       "cent.heat4", "state4")

# Now look at your dataset to see if the names and  lags turned
#out right.  Notice that the cent.heat and state variables have
#been made numerical.

data3

#Make this dataset the default dataset to use for this session:

attach(data3)

######################################

# SUMMARY STATISTICS AND PLOTTING

 #We can get some summary statistics.
#Use your original dataset without the lags, thus:

summary(data2)

#To get a correlation matrix, use your original dataset without the
#lags, thus:

cor(data2)

#The cor() command ignores our request to only have 2 digits.
#To get it down to 2 digits, read it into Excel, and change it there.


#To plot two variables   type

plot(price,floor, main="Figure 2: Price and Floor")

#To save a figure as a jpg file, go to FILE, SAVE-AS, jpg on the
#top menu.


######################################

# RUNNING A REGRESSION

#To do a regression, create a regression output object called
#"output4":

output4 <- lm(price~ price1 + floor +area+  floor*area +
cent.heat )

#Then display the output:

summary(output4)


#If you want the residuals, variance-covariance matrix, or
#predicted values, type

residuals5  <- output4$resid
residuals5

vcov5 <-vcov(output4)
vcov5

pred5  <- output4$fitted.values
pred5

#Thesummary(), residuals(), and vcov() commands partly ignore
#our request to only have  2 digits.
#To get them down to 2 digits, cut-and-paste into Excel.

######################################

#ODDS AND ENDS

#We can plot a data histogram.

#First get rid of all the observations with missing data.

data3a<-na.omit(data3)
attach(data3a)

#Then create and plot the histogram:

var234<-hist(price)
plot(var234, main="Figure 1: A histogram  of price")

# We can also do kernel densities, for variables that
#are close to continuous. Not true here, but try it anyway:

var123 <- density (price)
plot(var123, main="Figure 3: A kernel density estimate of price")

#Finally, let's go back to our original dataset:

attach(data3)

##################

#If you don't want an intercept in a regression, type

output6 <- lm(price~ 0+  floor +area+  cent.heat)
summary(output6)

##################

#To plot two variables and the regression line for a
#TWO-VARIABLE regression:

output5 <- lm(price~ floor   )
 plot( floor,price,abline(output5$coef))

##################

#I don't have fixed state effects working yet.
#What is below is mistaken.
# To put in fixed state effects,   type

output7 <- lm(price~    floor +area+  age + factor(state))
summary(output7)

 #The dummy variables for "state" will be created automatically.


##################

# To run a logit,  convert the y-variable to have only
#values of 0 or 1 thus:

cent.heat = cent.heat -1
cent.heat

#Then run the regression

 output8 <- glm(cent.heat~   floor +area, family =binomial(link =
"logit") )
summary(output8)

##################

# If you have your data in a spreadsheet form, save
# it as a *.csv file,  which   is comma separated.  Then
 #your R  data input command  would be

data2a <- read.csv("D:\\_Take-to-office\\r-data.csv")

 #I haven't tested this one.

##################


#We can write the data  out to a file called data3.csv,
#  no variable column names,  comma-separated:

write.table(data3, col.names = FALSE,  sep = ",",
file="data3.csv")

#We can write the data  out to a file called data3.txt,
#  with variable column names,  space-separated:

write.table(data3, col.names =TRUE,  sep = " ",
file="data3.txt")

######################################

#USING PACKAGES FOR SPECIAL COMMANDS


##################

# To do an F-Test for b2 + b3 =1, first download
# the "car" "package".

#On the top menu, pick
# Packages, Select Repository, (pick any site), Install Package, car

 #Then, whenever you want to use "car" pick
# Packages, Load Package, car.

#Finally, type (BUT THIS DOESN'T WORK YET)

output6 <- lm(price~ floor +area+  cent.heat)
rhs <- c(1)
restricted6  <- rbind(c(0,0,1,1))
linear.hypothesis(output6, restricted6, rhs)

##################

#To run a Durbin-Watson test for autocorrelation, load the car
#package  as described above and then  type

output9 <-  lm(price ~ floor+ area)
durbin.watson(output7, max.lag=2)

##################

#To save regression output in a Latex file, download and load the
#"hmisc" package in the same way as the "car" package described
#above and type

output10 <-  lm(price~ floor +area+  floor*area +  cent.heat)
latex(summary(output10)$coef)

#You'll get something pretty close to latex, with
# standard errors, t-stats, and p-values.
#It will be in a file called "summary.tex" in your
# working directory.


######################################

#REFERENCES

# Mark Gardener, "Using R for statistical analyses" is at:

#
http://www.gardenersown.co.uk/Education/Lectures/R/regressi
on.htm#what_is_R"

 #Gardener  is the best documentation, giving full step-by-step
#procedures.

# Grant Farnsworth's "Econometrics in R"  is at:

#
http://cran.r-project.org/doc/contrib/Farnsworth-
EconometricsInR.pdf

#Farnsworth is the best longer  reference for economists

# MAT 356 R Tutorial, Spring 2004 is at:

#  http://math.illinoisstate.edu/dhkim/rstuff/rtutor.html

#I learned some things from it.


#"An Introduction to R", by Venables, Smith, et al. is bad.


#B. D. Ripley and D. J. Murdoch "R for Windows FAQ" is at:

#  http://cran.r-project.org/bin/windows/base/rw-FAQ.html

#It probably won't be useful.

######################################