###############################################
# A Brief Introduction to R
# Eric Vansteenberghe
# (adapted from Erin Hartman and Danny Hidalgo's code and Mark J. Bennet and Dirk L. Hugen book Financial analytics with R)
# Fall 2016
###############################################
# What is R?
# - language and environment mostly used for statistical computing and graphics
# - based on S, a statistical programming language started at Bell Labs
# - has a basis in Fortran and Scheme
# - extensible, capable of plugging in Fortran, C, C+, and others
# - open source (and free!)
# - many people contribute an abundance of useful packages
# Some Downsides?
# - inefficient for big computation problems
# - bad for big data, though getting better
# Installation
# can be found at:
#
# - download the binary for your OS (or install from source, if you wish)
# - run the installer and follow the instructions
# Text Editing, Workflow and Coding Best Practices
# good work flow will save you from headaches and heartaches!
# 1. always map your work flow in a summary file
# 2. code well, clearly, and consistently, and comment obsessively!
# 3. always log your results and data output
# 4. good file management starts with you!
# Getting to R
#
#######
# HELP
#######
# Most useful tool in R, help
# Starts an interactive html help file
help.start()
# help function for 'mean'
help(mean)
# also help for 'mean'
?mean
# search
# suppose we want to find the command for the product of a vector
help.search("product")
# note that now, we can find the help file
help(read.csv)
#############
# Navigating
############
#see what working directory you are in:
getwd()
# change to another working directory
setwd("/Users/skimeur/Google Drive/empirical_finance")
getwd()
# list all objects in your workspace (currently should be empty)
ls()
#Use a library (if not installed, use Tools, install package)
library(Matching)
#############
# Assignment
############
# there are two way of doing assignment, <- and =
# <- was historically the way of assignment, = can also be used and should make no difference for most of our usages
x <- 5
y = 2
assign("a",10)
x
y
a
#############
# Basic Math
#############
z <- x + y
z
z - 2
x * 5
y / 3
######################
# Classes and Casting
######################
# A class refers to the type of an object.
# Casting refers to recasting an object to another type of class
# there are a few major classes:
# integer and numeric:
x <- 1
class(x)
#x <- as.integer(x)
class(as.integer(x))
class(x)
x <- 2.5
class(x)
as.integer(x)
# character:
y <- "test"
class(y)
y <- as.numeric(y)
# there is also matrix, array, list, expression, and many more, which we will encounter later
###############
# Entering Data
###############
# note many facilities for importing data of various types into R:
# read.csv [comma], read.table [tab,space], read.dta [stata], read.xpt [sas], read.spss [.sav for spss]
p <-scan(n=3)
1
2
3
p
# matrices: [can be tricky though]
s <- t(matrix(scan(n=6),2))
1 4
2 5
3 6
s
# remove particular data from the workspace
rm(s,p,x,y,z,a)
#create a vector of elements using c(), with a "Not Available" element
x = c(1.3,1.2,1.3,NA,1.4,1.5)
######
# Plot
######
#Plot the data
plot(x,ylab = "values")
#test which element of the vector x is "Not Available"
is.na(x)
!is.na(x)
x[1]
x[4]
x[4]
x <- x[!is.na(x)]
#Filter for prices above 1.3
threshold <- 1.3
z=x[x>threshold]
length(x)
# You can see a NA, remove it
#Compute the difference of the log of x (as it they were prices)
y=diff(log(x))
#Rounding y
round(y,2)
#####################
# Defining a function
#####################
power <- function(x,y){
return(x^y)
}
#apply this function to our vector x
power(x,3)
#It is important to not here that you can use a local variable in the function definition, like x and y and it won't affect the actual value of you variable x and y outside of this function
#If you need to go around this, you can use the super-assignment: <<-
powerbis <- function(x,y){
x<<-4
return(x^y)
}
powerbis(x,2)
x
#By using this super-assignement <<- we changed the value of x even outside of our function (this is not usually wanted)
typeof(powerbis)
typeof(x)
typeof(z)
#we define another function
sincos <-function(x,y){
return(sin(x)+cos(y))
}
#####################
# if - else condition
####################
sinusite=1
if(sinusite == 2){
str = "power(2,3)"
} else{
str = "sincos(2,3)"
}
######
# eval
######
#Now that we have as an output a string (str), we can evaluate the string as an R expression using parse() and then evaluate it with eval()
eval(parse(text=str))
sincos(2,3)
power(2,3)
sinusite=2
eval(parse(text=str))
#Although we changed the value of sinusite, it didn't changed our result
#In order to have a dynamic evalution we can use:
evaluons<-function(sinusite){
ifelse(sinusite==2,
eval(parse(text="power(2,3)")),
eval(parse(text="sincos(2,3)")))
}
#Now it is more dynamic
sinusite=1
evaluons(sinusite)
sinusite=2
evaluons(sinusite)
##########
# For loop
##########
#We can go one step further with defining a for loop to apply the function to each element of the vector sinusite
#simple for loop example
nombre=1e8
for(i in 1:10){
nombre=nombre/i
}
print(nombre)
nombre2=1e8
print(nombre2/factorial(10))
#Now we apply it to our case
sinusite=c(1,2)
for(i in 1:length(sinusite)){
print(evaluons(sinusite[i]))
}
########
# sapply
########
#We can also simply use the function sapply() to apply our function to each element of our vectore sinusite
sapply(sinusite,evaluons)
x <- (1:20)
cube <- function(x){return(x^3)}
sapply(x,cube)
# clearing our workspace
rm(list = ls())
#we want to search for (sin(x) == cos(x) in the vincinity of 3.5
f<-function(x){
return(sin(x))
}
g<-function(x){
return(cos(x))
}
x=c(0,0.1,0.2,1)
f(x)
g(x)
f(x)==g(x)
# plot f: x -> f(x)
seq(1,20,by=1)==1:20
z <- seq(0,10,by = pi/10)
plot(z,f(z),"l")
lines(z,g(z))
############
# While loop
############
#we want to search in the vector [3.5,4]
y=0
difference<-abs(f(y)-g(y))
tolerance <- 0.001
steps <- 0.0001
while(difference > tolerance){
difference<-abs(f(y)-g(y))
y=y+steps
}
#we found an y for which sin(y) approx cos(y)
f(y)
g(y)
y
cos(pi/4)
sin(pi/4)
y-pi/4
########
# fslove
########
#finding cos(x)-sinf(x) = 0 varying x around 0 is what the fsolve function does for us
h<-function(x){
return(abs(cos(x)-sin(x)))
}
library(pracma)
resultat <- fsolve(h,0)
resultat$x
fsolve(h,0)$x - pi/4
# clearing our workspace
rm(list = ls())
#########
# Arrays
#########
#We can define two vectors and bind them into an array
#Define a vector of 1 and another as a sequence
ones <- rep(1,5)
tows <- seq(1, 100, by=20)
length(ones) == length(tows)
Array=cbind(ones,tows)
Array
# Accessing an element in the array Array[row,column]
# The indexing of row and column start at 1 (when it start at 0 with Python!)
Array[2,2]
#removing the second row
Array[-2,]
# subset
Array[1:3,]
###########
# Matrices
##########
mat2by4 <- matrix(1:8, nrow=2,ncol=4)
mat2by4
#Square of this matrix
#To multiply two matrices, use %*%
squaremat<-mat2by4%*%t(mat2by4)
squaremat
#recall that 1*2+3*4+5*6+7*8 = 100
################
# Error Handling
################
?tryCatch
#Taken from http://www.win-vector.com/blog/2012/10/error-handling-in-r/
inputs = list(1, 2, 4, -5, 'oops', 0, 10)
for (element in inputs) {
print(paste("log of", element, "=", log(element)))
}
#We got some error message that prevented the executions of the whole list
#We use try to avoid this and go through the whole list
for (input in inputs) {
try(print(paste("log of", input, "=", log(input))))
}
#To chose what to return in case of error, you can use tryCatch
for (input in inputs) {
tryCatch(print(paste("log of", input, "=", log(input))),
warning = function(w) {print(paste("negative argument", input));log(-input)},
error = function(e) {print(paste("non-numeric argument", input));NaN})
}
# clearing our workspace
rm(list = ls())
#######################
# Setting the precision
#######################
# you can chose how many digits are shown
# reminder: using 64-bit doubles your target accuracy is around 16 decimal digits
options(digits = 10)
pi
##########################
# Use of random generation
##########################
#We want to plot the density of the normal distribution unsiong rnorm
?rnorm
plot(density(rnorm(50,0,1)))
#Now with more observations we are getting closer to the theoretical distribution
plot(density(rnorm(1e6,0,1)))
#We can get random samples
set.seed(1)
sample(10)
set.seed(1)
sample(10)
#If we want to make our random sample reproducible, we can use set.seed()
set.seed(1)
x <- rbinom(10,10,0.1)
y <- rbinom(10,10,0.1)
set.seed(1)
z <- rbinom(10,10,0.1)
sum(x - y)
sum(x - z)
# rm(.Random.seed, envir=.GlobalEnv)
# to reset seed
######################
# Working with strings
######################
# Create a list of strings
tickers <- c("RNO","SG","BNP")
# Find in which position SG is located
match("SG",tickers)
# Concatenate strings
liste <- paste("RNO","SG","BNP",sep = ",")
print(liste)
# Get a substring
substr(liste,5,6)
#############
# Data Frames
#############
# Data Frames is a sequence of rows where the columns are heterogeneously typed
L3 <- LETTERS[1:3]
fac <- sample(L3 , 10 , replace=TRUE)
df <- data.frame(x = 1,y=1:10,fac=fac)
df
df$fac
#Working on the DataFrame column names
names(df)
names(df) <- c(names(df)[1:2],"factor")
#We can write this DataFrame to a csv file
write.csv(df,file = "df.csv",row.names = FALSE)
# Accessing elements of a DataFrame
df[1:3,2]
# Accessing a column (as if it was a list, see below)
df[[3]]
df[,3]
# Nabigating through a DataFrame
df <- data.frame(x=1:4,y=5:8,z=9:12,a=13:16)
print(paste("df has",dim(df)[1],"rows and",dim(df)[2],"columns"))
df
df^2
df <- df^2
for (i in 1:dim(df)[1]) {
for (j in 1:dim(df)[2]){
df[i,j] <- df[i,j]^2
}
}
df
# merging (horizontal) and rbinding (vertical) two data frames
?merge
# we take 3 data frames
df1 <- data.frame(A = 1,B=1:10,C=sample(LETTERS[1:3] , 10 , replace=TRUE))
df2 <- data.frame(B=1:10,D=11:20)
df3 <- data.frame(A=1000,B=1001:1010,C="bob")
df4 <- data.frame(B=1:9,C="alors?")
# we can merge both df1 and df2 keeping B
df.merged <- merge(df1,df2,by="B")
# if both data frame don't have the same number of rows
df.merged.bis <- merge(df1,df4, by = "B")
# if you want to keep all rows and add NAs
df.merged.bis.all <- merge(df1,df4, by = "B",all.x=TRUE)
# we can stack vertically data frame df1 and data frame df3
df.stacked <- rbind(df1,df3)
# but to stack vertically you need to make sure that the columns of the data frames match
df.stacked.bis <- rbind(df1,df2)
# or use rbind.fill
library(plyr)
df.stacked.bis <- rbind.fill(df1,df2)
# clearing our workspace
rm(list = ls())
########
# Lists
#######
# List are similar to vectors except that they are recursively formed; you can construct list of lists
# The list() operator allow to keep the types of the different items you add to the list
v <- c(1,c(1,1),"A")
l <- list(1,c(1,1),"A")
v
l
# Return c(1,1) as a list
l[2]
# Return c(1,1) as a itself (a vector)
l[[2]]
# Illustration of the advantages of using lists
A <- matrix(c("RNO","SG","BNP"))
B <- matrix(c(1.1,1.2,1.3))
res <- list(A,B)
res[[1]]
res[[2]]
# clearing our workspace
rm(list = ls())
##################################
# Loading Data from external files
##################################
# Exercise :
# 1: create a function that give you log (x- threshold)
# 2: use tryCatch to return a NA in case there is a warning
# 3: apply your function to the house prices in Farmer 2015
# 4: for the NA in your create list, replace with the value of the previous result
data <- read.csv(file = "R_data/farmer_2015.csv", sep = ";")
thr <- 5
myfunc(thr,6)
x <- 7
y <- 6
log(-1)
log("oops")
tryCatch(log(x - thr),
warning = function(w) {;log(y - thr)},
error = function(e) {;log(y - thr)})
test <- sapply(data[,2], myfunc)
library(zoo)
test <- zoo(test)
test <- na.locf(test)
head(data)
# what do we know about 'data'
# gives us the dimensions as nrow, ncol
dim(data)
# gives the column names of data
names(data)
# we can use the $ to access a named column of our object
# by typing objectName$columnName
data$house_price
# gives us the length of a vector
length(data$house_price)
# loading data from an R library
# use the lalonde data set in the Matching library
data(lalonde)
ls()
names(lalonde)
dim(lalonde)
lalonde$age
# Some Basic Commands:
# Making Objects:
# sequence
# a sequence from 0 to 5 in 0.5 increments
seq.1 <- seq(0, 5, 0.5)
seq.1
class(seq.1)
# a sequence of integers
seq2 <- 1:10
seq2
class(seq2)
# making squares
myX <- 1:4
myX
# note: for an array, ^2 will to element wise operation
myX <- myX^2
myX
# assign some names to our array usinc "c()", which concatenates
names(myX)
names(myX) <- c("1 squared", "2 squared", "3 squared", "4 squared")
myX
class(myX)
# matrices
mat1 <- matrix(data = 1:10, nrow = 5, ncol = 2, byrow = TRUE)
mat1
mat2 <- matrix(1:10, 5, 2, byrow = FALSE)
mat2
# element wise logical comparison
# == will test for equality in the two compared objects
mat1 == mat2
ls()
x == y
x == z
x == x
# clearing our workspace
rm(list = ls())
ls()
# data frames
# a data frame is a list of variables of the same length with unique row names.
# it is best to put your data in a data frame when working with it
# when you read in data, R coerces it into a data.frame class
data <- read.csv(file = "R_data/farmer_2015.csv", sep= ";")
class(data)
names(data)
rownames(data)
# we can also make our own data frame
x <- 1:16
y <- seq(1, 4, .2)
length(x)
length(y)
myData <- data.frame( X = x, Y = y )
names(myData)
dim(myData)
class(myData)
myData$X
myData$Y
# attach allows us to access the columns of a data frame without typing the object name and run some diagnostics
attach(myData)
X
mean(X)
min(X)
Y
max(Y)
sd(Y)
detach(myData)
# Indexing... Super Important
# indexing arrays:
x
# return the first five elements of x
x[1:5]
# set the first five elements of x to NA
x[1:5] <- NA
x
# check for missing values, returns a logical vector
is.na(x)
class(is.na(x))
# putting a ! infront of a logical command will negate it
!is.na(x)
# find the number of missing values and non-missing values
sum(is.na(x))
sum(!is.na(x))
# check that all elements are accounted for
sum(is.na(x)) + sum(!is.na(x)) == length(x)
# making a new vector from a subset of an existing vector
# in this case, make a vector of all non-missing values of x
x2 <- x[!is.na(x)]
x2
length(x2)
# in this case, drop the first three values
x3 <- x[-c(1:3)]
x3
# indexing matrices:
# same as before, except now we must include a row and column value
data
# are there any missing data points in data$x1
sum(is.na(data$house_price))
# what about in the whole data set
sum(is.na(data))
# element row = 2, column = 1
data[2, 1]
# rows in which x1 < 0
data[data$house_price < 0, ]
# rows in which x1 < 0 and only columns 1 to 3
data[data$house_price < 0, 1:3]
attach(data)
data[house_price < 0, 1:3]
detach(data)
# indexing variables
# find all values of y1 for which x1 is also negative
data$ln_stock[data$x1 < 0]
# finding subsets
# note: we have >, <, >=, <=, !=, == as logical operators to use
# we can use & to do and statements and | to do or statements
library(MASS)
data(painters)
names(painters)
rownames(painters)
attach(painters)
painters[School == "A", ]
rownames(painters[School == "A", ])
# the & means that the row must meet both conditions
# putting nothing after the comma means "all columns"
painters[School == "A" & Drawing < 15, ]
subPainters <- painters[School == "A" & Drawing < 15, ]
subPainters
colours <- painters[School == "A" & Drawing >= 15, 3]
colours
painters[School == "A" | School == "B", ]
painters[School == "C" | School == "D", c(1:3)]
# putting nothing before the comma means "all rows"
painters[, 2]
detach(painters)
# String manipulation in R
# 'paste()' is helpful
# great for text data, naming things, searching over variables
vstring=painters[1:3,1]
paste(vstring[1:3],'-',sep='')
paste(vstring[1],'-',vstring[2],'-',vstring[3],'-',sep='')
# 'stringr' package is very useful for manipulating string objects in R
library(stringr)
str_length(paste(vstring[1],'-',vstring[2],'-',vstring[3],'-',sep=''))
str_sub(paste(vstring[1],'-',vstring[2],'-',vstring[3],'-',sep=''),1,5)
# For Loops
# we can use for loops to iterate through a process
# we index with i, and the for loop iterates through the process from the starting value to
# the last value, in this case starting at 1, incrementing by 1 each time, until we reach 5
for( i in 1:5) {
print(paste("Surfs up! I'm on interation", i))
}
for( i in 5:1) {
print(paste("Surfs down! I'm on interation", i))
}
# in this case we don't plug in consecutive values for i
for( i in c(3,10,9)) {
print(paste("Bummer! I've got i equal to", i))
}
# making squares using a for loop
squares <- NULL
for( i in 1:5) {
squares <- c(squares, i^2)
print(squares)
print(paste("The mean is currently", mean(squares)))
}
squares
mean(squares)
# dealing with columns of data
data
for( i in 2:ncol(data)) {
print(paste("The mean of column", i, "is", mean(data[, i]) ))
}
# find the max for each of the rows of data using a for loop
# note: the object we index by doesn't have to be 'i', that is just convention
for( j in 1:nrow(data)) {
print(paste("The max of row", j, "is", max(data[j, 2:ncol(data)]) ))
}
# Apply
# apply is similar to the idea of a for loop, but it is taylored to matrices and data frame
help(apply)
# we must pass in our object, the margin we want to iterate over (1 means rows, 2 means columns), and a function
# find the means of all the columns of data
means <- apply(data[,2:ncol(data)], 2, mean)
means
# find the max of all the rows of data
rowMax <- apply(data, 1, max)
rowMax
# Linear Regression:
data(cars)
attach(cars)
help(lm)
# make a new variable, speed^2
speed2 = speed^2
# model 1, just speed on distance with intercept
lm1 = lm(dist ~ speed)
summary(lm1)
# model 2, speed and speed^2 on distance with intercept
lm2 = lm(dist ~ speed + speed2)
summary(lm2)
# model 3, speed and speed^2 no intercept
lm3 = lm(dist ~ speed + speed2 - 1)
summary(lm3)
detach(cars)
# addition operators
# : sequence operator
# counts whole numbers from n to m; used in looping for instance
1:9
# integer sequences
seq(from=1,to=9,by=.7)
# matrix operations
dim(data)[2]==dim(data)[1]
databis<-data[,2:ncol(data)]
X=as.matrix(databis[,1:(dim(databis)[2]/2)])
Y=as.matrix(databis[,(dim(databis)[2]/2+1):dim(databis)[2]])
dim(X)
dim(Y)
#Transpose
t(X)
#Matrix multiplicatoin
t(Y)%*%X
#inverse
solve(t(Y)%*%X)
chol(solve(t(Y)%*%X))
#Testing a function (you need to install tempdisagg first)
library(tempdisagg)
data(swisspharma)
# one indicator, no intercept
mod1 <- td(sales.a ~ 0 + exports.q)
summary(mod1)
plot(mod1)
# disaggregated and true series
ts.plot(cbind(predict(mod1), sales.q), col = c("red", "blue"),
main = "Quarterly Sales, Pharmaceuticals and Chemicals")
legend("topleft", c("Estimated values", "True values"),
lty = 1, col = c("red", "blue"), inset = 0.05)