n <- 100 
x <- rnorm(n, 5, 1) # generate sample of size n=100 from N(5,1)

mean(x)          # arithmetic mean
(prod(x))^(1/n)  # geometric mean
n/sum(1/x)       # harmonical mean

var(x)           # empirical variance
sd(x) / mean(x)  # empirical coefficient of variation

(sum((x - mean(x))^3)/n) / (sd(x)^3)         # Skewness
(sum((x - mean(x))^4)/n) / (sd(x)^4) - 3     # Excess

median(x)        # empirical median
quantile(x, probs = seq(0, 1, 1/4))  # series of empirical quantiles
range(x)

summary(x)
####################################################################################
# write your own "summary" function
# define the function named "my.summary"
my.summary <- function(data) {
  mys <- c(0, 0, 0, 0, 0) # initialize output
  mys[1] <- mean(data)
  mys[2] <- median(data)
  mys[3] <- sd(data)
  mys[4] <- (quantile(data, 0.75)-quantile(data, 0.25)) / 1.349
  mys[5] <- length(data)
  names(mys) <- c("mean", "median", "std.dev(mom)", "std.dev(iqr)", "n")
  return(mys)
}
options(digits=4)  # set print option
my.summary(x)      # function call

####################################################################################
# write your own empirical distribution function
edf <- function(x, ...){
  s <- sort(x);   n <- length(x)
  Fn <- c(0, (1:n)/n, 1);  eps <- min(diff(s))
  min <- floor(s[1]) - 2*eps;   max <- ceiling(s[n]) + 2*eps
  plot(c(min, s, max), Fn, type = "s", ylab = "edf", ...)
}

par(mfrow=c(3,4))  # series of several edf's on 1 page
for (i in 1:12)    # start loop
  edf(rnorm(20), xlim=c(-3, 3), xlab="x", ylim=c(0, 1))

par(mfrow=c(1,1))  # 500 edf's in 1 plot
for (i in 1:500){
  edf(rnorm(20), xlim=c(-3, 3), xlab="x", ylim=c(0, 1))
  par(new = T)     # don't open new graphic window
}
par(new = F)       # reset to new graphic window

####################################################################################
# Monte Carlo estimate of the type I error alpha
R <- 1000         # define number of replications
n <- 50           # define sample size
pn <- pt <- 1:R   # initialize 2 vectors (lists)
for (r in 1:R) {  # start loop
  z <- rnorm(n)   # draw sample from a N(0,1) population
  t <- rt(n, 1)   # draw sample from a t_1 population
  pn[r] <- t.test(z)$p.value  # save respective p-value
  pt[r] <- t.test(t)$p.value
}                 # end loop
sum(pn > 0.95); sum(pt > 0.95) # count the number of p-values larger 0.95

par(mfrow = c(1, 3))
plot(seq(-4, 4, .1), dnorm(seq(-4, 4, .1)), type="l", xlab="x", ylab="Density")
lines(seq(-4, 4, .1), dt(seq(-4, 4, .1), 1), lty=2, col="2") # adds 'lines' to the plot

hist(pn, xlab="p-value", main="N(0,1)", ylim=c(0,200)) # shows uniform behaviour
hist(pt, xlab="p-value", main="t(1)", ylim=c(0,200))   # no longer uniformly distributed
par(mfrow=c(1,1)) # set back to default

####################################################################################
# Tests
# KS test on petrol data
petrol <- c(11.5, 11.8, 12.0, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2) 
ks.test(petrol, "pnorm", 12, 1, alternative="two.sided") # Test on N(12, 1)

# Wilcoxon test on median 180
h <- c(179, 177, 178, 174, 170, 185, 175, 179, 176, 169, 186, 189, 168, 170, 174)
wilcox.test(h, mu=180)

# recalculate the above Wilcoxon statistic by using vectors
d <- h - 180             # build vector of differences
r.absd <- rank(abs(d))   # calc the ranks of absolute diff's
z <- (d >= 0)            # take only positive diff's
w <- t(z) %*% r.absd     # the Wilcoxon statistic - now a 1*1 matrix
w                        # print w

#####################################################################################
# Working with vectors and matrices 
# Create a 3*2 matrix
A <- matrix(c(1, 3, 2, 5, 6, 4), byrow=T, ncol=2);  A

# Create a 2*2 matrix
B <- matrix(c(2, 2, 3, 1), byrow=T, ncol=2);  B

# Matrix multiplication (operator '%*%')
A %*% B

# Square each element of a matrix
B^2

# Multiply a square matrix by itself
B %*% B

# Find the inverse of a square matrix
solve(B)

# Extract the diagonal of a square matrix
diag(B)

# add up diagonal elements
sum(diag(B))

# square root of the diagonal matrix
sqrt(diag(B))

# Concatenate a matrix (i.e. stack the columns)
c(A)

# Transpose a matrix
t(A)

# Bind the columns of two matrices
cbind(t(A),B)

########################################################################################
# EDA
e <- rexp(100, 1)  # generate 100 random variates from an exponential(1) population
boxplot(e)         # draw a boxplot of these values
hist(e)            # a histogram
d <- density(e, kernel="gaussian")
plot(d)            # a density estimate

########################################################################################
# External Data Sets
# read data
X <- matrix(scan("houses.dat"), byrow=T, ncol=5)
# check 45th row, 1st column
X[45,1]
# print all rows of 1st column
X[ ,1]
# calc mean of 1st column
mean(X[ ,1]) 
# calc variances of the columns
v <- apply(X, 2, var); v
# calc std-dev's of the columns
s <- sqrt(v); s
var(X) # variance-covariance matrix of the entire data set

# alternative handling using data frames
HP <- read.table("houses.dat", col.names=c("Price", "Size", "Bed", "Bath", "New"))

# show all objects available
objects()
# list object Price in data frame HP
HP$Price
# add object HP in search path
attach(HP)
summary(HP) 

boxplot(Price~New)  # draw boxplot for each value of "new"
plot(Size, Price, type="n", ylim=c(0,350), xlim=c(0,4)) # empty scatterplot
# add ponts to this scatterplot, where plotting character and colour depends on Bath
points(Size, Price, pch=Bath+1, col=Bath+1) 

table(New, Bath)
Pricef <- cut(Price, breaks=15+60*(0:5))  # factorize Price 
Sizef <- cut(Size, breaks=0.8*(0:5))      # factorize Size
table(Pricef, Sizef)                      # and tabulate their frequencies