n <- 100 x <- rnorm(n, 5, 1) # generate sample of size n=100 from N(5,1) mean(x) # arithmetic mean (prod(x))^(1/n) # geometric mean n/sum(1/x) # harmonical mean var(x) # empirical variance sd(x) / mean(x) # empirical coefficient of variation (sum((x - mean(x))^3)/n) / (sd(x)^3) # Skewness (sum((x - mean(x))^4)/n) / (sd(x)^4) - 3 # Excess median(x) # empirical median quantile(x, probs = seq(0, 1, 1/4)) # series of empirical quantiles range(x) summary(x) #################################################################################### # write your own "summary" function # define the function named "my.summary" my.summary <- function(data) { mys <- c(0, 0, 0, 0, 0) # initialize output mys[1] <- mean(data) mys[2] <- median(data) mys[3] <- sd(data) mys[4] <- (quantile(data, 0.75)-quantile(data, 0.25)) / 1.349 mys[5] <- length(data) names(mys) <- c("mean", "median", "std.dev(mom)", "std.dev(iqr)", "n") return(mys) } options(digits=4) # set print option my.summary(x) # function call #################################################################################### # write your own empirical distribution function edf <- function(x, ...){ s <- sort(x); n <- length(x) Fn <- c(0, (1:n)/n, 1); eps <- min(diff(s)) min <- floor(s[1]) - 2*eps; max <- ceiling(s[n]) + 2*eps plot(c(min, s, max), Fn, type = "s", ylab = "edf", ...) } par(mfrow=c(3,4)) # series of several edf's on 1 page for (i in 1:12) # start loop edf(rnorm(20), xlim=c(-3, 3), xlab="x", ylim=c(0, 1)) par(mfrow=c(1,1)) # 500 edf's in 1 plot for (i in 1:500){ edf(rnorm(20), xlim=c(-3, 3), xlab="x", ylim=c(0, 1)) par(new = T) # don't open new graphic window } par(new = F) # reset to new graphic window #################################################################################### # Monte Carlo estimate of the type I error alpha R <- 1000 # define number of replications n <- 50 # define sample size pn <- pt <- 1:R # initialize 2 vectors (lists) for (r in 1:R) { # start loop z <- rnorm(n) # draw sample from a N(0,1) population t <- rt(n, 1) # draw sample from a t_1 population pn[r] <- t.test(z)$p.value # save respective p-value pt[r] <- t.test(t)$p.value } # end loop sum(pn > 0.95); sum(pt > 0.95) # count the number of p-values larger 0.95 par(mfrow = c(1, 3)) plot(seq(-4, 4, .1), dnorm(seq(-4, 4, .1)), type="l", xlab="x", ylab="Density") lines(seq(-4, 4, .1), dt(seq(-4, 4, .1), 1), lty=2, col="2") # adds 'lines' to the plot hist(pn, xlab="p-value", main="N(0,1)", ylim=c(0,200)) # shows uniform behaviour hist(pt, xlab="p-value", main="t(1)", ylim=c(0,200)) # no longer uniformly distributed par(mfrow=c(1,1)) # set back to default #################################################################################### # Tests # KS test on petrol data petrol <- c(11.5, 11.8, 12.0, 12.4, 12.5, 12.6, 12.8, 12.9, 13.0, 13.2) ks.test(petrol, "pnorm", 12, 1, alternative="two.sided") # Test on N(12, 1) # Wilcoxon test on median 180 h <- c(179, 177, 178, 174, 170, 185, 175, 179, 176, 169, 186, 189, 168, 170, 174) wilcox.test(h, mu=180) # recalculate the above Wilcoxon statistic by using vectors d <- h - 180 # build vector of differences r.absd <- rank(abs(d)) # calc the ranks of absolute diff's z <- (d >= 0) # take only positive diff's w <- t(z) %*% r.absd # the Wilcoxon statistic - now a 1*1 matrix w # print w ##################################################################################### # Working with vectors and matrices # Create a 3*2 matrix A <- matrix(c(1, 3, 2, 5, 6, 4), byrow=T, ncol=2); A # Create a 2*2 matrix B <- matrix(c(2, 2, 3, 1), byrow=T, ncol=2); B # Matrix multiplication (operator '%*%') A %*% B # Square each element of a matrix B^2 # Multiply a square matrix by itself B %*% B # Find the inverse of a square matrix solve(B) # Extract the diagonal of a square matrix diag(B) # add up diagonal elements sum(diag(B)) # square root of the diagonal matrix sqrt(diag(B)) # Concatenate a matrix (i.e. stack the columns) c(A) # Transpose a matrix t(A) # Bind the columns of two matrices cbind(t(A),B) ######################################################################################## # EDA e <- rexp(100, 1) # generate 100 random variates from an exponential(1) population boxplot(e) # draw a boxplot of these values hist(e) # a histogram d <- density(e, kernel="gaussian") plot(d) # a density estimate ######################################################################################## # External Data Sets # read data X <- matrix(scan("houses.dat"), byrow=T, ncol=5) # check 45th row, 1st column X[45,1] # print all rows of 1st column X[ ,1] # calc mean of 1st column mean(X[ ,1]) # calc variances of the columns v <- apply(X, 2, var); v # calc std-dev's of the columns s <- sqrt(v); s var(X) # variance-covariance matrix of the entire data set # alternative handling using data frames HP <- read.table("houses.dat", col.names=c("Price", "Size", "Bed", "Bath", "New")) # show all objects available objects() # list object Price in data frame HP HP$Price # add object HP in search path attach(HP) summary(HP) boxplot(Price~New) # draw boxplot for each value of "new" plot(Size, Price, type="n", ylim=c(0,350), xlim=c(0,4)) # empty scatterplot # add ponts to this scatterplot, where plotting character and colour depends on Bath points(Size, Price, pch=Bath+1, col=Bath+1) table(New, Bath) Pricef <- cut(Price, breaks=15+60*(0:5)) # factorize Price Sizef <- cut(Size, breaks=0.8*(0:5)) # factorize Size table(Pricef, Sizef) # and tabulate their frequencies