knitr = alternative literate programming package
“Ask yourselves, what problem have you solved, ever, that was worth solving, where you knew all of the given information in advance? Where you didn’t have a surplus of information and have to filter it out, or you had insufficient information and have to go find some?” - Dan Myer
# If it isn't installed, install the kernlab package with install.packages()
library(kernlab)
data(spam)
set.seed(3435)
trainIndicator = rbinom(4601, size = 1, prob = 0.5)
table(trainIndicator)
## trainIndicator
## 0 1
## 2314 2287
trainSpam = spam[trainIndicator == 1, ]
testSpam = spam[trainIndicator == 0, ]
names(), summary(), head()table(trainSpam$type)plot(trainSpam$capitalAve ~ trainSpam$type)plot(log10(trainSpam$capitalAve + 1) ~ trainSpam$type)
log+1plot(log10(trainSpam[, 1:4] + 1))
hCluster = hclust(dist(t(trainSpam[, 1:57])))
hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))trainSpam$numType = as.numeric(trainSpam$type) - 1
costFunction = function(x, y) sum(x != (y > 0.5))
cvError = rep(NA, 55)
library(boot)
for (i in 1:55) {
# creates formula with one variable and the result
lmFormula = reformulate(names(trainSpam)[i], response = "numType")
glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
# cross validated error
cvError[i] = cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]
}
# Which predictor has minimum cross-validated error?
names(trainSpam)[which.min(cvError)]
## [1] "charDollar"
# Use the best model from the group
predictionModel = glm(numType ~ charDollar,family="binomial",data=trainSpam)
# Get predictions on the test set
predictionTest = predict(predictionModel,testSpam)
predictedSpam = rep("nonspam",dim(testSpam)[1])
# Classify as 'spam' for those with prob > 0.5
predictedSpam[predictionModel$fitted > 0.5] = "spam"
# Classification table
table(predictedSpam, testSpam$type)
##
## predictedSpam nonspam spam
## nonspam 1346 458
## spam 61 449
# Error rate
(61 + 458)/(1346 + 458 + 61 + 449)
## [1] 0.2242869
knitr
knitr*text***text**#Heading##Heading###Heading- first element1. first element
[text](url)
[text][1] \(\rightarrow\) later in the document, define all of the links in this format: [1]: url "text"knitr package
======== = indicates title of document (large text)$expression$ = indicates LaTeX expression/formattingtext` = changes text to code format (typewriter font){r name, echo = FALSE, results = hide}...``` = R code chunk
name = name of the code chunkecho = FALSE = turns off the echo of the R code chunk, which means display only the resultresults = hide = hides the results from being placed in the markdown documentr variable` = prints the value of that variable directly inline with the text{r scatterplot, fig.height = 4, fig.width = 6} ... plot() ...``` = inserts a plot into markdown document
scatterplot = name of this code chunk (can be anything)fig.height = 4 = adjusts height of the figure, specifying this alone will produce a rectangular plot rather than a square one by defaultfig.width = 6 = adjusts width of the figureinstall.packages("xtable"))
xtableprints the table in html format, which is better presented than plain text normallylibrary(datasets)
library(xtable)
fit <- lm(Ozone ~ Wind + Temp + Solar.R, data = airquality)
xt <- xtable(summary(fit))
print(xt, "html")
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -64.3421 | 23.0547 | -2.79 | 0.0062 |
| Wind | -3.3336 | 0.6544 | -5.09 | 0.0000 |
| Temp | 1.6521 | 0.2535 | 6.52 | 0.0000 |
| Solar.R | 0.0598 | 0.0232 | 2.58 | 0.0112 |
{r setoptions, echo = FALSE} opt_chunk$set(echo = FALSE, results = "hide")``` = sets the default option to not print the code/results unless otherwise specifiedresults = "asis" OR "hide"
"asis" = output to stay in original format and not compiled into HTMLecho = TRUE OR FALSEfig.height = numericfig.width = numericcache = TRUElibrary(knitr)
setwd(<working directory>)
knit2html("document.Rmd")
browseURL("document.html")
knitr documents
knitr processes file to Markdown (.md) \(\rightarrow\) knitr converts file to HTMLknitrdownload.file("url", "filename") = convenient way to download file
sessionInfo() = prints R version, operating system, local, base/attached/utilized packagesset.seed() can be used to specify seet for random generator in Rcacher Packagecacher package parses R source files and creates necessary cache directories/subdirectoriescache = TRUE function for knitr)cachepackage function creates cacher package storing
cacher packageclone(id = "####") = loads data from cache
showfiles() = lists R scripts available in cachesourcefile("name.R") = loads cached R filecode() = prints the content of the R file line by linegraphcode() = plots a graph to demonstrate dependencies/structure of codeobjectcode("object") = shows lines of code that were used to generate that specific object (tracing all the way back to reading data)runcode() = executes code by loading data from cached database (much faster than regular)
checkcode() = evaluates all expressions from scratch
checkobjects() = check for integrity of data objects (i.e. see if there are possible data corruption)loadcache() = loads pointers to data objects in the data base
library(cacher)
clonecache(id = "092dcc7dda4b93e42f23e038a60e1d44dbec7b3f")
clonecache(id = "092d") ## effectively the same as above
# output: created cache directory '.cache'
showfiles() # show files stored in cache
# output: [1] "top20.R"
sourcefile("top20.R") # load R script
code() # examine the content of the code
# output:
# source file: top20.R
# 1 cities <- readLines("citylist.txt")
# 2 classes <- readLines("colClasses.txt")
# 3 vars <- c("date", "dow", "death",
# 4 data <- lapply(cities, function(city) {
# 5 names(data) <- cities
# 6 estimates <- sapply(data, function(city) {
# 7 effect <- weighted.mean(estimates[1,
# 8 stderr <- sqrt(1/sum(1/estimates[2,
graphcode() # generate graph showing structure of code
objectcode(“data”)
# output:
# source file: top20.R
# 1 cities <- readLines("citylist.txt")
# 2 classes <- readLines("colClasses.txt")
# 3 vars <- c("date", "dow", "death", "tmpd", "rmtmpd", "dptp", "rmdptp", "l1pm10tmean")
# 4 data <- lapply(cities, function(city) {
# filename <- file.path("data", paste(city, "csv", sep = "."))
# d0 <- read.csv(filename, colClasses = classes, nrow = 5200)
# d0[, vars]
# })
# 5 names(data) <- cities
loadcache()
ls()
# output:
# [1] "cities" "classes" "data" "effect"
# [5] "estimates" "stderr" "vars"
cities
# output:
# / transferring cache db file b8fd490bcf1d48cd06...
# [1] "la" "ny" "chic" "dlft" "hous" "phoe"
# [7] "staa" "sand" "miam" "det" "seat" "sanb"
# [13] "sanj" "minn" "rive" "phil" "atla" "oakl"
# [19] "denv" "clev"
effect
# output:
# / transferring cache db file 584115c69e5e2a4ae5...
# [1] 0.0002313219
stderr
# output:
# / transferring cache db file 81b6dc23736f3d72c6...
# [1] 0.000052457