R

R is a programming language and software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing.

Examples

Assign values

> a=16
> b<-1/3

Print values

Get all variables you filled so far

> ls()

> myVector <- c(3,5,7,9)
> myVector

 3 5 7 9

Get 3. element

> myVector

 7

Get type of vector

> myVector

numeric(0)

Get Vector with this size and every value is 0

a <- numeric(10)

Get an overview of the data in a vector

> summary(myVector)

Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
3.0     4.5     6.0     6.0     7.5     9.0

For more than one columns, get summary only for column "Clicks"

> summary(myOtherVector\$Clicks)

Add 1 to any element in the vector

> myVector

 3 5 7 9

> myVector<-myVector+1
> myVector

  4  6  8 10

Watch out if vectors have a different size!

> a=c(1,2,3,4)
> b=c(5,6)
> a+b

  6  8  8 10

Calculate

> b=sqrt(16)
> e=exp(1)
> e

 2.718282
> log(e)

 1

Mean

> mean(myVector)

 7
> var(myVector)

 6.666667
> round(myVector, digits=2)

Sum

> sum(myVector)

 28
> sort(question1,decreasing = TRUE)

 "Weekly"      "Weekly"      "Sometimes"   "Sometimes"   "Sometimes"   "Sometimes"   "Once a
year" "Never"
> min(question1)

 "Never"
> max(question1)

 "Weekly"

Filter Only smaller 5

> myValues<-c(1,8,2,4,7,1,2,3)
> myValues[myValues<5]

 1 2 4 1 2 3
> myValues[myValues==2 | myValues==1]

 1 2 1 2

Get help on a command

File commands

> dir()
> getwd()

Columns may have names, get all the names

> names(myCSVFile)

Get values from a column by name (column has name "Clicks")

> myCSVFile\$Clicks

Get all column and all row names

> attributes(myCSVFile)

Strings

> s <- "Hello World"

Factors Sometimes columns do not contain numbers but names or labels of a discrete set of possible values. For example any row might belong to group A, B or C so you have a column Group and any row has on of those labels in it. R calls such a value a Factor. If your labels are numbers (e.g. 1,2 or 3) you can tell R that they are Factors.

> myCSVFile\$Reviews<-factor(myCSVFile\$Reviews)

Data Frames Data Frames can group serveral different variables into one object

myFrame <- data.frame(A=a,B=b)

Logical

> spock=5>2
> spock

 TRUE

Matrix

> myMatrix=matrix(c(4,3,2,5,6,7,8,9,0),ncol=3,byrow=TRUE)
> myMatrix

[,1] [,2] [,3]
[1,]    4    3    2
[2,]    5    6    7
[3,]    8    9    0
> colnames(myMatrix) <- c("A","B","C")
> myMatrix

A B C
[1,] 4 3 2
[2,] 5 6 7
[3,] 8 9 0

Tables Get some factors and a table to find out how often the different values appear

> people<-factor(c("Smoker", "NonSmoker", "NonSmoker", "Smoker", "Smoker", "Smoker"))
> myTable<-table(people)
> myTable

people
NonSmoker    Smoker
2         4

A matrix with named columns and rows may also be turned into a table

> myMatrix=matrix(c(4,3,2),ncol=3,byrow=TRUE)
> colnames(myMatrix) <- c("A","B","C")
> rownames(myMatrix) <- c("R1")
> myTable<-as.table(myMatrix)
> myTable

A B C
R1 4 3 2

Two Way Tables

Assume you asked 8 people 2 questions. This is how a table will show you how often a person who replied with a given answer to the first question replied to the second question

> question1<-c("Sometimes","Sometimes","Never","Weekly","Weekly","Sometimes","Sometimes","Once a
year")
> question2<-c("Maybe","Maybe","Yes","Maybe","Maybe","No","Yes","No")
> myTable<-table(question1,question2)
> myTable

question2
question1     Maybe No Yes
Never           0  0   1
Once a year     0  1   0
Sometimes       2  1   1
Weekly          2  0   0

Probability

Plotting

Get linear regression model for y=f(x)

> year <- c(2000 ,   2001  ,  2002  ,  2003 ,   2004)
> rate <- c(9.34 ,   8.50  ,  7.62  ,  6.93  ,  6.60)
> fit <- lm(rate ~ year)
> plot(year,rate)
> abline(fit)
> plot(fit)

Demo data There is some demo data included

> ?mtcars
> mtcars

mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
...

Correlation Get test data

> myData<-with(mtcars, data.frame(mpg, hp, wt))
> summary(myData)

mpg              hp              wt
Min.   :10.40   Min.   : 52.0   Min.   :1.513
1st Qu.:15.43   1st Qu.: 96.5   1st Qu.:2.581
Median :19.20   Median :123.0   Median :3.325
Mean   :20.09   Mean   :146.7   Mean   :3.217
3rd Qu.:22.80   3rd Qu.:180.0   3rd Qu.:3.610
Max.   :33.90   Max.   :335.0   Max.   :5.424

Packages

> library(reshape)

Error in library(reshape) : there is no package called ‘reshape’

> install.packages("reshape")

Installing package into ‘/usr/local/lib/R/site-library’
> library(reshape2)
> install.packages('ggplot2', dep = TRUE)

ggplot2 : Quick correlation matrix heatmap http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization

> myData<-with(mtcars, data.frame(mpg, hp, wt))

Calculate Correlation

> myCorrelation=round(cor(myData),2)
> myCorrelation

mpg    hp    wt
mpg  1.00 -0.78 -0.87
hp  -0.78  1.00  0.66
wt  -0.87  0.66  1.00

Filter out the upper triangle

> upper.tri(myCorrelation)

[,1]  [,2]  [,3]
[1,] FALSE  TRUE  TRUE
[2,] FALSE FALSE  TRUE
[3,] FALSE FALSE FALSE

Overwrite it with NA

> myCorrelation[upper.tri(myCorrelation)] <- NA
> myCorrelation

mpg   hp wt
mpg  1.00   NA NA
hp  -0.78 1.00 NA
wt  -0.87 0.66  1

Only one correlation per line

> library(reshape2)
> myDataMelted=melt(myCorrelation)
> myDataMelted

Var1 Var2 value
1  mpg  mpg  1.00
2   hp  mpg -0.78
3   wt  mpg -0.87
4  mpg   hp    NA
5   hp   hp  1.00
6   wt   hp  0.66
7  mpg   wt    NA
8   hp   wt    NA
9   wt   wt  1.00

Filter out the NA

> myDataMelted=na.omit(myDataMelted)
> myDataMelted

Var1 Var2 value
1  mpg  mpg  1.00
2   hp  mpg -0.78
3   wt  mpg -0.87
5   hp   hp  1.00
6   wt   hp  0.66
9   wt   wt  1.00

Plot it

> library(ggplot2)
> ggplot(data = myDataMelted, aes(x=Var1, y=Var2, fill=value)) + geom_tile()

Bigger example, fancier

> library(reshape2)
> myData<-with(mtcars, data.frame(mpg, cyl,  disp,  hp, drat,    wt,  qsec, vs, am, gear, carb))
> myCorrelation=round(cor(myData),2)
> hc=hclust(as.dist(1-myCorrelation)/2)
> myCorrelationSorted<-myCorrelation[hc\$order, hc\$order]
> myCorrelationSorted[lower.tri(myCorrelation)] <- NA
> data<-na.omit(melt(myCorrelationSorted))
> library(ggplot2)
> plotMe<-ggplot(data, aes(Var2, Var1, fill = value))+  geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1),
name="PearsonnCorrelation") + theme_minimal()+ theme(axis.text.x = element_text(angle = 45, vjust =
1, size = 12, hjust = 1))+coord_fixed()
> print(plotMe)
> plotMe +  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +  theme(
axis.title.x = element_blank(),  axis.title.y = element_blank(), panel.grid.major = element_blank(),
panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
legend.justification = c(1, 0), legend.position = c(0.6, 0.7), legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1, title.position = "top", title.hjust =
0.5))