Showing posts with label R. Show all posts
Showing posts with label R. Show all posts

R - Plot


  • Plot cumulative distribution function (CDF)

 > Y = simlist$V1 # or just use random data Y = rnorm(100) 
 > str(Y) # structure of value Y  
  num [1:229920] 0 0.01837 0 0.00955 0.01 ...  
 > P = ecdf(Y)  
 > plot(P)  
 > P(0.05)  
 [1] 0.993663  

Example - Date


  • Use funtion

 > d1 <- date()  
 > class(d1)  
 [1] "character"  
 > d1  
 [1] "Tue Dec 23 17:46:34 2014"  
 > d2 <- Sys.Date()  
 > class(d2)  
 [1] "Date"  
 > d2  
 [1] "2014-12-23"  
 > format(d2,"%a %b %d")  
 [1] "Tue Dec 23"  
 > d3 <- "1jan2014"  
 > d4 <- as.Date(d3,"%d%b%Y")  
 > d4  
 [1] "2014-01-01"  
 > d2 - d4  
 Time difference of 356 days  
 > as.numeric(d2-d4)  
 [1] 356  
 > weekdays(d2)  
 [1] "Tuesday"  
 > months(d2)  
 [1] "December"  
 > julian(d2)  # days from origin date (1970-01-01)
 [1] 16427  
 attr(,"origin")  
 [1] "1970-01-01"  

  • Use library

 > library(lubridate)  
 > ymd("20140108")  
 [1] "2014-01-08 UTC"  
 > ymd_hms("2014-01-01 10:10:10")  
 [1] "2014-01-01 10:10:10 UTC"  
 > ymd_hms("2014-01-01 10:10:10", tz = "Pacific/Auckland")  
 [1] "2014-01-01 10:10:10 NZDT"  
 > wday( ymd("20140108"))  
 [1] 4  
 > wday( ymd("20140108"), label = TRUE)  
 [1] Wed  
 Levels: Sun < Mon < Tues < Wed < Thurs < Fri < Sat  

Example - Create directory, download and summarizing data


  • Check current working directory and create "RDirectory" if doesn't exist.
1:  > getwd()  
2:  [1] "/Users/guangyuan"  
3:  > file.exists("./Rdirectory")  
4:  [1] FALSE  
5:  > if(!file.exists("./RDirectory")) {  
6:  +   dir.create("./RDirectory")  
7:  + }  
8:  > file.exists("./Rdirectory")  
9:  [1] TRUE  

  • Download file and read
1:  > url <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"  
2:  > download.file(url, destfile = "./RDirectory/restaurants.csv", method = "curl")  
3:  > data <- read.csv("./RDirectory/restaurants.csv")  

  • Summary of the data
1:  > head(data, n=3) # get first 3 lines  
2:  > tail(data, n=3) # get last 3 lines  
3:  > summary(data)  
4:  > str(data) # display the structure of the object  

  • Make table
1:  > table(data$zipCode, useNA = "ifany")  
2:  > table(data$councilDistrict, data$zipCode)  
3:     -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217 21218 21220 21222  
4:   1    0   0  37   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0   7  
5:   2    0   0   0   3  27   0   0   0   0   0   0   0   0   0   0   0   0   0   0  
6:   3    0   0   0   0   0   0   0   0   0   0   0   2  17   0   0   0   3   0   0  
7:   4    0   0   0   0   0   0   0   0   0   0  27   0   0   0   0   0   0   0   0  
8:   5    0   0   0   0   0   3   0   6   0   0   0   0   0  31   0   0   0   0   0  
9:   6    0   0   0   0   0   0   0   1  19   0   0   0   0  15   1   0   0   0   0  

  • Check zipcode
 > table(ifelse(data$zipCode <0 ,TRUE, FALSE))  
 FALSE TRUE   
  1326   1   
 > table(data$zipCode <0)  
 FALSE TRUE   
  1326   1  
  • Categorization
 > data$zipGroups <- cut(data$zipCode, breaks = quantile(data$zipCode))  
 > table(data$zipGroups)  
 (-2.123e+04,2.12e+04] (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] (2.123e+04,2.129e+04]   
          337          375          282          332   
 > table(data$zipGroups,data$zipCode)  
             -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217  
  (-2.123e+04,2.12e+04]   0  136  201   0   0   0   0   0   0   0   0   0   0   0   0   0  
  (2.12e+04,2.122e+04]    0   0   0  27  30   4   1   8  23  41  28  31  17  54  10  32  
 > library(Hmisc)  # use Hmisc package
 > data$zipGroups <- cut2(data$zipCode,g = 4)  
 > table(data$zipGroups)  
 [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]   
       338      375      300      314   
  • Mutate to create new variable
 library(Hmisc); library(plyr)  
 > data2 <- mutate(data, zipGroups = cut2(zipCode, g=4))  
 > table(data2$zipGroups)  
 [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]   
       338      375      300      314   
  • NA Values
1:  > sum(is.na(data$councilDistrict))  # get NA count, is.na() return 1 if it is NA
2:  [1] 0  
3:  > any(is.na(data$councilDistrict))  # is any value in that column NA?
4:  [1] FALSE  
5:  > all(data$zipCode>0)  # are all zipCode > 0
6:  [1] FALSE  
7:  > colSums(is.na(data))  
8:        name     zipCode  neighborhood councilDistrict policeDistrict   Location.1   
9:         0        0        0        0        0        0   
10:  > all(colSums(is.na(data)==0))  
11:  [1] TRUE  
12:  > table(data$zipCode %in% c("21213","21214"))  # zipcode in 21213 or 21214
13:  FALSE TRUE   
14:   1279  48   
15:  > data[data$zipCode %in% c("21213","21214"),]  
16:                     name zipCode           neighborhood councilDistrict policeDistrict  
17:  39              BERMUDA BAR  21213          Broadway East       12    EASTERN  
18:  44    BIG BAD WOLF'S HOUSE OF BARBEQUE  21214 Harford-Echodale/Perring Parkway        3  NORTHEASTERN  
  • Check data size
1:  > object.size(data)  
2:  242752 bytes  
3:  > print(object.size(data), units="Mb")  
4:  0.2 Mb  

  • Use xtabs
 > data(UCBAdmissions)  
 > ucba <- as.data.frame(UCBAdmissions)  
 > summary(ucba)  
    Admit    Gender  Dept    Freq     
  Admitted:12  Male :12  A:4  Min.  : 8.0   
  Rejected:12  Female:12  B:4  1st Qu.: 80.0   
               C:4  Median :170.0   
               D:4  Mean  :188.6   
               E:4  3rd Qu.:302.5   
               F:4  Max.  :512.0   
 > crosstab <- xtabs(Freq ~ Gender + Admit, data = ucba) 
 > crosstab  
     Admit  
 Gender  Admitted Rejected  
  Male    1198   1493  
  Female   557   1278  

  • Use ftable
 > warpbreaks$replicate  
  [1] 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9  
 > summary(warpbreaks)  
    breaks   wool  tension  replicate  
  Min.  :10.00  A:27  L:18  Min.  :1   
  1st Qu.:18.25  B:27  M:18  1st Qu.:3   
  Median :26.00     H:18  Median :5   
  Mean  :28.15         Mean  :5   
  3rd Qu.:34.00         3rd Qu.:7   
  Max.  :70.00         Max.  :9   
 > crosstab <- xtabs(breaks ~ .,data = warpbreaks)  
 > crosstab  
 , , replicate = 1  
   tension  
 wool L M H  
   A 26 18 36  
   B 27 42 20  
 , , replicate = 2  
   tension  
 wool L M H  
   A 30 21 21  
   B 14 26 21  
 , , replicate = 3  
   tension  
 wool L M H  
   A 54 29 24  
   B 29 19 24  
 , , replicate = 4  
   tension  
 wool L M H  
   A 25 17 18  
   B 19 16 17  
 , , replicate = 5  
   tension  
 wool L M H  
   A 70 12 10  
   B 29 39 13  
 , , replicate = 6  
   tension  
 wool L M H  
   A 52 18 43  
   B 31 28 15  
 , , replicate = 7  
   tension  
 wool L M H  
   A 51 35 28  
   B 41 21 15  
 , , replicate = 8  
   tension  
 wool L M H  
   A 26 30 15  
   B 20 39 16  
 , , replicate = 9  
   tension  
 wool L M H  
   A 67 36 26  
   B 44 29 28  
 > ftable(crosstab)  
        replicate 1 2 3 4 5 6 7 8 9  
 wool tension                     
 A  L         26 30 54 25 70 52 51 26 67  
    M         18 21 29 17 12 18 35 30 36  
    H         36 21 24 18 10 43 28 15 26  
 B  L         27 14 29 19 29 31 41 20 44  
    M         42 26 19 16 39 28 21 39 29  
    H         20 21 24 17 13 15 15 16 28  

  • Change class
 > class(data$zipCode)  
 [1] "integer"  
 > data$zipCodeFacter <- as.factor(data$zipCode)  

Install Packages in R

1. Use command

 > install.packages("KernSmooth")  
 trying URL 'http://cran.rstudio.com/bin/macosx/mavericks/contrib/3.1/KernSmooth_2.23-13.tgz'  
 Content type 'application/x-gzip' length 90611 bytes (88 Kb)  
 opened URL  
 ==================================================  
 downloaded 88 Kb  
 The downloaded binary packages are in  
      /var/folders/_s/n4303w0908s0vds7yhl10t8c0000gq/T//RtmpJTnLfh/downloaded_packages  
 > library(KernSmooth)  
 KernSmooth 2.23 loaded  
 Copyright M. P. Wand 1997-2009  

2. Use RStudio menu



R - Sort dataframe with multiple columns


  • An example to show how to sort (including descending order) dataframe with multiple column values.
  •  > data  # if we have data.frame (variable data here)
       b x y z  
     1 Hi A 8 1  
     2 Med D 3 1  
     3 Hi A 9 1  
     4 Low C 9 2    
     > data[order(data[,4]),]  # sort by 4th column, which is z here
       b x y z  
     1 Hi A 8 1  
     2 Med D 3 1  
     3 Hi A 9 1  
     4 Low C 9 2  
     > data[order(-data[,4]),]  # sort by 4th column, with descending order 
       b x y z  
     4 Low C 9 2  
     1 Hi A 8 1  
     2 Med D 3 1  
     3 Hi A 9 1  
     > data[order(-data[,4], data[,3]),]  # sort by 4th column(z), then sort by 3rd column(y)
       b x y z  
     4 Low C 9 2  
     2 Med D 3 1  
     1 Hi A 8 1  
     3 Hi A 9 1  
    

R - Sampling Data


  • sample function
    • sample takes a sample of the specified size from the elements of x using either with or without replacement.
    •  > sample(1:6, 4, replace = TRUE)  
       [1] 5 3 3 2  # duplicated number could be generated with replace option
      
    • sample without duplicated number
    •  > sample(1:20, 10)  
        [1] 16 15 2 20 12 1 13 5 18 11  
      
    • predefined English alphabet variable in R - LETTERS
    •  > LETTERS  
        [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z"  
      
    • sample 10 characters with "a" and "b" with probability of 50% respectively
    •  > sample(c("a","b"), 10, replace = TRUE, prob = c(0.5, 0.5))  
        [1] "b" "a" "a" "b" "a" "a" "a" "a" "a" "a"  
      
    • if we would like to generate 10 numbers with binominal data (0 or 1), we could use rbinom()
    •  > rbinom(n = 10, size = 1, prob = 0.7)  
        [1] 1 0 1 1 0 0 0 1 1 1  
      
    • replicate() creates matrix, (replicate the function for n times)
    •  > replicate(10,rbinom(n = 10, size = 1, prob = 0.7))  
          [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]  
        [1,]  0  1  1  1  0  0  1  1  1   1  
        [2,]  1  1  1  1  1  1  1  1  1   1  
        [3,]  0  1  1  1  1  0  1  1  1   1  
        [4,]  1  0  1  1  0  1  1  1  0   1  
        [5,]  0  0  1  1  1  1  0  1  0   0  
        [6,]  1  0  1  0  1  1  0  1  1   0  
        [7,]  1  0  0  1  1  0  1  1  1   0  
        [8,]  0  1  1  0  1  1  1  1  1   0  
        [9,]  0  0  1  1  1  1  1  1  1   1  
       [10,]  1  0  1  1  1  1  0  0  0   1  
      

R - lapply, sapply, vapply, tapply and mapply

  • lapply
    • loop over a list and evaluate a function on each element
    •  > x <- list(a=c(1:3),b=c(4:6))  
       > lapply(x,sum)  
       $a  
       [1] 6  
       $b  
       [1] 15  
       > x <- 1:3  
       > lapply(x,runif)  # generate vector with number of elements with 1,2,3, uniform distribution on the interval from min(defalt:0) to max(default:1).
       [[1]]  
       [1] 0.8910183  
       [[2]]  
       [1] 0.6254153 0.2044462  
       [[3]]  
       [1] 0.05463454 0.37686384 0.45033247  
      
  • sapply
    • same as lapply but will try to simplify the result if possible.
    • That means, return vector if all elements in list with length 1, return matrix if all elements with the same length greater than 1, return list if none of these cases.
    •  > x <- list(a=c(1:5),b=c(4:5))  
       > lapply(x,sum)  
       $a  
       [1] 15  
       $b  
       [1] 9  
       > sapply(x,sum)  
        a b   
       15 9   
      
  • vapply
    • use vapply() to point out that what result format we expect, print error if not.
    •  > x <- data.frame(sex = c("male","female","male","female"),age = 26:29)  
       > x  
          sex age  
       1  male 26  
       2 female 27  
       3  male 28  
       4 female 29  
       > sapply(x,length)  
       sex age   
        4  4   
       > sapply(x,class)  
          sex    age   
        "factor" "integer"   
       > vapply(x, class, character(1))  # expect character(1) here as result format
          sex    age   
        "factor" "integer" 
      > table(x$sex) # check how many females or males in each group
      female   male 
           2      2   
      
  • tapply
    • apply a functions to each group of second variable
    • for example, we want to get the mean value of female and male groups
    •  > tapply(x$age, x$sex, mean)  
       female  male   
         28   27   
      
  • mapply 
    • applies a function in parallel over a set of arguments.
    •  > x <- list(rep(1,4),rep(2,3),rep(3,2),rep(4,1))  
       > x  
       [[1]]  
       [1] 1 1 1 1  
       [[2]]  
       [1] 2 2 2  
       [[3]]  
       [1] 3 3  
       [[4]]  
       [1] 4  
       > y <- mapply(rep, 1:4, 4:1)  # use mapply shortly achieve the same goal 
       > y  
       [[1]]  
       [1] 1 1 1 1  
       [[2]]  
       [1] 2 2 2  
       [[3]]  
       [1] 3 3  
       [[4]]  
       [1] 4  
      

R - Date & Time

  • Date
    • Represented by the Date class, stored internally as number of days since 1970-01-01
    •  > x <- as.Date("2014-09-09")  # create date
       > x  
       [1] "2014-09-09"  
       > unclass(x)  
       [1] 16322  # number of days since 1970-01-01
      
  • Time
    • Representd by the POSIXct or POSIXlt class
    • POSIXct: large integer
    •  > y <- Sys.time()  
       > y <- as.POSIXct(y)  
       > names(unclass(y))  
       NULL  
       > unclass(y)  
       [1] 1411258804  
      
    • POSIXlt: with many useful information
    •  > y <- as.POSIXlt(y)  
       > names(unclass(y))  
        [1] "sec"  "min"  "hour"  "mday"  "mon"  "year"   
        [7] "wday"  "yday"  "isdst" "zone"  "gmtoff"  
      
    • Create from string
    •  > time <- c("September 21, 2014 1:27")  
       > x <- strptime(time, "%B %d, %Y %H:%M")  
       > x  
       [1] "2014-09-21 01:27:00 BST"  
       > class(x)  
       [1] "POSIXlt" "POSIXt"   
      
    • Check time differences
    •  > y <- Sys.time()  
       > y  
       [1] "2014-09-21 01:34:42 BST"  
       > x - y  
       Time difference of -7.709296 mins  
      

R - Binding values to a symbol


  • Scoping: Binding values to a symbol
    • If we define a function named "mean", how R will recognize it? 
    •  > mean <- function(x,y){  
       +   x+y  
       + }  
       > mean(1,4)  # it will search mean from user environment (or global environment) first
       [1] 5  
       > search()  # check search order in R
        [1] ".GlobalEnv"    "tools:rstudio"   "package:stats"   "package:graphics" "package:grDevices"  
        [6] "package:utils"   "package:datasets" "package:methods"  "Autoloads"     "package:base"    
      
    • Free variable, the variable is searched from environment in which the function is defined(Lexical Scoping).
    • If can't find in the environment, search top-level environment (workspace, packages)
    •  > z <- 11  
       > fun <- function(x,y){  
       +   x+y-z  # z is a free variable here
       + }  
       > fun(12,13)  
       [1] 14  
      
    • Define function inside a function
    •  > make.power <- function(n){  
       +   pow <- function(x){  
       +     x^n  
       +   }  
       +   pow  
       + }  
       > cube <- make.power(3)  
       > cube(2)  
       [1] 8  
       > ls(environment(cube))  # list what in function closure (environment)
       [1] "n"  "pow"  
       > get("n",environment(cube))  # get value of n
       [1] 3  
       > get("pow",environment(cube))  # get value of pow
       function(x){  
           x^n  
         }  
       <environment: 0x000000000856e658>  
      
    • Lexical Scoping and Dynamic Scoping(searched from environment in which the function is called)
    •  > a <- 1  
       > f <- function(b){  
       +   a <- 10  
       +   a + b + g(b)  # a is 10 in f() and 1 in g(b) since R is using lexical scoping, a is 10 in g(b) if it is Dynamic Scoping 
       + }  
       > g <- function(b){  
       +   a + b  
       + }  
       > f(1)  
       [1] 13  
      

R - Function

  • Function in R
    • is a R Object of class "function"
    • It has "named arguments" with default values, i.e. mean function in R calculates the arithmetic mean, it has na.rm arguments with default value "FALSE".
    •  > x <- c(1,2,NA,5)  
       > mean(x)  # can't calculate mean because of the NA value
       [1] NA  
       > mean(x,na.rm = TRUE)  # turn on remove NA value argument
       [1] 2.666667  
       > mean(na.rm = TRUE, x) # the same even change the position of arguments
       [1] 2.666667  
      
    • Define a sum function to return sum of two arguments
    •  > sumfun <- function(x,y){  
       +   return(x+y)  
       + }  
       > sumfun(1,4)  
       [1] 5  
      
    • Lazy Evaluation: evaluated only while needed
    •  > f <- function(x,y){  
       +   x*2  # no evaluation for y, the same as return(x*2)
       + }  
       > f(4)  # return result without error
       [1] 8  
      

R - Control Structure

  • If... else...
    • Example: print msg if your age is under 19
    •  > if(age <19){  
       +   print("you are not adult")  
       + } else {  
       +   print("you are adult")  
       + }  
       [1] "you are not adult"  
      
  • For loop
    • Takes an iterator variable by successive values from a vector. 
    • Example: loop a vector
    •  > x <- c("a","B","c") 
       > for(i in x){  
       +   print(i)  
       + }  
       [1] "a"  
       [1] "B"  
       [1] "c"  
      
    • Use seq_along() function for looping
    •  > for(i in seq_along(x)){  # seq_along returns integer vector 1:3
       +   print(x[i])  
       + }  
       [1] "a"  
       [1] "B"  
       [1] "c"  
      
    • Loop for matrix using seq_len()
    •  > x <- matrix(1:4,2,2)  
       > for(i in seq_len(nrow(x))){  
       +   for(j in seq_len(ncol(x))){  
       +     print(x[i,j])  
       +   }  
       + }  
       [1] 1  
       [1] 3  
       [1] 2  
       [1] 4  
      # difference between seq_len() and seq_along()
       > nrow(x)  
       [1] 2  
       > seq_len(nrow(x))  
       [1] 1 2  
       > seq_along(nrow(x))  
       [1] 1  
      
  • While loop
    • Loops while the testing condition is true
    •  > while(age <19){  
       +   print("you can't enter the club")  
       +   age <- age+1  
       + }  
       [1] "you can't enter the club"  
       [1] "you can't enter the club"  
      
  • next keyword
    • Skip an iteration of a loop 
    •  > for(i in 1:4){  
       +   if(i<2){  
       +     next  # skip to next iteration
       +   }  
       +   print("Hello")  
       + }  
       [1] "Hello"  
       [1] "Hello"  
       [1] "Hello"  
      

R - Extract subset of Object


  • Get subset from Vector
    • [] return an object of the same classes as extracted object
    •  > x <- 1:10  
       > x[x>5]  # extract subset which includes 
       [1] 6 7 8 9 10  
       > x[2]  
       [1] 2  
       > x[3]  
       [1] 3  
      
    • Get 1,3,5th elements from x
    •  > x[c(1,3,5)]  
       [1] 1 3 5  
      
    • What if we want to get all elements except 1,3,5th elements?
    •  > x[c(-1,-3,-5)]  # or simply use x[-c(1,3,5)]
       [1] 2 4 6 7 8 9 10  
      
  • Get subset from Frame
    • Examples
    •  > x  
        var1 var2 var3  
       1  3  10  11  
       2  2  6  15  
       3  4  8  12  
       4  5  7  14  
       5  1  9  13  
       > x[,1] # get first column data  
       [1] 3 2 4 5 1  
       > x[,"var3"] # get var3 column data  
       [1] 11 15 12 14 13  
       > x[2:3,"var3"] # get 2,3rd data in column var3  
       [1] 15 12  
       > x[(x$var2 >8 & x$var3 <15),] # get all column data, with 2nd data in column var2 >8 and 3rd in column var3 <15  
        var1 var2 var3  
       1  3  10  11  
       5  1  9  13  
      
    • Example for dealing with NA values
    •  > x$var1[c(1,4)] = NA  
       > x  
        var1 var2 var3  
       3  NA  8  12  
       5  1  9  13  
       2  2  6  15  
       4  NA  7  14  
       1  3  10  11  
       > x[x$var1>1,]  # lines with NA will show up as well
          var1 var2 var3  
       NA   NA  NA  NA  
       2    2  6  15  
       NA.1  NA  NA  NA  
       1    3  10  11  
       > x[which(x$var1>1),]  # use which to ignore NA line
        var1 var2 var3  
       2  2  6  15  
       1  3  10  11 
    • Example of sorting
    •  > sort(x$var1)  
       [1] 1 2 3  
       > sort(x$var1, decreasing=TRUE)  
       [1] 3 2 1  
       > sort(x$var1, na.last = TRUE)  
       [1] 1 2 3 NA NA  
      
    • Example of reordering frame
    •  > x[order(x$var2),]  
        var1 var2 var3  
       2  2  6  15  
       4  NA  7  14  
       3  NA  8  12  
       5  1  9  13  
       1  3  10  11  
       > library(plyr)  # use plyr package
       > arrange(x,var3)  
        var1 var2 var3  
       1  3  10  11  
       2  NA  8  12  
       3  1  9  13  
       4  NA  7  14  
       5  2  6  15  
       > arrange(x,desc(var3)  
       + )  
        var1 var2 var3  
       1  2  6  15  
       2  NA  7  14  
       3  1  9  13  
       4  NA  8  12  
       5  3  10  11  
      
    • Example of adding column
    •  > x$var4 <- rnorm(5)  
       > x  
        var1 var2 var3    var4  
       3  NA  8  12 0.01046482  
       5  1  9  13 0.06659688  
       2  2  6  15 0.91059308  
       4  NA  7  14 1.26587778  
       1  3  10  11 -1.46815620  
       > y <- cbind(x, rnorm(5))  
       > y  
        var1 var2 var3    var4  rnorm(5)  
       3  NA  8  12 0.01046482 0.4359661  
       5  1  9  13 0.06659688 -0.7918177  
       2  2  6  15 0.91059308 -0.0485241  
       4  NA  7  14 1.26587778 -0.5299538  
       1  3  10  11 -1.46815620 0.1181559  
      
  • Get subset from Matrix
    • Example for getting one element from matrix and one row from it
    •  > x <- matrix(1:6,2,3)  
       > x  
          [,1] [,2] [,3]  
       [1,]  1  3  5  
       [2,]  2  4  6  
       > x[1,2]  # get element in first row, second column
       [1] 3  
       > x[1,]  # get first row
       [1] 1 3 5  
      
    • It will get vector by default, you could also get matrix by using drop argument
    •  > x[1,2,drop = FALSE]  
          [,1]  
       [1,]  3  
      
  • Get subset from List
    • Example for getting subset from List
    •  > x <- list(male=c(1:4),female=c(5:10))  # create a list
       > x  # print
       $male  
       [1] 1 2 3 4  
       $female  
       [1] 5 6 7 8 9 10  
       > x[1]  # single bracket return list
       $male  
       [1] 1 2 3 4  
       > x[[1]]  # double bracket return vector
       [1] 1 2 3 4  
       > x$male  # use name to return vector
       [1] 1 2 3 4  
      
    • Use variable indice to get subset
    •  > var <- "male"  
       > x[[var]]  
       [1] 1 2 3 4  
      
  • Deal with NA values
    • Get complete cases over two vector
    •  > x <- c("male","female",NA,"male")  
       > y <- c("female",NA,NA,"female")  
       > completeflag <- complete.cases(x,y)  # save complete flag TRUE,FALSE,FALSE,TRUE
       > x[completeflag]  
       [1] "male" "male"  
      
    • Remove NA from vector
    •  > x <- c(1:10,NA)  
       > x  
        [1] 1 2 3 4 5 6 7 8 9 10 NA  
       > naflag <- is.na(x)  
       > x[naflag]  
       [1] NA  
       > x[!naflag]  
        [1] 1 2 3 4 5 6 7 8 9 10  
      
  • Sample data from vector(s)
    • Use sample() function to get number of samples from target
    •  > x <- c(1:10)  
       > y <- c("a","b","c","d")  
       > sample(c(x,y),4)  
       [1] "3" "10" "2" "c"   
      
  • How to count the number of NA values?
    • R treats TRUE as 1 and FALSE as 0, so we could check sum of the vector to see the count.
    •  > x <- rep(NA, 10)  
       > y <- c(1:10)  
       > z <- sample(c(x,y),5)  
       > z  
       [1] 9 NA 8 NA NA  
       > naflag <- is.na(z)  
       > naflag  
       [1] FALSE TRUE FALSE TRUE TRUE  
       > sum(naflag)  
       [1] 3  
      

R - Matrix, List, Factor, Data Frame


  • Matrix
    • Matrix is a vector with dimension attribute (dimension itself is a integer vector of length 2 : nrow and ncol).
    •  > m <- matrix(1:10, nrow = 2, ncol = 5) # create matrix   
       > m # print matrix  
          [,1] [,2] [,3] [,4] [,5]  
       [1,]  1  3  5  7  9  
       [2,]  2  4  6  8  10  
       > dim(m) # get dimension of m  
       [1] 2 5  
       > attributes(m) # get attributes of m  
       $dim  
       [1] 2 5  
      
    • As you can see, Matrix constructed from column-first order (column-wise).
    • Matrix can be created from vector by adding a dimension attribute.
    •  > m <- 1:10  # vector
       > dim(m) <- c(2,5)  # add dimension attribute to vector, m will become Matrix
       > m  
          [,1] [,2] [,3] [,4] [,5]  
       [1,]  1  3  5  7  9  
       [2,]  2  4  6  8  10  
      
    • Matrix can be created from cbind() or rbind() function.
    •  > a <- 1:2  
       > b <- 3:4  
       > c <- 5:6  
       > d <- 7:8  
       > e <- 9:10  
       > cbind(a,b,c,d,e)  # create Matrix with cbind()
          a b c d e  
       [1,] 1 3 5 7 9  
       [2,] 2 4 6 8 10  
       > f <- 1:10  # what if the length of the column is different?
       > cbind(a,b,c,d,e,f)  # create Matrix with cbind(), f is length 10
          a b c d e f  
        [1,] 1 3 5 7 9 1  
        [2,] 2 4 6 8 10 2  
        [3,] 1 3 5 7 9 3  
        [4,] 2 4 6 8 10 4  
        [5,] 1 3 5 7 9 5  
        [6,] 2 4 6 8 10 6  
        [7,] 1 3 5 7 9 7  
        [8,] 2 4 6 8 10 8  
        [9,] 1 3 5 7 9 9  
       [10,] 2 4 6 8 10 10  
    • How to do multiple operation for matrix?
    •  > x <- matrix(1:4, 2, 2); y <- matrix(rep(10,4),2,2)  
       > y  
          [,1] [,2]  
       [1,]  10  10  
       [2,]  10  10  
       > x  
          [,1] [,2]  
       [1,]  1  3  
       [2,]  2  4  
       > x * y  # it just do multiple in element-wise 
          [,1] [,2]  
       [1,]  10  30  
       [2,]  20  40  
       > x %*% y  # matrix multiplication
          [,1] [,2]  
       [1,]  40  40  
       [2,]  60  60  
      
  • List
    • A special type of vector that contain different classes of objects
    •  > x <- list(TRUE, "list", 1L)  
       > x  
       [[1]]  
       [1] TRUE  
       [[2]]  
       [1] "list"  
       [[3]]  
       [1] 1  
  • Factor
    • used for categorical data 
    •  > x <- factor(c("male","male","female","female"))  # create factor
       > x  # print x
       [1] male  male  female female  
       Levels: female male  
       > table(x)  # call table() to show how many items have in each labels
       x  
       female  male   
          2   2   
      
    • Factor labels have order, you could set order while creating with factor()
    •  > x <- factor(c("male","male","female","female"), levels = c("male","female"))  
       > x  
       [1] male  male  female female  
       Levels: male female  # order has been changed from example above
  • Data Frame
    • for storing tabular data, can have different classes of objects in each column.
    •  > x <- data.frame(sex = c("male","female", "male", "male", "female"), age = 26:30)  
       > x  
          sex age  
       1  male 26  
       2 female 27  
       3  male 28  
       4  male 29  
       5 female 30  
      
    • Use nrow() and ncol() function to get the no. of rows and columns
    •  > nrow(x)  
       [1] 5  
       > ncol(x)  
       [1] 2  
      
    • How to change column names in data frame? -> use colnames()
    •  > my_data  
        patients X1 X2 X3 X4 X5  
       1   Bill 1 5 9 13 17  
       2   Gina 2 6 10 14 18  
       3  Kelly 3 7 11 15 19  
       4   Sean 4 8 12 16 20  
       > cnames <- c("patient","age","weight","bp","rating","test")  
       > colnames(my_data) <- cnames  
       > my_data  
        patient age weight bp rating test  
       1  Bill  1   5 9   13  17  
       2  Gina  2   6 10   14  18  
       3  Kelly  3   7 11   15  19  
       4  Sean  4   8 12   16  20  
      
    • Check how much memory the dataset is occupying.
    •  > object.size(x)  
       1312 bytes  
      
    • Suppose data frame is pretty big and have thousands of rows, probably you could check first several rows using head(). (Use tail() for last n rows)
    •  > head(x)  # head() function returns first 6 rows
          sex age  
       1  male 26  
       2 female 27  
       3  male 28  
       4  male 29  
       5 female 30  
       > head(x,3)  # specify how many rows you want to see
          sex age  
       1  male 26  
       2 female 27  
       3  male 28  
      
  • Sequence
    • There are several ways to create sequence
    •  > 1:20  
        [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20  
       > seq(1,20)  # seq() does exactly the same thing : does
        [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20  
       > seq(1,20,by=0.5)  # set by argument so that increase by 0.5 each time
        [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0  
       [16] 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5  
       [31] 16.0 16.5 17.0 17.5 18.0 18.5 19.0 19.5 20.0  
      
       > x <- seq(1,20,length = 5)  # create with length 5, (values between 1~20)
       > length(x)  
       [1] 5  
       > 1:length(x)  # use length to create vector 
       [1] 1 2 3 4 5   
       > seq_along(x)  # the same as 1:length(x)
       [1] 1 2 3 4 5   
       > rep(1, times = 10)  # rep replicate value 1 for 10 times
        [1] 1 1 1 1 1 1 1 1 1 1  
       > rep(c(0,1,2), times = 5)  # replicate 0,1,2 for 5 times
        [1] 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2  
       > rep(c(0,1,2), each = 5)  # replicate 0 for 5 times and 1 for 5 times...
        [1] 0 0 0 0 0 1 1 1 1 1 2 2 2 2 2  
      
  • Change class
 > yesno <- sample(c("yes","no"), size = 10, replace = TRUE)  
 > class(yesno)  
 [1] "character"  
 > yesnoFactor <- as.factor(yesno)  
 > yesnoFactor  
  [1] yes no yes no yes yes yes no no no   
 > yesnoFactor <- factor(yesno, levels = c("yes","no"))  
 > yesnoFactor  
  [1] yes no yes no yes yes yes no no no   
 Levels: yes no  
 > as.numeric(yesnoFactor)  
  [1] 1 2 1 2 1 1 1 2 2 2  

R - Hello World (Data Types and Vector)

  • Start : Install R and R Studio 

  • Five basic Classes of Objects(data types) in R
    • character
    • numeric (if you give 1 for variable, it will give numeric Object)
    • integer (you could give variable x with 1L for giving integer Object)
    • complex
    • logical (Ture or False)
  • Inf and NaN
    • Inf means infinite, for example, 1/0 will give you Inf
    • NaN means Not a Number, for example, 0/0 will give you NaN
  • Comment
    • # indicates comment
    •   x <- 100 # this is comment  
      
  • "Hello World"
    • Give variable x "Hello World" and print it to the console
    •  > x <- "Hello World"  # <- symbol is the assignment operator
       > print(x)  
       [1] "Hello World"  
      
  • Vector : A basic Object contains the same Classes of Objects
    • In "Hello World" example, x is a Vector and the first element is "Hello World"
    • Create Vector using c() function & vector() function
    •  > x <- c(0.5, 0.6)  
       > x  # print x
       [1] 0.5 0.6  
       > x <- vector("numeric", length = 10)  
       > x  # print x
        [1] 0 0 0 0 0 0 0 0 0 0  
      
    • What if we create vector with different Objects? A: Coercion will occur
    •  > y <- c(1.7, "a")  
       > y  # 1.7 will be changed to character "1.7"
       [1] "1.7" "a"   
      
    • Connect string with paste() function
    •  > a <- c("Blog","address","is")  
       > a  
       [1] "Blog"  "address" "is"     
       > paste(a,collapse=" ")  
       [1] "Blog address is"  
       > b <- c(a,"parklize.blogspot.com")  
       > b  
       [1] "Blog"         "address"        "is"            
       [4] "parklize.blogspot.com"  
       > paste(b,collapse=" ")  
       [1] "Blog address is parklize.blogspot.com"  
       > c <- paste("Blog","address","is","parklize.blogspot.com",sep=" ")  
       > c  
       [1] "Blog address is parklize.blogspot.com"  
      
    • Check vectors are identical or not..
    •  > x <- c(1:5)  
       > y <- 1:5  
       > identical(x,y)  
       [1] TRUE  
       > names(x) <- c("a","b","c","d","e")  
       > identical(x,y)  
       [1] FALSE  
      
  • Explicit Coercion
    • Object could be coerced from one class to another class with as.* function
    •  > y <- 1.7  
       > class(y)  
       [1] "numeric"  
       > y <- as.character(y)  # explicitly coerce to character Object
       > class(y)  
       [1] "character"  
      
  • Get help for function
    • str("function name") will get brief information and ?"function name" to get more details
    •  > str(matrix)  # check matrix() function in brief
       function (data = NA, nrow = 1, ncol = 1, byrow = FALSE, dimnames = NULL)   
       > ?matrix  # get help page on the matrix() function
  • Get help for operator
    • If you want to know what the operator ":" does...use ?':' and it will open the help page.
    •  > ?':'  
       starting httpd help server ... done  
      
    • List variables in your workspace
    •  > x <- 1:4  
       > ls()  
       [1] "x"