Guangyuan's Research and Development Blog: R

Showing posts with label R. Show all posts

R - Plot

Plot cumulative distribution function (CDF)

 > Y = simlist$V1 # or just use random data Y = rnorm(100) 
 > str(Y) # structure of value Y  
  num [1:229920] 0 0.01837 0 0.00955 0.01 ...  
 > P = ecdf(Y)  
 > plot(P)  
 > P(0.05)  
 [1] 0.993663

Example - Date

Use funtion

 > d1 <- date()  
 > class(d1)  
 [1] "character"  
 > d1  
 [1] "Tue Dec 23 17:46:34 2014"  
 > d2 <- Sys.Date()  
 > class(d2)  
 [1] "Date"  
 > d2  
 [1] "2014-12-23"  
 > format(d2,"%a %b %d")  
 [1] "Tue Dec 23"  
 > d3 <- "1jan2014"  
 > d4 <- as.Date(d3,"%d%b%Y")  
 > d4  
 [1] "2014-01-01"  
 > d2 - d4  
 Time difference of 356 days  
 > as.numeric(d2-d4)  
 [1] 356  
 > weekdays(d2)  
 [1] "Tuesday"  
 > months(d2)  
 [1] "December"  
 > julian(d2)  # days from origin date (1970-01-01)
 [1] 16427  
 attr(,"origin")  
 [1] "1970-01-01"

Use library

 > library(lubridate)  
 > ymd("20140108")  
 [1] "2014-01-08 UTC"  
 > ymd_hms("2014-01-01 10:10:10")  
 [1] "2014-01-01 10:10:10 UTC"  
 > ymd_hms("2014-01-01 10:10:10", tz = "Pacific/Auckland")  
 [1] "2014-01-01 10:10:10 NZDT"  
 > wday( ymd("20140108"))  
 [1] 4  
 > wday( ymd("20140108"), label = TRUE)  
 [1] Wed  
 Levels: Sun < Mon < Tues < Wed < Thurs < Fri < Sat

Example - Create directory, download and summarizing data

Check current working directory and create "RDirectory" if doesn't exist.

1:  > getwd()  
2:  [1] "/Users/guangyuan"  
3:  > file.exists("./Rdirectory")  
4:  [1] FALSE  
5:  > if(!file.exists("./RDirectory")) {  
6:  +   dir.create("./RDirectory")  
7:  + }  
8:  > file.exists("./Rdirectory")  
9:  [1] TRUE

Download file and read

1:  > url <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"  
2:  > download.file(url, destfile = "./RDirectory/restaurants.csv", method = "curl")  
3:  > data <- read.csv("./RDirectory/restaurants.csv")

Summary of the data

1:  > head(data, n=3) # get first 3 lines  
2:  > tail(data, n=3) # get last 3 lines  
3:  > summary(data)  
4:  > str(data) # display the structure of the object

Make table

1:  > table(data$zipCode, useNA = "ifany")  
2:  > table(data$councilDistrict, data$zipCode)  
3:     -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217 21218 21220 21222  
4:   1    0   0  37   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0   7  
5:   2    0   0   0   3  27   0   0   0   0   0   0   0   0   0   0   0   0   0   0  
6:   3    0   0   0   0   0   0   0   0   0   0   0   2  17   0   0   0   3   0   0  
7:   4    0   0   0   0   0   0   0   0   0   0  27   0   0   0   0   0   0   0   0  
8:   5    0   0   0   0   0   3   0   6   0   0   0   0   0  31   0   0   0   0   0  
9:   6    0   0   0   0   0   0   0   1  19   0   0   0   0  15   1   0   0   0   0

Check zipcode

 > table(ifelse(data$zipCode <0 ,TRUE, FALSE))  
 FALSE TRUE   
  1326   1   
 > table(data$zipCode <0)  
 FALSE TRUE   
  1326   1

Categorization

 > data$zipGroups <- cut(data$zipCode, breaks = quantile(data$zipCode))  
 > table(data$zipGroups)  
 (-2.123e+04,2.12e+04] (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] (2.123e+04,2.129e+04]   
          337          375          282          332   
 > table(data$zipGroups,data$zipCode)  
             -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217  
  (-2.123e+04,2.12e+04]   0  136  201   0   0   0   0   0   0   0   0   0   0   0   0   0  
  (2.12e+04,2.122e+04]    0   0   0  27  30   4   1   8  23  41  28  31  17  54  10  32

 > library(Hmisc)  # use Hmisc package
 > data$zipGroups <- cut2(data$zipCode,g = 4)  
 > table(data$zipGroups)  
 [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]   
       338      375      300      314

Mutate to create new variable

 library(Hmisc); library(plyr)  
 > data2 <- mutate(data, zipGroups = cut2(zipCode, g=4))  
 > table(data2$zipGroups)  
 [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]   
       338      375      300      314

NA Values

1:  > sum(is.na(data$councilDistrict))  # get NA count, is.na() return 1 if it is NA
2:  [1] 0  
3:  > any(is.na(data$councilDistrict))  # is any value in that column NA?
4:  [1] FALSE  
5:  > all(data$zipCode>0)  # are all zipCode > 0
6:  [1] FALSE  
7:  > colSums(is.na(data))  
8:        name     zipCode  neighborhood councilDistrict policeDistrict   Location.1   
9:         0        0        0        0        0        0   
10:  > all(colSums(is.na(data)==0))  
11:  [1] TRUE  
12:  > table(data$zipCode %in% c("21213","21214"))  # zipcode in 21213 or 21214
13:  FALSE TRUE   
14:   1279  48   
15:  > data[data$zipCode %in% c("21213","21214"),]  
16:                     name zipCode           neighborhood councilDistrict policeDistrict  
17:  39              BERMUDA BAR  21213          Broadway East       12    EASTERN  
18:  44    BIG BAD WOLF'S HOUSE OF BARBEQUE  21214 Harford-Echodale/Perring Parkway        3  NORTHEASTERN

Check data size

1:  > object.size(data)  
2:  242752 bytes  
3:  > print(object.size(data), units="Mb")  
4:  0.2 Mb

Use xtabs

 > data(UCBAdmissions)  
 > ucba <- as.data.frame(UCBAdmissions)  
 > summary(ucba)  
    Admit    Gender  Dept    Freq     
  Admitted:12  Male :12  A:4  Min.  : 8.0   
  Rejected:12  Female:12  B:4  1st Qu.: 80.0   
               C:4  Median :170.0   
               D:4  Mean  :188.6   
               E:4  3rd Qu.:302.5   
               F:4  Max.  :512.0   
 > crosstab <- xtabs(Freq ~ Gender + Admit, data = ucba) 
 > crosstab  
     Admit  
 Gender  Admitted Rejected  
  Male    1198   1493  
  Female   557   1278

Use ftable

> warpbreaks$replicate [1] 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 > summary(warpbreaks) breaks wool tension replicate Min. :10.00 A:27 L:18 Min. :1 1st Qu.:18.25 B:27 M:18 1st Qu.:3 Median :26.00 H:18 Median :5 Mean :28.15 Mean :5 3rd Qu.:34.00 3rd Qu.:7 Max. :70.00 Max. :9 > crosstab <- xtabs(breaks ~ .,data = warpbreaks) > crosstab , , replicate = 1 tension wool L M H A 26 18 36 B 27 42 20 , , replicate = 2 tension wool L M H A 30 21 21 B 14 26 21 , , replicate = 3 tension wool L M H A 54 29 24 B 29 19 24 , , replicate = 4 tension wool L M H A 25 17 18 B 19 16 17 , , replicate = 5 tension wool L M H A 70 12 10 B 29 39 13 , , replicate = 6 tension wool L M H A 52 18 43 B 31 28 15 , , replicate = 7 tension wool L M H A 51 35 28 B 41 21 15 , , replicate = 8 tension wool L M H A 26 30 15 B 20 39 16 , , replicate = 9 tension wool L M H A 67 36 26 B 44 29 28 > ftable(crosstab)replicate 1 2 3 4 5 6 7 8 9 wool tension A L 26 30 54 25 70 52 51 26 67 M 18 21 29 17 12 18 35 30 36 H 36 21 24 18 10 43 28 15 26 B L 27 14 29 19 29 31 41 20 44 M 42 26 19 16 39 28 21 39 29 H 20 21 24 17 13 15 15 16 28

Change class

> class(data$zipCode) [1] "integer" > data$zipCodeFacter <- as.factor(data$zipCode)

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

Install Packages in R

1. Use command

> install.packages("KernSmooth") trying URL 'http://cran.rstudio.com/bin/macosx/mavericks/contrib/3.1/KernSmooth_2.23-13.tgz' Content type 'application/x-gzip' length 90611 bytes (88 Kb) opened URL ================================================== downloaded 88 Kb The downloaded binary packages are in /var/folders/_s/n4303w0908s0vds7yhl10t8c0000gq/T//RtmpJTnLfh/downloaded_packages > library(KernSmooth) KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009

2. Use RStudio menu

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Sort dataframe with multiple columns

An example to show how to sort (including descending order) dataframe with multiple column values.

> data # if we have data.frame (variable data here)b x y z 1 Hi A 8 1 2 Med D 3 1 3 Hi A 9 1 4 Low C 9 2 > data[order(data[,4]),] #sort by 4th column, which is z hereb x y z 1 Hi A 8 1 2 Med D 3 1 3 Hi A 9 1 4 Low C 9 2 > data[order(-data[,4]),] #sort by 4th column, with descending order b x y z 4 Low C 9 2 1 Hi A 8 1 2 Med D 3 1 3 Hi A 9 1 > data[order(-data[,4], data[,3]),] #sort by 4th column(z), then sort by 3rd column(y)b x y z 4 Low C 9 2 2 Med D 3 1 1 Hi A 8 1 3 Hi A 9 1

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Sampling Data

sample function

sample takes a sample of the specified size from the elements of x using either with or without replacement.

> sample(1:6, 4, replace = TRUE) [1] 5 3 3 2 #duplicated number could be generated with replace option

sample without duplicated number

> sample(1:20, 10) [1] 16 15 2 20 12 1 13 5 18 11

predefined English alphabet variable in R - LETTERS

> LETTERS [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z"

sample 10 characters with "a" and "b" with probability of 50% respectively

> sample(c("a","b"), 10, replace = TRUE, prob = c(0.5, 0.5)) [1] "b" "a" "a" "b" "a" "a" "a" "a" "a" "a"

if we would like to generate 10 numbers with binominal data (0 or 1), we could use rbinom()

> rbinom(n = 10, size = 1, prob = 0.7) [1] 1 0 1 1 0 0 0 1 1 1

replicate() creates matrix, (replicate the function for n times)

> replicate(10,rbinom(n = 10, size = 1, prob = 0.7)) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] 0 1 1 1 0 0 1 1 1 1 [2,] 1 1 1 1 1 1 1 1 1 1 [3,] 0 1 1 1 1 0 1 1 1 1 [4,] 1 0 1 1 0 1 1 1 0 1 [5,] 0 0 1 1 1 1 0 1 0 0 [6,] 1 0 1 0 1 1 0 1 1 0 [7,] 1 0 0 1 1 0 1 1 1 0 [8,] 0 1 1 0 1 1 1 1 1 0 [9,] 0 0 1 1 1 1 1 1 1 1 [10,] 1 0 1 1 1 1 0 0 0 1

Back to the list

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - lapply, sapply, vapply, tapply and mapply

lapply

loop over a list and evaluate a function on each element

> x <- list(a=c(1:3),b=c(4:6)) > lapply(x,sum) $a [1] 6 $b [1] 15 > x <- 1:3 > lapply(x,runif) # generate vector with number of elements with 1,2,3, uniform distribution on the interval from min(defalt:0) to max(default:1). [[1]] [1] 0.8910183 [[2]] [1] 0.6254153 0.2044462 [[3]] [1] 0.05463454 0.37686384 0.45033247

sapply

same as lapply but will try to simplify the result if possible.

That means, return vector if all elements in list with length 1, return matrix if all elements with the same length greater than 1, return list if none of these cases.

> x <- list(a=c(1:5),b=c(4:5)) > lapply(x,sum) $a [1] 15 $b [1] 9 > sapply(x,sum) a b 15 9

vapply

use vapply() to point out that what result format we expect, print error if not.

> x <- data.frame(sex = c("male","female","male","female"),age = 26:29) > x sex age 1 male 26 2 female 27 3 male 28 4 female 29 > sapply(x,length) sex age 4 4 > sapply(x,class) sex age "factor" "integer" > vapply(x, class, character(1))# expect character(1) here as result format sex age "factor" "integer"

> table(x$sex) # check how many females or males in each groupfemale male 2 2

tapply

apply a functions to each group of second variable

for example, we want to get the mean value of female and male groups

> tapply(x$age, x$sex, mean) female male 28 27

mapply

applies a function in parallel over a set of arguments.

> x <- list(rep(1,4),rep(2,3),rep(3,2),rep(4,1)) > x [[1]] [1] 1 1 1 1 [[2]] [1] 2 2 2 [[3]] [1] 3 3 [[4]] [1] 4 > y <- mapply(rep, 1:4, 4:1) # use mapply shortly achieve the same goal > y [[1]] [1] 1 1 1 1 [[2]] [1] 2 2 2 [[3]] [1] 3 3 [[4]] [1] 4

Back to the list

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Date & Time

Date

Represented by the Date class, stored internally as number of days since 1970-01-01

> x <- as.Date("2014-09-09") # create date > x [1] "2014-09-09" > unclass(x) [1] 16322 # number of days since 1970-01-01

Time

Representd by the POSIXct or POSIXlt class

POSIXct: large integer

> y <- Sys.time() > y <- as.POSIXct(y) > names(unclass(y)) NULL > unclass(y) [1] 1411258804

POSIXlt: with many useful information

> y <- as.POSIXlt(y) > names(unclass(y)) [1] "sec" "min" "hour" "mday" "mon" "year" [7] "wday" "yday" "isdst" "zone" "gmtoff"

Create from string

> time <- c("September 21, 2014 1:27") > x <- strptime(time, "%B %d, %Y %H:%M") > x [1] "2014-09-21 01:27:00 BST" > class(x) [1] "POSIXlt" "POSIXt"

Check time differences

> y <- Sys.time() > y [1] "2014-09-21 01:34:42 BST" > x - y Time difference of -7.709296 mins

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Binding values to a symbol

Scoping: Binding values to a symbol

If we define a function named "mean", how R will recognize it?

> mean <- function(x,y){ + x+y + } > mean(1,4) # it will search mean from user environment (or global environment) first [1] 5 > search() # check search order in R [1] ".GlobalEnv" "tools:rstudio" "package:stats" "package:graphics" "package:grDevices" [6] "package:utils" "package:datasets" "package:methods" "Autoloads" "package:base"

Free variable, the variable is searched from environment in which the function is defined(Lexical Scoping).

If can't find in the environment, search top-level environment (workspace, packages)

> z <- 11 > fun <- function(x,y){ + x+y-z # z is a free variable here + } > fun(12,13) [1] 14

Define function inside a function

> make.power <- function(n){ + pow <- function(x){ + x^n + } + pow + } > cube <- make.power(3) > cube(2) [1] 8 > ls(environment(cube)) #list what in function closure (environment)[1] "n" "pow" > get("n",environment(cube)) #get value of n[1] 3 > get("pow",environment(cube)) #get value of powfunction(x){ x^n } <environment: 0x000000000856e658>

Lexical Scoping and Dynamic Scoping(searched from environment in which the function is called)

> a <- 1 > f <- function(b){ + a <- 10 + a + b + g(b) # a is 10 in f() and 1 in g(b) since R is using lexical scoping, a is 10 in g(b) if it is Dynamic Scoping + }> g <- function(b){ + a + b + } > f(1) [1] 13

Back to the list

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Function

Function in R

is a R Object of class "function"

It has "named arguments" with default values, i.e. mean function in R calculates the arithmetic mean, it has na.rm arguments with default value "FALSE".

> x <- c(1,2,NA,5) > mean(x) # can't calculate mean because of the NA value [1] NA > mean(x,na.rm = TRUE) # turn on remove NA value argument [1] 2.666667 > mean(na.rm = TRUE, x) # the same even change the position of arguments [1] 2.666667

Define a sum function to return sum of two arguments

> sumfun <- function(x,y){ + return(x+y) + } > sumfun(1,4) [1] 5

Lazy Evaluation: evaluated only while needed

> f <- function(x,y){ + x*2 # no evaluation for y, the same as return(x*2) + } > f(4) # return result without error [1] 8

Back to the list

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Control Structure

If... else...

Example: print msg if your age is under 19

> if(age <19){ + print("you are not adult") + } else { + print("you are adult") + } [1] "you are not adult"

For loop

Takes an iterator variable by successive values from a vector.

Example: loop a vector

> x <- c("a","B","c") > for(i in x){ + print(i) + } [1] "a" [1] "B" [1] "c"

Use seq_along() function for looping

> for(i in seq_along(x)){ # seq_along returns integer vector 1:3 + print(x[i]) + } [1] "a" [1] "B" [1] "c"

Loop for matrix using seq_len()

> x <- matrix(1:4,2,2) > for(i in seq_len(nrow(x))){ + for(j in seq_len(ncol(x))){ + print(x[i,j]) + } + } [1] 1 [1] 3 [1] 2 [1] 4

# difference between seq_len() and seq_along() > nrow(x) [1] 2 > seq_len(nrow(x)) [1] 1 2 > seq_along(nrow(x)) [1] 1

While loop

Loops while the testing condition is true

> while(age <19){ + print("you can't enter the club") + age <- age+1 + } [1] "you can't enter the club" [1] "you can't enter the club"

next keyword

Skip an iteration of a loop

> for(i in 1:4){ + if(i<2){ + next # skip to next iteration + } + print("Hello") + } [1] "Hello" [1] "Hello" [1] "Hello"

Back to the list

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Extract subset of Object

Get subset from Vector

[] return an object of the same classes as extracted object

> x <- 1:10 > x[x>5] # extract subset which includes [1] 6 7 8 9 10 > x[2] [1] 2 > x[3] [1] 3

Get 1,3,5th elements from x

> x[c(1,3,5)] [1] 1 3 5

What if we want to get all elements except 1,3,5th elements?

> x[c(-1,-3,-5)] # or simply use x[-c(1,3,5)][1] 2 4 6 7 8 9 10

Get subset from Frame

Examples

> x var1 var2 var3 1 3 10 11 2 2 6 15 3 4 8 12 4 5 7 14 5 1 9 13 > x[,1] # get first column data [1] 3 2 4 5 1 > x[,"var3"] # get var3 column data [1] 11 15 12 14 13 > x[2:3,"var3"] # get 2,3rd data in column var3 [1] 15 12 > x[(x$var2 >8 & x$var3 <15),] # get all column data, with 2nd data in column var2 >8 and 3rd in column var3 <15 var1 var2 var3 1 3 10 11 5 1 9 13

Example for dealing with NA values

> x$var1[c(1,4)] = NA > x var1 var2 var3 3 NA 8 12 5 1 9 13 2 2 6 15 4 NA 7 14 1 3 10 11 > x[x$var1>1,] # lines with NA will show up as well var1 var2 var3 NA NA NA NA 2 2 6 15 NA.1 NA NA NA 1 3 10 11 > x[which(x$var1>1),] # use which to ignore NA line var1 var2 var3 2 2 6 15 1 3 10 11

Example of sorting

> sort(x$var1) [1] 1 2 3 > sort(x$var1, decreasing=TRUE) [1] 3 2 1 > sort(x$var1, na.last = TRUE) [1] 1 2 3 NA NA

Example of reordering frame

> x[order(x$var2),] var1 var2 var3 2 2 6 15 4 NA 7 14 3 NA 8 12 5 1 9 13 1 3 10 11 > library(plyr) # use plyr package> arrange(x,var3) var1 var2 var3 1 3 10 11 2 NA 8 12 3 1 9 13 4 NA 7 14 5 2 6 15 > arrange(x,desc(var3) + ) var1 var2 var3 1 2 6 15 2 NA 7 14 3 1 9 13 4 NA 8 12 5 3 10 11

Example of adding column

> x$var4 <- rnorm(5) > x var1 var2 var3 var4 3 NA 8 12 0.01046482 5 1 9 13 0.06659688 2 2 6 15 0.91059308 4 NA 7 14 1.26587778 1 3 10 11 -1.46815620 > y <- cbind(x, rnorm(5)) > y var1 var2 var3 var4 rnorm(5) 3 NA 8 12 0.01046482 0.4359661 5 1 9 13 0.06659688 -0.7918177 2 2 6 15 0.91059308 -0.0485241 4 NA 7 14 1.26587778 -0.5299538 1 3 10 11 -1.46815620 0.1181559

Get subset from Matrix

Example for getting one element from matrix and one row from it

> x <- matrix(1:6,2,3) > x [,1] [,2] [,3] [1,] 1 3 5 [2,] 2 4 6 > x[1,2] # get element in first row, second column [1] 3 > x[1,] # get first row [1] 1 3 5

It will get vector by default, you could also get matrix by using drop argument

> x[1,2,drop = FALSE] [,1] [1,] 3

Get subset from List

Example for getting subset from List

> x <- list(male=c(1:4),female=c(5:10)) # create a list > x # print $male [1] 1 2 3 4 $female [1] 5 6 7 8 9 10 > x[1] # single bracket return list $male [1] 1 2 3 4 > x[[1]] # double bracket return vector [1] 1 2 3 4 > x$male # use name to return vector [1] 1 2 3 4

Use variable indice to get subset

> var <- "male" > x[[var]] [1] 1 2 3 4

Deal with NA values

Get complete cases over two vector

> x <- c("male","female",NA,"male") > y <- c("female",NA,NA,"female") > completeflag <- complete.cases(x,y) # save complete flag TRUE,FALSE,FALSE,TRUE > x[completeflag] [1] "male" "male"

Remove NA from vector

> x <- c(1:10,NA) > x [1] 1 2 3 4 5 6 7 8 9 10 NA > naflag <- is.na(x) > x[naflag] [1] NA > x[!naflag] [1] 1 2 3 4 5 6 7 8 9 10

Sample data from vector(s)

Use sample() function to get number of samples from target

> x <- c(1:10) > y <- c("a","b","c","d") > sample(c(x,y),4) [1] "3" "10" "2" "c"

How to count the number of NA values?

R treats TRUE as 1 and FALSE as 0, so we could check sum of the vector to see the count.

> x <- rep(NA, 10) > y <- c(1:10) > z <- sample(c(x,y),5) > z [1] 9 NA 8 NA NA > naflag <- is.na(z) > naflag [1] FALSE TRUE FALSE TRUE TRUE > sum(naflag) [1] 3

Back to the list

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Matrix, List, Factor, Data Frame

Matrix

Matrix is a vector with dimension attribute (dimension itself is a integer vector of length 2 : nrow and ncol).

> m <- matrix(1:10, nrow = 2, ncol = 5) # create matrix > m # print matrix [,1] [,2] [,3] [,4] [,5] [1,] 1 3 5 7 9 [2,] 2 4 6 8 10 > dim(m) # get dimension of m [1] 2 5 > attributes(m) # get attributes of m $dim [1] 2 5

As you can see, Matrix constructed from column-first order (column-wise).

Matrix can be created from vector by adding a dimension attribute.

> m <- 1:10 # vector > dim(m) <- c(2,5) # add dimension attribute to vector, m will become Matrix > m [,1] [,2] [,3] [,4] [,5] [1,] 1 3 5 7 9 [2,] 2 4 6 8 10

Matrix can be created from cbind() or rbind() function.

> a <- 1:2 > b <- 3:4 > c <- 5:6 > d <- 7:8 > e <- 9:10 > cbind(a,b,c,d,e) # create Matrix with cbind() a b c d e [1,] 1 3 5 7 9 [2,] 2 4 6 8 10 > f <- 1:10 # what if the length of the column is different? > cbind(a,b,c,d,e,f) # create Matrix with cbind(), f is length 10 a b c d e f [1,] 1 3 5 7 9 1 [2,] 2 4 6 8 10 2 [3,] 1 3 5 7 9 3 [4,] 2 4 6 8 10 4 [5,] 1 3 5 7 9 5 [6,] 2 4 6 8 10 6 [7,] 1 3 5 7 9 7 [8,] 2 4 6 8 10 8 [9,] 1 3 5 7 9 9 [10,] 2 4 6 8 10 10

How to do multiple operation for matrix?

> x <- matrix(1:4, 2, 2); y <- matrix(rep(10,4),2,2) > y [,1] [,2] [1,] 10 10 [2,] 10 10 > x [,1] [,2] [1,] 1 3 [2,] 2 4 > x * y # it just do multiple in element-wise [,1] [,2] [1,] 10 30 [2,] 20 40 > x %*% y # matrix multiplication[,1] [,2] [1,] 40 40 [2,] 60 60

List

A special type of vector that contain different classes of objects

> x <- list(TRUE, "list", 1L) > x [[1]] [1] TRUE [[2]] [1] "list" [[3]] [1] 1

Factor

used for categorical data

> x <- factor(c("male","male","female","female")) # create factor > x # print x [1] male male female female Levels: female male > table(x) #call table() to show how many items have in each labelsx female male 2 2

Factor labels have order, you could set order while creating with factor()

> x <- factor(c("male","male","female","female"), levels = c("male","female")) > x [1] male male female female Levels: male female # order has been changed from example above

Data Frame

for storing tabular data, can have different classes of objects in each column.

> x <- data.frame(sex = c("male","female", "male", "male", "female"), age = 26:30) > x sex age 1 male 26 2 female 27 3 male 28 4 male 29 5 female 30

Use nrow() and ncol() function to get the no. of rows and columns

> nrow(x) [1] 5 > ncol(x) [1] 2

How to change column names in data frame? -> use colnames()

> my_data patients X1 X2 X3 X4 X5 1 Bill 1 5 9 13 17 2 Gina 2 6 10 14 18 3 Kelly 3 7 11 15 19 4 Sean 4 8 12 16 20 > cnames <- c("patient","age","weight","bp","rating","test") >colnames(my_data) <- cnames > my_data patient age weight bp rating test 1 Bill 1 5 9 13 17 2 Gina 2 6 10 14 18 3 Kelly 3 7 11 15 19 4 Sean 4 8 12 16 20

Check how much memory the dataset is occupying.

> object.size(x) 1312 bytes

Suppose data frame is pretty big and have thousands of rows, probably you could check first several rows using head(). (Use tail() for last n rows)

> head(x) # head() function returns first 6 rowssex age 1 male 26 2 female 27 3 male 28 4 male 29 5 female 30 > head(x,3) #specify how many rows you want to seesex age 1 male 26 2 female 27 3 male 28

Sequence

There are several ways to create sequence

> 1:20 [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 > seq(1,20) #seq() does exactly the same thing : does[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 > seq(1,20,by=0.5) # set by argument so that increase by 0.5 each time[1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 [16] 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 [31] 16.0 16.5 17.0 17.5 18.0 18.5 19.0 19.5 20.0

> x <- seq(1,20,length = 5) # create with length 5, (values between 1~20) > length(x) [1] 5 > 1:length(x) # use length to create vector [1] 1 2 3 4 5 > seq_along(x) # the same as 1:length(x) [1] 1 2 3 4 5 > rep(1, times = 10) # rep replicate value 1 for 10 times [1] 1 1 1 1 1 1 1 1 1 1 > rep(c(0,1,2), times = 5) #replicate 0,1,2 for 5 times[1] 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 > rep(c(0,1,2), each = 5) #replicate 0 for 5 times and 1 for 5 times...[1] 0 0 0 0 0 1 1 1 1 1 2 2 2 2 2

Change class

> yesno <- sample(c("yes","no"), size = 10, replace = TRUE) > class(yesno) [1] "character" > yesnoFactor <-as.factor(yesno)> yesnoFactor [1] yes no yes no yes yes yes no no no > yesnoFactor <- factor(yesno, levels = c("yes","no"))> yesnoFactor [1] yes no yes no yes yes yes no no no Levels: yes no > as.numeric(yesnoFactor) [1] 1 2 1 2 1 1 1 2 2 2

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

R - Hello World (Data Types and Vector)

Start : Install R and R Studio

Five basic Classes of Objects(data types) in R

character

numeric (if you give 1 for variable, it will give numeric Object)

integer (you could give variable x with 1L for giving integer Object)

complex

logical (Ture or False)

Inf and NaN

Inf means infinite, for example, 1/0 will give you Inf

NaN means Not a Number, for example, 0/0 will give you NaN

Comment

# indicates comment

x <- 100 # this is comment

"Hello World"

Give variable x "Hello World" and print it to the console

> x <- "Hello World" # <- symbol is the assignment operator > print(x) [1] "Hello World"

Vector : A basic Object contains the same Classes of Objects

In "Hello World" example, x is a Vector and the first element is "Hello World"

Create Vector using c() function & vector() function

> x <- c(0.5, 0.6) > x # print x [1] 0.5 0.6 > x <- vector("numeric", length = 10) > x # print x [1] 0 0 0 0 0 0 0 0 0 0

What if we create vector with different Objects? A: Coercion will occur

> y <- c(1.7, "a") > y # 1.7 will be changed to character "1.7" [1] "1.7" "a"

Connect string with paste() function

> a <- c("Blog","address","is") > a [1] "Blog" "address" "is" > paste(a,collapse=" ") [1] "Blog address is" > b <- c(a,"parklize.blogspot.com") > b [1] "Blog" "address" "is" [4] "parklize.blogspot.com" > paste(b,collapse=" ") [1] "Blog address is parklize.blogspot.com" > c <- paste("Blog","address","is","parklize.blogspot.com",sep=" ") > c [1] "Blog address is parklize.blogspot.com"

Check vectors are identical or not..

> x <- c(1:5) > y <- 1:5 > identical(x,y) [1] TRUE > names(x) <- c("a","b","c","d","e") >identical(x,y) [1] FALSE

Explicit Coercion

Object could be coerced from one class to another class with as.* function

> y <- 1.7 > class(y) [1] "numeric" > y <- as.character(y) # explicitly coerce to character Object > class(y) [1] "character"

Get help for function

str("function name") will get brief information and ?"function name" to get more details

> str(matrix) # check matrix() function in brief function (data = NA, nrow = 1, ncol = 1, byrow = FALSE, dimnames = NULL) > ?matrix # get help page on the matrix() function

Get help for operator

If you want to know what the operator ":" does...use ?':' and it will open the help page.

> ?':' starting httpd help server ... done

List variables in your workspace

> x <- 1:4 > ls() [1] "x"

Back to the list

No comments:
Email This BlogThis!Share to X Share to Facebook Share to Pinterest

Labels: programming, R, tutorial

Older Posts Home
View mobile version

Subscribe to: Posts (Atom)

Research

Development

Search This Blog

Total Pageviews