Example - Create directory, download and summarizing data


  • Check current working directory and create "RDirectory" if doesn't exist.
1:  > getwd()  
2:  [1] "/Users/guangyuan"  
3:  > file.exists("./Rdirectory")  
4:  [1] FALSE  
5:  > if(!file.exists("./RDirectory")) {  
6:  +   dir.create("./RDirectory")  
7:  + }  
8:  > file.exists("./Rdirectory")  
9:  [1] TRUE  

  • Download file and read
1:  > url <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"  
2:  > download.file(url, destfile = "./RDirectory/restaurants.csv", method = "curl")  
3:  > data <- read.csv("./RDirectory/restaurants.csv")  

  • Summary of the data
1:  > head(data, n=3) # get first 3 lines  
2:  > tail(data, n=3) # get last 3 lines  
3:  > summary(data)  
4:  > str(data) # display the structure of the object  

  • Make table
1:  > table(data$zipCode, useNA = "ifany")  
2:  > table(data$councilDistrict, data$zipCode)  
3:     -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217 21218 21220 21222  
4:   1    0   0  37   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0   7  
5:   2    0   0   0   3  27   0   0   0   0   0   0   0   0   0   0   0   0   0   0  
6:   3    0   0   0   0   0   0   0   0   0   0   0   2  17   0   0   0   3   0   0  
7:   4    0   0   0   0   0   0   0   0   0   0  27   0   0   0   0   0   0   0   0  
8:   5    0   0   0   0   0   3   0   6   0   0   0   0   0  31   0   0   0   0   0  
9:   6    0   0   0   0   0   0   0   1  19   0   0   0   0  15   1   0   0   0   0  

  • Check zipcode
 > table(ifelse(data$zipCode <0 ,TRUE, FALSE))  
 FALSE TRUE   
  1326   1   
 > table(data$zipCode <0)  
 FALSE TRUE   
  1326   1  
  • Categorization
 > data$zipGroups <- cut(data$zipCode, breaks = quantile(data$zipCode))  
 > table(data$zipGroups)  
 (-2.123e+04,2.12e+04] (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] (2.123e+04,2.129e+04]   
          337          375          282          332   
 > table(data$zipGroups,data$zipCode)  
             -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217  
  (-2.123e+04,2.12e+04]   0  136  201   0   0   0   0   0   0   0   0   0   0   0   0   0  
  (2.12e+04,2.122e+04]    0   0   0  27  30   4   1   8  23  41  28  31  17  54  10  32  
 > library(Hmisc)  # use Hmisc package
 > data$zipGroups <- cut2(data$zipCode,g = 4)  
 > table(data$zipGroups)  
 [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]   
       338      375      300      314   
  • Mutate to create new variable
 library(Hmisc); library(plyr)  
 > data2 <- mutate(data, zipGroups = cut2(zipCode, g=4))  
 > table(data2$zipGroups)  
 [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]   
       338      375      300      314   
  • NA Values
1:  > sum(is.na(data$councilDistrict))  # get NA count, is.na() return 1 if it is NA
2:  [1] 0  
3:  > any(is.na(data$councilDistrict))  # is any value in that column NA?
4:  [1] FALSE  
5:  > all(data$zipCode>0)  # are all zipCode > 0
6:  [1] FALSE  
7:  > colSums(is.na(data))  
8:        name     zipCode  neighborhood councilDistrict policeDistrict   Location.1   
9:         0        0        0        0        0        0   
10:  > all(colSums(is.na(data)==0))  
11:  [1] TRUE  
12:  > table(data$zipCode %in% c("21213","21214"))  # zipcode in 21213 or 21214
13:  FALSE TRUE   
14:   1279  48   
15:  > data[data$zipCode %in% c("21213","21214"),]  
16:                     name zipCode           neighborhood councilDistrict policeDistrict  
17:  39              BERMUDA BAR  21213          Broadway East       12    EASTERN  
18:  44    BIG BAD WOLF'S HOUSE OF BARBEQUE  21214 Harford-Echodale/Perring Parkway        3  NORTHEASTERN  
  • Check data size
1:  > object.size(data)  
2:  242752 bytes  
3:  > print(object.size(data), units="Mb")  
4:  0.2 Mb  

  • Use xtabs
 > data(UCBAdmissions)  
 > ucba <- as.data.frame(UCBAdmissions)  
 > summary(ucba)  
    Admit    Gender  Dept    Freq     
  Admitted:12  Male :12  A:4  Min.  : 8.0   
  Rejected:12  Female:12  B:4  1st Qu.: 80.0   
               C:4  Median :170.0   
               D:4  Mean  :188.6   
               E:4  3rd Qu.:302.5   
               F:4  Max.  :512.0   
 > crosstab <- xtabs(Freq ~ Gender + Admit, data = ucba) 
 > crosstab  
     Admit  
 Gender  Admitted Rejected  
  Male    1198   1493  
  Female   557   1278  

  • Use ftable
 > warpbreaks$replicate  
  [1] 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9  
 > summary(warpbreaks)  
    breaks   wool  tension  replicate  
  Min.  :10.00  A:27  L:18  Min.  :1   
  1st Qu.:18.25  B:27  M:18  1st Qu.:3   
  Median :26.00     H:18  Median :5   
  Mean  :28.15         Mean  :5   
  3rd Qu.:34.00         3rd Qu.:7   
  Max.  :70.00         Max.  :9   
 > crosstab <- xtabs(breaks ~ .,data = warpbreaks)  
 > crosstab  
 , , replicate = 1  
   tension  
 wool L M H  
   A 26 18 36  
   B 27 42 20  
 , , replicate = 2  
   tension  
 wool L M H  
   A 30 21 21  
   B 14 26 21  
 , , replicate = 3  
   tension  
 wool L M H  
   A 54 29 24  
   B 29 19 24  
 , , replicate = 4  
   tension  
 wool L M H  
   A 25 17 18  
   B 19 16 17  
 , , replicate = 5  
   tension  
 wool L M H  
   A 70 12 10  
   B 29 39 13  
 , , replicate = 6  
   tension  
 wool L M H  
   A 52 18 43  
   B 31 28 15  
 , , replicate = 7  
   tension  
 wool L M H  
   A 51 35 28  
   B 41 21 15  
 , , replicate = 8  
   tension  
 wool L M H  
   A 26 30 15  
   B 20 39 16  
 , , replicate = 9  
   tension  
 wool L M H  
   A 67 36 26  
   B 44 29 28  
 > ftable(crosstab)  
        replicate 1 2 3 4 5 6 7 8 9  
 wool tension                     
 A  L         26 30 54 25 70 52 51 26 67  
    M         18 21 29 17 12 18 35 30 36  
    H         36 21 24 18 10 43 28 15 26  
 B  L         27 14 29 19 29 31 41 20 44  
    M         42 26 19 16 39 28 21 39 29  
    H         20 21 24 17 13 15 15 16 28  

  • Change class
 > class(data$zipCode)  
 [1] "integer"  
 > data$zipCodeFacter <- as.factor(data$zipCode)