- Check current working directory and create "RDirectory" if doesn't exist.
1: > getwd()
2: [1] "/Users/guangyuan"
3: > file.exists("./Rdirectory")
4: [1] FALSE
5: > if(!file.exists("./RDirectory")) {
6: + dir.create("./RDirectory")
7: + }
8: > file.exists("./Rdirectory")
9: [1] TRUE
1: > url <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
2: > download.file(url, destfile = "./RDirectory/restaurants.csv", method = "curl")
3: > data <- read.csv("./RDirectory/restaurants.csv")
1: > head(data, n=3) # get first 3 lines
2: > tail(data, n=3) # get last 3 lines
3: > summary(data)
4: > str(data) # display the structure of the object
1: > table(data$zipCode, useNA = "ifany")
2: > table(data$councilDistrict, data$zipCode)
3: -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217 21218 21220 21222
4: 1 0 0 37 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 7
5: 2 0 0 0 3 27 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6: 3 0 0 0 0 0 0 0 0 0 0 0 2 17 0 0 0 3 0 0
7: 4 0 0 0 0 0 0 0 0 0 0 27 0 0 0 0 0 0 0 0
8: 5 0 0 0 0 0 3 0 6 0 0 0 0 0 31 0 0 0 0 0
9: 6 0 0 0 0 0 0 0 1 19 0 0 0 0 15 1 0 0 0 0
> table(ifelse(data$zipCode <0 ,TRUE, FALSE))
FALSE TRUE
1326 1
> table(data$zipCode <0)
FALSE TRUE
1326 1
> data$zipGroups <-
cut(data$zipCode, breaks = quantile(data$zipCode))
> table(data$zipGroups)
(-2.123e+04,2.12e+04] (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] (2.123e+04,2.129e+04]
337 375 282 332
> table(data$zipGroups,data$zipCode)
-21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213 21214 21215 21216 21217
(-2.123e+04,2.12e+04] 0 136 201 0 0 0 0 0 0 0 0 0 0 0 0 0
(2.12e+04,2.122e+04] 0 0 0 27 30 4 1 8 23 41 28 31 17 54 10 32
>
library(Hmisc)
# use Hmisc package
> data$zipGroups <- cut2(data$zipCode,g = 4)
> table(data$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]
338 375 300 314
- Mutate to create new variable
library(Hmisc); library(plyr)
> data2 <- mutate(data, zipGroups = cut2(zipCode, g=4))
> table(data2$zipGroups)
[-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]
338 375 300 314
1: > sum(is.na(data$councilDistrict)) # get NA count, is.na() return 1 if it is NA
2: [1] 0
3: > any(is.na(data$councilDistrict)) # is any value in that column NA?
4: [1] FALSE
5: > all(data$zipCode>0) # are all zipCode > 0
6: [1] FALSE
7: > colSums(is.na(data))
8: name zipCode neighborhood councilDistrict policeDistrict Location.1
9: 0 0 0 0 0 0
10: > all(colSums(is.na(data)==0))
11: [1] TRUE
12: > table(data$zipCode %in% c("21213","21214")) # zipcode in 21213 or 21214
13: FALSE TRUE
14: 1279 48
15: > data[data$zipCode %in% c("21213","21214"),]
16: name zipCode neighborhood councilDistrict policeDistrict
17: 39 BERMUDA BAR 21213 Broadway East 12 EASTERN
18: 44 BIG BAD WOLF'S HOUSE OF BARBEQUE 21214 Harford-Echodale/Perring Parkway 3 NORTHEASTERN
1: > object.size(data)
2: 242752 bytes
3: > print(object.size(data), units="Mb")
4: 0.2 Mb
> data(UCBAdmissions)
> ucba <- as.data.frame(UCBAdmissions)
> summary(ucba)
Admit Gender Dept Freq
Admitted:12 Male :12 A:4 Min. : 8.0
Rejected:12 Female:12 B:4 1st Qu.: 80.0
C:4 Median :170.0
D:4 Mean :188.6
E:4 3rd Qu.:302.5
F:4 Max. :512.0
> crosstab <- xtabs(Freq ~ Gender + Admit, data = ucba)
> crosstab
Admit
Gender Admitted Rejected
Male 1198 1493
Female 557 1278
> warpbreaks$replicate
[1] 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9
> summary(warpbreaks)
breaks wool tension replicate
Min. :10.00 A:27 L:18 Min. :1
1st Qu.:18.25 B:27 M:18 1st Qu.:3
Median :26.00 H:18 Median :5
Mean :28.15 Mean :5
3rd Qu.:34.00 3rd Qu.:7
Max. :70.00 Max. :9
> crosstab <- xtabs(breaks ~ .,data = warpbreaks)
> crosstab
, , replicate = 1
tension
wool L M H
A 26 18 36
B 27 42 20
, , replicate = 2
tension
wool L M H
A 30 21 21
B 14 26 21
, , replicate = 3
tension
wool L M H
A 54 29 24
B 29 19 24
, , replicate = 4
tension
wool L M H
A 25 17 18
B 19 16 17
, , replicate = 5
tension
wool L M H
A 70 12 10
B 29 39 13
, , replicate = 6
tension
wool L M H
A 52 18 43
B 31 28 15
, , replicate = 7
tension
wool L M H
A 51 35 28
B 41 21 15
, , replicate = 8
tension
wool L M H
A 26 30 15
B 20 39 16
, , replicate = 9
tension
wool L M H
A 67 36 26
B 44 29 28
> ftable(crosstab)
replicate 1 2 3 4 5 6 7 8 9
wool tension
A L 26 30 54 25 70 52 51 26 67
M 18 21 29 17 12 18 35 30 36
H 36 21 24 18 10 43 28 15 26
B L 27 14 29 19 29 31 41 20 44
M 42 26 19 16 39 28 21 39 29
H 20 21 24 17 13 15 15 16 28
> class(data$zipCode)
[1] "integer"
> data$zipCodeFacter <- as.factor(data$zipCode)