- Subsetting a vector
- Subsetting a data frame
- Merging data
x = c(1:10) x
## [1] 1 2 3 4 5 6 7 8 9 10
## Use square brackets to get one element of the vector x[5]
## [1] 5
## Use a logical argument x[x > 6]
## [1] 7 8 9 10
## Using a logical vector to subset x good = x > 6 good
## [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
x[good]
## [1] 7 8 9 10
load("/Users/yadi/UofM/R-QME/Data_Subs_Merge.RData")
mydata1
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 4 4 31 1 ## 5 5 28 1 ## 6 6 29 0 ## 7 7 20 0 ## 8 8 30 0 ## 9 9 26 1 ## 10 10 19 1
## Indexing a data frame mydata1[2,2]
## [1] 19
mydata1[2, ]
## id age gender ## 2 2 19 1
mydata1[ ,1]
## [1] 1 2 3 4 5 6 7 8 9 10
## Subsetting variables by column number mydata1[ ,2:3]
## age gender ## 1 23 1 ## 2 19 1 ## 3 34 0 ## 4 31 1 ## 5 28 1 ## 6 29 0 ## 7 20 0 ## 8 30 0 ## 9 26 1 ## 10 19 1
## Subsetting variables by column number mydata1[ ,c(1,3)]
## id gender ## 1 1 1 ## 2 2 1 ## 3 3 0 ## 4 4 1 ## 5 5 1 ## 6 6 0 ## 7 7 0 ## 8 8 0 ## 9 9 1 ## 10 10 1
## Subsetting variables by name mydata1[c("age", "gender")]
## age gender ## 1 23 1 ## 2 19 1 ## 3 34 0 ## 4 31 1 ## 5 28 1 ## 6 29 0 ## 7 20 0 ## 8 30 0 ## 9 26 1 ## 10 19 1
## Using logical argument mydata1[mydata1$age > 25, ]
## id age gender ## 3 3 34 0 ## 4 4 31 1 ## 5 5 28 1 ## 6 6 29 0 ## 8 8 30 0 ## 9 9 26 1
## Using subset() subset(mydata1, age > 25)
## id age gender ## 3 3 34 0 ## 4 4 31 1 ## 5 5 28 1 ## 6 6 29 0 ## 8 8 30 0 ## 9 9 26 1
subset(mydata1, age > 25 & gender == 1)
## id age gender ## 4 4 31 1 ## 5 5 28 1 ## 9 9 26 1
Data with missing values
mydata2
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 4 4 31 NA ## 5 5 28 1 ## 6 6 29 0 ## 7 7 NA 0 ## 8 8 30 0 ## 9 9 26 1 ## 10 10 19 1
Two ways of removing missing data
Using complete.cases()
good = complete.cases(mydata2) good
## [1] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE
mydata2[good, ]
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 5 5 28 1 ## 6 6 29 0 ## 8 8 30 0 ## 9 9 26 1 ## 10 10 19 1
Using na.omit()
na.omit(mydata2)
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 5 5 28 1 ## 6 6 29 0 ## 8 8 30 0 ## 9 9 26 1 ## 10 10 19 1
Using is.na()
mydata2[!is.na(mydata2$age), ]
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 4 4 31 NA ## 5 5 28 1 ## 6 6 29 0 ## 8 8 30 0 ## 9 9 26 1 ## 10 10 19 1
Example 1: merging two data sets by a single identifier
data1
## id age ## 1 1 23 ## 2 2 19 ## 3 3 34 ## 4 4 31 ## 5 5 28
data2
## id gender ## 1 1 1 ## 2 2 1 ## 3 3 0 ## 4 4 0 ## 5 5 1
Example 1: merging two data sets by a single identifier
merge(data1, data2, by = "id")
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 4 4 31 0 ## 5 5 28 1
Example 2: merging unsorted data
data3
## id gender ## 1 3 0 ## 2 4 0 ## 3 1 1 ## 4 5 1 ## 5 2 1
merge(data1, data3, by = "id")
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 4 4 31 0 ## 5 5 28 1
Example 3: merging when one data set has fewer observations
data1
## id age ## 1 1 23 ## 2 2 19 ## 3 3 34 ## 4 4 31 ## 5 5 28
data4
## id gender ## 1 1 1 ## 2 2 1 ## 3 3 0 ## 4 4 0
Example 3: merging when one data set has fewer observations
merge(data1, data4, by = "id")
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 4 4 31 0
merge(data1, data4, by = "id", all.x = TRUE)
## id age gender ## 1 1 23 1 ## 2 2 19 1 ## 3 3 34 0 ## 4 4 31 0 ## 5 5 28 NA
Example 4: merging clustered data by a single identifier
data5
## id.student id.class math.score gender ## 1 1 101 600 1 ## 2 2 101 700 1 ## 3 3 101 550 0 ## 4 4 101 790 1 ## 5 5 201 450 1 ## 6 6 201 640 0 ## 7 7 201 580 0 ## 8 8 301 670 0 ## 9 9 301 720 1 ## 10 10 301 590 1
Example 4: merging clustered data by a single identifier
data6
## id.class ses.class teach.exp ## 1 101 5 11 ## 2 201 4 3 ## 3 301 3 7
Example 4: merging clustered data by a single identifier
merge(data5, data6, by = "id.class")
## id.class id.student math.score gender ses.class teach.exp ## 1 101 1 600 1 5 11 ## 2 101 2 700 1 5 11 ## 3 101 3 550 0 5 11 ## 4 101 4 790 1 5 11 ## 5 201 5 450 1 4 3 ## 6 201 6 640 0 4 3 ## 7 201 7 580 0 4 3 ## 8 301 8 670 0 3 7 ## 9 301 9 720 1 3 7 ## 10 301 10 590 1 3 7
Example 5: merging clustered data by two identifiers
data7
## id.student id.school id.class math.score gender ## 1 1 1 101 600 1 ## 2 2 1 101 700 1 ## 3 3 1 101 550 0 ## 4 4 1 102 790 1 ## 5 5 1 102 450 1 ## 6 6 2 101 640 0 ## 7 7 2 101 580 0 ## 8 8 2 102 670 0 ## 9 9 2 102 720 1 ## 10 10 2 102 590 1
Example 5: merging clustered data by two identifiers
data8
## id.school id.class ses.class teach.exp ## 1 1 101 5 11 ## 2 1 102 4 3 ## 3 2 101 3 7 ## 4 2 102 6 6
Example 5: merging clustered data by two identifiers
merge(data7, data8, by = c("id.school", "id.class"))
## id.school id.class id.student math.score gender ses.class teach.exp ## 1 1 101 1 600 1 5 11 ## 2 1 101 2 700 1 5 11 ## 3 1 101 3 550 0 5 11 ## 4 1 102 4 790 1 4 3 ## 5 1 102 5 450 1 4 3 ## 6 2 101 6 640 0 3 7 ## 7 2 101 7 580 0 3 7 ## 8 2 102 8 670 0 6 6 ## 9 2 102 9 720 1 6 6 ## 10 2 102 10 590 1 6 6