Data Preparation I: Subsetting and Merging

  • Subsetting a vector
  • Subsetting a data frame
  • Merging data

Subsetting a vector

x = c(1:10)
x
##  [1]  1  2  3  4  5  6  7  8  9 10
## Use square brackets to get one element of the vector
x[5]
## [1] 5
## Use a logical argument
x[x > 6]
## [1]  7  8  9 10

Subsetting a vector

## Using a logical vector to subset x
good = x > 6
good
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
x[good]
## [1]  7  8  9 10

Subsetting a data frame

load("/Users/yadi/UofM/R-QME/Data_Subs_Merge.RData")
mydata1
##    id age gender
## 1   1  23      1
## 2   2  19      1
## 3   3  34      0
## 4   4  31      1
## 5   5  28      1
## 6   6  29      0
## 7   7  20      0
## 8   8  30      0
## 9   9  26      1
## 10 10  19      1

Subsetting a data frame

## Indexing a data frame
mydata1[2,2]
## [1] 19
mydata1[2, ]
##   id age gender
## 2  2  19      1
mydata1[ ,1]
##  [1]  1  2  3  4  5  6  7  8  9 10

Subsetting a data frame

## Subsetting variables by column number
mydata1[ ,2:3]
##    age gender
## 1   23      1
## 2   19      1
## 3   34      0
## 4   31      1
## 5   28      1
## 6   29      0
## 7   20      0
## 8   30      0
## 9   26      1
## 10  19      1

Subsetting a data frame

## Subsetting variables by column number
mydata1[ ,c(1,3)]
##    id gender
## 1   1      1
## 2   2      1
## 3   3      0
## 4   4      1
## 5   5      1
## 6   6      0
## 7   7      0
## 8   8      0
## 9   9      1
## 10 10      1

Subsetting a data frame

## Subsetting variables by name
mydata1[c("age", "gender")]
##    age gender
## 1   23      1
## 2   19      1
## 3   34      0
## 4   31      1
## 5   28      1
## 6   29      0
## 7   20      0
## 8   30      0
## 9   26      1
## 10  19      1

Subsetting a data frame

## Using logical argument
mydata1[mydata1$age > 25, ]
##   id age gender
## 3  3  34      0
## 4  4  31      1
## 5  5  28      1
## 6  6  29      0
## 8  8  30      0
## 9  9  26      1

Subsetting a data frame

## Using subset()
subset(mydata1, age > 25)
##   id age gender
## 3  3  34      0
## 4  4  31      1
## 5  5  28      1
## 6  6  29      0
## 8  8  30      0
## 9  9  26      1
subset(mydata1, age > 25 & gender == 1)
##   id age gender
## 4  4  31      1
## 5  5  28      1
## 9  9  26      1

Subsetting a data frame

Data with missing values

mydata2
##    id age gender
## 1   1  23      1
## 2   2  19      1
## 3   3  34      0
## 4   4  31     NA
## 5   5  28      1
## 6   6  29      0
## 7   7  NA      0
## 8   8  30      0
## 9   9  26      1
## 10 10  19      1

Subsetting a data frame

Two ways of removing missing data

  • Remove all missing data in the file
    • complete.cases()
    • na.omit()
  • Remove missing data of some of the variables
    • is.na()

Subsetting a data frame

Using complete.cases()

good = complete.cases(mydata2)
good
##  [1]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
mydata2[good, ]
##    id age gender
## 1   1  23      1
## 2   2  19      1
## 3   3  34      0
## 5   5  28      1
## 6   6  29      0
## 8   8  30      0
## 9   9  26      1
## 10 10  19      1

Subsetting a data frame

Using na.omit()

na.omit(mydata2)
##    id age gender
## 1   1  23      1
## 2   2  19      1
## 3   3  34      0
## 5   5  28      1
## 6   6  29      0
## 8   8  30      0
## 9   9  26      1
## 10 10  19      1

Subsetting a data frame

Using is.na()

mydata2[!is.na(mydata2$age), ]
##    id age gender
## 1   1  23      1
## 2   2  19      1
## 3   3  34      0
## 4   4  31     NA
## 5   5  28      1
## 6   6  29      0
## 8   8  30      0
## 9   9  26      1
## 10 10  19      1

Merging data

Example 1: merging two data sets by a single identifier

data1
##   id age
## 1  1  23
## 2  2  19
## 3  3  34
## 4  4  31
## 5  5  28
data2
##   id gender
## 1  1      1
## 2  2      1
## 3  3      0
## 4  4      0
## 5  5      1

Merging data

Example 1: merging two data sets by a single identifier

merge(data1, data2, by = "id")
##   id age gender
## 1  1  23      1
## 2  2  19      1
## 3  3  34      0
## 4  4  31      0
## 5  5  28      1

Merging data

Example 2: merging unsorted data

data3
##   id gender
## 1  3      0
## 2  4      0
## 3  1      1
## 4  5      1
## 5  2      1
merge(data1, data3, by = "id")
##   id age gender
## 1  1  23      1
## 2  2  19      1
## 3  3  34      0
## 4  4  31      0
## 5  5  28      1

Merging data

Example 3: merging when one data set has fewer observations

data1
##   id age
## 1  1  23
## 2  2  19
## 3  3  34
## 4  4  31
## 5  5  28
data4
##   id gender
## 1  1      1
## 2  2      1
## 3  3      0
## 4  4      0

Merging data

Example 3: merging when one data set has fewer observations

merge(data1, data4, by = "id")
##   id age gender
## 1  1  23      1
## 2  2  19      1
## 3  3  34      0
## 4  4  31      0
merge(data1, data4, by = "id", all.x = TRUE)
##   id age gender
## 1  1  23      1
## 2  2  19      1
## 3  3  34      0
## 4  4  31      0
## 5  5  28     NA

Merging data

Example 4: merging clustered data by a single identifier

data5
##    id.student id.class math.score gender
## 1           1      101        600      1
## 2           2      101        700      1
## 3           3      101        550      0
## 4           4      101        790      1
## 5           5      201        450      1
## 6           6      201        640      0
## 7           7      201        580      0
## 8           8      301        670      0
## 9           9      301        720      1
## 10         10      301        590      1

Merging data

Example 4: merging clustered data by a single identifier

data6
##   id.class ses.class teach.exp
## 1      101         5        11
## 2      201         4         3
## 3      301         3         7

Merging data

Example 4: merging clustered data by a single identifier

merge(data5, data6, by = "id.class")
##    id.class id.student math.score gender ses.class teach.exp
## 1       101          1        600      1         5        11
## 2       101          2        700      1         5        11
## 3       101          3        550      0         5        11
## 4       101          4        790      1         5        11
## 5       201          5        450      1         4         3
## 6       201          6        640      0         4         3
## 7       201          7        580      0         4         3
## 8       301          8        670      0         3         7
## 9       301          9        720      1         3         7
## 10      301         10        590      1         3         7

Merging data

Example 5: merging clustered data by two identifiers

data7
##    id.student id.school id.class math.score gender
## 1           1         1      101        600      1
## 2           2         1      101        700      1
## 3           3         1      101        550      0
## 4           4         1      102        790      1
## 5           5         1      102        450      1
## 6           6         2      101        640      0
## 7           7         2      101        580      0
## 8           8         2      102        670      0
## 9           9         2      102        720      1
## 10         10         2      102        590      1

Merging data

Example 5: merging clustered data by two identifiers

data8
##   id.school id.class ses.class teach.exp
## 1         1      101         5        11
## 2         1      102         4         3
## 3         2      101         3         7
## 4         2      102         6         6

Merging data

Example 5: merging clustered data by two identifiers

merge(data7, data8, by = c("id.school", "id.class"))
##    id.school id.class id.student math.score gender ses.class teach.exp
## 1          1      101          1        600      1         5        11
## 2          1      101          2        700      1         5        11
## 3          1      101          3        550      0         5        11
## 4          1      102          4        790      1         4         3
## 5          1      102          5        450      1         4         3
## 6          2      101          6        640      0         3         7
## 7          2      101          7        580      0         3         7
## 8          2      102          8        670      0         6         6
## 9          2      102          9        720      1         6         6
## 10         2      102         10        590      1         6         6

Final notes

  • Download the Data_Subs_Merge.RData file so you can reproduce the examples in this presentation
  • Note that you need to assign the examples presented to a new data frame
    • new_data = mydata1[c("age", "gender")]
    • new_data = subset(mydata1, age > 25 & gender == 1)
    • new_data = merge(data1, data4, by = "id", all.x = TRUE)