Homework 10

For Loops and Randomization Tests

Worked on some parts of the code with Stephen Peters Collaer

Question 1 - Using a for loop, write a function to calculate the number of zeroes in a numeric vector. Before entering the loop, set up a counter variable counter <- 0. Inside the loop, add 1 to counter each time you have a zero in the matrix. Finally, use return(counter) for the output.

##################################################
# FUNCTION: num_zero
# purpose: count number of zeros 
# input: x vectors of numeric
# output: count of number of zeros
#------------------------------------------------- 
num_zeros <- function(vector1 = sample(0:3, size = 1000, replace = TRUE)){
  counter <- 0
  for(i in 1:length(vector1)){
    if(vector1[i]== 0){
      counter <- counter + 1
    }
  }
  return(counter)
} 
 
num_zeros()

## [1] 255

Question 2 - Use subsetting instead of a loop to rewrite the function as a single line of code.

##################################################
# FUNCTION: num_zero_sub
# purpose: count number of zeros 
# input: x vectors of numeric
# output: count of number of zeros
#------------------------------------------------- 
num_zeros_sub <- function(vector1 = sample(0:3, size = 1000, replace = TRUE)){
 length(vector1[vector1 == 0])
}
  
num_zeros_sub()

## [1] 267

Question 3 - Write a function that takes as input two integers representing the number of rows and columns in a matrix. The output is a matrix of these dimensions in which each element is the product of the row number x the column number.

##################################################
# FUNCTION: mat_product
# purpose: multiply the row number x by column number
# input: number of rows and columns in a matrix
# output: product of the row number and column number 
#------------------------------------------------- 

mat_product <- function(nrow=4, ncol=4){
  mat <- matrix(nrow=4, ncol=4)
  for(i in 1:nrow){
    for(j in 1:ncol){
      mat[i,j] <- i*j
    }
  }
  print(mat)
}

mat_product()

##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    2    4    6    8
## [3,]    3    6    9   12
## [4,]    4    8   12   16

Question 4 - In the next few lectures, you will learn how to do a randomization test on your data. We will complete some of the steps today to practice calling custom functions within a for loop. Use the code from the March 31st lecture (Randomization Tests) to complete the following steps:

a. Simulate a dataset with 3 groups of data, each group drawn from a distribution with a different mean. The final data frame should have 1 column for group and 1 column for the response variable.

group <- c(rep("A", 100), rep("B", 100), rep("C", 100))
A <- rnorm(n=100, mean = 1, sd = 0.1)
B <- rnorm(n=100, mean = 2, sd = 0.1)
C <- rnorm(n=100, mean = 3, sd = 0.1)

response <- c(A, B, C)
d_frame <- data.frame(group, response)

print(d_frame)

##     group  response
## 1       A 0.9489109
## 2       A 1.0330776
## 3       A 0.8346273
## 4       A 0.8276148
## 5       A 0.9280703
## 6       A 1.0646061
## 7       A 1.0985055
## 8       A 0.8992640
## 9       A 1.1522294
## 10      A 0.8544384
## 11      A 1.1393708
## 12      A 0.9712650
## 13      A 0.9011366
## 14      A 1.1382998
## 15      A 1.1191562
## 16      A 1.1225984
## 17      A 1.2727916
## 18      A 1.1526323
## 19      A 1.0770748
## 20      A 1.1176793
## 21      A 0.9018112
## 22      A 0.9695369
## 23      A 1.0187102
## 24      A 1.0184800
## 25      A 0.9457880
## 26      A 0.9829977
## 27      A 0.9059845
## 28      A 0.8345225
## 29      A 1.0164207
## 30      A 1.0812333
## 31      A 1.0927025
## 32      A 1.1044958
## 33      A 0.9405686
## 34      A 1.0223136
## 35      A 0.8439077
## 36      A 0.8806514
## 37      A 1.1312564
## 38      A 1.0358736
## 39      A 1.1394001
## 40      A 0.9244434
## 41      A 1.2854223
## 42      A 1.1371510
## 43      A 0.9052031
## 44      A 1.0287769
## 45      A 1.0351112
## 46      A 0.9082507
## 47      A 1.0763453
## 48      A 1.1755073
## 49      A 0.8956280
## 50      A 1.1702495
## 51      A 0.8490703
## 52      A 1.0677705
## 53      A 1.1067605
## 54      A 1.0507611
## 55      A 0.9466799
## 56      A 0.7482978
## 57      A 0.8735492
## 58      A 1.0037703
## 59      A 0.9088471
## 60      A 1.0563512
## 61      A 1.0856103
## 62      A 1.0223159
## 63      A 1.1497415
## 64      A 1.0304237
## 65      A 1.0980262
## 66      A 0.9989163
## 67      A 0.9227823
## 68      A 0.8624161
## 69      A 1.0630950
## 70      A 0.9656033
## 71      A 0.9211142
## 72      A 1.0588867
## 73      A 0.9847022
## 74      A 1.1580262
## 75      A 1.1210450
## 76      A 0.7413241
## 77      A 0.9392109
## 78      A 0.9609425
## 79      A 0.8703884
## 80      A 0.9986392
## 81      A 0.9303418
## 82      A 0.9763315
## 83      A 1.0004204
## 84      A 0.8465336
## 85      A 0.9537340
## 86      A 0.9022313
## 87      A 1.1080124
## 88      A 0.9588087
## 89      A 1.0116749
## 90      A 0.9591948
## 91      A 1.1483071
## 92      A 1.0157307
## 93      A 1.0770544
## 94      A 0.9678731
## 95      A 1.1021428
## 96      A 0.9023021
## 97      A 0.8564305
## 98      A 1.1766782
## 99      A 0.8211142
## 100     A 1.0487605
## 101     B 1.9350425
## 102     B 1.9252076
## 103     B 1.9242677
## 104     B 2.1445266
## 105     B 1.9313615
## 106     B 2.0032403
## 107     B 1.9131659
## 108     B 2.0976241
## 109     B 2.1488031
## 110     B 2.1492973
## 111     B 1.8777874
## 112     B 2.1293482
## 113     B 1.9499672
## 114     B 1.9761347
## 115     B 2.0687658
## 116     B 2.1087312
## 117     B 2.0193473
## 118     B 1.9342212
## 119     B 1.8626402
## 120     B 1.9971120
## 121     B 2.0997790
## 122     B 1.9037990
## 123     B 2.0495504
## 124     B 1.7797637
## 125     B 1.9897129
## 126     B 1.8387306
## 127     B 2.0564937
## 128     B 1.8693414
## 129     B 1.9833618
## 130     B 1.9799649
## 131     B 2.1044764
## 132     B 2.0222013
## 133     B 2.0982842
## 134     B 1.7888821
## 135     B 2.1614057
## 136     B 2.1119164
## 137     B 2.1642945
## 138     B 2.1436314
## 139     B 1.9586575
## 140     B 2.1672318
## 141     B 1.9245933
## 142     B 2.0597950
## 143     B 1.9553836
## 144     B 2.0451524
## 145     B 1.8780734
## 146     B 1.7526507
## 147     B 1.8996536
## 148     B 1.8639164
## 149     B 2.1441310
## 150     B 2.0019437
## 151     B 2.0087503
## 152     B 1.9311017
## 153     B 1.9548209
## 154     B 2.2359463
## 155     B 1.9480998
## 156     B 2.0514466
## 157     B 2.1871640
## 158     B 1.8359306
## 159     B 1.9174075
## 160     B 2.0013353
## 161     B 2.1573155
## 162     B 2.0875571
## 163     B 1.9369616
## 164     B 1.9611189
## 165     B 2.2073903
## 166     B 2.0788114
## 167     B 1.9772477
## 168     B 2.1809822
## 169     B 2.0231169
## 170     B 1.9293785
## 171     B 1.9838162
## 172     B 2.1868840
## 173     B 2.1114603
## 174     B 2.0377402
## 175     B 2.0437561
## 176     B 1.9230077
## 177     B 2.0010833
## 178     B 1.9158758
## 179     B 2.0707335
## 180     B 1.9190631
## 181     B 1.9279236
## 182     B 1.9632803
## 183     B 1.9833505
## 184     B 1.9774111
## 185     B 2.0410429
## 186     B 2.0300099
## 187     B 2.1427943
## 188     B 1.9389083
## 189     B 1.9416626
## 190     B 2.1464809
## 191     B 1.9655888
## 192     B 1.9584963
## 193     B 2.0294502
## 194     B 1.9736974
## 195     B 2.1115239
## 196     B 1.9899114
## 197     B 1.9509922
## 198     B 1.9469498
## 199     B 2.0040985
## 200     B 2.0253210
## 201     C 2.9142005
## 202     C 3.0293451
## 203     C 3.1440213
## 204     C 2.9015792
## 205     C 3.0918242
## 206     C 2.8118241
## 207     C 3.0284454
## 208     C 2.9952916
## 209     C 3.0360730
## 210     C 3.1164510
## 211     C 3.0314961
## 212     C 2.8819189
## 213     C 2.9427198
## 214     C 3.0264214
## 215     C 3.1628165
## 216     C 2.9289604
## 217     C 3.1091060
## 218     C 2.9788696
## 219     C 3.2546441
## 220     C 2.8648756
## 221     C 2.9691935
## 222     C 2.8938797
## 223     C 3.0071675
## 224     C 2.9635530
## 225     C 3.0619730
## 226     C 3.1696635
## 227     C 3.0059378
## 228     C 3.0627872
## 229     C 2.9285083
## 230     C 3.0078925
## 231     C 3.1763767
## 232     C 3.0032900
## 233     C 3.0220418
## 234     C 2.9982807
## 235     C 3.0823149
## 236     C 2.9158113
## 237     C 2.8169287
## 238     C 2.7593314
## 239     C 3.2632982
## 240     C 2.9630279
## 241     C 2.8558924
## 242     C 2.9772777
## 243     C 2.8802364
## 244     C 3.0076954
## 245     C 3.0712354
## 246     C 3.0376540
## 247     C 3.1477763
## 248     C 3.1214968
## 249     C 2.9966423
## 250     C 2.9184339
## 251     C 2.9905827
## 252     C 3.2833795
## 253     C 3.3244830
## 254     C 3.0984188
## 255     C 2.9157338
## 256     C 2.9985695
## 257     C 2.8881090
## 258     C 2.7713280
## 259     C 2.9664566
## 260     C 2.7427248
## 261     C 3.1537383
## 262     C 3.0308490
## 263     C 2.9653083
## 264     C 3.0300088
## 265     C 2.9265617
## 266     C 2.9474016
## 267     C 3.0016044
## 268     C 2.8213021
## 269     C 2.8666134
## 270     C 3.0180791
## 271     C 2.8335380
## 272     C 2.9166608
## 273     C 2.8844372
## 274     C 3.0939092
## 275     C 3.1112040
## 276     C 2.9064751
## 277     C 2.9513503
## 278     C 2.9046469
## 279     C 3.1726304
## 280     C 3.0993397
## 281     C 2.9003703
## 282     C 3.0765356
## 283     C 3.0575354
## 284     C 2.9406127
## 285     C 3.0200952
## 286     C 2.9242395
## 287     C 3.0478469
## 288     C 2.8545539
## 289     C 3.0895001
## 290     C 2.9976796
## 291     C 2.9487557
## 292     C 3.0525375
## 293     C 3.1073801
## 294     C 3.0077684
## 295     C 3.0241566
## 296     C 3.0833803
## 297     C 3.0024078
## 298     C 3.0463204
## 299     C 2.9137278
## 300     C 3.1812110

b. Write a custom function that 1) reshuffles the response variable, and 2) calculates the mean of each group in the reshuffled data. Store the means in a vector of length 3.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

##################################################
# FUNCTION: reshuffle
# purpose: get means of reshuffled data
# input: groups of data (A, B, C)
# output: vector length of 3 with means of each group
#------------------------------------------------- 
reshuffle <- function(df = d_frame, response = response){
  colnames(d_frame) <- c('group', 'response')
  new_shuff <- sample(d_frame$response, replace=FALSE)
    d_frame <- data.frame(group, new_shuff)
  sum_mean <- d_frame %>% group_by(group) %>% summarise(mean = mean(new_shuff))
      return(sum_mean)
}

reshuffle()

## # A tibble: 3 x 2
##   group  mean
##   <chr> <dbl>
## 1 A      1.92
## 2 B      2.06
## 3 C      2.03

c. Use a for loop to repeat the function in b 100 times. Store the results in a data frame that has 1 column indicating the replicate number and 1 column for each new group mean, for a total of 4 columns.

library(tidyr)

results <- data.frame()
  
for(i in 1:100){
  shuff <- sample(d_frame$response, replace=FALSE)
    d_frame_shuff <- data.frame(group, shuff)
  sum_mean <- d_frame_shuff %>% group_by(group) %>%    
    summarise(mean = mean(shuff))
  df <- sum_mean %>% pivot_wider(names_from = group, values_from = mean)
  final_df <- cbind(i, df)
  
  results <- rbind(results, final_df)
}

print(results)

##       i        A        B        C
## 1     1 1.923411 2.035020 2.055238
## 2     2 2.030707 1.979730 2.003233
## 3     3 1.853715 2.037623 2.122331
## 4     4 2.097272 1.893838 2.022559
## 5     5 2.027032 1.918857 2.067780
## 6     6 1.977747 1.971023 2.064899
## 7     7 1.898422 2.086438 2.028809
## 8     8 2.030071 2.006963 1.976635
## 9     9 2.078044 1.929922 2.005703
## 10   10 2.010550 1.983847 2.019272
## 11   11 1.864892 2.126383 2.022394
## 12   12 2.106846 1.910628 1.996195
## 13   13 2.022738 2.066754 1.924177
## 14   14 1.971465 2.057347 1.984857
## 15   15 1.904061 1.967805 2.141804
## 16   16 2.027785 2.051297 1.934586
## 17   17 1.955615 1.937979 2.120076
## 18   18 1.937583 2.102317 1.973770
## 19   19 2.171021 1.903089 1.939560
## 20   20 1.956471 1.974855 2.082343
## 21   21 2.092374 1.852835 2.068460
## 22   22 1.936598 2.043768 2.033303
## 23   23 1.988238 1.950242 2.075190
## 24   24 2.047977 1.995502 1.970191
## 25   25 2.031919 1.985752 1.995999
## 26   26 1.942426 2.070877 2.000367
## 27   27 1.999995 1.904903 2.108771
## 28   28 2.123316 2.051599 1.838754
## 29   29 1.998609 1.960040 2.055020
## 30   30 2.019434 2.023497 1.970738
## 31   31 1.994219 1.960954 2.058496
## 32   32 2.047033 1.951307 2.015329
## 33   33 1.920407 2.112264 1.980998
## 34   34 1.962063 1.966501 2.085106
## 35   35 2.022413 1.995585 1.995671
## 36   36 2.071889 2.009238 1.932543
## 37   37 2.023599 1.889132 2.100938
## 38   38 2.080219 1.876835 2.056616
## 39   39 2.004464 2.005163 2.004042
## 40   40 1.938638 2.013901 2.061130
## 41   41 2.006407 1.882779 2.124483
## 42   42 1.921604 1.950145 2.141921
## 43   43 1.996196 2.009823 2.007650
## 44   44 1.975729 1.951066 2.086874
## 45   45 2.063703 1.939013 2.010954
## 46   46 1.908313 2.057554 2.047802
## 47   47 2.044262 2.044148 1.925259
## 48   48 1.996681 1.923701 2.093287
## 49   49 1.950984 2.040889 2.021797
## 50   50 1.915258 1.925806 2.172606
## 51   51 1.851950 2.001538 2.160182
## 52   52 2.000794 2.026642 1.986233
## 53   53 2.023366 1.970777 2.019526
## 54   54 2.000014 1.944787 2.068868
## 55   55 1.928525 2.071486 2.013659
## 56   56 2.042812 1.901493 2.069365
## 57   57 2.103384 1.973640 1.936645
## 58   58 1.980743 2.002707 2.030219
## 59   59 1.924965 2.037180 2.051525
## 60   60 2.035173 1.981270 1.997227
## 61   61 2.072519 1.984268 1.956882
## 62   62 2.036915 1.946984 2.029770
## 63   63 2.038274 1.927484 2.047912
## 64   64 1.921718 2.058201 2.033750
## 65   65 2.090776 1.952403 1.970490
## 66   66 2.090776 1.909257 2.013636
## 67   67 2.012563 1.958442 2.042664
## 68   68 2.102882 1.879183 2.031604
## 69   69 1.990617 2.033730 1.989323
## 70   70 1.992289 2.024140 1.997240
## 71   71 1.973903 2.049412 1.990355
## 72   72 2.042303 1.827604 2.143763
## 73   73 2.005450 1.941992 2.066228
## 74   74 1.973146 2.094665 1.945859
## 75   75 2.027318 2.017377 1.968974
## 76   76 2.008129 1.934366 2.071175
## 77   77 2.053437 2.033708 1.926524
## 78   78 2.105689 1.923611 1.984369
## 79   79 2.100935 1.964620 1.948114
## 80   80 2.088997 1.937100 1.987572
## 81   81 1.892863 2.106216 2.014590
## 82   82 1.903224 2.103143 2.007302
## 83   83 1.934262 1.819598 2.259810
## 84   84 1.987191 1.963475 2.063004
## 85   85 1.973065 2.007247 2.033357
## 86   86 1.934785 2.010399 2.068485
## 87   87 1.950479 2.006244 2.056947
## 88   88 2.080891 1.908086 2.024692
## 89   89 2.048759 1.992310 1.972600
## 90   90 2.027251 2.083008 1.903410
## 91   91 1.984666 2.034617 1.994386
## 92   92 2.037621 1.982131 1.993918
## 93   93 1.873871 2.103404 2.036395
## 94   94 1.930436 2.082988 2.000244
## 95   95 1.973970 1.984898 2.054801
## 96   96 1.999138 2.045327 1.969204
## 97   97 2.109939 1.876144 2.027586
## 98   98 1.995805 2.038296 1.979568
## 99   99 2.084939 1.944884 1.983846
## 100 100 2.089501 1.998830 1.925339

d. Use qplot() to create a histogram of the means for each reshuffled group. Or, if you want a challenge, use ggplot() to overlay all 3 histograms in the same figure. How do the distributions of reshuffled means compare to the original means?

library(ggplot2)

# Calculate quantiles A - 

quantile(x=A, probs = 0.975)

##    97.5% 
## 1.176122

mean_plotA <- ggplot(data = results, aes(x=A)) +
    geom_histogram(color = "black", fill = "Steelblue3") +
    geom_vline(aes(xintercept=mean(A)),
            color="red", linetype="dashed", size=1) + 
  geom_vline(aes(xintercept= quantile(x=A, probs= 0.025)),
             color="black", size=1) +
  geom_vline(aes(xintercept= quantile(x=A, probs= 0.975)),
             color="black", size=1)

print(mean_plotA)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mean_plotB <- ggplot(data = results, aes(x=B)) +
    geom_histogram(color = "black", fill = "darkolivegreen4") +
    geom_vline(aes(xintercept=mean(B)),
            color="red", linetype="dashed", size=1) + 
  geom_vline(aes(xintercept= quantile(x=B, probs= 0.025)),
             color="black", size=1) +
  geom_vline(aes(xintercept= quantile(x=B, probs= 0.975)),
             color="black", size=1)

print(mean_plotB)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mean_plotC <- ggplot(data = results, aes(x=C)) +
    geom_histogram(color = "black", fill = "darksalmon") +
    geom_vline(aes(xintercept=mean(C)),
            color="red", linetype="dashed", size=1) + 
  geom_vline(aes(xintercept= quantile(x=C, probs= 0.025)),
             color="black", size=1) +
  geom_vline(aes(xintercept= quantile(x=C, probs= 0.975)),
             color="black", size=1)

print(mean_plotC)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Looking at my histograms, you can see that the means are different so I added a mean line (red) on the histogram graphs of the observed values and the quantile lines (red) represent the 95% confidence interval

Worked with Stephen on this code

5. Use the code from the upcoming April 2nd lecture (Randomization Tests) to design and conduct a randomization test for some of your own data.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.6     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

simulated <- results %>% select(A:C) %>% pivot_longer(cols= A:C, names_to = "group", values_to = "response")
  
  
##################################################
# function: get_pval_obs
# calculate p value from obs
# input: vector of simulated metrics
# output: lower, upper tail probability values
#------------------------------------------------- 
get_pval_obs <- function(df = d_frame) {
  test <- aov( response ~ group, data = df)
  p_val <- summary(test)[[1]][["Pr(>F)"]][1]
  return(p_val)
}

get_pval_obs()

## [1] 2.829588e-262

x_obs <- get_pval_obs()


##################################################
# function: get_pval_sim
# calculate p value from simulation
# input: list of observed metric, and vector of simulated metrics
# output: lower, upper tail probability values
#------------------------------------------------- 
get_pval_sim <- function(df = simulated) {
  test <- aov( response ~ group, data = df)
  p_val <- summary(test)[[1]][["Pr(>F)"]][1]
  return(p_val)
}

get_pval_sim()

## [1] 0.0004797687

Home Page