-
Notifications
You must be signed in to change notification settings - Fork 1
/
04-reanalysisIII.Rmd
447 lines (356 loc) · 15 KB
/
04-reanalysisIII.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
# Re-analysis 3 {#studyIII}
This re-analysis is from the study
> Marton, Giulia, et al. "Patients’ health locus of control and preferences about the role that they want to play in the medical decision-making process." Psychology, Health & Medicine (2020): 1-7
```{r warning=F, message=F}
library(bpcs)
library(tidyverse)
library(knitr)
library(loo)
library(cmdstanr)
PATH_TO_CMDSTAN <- paste(Sys.getenv("HOME"), '/.cmdstan/cmdstan-2.27.0', sep = '')
set_cmdstan_path(PATH_TO_CMDSTAN)
set.seed(99)
```
## Importing the data
The data from this paper was made available upon request and below we exemplify a few rows of how the original dataset looks like.
Let's starting importing the data.
```{r}
d<-read.table("data/MHLC.txt", sep="\t", header=T)
d<-as.data.frame(d)
sample_n(d, size=5) %>%
kable(caption = 'Sample of rows from the original data') %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::scroll_box(width = "100%")
```
As we can see, the data is in a wide format. Before we pivot it to longer let's add a column that indicates the subject ID.
```{r}
d <- d %>% mutate(SubjectID = row_number())
```
Now let's pivot it to the longer format
```{r}
cols_to_pivot<-colnames(d)[1:10]
d_longer<-tidyr::pivot_longer(d, cols=all_of(cols_to_pivot), names_to='comparison', values_to='y')
```
Now let's divide the comparison into two vectors (choice0 and choice1). So it fits the bpcs format
```{r}
comp_cols <- str_split_fixed(d_longer$comparison, "", 2)
d_longer$choice0 <- comp_cols[,1]
d_longer$choice1 <- comp_cols[,2]
```
The data frame now looks like this:
```{r}
dplyr::sample_n(d_longer, size=10) %>%
kable() %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::scroll_box(width = "100%")
```
Now that we have setup the data frame correctly for the bpcs package, we can use it to model the problem.
Let's just rename a few values so it is easier to understand.
```{r}
#choice0
d_longer$choice0 <- recode(d_longer$choice0, 'A'='Active')
d_longer$choice0 <- recode(d_longer$choice0, 'B'='Active-Collaborative')
d_longer$choice0 <- recode(d_longer$choice0, 'C'='Collaborative')
d_longer$choice0 <- recode(d_longer$choice0, 'D'='Passive-Collaborative')
d_longer$choice0 <- recode(d_longer$choice0, 'E'='Passive')
#choice1
d_longer$choice1 <- recode(d_longer$choice1, 'A'='Active')
d_longer$choice1 <- recode(d_longer$choice1, 'B'='Active-Collaborative')
d_longer$choice1 <- recode(d_longer$choice1, 'C'='Collaborative')
d_longer$choice1 <- recode(d_longer$choice1, 'D'='Passive-Collaborative')
d_longer$choice1 <- recode(d_longer$choice1, 'E'='Passive')
```
Our final step in the preparation of the dataset is to standardize the values of the MHLOC scales. The values provided in the dataset correspond to the sum of the values of the scale and ranges from 3-18 or from 6-36. Therefore we will scale them accordingly for more stable inference
```{r}
d_longer <- d_longer %>%
mutate(Internal = scale(MHLC_INTERNAL),
Chance = scale(MHLC_CHANCE),
Doctors = scale(MHLC_DOCTORS),
OtherPeople = scale(MHLC_OTHER_PEOPLE))
```
The final modified dataset looks like:
```{r}
sample_n(d_longer, size=10) %>%
kable() %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::scroll_box(width = "100%")
```
Now we can proceed with the analysis.
## Simple Bradley-Terry models
Let's start with a simple BT without considering any subject predictors. Just to evaluate the average probability of people being Active, Active-Collaborative, Collaborative, Passive-Collaborative or Passive.
```{r r3bt, eval=F}
m1 <-
bpc(
d_longer,
player0 = 'choice0',
player1 = 'choice1',
result_column = 'y',
model_type = 'bt',
priors = list(prior_lambda_std = 3.0),
iter = 2000,
show_chain_messages = T
)
save_bpc_model(m1, 'm_hloc', './fittedmodels')
```
```{r echo=F}
m1 <- load_bpc_model('./fittedmodels/m_hloc.RDS')
```
Let's investigate model convergence with shinystan and with check convergence function. Everything seems fine.
```{r eval=F}
launch_shinystan(m1)
```
Now let's get the waic
```{r cache=T}
m1_waic <- get_waic(m1)
m1_waic
```
## Subject predictors model
We have different HLC that can be used as predictors for the response. Let's create a single model with all the predictors.
```{r r3btS, eval=F}
m2 <-
bpc(
d_longer,
player0 = 'choice0',
player1 = 'choice1',
result_column = 'y',
subject_predictors = c('Internal', 'Chance', 'Doctors', 'OtherPeople'),
model_type = 'bt-subjectpredictors',
priors = list(prior_lambda_std = 3.0,
prior_S_std = 3.0),
iter = 2000,
show_chain_messages = T
)
save_bpc_model(m2, 'm_mhloc_subjectpred', './fittedmodels')
```
```{r echo=F}
m2 <- load_bpc_model('fittedmodels/m_mhloc_subjectpred.RDS')
```
Diagnostics:
```{r eval=F}
launch_shinystan(m2)
```
```{r cache=T}
m2_waic <- get_waic(m2)
m2_waic
```
## Subject predictors and random effects
Now let's create a third model that also compensate for multiple judgment with a random effects variable.
This model adds 153*4 variables, so sampling will take longer and be a bit more complex.
```{r r3btSU, eval=F}
m3 <-
bpc(
d_longer,
player0 = 'choice0',
player1 = 'choice1',
result_column = 'y',
subject_predictors = c('Internal', 'Chance', 'Doctors', 'OtherPeople'),
cluster = c('SubjectID'),
model_type = 'bt-subjectpredictors-U',
priors = list(prior_lambda_std = 3.0,
prior_S_std = 3.0,
prior_U1_std = 3.0),
iter = 2000,
show_chain_messages = T
)
save_bpc_model(m3, 'm_mhloc_subjectpred_U', './fittedmodels')
```
```{r echo=F}
m3 <- load_bpc_model('fittedmodels/m_mhloc_subjectpred_U.RDS')
```
Diagnostics
For a quick check
```{r cache=T}
check_convergence_diagnostics(m3)
```
```{r eval=F}
launch_shinystan(m3)
```
```{r cache=T, message=F, warning=F}
m3_waic <- get_waic(m3)
m3_waic
```
## Comparing the WAIC
Let's compare the WAIC of the three models
```{r}
loo::loo_compare(m1_waic,m2_waic, m3_waic)
```
## Plots and tables
Now that we see that all models have proper convergence and that the model m3 has a better fit we will generate some plots and tables to help understand the problem
```{r cache=T}
lambda <- get_parameters(m3, params = 'lambda', n_eff = T,keep_par_name = F)
Spar <- get_parameters(m3, params = 'S', n_eff = T)
U_std <- get_parameters(m3, params = 'U1_std', n_eff = T)
```
Let's create a custom table based on these parameters. But first we will rename the parameters a bit so the table reads a bit better. We are here removing the S[*] from the parameter
```{r}
Spar$Parameter <- stringr::str_sub(Spar$Parameter,start = 3, end=-2)
```
### Table lambda and U
Now we can create the table
```{r}
#merge the datasets
df_table <- rbind(lambda, U_std)
rownames(df_table)<- NULL
#creating the table
df_table %>%
kable(caption = 'Lambda parameters of the model and the random effects standard deviation',
digits = 2,
col.names = c('Parameter', 'Mean','Median','HPD lower', 'HPD lower','N. Eff. Samples')) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::scroll_box(width = "100%")
```
```{r echo=F, eval=F}
df_table %>%
kable(format='latex',
caption = 'Lambda parameters of the model and the random effects standard deviation',
digits = 2,
col.names = c('Parameter', 'Mean','Median','HPD lower', 'HPD lower', 'N. Eff. Samples'),
booktabs=T) %>%
kableExtra::kable_styling()
```
### Subject predictors table
```{r}
S <- Spar %>% tidyr::separate(Parameter,c('Role','MHLOC'), sep=",")
S$MHLOC <- recode(S$MHLOC, 'OtherPeople'= 'Other people')
rownames(S) <- NULL
```
```{r}
S %>%
dplyr::arrange(Role) %>%
select(-Role) %>%
kable(format='html',
caption = 'Subject predictors parameters by role',
digits = 2,
col.names = c('Parameter', 'Mean', 'Median','HPD lower', 'HPD lower', 'N. Eff. Samples'),
booktabs=T) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::pack_rows("Active", 1, 4) %>%
kableExtra::pack_rows("Active-Collaborative", 5, 8) %>%
kableExtra::pack_rows("Collaborative", 9, 12) %>%
kableExtra::pack_rows("Passive-Collaborative", 13, 16) %>%
kableExtra::pack_rows("Passive", 17, 20) %>%
kableExtra::scroll_box(width = "100%")
```
```{r echo=F, eval=F}
S %>%
dplyr::arrange(Role) %>%
select(-Role) %>%
kable(format='latex',
caption = 'Subject predictors parameters by role',
digits = 2,
col.names = c('Parameter', 'Mean', 'Median','HPD lower', 'HPD lower', 'N. Eff. Samples'),
booktabs=T) %>%
kableExtra::kable_styling() %>%
kableExtra::pack_rows("Active", 1, 4) %>%
kableExtra::pack_rows("Active-Collaborative", 5, 8) %>%
kableExtra::pack_rows("Collaborative", 9, 12) %>%
kableExtra::pack_rows("Passive-Collaborative", 13, 16) %>%
kableExtra::pack_rows("Passive", 17, 20)
```
### Plot
```{r}
S$Role <- factor(S$Role, levels = c('Active', 'Active-Collaborative', 'Collaborative', 'Passive-Collaborative', 'Passive'))
ggplot(S, aes(x=MHLOC))+
geom_pointrange(aes(
ymin = HPD_lower,
ymax = HPD_higher,
y = Mean,
group=MHLOC))+
facet_wrap(~Role, nrow = 3) + #Dividing the plot into three by species
labs(title = 'Subject predictors estimates with the HPD interval',
y = 'Estimate',
x = 'HLOC dimension')+
theme_bw()+ # A black and white theme
# scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
theme(legend.position="bottom") #small adjustments to the theme
```
### Probability tables
First let's create a data frame with the cases we want to investigate. We will utilize the model_type option in the probabilities table so we can average out the values of the random effects in the subjects. In this table, we will only investigate a few of the conditions, but of course it is possible to do a much more expansive analysis
Let's create a new data frame with the conditions we want to investigate.
Mainly we are just investigating:
* Between the choice of Active and Passive how does going from -2 to 2 standard deviations over the mean influences the probability in the Internal and the Doctors
* Between the choice of Active-Collaborative and Collaborative how does going from -2 to 2 standard deviations over the mean influences the probability in the Internal and the Doctors
* Between the choice of Collaborative and Passive-Collaborative how does going from -2 to 2 standard deviations over the mean influences the probability in the Internal and the Doctors
```{r}
newdata <- tibble::tribble(~choice1, ~choice0, ~Internal, ~Chance, ~Doctors, ~OtherPeople,
"Active", "Passive", 0, 0, 0, 0,
"Active", "Passive", -2, 0, 0, 0,
"Active", "Passive", 2, 0, 0, 0,
"Active", "Passive", 0, 0, -2, 0,
"Active", "Passive", 0, 0, 2, 0,
"Active-Collaborative", "Collaborative", 0, 0, 0, 0,
"Active-Collaborative", "Collaborative", -2, 0, 0, 0,
"Active-Collaborative", "Collaborative", 2, 0, 0, 0,
"Active-Collaborative", "Collaborative", 0, 0, -2, 0,
"Active-Collaborative", "Collaborative", 0, 0, 2, 0,
"Collaborative", "Passive-Collaborative", 0, 0, 0, 0,
"Collaborative", "Passive-Collaborative", 0, -2, 0, 0,
"Collaborative", "Passive-Collaborative", 0, 2, 0, 0,
"Collaborative", "Passive-Collaborative", 0, 0, 0, -2,
"Collaborative", "Passive-Collaborative", 0, 0, 0, 2
)
#Now we can calculate the probabilities
prob_hloc <-
get_probabilities_df(
m3,
newdata = newdata,
model_type = 'bt-subjectpredictors') #here we are assuming zero for the random effects
```
```{r}
prob_hloc %>%
select(-j_beats_i) %>%
rename(Probability=i_beats_j) %>%
kable(caption = "Probabilities of selecting role i instead of j based on changes of the values of the HLOC dimensions",
digits = 2,
booktabs = T) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
kableExtra::add_header_above(c("Roles" = 2, "HLOC dimensions" = 4, " "=1)) %>%
kableExtra::scroll_box(width = "100%")
```
```{r eval=F, echo=F}
#Recoding to make it a bit shorter
prob_hloc$i <- recode(prob_hloc$i, 'Active-Collaborative'='Act.-Collab.')
prob_hloc$i <- recode(prob_hloc$i, 'Collaborative'='Collab.')
prob_hloc$j <- recode(prob_hloc$j, 'Collaborative'='Collab.')
prob_hloc$j <- recode(prob_hloc$j, 'Passive-Collaborative'='Pass.-Collab.')
prob_hloc %>%
select(-j_beats_i) %>%
rename(Probability=i_beats_j) %>%
kable(
format='latex',
caption = "Probabilities of selecting role i instead of j based on changes of the values of the HLOC dimensions",
digits = 2,
booktabs = T) %>%
kableExtra::add_header_above(c("Roles" = 2, "HLOC dimensions" = 4, " "=1))
```
## Assessing fitness of items and judges
For simplification we will be using only the median (and not the full posterior) to assess the fitness of judges
```{r}
std_judges <- U_std$Median #variance of the judges
U <- get_parameters(m3, params = 'U1', n_eff = T)
U_median <- U$Median
U_median <- U_median[-length(U_median)]
U_fit <- data.frame(U=U_median, y=rep(0,length(U_median)), q90=as.factor(as.numeric(U_median > std_judges*1.65)), q95=as.factor(as.numeric(U_median > std_judges*1.96)))
ggplot(data=U_fit, aes(x=U, y=y, color=q90))+
geom_jitter() +
geom_vline(xintercept = c(-1.65*std_judges,1.65*3), linetype = "dashed", color="red")+
geom_vline(xintercept = c(-1.95*std_judges,1.95*3), linetype = "dashed", color="cyan")+
theme(legend.position="none", axis.title.y = element_blank()) #small adjustments to the theme
```
From this figure we can see that none of the judges rated a value outside the 90% and the 95% interval
Comparing to the prior distribution (with std of 3.0)
```{r}
ggplot(data=U_fit, aes(x=U, y=y, color=q90))+
geom_jitter() +
geom_vline(xintercept = c(-1.65*3,1.65*3), linetype = "dashed", color="red")+
geom_vline(xintercept = c(-1.95*3,1.95*3), linetype = "dashed", color="cyan")+
theme(legend.position="none", axis.title.y = element_blank()) #small adjustments to the theme
```
Finally we can look also if any of the judges can be considered to be outliers
Let's use a Hample filter on the median values
```{r}
lower_bound <- median(U_median) - 3 * mad(U_median, constant = 1)
upper_bound <- median(U_median) + 3 * mad(U_median, constant = 1)
which(U_median< lower_bound | U_median > upper_bound)
```
Within the values we have a few pairs of judges/items to be consider outliers, although without a misfit in terms of model