set.seed(1) G <- 2 N <- 1000 R <- 1000 q <- c(0.6, 0.4) e <- c(0.3, 0.7) N_t <- N * q * e tau_fs <- c(0.2, 0.3)
outcome <- foreach (g = 1:G, .combine = "rbind") %do% { outcome_g <- data.frame( g = g, y_0 = rnorm(N * q[g], mean = 0, sd = 1), y_1 = rnorm(N * q[g], mean = tau_fs[g], sd = 1) ) return(outcome_g) } head(outcome)
## g y_0 y_1 ## 1 1 -0.6264538 -0.14106698 ## 2 1 0.1836433 1.70242453 ## 3 1 -0.8356286 0.72830771 ## 4 1 1.5952808 0.74219136 ## 5 1 0.3295078 0.06332664 ## 6 1 -0.8204684 -0.93673385
outcome <- outcome %>% dplyr::group_by(g) %>% dplyr::mutate(w = (1:length(g) %in% sample(length(g), N_t[g]))) %>% dplyr::ungroup() head(outcome)
## # A tibble: 6 x 4 ## g y_0 y_1 w ## <int> <dbl> <dbl> <lgl> ## 1 1 -0.626 -0.141 FALSE ## 2 1 0.184 1.70 FALSE ## 3 1 -0.836 0.728 FALSE ## 4 1 1.60 0.742 FALSE ## 5 1 0.330 0.0633 FALSE ## 6 1 -0.820 -0.937 FALSE
outcome_observed <- outcome %>% dplyr::mutate(y = y_0 * (1 - w) + y_1 * w) %>% dplyr::select(g, y, w) head(outcome_observed)
## # A tibble: 6 x 3 ## g y w ## <int> <dbl> <lgl> ## 1 1 -0.626 FALSE ## 2 1 0.184 FALSE ## 3 1 -0.836 FALSE ## 4 1 1.60 FALSE ## 5 1 0.330 FALSE ## 6 1 -0.820 FALSE
weight <- outcome_observed %>% dplyr::group_by(g) %>% dplyr::summarise( e = sum(w) / length(w), q = length(g) ) %>% dplyr::ungroup() %>% dplyr::mutate( q = q / sum(q), lambda_0 = q, lambda_1 = q * e * (1 - e) / sum(q * e * (1 - e)) ) weight
## # A tibble: 2 x 5 ## g e q lambda_0 lambda_1 ## <int> <dbl> <dbl> <dbl> <dbl> ## 1 1 0.3 0.6 0.6 0.6 ## 2 2 0.7 0.4 0.4 0.4
statistics_observed <- outcome_observed %>% dplyr::group_by(g) %>% dplyr::summarise( tau = sum(y * w) / sum(w) - sum(y * (1 - w) / sum(1 - w)), ) %>% dplyr::ungroup() statistics_observed
## # A tibble: 2 x 2 ## g tau ## <int> <dbl> ## 1 1 0.175 ## 2 2 0.324
statistics_observed <- statistics_observed %>% dplyr::left_join(weight, by = "g") %>% dplyr::summarise( tau_0 = sum(tau * lambda_0) %>% abs(), tau_1 = sum(tau * lambda_1) %>% abs() ) statistics_observed
## # A tibble: 1 x 2 ## tau_0 tau_1 ## <dbl> <dbl> ## 1 0.235 0.235
statistics_simulated <- foreach (r = 1:R, .combine = "rbind") %do% { statistics_simulated <- outcome_observed %>% dplyr::left_join(weight, by = "g") %>% dplyr::group_by(g) %>% dplyr::mutate(w = (1:length(g) %in% sample(length(g), N_t[g]))) %>% dplyr::summarise( tau = mean(y * w) - mean(y * (1 - w)), ) %>% dplyr::ungroup() %>% dplyr::left_join(weight, by = "g") %>% dplyr::summarise( tau_0 = sum(tau * lambda_0) %>% abs(), tau_1 = sum(tau * lambda_1) %>% abs() ) return(statistics_simulated) }
p_value_0 <- mean(statistics_observed$tau_0 < statistics_simulated$tau_0) p_value_0
## [1] 0
p_value_1 <- mean(statistics_observed$tau_1 < statistics_simulated$tau_1) p_value_1
## [1] 0
We can estimate the difference in means for each stratum: \[ \hat{\tau}_{fsg} \equiv \overline{Y}_{tg}^{obs} - \overline{Y}_{cg}^{obs} \equiv \frac{1}{N_{tg}} \sum_{i: G_{ig} = 1} W_i \cdot Y_i^{obs} - \frac{1}{N_{cg}} \sum_{i: G_{ig} = 0} (1 - W_i) \cdot Y_i^{obs}. \]
Then, we can take the weighted average: \[ \hat{\tau}_{fs} \equiv \sum_{g = 1}^G q_g \hat{\tau}_{fsg}. \]
With the fixed stratum size, the unbiasedness of each within-stratum estimator implies unbiasedness of \(\hat{\tau}_{fs}\) to \(\tau_{fs}\).