We present a working R (R Core Team, 2017) implementation and the code for a NSDUH example. The function ‘cs sampling’ is a wrapper which takes a Stan model (Carpenter, 2015), computes MCMC draws from the (pseudo) posterior, extracts the gradient function via Rstan (Stan Development Team, 2016), creates a replicate design and estimates the variance of the gradient via the ‘survey’ package (Lumley, 2016). We then compute and apply the sandwich adjustment in equation 15. The resampling method to estimate Jπθ in Algorithm 1 corresponds to a special case of the ‘mrbbootstrap’ replication option (Preston, 2009). For the weighted logistic Stan model, see Appendix B of supplementary information of Williams and Savitsky (2018a): doi:10.1214/18-BA1143SUPP. ################## # wt_logistic.stan ################## /* Logistic regression with exponentiated weights */ functions{ /** * Return the log probability of a sampling-weighted bernoulli * distribution for an n-vector of parameter values, y * * @param y Vector containing the parameters under a bernoulli * @param mu vector containing linear predictor (logit scale) * @param weights Vector of observation-specific sampling weights * @param n number of observations (length of y, mu, weights) */ real wt_bin_lpmf(int[] y, vector mu, vector weights, int n){ real check_term; check_term = 0.0; for( i in 1:n ) { check_term = check_term + weights[i] * bernoulli_logit_lpmf(y[i] | mu[i]); } return check_term; }} data{ int n; // number of observations int k; // number of linear predictors int y[n]; // Response variable vector[n] weights; // observation-indexed weights matrix[n, k] X; // coefficient matrix } parameters{ vector[k] beta; /* regression coefficients from linear predictor */ } transformed parameters{ vector[n] mu; mu = X * beta; /* linear predictor */ } /* end transformed parameters block */ model{ /*improper prior on beta in (-inf,inf)*/ /* directly update the log-probability for sampling */ target += wt_bin_lpmf(y | mu, weights, n); } ############### # cs_sampling.r ############### require(rstan) require(survey) require(plyr) #take in stan mod, list for stan data, name of parameters sampled, #survey design object (or rep) #return sampling object with parameters overwritten cs_sampling <- function(svydes, mod_stan, par_stan, data_stan, ctrl_stan = list(chains = 1, iter = 2000, warmup = 1000, thin = 1), rep_design = FALSE, ctrl_rep = list(replicates = 100, type = "mrbbootstrap")){ #run STAN model print("stan fitting") out_stan <- sampling(object = mod_stan, data = data_stan, pars = par_stan, chains = ctrl_stan$chains, iter = ctrl_stan$iter, warmup = ctrl_stan$warmup, thin = ctrl_stan$thin) #Get posterior mean (across all chains) par_samps <- extract(out_stan, pars = par_stan, permuted = FALSE) par_hat <- colMeans(par_samps, dim = 2)#dim = 1 by chain, dim = 2 across chains #Estimate Hessian Hhat <- -1*optimHess(par_hat, gr = function(x){grad_log_prob(out_stan, x)}) #create svrepdesign if(rep_design == TRUE){svyrep <- svydes } else{ svyrep <- as.svrepdesign(design = svydes, type = ctrl_rep$type, replicates = ctrl_rep$replicates) } #Estimate Jhat = Var(gradient) print("gradient evaluation") rep_tmp <- withReplicates(design = svyrep, theta = grad_par, stanmod = mod_stan, standata = data_stan, par_stan = par_stan, par_hat = par_hat) Jhat <- vcov(rep_tmp) #compute adjustment Hi <- solve(Hhat) V1 <- Hi%*%Jhat%*%Hi R1 <- chol(V1) R2i <- chol(Hi) R2 <- solve(R2i) R2R1 <- R2%*%R1 #adjust samples par_adj <- aaply(par_samps, 1, DEadj, par_hat = par_hat, R2R1 = R2R1, .drop = FALSE) #matches par_samps if needed return(list(stan_fit = out_stan, sampled_parms = par_samps, adjusted_parms =par_adj)) } # end of cs_sampling ##helper functions### ##grad_par helper function to nest within withReplicates() #Stan will pass warnings from calling 0 chains, but still create out_stan object with #grad_log_prob() method grad_par <- function(pwts, svydata, stanmod, standata,par_stan,par_hat){ #ignore svydata argument it allows access to svy object data standata$weights <- pwts out_stan <- sampling(object = stanmod, data = standata, pars = par_stan, chains = 0, warmup = 0, ) gradpar <- grad_log_prob(out_stan,par_hat) return(gradpar) }#end of grad theta #helper function to apply matrix rotation DEadj <- function(par, par_hat, R2R1){ par_adj <- (par - par_hat)%*%R2R1 + par_hat return(par_adj) } ######################################################## #National Survey on Drug Use and Health example #Estimate logistic regression model via Stan #adjust parameter distribution for complex sample design ######################################################## setwd("...") source("cs_sampling.r") # requires rstan rstan_options(auto_write = TRUE) mod <- stan_model(’wt_logistic.stan’)# compile stan code (doi:10.1214/18-BA1143SUPP) #import NSDUH data #https://www.datafiles.samhsa.gov/study-dataset/ national-survey-drug-use-and-health-2014-nsduh-2014-ds0001-nid16876)# load(file = "NSDUH_2014.RData") #subset and clean up the large file# library(stringr) #change names to all upper case names(PUF2014_090718) <- str_to_upper(names(PUF2014_090718)) #QUESTID2: individual ID #CIGMON: past month smoking #AMDEY2_U: past year depression (adults) #CATAG6: Age groups #ANALWT_C: analysis weights #VESTR: Variance estimation strata #VEREP: Variance estimation PSU (nested within strata) dat14 <- PUF2014_090718[,c("QUESTID2","CIGMON", "AMDEY2_U", "CATAG6", "ANALWT_C", "VESTR", "VEREP")] rm(PUF2014_090718);gc(); #clean up memory #subset to adults dat14 <- dat14[as.numeric(dat14$CATAG6) > 1,] #normalize weights to sum to sample size dat14$WTS <- dat14$ANALWT_C*(length(dat14$ANALWT_C)/sum(dat14$ANALWT_C)) #create survey design object# svy14 <- svydesign(ids = ~VEREP, strata = ~VESTR, weights = ~WTS, data = dat14, nest = TRUE) #create list of inputs# X <- model.matrix( ~ AMDEY2_U, data = dat14) y <- dat14$CIGMON k <- dim(X)[2] n <- length(y) weights <- dat14$WTS #list of inputs data_stan <- list(y = array(y, dim = n), X = X, k = k, n = n, weights = array(weights, dim = n) ) par_stan <- c("theta") #subset of parameters interested in #run Stan and adjustment code set.seed(12345) #set seed to fix output for comparison mod1 <- cs_sampling(svydes = svy14, mod_stan = mod, par_stan = par_stan, data_stan = data_stan) ##compare to svyglm svyglm1 <- svyglm(CIGMON ~ AMDEY2_U, design = svy14, family = quasibinomial()) #asymptotic normal 90% ellipse for MLE library(car) MLell <- data.frame(ellipse(center = coef(svyglm1), shape = vcov(svyglm1), radius = sqrt(qchisq(0.90,2)), draw = FALSE), Adjust = NA) names(MLell) <- c("th0", "th1", "Adjust") #asymptotic normal 90% intervals for MLE qML0 <- qnorm(c(0.05, 0.95), coef(svyglm1)[1], sqrt(diag(vcov(svyglm1)))[1]) qML1 <- qnorm(c(0.05, 0.95), coef(svyglm1)[2], sqrt(diag(vcov(svyglm1)))[2]) ##plot before/after adjustment library(ggplot2) #Joint distribution# datpl <- data.frame(rbind(mod1$sampled_parms[,1,], mod1$adjusted_parms[,1,]) , as.factor(c(rep("NO", 1000), rep("YES", 1000)))) names(datpl) <- c("th0", "th1", "Adjust") plt1 <- ggplot(datpl, aes(th0, th1, color = Adjust, shape = Adjust )) + geom_point()+ stat_ellipse(level = 0.90, type = "norm", size = 2)+ geom_path(data = MLell, aes(th0,th1), size = 2, linetype = "dashed") + labs( x= expression(theta [0]), y = expression(theta [1])) print(plt1) #marginal distribution df1 <- data.frame(datpl[,-2], par = 0, ML = coef(svyglm1)[1], q5ML = qML0[1], q95ML = qML0[2]) df2 <- data.frame(datpl[,-1], par = 1, ML = coef(svyglm1)[2], q5ML = qML1[1], q95ML = qML1[2]) names(df1) <- names(df2) <- c("Est", "Adjust", "Par", "ML", "q5ML", "q95ML") datpl2 <- rbind(df1,df2) plt2 <- ggplot(datpl2, aes(y= Est, x= Adjust, color = Adjust, fill = Adjust)) + geom_hline(aes(yintercept = ML), size = 1, linetype = "dotted")+ geom_hline(aes(yintercept = q5ML), size = 1, linetype = "dashed")+ 40 geom_hline(aes(yintercept = q95ML), size = 1, linetype = "dashed")+ geom_violin(trim=TRUE,draw_quantiles = c(0.05, 0.5, 0.95),alpha=0.5, size = 1.5)+ facet_grid(Par~., scales = "free", labeller = label_bquote(theta [.(Par)]))+ labs( x = NULL, y = NULL) print(plt2)