R/simulate_hierarchically_sparse_data.R
genHierSparseData.Rd
function to generate data with hierarchical sparsity
genHierSparseData( ncats, nvars, nobs, nobs.test = 100, hier.sparsity.param = 0.5, avg.hier.zeros = NULL, prop.zero.vars = 0.5, effect.size.max = 0.5, misspecification.prop = 0, family = c("gaussian", "binomial", "coxph"), sd = 1, snr = NULL, beta = NULL, tau = 10, covar = 0 )
ncats | number of categories to stratify on |
---|---|
nvars | number of variables |
nobs | number of observations per strata to simulate |
nobs.test | number of independent test observations per strata to simulate |
hier.sparsity.param | parameter between 0 and 1 which determines how much hierarchical sparsity there is. To achieve a desired total level of sparsity among the variables with hierarchical sparsity, this parameter can be estimated using the function 'estimate.hier.sparsity.param' |
avg.hier.zeros | desired percent of zero variables among the variables with hierarchical zero patterns. If this is specified, it will override the given hier.sparsity.param value and estimate it. This takes a while |
prop.zero.vars | proportion of all variables that will be zero across all strata |
effect.size.max | maximum magnitude of the true effect sizes |
misspecification.prop | proportion of variables with hierarchical missingness misspecified |
family | family for the response variable |
sd | standard devation for gaussian simulations |
snr | signal-to-noise ratio (only used for |
beta | a matrix of true beta values. If given, then no beta will be created and data will be simulated from the given beta |
tau | rate parameter for |
covar | scalar, pairwise covariance term for covariates |
set.seed(123) dat.sim <- genHierSparseData(ncats = 3, nvars = 100, nobs = 200) # estimate hier.sparsity.param for 0.15 total proportion of nonzero variables # among vars with hierarchical zero patterns if (FALSE) { hsp <- estimate.hier.sparsity.param(ncats = 3, nvars = 50, avg.hier.zeros = 0.15, nsims = 100) } # the above results in the following value hsp <- 0.6270698 # check that this does indeed achieve the desired level of sparsity mean(replicate(50, mean(genHierSparseBeta(ncats = 3, nvars = 50, hier.sparsity.param = hsp) != 0) ))#> [1] 0.1527dat.sim2 <- genHierSparseData(ncats = 3, nvars = 100, nobs = 200, hier.sparsity.param = hsp) sparseBeta <- genHierSparseBeta(ncats = 3, nvars = 100, hier.sparsity.param = hsp) ## generate data with already generated beta dat.sim3 <- genHierSparseData(ncats = 3, nvars = 100, nobs = 200, beta = sparseBeta) ## complete example: ## 50% sparsity: hsp <- 0.2626451 dat.sim <- genHierSparseData(ncats = 3, nvars = 25, nobs = 150, nobs.test = 1000, hier.sparsity.param = hsp, prop.zero.vars = 0.5, effect.size.max = 0.25, family = "gaussian") x <- dat.sim$x x.test <- dat.sim$x.test y <- dat.sim$y y.test <- dat.sim$y.test grp <- dat.sim$group.ind grp.test <- dat.sim$group.ind.test fit.adapt <- cv.vennLasso(x, y, grp, adaptive.lasso = TRUE, nlambda = 25, family = "gaussian", abs.tol = 1e-5, rel.tol = 1e-5, maxit = 1000, irls.maxit = 15L, gamma = 0.2, standardize = FALSE, intercept = TRUE, nfolds = 3, model.matrix = TRUE) preds.a <- predict(fit.adapt$vennLasso.fit, x.test, grp.test, s = fit.adapt$lambda.min, type = 'response')