generate_case_ids <- function(n) {
start_id <- 201011000001
end_id <- start_id + n-1
case_ids <- as.character(seq(start_id, end_id, by = 1))
return(case_ids)
}10 Appendix A: Sample Data
10.1 Introduction
The sample data used in this book was generated from the Malawi Intergrated Household Survey Fifth Edition 2018-2019 downloaded from here.
The data was generated randomly using the following code:
10.2 Define functions used
10.2.1 Create case_id generation
10.2.2 Create HHID generation function
generate_HHIDs <- function(n) {
hhids <- sapply(1:n, function(x) {
paste(sample(c(0:9, letters[1:6]), 32, replace = TRUE), collapse = "")
})
return(hhids)
}10.3 Set seed and number of households to generate
# Set seed
set.seed(123)
# Set number of households to generate
households <- 10010.4 Load Original data and extract food and unit lists
# Import Malawi IHS5 HCES consumption module data
original_data <-
haven::read_dta(here::here("data-ignore", "IHS5", "HH_MOD_G1.dta"))
# Extract "standard" food list from the original data
food_list <-
original_data |>
dplyr::select(hh_g02) |>
dplyr::distinct()
# Extract "non-standard" food lists from the original data
other_food_list_codes <-
original_data |>
dplyr::distinct(hh_g02, hh_g01_oth) |>
dplyr::filter(hh_g01_oth != "") |>
dplyr::distinct(hh_g02) |>
dplyr::arrange()
other_food_list_options <-
original_data |>
dplyr::distinct(hh_g02, hh_g01_oth) |>
dplyr::filter(hh_g01_oth != "")
# Extract Food unit lists from the original data
food_unit_lists <-
original_data |>
dplyr::distinct(hh_g03b, hh_g03b_label, hh_g03b_oth, hh_g03c, hh_g03c_1)
# Extract the length of Number of foods from the food list
n_foods <- length(food_list$hh_g02)10.5 Data creation
10.5.1 Create HHIDs
# Creeate case_ids
case_id <- generate_case_ids(households)
# Generate HHIDs
hhids <- generate_HHIDs(households)10.5.2 Create data
sample_data <- tibble::tibble(
case_id = rep(case_id, each = n_foods),
HHID = rep(hhids, each = n_foods),
hh_g00_1 = 2,
hh_g00_2 = 2,
food_list |> dplyr::slice(rep(1:dplyr::n(), households)),
hh_g01 = sample(
original_data$hh_g01,
# replace = T,
size = households * 142
)
) |>
# Add "other food items"
dplyr::rowwise() |>
dplyr::mutate(
hh_g01_oth = dplyr::case_when(
hh_g02 == 414 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 414) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 515 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 515) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 117 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 117) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 830 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 830) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 310 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 310) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 412 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 412) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 610 &
hh_g01 == 1 ~ sample(
dplyr::filter( other_food_list_options,hh_g02 == 610) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 916 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 916) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 209 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 209) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 709 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 709) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 818 &
hh_g01 == 1 ~ sample(
dplyr::filter(other_food_list_options,hh_g02 == 818) |> dplyr::pull(hh_g01_oth),
1
),
hh_g02 == 804 &
hh_g01 == 1 ~ sample(dplyr::filter(other_food_list_options,hh_g02 == 804) |> dplyr::pull(hh_g01_oth),
1
),
TRUE ~ ""
)
) |>
dplyr::mutate(hh_g03a = dplyr::case_when(hh_g01 == 1 ~ sample(c(1:10, 0.5:10), 1),
TRUE ~ NA)) |>
dplyr::rowwise() |>
dplyr::mutate(unit_key = dplyr::case_when(hh_g01 == 1 ~ sample(1:214, 1), TRUE ~
NA)) |>
dplyr::mutate(
hh_g03b = food_unit_lists$hh_g03b[unit_key],
hh_g03b_label = food_unit_lists$hh_g03b_label[unit_key],
hh_g03b_oth = food_unit_lists$hh_g03b_oth[unit_key],
hh_g03c = food_unit_lists$hh_g03c[unit_key],
hh_g03c_1 = food_unit_lists$hh_g03c_1[unit_key]
) |>
dplyr::select(
-unit_key,
"case_id",
"HHID",
"hh_g00_1",
"hh_g00_2",
"hh_g01",
"hh_g01_oth",
"hh_g02",
"hh_g03a",
"hh_g03b",
"hh_g03b_label",
"hh_g03b_oth",
"hh_g03c",
"hh_g03c_1"
) # Add the rest of the columns
sample_data <- original_data |> dplyr::filter(is.na(case_id)) |>
dplyr::bind_rows(sample_data)# Attach stata column labels
for (i in names(sample_data)){
attr(sample_data[[i]], "label") <- attr(original_data[[i]], "label")
}# Export sample data as stata file
haven::write_dta(sample_data,here::here("data","sample_data","MWI-IHSV","HH_MOD_G1_vMAPS.dta"))10.5.3 Create hh_mod_a_filt.dta file
sample_data |>
dplyr::select(case_id,HHID) |>
dplyr::distinct() |>
dplyr::rowwise() |>
dplyr::mutate(region = sample(1:3,1)) |>
haven::write_dta(here::here("data","sample_data","MWI-IHSV","hh_mod_a_filt_vMAPS.dta"))10.5.4 Create hh_roster.dta
# Import original roster from IHS5
ihs5_roster <- haven::read_dta(here::here("data-ignore", "IHS5", "HH_MOD_B.dta"))
# create a dataframe with the case_ids and HHIDs of our sample data
sample_roster <- sample_data |> dplyr::distinct(case_id,HHID)
# replicate each row a random number of times between 1 and 10 to simulate household members
n <- sample(1:10, nrow(sample_roster), replace = TRUE)
sample_roster <- sample_roster[rep(seq_len(nrow(sample_roster)), times = n), ]
# Create other variables
sample_roster <- sample_roster |>
dplyr::rowwise() |>
dplyr::mutate(hh_b03 = sample(ihs5_roster$hh_b03,1),
hh_b05a = sample(ihs5_roster$hh_b05a,1),
hh_b05b = dplyr::case_when(hh_b05a < 5~sample(1:11,1),TRUE~NA))
# Add the other blank columns from the original dataset
sample_roster <- ihs5_roster |>
dplyr::filter(case_id == "") |>
dplyr::bind_rows(sample_roster)
# Attach stata column labels
for (i in names(sample_roster)){
attr(sample_roster[[i]], "label") <- attr(ihs5_roster[[i]], "label")
}
# writeout the sample_ihs5_roster
haven::write_dta(sample_roster,here::here("data","sample_data","MWI-IHSV","HH_MOD_B_vMAPS.dta"))10.5.5 Create sample “HH_MOD_D.dta”
# import original data
original_health <- haven::read_dta(here::here("data-ignore", "IHS5", "HH_MOD_D.dta"))
# Use the sample_roster to create a sample_health dataset
sample_health <- sample_roster |>
dplyr::select(case_id,HHID) |>
dplyr::rowwise()|>
dplyr::mutate(hh_d05a = sample(c(original_health$hh_d05a),1),
hh_d05b = sample(original_health$hh_d05b,1))
# Add the other blank columns from the original dataset
sample_health <- original_health |>
dplyr::filter(case_id == "") |>
dplyr::bind_rows(sample_health)
# Attach stata column labels
for (i in names(sample_health)){
attr(sample_health[[i]], "label") <- attr(sample_health[[i]], "label")
}
# writeout the sample_ihs5_roster
haven::write_dta(sample_health,here::here("data","sample_data","MWI-IHSV","HH_MOD_D_vMAPS.dta"))