7 Extract Demographic

1 Read Preprocessed File

We read output data from the previous section.

2 Demographics and Behavioral parameters

2.1 Age and Sex

age_years will be mapped from the column Age.

sex is grouped as follows:

Sex	sex
Female	0
Male	1

Code

age_gender_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id", 
                  "Age", 
                  "Sex")) |>
  pointblank::col_vals_expr(
    expr = ~ harmonisation::is_integer_vector(
      cohort_A_data[["age"]],
      allow_na = TRUE)
  ) |>
  dplyr::mutate(
    # Convert age to type integer
    age_years = as.integer(.data[["Age"]]),
    # Convert categorical columns to factors
    sex = dplyr::case_when(
      .data[["Sex"]] == "Female" ~ "0",
      .data[["Sex"]] == "Male" ~ "1",
      .default = NA_character_
    ),
    `Sex` = forcats::fct_relevel(
      .data[["Sex"]],
      c("Female", "Male")
    ),
    sex = forcats::fct_relevel(
      .data[["sex"]],
      c("0", "1")),
  ) |>
  dplyr::relocate(
    "sex",
    .after = "Sex"
  ) |> 
  dplyr::relocate(
    "age_years",
    .after = "Age"
  ) |>
  pointblank::col_vals_in_set(
    columns = "sex",
    set = c("0", "1")
  ) |> 
  pointblank::col_vals_between(
    columns = "age_years",
    left = 0,
    right = 100,
    inclusive = c(FALSE, TRUE),
    na_pass = TRUE
  )

Code

if (params$show_table && knitr::is_html_output()) {
  age_gender_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

age_gender_data <- age_gender_data |>
  dplyr::select(-c("Age", "Sex"))

2.2 Height, Weight, BMI and BSA

height_cm will be mapped from the column Height. weight_kg will be mapped from the column Weight.

bsa_m2 in m^2 will be calculated as sqrt([Height(cm) x Weight(kg)]/3600) bmi will be calculated as Weight(kg)/((Height(m))^2)

All values are then converted to two decimal places.

Code

body_measurement_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id", 
                  "Height", "Weight")) |>
  dplyr::mutate(
    height_cm = .data[["Height"]],
    weight_kg = .data[["Weight"]],
    bsa_m2 = sqrt((.data[["height_cm"]] * .data[["weight_kg"]]) / 3600),
    bsa_m2 = harmonisation::round_to_nearest_digit(.data[["bsa_m2"]], digits = 2),
    bmi = .data[["weight_kg"]] / ((.data[["height_cm"]] / 100)^2),
    bmi = harmonisation::round_to_nearest_digit(.data[["bmi"]], digits = 2),
    height_cm = harmonisation::round_to_nearest_digit(.data[["height_cm"]], digits = 2),
    weight_kg = harmonisation::round_to_nearest_digit(.data[["weight_kg"]], digits = 2)
  ) |> 
  pointblank::col_vals_between(
    columns = "bmi",
    left = 10,
    right = 50,
    inclusive = c(TRUE, TRUE),
    na_pass = TRUE
  )

Code

if (params$show_table && knitr::is_html_output()) {
  body_measurement_data |>
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

body_measurement_data <- body_measurement_data |>
  dplyr::select(-c("Height", "Weight"))

2.3 Smoking History

smoke_current is grouped as follows:

Smoke History	smoke_current
non-smoker	0
past smoker	0
current smoker	1
NA	-1

smoke_past is grouped as follows:

Smoke History	smoke_past
non-smoker	0
past smoker	1
current smoker	0
NA	-1

We do a check to ensure that we can only have these scenarios

smoke_current as 1 and smoke_past as 0 for current smokers
smoke_current as 0 and smoke_past as 1 for past smokers
smoke_current as 0 and smoke_past as 0 for non-smokers
smoke_current as -1 and smoke_past as -1 for unknown

Code

smoking_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id", 
                  "Smoke History")) |>
  dplyr::mutate(
    smoke_current = dplyr::case_when(
      is.na(.data[["Smoke History"]]) ~ "-1",
      .data[["Smoke History"]] == "non-smoker" ~ "0",
      .data[["Smoke History"]] == "past smoker" ~ "0",
      .data[["Smoke History"]] == "current smoker" ~ "1",
      .default = NA_character_
    ),
    smoke_current = forcats::fct_relevel(
      .data[["smoke_current"]],
      c("0", "1")), 
    smoke_past = dplyr::case_when(
      is.na(.data[["Smoke History"]]) ~ "-1",
      .data[["Smoke History"]] == "non-smoker" ~ "0",
      .data[["Smoke History"]] == "past smoker" ~ "1",
      .data[["Smoke History"]] == "current smoker" ~ "0",
      .default = NA_character_
    ),
    smoke_past = forcats::fct_relevel(
      .data[["smoke_past"]],
      c("0", "1")),
    `Smoke History` = forcats::fct(
      .data[["Smoke History"]]
    )
  ) |>
  pointblank::col_vals_in_set(
    columns = c("smoke_current", "smoke_past"),
    set = c("0", "1", "-1")
  ) |> 
  pointblank::col_vals_expr(
    expr = pointblank::expr(
      (.data[["smoke_current"]] == "1" & .data[["smoke_past"]] == "0") |
      (.data[["smoke_current"]] == "-1" & .data[["smoke_past"]] == -"1") |
      (.data[["smoke_current"]] == "0" & .data[["smoke_past"]] %in% c("0", "1"))
    )
  )

Code

if (params$show_table && knitr::is_html_output()) {
  smoking_data |>
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

smoking_data <- smoking_data |>
  dplyr::select(-c("Smoke History"))

2.4 Chest Pain

2.4.1 Shortness of Breath

have_sob is grouped as follows:

Dyspnea	have_sob
no	0
yes	1

Code

shortness_of_breath_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id", "Dyspnea")) |>
  dplyr::mutate(
    have_sob = dplyr::case_when(
      .data[["Dyspnea"]] == "no" ~ "0",
      .data[["Dyspnea"]] == "yes" ~ "1",
      .default = NA_character_
    ),
    have_sob = forcats::fct_relevel(
      as.character(.data[["have_sob"]]),
      c("0", "1")),
    Dyspnea = forcats::fct_relevel(
      as.character(.data[["Dyspnea"]]),
      c("no", "yes")),    
  ) |>
  pointblank::col_vals_in_set(
    columns = c("have_sob"),
    set = c("0", "1", "-1")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  shortness_of_breath_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

shortness_of_breath_data <- shortness_of_breath_data |>
  dplyr::select(-c("Dyspnea"))

2.4.2 Have chest pain or not

have_chest_pain is grouped as follows:

Chest Pain Character	have_chest_pain
no chest pain	0
typical, atypical or nonanginal	1

Code

have_chest_pain_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id", "Chest Pain Character")) |>
  dplyr::mutate(
    have_chest_pain = dplyr::case_when(
      .data[["Chest Pain Character"]] %in% c("no chest pain") ~ "0",
      .data[["Chest Pain Character"]] %in% c("typical", "atypical", "nonanginal") ~ "1",
      .default = NA_character_
    ),
    have_chest_pain = forcats::fct_relevel(
      .data[["have_chest_pain"]],
      c("0", "1")
    ),
    `Chest Pain Character` = forcats::fct_relevel(
      as.character(.data[["Chest Pain Character"]]),
      c("no chest pain", "typical", "atypical", "nonanginal")
    )
  ) |>
  pointblank::col_vals_in_set(
    columns = c("have_chest_pain"),
    set = c("0", "1")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  have_chest_pain_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

have_chest_pain_data <- have_chest_pain_data |>
  dplyr::select(-c("Chest Pain Character"))

2.4.3 Symptomatic or Asymptomatic

symptoms is grouped as follows:

have_sob	have_chest_pain	symptoms
-1	-1	-1
0	0	0
0 or 1	1	1
1	0	2

Code

symptoms_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id")) |>
  dplyr::inner_join(shortness_of_breath_data, 
                   by = dplyr::join_by("cohort_unique_id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(have_chest_pain_data, 
                   by = dplyr::join_by("cohort_unique_id"),
                   unmatched = "error",
                   relationship = "one-to-one") |>                   
  dplyr::mutate(
    symptoms = dplyr::case_when(
      (.data[["have_chest_pain"]] == "-1" &
       .data[["have_sob"]] == "-1"
      ) ~ "-1",
      (.data[["have_chest_pain"]] == "0" &
       .data[["have_sob"]] == "0" 
      ) ~ "0",        
      (.data[["have_chest_pain"]] == "1" &
       .data[["have_sob"]] %in% c("0", "1")
      ) ~ "1", 
      (.data[["have_chest_pain"]] == "0" &
       .data[["have_sob"]] == "1" 
      ) ~ "2",  
      .default = NA_character_
    ),
    symptoms = forcats::fct_relevel(
      .data[["symptoms"]],
      c("0", "1", "2"))
  ) |> 
  pointblank::col_vals_in_set(
    columns = c("symptoms"),
    set = c("0", "1", "2")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  symptoms_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

symptoms_data <- symptoms_data |>
  dplyr::select(-c("have_chest_pain", "have_sob"))

2.4.4 Chest Pain Type

chest_pain_type is grouped as follows:

Dyspnea	Chest Pain Character	chest_pain_type
no	no chest pain	0
no or yes	typical	1
no or yes	atypical	2
no or yes	nonanginal	3
yes	no chest pain	4

Code

chest_pain_type_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id", "Chest Pain Character", "Dyspnea")) |>
  dplyr::mutate(
    chest_pain_type = dplyr::case_when(
      (.data[["Chest Pain Character"]] == "no chest pain" &
       .data[["Dyspnea"]] == "no" 
      ) ~ "0",        
      (.data[["Chest Pain Character"]] == "typical" &
       .data[["Dyspnea"]] %in% c("no", "yes")
      ) ~ "1",
      (.data[["Chest Pain Character"]] == "atypical" &
       .data[["Dyspnea"]] %in% c("no", "yes")
      ) ~ "2", 
      (.data[["Chest Pain Character"]] == "nonanginal" &
       .data[["Dyspnea"]] %in% c("no", "yes")
      ) ~ "3", 
      (.data[["Chest Pain Character"]] == "no chest pain" &
       .data[["Dyspnea"]] == "yes" 
      ) ~ "4",  
      .default = NA_character_
    ),
    `Chest Pain Character` = forcats::fct_relevel(
      as.character(.data[["Chest Pain Character"]]),
      c("no chest pain", "typical", "atypical", "nonanginal")
    ),
    `Dyspnea` = forcats::fct_relevel(
      as.character(.data[["Dyspnea"]]),
      c("no", "yes")
    ),
    chest_pain_type = forcats::fct_relevel(
      .data[["chest_pain_type"]],
      c("0", "1", "2", "3"))
  ) |> 
  dplyr::relocate(
    "Chest Pain Character",
    .after = "cohort_unique_id"
  ) |> 
  pointblank::col_vals_in_set(
    columns = c("chest_pain_type"),
    set = c("0", "1", "2", "3", "4")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  chest_pain_type_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

chest_pain_type_data <- chest_pain_type_data |>
  dplyr::select(-c("Dyspnea", "Chest Pain Character"))

2.4.5 Combined chest pain related tables

We combine all chest related tables together

Code

join_specification <- dplyr::join_by("cohort_unique_id")

chest_pain_data <- cohort_B_data |> 
  dplyr::select(c("cohort_unique_id")) |> 
  dplyr::inner_join(have_chest_pain_data, 
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(chest_pain_type_data, 
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(shortness_of_breath_data, 
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(symptoms_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one")

testthat::expect_true(
  pointblank::has_columns(
    chest_pain_data,
    columns = c("have_sob", "have_chest_pain", "symptoms", "chest_pain_type")
  )
) 

testthat::expect_equal(
  ncol(chest_pain_data), 5
)

2.5 Combine Demographics

We combine all the data to give the demo_behave_data.

Code

join_specification <- dplyr::join_by("cohort_unique_id")

demo_behave_data <- cohort_B_data |>
  dplyr::select(c("cohort_unique_id")) |>
  dplyr::inner_join(age_gender_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::inner_join(body_measurement_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::inner_join(smoking_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::inner_join(chest_pain_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::relocate(c("bsa_m2", "bmi"),
                  .after = "sex")


testthat::expect_true(
  pointblank::has_columns(
    demo_behave_data,
    columns = c(
      "age_years", "sex",
      "height_cm", "weight_kg", "bsa_m2", "bmi",
      "smoke_current", "smoke_past",
      "have_sob", "have_chest_pain",
      "symptoms", "chest_pain_type"
    )
  )
) 

testthat::expect_equal(
  ncol(demo_behave_data), 13
)

3 Write Preprocessed File

We output data to be used for the next session.

Code

demo_behave_data |>
  nanoparquet::write_parquet(
    file = here::here(
      params$analysis_folder,
      params$harmonisation_folder,
      params$preprocessing_folder,
      "02_demographic_data.parquet"
  )
)