Code
<- readxl::read_excel(
cohort_A_data path = here::here("data-raw",
"Cohort_A",
"data_to_harmonise_age_issue.xlsx"),
sheet = "Sheet1",
col_types = c(
"text", "numeric"
) )
We read the data and have the following warnings
This warning occurs because we expect the second column Age
to be numeric but there exists some text columns.
Suppose we ask the collaborator to fix the age column and the collaborator returns a new file. To ensure that there are no messages, we can use testthat::expect_no_condition
.
Here is an example when it gives an error with the old file
Error: Expected `readxl::read_excel(...)` to run without any conditions.
ℹ Actually got a <simpleWarning> with text:
Expecting numeric in B7 / R7C2: got 'missing'
We can read the new file in the following way. However, this method means that you will need to read the file twice.
testthat::expect_no_condition(
readxl::read_excel(
path = here::here("data-raw",
"Cohort_A",
"data_to_harmonise_age_issue_fixed.xlsx"),
sheet = "Sheet1",
col_types = c(
"text", "numeric"
)
)
)
cohort_A_data <- readxl::read_excel(
path = here::here("data-raw",
"Cohort_A",
"data_to_harmonise_age_issue_fixed.xlsx"),
sheet = "Sheet1",
col_types = c(
"text", "numeric"
)
)
To read the file only once, we can use the tee pipe operator %T>%
.
We can use pointblank::rows_distinct
to check if the column Serial Number
has unique values.
cohort_A_data <- readxl::read_excel(
path = here::here("data-raw",
"Cohort_A",
"data_to_harmonise_age_issue_fixed.xlsx"),
sheet = "Sheet1",
col_types = c(
"text", "numeric"
)
) %T>%
testthat::expect_no_condition() |>
dplyr::rename(cohort_unique_id = "Serial Number") |>
# Remove rows when the ID value is NA
dplyr::filter(!is.na(.data[["cohort_unique_id"]])) |>
dplyr::mutate(
cohort_unique_id = as.character(cohort_unique_id)
) |>
# Remove white spaces in column names
dplyr::rename_all(stringr::str_trim) |>
# Check if cohort id is unique
pointblank::rows_distinct(
columns = "cohort_unique_id",
)
Sometimes the collaborator will not give you a new file and will only respond with an email acknowledging that it is an error.
You will need to edit the values yourself. It is best not to edit the file as you may forget to make the manual change if the collaborator gives you a new version a few months later with the same error.
It is also advised to record such changes before data harmonisation.
We read the data with the some issues with the weight.
cohort_A_data <- readxl::read_excel(
path = here::here("data-raw",
"Cohort_A",
"data_to_harmonise.xlsx"),
sheet = "Sheet1",
col_types = c(
"text", # unique id
"numeric", "text", # age and sex
"numeric", "numeric", # height and weight
"numeric", "numeric", "numeric", "numeric", # smoking history
"numeric", "numeric" # symptoms
)
) %T>%
testthat::expect_no_condition() |>
dplyr::rename(cohort_unique_id = "Serial Number") |>
# Remove rows when the ID value is NA
dplyr::filter(!is.na(.data[["cohort_unique_id"]])) |>
dplyr::mutate(
cohort_unique_id = as.character(cohort_unique_id)
) |>
# Remove white spaces in column names
dplyr::rename_all(stringr::str_trim) |>
# Check if cohort id is unique
pointblank::rows_distinct(
columns = "cohort_unique_id",
)
Here are the following patient’s height that needs to be updated.
weight_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id", "weight")) |>
# Check if these patient IDs are present
pointblank::col_vals_make_subset(
columns = c("cohort_unique_id"),
set = c("A018")
) |>
dplyr::mutate(
updated_weight = dplyr::case_when(
.data[["cohort_unique_id"]] == "A018" & .data[["weight"]] == 215.4 ~ 90.1,
.default = .data[["weight"]]
),
)
Remove unnecessary columns so that we can merge with the other fields.
We check if the corrections are made based on the collaborator request. Changes are made manually on the excel file as the collaborator is no longer providing newer version of the data.
weight
changed from 215.4kg to 90.1kg for patient A018
cohort_A_data |>
# Check if these patient IDs are present
pointblank::expect_col_vals_make_subset(
columns = c("cohort_unique_id"),
set = c("A018")
) |>
pointblank::expect_col_vals_expr(
expr = pointblank::expr(
dplyr::case_when(
.data[["cohort_unique_id"]] %in% "A018" ~
isTRUE(all.equal(
target = 90.1,
current = cohort_A_data[["weight"]][which(cohort_A_data[["cohort_unique_id"]] == "A018")],
tolerance = 0.0001)
))
)
)
We output data to be used for the next session.