2 Read Cohort A Data

1 Read Data

We read the data and have the following warnings

Code

cohort_A_data <- readxl::read_excel(
  path = here::here("data-raw",
                    "Cohort_A",
                    "data_to_harmonise_age_issue.xlsx"),
  sheet = "Sheet1",
  col_types = c(
    "text", "numeric"
    )
  )

This warning occurs because we expect the second column Age to be numeric but there exists some text columns.

Suppose we ask the collaborator to fix the age column and the collaborator returns a new file. To ensure that there are no messages, we can use testthat::expect_no_condition.

Here is an example when it gives an error with the old file

Code

testthat::expect_no_condition(
  readxl::read_excel(
  path = here::here("data-raw",
                    "Cohort_A",
                    "data_to_harmonise_age_issue.xlsx"),
  sheet = "Sheet1",
  col_types = c(
    "text", "numeric"
    )
  )
)

Error: Expected `readxl::read_excel(...)` to run without any conditions.
ℹ Actually got a <simpleWarning> with text:
  Expecting numeric in B7 / R7C2: got 'missing'

We can read the new file in the following way. However, this method means that you will need to read the file twice.

Code

testthat::expect_no_condition(
  readxl::read_excel(
  path = here::here("data-raw",
                    "Cohort_A",
                    "data_to_harmonise_age_issue_fixed.xlsx"),
  sheet = "Sheet1",
  col_types = c(
    "text", "numeric"
    )
  )
)

cohort_A_data <- readxl::read_excel(
  path = here::here("data-raw",
                    "Cohort_A",
                    "data_to_harmonise_age_issue_fixed.xlsx"),
  sheet = "Sheet1",
  col_types = c(
    "text", "numeric"
    )
  )

To read the file only once, we can use the tee pipe operator %T>%.

Code

cohort_A_data <- readxl::read_excel(
  path = here::here("data-raw",
                    "Cohort_A",
                    "data_to_harmonise_age_issue_fixed.xlsx"),
  sheet = "Sheet1",
  col_types = c(
    "text", "numeric"
    )
  ) %T>%
  testthat::expect_no_condition()

2 Check for unique patient id

We can use pointblank::rows_distinct to check if the column Serial Number has unique values.

Code

cohort_A_data <- readxl::read_excel(
  path = here::here("data-raw",
                    "Cohort_A",
                    "data_to_harmonise_age_issue_fixed.xlsx"),
  sheet = "Sheet1",
  col_types = c(
    "text", "numeric"
    )
  ) %T>%
  testthat::expect_no_condition() |> 
  dplyr::rename(cohort_unique_id = "Serial Number") |> 
  # Remove rows when the ID value is NA
  dplyr::filter(!is.na(.data[["cohort_unique_id"]])) |>
  dplyr::mutate(
    cohort_unique_id = as.character(cohort_unique_id)
  ) |> 
  # Remove white spaces in column names
  dplyr::rename_all(stringr::str_trim) |> 
  # Check if cohort id is unique
  pointblank::rows_distinct(
    columns = "cohort_unique_id",
  )

3 Clean Weight columns

Sometimes the collaborator will not give you a new file and will only respond with an email acknowledging that it is an error.

You will need to edit the values yourself. It is best not to edit the file as you may forget to make the manual change if the collaborator gives you a new version a few months later with the same error.

It is also advised to record such changes before data harmonisation.

We read the data with the some issues with the weight.

Code

cohort_A_data <- readxl::read_excel(
  path = here::here("data-raw",
                    "Cohort_A",
                    "data_to_harmonise.xlsx"),
  sheet = "Sheet1",
  col_types = c(
    "text", # unique id
    "numeric", "text", # age and sex
    "numeric", "numeric", # height and weight
    "numeric", "numeric", "numeric", "numeric", # smoking history
    "numeric", "numeric" # symptoms
    )
  ) %T>%
  testthat::expect_no_condition() |> 
  dplyr::rename(cohort_unique_id = "Serial Number") |> 
  # Remove rows when the ID value is NA
  dplyr::filter(!is.na(.data[["cohort_unique_id"]])) |>
  dplyr::mutate(
    cohort_unique_id = as.character(cohort_unique_id)
  ) |> 
  # Remove white spaces in column names
  dplyr::rename_all(stringr::str_trim) |> 
  # Check if cohort id is unique
  pointblank::rows_distinct(
    columns = "cohort_unique_id",
  )

4 Update Weight

Here are the following patient’s height that needs to be updated.

A018 has a weight of 215.4kg. Value is changed to 90 kg.

Code

weight_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", "weight")) |>
  # Check if these patient IDs are present
  pointblank::col_vals_make_subset(
    columns = c("cohort_unique_id"),
    set = c("A018")
  ) |> 
  dplyr::mutate(
    updated_weight = dplyr::case_when(
      .data[["cohort_unique_id"]] == "A018" & .data[["weight"]] == 215.4 ~ 90.1,
      .default = .data[["weight"]]
    ),
  )

Code

if (params$show_table && knitr::is_html_output()) {
  weight_data |> 
    dplyr::filter(
      .data[["cohort_unique_id"]] %in% 
        c("A018")
    ) |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

weight_data <- weight_data |>
  dplyr::select(-c("weight"))

5 Merge updated weight

Code

join_specification <- dplyr::join_by("cohort_unique_id")

cohort_A_data <- cohort_A_data |>
  dplyr::inner_join(weight_data, 
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::mutate(
    `weight` = .data[["updated_weight"]]
  ) |> 
  dplyr::select(-c("updated_weight"))

6 Check corrections

We check if the corrections are made based on the collaborator request. Changes are made manually on the excel file as the collaborator is no longer providing newer version of the data.

weight changed from 215.4kg to 90.1kg for patient A018

Code

cohort_A_data |> 
  # Check if these patient IDs are present
  pointblank::expect_col_vals_make_subset(
    columns = c("cohort_unique_id"),
    set = c("A018")
  ) |> 
  pointblank::expect_col_vals_expr(  
    expr = pointblank::expr( 
      dplyr::case_when(
        .data[["cohort_unique_id"]] %in% "A018" ~ 
          isTRUE(all.equal(
            target = 90.1,
            current = cohort_A_data[["weight"]][which(cohort_A_data[["cohort_unique_id"]] == "A018")],
            tolerance = 0.0001)
          ))
    )
  )

7 Write Preprocessed File

We output data to be used for the next session.

Code

cohort_A_data |> 
  nanoparquet::write_parquet(
    file = here::here(
      params$analysis_folder,
      params$harmonisation_folder,
      params$preprocessing_folder,
      "01_Cohort_A_cleaned.parquet"
  )
)