Missing Values

Published

Last Updated on 31 August 2024

Minimising the rate of missing values or NAs is one of the main objectives in ReSECT data quality control. This document represents the main relationships between NAs, and quantifies the number of NAs per Centre for certain fields considered key to the anatomical lung resection process. This information complements the use of filters within the platform, which should be used to determine which cases should be reviewed.

Important

In order for the use of the built-in data platform filters to find out missing values, it is NECESSARY, that at the time of including every patient in the anatomical lung resection process, ALL the forms are activated. Such activation is obtained by completing the first variable of each form, regardless of whether each form is fully or partially completed at that time.

Pre-OP

Code
datosClinicos_rpa %>% 
  mutate(dlco_percent_ = ifelse(realizacion_de_difusion == "No", "-", dlco_percent)) %>%
  select(
    Hospital = hospital, 
    Gender = sexo, 
    BMI = indice_de_masa_corporal, 
    Smoking = tabaquismo, 
    Comorbidities = comorbilidades, 
    `FEV1(%)` = fev1_percent, 
    `DLCO (%)` = dlco_percent_, 
    ASA = riesgo_asa, 
    ECOG = ecog, 
    Dyspnea = grado_de_disnea, 
    `Previous Thoracic Surg,` = cirugia_toracica_previa) %>%
  gg_miss_upset(nsets = 10)

Code
nas_preop <- datosClinicos_rpa %>% 
  mutate(dlco_percent_ = ifelse(realizacion_de_difusion == "No", "-", dlco_percent)) %>%
  select(
    Hospital = hospital, 
    Gender = sexo, 
    BMI = indice_de_masa_corporal, 
    Smoking = tabaquismo, 
    Comorbidities = comorbilidades, 
    `FEV1(%)` = fev1_percent, 
    `DLCO (%)` = dlco_percent_, 
    ASA = riesgo_asa, 
    ECOG = ecog, 
    Dyspnea = grado_de_disnea, 
    `Previous Thoracic Surg,` = cirugia_toracica_previa) %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 

pacientes_preop <- datosClinicos_rpa %>%
  count("Hospital" = hospital)


nas_preop %>% pivot_longer(!Hospital, names_to = "Variable", values_to = "nas") %>%
  full_join(pacientes_preop, by="Hospital") %>%
  group_by(Hospital) %>%
  mutate(n = ifelse(duplicated(n), NA, n)) %>%
  ungroup() %>%
  ggplot(aes(x = Hospital, 
             y = fct_relevel(Variable, "Gender", "BMI", "Smoking", "Comorbidities", 
                             "FEV1 (%)", "DLCO (%)", "ASA", "ECOG", "Dyspnea", "Previos Thoracic Surgery"), 
             fill = nas)) +
  geom_tile() + 
  geom_text(aes(label = n), color="white", size=1.8, nudge_y =-0.3)+
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7, margin = margin(r = 0)),
    plot.caption = element_text(face = "italic", color = "steelblue", size=6))+
  labs(fill = "Nº NAs", y = NULL, x = NULL,
          caption = "\nThe color of the tiles should be interpreted in conjunction with the number 
       of patients recruited by each department used to compute the number of misssing values in this plot")

Code
nas_preop <- datosClinicos_rpa %>% 
  mutate(dlco_percent_ = ifelse(realizacion_de_difusion == "No", "-", dlco_percent)) %>%
  select(
   Hospital = hospital, 
    Gender = sexo, 
    BMI = indice_de_masa_corporal, 
    Smoking = tabaquismo, 
    Comorbidities = comorbilidades, 
    `FEV1(%)` = fev1_percent, 
    `DLCO (%)` = dlco_percent_, 
    ASA = riesgo_asa, 
    ECOG = ecog, 
    Dyspnea = grado_de_disnea, 
    `Previous Thoracic Surg,` = cirugia_toracica_previa) %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 

pacientes <- datosClinicos_rpa %>% count(Hospital = hospital)

percent_nas_preop <- inner_join(x = nas_preop, y = pacientes, by = "Hospital") %>% 
  mutate_if(is.numeric, funs(.*100/n)) %>% select(-n)


percent_nas_preop %>% pivot_longer(!Hospital, names_to = "Variable", values_to = "percent_nas") %>%
  ggplot(aes(x = Hospital, 
              y = fct_relevel(Variable, "Gender", "BMI", "Smoking", "Comorbidities", 
                             "FEV1 (%)", "DLCO (%)", "ASA", "ECOG", "Dyspnea", "Previos Thoracic Surgery"), 
             fill = percent_nas)) +
  geom_tile() + 
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7))+
  labs(fill = "% NAs", y = NULL, x = NULL)

Surgery

Code
datosClinicos_rpa %>% 
  mutate(conversion = ifelse(abordaje_final != "Toracotomía", "-", conversion),
         lobectomia  = ifelse(procedimiento_pulmonar != "Lobectomía", "-", lobectomia)) %>%
  select(
   Hospital = hospital,
    Age = edad_del_paciente_a_la_fecha_de_intervencion,
    Procedure = procedimiento_pulmonar,
    Approach = abordaje_final,
    Lymphad. = tipo_de_linfadenectomia_hilio_mediastinica,
    Conversion = conversion,
    'Functioning Seg.'= numero_de_segmentos_funcionantes_resecados)  %>%
  gg_miss_upset(nsets = 8) 

Code
nas_cirugia <- datosClinicos_rpa %>% 
  mutate(conversion = ifelse(abordaje_final != "Toracotomía", "-", conversion),
         lobectomia  = ifelse(procedimiento_pulmonar != "Lobectomía", "-", lobectomia)) %>%
  select(
    Hospital = hospital,
    Age = edad_del_paciente_a_la_fecha_de_intervencion,
    Procedure = procedimiento_pulmonar,
    Approach = abordaje_final,
    Lymphad. = tipo_de_linfadenectomia_hilio_mediastinica,
    Conversion = conversion,
    'Functioning Seg.'= numero_de_segmentos_funcionantes_resecados)  %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 

pacientes_cirugia <- datosClinicos_rpa %>%
  count("Hospital" = hospital)


nas_cirugia %>% pivot_longer(!Hospital, names_to = "Variable", values_to = "nas") %>%
   full_join(pacientes_cirugia, by="Hospital") %>%
  group_by(Hospital) %>%
  mutate(n = ifelse(duplicated(n), NA, n)) %>%
  ungroup() %>%
  ggplot(aes(x = Hospital, 
             y = fct_relevel(Variable, "Age", "Procedure", "Approach", "Conversion","Lymphad.", "Functioning Seg."), 
             fill = nas)) +
  geom_tile() + 
  geom_text(aes(label = n), color="white", size=1.8, nudge_y = -0.4)+
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7, margin = margin(r = 0)),
    plot.caption = element_text(face = "italic", color = "steelblue", size=6))+
  labs(fill = "Nº NAs", y = NULL, x = NULL,
         caption = "\nThe color of the tiles should be interpreted in conjunction with the number 
       of patients recruited by each department used to compute the number of misssing values in this plot")

Code
nas_cirugia <- datosClinicos_rpa %>% 
   mutate(conversion = ifelse(abordaje_final != "Toracotomía", "-", conversion),
         lobectomia  = ifelse(procedimiento_pulmonar != "Lobectomía", "-", lobectomia)) %>%
  select(
    Hospital = hospital,
    Age = edad_del_paciente_a_la_fecha_de_intervencion,
    Procedure = procedimiento_pulmonar,
    Approach = abordaje_final,
    Lymphad. = tipo_de_linfadenectomia_hilio_mediastinica,
    Conversion = conversion,
    'Functioning Seg.'= numero_de_segmentos_funcionantes_resecados)  %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 

pacientes <- datosClinicos_rpa %>% count(Hospital = hospital)

percent_nas_cirugia <- inner_join(x = nas_cirugia, y = pacientes, by = "Hospital") %>% 
  mutate_if(is.numeric, funs(.*100/n)) %>% select(-n)

percent_nas_cirugia %>% pivot_longer(!Hospital, names_to = "Variable", values_to = "percent_nas") %>%
  ggplot(aes(x = Hospital, 
             y = fct_relevel(Variable, "Age", "Procedure", "Approach", "Conversion","Lymphad.", "Functioning Seg."),  
             fill = percent_nas)) +
  geom_tile() + 
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7))+
  labs(fill = "% NAs", y = NULL, x = NULL)

Post-OP

Code
datosClinicos_rpa %>% 
  mutate(grado = ifelse(
    complicaciones_postoperatorias == "No", "-", grado_complicaciones_postoperatorias)) %>%
  select(
    Hospital = hospital,
    'Postop Care' = cuidados_inmediatos_postop,
    'Non-Expected ICU' = ingreso_no_esperado_en_cuidados_intermedios_o_intensivos,
    Reintervention = reintervencion_quirurgica,
    Complications = complicaciones_postoperatorias,
    'Clavien-Dindo' = grado,
    'Discharge Date' = fecha_de_alta,
    'Discharge Status' = estado_de_alta)  %>%
  gg_miss_upset(nsets = 7)

Code
nas_postop <- datosClinicos_rpa %>% 
  mutate(grado = ifelse(
    complicaciones_postoperatorias == "No", "-", grado_complicaciones_postoperatorias)) %>%
  select(
    Hospital = hospital,
    'Postop Care' = cuidados_inmediatos_postop,
    'Non-Expected ICU' = ingreso_no_esperado_en_cuidados_intermedios_o_intensivos,
    Reintervention = reintervencion_quirurgica,
    Complications = complicaciones_postoperatorias,
    'Clavien-Dindo' = grado,
    'Discharge Date' = fecha_de_alta,
    'Discharge Status' = estado_de_alta)  %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 

pacientes_postop <- datosClinicos_rpa %>%
  count("Hospital" = hospital)


nas_postop %>% pivot_longer(!Hospital, names_to = "Variable", values_to = "nas") %>%
  full_join(pacientes_postop, by="Hospital") %>%
  group_by(Hospital) %>%
  mutate(n = ifelse(duplicated(n), NA, n)) %>%
  ungroup() %>%
  ggplot(aes(x = Hospital, 
             y = fct_relevel(Variable, "Postop Care", "Non-Expected ICU", "Reintervention", "Complications", 
                             "Clavien-Dindo", "Discharge Date", "Discharge Status"),
             fill = nas)) +
  geom_tile() + 
  geom_text(aes(label = n), color="white", size=1.8, nudge_y = -0.35)+
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7, margin = margin(r = 0)),
    plot.caption = element_text(face = "italic", color = "steelblue", size=6))+
  labs(fill = "Nº NAs", y = NULL, x = NULL,
            caption = "\nThe color of the tiles should be interpreted in conjunction with the number 
       of patients recruited by each department used to compute the number of misssing values in this plot")

Code
nas_postop <- datosClinicos_rpa %>% 
  mutate(grado = ifelse(
    complicaciones_postoperatorias == "No", "-", grado_complicaciones_postoperatorias)) %>%
  select(
   Hospital = hospital,
   'Postop Care' = cuidados_inmediatos_postop,
    'Non-Expected ICU' = ingreso_no_esperado_en_cuidados_intermedios_o_intensivos,
    Reintervention = reintervencion_quirurgica,
    Complications = complicaciones_postoperatorias,
    'Clavien-Dindo' = grado,
    'Discharge Date' = fecha_de_alta,
    'Discharge Status' = estado_de_alta)  %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 

pacientes <- datosClinicos_rpa %>% count(Hospital = hospital)

percent_nas_postop <- inner_join(x = nas_postop, y = pacientes, by = "Hospital") %>% 
  mutate_if(is.numeric, funs(.*100/n)) %>% select(-n)

percent_nas_postop %>% pivot_longer(!Hospital, names_to = "Variable", values_to = "percent_nas") %>%
  ggplot(aes(x = Hospital, 
            y = fct_relevel(Variable, "Postop Care", "Non-Expected ICU", "Reintervention", "Complications", 
                             "Clavien-Dindo", "Discharge Date", "Discharge Status"), 
             fill = percent_nas)) +
  geom_tile() + 
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7))+
  labs(fill = "% NAs", y = NULL, x = NULL)

Diagnosis

Code
datosClinicos_rpa %>% 
  filter(grupo_diagnostico == "Tumoral Maligno Primario" & 
           caracter_de_la_enfermedad != "Recurrencia de un Tumor Maligno Primario ya conocido")%>%
  select(
    Hospital = hospital,
    Histology = diagnostico_histologico,
    Location = localizacion_tumoral,
    `Nc CT-Scan` = descriptor_n_segun_tac,
    `Invasive Staging` = estadificacion_invasiva_ganglionar, 
    Neoadjuvant = neoadyuvancia,
    Tp = descriptor_t_patologico,
    Np = descriptor_n_patologico,
    Mp = descriptor_m_patologico,
    `Complete Resection` = reseccion_completa
    )  %>%
  gg_miss_upset(nsets = 7)

Code
nas_grupodx <- datosClinicos_rpa %>% 
  select(
    Hospital = hospital,
    `Diagnosis Group`= grupo_diagnostico
  ) %>%
  group_by(Hospital) %>%
  summarise_all(funs(sum(is.na(.))))

pacientes_grupodx <- datosClinicos_rpa %>% 
  count(Hospital = hospital, name = "n_grupodx")


nas_dx <- datosClinicos_rpa %>% 
  filter(grupo_diagnostico == "Tumoral Maligno Primario" & 
           caracter_de_la_enfermedad != "Recurrencia de un Tumor Maligno Primario ya conocido")%>%
  select(
    Hospital = hospital,
    Histology = diagnostico_histologico,
    Location = localizacion_tumoral,
    `Nc CT-Scan` = descriptor_n_segun_tac,
    `Invasive Staging` = estadificacion_invasiva_ganglionar, 
    Neoadjuvant = neoadyuvancia,
    Tp = descriptor_t_patologico,
    Np = descriptor_n_patologico,
    Mp = descriptor_m_patologico,
    `Complete Resection` = reseccion_completa
    )  %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 


nas_dx %>% full_join(nas_grupodx, by = "Hospital") %>%
  mutate_all(.funs = ~ replace(., is.na(.), 0)) %>% 
  pivot_longer(!Hospital, names_to = "Variable", values_to = "nas") %>%
  full_join(pacientes_grupodx, by="Hospital") %>%
  group_by(Hospital) %>%
  mutate(n = ifelse(duplicated(n_grupodx), NA, n_grupodx)) %>%
  ungroup() %>%
  ggplot(aes(x = Hospital, 
             y = fct_relevel(factor(Variable), "Diagnosis Group", "Histology", "Location", "Nc CT-Scan", "Invasive Staging", 
                             "Neoadjuvant", "Tp", "Np", "Mp", "Complete Resection"), 
             fill = nas)) +
  geom_tile() + 
  geom_text(aes(label = n), color = "white", size=1.8, nudge_y = -1.3)+
  
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7, margin = margin(r = -20)),
    plot.caption = element_text(face = "italic", color = "steelblue", size=6))+
  labs(fill = "Nº NAs", y = NULL, x = NULL,
       caption = "\nThe color of the tiles should be interpreted in conjunction with the number 
       of patients recruited by each department used to compute the number of misssing values in this plot. \n 
       All characteristics, except ‘Diagnosis Group’, refer to patients with primary malignant 
       tumors that were not recurrence of a previous tumor.")

Code
# Var Grupo Diagnosito

nas_grupodx <- datosClinicos_rpa %>% 
  select(
    Hospital = hospital,
    `Diagnosis Group`= grupo_diagnostico
  ) %>%
  group_by(Hospital) %>%
  summarise_all(funs(sum(is.na(.))))


pacientes_grupodx <- datosClinicos_rpa %>% 
  count(Hospital = hospital, name = "n_grupodx")

percent_nas_grupodx <- full_join(nas_grupodx, pacientes_grupodx, by = "Hospital") %>%
  mutate(`Diagnosis Group` = 100 * `Diagnosis Group`/ n_grupodx) %>%
  select(- n_grupodx)
  
  

# Vars for Primary Malignant (no recurrence)
  
  
nas_dx <- datosClinicos_rpa %>% 
  filter(grupo_diagnostico == "Tumoral Maligno Primario" & 
           caracter_de_la_enfermedad != "Recurrencia de un Tumor Maligno Primario ya conocido")%>%
  select(
    Hospital = hospital,
    Histology = diagnostico_histologico,
    Location = localizacion_tumoral,
    `Nc CT-Scan` = descriptor_n_segun_tac,
    `Invasive Staging` = estadificacion_invasiva_ganglionar, 
    Neoadjuvant = neoadyuvancia,
    Tp = descriptor_t_patologico,
    Np = descriptor_n_patologico,
    Mp = descriptor_m_patologico,
    `Complete Resection` = reseccion_completa
    )  %>%
  group_by(Hospital) %>% 
  summarise_all(funs(sum(is.na(.)))) 


pacientes_dx <- datosClinicos_rpa %>% 
  filter(grupo_diagnostico == "Tumoral Maligno Primario" &
           caracter_de_la_enfermedad != "Recurrencia de un Tumor Maligno Primario ya conocido")%>%
  count(Hospital = hospital, name = "n_dx")


percent_nas_dx <- full_join(x = nas_dx, y = pacientes_dx, by = "Hospital") %>% 
  mutate_if(is.numeric, funs(.*100/n_dx)) %>% select(-n_dx)

# Join

percent_nas <- percent_nas_grupodx %>% full_join(percent_nas_dx, by = "Hospital") %>%
  mutate_all(.funs = ~ replace(., is.na(.), 0)) 


percent_nas %>% pivot_longer(!Hospital, names_to = "Variable", values_to = "percent_nas") %>%
  ggplot(aes(x = Hospital, 
              y = fct_relevel(factor(Variable), "Diagnosis Group", "Histology", "Location", "Nc CT-Scan", "Invasive Staging", 
                             "Neoadjuvant", "Tp", "Np", "Mp", "Complete Resection"), 
             fill = percent_nas)) +
  geom_tile() + 
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7),
    plot.caption = element_text(face = "italic", color = "steelblue", size=6))+
  labs(fill = "% NAs", y = NULL, x = NULL,
          caption = "All characteristics, except ‘Diagnosis Group’, refer to patients with primary malignant 
       tumors that were not recurrence of a previous tumor.")

Short-Term Follow-Up

Code
datosClinicos_rpa %>% 
  filter(grado_complicaciones_postoperatorias != "Grado V = Exitus") %>%
  select(
    Hospital = hospital,
    Readmission = reingreso_hospitalario,
    `30-day Status` = estado_a_30_dias,
    `90-day Status` = estado_a_90_dias,
    )  %>%
  gg_miss_upset(nsets = 7)

Code
# Presence of readmission

nas_short <- datosClinicos_rpa %>% 
  filter(complicaciones_postoperatorias != "Grado V = Exitus") %>% 
  select(
    Hospital = hospital,
    Readmission = reingreso_hospitalario,
    `30-day Status` = estado_a_30_dias,
    `90-day Status` = estado_a_90_dias
  ) %>%
  group_by(Hospital) %>%
  summarise_all(funs(sum(is.na(.))))

pacientes_short <- datosClinicos_rpa %>%
  filter(complicaciones_postoperatorias != "Grado V = Exitus") %>% 
  count(Hospital = hospital)

nas_short %>% 
  pivot_longer(!Hospital, names_to = "Variable", values_to = "nas") %>%
  full_join(pacientes_short, by="Hospital") %>%
  group_by(Hospital) %>%
  mutate(n = ifelse(duplicated(n), NA, n)) %>%
  ungroup() %>%
  ggplot(aes(x = Hospital, 
             y = fct_relevel(factor(Variable), "Readmission", "30-day Status", "90-day Status"), 
             fill = nas)) +
  geom_tile() + 
  geom_text(aes(label = n), color="white", size=1.8, nudge_y = -0.45)+
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
     axis.text.y = element_text(size = 7, margin = margin(r = -5)),
    plot.caption = element_text(face = "italic", color = "steelblue", size=6))+
  labs(fill = "Nº NAs", x = NULL, y=NULL,
        caption = "\nThe color of the tiles should be interpreted in conjunction with the number 
       of patients recruited by each department used to compute the number of misssing values in this plot")

Code
nas_short <- datosClinicos_rpa %>% 
  filter(complicaciones_postoperatorias != "Grado V = Exitus") %>% 
  select(
    Hospital = hospital,
    Readmission = reingreso_hospitalario,
    `30-day Status` = estado_a_30_dias,
    `90-day Status` = estado_a_90_dias
  ) %>%
  group_by(Hospital) %>%
  summarise_all(funs(sum(is.na(.))))

pacientes_short <- datosClinicos_rpa %>%
  filter(complicaciones_postoperatorias != "Grado V = Exitus") %>% 
  count(Hospital = hospital)


nas_short %>% full_join(pacientes_short, by = "Hospital") %>%
  mutate_all(.funs = ~ replace(., is.na(.), 0)) %>%
  mutate_if(is.numeric, funs(.*100/n)) %>%
  select(-n) %>%
  pivot_longer(!Hospital, names_to = "Variable", values_to = "nas") %>%
  ggplot(aes(x = Hospital, 
             y = fct_relevel(factor(Variable), "Readmission", "30-day Status", "90-day Status"), 
             fill = nas)) +
  geom_tile() + 
  coord_flip() + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 90),
    axis.text.y = element_text(size = 7),
    plot.caption = element_text(face = "italic", color = "steelblue"))+
  labs(fill = "% NAs", x = NULL, y=NULL)

Back to top