Workbook document

Author
Affiliation

Brian ODonnell

Charité - Universitätsmedizin Berlin

Goals

Code
 #| label: load-packages
 #| include: false
 #| echo: false

library(tidyverse)
library(tidymodels)
library(table1)
library(slider)
library(ggcorrplot)
library(vip)
library(summarytools)
library(glmnet)
library(readxl)
library(kableExtra)

Quarto table practice

testing out quarto function to make a table

Code
data(mtcars)

table1::table1(~ mpg + cyl | factor(vs), data=mtcars, caption="CARS: mpg and cyl by vs") 
CARS: mpg and cyl by vs
0
(N=18)
1
(N=14)
Overall
(N=32)
mpg
Mean (SD) 16.6 (3.86) 24.6 (5.38) 20.1 (6.03)
Median [Min, Max] 15.7 [10.4, 26.0] 22.8 [17.8, 33.9] 19.2 [10.4, 33.9]
cyl
Mean (SD) 7.44 (1.15) 4.57 (0.938) 6.19 (1.79)
Median [Min, Max] 8.00 [4.00, 8.00] 4.00 [4.00, 6.00] 6.00 [4.00, 8.00]

Noshow data practice

Now lets start importing the noshow data as practice.

Code
 #| label: load-data
 #| include: true
 #| echo: false
 
dta<-read_csv("kaggle.csv") %>% 
  janitor::clean_names() %>% 
  mutate(across(contains("day"),as.Date))

Explore the data. Start by counting the number of appointments and noshows per patient.

Code
tots <- dta %>% group_by(patient_id,no_show) %>% tally()

tots %>% head()
# A tibble: 6 × 3
# Groups:   patient_id [6]
  patient_id no_show     n
       <dbl> <chr>   <int>
1     39218. No          1
2     43742. No          1
3     93780. No          1
4    141724. No          1
5    537615. No          1
6   5628261  Yes         1
Code
# tots %>% 
#   ggplot()+
#   geom_histogram(aes(x=n))

# tots %>% arrange(-n)

tots %>% 
    mutate(n_group =
      case_when(
        n==1 ~ "1",
        n>1 & n<6 ~ "1-5",
        TRUE ~ "6+"
      )
    ) %>% 
  ungroup() %>% 
  group_by(no_show, n_group) %>% 
  summarise(percent = n() / nrow(tots)) %>%   
  ungroup() %>% 
  # janitor::tabyl(n_group,no_show) %>% 
  as_tibble() %>% 
  ggplot()+
  geom_col(aes(x=n_group,y=percent)) +
  facet_grid(~no_show, scales="fixed")

On the noshow var: here “no”=appointment was made, and “yes”=appointment was noshow.

Patient load per month per clinic (here we pretend neighborhood is the clinic)

Code
# library(summarytools)

dta_pts_per_month<-dta %>% 
  mutate(appt_month=lubridate::month(appointment_day)) %>% 
  group_by(appt_month, neighbourhood) %>% 
  summarize(patients=n_distinct(patient_id),
            appts=n()) %>% 
  ungroup() %>% 
  filter(appt_month>4) %>%
  mutate(pa_ratio=appts/patients)

dta_pts_per_month %>% 
  summarytools::descr()
Descriptive Statistics  
dta_pts_per_month  
N: 160  

                    appt_month     appts   pa_ratio   patients
----------------- ------------ --------- ---------- ----------
             Mean         5.49    670.58       1.42     446.88
          Std.Dev         0.50    823.45       0.23     509.53
              Min         5.00      1.00       1.00       1.00
               Q1         5.00    133.50       1.22      98.00
           Median         5.00    399.50       1.36     281.50
               Q3         6.00    843.50       1.58     591.50
              Max         6.00   5761.00       2.33    3375.00
              MAD         0.00    426.99       0.26     305.42
              IQR         1.00    705.50       0.36     492.75
               CV         0.09      1.23       0.16       1.14
         Skewness         0.02      2.69       0.55       2.45
      SE.Skewness         0.19      0.19       0.19       0.19
         Kurtosis        -2.01     10.37       0.33       8.44
          N.Valid       160.00    160.00     160.00     160.00
        Pct.Valid       100.00    100.00     100.00     100.00
Code
# %>% summarize(patient_avg=mean(patients),
#             appt_avg=mean(appts))

dta_pts_per_month %>% 
  ggplot()+
  geom_histogram(aes(x=appts),bins=20)

Check out distributions of all the vars including patients appts ratio at each clinic per month.

Code
dta_pts_per_month %>% 
  summarytools::dfSummary(plain.ascii = FALSE) %>% 
  print(method="render", varnumbers   = FALSE, 
                valid.col    = FALSE, 
                graph.magnif = 0.76)

Data Frame Summary

dta_pts_per_month

Dimensions: 160 x 5
Duplicates: 0
Variable Stats / Values Freqs (% of Valid) Graph Missing
appt_month [numeric]
Min : 5
Mean : 5.5
Max : 6
5 : 81 ( 50.6% )
6 : 79 ( 49.4% )
0 (0.0%)
neighbourhood [character]
1. AEROPORTO
2. ANDORINHAS
3. ANTÔNIO HONÓRIO
4. ARIOVALDO FAVALESSA
5. BARRO VERMELHO
6. BELA VISTA
7. BENTO FERREIRA
8. BOA VISTA
9. BONFIM
10. CARATOÍRA
[ 71 others ]
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
2 ( 1.2% )
140 ( 87.5% )
0 (0.0%)
patients [integer]
Mean (sd) : 446.9 (509.5)
min ≤ med ≤ max:
1 ≤ 281.5 ≤ 3375
IQR (CV) : 492.8 (1.1)
146 distinct values 0 (0.0%)
appts [integer]
Mean (sd) : 670.6 (823.5)
min ≤ med ≤ max:
1 ≤ 399.5 ≤ 5761
IQR (CV) : 705.5 (1.2)
153 distinct values 0 (0.0%)
pa_ratio [numeric]
Mean (sd) : 1.4 (0.2)
min ≤ med ≤ max:
1 ≤ 1.4 ≤ 2.3
IQR (CV) : 0.4 (0.2)
155 distinct values 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.3.2)
2024-04-18

Now lets make a line graph of all the appointments. First lets test if there is any difference between noshows and shows for every day of observartions.

We’ll first do a T test

Code
dta_tl <-dta %>% 
  ungroup() %>% 
  select(no_show,contains("_day")) %>% 
  #   add_count(appointment_day, name="n_appts") %>% 
  # add_count(scheduled_day, name="n_scheds") %>% 
  pivot_longer(cols=all_of(c("appointment_day",
                           "scheduled_day")),
               names_to="date_type",
               values_to="date") %>% 
  # arrange(desc(date))
  group_by(across(everything())) %>%
  summarise(n = n()) %>% 
  complete(date = seq.Date(min(date), max(date), by="day")) %>% 
  mutate(n=replace_na(n,0)) %>% 
  ungroup() %>%
  # filter(date>as.Date("2016-04-01")) %>% 
  filter(date_type=="appointment_day") %>%
  group_by(no_show) %>%
  arrange(date) %>% 
  mutate(mean_seven_day = slide_index_dbl( #### Give it an index for a sliding window
    n,
    .i=date,
    .f = ~mean(.x, na.rm=TRUE),
    .before = days(7)
  )) %>% ungroup()


# dta_test<-dta_tl %>% 
#   select(date,no_show,n) %>% 
#   pivot_wider(names_from=no_show,
#               values_from=n)

# noshow_n_days<-dta_tl %>% filter(no_show=="No") %>% select(n)
# noshow_y_days<-dta_tl %>% filter(no_show=="Yes") %>% select(n)

# t.test(noshow_n_days$n ~ noshow_y_days$n)


## T test,... is the noshow days different with total appts in each day??
t.test(n~no_show, data=dta_tl) %>% 
  broom::tidy() %>% 
  kableExtra::kable()
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
1607.049 2151.415 544.3659 5.984218 3e-07 45.16183 1066.219 2147.878 Welch Two Sample t-test two.sided

The difference is significant at least.

Now make the line graph, any changes over time?

Code
dta_tl %>% 
  # pivot_longer(cols=all_of(c("appointment_day",
  #                          "scheduled_day")),
  #              names_to="date_type",
  #              values_to="date") %>% 
  # select(no_show,contains("date"),contains("n_"))
  # summarize(appt=n(appointment_day),
  #           sched=n(scheduled_day))
  ggplot(aes(x=date)) +
  geom_col(aes(y=n),fill="grey80", alpha=0.5) +
  geom_line(aes(y=mean_seven_day, color=date_type),size=0.5) +
  scale_x_date(date_labels="%Y %b %d") +
  facet_wrap(~no_show, ncol = 1, scales="free") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title="Was the appointment a no show? Shown over duration of dataset")

Code
dta_tl %>% 
  str()
tibble [82 × 5] (S3: tbl_df/tbl/data.frame)
 $ no_show       : chr [1:82] "No" "Yes" "No" "Yes" ...
 $ date_type     : chr [1:82] "appointment_day" "appointment_day" "appointment_day" "appointment_day" ...
 $ date          : Date[1:82], format: "2016-04-29" "2016-04-29" ...
 $ n             : int [1:82] 2602 633 0 0 0 0 3515 861 3425 831 ...
 $ mean_seven_day: num [1:82] 2602 633 1301 316 867 ...
Code
# install.packages("slider")
# library(slider)
# install.packages("tidyquant")

# summary(dta$appointment_day)

Looks like not much difference in temporal trends of noshows.

Cleaning and Cross Correlation

That was fun. Now lets clean up the data to perform a logistic regression and then see how to use prediction methods…

1) Clean up data

Code
dta_clean <- dta %>% 
  #remove duplicate appointments for same patient and same day
   filter(!duplicated(across(-appointment_id))) %>% 
  #create a day of the week variable
  mutate(wday=wday(appointment_day,label=TRUE,abbr=TRUE),
         is_friday=as.factor(if_else(wday=="Fri",1,0))) %>% 
  #create var number of patients seen at clinic/neighborhood in total
  add_count(neighbourhood,name="neigh_appts") %>% 
    # factorize vars and create a diff between appt and scheduled date
  mutate(gender=as.factor(gender),
         gender_male=as.factor(if_else(gender=="M",1,0)),
         attended=as.factor(case_match(no_show,
                            "No"~1,
                            "Yes"~0)),
         days_since_call=as.double(difftime(appointment_day, scheduled_day, units="days"))
  ) %>% 
  mutate(across(scholarship:sms_received, as.factor)) %>%
  mutate(across(contains("_id"),as.factor)) %>%
  #create tallies
  arrange(patient_id,appointment_day) %>% 
  add_count(patient_id, name="appt_count") %>% 
  group_by(patient_id) %>%
  mutate("one"=1,
         "appt_tally"=cumsum(one)) %>% select(-one) %>% ungroup()

dta_clean %>%
  # colnames() %>% 
  select(!patient_id & !appointment_id) %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 106305
Number of columns 20
_______________________
Column type frequency:
character 2
Date 2
factor 11
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
neighbourhood 0 1 5 27 0 81 0
no_show 0 1 2 3 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
scheduled_day 0 1 2015-11-10 2016-06-08 2016-05-10 111
appointment_day 0 1 2016-04-29 2016-06-08 2016-05-18 27

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1 FALSE 2 F: 69588, M: 36717
scholarship 0 1 FALSE 2 0: 95880, 1: 10425
hipertension 0 1 FALSE 2 0: 85156, 1: 21149
diabetes 0 1 FALSE 2 0: 98572, 1: 7733
alcoholism 0 1 FALSE 2 0: 103330, 1: 2975
handcap 0 1 FALSE 5 0: 104186, 1: 1928, 2: 177, 3: 11
sms_received 0 1 FALSE 2 0: 70831, 1: 35474
wday 0 1 TRUE 6 Tue: 24876, Wed: 24818, Mon: 21803, Fri: 18062
is_friday 0 1 FALSE 2 0: 88243, 1: 18062
gender_male 0 1 FALSE 2 0: 69588, 1: 36717
attended 0 1 FALSE 2 1: 84615, 0: 21690

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 37.12 23.17 -1 18 37 56 115 ▇▇▇▂▁
neigh_appts 0 1 2615.11 1811.84 1 1332 2208 3169 7413 ▆▇▃▁▂
days_since_call 0 1 10.41 15.37 -6 0 4 15 179 ▇▁▁▁▁
appt_count 0 1 2.83 2.96 1 1 2 3 35 ▇▁▁▁▁
appt_tally 0 1 1.91 1.87 1 1 1 2 35 ▇▁▁▁▁
Code
dta_clean
# A tibble: 106,305 × 22
   patient_id   appointment_id gender scheduled_day appointment_day   age
   <fct>        <fct>          <fct>  <date>        <date>          <dbl>
 1 39217.84439  5751990        F      2016-05-31    2016-06-03         44
 2 43741.75652  5760144        M      2016-06-01    2016-06-01         39
 3 93779.52927  5712759        F      2016-05-18    2016-05-18         33
 4 141724.16655 5637648        M      2016-04-29    2016-05-02         12
 5 537615.28476 5637728        F      2016-04-29    2016-05-06         14
 6 5628261      5680449        M      2016-05-10    2016-05-13         13
 7 11831856     5718578        M      2016-05-19    2016-05-19         16
 8 22638656     5580835        F      2016-04-14    2016-05-03         22
 9 22638656     5715081        F      2016-05-18    2016-06-08         23
10 52168938     5704816        F      2016-05-16    2016-05-16         28
# ℹ 106,295 more rows
# ℹ 16 more variables: neighbourhood <chr>, scholarship <fct>,
#   hipertension <fct>, diabetes <fct>, alcoholism <fct>, handcap <fct>,
#   sms_received <fct>, no_show <chr>, wday <ord>, is_friday <fct>,
#   neigh_appts <int>, gender_male <fct>, attended <fct>,
#   days_since_call <dbl>, appt_count <int>, appt_tally <dbl>

2) Correlation matrix with cleaned data

Code
# install.packages("ggcorrplot")

dta_clean_noid<-dta_clean  %>%   
  select(!patient_id & !appointment_id & !handcap & !gender_male & !wday) %>% 
  select(where(is.factor))

dta_clean_noid
# A tibble: 106,305 × 8
   gender scholarship hipertension diabetes alcoholism sms_received is_friday
   <fct>  <fct>       <fct>        <fct>    <fct>      <fct>        <fct>    
 1 F      0           0            0        0          0            1        
 2 M      0           0            1        0          0            0        
 3 F      0           0            0        0          0            0        
 4 M      0           0            0        0          0            0        
 5 F      0           0            0        0          1            1        
 6 M      0           0            0        0          0            1        
 7 M      0           0            0        0          0            0        
 8 F      0           0            0        0          1            0        
 9 F      0           0            0        0          1            0        
10 F      0           0            0        0          0            0        
# ℹ 106,295 more rows
# ℹ 1 more variable: attended <fct>
Code
model.matrix(~0+., data=dta_clean_noid) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot::ggcorrplot(show.diag = F,type="lower",lab=TRUE, lab_size=2, digits=3)

Predictions with Logistic Regression

Following workflow from: https://www.tidymodels.org/start/case-study/

1) Split into training and validation and test sets

Code
# some of these vars are not necesary now or better processed by recipe
# outcome is still attended
set.seed(123)
dta_clean2<-dta_clean %>% ungroup() %>% 
  select(!c(gender,scheduled_day,wday,is_friday,no_show,neighbourhood, patient_id))

splits      <- initial_split(dta_clean2, strata = attended)

appts_other <- training(splits)
appts_test  <- testing(splits)

# appts_other %>% 
#   count(attended) %>% 
#   mutate(prop = n/sum(n))
# 
# appts_test %>% 
#   count(attended) %>% 
#   mutate(prop = n/sum(n))

# appts_other

val_set <- validation_split(appts_other, 
                            strata = attended, 
                            prop = c(0.8))

2) Build the model

Code
lr_mod <- logistic_reg(penalty = tune(), mixture=1) %>% 
  set_engine("glmnet")

3) Build the “recipe” to preprocess the data

Code
lr_recipe <- recipe(attended ~ ., data = appts_other) %>% 
  update_role(appointment_id, new_role = "ID") %>% 
  step_date(appointment_day, features = c("dow", "month")) %>%               
  step_rm(appointment_day) %>% 
  # step_corr(all_numeric_predictors(), threshold = .5)
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors())

4) Now we add a workflow, combining the model and recipe

Code
lr_workflow<-workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(lr_recipe)

5) Create penalty values for tuning

Code
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 40))

6) Now we apply these each of these penalty values to the workflow (ie with recipe and model)

Code
set.seed(321)
# install.packages("glmnet")

lr_res <- lr_workflow %>% 
  tune_grid(val_set,
            grid = lr_reg_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(roc_auc))

lr_plot <- 
  lr_res %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
  geom_point() + 
  geom_line() + 
  ylab("Area under the ROC Curve")

lr_plot 

Lets try to find the best model… looks like higher penalty actually performs better!!

Code
lr_res %>% 
  collect_metrics() %>% 
  arrange(penalty) %>% 
  rowid_to_column()
# A tibble: 40 × 8
   rowid  penalty .metric .estimator  mean     n std_err .config              
   <int>    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1     1 0.0001   roc_auc binary     0.652     1      NA Preprocessor1_Model01
 2     2 0.000119 roc_auc binary     0.652     1      NA Preprocessor1_Model02
 3     3 0.000143 roc_auc binary     0.652     1      NA Preprocessor1_Model03
 4     4 0.000170 roc_auc binary     0.652     1      NA Preprocessor1_Model04
 5     5 0.000203 roc_auc binary     0.652     1      NA Preprocessor1_Model05
 6     6 0.000242 roc_auc binary     0.652     1      NA Preprocessor1_Model06
 7     7 0.000289 roc_auc binary     0.652     1      NA Preprocessor1_Model07
 8     8 0.000346 roc_auc binary     0.652     1      NA Preprocessor1_Model08
 9     9 0.000412 roc_auc binary     0.652     1      NA Preprocessor1_Model09
10    10 0.000492 roc_auc binary     0.652     1      NA Preprocessor1_Model10
# ℹ 30 more rows
Code
###Seems to be row 38... high mean roc_auc while also being the highest penalty

lr_best<-  lr_res %>% 
  collect_metrics() %>% 
  arrange(penalty) %>% 
  slice(38)

OK, now lets see how this ROC curve looks for the best fit log model….

Code
lr_auc <- lr_res %>% 
  collect_predictions(parameters = lr_best) %>% 
  roc_curve(attended, .pred_0) %>% 
  mutate(model = "Logistic Regression") 

autoplot(lr_auc)

Code
lr_res %>% 
  collect_predictions(parameters = lr_best) 
# A tibble: 15,947 × 7
   id         .pred_0 .pred_1  .row penalty attended .config              
   <chr>        <dbl>   <dbl> <int>   <dbl> <fct>    <chr>                
 1 validation   0.206   0.794    14  0.0702 0        Preprocessor1_Model38
 2 validation   0.209   0.791    24  0.0702 0        Preprocessor1_Model38
 3 validation   0.208   0.792    26  0.0702 0        Preprocessor1_Model38
 4 validation   0.208   0.792    27  0.0702 0        Preprocessor1_Model38
 5 validation   0.205   0.795    31  0.0702 0        Preprocessor1_Model38
 6 validation   0.203   0.797    36  0.0702 0        Preprocessor1_Model38
 7 validation   0.205   0.795    39  0.0702 0        Preprocessor1_Model38
 8 validation   0.202   0.798    43  0.0702 0        Preprocessor1_Model38
 9 validation   0.203   0.797    44  0.0702 0        Preprocessor1_Model38
10 validation   0.203   0.797    53  0.0702 0        Preprocessor1_Model38
# ℹ 15,937 more rows

its… not great!

But we can see how it would fit onto the test data now.

Code
last_log_mod <-  logistic_reg(penalty = lr_best$penalty[1], mixture=1) %>% 
  set_engine("glmnet", importance="BIC")

# the last workflow
last_log_workflow <- lr_workflow %>% 
                    update_model(last_log_mod)

# last fit
last_lr_fit<- last_log_workflow %>% 
              last_fit(splits)

# show metrics
last_lr_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.796 Preprocessor1_Model1
2 roc_auc  binary         0.696 Preprocessor1_Model1

About same ROC as with validation set…

7) Finally, we look at the top variables/features included

Code
last_lr_fit %>% 
  extract_fit_parsnip() %>% 
  vip(num_features = 20, mapping=aes(fill=Sign)) +
  scale_y_continuous(expand = c(0,0))

Looks like the days since the last call was most important by far!!!

Next after was the SMS received… but from correlation matrix, it seemed to have an inverse effect. Should explore more!

Code
# 
# cor.test(dta_clean$sms_received, dta_clean$no_show)
#   
# chisq.test(dta_clean$sms_received, dta_clean$no_show)

DHIS2 Hypertension Registry Test Run

Code
# library(readxl)
dhis<-readxl::read_xls("dhis.xls") %>% 
  janitor::clean_names()

dhis_clean<-dhis %>% 
  mutate(across(contains("date"), as.Date)) %>% # convert to date everything thats a date
    mutate(across(contains("mm_hg"),as.double)) %>% # BP should be character
  group_by(gen_address_current,gen_given_name,gen_date_of_birth) %>% #create patient IDs based on address DOB name
  mutate(patient_id=cur_group_id()) %>% 
  mutate(has_support=!is.na(supporters_name), #add a dummy var if patient has a supporter
         sex=gen_sex,
         age_at_entry=floor(lubridate::time_length(difftime(registration_date, gen_date_of_birth), 
                                                                "years")) #calc age and sex
        ) %>%
  add_count(name="total_patient_visits") %>% # counter for total patients visits
  ungroup() %>% 
  arrange(patient_id,event_date) %>% #now make a running tally for each patient visits
  group_by(patient_id) %>%
  mutate("one"=1, "appt_tally"=cumsum(one)) %>% 
  select(-one) %>% 
  ungroup() %>% 
  mutate(days_since_entry=as.double(difftime(registration_date, event_date, units="days"))) %>% #days since entry
  select(!starts_with("gen") & !supporters_name)

# dhis %>% 
#   count(has_support)

# dhis_clean %>% head()
# 
# dhis %>% 
#   colnames()

Still need to clean up the presecriptions etc…

Process it for the Table1

Code
dhis_table <- dhis_clean %>% 
    mutate(patient_returned=if_else(total_patient_visits>1, "1+ Visits After Entry",
                                    "LTFU After Entry"
                                    )) %>% 
    mutate(sbp_level =
      case_when(
        ncd_systole_mm_hg >= 140                         ~ "very_high",
        ncd_systole_mm_hg >= 130 & ncd_systole_mm_hg < 140 ~ "high",
        ncd_systole_mm_hg >= 120 & ncd_systole_mm_hg < 130 ~ "elevated",
        is.na(ncd_systole_mm_hg) ~ NA,
        TRUE ~ "normal"
      )
    ) %>% 
  mutate(sbp_level=fct_relevel(sbp_level, 
                              c("normal","elevated","high",
                               "very_high"))) %>% 
  mutate(age_group = as.factor(cut(age_at_entry, 
                         breaks = c(40, 55, 60, 75, 100), 
                         include.lowest = T, 
                         labels = c("40-55","55-60", "60-75", "75-100")))) %>% 
  select(patient_returned, sex, age_group, appt_tally,
         "sbp_baseline"=sbp_level, "amlodipine_baseline"=med_amlodipine) %>% 
  filter(appt_tally==1)

# hist(dhis_clean$age_at_entry)


table1::table1(~ sex + age_group + sbp_baseline + amlodipine_baseline | factor(patient_returned), data=dhis_table) 
1+ Visits After Entry
(N=1526)
LTFU After Entry
(N=277)
Overall
(N=1803)
sex
Female 767 (50.3%) 142 (51.3%) 909 (50.4%)
Male 759 (49.7%) 135 (48.7%) 894 (49.6%)
age_group
40-55 51 (3.3%) 64 (23.1%) 115 (6.4%)
55-60 269 (17.6%) 62 (22.4%) 331 (18.4%)
60-75 1206 (79.0%) 151 (54.5%) 1357 (75.3%)
75-100 0 (0%) 0 (0%) 0 (0%)
sbp_baseline
normal 159 (10.4%) 10 (3.6%) 169 (9.4%)
elevated 119 (7.8%) 17 (6.1%) 136 (7.5%)
high 59 (3.9%) 56 (20.2%) 115 (6.4%)
very_high 1188 (77.9%) 194 (70.0%) 1382 (76.7%)
Missing 1 (0.1%) 0 (0%) 1 (0.1%)
amlodipine_baseline
Amlodipine(5mg) 183 (12.0%) 19 (6.9%) 202 (11.2%)
Missing 1343 (88.0%) 258 (93.1%) 1601 (88.8%)
Code
# dhis_clean %>% 
#   filter(is.na(ncd_systole_mm_hg))

# dhis_clean %>% 
#   count(total_patient_visits)

# dhis_clean %>% 
#   count()

From here:

  • Transform hypertension registry data into “long” appointment dataset

  • Define and import climate / econ variables and add them

  • Validation criteria for prediction model?

  • Cox PH - Discrete time or continuous time?

Analysis Plan - Outline

  • Import DHIS2 line list data

  • transform variables as needed and do a data inspection

    • avg total number of visits per client

    • appts per month enrolled

    • missing data analysis

    • days since last appt for each pnt

    • mean patient load per month per clinic

    • hist of total vists by age/region/sex

    • dummy vars for each of the medications prescribed

      • and whether prescribed within appropriate window since registration
  • crosstabs of patient level variables including by attendance status groups

    • registered without follow up

    • had follow up and never dropped out until endline

    • had a follow up

    • sex and 10year age bands

    • SBP above or below 180

    • BMI range at enrollment

  • Add date of next appt to each line

    • Variable if falls within time of ec or env shock

    • dummy var for season of enrollment

  • Patient return model

    • logistic reg

    • include all vars, then LASSO for vars that are relevant

    • bootstrap resampling

  • IFF enough patients return for 2+ FU visits (~40%) then also do a survival analysis for LTFU

    • Cox PH model

    • include all vars, then LASSO for vars that are relevant

    • bootstrap resampling