Hands-on Exercise 4

Author

Tan Zhi Hao

Published

February 4, 2023

Modified

March 11, 2023

pacman::p_load(ggstatsplot, tidyverse, tidyverse,plotly, crosstalk, DT, ggdist, gganimate, gifski, gapminder, FunnelPlotR, plotly, knitr)
exam <- read_csv("data/Exam_data.csv")
set.seed(1234)

gghistostats(
  data = exam,
  x = ENGLISH,
  type = "bayes",
  test.value = 60,
  xlab = "English score"
)

ggbetweenstats(
  data = exam,
  x = GENDER,
  y = MATHS,
  type = "np",
  messages = FALSE
)

ggbetweenstats(
  data = exam,
  x = RACE,
  y = ENGLISH,
  type = "p",
  mean.ci = TRUE,
  pairwise.comparisons = TRUE,
  pairwise.display = "s",
  p.adjust.method = "fdr",
  messages = FALSE
)

ggscatterstats(
  data = exam,
  x = MATHS,
  y = ENGLISH,
  marginal = FALSE
)

exam1 <- exam %>%
  mutate(MATHS_bins = 
           cut(MATHS,
               breaks = c(0,60,75,85,100))
         )
ggbarstats(exam1,
           x = MATHS_bins,
           y = GENDER)

pacman::p_load(readxl, performance, parameters, see)
car_resale <- read_xls("data/ToyotaCorolla.xls", 
                       "data")
car_resale
# A tibble: 1,436 × 38
      Id Model       Price Age_0…¹ Mfg_M…² Mfg_Y…³     KM Quart…⁴ Weight Guara…⁵
   <dbl> <chr>       <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
 1    81 TOYOTA Cor… 18950      25       8    2002  20019     100   1180       3
 2     1 TOYOTA Cor… 13500      23      10    2002  46986     210   1165       3
 3     2 TOYOTA Cor… 13750      23      10    2002  72937     210   1165       3
 4     3  TOYOTA Co… 13950      24       9    2002  41711     210   1165       3
 5     4 TOYOTA Cor… 14950      26       7    2002  48000     210   1165       3
 6     5 TOYOTA Cor… 13750      30       3    2002  38500     210   1170       3
 7     6 TOYOTA Cor… 12950      32       1    2002  61000     210   1170       3
 8     7  TOYOTA Co… 16900      27       6    2002  94612     210   1245       3
 9     8 TOYOTA Cor… 18600      30       3    2002  75889     210   1245       3
10    44 TOYOTA Cor… 16950      27       6    2002 110404     234   1255       3
# … with 1,426 more rows, 28 more variables: HP_Bin <chr>, CC_bin <chr>,
#   Doors <dbl>, Gears <dbl>, Cylinders <dbl>, Fuel_Type <chr>, Color <chr>,
#   Met_Color <dbl>, Automatic <dbl>, Mfr_Guarantee <dbl>,
#   BOVAG_Guarantee <dbl>, ABS <dbl>, Airbag_1 <dbl>, Airbag_2 <dbl>,
#   Airco <dbl>, Automatic_airco <dbl>, Boardcomputer <dbl>, CD_Player <dbl>,
#   Central_Lock <dbl>, Powered_Windows <dbl>, Power_Steering <dbl>,
#   Radio <dbl>, Mistlamps <dbl>, Sport_Model <dbl>, Backseat_Divider <dbl>, …
model <- lm(Price ~ Age_08_04 + Mfg_Year + KM +
              Weight + Guarantee_Period, data = car_resale)

model

Call:
lm(formula = Price ~ Age_08_04 + Mfg_Year + KM + Weight + Guarantee_Period, 
    data = car_resale)

Coefficients:
     (Intercept)         Age_08_04          Mfg_Year                KM  
      -2.637e+06        -1.409e+01         1.315e+03        -2.323e-02  
          Weight  Guarantee_Period  
       1.903e+01         2.770e+01  
check_collinearity(model)
# Check for Multicollinearity

Low Correlation

             Term   VIF     VIF 95% CI Increased SE Tolerance Tolerance 95% CI
 Guarantee_Period  1.04   [1.01, 1.17]         1.02      0.97     [0.86, 0.99]
        Age_08_04 31.07 [28.08, 34.38]         5.57      0.03     [0.03, 0.04]
         Mfg_Year 31.16 [28.16, 34.48]         5.58      0.03     [0.03, 0.04]

High Correlation

   Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
     KM 1.46 [1.37, 1.57]         1.21      0.68     [0.64, 0.73]
 Weight 1.41 [1.32, 1.51]         1.19      0.71     [0.66, 0.76]
check_c <- check_collinearity(model)
plot(check_c)

model1 <- lm(Price ~ Age_08_04 + KM + 
               Weight + Guarantee_Period, data = car_resale)
check_n <- check_normality(model1)
plot(check_n)

check_h <- check_heteroscedasticity(model1)
plot(check_h)

check_model(model1)

plot(parameters(model1))

ggcoefstats(model1,
            output = "plot")

exam <- read_csv("data/Exam_data.csv")
my_sum <- exam %>%
  group_by(RACE) %>%
  summarise(
    n=n(),
    mean=mean(MATHS),
    sd=sd(MATHS),
    lci = t.test(MATHS, conf.level = 0.95)$conf.int[1],
    uci = t.test(MATHS, conf.level = 0.95)$conf.int[2]
  ) %>%
  mutate(se=sd/sqrt(n-1))
knitr::kable(head(my_sum), format = 'html')
RACE n mean sd lci uci se
Chinese 193 76.50777 15.69040 74.28011 78.73544 1.132357
Indian 12 60.66667 23.35237 45.82928 75.50406 7.041005
Malay 108 57.44444 21.13478 53.41288 61.47601 2.043177
Others 9 69.66667 10.72381 61.42362 77.90971 3.791438
ggplot(my_sum) +
  geom_errorbar(
    aes(x=RACE,
        ymin=mean-se,
        ymax=mean+se),
    width=0.2,
    colour='black',
    alpha=0.9,
    size=0.5) +
  geom_point(aes
            (x=RACE,
            y=mean),
            stat = "identity",
            color = "red",
            size = 1.5,
            alpha = 1) +
  ggtitle("Standard error of mean maths score by race")

ggplot(my_sum) +
  geom_errorbar(
    aes(x=RACE,
        ymin=lci,
        ymax=uci),
    width=0.2,
    colour='black',
    alpha=0.9,
    size=0.5) +
  geom_point(aes
            (x=RACE,
            y=mean),
            stat = "identity",
            color = "red",
            size = 1.5,
            alpha = 1) +
  ggtitle("95% confidence interval of mean maths score by race")

exam %>%
  ggplot(aes(x = RACE,
             y = MATHS)) +
  stat_pointinterval() +
  labs(
    title = "Visualising confidence intervals of mean math score", subtitle = "Mean Point + Multuple-interval plot"
  )

exam %>%
  ggplot(aes(x = RACE, y=MATHS)) +
  stat_pointinterval(.width = 0.95,
                     .point = median,
                     .interval = qi) +
  labs(
    title = "Visualising confidence intervals of mean math score", subtitle = "Mean Point + Multiple-interval plot")

exam %>%
  ggplot(aes(x = RACE, 
             y = MATHS)) +
  stat_pointinterval(
    show.legend = FALSE) +   
  labs(
    title = "Visualising confidence intervals of mean math score",
    subtitle = "Mean Point + Multiple-interval plot")

exam %>%
  ggplot(aes(x = RACE, 
             y = MATHS)) +
  stat_gradientinterval(   
    fill = "skyblue",      
    show.legend = TRUE     
  ) +                        
  labs(
    title = "Visualising confidence intervals of mean math score",
    subtitle = "Gradient + interval plot")

devtools::install_github("wilkelab/ungeviz")
library(ungeviz)
ggplot(data = exam, 
       (aes(x = factor(RACE), y = MATHS))) +
  geom_point(position = position_jitter(
    height = 0.3, width = 0.05), 
    size = 0.4, color = "#0072B2", alpha = 1/2) +
  geom_hpline(data = sampler(25, group = RACE), height = 0.6, color = "#D55E00") +
  theme_bw() + 
  # `.draw` is a generated column indicating the sample draw
  transition_states(.draw, 1, 3)

ggplot(data = exam, 
       (aes(x = factor(RACE), 
            y = MATHS))) +
  geom_point(position = position_jitter(
    height = 0.3, 
    width = 0.05), 
    size = 0.4, 
    color = "#0072B2", 
    alpha = 1/2) +
  geom_hpline(data = sampler(25, 
                             group = RACE), 
              height = 0.6, 
              color = "#D55E00") +
  theme_bw() + 
  transition_states(.draw, 1, 3)

covid19 <- read_csv("data/COVID-19_DKI_Jakarta.csv") %>%
  mutate_if(is.character, as.factor)
funnel_plot(
  numerator = covid19$Positive,
  denominator = covid19$Death,
  group = covid19$`Sub-district`
)

A funnel plot object with 267 points of which 0 are outliers. 
Plot is adjusted for overdispersion. 
funnel_plot(
  numerator = covid19$Death,
  denominator = covid19$Positive,
  group = covid19$`Sub-district`,
  data_type = "PR",
  xrange = c(0, 6500),
  yrange = c(0, 0.05)
)

A funnel plot object with 267 points of which 7 are outliers. 
Plot is adjusted for overdispersion. 
funnel_plot(
  numerator = covid19$Death,
  denominator = covid19$Positive,
  group = covid19$`Sub-district`,
  data_type = "PR",   
  xrange = c(0, 6500),  
  yrange = c(0, 0.05),
  label = NA,
  title = "Cumulative COVID-19 Fatality Rate by Cumulative Total Number of COVID-19 Positive Cases", #<<           
  x_label = "Cumulative COVID-19 Positive Cases", #<<
  y_label = "Cumulative Fatality Rate"  #<<
)

A funnel plot object with 267 points of which 7 are outliers. 
Plot is adjusted for overdispersion. 
df <-  covid19 %>%
  mutate(rate = Death / Positive) %>%
  mutate(rate.se = sqrt((rate*(1-rate)) / (Positive))) %>%
  filter(rate > 0)
fit.mean <- weighted.mean(df$rate, 1/df$rate.se^2)
number.seq <- seq(1, max(df$Positive), 1)
number.ll95 <- fit.mean - 1.96 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
number.ul95 <- fit.mean + 1.96 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
number.ll999 <- fit.mean - 3.29 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
number.ul999 <- fit.mean + 3.29 * sqrt((fit.mean*(1-fit.mean)) / (number.seq)) 
dfCI <- data.frame(number.ll95, number.ul95, number.ll999, number.ul999, number.seq, fit.mean)
p <- ggplot(df, aes(x = Positive, y = rate)) +
  geom_point(aes(label=`Sub-district`), 
             alpha=0.4) +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ll95), 
            size = 0.4, 
            colour = "grey40", 
            linetype = "dashed") +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ul95), 
            size = 0.4, 
            colour = "grey40", 
            linetype = "dashed") +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ll999), 
            size = 0.4, 
            colour = "grey40") +
  geom_line(data = dfCI, 
            aes(x = number.seq, 
                y = number.ul999), 
            size = 0.4, 
            colour = "grey40") +
  geom_hline(data = dfCI, 
             aes(yintercept = fit.mean), 
             size = 0.4, 
             colour = "grey40") +
  coord_cartesian(ylim=c(0,0.05)) +
  annotate("text", x = 1, y = -0.13, label = "95%", size = 3, colour = "grey40") + 
  annotate("text", x = 4.5, y = -0.18, label = "99%", size = 3, colour = "grey40") + 
  ggtitle("Cumulative Fatality Rate by Cumulative Number of COVID-19 Cases") +
  xlab("Cumulative Number of COVID-19 Cases") + 
  ylab("Cumulative Fatality Rate") +
  theme_light() +
  theme(plot.title = element_text(size=12),
        legend.position = c(0.91,0.85), 
        legend.title = element_text(size=7),
        legend.text = element_text(size=7),
        legend.background = element_rect(colour = "grey60", linetype = "dotted"),
        legend.key.height = unit(0.3, "cm"))
p

fp_ggplotly <- ggplotly(p,
  tooltip = c("label", 
              "x", 
              "y"))
fp_ggplotly