#----------------------------------------------
# Name:    R Script - An Intuitive Introduction to R
# Author:  Okan Sarioglu
# GitHub:  Okan2022
# E-Mail:  o.sarioglu@gmx.de
# LinkedIn: @osarioglu
#----------------------------------------------

# This R script belongs to the course "An Intuitive Introduction to R".
# Its purpose is not only to let you read the code but also to execute
# it directly to get familiar with R. This script contains only the
# code from the course.

#-------------
# Instructions
#-------------

# Work through this R script from beginning to end. You only need to
# follow two rules.

# The rules are simple:

# First, execute these lines, since they are required
# (load packages and simulate data):

#=====================RUN THESE LINES FIRST====================================#

# Simulating the data
source(
  "https://raw.githubusercontent.com/Okan2022/bookdown_aiitr_en/main/script/simulating_data_aiitr.R"
)

#=====================RUN THESE LINES FIRST====================================#

# Check your environment. If the message
# "The simulated data has been loaded successfully!" appears in your
# console, everything is fine!

#====================IMPORTANT MESSAGE!!!!!====================================#

# Second, you cannot run individual lines of code at random. That may
# result in error messages. To guarantee success, always run everything
# between two "#------" dividers. Only then will the code run without
# errors.

# And now: enjoy the course!


#-----------------
# 1. Fundamentals
#-----------------

#-------------------------------------------------------------------------------

### Mathematical operations in R

# The hashtag lets you write comments

1 + 1 # Addition

1 - 1 # Subtraction

1 * 1 # Multiplication

1 / 1 # Division

2^(1 / 2) # Mixed term

#-------------------------------------------------------------------------------

### Comparison operators

1 < 3   # TRUE

5 >= 8  # FALSE

11 != 10 # TRUE

22 == 22 # TRUE

7 < 3   # FALSE

5 <= 2 + 3 # TRUE

#-------------------------------------------------------------------------------

### Logical operators (&, |, !)

5 & 4 < 8  # TRUE

5 | 4 < 8  # TRUE

!5 > 2     # FALSE

#-------------------------------------------------------------------------------

### Using functions

sqrt(x = 36) # Square root

exp(x = 0)   # Exponential function

print("Over 7 bridges you must go") # print arbitrary text

#-------------------------------------------------------------------------------

### Getting help

?exp()       # question-mark method
help(exp)    # help function

#-------------------------------------------------------------------------------

### Assigning objects and printing them

Pizza <- 7.50 # object Pizza

Coke <- 3.50  # object Coke

Pizza + Coke  # adding the two objects

#-------------------------------------------------------------------------------

Offer <- Pizza + Coke # store the sum

Offer   # print the object

Offer^2 # square the term

#-------------------------------------------------------------------------------

### Vectors

food <- c("Pizza", "Kebab", "Curry",
          "Fish", "Burrito") # food vector

print(food)

prices <- c(7.50, 6.00, 8.50, 3.00, 11.00) # price vector

print(prices)

coke_prices <- c(3.50, 3, 4, 2.50, 3) # coke prices

print(coke_prices)

#-------------------------------------------------------------------------------

combined_prices <- prices + coke_prices # add the prices

print(combined_prices)

#-------------------------------------------------------------------------------

### Object classes

# Check the classes
class(prices)
class(food)
class(coke_prices)

#-------------------------------------------------------------------------------

# Change the class of a variable
coke_prices_character <- as.character(coke_prices)

# Check it
class(coke_prices_character)
print(coke_prices_character)

#-------------------------------------------------------------------------------

### Building matrices

# Bind by columns
price_index <- cbind(food,
                     prices,
                     coke_prices)

print(price_index)

# Bind by rows
price_index2 <- rbind(food,
                      prices,
                      coke_prices)

print(price_index2)

#-------------------------------------------------------------------------------

# Simulate a matrix
matrix_example <- matrix(1:20, nrow = 4, ncol = 5, byrow = T)

# Check it
print(matrix_example)
dim(matrix_example)

# What happens if byrow = FALSE?
matrix_example2 <- matrix(1:20, nrow = 4, ncol = 5, byrow = F)

print(matrix_example2)
dim(matrix_example2)

#-------------------------------------------------------------------------------

### Working with matrices

row    <- 1
column <- 1

# Print it
print(object1 <- matrix_example[row, ])         # first row
print(object2 <- matrix_example[, column])      # first column
print(object3 <- matrix_example[row, column])   # value at first row, first column

print(matrix_example)

#-------------------------------------------------------------------------------

# More information

nrow(matrix_example) # number of rows
ncol(matrix_example) # number of columns
dim(matrix_example)  # dimensions

#-------------------------------------------------------------------------------

### Data frames

# Build an example data frame
df_example <- data.frame(
  country = c("Austria", "England", "Brazil", "Germany"),
  capital = c("Vienna", "London", "Brasilia", "Berlin"),
  pop     = c(9.04, 55.98, 215.3, 83.8),
  europe  = c(TRUE, FALSE, TRUE, TRUE)
)

# Check it
print(df_example)

#-------------------------------------------------------------------------------

# Access columns
df_example$country

# Inspect a single observation
df_example$country[3]

# Apply a condition to the data frame
df_example$country[df_example$pop > 60]

#-------------------------------------------------------------------------------

### If-else statements with one condition

grade <- 1.7

if (grade < 2) {
  print("Good Job")
} # if(test expression), then {body expression}

grade <- 2.5

if (grade < 2) {
  print("Good Job")
} # The condition is not met, so nothing happens

#-------------------------------------------------------------------------------

### If statements with an else branch

grade <- 3.3

if (grade <= 2) {
  print("Good Job")
} else {
  print("Life goes on")
}

grade <- 1.3

if (grade <= 2) {
  print("Good Job")
} else {
  print("Life goes on")
}

#-------------------------------------------------------------------------------

### The ifelse() function

ifelse(grade <= 2, "Good Job", "Life goes on")

#-------------------------------------------------------------------------------

### If-else ladders / multiple conditions

grade <- 3.3

if (grade == 1.0) {
  print("Amazing")
} else if (grade > 1.0 & grade <= 2.0) {
  print("Good Job")
} else if (grade > 2.0 & grade <= 3.0) {
  print("OK")
} else if (grade > 3.0 & grade <= 4.0) {
  print("Life goes on")
} else {
  print("No expression found")
}

grade <- 1.7

if (grade == 1.0) {
  print("Amazing")
} else if (grade > 1.0 & grade <= 2.0) {
  print("Good Job")
} else if (grade > 2.0 & grade <= 3.0) {
  print("OK")
} else if (grade > 3.0 & grade <= 4.0) {
  print("Life goes on")
} else {
  print("No expression found")
}

grade <- 5.0

if (grade == 1.0) {
  print("Amazing")
} else if (grade > 1.0 & grade <= 2.0) {
  print("Good Job")
} else if (grade > 2.0 & grade <= 3.0) {
  print("OK")
} else if (grade > 3.0 & grade <= 4.0) {
  print("Life goes on")
} else {
  print("No expression found")
}

#-------------------------------------------------------------------------------

### Nested ifelse()

grade <- 1.7

ifelse(grade == 1.0, "Amazing",
       ifelse(grade > 1 & grade <= 2, "Good Job",
              ifelse(grade > 2 & grade <= 3, "OK",
                     ifelse(grade > 3 & grade <= 4, "Life goes on",
                            "No expression found"
                     )
              )
       )
)

grade <- 3.3

ifelse(grade == 1.0, "Amazing",
       ifelse(grade > 1 & grade <= 2, "Good Job",
              ifelse(grade > 2 & grade <= 3, "OK",
                     ifelse(grade > 3 & grade <= 4, "Life goes on",
                            "No expression found")
              )
       )
)

# Logic: ifelse(test expression, value if TRUE, ifelse(test expression 2, ...))


#----------------------
# 2. Data Manipulation
#----------------------

#-------------------------------------------------------------------------------

### Loading packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load("tidyverse", "psych", "gapminder")

#-------------------------------------------------------------------------------

### The ESS data
# The simulated ESS dataset has already been loaded via source().
# Let us have a look:

head(ess)

#-------------------------------------------------------------------------------

### Pipelines / piping

# Vector with three random numbers
q <- c(6, 3, 8)

# First mean, then exponent, then square root — nested
sqrt(exp(mean(q)))

# Using a pipe
q %>%
  mean() %>%
  exp() %>%
  sqrt()

#-------------------------------------------------------------------------------

### The filter() function: one condition

# Keep only respondents from Hungary
d1 <- ess %>%
  filter(cntry == "HU")

head(d1)

# Keep only respondents younger than 40
d2 <- ess %>%
  filter(agea <= 40)

head(d2)

#-------------------------------------------------------------------------------

### The filter() function: multiple conditions

# Keep cases from Hungary and France
d1 <- ess %>%
  filter(cntry %in% c("HU", "FR"))

head(d1)

# Keep cases under 40 from Hungary and France
d2 <- ess %>%
  filter(cntry %in% c("HU", "FR") &
           agea <= 40)

head(d2)

#-------------------------------------------------------------------------------

### The select() function

# Select the relevant variables
d1 <- ess %>%
  select(year, cntry, happy, agea, gndr, eisced)

head(d1)

# Drop a column by prefixing it with a minus sign
d2 <- d1 %>%
  select(-agea)

head(d2)

#-------------------------------------------------------------------------------

### Combining select() with filter()

d1 <- ess %>%
  filter(agea < 40) %>%
  select(year, cntry, happy, agea, gndr, eisced)

head(d1)

#-------------------------------------------------------------------------------

### The arrange() function

# Ascending order
d1 <- ess %>%
  filter(agea < 40) %>%
  select(year, cntry, happy, agea, gndr, eisced) %>%
  arrange(agea)

head(d1)

# Descending order
d1 <- ess %>%
  filter(agea < 40) %>%
  select(year, cntry, happy, agea, gndr, eisced) %>%
  arrange(desc(agea))

head(d1)

#-------------------------------------------------------------------------------

### rename() and relocate()

# Rename variables
d1 <- ess %>%
  filter(agea < 40) %>%
  select(year, cntry, happy, agea, gndr, eisced) %>%
  arrange(desc(agea)) %>%
  rename(country   = cntry,
         age       = agea,
         education = eisced,
         female    = gndr)

head(d1)

#-------------------------------------------------------------------------------

# Reorder variables
d1 <- ess %>%
  filter(agea < 40) %>%
  select(year, cntry, happy, agea, gndr, eisced) %>%
  arrange(desc(agea)) %>%
  rename(country   = cntry,
         age       = agea,
         education = eisced,
         female    = gndr) %>%
  relocate(education, age, female, country, happy, year)

head(d1)

# Move after a column
d2 <- ess %>%
  filter(agea < 40) %>%
  select(year, cntry, happy, agea, gndr, eisced) %>%
  arrange(desc(agea)) %>%
  rename(country   = cntry,
         age       = agea,
         education = eisced,
         female    = gndr) %>%
  relocate(country, .after = age)

head(d2)

# Move before a column
d3 <- ess %>%
  filter(agea < 40) %>%
  select(year, cntry, happy, agea, gndr, eisced) %>%
  arrange(desc(agea)) %>%
  rename(country   = cntry,
         age       = agea,
         education = eisced,
         female    = gndr) %>%
  relocate(country, .before = age)

head(d3)

#-------------------------------------------------------------------------------

### The mutate() function

# Mutate a variable
d1 <- ess %>%
  mutate(happy_10 = happy * 10)

head(d1)

#-------------------------------------------------------------------------------

# More mutating with class conversion
d2 <- ess %>%
  mutate(new_variable = happy * 10 / eisced + 67,
         female_char  = as.character(gndr)) %>%
  select(female_char, new_variable)

head(d2)

#-------------------------------------------------------------------------------

### Recoding with mutate() and recode()

# Overview of the variable
table(ess$happy)

# Recode the variables
d1 <- ess %>%
  mutate(
    gndr_fac  = as.factor(gndr),
    happy_cat = dplyr::recode(happy,
                              `1`  = 0, `2`  = 0, `3`  = 0, `4`  = 0,
                              `5`  = 1,
                              `6`  = 2, `7`  = 2, `8`  = 2, `9`  = 2, `10` = 2,
                              `77` = NA_real_, `88` = NA_real_, `99` = NA_real_),
    female    = dplyr::recode(gndr_fac,
                              `1` = "Male",
                              `2` = "Female",
                              `9` = NA_character_))

# Check the result
table(d1$happy_cat)
table(d1$female)

#-------------------------------------------------------------------------------

### Recoding with mutate() and case_when()

d1 <- ess %>%
  mutate(
    gndr_fac  = as.factor(gndr),
    happy_cat = case_when(
      happy < 5  ~ 0,
      happy == 5 ~ 1,
      happy > 5  ~ 2),
    female = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female"
    ))

table(d1$female)
table(d1$happy_cat)

#-------------------------------------------------------------------------------

### Recoding with mutate() and ifelse()

d1 <- ess %>%
  mutate(
    gndr_fac  = as.factor(gndr),
    happy_cat = ifelse(happy < 5, 0,
                       ifelse(happy == 5, 1,
                              ifelse(happy > 5, 2, NA))),
    female    = ifelse(gndr_fac == 1, "Male",
                       ifelse(gndr_fac == 2, "Female", NA))
  )

table(d1$happy_cat)
table(d1$female)

#-------------------------------------------------------------------------------

### Handling missing values

# Full workflow: filter, select, arrange, rename, mutate
d1 <- ess %>%
  filter(agea >= 40) %>%
  select(year, cntry, netusoft, agea, eisced, gndr, happy) %>%
  arrange(desc(agea)) %>%
  rename(
    internet_use = netusoft,
    age          = agea,
    education    = eisced,
    female       = gndr) %>%
  mutate(
    internet_use = case_when(
      internet_use > 5 ~ NA_real_,
      TRUE ~ internet_use),
    age = case_when(
      age == 999 ~ NA_real_,
      TRUE ~ age),
    education = case_when(
      education %in% c(55, 77, 88, 99) ~ NA_real_,
      TRUE ~ education),
    female = case_when(
      female == 1 ~ 0,
      female == 2 ~ 1,
      female == 9 ~ NA_real_,
      TRUE ~ female),
    happy = case_when(
      happy %in% c(77, 88, 99) ~ NA_real_,
      TRUE ~ happy)
  ) %>%
  arrange(age)

head(d1)

#-------------------------------------------------------------------------------

# Drop NAs with tidyverse
d2 <- d1 %>%
  drop_na()

colSums(is.na(d2))

# Drop NAs with base R
d3 <- na.omit(d1)

colSums(is.na(d3))

#-------------------------------------------------------------------------------

### group_by() and summarize() — one grouping variable

d1 <- ess %>%
  mutate(
    gndr_fac = as.factor(gndr),
    female = case_when(
      gndr_fac == 1 ~ "Male",
      gndr_fac == 2 ~ "Female",
      gndr_fac == 9 ~ NA_character_
    )) %>%
  drop_na() %>%
  group_by(female) %>%
  dplyr::summarize(average_happiness = mean(happy))

head(d1)

#-------------------------------------------------------------------------------

### group_by() and summarize() — multiple grouping variables and metrics

d1 <- ess %>%
  mutate(
    country  = cntry,
    gndr_fac = as.factor(gndr),
    female = case_when(
      gndr_fac == 1 ~ "Male",
      gndr_fac == 2 ~ "Female",
      gndr_fac %in% c(77, 88, 99) ~ NA_character_),
    age = case_when(
      agea == 999 ~ NA_real_,
      TRUE ~ agea)
  ) %>%
  drop_na() %>%
  group_by(country, female) %>%
  dplyr::summarize(average_happiness = mean(happy),
                   median_happiness  = median(happy),
                   average_age       = mean(age),
                   median_age        = median(age))

head(d1)

#-------------------------------------------------------------------------------

### Merging datasets — simulating World Bank data

countries <- c("BE", "BG", "CH", "EE", "FR", "GB")
indicator <- c("NY.GDP.PCAP.CD", "TX.VAL.FUEL.ZS.UN", "EN.ATM.CO2E.KT")

# Simulate the data (alternative to the World Bank API call)
wb <- data.frame(
  iso2c             = c("BE", "BG", "CH", "EE", "FR", "GB"),
  NY.GDP.PCAP.CD    = c(45587.97, 10148.34, 85897.78, 23565.18,
                        39179.74, 40217.01),
  TX.VAL.FUEL.ZS.UN = c(5.021, 4.644, 0.6111, 4.863, 1.886, 7.062),
  EN.ATM.CO2E.KT    = c(85364.10, 34138.10, 34916.10, 7097.52,
                        267154.70, 308650.30)
)

head(wb)

# Clean the World Bank data
wb <- wb %>%
  select(iso2c, NY.GDP.PCAP.CD, TX.VAL.FUEL.ZS.UN, EN.ATM.CO2E.KT) %>%
  arrange(iso2c) %>%
  rename(gdp_per_cap = NY.GDP.PCAP.CD,
         oil_exp     = TX.VAL.FUEL.ZS.UN,
         co2         = EN.ATM.CO2E.KT) %>%
  mutate(oil_exp = round(oil_exp, 2))

head(wb)

#-------------------------------------------------------------------------------

# Prepare ESS for merging
d1 <- ess %>%
  filter(cntry == c("BE", "BG", "CZ", "EE", "FI")) %>%
  rename(iso2c = cntry) %>%
  group_by(iso2c, year) %>%
  summarise(happy_agg = round(mean(happy), 2))

head(d1)

#-------------------------------------------------------------------------------

### left_join() and right_join() with one identifier

merged_data <- left_join(d1, wb,
                         by = "iso2c")

head(merged_data)

merged_data2 <- right_join(d1, wb,
                           by = "iso2c")

head(merged_data2)

#-------------------------------------------------------------------------------

### left_join() and right_join() with two identifiers

# New World Bank data with a year variable
wb <- data.frame(
  iso3c             = c("BEL", "BEL", "BGR", "BGR"),
  iso2c             = c("BE", "BE", "BG", "BG"),
  year              = c(2019, 2020, 2019, 2020),
  NY.GDP.PCAP.CD    = c(46641.72, 45587.97, 9874.336, 10148.34),
  TX.VAL.FUEL.ZS.UN = c(7.38, 5.02, 9.53, 4.64),
  EN.ATM.CO2E.KT    = c(92989.4, 85364.10, 39159.9, 34138.10)
)

# Clean it
wb <- wb %>%
  select(-iso3c) %>%
  arrange(iso2c) %>%
  rename(gdp_per_cap = NY.GDP.PCAP.CD,
         oil_exp     = TX.VAL.FUEL.ZS.UN,
         co2         = EN.ATM.CO2E.KT) %>%
  mutate(oil_exp = round(oil_exp, 2))

head(wb)

# Data with year for merging
d1 <- data.frame(
  iso2c     = c("BE", "BE", "BG", "BG", "CZ", "CZ"),
  year      = c(2019, 2020, 2019, 2020, 2019, 2020),
  happy_agg = c(5.95, 6.76, 6.56, 7.54, 6.27, 6.88)
)

head(d1)

# left_join() with two identifiers
merged_data3 <- left_join(d1, wb,
                          by = c("iso2c", "year"))

head(merged_data3)

# right_join() with two identifiers
merged_data4 <- right_join(d1, wb,
                           by = c("iso2c", "year"))

head(merged_data4)


#----------------------
# 3. Data Visualisation
#----------------------

#-------------------------------------------------------------------------------

pacman::p_load("tidyverse", "babynames", "sf", "ggridges",
               "rnaturalearth", "rnaturalearthdata", "forcats", "tmap")

#-------------------------------------------------------------------------------

### Introduction to ggplot2

ggplot() # empty frame

#-------------------------------------------------------------------------------

### Histograms — basic histogram

# Overview of data1
glimpse(data1)

ggplot(data1, aes(x = value)) +
  geom_histogram()

#-------------------------------------------------------------------------------

# Histogram with colour and fill
ggplot(data1, aes(x = value)) +
  geom_histogram(color = "white", fill = "#69b3a2")

#-------------------------------------------------------------------------------

# Histogram with axis labels and scaling
ggplot(data1, aes(x = value)) +
  geom_histogram(color = "white", fill = "#69b3a2") +
  labs(
    x = "Value",
    y = "Count",
    title = "A Histogram") +
  scale_x_continuous(breaks = seq(-4, 4, 1),
                     limits = c(-4, 4))

#-------------------------------------------------------------------------------

# Histogram with theme_minimal()
ggplot(data1, aes(x = value)) +
  geom_histogram(color = "white", fill = "#69b3a2") +
  labs(
    x = "Value",
    y = "Count",
    title = "A Histogram") +
  scale_x_continuous(breaks = seq(-4, 4, 1),
                     limits = c(-4, 4)) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Histogram with binwidth

# binwidth = 0.1
ggplot(data1, aes(x = value)) +
  geom_histogram(color = "white", fill = "#69b3a2",
                 binwidth = 0.1) +
  labs(
    x = "Value",
    y = "Count",
    title = "A Histogram with binwidth = 0.1") +
  scale_x_continuous(breaks = seq(-4, 4, 1),
                     limits = c(-4, 4)) +
  theme_minimal()

# binwidth = 0.6
ggplot(data1, aes(x = value)) +
  geom_histogram(color = "white", fill = "#69b3a2",
                 binwidth = 0.6) +
  labs(
    x = "Value",
    y = "Count",
    title = "A Histogram with binwidth = 0.6") +
  scale_x_continuous(breaks = seq(-4, 4, 1),
                     limits = c(-4, 4)) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Multiple histograms

# Two histograms in one plot
ggplot(data2, aes(x = value, fill = type)) +
  geom_histogram(color = "#e9ecef",
                 position = "identity") +
  theme_bw()

# Two histograms with transparency and manual colours
ggplot(data2, aes(x = value, fill = type)) +
  geom_histogram(color = "#e9ecef",
                 alpha = 0.6,
                 position = "identity") +
  scale_fill_manual(values = c("#8AA4D6", "#E89149")) +
  theme_bw()

#-------------------------------------------------------------------------------

### Density plots

# Basic density plot
ggplot(data1, aes(x = value)) +
  geom_density()

# Density plot with styling
ggplot(data1, aes(x = value)) +
  geom_density(color = "white",
               fill = "orange",
               alpha = 0.6) +
  labs(
    x = "Value",
    y = "Density",
    title = "A Density Plot") +
  scale_x_continuous(breaks = seq(-4, 4, 1),
                     limits = c(-4, 4)) +
  theme_minimal()

# Multiple density plots
ggplot(data2, aes(x = value, fill = type)) +
  geom_density(color = "#0a0a0a",
               alpha = 0.9,
               position = "identity") +
  scale_fill_manual(values = c("#FDE725FF",
                               "#440154FF")) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Boxplots

# Basic boxplot
ggplot(data1, aes(x = value)) +
  geom_boxplot()

# Nice-looking boxplot
ggplot(data1, aes(x = value)) +
  geom_boxplot() +
  labs(
    x = "Value",
    y = "Frequency",
    title = "A Boxplot") +
  scale_x_continuous(breaks = seq(-4, 4, 1),
                     limits = c(-4, 4)) +
  theme_classic()

#-------------------------------------------------------------------------------

### Multiple boxplots: income by age group

# Basic grouped boxplot
ggplot(data3, aes(x = age,
                  y = income, fill = age)) +
  geom_boxplot()

# Nice-looking boxplot with adjustments
ggplot(data3, aes(x = age, y = income, fill = age)) +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  scale_fill_manual(values = c("#acf6c8", "#ecec53", "#D1BC8A")) +
  labs(
    title = "Comparison of Income Distribution by Age Group",
    x = "Age",
    y = "Income"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

#-------------------------------------------------------------------------------

### Ranking: bar plot

# Basic bar plot
ggplot(data4, aes(x = name, y = strength)) +
  geom_bar(stat = "identity")

# Nice-looking bar plot
ggplot(data4, aes(x = name, y = strength)) +
  geom_bar(stat = "identity", fill = "#AE388B") +
  labs(
    x = "",
    y = "Strength",
    title = "Strength of Fictional Characters"
  ) +
  theme_test()

#-------------------------------------------------------------------------------

### Bar plot for counting categories

ggplot(data5, aes(x = hero)) +
  geom_bar(fill = "#AE388B") +
  labs(
    x = "",
    y = "Frequency",
    title = "Who is your favourite character?"
  ) +
  scale_y_continuous(breaks = seq(0, 10, 1)) +
  theme_test()

#-------------------------------------------------------------------------------

### Flipping bar plots with coord_flip()

# Plot 1
ggplot(data4, aes(x = name, y = strength)) +
  geom_bar(stat = "identity", fill = "#AE388B") +
  labs(
    x = "",
    y = "Strength",
    title = "Strength of Fictional Characters"
  ) +
  theme_test() +
  coord_flip()

# Plot 2
ggplot(data5, aes(x = hero)) +
  geom_bar(fill = "#AE388B") +
  labs(
    x = "",
    y = "Frequency",
    title = "Who is your favourite character?"
  ) +
  scale_y_continuous(breaks = seq(0, 10, 1)) +
  theme_test() +
  coord_flip()

#-------------------------------------------------------------------------------

### Sorting with fct_reorder()

# Plot 1 — sorted
ggplot(data4, aes(x = fct_reorder(name, strength), y = strength)) +
  geom_bar(stat = "identity", fill = "#AE388B") +
  labs(
    x = "",
    y = "Strength",
    title = "Strength of Fictional Characters"
  ) +
  theme_test()

# Plot 2 — sorted and flipped
ggplot(data4, aes(x = fct_reorder(name, strength), y = strength)) +
  geom_bar(stat = "identity", fill = "#AE388B") +
  labs(
    x = "",
    y = "Strength",
    title = "Strength of Fictional Characters"
  ) +
  theme_test() +
  coord_flip()

#-------------------------------------------------------------------------------

### Grouped and stacked bar plots

# Grouped bar plot side by side (dodge)
ggplot(data6, aes(x = age, y = value, fill = female)) +
  geom_bar(position = "dodge", stat = "identity")

# Stacked bar plot (stack)
ggplot(data6, aes(x = age, y = value, fill = female)) +
  geom_bar(position = "stack", stat = "identity")

#-------------------------------------------------------------------------------

### Bar plots with colour palettes

# Plot 1 — dodge with scale_fill_brewer
ggplot(data6, aes(x = age, y = value, fill = female)) +
  geom_bar(position = "dodge", stat = "identity",
           width = 0.35) +
  scale_fill_brewer(palette = "Accent") +
  scale_y_continuous(breaks = seq(0, 15, 1)) +
  labs(
    x = "Age Group",
    y = "Average Well-being Score",
    title = "Effect of Age on Well-being\nby Gender"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

# Plot 2 — stack with scale_fill_brewer
ggplot(data6, aes(x = age, y = value, fill = female)) +
  geom_bar(position = "stack", stat = "identity",
           width = 0.35) +
  scale_fill_brewer(palette = "Accent") +
  scale_y_continuous(breaks = seq(0, 15, 2)) +
  labs(
    x = "Age Group",
    y = "Average Well-being Score",
    title = "Effect of Age on Well-being by Gender"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

#-------------------------------------------------------------------------------

### Line charts

# Basic line chart
ggplot(data7, aes(x = date, y = y)) +
  geom_line()

# Dashed line chart with adjustments
ggplot(data7, aes(x = date, y = y)) +
  geom_line(color = "#0F52BA", linetype = "dashed",
            linewidth = 1) +
  scale_y_continuous(breaks = seq(-1, 6, 1),
                     limits = c(-1, 6)) +
  scale_x_continuous(breaks = seq(2000, 2024, 2)) +
  labs(
    y = "",
    x = "Year",
    title = "A Line Chart"
  ) +
  theme_bw()

#-------------------------------------------------------------------------------

### Multiple lines

# Two lines
ggplot(data7) +
  geom_line(aes(x = date, y = y)) +
  geom_line(aes(x = date, y = y2))

# Multiple lines with individual styling
ggplot(data7) +
  geom_line(aes(x = date, y = y),
            linetype = "twodash",
            linewidth = 1,
            color = "#365E32") +
  geom_line(aes(x = date, y = y2),
            linetype = "longdash",
            linewidth = 1,
            color = "#FD9B63") +
  scale_y_continuous(breaks = seq(-5, 6, 1),
                     limits = c(-5, 6)) +
  scale_x_continuous(breaks = seq(2000, 2024, 2)) +
  labs(
    y = "",
    x = "Year",
    title = "A Line Chart"
  ) +
  theme_bw()

#-------------------------------------------------------------------------------

### Grouped line charts — baby names

# Inspect the dataset
head(babynames)

# Restrict to three names
babynames_cut <- babynames %>%
  filter(name %in% c("Emma", "Kimberly", "Ruth")) %>%
  filter(sex == "F")

# Basic grouped line chart
ggplot(babynames_cut, aes(x = year, y = n,
                          group = name,
                          color = name)) +
  geom_line()

# Nicely styled line chart
ggplot(babynames_cut, aes(x = year, y = n,
                          group = name,
                          color = name)) +
  geom_line(linewidth = 1) +
  scale_color_brewer(palette = "Set1") +
  labs(
    x = "Year",
    y = "Number of babies with this name",
    title = "Popularity of Baby Names Over Time",
    color = "Name"
  ) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Scatter plots — basic scatter plot

ggplot(data_point, aes(x = marketing_budget,
                       y = sales)) +
  geom_point()

# With styling
ggplot(data_point, aes(x = marketing_budget,
                       y = sales)) +
  geom_point(color = "#99582a") +
  scale_x_continuous(breaks = seq(0, 10000, 2000)) +
  labs(
    x = "Marketing Budget",
    y = "Sales per Unit",
    title = "Chocolate Milk: Sales and Marketing"
  ) +
  theme_classic()

#-------------------------------------------------------------------------------

### Scatter plots with multiple groups

# Coloured by group
ggplot(data8, aes(x = marketing_budget,
                  y = sales,
                  color = name)) +
  geom_point() +
  scale_color_manual(values = c("#e71d36",
                                "#260701")) +
  scale_x_continuous(breaks = seq(0, 10000, 2000)) +
  labs(
    x = "Marketing Budget",
    y = "Sales per Unit",
    title = "Chocolate: Sales and Marketing",
    color = "Product"
  ) +
  theme_classic()

# Using shapes instead of colours
ggplot(data8, aes(x = marketing_budget,
                  y = sales,
                  shape = name)) +
  geom_point(size = 2.5) +
  scale_x_continuous(breaks = seq(0, 10000, 2000)) +
  labs(
    x = "Marketing Budget",
    y = "Sales per Unit",
    title = "Chocolate: Sales and Marketing",
    shape = "Product"
  ) +
  theme_classic()

# Combining colour AND shape
ggplot(data8, aes(x = marketing_budget,
                  y = sales,
                  shape = name,
                  color = name)) +
  geom_point(size = 2.5) +
  scale_color_manual(values = c("#e71d36",
                                "#260701")) +
  scale_x_continuous(breaks = seq(0, 10000, 2000)) +
  labs(
    x = "Marketing Budget",
    y = "Sales per Unit",
    title = "Chocolate: Sales and Marketing",
    shape = "",
    color = ""
  ) +
  theme_classic()

#-------------------------------------------------------------------------------

### Plots with facet_wrap()

# Simple facet_wrap by quarter
ggplot(data8, aes(x = marketing_budget,
                  y = sales)) +
  geom_point() +
  facet_wrap(~ quarters)

# With styling
ggplot(data8, aes(x = marketing_budget,
                  y = sales)) +
  geom_point(color = "#99582a") +
  scale_x_continuous(breaks = seq(0, 10000, 2000)) +
  labs(
    x = "Marketing Budget",
    y = "Sales per Unit",
    title = "Chocolate: Sales and Marketing"
  ) +
  theme_classic() +
  facet_wrap(~ quarters)

# facet_wrap combined with shape and colour
ggplot(data8, aes(x = marketing_budget,
                  y = sales,
                  shape = name,
                  color = name)) +
  geom_point(size = 2.5) +
  scale_color_manual(values = c("#e71d36",
                                "#260701")) +
  scale_x_continuous(breaks = seq(0, 10000, 2000)) +
  labs(
    x = "Marketing Budget",
    y = "Sales per Unit",
    title = "Chocolate: Sales and Marketing",
    shape = "",
    color = ""
  ) +
  theme_classic() +
  facet_wrap(~ quarters)

#-------------------------------------------------------------------------------

### facet_grid() with city x year

ggplot(data9, aes(x = Month, y = Temperature,
                  group = Year, color = factor(Year))) +
  geom_line() +
  labs(title = "Average Monthly Temperature (Jan-Apr, 2018-2020)",
       x = "Month",
       y = "Temperature (°C)",
       color = "Year") +
  theme_bw() +
  theme(legend.position = "none") +
  facet_grid(Year ~ City)

#-------------------------------------------------------------------------------

### Combining different chart types — dual-axis chart

ggplot(data10, aes(x = months)) +
  geom_bar(aes(y = n_deaths), stat = "identity",
           fill = "#FF8080", alpha = 0.6) +
  geom_line(aes(y = avg_temp * scale_factor, group = 1),
            color = "#2c2c2c", linewidth = 1, linetype = "dashed") +
  scale_y_continuous(
    name = "Number of Traffic Deaths",
    sec.axis = sec_axis(~ . / scale_factor,
                        name = "Average Temperature (Celsius)")
  ) +
  labs(x = "",
       title = "Number of Traffic Deaths and Average Temperature per Month") +
  theme_bw() +
  theme(
    axis.title.y.left  = element_text(color = "#FF8080"),
    axis.title.y.right = element_text(color = "#2c2c2c")
  )

#-------------------------------------------------------------------------------

### Violin plot

ggplot(sports_data, aes(x = sport, y = height, fill = sport)) +
  geom_violin(trim = FALSE) +
  labs(
    title = "Distribution of Athletes' Heights by Sport",
    x = "Sport",
    y = "Height (cm)"
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    plot.title      = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title.x    = element_text(size = 14),
    axis.title.y    = element_text(size = 14)
  ) +
  scale_fill_brewer(palette = "RdBu")

#-------------------------------------------------------------------------------

### Ridgeline plot

ggplot(ridgeline_data, aes(x = value, y = distribution, fill = distribution)) +
  geom_density_ridges() +
  scale_fill_brewer(palette = "Dark2") +
  labs(
    x = "Values",
    y = "Distribution",
    title = "A Ridgeline Chart"
  ) +
  theme_ridges() +
  theme(legend.position = "none")

#-------------------------------------------------------------------------------

### Lollipop chart

# Basic
ggplot(data4, aes(x = name, y = strength)) +
  geom_point() +
  geom_segment(aes(x = name, xend = name, y = 0, yend = strength))

# Nice-looking
ggplot(data4, aes(x = name, y = strength)) +
  geom_segment(aes(x = name, xend = name, y = 0, yend = strength),
               color = "grey") +
  geom_point(size = 4, color = "#74B72E") +
  labs(x = "Fictional Character",
       y = "Strength",
       title = "Strength of Fictional Characters") +
  theme_light() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.border       = element_blank(),
    axis.ticks.x       = element_blank()
  )

#-------------------------------------------------------------------------------

### Maps — world map with income groups

# Load country-level shapefiles
world <- ne_countries(scale = "medium", returnclass = "sf")
world <- world %>%
  filter(gdp_year == 2019) %>%
  mutate(`Income Group` = case_when(
    income_grp %in% c("1. High income: OECD",
                      "2. High income: nonOECD") ~ "1. High Income",
    income_grp == "3. Upper middle income"       ~ "2. Upper Middle Income",
    income_grp == "4. Lower middle income"       ~ "3. Lower Middle Income",
    income_grp == "5. Low income"                ~ "4. Low Income")
  )

# Plot with tmap
tmap_mode("view")
tm_shape(world) +
  tm_polygons("Income Group",
              title   = "Income Groups",
              palette = "viridis",
              style   = "cat",
              id      = "sovereignt")

# Switch back to plot mode
tmap_mode("plot")


#------------------------------
# 4. Exploratory Data Analysis
#------------------------------

#-------------------------------------------------------------------------------

pacman::p_load("summarytools", "SmartEDA", "skimr",
               "naniar", "gtsummary", "dlookr",
               "DataExplorer", "psych", "ggplot2",
               "palmerpenguins", "dplyr", "tidyr", "corrplot")

# The penguins dataset has already been loaded via source()
head(penguins)

#-------------------------------------------------------------------------------

### Measures of central tendency — mode

uniq_vals <- unique(penguins$bill_length_mm)          # unique values
freq      <- tabulate(match(penguins$bill_length_mm,
                            uniq_vals))               # counts of values
uniq_vals[which.max(freq)]                             # most frequent value

#-------------------------------------------------------------------------------

### Measures of central tendency — mean and median

mean(penguins$bill_length_mm)
median(penguins$bill_length_mm)

#-------------------------------------------------------------------------------

### Measures of dispersion

IQR(penguins$bill_length_mm)
var(penguins$bill_length_mm)
sd(penguins$bill_length_mm)

#-------------------------------------------------------------------------------

### Cross tabulations / contingency tables

# Base R
table(penguins$species, penguins$island)

# summarytools
summarytools::ctable(penguins$species, penguins$island)

# gtsummary
gtsummary::tbl_cross(data = penguins,
                     row  = species,
                     col  = island)

#-------------------------------------------------------------------------------

### Correlation — Pearson

cor(penguins$bill_length_mm, penguins$body_mass_g,
    method = "pearson")

# Spearman
cor(penguins$bill_length_mm, penguins$body_mass_g,
    method = "spearman")

# Kendall
cor(penguins$bill_length_mm, penguins$body_mass_g,
    method = "kendall")

#-------------------------------------------------------------------------------

### Correlation — scatter plot

ggplot(penguins, aes(x = bill_length_mm, y = body_mass_g)) +
  geom_point(color = "#0077b6") +
  labs(x = "Length in mm",
       y = "Body mass in g",
       title = "Relationship between Length (in mm) and Body Mass (in g)") +
  theme_bw()

#-------------------------------------------------------------------------------

### Visualising three correlation types

ggplot(df_cor, aes(x = x, y = y)) +
  geom_point(color = "steelblue", size = 2) +
  facet_wrap(~ relationship, nrow = 1) +
  labs(title = "Strong Positive, Negative and No Correlation",
       x = "X", y = "Y") +
  theme_bw(base_size = 18)

#-------------------------------------------------------------------------------

### Correlation matrix

# Numeric subset
penguins_numeric <- penguins %>%
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g) %>%
  drop_na()

# Calculate the correlation matrix
corr_matrix <- cor(penguins_numeric)

# Simple visualisation
corrplot(corr_matrix, method = "color")

# With coefficients
corrplot(corr_matrix, method = "color", type = "upper",
         addCoef.col = "black", tl.col = "black", tl.srt = 45)

# Circle method with coefficients
corrplot(corr_matrix, method = "circle", type = "upper",
         addCoef.col = "black", tl.col = "black", tl.srt = 45)

# Circle method without coefficients
corrplot(corr_matrix, method = "circle", type = "upper",
         tl.col = "black", tl.srt = 45)

#-------------------------------------------------------------------------------

### Working with EDA packages — psych

# describe() applied to the entire dataset
psych::describe(penguins)

# Correlation test
psych::corr.test(penguins_numeric)

# pairs.panels: visualise correlations
psych::pairs.panels(penguins_numeric)

#-------------------------------------------------------------------------------

### skimr

skimr::skim(penguins)

# Without distribution sparklines
# skimr::skim_without_charts(penguins)

#-------------------------------------------------------------------------------

### summarytools

# Dataset summary
dfSummary(penguins)

# Use view() for a formatted output (in RStudio)
# view(dfSummary(penguins))

# Frequency table for a single variable
freq(penguins$species)

# Descriptive statistics
descr(penguins)

# Cross tabulations
ctable(penguins$species, penguins$island)

#-------------------------------------------------------------------------------

### naniar — missing values

# Tabular
naniar::miss_var_summary(penguins_raw)

# Structure of missing values
naniar::gg_miss_upset(penguins_raw)

# Visualisation of missing values
naniar::vis_miss(penguins_raw)

#-------------------------------------------------------------------------------

### gtsummary — publication-ready tables

# Overview
gtsummary::tbl_summary(penguins)

# Grouped by sex
penguins %>%
  tbl_summary(by = sex)

# Cross tabulation
penguins %>%
  tbl_cross(
    row = species,
    col = island
  )

#-------------------------------------------------------------------------------

### dlookr

# Diagnose
diagnose(penguins) %>%
  print()

# describe from dlookr
dlookr::describe(penguins)

# Generate an EDA report (commented out; would create an HTML file)
# dlookr::eda_paged_report(penguins, output_format = "html")

#-------------------------------------------------------------------------------

### DataExplorer

# Basic information
introduce(penguins)

# Missing values
plot_missing(penguins_raw)

# Correlations
plot_correlation(penguins_numeric)

# Automated report (commented out)
# create_report(penguins)

#-------------------------------------------------------------------------------

### SmartEDA

# Structure and missing values
ExpData(penguins, type = 1)

# Descriptive statistics for numeric variables
ExpNumStat(penguins)

# Automated EDA report (commented out)
# ExpReport(data = penguins, Target = "species",
#           label = "Penguin Species",
#           op_file = "Samp1.html", Rc = 3)


#------------------
# 5. Data Analysis
#------------------

#-------------------------------------------------------------------------------

pacman::p_load("tidyverse", "ggpubr", "gapminder",
               "sjPlot", "GGally", "car", "margins", "plotly")

#-------------------------------------------------------------------------------

### Linear regression — visualising the data

ggplot(df, aes(x, y)) +
  geom_point() +
  theme_bw() +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) +
  scale_y_continuous(breaks = seq(0, 20, by = 2))

#-------------------------------------------------------------------------------

### Visually fitting the OLS line with residuals

ggplot(df, aes(x, y)) +
  geom_point() +
  theme_bw() +
  scale_x_continuous(breaks = seq(0, 10, by = 1)) +
  scale_y_continuous(breaks = seq(0, 30, by = 5)) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_segment(aes(x = x, y = y, xend = x,
                   yend = predict(lm(y ~ x, data = df))),
               linewidth = 0.5)

#-------------------------------------------------------------------------------

### Calculating OLS by hand

# First, compute the covariance
cov <- sum((df$x - mean(df$x)) * (df$y - mean(df$y)))

# Variance of x
x_sq <- sum((df$x - mean(df$x))^2)
x_sq

# Slope
slope <- cov / x_sq
print(slope)

# Intercept
beta_0 <- mean(df$y) - (slope * mean(df$x))
beta_0

#-------------------------------------------------------------------------------

### Linear regression with lm()

# Fit the model
model1 <- lm(y ~ x,
             data = df)

# Summary
summary(model1)

#-------------------------------------------------------------------------------

### Predictions from a linear regression

# Manual calculation using the estimated coefficients
df$y_hat <- 1.6821 + 1.5394 * df$x

# Automatic with predict()
df$auto_y_hat <- predict(model1)

# Check it
head(df)

#-------------------------------------------------------------------------------

### Standard errors — RMSE

#Getting the residuals
residuals <- df$y_hat - df$y

#Getting the squared residuals
squared_residuals <- residuals^2

#Getting the mean of the squared residuals
mean_sq_resid <- mean(squared_residuals)

#Getting the RSME
rmse <- sqrt(mean_sq_resid)

#Or in one line
#rmse <- sqrt(mean((df$y_hat - df$y)^2))
#-------------------------------------------------------------------------------

### Standard error of the estimator (by hand)

#Getting the residuals
residuals <- df$y - df$y_hat

#getting sigma squared
sigma_sq <- sum(residuals^2) / (nrow(df) - 2)

#getting standard error of beta 1
se_beta1 <- sqrt(sigma_sq / sum((df$x - mean(df$x))^2))

#print it
se_beta1

#-------------------------------------------------------------------------------

### t-value / t-statistic

# From the regression
t_value_intercept <- -0.32773 / 0.30271
t_value_x         <- 0.67949 / 0.04813

print(t_value_intercept)
print(t_value_x)

#-------------------------------------------------------------------------------

### t-distributions with different degrees of freedom

ggplot(densities, aes(x = x, y = density, color = distribution)) +
  geom_line() +
  theme_minimal() +
  labs(x = "x", y = "Density",
       title = "t-distributions with different degrees of freedom") +
  scale_color_manual(values = c("black", "red", "green", "blue")) +
  scale_x_continuous("X", seq(-5, 5, 1), limits = c(-5, 5))

#-------------------------------------------------------------------------------

### Identifying the t-statistic visually

t_density <- function(x) dt(x, df = 28)

ggplot(t_value_data, aes(x = x, y = density)) +
  geom_line(lineend = "round") +
  stat_function(fun = t_density, geom = "area", fill = "gray",
                alpha = 0.75, xlim = c(-5, -1.701), n = 10000) +
  stat_function(fun = t_density, geom = "area", fill = "gray",
                alpha = 0.75, xlim = c(5, 1.701), n = 10000) +
  geom_vline(xintercept = -1.701, linetype = "dashed",
             colour = "red") +
  geom_vline(xintercept =  1.701, linetype = "dashed",
             colour = "red") +
  ggtitle("t-distribution with 28 degrees of freedom",
          subtitle = "The grey area marks the interval of significant values at the 95% level") +
  geom_segment(x = -1.082653,
               xend = -1.082653,
               yend = dt(-1.082653, df = 28),
               y = -1,
               color = "pink",
               linetype = "dashed",
               linewidth = 0.2) +
  annotate("point", x = -1.082653, y = dt(-1.082653, df = 28),
           color = "pink") +
  scale_x_continuous("X", seq(-5, 5, 1), limits = c(-5, 5)) +
  theme_classic() +
  theme(legend.position = "none")

#-------------------------------------------------------------------------------

### p-values by hand

p_value_1 <- 2 * pt(-abs(t_value_intercept), 28)
p_value_2 <- 2 * pt(-abs(t_value_x), 28)

print(p_value_1)
print(p_value_2)

#-------------------------------------------------------------------------------

### Confidence intervals — visualising the simulation

# Plot of the 100 simulated CIs (CIs was created in the simulation script)
ggplot(data = CIs) +
  geom_pointrange(
    aes(
      x     = estimates,
      xmin  = lower_ci,
      xmax  = upper_ci,
      y     = id,
      color = missed
    )
  ) +
  geom_vline(
    aes(xintercept = true_mean)
  ) +
  scale_color_manual(values = c("red", "azure4")) +
  theme_minimal() +
  labs(
    title    = "Confidence Interval for the Mean",
    subtitle = "True population parameter = 24",
    x        = "Estimates",
    y        = "Sample",
    color    = "Is the true population parameter contained in the CI?"
  ) +
  theme(legend.position = "top") +
  scale_x_continuous(breaks = c(seq(15, 30, by = 1)))

#-------------------------------------------------------------------------------

### Confidence intervals for regression coefficients

# Automatic
confint(model1)

# By hand: intercept
ci_lower_int <- model1$coefficients[1] -
  1.96 * summary(model1)$coef[, "Std. Error"][1]
ci_upper_int <- model1$coefficients[1] +
  1.96 * summary(model1)$coef[, "Std. Error"][1]

print(ci_lower_int)
print(ci_upper_int)

# By hand: estimator for x
ci_lower_est <- model1$coefficients[2] -
  1.96 * summary(model1)$coef[, "Std. Error"][2]
ci_upper_est <- model1$coefficients[2] +
  1.96 * summary(model1)$coef[, "Std. Error"][2]

print(ci_lower_est)
print(ci_upper_est)

#-------------------------------------------------------------------------------

### Multivariate regression

lm(y ~ x + categorical_variable, data = df) %>%
  summary()

#-------------------------------------------------------------------------------

### Categorical variables — table

table(df$categorical_variable)

#-------------------------------------------------------------------------------

### Model with a categorical variable

model2 <- lm(y ~ categorical_variable,
             data = df)

summary(model2)

#-------------------------------------------------------------------------------

### Model with + 0 (no intercept)

model3 <- lm(y ~ categorical_variable + 0,
             data = df)

summary(model3)

# Verify the coefficient difference manually
result <- coefficients(model3)[2] - coefficients(model3)[1]
print(result)

#-------------------------------------------------------------------------------

### Interaction effects

# Fit the interaction model
model_interaction <- lm(coding_ability ~
                          hours_spent * this_course,
                        data = df_int)

# Summary
summary(model_interaction)

#-------------------------------------------------------------------------------

### Plotting the interaction effect

plot_model(model_interaction, type = "int") +
  scale_x_continuous(breaks = seq(0, 10, 1)) +
  labs(title = "Comparison of Coding Skills by Course Type",
       x = "Hours Spent",
       y = "R Coding Skills") +
  scale_color_manual(
    values = c("red", "blue"),
    labels = c("Other Courses", "This Course")
  ) +
  theme_sjplot() +
  theme(legend.position = "bottom",
        legend.title    = element_blank())


#-----------------------
# 6. Loops and Functions
#-----------------------

#-------------------------------------------------------------------------------

### For loops — motivation: lots of if-else

grade <- 4.0

if (grade == 1.0) {
  print("Excellent")
} else if (grade > 1.0 & grade <= 2.0) {
  print("Good Job")
} else if (grade > 2.0 & grade <= 3.0) {
  print("OK")
} else if (grade > 3.0 & grade <= 4.0) {
  print("Life goes on")
}

grade <- 3.3

if (grade == 1.0) {
  print("Excellent")
} else if (grade > 1.0 & grade <= 2.0) {
  print("Good Job")
} else if (grade > 2.0 & grade <= 3.0) {
  print("OK")
} else if (grade > 3.0 & grade <= 4.0) {
  print("Life goes on")
}

grade <- 2.3

if (grade == 1.0) {
  print("Excellent")
} else if (grade > 1.0 & grade <= 2.0) {
  print("Good Job")
} else if (grade > 2.0 & grade <= 3.0) {
  print("OK")
} else if (grade > 3.0 & grade <= 4.0) {
  print("Life goes on")
}

#-------------------------------------------------------------------------------

### Grade vector and loop

# Grade vector
grades <- c(1.7, 3.3, 4.0, 2.3, 1.0)

# The for loop
for (i in 1:length(grades)) {
  if (grades[i] == 1.0) {
    print("Excellent")
  } else if (grades[i] > 1.0 & grades[i] <= 2.0) {
    print("Good Job")
  } else if (grades[i] > 2.0 & grades[i] <= 3.0) {
    print("OK")
  } else if (grades[i] > 3.0 & grades[i] <= 4.0) {
    print("Life goes on")
  }
}

#-------------------------------------------------------------------------------

### Looping over a number vector

num <- c(1, 2, 3, 4, 5, 249)

for (i in num) {
  print(stringr::str_c("This is iteration number ", i, "."))
}

#-------------------------------------------------------------------------------

### Nested loops — tic-tac-toe

# Define the matrix
ttt <- matrix(c("X", "O", "X",
                "O", "X", "O",
                "O", "X", "O"), nrow = 3, ncol = 3, byrow = TRUE)

# Nested loop
for (i in 1:nrow(ttt)) {
  for (j in 1:ncol(ttt)) {
    print(paste("In row", i, "and column",
                j, "the board contains", ttt[i, j]))
  }
}

#-------------------------------------------------------------------------------

### The apply() function family

# Build a matrix
mat <- matrix(1:10, nrow = 5, ncol = 6)
head(mat)

# Column-wise mean, sum and standard deviation
apply(mat, 2, mean)
apply(mat, 2, sum)
apply(mat, 2, sd)

# The equivalent loop
for (i in 1:ncol(mat)) {
  avg_column <- mean(mat[, i])
  print(avg_column)
}

#-------------------------------------------------------------------------------

### Writing your own functions

# Simple addition function
add <- function(x, y) {
  result <- x + y
  return(result)
}

add(2, 7)

#-------------------------------------------------------------------------------

### Area of a circle

aoc <- function(radius) {
  pi   <- 3.14159
  area <- pi * radius^2
  return(area)
}

aoc(5)

#-------------------------------------------------------------------------------

### A more complex function — classroom

classroom <- function(x) {
  for (i in 1:length(x)) {
    for (j in 1:length(x[[i]])) {
      student <- x[[i]][j]
      if (student == 1) {
        comment <- "Alice"
      } else if (student == 2) {
        comment <- "Bob"
      } else if (student == 3) {
        comment <- "Cathy"
      } else if (student == 4) {
        comment <- "David"
      } else if (student == 5) {
        comment <- "Eva"
      } else {
        comment <- paste("Unknown student", student,
                         "is doing something interesting.")
      }
      cat("In row", i, "column", j, ":", comment, "\n")
    }
  }
}

seating_plan <- list(
  c(1, 5, 2),
  c(4, 3, 7)
)

classroom(seating_plan)


#-------------------------
# 7. Further Explanations
#-------------------------

#-------------------------------------------------------------------------------

pacman::p_load("dplyr", "tidyr", "ggpubr", "gapminder",
               "kableExtra", "car")

#-------------------------------------------------------------------------------

### Probability theory — rolling one die

dice_role <- 3
print(dice_role)

#-------------------------------------------------------------------------------

### 1000 dice rolls — uniform distribution (data: dice_rolls)

ggplot(dice_rolls, aes(x = factor(roll))) +
  geom_bar(fill = "#89CFF0", color = "gray") +
  labs(
    title = "Distribution of 1,000 Dice Rolls",
    x     = "Die Face",
    y     = "Frequency"
  ) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Theoretical uniform distribution

ggplot(uniform_dis, aes(x = Outcome, y = Probability)) +
  geom_point() +
  labs(
    title = "Uniform Probability Distribution of a Fair Die",
    x     = "Die Face",
    y     = "Probability"
  ) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Cumulative distribution of the die

ggplot(uniform_dis, aes(x = Outcome, y = cumsum)) +
  geom_point() +
  labs(
    title = "Cumulative Distribution of a Fair Die",
    x     = "Die Face",
    y     = "Probability"
  ) +
  ylim(0, 1) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Bernoulli distribution — fair coin

ggplot(fair_coin, aes(x = outcome)) +
  geom_bar(fill = "#89CFF0", width = 0.35) +
  labs(
    title = "Simulation of 10 Tosses of a Fair Coin",
    x     = "Outcome",
    y     = "Frequency"
  ) +
  theme_minimal()

# Unfair coin
ggplot(unfair_coin, aes(x = outcome)) +
  geom_bar(fill = "#89CFF0", width = 0.35) +
  labs(
    title = "Simulation of 10 Tosses of an Unfair Coin",
    x     = "Outcome",
    y     = "Frequency"
  ) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Probabilities of an exact number of heads — dbinom()

# Fair coin: P(5 heads in 10 tosses)
dbinom(
  x    = 5,
  size = 10,
  prob = 0.5
)

# Unfair coin: P(3 heads in 10 tosses)
dbinom(
  x    = 3,
  size = 10,
  prob = 0.28
)

#-------------------------------------------------------------------------------

### Visualising binomial distributions

ggplot(theoretical_probs, aes(x = heads, y = prob)) +
  geom_point(size = 1.5, color = "black") +
  facet_wrap(~ coin_type,
             labeller = as_labeller(c(
               biased   = "Biased coin (p = 0.28)",
               unbiased = "Fair coin (p = 0.5)"))) +
  labs(title = "Theoretical Binomial Distribution of Number of Heads in 10 Tosses",
       x     = "Number of Heads",
       y     = "Probability") +
  scale_x_continuous(breaks = 0:10) +
  theme_bw()

#-------------------------------------------------------------------------------

### Cumulative binomial distributions

ggplot(theoretical_probs, aes(x = heads, y = cumsum_prob)) +
  geom_point(size = 1.5, color = "black") +
  facet_wrap(~ coin_type,
             labeller = as_labeller(c(
               biased   = "Biased coin (p = 0.28)",
               unbiased = "Fair coin (p = 0.5)")),
             nrow = 2) +
  labs(
    title = "Cumulative Distribution of Number of Heads in 10 Tosses",
    x     = "Number of Heads",
    y     = "Cumulative Probability"
  ) +
  scale_x_continuous(breaks = 0:10) +
  theme_bw()

#-------------------------------------------------------------------------------

### Normal distribution — PDF

ggplot(snd, aes(x = sample)) +
  geom_density() +
  labs(title = "Normal Distribution",
       x = "Value of the Random Variable",
       y = "Density") +
  scale_x_continuous(breaks = -5:5,
                     limits = c(-5, 5)) +
  theme_minimal()

# CDF
ggplot(snd, aes(x = sample)) +
  stat_ecdf(geom = "step", color = "black") +
  labs(title = "Cumulative Distribution Function (CDF)",
       x = "Value of the Random Variable",
       y = "Cumulative Probability") +
  theme_minimal()

#-------------------------------------------------------------------------------

### Visualising the Central Limit Theorem

ggplot(data_clt, aes(x = mean)) +
  geom_density(fill = "skyblue", alpha = 0.6) +
  facet_wrap(~ rolls, scales = "free", ncol = 2) +
  labs(title = "Demonstration of the Central Limit Theorem with Dice Rolls",
       x     = "Sample Mean",
       y     = "Density") +
  scale_x_continuous(breaks = 1:6,
                     limits = c(1, 6)) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Overview of common distributions

ggplot(plot_df, aes(x = x, y = y)) +
  geom_line(data = filter(plot_df,
                          !distribution %in% c("Poisson (lambda=3)")),
            color = "steelblue", linewidth = 1) +
  geom_point(data = filter(plot_df, distribution == "Poisson (lambda=3)"),
             color = "steelblue", size = 1) +
  facet_wrap(~ distribution, scales = "free", ncol = 3) +
  labs(title = "Overview of Common Statistical Distributions",
       x = "x", y = "Density / Probability") +
  theme_minimal(base_size = 14)

#-------------------------------------------------------------------------------

### Working with distributions — simulate IQ

ggplot(sample_iq, aes(x = height)) +
  geom_density(linewidth = 1, color = "#E35335") +
  labs(
    x = "IQ",
    y = "Frequency"
  ) +
  scale_x_continuous(breaks = seq(40, 160, 20),
                     limits = c(40, 160)) +
  theme_minimal()

#-------------------------------------------------------------------------------

### Probabilities with dnorm()

dnorm(x = 100, mean = 100, sd = 15) # IQ 100
dnorm(x = 87,  mean = 100, sd = 15) # IQ 87
dnorm(x = 140, mean = 100, sd = 15) # IQ 140

#-------------------------------------------------------------------------------

### Cumulative probabilities with pnorm()

pnorm(q = 100, mean = 100, sd = 15)
pnorm(q = 87,  mean = 100, sd = 15)
pnorm(q = 140, mean = 100, sd = 15)

#-------------------------------------------------------------------------------

### Quantiles with qnorm()

qnorm(p = 0.5,       mean = 100, sd = 15)
qnorm(p = 0.1930623, mean = 100, sd = 15)
qnorm(p = 0.9961696, mean = 100, sd = 15)

#-------------------------------------------------------------------------------

### Regression diagnostics — model and residuals

# Model
model1 <- lm(y ~ x, data = df)

# Predictions for y
df$y_hat <- 1.6821 + 1.5394 * df$x

# Residuals by hand
df$residuals <- df$y - df$y_hat
head(df)

# Residuals automatically
df$residuals_auto <- residuals(model1)
head(df)

#-------------------------------------------------------------------------------

### Residual plot

ggplot(df, aes(x, residuals_auto)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  scale_y_continuous("Residuals", seq(-6, 6, 1),
                     limits = c(-6, 6)) +
  scale_x_continuous("Fitted values", seq(0, 10, 1),
                     limits = c(0, 10)) +
  theme_bw()

#-------------------------------------------------------------------------------

### Homoscedasticity vs. heteroscedasticity

homoscedastic_plot <- ggplot(df_homo_hetero, aes(x = x, y = y_homo)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous("Y", seq(-0.5, 3.5, 0.5), limits = c(-0.5, 3.5)) +
  labs(title = "Homoscedastic plot") +
  theme_minimal()

heteroscedastic_plot <- ggplot(df_homo_hetero,
                               aes(x = x, y = y_hetero)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Heteroscedastic plot") +
  scale_y_continuous("Y", seq(-0.5, 3.5, 0.5), limits = c(-0.5, 3.5)) +
  theme_minimal()

facet_plots <- ggarrange(homoscedastic_plot,
                         heteroscedastic_plot, nrow = 1)
print(facet_plots)

#-------------------------------------------------------------------------------

### TSS, ESS, R-squared

tss        <- sum((df$y - mean(df$y))^2)
ess        <- sum((df$y_hat - mean(df$y))^2)
r_squared  <- ess / tss
r_squared

# Automatic
summary(model1)$r.squared

#-------------------------------------------------------------------------------

### Influential outliers

# Plot with two regression lines (with/without outlier)
ggplot(data_outlier, aes(x = x1, y = y1)) +
  geom_point(shape = 20, size = 3) +
  geom_abline(aes(slope     = model_outlier$coefficients[2],
                  intercept = model_outlier$coefficients[1],
                  color     = "Model with outlier"),
              linewidth = 0.75, show.legend = TRUE) +
  geom_abline(aes(slope     = model_without_outlier$coefficients[2],
                  intercept = model_without_outlier$coefficients[1],
                  color     = "Model without outlier"),
              linewidth = 0.75, show.legend = TRUE) +
  xlab("Independent variable") +
  theme_classic() +
  theme(legend.position = c(0.15, 0.9),
        legend.title    = element_blank())

#-------------------------------------------------------------------------------

### Cook's distance

data_outlier$cooks_distance <- cooks.distance(model_outlier)

ggplot(data_outlier, aes(x = x1, y = cooks_distance)) +
  geom_point(colour = "darkgreen", size = 3, alpha = 0.5) +
  labs(y = "Cook's Distance", x = "Independent variable") +
  geom_hline(yintercept = 1, linetype = "dashed") +
  theme_bw()

#-------------------------------------------------------------------------------

### Functional form — quadratic data

# Simple linear model on quadratic data
model_simple <- lm(Y_quadratic ~ X_quadratic, data = df2)
summary(model_simple)

# Plot with linear fit
ggplot(df2, aes(x = X_quadratic, y = Y_quadratic)) +
  geom_point(shape = 20, size = 3) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_bw()

#-------------------------------------------------------------------------------

### Quadratic model (correctly using poly())

model_quadratic <- lm(Y_quadratic ~ poly(X_quadratic, 2),
                      data = df2)
summary(model_quadratic)

# Plot with quadratic fit
ggplot(df2, aes(x = X_quadratic, y = Y_quadratic)) +
  geom_point(shape = 20, size = 3) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2),
              color = "red",
              se = FALSE) +
  scale_x_continuous("X", breaks = seq(-5, 5, 1),
                     limits = c(-5, 5)) +
  ylab("Y") +
  theme_bw()

#-------------------------------------------------------------------------------

### Gapminder — non-linear relationship

head(gapminder)

# Without log transformation
ggplot(gapminder, aes(gdpPercap, lifeExp)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE) +
  scale_y_continuous("Life expectancy", seq(30, 80, 10),
                     limits = c(30, 80)) +
  theme_bw()

# With log transformation
ggplot(gapminder, aes(log(gdpPercap), lifeExp)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous("Life expectancy", seq(30, 80, 10),
                     limits = c(30, 80)) +
  xlab("GDP per capita") +
  theme_bw()

#-------------------------------------------------------------------------------

### Independent observations — time series

ggplot(df_time_series, aes(date, y_time)) +
  geom_line() +
  ylab("Y") +
  xlab("Year") +
  theme_bw()

#-------------------------------------------------------------------------------

### Adjusted R-squared (multivariate regression)

# Multivariate model
multivariate_model <- lm(y ~ x + categorical_variable, data = df)

summary(multivariate_model)

# Extract
summary(multivariate_model)$adj.r.squared

# By hand
adj_r_squared <- 1 - (((1 - summary(multivariate_model)$r.squared) *
                         (nrow(df) - 1)) / (nrow(df) - 2 - 1))
print(adj_r_squared)

#-------------------------------------------------------------------------------

### Omitted variable bias (OVB)

# Model without temperature
model_without_temperature <- lm(violence_crime_true ~ ice_cream_sales,
                                data = data_ice)

# Model with only temperature
model_only_temperature <- lm(violence_crime_true ~ temperature,
                             data = data_ice)

# Full model
model_with_temperature <- lm(violence_crime_true ~ ice_cream_sales + temperature,
                             data = data_ice)

# Summaries
summary(model_without_temperature)
summary(model_only_temperature)
summary(model_with_temperature)

# Comparison table (helper function table_ovb was defined in the simulation script)
table_ovb(model_without_temperature, model_with_temperature)

#-------------------------------------------------------------------------------

### Multicollinearity

# Model with correlated predictors
grades_model <- lm(grades ~ learning_time + gaming_time, data = df_grades)
summary(grades_model)

#-------------------------------------------------------------------------------

### Checking correlations

cormatrix_data <- df_grades %>%
  dplyr::select(learning_time, gaming_time)

cormatrix <- cor(cormatrix_data)
round(cormatrix, 2)

#-------------------------------------------------------------------------------

### Variance Inflation Factor (VIF)

vif(grades_model)

#-------------------------------------------------------------------------------

# End of the script!
print("You have reached the end of the script — congratulations!")