# Required packages are installed and loaded here. If more packages are needed, they can be added to the list
packages <- c("foreign" , "dplyr", "sjPlot", "lattice", "lmtest", "ggplot2", "Hmisc", "rcompanion", "kableExtra")
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages], repos = "https://cran.r-project.org/")
}
invisible(lapply(packages, library, character.only = TRUE))

#Read Data
data_us <- read.csv("data/data_election.csv")
knitr::kable(head(data_us, 10), booktabs = TRUE,  caption = 'A table of the first 10 rows of the vote data.') %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "100%")

#total number of cases
dim(data_us) 
table(data_us$vote)

#Relative Frequencies Vote
prop.table(table(data_us$vote))

#Relative Frequencies Trump
prop.table(table(data_us$trump))

#Cross Tab w. column percentages
tab_xtab(var.row = data_us$trump, var.col = data_us$education, show.col.prc = TRUE, show.obs = TRUE, show.summary = FALSE) #show.col.prc = TRUE displays column percentages

#Cross Tab w. row percentages 
tab_xtab(var.row = data_us$trump, var.col = data_us$education, show.row.prc = TRUE, show.obs = TRUE, show.summary = FALSE) #show.row.prc = TRUE displays row percentages

#Cross Tab w. total percentages
tab_xtab(var.row = data_us$trump, var.col = data_us$education, show.cell.prc = TRUE, show.obs = TRUE, show.summary = FALSE) #show.cell.prc = TRUE displays cell or total percentages

#Histograms age
histogram( ~ age | trump ,
           breaks = 10, 
           ylim=c(0,12),
           type = "percent", 
           main = "Left: trump=0 (in support of Biden), Right: trump=1 (in support of Trump)",
           ylab = "Percent of observations",
           xlab = "Age in years",
           layout = c(2,1),
           scales = list(relation="free"),
           col = 'grey',
           data = data_us)

#Mean, Median and SD of age for Biden Suporters
mean(data_us$age[data_us$trump == 0], na.rm=TRUE)
median(data_us$age[data_us$trump == 0], na.rm=TRUE)
sd(data_us$age[data_us$trump == 0], na.rm=TRUE)

#Mean, Median and SD of age for Trump Suporters
mean(data_us$age[data_us$trump == 1], na.rm=TRUE)
median(data_us$age[data_us$trump == 1], na.rm=TRUE)
sd(data_us$age[data_us$trump == 1], na.rm=TRUE)

#Recode age and cross tab w. column percentages
data_us <- data_us %>%  
  mutate(age_cat =  
           case_when(age <= 35 ~ 1,   
                     age > 36 & age <= 59 ~ 2,  
                     age > 59 ~ 3))  


tab_xtab(var.row = data_us$trump, var.col = data_us$age_cat, show.col.prc = TRUE, show.obs = TRUE, show.summary = FALSE)

# Age variable will be subdivided into the three dummy variables "young", "middle-aged", and "old"
data_us <- data_us %>%  
  mutate(age_cat_nom =  
           case_when(age <= 35 ~ "young",   
                     age > 36 & age <= 59 ~ "middle-aged",  
                     age > 59 ~ "old"))  

data_us$age_cat_nom  <- factor(data_us$age_cat_nom, levels=c('young', 'middle-aged', 'old'))

# Recoding of electoral participation and removal of missings
data_us <- data_us %>%  
  mutate(trump_nom =
           case_when(trump == 0 ~ "Favoring Biden",   
                     trump == 1 ~ "Favoring Trump"))

data_us_counted <- data_us  %>%  count(age_cat_nom, trump_nom)
data_us_counted <- subset(data_us_counted, !is.na(data_us_counted$age_cat_nom)) # removal of missing values
data_us_counted <- subset(data_us_counted, !is.na(data_us_counted$trump_nom)) # removal of missing values

#Plot of Voting intention and age
ggplot(data_us_counted, aes(fill=trump_nom, y=n, x=age_cat_nom)) +
  geom_bar(position="fill", stat="identity") +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Age groups",
       y = "Shares",
       fill = "Voting intention US presidential election") +
  theme_minimal()

#Same Plot Scales reversed
ggplot(data_us_counted, aes(fill=age_cat_nom, y=n, x=trump_nom)) +
  geom_bar(position="fill", stat="identity") +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(x = "Voting intention US presidential election",
       y = "Shares",
       fill = "Age groups") +
  theme_minimal()

#Read States data
data_states <- read.csv("data/data_states.csv")
knitr::kable(
  head(data_states, 10), booktabs = TRUE,
  caption = 'A table of the first 10 rows of the regional vote data.') %>%
  kable_paper() %>%
  scroll_box(width = "100%", height = "100%")

#Scatter Plot Trump Approval x Education
sc1 <- ggplot(data=data_states, aes(x = perc_higheducation, y = perc_trump)) + 
  geom_point() + 
  xlab("Share of highly-educated persons in %") +
  ylab("Support of Trump in %") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent)
sc1

#Obtaining means for each variable
mean(data_states$perc_trump, na.rm=TRUE)
mean(data_states$perc_higheducation, na.rm=TRUE)

#Scatter plot using percentages as scale units
sc2 <- ggplot(data=data_states, aes(x = perc_higheducation, y = perc_trump)) + 
  geom_point() + 
  xlab("Share of highly-educated persons in %") +
  ylab("Support of Trump in %") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  geom_hline(yintercept=0.470922, linetype="dashed", color = "red", size=1) +
  geom_vline(xintercept=0.1927645, linetype="dashed", color = "red", size=1)
sc2

#Scatter Plot w. Regression Slope
sc3 <- ggplot(data=data_states, aes(x = perc_higheducation, y = perc_trump)) + 
  geom_point() + 
  xlab("Share of highly-educated persons in %") +
  ylab("Support of Trump in %") +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  geom_smooth(method = lm, se = FALSE)
sc3

#Correlation table high education and trump approval
vars <- c("perc_higheducation", "perc_trump")
cor.vars <- data_states[vars]
rcorr(as.matrix(cor.vars))

#Cross Tab education Trump Approval
tab_xtab(var.row = data_us$trump, var.col = data_us$education, show.col.prc = TRUE, show.obs = TRUE, show.summary = TRUE) #setting show.summary = TRUE displays summary statistics at the bottom of the table

#Chisquare, Cramers V
chisq.test(data_us$trump, data_us$education)
cramerV_tabelle <- table(data_us$trump, data_us$education)
cramerV(cramerV_tabelle)