#---
#title: "III Inferential Statistics - Case Study Satisfaction with Government"
#author: "Conrad Ziller, University of Duisburg-Essen"
#---

#Load Packages
packages <- c("haven" , "dplyr", "ggplot2")
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages], repos = "https://cran.r-project.org/")
}
invisible(lapply(packages, library, character.only = TRUE))


#Create Bernoulli Distribution + Histogram
possible_values <- c(1,0)
Bernoulli <- sample(possible_values,
                    size=10000,
                    replace=TRUE,
                    prob=c(0.3, 0.7))
prop.table(table(Bernoulli))
h <- hist(Bernoulli,plot=FALSE)
h$density = h$counts/sum(h$counts)*100
plot(h,freq=FALSE, axes=FALSE)
axis(1, at = c(0, 1), labels = c("Blue", "Red"))
axis(2, at = c(0, 10, 20, 30, 40, 50, 60, 70))

# Draw a standard normal distribution:
z = seq(-4, 4, length.out=1001)
x = rnorm(z)
plot( x=z, y=dnorm(z), bty='n', type='l', main="Standard normal distribution", ylab="Probability density", xlab="z", xlim=c(-3,3))
axis(1, at = seq(-4, 4, by = 1))

# annotate the density function with the 5% probability mass tails
polygon(x=c(z[z<=qnorm(0.025)], qnorm(0.025), min(z)), y=c(dnorm(z[z<=qnorm(0.025)]), 0, 0), col=grey(0.8))
polygon(x=c(z[z>=qnorm(0.975)], max(z), qnorm(0.975)), y=c(dnorm(z[z>=qnorm(0.975)]), 0, 0), col=grey(0.8))


#Central Limit Theorem, Histogram 
set.seed(123)
data <- rnorm(100000, 4.2, 1)
hist(data, freq = FALSE, col = "gray", xlab = "Data Values", main = "Means of government satisfaction")
curve(dnorm(x, mean = mean(data), sd = sd(data)), col = "black", lwd = 2, add = TRUE)


#Load ESS data and show Histogram + Summary
data_ess <- read_dta("data/ESS9_DE.dta", encoding = "latin1") 
hist(data_ess$stfgov, breaks = "FD")
summary(data_ess$stfgov)


# We store the mean, the standard deviation and the number of observations as objects, because this way we can refer to them later on
n <- 2292
xbar <- mean(data_ess$stfgov, na.rm = TRUE)
s <- sd(data_ess$stfgov, na.rm = TRUE)

# Set confidence level with 1-alpha (alpha = our willingness to be wrong in repeated samples)
conf.level <- 0.95

# Calculating the critical z-value for a two-sided test 
z <- qnorm((1 + conf.level) / 2)

# Calculate the confidence interval
lower.ci <- xbar - z * s / sqrt(n)
upper.ci <- xbar + z * s / sqrt(n)

# Print confidence intervals
cat("The", conf.level*100,"% confidence interval for the population mean is (",round(lower.ci, 2), ",", round(upper.ci, 2),").\n")


#T-Test
t.test(data_ess$stfgov, conf.level = 0.95)


#Confidence Interval plot
df <- data.frame(xbar, lower.ci, upper.ci)

ggplot(df, aes(x = 1, y = xbar)) +
  theme(axis.text.y = element_blank(),
        axis.ticks.x = element_blank()) +
  geom_point(size = 3, shape = 21, fill = "white", colour = "black") +
  geom_errorbar(aes(ymin = lower.ci, ymax = upper.ci), width = 0.2) +
  coord_flip() +
  labs(x = "", y = "Mean with 95% CI") +
  scale_x_continuous(breaks = seq(0, 10, by = 1), limits = c(1, 1))


#Dichotomized government satisfaction
# Recode of variable
data_ess <- data_ess %>%  
  mutate(stfgov_di =
           case_when(stfgov <= 5 ~ 0,
                     stfgov > 5 ~ 1))

# Proportion via table command
prop.table(table(data_ess$stfgov_di))

# Correspondence with the mean of the summary command
summary(data_ess$stfgov_di)

# Set confidence level with 1-alpha
conf.level <- 0.95

# Calculating the critical z-value for a two-sided test
z <- qnorm((1 + conf.level) / 2)

# Calculation of the confidence interval
lower.ci <- 0.3067 - z * sqrt((0.3067*(1-0.3067))/n)
upper.ci <- 0.3067 + z * sqrt((0.3067*(1-0.3067))/n)

# Print the confidence intervals
cat("The", conf.level*100, "% confidence interval for the proportion value is (", round(lower.ci, 2), ",", round(upper.ci, 2), ").\n")

# Test using t.test
t.test(data_ess$stfgov_di, conf.level = 0.95)


#Correlation Coefficient
cor <- cor.test(data_ess$hinctnta, data_ess$stfgov)
cor


#Grpahical representation of CI
ciplot <- ggplot(data=data_ess, aes(x = hinctnta , y = stfgov)) + 
  geom_smooth(method = lm, se = TRUE, level = 0.95) +
  xlab("Income (Deciles)") +
  ylab("Satisfaction with the government")
ciplot


#Hypothesis test of a difference in means
data_ess <- data_ess %>%  
  mutate(hinctnta_di =
           case_when(hinctnta <= 6 ~ 0,
                     hinctnta > 6 ~ 1))
summary(data_ess$hinctnta_di)


#Test implementation
mean(data_ess$stfgov[data_ess$hinctnta_di==1], na.rm = TRUE)-
  mean(data_ess$stfgov[data_ess$hinctnta_di==0], na.rm = TRUE)

t.test(data_ess$stfgov[data_ess$hinctnta_di==1], data_ess$stfgov[data_ess$hinctnta_di==0])


#Hypothesis Test of correlation
cor <- cor.test(data_ess$hinctnta, data_ess$stfgov)
cor