#---
#title: "I Univariate Statistics – Case Study Socio-Demographic Reporting"
#author: "Conrad Ziller, University of Duisburg-Essen"
#---


# Required packages are installed and loaded here. If more packages are needed, they can be added to the list
packages <- c("tidyverse", "readxl", "knitr", "stargazer", "ggplot2", "sf", "dplyr", "viridis","Hmisc", "kableExtra")
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages], repos = "https://cran.r-project.org/")
}
invisible(lapply(packages, library, character.only = TRUE))


#Read data
library(readxl) # This command loads the package "readxl". In the markdown document, all required packages are installed in the background using the command "install.packages('packagename')". In the downloadable script files, the installation and activation of the packages is automated. 

data_nrw <- read_excel("data/inkar_nrw.xls") # Reads data from Excel format; the directory where the data is to be found can be changed (e.g., "C:/user/documents/folderxyz/inkar_nrw.xls")


#First ten rows
head(data_nrw, 10)


# Read a so-called shape file for NRW which contains the polygones (e.g., district boundaries) necessary to build maps and merging with structural data
nrw_shp <- st_read("data/dvg2krs_nw.shp")
nrw_shp$KN <- as.numeric(as.character(nrw_shp$KN))
nrw_shp <- nrw_shp %>%
  right_join(data_nrw, by = c("KN"))

# Building the map using the "ggplot2" package

ggplot() + 
  geom_sf(data = nrw_shp, aes(fill = unemp), color = 'gray', size = 0.1) + 
  ggtitle("North-Rhine Westphalia, Germany") +
  guides(fill=guide_colorbar(title="Unemployment in %")) + 
  scale_fill_gradientn(colours=rev(magma(3)),
                       na.value = "grey100", 
  ) 


#Histogramme
library(lattice)

data_nrw$popdens <- data_nrw$population/data_nrw$flaeche # This generates the variable for the population density

histogram( ~ popdens + unemp + avage + crimerate,
           breaks = 10, 
           type = "percent", 
           xlab = "",
           ylab = "Percent of observations", 
           layout = c(2,1),
           scales = list(relation="free"),
           col = 'grey',
           data = data_nrw)


## Alternative approach with the "hist"-function
#hist(data_nrw$unemp)
#hist(data_nrw$avage)
#hist(data_nrw$crimerate)


#Summary
summary(data_nrw)


#Graphical representation of the dispersion of 'unemp' und 'crimerate'
s_unemp <- ggplot(data=data_nrw, aes(y=unemp, x=reorder(area, area), color=area)) +
  geom_jitter(height = 0) + 
  ggtitle("Unemployment rate 2020 in %")
s_unemp + geom_hline(yintercept=7.432, linetype="solid", color = "red", size=0.1) + theme(legend.position="bottom") + theme(legend.text = element_text(size=4)) + theme(axis.title.x=element_blank(),        axis.text.x=element_blank(),        axis.ticks.x=element_blank())



s_crime <- ggplot(data=data_nrw, aes(y=crimerate, x=reorder(area, area), color=area)) +
  geom_jitter(height = 0) + 
  ggtitle("Crime rate 2020")

s_crime + geom_hline(yintercept=6244, linetype="solid", color = "red", size=0.1)  + theme(legend.position="bottom") + theme(legend.text = element_text(size=4)) + theme(axis.title.x=element_blank(),        axis.text.x=element_blank(),        axis.ticks.x=element_blank())


#Standard deviation, range and coefficient of unemployment
sd(data_nrw$unemp)
range <- max(data_nrw$unemp, na.rm=TRUE) - min(data_nrw$unemp, na.rm=TRUE)
range 
varcoef <- sd(data_nrw$unemp) / mean(data_nrw$unemp) * 100 #how much percent of the mean is the standard deviation?
varcoef


#Standard deviation, range and coefficient of crime rate
sd(data_nrw$crimerate)
range <- max(data_nrw$crimerate, na.rm=TRUE) - min(data_nrw$crimerate, na.rm=TRUE)
range 
varcoef <- sd(data_nrw$crimerate) / mean(data_nrw$crimerate) * 100
varcoef


#Quartiles and IQR for uneployment
quantile(data_nrw$unemp)

unemp.score.quart <- quantile(data_nrw$unemp, names = FALSE)
unemp.score.quart[4] - unemp.score.quart[2]


#Quartiles and IQR for crime rate
quantile(data_nrw$crimerate)

crimerate.score.quart <- quantile(data_nrw$crimerate, names = FALSE)
crimerate.score.quart[4] - crimerate.score.quart[2]


#Boxplots
boxplot(data_nrw$unemp, 
        col = 'blue', 
        horizontal = FALSE,
        ylab = 'in %', 
        main = 'Unemployment rate')

boxplot(data_nrw$crimerate, 
        col = 'orange', 
        horizontal = FALSE,
        ylab = 'in cases per 100.000 inhab.', 
        main = 'Crime rate')


#Scatterplots
sc1 <- ggplot(data=data_nrw, aes(x = unemp, y = crimerate)) + 
  geom_point() + 
  xlab("Unemployment rate 2020 in %") +
  ylab("Crime rate")
sc1

sc1 <- ggplot(data=data_nrw, aes(x = unemp, y = crimerate)) + 
  geom_point() + 
  geom_smooth(method = lm, se = FALSE) +
  xlab("Unemployment rate in %") +
  ylab("Crime rate")
sc1


#Pearson's correlation coefficient
vars <- c("unemp", "crimerate")
cor.vars <- data_nrw[vars]
rcorr(as.matrix(cor.vars))


#Regression
model1 <- lm(crimerate ~ 1 + unemp, data = data_nrw)
model2 <- lm(crimerate ~ 1 + unemp + popdens, data = data_nrw)

stargazer(model1, model2, type = "text")