stats-R/Project.R

# Task 1


# b)
X <- 0:12
Y <- dgeom(X, p = 0.45)
plot(X, Y, type = "h", main = "Geometric distribution (p=0.45)", ylab = "P(X=k)")
points(X, Y, pch = 16)

# Task 2

# b)
X <- 0:30
Y <- dbinom(X, size = 30, prob = .6)
plot(X, Y, type = "h", main = "Binominal distribution (p=0.6)", ylab = "P(X=k)")
points(X, Y, pch = 16)

# Task 3

# 2)
X <- 0:40
Y <- dpois(X, lambda = 20)
plot(X, Y, type = "h", main = "Poisson distribution (lambda=20)", ylab = "P(X=k)")
points(X, Y, pch = 16)

# Task 4
library(readr)
data_set1 <- read_csv("data_set1.csv")
View(data_set1)

# 1) The name of the column is "Val".
# 2) There are 1029 rows.
# 3) Max = 109.379
max(data_set1)
# 4) Min = 4.193534
min(data_set1)
# 5) Mean = 50.49665
mean(data_set1$Val)
# 6) Median = 50.52415
median(data_set1$Val)
# 7) Variance = 218.7175
var(data_set1$Val)
# 8) Standard deviation = 14.7891
sd(data_set1$Val)

# Task 5
# 1)
library(readr)
data_set1 <- read_csv("data_set1.csv")
X <- 0:100
Y <- dnorm(X, mean = mean(data_set1$Val), sd = sd(data_set1$Val))
plot(X, Y, type = "l", ylim = c(0, 0.03), main = "Data set vs normal distribution")
# 2)
d <- density(data_set1$Val, bw = 3)
# 3)
points(d, col = "red", type = "l")
# 4)
abline(v = mean(data_set1$Val), col = "green")

# Task 6
# 4 variables most correlated with hp: mpg, cyl, disp, carb
cars <- mtcars
round(cor(cars), digits = 2)

# Task 7
# 1)
model <- lm(hp ~ cyl + disp + carb + mpg, data = mtcars)
# 2)
hp_hat <- predict(model)
# 3)
residuals <- mtcars$hp - hp_hat
# 4)
hpplot <- density(residuals)
plot(hpplot, main = "Density of residuals")
# 6)
summary(model)$r.squared # 0.8594845 - Correct and accurate

# Task 8
library(readr)
data_set2 <- read_csv("data_set2.csv")
X <- min(data_set2):max(data_set2)
Y <- dnorm(X, mean = mean(data_set2$Val), sd = sd(data_set2$Val))
plot(X, Y, type = "l", main = "Normal distribution of stick lengths")
d <- density(data_set2$Val, bw = 1)
points(d, col = "red", type = "l")
abline(v = mean(data_set2$Val), col = "green")
# The length of the sticks is not acceptable as the mean is much higher than the null hypothesis of µ = 30 and most values are not around 30.