X

Plane crash data

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A Simple Skript for planecrash data from: 
# http://www.planecrashinfo.com/database.htm
# The aviation accident database includes: 
# All civil and commercial aviation accidents of scheduled and non-scheduled 
# passenger airliners worldwide, which resulted in a fatality 
# (including all U.S. Part 121 and Part 135 fatal accidents)
# All cargo, positioning, ferry and test flight fatal accidents.
# All military transport accidents with 10 or more fatalities.
# All commercial and military helicopter accidents with greater than 
# 10 fatalities.
# All civil and military airship accidents involving fatalities.
# Aviation accidents involving the death of famous people.
# Aviation accidents or incidents of noteworthy interest.

# Database Format
# Date:	 Date of accident,  in the format - January 01, 2001
# Time:	 Local time, in 24 hr. format unless otherwise specified
# Airline/Op:	 Airline or operator of the aircraft
# Flight #:	 Flight number assigned by the aircraft operator
# Route:	 Complete or partial route flown prior to the accident
# AC Type:	 Aircraft type
# Reg:	 ICAO registration of the aircraft
# cn / ln:	 Construction or serial number / Line or fuselage number
# Aboard:	 Total aboard (passengers / crew)
# Fatalities:	 Total fatalities aboard (passengers / crew)
# Ground:	 Total killed on the ground
# Summary:	 Brief description of the accident and cause if known


# to be done: histograms, t test?, types of chrashes, 

# country filter?, region filter, plane filter
# text analysis, 

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# packages

library("XML")
library("quantmod")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# 

rm(list = ls(all = TRUE))
getwd()
#system("ls")
setwd("~/ownCloud/STA_Statistics/PlaneCrashData")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# get the available years

url <- "http://www.planecrashinfo.com/database.htm"
tables <- readHTMLTable(url,header=FALSE)
years <- NULL
for(i in 1:length(tables[[2]][,1])){
  for(e in 2:length(tables[[2]][1,])){
    years<-c(years,levels(tables[[2]][i,e])[i])
  }
}
years
years<-na.omit(as.numeric(years))
years

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# get all data tables

table.crash.all <- list()

for(i in 1:length(years)){
  url2 <- paste("http://www.planecrashinfo.com/",
                years[i],"/",years[i],".htm",sep="")
  assign(paste("table.crash.",years[i],sep=""),readHTMLTable(url2))
  table.crash.all[[i]] <- get(paste("table.crash.",years[i],sep=""))
  print(paste("Year",years[i],"downloaded"))
  }
#table.crash.all
length(table.crash.all)
save(table.crash.all,file="planecrashdata_list.RData")
for(i in 1:length(years)){
  data<-table.crash.all[[i]]
  save(data,file=paste("planecrashdata_",years[i],".RData",sep=""))
}
for(i in 1:length(years)){
  data<-table.crash.all[[i]]
  write.csv(data,file=paste("planecrashdata_",years[i],".csv",sep=""))
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Plane Crashes per Year

years.crashes <- data.frame()
for(i in 1:length(years)){
  years.crashes[i,1] <- years[i]
  years.crashes[i,2] <- length(as.data.frame(table.crash.all[[i]])[,1])
}
years.crashes
rownames(years.crashes)<-years.crashes[,1]

par(mfrow=c(1,1))

plot(years.crashes,type="b",main="Plane Crashes per Year",
     xlab="Year",ylab="number of crashes")
points(years.crashes,col="red")

par(mfrow=c(1,1))

setwd("/Users/impac/ownCloud/STA_Statistics/PlaneCrashData/plots")

jpeg(file="Plane Crashes per Year-points.jpeg", width = 1080, 
     height = 1080, pointsize = 12, quality = 75)

plot(years.crashes,type="b",main="Plane Crashes per Year",
     xlab="Year",ylab="number of crashes")

points(years.crashes,col="red")
dev.off()

jpeg(file="Plane Crashes per Year-bars.jpeg", width = 1080, 
     height = 1080, pointsize = 12, quality = 75)

plot(years.crashes,type="h",main="Plane Crashes per Year",
     xlab="Year",ylab="number of crashes")

points(years.crashes,col="red")
dev.off()

jpeg(file="Plane Crashes per Year-steps.jpeg", width = 1080, 
     height = 1080, pointsize = 12, quality = 75)

plot(years.crashes,type="s",main="Plane Crashes per Year",
     xlab="Year",ylab="number of crashes")

points(years.crashes,col="red")
dev.off()

par(mfrow=c(2,2))

plot(years.crashes[1:30,],type="h",main=paste("Plane Crashes per Year",
                           years.crashes[1,1],"-",years.crashes[30,1]),xlab="Year",
                           ylab="number of crashes")

points(years.crashes[1:30,],col="red")
abline(h=mean(years.crashes[1:30,2]),col="blue")

plot(years.crashes[31:60,],type="h",main=paste("Plane Crashes per Year",years.crashes[31,1],"-",years.crashes[60,1]),xlab="Year",ylab="number of crashes")
points(years.crashes[31:60,],col="red")
abline(h=mean(years.crashes[31:60,2]),col="blue")

plot(years.crashes[61:80,],type="h",main=paste("Plane Crashes per Year",years.crashes[61,1],"-",years.crashes[80,1]),xlab="Year",ylab="number of crashes")
points(years.crashes[61:80,],col="red")
abline(h=mean(years.crashes[61:80,2]),col="blue")

plot(years.crashes[81:length(years.crashes[,1]),],type="h",main=paste("Plane Crashes per Year",years.crashes[81,1],"-",years.crashes[length(years.crashes[,1]),1]),xlab="Year",ylab="number of crashes")
points(years.crashes[81:length(years.crashes[,1]),],col="red")
abline(h=mean(years.crashes[81:length(years.crashes[,1]),2]),col="blue")


#jpeg(file="Plane Crashes per Year.jpeg", width = 1080, height = 1080, pointsize = 12, quality = 75)
#par(mfrow=c(2,2))

#plot(years.crashes[1:30,],type="h",main=paste("Plane Crashes per Year",years.crashes[1,1],"-",years.crashes[30,1]),xlab="Year",ylab="number of crashes")
#points(years.crashes[1:30,],col="red")
#abline(h=mean(years.crashes[1:30,2]),col="blue")

#plot(years.crashes[31:60,],type="h",main=paste("Plane Crashes per Year",years.crashes[31,1],"-",years.crashes[60,1]),xlab="Year",ylab="number of crashes")
#points(years.crashes[31:60,],col="red")
#abline(h=mean(years.crashes[31:60,2]),col="blue")

#plot(years.crashes[61:80,],type="h",main=paste("Plane Crashes per Year",years.crashes[61,1],"-",years.crashes[80,1]),xlab="Year",ylab="number of crashes")
#points(years.crashes[61:80,],col="red")
#abline(h=mean(years.crashes[61:80,2]),col="blue")

#plot(years.crashes[81:length(years.crashes[,1]),],type="h",main=paste("Plane Crashes per Year",years.crashes[81,1],"-",years.crashes[length(years.crashes[,1]),1]),xlab="Year",ylab="number of crashes")
#points(years.crashes[81:length(years.crashes[,1]),],col="red")
#abline(h=mean(years.crashes[81:length(years.crashes[,1]),2]),col="blue")

#dev.off()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Histograms of the number of crashes

par(mfrow=c(1,1))

hist(years.crashes[,2], nclass=length(years.crashes[,2]), 
     freq=TRUE, main='histogram of plane crashes',
     xlab="number of chrashes per year", col = "lightblue", border = "blue")

hist(years.crashes[,2], nclass=length(years.crashes[,2]), freq=FALSE, 
     main='histogram of plane crashes',xlab="number of chrashes per year", 
     col = "lightblue", border = "blue")

curve(dnorm(x, mean=mean(years.crashes[,2]),sd=sd(years.crashes[,2])), 
      add=TRUE, col="red")

par(mfrow=c(1,2))

qqnorm(years.crashes[,2])
qqline(years.crashes[,2], col = 2)
plot(density(years.crashes[,2]),log='y', main='')
curve(dnorm(x, mean=mean(years.crashes[,2]),
            sd=sd(years.crashes[,2])), log="y", add=TRUE, col="red")

# different periods

par(mfrow=c(1,2))

# 1980-1999
#hist(years.crashes[61:80,2], nclass=20, freq=TRUE, 
#     main='',xlab="number of chrashes per year", col = "lightblue", 
#     border = "blue")

hist(years.crashes[61:80,2], nclass=20, freq=FALSE, main='1980-1999',xlab="number of chrashes per year", col = "lightblue", border = "blue")
curve(dnorm(x, mean=mean(years.crashes[61:80,2]),sd=sd(years.crashes[61:80,2])), add=TRUE, col="red")

# 2000-2020
#hist(years.crashes[81:length(years.crashes[,1]),2], nclass=length(years.crashes[81:length(years.crashes[,1]),2]), freq=TRUE, main='',xlab="number of chrashes per year", col = "lightblue", border = "blue")
hist(years.crashes[81:length(years.crashes[,1]),2], nclass=length(years.crashes[81:length(years.crashes[,1]),2]), freq=FALSE, main='2000-2020',xlab="number of chrashes per year", col = "lightblue", border = "blue")
curve(dnorm(x,mean=mean(years.crashes[81:length(years.crashes[,1]),2]),sd=sd(years.crashes[81:length(years.crashes[,1]),2])), add=TRUE, col="red")

par(mfrow=c(2,2))

qqnorm(years.crashes[61:80,2])
qqline(years.crashes[61:80,2], col = 2)
plot(density(years.crashes[61:80,2]),log='y', main='')
curve(dnorm(x, mean=mean(years.crashes[61:80,2]),sd=sd(years.crashes[61:80,2])), log="y", add=TRUE, col="red")

qqnorm(years.crashes[81:length(years.crashes[,1]),2])
qqline(years.crashes[81:length(years.crashes[,1]),2], col = 2)
plot(density(years.crashes[81:length(years.crashes[,1]),2]),log='y', main='')
curve(dnorm(x, mean=mean(years.crashes[81:length(years.crashes[,1]),2]),sd=sd(years.crashes[81:length(years.crashes[,1]),2])), log="y", add=TRUE, col="red")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# boxplots

par(mfrow=c(1,1))

boxplot(years.crashes[,2], main="number of plain crashes per year 1920 - 2020", xlab="", ylab="plain crashes per year")

boxplot(years.crashes[,2], notch=TRUE, col=(c("gold","darkgreen")), main="number of plain crashes per year 1920 - 2020", ylab="plain crashes per year")

years.crashes[1:30,3]<-rep("1920-1949",30)
years.crashes[31:60,3]<-rep("1950-1979",30)278750
years.crashes[61:80,3]<-rep("1980-1999",20)
years.crashes[81:length(years.crashes[,1]),3]<-rep("2000-2020",21)

colnames(years.crashes)<-c("year","ncrashes","cat")

boxplot(ncrashes~cat, data=years.crashes, notch=TRUE, col=(c("green","blue","blue","green")), main="number of plain crashes per year - groups", xlab="")

# start from here
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# t test

# independent 2-group t-test    t.test(y~x) # where y is numeric and x is a binary factor
# independent 2-group t-test    t.test(y1,y2) # where y1 and y2 are numeric
# paired t-test                 t.test(y1,y2,paired=TRUE) # where y1 & y2 are numeric
# one sample t-test             t.test(y,mu=3) # Ho: mu=3

t.test(years.crashes[61:80,2],years.crashes[81:length(years.crashes[,1]),2])

# Welch Two Sample t-test
#
# data:  years.crashes[61:80, 2] and years.crashes[81:length(years.crashes[, 1]), 2]
# t = 5.4142, df = 31.453, p-value = 6.268e-06
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
#   14.70178 32.45536
# sample estimates:
#   mean of x mean of y 
# 59.15000  35.57143 

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# rate of difference

par(mfrow=c(3,1))

rate<-na.omit(ROC(years.crashes[,2]))
plot(rate,type="l")
plot(cumsum(rate),type="l")
hist(rate, nclass=20)

rate<-na.omit(ROC(years.crashes[61:80,2]))
plot(rate,type="l")
plot(cumsum(rate),type="l")
hist(rate, nclass=20)

rate<-na.omit(ROC(years.crashes[81:length(years.crashes[,1]),2]))
plot(rate,type="l")
plot(cumsum(rate),type="l")
hist(rate, nclass=20)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# One big Table

table.crash.one.table <- data.frame(table.crash.all[[1]],stringsAsFactors = TRUE)
colnames(table.crash.one.table)<-c("date","location","type","fatalities")

for(i in 2:length(years)){
  data <- data.frame(table.crash.all[[i]],stringsAsFactors = TRUE)
  colnames(data)<-c("date","location","type","fatalities")
  table.crash.one.table <- rbind(table.crash.one.table,data)
}
table.crash.one.table

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# create a time series object

date <- gsub("Jan","01",table.crash.one.table[,1])
date <- gsub("Feb","02",date)
date <- gsub("Mar","03",date)
date <- gsub("Apr","04",date)
date <- gsub("May","05",date)
date <- gsub("Jun","06",date)
date <- gsub("Jul","07",date)
date <- gsub("Aug","08",date)
date <- gsub("Sep","09",date)
date <- gsub("Oct","10",date)
date <- gsub("Nov","11",date)
date <- gsub("Dec","12",date)

gsub("/","-",table.crash.one.table[,4])

fatalities <- strsplit(gsub("/","-",table.crash.one.table[,4]),'-')
number<-NULL
for(i in 1:length(fatalities)){number<-c(number,fatalities[[i]][1])}
#as.numeric(number)
table.crash.one.table[,5]<-as.numeric(number)


#strsplit(gsub("(","-",fatalities[[i]][2], fixed = TRUE),"-")[1]
number2<-NULL
for(i in 1:length(fatalities)){number2<-c(number2,
                strsplit(gsub("(","-",fatalities[[i]][2], fixed = TRUE),"-")[[1]][1])}
number2
table.crash.one.table[,6]<-as.numeric(number2)

colnames(table.crash.one.table)<-c("date","location","type",
                                   "fatalities","fatalities-passengers","fatalities-crew")

#table.crash.one.table.xts <- xts(table.crash.one.table[,2:6], order.by=as.Date(date,"%d %m %Y"))
#colnames(table.crash.one.table.xts)<-c("location","type","fatalities",
#"passengerfatalities","crewfatalities")

table.crash.one.table.xts <- xts(table.crash.one.table[,5:6], order.by=as.Date(date,"%d %m %Y"))
colnames(table.crash.one.table.xts)<-c("totalfatalities","totalaboard")

table.crash.one.table.xts$totalfatalities
table.crash.one.table.xts$totalaboard

plot(table.crash.one.table.xts$totalfatalities, main="Plane Crash Total Fatalities")

plot(table.crash.one.table.xts$totalaboard, main="Plane Crash Total Aboard")

plot(table.crash.one.table.xts$totalaboard, main="Plane Crash Total Aboard vs Total Fatalities")
lines(table.crash.one.table.xts$totalfatalities,col="red")

plot(table.crash.one.table.xts$totalaboard["2000/2020"], 
     main="Plane Crash Total Aboard vs Total Fatalities",type="c")

plot(table.crash.one.table.xts$totalaboard["2000/2020"], 
     main="Plane Crash Total Aboard vs Total Fatalities",type="o")

plot(table.crash.one.table.xts$totalaboard["2000/2020"], 
     main="Plane Crash Total Aboard vs Total Fatalities",type="h")

plot(table.crash.one.table.xts$totalaboard["2000/2020"], 
     main="Plane Crash Total Aboard vs Total Fatalities",type="b")
points(table.crash.one.table.xts$totalfatalities["2000/2020"],col="red")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Total killed on the ground



number3<-NULL
for(i in 1:length(fatalities)){
  sub <- gsub("(","-",fatalities[[i]][2], fixed = TRUE)
  sub <- gsub(")","-",sub, fixed = TRUE)
  number3<-c(number3,strsplit(sub,"-")[[1]][2])
}
number3
table.crash.one.table[,7]<-as.numeric(number3)

colnames(table.crash.one.table)<-c("date","location","type",
                          "fatalities","fatalities-passengers","fatalities-crew","totalkilledontheground")

table.crash.one.table.xts <- xts(table.crash.one.table[,5:7], order.by=as.Date(date,"%d %m %Y"))
colnames(table.crash.one.table.xts)<-c("totalfatalities","totalaboard","totalkilledontheground")


plot(table.crash.one.table.xts$totalkilledontheground, main="Plane Crash Total killed on the ground")
plot(table.crash.one.table.xts$totalkilledontheground, main="Plane Crash Total killed on the ground",type="b")
plot(table.crash.one.table.xts$totalkilledontheground, main="Plane Crash Total killed on the ground",type="c")

# Martin Stoppacher                                                             #
# office@martinstoppacher.com                                                   #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#################################################################################

 

Martin Stoppacher: