# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # A Simple Skript for planecrash data from: # http://www.planecrashinfo.com/database.htm # The aviation accident database includes: # All civil and commercial aviation accidents of scheduled and non-scheduled # passenger airliners worldwide, which resulted in a fatality # (including all U.S. Part 121 and Part 135 fatal accidents) # All cargo, positioning, ferry and test flight fatal accidents. # All military transport accidents with 10 or more fatalities. # All commercial and military helicopter accidents with greater than # 10 fatalities. # All civil and military airship accidents involving fatalities. # Aviation accidents involving the death of famous people. # Aviation accidents or incidents of noteworthy interest. # Database Format # Date: Date of accident, in the format - January 01, 2001 # Time: Local time, in 24 hr. format unless otherwise specified # Airline/Op: Airline or operator of the aircraft # Flight #: Flight number assigned by the aircraft operator # Route: Complete or partial route flown prior to the accident # AC Type: Aircraft type # Reg: ICAO registration of the aircraft # cn / ln: Construction or serial number / Line or fuselage number # Aboard: Total aboard (passengers / crew) # Fatalities: Total fatalities aboard (passengers / crew) # Ground: Total killed on the ground # Summary: Brief description of the accident and cause if known # to be done: histograms, t test?, types of chrashes, # country filter?, region filter, plane filter # text analysis, # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # packages library("XML") library("quantmod") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # rm(list = ls(all = TRUE)) getwd() #system("ls") setwd("~/ownCloud/STA_Statistics/PlaneCrashData") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get the available years url <- "http://www.planecrashinfo.com/database.htm" tables <- readHTMLTable(url,header=FALSE) years <- NULL for(i in 1:length(tables[[2]][,1])){ for(e in 2:length(tables[[2]][1,])){ years<-c(years,levels(tables[[2]][i,e])[i]) } } years years<-na.omit(as.numeric(years)) years # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # get all data tables table.crash.all <- list() for(i in 1:length(years)){ url2 <- paste("http://www.planecrashinfo.com/", years[i],"/",years[i],".htm",sep="") assign(paste("table.crash.",years[i],sep=""),readHTMLTable(url2)) table.crash.all[[i]] <- get(paste("table.crash.",years[i],sep="")) print(paste("Year",years[i],"downloaded")) } #table.crash.all length(table.crash.all) save(table.crash.all,file="planecrashdata_list.RData") for(i in 1:length(years)){ data<-table.crash.all[[i]] save(data,file=paste("planecrashdata_",years[i],".RData",sep="")) } for(i in 1:length(years)){ data<-table.crash.all[[i]] write.csv(data,file=paste("planecrashdata_",years[i],".csv",sep="")) } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Plane Crashes per Year years.crashes <- data.frame() for(i in 1:length(years)){ years.crashes[i,1] <- years[i] years.crashes[i,2] <- length(as.data.frame(table.crash.all[[i]])[,1]) } years.crashes rownames(years.crashes)<-years.crashes[,1] par(mfrow=c(1,1)) plot(years.crashes,type="b",main="Plane Crashes per Year", xlab="Year",ylab="number of crashes") points(years.crashes,col="red") par(mfrow=c(1,1)) setwd("/Users/impac/ownCloud/STA_Statistics/PlaneCrashData/plots") jpeg(file="Plane Crashes per Year-points.jpeg", width = 1080, height = 1080, pointsize = 12, quality = 75) plot(years.crashes,type="b",main="Plane Crashes per Year", xlab="Year",ylab="number of crashes") points(years.crashes,col="red") dev.off() jpeg(file="Plane Crashes per Year-bars.jpeg", width = 1080, height = 1080, pointsize = 12, quality = 75) plot(years.crashes,type="h",main="Plane Crashes per Year", xlab="Year",ylab="number of crashes") points(years.crashes,col="red") dev.off() jpeg(file="Plane Crashes per Year-steps.jpeg", width = 1080, height = 1080, pointsize = 12, quality = 75) plot(years.crashes,type="s",main="Plane Crashes per Year", xlab="Year",ylab="number of crashes") points(years.crashes,col="red") dev.off() par(mfrow=c(2,2)) plot(years.crashes[1:30,],type="h",main=paste("Plane Crashes per Year", years.crashes[1,1],"-",years.crashes[30,1]),xlab="Year", ylab="number of crashes") points(years.crashes[1:30,],col="red") abline(h=mean(years.crashes[1:30,2]),col="blue") plot(years.crashes[31:60,],type="h",main=paste("Plane Crashes per Year",years.crashes[31,1],"-",years.crashes[60,1]),xlab="Year",ylab="number of crashes") points(years.crashes[31:60,],col="red") abline(h=mean(years.crashes[31:60,2]),col="blue") plot(years.crashes[61:80,],type="h",main=paste("Plane Crashes per Year",years.crashes[61,1],"-",years.crashes[80,1]),xlab="Year",ylab="number of crashes") points(years.crashes[61:80,],col="red") abline(h=mean(years.crashes[61:80,2]),col="blue") plot(years.crashes[81:length(years.crashes[,1]),],type="h",main=paste("Plane Crashes per Year",years.crashes[81,1],"-",years.crashes[length(years.crashes[,1]),1]),xlab="Year",ylab="number of crashes") points(years.crashes[81:length(years.crashes[,1]),],col="red") abline(h=mean(years.crashes[81:length(years.crashes[,1]),2]),col="blue") #jpeg(file="Plane Crashes per Year.jpeg", width = 1080, height = 1080, pointsize = 12, quality = 75) #par(mfrow=c(2,2)) #plot(years.crashes[1:30,],type="h",main=paste("Plane Crashes per Year",years.crashes[1,1],"-",years.crashes[30,1]),xlab="Year",ylab="number of crashes") #points(years.crashes[1:30,],col="red") #abline(h=mean(years.crashes[1:30,2]),col="blue") #plot(years.crashes[31:60,],type="h",main=paste("Plane Crashes per Year",years.crashes[31,1],"-",years.crashes[60,1]),xlab="Year",ylab="number of crashes") #points(years.crashes[31:60,],col="red") #abline(h=mean(years.crashes[31:60,2]),col="blue") #plot(years.crashes[61:80,],type="h",main=paste("Plane Crashes per Year",years.crashes[61,1],"-",years.crashes[80,1]),xlab="Year",ylab="number of crashes") #points(years.crashes[61:80,],col="red") #abline(h=mean(years.crashes[61:80,2]),col="blue") #plot(years.crashes[81:length(years.crashes[,1]),],type="h",main=paste("Plane Crashes per Year",years.crashes[81,1],"-",years.crashes[length(years.crashes[,1]),1]),xlab="Year",ylab="number of crashes") #points(years.crashes[81:length(years.crashes[,1]),],col="red") #abline(h=mean(years.crashes[81:length(years.crashes[,1]),2]),col="blue") #dev.off() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Histograms of the number of crashes par(mfrow=c(1,1)) hist(years.crashes[,2], nclass=length(years.crashes[,2]), freq=TRUE, main='histogram of plane crashes', xlab="number of chrashes per year", col = "lightblue", border = "blue") hist(years.crashes[,2], nclass=length(years.crashes[,2]), freq=FALSE, main='histogram of plane crashes',xlab="number of chrashes per year", col = "lightblue", border = "blue") curve(dnorm(x, mean=mean(years.crashes[,2]),sd=sd(years.crashes[,2])), add=TRUE, col="red") par(mfrow=c(1,2)) qqnorm(years.crashes[,2]) qqline(years.crashes[,2], col = 2) plot(density(years.crashes[,2]),log='y', main='') curve(dnorm(x, mean=mean(years.crashes[,2]), sd=sd(years.crashes[,2])), log="y", add=TRUE, col="red") # different periods par(mfrow=c(1,2)) # 1980-1999 #hist(years.crashes[61:80,2], nclass=20, freq=TRUE, # main='',xlab="number of chrashes per year", col = "lightblue", # border = "blue") hist(years.crashes[61:80,2], nclass=20, freq=FALSE, main='1980-1999',xlab="number of chrashes per year", col = "lightblue", border = "blue") curve(dnorm(x, mean=mean(years.crashes[61:80,2]),sd=sd(years.crashes[61:80,2])), add=TRUE, col="red") # 2000-2020 #hist(years.crashes[81:length(years.crashes[,1]),2], nclass=length(years.crashes[81:length(years.crashes[,1]),2]), freq=TRUE, main='',xlab="number of chrashes per year", col = "lightblue", border = "blue") hist(years.crashes[81:length(years.crashes[,1]),2], nclass=length(years.crashes[81:length(years.crashes[,1]),2]), freq=FALSE, main='2000-2020',xlab="number of chrashes per year", col = "lightblue", border = "blue") curve(dnorm(x,mean=mean(years.crashes[81:length(years.crashes[,1]),2]),sd=sd(years.crashes[81:length(years.crashes[,1]),2])), add=TRUE, col="red") par(mfrow=c(2,2)) qqnorm(years.crashes[61:80,2]) qqline(years.crashes[61:80,2], col = 2) plot(density(years.crashes[61:80,2]),log='y', main='') curve(dnorm(x, mean=mean(years.crashes[61:80,2]),sd=sd(years.crashes[61:80,2])), log="y", add=TRUE, col="red") qqnorm(years.crashes[81:length(years.crashes[,1]),2]) qqline(years.crashes[81:length(years.crashes[,1]),2], col = 2) plot(density(years.crashes[81:length(years.crashes[,1]),2]),log='y', main='') curve(dnorm(x, mean=mean(years.crashes[81:length(years.crashes[,1]),2]),sd=sd(years.crashes[81:length(years.crashes[,1]),2])), log="y", add=TRUE, col="red") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # boxplots par(mfrow=c(1,1)) boxplot(years.crashes[,2], main="number of plain crashes per year 1920 - 2020", xlab="", ylab="plain crashes per year") boxplot(years.crashes[,2], notch=TRUE, col=(c("gold","darkgreen")), main="number of plain crashes per year 1920 - 2020", ylab="plain crashes per year") years.crashes[1:30,3]<-rep("1920-1949",30) years.crashes[31:60,3]<-rep("1950-1979",30)278750 years.crashes[61:80,3]<-rep("1980-1999",20) years.crashes[81:length(years.crashes[,1]),3]<-rep("2000-2020",21) colnames(years.crashes)<-c("year","ncrashes","cat") boxplot(ncrashes~cat, data=years.crashes, notch=TRUE, col=(c("green","blue","blue","green")), main="number of plain crashes per year - groups", xlab="") # start from here # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # t test # independent 2-group t-test t.test(y~x) # where y is numeric and x is a binary factor # independent 2-group t-test t.test(y1,y2) # where y1 and y2 are numeric # paired t-test t.test(y1,y2,paired=TRUE) # where y1 & y2 are numeric # one sample t-test t.test(y,mu=3) # Ho: mu=3 t.test(years.crashes[61:80,2],years.crashes[81:length(years.crashes[,1]),2]) # Welch Two Sample t-test # # data: years.crashes[61:80, 2] and years.crashes[81:length(years.crashes[, 1]), 2] # t = 5.4142, df = 31.453, p-value = 6.268e-06 # alternative hypothesis: true difference in means is not equal to 0 # 95 percent confidence interval: # 14.70178 32.45536 # sample estimates: # mean of x mean of y # 59.15000 35.57143 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # rate of difference par(mfrow=c(3,1)) rate<-na.omit(ROC(years.crashes[,2])) plot(rate,type="l") plot(cumsum(rate),type="l") hist(rate, nclass=20) rate<-na.omit(ROC(years.crashes[61:80,2])) plot(rate,type="l") plot(cumsum(rate),type="l") hist(rate, nclass=20) rate<-na.omit(ROC(years.crashes[81:length(years.crashes[,1]),2])) plot(rate,type="l") plot(cumsum(rate),type="l") hist(rate, nclass=20) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # One big Table table.crash.one.table <- data.frame(table.crash.all[[1]],stringsAsFactors = TRUE) colnames(table.crash.one.table)<-c("date","location","type","fatalities") for(i in 2:length(years)){ data <- data.frame(table.crash.all[[i]],stringsAsFactors = TRUE) colnames(data)<-c("date","location","type","fatalities") table.crash.one.table <- rbind(table.crash.one.table,data) } table.crash.one.table # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # create a time series object date <- gsub("Jan","01",table.crash.one.table[,1]) date <- gsub("Feb","02",date) date <- gsub("Mar","03",date) date <- gsub("Apr","04",date) date <- gsub("May","05",date) date <- gsub("Jun","06",date) date <- gsub("Jul","07",date) date <- gsub("Aug","08",date) date <- gsub("Sep","09",date) date <- gsub("Oct","10",date) date <- gsub("Nov","11",date) date <- gsub("Dec","12",date) gsub("/","-",table.crash.one.table[,4]) fatalities <- strsplit(gsub("/","-",table.crash.one.table[,4]),'-') number<-NULL for(i in 1:length(fatalities)){number<-c(number,fatalities[[i]][1])} #as.numeric(number) table.crash.one.table[,5]<-as.numeric(number) #strsplit(gsub("(","-",fatalities[[i]][2], fixed = TRUE),"-")[1] number2<-NULL for(i in 1:length(fatalities)){number2<-c(number2, strsplit(gsub("(","-",fatalities[[i]][2], fixed = TRUE),"-")[[1]][1])} number2 table.crash.one.table[,6]<-as.numeric(number2) colnames(table.crash.one.table)<-c("date","location","type", "fatalities","fatalities-passengers","fatalities-crew") #table.crash.one.table.xts <- xts(table.crash.one.table[,2:6], order.by=as.Date(date,"%d %m %Y")) #colnames(table.crash.one.table.xts)<-c("location","type","fatalities", #"passengerfatalities","crewfatalities") table.crash.one.table.xts <- xts(table.crash.one.table[,5:6], order.by=as.Date(date,"%d %m %Y")) colnames(table.crash.one.table.xts)<-c("totalfatalities","totalaboard") table.crash.one.table.xts$totalfatalities table.crash.one.table.xts$totalaboard plot(table.crash.one.table.xts$totalfatalities, main="Plane Crash Total Fatalities") plot(table.crash.one.table.xts$totalaboard, main="Plane Crash Total Aboard") plot(table.crash.one.table.xts$totalaboard, main="Plane Crash Total Aboard vs Total Fatalities") lines(table.crash.one.table.xts$totalfatalities,col="red") plot(table.crash.one.table.xts$totalaboard["2000/2020"], main="Plane Crash Total Aboard vs Total Fatalities",type="c") plot(table.crash.one.table.xts$totalaboard["2000/2020"], main="Plane Crash Total Aboard vs Total Fatalities",type="o") plot(table.crash.one.table.xts$totalaboard["2000/2020"], main="Plane Crash Total Aboard vs Total Fatalities",type="h") plot(table.crash.one.table.xts$totalaboard["2000/2020"], main="Plane Crash Total Aboard vs Total Fatalities",type="b") points(table.crash.one.table.xts$totalfatalities["2000/2020"],col="red") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Total killed on the ground number3<-NULL for(i in 1:length(fatalities)){ sub <- gsub("(","-",fatalities[[i]][2], fixed = TRUE) sub <- gsub(")","-",sub, fixed = TRUE) number3<-c(number3,strsplit(sub,"-")[[1]][2]) } number3 table.crash.one.table[,7]<-as.numeric(number3) colnames(table.crash.one.table)<-c("date","location","type", "fatalities","fatalities-passengers","fatalities-crew","totalkilledontheground") table.crash.one.table.xts <- xts(table.crash.one.table[,5:7], order.by=as.Date(date,"%d %m %Y")) colnames(table.crash.one.table.xts)<-c("totalfatalities","totalaboard","totalkilledontheground") plot(table.crash.one.table.xts$totalkilledontheground, main="Plane Crash Total killed on the ground") plot(table.crash.one.table.xts$totalkilledontheground, main="Plane Crash Total killed on the ground",type="b") plot(table.crash.one.table.xts$totalkilledontheground, main="Plane Crash Total killed on the ground",type="c") # Martin Stoppacher # # office@martinstoppacher.com # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # #################################################################################