#Load the complete HTML file into memory
html <- readLines(url("https://en.wikipedia.org/wiki/Opinion_polling_for_the_New_Zealand_general_election,_2008"),encoding="UTF-8")
closeAllConnections()
#The fourth table is the opinion poll data
tbl <- html[(grep("<table.*",html)[4]):(grep("</table.*",html)[4])]
#Now split it into the rows, based on the <tr> tag
tbl.rows <- split(tbl,cumsum(tbl=="<tr>"))
#Now extract the data
survey.dat <- lapply(tbl.rows,function(x) {
#Start by only considering where we have <td> tags
td.tags <- x[grep("<td",x)]
#Polling data appears in columns 3-6
dat <- td.tags[3:6]
#Now strip the data and covert to numeric format
dat <- gsub("<td>|</td>","",dat)
dat <- gsub("%","",dat)
dat <- gsub("-","0",dat)
dat <- as.numeric(dat)
#Getting the date strings is a little harder. The approach we will take is to take advantage
#of the title="date" hyperlinks to generate a set of dates
date.str <- td.tags[2] #Dates are in the second column
date.str <- gsub("<sup.*</sup>","",date.str) #Throw out anything between superscript tags, as its an reference to the source
titles <- gregexpr("(?U)title=\".*\"",date.str,perl=TRUE)[[1]] #Find the location of the title tags
#Now, extract the actual date strings
date.strings <- rep(NULL,length(titles))
for(i in 1:length(titles)) {
date.strings[i] <- substr(date.str,titles[i]+7,titles[i]+attr(titles,"match.length")[i]-2)
}
yr <- rev(date.strings)[1]
dates <- rep(as.POSIXct(Sys.time()),length(date.strings)-1)
for(i in 1:(length(date.strings)-1)) {
dates[i] <- as.POSIXct(strptime(paste(date.strings[i],yr),"%B %d %Y"))
}
survey.time <- mean(dates)
#And now return results
# if(is.na(survey.time)) browser()
return(data.frame(Date=survey.time,t(dat)))
})
polls <- do.call(rbind,survey.dat)
#Now make the graph!
polls <- subset(polls,!is.na(polls$Date))
polls$Date <- as.double(polls$Date)
colnames(polls) <- c("Date","Helen Clark","Don Brash","John Key","Winston Peters")
parties <- colnames(polls)[-1]
cols <- c("red","purple","blue","black")
ticks <- ISOdate(c(2005,rep(2006,3),rep(2007,3),rep(2008,3)),c(9,rep(c(1,5,9),3)),1)
xlims <- range(as.double(c(ticks,ISOdate(2009,2,1))))
png("NZ_opinion_polls_2005-2008 -PPM.png",width=778,height=487,pointsize=16)
par(mar=c(3,4,1,1))
matplot(polls$Date,polls[,parties],pch=NA,xlim=xlims,ylab="Party support (%)",xlab="",col=cols,xaxt="n",ylim=c(0,60))
abline(h=seq(0,95,by=5),col="lightgrey",lty=3)
abline(v=as.double(ticks),col="lightgrey",lty=3)
#Now add loess smoothers
smoothed <- list()
for(i in 1:length(parties)) {
smoother <- loess(polls[,i+1] ~ polls[,"Date"],span=0.33)
smoothed[[i]] <- predict(smoother,se=TRUE)
# polygon(c(polls[,"Date"],rev(polls[,"Date"])),
# c(smoothed[[i]]$fit+smoothed[[i]]$se.fit*1.96,rev(smoothed[[i]]$fit-smoothed[[i]]$se.fit*1.96)),
# col=rgb(0.5,0.5,0.5,0.5),border=NA)
lines(polls[,"Date"],pmax(0,smoothed[[i]]$fit),col=cols[i],lwd=2) #Constraints it to be positive
}
matpoints(polls$Date,polls[,parties],pch=20,col=cols)
legend("topleft",legend=parties,col=cols,pch=20,bg="white",lwd=2)
axis(1,at=as.double(ticks),labels=format(ticks,format="%b\n%Y"),cex.axis=0.8)
axis(4,at=axTicks(4),labels=rep("",length(axTicks(4))))
#Add best estimates
for(i in 1:length(smoothed)) {
lbl <- sprintf("%4.0f%% ± %2.0f",abs(round(rev(smoothed[[i]]$fit)[1],0)),round(1.96*rev(smoothed[[i]]$se.fit)[1],0))
text(rev(polls$Date)[1],rev(smoothed[[i]]$fit)[1],labels=lbl,pos=4,col=cols[i])
}
dev.off()