User:Ixfd64/revision sizes
This is an R script for visualizing the byte sizes of Wikipedia pages over time. I originally wrote this for an assignment for my Stat 133 class, but I have decided to release it to the public. Other Wikipedians are welcome to improve it as they see fit.
Source code
# current version: 1.2.0
# last update: November 19, 2009
# load the Tcl/Tk library
require(tcltk)
# create main GUI
base = tktoplevel()
tkwm.title(base,'Wikipedia revision sizes')
# create main frame
nfrm = tkframe(base)
# create Tcl variables
revisions.tcl = tclVar('')
page.tcl = tclVar('')
# create "Page" text field
f1 = tkframe(nfrm)
tkpack(tklabel(f1, text = 'Page'), side = 'left')
tkpack(tkentry(f1, width = 25, textvariable = page.tcl), side = 'left')
# create "Revisions" text field
f2 = tkframe(nfrm)
tkpack(tklabel(f2, text = 'Revisions'), side = 'left')
tkpack(tkentry(f2, width = 8, textvariable = revisions.tcl), side = 'left')
# pack GUI elements
tkpack(f1, side='top')
tkpack(f2, side='top')
tkpack(nfrm)
# language code
# complete list available at http://meta.wikimedia.org/wiki/List_of_Wikipedias
lang = 'en'
# get page revision sizes
getpage = function(...) {
revisions = as.numeric(tclvalue(revisions.tcl))
page = as.character(tclvalue(page.tcl))
page.loc = gsub(' ', '_', page) # fix space parsing
ge = function(s, g) substring(s, g, g + attr(g, 'match.length') - 1)
byte.pat = '<span class="history-size">(\\([0-9bytesmtp ,]+\\))</span>'
historylink = paste('http://', lang, '.wikipedia.org/w/index.php?title=', page.loc, '&limit=', as.character(revisions), '&action=history', sep='')
history = readLines(historylink)
if (any(grep('no revision history', history))) {
tkmessageBox(title = 'Error', message = 'This page does not exist!', type = 'ok')
} else {
revisiondata = gsub(byte.pat, '\\1', mapply(ge, history[grep(byte.pat, history)], gregexpr(byte.pat, history[grep(byte.pat, history)], ignore.case = TRUE)), ignore.case = TRUE)
revisiondata = as.character(revisiondata)
revisiondata = gsub('[\\(\\), ]', '', revisiondata)
for (a in 1:length(revisiondata)) {
if (revisiondata[a] == 'empty') {revisiondata[a] = '0'}
}
revisiondata = gsub('[bytes]', '', revisiondata)
revisiondata = as.numeric(revisiondata, ignore.na = TRUE)
revisiondata = rev(revisiondata)
plot(revisiondata, xlab = 'Revision index', ylab = paste('Revision size of \"', page, '\" (bytes)', sep = ''), type = 'l')
}
}
# quit program
destroy = function(...) tkdestroy(base)
# create buttons
bfrm = tkframe(base)
tkpack(tkbutton(bfrm, text = 'Run', command = getpage), side = 'left')
tkpack(tkbutton(bfrm, text = 'Quit', command = destroy), side = 'right')
# pack bottom frame
tkpack(bfrm, side = 'bottom')
Usage
This script requires R to be installed. The best way to use this script is to put it in a text file located in the directory of your R workspace. For example, if you named your file wpgraph.txt, you can run the script by entering source('wpgraph.txt') in the R console.
Limitations
- Certain characters are not processed correctly. For example, "1+1" must be entered as 1%2B1.