#!/usr/bin/python
# -*- coding: utf8 -*-
import numpy as np
import pylab as pl
import urllib
import StringIO
import os
import re
from lxml import etree
website = 'http://results.scc-events.com/2014/?event=MAL&pid=list&num_results=100'
def fetch_times_from_page(pagename):
page = urllib.urlopen(pagename)
page_text = page.read()
page_xml = etree.parse(StringIO.StringIO(page_text), etree.HTMLParser())
times_txt = []
for element in page_xml.iter('tr'):
if len(element) == 10:
t_text = element[8].text
match = re.match('\A(\d{2}):(\d{2}):(\d{2})\Z', t_text)
if match == None:
continue
hms = match.groups()
if len(hms) == 3:
times_txt.append(t_text)
return times_txt
def fetch_list(pagename):
print 'fetching data from website'
times_txt = []
i = 0
while True:
print 'fetching set', i
new_times = fetch_times_from_page(pagename + '&page=' + str(i))
if len(new_times) == 0:
print 'no more new entries on page', i
break
else:
times_txt = times_txt + new_times
i += 1
return times_txt
times = {}
for gender in ['M', 'W']:
fname = 'marathon_' + gender + '.txt'
if not os.path.isfile(fname):
# have no local file -> need to download data
pagename = website + '&search[se' + 'x]=' + gender
times_list = fetch_list(pagename)
f = open(fname, 'w')
f.write('\n'.join(times_list))
f.close()
f = open(fname, 'r')
times[gender] = []
for line in f.readlines():
match = re.match('\A(\d{2}):(\d{2}):(\d{2})\Z', line.strip())
if match == None:
continue
hms = match.groups()
if len(hms) == 3:
t = float(hms[0]) + float(hms[1]) / 60. + float(hms[2]) / 3600.
times[gender].append(t)
f.close()
w = 0.2
histM, edg = np.histogram(times['M'], np.arange(2, 7+w, w), density=True)
histW, edg = np.histogram(times['W'], np.arange(2, 7+w, w), density=True)
t_arr = w/2 + edg[:-1]
pl.figure(figsize=(50/9., 35/9.))
pl.plot(t_arr, histM, 'bo-', label='m')
pl.plot(t_arr, histW, 'ro-', label='w')
pl.xlim(2, 7)
pl.xlabel('t [h]')
pl.ylabel('P')
pl.title('41. Berlin Marathon running times')
pl.grid(True)
pl.legend(loc='upper right')
pl.tight_layout()
pl.savefig('running_times_41Berlin_marathon.svg')