From: Benjamin Mako Hill Date: Sat, 9 May 2015 15:13:28 +0000 (-0700) Subject: Merge branch 'traffic-timeseries' of github.com:arokem/matplotlib-cdsw X-Git-Url: https://projects.mako.cc/source/matplotlib-cdsw/commitdiff_plain/6e94163bfc436e165e143cbc6259054537f554a9?hp=bd1a50d7dc3b3026090730b9853e64aa31cb9463 Merge branch 'traffic-timeseries' of github.com:arokem/matplotlib-cdsw --- diff --git a/001-hello-plot.py b/001-hello-plot.py new file mode 100644 index 0000000..882c22e --- /dev/null +++ b/001-hello-plot.py @@ -0,0 +1,13 @@ +""" + +hello_plot.py + +A first plot with matplotlib + +""" + +import matplotlib.pyplot as plt +figure, axis = plt.subplots(1) +axis.plot([1,2,3], [2,4,8]) +plt.show() + diff --git a/002-subplots.py b/002-subplots.py new file mode 100644 index 0000000..0c6749b --- /dev/null +++ b/002-subplots.py @@ -0,0 +1,36 @@ +""" + +Make a slightly more elaborate plot with subplots + +Save the figure in the end + +""" +import matplotlib.pyplot as plt +import math + +# Make some data to plot +x = [] +y1 = [] +y2 = [] + +for i in range(100): + x.append(i) + y1.append(math.sin(i * (2 * math.pi / 100))) + y2.append(math.cos(i * (2 * math.pi/ 100))) + +# First, create an empty figure with 2 subplots +# - The function plt.subplots returns an object for the figure and for each axes +# - There are multiple ways to accomplish this same goal, but this is probably the +# simplest - notice that each subplot is associated with one of the axes objects. +fig, (ax1, ax2) = plt.subplots(2) + +# Next, put one line on the first axis and both lines on the second axis +# - On the second axes, add a legend to distinguish the two lines +ax1.plot(x, y1) + +ax2.plot(x, y1, label='sin') # The labels are what appear in the legend +ax2.plot(x, y2, label='cos') +ax2.legend() + +# Finally, save the figure as a png file +fig.savefig('myfig.png') \ No newline at end of file diff --git a/003-plot-timeseries.py b/003-plot-timeseries.py new file mode 100644 index 0000000..f0fe255 --- /dev/null +++ b/003-plot-timeseries.py @@ -0,0 +1,45 @@ +""" +003-plot-timeseries.py + +Plot data from the Harry Potter data-set as a time-series + +""" + + +import matplotlib.pyplot as plt +import load_hp_data as hp + +# We can play with styles: +#plt.style.use('bmh') +plt.style.use('ggplot') +# To see available styles, type: +#plt.style.available + +fig, ax = plt.subplots(1) +ax.plot(hp.columns['timestamp'], hp.columns['size']) +ax.set_xlabel('Time') +ax.set_ylabel('Size of the edit') + +plt.show() + + +# Challenge: Is edit size related to how long it's been since the last edit? +# => Plot the relationship between edit size and the time since the last edit: + +## Hint 1: the number of seconds between two edits is: + +#delta_time1 = (hp.columns['timestamp'][1] - hp.columns['timestamp'][0]).total_seconds() + +## Hint 2: + +# You can give `plt.plot` more arguments to control the shape/size/color +# of the markers used. For example, try: + +# ax.plot([1,2,3], [2,4,8], '.') +# ax.plot([1,2,3], [2,4,8], 'r.') + +# And see online documentation here: +# http://matplotlib.org/api/pyplot_summary.html +# http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot + + diff --git a/004-plot-histogram.py b/004-plot-histogram.py new file mode 100644 index 0000000..b938fea --- /dev/null +++ b/004-plot-histogram.py @@ -0,0 +1,24 @@ +""" +004-plot-histogram.py + +Plot a histogram of edit sizes + +""" + +import matplotlib.pyplot as plt +import load_hp_data as hp + +plt.style.use('ggplot') + +fig, ax = plt.subplots(1) +ax.hist(hp.columns['size'], bins=1000) +ax.set_xlabel('Size of the edit') +ax.set_ylabel('') +ax.set_title('Edit size distribution') + +# Maybe don't really need that axis to be so long: +# ax.set_xlim([0, 200000]) +plt.show() + +## Challenge : A 'mega-user' is a user with more than 1000 edits. +# Plot a bar chart with the maximal edit size for each one of the mega-users diff --git a/README b/README index 4625944..34835e1 100644 --- a/README +++ b/README @@ -1,4 +1,3 @@ Using this dataset requires that you first download the following dataset: -http://communitydata.cc/~mako/wikipedia_bios.csv - +http://communitydata.cc/~mako/hp_wiki.tsv diff --git a/load_hp_data.py b/load_hp_data.py new file mode 100644 index 0000000..55d497c --- /dev/null +++ b/load_hp_data.py @@ -0,0 +1,31 @@ +""" load_hp_data.py + +A module for loading data from the Harry Potter wikipedia data set + +""" +import csv +from datetime import datetime + +f = open('hp_wiki.tsv', 'r') +reader = csv.DictReader(f, delimiter='\t') + +columns = {} +for fieldname in reader.fieldnames: + columns[fieldname] = [] + + +rows = [] +for row in reader: + # Convert timestamp from a string to a date: + row['timestamp'] = datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S') + # Convert size from a string to an integer: + row['size'] = int(row['size']) + rows.append(row) + +# Sort these things, so that they give you nice ordered time-series +sort_rows = sorted(rows, key=lambda row: row['timestamp'], reverse=False) + +rows = sort_rows +for row in sort_rows: + for fieldname, value in row.items(): + columns[fieldname].append(value)