From 7bd9bbbbb11d7585b76e63e354c0c779d5a8c547 Mon Sep 17 00:00:00 2001 From: arokem Date: Thu, 7 May 2015 17:16:55 -0700 Subject: [PATCH] Add more examples from the HP data-set. --- 001-hello-plot.py | 2 +- 003-plot-timeseries.py | 38 ++++++++++++++++++++++++++++++++++++++ 004-plot-histogram.py | 22 ++++++++++++++++++++++ load_hp_data.py | 18 +++++++++++++----- 4 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 003-plot-timeseries.py create mode 100644 004-plot-histogram.py diff --git a/001-hello-plot.py b/001-hello-plot.py index 227b56c..0690470 100644 --- a/001-hello-plot.py +++ b/001-hello-plot.py @@ -8,6 +8,6 @@ A first plot with matplotlib import matplotlib.pyplot as plt figure, axis = plt.subplots(1) -plt.plot([1,2,3], [1,2,3]) +plt.plot([1,2,3], [2,4,8]) plt.show() diff --git a/003-plot-timeseries.py b/003-plot-timeseries.py new file mode 100644 index 0000000..ec98776 --- /dev/null +++ b/003-plot-timeseries.py @@ -0,0 +1,38 @@ +""" +003-plot-timeseries.py + +Plot data from the Harry Potter data-set as a time-series + +""" + + +import matplotlib.pyplot as plt +import load_hp_data as hp + +# We can play with styles: +#plt.style.use('bmh') +plt.style.use('ggplot') +# To see available styles, type: +#plt.style.available + +fig, ax = plt.subplots(1) +ax.plot(hp.columns['timestamp'], hp.columns['size']) +ax.set_xlabel('Time') +ax.set_ylabel('Size of the edit') + +plt.show() + + +# Challenge: plot the relationship between edit size. Use + +## Hint 1: + +#delta_time1 = hp.columns['timestamp'][1] - hp.columns['timestamp'][0] + +## Hint 2: + +# You can give `plt.plot` more arguments to control the shape/size/color +# of the markers used. For example, try: + +# ax.plot([1,2,3], [2,4,8], '.') +# ax.plot([1,2,3], [2,4,8], 'r.') diff --git a/004-plot-histogram.py b/004-plot-histogram.py new file mode 100644 index 0000000..785a068 --- /dev/null +++ b/004-plot-histogram.py @@ -0,0 +1,22 @@ +""" +004-plot-histogram.py + +Plot a histogram of edit sizes + +""" + +import matplotlib.pyplot as plt +import load_hp_data as hp + +plt.style.use('ggplot') + +fig, ax = plt.subplots(1) +ax.hist(hp.columns['size'], bins=1000) +ax.set_xlabel('Size of the edit') +ax.set_ylabel('') +ax.set_title('Edit size distribution') + +# Maybe don't really need that axis to be so long: +# ax.set_xlim([0, 200000]) + +plt.show() diff --git a/load_hp_data.py b/load_hp_data.py index 85cf142..0fe78c5 100644 --- a/load_hp_data.py +++ b/load_hp_data.py @@ -16,8 +16,16 @@ for fieldname in reader.fieldnames: rows = [] for row in reader: - # Convert timestamp from a string to a date: - row['timestamp'] = datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S') - rows.append(row) - for fieldname, value in row.items(): - columns[fieldname].append(value) \ No newline at end of file + # Convert timestamp from a string to a date: + row['timestamp'] = datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S') + # Convert size from a string to an integer: + row['size'] = int(row['size']) + rows.append(row) + +# Sort these things, so that they give you nice time-series +sort_rows = sorted(rows, key=lambda row: row['timestamp'], reverse=False) + +rows = sort_rows +for row in sort_rows: + for fieldname, value in row.items(): + columns[fieldname].append(value) -- 2.30.2