Add more examples from the HP data-set.

author arokem <arokem@gmail.com>

Fri, 8 May 2015 00:16:55 +0000 (17:16 -0700)

committer Ariel Rokem <arokem@gmail.com>

Fri, 8 May 2015 00:16:55 +0000 (17:16 -0700)
author arokem <arokem@gmail.com>
Fri, 8 May 2015 00:16:55 +0000 (17:16 -0700)
committer Ariel Rokem <arokem@gmail.com>
Fri, 8 May 2015 00:16:55 +0000 (17:16 -0700)
diff --git a/001-hello-plot.py b/001-hello-plot.py

index 227b56cdd079ce3caa21f19c87bb8096b71759da..06904708e84e353b2a2366e5ba283685ed693375 100644 (file)
--- a/001-hello-plot.py
+++ b/001-hello-plot.py
@@ -8,6 +8,6 @@ A first plot with matplotlib
  
  import matplotlib.pyplot as plt 
  figure, axis = plt.subplots(1)
-plt.plot([1,2,3], [1,2,3])
+plt.plot([1,2,3], [2,4,8])
  plt.show()
  
diff --git a/003-plot-timeseries.py b/003-plot-timeseries.py

new file mode 100644 (file)

index 0000000..ec98776
--- /dev/null
+++ b/003-plot-timeseries.py
@@ -0,0 +1,38 @@
+""" 
+003-plot-timeseries.py 
+
+Plot data from the Harry Potter data-set as a time-series
+
+""" 
+
+
+import matplotlib.pyplot as plt 
+import load_hp_data as hp
+
+# We can play with styles:
+#plt.style.use('bmh')
+plt.style.use('ggplot') 
+# To see available styles, type: 
+#plt.style.available
+
+fig, ax = plt.subplots(1)
+ax.plot(hp.columns['timestamp'],  hp.columns['size'])
+ax.set_xlabel('Time')
+ax.set_ylabel('Size of the edit')
+
+plt.show()
+
+
+# Challenge: plot the relationship between edit size. Use 
+
+## Hint 1: 
+
+#delta_time1 = hp.columns['timestamp'][1] - hp.columns['timestamp'][0]
+
+## Hint 2: 
+
+# You can give `plt.plot` more arguments to control the shape/size/color 
+# of the markers used. For example, try: 
+
+# ax.plot([1,2,3], [2,4,8], '.')
+# ax.plot([1,2,3], [2,4,8], 'r.')
diff --git a/004-plot-histogram.py b/004-plot-histogram.py

new file mode 100644 (file)

index 0000000..785a068
--- /dev/null
+++ b/004-plot-histogram.py
@@ -0,0 +1,22 @@
+"""
+004-plot-histogram.py 
+
+Plot a histogram of edit sizes 
+
+"""
+
+import matplotlib.pyplot as plt 
+import load_hp_data as hp
+
+plt.style.use('ggplot')
+
+fig, ax = plt.subplots(1)
+ax.hist(hp.columns['size'], bins=1000)
+ax.set_xlabel('Size of the edit')
+ax.set_ylabel('')
+ax.set_title('Edit size distribution')
+
+# Maybe don't really need that axis to be so long:
+# ax.set_xlim([0, 200000])
+
+plt.show()
diff --git a/load_hp_data.py b/load_hp_data.py

index 85cf142852ba96fd1fa2f937619e4ed30b838f9f..0fe78c55c1186d76bf1fd0f2033eb84274654bbe 100644 (file)
--- a/load_hp_data.py
+++ b/load_hp_data.py
@@ -16,8 +16,16 @@ for fieldname in reader.fieldnames:
  
  rows = []
  for row in reader:
-       # Convert timestamp from a string to a date:
-       row['timestamp'] = datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S')
-       rows.append(row)
-       for fieldname, value in row.items():
-               columns[fieldname].append(value)
-\ No newline at end of file
+    # Convert timestamp from a string to a date:
+    row['timestamp'] = datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S')
+    # Convert size from a string to an integer:
+    row['size'] = int(row['size'])
+    rows.append(row)
+
+# Sort these things, so that they give you nice time-series
+sort_rows = sorted(rows, key=lambda row: row['timestamp'], reverse=False)
+
+rows = sort_rows
+for row in sort_rows:
+    for fieldname, value in row.items():
+        columns[fieldname].append(value)
author	arokem <arokem@gmail.com>
	Fri, 8 May 2015 00:16:55 +0000 (17:16 -0700)
committer	Ariel Rokem <arokem@gmail.com>
	Fri, 8 May 2015 00:16:55 +0000 (17:16 -0700)
001-hello-plot.py		patch \| blob \| history
003-plot-timeseries.py	[new file with mode: 0644]	patch \| blob
004-plot-histogram.py	[new file with mode: 0644]	patch \| blob
load_hp_data.py		patch \| blob \| history