From 0ba375a2bddabc25295762e9f7be8c2f4a90d204 Mon Sep 17 00:00:00 2001
From: Benjamin Mako Hill <mako@atdot.cc>
Date: Wed, 29 Apr 2015 14:44:04 -0700
Subject: [PATCH] improvements based on a run-through in COM597G

- Added "encoding='utf-8'" to add calls to open() which fixes bugs on
  Windows.

- changed output files from CSV To TSV to deal with titles and such
  with commas

- Fixed "True"/"False" typo (thinko?) in hpwp-minor.py program.
---
 .gitignore            |  2 +-
 build_hpwp_dataset.py |  6 +++---
 hpwp-minor.py         |  6 +++---
 hpwp-trend.py         | 12 ++++++------
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index 72699f7..3c8ddf3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-/*.csv
+/*.tsv
diff --git a/build_hpwp_dataset.py b/build_hpwp_dataset.py
index 7cb5816..4ad96c9 100644
--- a/build_hpwp_dataset.py
+++ b/build_hpwp_dataset.py
@@ -90,8 +90,8 @@ articles_json = r.json()
 articles = articles_json["*"][0]["a"]["*"]
 
 # open a filie to write all the output
-output = open("hp_wiki.csv", "w")
-output.write(",".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n")
+output = open("hp_wiki.tsv", "w", encoding="utf-8")
+output.write("\t".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n")
 
 # for every article
 for article in articles:
@@ -102,7 +102,7 @@ for article in articles:
     # get the list of revisions from our function and then interating through it printinig it out
     revisions = get_article_revisions(title)
     for rev in revisions:
-        output.write(",".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"',
+        output.write("\t".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"',
                                rev["timestamp"], str(rev["size"]), str(rev["anon"]),
                                str(rev["minor"]), str(rev["revid"])]) + "\n")
 
diff --git a/hpwp-minor.py b/hpwp-minor.py
index 4e173f2..c327015 100644
--- a/hpwp-minor.py
+++ b/hpwp-minor.py
@@ -1,12 +1,12 @@
 from csv import DictReader
 
-input_file = open("hp_wiki.csv", 'r')
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
 
 num_edits = 0
 num_anon = 0
-for row in DictReader(input_file):
+for row in DictReader(input_file, delimiter="\t"):
     num_edits = num_edits + 1
-    if row["anon"] == "False":
+    if row["anon"] == "True":
         num_anon = num_anon + 1
 
 prop_anon = num_anon / num_edits
diff --git a/hpwp-trend.py b/hpwp-trend.py
index 939406e..aaa2b70 100644
--- a/hpwp-trend.py
+++ b/hpwp-trend.py
@@ -1,10 +1,10 @@
 from csv import DictReader
 
 # read in the input file and count by day
-input_file = open("hp_wiki.csv", 'r')
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
 
 edits_by_day = {}
-for row in DictReader(input_file):
+for row in DictReader(input_file, delimiter="\t"):
     day_string = row['timestamp'][0:10]
 
     if day_string in edits_by_day:
@@ -15,13 +15,13 @@ for row in DictReader(input_file):
 input_file.close()
 
 # output the counts by day
-output_file = open("hp_edits_by_day.csv", "w")
+output_file = open("hp_edits_by_day.tsv", "w", encoding='utf-8')
 
 # write a header
-output_file.write("date,edits\n")
+output_file.write("date\tedits\n")
 
 # iterate through every day and print out data into the file
-for day_string in edits_by_day:
-    output_file.write(",".join([day_string, str(edits_by_day[day_string])]) + "\n")
+for day_string in edits_by_day.keys():
+    output_file.write("\t".join([day_string, str(edits_by_day[day_string])]) + "\n")
 
 output_file.close()
-- 
2.39.5