improvements based on a run-through in COM597G
authorBenjamin Mako Hill <mako@atdot.cc>
Wed, 29 Apr 2015 21:44:04 +0000 (14:44 -0700)
committerBenjamin Mako Hill <mako@atdot.cc>
Wed, 29 Apr 2015 21:50:54 +0000 (14:50 -0700)
- Added "encoding='utf-8'" to add calls to open() which fixes bugs on
  Windows.

- changed output files from CSV To TSV to deal with titles and such
  with commas

- Fixed "True"/"False" typo (thinko?) in hpwp-minor.py program.

.gitignore
build_hpwp_dataset.py
hpwp-minor.py
hpwp-trend.py

index 72699f7d10b0befb8cb4d9936f747b4ca8a80c07..3c8ddf31d18b93464f10ab2cbd42dd65995dd1c5 100644 (file)
@@ -1 +1 @@
-/*.csv
+/*.tsv
index 7cb58168d1a2b3fd6c2b2338f7edf147394a2557..4ad96c9cb5ac0839502d93d4f03a1a65e16ec20e 100644 (file)
@@ -90,8 +90,8 @@ articles_json = r.json()
 articles = articles_json["*"][0]["a"]["*"]
 
 # open a filie to write all the output
-output = open("hp_wiki.csv", "w")
-output.write(",".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n")
+output = open("hp_wiki.tsv", "w", encoding="utf-8")
+output.write("\t".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n")
 
 # for every article
 for article in articles:
@@ -102,7 +102,7 @@ for article in articles:
     # get the list of revisions from our function and then interating through it printinig it out
     revisions = get_article_revisions(title)
     for rev in revisions:
-        output.write(",".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"',
+        output.write("\t".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"',
                                rev["timestamp"], str(rev["size"]), str(rev["anon"]),
                                str(rev["minor"]), str(rev["revid"])]) + "\n")
 
index 4e173f2daa4db6aaeffebed47e9e3ff99181ee52..c327015f78f0370a23c30da037deed6a7ce476fc 100644 (file)
@@ -1,12 +1,12 @@
 from csv import DictReader
 
-input_file = open("hp_wiki.csv", 'r')
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
 
 num_edits = 0
 num_anon = 0
-for row in DictReader(input_file):
+for row in DictReader(input_file, delimiter="\t"):
     num_edits = num_edits + 1
-    if row["anon"] == "False":
+    if row["anon"] == "True":
         num_anon = num_anon + 1
 
 prop_anon = num_anon / num_edits
index 939406e5a2b902d586b736a2736e6b39cac3cd28..aaa2b70a0ef3a088fcad40d174571ccf806ff154 100644 (file)
@@ -1,10 +1,10 @@
 from csv import DictReader
 
 # read in the input file and count by day
-input_file = open("hp_wiki.csv", 'r')
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
 
 edits_by_day = {}
-for row in DictReader(input_file):
+for row in DictReader(input_file, delimiter="\t"):
     day_string = row['timestamp'][0:10]
 
     if day_string in edits_by_day:
@@ -15,13 +15,13 @@ for row in DictReader(input_file):
 input_file.close()
 
 # output the counts by day
-output_file = open("hp_edits_by_day.csv", "w")
+output_file = open("hp_edits_by_day.tsv", "w", encoding='utf-8')
 
 # write a header
-output_file.write("date,edits\n")
+output_file.write("date\tedits\n")
 
 # iterate through every day and print out data into the file
-for day_string in edits_by_day:
-    output_file.write(",".join([day_string, str(edits_by_day[day_string])]) + "\n")
+for day_string in edits_by_day.keys():
+    output_file.write("\t".join([day_string, str(edits_by_day[day_string])]) + "\n")
 
 output_file.close()

Benjamin Mako Hill || Want to submit a patch?