From 0ba375a2bddabc25295762e9f7be8c2f4a90d204 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Wed, 29 Apr 2015 14:44:04 -0700 Subject: [PATCH] improvements based on a run-through in COM597G - Added "encoding='utf-8'" to add calls to open() which fixes bugs on Windows. - changed output files from CSV To TSV to deal with titles and such with commas - Fixed "True"/"False" typo (thinko?) in hpwp-minor.py program. --- .gitignore | 2 +- build_hpwp_dataset.py | 6 +++--- hpwp-minor.py | 6 +++--- hpwp-trend.py | 12 ++++++------ 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 72699f7..3c8ddf3 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -/*.csv +/*.tsv diff --git a/build_hpwp_dataset.py b/build_hpwp_dataset.py index 7cb5816..4ad96c9 100644 --- a/build_hpwp_dataset.py +++ b/build_hpwp_dataset.py @@ -90,8 +90,8 @@ articles_json = r.json() articles = articles_json["*"][0]["a"]["*"] # open a filie to write all the output -output = open("hp_wiki.csv", "w") -output.write(",".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n") +output = open("hp_wiki.tsv", "w", encoding="utf-8") +output.write("\t".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n") # for every article for article in articles: @@ -102,7 +102,7 @@ for article in articles: # get the list of revisions from our function and then interating through it printinig it out revisions = get_article_revisions(title) for rev in revisions: - output.write(",".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"', + output.write("\t".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"', rev["timestamp"], str(rev["size"]), str(rev["anon"]), str(rev["minor"]), str(rev["revid"])]) + "\n") diff --git a/hpwp-minor.py b/hpwp-minor.py index 4e173f2..c327015 100644 --- a/hpwp-minor.py +++ b/hpwp-minor.py @@ -1,12 +1,12 @@ from csv import DictReader -input_file = open("hp_wiki.csv", 'r') +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") num_edits = 0 num_anon = 0 -for row in DictReader(input_file): +for row in DictReader(input_file, delimiter="\t"): num_edits = num_edits + 1 - if row["anon"] == "False": + if row["anon"] == "True": num_anon = num_anon + 1 prop_anon = num_anon / num_edits diff --git a/hpwp-trend.py b/hpwp-trend.py index 939406e..aaa2b70 100644 --- a/hpwp-trend.py +++ b/hpwp-trend.py @@ -1,10 +1,10 @@ from csv import DictReader # read in the input file and count by day -input_file = open("hp_wiki.csv", 'r') +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") edits_by_day = {} -for row in DictReader(input_file): +for row in DictReader(input_file, delimiter="\t"): day_string = row['timestamp'][0:10] if day_string in edits_by_day: @@ -15,13 +15,13 @@ for row in DictReader(input_file): input_file.close() # output the counts by day -output_file = open("hp_edits_by_day.csv", "w") +output_file = open("hp_edits_by_day.tsv", "w", encoding='utf-8') # write a header -output_file.write("date,edits\n") +output_file.write("date\tedits\n") # iterate through every day and print out data into the file -for day_string in edits_by_day: - output_file.write(",".join([day_string, str(edits_by_day[day_string])]) + "\n") +for day_string in edits_by_day.keys(): + output_file.write("\t".join([day_string, str(edits_by_day[day_string])]) + "\n") output_file.close() -- 2.39.5