improvements based on a run-through in COM597G

author Benjamin Mako Hill <mako@atdot.cc>

Wed, 29 Apr 2015 21:44:04 +0000 (14:44 -0700)

committer Benjamin Mako Hill <mako@atdot.cc>

Wed, 29 Apr 2015 21:50:54 +0000 (14:50 -0700)
author Benjamin Mako Hill <mako@atdot.cc>
Wed, 29 Apr 2015 21:44:04 +0000 (14:44 -0700)
committer Benjamin Mako Hill <mako@atdot.cc>
Wed, 29 Apr 2015 21:50:54 +0000 (14:50 -0700)
diff --git a/.gitignore b/.gitignore

index 72699f7d10b0befb8cb4d9936f747b4ca8a80c07..3c8ddf31d18b93464f10ab2cbd42dd65995dd1c5 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-/*.csv
+/*.tsv
diff --git a/build_hpwp_dataset.py b/build_hpwp_dataset.py

index 7cb58168d1a2b3fd6c2b2338f7edf147394a2557..4ad96c9cb5ac0839502d93d4f03a1a65e16ec20e 100644 (file)
--- a/build_hpwp_dataset.py
+++ b/build_hpwp_dataset.py
@@ -90,8 +90,8 @@ articles_json = r.json()
  articles = articles_json["*"][0]["a"]["*"]
  
  # open a filie to write all the output
  articles = articles_json["*"][0]["a"]["*"]
  
  # open a filie to write all the output
-output = open("hp_wiki.csv", "w")
-output.write(",".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n")
+output = open("hp_wiki.tsv", "w", encoding="utf-8")
+output.write("\t".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n")
  
  # for every article
  for article in articles:
  
  # for every article
  for article in articles:
@@ -102,7 +102,7 @@ for article in articles:
      # get the list of revisions from our function and then interating through it printinig it out
      revisions = get_article_revisions(title)
      for rev in revisions:
      # get the list of revisions from our function and then interating through it printinig it out
      revisions = get_article_revisions(title)
      for rev in revisions:
-        output.write(",".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"',
+        output.write("\t".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"',
                                 rev["timestamp"], str(rev["size"]), str(rev["anon"]),
                                 str(rev["minor"]), str(rev["revid"])]) + "\n")
  
                                 rev["timestamp"], str(rev["size"]), str(rev["anon"]),
                                 str(rev["minor"]), str(rev["revid"])]) + "\n")
  
diff --git a/hpwp-minor.py b/hpwp-minor.py

index 4e173f2daa4db6aaeffebed47e9e3ff99181ee52..c327015f78f0370a23c30da037deed6a7ce476fc 100644 (file)
--- a/hpwp-minor.py
+++ b/hpwp-minor.py
@@ -1,12 +1,12 @@
  from csv import DictReader
  
  from csv import DictReader
  
-input_file = open("hp_wiki.csv", 'r')
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
  
  num_edits = 0
  num_anon = 0
  
  num_edits = 0
  num_anon = 0
-for row in DictReader(input_file):
+for row in DictReader(input_file, delimiter="\t"):
      num_edits = num_edits + 1
      num_edits = num_edits + 1
-    if row["anon"] == "False":
+    if row["anon"] == "True":
          num_anon = num_anon + 1
  
  prop_anon = num_anon / num_edits
          num_anon = num_anon + 1
  
  prop_anon = num_anon / num_edits
diff --git a/hpwp-trend.py b/hpwp-trend.py

index 939406e5a2b902d586b736a2736e6b39cac3cd28..aaa2b70a0ef3a088fcad40d174571ccf806ff154 100644 (file)
--- a/hpwp-trend.py
+++ b/hpwp-trend.py
@@ -1,10 +1,10 @@
  from csv import DictReader
  
  # read in the input file and count by day
  from csv import DictReader
  
  # read in the input file and count by day
-input_file = open("hp_wiki.csv", 'r')
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
  
  edits_by_day = {}
  
  edits_by_day = {}
-for row in DictReader(input_file):
+for row in DictReader(input_file, delimiter="\t"):
      day_string = row['timestamp'][0:10]
  
      if day_string in edits_by_day:
      day_string = row['timestamp'][0:10]
  
      if day_string in edits_by_day:
@@ -15,13 +15,13 @@ for row in DictReader(input_file):
  input_file.close()
  
  # output the counts by day
  input_file.close()
  
  # output the counts by day
-output_file = open("hp_edits_by_day.csv", "w")
+output_file = open("hp_edits_by_day.tsv", "w", encoding='utf-8')
  
  # write a header
  
  # write a header
-output_file.write("date,edits\n")
+output_file.write("date\tedits\n")
  
  # iterate through every day and print out data into the file
  
  # iterate through every day and print out data into the file
-for day_string in edits_by_day:
-    output_file.write(",".join([day_string, str(edits_by_day[day_string])]) + "\n")
+for day_string in edits_by_day.keys():
+    output_file.write("\t".join([day_string, str(edits_by_day[day_string])]) + "\n")
  
  output_file.close()
  
  output_file.close()
author	Benjamin Mako Hill <mako@atdot.cc>
	Wed, 29 Apr 2015 21:44:04 +0000 (14:44 -0700)
committer	Benjamin Mako Hill <mako@atdot.cc>
	Wed, 29 Apr 2015 21:50:54 +0000 (14:50 -0700)
.gitignore		patch \| blob \| history
build_hpwp_dataset.py		patch \| blob \| history
hpwp-minor.py		patch \| blob \| history
hpwp-trend.py		patch \| blob \| history