X-Git-Url: https://projects.mako.cc/source/harrypotter-wikipedia-cdsw/blobdiff_plain/c4b1a145625537c7ac8131b133e6d85c587ff203..0ba375a2bddabc25295762e9f7be8c2f4a90d204:/build_hpwp_dataset.py diff --git a/build_hpwp_dataset.py b/build_hpwp_dataset.py index 7cb5816..4ad96c9 100644 --- a/build_hpwp_dataset.py +++ b/build_hpwp_dataset.py @@ -90,8 +90,8 @@ articles_json = r.json() articles = articles_json["*"][0]["a"]["*"] # open a filie to write all the output -output = open("hp_wiki.csv", "w") -output.write(",".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n") +output = open("hp_wiki.tsv", "w", encoding="utf-8") +output.write("\t".join(["title", "user", "timestamp", "size", "anon", "minor", "revid"]) + "\n") # for every article for article in articles: @@ -102,7 +102,7 @@ for article in articles: # get the list of revisions from our function and then interating through it printinig it out revisions = get_article_revisions(title) for rev in revisions: - output.write(",".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"', + output.write("\t".join(['"' + rev["title"] + '"', '"' + rev["user"] + '"', rev["timestamp"], str(rev["size"]), str(rev["anon"]), str(rev["minor"]), str(rev["revid"])]) + "\n")