X-Git-Url: https://projects.mako.cc/source/wikiq/blobdiff_plain/86aeece4b3a7ffa495b4c09441cffeec8e579f34..a8bbe4703654e6d6131368c0c42995f8933b3dce:/wikiq.c diff --git a/wikiq.c b/wikiq.c index 4d254f2..a8cf97d 100644 --- a/wikiq.c +++ b/wikiq.c @@ -10,6 +10,8 @@ #include #include "expat.h" #include +#include "disorder.h" +#include "md5.h" // timestamp of the form 2003-11-07T00:43:23Z #define DATE_LENGTH 10 @@ -209,10 +211,20 @@ static void write_row(revisionData *data) { - // TODO: make it so you can specify fields to output - // note that date and time are separated by a space, to match postgres's - // timestamp format - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s", + // get md5sum + md5_state_t state; + md5_byte_t digest[16]; + char md5_hex_output[2 * 16 + 1]; + md5_init(&state); + md5_append(&state, (const md5_byte_t *)data->text, data->text_size); + md5_finish(&state, digest); + int di; + for (di = 0; di < 16; ++di) { + sprintf(md5_hex_output + di * 2, "%02x", digest[di]); + } + + // print line of tsv output + printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n", data->title, data->articleid, data->revid, @@ -221,54 +233,19 @@ write_row(revisionData *data) (data->editor[0] != '\0') ? "0" : "1", // anon? data->editor, data->editorid, - (data->minor) ? "1" : "0"); - switch (data->output_type) - { - case SIMPLE: - printf("\t%i\n", (unsigned int) strlen(data->text)); - //printf("\n"); - break; - case FULL: - printf("\t%s\t%s\n", data->comment, data->text); - break; + (data->minor) ? "1" : "0", + (unsigned int) data->text_size, + shannon_H(data->text, data->text_size), + md5_hex_output + ); + + // + if (data->output_type == FULL) { + printf("comment:%s\ntext:\n%s\n", data->comment, data->text); } } -char -*append(char *entry, char *newstr) -{ - char *newbuff; - int len; - len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1; - newbuff = (char*) realloc(entry, len); - strcat(newbuff, newstr); - return newbuff; -} - -char -*cache(char *entry, char *newstr) -{ - char *newbuff; - int len; - len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' ! - newbuff = (char*) malloc(len); - strcpy(newbuff,newstr); - return newbuff; - -} - -char -*store(char *entry, char *newstr) -{ - char *newbuff; - if (entry == NULL) - newbuff = cache(entry, newstr); - else - newbuff = append(entry, newstr); - return newbuff; -} - void split_timestamp(revisionData *data) { @@ -278,19 +255,6 @@ split_timestamp(revisionData *data) strncpy(data->time, timeinstamp, TIME_LENGTH); } -/* currently unused */ -static int -is_whitespace(char *string) { - int len = strlen(string); - while (isspace(string[0]) && strlen(string) > 0) { - string++; - } - if (strcmp(string, "") == 0) - return 1; - else - return 0; -} - // like strncat but with previously known length char* strlcatn(char *dest, const char *src, size_t dest_len, size_t n) @@ -444,7 +408,7 @@ void print_usage(char* argv[]) { fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n"); fprintf(stderr, "a tab-separated stream of revisions on standard out:\n"); fprintf(stderr, "\n"); - fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor, revlength\n"); + fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n"); fprintf(stderr, "\n"); fprintf(stderr, "author: Erik Garrison \n"); }