#include <stdlib.h>
#include "expat.h"
#include <getopt.h>
+#include "disorder.h"
+#include "md5.h"
// timestamp of the form 2003-11-07T00:43:23Z
#define DATE_LENGTH 10
write_row(revisionData *data)
{
- // TODO: make it so you can specify fields to output
- // note that date and time are separated by a space, to match postgres's
- // timestamp format
- printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
+ // get md5sum
+ md5_state_t state;
+ md5_byte_t digest[16];
+ char md5_hex_output[2 * 16 + 1];
+ md5_init(&state);
+ md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
+ md5_finish(&state, digest);
+ int di;
+ for (di = 0; di < 16; ++di) {
+ sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
+ }
+
+ // print line of tsv output
+ printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n",
data->title,
data->articleid,
data->revid,
(data->editor[0] != '\0') ? "0" : "1", // anon?
data->editor,
data->editorid,
- (data->minor) ? "1" : "0");
- switch (data->output_type)
- {
- case SIMPLE:
- printf("\t%i\n", (unsigned int) strlen(data->text));
- //printf("\n");
- break;
- case FULL:
- printf("\t%s\t%s\n", data->comment, data->text);
- break;
+ (data->minor) ? "1" : "0",
+ (unsigned int) data->text_size,
+ shannon_H(data->text, data->text_size),
+ md5_hex_output
+ );
+
+ //
+ if (data->output_type == FULL) {
+ printf("comment:%s\ntext:\n%s\n", data->comment, data->text);
}
}
-char
-*append(char *entry, char *newstr)
-{
- char *newbuff;
- int len;
- len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
- newbuff = (char*) realloc(entry, len);
- strcat(newbuff, newstr);
- return newbuff;
-}
-
-char
-*cache(char *entry, char *newstr)
-{
- char *newbuff;
- int len;
- len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
- newbuff = (char*) malloc(len);
- strcpy(newbuff,newstr);
- return newbuff;
-
-}
-
-char
-*store(char *entry, char *newstr)
-{
- char *newbuff;
- if (entry == NULL)
- newbuff = cache(entry, newstr);
- else
- newbuff = append(entry, newstr);
- return newbuff;
-}
-
void
split_timestamp(revisionData *data)
{
strncpy(data->time, timeinstamp, TIME_LENGTH);
}
-/* currently unused */
-static int
-is_whitespace(char *string) {
- int len = strlen(string);
- while (isspace(string[0]) && strlen(string) > 0) {
- string++;
- }
- if (strcmp(string, "") == 0)
- return 1;
- else
- return 0;
-}
-
// like strncat but with previously known length
char*
strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
fprintf(stderr, "\n");
- fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor, revlength\n");
+ fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n");
fprintf(stderr, "\n");
fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
}